update
This commit is contained in:
47
README.md
47
README.md
@@ -149,27 +149,38 @@ client = Client(
|
|||||||
concurrent=10, # Number of concurrent requests
|
concurrent=10, # Number of concurrent requests
|
||||||
)
|
)
|
||||||
|
|
||||||
# Translate a single file (auto-detects file type)
|
# Example 1: Translate plain text files (no PDF parsing engine needed)
|
||||||
result = client.translate("path/to/your/document.pdf")
|
result = client.translate("path/to/your/document.txt")
|
||||||
|
|
||||||
# Save with default format (PDF -> html by default)
|
|
||||||
print(f"Translation complete! Saved to: {result.save()}")
|
print(f"Translation complete! Saved to: {result.save()}")
|
||||||
|
|
||||||
# Or specify output format explicitly
|
# Example 2: Translate PDF files (requires mineru_token or local deployment)
|
||||||
# For PDF/markdown_based:
|
# Option A: Use online MinerU (token required: https://mineru.net/apiManage/token)
|
||||||
# - "markdown": Markdown with embedded base64 images (default)
|
result = client.translate(
|
||||||
# - "markdown_zip": Markdown with separate image files (ZIP archive)
|
"path/to/your/document.pdf",
|
||||||
# - "html": HTML format
|
convert_engine="mineru",
|
||||||
# For docx: "docx"
|
mineru_token="YOUR_MINERU_TOKEN", # Replace with your MinerU Token
|
||||||
# For xlsx: "xlsx"
|
formula_ocr=True, # Enable formula recognition
|
||||||
result.save(fmt="html") # Save as HTML
|
)
|
||||||
result.save(fmt="markdown") # Save as Markdown with embedded images
|
result.save(fmt="html")
|
||||||
result.save(fmt="markdown_zip") # Save as ZIP with separate images
|
|
||||||
|
|
||||||
# Save to custom location
|
# Option B: Use locally deployed MinerU (recommended for intranet/offline)
|
||||||
result.save(output_dir="./my_translations", name="my_document.html")
|
# First start local MinerU service, reference: https://github.com/opendatalab/MinerU
|
||||||
|
result = client.translate(
|
||||||
|
"path/to/your/document.pdf",
|
||||||
|
convert_engine="mineru_deploy",
|
||||||
|
mineru_deploy_base_url="http://127.0.0.1:8000", # Your local MinerU address
|
||||||
|
mineru_deploy_backend="hybrid-auto-engine", # Backend type
|
||||||
|
)
|
||||||
|
result.save(fmt="markdown")
|
||||||
|
|
||||||
# Export as base64 encoded string
|
# Example 3: Translate Docx files (preserve formatting)
|
||||||
|
result = client.translate(
|
||||||
|
"path/to/your/document.docx",
|
||||||
|
insert_mode="replace", # replace/append/prepend
|
||||||
|
)
|
||||||
|
result.save(fmt="docx") # Save as docx format
|
||||||
|
|
||||||
|
# Example 4: Export as base64 encoded string (for API transmission)
|
||||||
base64_content = result.export(fmt="html")
|
base64_content = result.export(fmt="html")
|
||||||
print(f"Exported content length: {len(base64_content)}")
|
print(f"Exported content length: {len(base64_content)}")
|
||||||
|
|
||||||
@@ -194,6 +205,8 @@ print(f"Exported content length: {len(base64_content)}")
|
|||||||
| **concurrent** | `int` | 10 | Number of concurrent LLM requests |
|
| **concurrent** | `int` | 10 | Number of concurrent LLM requests |
|
||||||
| **convert_engine** | `str` | `"mineru"` | PDF parsing engine: `"mineru"`, `"docling"`, `"mineru_deploy"` |
|
| **convert_engine** | `str` | `"mineru"` | PDF parsing engine: `"mineru"`, `"docling"`, `"mineru_deploy"` |
|
||||||
| **mineru_deploy_base_url** | `str` | - | Local minerU API address (when `convert_engine="mineru_deploy"`) |
|
| **mineru_deploy_base_url** | `str` | - | Local minerU API address (when `convert_engine="mineru_deploy"`) |
|
||||||
|
| **mineru_deploy_parse_method** | `str` | `"auto"` | Local minerU parsing method: `"auto"`, `"txt"`, `"ocr"` |
|
||||||
|
| **mineru_deploy_table_enable** | `bool` | `True` | Enable table recognition for local minerU |
|
||||||
| **mineru_token** | `str` | - | minerU API token (when using online minerU) |
|
| **mineru_token** | `str` | - | minerU API token (when using online minerU) |
|
||||||
| **skip_translate** | `bool` | `False` | Skip translation, only parse document |
|
| **skip_translate** | `bool` | `False` | Skip translation, only parse document |
|
||||||
| **output_dir** | `str` | `"./output"` | Default output directory for `save()` |
|
| **output_dir** | `str` | `"./output"` | Default output directory for `save()` |
|
||||||
|
|||||||
47
README_JP.md
47
README_JP.md
@@ -149,27 +149,38 @@ client = Client(
|
|||||||
concurrent=10, # 同時リクエスト数
|
concurrent=10, # 同時リクエスト数
|
||||||
)
|
)
|
||||||
|
|
||||||
# 単一ファイルを翻訳 (ファイル类型を自動検出)
|
# 例 1: テキストファイルを翻訳 (PDF 解析エンジンが不要)
|
||||||
result = client.translate("path/to/your/document.pdf")
|
result = client.translate("path/to/your/document.txt")
|
||||||
|
|
||||||
# デフォルトフォーマットで保存 (PDF -> markdown with embedded images)
|
|
||||||
print(f"翻訳完了!保存先: {result.save()}")
|
print(f"翻訳完了!保存先: {result.save()}")
|
||||||
|
|
||||||
# または出力フォーマットを明示的に指定
|
# 例 2: PDF ファイルを翻訳 (mineru_token またはローカルデプロイが必要)
|
||||||
# PDF/markdown_based は以下をサポート:
|
# 方式 A: オンライン MinerU を使用 (token が必要: https://mineru.net/apiManage/token)
|
||||||
# - "markdown": Markdown フォーマット、base64 画像埋め込み (デフォルト)
|
result = client.translate(
|
||||||
# - "markdown_zip": Markdown フォーマット、画像分離保存 (ZIP アーカイブ)
|
"path/to/your/document.pdf",
|
||||||
# - "html": HTML フォーマット
|
convert_engine="mineru",
|
||||||
# docx は "docx" をサポート
|
mineru_token="YOUR_MINERU_TOKEN", # MinerU Token に置き換える
|
||||||
# xlsx は "xlsx" をサポート
|
formula_ocr=True, # 数式認識を有効化
|
||||||
result.save(fmt="html") # HTML として保存
|
)
|
||||||
result.save(fmt="markdown") # Markdown として保存(画像埋め込み)
|
result.save(fmt="html")
|
||||||
result.save(fmt="markdown_zip") # ZIP として保存(画像分離)
|
|
||||||
|
|
||||||
# カスタム場所に保存
|
# 方式 B: ローカルデプロイの MinerU を使用 (イントラネット/オフライン環境推奨)
|
||||||
result.save(output_dir="./my_translations", name="my_document.html")
|
# ローカル MinerU サービスを先に起動してください, 参考: https://github.com/opendatalab/MinerU
|
||||||
|
result = client.translate(
|
||||||
|
"path/to/your/document.pdf",
|
||||||
|
convert_engine="mineru_deploy",
|
||||||
|
mineru_deploy_base_url="http://127.0.0.1:8000", # ローカル MinerU アドレス
|
||||||
|
mineru_deploy_backend="hybrid-auto-engine", # バックエンドタイプ
|
||||||
|
)
|
||||||
|
result.save(fmt="markdown")
|
||||||
|
|
||||||
# または Base64 エンコード文字列としてエクスポート
|
# 例 3: Docx ファイルを翻訳 (書式保持)
|
||||||
|
result = client.translate(
|
||||||
|
"path/to/your/document.docx",
|
||||||
|
insert_mode="replace", # replace/append/prepend
|
||||||
|
)
|
||||||
|
result.save(fmt="docx") # docx フォーマットで保存
|
||||||
|
|
||||||
|
# 例 4: Base64 エンコード文字列としてエクスポート (API 転送用)
|
||||||
base64_content = result.export(fmt="html")
|
base64_content = result.export(fmt="html")
|
||||||
print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
|
print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
|
||||||
|
|
||||||
@@ -194,6 +205,8 @@ print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
|
|||||||
| **concurrent** | `int` | 10 | 同時 LLM リクエスト数 |
|
| **concurrent** | `int` | 10 | 同時 LLM リクエスト数 |
|
||||||
| **convert_engine** | `str` | `"mineru"` | PDF 解析エンジン: `"mineru"`、`"docling"`、`"mineru_deploy"` |
|
| **convert_engine** | `str` | `"mineru"` | PDF 解析エンジン: `"mineru"`、`"docling"`、`"mineru_deploy"` |
|
||||||
| **mineru_deploy_base_url** | `str` | - | ローカル minerU API アドレス(`convert_engine="mineru_deploy"` の場合) |
|
| **mineru_deploy_base_url** | `str` | - | ローカル minerU API アドレス(`convert_engine="mineru_deploy"` の場合) |
|
||||||
|
| **mineru_deploy_parse_method** | `str` | `"auto"` | ローカル minerU 解析方法: `"auto"`, `"txt"`, `"ocr"` |
|
||||||
|
| **mineru_deploy_table_enable** | `bool` | `True` | ローカル minerU テーブル認識を有効化するか |
|
||||||
| **mineru_token** | `str` | - | minerU API Token(オンライン minerU 使用時) |
|
| **mineru_token** | `str` | - | minerU API Token(オンライン minerU 使用時) |
|
||||||
| **skip_translate** | `bool` | `False` | 翻訳をスキップしてドキュメントのみを解析 |
|
| **skip_translate** | `bool` | `False` | 翻訳をスキップしてドキュメントのみを解析 |
|
||||||
| **output_dir** | `str` | `"./output"` | `save()` メソッドのデフォルト出力ディレクトリ |
|
| **output_dir** | `str` | `"./output"` | `save()` メソッドのデフォルト出力ディレクトリ |
|
||||||
|
|||||||
47
README_ZH.md
47
README_ZH.md
@@ -150,27 +150,38 @@ client = Client(
|
|||||||
concurrent=10, # 并发请求数
|
concurrent=10, # 并发请求数
|
||||||
)
|
)
|
||||||
|
|
||||||
# 翻译单个文件 (自动检测文件类型)
|
# 示例 1: 翻译纯文本文件 (无需 PDF 解析引擎)
|
||||||
result = client.translate("path/to/your/document.pdf")
|
result = client.translate("path/to/your/document.txt")
|
||||||
|
|
||||||
# 使用默认格式保存 (PDF -> markdown with embedded images)
|
|
||||||
print(f"翻译完成!保存位置: {result.save()}")
|
print(f"翻译完成!保存位置: {result.save()}")
|
||||||
|
|
||||||
# 或显式指定输出格式
|
# 示例 2: 翻译 PDF 文件 (需要指定 mineru_token 或使用本地部署)
|
||||||
# PDF/markdown_based 支持:
|
# 方式 A: 使用在线 MinerU (需要申请 token: https://mineru.net/apiManage/token)
|
||||||
# - "markdown": Markdown 格式,内嵌 base64 图片 (默认)
|
result = client.translate(
|
||||||
# - "markdown_zip": Markdown 格式,图片分离存储 (ZIP 压缩包)
|
"path/to/your/document.pdf",
|
||||||
# - "html": HTML 格式
|
convert_engine="mineru",
|
||||||
# docx 支持: "docx"
|
mineru_token="YOUR_MINERU_TOKEN", # 替换为您的 MinerU Token
|
||||||
# xlsx 支持: "xlsx"
|
formula_ocr=True, # 启用公式识别
|
||||||
result.save(fmt="html") # 保存为 HTML
|
)
|
||||||
result.save(fmt="markdown") # 保存为 Markdown(内嵌图片)
|
result.save(fmt="html")
|
||||||
result.save(fmt="markdown_zip") # 保存为 ZIP(图片分离)
|
|
||||||
|
|
||||||
# 保存到自定义位置
|
# 方式 B: 使用本地部署的 MinerU (推荐内网/离线环境)
|
||||||
result.save(output_dir="./my_translations", name="my_document.html")
|
# 需要先启动本地 MinerU 服务,参考: https://github.com/opendatalab/MinerU
|
||||||
|
result = client.translate(
|
||||||
|
"path/to/your/document.pdf",
|
||||||
|
convert_engine="mineru_deploy",
|
||||||
|
mineru_deploy_base_url="http://127.0.0.1:8000", # 您的本地 MinerU 地址
|
||||||
|
mineru_deploy_backend="hybrid-auto-engine", # 后端类型
|
||||||
|
)
|
||||||
|
result.save(fmt="markdown")
|
||||||
|
|
||||||
# 导出为 Base64 编码字符串
|
# 示例 3: 翻译 Docx 文件 (保持格式)
|
||||||
|
result = client.translate(
|
||||||
|
"path/to/your/document.docx",
|
||||||
|
insert_mode="replace", # replace/append/prepend
|
||||||
|
)
|
||||||
|
result.save(fmt="docx") # 保存为 docx 格式
|
||||||
|
|
||||||
|
# 示例 4: 导出为 Base64 编码字符串 (用于 API 传输)
|
||||||
base64_content = result.export(fmt="html")
|
base64_content = result.export(fmt="html")
|
||||||
print(f"导出内容长度: {len(base64_content)}")
|
print(f"导出内容长度: {len(base64_content)}")
|
||||||
|
|
||||||
@@ -195,6 +206,8 @@ print(f"导出内容长度: {len(base64_content)}")
|
|||||||
| **concurrent** | `int` | 10 | 并发 LLM 请求数 |
|
| **concurrent** | `int` | 10 | 并发 LLM 请求数 |
|
||||||
| **convert_engine** | `str` | `"mineru"` | PDF 解析引擎:`"mineru"`、`"docling"`、`"mineru_deploy"` |
|
| **convert_engine** | `str` | `"mineru"` | PDF 解析引擎:`"mineru"`、`"docling"`、`"mineru_deploy"` |
|
||||||
| **mineru_deploy_base_url** | `str` | - | 本地 minerU API 地址(当 `convert_engine="mineru_deploy"` 时) |
|
| **mineru_deploy_base_url** | `str` | - | 本地 minerU API 地址(当 `convert_engine="mineru_deploy"` 时) |
|
||||||
|
| **mineru_deploy_parse_method** | `str` | `"auto"` | 本地 minerU 解析方法: `"auto"`, `"txt"`, `"ocr"` |
|
||||||
|
| **mineru_deploy_table_enable** | `bool` | `True` | 本地 minerU 是否启用表格识别 |
|
||||||
| **mineru_token** | `str` | - | minerU API Token(使用在线 minerU 时) |
|
| **mineru_token** | `str` | - | minerU API Token(使用在线 minerU 时) |
|
||||||
| **skip_translate** | `bool` | `False` | 跳过翻译,仅解析文档 |
|
| **skip_translate** | `bool` | `False` | 跳过翻译,仅解析文档 |
|
||||||
| **output_dir** | `str` | `"./output"` | `save()` 方法的默认输出目录 |
|
| **output_dir** | `str` | `"./output"` | `save()` 方法的默认输出目录 |
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ class TranslationResult:
|
|||||||
method = getattr(self._workflow, method_name, None)
|
method = getattr(self._workflow, method_name, None)
|
||||||
if method:
|
if method:
|
||||||
method(name=name, output_dir=output_dir)
|
method(name=name, output_dir=output_dir)
|
||||||
return os.path.join(output_dir, name)
|
return str(Path(output_dir) / name)
|
||||||
raise AttributeError(f"Workflow 缺少方法 {method_name}")
|
raise AttributeError(f"Workflow 缺少方法 {method_name}")
|
||||||
|
|
||||||
def export(self, fmt: Optional[str] = None) -> str:
|
def export(self, fmt: Optional[str] = None) -> str:
|
||||||
@@ -126,11 +126,10 @@ class TranslationResult:
|
|||||||
fmt = self._supported_formats[0]
|
fmt = self._supported_formats[0]
|
||||||
export_key = fmt
|
export_key = fmt
|
||||||
|
|
||||||
method_name = f"export_to_{export_key.replace('markdown_zip', 'markdown_zip')}"
|
# 构建方法名
|
||||||
# 特殊处理 markdown_zip -> export_to_markdown_zip
|
|
||||||
if export_key == "markdown_zip":
|
if export_key == "markdown_zip":
|
||||||
method_name = "export_to_markdown_zip"
|
method_name = "export_to_markdown_zip"
|
||||||
elif export_key in ["html", "markdown"]:
|
else:
|
||||||
method_name = f"export_to_{export_key}"
|
method_name = f"export_to_{export_key}"
|
||||||
|
|
||||||
method = getattr(self._workflow, method_name, None)
|
method = getattr(self._workflow, method_name, None)
|
||||||
@@ -163,8 +162,8 @@ class Client:
|
|||||||
retry: int = default_params["retry"],
|
retry: int = default_params["retry"],
|
||||||
thinking: ThinkingMode = default_params["thinking"],
|
thinking: ThinkingMode = default_params["thinking"],
|
||||||
system_proxy_enable: bool = default_params["system_proxy_enable"],
|
system_proxy_enable: bool = default_params["system_proxy_enable"],
|
||||||
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
|
convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
|
||||||
mineru_token: Optional[str] = None,
|
mineru_token: str = "",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -204,22 +203,24 @@ class Client:
|
|||||||
rpm: Optional[int] = None,
|
rpm: Optional[int] = None,
|
||||||
tpm: Optional[int] = None,
|
tpm: Optional[int] = None,
|
||||||
provider: Optional[Union[ProviderType, str]] = None,
|
provider: Optional[Union[ProviderType, str]] = None,
|
||||||
insert_mode: Optional[InsertMode] = None,
|
insert_mode: Literal["replace", "append", "prepend"] = "replace",
|
||||||
separator: Optional[str] = None,
|
separator: str = "\n",
|
||||||
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None,
|
segment_mode: Literal["line", "paragraph", "none"] = "line",
|
||||||
translate_regions: Optional[List[str]] = None,
|
translate_regions: Optional[List[str]] = None,
|
||||||
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
|
convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
|
||||||
mineru_token: Optional[str] = None,
|
mineru_token: str = "",
|
||||||
model_version: Optional[Literal["pipeline", "vlm"]] = None,
|
model_version: Literal["pipeline", "vlm"] = "vlm",
|
||||||
formula_ocr: Optional[bool] = None,
|
formula_ocr: bool = True,
|
||||||
code_ocr: Optional[bool] = None,
|
code_ocr: bool = True,
|
||||||
mineru_deploy_base_url: Optional[str] = None,
|
mineru_deploy_base_url: str = "http://127.0.0.1:8000",
|
||||||
mineru_deploy_backend: Optional[str] = None,
|
mineru_deploy_backend: Literal["pipeline", "vlm-auto-engine", "vlm-http-client", "hybrid-auto-engine", "hybrid-http-client"] = "hybrid-auto-engine",
|
||||||
mineru_deploy_formula_enable: Optional[bool] = None,
|
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = "auto",
|
||||||
mineru_deploy_start_page_id: Optional[int] = None,
|
mineru_deploy_formula_enable: bool = True,
|
||||||
mineru_deploy_end_page_id: Optional[int] = None,
|
mineru_deploy_table_enable: bool = True,
|
||||||
|
mineru_deploy_start_page_id: int = 0,
|
||||||
|
mineru_deploy_end_page_id: int = 99999,
|
||||||
mineru_deploy_lang_list: Optional[List[str]] = None,
|
mineru_deploy_lang_list: Optional[List[str]] = None,
|
||||||
mineru_deploy_server_url: Optional[str] = None,
|
mineru_deploy_server_url: str = "",
|
||||||
json_paths: Optional[List[str]] = None,
|
json_paths: Optional[List[str]] = None,
|
||||||
glossary_generate_enable: Optional[bool] = None,
|
glossary_generate_enable: Optional[bool] = None,
|
||||||
glossary_dict: Optional[Dict[str, str]] = None,
|
glossary_dict: Optional[Dict[str, str]] = None,
|
||||||
@@ -270,26 +271,28 @@ class Client:
|
|||||||
provider: Optional[Union[ProviderType, str]] = None,
|
provider: Optional[Union[ProviderType, str]] = None,
|
||||||
|
|
||||||
# --- 格式参数 (Docx/Excel/Txt) ---
|
# --- 格式参数 (Docx/Excel/Txt) ---
|
||||||
insert_mode: Optional[InsertMode] = None,
|
insert_mode: Literal["replace", "append", "prepend"] = "replace",
|
||||||
separator: Optional[str] = None,
|
separator: str = "\n",
|
||||||
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None,
|
segment_mode: Literal["line", "paragraph", "none"] = "line",
|
||||||
translate_regions: Optional[List[str]] = None,
|
translate_regions: Optional[List[str]] = None,
|
||||||
|
|
||||||
# --- 解析引擎 (PDF/OCR) ---
|
# --- 解析引擎 (PDF/OCR) ---
|
||||||
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
|
convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
|
||||||
mineru_token: Optional[str] = None,
|
mineru_token: str = "",
|
||||||
model_version: Optional[Literal["pipeline", "vlm"]] = None,
|
model_version: Literal["pipeline", "vlm"] = "vlm",
|
||||||
formula_ocr: Optional[bool] = None,
|
formula_ocr: bool = True,
|
||||||
code_ocr: Optional[bool] = None,
|
code_ocr: bool = True,
|
||||||
|
|
||||||
# --- Mineru 本地部署参数 ---
|
# --- Mineru 本地部署参数 ---
|
||||||
mineru_deploy_base_url: Optional[str] = None,
|
mineru_deploy_base_url: str = "http://127.0.0.1:8000",
|
||||||
mineru_deploy_backend: Optional[str] = None,
|
mineru_deploy_backend: Literal["pipeline", "vlm-auto-engine", "vlm-http-client", "hybrid-auto-engine", "hybrid-http-client"] = "hybrid-auto-engine",
|
||||||
mineru_deploy_formula_enable: Optional[bool] = None,
|
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = "auto",
|
||||||
mineru_deploy_start_page_id: Optional[int] = None,
|
mineru_deploy_formula_enable: bool = True,
|
||||||
mineru_deploy_end_page_id: Optional[int] = None,
|
mineru_deploy_table_enable: bool = True,
|
||||||
|
mineru_deploy_start_page_id: int = 0,
|
||||||
|
mineru_deploy_end_page_id: int = 99999,
|
||||||
mineru_deploy_lang_list: Optional[List[str]] = None,
|
mineru_deploy_lang_list: Optional[List[str]] = None,
|
||||||
mineru_deploy_server_url: Optional[str] = None,
|
mineru_deploy_server_url: str = "",
|
||||||
|
|
||||||
# --- JSON / 术语表 ---
|
# --- JSON / 术语表 ---
|
||||||
json_paths: Optional[List[str]] = None,
|
json_paths: Optional[List[str]] = None,
|
||||||
|
|||||||
Reference in New Issue
Block a user