This commit is contained in:
xunbu
2026-01-11 13:38:45 +08:00
parent 3f827067fe
commit 253eb00510
4 changed files with 127 additions and 85 deletions

View File

@@ -149,27 +149,38 @@ client = Client(
concurrent=10, # Number of concurrent requests concurrent=10, # Number of concurrent requests
) )
# Translate a single file (auto-detects file type) # Example 1: Translate plain text files (no PDF parsing engine needed)
result = client.translate("path/to/your/document.pdf") result = client.translate("path/to/your/document.txt")
# Save with default format (PDF -> html by default)
print(f"Translation complete! Saved to: {result.save()}") print(f"Translation complete! Saved to: {result.save()}")
# Or specify output format explicitly # Example 2: Translate PDF files (requires mineru_token or local deployment)
# For PDF/markdown_based: # Option A: Use online MinerU (token required: https://mineru.net/apiManage/token)
# - "markdown": Markdown with embedded base64 images (default) result = client.translate(
# - "markdown_zip": Markdown with separate image files (ZIP archive) "path/to/your/document.pdf",
# - "html": HTML format convert_engine="mineru",
# For docx: "docx" mineru_token="YOUR_MINERU_TOKEN", # Replace with your MinerU Token
# For xlsx: "xlsx" formula_ocr=True, # Enable formula recognition
result.save(fmt="html") # Save as HTML )
result.save(fmt="markdown") # Save as Markdown with embedded images result.save(fmt="html")
result.save(fmt="markdown_zip") # Save as ZIP with separate images
# Save to custom location # Option B: Use locally deployed MinerU (recommended for intranet/offline)
result.save(output_dir="./my_translations", name="my_document.html") # First start local MinerU service, reference: https://github.com/opendatalab/MinerU
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru_deploy",
mineru_deploy_base_url="http://127.0.0.1:8000", # Your local MinerU address
mineru_deploy_backend="hybrid-auto-engine", # Backend type
)
result.save(fmt="markdown")
# Export as base64 encoded string # Example 3: Translate Docx files (preserve formatting)
result = client.translate(
"path/to/your/document.docx",
insert_mode="replace", # replace/append/prepend
)
result.save(fmt="docx") # Save as docx format
# Example 4: Export as base64 encoded string (for API transmission)
base64_content = result.export(fmt="html") base64_content = result.export(fmt="html")
print(f"Exported content length: {len(base64_content)}") print(f"Exported content length: {len(base64_content)}")
@@ -194,6 +205,8 @@ print(f"Exported content length: {len(base64_content)}")
| **concurrent** | `int` | 10 | Number of concurrent LLM requests | | **concurrent** | `int` | 10 | Number of concurrent LLM requests |
| **convert_engine** | `str` | `"mineru"` | PDF parsing engine: `"mineru"`, `"docling"`, `"mineru_deploy"` | | **convert_engine** | `str` | `"mineru"` | PDF parsing engine: `"mineru"`, `"docling"`, `"mineru_deploy"` |
| **mineru_deploy_base_url** | `str` | - | Local minerU API address (when `convert_engine="mineru_deploy"`) | | **mineru_deploy_base_url** | `str` | - | Local minerU API address (when `convert_engine="mineru_deploy"`) |
| **mineru_deploy_parse_method** | `str` | `"auto"` | Local minerU parsing method: `"auto"`, `"txt"`, `"ocr"` |
| **mineru_deploy_table_enable** | `bool` | `True` | Enable table recognition for local minerU |
| **mineru_token** | `str` | - | minerU API token (when using online minerU) | | **mineru_token** | `str` | - | minerU API token (when using online minerU) |
| **skip_translate** | `bool` | `False` | Skip translation, only parse document | | **skip_translate** | `bool` | `False` | Skip translation, only parse document |
| **output_dir** | `str` | `"./output"` | Default output directory for `save()` | | **output_dir** | `str` | `"./output"` | Default output directory for `save()` |

View File

@@ -149,27 +149,38 @@ client = Client(
concurrent=10, # 同時リクエスト数 concurrent=10, # 同時リクエスト数
) )
# 単一ファイルを翻訳 (ファイル类型を自動検出) # 例 1: テキストファイルを翻訳 (PDF 解析エンジンが不要)
result = client.translate("path/to/your/document.pdf") result = client.translate("path/to/your/document.txt")
# デフォルトフォーマットで保存 (PDF -> markdown with embedded images)
print(f"翻訳完了!保存先: {result.save()}") print(f"翻訳完了!保存先: {result.save()}")
# または出力フォーマットを明示的に指定 # 例 2: PDF ファイルを翻訳 (mineru_token またはローカルデプロイが必要)
# PDF/markdown_based は以下をサポート: # 方式 A: オンライン MinerU を使用 (token が必要: https://mineru.net/apiManage/token)
# - "markdown": Markdown フォーマット、base64 画像埋め込み (デフォルト) result = client.translate(
# - "markdown_zip": Markdown フォーマット、画像分離保存 (ZIP アーカイブ) "path/to/your/document.pdf",
# - "html": HTML フォーマット convert_engine="mineru",
# docx は "docx" をサポート mineru_token="YOUR_MINERU_TOKEN", # MinerU Token に置き換える
# xlsx は "xlsx" をサポート formula_ocr=True, # 数式認識を有効化
result.save(fmt="html") # HTML として保存 )
result.save(fmt="markdown") # Markdown として保存(画像埋め込み) result.save(fmt="html")
result.save(fmt="markdown_zip") # ZIP として保存(画像分離)
# カスタム場所に保存 # 方式 B: ローカルデプロイの MinerU を使用 (イントラネット/オフライン環境推奨)
result.save(output_dir="./my_translations", name="my_document.html") # ローカル MinerU サービスを先に起動してください, 参考: https://github.com/opendatalab/MinerU
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru_deploy",
mineru_deploy_base_url="http://127.0.0.1:8000", # ローカル MinerU アドレス
mineru_deploy_backend="hybrid-auto-engine", # バックエンドタイプ
)
result.save(fmt="markdown")
# または Base64 エンコード文字列としてエクスポート # 例 3: Docx ファイルを翻訳 (書式保持)
result = client.translate(
"path/to/your/document.docx",
insert_mode="replace", # replace/append/prepend
)
result.save(fmt="docx") # docx フォーマットで保存
# 例 4: Base64 エンコード文字列としてエクスポート (API 転送用)
base64_content = result.export(fmt="html") base64_content = result.export(fmt="html")
print(f"エクスポートコンテンツ長さ: {len(base64_content)}") print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
@@ -194,6 +205,8 @@ print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
| **concurrent** | `int` | 10 | 同時 LLM リクエスト数 | | **concurrent** | `int` | 10 | 同時 LLM リクエスト数 |
| **convert_engine** | `str` | `"mineru"` | PDF 解析エンジン: `"mineru"``"docling"``"mineru_deploy"` | | **convert_engine** | `str` | `"mineru"` | PDF 解析エンジン: `"mineru"``"docling"``"mineru_deploy"` |
| **mineru_deploy_base_url** | `str` | - | ローカル minerU API アドレス(`convert_engine="mineru_deploy"` の場合) | | **mineru_deploy_base_url** | `str` | - | ローカル minerU API アドレス(`convert_engine="mineru_deploy"` の場合) |
| **mineru_deploy_parse_method** | `str` | `"auto"` | ローカル minerU 解析方法: `"auto"`, `"txt"`, `"ocr"` |
| **mineru_deploy_table_enable** | `bool` | `True` | ローカル minerU テーブル認識を有効化するか |
| **mineru_token** | `str` | - | minerU API Tokenオンライン minerU 使用時) | | **mineru_token** | `str` | - | minerU API Tokenオンライン minerU 使用時) |
| **skip_translate** | `bool` | `False` | 翻訳をスキップしてドキュメントのみを解析 | | **skip_translate** | `bool` | `False` | 翻訳をスキップしてドキュメントのみを解析 |
| **output_dir** | `str` | `"./output"` | `save()` メソッドのデフォルト出力ディレクトリ | | **output_dir** | `str` | `"./output"` | `save()` メソッドのデフォルト出力ディレクトリ |

View File

@@ -150,27 +150,38 @@ client = Client(
concurrent=10, # 并发请求数 concurrent=10, # 并发请求数
) )
# 翻译单个文件 (自动检测文件类型) # 示例 1: 翻译纯文本文件 (无需 PDF 解析引擎)
result = client.translate("path/to/your/document.pdf") result = client.translate("path/to/your/document.txt")
# 使用默认格式保存 (PDF -> markdown with embedded images)
print(f"翻译完成!保存位置: {result.save()}") print(f"翻译完成!保存位置: {result.save()}")
# 或显式指定输出格式 # 示例 2: 翻译 PDF 文件 (需要指定 mineru_token 或使用本地部署)
# PDF/markdown_based 支持: # 方式 A: 使用在线 MinerU (需要申请 token: https://mineru.net/apiManage/token)
# - "markdown": Markdown 格式,内嵌 base64 图片 (默认) result = client.translate(
# - "markdown_zip": Markdown 格式,图片分离存储 (ZIP 压缩包) "path/to/your/document.pdf",
# - "html": HTML 格式 convert_engine="mineru",
# docx 支持: "docx" mineru_token="YOUR_MINERU_TOKEN", # 替换为您的 MinerU Token
# xlsx 支持: "xlsx" formula_ocr=True, # 启用公式识别
result.save(fmt="html") # 保存为 HTML )
result.save(fmt="markdown") # 保存为 Markdown内嵌图片 result.save(fmt="html")
result.save(fmt="markdown_zip") # 保存为 ZIP图片分离
# 保存到自定义位置 # 方式 B: 使用本地部署的 MinerU (推荐内网/离线环境)
result.save(output_dir="./my_translations", name="my_document.html") # 需要先启动本地 MinerU 服务,参考: https://github.com/opendatalab/MinerU
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru_deploy",
mineru_deploy_base_url="http://127.0.0.1:8000", # 您的本地 MinerU 地址
mineru_deploy_backend="hybrid-auto-engine", # 后端类型
)
result.save(fmt="markdown")
# 导出为 Base64 编码字符串 # 示例 3: 翻译 Docx 文件 (保持格式)
result = client.translate(
"path/to/your/document.docx",
insert_mode="replace", # replace/append/prepend
)
result.save(fmt="docx") # 保存为 docx 格式
# 示例 4: 导出为 Base64 编码字符串 (用于 API 传输)
base64_content = result.export(fmt="html") base64_content = result.export(fmt="html")
print(f"导出内容长度: {len(base64_content)}") print(f"导出内容长度: {len(base64_content)}")
@@ -195,6 +206,8 @@ print(f"导出内容长度: {len(base64_content)}")
| **concurrent** | `int` | 10 | 并发 LLM 请求数 | | **concurrent** | `int` | 10 | 并发 LLM 请求数 |
| **convert_engine** | `str` | `"mineru"` | PDF 解析引擎:`"mineru"``"docling"``"mineru_deploy"` | | **convert_engine** | `str` | `"mineru"` | PDF 解析引擎:`"mineru"``"docling"``"mineru_deploy"` |
| **mineru_deploy_base_url** | `str` | - | 本地 minerU API 地址(当 `convert_engine="mineru_deploy"` 时) | | **mineru_deploy_base_url** | `str` | - | 本地 minerU API 地址(当 `convert_engine="mineru_deploy"` 时) |
| **mineru_deploy_parse_method** | `str` | `"auto"` | 本地 minerU 解析方法: `"auto"`, `"txt"`, `"ocr"` |
| **mineru_deploy_table_enable** | `bool` | `True` | 本地 minerU 是否启用表格识别 |
| **mineru_token** | `str` | - | minerU API Token使用在线 minerU 时) | | **mineru_token** | `str` | - | minerU API Token使用在线 minerU 时) |
| **skip_translate** | `bool` | `False` | 跳过翻译,仅解析文档 | | **skip_translate** | `bool` | `False` | 跳过翻译,仅解析文档 |
| **output_dir** | `str` | `"./output"` | `save()` 方法的默认输出目录 | | **output_dir** | `str` | `"./output"` | `save()` 方法的默认输出目录 |

View File

@@ -101,7 +101,7 @@ class TranslationResult:
method = getattr(self._workflow, method_name, None) method = getattr(self._workflow, method_name, None)
if method: if method:
method(name=name, output_dir=output_dir) method(name=name, output_dir=output_dir)
return os.path.join(output_dir, name) return str(Path(output_dir) / name)
raise AttributeError(f"Workflow 缺少方法 {method_name}") raise AttributeError(f"Workflow 缺少方法 {method_name}")
def export(self, fmt: Optional[str] = None) -> str: def export(self, fmt: Optional[str] = None) -> str:
@@ -126,11 +126,10 @@ class TranslationResult:
fmt = self._supported_formats[0] fmt = self._supported_formats[0]
export_key = fmt export_key = fmt
method_name = f"export_to_{export_key.replace('markdown_zip', 'markdown_zip')}" # 构建方法名
# 特殊处理 markdown_zip -> export_to_markdown_zip
if export_key == "markdown_zip": if export_key == "markdown_zip":
method_name = "export_to_markdown_zip" method_name = "export_to_markdown_zip"
elif export_key in ["html", "markdown"]: else:
method_name = f"export_to_{export_key}" method_name = f"export_to_{export_key}"
method = getattr(self._workflow, method_name, None) method = getattr(self._workflow, method_name, None)
@@ -163,8 +162,8 @@ class Client:
retry: int = default_params["retry"], retry: int = default_params["retry"],
thinking: ThinkingMode = default_params["thinking"], thinking: ThinkingMode = default_params["thinking"],
system_proxy_enable: bool = default_params["system_proxy_enable"], system_proxy_enable: bool = default_params["system_proxy_enable"],
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None, convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
mineru_token: Optional[str] = None, mineru_token: str = "",
**kwargs **kwargs
): ):
""" """
@@ -204,22 +203,24 @@ class Client:
rpm: Optional[int] = None, rpm: Optional[int] = None,
tpm: Optional[int] = None, tpm: Optional[int] = None,
provider: Optional[Union[ProviderType, str]] = None, provider: Optional[Union[ProviderType, str]] = None,
insert_mode: Optional[InsertMode] = None, insert_mode: Literal["replace", "append", "prepend"] = "replace",
separator: Optional[str] = None, separator: str = "\n",
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None, segment_mode: Literal["line", "paragraph", "none"] = "line",
translate_regions: Optional[List[str]] = None, translate_regions: Optional[List[str]] = None,
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None, convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
mineru_token: Optional[str] = None, mineru_token: str = "",
model_version: Optional[Literal["pipeline", "vlm"]] = None, model_version: Literal["pipeline", "vlm"] = "vlm",
formula_ocr: Optional[bool] = None, formula_ocr: bool = True,
code_ocr: Optional[bool] = None, code_ocr: bool = True,
mineru_deploy_base_url: Optional[str] = None, mineru_deploy_base_url: str = "http://127.0.0.1:8000",
mineru_deploy_backend: Optional[str] = None, mineru_deploy_backend: Literal["pipeline", "vlm-auto-engine", "vlm-http-client", "hybrid-auto-engine", "hybrid-http-client"] = "hybrid-auto-engine",
mineru_deploy_formula_enable: Optional[bool] = None, mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = "auto",
mineru_deploy_start_page_id: Optional[int] = None, mineru_deploy_formula_enable: bool = True,
mineru_deploy_end_page_id: Optional[int] = None, mineru_deploy_table_enable: bool = True,
mineru_deploy_start_page_id: int = 0,
mineru_deploy_end_page_id: int = 99999,
mineru_deploy_lang_list: Optional[List[str]] = None, mineru_deploy_lang_list: Optional[List[str]] = None,
mineru_deploy_server_url: Optional[str] = None, mineru_deploy_server_url: str = "",
json_paths: Optional[List[str]] = None, json_paths: Optional[List[str]] = None,
glossary_generate_enable: Optional[bool] = None, glossary_generate_enable: Optional[bool] = None,
glossary_dict: Optional[Dict[str, str]] = None, glossary_dict: Optional[Dict[str, str]] = None,
@@ -270,26 +271,28 @@ class Client:
provider: Optional[Union[ProviderType, str]] = None, provider: Optional[Union[ProviderType, str]] = None,
# --- 格式参数 (Docx/Excel/Txt) --- # --- 格式参数 (Docx/Excel/Txt) ---
insert_mode: Optional[InsertMode] = None, insert_mode: Literal["replace", "append", "prepend"] = "replace",
separator: Optional[str] = None, separator: str = "\n",
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None, segment_mode: Literal["line", "paragraph", "none"] = "line",
translate_regions: Optional[List[str]] = None, translate_regions: Optional[List[str]] = None,
# --- 解析引擎 (PDF/OCR) --- # --- 解析引擎 (PDF/OCR) ---
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None, convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
mineru_token: Optional[str] = None, mineru_token: str = "",
model_version: Optional[Literal["pipeline", "vlm"]] = None, model_version: Literal["pipeline", "vlm"] = "vlm",
formula_ocr: Optional[bool] = None, formula_ocr: bool = True,
code_ocr: Optional[bool] = None, code_ocr: bool = True,
# --- Mineru 本地部署参数 --- # --- Mineru 本地部署参数 ---
mineru_deploy_base_url: Optional[str] = None, mineru_deploy_base_url: str = "http://127.0.0.1:8000",
mineru_deploy_backend: Optional[str] = None, mineru_deploy_backend: Literal["pipeline", "vlm-auto-engine", "vlm-http-client", "hybrid-auto-engine", "hybrid-http-client"] = "hybrid-auto-engine",
mineru_deploy_formula_enable: Optional[bool] = None, mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = "auto",
mineru_deploy_start_page_id: Optional[int] = None, mineru_deploy_formula_enable: bool = True,
mineru_deploy_end_page_id: Optional[int] = None, mineru_deploy_table_enable: bool = True,
mineru_deploy_start_page_id: int = 0,
mineru_deploy_end_page_id: int = 99999,
mineru_deploy_lang_list: Optional[List[str]] = None, mineru_deploy_lang_list: Optional[List[str]] = None,
mineru_deploy_server_url: Optional[str] = None, mineru_deploy_server_url: str = "",
# --- JSON / 术语表 --- # --- JSON / 术语表 ---
json_paths: Optional[List[str]] = None, json_paths: Optional[List[str]] = None,