This commit is contained in:
xunbu
2026-01-11 13:38:45 +08:00
parent 3f827067fe
commit 253eb00510
4 changed files with 127 additions and 85 deletions

View File

@@ -149,27 +149,38 @@ client = Client(
concurrent=10, # Number of concurrent requests
)
# Translate a single file (auto-detects file type)
result = client.translate("path/to/your/document.pdf")
# Save with default format (PDF -> html by default)
# Example 1: Translate plain text files (no PDF parsing engine needed)
result = client.translate("path/to/your/document.txt")
print(f"Translation complete! Saved to: {result.save()}")
# Or specify output format explicitly
# For PDF/markdown_based:
# - "markdown": Markdown with embedded base64 images (default)
# - "markdown_zip": Markdown with separate image files (ZIP archive)
# - "html": HTML format
# For docx: "docx"
# For xlsx: "xlsx"
result.save(fmt="html") # Save as HTML
result.save(fmt="markdown") # Save as Markdown with embedded images
result.save(fmt="markdown_zip") # Save as ZIP with separate images
# Example 2: Translate PDF files (requires mineru_token or local deployment)
# Option A: Use online MinerU (token required: https://mineru.net/apiManage/token)
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru",
mineru_token="YOUR_MINERU_TOKEN", # Replace with your MinerU Token
formula_ocr=True, # Enable formula recognition
)
result.save(fmt="html")
# Save to custom location
result.save(output_dir="./my_translations", name="my_document.html")
# Option B: Use locally deployed MinerU (recommended for intranet/offline)
# First start local MinerU service, reference: https://github.com/opendatalab/MinerU
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru_deploy",
mineru_deploy_base_url="http://127.0.0.1:8000", # Your local MinerU address
mineru_deploy_backend="hybrid-auto-engine", # Backend type
)
result.save(fmt="markdown")
# Export as base64 encoded string
# Example 3: Translate Docx files (preserve formatting)
result = client.translate(
"path/to/your/document.docx",
insert_mode="replace", # replace/append/prepend
)
result.save(fmt="docx") # Save as docx format
# Example 4: Export as base64 encoded string (for API transmission)
base64_content = result.export(fmt="html")
print(f"Exported content length: {len(base64_content)}")
@@ -194,6 +205,8 @@ print(f"Exported content length: {len(base64_content)}")
| **concurrent** | `int` | 10 | Number of concurrent LLM requests |
| **convert_engine** | `str` | `"mineru"` | PDF parsing engine: `"mineru"`, `"docling"`, `"mineru_deploy"` |
| **mineru_deploy_base_url** | `str` | - | Local minerU API address (when `convert_engine="mineru_deploy"`) |
| **mineru_deploy_parse_method** | `str` | `"auto"` | Local minerU parsing method: `"auto"`, `"txt"`, `"ocr"` |
| **mineru_deploy_table_enable** | `bool` | `True` | Enable table recognition for local minerU |
| **mineru_token** | `str` | - | minerU API token (when using online minerU) |
| **skip_translate** | `bool` | `False` | Skip translation, only parse document |
| **output_dir** | `str` | `"./output"` | Default output directory for `save()` |

View File

@@ -149,27 +149,38 @@ client = Client(
concurrent=10, # 同時リクエスト数
)
# 単一ファイルを翻訳 (ファイル类型を自動検出)
result = client.translate("path/to/your/document.pdf")
# デフォルトフォーマットで保存 (PDF -> markdown with embedded images)
# 例 1: テキストファイルを翻訳 (PDF 解析エンジンが不要)
result = client.translate("path/to/your/document.txt")
print(f"翻訳完了!保存先: {result.save()}")
# または出力フォーマットを明示的に指定
# PDF/markdown_based は以下をサポート:
# - "markdown": Markdown フォーマット、base64 画像埋め込み (デフォルト)
# - "markdown_zip": Markdown フォーマット、画像分離保存 (ZIP アーカイブ)
# - "html": HTML フォーマット
# docx は "docx" をサポート
# xlsx は "xlsx" をサポート
result.save(fmt="html") # HTML として保存
result.save(fmt="markdown") # Markdown として保存(画像埋め込み)
result.save(fmt="markdown_zip") # ZIP として保存(画像分離)
# 例 2: PDF ファイルを翻訳 (mineru_token またはローカルデプロイが必要)
# 方式 A: オンライン MinerU を使用 (token が必要: https://mineru.net/apiManage/token)
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru",
mineru_token="YOUR_MINERU_TOKEN", # MinerU Token に置き換える
formula_ocr=True, # 数式認識を有効化
)
result.save(fmt="html")
# カスタム場所に保存
result.save(output_dir="./my_translations", name="my_document.html")
# 方式 B: ローカルデプロイの MinerU を使用 (イントラネット/オフライン環境推奨)
# ローカル MinerU サービスを先に起動してください, 参考: https://github.com/opendatalab/MinerU
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru_deploy",
mineru_deploy_base_url="http://127.0.0.1:8000", # ローカル MinerU アドレス
mineru_deploy_backend="hybrid-auto-engine", # バックエンドタイプ
)
result.save(fmt="markdown")
# または Base64 エンコード文字列としてエクスポート
# 例 3: Docx ファイルを翻訳 (書式保持)
result = client.translate(
"path/to/your/document.docx",
insert_mode="replace", # replace/append/prepend
)
result.save(fmt="docx") # docx フォーマットで保存
# 例 4: Base64 エンコード文字列としてエクスポート (API 転送用)
base64_content = result.export(fmt="html")
print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
@@ -194,6 +205,8 @@ print(f"エクスポートコンテンツ長さ: {len(base64_content)}")
| **concurrent** | `int` | 10 | 同時 LLM リクエスト数 |
| **convert_engine** | `str` | `"mineru"` | PDF 解析エンジン: `"mineru"``"docling"``"mineru_deploy"` |
| **mineru_deploy_base_url** | `str` | - | ローカル minerU API アドレス(`convert_engine="mineru_deploy"` の場合) |
| **mineru_deploy_parse_method** | `str` | `"auto"` | ローカル minerU 解析方法: `"auto"`, `"txt"`, `"ocr"` |
| **mineru_deploy_table_enable** | `bool` | `True` | ローカル minerU テーブル認識を有効化するか |
| **mineru_token** | `str` | - | minerU API Tokenオンライン minerU 使用時) |
| **skip_translate** | `bool` | `False` | 翻訳をスキップしてドキュメントのみを解析 |
| **output_dir** | `str` | `"./output"` | `save()` メソッドのデフォルト出力ディレクトリ |

View File

@@ -150,27 +150,38 @@ client = Client(
concurrent=10, # 并发请求数
)
# 翻译单个文件 (自动检测文件类型)
result = client.translate("path/to/your/document.pdf")
# 使用默认格式保存 (PDF -> markdown with embedded images)
# 示例 1: 翻译纯文本文件 (无需 PDF 解析引擎)
result = client.translate("path/to/your/document.txt")
print(f"翻译完成!保存位置: {result.save()}")
# 或显式指定输出格式
# PDF/markdown_based 支持:
# - "markdown": Markdown 格式,内嵌 base64 图片 (默认)
# - "markdown_zip": Markdown 格式,图片分离存储 (ZIP 压缩包)
# - "html": HTML 格式
# docx 支持: "docx"
# xlsx 支持: "xlsx"
result.save(fmt="html") # 保存为 HTML
result.save(fmt="markdown") # 保存为 Markdown内嵌图片
result.save(fmt="markdown_zip") # 保存为 ZIP图片分离
# 示例 2: 翻译 PDF 文件 (需要指定 mineru_token 或使用本地部署)
# 方式 A: 使用在线 MinerU (需要申请 token: https://mineru.net/apiManage/token)
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru",
mineru_token="YOUR_MINERU_TOKEN", # 替换为您的 MinerU Token
formula_ocr=True, # 启用公式识别
)
result.save(fmt="html")
# 保存到自定义位置
result.save(output_dir="./my_translations", name="my_document.html")
# 方式 B: 使用本地部署的 MinerU (推荐内网/离线环境)
# 需要先启动本地 MinerU 服务,参考: https://github.com/opendatalab/MinerU
result = client.translate(
"path/to/your/document.pdf",
convert_engine="mineru_deploy",
mineru_deploy_base_url="http://127.0.0.1:8000", # 您的本地 MinerU 地址
mineru_deploy_backend="hybrid-auto-engine", # 后端类型
)
result.save(fmt="markdown")
# 导出为 Base64 编码字符串
# 示例 3: 翻译 Docx 文件 (保持格式)
result = client.translate(
"path/to/your/document.docx",
insert_mode="replace", # replace/append/prepend
)
result.save(fmt="docx") # 保存为 docx 格式
# 示例 4: 导出为 Base64 编码字符串 (用于 API 传输)
base64_content = result.export(fmt="html")
print(f"导出内容长度: {len(base64_content)}")
@@ -195,6 +206,8 @@ print(f"导出内容长度: {len(base64_content)}")
| **concurrent** | `int` | 10 | 并发 LLM 请求数 |
| **convert_engine** | `str` | `"mineru"` | PDF 解析引擎:`"mineru"``"docling"``"mineru_deploy"` |
| **mineru_deploy_base_url** | `str` | - | 本地 minerU API 地址(当 `convert_engine="mineru_deploy"` 时) |
| **mineru_deploy_parse_method** | `str` | `"auto"` | 本地 minerU 解析方法: `"auto"`, `"txt"`, `"ocr"` |
| **mineru_deploy_table_enable** | `bool` | `True` | 本地 minerU 是否启用表格识别 |
| **mineru_token** | `str` | - | minerU API Token使用在线 minerU 时) |
| **skip_translate** | `bool` | `False` | 跳过翻译,仅解析文档 |
| **output_dir** | `str` | `"./output"` | `save()` 方法的默认输出目录 |

View File

@@ -101,7 +101,7 @@ class TranslationResult:
method = getattr(self._workflow, method_name, None)
if method:
method(name=name, output_dir=output_dir)
return os.path.join(output_dir, name)
return str(Path(output_dir) / name)
raise AttributeError(f"Workflow 缺少方法 {method_name}")
def export(self, fmt: Optional[str] = None) -> str:
@@ -126,11 +126,10 @@ class TranslationResult:
fmt = self._supported_formats[0]
export_key = fmt
method_name = f"export_to_{export_key.replace('markdown_zip', 'markdown_zip')}"
# 特殊处理 markdown_zip -> export_to_markdown_zip
# 构建方法名
if export_key == "markdown_zip":
method_name = "export_to_markdown_zip"
elif export_key in ["html", "markdown"]:
else:
method_name = f"export_to_{export_key}"
method = getattr(self._workflow, method_name, None)
@@ -163,8 +162,8 @@ class Client:
retry: int = default_params["retry"],
thinking: ThinkingMode = default_params["thinking"],
system_proxy_enable: bool = default_params["system_proxy_enable"],
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
mineru_token: Optional[str] = None,
convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
mineru_token: str = "",
**kwargs
):
"""
@@ -204,22 +203,24 @@ class Client:
rpm: Optional[int] = None,
tpm: Optional[int] = None,
provider: Optional[Union[ProviderType, str]] = None,
insert_mode: Optional[InsertMode] = None,
separator: Optional[str] = None,
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None,
insert_mode: Literal["replace", "append", "prepend"] = "replace",
separator: str = "\n",
segment_mode: Literal["line", "paragraph", "none"] = "line",
translate_regions: Optional[List[str]] = None,
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
mineru_token: Optional[str] = None,
model_version: Optional[Literal["pipeline", "vlm"]] = None,
formula_ocr: Optional[bool] = None,
code_ocr: Optional[bool] = None,
mineru_deploy_base_url: Optional[str] = None,
mineru_deploy_backend: Optional[str] = None,
mineru_deploy_formula_enable: Optional[bool] = None,
mineru_deploy_start_page_id: Optional[int] = None,
mineru_deploy_end_page_id: Optional[int] = None,
convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
mineru_token: str = "",
model_version: Literal["pipeline", "vlm"] = "vlm",
formula_ocr: bool = True,
code_ocr: bool = True,
mineru_deploy_base_url: str = "http://127.0.0.1:8000",
mineru_deploy_backend: Literal["pipeline", "vlm-auto-engine", "vlm-http-client", "hybrid-auto-engine", "hybrid-http-client"] = "hybrid-auto-engine",
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = "auto",
mineru_deploy_formula_enable: bool = True,
mineru_deploy_table_enable: bool = True,
mineru_deploy_start_page_id: int = 0,
mineru_deploy_end_page_id: int = 99999,
mineru_deploy_lang_list: Optional[List[str]] = None,
mineru_deploy_server_url: Optional[str] = None,
mineru_deploy_server_url: str = "",
json_paths: Optional[List[str]] = None,
glossary_generate_enable: Optional[bool] = None,
glossary_dict: Optional[Dict[str, str]] = None,
@@ -270,26 +271,28 @@ class Client:
provider: Optional[Union[ProviderType, str]] = None,
# --- 格式参数 (Docx/Excel/Txt) ---
insert_mode: Optional[InsertMode] = None,
separator: Optional[str] = None,
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None,
insert_mode: Literal["replace", "append", "prepend"] = "replace",
separator: str = "\n",
segment_mode: Literal["line", "paragraph", "none"] = "line",
translate_regions: Optional[List[str]] = None,
# --- 解析引擎 (PDF/OCR) ---
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
mineru_token: Optional[str] = None,
model_version: Optional[Literal["pipeline", "vlm"]] = None,
formula_ocr: Optional[bool] = None,
code_ocr: Optional[bool] = None,
convert_engine: Literal["identity", "mineru", "docling", "mineru_deploy"] = "identity",
mineru_token: str = "",
model_version: Literal["pipeline", "vlm"] = "vlm",
formula_ocr: bool = True,
code_ocr: bool = True,
# --- Mineru 本地部署参数 ---
mineru_deploy_base_url: Optional[str] = None,
mineru_deploy_backend: Optional[str] = None,
mineru_deploy_formula_enable: Optional[bool] = None,
mineru_deploy_start_page_id: Optional[int] = None,
mineru_deploy_end_page_id: Optional[int] = None,
mineru_deploy_base_url: str = "http://127.0.0.1:8000",
mineru_deploy_backend: Literal["pipeline", "vlm-auto-engine", "vlm-http-client", "hybrid-auto-engine", "hybrid-http-client"] = "hybrid-auto-engine",
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = "auto",
mineru_deploy_formula_enable: bool = True,
mineru_deploy_table_enable: bool = True,
mineru_deploy_start_page_id: int = 0,
mineru_deploy_end_page_id: int = 99999,
mineru_deploy_lang_list: Optional[List[str]] = None,
mineru_deploy_server_url: Optional[str] = None,
mineru_deploy_server_url: str = "",
# --- JSON / 术语表 ---
json_paths: Optional[List[str]] = None,