diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py
index dbd8dd6..a96fd51 100644
--- a/docutranslate/agents/segments_agent.py
+++ b/docutranslate/agents/segments_agent.py
@@ -1,16 +1,18 @@
+import json
from dataclasses import dataclass
from docutranslate.agents import AgentConfig, Agent
+from docutranslate.utils.json_utils import flat_json_split
@dataclass
-class JsonTranslateAgentConfig(AgentConfig):
+class SegmentsTranslateAgentConfig(AgentConfig):
to_lang: str
custom_prompt: str | None = None
-class JsonTranslateAgent(Agent):
- def __init__(self, config: JsonTranslateAgentConfig):
+class SegmentsTranslateAgent(Agent):
+ def __init__(self, config: SegmentsTranslateAgentConfig):
super().__init__(config)
self.system_prompt = f"""
# 角色
@@ -34,3 +36,26 @@ class JsonTranslateAgent(Agent):
"""
if config.custom_prompt:
self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n'
+
+ def send_segments(self, segments: list[str], chunk_size: int):
+ indexed_originals = {str(i): text for i, text in enumerate(segments)}
+ chunks = flat_json_split(indexed_originals, chunk_size)
+ prompts = [json.dumps(chunk) for chunk in chunks]
+ translated_chunks = super().send_prompts(prompts=prompts)
+ indexed_translated = indexed_originals.copy()
+ for chunk_str in translated_chunks:
+ translated_part = json.loads(chunk_str)
+ indexed_translated.update(translated_part)
+ return list(indexed_translated.values())
+
+ #todo:增加协程粒度
+ async def send_segments_async(self, segments: list[str], chunk_size: int):
+ indexed_originals = {str(i): text for i, text in enumerate(segments)}
+ chunks = flat_json_split(indexed_originals, chunk_size)
+ prompts = [json.dumps(chunk) for chunk in chunks]
+ translated_chunks = await super().send_prompts_async(prompts=prompts)
+ indexed_translated = indexed_originals.copy()
+ for chunk_str in translated_chunks:
+ translated_part = json.loads(chunk_str)
+ indexed_translated.update(translated_part)
+ return list(indexed_translated.values())
diff --git a/docutranslate/exporter/xlsx/__init__.py b/docutranslate/exporter/xlsx/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docutranslate/exporter/xlsx/base.py b/docutranslate/exporter/xlsx/base.py
new file mode 100644
index 0000000..6272296
--- /dev/null
+++ b/docutranslate/exporter/xlsx/base.py
@@ -0,0 +1,8 @@
+from docutranslate.exporter.base import Exporter
+from docutranslate.ir.document import Document
+
+#TODO:看情况是否需要为json单独写一个document类型
+class XlsxExporter(Exporter[Document]):
+
+ def export(self,document:Document)->Document:
+ ...
\ No newline at end of file
diff --git a/docutranslate/exporter/xlsx/xlsx2html_exporter.py b/docutranslate/exporter/xlsx/xlsx2html_exporter.py
new file mode 100644
index 0000000..a7b9849
--- /dev/null
+++ b/docutranslate/exporter/xlsx/xlsx2html_exporter.py
@@ -0,0 +1,62 @@
+from dataclasses import dataclass
+from io import BytesIO
+
+import jinja2
+import openpyxl
+
+from docutranslate.exporter.base import ExporterConfig
+from docutranslate.exporter.xlsx.base import XlsxExporter
+from docutranslate.ir.document import Document
+from docutranslate.utils.resource_utils import resource_path
+
+
+@dataclass
+class Xlsx2HTMLExporterConfig(ExporterConfig):
+ cdn: bool = True
+
+
+class Xlsx2HTMLExporter(XlsxExporter):
+ def __init__(self, config: Xlsx2HTMLExporterConfig = None):
+ config = config or Xlsx2HTMLExporterConfig()
+ super().__init__(config=config)
+ self.cdn = config.cdn
+
+ def export(self, document: Document) -> Document:
+
+ # 1. 加载工作簿和工作表
+ workbook = openpyxl.load_workbook(BytesIO(document.content))
+ sheet = workbook.active
+
+ # 2. 手动构建HTML字符串
+ table = '
\n'
+
+ # 处理表头
+ table += ' \n \n'
+ for cell in sheet[1]: # 假设第一行是表头
+ table += f' | {cell.value} | \n'
+ table += '
\n \n'
+
+ # 处理数据行
+ table += ' \n'
+ # iter_rows(min_row=2) 从第二行开始遍历
+ for row in sheet.iter_rows(min_row=2):
+ table += ' \n'
+ for cell in row:
+ # 处理None值,防止在HTML中显示"None"
+ cell_value = cell.value if cell.value is not None else ""
+ table += f' | {cell_value} | \n'
+ table += '
\n'
+ table += ' \n'
+
+ table += '
'
+
+ html_template = resource_path("template/xlsx.html").read_text(encoding="utf-8")
+
+ pico = f'' if not cdn else r''
+ render = jinja2.Template(html_template).render(
+ title=document.stem,
+ pico=pico,
+ body=table,
+ )
+ print("\n通过openpyxl手动生成了 output_manual.html 文件!")
+ return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
diff --git a/docutranslate/exporter/xlsx/xlsx2xlsx_exporter.py b/docutranslate/exporter/xlsx/xlsx2xlsx_exporter.py
new file mode 100644
index 0000000..20cc3af
--- /dev/null
+++ b/docutranslate/exporter/xlsx/xlsx2xlsx_exporter.py
@@ -0,0 +1,8 @@
+from docutranslate.exporter.txt.base import TXTExporter
+from docutranslate.exporter.xlsx.base import XlsxExporter
+from docutranslate.ir.document import Document
+
+
+class Xlsx2XlsxExporter(XlsxExporter):
+ def export(self, document: Document) -> Document:
+ return document.copy()
diff --git a/docutranslate/template/xlsx.html b/docutranslate/template/xlsx.html
new file mode 100644
index 0000000..dfcd23b
--- /dev/null
+++ b/docutranslate/template/xlsx.html
@@ -0,0 +1,17 @@
+
+
+
+
+ {{ title }}
+ {{pico}}
+
+
+
+{{ body }}
+
+
\ No newline at end of file
diff --git a/docutranslate/translator/ai_translator/json_translator.py b/docutranslate/translator/ai_translator/json_translator.py
index ce27f46..61403f3 100644
--- a/docutranslate/translator/ai_translator/json_translator.py
+++ b/docutranslate/translator/ai_translator/json_translator.py
@@ -4,7 +4,7 @@ from typing import Self, Any
from jsonpath_ng.ext import parse
-from docutranslate.agents.segments_agent import JsonTranslateAgentConfig, JsonTranslateAgent
+from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
@@ -20,18 +20,18 @@ class JsonTranslator(Translator):
def __init__(self, config: JsonTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
- agent_config = JsonTranslateAgentConfig(custom_prompt=config.custom_prompt,
- to_lang=config.to_lang,
- baseurl=config.base_url,
- key=config.api_key,
- model_id=config.model_id,
- system_prompt=None,
- temperature=config.temperature,
- thinking=config.thinking,
- max_concurrent=config.concurrent,
- timeout=config.timeout,
- logger=self.logger)
- self.translate_agent = JsonTranslateAgent(agent_config)
+ agent_config = SegmentsTranslateAgentConfig(custom_prompt=config.custom_prompt,
+ to_lang=config.to_lang,
+ baseurl=config.base_url,
+ key=config.api_key,
+ model_id=config.model_id,
+ system_prompt=None,
+ temperature=config.temperature,
+ thinking=config.thinking,
+ max_concurrent=config.concurrent,
+ timeout=config.timeout,
+ logger=self.logger)
+ self.translate_agent = SegmentsTranslateAgent(agent_config)
self.jsonpaths = config.json_paths
def _extract_matches(self, content: dict) -> list[Any]:
@@ -71,30 +71,6 @@ class JsonTranslator(Translator):
# 5. 按原始顺序返回翻译后的文本列表
return list(indexed_translated.values())
- async def _translate_texts_in_batches_async(self, texts: list[str]) -> list[str]:
- """
- 将文本列表打包、分块、发送翻译并返回翻译结果。
- 此函数封装了与翻译代理交互的所有细节。
- """
- # 1. 使用索引作为唯一ID,将文本列表转换为字典,便于API处理
- indexed_originals = {str(i): text for i, text in enumerate(texts)}
-
- # 2. 将大字典分割成小块,以满足API的限制
- chunks = flat_json_split(indexed_originals, self.chunk_size)
-
- # 3. 将每个块序列化为JSON字符串并发送翻译
- prompts = [json.dumps(chunk) for chunk in chunks]
- translated_chunks = await self.translate_agent.send_prompts_async(prompts)
-
- # 4. 将翻译结果合并回一个字典
- # 我们从原始字典的副本开始,以确保即使翻译失败,我们也能保持结构
- indexed_translated = indexed_originals.copy()
- for chunk_str in translated_chunks:
- translated_part = json.loads(chunk_str)
- indexed_translated.update(translated_part)
-
- # 5. 按原始顺序返回翻译后的文本列表
- return list(indexed_translated.values())
def _update_content_with_translations(self, content: dict, matches: list[Any], translated_texts: list[str]):
"""
使用翻译后的文本更新原始JSON内容。
@@ -127,7 +103,7 @@ class JsonTranslator(Translator):
original_texts = [match.value for match in all_matches]
# 步骤 2: 批量翻译提取出的文本
- translated_texts = self._translate_texts_in_batches(original_texts)
+ translated_texts = self.translate_agent.send_segments(original_texts,self.chunk_size)
# 健壮性检查:确保翻译回来的项目数量与发送的一致
if len(original_texts) != len(translated_texts):
@@ -141,6 +117,7 @@ class JsonTranslator(Translator):
return self
+ # todo:增加协程粒度
async def translate_async(self, document: Document) -> Self:
content = json.loads(document.content.decode())
@@ -154,7 +131,7 @@ class JsonTranslator(Translator):
original_texts = [match.value for match in all_matches]
# 步骤 2: 批量翻译提取出的文本
- translated_texts = await self._translate_texts_in_batches_async(original_texts)
+ translated_texts = await self.translate_agent.send_segments_async(original_texts,self.chunk_size)
# 健壮性检查:确保翻译回来的项目数量与发送的一致
if len(original_texts) != len(translated_texts):
diff --git a/docutranslate/translator/ai_translator/xlsx_translator.py b/docutranslate/translator/ai_translator/xlsx_translator.py
new file mode 100644
index 0000000..f539852
--- /dev/null
+++ b/docutranslate/translator/ai_translator/xlsx_translator.py
@@ -0,0 +1,136 @@
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Self, Literal
+
+import openpyxl
+
+from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
+from docutranslate.ir.document import Document
+from docutranslate.translator.ai_translator.base import AiTranslatorConfig
+from docutranslate.translator.base import Translator
+
+
+@dataclass
+class XlsxTranslatorConfig(AiTranslatorConfig):
+ position: Literal["replace", "append", "prepend"] = "replace"
+ separator: str = "\n"
+
+
+class XlsxTranslator(Translator):
+ def __init__(self, config: XlsxTranslatorConfig):
+ super().__init__(config=config)
+ self.chunk_size = config.chunk_size
+ agent_config = SegmentsTranslateAgentConfig(custom_prompt=config.custom_prompt,
+ to_lang=config.to_lang,
+ baseurl=config.base_url,
+ key=config.api_key,
+ model_id=config.model_id,
+ system_prompt=None,
+ temperature=config.temperature,
+ thinking=config.thinking,
+ max_concurrent=config.concurrent,
+ timeout=config.timeout,
+ logger=self.logger)
+ self.translate_agent = SegmentsTranslateAgent(agent_config)
+ self.position = config.position
+ self.separator = config.separator
+
+ def _pre_translate(self, document: Document):
+ workbook = openpyxl.load_workbook(BytesIO(document.content))
+
+ # --- 步骤 1: 收集所有需要翻译的文本单元格 ---
+ cells_to_translate = []
+
+ for sheet_name in workbook.sheetnames:
+ sheet = workbook[sheet_name]
+ for row in sheet.iter_rows():
+ for cell in row:
+ # 关键判断:值是字符串(str) 且 数据类型是 's' (string),以排除公式('f')
+ if isinstance(cell.value, str) and cell.data_type == "s":
+ cell_info = {
+ "sheet_name": sheet_name,
+ "coordinate": cell.coordinate,
+ "original_text": cell.value,
+ }
+ cells_to_translate.append(cell_info)
+ # 提取所有原文文本,准备进行批量翻译
+ original_texts = [cell["original_text"] for cell in cells_to_translate]
+ return workbook, cells_to_translate, original_texts
+
+ def _after_translate(self, workbook, cells_to_translate, translated_texts, original_texts):
+ for i, cell_info in enumerate(cells_to_translate):
+ sheet_name = cell_info["sheet_name"]
+ coordinate = cell_info["coordinate"]
+ translated_text = translated_texts[i]
+ original_text = original_texts[i]
+
+ # 定位到工作表和单元格
+ sheet = workbook[sheet_name]
+ if self.position == "replace":
+ sheet[coordinate] = translated_text
+ elif self.position == "append":
+ sheet[coordinate] = original_text + self.separator + translated_text
+ elif self.position == "prepend":
+ sheet[coordinate] = translated_text + self.separator + original_text
+ else:
+ self.logger.error("不正确的XlsxTranslatorConfig参数")
+
+ workbook_output_stream = BytesIO()
+ # 保存修改后的工作簿到新文件
+ try:
+ workbook.save(workbook_output_stream)
+ finally:
+ workbook.close()
+ return workbook_output_stream.getvalue()
+
+ def translate(self, document: Document) -> Self:
+
+ workbook, cells_to_translate, original_texts = self._pre_translate(document)
+ if not cells_to_translate:
+ print("\n文件中没有找到需要翻译的纯文本内容。")
+ workbook.close()
+ return
+ # --- 步骤 2: 调用翻译函数 ---
+ translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
+
+ document.content = self._after_translate(workbook, cells_to_translate, translated_texts, original_texts)
+ return self
+
+ async def translate_async(self, document: Document) -> Self:
+
+ workbook, cells_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
+ if not cells_to_translate:
+ print("\n文件中没有找到需要翻译的纯文本内容。")
+ workbook.close()
+ return
+ # --- 步骤 2: 调用翻译函数 ---
+ translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
+
+ document.content = await asyncio.to_thread(self._after_translate, workbook, cells_to_translate,
+ translated_texts, original_texts)
+ return self
+
+
+if __name__ == '__main__':
+ from pathlib import Path
+ import asyncio
+
+ config = XlsxTranslatorConfig(
+ base_url=r"https://open.bigmodel.cn/api/paas/v4/",
+ api_key=r"969ba51b61914cc2b710d1393dca1a3c.hSuATex5IoNVZNGu",
+ model_id=r"glm-4-flash",
+ to_lang="英文",
+ position="append"
+ )
+ translator = XlsxTranslator(config)
+ document = Document.from_path(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\工业互联分组表.xlsx")
+
+
+ async def run():
+ await translator.translate_async(document)
+ path = Path(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\output\output.xlsx")
+ path.write_bytes(document.content)
+ print(f"已保存到{path.resolve()}")
+
+
+ asyncio.run(run())
diff --git a/pyproject.toml b/pyproject.toml
index 41fdddf..dc64a77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
"markdown2>=2.5.3",
"fastapi[standard]>=0.115.12",
"jsonpath-ng>=1.7.0",
+ "openpyxl>=3.1.5",
]
dynamic = ["version"]
diff --git a/uv.lock b/uv.lock
index c0bad3e..680b334 100644
--- a/uv.lock
+++ b/uv.lock
@@ -285,6 +285,7 @@ dependencies = [
{ name = "httpx" },
{ name = "jsonpath-ng" },
{ name = "markdown2" },
+ { name = "openpyxl" },
]
[package.optional-dependencies]
@@ -308,6 +309,7 @@ requires-dist = [
{ name = "jsonpath-ng", specifier = ">=1.7.0" },
{ name = "markdown2", specifier = ">=2.5.3" },
{ name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" },
+ { name = "openpyxl", specifier = ">=3.1.5" },
]
provides-extras = ["docling"]