From fb99ed78e3df8933fce6cea23060a97cb67aa826 Mon Sep 17 00:00:00 2001 From: xunbu Date: Tue, 5 Aug 2025 00:10:46 +0800 Subject: [PATCH] fix --- docutranslate/agents/segments_agent.py | 37 +++++++++++++---- .../exporter/xlsx/xlsx2html_exporter.py | 40 ++----------------- docutranslate/static/index.html | 2 +- pyproject.toml | 2 + uv.lock | 37 +++++++++++++++++ 5 files changed, 72 insertions(+), 46 deletions(-) diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index 1621aba..6f05edf 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -1,4 +1,5 @@ import json +from json_repair import json_repair from dataclasses import dataclass from json import JSONDecodeError @@ -28,12 +29,18 @@ class SegmentsTranslateAgent(Agent): 翻译后的片段应该与源格式尽量相同 如果待翻译片段已经是目标语言,则保持原样 # 输出 -翻译后的片段序列,以json文本表示(文本而非代码块)。其中键是片段编号,值是翻译后的片段 +翻译后的片段序列,以json文本表示(注意不是代码块)。其中键是片段编号,值是翻译后的片段。 +返回的json文本必须能被json.loads转换为形如{{"片段编号":"译文"}}的字典。 # 示例 ## 输入 {r'{"0":"hello","1":"apple","2":true,"3":"false"}'} ## 输出 {r'{"0":"你好","1":"苹果","2":true,"3":"错误"}'} +# 错误输出示例 +"{r'"{\"0\":\"你好\",\"1\":\"苹果\"}"'}" +# 正确输出示例 +{r'{"0":"你好","1":"苹果"}'} +警告:绝不要将整个JSON对象用引号包裹成一个字符串。 """ if config.custom_prompt: self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n' @@ -41,26 +48,40 @@ class SegmentsTranslateAgent(Agent): def send_segments(self, segments: list[str], chunk_size: int): indexed_originals = {str(i): text for i, text in enumerate(segments)} chunks = flat_json_split(indexed_originals, chunk_size) - prompts = [json.dumps(chunk) for chunk in chunks] + prompts = [json.dumps(chunk,ensure_ascii=False) for chunk in chunks] translated_chunks = super().send_prompts(prompts=prompts) indexed_translated = indexed_originals.copy() for chunk_str in translated_chunks: - translated_part = json.loads(chunk_str) - indexed_translated.update(translated_part) + try: + translated_part = json_repair.loads(chunk_str) + for key,val in translated_part: + if key in indexed_translated: + indexed_translated[key]=val + except JSONDecodeError as e: + self.logger.info(f"json解析错误,解析文本:{chunk_str},错误:{e.__repr__()}") + except ValueError as e: + self.logger.info(f"value错误,更新对象:{indexed_translated},错误:{e.__repr__()}") + return list(indexed_translated.values()) #todo:增加协程粒度 async def send_segments_async(self, segments: list[str], chunk_size: int): indexed_originals = {str(i): text for i, text in enumerate(segments)} chunks = flat_json_split(indexed_originals, chunk_size) - prompts = [json.dumps(chunk) for chunk in chunks] + prompts = [json.dumps(chunk,ensure_ascii=False) for chunk in chunks] translated_chunks = await super().send_prompts_async(prompts=prompts) indexed_translated = indexed_originals.copy() for chunk_str in translated_chunks: try: - translated_part = json.loads(chunk_str) - indexed_translated.update(translated_part) + translated_part:dict = json_repair.loads(chunk_str) + for key,val in translated_part.items(): + if key in indexed_translated: + indexed_translated[key]=val except JSONDecodeError as e: - self.logger.info(f"json解析错误,解析文本:{chunk_str},错误:{e.__repr__()}") + self.logger.error(f"json解析错误,解析文本:{chunk_str},错误:{e.__repr__()}") + except ValueError as e: + self.logger.error(f"value错误,更新对象:{indexed_translated},错误:{e.__repr__()}") + except AttributeError as e: + self.logger.error(f"属性错误,chunk_str:{chunk_str},错误:{e.__repr__()}") return list(indexed_translated.values()) diff --git a/docutranslate/exporter/xlsx/xlsx2html_exporter.py b/docutranslate/exporter/xlsx/xlsx2html_exporter.py index f5310ff..8d04705 100644 --- a/docutranslate/exporter/xlsx/xlsx2html_exporter.py +++ b/docutranslate/exporter/xlsx/xlsx2html_exporter.py @@ -3,6 +3,7 @@ from io import BytesIO import jinja2 import openpyxl +from xlsx2html import xlsx2html from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.xlsx.base import XlsxExporter @@ -22,40 +23,5 @@ class Xlsx2HTMLExporter(XlsxExporter): self.cdn = config.cdn def export(self, document: Document) -> Document: - - # 1. 加载工作簿和工作表 - workbook = openpyxl.load_workbook(BytesIO(document.content)) - sheet = workbook.active - - # 2. 手动构建HTML字符串 - table = '\n' - - # 处理表头 - table += ' \n \n' - for cell in sheet[1]: # 假设第一行是表头 - table += f' \n' - table += ' \n \n' - - # 处理数据行 - table += ' \n' - # iter_rows(min_row=2) 从第二行开始遍历 - for row in sheet.iter_rows(min_row=2): - table += ' \n' - for cell in row: - # 处理None值,防止在HTML中显示"None" - cell_value = cell.value if cell.value is not None else "" - table += f' \n' - table += ' \n' - table += ' \n' - - table += '
{cell.value}
{cell_value}
' - - html_template = resource_path("template/xlsx.html").read_text(encoding="utf-8") - - pico = f'' if not self.cdn else r'' - render = jinja2.Template(html_template).render( - title=document.stem, - pico=pico, - body=table, - ) - return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem) + html_content = xlsx2html(BytesIO(document.content), output=None).getvalue() + return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem) diff --git a/docutranslate/static/index.html b/docutranslate/static/index.html index 3473bef..c6e3a59 100644 --- a/docutranslate/static/index.html +++ b/docutranslate/static/index.html @@ -1 +1 @@ - DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

任务列表

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file + DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

任务列表

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index dc64a77..08c886f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,8 @@ dependencies = [ "fastapi[standard]>=0.115.12", "jsonpath-ng>=1.7.0", "openpyxl>=3.1.5", + "xlsx2html>=0.6.2", + "json-repair>=0.48.0", ] dynamic = ["version"] diff --git a/uv.lock b/uv.lock index 680b334..1ccda02 100644 --- a/uv.lock +++ b/uv.lock @@ -51,6 +51,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815 }, ] +[[package]] +name = "babel" +version = "2.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537 }, +] + [[package]] name = "beautifulsoup4" version = "4.13.4" @@ -283,9 +292,11 @@ source = { editable = "." } dependencies = [ { name = "fastapi", extra = ["standard"] }, { name = "httpx" }, + { name = "json-repair" }, { name = "jsonpath-ng" }, { name = "markdown2" }, { name = "openpyxl" }, + { name = "xlsx2html" }, ] [package.optional-dependencies] @@ -306,10 +317,12 @@ requires-dist = [ { name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, { name = "httpx", specifier = "==0.27.2" }, + { name = "json-repair", specifier = ">=0.48.0" }, { name = "jsonpath-ng", specifier = ">=1.7.0" }, { name = "markdown2", specifier = ">=2.5.3" }, { name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" }, { name = "openpyxl", specifier = ">=3.1.5" }, + { name = "xlsx2html", specifier = ">=0.6.2" }, ] provides-extras = ["docling"] @@ -553,6 +566,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 }, ] +[[package]] +name = "json-repair" +version = "0.48.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/b5/dd0e703abd6f69507c7ec0494c4d0bf5ecefaabaa454801bebcc8b80ff73/json_repair-0.48.0.tar.gz", hash = "sha256:030f826e6867dbc465be7163dfc23458c0776002c0878d239b29136cd2ae8f39", size = 34736 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ba/e9/22315fd481ed9dc007646181b8c2ebf8b63cd512e59c750d03d50a9ed838/json_repair-0.48.0-py3-none-any.whl", hash = "sha256:c3eb34518c39a7a58d963dbbbda8cdb44e16d819169a85d6d1882f7d2fa24774", size = 26354 }, +] + [[package]] name = "jsonlines" version = "3.1.0" @@ -2406,6 +2428,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743 }, ] +[[package]] +name = "xlsx2html" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "babel" }, + { name = "openpyxl" }, + { name = "packaging" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/1b/770c7c3d73b7dd0309f8aa640be19782f6762f03caa871edcb8c3fbd0ae6/xlsx2html-0.6.2.tar.gz", hash = "sha256:e3be926dca7c3217eabe6b9e4e50447b1c65cd2a16a711e395f5e9a1ac18ce9e", size = 19638 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/26/a43d76e5ffddb1c2d958915fec5d99552a80559e6403ee3f0dd3db28fdea/xlsx2html-0.6.2-py2.py3-none-any.whl", hash = "sha256:d10d6c18be2e563a9fe8aee5dcadb2140be38e1090db717ede1b8ff09648452a", size = 17547 }, +] + [[package]] name = "xlsxwriter" version = "3.2.3"