This commit is contained in:
xunbu
2025-08-05 00:10:46 +08:00
parent 262abe88c6
commit fb99ed78e3
5 changed files with 72 additions and 46 deletions

View File

@@ -1,4 +1,5 @@
import json import json
from json_repair import json_repair
from dataclasses import dataclass from dataclasses import dataclass
from json import JSONDecodeError from json import JSONDecodeError
@@ -28,12 +29,18 @@ class SegmentsTranslateAgent(Agent):
翻译后的片段应该与源格式尽量相同 翻译后的片段应该与源格式尽量相同
如果待翻译片段已经是目标语言,则保持原样 如果待翻译片段已经是目标语言,则保持原样
# 输出 # 输出
翻译后的片段序列以json文本表示文本而非代码块)。其中键是片段编号,值是翻译后的片段 翻译后的片段序列以json文本表示注意不是代码块)。其中键是片段编号,值是翻译后的片段
返回的json文本必须能被json.loads转换为形如{{"片段编号":"译文"}}的字典。
# 示例 # 示例
## 输入 ## 输入
{r'{"0":"hello","1":"apple","2":true,"3":"false"}'} {r'{"0":"hello","1":"apple","2":true,"3":"false"}'}
## 输出 ## 输出
{r'{"0":"你好","1":"苹果","2":true,"3":"错误"}'} {r'{"0":"你好","1":"苹果","2":true,"3":"错误"}'}
# 错误输出示例
"{r'"{\"0\":\"你好\",\"1\":\"苹果\"}"'}"
# 正确输出示例
{r'{"0":"你好","1":"苹果"}'}
警告绝不要将整个JSON对象用引号包裹成一个字符串。
""" """
if config.custom_prompt: if config.custom_prompt:
self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n' self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n'
@@ -41,26 +48,40 @@ class SegmentsTranslateAgent(Agent):
def send_segments(self, segments: list[str], chunk_size: int): def send_segments(self, segments: list[str], chunk_size: int):
indexed_originals = {str(i): text for i, text in enumerate(segments)} indexed_originals = {str(i): text for i, text in enumerate(segments)}
chunks = flat_json_split(indexed_originals, chunk_size) chunks = flat_json_split(indexed_originals, chunk_size)
prompts = [json.dumps(chunk) for chunk in chunks] prompts = [json.dumps(chunk,ensure_ascii=False) for chunk in chunks]
translated_chunks = super().send_prompts(prompts=prompts) translated_chunks = super().send_prompts(prompts=prompts)
indexed_translated = indexed_originals.copy() indexed_translated = indexed_originals.copy()
for chunk_str in translated_chunks: for chunk_str in translated_chunks:
translated_part = json.loads(chunk_str) try:
indexed_translated.update(translated_part) translated_part = json_repair.loads(chunk_str)
for key,val in translated_part:
if key in indexed_translated:
indexed_translated[key]=val
except JSONDecodeError as e:
self.logger.info(f"json解析错误解析文本:{chunk_str},错误:{e.__repr__()}")
except ValueError as e:
self.logger.info(f"value错误更新对象:{indexed_translated},错误:{e.__repr__()}")
return list(indexed_translated.values()) return list(indexed_translated.values())
#todo:增加协程粒度 #todo:增加协程粒度
async def send_segments_async(self, segments: list[str], chunk_size: int): async def send_segments_async(self, segments: list[str], chunk_size: int):
indexed_originals = {str(i): text for i, text in enumerate(segments)} indexed_originals = {str(i): text for i, text in enumerate(segments)}
chunks = flat_json_split(indexed_originals, chunk_size) chunks = flat_json_split(indexed_originals, chunk_size)
prompts = [json.dumps(chunk) for chunk in chunks] prompts = [json.dumps(chunk,ensure_ascii=False) for chunk in chunks]
translated_chunks = await super().send_prompts_async(prompts=prompts) translated_chunks = await super().send_prompts_async(prompts=prompts)
indexed_translated = indexed_originals.copy() indexed_translated = indexed_originals.copy()
for chunk_str in translated_chunks: for chunk_str in translated_chunks:
try: try:
translated_part = json.loads(chunk_str) translated_part:dict = json_repair.loads(chunk_str)
indexed_translated.update(translated_part) for key,val in translated_part.items():
if key in indexed_translated:
indexed_translated[key]=val
except JSONDecodeError as e: except JSONDecodeError as e:
self.logger.info(f"json解析错误解析文本:{chunk_str},错误:{e.__repr__()}") self.logger.error(f"json解析错误解析文本:{chunk_str},错误:{e.__repr__()}")
except ValueError as e:
self.logger.error(f"value错误更新对象:{indexed_translated},错误:{e.__repr__()}")
except AttributeError as e:
self.logger.error(f"属性错误,chunk_str:{chunk_str},错误:{e.__repr__()}")
return list(indexed_translated.values()) return list(indexed_translated.values())

View File

@@ -3,6 +3,7 @@ from io import BytesIO
import jinja2 import jinja2
import openpyxl import openpyxl
from xlsx2html import xlsx2html
from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.xlsx.base import XlsxExporter from docutranslate.exporter.xlsx.base import XlsxExporter
@@ -22,40 +23,5 @@ class Xlsx2HTMLExporter(XlsxExporter):
self.cdn = config.cdn self.cdn = config.cdn
def export(self, document: Document) -> Document: def export(self, document: Document) -> Document:
html_content = xlsx2html(BytesIO(document.content), output=None).getvalue()
# 1. 加载工作簿和工作表 return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)
workbook = openpyxl.load_workbook(BytesIO(document.content))
sheet = workbook.active
# 2. 手动构建HTML字符串
table = '<table border="1">\n'
# 处理表头
table += ' <thead>\n <tr>\n'
for cell in sheet[1]: # 假设第一行是表头
table += f' <th>{cell.value}</th>\n'
table += ' </tr>\n </thead>\n'
# 处理数据行
table += ' <tbody>\n'
# iter_rows(min_row=2) 从第二行开始遍历
for row in sheet.iter_rows(min_row=2):
table += ' <tr>\n'
for cell in row:
# 处理None值防止在HTML中显示"None"
cell_value = cell.value if cell.value is not None else ""
table += f' <td>{cell_value}</td>\n'
table += ' </tr>\n'
table += ' </tbody>\n'
table += '</table>'
html_template = resource_path("template/xlsx.html").read_text(encoding="utf-8")
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not self.cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
render = jinja2.Template(html_template).render(
title=document.stem,
pico=pico,
body=table,
)
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)

File diff suppressed because one or more lines are too long

View File

@@ -9,6 +9,8 @@ dependencies = [
"fastapi[standard]>=0.115.12", "fastapi[standard]>=0.115.12",
"jsonpath-ng>=1.7.0", "jsonpath-ng>=1.7.0",
"openpyxl>=3.1.5", "openpyxl>=3.1.5",
"xlsx2html>=0.6.2",
"json-repair>=0.48.0",
] ]
dynamic = ["version"] dynamic = ["version"]

37
uv.lock generated
View File

@@ -51,6 +51,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815 }, { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815 },
] ]
[[package]]
name = "babel"
version = "2.17.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537 },
]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.13.4" version = "4.13.4"
@@ -283,9 +292,11 @@ source = { editable = "." }
dependencies = [ dependencies = [
{ name = "fastapi", extra = ["standard"] }, { name = "fastapi", extra = ["standard"] },
{ name = "httpx" }, { name = "httpx" },
{ name = "json-repair" },
{ name = "jsonpath-ng" }, { name = "jsonpath-ng" },
{ name = "markdown2" }, { name = "markdown2" },
{ name = "openpyxl" }, { name = "openpyxl" },
{ name = "xlsx2html" },
] ]
[package.optional-dependencies] [package.optional-dependencies]
@@ -306,10 +317,12 @@ requires-dist = [
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" }, { name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" },
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
{ name = "httpx", specifier = "==0.27.2" }, { name = "httpx", specifier = "==0.27.2" },
{ name = "json-repair", specifier = ">=0.48.0" },
{ name = "jsonpath-ng", specifier = ">=1.7.0" }, { name = "jsonpath-ng", specifier = ">=1.7.0" },
{ name = "markdown2", specifier = ">=2.5.3" }, { name = "markdown2", specifier = ">=2.5.3" },
{ name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" }, { name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" },
{ name = "openpyxl", specifier = ">=3.1.5" }, { name = "openpyxl", specifier = ">=3.1.5" },
{ name = "xlsx2html", specifier = ">=0.6.2" },
] ]
provides-extras = ["docling"] provides-extras = ["docling"]
@@ -553,6 +566,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 }, { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 },
] ]
[[package]]
name = "json-repair"
version = "0.48.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/fc/b5/dd0e703abd6f69507c7ec0494c4d0bf5ecefaabaa454801bebcc8b80ff73/json_repair-0.48.0.tar.gz", hash = "sha256:030f826e6867dbc465be7163dfc23458c0776002c0878d239b29136cd2ae8f39", size = 34736 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ba/e9/22315fd481ed9dc007646181b8c2ebf8b63cd512e59c750d03d50a9ed838/json_repair-0.48.0-py3-none-any.whl", hash = "sha256:c3eb34518c39a7a58d963dbbbda8cdb44e16d819169a85d6d1882f7d2fa24774", size = 26354 },
]
[[package]] [[package]]
name = "jsonlines" name = "jsonlines"
version = "3.1.0" version = "3.1.0"
@@ -2406,6 +2428,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743 }, { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743 },
] ]
[[package]]
name = "xlsx2html"
version = "0.6.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "babel" },
{ name = "openpyxl" },
{ name = "packaging" },
{ name = "six" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c6/1b/770c7c3d73b7dd0309f8aa640be19782f6762f03caa871edcb8c3fbd0ae6/xlsx2html-0.6.2.tar.gz", hash = "sha256:e3be926dca7c3217eabe6b9e4e50447b1c65cd2a16a711e395f5e9a1ac18ce9e", size = 19638 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0c/26/a43d76e5ffddb1c2d958915fec5d99552a80559e6403ee3f0dd3db28fdea/xlsx2html-0.6.2-py2.py3-none-any.whl", hash = "sha256:d10d6c18be2e563a9fe8aee5dcadb2140be38e1090db717ede1b8ff09648452a", size = 17547 },
]
[[package]] [[package]]
name = "xlsxwriter" name = "xlsxwriter"
version = "3.2.3" version = "3.2.3"