From 0cd049e707cc6c29850ee1d79ed5288124634e26 Mon Sep 17 00:00:00 2001 From: xunbu Date: Mon, 18 Aug 2025 15:20:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E7=A4=BA=E8=AF=8D=E6=94=B9=E4=B8=BA?= =?UTF-8?q?=E8=8B=B1=E6=96=87=EF=BC=8C=E7=BD=91=E7=BB=9C=E8=AF=B7=E6=B1=82?= =?UTF-8?q?=E7=8E=B0=E5=9C=A8=E7=BB=8F=E8=BF=87=E4=BB=A3=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/agent.py | 6 ++- docutranslate/agents/markdown_agent.py | 47 ++++++++++--------- docutranslate/agents/segments_agent.py | 36 +++++++------- docutranslate/agents/txt_agent.py | 25 +++++----- .../converter/x2md/converter_mineru.py | 10 ++-- .../exporter/docx/docx2html_exporter.py | 4 +- 6 files changed, 69 insertions(+), 59 deletions(-) diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index e220cbb..5b7d4c5 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -86,8 +86,10 @@ class Agent: self.model_id = config.model_id.strip() self.system_prompt = config.system_prompt or "" self.temperature = config.temperature - self.client = httpx.Client(trust_env=False, proxy=None, verify=False) - self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False) + # self.client = httpx.Client(trust_env=False, proxy=None, verify=False) + # self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False) + self.client = httpx.Client(verify=False) + self.client_async = httpx.AsyncClient(verify=False) self.max_concurrent = config.max_concurrent self.timeout = config.timeout self.thinking = config.thinking diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index 0e7ab9a..46c382b 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -11,32 +11,37 @@ class MDTranslateAgent(Agent): def __init__(self,config:MDTranslateAgentConfig): super().__init__(config) self.system_prompt = f""" -# 角色 -你是一个专业的机器翻译引擎 -# 工作 -翻译输入的markdown文本 -目标语言{config.to_lang} -# 要求 -翻译要求专业准确 -不输出任何解释和注释 -不能改变形如的占位符 -code、latex和HTML只翻译说明文字,其余保持原文 -所有公式无论长短必须表示为能被解析的合法latex公式,公式需被$或\\(\\)或$$正确包裹,如不正确则进行修正 -去除、修正明显异常的字符、但不能改变原意 -引用参考文献时请严格保持原文,不要翻译。参考文献格式示例如下: -[1] Author A, Author B. "Original Title". Journal, 2023. -[2] 作者C. 《中文标题》. 期刊, 2022. -# 输出 -翻译后的markdown译文纯文本(不是markdown代码块,无任何多余文字) -# 示例 -## 目标语言为中文 -输入: +# Role +You are a professional machine translation engine. + +# Task +Translate the input markdown text. +Target language: {config.to_lang} + +# Requirements +- The translation must be professional and accurate. +- Do not output any explanations or annotations. +- Do not change placeholders in the format of ``. +- For `code`, `LaTeX`, and `HTML`, only translate the descriptive text (e.g., comments, captions); keep the rest of the content in its original form. +- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it. +- Remove or correct any obviously abnormal characters, but without altering the original meaning. +- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows: + [1] Author A, Author B. "Original Title". Journal, 2023. + [2] 作者C. 《中文标题》. 期刊, 2022. + +# Output +The translated markdown text as plain text (not in a markdown code block, with no extraneous text). + +# Example +## Target language is Chinese +Input: hello, what's your nam*@e? ![photo title]() The equation is E=mc 2. This is famous. 1+1=2$$ (c_0,c_1_1,c_2^2)is a coordinate. -输出: + +Output: 你好,你叫什么名字? ![图像标题]() 这个方程是 $E=mc^2$。这很有名。 diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index b16c800..4410654 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -19,26 +19,26 @@ class SegmentsTranslateAgent(Agent): def __init__(self, config: SegmentsTranslateAgentConfig): super().__init__(config) self.system_prompt = f""" -# 角色 -你是一个专业的机器翻译引擎 -# 工作 -你接收一个待翻译片段的序列,以json格式表示。其中键是待片段的编号,值是待翻译片段。 -你需要将待翻译片段翻译成目标语言。 -目标语言:{config.to_lang} -# 要求 -翻译要求专业准确 -不输出任何解释和注释 -翻译后的片段应该与源格式尽量相同 -如果待翻译片段已经是目标语言,则保持原样 -# 输出 -翻译后的片段序列,以json文本表示(注意不是代码块)。其中键是片段编号,值是翻译后的片段。 -返回的json文本必须能被json.loads转换为形如{{"片段编号":"译文"}}的字典。 -# 示例 -## 输入 +Role +You are a professional machine translation engine. +Task +You will receive a sequence of segments to be translated, represented in JSON format. The keys are the segment IDs, and the values are the segments for translation. +You need to translate these segments into the target language. +Target language: {config.to_lang} +Requirements +The translation must be professional and accurate. +Do not output any explanations or annotations. +The format of the translated segments should be as close as possible to the source format. +If a segment is already in the target language, keep it as is. +Output +The translated sequence of segments, represented as JSON text (note: not a code block). The keys are the segment IDs, and the values are the translated segments. +The returned JSON text must be parsable by json.loads into a dictionary of the form {r'{"segment_id": "translation"}'}. +Example +Input {r'{"0":"hello","1":"apple","2":true,"3":"false"}'} -## 输出 +Output {r'{"0":"你好","1":"苹果","2":true,"3":"错误"}'} -警告:绝不要将整个JSON对象用引号包裹成一个字符串。 +Warning: Never wrap the entire JSON object in quotes to make it a single string. Never wrap the JSON text in ```. """ if config.custom_prompt: self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n' diff --git a/docutranslate/agents/txt_agent.py b/docutranslate/agents/txt_agent.py index 55a4ee4..3b70ea0 100644 --- a/docutranslate/agents/txt_agent.py +++ b/docutranslate/agents/txt_agent.py @@ -13,17 +13,20 @@ class TXTTranslateAgent(Agent): def __init__(self, config: TXTTranslateAgentConfig): super().__init__(config) self.system_prompt = f""" -# 角色 -你是一个专业的机器翻译引擎 -# 工作 -翻译输入的txt文本 -目标语言{config.to_lang} -# 要求 -翻译要求专业准确 -不输出任何解释和注释 -不能改变形如的占位符 -# 输出 -翻译后的txt译文纯文本 +# Role +You are a professional machine translation engine. + +# Task +Translate the input txt text. +Target language: {config.to_lang} + +# Requirements +- The translation must be professional and accurate. +- Do not output any explanations or annotations. +- Do not change placeholders in the format of ``. + +# Output +The translated txt text as plain text. """ if config.custom_prompt: self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n' diff --git a/docutranslate/converter/x2md/converter_mineru.py b/docutranslate/converter/x2md/converter_mineru.py index 6d3ac21..4346827 100644 --- a/docutranslate/converter/x2md/converter_mineru.py +++ b/docutranslate/converter/x2md/converter_mineru.py @@ -2,7 +2,6 @@ import asyncio import time import zipfile from dataclasses import dataclass -from logging import Logger from typing import Hashable import httpx @@ -10,7 +9,6 @@ import httpx from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument -from docutranslate.logger import global_logger from docutranslate.utils.markdown_utils import embed_inline_image_from_zip URL = 'https://mineru.net/api/v4/file-urls/batch' @@ -21,7 +19,7 @@ class ConverterMineruConfig(X2MarkdownConverterConfig): mineru_token: str formula_ocr: bool = True - def gethash(self) ->Hashable: + def gethash(self) -> Hashable: return self.formula_ocr @@ -32,8 +30,10 @@ timeout = httpx.Timeout( pool=1.0 # 从连接池获取连接的超时时间 ) -client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False) -client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False) +# client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False) +# client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False) +client = httpx.Client(timeout=timeout, verify=False) +client_async = httpx.AsyncClient(timeout=timeout, verify=False) class ConverterMineru(X2MarkdownConverter): diff --git a/docutranslate/exporter/docx/docx2html_exporter.py b/docutranslate/exporter/docx/docx2html_exporter.py index 89cfcd0..6818f1f 100644 --- a/docutranslate/exporter/docx/docx2html_exporter.py +++ b/docutranslate/exporter/docx/docx2html_exporter.py @@ -4,7 +4,7 @@ from io import BytesIO import mammoth from docutranslate.exporter.base import ExporterConfig -from docutranslate.exporter.xlsx.base import XlsxExporter +from docutranslate.exporter.docx.base import DocxExporter from docutranslate.ir.document import Document @@ -13,7 +13,7 @@ class Docx2HTMLExporterConfig(ExporterConfig): cdn: bool = True -class Docx2HTMLExporter(XlsxExporter): +class Docx2HTMLExporter(DocxExporter): def __init__(self, config: Docx2HTMLExporterConfig = None): config = config or Docx2HTMLExporterConfig() super().__init__(config=config)