This commit is contained in:
xunbu
2025-05-10 12:32:17 +08:00
parent 64bb9a966a
commit 4684a39d54
6 changed files with 58 additions and 86 deletions

75
.idea/workspace.xml generated
View File

@@ -5,8 +5,11 @@
</component>
<component name="ChangeListManager">
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
<change beforePath="$PROJECT_DIR$/docutranslate/Agents/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/__init__.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -38,8 +41,10 @@
"JavaScript 调试.output.html (1).executor": "Run",
"JavaScript 调试.output.html.executor": "Run",
"JavaScript 调试.regex_中文.html.executor": "Run",
"JavaScript 调试.test2.html.executor": "Run",
"JavaScript 调试.test2_英文.html.executor": "Run",
"JavaScript 调试.test4-1_中文.html.executor": "Run",
"JavaScript 调试.互联网认证授权机制.html.executor": "Run",
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
"JavaScript 调试.毕业论文_英文.html.executor": "Run",
"ModuleVcsDetector.initialDetectionPerformed": "true",
@@ -106,7 +111,10 @@
<option name="OPTIONS" value="" />
<method v="2" />
</configuration>
<configuration name="互联网认证授权机制_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/resource/互联网认证授权机制_英文.html" useBuiltInWebServerPort="true">
<configuration name="互联网认证授权机制.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/互联网认证授权机制.html" useBuiltInWebServerPort="true">
<method v="2" />
</configuration>
<configuration name="test2.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/test2.html" useBuiltInWebServerPort="true">
<method v="2" />
</configuration>
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
@@ -132,52 +140,6 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="agent" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/Agents" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/Agents/agent.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="markdown_splitter" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/utils" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
@@ -201,7 +163,7 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test3" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<configuration name="test2" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
@@ -215,7 +177,7 @@
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/test3.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/test2.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
@@ -347,10 +309,10 @@
<recent_temporary>
<list>
<item itemvalue="Python.test1" />
<item itemvalue="Python.test2" />
<item itemvalue="JavaScript 调试.互联网认证授权机制.html" />
<item itemvalue="JavaScript 调试.test2.html" />
<item itemvalue="Python.translater" />
<item itemvalue="Python.agent" />
<item itemvalue="Python.markdown_splitter" />
<item itemvalue="Python.test3" />
</list>
</recent_temporary>
</component>
@@ -397,7 +359,8 @@
<workItem from="1746788668813" duration="25000" />
<workItem from="1746791230782" duration="3129000" />
<workItem from="1746799824552" duration="317000" />
<workItem from="1746801217905" duration="14698000" />
<workItem from="1746801217905" duration="14895000" />
<workItem from="1746844613273" duration="6860000" />
</task>
<servers />
</component>
@@ -408,7 +371,7 @@
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746843295280" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746851336881" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
@@ -420,7 +383,7 @@
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
<SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746710994589" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746850160458" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746843159560" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
</component>

View File

@@ -53,7 +53,7 @@ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
由于需要使用大语言模型进行markdown调整与翻译所以需要预先获取模型的baseurl、key、model-id
常见的大模型平台baseurl与api获取方式可见[常用ai平台](#常用ai平台)
> 比较推荐的模型有智谱的glm-4-flash免费、glm-4-air阿里云的qwen-plus等。
> 比较推荐的模型有智谱的glm-4-air、glm-4-flash免费阿里云的qwen-plus等。
> 推理模型需要支持api请求响应中区分`reasoning_content`和`content`详见平台开发手册ollama、lmstudio需开启对应选项
# 使用方式
@@ -125,10 +125,10 @@ from docutranslate import FileTranslater
translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl
key="<key>", # 默认的模型api-key
model_id="<model-id>", # 默认的模型id
chunksize=4000, # markdown分块长度分块越大效果越好不建议超过4096
max_concurrent=20, # 并发数受到ai平台并发量限制如果文章很长建议适当加大到20以上
chunksize=4000, # markdown分块长度单位byte分块越大效果越好不建议超过4096
max_concurrent=10, # 并发数受到ai平台并发量限制如果文章很长建议适当加大到20以上
docling_artifact=None, # 使用提前下载好的docling模型
timeout=1000,# 调用api的超时时间
timeout=2000,# 调用api的超时时间
tips=True # 开场提示
)

View File

@@ -12,9 +12,11 @@ class MDRefineAgent(Agent):
找到markdown片段的不合理之处。
对于缺失的句子,应该查看缺失的语句是否可能被错误的放在了其他位置,并通过重组段落修复不合理之处。
去掉异常字词,修复错误格式。
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。code和latex保持原文。保留正确的空行。
# 要求
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。
code块和latex块保持原文。
# 输出
修正后的markdown纯文本
修正后的markdown**纯文本**
# 示例
## 调整顺序
输入:
@@ -36,12 +38,15 @@ class MDTranslateAgent(Agent):
self.system_prompt=f"""# 角色
你是一个翻译markdown文本的专家。
# 工作
输入的markdown文本翻译成{to_lang}
请忠实于原文。修改明显错误的字符。保留正确的空行。
翻译输入的markdown文本
目标语言{to_lang}
# 要求
请忠实于原文,适当修复异常文本。
必要的专有名词不要翻译。
形如<ph-abc123>的占位符不要改变。
code和latex保持原文。
code和latex保持原文。
# 输出
翻译后的markdown纯文本
翻译后的markdown**纯文本**
# 示例
## 英文翻译为中文:
输入:

View File

@@ -14,9 +14,9 @@ from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_ur
class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunksize: int = 4096, base_url="", key=None,
model_id="", temperature=0.7, max_concurrent=20, docling_artifact: Path | str | None = None,
timeout=1000, tips=True):
def __init__(self, file_path: Path | str | None = None, chunksize: int = 5000, base_url="", key=None,
model_id="", temperature=0.7, max_concurrent=10, docling_artifact: Path | str | None = None,
timeout=2000, tips=True):
if isinstance(file_path, str):
file_path = Path(file_path)
self.file_path: Path = file_path
@@ -99,22 +99,25 @@ class FileTranslater:
def refine_markdown_by_agent(self, refine_agent: Agent | None = None) -> str:
print("正在修正markdown")
self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks()
if refine_agent is None:
refine_agent = MDRefineAgent(**self.default_agent_params())
result: list[str] = refine_agent.send_prompts(chuncks)
self.markdown = "\n".join(result)
self._unmask_uris_in_markdown()
print("markdown已修正")
return self.markdown
def translate_markdown_by_agent(self, translate_agent: Agent | None = None):
def translate_markdown_by_agent(self, translate_agent: Agent | None = None,to_lang="中文"):
print("正在翻译markdown")
self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks()
if translate_agent is None:
translate_agent = MDTranslateAgent(**self.default_agent_params())
translate_agent = MDTranslateAgent(to_lang=to_lang,**self.default_agent_params())
result: list[str] = translate_agent.send_prompts(chuncks)
self.markdown = "\n".join(result)
self._unmask_uris_in_markdown()
print("翻译完成")
return self.markdown
@@ -226,11 +229,9 @@ class FileTranslater:
if isinstance(file_path, str):
file_path = Path(file_path)
self.read_file(file_path, formula=formula, code=code)
self._mask_uris_in_markdown()
if refine:
self.refine_markdown_by_agent(refine_agent)
self.translate_markdown_by_agent(translate_agent)
self._unmask_uris_in_markdown()
self.translate_markdown_by_agent(translate_agent,to_lang=to_lang)
if output_format == "markdown":
filename = f"{file_path.stem}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)

View File

@@ -3,7 +3,7 @@ from typing import List
class MarkdownBlockSplitter:
def __init__(self, max_block_size: int = 4096):
def __init__(self, max_block_size: int = 5000):
"""
初始化Markdown分块器
@@ -11,6 +11,9 @@ class MarkdownBlockSplitter:
max_block_size: 每个块的最大字符数
"""
self.max_block_size = max_block_size
@staticmethod
def _get_bytes(text:str)->int:
return len(text.encode('utf-8'))
def split_markdown(self, markdown_text: str) -> List[str]:
"""
@@ -34,7 +37,7 @@ class MarkdownBlockSplitter:
pending_heading = None # 等待内容的标题
for block in blocks:
block_size = len(block)
block_size = self._get_bytes(block)
is_heading = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
is_separator = block.strip() == '' and block.count('\n') > 0
@@ -57,10 +60,10 @@ class MarkdownBlockSplitter:
# 如果有等待内容的标题,尝试将其与内容保持在一起
if pending_heading and not is_heading and not is_separator:
# 如果只添加标题和此块能放下,则这样做
if len(pending_heading) + block_size + 1 <= self.max_block_size:
if self._get_bytes(pending_heading) + block_size + 1 <= self.max_block_size:
result_blocks.append('\n'.join(current_block[:-1])) # 输出不含标题的内容
current_block = [pending_heading, block]
current_size = len(pending_heading) + 1 + block_size
current_size = self._get_bytes(pending_heading) + 1 + block_size
pending_heading = None
continue
@@ -166,16 +169,16 @@ class MarkdownBlockSplitter:
result = []
current_chunk = [first_line]
current_size = len(first_line)
current_size = self._get_bytes(first_line)
for line in remaining_lines:
line_len = len(line) + 1 # +1是因为换行符
line_len = self._get_bytes(line) + 1 # +1是因为换行符
if current_size + line_len + len(closing_fence) > self.max_block_size:
if current_size + line_len + self._get_bytes(closing_fence) > self.max_block_size:
# 关闭当前块并开始新块
result.append('\n'.join(current_chunk + [closing_fence]))
current_chunk = [first_line] # 新块使用相同的开始标记
current_size = len(first_line)
current_size = self._get_bytes(first_line)
current_chunk.append(line)
current_size += line_len
@@ -193,7 +196,7 @@ class MarkdownBlockSplitter:
current_size = 0
for line in lines:
line_len = len(line) + 1 # +1是因为换行符
line_len = self._get_bytes(line) + 1 # +1是因为换行符
if current_size + line_len > self.max_block_size and current_chunk:
result.append('\n'.join(current_chunk))
@@ -213,7 +216,7 @@ class MarkdownBlockSplitter:
return result
def split_markdown_text(markdown_text, max_block_size=4096):
def split_markdown_text(markdown_text, max_block_size=5000):
"""
将Markdown字符串分割成不超过max_block_size的块
可以通过简单拼接重建原始文本(分割的代码块除外)

View File

@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
version = "0.1.2"
version = "0.1.3"
description = "文件翻译工具"
readme = "README.md"
requires-python = ">=3.10"