This commit is contained in:
xunbu
2025-05-08 16:13:22 +08:00
parent 204c325209
commit 2798172528
14 changed files with 74 additions and 57 deletions

2
.gitignore vendored
View File

@@ -7,6 +7,6 @@ wheels/
*.egg-info *.egg-info
tests/resource/ tests/resource/
tests/ tests/
filetranslate/output/ docutranslate/output/
# Virtual environments # Virtual environments
.venv .venv

76
.idea/workspace.xml generated
View File

@@ -5,8 +5,20 @@
</component> </component>
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment=""> <list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
<change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" /> <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/__init__.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/decorator/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/decorator/__init__.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/decorator/markdown_mask.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/decorator/markdown_mask.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/decorator/time.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/decorator/time.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/utils/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/__init__.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/utils/agent_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/agent_utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/utils/convert.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/convert.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/utils/markdown_splitter.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/filetranslate/utils/markdown_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
</list> </list>
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" /> <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -32,38 +44,38 @@
<option name="hideEmptyMiddlePackages" value="true" /> <option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" /> <option name="showLibraryContents" value="true" />
</component> </component>
<component name="PropertiesComponent"><![CDATA[{ <component name="PropertiesComponent">{
"keyToString": { &quot;keyToString&quot;: {
"DefaultHtmlFileTemplate": "HTML File", &quot;DefaultHtmlFileTemplate&quot;: &quot;HTML File&quot;,
"JavaScript 调试.output.html (1).executor": "Run", &quot;JavaScript 调试.output.html (1).executor&quot;: &quot;Run&quot;,
"JavaScript 调试.output.html.executor": "Run", &quot;JavaScript 调试.output.html.executor&quot;: &quot;Run&quot;,
"JavaScript 调试.regex_中文.html.executor": "Run", &quot;JavaScript 调试.regex_中文.html.executor&quot;: &quot;Run&quot;,
"JavaScript 调试.test2_英文.html.executor": "Run", &quot;JavaScript 调试.test2_英文.html.executor&quot;: &quot;Run&quot;,
"ModuleVcsDetector.initialDetectionPerformed": "true", &quot;ModuleVcsDetector.initialDetectionPerformed&quot;: &quot;true&quot;,
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run", &quot;Python 测试.Python 测试 (markdown_mask.py 内).executor&quot;: &quot;Run&quot;,
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", &quot;Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor&quot;: &quot;Run&quot;,
"Python.PDFtranslater (1).executor": "Run", &quot;Python.PDFtranslater (1).executor&quot;: &quot;Run&quot;,
"Python.PDFtranslater (2).executor": "Run", &quot;Python.PDFtranslater (2).executor&quot;: &quot;Run&quot;,
"Python.agent_utils.executor": "Run", &quot;Python.agent_utils.executor&quot;: &quot;Run&quot;,
"Python.convert.executor": "Run", &quot;Python.convert.executor&quot;: &quot;Run&quot;,
"Python.markdown_splitter.executor": "Run", &quot;Python.markdown_splitter.executor&quot;: &quot;Run&quot;,
"Python.markdown_utils.executor": "Run", &quot;Python.markdown_utils.executor&quot;: &quot;Run&quot;,
"Python.test.executor": "Run", &quot;Python.test.executor&quot;: &quot;Run&quot;,
"Python.test1.executor": "Run", &quot;Python.test1.executor&quot;: &quot;Run&quot;,
"Python.translater.executor": "Debug", &quot;Python.translater.executor&quot;: &quot;Debug&quot;,
"RunOnceActivity.ShowReadmeOnStart": "true", &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
"RunOnceActivity.git.unshallow": "true", &quot;RunOnceActivity.git.unshallow&quot;: &quot;true&quot;,
"git-widget-placeholder": "master", &quot;git-widget-placeholder&quot;: &quot;master&quot;,
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/resource", &quot;last_opened_file_path&quot;: &quot;C:/Users/jxgm/Desktop/FileTranslate/tests/resource&quot;,
"node.js.detected.package.eslint": "true", &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
"node.js.detected.package.tslint": "true", &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
"node.js.selected.package.eslint": "(autodetect)", &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
"node.js.selected.package.tslint": "(autodetect)", &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
"nodejs_package_manager_path": "npm", &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
"settings.editor.selected.configurable": "Errors", &quot;settings.editor.selected.configurable&quot;: &quot;Errors&quot;,
"vue.rearranger.settings.migration": "true" &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
} }
}]]></component> }</component>
<component name="RecentsManager"> <component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS"> <key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\resource" /> <recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\resource" />
@@ -302,7 +314,7 @@
<workItem from="1746588383790" duration="2614000" /> <workItem from="1746588383790" duration="2614000" />
<workItem from="1746593417117" duration="25924000" /> <workItem from="1746593417117" duration="25924000" />
<workItem from="1746626070703" duration="7931000" /> <workItem from="1746626070703" duration="7931000" />
<workItem from="1746669839816" duration="17865000" /> <workItem from="1746669839816" duration="20170000" />
</task> </task>
<servers /> <servers />
</component> </component>

View File

@@ -1,15 +1,15 @@
# 简介 # 简介
## FileTranslate ## DocuTranslate
一个使用大预言模型(llm)翻译pdf和markdown的包 一个使用大预言模型(llm)翻译pdf和markdown的包
[github主页](https://github.com/xunbu/filetranslate) [github主页](https://github.com/xunbu/docutranslate)
# 安装 # 安装
使用pip 使用pip
`pip install filetranslate` `pip install doctranslate`
使用uv 使用uv
`uv add filetranslate` `uv add doctranslate`
# 前置条件获取大模型平台的baseurl、key、model-id # 前置条件获取大模型平台的baseurl、key、model-id
由于需要使用大语言模型进行markdown调整与翻译所以需要预先获取模型的baseurl、key、model-id 由于需要使用大语言模型进行markdown调整与翻译所以需要预先获取模型的baseurl、key、model-id
@@ -17,40 +17,45 @@
# 使用方式 # 使用方式
## 使用默认参数翻译pdf ## 使用默认参数翻译pdf
```python ```python
from filetranslate.translater import FileTranslater from docutranslate.translater import FileTranslater
#不开启公式、代码识别 # 不开启公式、代码识别
FileTranslater(base_url="<baseurl>",key="<key>",model_id="<model-id>").translate_pdf_file("<pdf路径>",to_lang="中文") FileTranslater(base_url="<baseurl>", key="<key>", model_id="<model-id>").translate_pdf_file("<pdf路径>", to_lang="中文")
#开启公式、代码识别(需要下载更多模型) # 开启公式、代码识别(需要下载更多模型)
FileTranslater(base_url="<baseurl>",key="<key>",model_id="<model-id>").translate_pdf_file("<pdf路径>",to_lang="中文",formula=True,code=True) FileTranslater(base_url="<baseurl>", key="<key>", model_id="<model-id>").translate_pdf_file("<pdf路径>", to_lang="中文",
formula=True, code=True)
``` ```
> 第一次使用时需要下载模型约1G、使用公式、代码识别需要多约0.5G),请稍作等待 > 第一次使用时需要下载模型约1G、使用公式、代码识别需要多约0.5G),请稍作等待
> 输出文件默认放在`./output`中 > 输出文件默认放在`./output`中
## 使用不同的agent分别进行文本修正和翻译 ## 使用不同的agent分别进行文本修正和翻译
```python ```python
from filetranslate.translater import FileTranslater from docutranslate.translater import FileTranslater
translater = FileTranslater() translater = FileTranslater()
refine_agent=translater.create_refine_agent(baseurl="<baseurl-1>",key="<key-1>",model_id="<model-id-1>") refine_agent = translater.create_refine_agent(baseurl="<baseurl-1>", key="<key-1>", model_id="<model-id-1>")
translate_agent=translater.create_translate_agent(baseurl="<baseurl-2>",key="<key-2>",model_id="<model-id-2>") translate_agent = translater.create_translate_agent(baseurl="<baseurl-2>", key="<key-2>", model_id="<model-id-2>")
translater.translate_pdf_file(pdf_path="<pdf路径>",to_lang="中文",refine_agent=refine_agent,translate_agent=translate_agent) translater.translate_pdf_file(pdf_path="<pdf路径>", to_lang="中文", refine_agent=refine_agent,
translate_agent=translate_agent)
``` ```
## 参数说明 ## 参数说明
### 创建FileTranslate ### 创建FileTranslate
```python ```python
from filetranslate.translater import FileTranslater from docutranslate.translater import FileTranslater
translater = FileTranslater(base_url="<baseurl>", translater = FileTranslater(base_url="<baseurl>",
key="<key>", key="<key>",
model_id="<model-id>",#使用的模型id model_id="<model-id>", # 使用的模型id
chunksize=4000,#【可选】markdown分块长度分块越大效果越好不建议超过4096 chunksize=4000, # 【可选】markdown分块长度分块越大效果越好不建议超过4096
max_concurrent=6#【可选】并发数受到ai平台并发量限制 max_concurrent=6 # 【可选】并发数受到ai平台并发量限制
) )
``` ```
### 翻译pdf文件 ### 翻译pdf文件

View File

@@ -2,7 +2,7 @@ from functools import wraps
from typing import Concatenate, ParamSpec, Callable from typing import Concatenate, ParamSpec, Callable
import re import re
from filetranslate.utils.markdown_utils import MaskDict from docutranslate.utils.markdown_utils import MaskDict
P=ParamSpec("P") P=ParamSpec("P")
def mask_uris_temp(func:Callable[Concatenate[str, P], str]) -> Callable[Concatenate[str, P], str]: def mask_uris_temp(func:Callable[Concatenate[str, P], str]) -> Callable[Concatenate[str, P], str]:

View File

@@ -3,11 +3,11 @@ from typing import Literal
import markdown2 import markdown2
from filetranslate.decorator.markdown_mask import MaskDict from docutranslate.decorator.markdown_mask import MaskDict
from filetranslate.utils.agent_utils import Agent from docutranslate.utils.agent_utils import Agent
from filetranslate.utils.convert import pdf2markdown_embed_images from docutranslate.utils.convert import pdf2markdown_embed_images
from filetranslate.utils.markdown_splitter import split_markdown_text from docutranslate.utils.markdown_splitter import split_markdown_text
from filetranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris
class FileTranslater: class FileTranslater:

View File

@@ -1,5 +1,5 @@
[project] [project]
name = "filetranslate" name = "docutranslate"
version = "0.0.1" version = "0.0.1"
description = "能翻译pdf和markdown的软件" description = "能翻译pdf和markdown的软件"
readme = "README.md" readme = "README.md"