diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index b7abe93..a3b8fbf 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -6,7 +6,10 @@
+
+
+
@@ -54,6 +57,7 @@
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
"Python 测试.pytest (test_html.py 内).executor": "Run",
+ "Python.2test2 (1).executor": "Run",
"Python.PDFtranslater (1).executor": "Run",
"Python.PDFtranslater (2).executor": "Run",
"Python.agent.executor": "Debug",
@@ -77,7 +81,7 @@
"RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
"RunOnceActivity.git.unshallow": "true",
"git-widget-placeholder": "main",
- "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate",
+ "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/files",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
@@ -89,11 +93,11 @@
}]]>
+
-
@@ -269,6 +273,52 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -315,75 +365,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -481,7 +462,7 @@
-
+
@@ -495,17 +476,17 @@
-
+
-
+
+
-
-
+
@@ -580,7 +561,8 @@
-
+
+
@@ -588,11 +570,11 @@
-
+
-
+
@@ -600,9 +582,10 @@
-
-
+
+
+
diff --git a/README.md b/README.md
index 2d1630c..fe0d246 100644
--- a/README.md
+++ b/README.md
@@ -144,7 +144,7 @@ translater = FileTranslater(base_url="", # 默认的模型baseurl
key="", # 默认的模型api-key
model_id="", # 默认的模型id
chunksize=2000, # markdown分块长度(单位byte),分块越大效果越好(也越慢),不建议超过8000
- max_concurrent=15, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
+ max_concurrent=20, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
docling_artifact=None, # 使用提前下载好的docling模型
timeout=2000, # 调用api的超时时间
tips=True # 开场提示
diff --git a/docutranslate/translater.py b/docutranslate/translater.py
index 6446a8d..8edddc1 100644
--- a/docutranslate/translater.py
+++ b/docutranslate/translater.py
@@ -9,7 +9,7 @@ from docling.datamodel.document import DocumentStream
from docutranslate.agents import Agent, AgentArgs
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
from docutranslate.utils.convert import file2markdown_embed_images
-from docutranslate.utils.markdown_splitter import split_markdown_text
+from docutranslate.utils.markdown_splitter import split_markdown_text,join_markdown_texts
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
from docutranslate.logger import translater_logger
@@ -17,7 +17,7 @@ from docutranslate.logger import translater_logger
class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000, base_url="", key=None,
- model_id="", temperature=0.7, max_concurrent=15, docling_artifact: Path | str | None = None,
+ model_id="", temperature=0.7, max_concurrent=20, docling_artifact: Path | str | None = None,
timeout=2000, tips=True):
if isinstance(file_path, str):
file_path = Path(file_path)
@@ -129,7 +129,7 @@ class FileTranslater:
if refine_agent is None:
refine_agent = MDRefineAgent(**self.default_agent_params())
result: list[str] = refine_agent.send_prompts(chuncks)
- self.markdown = "\n\n".join(result)
+ self.markdown=join_markdown_texts(result)
self._unmask_uris_in_markdown()
translater_logger.info("markdown已修正")
return self.markdown
@@ -141,7 +141,7 @@ class FileTranslater:
if translate_agent is None:
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
result: list[str] = translate_agent.send_prompts(chuncks)
- self.markdown = "\n\n".join(result)
+ self.markdown=join_markdown_texts(result)
self._unmask_uris_in_markdown()
translater_logger.info("翻译完成")
return self.markdown
@@ -154,7 +154,7 @@ class FileTranslater:
if refine_agent is None:
refine_agent = MDRefineAgent(**self.default_agent_params())
result: list[str] = await refine_agent.send_prompts_async(chuncks)
- self.markdown = "\n\n".join(result)
+ self.markdown=join_markdown_texts(result)
self._unmask_uris_in_markdown()
translater_logger.info("markdown已修正")
return self.markdown
@@ -166,7 +166,7 @@ class FileTranslater:
if translate_agent is None:
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
result: list[str] = await translate_agent.send_prompts_async(chuncks)
- self.markdown = "\n\n".join(result)
+ self.markdown=join_markdown_texts(result)
self._unmask_uris_in_markdown()
translater_logger.info("翻译完成")
return self.markdown
diff --git a/docutranslate/utils/convert.py b/docutranslate/utils/convert.py
index 902f8ff..f3d23fb 100644
--- a/docutranslate/utils/convert.py
+++ b/docutranslate/utils/convert.py
@@ -17,6 +17,7 @@ def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=Fal
pipeline_options.do_ocr=False
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
+ pipeline_options.table_structure_options.do_cell_matching=False
if formula:
pipeline_options.do_formula_enrichment=True
if code:
diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py
index 2c1e394..63ec8f3 100644
--- a/docutranslate/utils/markdown_splitter.py
+++ b/docutranslate/utils/markdown_splitter.py
@@ -233,5 +233,15 @@ def split_markdown_text(markdown_text, max_block_size=5000):
return splitter.split_markdown(markdown_text)
+def join_markdown_texts(markdown_texts:list[str])->str:
+ result=""
+ for text in markdown_texts:
+ #只有表格会收到多余空行的影响
+ if text.lstrip().startswith("|"):
+ result=result+"\n"+text
+ else:
+ result+="\n\n"+text
+ return result
+
if __name__ == '__main__':
pass
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index fb08b7b..70ab53a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
-version = "0.2.15"
+version = "0.2.16"
description = "文件翻译工具"
readme = "README.md"
requires-python = ">=3.10"
@@ -10,6 +10,5 @@ dependencies = [
"markdown2",
"fastapi[standard]>=0.115.12",
]
-
[project.scripts]
docutranslate="docutranslate.cli:main"