优化表格合并逻辑;txt直接读取
This commit is contained in:
5
.idea/workspace.xml
generated
5
.idea/workspace.xml
generated
@@ -6,7 +6,8 @@
|
|||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/docutranslate/agents/markdown_agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/agents/markdown_agent.py" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
@@ -612,7 +613,7 @@
|
|||||||
</option>
|
</option>
|
||||||
</component>
|
</component>
|
||||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1748402710866" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1748410339788" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ class FileTranslater:
|
|||||||
document = Document(filename=name, filebytes=file)
|
document = Document(filename=name, filebytes=file)
|
||||||
file_path = Path(name)
|
file_path = Path(name)
|
||||||
# 如果是markdown,直接读取
|
# 如果是markdown,直接读取
|
||||||
if file_path.suffix == ".md":
|
if file_path.suffix in [".md", ".txt"]:
|
||||||
self.markdown = file.decode()
|
self.markdown = file.decode()
|
||||||
else:
|
else:
|
||||||
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
|
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
|
||||||
@@ -139,7 +139,7 @@ class FileTranslater:
|
|||||||
document = Document(filename=name, filebytes=file)
|
document = Document(filename=name, filebytes=file)
|
||||||
file_path = Path(name)
|
file_path = Path(name)
|
||||||
# 如果是markdown,直接读取
|
# 如果是markdown,直接读取
|
||||||
if file_path.suffix == ".md":
|
if file_path.suffix in [".md", ".txt"]:
|
||||||
self.markdown = file.decode()
|
self.markdown = file.decode()
|
||||||
else:
|
else:
|
||||||
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
|
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
|
||||||
@@ -165,7 +165,7 @@ class FileTranslater:
|
|||||||
file_path = Path(file_path)
|
file_path = Path(file_path)
|
||||||
translater_logger.info(f"读取文件:{file_path.name}")
|
translater_logger.info(f"读取文件:{file_path.name}")
|
||||||
# 如果是markdown,直接读取
|
# 如果是markdown,直接读取
|
||||||
if file_path.suffix == ".md":
|
if file_path.suffix in [".md", ".txt"]:
|
||||||
with open(file_path, "r") as f:
|
with open(file_path, "r") as f:
|
||||||
self.markdown = f.read()
|
self.markdown = f.read()
|
||||||
else:
|
else:
|
||||||
@@ -192,7 +192,7 @@ class FileTranslater:
|
|||||||
file_path = Path(file_path)
|
file_path = Path(file_path)
|
||||||
translater_logger.info(f"读取文件:{file_path.name}")
|
translater_logger.info(f"读取文件:{file_path.name}")
|
||||||
# 如果是markdown,直接读取
|
# 如果是markdown,直接读取
|
||||||
if file_path.suffix == ".md":
|
if file_path.suffix in [".md", ".txt"]:
|
||||||
with open(file_path, "r") as f:
|
with open(file_path, "r") as f:
|
||||||
self.markdown = f.read()
|
self.markdown = f.read()
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ class MarkdownBlockSplitter:
|
|||||||
def _get_bytes(text: str) -> int:
|
def _get_bytes(text: str) -> int:
|
||||||
return len(text.encode('utf-8'))
|
return len(text.encode('utf-8'))
|
||||||
|
|
||||||
#TODO: 修复分块有时候会有空白块的问题
|
# TODO: 修复分块有时候会有空白块的问题
|
||||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
def split_markdown(self, markdown_text: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
将Markdown文本分割成指定大小的块
|
将Markdown文本分割成指定大小的块
|
||||||
@@ -233,19 +233,21 @@ def split_markdown_text(markdown_text, max_block_size=5000):
|
|||||||
"""
|
"""
|
||||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||||
chunks = splitter.split_markdown(markdown_text)
|
chunks = splitter.split_markdown(markdown_text)
|
||||||
#过滤空白块
|
# 过滤空白块
|
||||||
chunks=[chunk for chunk in chunks if chunk.strip()]
|
chunks = [chunk for chunk in chunks if chunk.strip()]
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def join_markdown_texts(markdown_texts: list[str]) -> str:
|
def join_markdown_texts(markdown_texts: list[str]) -> str:
|
||||||
result = ""
|
result = ""
|
||||||
|
pre = ""
|
||||||
for text in markdown_texts:
|
for text in markdown_texts:
|
||||||
# 只有表格会收到多余空行的影响
|
# 只有表格会收到多余空行的影响
|
||||||
if text.lstrip().startswith("|"):
|
if text.lstrip().startswith("|") and pre.rstrip().endswith("|"):
|
||||||
result = result + "\n" + text
|
result = result + "\n" + text
|
||||||
else:
|
else:
|
||||||
result += "\n\n" + text
|
result += "\n\n" + text
|
||||||
|
pre = text
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user