diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 64cb11a..9dffb3f 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -6,7 +6,8 @@
-
+
+
@@ -612,7 +613,7 @@
-
+
diff --git a/docutranslate/translater.py b/docutranslate/translater.py
index 8fea3fa..c83b12a 100644
--- a/docutranslate/translater.py
+++ b/docutranslate/translater.py
@@ -120,7 +120,7 @@ class FileTranslater:
document = Document(filename=name, filebytes=file)
file_path = Path(name)
# 如果是markdown,直接读取
- if file_path.suffix == ".md":
+ if file_path.suffix in [".md", ".txt"]:
self.markdown = file.decode()
else:
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
@@ -139,7 +139,7 @@ class FileTranslater:
document = Document(filename=name, filebytes=file)
file_path = Path(name)
# 如果是markdown,直接读取
- if file_path.suffix == ".md":
+ if file_path.suffix in [".md", ".txt"]:
self.markdown = file.decode()
else:
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
@@ -165,7 +165,7 @@ class FileTranslater:
file_path = Path(file_path)
translater_logger.info(f"读取文件:{file_path.name}")
# 如果是markdown,直接读取
- if file_path.suffix == ".md":
+ if file_path.suffix in [".md", ".txt"]:
with open(file_path, "r") as f:
self.markdown = f.read()
else:
@@ -192,7 +192,7 @@ class FileTranslater:
file_path = Path(file_path)
translater_logger.info(f"读取文件:{file_path.name}")
# 如果是markdown,直接读取
- if file_path.suffix == ".md":
+ if file_path.suffix in [".md", ".txt"]:
with open(file_path, "r") as f:
self.markdown = f.read()
else:
diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py
index 807049f..44f27b6 100644
--- a/docutranslate/utils/markdown_splitter.py
+++ b/docutranslate/utils/markdown_splitter.py
@@ -16,7 +16,7 @@ class MarkdownBlockSplitter:
def _get_bytes(text: str) -> int:
return len(text.encode('utf-8'))
- #TODO: 修复分块有时候会有空白块的问题
+ # TODO: 修复分块有时候会有空白块的问题
def split_markdown(self, markdown_text: str) -> List[str]:
"""
将Markdown文本分割成指定大小的块
@@ -233,19 +233,21 @@ def split_markdown_text(markdown_text, max_block_size=5000):
"""
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
chunks = splitter.split_markdown(markdown_text)
- #过滤空白块
- chunks=[chunk for chunk in chunks if chunk.strip()]
+ # 过滤空白块
+ chunks = [chunk for chunk in chunks if chunk.strip()]
return chunks
def join_markdown_texts(markdown_texts: list[str]) -> str:
result = ""
+ pre = ""
for text in markdown_texts:
# 只有表格会收到多余空行的影响
- if text.lstrip().startswith("|"):
+ if text.lstrip().startswith("|") and pre.rstrip().endswith("|"):
result = result + "\n" + text
else:
result += "\n\n" + text
+ pre = text
return result