From 686b73e840f20792297257d4ef7e86a32f6b99a1 Mon Sep 17 00:00:00 2001 From: xunbu Date: Wed, 28 May 2025 13:58:13 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E8=A1=A8=E6=A0=BC=E5=90=88?= =?UTF-8?q?=E5=B9=B6=E9=80=BB=E8=BE=91=EF=BC=9Btxt=E7=9B=B4=E6=8E=A5?= =?UTF-8?q?=E8=AF=BB=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/workspace.xml | 5 +++-- docutranslate/translater.py | 8 ++++---- docutranslate/utils/markdown_splitter.py | 10 ++++++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 64cb11a..9dffb3f 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -6,7 +6,8 @@ - + + - + diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 8fea3fa..c83b12a 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -120,7 +120,7 @@ class FileTranslater: document = Document(filename=name, filebytes=file) file_path = Path(name) # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: self.markdown = file.decode() else: self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) @@ -139,7 +139,7 @@ class FileTranslater: document = Document(filename=name, filebytes=file) file_path = Path(name) # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: self.markdown = file.decode() else: self.markdown = await self._convert2markdown_async(document, formula=formula, code=code, @@ -165,7 +165,7 @@ class FileTranslater: file_path = Path(file_path) translater_logger.info(f"读取文件:{file_path.name}") # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: with open(file_path, "r") as f: self.markdown = f.read() else: @@ -192,7 +192,7 @@ class FileTranslater: file_path = Path(file_path) translater_logger.info(f"读取文件:{file_path.name}") # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: with open(file_path, "r") as f: self.markdown = f.read() else: diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py index 807049f..44f27b6 100644 --- a/docutranslate/utils/markdown_splitter.py +++ b/docutranslate/utils/markdown_splitter.py @@ -16,7 +16,7 @@ class MarkdownBlockSplitter: def _get_bytes(text: str) -> int: return len(text.encode('utf-8')) - #TODO: 修复分块有时候会有空白块的问题 + # TODO: 修复分块有时候会有空白块的问题 def split_markdown(self, markdown_text: str) -> List[str]: """ 将Markdown文本分割成指定大小的块 @@ -233,19 +233,21 @@ def split_markdown_text(markdown_text, max_block_size=5000): """ splitter = MarkdownBlockSplitter(max_block_size=max_block_size) chunks = splitter.split_markdown(markdown_text) - #过滤空白块 - chunks=[chunk for chunk in chunks if chunk.strip()] + # 过滤空白块 + chunks = [chunk for chunk in chunks if chunk.strip()] return chunks def join_markdown_texts(markdown_texts: list[str]) -> str: result = "" + pre = "" for text in markdown_texts: # 只有表格会收到多余空行的影响 - if text.lstrip().startswith("|"): + if text.lstrip().startswith("|") and pre.rstrip().endswith("|"): result = result + "\n" + text else: result += "\n\n" + text + pre = text return result