diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 64cb11a..9dffb3f 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -6,7 +6,8 @@ - + + - + diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 8fea3fa..c83b12a 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -120,7 +120,7 @@ class FileTranslater: document = Document(filename=name, filebytes=file) file_path = Path(name) # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: self.markdown = file.decode() else: self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) @@ -139,7 +139,7 @@ class FileTranslater: document = Document(filename=name, filebytes=file) file_path = Path(name) # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: self.markdown = file.decode() else: self.markdown = await self._convert2markdown_async(document, formula=formula, code=code, @@ -165,7 +165,7 @@ class FileTranslater: file_path = Path(file_path) translater_logger.info(f"读取文件:{file_path.name}") # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: with open(file_path, "r") as f: self.markdown = f.read() else: @@ -192,7 +192,7 @@ class FileTranslater: file_path = Path(file_path) translater_logger.info(f"读取文件:{file_path.name}") # 如果是markdown,直接读取 - if file_path.suffix == ".md": + if file_path.suffix in [".md", ".txt"]: with open(file_path, "r") as f: self.markdown = f.read() else: diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py index 807049f..44f27b6 100644 --- a/docutranslate/utils/markdown_splitter.py +++ b/docutranslate/utils/markdown_splitter.py @@ -16,7 +16,7 @@ class MarkdownBlockSplitter: def _get_bytes(text: str) -> int: return len(text.encode('utf-8')) - #TODO: 修复分块有时候会有空白块的问题 + # TODO: 修复分块有时候会有空白块的问题 def split_markdown(self, markdown_text: str) -> List[str]: """ 将Markdown文本分割成指定大小的块 @@ -233,19 +233,21 @@ def split_markdown_text(markdown_text, max_block_size=5000): """ splitter = MarkdownBlockSplitter(max_block_size=max_block_size) chunks = splitter.split_markdown(markdown_text) - #过滤空白块 - chunks=[chunk for chunk in chunks if chunk.strip()] + # 过滤空白块 + chunks = [chunk for chunk in chunks if chunk.strip()] return chunks def join_markdown_texts(markdown_texts: list[str]) -> str: result = "" + pre = "" for text in markdown_texts: # 只有表格会收到多余空行的影响 - if text.lstrip().startswith("|"): + if text.lstrip().startswith("|") and pre.rstrip().endswith("|"): result = result + "\n" + text else: result += "\n\n" + text + pre = text return result