From 66621fae50b5c8916f16f1723256b2a41f70359d Mon Sep 17 00:00:00 2001 From: xunbu Date: Wed, 28 May 2025 21:02:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E6=AD=A5=E6=94=AF=E6=8C=81txt?= =?UTF-8?q?=E7=BF=BB=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/converter/converter.py | 2 + docutranslate/translater.py | 141 ++++++++++++++------------- 2 files changed, 74 insertions(+), 69 deletions(-) diff --git a/docutranslate/converter/converter.py b/docutranslate/converter/converter.py index 2f94ad4..3fb5854 100644 --- a/docutranslate/converter/converter.py +++ b/docutranslate/converter/converter.py @@ -15,6 +15,8 @@ class Document: self.path=path self.filename=path.name self.filebytes=path.read_bytes() + self.suffix=Path(self.filename).suffix + self.stem=Path(self.filename).stem class Converter(Protocol): #转换为markdown diff --git a/docutranslate/translater.py b/docutranslate/translater.py index c83b12a..922b30e 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -1,6 +1,7 @@ import asyncio +import html from pathlib import Path -from typing import Literal +from typing import Literal, ParamSpec, TypedDict import markdown2 import jinja2 from docutranslate.agents import Agent, AgentArgs @@ -46,6 +47,7 @@ class FileTranslater: translater_logger.info("检测到docling_artifact文件夹") self.docling_artifact = artifact_path self.timeout = timeout + self.file_suffix: str | None = None # 现在处理的文件后缀如".md"、".txt" def _markdown_format(self): # 该方法还需要改进 @@ -83,7 +85,9 @@ class FileTranslater: return MDTranslateAgent(custom_prompt=custom_prompt, to_lang=to_lang, **self._default_agent_params()) def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str: - translater_logger.info(f"正在使用{self.convert_engin}转换文件为markdown") + if document.suffix in [".md", ".txt"]: + return document.filebytes.decode("utf-8") + translater_logger.info("正在转化为markdown") if self.convert_engin == "docling": if artifact is None: artifact = self.docling_artifact @@ -100,6 +104,9 @@ class FileTranslater: async def _convert2markdown_async(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str: + if document.suffix in [".md", ".txt"]: + return document.filebytes.decode("utf-8") + translater_logger.info("正在转化为markdown") if self.convert_engin == "docling": if artifact is None: artifact = self.docling_artifact @@ -114,43 +121,49 @@ class FileTranslater: result = await mdconverter.convert_async(document) return result - def read_bytes(self, name: str, file: bytes, formula=True, code=True, save=False, - save_format: Literal["markdown", "html"] = "markdown", refine=False, - refine_agent: Agent | None = None): - document = Document(filename=name, filebytes=file) - file_path = Path(name) - # 如果是markdown,直接读取 - if file_path.suffix in [".md", ".txt"]: - self.markdown = file.decode() - else: - self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) + def read_document(self, document: Document, formula: bool, code: bool, save: bool, + save_format: Literal["markdown", "html"], refine: bool, + refine_agent: Agent | None): + self.file_suffix = document.suffix + self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) if refine: self.refine_markdown_by_agent(refine_agent) if save: if save_format == "html": - self.save_as_html(filename=f"{file_path.stem}.html") + self.save_as_html(filename=f"{document.stem}.html") else: - self.save_as_markdown(filename=f"{file_path.stem}.md") + self.save_as_markdown(filename=f"{document.stem}.md") + return self + + async def read_document_async(self, document: Document, formula: bool, code: bool, save: bool, + save_format: Literal["markdown", "html"], refine: bool, + refine_agent: Agent | None): + self.file_suffix = document.suffix + self.markdown = await self._convert2markdown_async(document, formula=formula, code=code, + artifact=self.docling_artifact) + if refine: + await self.refine_markdown_by_agent_async(refine_agent) + if save: + if save_format == "html": + self.save_as_html(filename=f"{document.stem}.html") + else: + self.save_as_markdown(filename=f"{document.stem}.md") + return self + + def read_bytes(self, name: str, file: bytes, formula=True, code=True, save=False, + save_format: Literal["markdown", "html"] = "markdown", refine=False, + refine_agent: Agent | None = None): + document = Document(filename=name, filebytes=file) + self.read_document(document, formula=formula, code=code, save=save, save_format=save_format, + refine=refine, refine_agent=refine_agent) return self async def read_bytes_async(self, name: str, file: bytes, formula=True, code=True, save=False, save_format: Literal["markdown", "html"] = "markdown", refine=False, refine_agent: Agent | None = None): document = Document(filename=name, filebytes=file) - file_path = Path(name) - # 如果是markdown,直接读取 - if file_path.suffix in [".md", ".txt"]: - self.markdown = file.decode() - else: - self.markdown = await self._convert2markdown_async(document, formula=formula, code=code, - artifact=self.docling_artifact) - if refine: - await self.refine_markdown_by_agent_async(refine_agent) - if save: - if save_format == "html": - self.save_as_html(filename=f"{file_path.stem}.html") - else: - self.save_as_markdown(filename=f"{file_path.stem}.md") + await self.read_document_async(document, formula=formula, code=code, save=save, save_format=save_format, + refine=refine, refine_agent=refine_agent) return self def read_file(self, file_path: Path | str | None = None, formula=True, code=True, save=False, @@ -161,23 +174,10 @@ class FileTranslater: translater_logger.debug("未设置文件路径") raise Exception("未设置文件路径") file_path = self.file_path - if isinstance(file_path, str): - file_path = Path(file_path) - translater_logger.info(f"读取文件:{file_path.name}") - # 如果是markdown,直接读取 - if file_path.suffix in [".md", ".txt"]: - with open(file_path, "r") as f: - self.markdown = f.read() - else: - document = Document(file_path) - self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) - if refine: - self.refine_markdown_by_agent(refine_agent) - if save: - if save_format == "html": - self.save_as_html(filename=f"{file_path.stem}.html") - else: - self.save_as_markdown(filename=f"{file_path.stem}.md") + document = Document(path=file_path) + translater_logger.info(f"读取文件:{document.filename}") + self.read_document(document, formula=formula, code=code, save=save, save_format=save_format, refine=refine, + refine_agent=refine_agent) return self async def read_file_async(self, file_path: Path | str | None = None, formula=True, code=True, save=False, @@ -188,24 +188,11 @@ class FileTranslater: translater_logger.debug("未设置文件路径") raise Exception("未设置文件路径") file_path = self.file_path - if isinstance(file_path, str): - file_path = Path(file_path) - translater_logger.info(f"读取文件:{file_path.name}") + document = Document(file_path) + translater_logger.info(f"读取文件:{document.filename}") # 如果是markdown,直接读取 - if file_path.suffix in [".md", ".txt"]: - with open(file_path, "r") as f: - self.markdown = f.read() - else: - document = Document(file_path) - self.markdown = await self._convert2markdown_async(document, formula=formula, code=code, - artifact=self.docling_artifact) - if refine: - await self.refine_markdown_by_agent_async(refine_agent) - if save: - if save_format == "html": - self.save_as_html(filename=f"{file_path.stem}.html") - else: - self.save_as_markdown(filename=f"{file_path.stem}.md") + await self.read_document_async(document, formula=formula, code=code, save=save, save_format=save_format, + refine=refine, refine_agent=refine_agent) return self def refine_markdown_by_agent(self, refine_agent: Agent | None = None, custom_prompt=None) -> str: @@ -215,7 +202,10 @@ class FileTranslater: if refine_agent is None: refine_agent = self.default_refine_agent(custom_prompt) result: list[str] = refine_agent.send_prompts(chuncks) - self.markdown = join_markdown_texts(result) + if self.file_suffix == ".txt": + self.markdown = "\n".join(result) + else: + self.markdown = join_markdown_texts(result) self._unmask_uris_in_markdown() translater_logger.info("markdown已修正") return self.markdown @@ -227,7 +217,10 @@ class FileTranslater: if translate_agent is None: translate_agent = self.default_translate_agent(custom_prompt=custom_prompt, to_lang=to_lang) result: list[str] = translate_agent.send_prompts(chuncks) - self.markdown = join_markdown_texts(result) + if self.file_suffix == ".txt": + self.markdown = "\n".join(result) + else: + self.markdown = join_markdown_texts(result) self._unmask_uris_in_markdown() translater_logger.info("翻译完成") return self.markdown @@ -239,7 +232,10 @@ class FileTranslater: if refine_agent is None: refine_agent = self.default_refine_agent(custom_prompt=custom_prompt) result: list[str] = await refine_agent.send_prompts_async(chuncks) - self.markdown = join_markdown_texts(result) + if self.file_suffix == ".txt": + self.markdown = "\n".join(result) + else: + self.markdown = join_markdown_texts(result) self._unmask_uris_in_markdown() translater_logger.info("markdown已修正") return self.markdown @@ -252,7 +248,10 @@ class FileTranslater: if translate_agent is None: translate_agent = self.default_translate_agent(to_lang=to_lang, custom_prompt=custom_prompt) result: list[str] = await translate_agent.send_prompts_async(chuncks) - self.markdown = join_markdown_texts(result) + if self.file_suffix == ".txt": + self.markdown = "\n".join(result) + else: + self.markdown = join_markdown_texts(result) self._unmask_uris_in_markdown() translater_logger.info("翻译完成") return self.markdown @@ -305,19 +304,23 @@ class FileTranslater: markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"]) # language=html pico = f"" - html = resource_path("template/markdown.html").read_text(encoding='utf-8') + html_template = resource_path("template/markdown.html").read_text(encoding='utf-8') katex_css = f"" if not cdn else r"""""" katex_js = f"" if not cdn else r"""""" auto_render = f'' if not cdn else r"""""" mermaid = f'' + if self.file_suffix == ".txt": + content = html.escape(self.markdown).replace("\n", "
") + else: + content = markdowner.convert(self.markdown.replace("\\", "\\\\")) # TODO:实现MathJax本地化 - render = jinja2.Template(html).render( + render = jinja2.Template(html_template).render( title=title, pico=pico, katexCss=katex_css, katexJs=katex_js, autoRender=auto_render, - markdown=markdowner.convert(self.markdown.replace("\\", "\\\\")), + markdown=content, mermaid=mermaid, ) return render