From 49fe7b6db33d857aa91dad26c8951130262de7af Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 29 May 2025 22:52:38 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=BC=93=E5=AD=98=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=EF=BC=8C=E4=BC=98=E5=8C=96=E6=8F=90=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/markdown_agent.py | 2 +- docutranslate/cache/__init__.py | 1 + docutranslate/cache/document_cacher.py | 21 +++++++++++++++++++++ docutranslate/translater.py | 25 +++++++++++++++++++------ 4 files changed, 42 insertions(+), 7 deletions(-) create mode 100644 docutranslate/cache/__init__.py create mode 100644 docutranslate/cache/document_cacher.py diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index 3d7d028..5ba0acf 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -56,7 +56,7 @@ class MDTranslateAgent(Agent): 不输出任何解释和注释 不能改变形如的占位符 code、latex和HTML只翻译说明文字,其余保持原文 -公式无论长短必须表示为能被解析的合法latex公式,公式需被$或\\(\\)或$$正确包裹 +公式无论长短必须表示为能被解析的合法latex公式,公式需被$或\\(\\)或$$正确包裹,如不正确则进行修正 去除、修正明显异常的字符、但不能改变原意 参考文献的引用及其作者名保持源语言不要翻译。引用的参考文献示例如下: > [1] Nofer M, Gomber P, Hinz O, et al. Blockchain[J]. systems engineering, 2017, 59: 183-187. diff --git a/docutranslate/cache/__init__.py b/docutranslate/cache/__init__.py new file mode 100644 index 0000000..95b49d2 --- /dev/null +++ b/docutranslate/cache/__init__.py @@ -0,0 +1 @@ +from .document_cacher import DocumentCacher, document_cacher_global diff --git a/docutranslate/cache/document_cacher.py b/docutranslate/cache/document_cacher.py new file mode 100644 index 0000000..0805765 --- /dev/null +++ b/docutranslate/cache/document_cacher.py @@ -0,0 +1,21 @@ +from docutranslate.converter import Document + + +class DocumentCacher: + def __init__(self): + self.cache_dict:dict[str:str] = {} + @staticmethod + def _get_hashcode(document: Document, formula: bool, code: bool, convert_engin: str) -> str: + obj = (document.suffix, document.filebytes, formula, code, convert_engin) + return str(hash(obj)) + + def get_cached_result(self, document: Document, formula: bool, code: bool, convert_engin: str)->str|None: + return self.cache_dict.get(self._get_hashcode(document, formula, code, convert_engin)) + + def cache_result(self, result: str, document: Document, formula: bool, code: bool, convert_engin: str): + hash_code = self._get_hashcode(document, formula, code, convert_engin) + self.cache_dict[hash_code] = result + return result + + +document_cacher_global = DocumentCacher() diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 1319af1..42e048f 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -6,6 +6,7 @@ import markdown2 import jinja2 from docutranslate.agents import Agent, AgentArgs from docutranslate.agents import MDRefineAgent, MDTranslateAgent +from docutranslate.cache import document_cacher_global from docutranslate.converter import Document, ConverterMineru from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict @@ -24,7 +25,7 @@ class FileTranslater: max_concurrent=20, timeout=2000, convert_engin: Literal["docling", "mineru"] = "mineru", docling_artifact: Path | str | None = None, - mineru_token: str = None): + mineru_token: str = None, cache=True): self.convert_engin = convert_engin self.mineru_token = mineru_token.strip() if mineru_token is not None else None if isinstance(file_path, str): @@ -48,11 +49,13 @@ class FileTranslater: self.docling_artifact = artifact_path self.timeout = timeout self.file_suffix: str | None = None # 现在处理的文件后缀如".md"、".txt" + self.cache = cache + self.cacher=document_cacher_global def _markdown_format(self): # 该方法还需要改进 # self.markdown=mdformat.text(self.markdown) - self.markdown=self.markdown.replace(r'\(',r'\(') + self.markdown = self.markdown.replace(r'\(', r'\(') self.markdown = self.markdown.replace(r'\)', r'\)') pass @@ -86,7 +89,13 @@ class FileTranslater: def default_translate_agent(self, custom_prompt=None, to_lang="中文") -> MDTranslateAgent: return MDTranslateAgent(custom_prompt=custom_prompt, to_lang=to_lang, **self._default_agent_params()) + + def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str: + cached_result = self.cacher.get_cached_result(document, formula, code, convert_engin=self.convert_engin) + if cached_result: + translater_logger.info("正在获取缓存结果") + return cached_result if document.suffix in [".md", ".txt"]: return document.filebytes.decode("utf-8") translater_logger.info("正在转化为markdown") @@ -102,10 +111,14 @@ class FileTranslater: translater_logger.info("mineru暂不支持code识别") mdconverter = ConverterMineru(token=self.mineru_token, formula=formula) result = mdconverter.convert(document) - return result + return self.cacher.cache_result(result, document, formula, code, convert_engin=self.convert_engin) async def _convert2markdown_async(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str: + cached_result = self.cacher.get_cached_result(document, formula, code, convert_engin=self.convert_engin) + if cached_result: + translater_logger.info("解析结果已缓存,获取缓存结果") + return cached_result if document.suffix in [".md", ".txt"]: return document.filebytes.decode("utf-8") translater_logger.info("正在转化为markdown") @@ -121,7 +134,7 @@ class FileTranslater: translater_logger.info("mineru暂不支持code识别") mdconverter = ConverterMineru(token=self.mineru_token, formula=formula) result = await mdconverter.convert_async(document) - return result + return self.cacher.cache_result(result, document, formula, code, convert_engin=self.convert_engin) def read_document(self, document: Document, formula: bool, code: bool, save: bool, save_format: Literal["markdown", "html"], refine: bool, @@ -296,9 +309,9 @@ class FileTranslater: # 确保输出目录存在 output_dir.mkdir(parents=True, exist_ok=True) full_name = output_dir / filename - html = self.export_to_html(str(filename.resolve().stem)) + html_content = self.export_to_html(str(filename.resolve().stem)) with open(full_name, "w") as file: - file.write(html) + file.write(html_content) translater_logger.info(f"文件已写入{full_name.resolve()}") return self