添加缓存功能,优化提示词
This commit is contained in:
@@ -56,7 +56,7 @@ class MDTranslateAgent(Agent):
|
|||||||
不输出任何解释和注释
|
不输出任何解释和注释
|
||||||
不能改变形如<ph-xxxxxx>的占位符
|
不能改变形如<ph-xxxxxx>的占位符
|
||||||
code、latex和HTML只翻译说明文字,其余保持原文
|
code、latex和HTML只翻译说明文字,其余保持原文
|
||||||
公式无论长短必须表示为能被解析的合法latex公式,公式需被$或\\(\\)或$$正确包裹
|
公式无论长短必须表示为能被解析的合法latex公式,公式需被$或\\(\\)或$$正确包裹,如不正确则进行修正
|
||||||
去除、修正明显异常的字符、但不能改变原意
|
去除、修正明显异常的字符、但不能改变原意
|
||||||
参考文献的引用及其作者名保持源语言不要翻译。引用的参考文献示例如下:
|
参考文献的引用及其作者名保持源语言不要翻译。引用的参考文献示例如下:
|
||||||
> [1] Nofer M, Gomber P, Hinz O, et al. Blockchain[J]. systems engineering, 2017, 59: 183-187.
|
> [1] Nofer M, Gomber P, Hinz O, et al. Blockchain[J]. systems engineering, 2017, 59: 183-187.
|
||||||
|
|||||||
1
docutranslate/cache/__init__.py
vendored
Normal file
1
docutranslate/cache/__init__.py
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .document_cacher import DocumentCacher, document_cacher_global
|
||||||
21
docutranslate/cache/document_cacher.py
vendored
Normal file
21
docutranslate/cache/document_cacher.py
vendored
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from docutranslate.converter import Document
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentCacher:
|
||||||
|
def __init__(self):
|
||||||
|
self.cache_dict:dict[str:str] = {}
|
||||||
|
@staticmethod
|
||||||
|
def _get_hashcode(document: Document, formula: bool, code: bool, convert_engin: str) -> str:
|
||||||
|
obj = (document.suffix, document.filebytes, formula, code, convert_engin)
|
||||||
|
return str(hash(obj))
|
||||||
|
|
||||||
|
def get_cached_result(self, document: Document, formula: bool, code: bool, convert_engin: str)->str|None:
|
||||||
|
return self.cache_dict.get(self._get_hashcode(document, formula, code, convert_engin))
|
||||||
|
|
||||||
|
def cache_result(self, result: str, document: Document, formula: bool, code: bool, convert_engin: str):
|
||||||
|
hash_code = self._get_hashcode(document, formula, code, convert_engin)
|
||||||
|
self.cache_dict[hash_code] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
document_cacher_global = DocumentCacher()
|
||||||
@@ -6,6 +6,7 @@ import markdown2
|
|||||||
import jinja2
|
import jinja2
|
||||||
from docutranslate.agents import Agent, AgentArgs
|
from docutranslate.agents import Agent, AgentArgs
|
||||||
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
||||||
|
from docutranslate.cache import document_cacher_global
|
||||||
from docutranslate.converter import Document, ConverterMineru
|
from docutranslate.converter import Document, ConverterMineru
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
|
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
|
||||||
@@ -24,7 +25,7 @@ class FileTranslater:
|
|||||||
max_concurrent=20, timeout=2000,
|
max_concurrent=20, timeout=2000,
|
||||||
convert_engin: Literal["docling", "mineru"] = "mineru",
|
convert_engin: Literal["docling", "mineru"] = "mineru",
|
||||||
docling_artifact: Path | str | None = None,
|
docling_artifact: Path | str | None = None,
|
||||||
mineru_token: str = None):
|
mineru_token: str = None, cache=True):
|
||||||
self.convert_engin = convert_engin
|
self.convert_engin = convert_engin
|
||||||
self.mineru_token = mineru_token.strip() if mineru_token is not None else None
|
self.mineru_token = mineru_token.strip() if mineru_token is not None else None
|
||||||
if isinstance(file_path, str):
|
if isinstance(file_path, str):
|
||||||
@@ -48,6 +49,8 @@ class FileTranslater:
|
|||||||
self.docling_artifact = artifact_path
|
self.docling_artifact = artifact_path
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.file_suffix: str | None = None # 现在处理的文件后缀如".md"、".txt"
|
self.file_suffix: str | None = None # 现在处理的文件后缀如".md"、".txt"
|
||||||
|
self.cache = cache
|
||||||
|
self.cacher=document_cacher_global
|
||||||
|
|
||||||
def _markdown_format(self):
|
def _markdown_format(self):
|
||||||
# 该方法还需要改进
|
# 该方法还需要改进
|
||||||
@@ -86,7 +89,13 @@ class FileTranslater:
|
|||||||
def default_translate_agent(self, custom_prompt=None, to_lang="中文") -> MDTranslateAgent:
|
def default_translate_agent(self, custom_prompt=None, to_lang="中文") -> MDTranslateAgent:
|
||||||
return MDTranslateAgent(custom_prompt=custom_prompt, to_lang=to_lang, **self._default_agent_params())
|
return MDTranslateAgent(custom_prompt=custom_prompt, to_lang=to_lang, **self._default_agent_params())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str:
|
def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str:
|
||||||
|
cached_result = self.cacher.get_cached_result(document, formula, code, convert_engin=self.convert_engin)
|
||||||
|
if cached_result:
|
||||||
|
translater_logger.info("正在获取缓存结果")
|
||||||
|
return cached_result
|
||||||
if document.suffix in [".md", ".txt"]:
|
if document.suffix in [".md", ".txt"]:
|
||||||
return document.filebytes.decode("utf-8")
|
return document.filebytes.decode("utf-8")
|
||||||
translater_logger.info("正在转化为markdown")
|
translater_logger.info("正在转化为markdown")
|
||||||
@@ -102,10 +111,14 @@ class FileTranslater:
|
|||||||
translater_logger.info("mineru暂不支持code识别")
|
translater_logger.info("mineru暂不支持code识别")
|
||||||
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
||||||
result = mdconverter.convert(document)
|
result = mdconverter.convert(document)
|
||||||
return result
|
return self.cacher.cache_result(result, document, formula, code, convert_engin=self.convert_engin)
|
||||||
|
|
||||||
async def _convert2markdown_async(self, document: Document, formula: bool, code: bool,
|
async def _convert2markdown_async(self, document: Document, formula: bool, code: bool,
|
||||||
artifact: Path = None) -> str:
|
artifact: Path = None) -> str:
|
||||||
|
cached_result = self.cacher.get_cached_result(document, formula, code, convert_engin=self.convert_engin)
|
||||||
|
if cached_result:
|
||||||
|
translater_logger.info("解析结果已缓存,获取缓存结果")
|
||||||
|
return cached_result
|
||||||
if document.suffix in [".md", ".txt"]:
|
if document.suffix in [".md", ".txt"]:
|
||||||
return document.filebytes.decode("utf-8")
|
return document.filebytes.decode("utf-8")
|
||||||
translater_logger.info("正在转化为markdown")
|
translater_logger.info("正在转化为markdown")
|
||||||
@@ -121,7 +134,7 @@ class FileTranslater:
|
|||||||
translater_logger.info("mineru暂不支持code识别")
|
translater_logger.info("mineru暂不支持code识别")
|
||||||
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
||||||
result = await mdconverter.convert_async(document)
|
result = await mdconverter.convert_async(document)
|
||||||
return result
|
return self.cacher.cache_result(result, document, formula, code, convert_engin=self.convert_engin)
|
||||||
|
|
||||||
def read_document(self, document: Document, formula: bool, code: bool, save: bool,
|
def read_document(self, document: Document, formula: bool, code: bool, save: bool,
|
||||||
save_format: Literal["markdown", "html"], refine: bool,
|
save_format: Literal["markdown", "html"], refine: bool,
|
||||||
@@ -296,9 +309,9 @@ class FileTranslater:
|
|||||||
# 确保输出目录存在
|
# 确保输出目录存在
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
full_name = output_dir / filename
|
full_name = output_dir / filename
|
||||||
html = self.export_to_html(str(filename.resolve().stem))
|
html_content = self.export_to_html(str(filename.resolve().stem))
|
||||||
with open(full_name, "w") as file:
|
with open(full_name, "w") as file:
|
||||||
file.write(html)
|
file.write(html_content)
|
||||||
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user