From 80634fe74923cc8585d3b843a82b79d20ea0090c Mon Sep 17 00:00:00 2001 From: xunbu Date: Mon, 28 Jul 2025 23:41:35 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=9E=84=E4=BB=A3=E7=A0=81=EF=BC=8C?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E4=BA=86MarkdownBasedManager=E5=92=8CTXTMana?= =?UTF-8?q?ger=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/__init__.py | 2 +- docutranslate/agents/agent.py | 28 ++--- docutranslate/agents/markdown_agent.py | 7 +- docutranslate/agents/txt_agent.py | 29 +++++ docutranslate/cacher/__init__.py | 2 +- docutranslate/cacher/document_cacher.py | 30 ------ .../cacher/md_based_convert_cacher.py | 36 +++++++ docutranslate/converter/__init__.py | 12 +-- docutranslate/converter/converter.py | 27 ----- docutranslate/converter/interfaces.py | 12 +++ docutranslate/converter/x2md/__init__.py | 0 .../converter/{ => x2md}/converter_docling.py | 44 +++++--- .../converter/x2md/converter_identity.py | 15 +++ .../converter/{ => x2md}/converter_mineru.py | 88 +++++++++------ docutranslate/converter/x2md/interfaces.py | 22 ++++ docutranslate/document_context/__init__.py | 0 .../document_context/md_mask_context.py | 15 +++ docutranslate/exporter/__init__.py | 3 + docutranslate/exporter/export_config.py | 8 ++ docutranslate/exporter/interfaces.py | 16 +++ docutranslate/exporter/md2x/__init__.py | 0 docutranslate/exporter/md2x/interfaces.py | 12 +++ .../exporter/md2x/md2html_exporter.py | 73 +++++++++++++ docutranslate/exporter/md2x/md2md_exporter.py | 26 +++++ docutranslate/exporter/md2x/types.py | 4 + docutranslate/exporter/txt2x/__init__.py | 0 docutranslate/exporter/txt2x/interfaces.py | 8 ++ .../exporter/txt2x/txt2html_exporter.py | 33 ++++++ docutranslate/ir/__init__.py | 0 docutranslate/ir/document.py | 24 +++++ docutranslate/ir/markdown_document.py | 7 ++ docutranslate/manager/__init__.py | 0 docutranslate/manager/base_manager.py | 51 +++++++++ docutranslate/manager/interfaces.py | 34 ++++++ docutranslate/manager/md_based_manager.py | 102 ++++++++++++++++++ docutranslate/manager/txt_manager.py | 66 ++++++++++++ docutranslate/template/txt.html | 17 +++ docutranslate/translater.py | 10 +- docutranslate/translater/__init__.py | 0 docutranslate/translater/base.py | 16 +++ docutranslate/translater/interfaces.py | 21 ++++ docutranslate/translater/md_translator.py | 70 ++++++++++++ docutranslate/translater/txt_translator.py | 50 +++++++++ docutranslate/utils/markdown_splitter.py | 2 +- docutranslate/utils/markdown_utils.py | 2 +- 45 files changed, 885 insertions(+), 139 deletions(-) create mode 100644 docutranslate/agents/txt_agent.py delete mode 100644 docutranslate/cacher/document_cacher.py create mode 100644 docutranslate/cacher/md_based_convert_cacher.py delete mode 100644 docutranslate/converter/converter.py create mode 100644 docutranslate/converter/interfaces.py create mode 100644 docutranslate/converter/x2md/__init__.py rename docutranslate/converter/{ => x2md}/converter_docling.py (65%) create mode 100644 docutranslate/converter/x2md/converter_identity.py rename docutranslate/converter/{ => x2md}/converter_mineru.py (72%) create mode 100644 docutranslate/converter/x2md/interfaces.py create mode 100644 docutranslate/document_context/__init__.py create mode 100644 docutranslate/document_context/md_mask_context.py create mode 100644 docutranslate/exporter/__init__.py create mode 100644 docutranslate/exporter/export_config.py create mode 100644 docutranslate/exporter/interfaces.py create mode 100644 docutranslate/exporter/md2x/__init__.py create mode 100644 docutranslate/exporter/md2x/interfaces.py create mode 100644 docutranslate/exporter/md2x/md2html_exporter.py create mode 100644 docutranslate/exporter/md2x/md2md_exporter.py create mode 100644 docutranslate/exporter/md2x/types.py create mode 100644 docutranslate/exporter/txt2x/__init__.py create mode 100644 docutranslate/exporter/txt2x/interfaces.py create mode 100644 docutranslate/exporter/txt2x/txt2html_exporter.py create mode 100644 docutranslate/ir/__init__.py create mode 100644 docutranslate/ir/document.py create mode 100644 docutranslate/ir/markdown_document.py create mode 100644 docutranslate/manager/__init__.py create mode 100644 docutranslate/manager/base_manager.py create mode 100644 docutranslate/manager/interfaces.py create mode 100644 docutranslate/manager/md_based_manager.py create mode 100644 docutranslate/manager/txt_manager.py create mode 100644 docutranslate/template/txt.html create mode 100644 docutranslate/translater/__init__.py create mode 100644 docutranslate/translater/base.py create mode 100644 docutranslate/translater/interfaces.py create mode 100644 docutranslate/translater/md_translator.py create mode 100644 docutranslate/translater/txt_translator.py diff --git a/docutranslate/__init__.py b/docutranslate/__init__.py index 5281f42..509d45b 100644 --- a/docutranslate/__init__.py +++ b/docutranslate/__init__.py @@ -2,4 +2,4 @@ __version__="0.3.4b1" -from .translater import FileTranslater \ No newline at end of file +# from .translater import FileTranslater \ No newline at end of file diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index d22464d..c1ff2b9 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -17,23 +17,23 @@ class AgentArgs(TypedDict, total=False): baseurl: str key: str model_id: str - system_prompt: str + system_prompt: str | None temperature: float max_concurrent: int timeout: int - logger:logging.Logger + logger: logging.Logger class TotalErrorCounter: - def __init__(self,logger:logging.Logger): + def __init__(self, logger: logging.Logger): self.lock = Lock() self.count = 0 - self.logger=logger + self.logger = logger def add(self): self.lock.acquire() self.count += 1 - if self.count>MAX_TOTAL_ERROR_COUNT: + if self.count > MAX_TOTAL_ERROR_COUNT: self.logger.info(f"错误响应过多") self.lock.release() return self.reach_limit() @@ -42,14 +42,13 @@ class TotalErrorCounter: return self.count > MAX_TOTAL_ERROR_COUNT - # 仅使用多线程时用以计数 class PromptsCounter: - def __init__(self, total: int,logger:logging.Logger): + def __init__(self, total: int, logger: logging.Logger): self.lock = Lock() self.count = 0 self.total = total - self.logger=logger + self.logger = logger def add(self): self.lock.acquire() @@ -62,22 +61,23 @@ TIMEOUT = 600 class Agent: - def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7, - max_concurrent=15, timeout: int = TIMEOUT,logger:logging.Logger|None=None): + def __init__(self, baseurl: str, key: str | None, model_id: str, system_prompt: str | None = None, temperature=0.7, + max_concurrent=15, timeout: int = TIMEOUT, logger: logging.Logger | None = None): self.baseurl = baseurl.strip() if self.baseurl.endswith("/"): self.baseurl = self.baseurl[:-1] - self.key = key.strip() + self.key = key.strip() or "xx" self.model_id = model_id.strip() - self.system_prompt = system_prompt + self.system_prompt = system_prompt or "" self.temperature = temperature self.client = httpx.Client(trust_env=False, proxy=None, verify=False) self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False) self.max_concurrent = max_concurrent self.timeout = timeout - self.logger=logger if logger else global_logger + self.logger = logger if logger else global_logger self.total_error_counter = TotalErrorCounter(logger=self.logger) + def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9): if temperature is None: temperature = self.temperature @@ -210,7 +210,7 @@ class Agent: system_prompt: str | None = None, ) -> list[str]: system_prompts = [system_prompt] * len(prompts) - counts = [PromptsCounter(len(prompts),self.logger)] * len(prompts) + counts = [PromptsCounter(len(prompts), self.logger)] * len(prompts) output_list = [] with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor: results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts, counts) diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index c5ec9e1..bfd075f 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -1,7 +1,10 @@ -from typing import Unpack +from typing import Unpack, NotRequired -from .agent import (Agent, AgentArgs) +from .agent import Agent, AgentArgs +class MDTranslateAgentArgs(AgentArgs, total=True): + to_lang:str + custom_prompt:NotRequired[str] class MDRefineAgent(Agent): def __init__(self, custom_prompt=None, **kwargs: Unpack[AgentArgs]): diff --git a/docutranslate/agents/txt_agent.py b/docutranslate/agents/txt_agent.py new file mode 100644 index 0000000..0b6df69 --- /dev/null +++ b/docutranslate/agents/txt_agent.py @@ -0,0 +1,29 @@ +from typing import NotRequired, Unpack + +from docutranslate.agents import AgentArgs, Agent + + +class TXTTranslateAgentArgs(AgentArgs, total=True): + to_lang: str + custom_prompt: NotRequired[str] + + +class TXTTranslateAgent(Agent): + def __init__(self, custom_prompt=None, to_lang="中文", **kwargs: Unpack[AgentArgs]): + super().__init__(**kwargs) + self.system_prompt = f""" +# 角色 +你是一个专业的机器翻译引擎 +# 工作 +翻译输入的txt文本 +目标语言{to_lang} +# 要求 +翻译要求专业准确 +不输出任何解释和注释 +不能改变形如的占位符 +# 输出 +翻译后的txt译文纯文本 +""" + if custom_prompt: + self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + custom_prompt + '\n' + self.system_prompt += r'\no_think' diff --git a/docutranslate/cacher/__init__.py b/docutranslate/cacher/__init__.py index 95b49d2..8f6cebd 100644 --- a/docutranslate/cacher/__init__.py +++ b/docutranslate/cacher/__init__.py @@ -1 +1 @@ -from .document_cacher import DocumentCacher, document_cacher_global +from .md_based_convert_cacher import MDBasedCovertCacher, md_based_convert_cacher diff --git a/docutranslate/cacher/document_cacher.py b/docutranslate/cacher/document_cacher.py deleted file mode 100644 index b89ab05..0000000 --- a/docutranslate/cacher/document_cacher.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -from collections import OrderedDict - -from docutranslate.converter import Document - -CACHE_NUM=os.getenv("DOCUTRANSLATE_CACHE_NUM",default="10") - -class DocumentCacher: - def __init__(self): - self.cache_dict = OrderedDict() - @staticmethod - def _get_hashcode(document: Document, formula: bool, code: bool, convert_engin: str) -> str: - obj = (document.suffix, document.filebytes, formula, code, convert_engin) - return str(hash(obj)) - - def get_cached_result(self, document: Document, formula: bool, code: bool, convert_engin: str)->str|None: - return self.cache_dict.get(self._get_hashcode(document, formula, code, convert_engin)) - - def cache_result(self, result: str, document: Document, formula: bool, code: bool, convert_engin: str): - hash_code = self._get_hashcode(document, formula, code, convert_engin) - if len(self.cache_dict)>=int(CACHE_NUM): - self.cache_dict.popitem(last=False) - self.cache_dict[hash_code] = result - return result - - def clear(self): - self.cache_dict.clear() - - -document_cacher_global = DocumentCacher() diff --git a/docutranslate/cacher/md_based_convert_cacher.py b/docutranslate/cacher/md_based_convert_cacher.py new file mode 100644 index 0000000..04bd7cd --- /dev/null +++ b/docutranslate/cacher/md_based_convert_cacher.py @@ -0,0 +1,36 @@ +import os +from collections import OrderedDict + +from docutranslate.exporter.md2x.types import x2md_convert_config_type +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument + +CACHE_NUM = os.getenv("DOCUTRANSLATE_CACHE_NUM", default="10") + + +class MDBasedCovertCacher: + def __init__(self): + self.cache_dict = OrderedDict() + + @staticmethod + def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str: + obj = (document.suffix, document.content, convert_engin, convert_config) + return str(hash(obj)) + + def get_cached_result(self, document: Document, convert_engin: str, + convert_config: x2md_convert_config_type) -> MarkdownDocument | None: + return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config)) + + def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str, + convert_config: x2md_convert_config_type) -> MarkdownDocument: + hash_code = self._get_hashcode(document, convert_engin, convert_config) + if len(self.cache_dict) > int(CACHE_NUM): + self.cache_dict.popitem(last=False) + self.cache_dict[hash_code] = convert_result + return convert_result + + def clear(self): + self.cache_dict.clear() + + +md_based_convert_cacher = MDBasedCovertCacher() diff --git a/docutranslate/converter/__init__.py b/docutranslate/converter/__init__.py index 25cb957..b9636a8 100644 --- a/docutranslate/converter/__init__.py +++ b/docutranslate/converter/__init__.py @@ -1,9 +1,3 @@ -from .converter import Document,Converter -from .converter_mineru import ConverterMineru - -from docutranslate.global_values import conditional_import -if conditional_import("docling"): - from .converter_docling import ConverterDocling - -# 打包docling时取消下面一行注释 -# from .converter_docling import ConverterDocling +""" +这个包用来处理document之间的格式转换 +""" diff --git a/docutranslate/converter/converter.py b/docutranslate/converter/converter.py deleted file mode 100644 index c8bd5a5..0000000 --- a/docutranslate/converter/converter.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Protocol -from pathlib import Path - - -class Document: - def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None): - if path is None and (filename is None or filebytes is None): - raise Exception("Document的路径或filename、filebytes不能同时为空") - self.filebytes = filebytes - self.filename = filename - self.path = path - if path: - if isinstance(path,str): - path=Path(path) - self.path=path - self.filename=path.name - self.filebytes=path.read_bytes() - self.suffix=Path(self.filename).suffix - self.stem=Path(self.filename).stem - -class Converter(Protocol): - #转换为markdown - def convert(self,document:Document)->str: - ... - - async def convert_async(self,document:Document)->str: - ... \ No newline at end of file diff --git a/docutranslate/converter/interfaces.py b/docutranslate/converter/interfaces.py new file mode 100644 index 0000000..5e975b6 --- /dev/null +++ b/docutranslate/converter/interfaces.py @@ -0,0 +1,12 @@ +from typing import Protocol, runtime_checkable + +from docutranslate.ir.document import Document + + +@runtime_checkable +class Converter(Protocol): + def convert(self, document: Document) -> Document: + ... + + async def convert_async(self, document: Document) -> Document: + ... diff --git a/docutranslate/converter/x2md/__init__.py b/docutranslate/converter/x2md/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/converter/converter_docling.py b/docutranslate/converter/x2md/converter_docling.py similarity index 65% rename from docutranslate/converter/converter_docling.py rename to docutranslate/converter/x2md/converter_docling.py index 0a00c30..e006ab1 100644 --- a/docutranslate/converter/converter_docling.py +++ b/docutranslate/converter/x2md/converter_docling.py @@ -1,8 +1,9 @@ import asyncio -import logging import os import time +from dataclasses import dataclass from io import BytesIO +from logging import Logger from pathlib import Path from docling.datamodel.base_models import InputFormat @@ -13,34 +14,49 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from docling_core.types.doc import ImageRefMode from huggingface_hub.errors import LocalEntryNotFoundError -from docutranslate.converter import Converter, Document +from docutranslate.converter.x2md.interfaces import X2MarkdownConverter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.logger import global_logger IMAGE_RESOLUTION_SCALE = 4 -class ConverterDocling(Converter): - def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None): - self.code = code - self.formula = formula - self.artifact = artifact - self.logger = logger if logger else global_logger +@dataclass(frozen=True) +class ConverterDoclingConfig: + code: bool = True + formula: bool = True + artifact: Path | None = None - def convert(self, document): - assert isinstance(document.filename, str) + +class ConverterDocling(X2MarkdownConverter): + def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger): + self.config = config + self.code = config.code + self.formula = config.formula + self.artifact = config.artifact + self.logger = logger + + def convert(self, document) -> MarkdownDocument: + assert isinstance(document.name, str) self.logger.info(f"正在将文档转换为markdown") time1 = time.time() - document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes)) - result = self.file2markdown_embed_images(document_stream) + document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content)) + content = self.file2markdown_embed_images(document_stream) self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") - return result + md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) + return md_document - async def convert_async(self, document: Document) -> str: + async def convert_async(self, document: Document) -> MarkdownDocument: return await asyncio.to_thread( self.convert, document ) + def support_format(self) -> list[str]: + return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff", + ".bmp", ".webp"] + def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str: pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact) pipeline_options.do_ocr = False diff --git a/docutranslate/converter/x2md/converter_identity.py b/docutranslate/converter/x2md/converter_identity.py new file mode 100644 index 0000000..e139f2a --- /dev/null +++ b/docutranslate/converter/x2md/converter_identity.py @@ -0,0 +1,15 @@ +from docutranslate.converter.x2md.interfaces import X2MarkdownConverter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument + + +class ConverterIdentity(X2MarkdownConverter): + + def convert(self, document: Document) -> MarkdownDocument: + return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem) + + async def convert_async(self, document: Document) -> MarkdownDocument: + return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem) + + def support_format(self) -> list[str]: + return [".md"] diff --git a/docutranslate/converter/converter_mineru.py b/docutranslate/converter/x2md/converter_mineru.py similarity index 72% rename from docutranslate/converter/converter_mineru.py rename to docutranslate/converter/x2md/converter_mineru.py index dcd06a5..18ec95d 100644 --- a/docutranslate/converter/converter_mineru.py +++ b/docutranslate/converter/x2md/converter_mineru.py @@ -1,31 +1,43 @@ import asyncio -import logging import time import zipfile +from dataclasses import dataclass +from logging import Logger + import httpx -from docutranslate.converter import Converter, Document + +from docutranslate.converter.x2md.interfaces import X2MarkdownConverter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.logger import global_logger from docutranslate.utils.markdown_utils import embed_inline_image_from_zip URL = 'https://mineru.net/api/v4/file-urls/batch' + +@dataclass(frozen=True) +class ConverterMineruConfig: + mineru_token: str + formula: bool = True + + timeout = httpx.Timeout( - connect=5.0, # 连接超时 (建立连接的最长时间) - read=200.0, # 读取超时 (等待服务器响应的最长时间) - write=200.0, # 写入超时 (发送数据的最长时间) - pool=1.0 # 从连接池获取连接的超时时间 + connect=5.0, # 连接超时 (建立连接的最长时间) + read=200.0, # 读取超时 (等待服务器响应的最长时间) + write=200.0, # 写入超时 (发送数据的最长时间) + pool=1.0 # 从连接池获取连接的超时时间 ) +client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False) +client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False) -client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False) -client_async=httpx.AsyncClient(trust_env=False,timeout=timeout,proxy=None,verify=False) -# TODO: 提供更详细的logger -class ConverterMineru(Converter): - def __init__(self, token: str, formula=True,logger:logging.Logger|None=None): - self.mineru_token = token.strip() - self.formula = formula - self.logger=logger if logger else global_logger +class ConverterMineru(X2MarkdownConverter): + def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger): + self.config = config + self.mineru_token = config.mineru_token.strip() + self.formula = config.formula + self.logger = logger def _get_header(self): return { @@ -39,7 +51,7 @@ class ConverterMineru(Converter): "language": "auto", "enable_table": True, "files": [ - {"name": f"{document.filename}", "is_ocr": True} + {"name": f"{document.name}", "is_ocr": True} ] } @@ -54,7 +66,7 @@ class ConverterMineru(Converter): urls = result["data"]["file_urls"] # print('batch_id:{},urls:{}'.format(batch_id, urls)) # 获取 - res_upload = client.put(urls[0], content=document.filebytes) + res_upload = client.put(urls[0], content=document.content) res_upload.raise_for_status() # print(f"{urls[0]} upload success") return batch_id @@ -72,7 +84,7 @@ class ConverterMineru(Converter): urls = result["data"]["file_urls"] # print('batch_id:{},urls:{}'.format(batch_id, urls)) # 获取 - res_upload = await client_async.put(urls[0], content=document.filebytes) + res_upload = await client_async.put(urls[0], content=document.content) res_upload.raise_for_status() # print(f"{urls[0]} upload success") return batch_id @@ -87,8 +99,8 @@ class ConverterMineru(Converter): res.raise_for_status() fileinfo = res.json()["data"]["extract_result"][0] if fileinfo["state"] == "done": - fileurl = fileinfo["full_zip_url"] - return fileurl + file_url = fileinfo["full_zip_url"] + return file_url else: time.sleep(3) @@ -100,36 +112,40 @@ class ConverterMineru(Converter): res.raise_for_status() fileinfo = res.json()["data"]["extract_result"][0] if fileinfo["state"] == "done": - fileurl = fileinfo["full_zip_url"] - return fileurl + file_url = fileinfo["full_zip_url"] + return file_url else: await asyncio.sleep(3) - def convert(self, document: Document) -> str: + def convert(self, document: Document) -> MarkdownDocument: self.logger.info(f"正在将文档转换为markdown") time1 = time.time() batch_id = self.upload(document) file_url = self.get_file_url(batch_id) - result = get_md_from_zip_url_with_inline_images(zip_url=file_url) + content = get_md_from_zip_url_with_inline_images(zip_url=file_url) self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") - return result + md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) + return md_document - async def convert_async(self, document: Document) -> str: - # 待优化 + async def convert_async(self, document: Document) -> MarkdownDocument: self.logger.info(f"正在将文档转换为markdown") time1 = time.time() batch_id = await self.upload_async(document) file_url = await self.get_file_url_async(batch_id) - result = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) + content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") - return result + md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) + return md_document + + def support_format(self) -> list[str]: + return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"] def get_md_from_zip_url_with_inline_images( zip_url: str, filename_in_zip: str = "full.md", encoding: str = "utf-8" -) -> str | None: +) -> str: """ 从给定的ZIP文件URL中下载并提取指定文件的内容, 并将Markdown文件中的相对路径图片转换为内联Base64图片。 @@ -152,7 +168,8 @@ def get_md_from_zip_url_with_inline_images( except httpx.HTTPStatusError as e: - raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...") + raise Exception( + f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...") except httpx.RequestError as e: raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}") except zipfile.BadZipFile: @@ -169,7 +186,7 @@ async def get_md_from_zip_url_with_inline_images_async( zip_url: str, filename_in_zip: str = "full.md", encoding: str = "utf-8" -) -> str | None: +) -> str: """ 从给定的ZIP文件URL中下载并提取指定文件的内容, 并将Markdown文件中的相对路径图片转换为内联Base64图片。 @@ -181,18 +198,20 @@ async def get_md_from_zip_url_with_inline_images_async( encoding (str): 目标文件的预期编码。默认为 "utf-8"。 Returns: - str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。 + str : 如果成功,返回处理后的Markdown文本内容。 """ try: print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") response = await client_async.get(zip_url) # 增加超时 response.raise_for_status() print("ZIP文件下载完成。") - return await asyncio.to_thread(embed_inline_image_from_zip,response.content, filename_in_zip=filename_in_zip, encoding=encoding) + return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip, + encoding=encoding) except httpx.HTTPStatusError as e: - raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...") + raise Exception( + f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...") except httpx.RequestError as e: raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}") except zipfile.BadZipFile: @@ -204,5 +223,6 @@ async def get_md_from_zip_url_with_inline_images_async( traceback.print_exc() # 打印完整的堆栈跟踪,便于调试 raise Exception(f"发生未知错误: {e}") + if __name__ == '__main__': pass diff --git a/docutranslate/converter/x2md/interfaces.py b/docutranslate/converter/x2md/interfaces.py new file mode 100644 index 0000000..dc081bb --- /dev/null +++ b/docutranslate/converter/x2md/interfaces.py @@ -0,0 +1,22 @@ +from typing import runtime_checkable + +from typing import Protocol +from docutranslate.converter.interfaces import Converter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument + + + +@runtime_checkable +class X2MarkdownConverter(Converter,Protocol): + """ + 负责将其它格式的文件转换为markdown + """ + def convert(self, document: Document) -> MarkdownDocument: + ... + + async def convert_async(self, document: Document) -> MarkdownDocument: + ... + + def support_format(self)->list[str]: + ... \ No newline at end of file diff --git a/docutranslate/document_context/__init__.py b/docutranslate/document_context/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/document_context/md_mask_context.py b/docutranslate/document_context/md_mask_context.py new file mode 100644 index 0000000..bc26c7d --- /dev/null +++ b/docutranslate/document_context/md_mask_context.py @@ -0,0 +1,15 @@ +from docutranslate.ir.markdown_document import MarkdownDocument +from docutranslate.utils.markdown_utils import MaskDict, uris2placeholder, placeholder2uris + + +class MDMaskUrisContext: + def __init__(self, document: MarkdownDocument): + self.document = document + self.mask_dict = MaskDict() + + def __enter__(self): + self.document.content = uris2placeholder(self.document.content.decode(), self.mask_dict).encode() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.document.content = placeholder2uris(self.document.content.decode(), self.mask_dict).encode() diff --git a/docutranslate/exporter/__init__.py b/docutranslate/exporter/__init__.py new file mode 100644 index 0000000..0cb105e --- /dev/null +++ b/docutranslate/exporter/__init__.py @@ -0,0 +1,3 @@ +""" +这个包用于将Document导出为其它格式 +""" \ No newline at end of file diff --git a/docutranslate/exporter/export_config.py b/docutranslate/exporter/export_config.py new file mode 100644 index 0000000..315f4f7 --- /dev/null +++ b/docutranslate/exporter/export_config.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + +@dataclass +class ExportConfig: + pass + + + diff --git a/docutranslate/exporter/interfaces.py b/docutranslate/exporter/interfaces.py new file mode 100644 index 0000000..1f3c1e2 --- /dev/null +++ b/docutranslate/exporter/interfaces.py @@ -0,0 +1,16 @@ +from typing import Protocol, runtime_checkable, TypeVar, Any, Self + +from docutranslate.exporter.export_config import ExportConfig +from docutranslate.ir.document import Document + +D_in = TypeVar('D_in', bound=Document) + + +@runtime_checkable +class Exporter(Protocol[D_in]): + @classmethod + def from_config(cls, export_config: ExportConfig | None = None) -> Self: + ... + + def export(self, document: D_in) -> Any: + ... diff --git a/docutranslate/exporter/md2x/__init__.py b/docutranslate/exporter/md2x/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/exporter/md2x/interfaces.py b/docutranslate/exporter/md2x/interfaces.py new file mode 100644 index 0000000..515b86f --- /dev/null +++ b/docutranslate/exporter/md2x/interfaces.py @@ -0,0 +1,12 @@ +from typing import Self + +from docutranslate.exporter.export_config import ExportConfig +from docutranslate.exporter.interfaces import Exporter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument + + +class MDExporter(Exporter): + + def export(self,document:MarkdownDocument)->Document: + ... diff --git a/docutranslate/exporter/md2x/md2html_exporter.py b/docutranslate/exporter/md2x/md2html_exporter.py new file mode 100644 index 0000000..9755e30 --- /dev/null +++ b/docutranslate/exporter/md2x/md2html_exporter.py @@ -0,0 +1,73 @@ +from dataclasses import dataclass + +import jinja2 +import markdown2 + +from docutranslate.exporter.export_config import ExportConfig +from docutranslate.exporter.md2x.interfaces import MDExporter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument +from docutranslate.utils.resource_utils import resource_path + +@dataclass +class MD2HTMLExportConfig(ExportConfig): + cdn: bool = True + +class MD2HTMLExporter(MDExporter): + def __init__(self, export_config: MD2HTMLExportConfig = None): + export_config = export_config or MD2HTMLExportConfig() + self.cdn=export_config.cdn + + def export(self, document: MarkdownDocument) -> Document: + cdn = self.cdn + markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"]) + # language=html + pico = f'' if not cdn else r'' + html_template = resource_path("template/markdown.html").read_text(encoding="utf-8") + katex_css = f'' if not cdn else r"""""" + katex_js = f'' if not cdn else r"""""" + auto_render = f'' if not cdn else r"""""" + # language=javascript + render_math_in_element = r""" + """ if cdn else r""" + """ + mermaid = f'' + content = markdowner.convert(document.content.decode().replace("\\", "\\\\")) + # TODO:实现MathJax本地化 + render = jinja2.Template(html_template).render( + title=document.stem, + pico=pico, + katexCss=katex_css, + katexJs=katex_js, + autoRender=auto_render, + markdown=content, + renderMathInElement=render_math_in_element, + mermaid=mermaid, + ) + return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem) diff --git a/docutranslate/exporter/md2x/md2md_exporter.py b/docutranslate/exporter/md2x/md2md_exporter.py new file mode 100644 index 0000000..1f8896c --- /dev/null +++ b/docutranslate/exporter/md2x/md2md_exporter.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass +from typing import runtime_checkable + +from docutranslate.exporter.export_config import ExportConfig +from docutranslate.exporter.md2x.interfaces import MDExporter +from docutranslate.ir.markdown_document import MarkdownDocument,Document +from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip + + +@dataclass +class MD2MDExportConfig(ExportConfig): + embed_images: bool = True + + +class MD2MDExporter(MDExporter): + def __init__(self, export_config: MD2MDExportConfig | None=None): + export_config=export_config or MD2MDExportConfig() + self.embed_images=export_config.embed_images + + def export(self,document:MarkdownDocument)->Document: + if self.embed_images: + return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem) + else: + return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem) + + diff --git a/docutranslate/exporter/md2x/types.py b/docutranslate/exporter/md2x/types.py new file mode 100644 index 0000000..e58d790 --- /dev/null +++ b/docutranslate/exporter/md2x/types.py @@ -0,0 +1,4 @@ +from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig +from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig + +x2md_convert_config_type=ConverterDoclingConfig | ConverterMineruConfig \ No newline at end of file diff --git a/docutranslate/exporter/txt2x/__init__.py b/docutranslate/exporter/txt2x/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/exporter/txt2x/interfaces.py b/docutranslate/exporter/txt2x/interfaces.py new file mode 100644 index 0000000..fb29634 --- /dev/null +++ b/docutranslate/exporter/txt2x/interfaces.py @@ -0,0 +1,8 @@ +from docutranslate.exporter.interfaces import Exporter +from docutranslate.ir.document import Document + +#TODO:看情况是否需要为TXT单独写一个document类型 +class TXTExporter(Exporter): + + def export(self,document:Document)->Document: + ... \ No newline at end of file diff --git a/docutranslate/exporter/txt2x/txt2html_exporter.py b/docutranslate/exporter/txt2x/txt2html_exporter.py new file mode 100644 index 0000000..4100846 --- /dev/null +++ b/docutranslate/exporter/txt2x/txt2html_exporter.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass + +import jinja2 + +from docutranslate.exporter.export_config import ExportConfig +from docutranslate.exporter.txt2x.interfaces import TXTExporter +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument +from docutranslate.utils.resource_utils import resource_path + + +@dataclass +class TXT2HTMLExportConfig(ExportConfig): + cdn: bool = True + + +class TXT2HTMLExporter(TXTExporter): + def __init__(self, export_config: TXT2HTMLExportConfig = None): + export_config = export_config or TXT2HTMLExportConfig() + self.cdn = export_config.cdn + + def export(self, document: MarkdownDocument) -> Document: + cdn = self.cdn + html_template = resource_path("template/txt.html").read_text(encoding="utf-8") + + # language=html + pico = f'' if not cdn else r'' + + render = jinja2.Template(html_template).render( + title=document.stem, + pico=pico, + ) + return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem) diff --git a/docutranslate/ir/__init__.py b/docutranslate/ir/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/ir/document.py b/docutranslate/ir/document.py new file mode 100644 index 0000000..327a101 --- /dev/null +++ b/docutranslate/ir/document.py @@ -0,0 +1,24 @@ +import dataclasses +from pathlib import Path + +class Document: + def __init__(self,suffix:str,content:bytes,stem:str|None=None,path:Path=None): + self.suffix=suffix + self.content=content + self.stem=stem + self.path=path + @property + def name(self)->str|None: + if not self.stem: + return None + return self.stem+self.suffix + + @classmethod + def from_path(cls,path:Path|str): + if isinstance(path,str): + path=Path(path) + return cls(suffix=path.suffix,content=path.read_bytes(),stem=path.stem,path=path) + + @classmethod + def from_bytes(cls,content:bytes,suffix:str,stem:str|None): + return cls(content=content,suffix=suffix,stem=stem) diff --git a/docutranslate/ir/markdown_document.py b/docutranslate/ir/markdown_document.py new file mode 100644 index 0000000..0e1dfad --- /dev/null +++ b/docutranslate/ir/markdown_document.py @@ -0,0 +1,7 @@ +from docutranslate.ir.document import Document + + +class MarkdownDocument(Document): + def __init__(self,*args,**kwargs): + super().__init__(*args,**kwargs) + self.suffix=".md" \ No newline at end of file diff --git a/docutranslate/manager/__init__.py b/docutranslate/manager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/manager/base_manager.py b/docutranslate/manager/base_manager.py new file mode 100644 index 0000000..becf3ea --- /dev/null +++ b/docutranslate/manager/base_manager.py @@ -0,0 +1,51 @@ +from abc import ABC, abstractmethod +from logging import Logger +from pathlib import Path +from typing import Self, Generic, TypeVar + +from docutranslate.exporter.interfaces import Exporter +from docutranslate.ir.document import Document +from docutranslate.logger import global_logger + +T_Translated = TypeVar('T_Translated', bound=Document) + + +class BaseManager(ABC, Generic[T_Translated]): + def __init__(self, logger: Logger = global_logger): + self.logger = logger + self.document_original: Document | None = None + self.document_translated: T_Translated | None = None + + def read_path(self, path: Path | str): + document = Document.from_path(path) + self.document_original = document + + def read_bytes(self, content: bytes, stem: str, suffix: str): + document = Document.from_bytes(content=content, stem=stem, suffix=suffix) + self.document_original = document + + @abstractmethod + def translate(self, *args, **kwargs) -> Self: + ... + + @abstractmethod + async def translate_async(self, *args, **kwargs) -> Self: + ... + + def _export(self, exporter: Exporter) -> Document: + if self.document_translated is None: + raise RuntimeError("Document has not been translated yet. Call translate() first.") + docu = exporter.export(self.document_translated) + return docu + + def _save(self, exporter: Exporter, name: str = None, out_put_dir: Path | str = "./output"): + docu = self._export(exporter) + name = name or docu.name + output_path = Path(out_put_dir) / Path(name) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_bytes(docu.content) + self.logger.info(f"文件已保存到{output_path.resolve()}") + return self + @abstractmethod + def support_export_format(self)->list[str]: + ... \ No newline at end of file diff --git a/docutranslate/manager/interfaces.py b/docutranslate/manager/interfaces.py new file mode 100644 index 0000000..3a58e03 --- /dev/null +++ b/docutranslate/manager/interfaces.py @@ -0,0 +1,34 @@ +from pathlib import Path +from typing import Protocol, runtime_checkable, Self, TypeVar + +from docutranslate.exporter.export_config import ExportConfig + +T = TypeVar("T", bound=ExportConfig) + + + +@runtime_checkable +class HTMLExportable(Protocol): + def export_to_html(self, export_config: T) -> str: + ... + + def save_as_html(self, name: str, out_put_dir: Path | str, export_config: T) -> Self: + ... + + +@runtime_checkable +class MDExportable(Protocol): + def export_to_markdown(self, export_config: T) -> str: + ... + + def save_as_markdown(self, name: str, out_put_dir: Path | str, export_config: T) -> Self: + ... + + +@runtime_checkable +class TXTExportable(Protocol): + def export_to_txt(self, export_config: T) -> str: + ... + + def save_as_txt(self, name: str, out_put_dir: Path | str, export_config: T) -> Self: + ... diff --git a/docutranslate/manager/md_based_manager.py b/docutranslate/manager/md_based_manager.py new file mode 100644 index 0000000..d77f2ba --- /dev/null +++ b/docutranslate/manager/md_based_manager.py @@ -0,0 +1,102 @@ +import asyncio +from pathlib import Path +from typing import Self, Literal, overload + +from docutranslate.cacher import md_based_convert_cacher +from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling +from docutranslate.converter.x2md.converter_identity import ConverterIdentity +from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru +from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter +from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter +from docutranslate.exporter.md2x.types import x2md_convert_config_type +from docutranslate.manager.base_manager import BaseManager +from docutranslate.manager.interfaces import HTMLExportable, MDExportable +from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator + + +class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): + + def support_export_format(self) -> list[str]: + return [".md",".html",".zip"] + + def _get_document_md(self, convert_engin, convert_config): + if self.document_original is None: + raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") + # 获取缓存的解析后文件 + document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, + convert_config) + # 获取解析文件 + if document_cached: + document_md = document_cached + else: + if convert_engin is None: + converter = ConverterIdentity() + elif convert_engin == "mineru": + if not isinstance(convert_config, ConverterMineruConfig): + raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterMineruConfig}") + converter = ConverterMineru(convert_config, logger=self.logger) + elif convert_engin == "docling": + if not isinstance(convert_config, ConverterDoclingConfig): + raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterDoclingConfig}") + converter = ConverterDocling(convert_config, logger=self.logger) + else: + raise ValueError(f"不存在{convert_engin}解析引擎") + document_md = converter.convert(self.document_original) + # 获取缓存解析后文件 + md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) + return document_md + + @overload + def translate(self, convert_engin: None, + convert_config: None, translate_config: MDTranslateConfig) -> Self: + ... + + @overload + def translate(self, convert_engin: Literal["docling"], + convert_config: ConverterDoclingConfig, translate_config: MDTranslateConfig) -> Self: + ... + + @overload + def translate(self, convert_engin: Literal["mineru"], + convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self: + ... + + def translate(self, convert_engin: Literal["mineru", "docling"] | None, + convert_config: x2md_convert_config_type | None, + translate_config: MDTranslateConfig) -> Self: + document_md = self._get_document_md(convert_engin, convert_config) + # 翻译解析后文件 + translator = MDTranslator(translate_config) + translator.translate(document_md) + self.document_translated = document_md + return self + + async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None, + convert_config: x2md_convert_config_type | None, + translate_config: MDTranslateConfig) -> Self: + + document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config) + # 翻译解析后文件 + translator = MDTranslator(translate_config) + await translator.translate_async(document_md) + self.document_translated = document_md + return self + + def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str: + docu = self._export(MD2HTMLExporter(export_config)) + return docu.content.decode() + + def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str: + docu = self._export(MD2MDExporter(export_config)) + return docu.content.decode() + + def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output", + export_config: MD2HTMLExportConfig | None = None) -> Self: + self._save(exporter=MD2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir) + return self + + def save_as_markdown(self, name: str = None, out_put_dir: Path | str = "./output", + export_config: MD2MDExportConfig | None = None) -> Self: + + self._save(exporter=MD2MDExporter(export_config), name=name, out_put_dir=out_put_dir) + return self diff --git a/docutranslate/manager/txt_manager.py b/docutranslate/manager/txt_manager.py new file mode 100644 index 0000000..6121dc0 --- /dev/null +++ b/docutranslate/manager/txt_manager.py @@ -0,0 +1,66 @@ +from copy import copy +from dataclasses import dataclass +from logging import Logger +from pathlib import Path +from typing import Self + +from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter +from docutranslate.manager.base_manager import BaseManager +from docutranslate.manager.interfaces import HTMLExportable +from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator + + +@dataclass +class TXTManagerConfig: + chunk_size: int = 3000 + base_url: str | None = None + api_key = None, + model_id: str | None = None + temperature = 0.7 + concurrent: int = 30 + timeout = 2000 + cache = True + logger: Logger | None = None + + +class TXTManager(BaseManager, HTMLExportable): + def support_export_format(self) -> list[str]: + return [".txt", ".html"] + + def translate(self, translate_config: TXTTranslateConfig) -> Self: + document = copy(self.document_original) + # 翻译解析后文件 + translator = TXTTranslator(translate_config) + translator.translate(document) + self.document_translated = document + return self + + async def translate_async(self, translate_config: TXTTranslateConfig) -> Self: + document = copy(self.document_original) + # 翻译解析后文件 + translator = TXTTranslator(translate_config) + await translator.translate_async(document) + self.document_translated = document + return self + + def export_to_html(self, export_config: TXT2HTMLExportConfig) -> str: + docu = self._export(TXT2HTMLExporter(export_config)) + return docu.content.decode() + + def export_to_txt(self) -> str: + if self.document_translated is None: + raise RuntimeError("Document has not been translated yet. Call translate() first.") + return self.document_translated.content.decode() + + def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output", + export_config: TXT2HTMLExportConfig | None = None) -> Self: + self._save(exporter=TXT2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir) + return self + + def save_as_txt(self, name: str = None, out_put_dir: Path | str = "./output", ) -> Self: + name = name or self.document_translated.name + output_path = Path(out_put_dir) / Path(name) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_bytes(self.document_translated.content) + self.logger.info(f"文件已保存到{output_path.resolve()}") + return self diff --git a/docutranslate/template/txt.html b/docutranslate/template/txt.html new file mode 100644 index 0000000..dfcd23b --- /dev/null +++ b/docutranslate/template/txt.html @@ -0,0 +1,17 @@ + + + + + {{ title }} + {{pico}} + + + +{{ body }} + + \ No newline at end of file diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 9bc9b1c..062e043 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -11,12 +11,12 @@ import markdown2 from docutranslate.agents import Agent, AgentArgs from docutranslate.agents import MDRefineAgent, MDTranslateAgent -from docutranslate.cacher import document_cacher_global -from docutranslate.converter import Document, ConverterMineru +from docutranslate.cacher import md_based_convert_cacher +from docutranslate.ir.document import Document from docutranslate.global_values import available_packages from docutranslate.logger import global_logger from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts -from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \ +from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2uris, MaskDict, clean_markdown_math_block, \ unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip from docutranslate.utils.resource_utils import resource_path @@ -62,7 +62,7 @@ class FileTranslater: self.timeout = timeout self.document: Document | None = None self.cache = cache - self.cacher = document_cacher_global + self.cacher = md_based_convert_cacher if file_path: self.read_file(file_path=file_path) @@ -79,7 +79,7 @@ class FileTranslater: return self def _unmask_uris_in_markdown(self): - self.markdown = placeholder2_uris(self.markdown, self._mask_dict) + self.markdown = placeholder2uris(self.markdown, self._mask_dict) return self def _split_markdown_into_chunks(self) -> list[str]: diff --git a/docutranslate/translater/__init__.py b/docutranslate/translater/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/translater/base.py b/docutranslate/translater/base.py new file mode 100644 index 0000000..97ff2e6 --- /dev/null +++ b/docutranslate/translater/base.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from logging import Logger + + +@dataclass +class AiTranslateConfig: + base_url: str + api_key: str + model_id: str + to_lang: str + custom_prompt: str | None = None + temperature: float = 0.7 + timeout: int = 2000 + chunk_size: int = 3000 + concurrent: int = 30 + logger: Logger | None = None \ No newline at end of file diff --git a/docutranslate/translater/interfaces.py b/docutranslate/translater/interfaces.py new file mode 100644 index 0000000..77e7327 --- /dev/null +++ b/docutranslate/translater/interfaces.py @@ -0,0 +1,21 @@ +from typing import runtime_checkable, Protocol, TypeVar + +from docutranslate.agents import Agent +from docutranslate.ir.document import Document + +T=TypeVar('T',bound=Document) +V=TypeVar('V',bound=Agent) + +@runtime_checkable +class Translator(Protocol[T,V]): + """ + 翻译中间文本(原地替换),Translator不做格式转换 + """ + def translate(self, document:T) -> Document: + ... + + async def translate_async(self, document: T) -> Document: + ... + + def log(self,info:str): + ... \ No newline at end of file diff --git a/docutranslate/translater/md_translator.py b/docutranslate/translater/md_translator.py new file mode 100644 index 0000000..054fb81 --- /dev/null +++ b/docutranslate/translater/md_translator.py @@ -0,0 +1,70 @@ +import asyncio +from dataclasses import dataclass +from logging import Logger +from typing import Self + +from docutranslate.agents import MDTranslateAgent +from docutranslate.document_context.md_mask_context import MDMaskUrisContext +from docutranslate.ir.markdown_document import MarkdownDocument +from docutranslate.logger import global_logger +from docutranslate.translater.base import AiTranslateConfig +from docutranslate.translater.interfaces import Translator +from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts +from docutranslate.utils.markdown_utils import clean_markdown_math_block + + +@dataclass +class MDTranslateConfig(AiTranslateConfig): + ... + + + +class MDTranslator(Translator): + def __init__(self, config: MDTranslateConfig): + self.logger = config.logger or global_logger + self.chunk_size = config.chunk_size + self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt, + to_lang=config.to_lang, + baseurl=config.base_url, + key=config.api_key, + model_id=config.model_id, + system_prompt=None, + temperature=config.temperature, + max_concurrent=config.concurrent, + timeout=config.timeout, + logger=self.logger) + + def translate(self, document: MarkdownDocument) -> Self: + self.logger.info("正在翻译markdown") + with MDMaskUrisContext(document): + chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) + self.logger.info(f"markdown分为{len(chunks)}块") + result: list[str] = self.translate_agent.send_prompts(chunks) + content = join_markdown_texts(result) + # 做一些加强鲁棒性的操作 + content = content.replace(r'\(', r'\(') + content = content.replace(r'\)', r'\)') + content = clean_markdown_math_block(content) + + document.content = content.encode() + self.logger.info("翻译完成") + return self + + async def translate_async(self, document: MarkdownDocument) -> Self: + self.logger.info("正在翻译markdown") + with MDMaskUrisContext(document): + chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) + self.logger.info(f"markdown分为{len(chunks)}块") + result: list[str] = await self.translate_agent.send_prompts_async(chunks) + + def run(): + content = join_markdown_texts(result) + # 做一些加强鲁棒性的操作 + content = content.replace(r'\(', r'\(') + content = content.replace(r'\)', r'\)') + content = clean_markdown_math_block(content) + document.content = content.encode() + + await asyncio.to_thread(run) + self.logger.info("翻译完成") + return self diff --git a/docutranslate/translater/txt_translator.py b/docutranslate/translater/txt_translator.py new file mode 100644 index 0000000..05200e4 --- /dev/null +++ b/docutranslate/translater/txt_translator.py @@ -0,0 +1,50 @@ +from dataclasses import dataclass +from typing import Self + +from docutranslate.agents.txt_agent import TXTTranslateAgent +from docutranslate.ir.document import Document +from docutranslate.logger import global_logger +from docutranslate.translater.base import AiTranslateConfig +from docutranslate.translater.interfaces import Translator +from docutranslate.utils.markdown_splitter import split_markdown_text + + +@dataclass +class TXTTranslateConfig(AiTranslateConfig): + ... + + +class TXTTranslator(Translator): + def __init__(self, config: TXTTranslateConfig): + self.logger = config.logger or global_logger + self.chunk_size = config.chunk_size + self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt, + to_lang=config.to_lang, + baseurl=config.base_url, + key=config.api_key, + model_id=config.model_id, + system_prompt=None, + temperature=config.temperature, + max_concurrent=config.concurrent, + timeout=config.timeout, + logger=self.logger) + + def translate(self, document: Document) -> Self: + self.logger.info("正在翻译txt") + chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size) + self.logger.info(f"txt分为{len(chunks)}块") + result: list[str] = self.translate_agent.send_prompts(chunks) + content = "\n".join(result) + document.content = content.encode() + self.logger.info("翻译完成") + return self + + async def translate_async(self, document: Document) -> Self: + self.logger.info("正在翻译txt") + chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size) + self.logger.info(f"txt分为{len(chunks)}块") + result: list[str] = await self.translate_agent.send_prompts_async(chunks) + content = "\n".join(result) + document.content = content.encode() + self.logger.info("翻译完成") + return self diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py index ece4178..8bfaac8 100644 --- a/docutranslate/utils/markdown_splitter.py +++ b/docutranslate/utils/markdown_splitter.py @@ -218,7 +218,7 @@ class MarkdownBlockSplitter: return result -def split_markdown_text(markdown_text, max_block_size=5000): +def split_markdown_text(markdown_text:str, max_block_size=5000): """ 将Markdown字符串分割成不超过max_block_size的块 可以通过简单拼接重建原始文本(分割的代码块除外) diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index 8960e1b..4361d99 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -69,7 +69,7 @@ def uris2placeholder(markdown: str, mask_dict: MaskDict): return markdown -def placeholder2_uris(markdown: str, mask_dict: MaskDict): +def placeholder2uris(markdown: str, mask_dict: MaskDict): def placeholder2uri(match: re.Match): id = match.group(1) uri = mask_dict.get(id)