重构代码,新增了MarkdownBasedManager和TXTManager实现

This commit is contained in:
xunbu
2025-07-28 23:41:35 +08:00
parent 6ab3278481
commit 80634fe749
45 changed files with 885 additions and 139 deletions

View File

@@ -2,4 +2,4 @@ __version__="0.3.4b1"
from .translater import FileTranslater
# from .translater import FileTranslater

View File

@@ -17,23 +17,23 @@ class AgentArgs(TypedDict, total=False):
baseurl: str
key: str
model_id: str
system_prompt: str
system_prompt: str | None
temperature: float
max_concurrent: int
timeout: int
logger:logging.Logger
logger: logging.Logger
class TotalErrorCounter:
def __init__(self,logger:logging.Logger):
def __init__(self, logger: logging.Logger):
self.lock = Lock()
self.count = 0
self.logger=logger
self.logger = logger
def add(self):
self.lock.acquire()
self.count += 1
if self.count>MAX_TOTAL_ERROR_COUNT:
if self.count > MAX_TOTAL_ERROR_COUNT:
self.logger.info(f"错误响应过多")
self.lock.release()
return self.reach_limit()
@@ -42,14 +42,13 @@ class TotalErrorCounter:
return self.count > MAX_TOTAL_ERROR_COUNT
# 仅使用多线程时用以计数
class PromptsCounter:
def __init__(self, total: int,logger:logging.Logger):
def __init__(self, total: int, logger: logging.Logger):
self.lock = Lock()
self.count = 0
self.total = total
self.logger=logger
self.logger = logger
def add(self):
self.lock.acquire()
@@ -62,22 +61,23 @@ TIMEOUT = 600
class Agent:
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
max_concurrent=15, timeout: int = TIMEOUT,logger:logging.Logger|None=None):
def __init__(self, baseurl: str, key: str | None, model_id: str, system_prompt: str | None = None, temperature=0.7,
max_concurrent=15, timeout: int = TIMEOUT, logger: logging.Logger | None = None):
self.baseurl = baseurl.strip()
if self.baseurl.endswith("/"):
self.baseurl = self.baseurl[:-1]
self.key = key.strip()
self.key = key.strip() or "xx"
self.model_id = model_id.strip()
self.system_prompt = system_prompt
self.system_prompt = system_prompt or ""
self.temperature = temperature
self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
self.max_concurrent = max_concurrent
self.timeout = timeout
self.logger=logger if logger else global_logger
self.logger = logger if logger else global_logger
self.total_error_counter = TotalErrorCounter(logger=self.logger)
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
if temperature is None:
temperature = self.temperature
@@ -210,7 +210,7 @@ class Agent:
system_prompt: str | None = None,
) -> list[str]:
system_prompts = [system_prompt] * len(prompts)
counts = [PromptsCounter(len(prompts),self.logger)] * len(prompts)
counts = [PromptsCounter(len(prompts), self.logger)] * len(prompts)
output_list = []
with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts, counts)

View File

@@ -1,7 +1,10 @@
from typing import Unpack
from typing import Unpack, NotRequired
from .agent import (Agent, AgentArgs)
from .agent import Agent, AgentArgs
class MDTranslateAgentArgs(AgentArgs, total=True):
to_lang:str
custom_prompt:NotRequired[str]
class MDRefineAgent(Agent):
def __init__(self, custom_prompt=None, **kwargs: Unpack[AgentArgs]):

View File

@@ -0,0 +1,29 @@
from typing import NotRequired, Unpack
from docutranslate.agents import AgentArgs, Agent
class TXTTranslateAgentArgs(AgentArgs, total=True):
to_lang: str
custom_prompt: NotRequired[str]
class TXTTranslateAgent(Agent):
def __init__(self, custom_prompt=None, to_lang="中文", **kwargs: Unpack[AgentArgs]):
super().__init__(**kwargs)
self.system_prompt = f"""
# 角色
你是一个专业的机器翻译引擎
# 工作
翻译输入的txt文本
目标语言{to_lang}
# 要求
翻译要求专业准确
不输出任何解释和注释
不能改变形如<ph-xxxxxx>的占位符
# 输出
翻译后的txt译文纯文本
"""
if custom_prompt:
self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + custom_prompt + '\n'
self.system_prompt += r'\no_think'

View File

@@ -1 +1 @@
from .document_cacher import DocumentCacher, document_cacher_global
from .md_based_convert_cacher import MDBasedCovertCacher, md_based_convert_cacher

View File

@@ -1,30 +0,0 @@
import os
from collections import OrderedDict
from docutranslate.converter import Document
CACHE_NUM=os.getenv("DOCUTRANSLATE_CACHE_NUM",default="10")
class DocumentCacher:
def __init__(self):
self.cache_dict = OrderedDict()
@staticmethod
def _get_hashcode(document: Document, formula: bool, code: bool, convert_engin: str) -> str:
obj = (document.suffix, document.filebytes, formula, code, convert_engin)
return str(hash(obj))
def get_cached_result(self, document: Document, formula: bool, code: bool, convert_engin: str)->str|None:
return self.cache_dict.get(self._get_hashcode(document, formula, code, convert_engin))
def cache_result(self, result: str, document: Document, formula: bool, code: bool, convert_engin: str):
hash_code = self._get_hashcode(document, formula, code, convert_engin)
if len(self.cache_dict)>=int(CACHE_NUM):
self.cache_dict.popitem(last=False)
self.cache_dict[hash_code] = result
return result
def clear(self):
self.cache_dict.clear()
document_cacher_global = DocumentCacher()

View File

@@ -0,0 +1,36 @@
import os
from collections import OrderedDict
from docutranslate.exporter.md2x.types import x2md_convert_config_type
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
CACHE_NUM = os.getenv("DOCUTRANSLATE_CACHE_NUM", default="10")
class MDBasedCovertCacher:
def __init__(self):
self.cache_dict = OrderedDict()
@staticmethod
def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str:
obj = (document.suffix, document.content, convert_engin, convert_config)
return str(hash(obj))
def get_cached_result(self, document: Document, convert_engin: str,
convert_config: x2md_convert_config_type) -> MarkdownDocument | None:
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
convert_config: x2md_convert_config_type) -> MarkdownDocument:
hash_code = self._get_hashcode(document, convert_engin, convert_config)
if len(self.cache_dict) > int(CACHE_NUM):
self.cache_dict.popitem(last=False)
self.cache_dict[hash_code] = convert_result
return convert_result
def clear(self):
self.cache_dict.clear()
md_based_convert_cacher = MDBasedCovertCacher()

View File

@@ -1,9 +1,3 @@
from .converter import Document,Converter
from .converter_mineru import ConverterMineru
from docutranslate.global_values import conditional_import
if conditional_import("docling"):
from .converter_docling import ConverterDocling
# 打包docling时取消下面一行注释
# from .converter_docling import ConverterDocling
"""
这个包用来处理document之间的格式转换
"""

View File

@@ -1,27 +0,0 @@
from typing import Protocol
from pathlib import Path
class Document:
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
if path is None and (filename is None or filebytes is None):
raise Exception("Document的路径或filename、filebytes不能同时为空")
self.filebytes = filebytes
self.filename = filename
self.path = path
if path:
if isinstance(path,str):
path=Path(path)
self.path=path
self.filename=path.name
self.filebytes=path.read_bytes()
self.suffix=Path(self.filename).suffix
self.stem=Path(self.filename).stem
class Converter(Protocol):
#转换为markdown
def convert(self,document:Document)->str:
...
async def convert_async(self,document:Document)->str:
...

View File

@@ -0,0 +1,12 @@
from typing import Protocol, runtime_checkable
from docutranslate.ir.document import Document
@runtime_checkable
class Converter(Protocol):
def convert(self, document: Document) -> Document:
...
async def convert_async(self, document: Document) -> Document:
...

View File

View File

@@ -1,8 +1,9 @@
import asyncio
import logging
import os
import time
from dataclasses import dataclass
from io import BytesIO
from logging import Logger
from pathlib import Path
from docling.datamodel.base_models import InputFormat
@@ -13,34 +14,49 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub.errors import LocalEntryNotFoundError
from docutranslate.converter import Converter, Document
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
IMAGE_RESOLUTION_SCALE = 4
class ConverterDocling(Converter):
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
self.code = code
self.formula = formula
self.artifact = artifact
self.logger = logger if logger else global_logger
@dataclass(frozen=True)
class ConverterDoclingConfig:
code: bool = True
formula: bool = True
artifact: Path | None = None
def convert(self, document):
assert isinstance(document.filename, str)
class ConverterDocling(X2MarkdownConverter):
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
self.config = config
self.code = config.code
self.formula = config.formula
self.artifact = config.artifact
self.logger = logger
def convert(self, document) -> MarkdownDocument:
assert isinstance(document.name, str)
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
result = self.file2markdown_embed_images(document_stream)
document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content))
content = self.file2markdown_embed_images(document_stream)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
async def convert_async(self, document: Document) -> str:
async def convert_async(self, document: Document) -> MarkdownDocument:
return await asyncio.to_thread(
self.convert,
document
)
def support_format(self) -> list[str]:
return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff",
".bmp", ".webp"]
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
pipeline_options.do_ocr = False

View File

@@ -0,0 +1,15 @@
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class ConverterIdentity(X2MarkdownConverter):
def convert(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
async def convert_async(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
def support_format(self) -> list[str]:
return [".md"]

View File

@@ -1,31 +1,43 @@
import asyncio
import logging
import time
import zipfile
from dataclasses import dataclass
from logging import Logger
import httpx
from docutranslate.converter import Converter, Document
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
URL = 'https://mineru.net/api/v4/file-urls/batch'
@dataclass(frozen=True)
class ConverterMineruConfig:
mineru_token: str
formula: bool = True
timeout = httpx.Timeout(
connect=5.0, # 连接超时 (建立连接的最长时间)
read=200.0, # 读取超时 (等待服务器响应的最长时间)
write=200.0, # 写入超时 (发送数据的最长时间)
pool=1.0 # 从连接池获取连接的超时时间
connect=5.0, # 连接超时 (建立连接的最长时间)
read=200.0, # 读取超时 (等待服务器响应的最长时间)
write=200.0, # 写入超时 (发送数据的最长时间)
pool=1.0 # 从连接池获取连接的超时时间
)
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
client_async=httpx.AsyncClient(trust_env=False,timeout=timeout,proxy=None,verify=False)
# TODO: 提供更详细的logger
class ConverterMineru(Converter):
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
self.mineru_token = token.strip()
self.formula = formula
self.logger=logger if logger else global_logger
class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
self.config = config
self.mineru_token = config.mineru_token.strip()
self.formula = config.formula
self.logger = logger
def _get_header(self):
return {
@@ -39,7 +51,7 @@ class ConverterMineru(Converter):
"language": "auto",
"enable_table": True,
"files": [
{"name": f"{document.filename}", "is_ocr": True}
{"name": f"{document.name}", "is_ocr": True}
]
}
@@ -54,7 +66,7 @@ class ConverterMineru(Converter):
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = client.put(urls[0], content=document.filebytes)
res_upload = client.put(urls[0], content=document.content)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
@@ -72,7 +84,7 @@ class ConverterMineru(Converter):
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = await client_async.put(urls[0], content=document.filebytes)
res_upload = await client_async.put(urls[0], content=document.content)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
@@ -87,8 +99,8 @@ class ConverterMineru(Converter):
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
fileurl = fileinfo["full_zip_url"]
return fileurl
file_url = fileinfo["full_zip_url"]
return file_url
else:
time.sleep(3)
@@ -100,36 +112,40 @@ class ConverterMineru(Converter):
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
fileurl = fileinfo["full_zip_url"]
return fileurl
file_url = fileinfo["full_zip_url"]
return file_url
else:
await asyncio.sleep(3)
def convert(self, document: Document) -> str:
def convert(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
batch_id = self.upload(document)
file_url = self.get_file_url(batch_id)
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
content = get_md_from_zip_url_with_inline_images(zip_url=file_url)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
async def convert_async(self, document: Document) -> str:
# 待优化
async def convert_async(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
batch_id = await self.upload_async(document)
file_url = await self.get_file_url_async(batch_id)
result = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
def support_format(self) -> list[str]:
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
def get_md_from_zip_url_with_inline_images(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str | None:
) -> str:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片
@@ -152,7 +168,8 @@ def get_md_from_zip_url_with_inline_images(
except httpx.HTTPStatusError as e:
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
raise Exception(
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
except httpx.RequestError as e:
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
except zipfile.BadZipFile:
@@ -169,7 +186,7 @@ async def get_md_from_zip_url_with_inline_images_async(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str | None:
) -> str:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片
@@ -181,18 +198,20 @@ async def get_md_from_zip_url_with_inline_images_async(
encoding (str): 目标文件的预期编码默认为 "utf-8"
Returns:
str | None: 如果成功返回处理后的Markdown文本内容否则返回 None
str : 如果成功返回处理后的Markdown文本内容
"""
try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = await client_async.get(zip_url) # 增加超时
response.raise_for_status()
print("ZIP文件下载完成。")
return await asyncio.to_thread(embed_inline_image_from_zip,response.content, filename_in_zip=filename_in_zip, encoding=encoding)
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
encoding=encoding)
except httpx.HTTPStatusError as e:
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
raise Exception(
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
except httpx.RequestError as e:
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
except zipfile.BadZipFile:
@@ -204,5 +223,6 @@ async def get_md_from_zip_url_with_inline_images_async(
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
raise Exception(f"发生未知错误: {e}")
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,22 @@
from typing import runtime_checkable
from typing import Protocol
from docutranslate.converter.interfaces import Converter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@runtime_checkable
class X2MarkdownConverter(Converter,Protocol):
"""
负责将其它格式的文件转换为markdown
"""
def convert(self, document: Document) -> MarkdownDocument:
...
async def convert_async(self, document: Document) -> MarkdownDocument:
...
def support_format(self)->list[str]:
...

View File

@@ -0,0 +1,15 @@
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.utils.markdown_utils import MaskDict, uris2placeholder, placeholder2uris
class MDMaskUrisContext:
def __init__(self, document: MarkdownDocument):
self.document = document
self.mask_dict = MaskDict()
def __enter__(self):
self.document.content = uris2placeholder(self.document.content.decode(), self.mask_dict).encode()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.document.content = placeholder2uris(self.document.content.decode(), self.mask_dict).encode()

View File

@@ -0,0 +1,3 @@
"""
这个包用于将Document导出为其它格式
"""

View File

@@ -0,0 +1,8 @@
from dataclasses import dataclass
@dataclass
class ExportConfig:
pass

View File

@@ -0,0 +1,16 @@
from typing import Protocol, runtime_checkable, TypeVar, Any, Self
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.ir.document import Document
D_in = TypeVar('D_in', bound=Document)
@runtime_checkable
class Exporter(Protocol[D_in]):
@classmethod
def from_config(cls, export_config: ExportConfig | None = None) -> Self:
...
def export(self, document: D_in) -> Any:
...

View File

View File

@@ -0,0 +1,12 @@
from typing import Self
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.interfaces import Exporter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class MDExporter(Exporter):
def export(self,document:MarkdownDocument)->Document:
...

View File

@@ -0,0 +1,73 @@
from dataclasses import dataclass
import jinja2
import markdown2
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.md2x.interfaces import MDExporter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.utils.resource_utils import resource_path
@dataclass
class MD2HTMLExportConfig(ExportConfig):
cdn: bool = True
class MD2HTMLExporter(MDExporter):
def __init__(self, export_config: MD2HTMLExportConfig = None):
export_config = export_config or MD2HTMLExportConfig()
self.cdn=export_config.cdn
def export(self, document: MarkdownDocument) -> Document:
cdn = self.cdn
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
# language=html
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
html_template = resource_path("template/markdown.html").read_text(encoding="utf-8")
katex_css = f'<style>{resource_path("static/katex.css").read_text(encoding="utf-8")}</style>' if not cdn else r"""<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/katex.min.css" integrity="sha512-fHwaWebuwA7NSF5Qg/af4UeDx9XqUpYpOGgubo3yWu+b2IQR4UeQwbb42Ti7gVAjNtVoI/I9TEoYeu9omwcC6g==" crossorigin="anonymous" referrerpolicy="no-referrer" />"""
katex_js = f'<script>{resource_path("static/katex.js").read_text(encoding="utf-8")}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/katex.min.js" integrity="sha512-LQNxIMR5rXv7o+b1l8+N1EZMfhG7iFZ9HhnbJkTp4zjNr5Wvst75AqUeFDxeRUa7l5vEDyUiAip//r+EFLLCyA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
auto_render = f'<script>{resource_path("static/autoRender.js").read_text(encoding="utf-8")}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js" integrity="sha512-iWiuBS5nt6r60fCz26Nd0Zqe0nbk1ZTIQbl3Kv7kYsX+yKMUFHzjaH2+AnM6vp2Xs+gNmaBAVWJjSmuPw76Efg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
# language=javascript
render_math_in_element = r"""
<script>
document.addEventListener("DOMContentLoaded", function () {
renderMathInElement(document.body, {
delimiters: [
{left: '$$', right: '$$', display: true},
{left: '\\[', right: '\\]', display: true},
{left: '$', right: '$', display: false},
{left: '\\(', right: '\\)', display: false}
],
throwOnError: false
})
});
</script>""" if cdn else r"""
<script>
document.addEventListener("DOMContentLoaded", function
() {
renderMathInElement(document.body, {
delimiters: [
{left: '$$', right: '$$', display: true},
{left: '\\[', right: '\\]', display: true},
{left: '$', right: '$', display: false},
{left: '\\(', right: '\\)', display: false}
],
fonts: false,
throwOnError: false
})
});
</script>"""
mermaid = f'<script>{resource_path("static/mermaid.js").read_text(encoding="utf-8")}</script>'
content = markdowner.convert(document.content.decode().replace("\\", "\\\\"))
# TODO:实现MathJax本地化
render = jinja2.Template(html_template).render(
title=document.stem,
pico=pico,
katexCss=katex_css,
katexJs=katex_js,
autoRender=auto_render,
markdown=content,
renderMathInElement=render_math_in_element,
mermaid=mermaid,
)
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)

View File

@@ -0,0 +1,26 @@
from dataclasses import dataclass
from typing import runtime_checkable
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.md2x.interfaces import MDExporter
from docutranslate.ir.markdown_document import MarkdownDocument,Document
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
@dataclass
class MD2MDExportConfig(ExportConfig):
embed_images: bool = True
class MD2MDExporter(MDExporter):
def __init__(self, export_config: MD2MDExportConfig | None=None):
export_config=export_config or MD2MDExportConfig()
self.embed_images=export_config.embed_images
def export(self,document:MarkdownDocument)->Document:
if self.embed_images:
return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)
else:
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)

View File

@@ -0,0 +1,4 @@
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
x2md_convert_config_type=ConverterDoclingConfig | ConverterMineruConfig

View File

View File

@@ -0,0 +1,8 @@
from docutranslate.exporter.interfaces import Exporter
from docutranslate.ir.document import Document
#TODO:看情况是否需要为TXT单独写一个document类型
class TXTExporter(Exporter):
def export(self,document:Document)->Document:
...

View File

@@ -0,0 +1,33 @@
from dataclasses import dataclass
import jinja2
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.txt2x.interfaces import TXTExporter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.utils.resource_utils import resource_path
@dataclass
class TXT2HTMLExportConfig(ExportConfig):
cdn: bool = True
class TXT2HTMLExporter(TXTExporter):
def __init__(self, export_config: TXT2HTMLExportConfig = None):
export_config = export_config or TXT2HTMLExportConfig()
self.cdn = export_config.cdn
def export(self, document: MarkdownDocument) -> Document:
cdn = self.cdn
html_template = resource_path("template/txt.html").read_text(encoding="utf-8")
# language=html
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
render = jinja2.Template(html_template).render(
title=document.stem,
pico=pico,
)
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)

View File

View File

@@ -0,0 +1,24 @@
import dataclasses
from pathlib import Path
class Document:
def __init__(self,suffix:str,content:bytes,stem:str|None=None,path:Path=None):
self.suffix=suffix
self.content=content
self.stem=stem
self.path=path
@property
def name(self)->str|None:
if not self.stem:
return None
return self.stem+self.suffix
@classmethod
def from_path(cls,path:Path|str):
if isinstance(path,str):
path=Path(path)
return cls(suffix=path.suffix,content=path.read_bytes(),stem=path.stem,path=path)
@classmethod
def from_bytes(cls,content:bytes,suffix:str,stem:str|None):
return cls(content=content,suffix=suffix,stem=stem)

View File

@@ -0,0 +1,7 @@
from docutranslate.ir.document import Document
class MarkdownDocument(Document):
def __init__(self,*args,**kwargs):
super().__init__(*args,**kwargs)
self.suffix=".md"

View File

View File

@@ -0,0 +1,51 @@
from abc import ABC, abstractmethod
from logging import Logger
from pathlib import Path
from typing import Self, Generic, TypeVar
from docutranslate.exporter.interfaces import Exporter
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
T_Translated = TypeVar('T_Translated', bound=Document)
class BaseManager(ABC, Generic[T_Translated]):
def __init__(self, logger: Logger = global_logger):
self.logger = logger
self.document_original: Document | None = None
self.document_translated: T_Translated | None = None
def read_path(self, path: Path | str):
document = Document.from_path(path)
self.document_original = document
def read_bytes(self, content: bytes, stem: str, suffix: str):
document = Document.from_bytes(content=content, stem=stem, suffix=suffix)
self.document_original = document
@abstractmethod
def translate(self, *args, **kwargs) -> Self:
...
@abstractmethod
async def translate_async(self, *args, **kwargs) -> Self:
...
def _export(self, exporter: Exporter) -> Document:
if self.document_translated is None:
raise RuntimeError("Document has not been translated yet. Call translate() first.")
docu = exporter.export(self.document_translated)
return docu
def _save(self, exporter: Exporter, name: str = None, out_put_dir: Path | str = "./output"):
docu = self._export(exporter)
name = name or docu.name
output_path = Path(out_put_dir) / Path(name)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(docu.content)
self.logger.info(f"文件已保存到{output_path.resolve()}")
return self
@abstractmethod
def support_export_format(self)->list[str]:
...

View File

@@ -0,0 +1,34 @@
from pathlib import Path
from typing import Protocol, runtime_checkable, Self, TypeVar
from docutranslate.exporter.export_config import ExportConfig
T = TypeVar("T", bound=ExportConfig)
@runtime_checkable
class HTMLExportable(Protocol):
def export_to_html(self, export_config: T) -> str:
...
def save_as_html(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
...
@runtime_checkable
class MDExportable(Protocol):
def export_to_markdown(self, export_config: T) -> str:
...
def save_as_markdown(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
...
@runtime_checkable
class TXTExportable(Protocol):
def export_to_txt(self, export_config: T) -> str:
...
def save_as_txt(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
...

View File

@@ -0,0 +1,102 @@
import asyncio
from pathlib import Path
from typing import Self, Literal, overload
from docutranslate.cacher import md_based_convert_cacher
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
from docutranslate.exporter.md2x.types import x2md_convert_config_type
from docutranslate.manager.base_manager import BaseManager
from docutranslate.manager.interfaces import HTMLExportable, MDExportable
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
def support_export_format(self) -> list[str]:
return [".md",".html",".zip"]
def _get_document_md(self, convert_engin, convert_config):
if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
convert_config)
# 获取解析文件
if document_cached:
document_md = document_cached
else:
if convert_engin is None:
converter = ConverterIdentity()
elif convert_engin == "mineru":
if not isinstance(convert_config, ConverterMineruConfig):
raise RuntimeError(f"未传入正确的convert_config应传入{ConverterMineruConfig}")
converter = ConverterMineru(convert_config, logger=self.logger)
elif convert_engin == "docling":
if not isinstance(convert_config, ConverterDoclingConfig):
raise RuntimeError(f"未传入正确的convert_config应传入{ConverterDoclingConfig}")
converter = ConverterDocling(convert_config, logger=self.logger)
else:
raise ValueError(f"不存在{convert_engin}解析引擎")
document_md = converter.convert(self.document_original)
# 获取缓存解析后文件
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md
@overload
def translate(self, convert_engin: None,
convert_config: None, translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["docling"],
convert_config: ConverterDoclingConfig, translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["mineru"],
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
...
def translate(self, convert_engin: Literal["mineru", "docling"] | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
document_md = self._get_document_md(convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
translator.translate(document_md)
self.document_translated = document_md
return self
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
await translator.translate_async(document_md)
self.document_translated = document_md
return self
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
docu = self._export(MD2HTMLExporter(export_config))
return docu.content.decode()
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
docu = self._export(MD2MDExporter(export_config))
return docu.content.decode()
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
export_config: MD2HTMLExportConfig | None = None) -> Self:
self._save(exporter=MD2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
return self
def save_as_markdown(self, name: str = None, out_put_dir: Path | str = "./output",
export_config: MD2MDExportConfig | None = None) -> Self:
self._save(exporter=MD2MDExporter(export_config), name=name, out_put_dir=out_put_dir)
return self

View File

@@ -0,0 +1,66 @@
from copy import copy
from dataclasses import dataclass
from logging import Logger
from pathlib import Path
from typing import Self
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
from docutranslate.manager.base_manager import BaseManager
from docutranslate.manager.interfaces import HTMLExportable
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
@dataclass
class TXTManagerConfig:
chunk_size: int = 3000
base_url: str | None = None
api_key = None,
model_id: str | None = None
temperature = 0.7
concurrent: int = 30
timeout = 2000
cache = True
logger: Logger | None = None
class TXTManager(BaseManager, HTMLExportable):
def support_export_format(self) -> list[str]:
return [".txt", ".html"]
def translate(self, translate_config: TXTTranslateConfig) -> Self:
document = copy(self.document_original)
# 翻译解析后文件
translator = TXTTranslator(translate_config)
translator.translate(document)
self.document_translated = document
return self
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
document = copy(self.document_original)
# 翻译解析后文件
translator = TXTTranslator(translate_config)
await translator.translate_async(document)
self.document_translated = document
return self
def export_to_html(self, export_config: TXT2HTMLExportConfig) -> str:
docu = self._export(TXT2HTMLExporter(export_config))
return docu.content.decode()
def export_to_txt(self) -> str:
if self.document_translated is None:
raise RuntimeError("Document has not been translated yet. Call translate() first.")
return self.document_translated.content.decode()
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
export_config: TXT2HTMLExportConfig | None = None) -> Self:
self._save(exporter=TXT2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
return self
def save_as_txt(self, name: str = None, out_put_dir: Path | str = "./output", ) -> Self:
name = name or self.document_translated.name
output_path = Path(out_put_dir) / Path(name)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(self.document_translated.content)
self.logger.info(f"文件已保存到{output_path.resolve()}")
return self

View File

@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{{ title }}</title>
{{pico}}
<style>
html {
padding: 2vh 10vw;
font-size: 15px;
}
</style>
</head>
<body>
{{ body }}
</body>
</html>

View File

@@ -11,12 +11,12 @@ import markdown2
from docutranslate.agents import Agent, AgentArgs
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
from docutranslate.cacher import document_cacher_global
from docutranslate.converter import Document, ConverterMineru
from docutranslate.cacher import md_based_convert_cacher
from docutranslate.ir.document import Document
from docutranslate.global_values import available_packages
from docutranslate.logger import global_logger
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2uris, MaskDict, clean_markdown_math_block, \
unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip
from docutranslate.utils.resource_utils import resource_path
@@ -62,7 +62,7 @@ class FileTranslater:
self.timeout = timeout
self.document: Document | None = None
self.cache = cache
self.cacher = document_cacher_global
self.cacher = md_based_convert_cacher
if file_path:
self.read_file(file_path=file_path)
@@ -79,7 +79,7 @@ class FileTranslater:
return self
def _unmask_uris_in_markdown(self):
self.markdown = placeholder2_uris(self.markdown, self._mask_dict)
self.markdown = placeholder2uris(self.markdown, self._mask_dict)
return self
def _split_markdown_into_chunks(self) -> list[str]:

View File

View File

@@ -0,0 +1,16 @@
from dataclasses import dataclass
from logging import Logger
@dataclass
class AiTranslateConfig:
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
logger: Logger | None = None

View File

@@ -0,0 +1,21 @@
from typing import runtime_checkable, Protocol, TypeVar
from docutranslate.agents import Agent
from docutranslate.ir.document import Document
T=TypeVar('T',bound=Document)
V=TypeVar('V',bound=Agent)
@runtime_checkable
class Translator(Protocol[T,V]):
"""
翻译中间文本原地替换Translator不做格式转换
"""
def translate(self, document:T) -> Document:
...
async def translate_async(self, document: T) -> Document:
...
def log(self,info:str):
...

View File

@@ -0,0 +1,70 @@
import asyncio
from dataclasses import dataclass
from logging import Logger
from typing import Self
from docutranslate.agents import MDTranslateAgent
from docutranslate.document_context.md_mask_context import MDMaskUrisContext
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
from docutranslate.translater.base import AiTranslateConfig
from docutranslate.translater.interfaces import Translator
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
from docutranslate.utils.markdown_utils import clean_markdown_math_block
@dataclass
class MDTranslateConfig(AiTranslateConfig):
...
class MDTranslator(Translator):
def __init__(self, config: MDTranslateConfig):
self.logger = config.logger or global_logger
self.chunk_size = config.chunk_size
self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt,
to_lang=config.to_lang,
baseurl=config.base_url,
key=config.api_key,
model_id=config.model_id,
system_prompt=None,
temperature=config.temperature,
max_concurrent=config.concurrent,
timeout=config.timeout,
logger=self.logger)
def translate(self, document: MarkdownDocument) -> Self:
self.logger.info("正在翻译markdown")
with MDMaskUrisContext(document):
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
self.logger.info(f"markdown分为{len(chunks)}")
result: list[str] = self.translate_agent.send_prompts(chunks)
content = join_markdown_texts(result)
# 做一些加强鲁棒性的操作
content = content.replace(r'\', r'\(')
content = content.replace(r'\', r'\)')
content = clean_markdown_math_block(content)
document.content = content.encode()
self.logger.info("翻译完成")
return self
async def translate_async(self, document: MarkdownDocument) -> Self:
self.logger.info("正在翻译markdown")
with MDMaskUrisContext(document):
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
self.logger.info(f"markdown分为{len(chunks)}")
result: list[str] = await self.translate_agent.send_prompts_async(chunks)
def run():
content = join_markdown_texts(result)
# 做一些加强鲁棒性的操作
content = content.replace(r'\', r'\(')
content = content.replace(r'\', r'\)')
content = clean_markdown_math_block(content)
document.content = content.encode()
await asyncio.to_thread(run)
self.logger.info("翻译完成")
return self

View File

@@ -0,0 +1,50 @@
from dataclasses import dataclass
from typing import Self
from docutranslate.agents.txt_agent import TXTTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
from docutranslate.translater.base import AiTranslateConfig
from docutranslate.translater.interfaces import Translator
from docutranslate.utils.markdown_splitter import split_markdown_text
@dataclass
class TXTTranslateConfig(AiTranslateConfig):
...
class TXTTranslator(Translator):
def __init__(self, config: TXTTranslateConfig):
self.logger = config.logger or global_logger
self.chunk_size = config.chunk_size
self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt,
to_lang=config.to_lang,
baseurl=config.base_url,
key=config.api_key,
model_id=config.model_id,
system_prompt=None,
temperature=config.temperature,
max_concurrent=config.concurrent,
timeout=config.timeout,
logger=self.logger)
def translate(self, document: Document) -> Self:
self.logger.info("正在翻译txt")
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
self.logger.info(f"txt分为{len(chunks)}")
result: list[str] = self.translate_agent.send_prompts(chunks)
content = "\n".join(result)
document.content = content.encode()
self.logger.info("翻译完成")
return self
async def translate_async(self, document: Document) -> Self:
self.logger.info("正在翻译txt")
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
self.logger.info(f"txt分为{len(chunks)}")
result: list[str] = await self.translate_agent.send_prompts_async(chunks)
content = "\n".join(result)
document.content = content.encode()
self.logger.info("翻译完成")
return self

View File

@@ -218,7 +218,7 @@ class MarkdownBlockSplitter:
return result
def split_markdown_text(markdown_text, max_block_size=5000):
def split_markdown_text(markdown_text:str, max_block_size=5000):
"""
将Markdown字符串分割成不超过max_block_size的块
可以通过简单拼接重建原始文本(分割的代码块除外)

View File

@@ -69,7 +69,7 @@ def uris2placeholder(markdown: str, mask_dict: MaskDict):
return markdown
def placeholder2_uris(markdown: str, mask_dict: MaskDict):
def placeholder2uris(markdown: str, mask_dict: MaskDict):
def placeholder2uri(match: re.Match):
id = match.group(1)
uri = mask_dict.get(id)