重构代码,新增了MarkdownBasedManager和TXTManager实现
This commit is contained in:
@@ -2,4 +2,4 @@ __version__="0.3.4b1"
|
||||
|
||||
|
||||
|
||||
from .translater import FileTranslater
|
||||
# from .translater import FileTranslater
|
||||
@@ -17,23 +17,23 @@ class AgentArgs(TypedDict, total=False):
|
||||
baseurl: str
|
||||
key: str
|
||||
model_id: str
|
||||
system_prompt: str
|
||||
system_prompt: str | None
|
||||
temperature: float
|
||||
max_concurrent: int
|
||||
timeout: int
|
||||
logger:logging.Logger
|
||||
logger: logging.Logger
|
||||
|
||||
|
||||
class TotalErrorCounter:
|
||||
def __init__(self,logger:logging.Logger):
|
||||
def __init__(self, logger: logging.Logger):
|
||||
self.lock = Lock()
|
||||
self.count = 0
|
||||
self.logger=logger
|
||||
self.logger = logger
|
||||
|
||||
def add(self):
|
||||
self.lock.acquire()
|
||||
self.count += 1
|
||||
if self.count>MAX_TOTAL_ERROR_COUNT:
|
||||
if self.count > MAX_TOTAL_ERROR_COUNT:
|
||||
self.logger.info(f"错误响应过多")
|
||||
self.lock.release()
|
||||
return self.reach_limit()
|
||||
@@ -42,14 +42,13 @@ class TotalErrorCounter:
|
||||
return self.count > MAX_TOTAL_ERROR_COUNT
|
||||
|
||||
|
||||
|
||||
# 仅使用多线程时用以计数
|
||||
class PromptsCounter:
|
||||
def __init__(self, total: int,logger:logging.Logger):
|
||||
def __init__(self, total: int, logger: logging.Logger):
|
||||
self.lock = Lock()
|
||||
self.count = 0
|
||||
self.total = total
|
||||
self.logger=logger
|
||||
self.logger = logger
|
||||
|
||||
def add(self):
|
||||
self.lock.acquire()
|
||||
@@ -62,22 +61,23 @@ TIMEOUT = 600
|
||||
|
||||
|
||||
class Agent:
|
||||
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
|
||||
max_concurrent=15, timeout: int = TIMEOUT,logger:logging.Logger|None=None):
|
||||
def __init__(self, baseurl: str, key: str | None, model_id: str, system_prompt: str | None = None, temperature=0.7,
|
||||
max_concurrent=15, timeout: int = TIMEOUT, logger: logging.Logger | None = None):
|
||||
self.baseurl = baseurl.strip()
|
||||
if self.baseurl.endswith("/"):
|
||||
self.baseurl = self.baseurl[:-1]
|
||||
self.key = key.strip()
|
||||
self.key = key.strip() or "xx"
|
||||
self.model_id = model_id.strip()
|
||||
self.system_prompt = system_prompt
|
||||
self.system_prompt = system_prompt or ""
|
||||
self.temperature = temperature
|
||||
self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
|
||||
self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
|
||||
self.max_concurrent = max_concurrent
|
||||
self.timeout = timeout
|
||||
|
||||
self.logger=logger if logger else global_logger
|
||||
self.logger = logger if logger else global_logger
|
||||
self.total_error_counter = TotalErrorCounter(logger=self.logger)
|
||||
|
||||
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
|
||||
if temperature is None:
|
||||
temperature = self.temperature
|
||||
@@ -210,7 +210,7 @@ class Agent:
|
||||
system_prompt: str | None = None,
|
||||
) -> list[str]:
|
||||
system_prompts = [system_prompt] * len(prompts)
|
||||
counts = [PromptsCounter(len(prompts),self.logger)] * len(prompts)
|
||||
counts = [PromptsCounter(len(prompts), self.logger)] * len(prompts)
|
||||
output_list = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
|
||||
results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts, counts)
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from typing import Unpack
|
||||
from typing import Unpack, NotRequired
|
||||
|
||||
from .agent import (Agent, AgentArgs)
|
||||
from .agent import Agent, AgentArgs
|
||||
|
||||
class MDTranslateAgentArgs(AgentArgs, total=True):
|
||||
to_lang:str
|
||||
custom_prompt:NotRequired[str]
|
||||
|
||||
class MDRefineAgent(Agent):
|
||||
def __init__(self, custom_prompt=None, **kwargs: Unpack[AgentArgs]):
|
||||
|
||||
29
docutranslate/agents/txt_agent.py
Normal file
29
docutranslate/agents/txt_agent.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from typing import NotRequired, Unpack
|
||||
|
||||
from docutranslate.agents import AgentArgs, Agent
|
||||
|
||||
|
||||
class TXTTranslateAgentArgs(AgentArgs, total=True):
|
||||
to_lang: str
|
||||
custom_prompt: NotRequired[str]
|
||||
|
||||
|
||||
class TXTTranslateAgent(Agent):
|
||||
def __init__(self, custom_prompt=None, to_lang="中文", **kwargs: Unpack[AgentArgs]):
|
||||
super().__init__(**kwargs)
|
||||
self.system_prompt = f"""
|
||||
# 角色
|
||||
你是一个专业的机器翻译引擎
|
||||
# 工作
|
||||
翻译输入的txt文本
|
||||
目标语言{to_lang}
|
||||
# 要求
|
||||
翻译要求专业准确
|
||||
不输出任何解释和注释
|
||||
不能改变形如<ph-xxxxxx>的占位符
|
||||
# 输出
|
||||
翻译后的txt译文纯文本
|
||||
"""
|
||||
if custom_prompt:
|
||||
self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + custom_prompt + '\n'
|
||||
self.system_prompt += r'\no_think'
|
||||
@@ -1 +1 @@
|
||||
from .document_cacher import DocumentCacher, document_cacher_global
|
||||
from .md_based_convert_cacher import MDBasedCovertCacher, md_based_convert_cacher
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
from docutranslate.converter import Document
|
||||
|
||||
CACHE_NUM=os.getenv("DOCUTRANSLATE_CACHE_NUM",default="10")
|
||||
|
||||
class DocumentCacher:
|
||||
def __init__(self):
|
||||
self.cache_dict = OrderedDict()
|
||||
@staticmethod
|
||||
def _get_hashcode(document: Document, formula: bool, code: bool, convert_engin: str) -> str:
|
||||
obj = (document.suffix, document.filebytes, formula, code, convert_engin)
|
||||
return str(hash(obj))
|
||||
|
||||
def get_cached_result(self, document: Document, formula: bool, code: bool, convert_engin: str)->str|None:
|
||||
return self.cache_dict.get(self._get_hashcode(document, formula, code, convert_engin))
|
||||
|
||||
def cache_result(self, result: str, document: Document, formula: bool, code: bool, convert_engin: str):
|
||||
hash_code = self._get_hashcode(document, formula, code, convert_engin)
|
||||
if len(self.cache_dict)>=int(CACHE_NUM):
|
||||
self.cache_dict.popitem(last=False)
|
||||
self.cache_dict[hash_code] = result
|
||||
return result
|
||||
|
||||
def clear(self):
|
||||
self.cache_dict.clear()
|
||||
|
||||
|
||||
document_cacher_global = DocumentCacher()
|
||||
36
docutranslate/cacher/md_based_convert_cacher.py
Normal file
36
docutranslate/cacher/md_based_convert_cacher.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
from docutranslate.exporter.md2x.types import x2md_convert_config_type
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
CACHE_NUM = os.getenv("DOCUTRANSLATE_CACHE_NUM", default="10")
|
||||
|
||||
|
||||
class MDBasedCovertCacher:
|
||||
def __init__(self):
|
||||
self.cache_dict = OrderedDict()
|
||||
|
||||
@staticmethod
|
||||
def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str:
|
||||
obj = (document.suffix, document.content, convert_engin, convert_config)
|
||||
return str(hash(obj))
|
||||
|
||||
def get_cached_result(self, document: Document, convert_engin: str,
|
||||
convert_config: x2md_convert_config_type) -> MarkdownDocument | None:
|
||||
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
|
||||
|
||||
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
|
||||
convert_config: x2md_convert_config_type) -> MarkdownDocument:
|
||||
hash_code = self._get_hashcode(document, convert_engin, convert_config)
|
||||
if len(self.cache_dict) > int(CACHE_NUM):
|
||||
self.cache_dict.popitem(last=False)
|
||||
self.cache_dict[hash_code] = convert_result
|
||||
return convert_result
|
||||
|
||||
def clear(self):
|
||||
self.cache_dict.clear()
|
||||
|
||||
|
||||
md_based_convert_cacher = MDBasedCovertCacher()
|
||||
@@ -1,9 +1,3 @@
|
||||
from .converter import Document,Converter
|
||||
from .converter_mineru import ConverterMineru
|
||||
|
||||
from docutranslate.global_values import conditional_import
|
||||
if conditional_import("docling"):
|
||||
from .converter_docling import ConverterDocling
|
||||
|
||||
# 打包docling时取消下面一行注释
|
||||
# from .converter_docling import ConverterDocling
|
||||
"""
|
||||
这个包用来处理document之间的格式转换
|
||||
"""
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
from typing import Protocol
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Document:
|
||||
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
|
||||
if path is None and (filename is None or filebytes is None):
|
||||
raise Exception("Document的路径或filename、filebytes不能同时为空")
|
||||
self.filebytes = filebytes
|
||||
self.filename = filename
|
||||
self.path = path
|
||||
if path:
|
||||
if isinstance(path,str):
|
||||
path=Path(path)
|
||||
self.path=path
|
||||
self.filename=path.name
|
||||
self.filebytes=path.read_bytes()
|
||||
self.suffix=Path(self.filename).suffix
|
||||
self.stem=Path(self.filename).stem
|
||||
|
||||
class Converter(Protocol):
|
||||
#转换为markdown
|
||||
def convert(self,document:Document)->str:
|
||||
...
|
||||
|
||||
async def convert_async(self,document:Document)->str:
|
||||
...
|
||||
12
docutranslate/converter/interfaces.py
Normal file
12
docutranslate/converter/interfaces.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from typing import Protocol, runtime_checkable
|
||||
|
||||
from docutranslate.ir.document import Document
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class Converter(Protocol):
|
||||
def convert(self, document: Document) -> Document:
|
||||
...
|
||||
|
||||
async def convert_async(self, document: Document) -> Document:
|
||||
...
|
||||
0
docutranslate/converter/x2md/__init__.py
Normal file
0
docutranslate/converter/x2md/__init__.py
Normal file
@@ -1,8 +1,9 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@@ -13,34 +14,49 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||
|
||||
from docutranslate.converter import Converter, Document
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
class ConverterDocling(Converter):
|
||||
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
|
||||
self.code = code
|
||||
self.formula = formula
|
||||
self.artifact = artifact
|
||||
self.logger = logger if logger else global_logger
|
||||
@dataclass(frozen=True)
|
||||
class ConverterDoclingConfig:
|
||||
code: bool = True
|
||||
formula: bool = True
|
||||
artifact: Path | None = None
|
||||
|
||||
def convert(self, document):
|
||||
assert isinstance(document.filename, str)
|
||||
|
||||
class ConverterDocling(X2MarkdownConverter):
|
||||
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
|
||||
self.config = config
|
||||
self.code = config.code
|
||||
self.formula = config.formula
|
||||
self.artifact = config.artifact
|
||||
self.logger = logger
|
||||
|
||||
def convert(self, document) -> MarkdownDocument:
|
||||
assert isinstance(document.name, str)
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
|
||||
result = self.file2markdown_embed_images(document_stream)
|
||||
document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content))
|
||||
content = self.file2markdown_embed_images(document_stream)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||
return md_document
|
||||
|
||||
async def convert_async(self, document: Document) -> str:
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
return await asyncio.to_thread(
|
||||
self.convert,
|
||||
document
|
||||
)
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff",
|
||||
".bmp", ".webp"]
|
||||
|
||||
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
|
||||
pipeline_options.do_ocr = False
|
||||
15
docutranslate/converter/x2md/converter_identity.py
Normal file
15
docutranslate/converter/x2md/converter_identity.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
|
||||
class ConverterIdentity(X2MarkdownConverter):
|
||||
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".md"]
|
||||
@@ -1,14 +1,26 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
|
||||
import httpx
|
||||
from docutranslate.converter import Converter, Document
|
||||
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||
|
||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ConverterMineruConfig:
|
||||
mineru_token: str
|
||||
formula: bool = True
|
||||
|
||||
|
||||
timeout = httpx.Timeout(
|
||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||
read=200.0, # 读取超时 (等待服务器响应的最长时间)
|
||||
@@ -16,16 +28,16 @@ timeout = httpx.Timeout(
|
||||
pool=1.0 # 从连接池获取连接的超时时间
|
||||
)
|
||||
|
||||
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||
|
||||
client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||
client_async=httpx.AsyncClient(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||
|
||||
# TODO: 提供更详细的logger
|
||||
class ConverterMineru(Converter):
|
||||
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
||||
self.mineru_token = token.strip()
|
||||
self.formula = formula
|
||||
self.logger=logger if logger else global_logger
|
||||
class ConverterMineru(X2MarkdownConverter):
|
||||
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
|
||||
self.config = config
|
||||
self.mineru_token = config.mineru_token.strip()
|
||||
self.formula = config.formula
|
||||
self.logger = logger
|
||||
|
||||
def _get_header(self):
|
||||
return {
|
||||
@@ -39,7 +51,7 @@ class ConverterMineru(Converter):
|
||||
"language": "auto",
|
||||
"enable_table": True,
|
||||
"files": [
|
||||
{"name": f"{document.filename}", "is_ocr": True}
|
||||
{"name": f"{document.name}", "is_ocr": True}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -54,7 +66,7 @@ class ConverterMineru(Converter):
|
||||
urls = result["data"]["file_urls"]
|
||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||
# 获取
|
||||
res_upload = client.put(urls[0], content=document.filebytes)
|
||||
res_upload = client.put(urls[0], content=document.content)
|
||||
res_upload.raise_for_status()
|
||||
# print(f"{urls[0]} upload success")
|
||||
return batch_id
|
||||
@@ -72,7 +84,7 @@ class ConverterMineru(Converter):
|
||||
urls = result["data"]["file_urls"]
|
||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||
# 获取
|
||||
res_upload = await client_async.put(urls[0], content=document.filebytes)
|
||||
res_upload = await client_async.put(urls[0], content=document.content)
|
||||
res_upload.raise_for_status()
|
||||
# print(f"{urls[0]} upload success")
|
||||
return batch_id
|
||||
@@ -87,8 +99,8 @@ class ConverterMineru(Converter):
|
||||
res.raise_for_status()
|
||||
fileinfo = res.json()["data"]["extract_result"][0]
|
||||
if fileinfo["state"] == "done":
|
||||
fileurl = fileinfo["full_zip_url"]
|
||||
return fileurl
|
||||
file_url = fileinfo["full_zip_url"]
|
||||
return file_url
|
||||
else:
|
||||
time.sleep(3)
|
||||
|
||||
@@ -100,36 +112,40 @@ class ConverterMineru(Converter):
|
||||
res.raise_for_status()
|
||||
fileinfo = res.json()["data"]["extract_result"][0]
|
||||
if fileinfo["state"] == "done":
|
||||
fileurl = fileinfo["full_zip_url"]
|
||||
return fileurl
|
||||
file_url = fileinfo["full_zip_url"]
|
||||
return file_url
|
||||
else:
|
||||
await asyncio.sleep(3)
|
||||
|
||||
def convert(self, document: Document) -> str:
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
batch_id = self.upload(document)
|
||||
file_url = self.get_file_url(batch_id)
|
||||
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||
content = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||
return md_document
|
||||
|
||||
async def convert_async(self, document: Document) -> str:
|
||||
# 待优化
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
batch_id = await self.upload_async(document)
|
||||
file_url = await self.get_file_url_async(batch_id)
|
||||
result = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
||||
content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||
return md_document
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
|
||||
|
||||
|
||||
def get_md_from_zip_url_with_inline_images(
|
||||
zip_url: str,
|
||||
filename_in_zip: str = "full.md",
|
||||
encoding: str = "utf-8"
|
||||
) -> str | None:
|
||||
) -> str:
|
||||
"""
|
||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||
@@ -152,7 +168,8 @@ def get_md_from_zip_url_with_inline_images(
|
||||
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
raise Exception(
|
||||
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
except httpx.RequestError as e:
|
||||
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||
except zipfile.BadZipFile:
|
||||
@@ -169,7 +186,7 @@ async def get_md_from_zip_url_with_inline_images_async(
|
||||
zip_url: str,
|
||||
filename_in_zip: str = "full.md",
|
||||
encoding: str = "utf-8"
|
||||
) -> str | None:
|
||||
) -> str:
|
||||
"""
|
||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||
@@ -181,18 +198,20 @@ async def get_md_from_zip_url_with_inline_images_async(
|
||||
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
||||
|
||||
Returns:
|
||||
str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。
|
||||
str : 如果成功,返回处理后的Markdown文本内容。
|
||||
"""
|
||||
try:
|
||||
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
||||
response = await client_async.get(zip_url) # 增加超时
|
||||
response.raise_for_status()
|
||||
print("ZIP文件下载完成。")
|
||||
return await asyncio.to_thread(embed_inline_image_from_zip,response.content, filename_in_zip=filename_in_zip, encoding=encoding)
|
||||
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
|
||||
encoding=encoding)
|
||||
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
raise Exception(
|
||||
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
except httpx.RequestError as e:
|
||||
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||
except zipfile.BadZipFile:
|
||||
@@ -204,5 +223,6 @@ async def get_md_from_zip_url_with_inline_images_async(
|
||||
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
|
||||
raise Exception(f"发生未知错误: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
22
docutranslate/converter/x2md/interfaces.py
Normal file
22
docutranslate/converter/x2md/interfaces.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from typing import runtime_checkable
|
||||
|
||||
from typing import Protocol
|
||||
from docutranslate.converter.interfaces import Converter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class X2MarkdownConverter(Converter,Protocol):
|
||||
"""
|
||||
负责将其它格式的文件转换为markdown
|
||||
"""
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
...
|
||||
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
...
|
||||
|
||||
def support_format(self)->list[str]:
|
||||
...
|
||||
0
docutranslate/document_context/__init__.py
Normal file
0
docutranslate/document_context/__init__.py
Normal file
15
docutranslate/document_context/md_mask_context.py
Normal file
15
docutranslate/document_context/md_mask_context.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.utils.markdown_utils import MaskDict, uris2placeholder, placeholder2uris
|
||||
|
||||
|
||||
class MDMaskUrisContext:
|
||||
def __init__(self, document: MarkdownDocument):
|
||||
self.document = document
|
||||
self.mask_dict = MaskDict()
|
||||
|
||||
def __enter__(self):
|
||||
self.document.content = uris2placeholder(self.document.content.decode(), self.mask_dict).encode()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.document.content = placeholder2uris(self.document.content.decode(), self.mask_dict).encode()
|
||||
3
docutranslate/exporter/__init__.py
Normal file
3
docutranslate/exporter/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
这个包用于将Document导出为其它格式
|
||||
"""
|
||||
8
docutranslate/exporter/export_config.py
Normal file
8
docutranslate/exporter/export_config.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class ExportConfig:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
16
docutranslate/exporter/interfaces.py
Normal file
16
docutranslate/exporter/interfaces.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from typing import Protocol, runtime_checkable, TypeVar, Any, Self
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
from docutranslate.ir.document import Document
|
||||
|
||||
D_in = TypeVar('D_in', bound=Document)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class Exporter(Protocol[D_in]):
|
||||
@classmethod
|
||||
def from_config(cls, export_config: ExportConfig | None = None) -> Self:
|
||||
...
|
||||
|
||||
def export(self, document: D_in) -> Any:
|
||||
...
|
||||
0
docutranslate/exporter/md2x/__init__.py
Normal file
0
docutranslate/exporter/md2x/__init__.py
Normal file
12
docutranslate/exporter/md2x/interfaces.py
Normal file
12
docutranslate/exporter/md2x/interfaces.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from typing import Self
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
from docutranslate.exporter.interfaces import Exporter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
|
||||
class MDExporter(Exporter):
|
||||
|
||||
def export(self,document:MarkdownDocument)->Document:
|
||||
...
|
||||
73
docutranslate/exporter/md2x/md2html_exporter.py
Normal file
73
docutranslate/exporter/md2x/md2html_exporter.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import jinja2
|
||||
import markdown2
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
from docutranslate.exporter.md2x.interfaces import MDExporter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.utils.resource_utils import resource_path
|
||||
|
||||
@dataclass
|
||||
class MD2HTMLExportConfig(ExportConfig):
|
||||
cdn: bool = True
|
||||
|
||||
class MD2HTMLExporter(MDExporter):
|
||||
def __init__(self, export_config: MD2HTMLExportConfig = None):
|
||||
export_config = export_config or MD2HTMLExportConfig()
|
||||
self.cdn=export_config.cdn
|
||||
|
||||
def export(self, document: MarkdownDocument) -> Document:
|
||||
cdn = self.cdn
|
||||
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
|
||||
# language=html
|
||||
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
|
||||
html_template = resource_path("template/markdown.html").read_text(encoding="utf-8")
|
||||
katex_css = f'<style>{resource_path("static/katex.css").read_text(encoding="utf-8")}</style>' if not cdn else r"""<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/katex.min.css" integrity="sha512-fHwaWebuwA7NSF5Qg/af4UeDx9XqUpYpOGgubo3yWu+b2IQR4UeQwbb42Ti7gVAjNtVoI/I9TEoYeu9omwcC6g==" crossorigin="anonymous" referrerpolicy="no-referrer" />"""
|
||||
katex_js = f'<script>{resource_path("static/katex.js").read_text(encoding="utf-8")}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/katex.min.js" integrity="sha512-LQNxIMR5rXv7o+b1l8+N1EZMfhG7iFZ9HhnbJkTp4zjNr5Wvst75AqUeFDxeRUa7l5vEDyUiAip//r+EFLLCyA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
|
||||
auto_render = f'<script>{resource_path("static/autoRender.js").read_text(encoding="utf-8")}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js" integrity="sha512-iWiuBS5nt6r60fCz26Nd0Zqe0nbk1ZTIQbl3Kv7kYsX+yKMUFHzjaH2+AnM6vp2Xs+gNmaBAVWJjSmuPw76Efg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
|
||||
# language=javascript
|
||||
render_math_in_element = r"""
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
renderMathInElement(document.body, {
|
||||
delimiters: [
|
||||
{left: '$$', right: '$$', display: true},
|
||||
{left: '\\[', right: '\\]', display: true},
|
||||
{left: '$', right: '$', display: false},
|
||||
{left: '\\(', right: '\\)', display: false}
|
||||
],
|
||||
throwOnError: false
|
||||
})
|
||||
});
|
||||
</script>""" if cdn else r"""
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function
|
||||
() {
|
||||
renderMathInElement(document.body, {
|
||||
delimiters: [
|
||||
{left: '$$', right: '$$', display: true},
|
||||
{left: '\\[', right: '\\]', display: true},
|
||||
{left: '$', right: '$', display: false},
|
||||
{left: '\\(', right: '\\)', display: false}
|
||||
],
|
||||
fonts: false,
|
||||
throwOnError: false
|
||||
})
|
||||
});
|
||||
</script>"""
|
||||
mermaid = f'<script>{resource_path("static/mermaid.js").read_text(encoding="utf-8")}</script>'
|
||||
content = markdowner.convert(document.content.decode().replace("\\", "\\\\"))
|
||||
# TODO:实现MathJax本地化
|
||||
render = jinja2.Template(html_template).render(
|
||||
title=document.stem,
|
||||
pico=pico,
|
||||
katexCss=katex_css,
|
||||
katexJs=katex_js,
|
||||
autoRender=auto_render,
|
||||
markdown=content,
|
||||
renderMathInElement=render_math_in_element,
|
||||
mermaid=mermaid,
|
||||
)
|
||||
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||
26
docutranslate/exporter/md2x/md2md_exporter.py
Normal file
26
docutranslate/exporter/md2x/md2md_exporter.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import runtime_checkable
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
from docutranslate.exporter.md2x.interfaces import MDExporter
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument,Document
|
||||
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
|
||||
|
||||
|
||||
@dataclass
|
||||
class MD2MDExportConfig(ExportConfig):
|
||||
embed_images: bool = True
|
||||
|
||||
|
||||
class MD2MDExporter(MDExporter):
|
||||
def __init__(self, export_config: MD2MDExportConfig | None=None):
|
||||
export_config=export_config or MD2MDExportConfig()
|
||||
self.embed_images=export_config.embed_images
|
||||
|
||||
def export(self,document:MarkdownDocument)->Document:
|
||||
if self.embed_images:
|
||||
return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)
|
||||
else:
|
||||
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)
|
||||
|
||||
|
||||
4
docutranslate/exporter/md2x/types.py
Normal file
4
docutranslate/exporter/md2x/types.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
||||
|
||||
x2md_convert_config_type=ConverterDoclingConfig | ConverterMineruConfig
|
||||
0
docutranslate/exporter/txt2x/__init__.py
Normal file
0
docutranslate/exporter/txt2x/__init__.py
Normal file
8
docutranslate/exporter/txt2x/interfaces.py
Normal file
8
docutranslate/exporter/txt2x/interfaces.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from docutranslate.exporter.interfaces import Exporter
|
||||
from docutranslate.ir.document import Document
|
||||
|
||||
#TODO:看情况是否需要为TXT单独写一个document类型
|
||||
class TXTExporter(Exporter):
|
||||
|
||||
def export(self,document:Document)->Document:
|
||||
...
|
||||
33
docutranslate/exporter/txt2x/txt2html_exporter.py
Normal file
33
docutranslate/exporter/txt2x/txt2html_exporter.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import jinja2
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.utils.resource_utils import resource_path
|
||||
|
||||
|
||||
@dataclass
|
||||
class TXT2HTMLExportConfig(ExportConfig):
|
||||
cdn: bool = True
|
||||
|
||||
|
||||
class TXT2HTMLExporter(TXTExporter):
|
||||
def __init__(self, export_config: TXT2HTMLExportConfig = None):
|
||||
export_config = export_config or TXT2HTMLExportConfig()
|
||||
self.cdn = export_config.cdn
|
||||
|
||||
def export(self, document: MarkdownDocument) -> Document:
|
||||
cdn = self.cdn
|
||||
html_template = resource_path("template/txt.html").read_text(encoding="utf-8")
|
||||
|
||||
# language=html
|
||||
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
|
||||
|
||||
render = jinja2.Template(html_template).render(
|
||||
title=document.stem,
|
||||
pico=pico,
|
||||
)
|
||||
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||
0
docutranslate/ir/__init__.py
Normal file
0
docutranslate/ir/__init__.py
Normal file
24
docutranslate/ir/document.py
Normal file
24
docutranslate/ir/document.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
|
||||
class Document:
|
||||
def __init__(self,suffix:str,content:bytes,stem:str|None=None,path:Path=None):
|
||||
self.suffix=suffix
|
||||
self.content=content
|
||||
self.stem=stem
|
||||
self.path=path
|
||||
@property
|
||||
def name(self)->str|None:
|
||||
if not self.stem:
|
||||
return None
|
||||
return self.stem+self.suffix
|
||||
|
||||
@classmethod
|
||||
def from_path(cls,path:Path|str):
|
||||
if isinstance(path,str):
|
||||
path=Path(path)
|
||||
return cls(suffix=path.suffix,content=path.read_bytes(),stem=path.stem,path=path)
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls,content:bytes,suffix:str,stem:str|None):
|
||||
return cls(content=content,suffix=suffix,stem=stem)
|
||||
7
docutranslate/ir/markdown_document.py
Normal file
7
docutranslate/ir/markdown_document.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from docutranslate.ir.document import Document
|
||||
|
||||
|
||||
class MarkdownDocument(Document):
|
||||
def __init__(self,*args,**kwargs):
|
||||
super().__init__(*args,**kwargs)
|
||||
self.suffix=".md"
|
||||
0
docutranslate/manager/__init__.py
Normal file
0
docutranslate/manager/__init__.py
Normal file
51
docutranslate/manager/base_manager.py
Normal file
51
docutranslate/manager/base_manager.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
from typing import Self, Generic, TypeVar
|
||||
|
||||
from docutranslate.exporter.interfaces import Exporter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
T_Translated = TypeVar('T_Translated', bound=Document)
|
||||
|
||||
|
||||
class BaseManager(ABC, Generic[T_Translated]):
|
||||
def __init__(self, logger: Logger = global_logger):
|
||||
self.logger = logger
|
||||
self.document_original: Document | None = None
|
||||
self.document_translated: T_Translated | None = None
|
||||
|
||||
def read_path(self, path: Path | str):
|
||||
document = Document.from_path(path)
|
||||
self.document_original = document
|
||||
|
||||
def read_bytes(self, content: bytes, stem: str, suffix: str):
|
||||
document = Document.from_bytes(content=content, stem=stem, suffix=suffix)
|
||||
self.document_original = document
|
||||
|
||||
@abstractmethod
|
||||
def translate(self, *args, **kwargs) -> Self:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def translate_async(self, *args, **kwargs) -> Self:
|
||||
...
|
||||
|
||||
def _export(self, exporter: Exporter) -> Document:
|
||||
if self.document_translated is None:
|
||||
raise RuntimeError("Document has not been translated yet. Call translate() first.")
|
||||
docu = exporter.export(self.document_translated)
|
||||
return docu
|
||||
|
||||
def _save(self, exporter: Exporter, name: str = None, out_put_dir: Path | str = "./output"):
|
||||
docu = self._export(exporter)
|
||||
name = name or docu.name
|
||||
output_path = Path(out_put_dir) / Path(name)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_bytes(docu.content)
|
||||
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
||||
return self
|
||||
@abstractmethod
|
||||
def support_export_format(self)->list[str]:
|
||||
...
|
||||
34
docutranslate/manager/interfaces.py
Normal file
34
docutranslate/manager/interfaces.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from pathlib import Path
|
||||
from typing import Protocol, runtime_checkable, Self, TypeVar
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
|
||||
T = TypeVar("T", bound=ExportConfig)
|
||||
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class HTMLExportable(Protocol):
|
||||
def export_to_html(self, export_config: T) -> str:
|
||||
...
|
||||
|
||||
def save_as_html(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
||||
...
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class MDExportable(Protocol):
|
||||
def export_to_markdown(self, export_config: T) -> str:
|
||||
...
|
||||
|
||||
def save_as_markdown(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
||||
...
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TXTExportable(Protocol):
|
||||
def export_to_txt(self, export_config: T) -> str:
|
||||
...
|
||||
|
||||
def save_as_txt(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
||||
...
|
||||
102
docutranslate/manager/md_based_manager.py
Normal file
102
docutranslate/manager/md_based_manager.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Self, Literal, overload
|
||||
|
||||
from docutranslate.cacher import md_based_convert_cacher
|
||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
||||
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
||||
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
|
||||
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
|
||||
from docutranslate.exporter.md2x.types import x2md_convert_config_type
|
||||
from docutranslate.manager.base_manager import BaseManager
|
||||
from docutranslate.manager.interfaces import HTMLExportable, MDExportable
|
||||
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
|
||||
|
||||
|
||||
class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
||||
|
||||
def support_export_format(self) -> list[str]:
|
||||
return [".md",".html",".zip"]
|
||||
|
||||
def _get_document_md(self, convert_engin, convert_config):
|
||||
if self.document_original is None:
|
||||
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
||||
# 获取缓存的解析后文件
|
||||
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
|
||||
convert_config)
|
||||
# 获取解析文件
|
||||
if document_cached:
|
||||
document_md = document_cached
|
||||
else:
|
||||
if convert_engin is None:
|
||||
converter = ConverterIdentity()
|
||||
elif convert_engin == "mineru":
|
||||
if not isinstance(convert_config, ConverterMineruConfig):
|
||||
raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterMineruConfig}")
|
||||
converter = ConverterMineru(convert_config, logger=self.logger)
|
||||
elif convert_engin == "docling":
|
||||
if not isinstance(convert_config, ConverterDoclingConfig):
|
||||
raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterDoclingConfig}")
|
||||
converter = ConverterDocling(convert_config, logger=self.logger)
|
||||
else:
|
||||
raise ValueError(f"不存在{convert_engin}解析引擎")
|
||||
document_md = converter.convert(self.document_original)
|
||||
# 获取缓存解析后文件
|
||||
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
||||
return document_md
|
||||
|
||||
@overload
|
||||
def translate(self, convert_engin: None,
|
||||
convert_config: None, translate_config: MDTranslateConfig) -> Self:
|
||||
...
|
||||
|
||||
@overload
|
||||
def translate(self, convert_engin: Literal["docling"],
|
||||
convert_config: ConverterDoclingConfig, translate_config: MDTranslateConfig) -> Self:
|
||||
...
|
||||
|
||||
@overload
|
||||
def translate(self, convert_engin: Literal["mineru"],
|
||||
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
|
||||
...
|
||||
|
||||
def translate(self, convert_engin: Literal["mineru", "docling"] | None,
|
||||
convert_config: x2md_convert_config_type | None,
|
||||
translate_config: MDTranslateConfig) -> Self:
|
||||
document_md = self._get_document_md(convert_engin, convert_config)
|
||||
# 翻译解析后文件
|
||||
translator = MDTranslator(translate_config)
|
||||
translator.translate(document_md)
|
||||
self.document_translated = document_md
|
||||
return self
|
||||
|
||||
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
|
||||
convert_config: x2md_convert_config_type | None,
|
||||
translate_config: MDTranslateConfig) -> Self:
|
||||
|
||||
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
||||
# 翻译解析后文件
|
||||
translator = MDTranslator(translate_config)
|
||||
await translator.translate_async(document_md)
|
||||
self.document_translated = document_md
|
||||
return self
|
||||
|
||||
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
|
||||
docu = self._export(MD2HTMLExporter(export_config))
|
||||
return docu.content.decode()
|
||||
|
||||
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
|
||||
docu = self._export(MD2MDExporter(export_config))
|
||||
return docu.content.decode()
|
||||
|
||||
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
|
||||
export_config: MD2HTMLExportConfig | None = None) -> Self:
|
||||
self._save(exporter=MD2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
|
||||
return self
|
||||
|
||||
def save_as_markdown(self, name: str = None, out_put_dir: Path | str = "./output",
|
||||
export_config: MD2MDExportConfig | None = None) -> Self:
|
||||
|
||||
self._save(exporter=MD2MDExporter(export_config), name=name, out_put_dir=out_put_dir)
|
||||
return self
|
||||
66
docutranslate/manager/txt_manager.py
Normal file
66
docutranslate/manager/txt_manager.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from copy import copy
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
from typing import Self
|
||||
|
||||
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
|
||||
from docutranslate.manager.base_manager import BaseManager
|
||||
from docutranslate.manager.interfaces import HTMLExportable
|
||||
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
|
||||
|
||||
|
||||
@dataclass
|
||||
class TXTManagerConfig:
|
||||
chunk_size: int = 3000
|
||||
base_url: str | None = None
|
||||
api_key = None,
|
||||
model_id: str | None = None
|
||||
temperature = 0.7
|
||||
concurrent: int = 30
|
||||
timeout = 2000
|
||||
cache = True
|
||||
logger: Logger | None = None
|
||||
|
||||
|
||||
class TXTManager(BaseManager, HTMLExportable):
|
||||
def support_export_format(self) -> list[str]:
|
||||
return [".txt", ".html"]
|
||||
|
||||
def translate(self, translate_config: TXTTranslateConfig) -> Self:
|
||||
document = copy(self.document_original)
|
||||
# 翻译解析后文件
|
||||
translator = TXTTranslator(translate_config)
|
||||
translator.translate(document)
|
||||
self.document_translated = document
|
||||
return self
|
||||
|
||||
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
|
||||
document = copy(self.document_original)
|
||||
# 翻译解析后文件
|
||||
translator = TXTTranslator(translate_config)
|
||||
await translator.translate_async(document)
|
||||
self.document_translated = document
|
||||
return self
|
||||
|
||||
def export_to_html(self, export_config: TXT2HTMLExportConfig) -> str:
|
||||
docu = self._export(TXT2HTMLExporter(export_config))
|
||||
return docu.content.decode()
|
||||
|
||||
def export_to_txt(self) -> str:
|
||||
if self.document_translated is None:
|
||||
raise RuntimeError("Document has not been translated yet. Call translate() first.")
|
||||
return self.document_translated.content.decode()
|
||||
|
||||
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
|
||||
export_config: TXT2HTMLExportConfig | None = None) -> Self:
|
||||
self._save(exporter=TXT2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
|
||||
return self
|
||||
|
||||
def save_as_txt(self, name: str = None, out_put_dir: Path | str = "./output", ) -> Self:
|
||||
name = name or self.document_translated.name
|
||||
output_path = Path(out_put_dir) / Path(name)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_bytes(self.document_translated.content)
|
||||
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
||||
return self
|
||||
17
docutranslate/template/txt.html
Normal file
17
docutranslate/template/txt.html
Normal file
@@ -0,0 +1,17 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>{{ title }}</title>
|
||||
{{pico}}
|
||||
<style>
|
||||
html {
|
||||
padding: 2vh 10vw;
|
||||
font-size: 15px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{{ body }}
|
||||
</body>
|
||||
</html>
|
||||
@@ -11,12 +11,12 @@ import markdown2
|
||||
|
||||
from docutranslate.agents import Agent, AgentArgs
|
||||
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
||||
from docutranslate.cacher import document_cacher_global
|
||||
from docutranslate.converter import Document, ConverterMineru
|
||||
from docutranslate.cacher import md_based_convert_cacher
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.global_values import available_packages
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
|
||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2uris, MaskDict, clean_markdown_math_block, \
|
||||
unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip
|
||||
from docutranslate.utils.resource_utils import resource_path
|
||||
|
||||
@@ -62,7 +62,7 @@ class FileTranslater:
|
||||
self.timeout = timeout
|
||||
self.document: Document | None = None
|
||||
self.cache = cache
|
||||
self.cacher = document_cacher_global
|
||||
self.cacher = md_based_convert_cacher
|
||||
if file_path:
|
||||
self.read_file(file_path=file_path)
|
||||
|
||||
@@ -79,7 +79,7 @@ class FileTranslater:
|
||||
return self
|
||||
|
||||
def _unmask_uris_in_markdown(self):
|
||||
self.markdown = placeholder2_uris(self.markdown, self._mask_dict)
|
||||
self.markdown = placeholder2uris(self.markdown, self._mask_dict)
|
||||
return self
|
||||
|
||||
def _split_markdown_into_chunks(self) -> list[str]:
|
||||
|
||||
0
docutranslate/translater/__init__.py
Normal file
0
docutranslate/translater/__init__.py
Normal file
16
docutranslate/translater/base.py
Normal file
16
docutranslate/translater/base.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
|
||||
|
||||
@dataclass
|
||||
class AiTranslateConfig:
|
||||
base_url: str
|
||||
api_key: str
|
||||
model_id: str
|
||||
to_lang: str
|
||||
custom_prompt: str | None = None
|
||||
temperature: float = 0.7
|
||||
timeout: int = 2000
|
||||
chunk_size: int = 3000
|
||||
concurrent: int = 30
|
||||
logger: Logger | None = None
|
||||
21
docutranslate/translater/interfaces.py
Normal file
21
docutranslate/translater/interfaces.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from typing import runtime_checkable, Protocol, TypeVar
|
||||
|
||||
from docutranslate.agents import Agent
|
||||
from docutranslate.ir.document import Document
|
||||
|
||||
T=TypeVar('T',bound=Document)
|
||||
V=TypeVar('V',bound=Agent)
|
||||
|
||||
@runtime_checkable
|
||||
class Translator(Protocol[T,V]):
|
||||
"""
|
||||
翻译中间文本(原地替换),Translator不做格式转换
|
||||
"""
|
||||
def translate(self, document:T) -> Document:
|
||||
...
|
||||
|
||||
async def translate_async(self, document: T) -> Document:
|
||||
...
|
||||
|
||||
def log(self,info:str):
|
||||
...
|
||||
70
docutranslate/translater/md_translator.py
Normal file
70
docutranslate/translater/md_translator.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
from typing import Self
|
||||
|
||||
from docutranslate.agents import MDTranslateAgent
|
||||
from docutranslate.document_context.md_mask_context import MDMaskUrisContext
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.translater.base import AiTranslateConfig
|
||||
from docutranslate.translater.interfaces import Translator
|
||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
||||
|
||||
|
||||
@dataclass
|
||||
class MDTranslateConfig(AiTranslateConfig):
|
||||
...
|
||||
|
||||
|
||||
|
||||
class MDTranslator(Translator):
|
||||
def __init__(self, config: MDTranslateConfig):
|
||||
self.logger = config.logger or global_logger
|
||||
self.chunk_size = config.chunk_size
|
||||
self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt,
|
||||
to_lang=config.to_lang,
|
||||
baseurl=config.base_url,
|
||||
key=config.api_key,
|
||||
model_id=config.model_id,
|
||||
system_prompt=None,
|
||||
temperature=config.temperature,
|
||||
max_concurrent=config.concurrent,
|
||||
timeout=config.timeout,
|
||||
logger=self.logger)
|
||||
|
||||
def translate(self, document: MarkdownDocument) -> Self:
|
||||
self.logger.info("正在翻译markdown")
|
||||
with MDMaskUrisContext(document):
|
||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||
result: list[str] = self.translate_agent.send_prompts(chunks)
|
||||
content = join_markdown_texts(result)
|
||||
# 做一些加强鲁棒性的操作
|
||||
content = content.replace(r'\(', r'\(')
|
||||
content = content.replace(r'\)', r'\)')
|
||||
content = clean_markdown_math_block(content)
|
||||
|
||||
document.content = content.encode()
|
||||
self.logger.info("翻译完成")
|
||||
return self
|
||||
|
||||
async def translate_async(self, document: MarkdownDocument) -> Self:
|
||||
self.logger.info("正在翻译markdown")
|
||||
with MDMaskUrisContext(document):
|
||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||
result: list[str] = await self.translate_agent.send_prompts_async(chunks)
|
||||
|
||||
def run():
|
||||
content = join_markdown_texts(result)
|
||||
# 做一些加强鲁棒性的操作
|
||||
content = content.replace(r'\(', r'\(')
|
||||
content = content.replace(r'\)', r'\)')
|
||||
content = clean_markdown_math_block(content)
|
||||
document.content = content.encode()
|
||||
|
||||
await asyncio.to_thread(run)
|
||||
self.logger.info("翻译完成")
|
||||
return self
|
||||
50
docutranslate/translater/txt_translator.py
Normal file
50
docutranslate/translater/txt_translator.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Self
|
||||
|
||||
from docutranslate.agents.txt_agent import TXTTranslateAgent
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.translater.base import AiTranslateConfig
|
||||
from docutranslate.translater.interfaces import Translator
|
||||
from docutranslate.utils.markdown_splitter import split_markdown_text
|
||||
|
||||
|
||||
@dataclass
|
||||
class TXTTranslateConfig(AiTranslateConfig):
|
||||
...
|
||||
|
||||
|
||||
class TXTTranslator(Translator):
|
||||
def __init__(self, config: TXTTranslateConfig):
|
||||
self.logger = config.logger or global_logger
|
||||
self.chunk_size = config.chunk_size
|
||||
self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt,
|
||||
to_lang=config.to_lang,
|
||||
baseurl=config.base_url,
|
||||
key=config.api_key,
|
||||
model_id=config.model_id,
|
||||
system_prompt=None,
|
||||
temperature=config.temperature,
|
||||
max_concurrent=config.concurrent,
|
||||
timeout=config.timeout,
|
||||
logger=self.logger)
|
||||
|
||||
def translate(self, document: Document) -> Self:
|
||||
self.logger.info("正在翻译txt")
|
||||
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
||||
self.logger.info(f"txt分为{len(chunks)}块")
|
||||
result: list[str] = self.translate_agent.send_prompts(chunks)
|
||||
content = "\n".join(result)
|
||||
document.content = content.encode()
|
||||
self.logger.info("翻译完成")
|
||||
return self
|
||||
|
||||
async def translate_async(self, document: Document) -> Self:
|
||||
self.logger.info("正在翻译txt")
|
||||
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
||||
self.logger.info(f"txt分为{len(chunks)}块")
|
||||
result: list[str] = await self.translate_agent.send_prompts_async(chunks)
|
||||
content = "\n".join(result)
|
||||
document.content = content.encode()
|
||||
self.logger.info("翻译完成")
|
||||
return self
|
||||
@@ -218,7 +218,7 @@ class MarkdownBlockSplitter:
|
||||
return result
|
||||
|
||||
|
||||
def split_markdown_text(markdown_text, max_block_size=5000):
|
||||
def split_markdown_text(markdown_text:str, max_block_size=5000):
|
||||
"""
|
||||
将Markdown字符串分割成不超过max_block_size的块
|
||||
可以通过简单拼接重建原始文本(分割的代码块除外)
|
||||
|
||||
@@ -69,7 +69,7 @@ def uris2placeholder(markdown: str, mask_dict: MaskDict):
|
||||
return markdown
|
||||
|
||||
|
||||
def placeholder2_uris(markdown: str, mask_dict: MaskDict):
|
||||
def placeholder2uris(markdown: str, mask_dict: MaskDict):
|
||||
def placeholder2uri(match: re.Match):
|
||||
id = match.group(1)
|
||||
uri = mask_dict.get(id)
|
||||
|
||||
Reference in New Issue
Block a user