重构代码,新增了MarkdownBasedManager和TXTManager实现
This commit is contained in:
@@ -2,4 +2,4 @@ __version__="0.3.4b1"
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
from .translater import FileTranslater
|
# from .translater import FileTranslater
|
||||||
@@ -17,7 +17,7 @@ class AgentArgs(TypedDict, total=False):
|
|||||||
baseurl: str
|
baseurl: str
|
||||||
key: str
|
key: str
|
||||||
model_id: str
|
model_id: str
|
||||||
system_prompt: str
|
system_prompt: str | None
|
||||||
temperature: float
|
temperature: float
|
||||||
max_concurrent: int
|
max_concurrent: int
|
||||||
timeout: int
|
timeout: int
|
||||||
@@ -42,7 +42,6 @@ class TotalErrorCounter:
|
|||||||
return self.count > MAX_TOTAL_ERROR_COUNT
|
return self.count > MAX_TOTAL_ERROR_COUNT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 仅使用多线程时用以计数
|
# 仅使用多线程时用以计数
|
||||||
class PromptsCounter:
|
class PromptsCounter:
|
||||||
def __init__(self, total: int, logger: logging.Logger):
|
def __init__(self, total: int, logger: logging.Logger):
|
||||||
@@ -62,14 +61,14 @@ TIMEOUT = 600
|
|||||||
|
|
||||||
|
|
||||||
class Agent:
|
class Agent:
|
||||||
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
|
def __init__(self, baseurl: str, key: str | None, model_id: str, system_prompt: str | None = None, temperature=0.7,
|
||||||
max_concurrent=15, timeout: int = TIMEOUT, logger: logging.Logger | None = None):
|
max_concurrent=15, timeout: int = TIMEOUT, logger: logging.Logger | None = None):
|
||||||
self.baseurl = baseurl.strip()
|
self.baseurl = baseurl.strip()
|
||||||
if self.baseurl.endswith("/"):
|
if self.baseurl.endswith("/"):
|
||||||
self.baseurl = self.baseurl[:-1]
|
self.baseurl = self.baseurl[:-1]
|
||||||
self.key = key.strip()
|
self.key = key.strip() or "xx"
|
||||||
self.model_id = model_id.strip()
|
self.model_id = model_id.strip()
|
||||||
self.system_prompt = system_prompt
|
self.system_prompt = system_prompt or ""
|
||||||
self.temperature = temperature
|
self.temperature = temperature
|
||||||
self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
|
self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
|
||||||
self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
|
self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
|
||||||
@@ -78,6 +77,7 @@ class Agent:
|
|||||||
|
|
||||||
self.logger = logger if logger else global_logger
|
self.logger = logger if logger else global_logger
|
||||||
self.total_error_counter = TotalErrorCounter(logger=self.logger)
|
self.total_error_counter = TotalErrorCounter(logger=self.logger)
|
||||||
|
|
||||||
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
|
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
|
||||||
if temperature is None:
|
if temperature is None:
|
||||||
temperature = self.temperature
|
temperature = self.temperature
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
from typing import Unpack
|
from typing import Unpack, NotRequired
|
||||||
|
|
||||||
from .agent import (Agent, AgentArgs)
|
from .agent import Agent, AgentArgs
|
||||||
|
|
||||||
|
class MDTranslateAgentArgs(AgentArgs, total=True):
|
||||||
|
to_lang:str
|
||||||
|
custom_prompt:NotRequired[str]
|
||||||
|
|
||||||
class MDRefineAgent(Agent):
|
class MDRefineAgent(Agent):
|
||||||
def __init__(self, custom_prompt=None, **kwargs: Unpack[AgentArgs]):
|
def __init__(self, custom_prompt=None, **kwargs: Unpack[AgentArgs]):
|
||||||
|
|||||||
29
docutranslate/agents/txt_agent.py
Normal file
29
docutranslate/agents/txt_agent.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from typing import NotRequired, Unpack
|
||||||
|
|
||||||
|
from docutranslate.agents import AgentArgs, Agent
|
||||||
|
|
||||||
|
|
||||||
|
class TXTTranslateAgentArgs(AgentArgs, total=True):
|
||||||
|
to_lang: str
|
||||||
|
custom_prompt: NotRequired[str]
|
||||||
|
|
||||||
|
|
||||||
|
class TXTTranslateAgent(Agent):
|
||||||
|
def __init__(self, custom_prompt=None, to_lang="中文", **kwargs: Unpack[AgentArgs]):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.system_prompt = f"""
|
||||||
|
# 角色
|
||||||
|
你是一个专业的机器翻译引擎
|
||||||
|
# 工作
|
||||||
|
翻译输入的txt文本
|
||||||
|
目标语言{to_lang}
|
||||||
|
# 要求
|
||||||
|
翻译要求专业准确
|
||||||
|
不输出任何解释和注释
|
||||||
|
不能改变形如<ph-xxxxxx>的占位符
|
||||||
|
# 输出
|
||||||
|
翻译后的txt译文纯文本
|
||||||
|
"""
|
||||||
|
if custom_prompt:
|
||||||
|
self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + custom_prompt + '\n'
|
||||||
|
self.system_prompt += r'\no_think'
|
||||||
@@ -1 +1 @@
|
|||||||
from .document_cacher import DocumentCacher, document_cacher_global
|
from .md_based_convert_cacher import MDBasedCovertCacher, md_based_convert_cacher
|
||||||
|
|||||||
@@ -1,30 +0,0 @@
|
|||||||
import os
|
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from docutranslate.converter import Document
|
|
||||||
|
|
||||||
CACHE_NUM=os.getenv("DOCUTRANSLATE_CACHE_NUM",default="10")
|
|
||||||
|
|
||||||
class DocumentCacher:
|
|
||||||
def __init__(self):
|
|
||||||
self.cache_dict = OrderedDict()
|
|
||||||
@staticmethod
|
|
||||||
def _get_hashcode(document: Document, formula: bool, code: bool, convert_engin: str) -> str:
|
|
||||||
obj = (document.suffix, document.filebytes, formula, code, convert_engin)
|
|
||||||
return str(hash(obj))
|
|
||||||
|
|
||||||
def get_cached_result(self, document: Document, formula: bool, code: bool, convert_engin: str)->str|None:
|
|
||||||
return self.cache_dict.get(self._get_hashcode(document, formula, code, convert_engin))
|
|
||||||
|
|
||||||
def cache_result(self, result: str, document: Document, formula: bool, code: bool, convert_engin: str):
|
|
||||||
hash_code = self._get_hashcode(document, formula, code, convert_engin)
|
|
||||||
if len(self.cache_dict)>=int(CACHE_NUM):
|
|
||||||
self.cache_dict.popitem(last=False)
|
|
||||||
self.cache_dict[hash_code] = result
|
|
||||||
return result
|
|
||||||
|
|
||||||
def clear(self):
|
|
||||||
self.cache_dict.clear()
|
|
||||||
|
|
||||||
|
|
||||||
document_cacher_global = DocumentCacher()
|
|
||||||
36
docutranslate/cacher/md_based_convert_cacher.py
Normal file
36
docutranslate/cacher/md_based_convert_cacher.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import os
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from docutranslate.exporter.md2x.types import x2md_convert_config_type
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
CACHE_NUM = os.getenv("DOCUTRANSLATE_CACHE_NUM", default="10")
|
||||||
|
|
||||||
|
|
||||||
|
class MDBasedCovertCacher:
|
||||||
|
def __init__(self):
|
||||||
|
self.cache_dict = OrderedDict()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str:
|
||||||
|
obj = (document.suffix, document.content, convert_engin, convert_config)
|
||||||
|
return str(hash(obj))
|
||||||
|
|
||||||
|
def get_cached_result(self, document: Document, convert_engin: str,
|
||||||
|
convert_config: x2md_convert_config_type) -> MarkdownDocument | None:
|
||||||
|
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
|
||||||
|
|
||||||
|
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
|
||||||
|
convert_config: x2md_convert_config_type) -> MarkdownDocument:
|
||||||
|
hash_code = self._get_hashcode(document, convert_engin, convert_config)
|
||||||
|
if len(self.cache_dict) > int(CACHE_NUM):
|
||||||
|
self.cache_dict.popitem(last=False)
|
||||||
|
self.cache_dict[hash_code] = convert_result
|
||||||
|
return convert_result
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
self.cache_dict.clear()
|
||||||
|
|
||||||
|
|
||||||
|
md_based_convert_cacher = MDBasedCovertCacher()
|
||||||
@@ -1,9 +1,3 @@
|
|||||||
from .converter import Document,Converter
|
"""
|
||||||
from .converter_mineru import ConverterMineru
|
这个包用来处理document之间的格式转换
|
||||||
|
"""
|
||||||
from docutranslate.global_values import conditional_import
|
|
||||||
if conditional_import("docling"):
|
|
||||||
from .converter_docling import ConverterDocling
|
|
||||||
|
|
||||||
# 打包docling时取消下面一行注释
|
|
||||||
# from .converter_docling import ConverterDocling
|
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
from typing import Protocol
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
class Document:
|
|
||||||
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
|
|
||||||
if path is None and (filename is None or filebytes is None):
|
|
||||||
raise Exception("Document的路径或filename、filebytes不能同时为空")
|
|
||||||
self.filebytes = filebytes
|
|
||||||
self.filename = filename
|
|
||||||
self.path = path
|
|
||||||
if path:
|
|
||||||
if isinstance(path,str):
|
|
||||||
path=Path(path)
|
|
||||||
self.path=path
|
|
||||||
self.filename=path.name
|
|
||||||
self.filebytes=path.read_bytes()
|
|
||||||
self.suffix=Path(self.filename).suffix
|
|
||||||
self.stem=Path(self.filename).stem
|
|
||||||
|
|
||||||
class Converter(Protocol):
|
|
||||||
#转换为markdown
|
|
||||||
def convert(self,document:Document)->str:
|
|
||||||
...
|
|
||||||
|
|
||||||
async def convert_async(self,document:Document)->str:
|
|
||||||
...
|
|
||||||
12
docutranslate/converter/interfaces.py
Normal file
12
docutranslate/converter/interfaces.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from typing import Protocol, runtime_checkable
|
||||||
|
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Converter(Protocol):
|
||||||
|
def convert(self, document: Document) -> Document:
|
||||||
|
...
|
||||||
|
|
||||||
|
async def convert_async(self, document: Document) -> Document:
|
||||||
|
...
|
||||||
0
docutranslate/converter/x2md/__init__.py
Normal file
0
docutranslate/converter/x2md/__init__.py
Normal file
@@ -1,8 +1,9 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from logging import Logger
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@@ -13,34 +14,49 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||||
|
|
||||||
from docutranslate.converter import Converter, Document
|
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
|
|
||||||
IMAGE_RESOLUTION_SCALE = 4
|
IMAGE_RESOLUTION_SCALE = 4
|
||||||
|
|
||||||
|
|
||||||
class ConverterDocling(Converter):
|
@dataclass(frozen=True)
|
||||||
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
|
class ConverterDoclingConfig:
|
||||||
self.code = code
|
code: bool = True
|
||||||
self.formula = formula
|
formula: bool = True
|
||||||
self.artifact = artifact
|
artifact: Path | None = None
|
||||||
self.logger = logger if logger else global_logger
|
|
||||||
|
|
||||||
def convert(self, document):
|
|
||||||
assert isinstance(document.filename, str)
|
class ConverterDocling(X2MarkdownConverter):
|
||||||
|
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
|
||||||
|
self.config = config
|
||||||
|
self.code = config.code
|
||||||
|
self.formula = config.formula
|
||||||
|
self.artifact = config.artifact
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
|
def convert(self, document) -> MarkdownDocument:
|
||||||
|
assert isinstance(document.name, str)
|
||||||
self.logger.info(f"正在将文档转换为markdown")
|
self.logger.info(f"正在将文档转换为markdown")
|
||||||
time1 = time.time()
|
time1 = time.time()
|
||||||
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
|
document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content))
|
||||||
result = self.file2markdown_embed_images(document_stream)
|
content = self.file2markdown_embed_images(document_stream)
|
||||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
return result
|
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||||
|
return md_document
|
||||||
|
|
||||||
async def convert_async(self, document: Document) -> str:
|
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||||
return await asyncio.to_thread(
|
return await asyncio.to_thread(
|
||||||
self.convert,
|
self.convert,
|
||||||
document
|
document
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def support_format(self) -> list[str]:
|
||||||
|
return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff",
|
||||||
|
".bmp", ".webp"]
|
||||||
|
|
||||||
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
|
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
|
||||||
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
|
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
15
docutranslate/converter/x2md/converter_identity.py
Normal file
15
docutranslate/converter/x2md/converter_identity.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
|
||||||
|
class ConverterIdentity(X2MarkdownConverter):
|
||||||
|
|
||||||
|
def convert(self, document: Document) -> MarkdownDocument:
|
||||||
|
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||||
|
|
||||||
|
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||||
|
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||||
|
|
||||||
|
def support_format(self) -> list[str]:
|
||||||
|
return [".md"]
|
||||||
@@ -1,14 +1,26 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
|
||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from docutranslate.converter import Converter, Document
|
|
||||||
|
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||||
|
|
||||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ConverterMineruConfig:
|
||||||
|
mineru_token: str
|
||||||
|
formula: bool = True
|
||||||
|
|
||||||
|
|
||||||
timeout = httpx.Timeout(
|
timeout = httpx.Timeout(
|
||||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||||
read=200.0, # 读取超时 (等待服务器响应的最长时间)
|
read=200.0, # 读取超时 (等待服务器响应的最长时间)
|
||||||
@@ -16,16 +28,16 @@ timeout = httpx.Timeout(
|
|||||||
pool=1.0 # 从连接池获取连接的超时时间
|
pool=1.0 # 从连接池获取连接的超时时间
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||||
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||||
|
|
||||||
# TODO: 提供更详细的logger
|
|
||||||
class ConverterMineru(Converter):
|
class ConverterMineru(X2MarkdownConverter):
|
||||||
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
|
||||||
self.mineru_token = token.strip()
|
self.config = config
|
||||||
self.formula = formula
|
self.mineru_token = config.mineru_token.strip()
|
||||||
self.logger=logger if logger else global_logger
|
self.formula = config.formula
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
def _get_header(self):
|
def _get_header(self):
|
||||||
return {
|
return {
|
||||||
@@ -39,7 +51,7 @@ class ConverterMineru(Converter):
|
|||||||
"language": "auto",
|
"language": "auto",
|
||||||
"enable_table": True,
|
"enable_table": True,
|
||||||
"files": [
|
"files": [
|
||||||
{"name": f"{document.filename}", "is_ocr": True}
|
{"name": f"{document.name}", "is_ocr": True}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -54,7 +66,7 @@ class ConverterMineru(Converter):
|
|||||||
urls = result["data"]["file_urls"]
|
urls = result["data"]["file_urls"]
|
||||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||||
# 获取
|
# 获取
|
||||||
res_upload = client.put(urls[0], content=document.filebytes)
|
res_upload = client.put(urls[0], content=document.content)
|
||||||
res_upload.raise_for_status()
|
res_upload.raise_for_status()
|
||||||
# print(f"{urls[0]} upload success")
|
# print(f"{urls[0]} upload success")
|
||||||
return batch_id
|
return batch_id
|
||||||
@@ -72,7 +84,7 @@ class ConverterMineru(Converter):
|
|||||||
urls = result["data"]["file_urls"]
|
urls = result["data"]["file_urls"]
|
||||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||||
# 获取
|
# 获取
|
||||||
res_upload = await client_async.put(urls[0], content=document.filebytes)
|
res_upload = await client_async.put(urls[0], content=document.content)
|
||||||
res_upload.raise_for_status()
|
res_upload.raise_for_status()
|
||||||
# print(f"{urls[0]} upload success")
|
# print(f"{urls[0]} upload success")
|
||||||
return batch_id
|
return batch_id
|
||||||
@@ -87,8 +99,8 @@ class ConverterMineru(Converter):
|
|||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
fileinfo = res.json()["data"]["extract_result"][0]
|
fileinfo = res.json()["data"]["extract_result"][0]
|
||||||
if fileinfo["state"] == "done":
|
if fileinfo["state"] == "done":
|
||||||
fileurl = fileinfo["full_zip_url"]
|
file_url = fileinfo["full_zip_url"]
|
||||||
return fileurl
|
return file_url
|
||||||
else:
|
else:
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
@@ -100,36 +112,40 @@ class ConverterMineru(Converter):
|
|||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
fileinfo = res.json()["data"]["extract_result"][0]
|
fileinfo = res.json()["data"]["extract_result"][0]
|
||||||
if fileinfo["state"] == "done":
|
if fileinfo["state"] == "done":
|
||||||
fileurl = fileinfo["full_zip_url"]
|
file_url = fileinfo["full_zip_url"]
|
||||||
return fileurl
|
return file_url
|
||||||
else:
|
else:
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
def convert(self, document: Document) -> str:
|
def convert(self, document: Document) -> MarkdownDocument:
|
||||||
self.logger.info(f"正在将文档转换为markdown")
|
self.logger.info(f"正在将文档转换为markdown")
|
||||||
time1 = time.time()
|
time1 = time.time()
|
||||||
batch_id = self.upload(document)
|
batch_id = self.upload(document)
|
||||||
file_url = self.get_file_url(batch_id)
|
file_url = self.get_file_url(batch_id)
|
||||||
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
content = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
return result
|
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||||
|
return md_document
|
||||||
|
|
||||||
async def convert_async(self, document: Document) -> str:
|
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||||
# 待优化
|
|
||||||
self.logger.info(f"正在将文档转换为markdown")
|
self.logger.info(f"正在将文档转换为markdown")
|
||||||
time1 = time.time()
|
time1 = time.time()
|
||||||
batch_id = await self.upload_async(document)
|
batch_id = await self.upload_async(document)
|
||||||
file_url = await self.get_file_url_async(batch_id)
|
file_url = await self.get_file_url_async(batch_id)
|
||||||
result = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
||||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
return result
|
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||||
|
return md_document
|
||||||
|
|
||||||
|
def support_format(self) -> list[str]:
|
||||||
|
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
|
||||||
|
|
||||||
|
|
||||||
def get_md_from_zip_url_with_inline_images(
|
def get_md_from_zip_url_with_inline_images(
|
||||||
zip_url: str,
|
zip_url: str,
|
||||||
filename_in_zip: str = "full.md",
|
filename_in_zip: str = "full.md",
|
||||||
encoding: str = "utf-8"
|
encoding: str = "utf-8"
|
||||||
) -> str | None:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||||
@@ -152,7 +168,8 @@ def get_md_from_zip_url_with_inline_images(
|
|||||||
|
|
||||||
|
|
||||||
except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
raise Exception(
|
||||||
|
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||||
except httpx.RequestError as e:
|
except httpx.RequestError as e:
|
||||||
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||||
except zipfile.BadZipFile:
|
except zipfile.BadZipFile:
|
||||||
@@ -169,7 +186,7 @@ async def get_md_from_zip_url_with_inline_images_async(
|
|||||||
zip_url: str,
|
zip_url: str,
|
||||||
filename_in_zip: str = "full.md",
|
filename_in_zip: str = "full.md",
|
||||||
encoding: str = "utf-8"
|
encoding: str = "utf-8"
|
||||||
) -> str | None:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||||
@@ -181,18 +198,20 @@ async def get_md_from_zip_url_with_inline_images_async(
|
|||||||
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。
|
str : 如果成功,返回处理后的Markdown文本内容。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
||||||
response = await client_async.get(zip_url) # 增加超时
|
response = await client_async.get(zip_url) # 增加超时
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
print("ZIP文件下载完成。")
|
print("ZIP文件下载完成。")
|
||||||
return await asyncio.to_thread(embed_inline_image_from_zip,response.content, filename_in_zip=filename_in_zip, encoding=encoding)
|
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
|
||||||
|
encoding=encoding)
|
||||||
|
|
||||||
|
|
||||||
except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
raise Exception(
|
||||||
|
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||||
except httpx.RequestError as e:
|
except httpx.RequestError as e:
|
||||||
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||||
except zipfile.BadZipFile:
|
except zipfile.BadZipFile:
|
||||||
@@ -204,5 +223,6 @@ async def get_md_from_zip_url_with_inline_images_async(
|
|||||||
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
|
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
|
||||||
raise Exception(f"发生未知错误: {e}")
|
raise Exception(f"发生未知错误: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pass
|
pass
|
||||||
22
docutranslate/converter/x2md/interfaces.py
Normal file
22
docutranslate/converter/x2md/interfaces.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
from typing import runtime_checkable
|
||||||
|
|
||||||
|
from typing import Protocol
|
||||||
|
from docutranslate.converter.interfaces import Converter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class X2MarkdownConverter(Converter,Protocol):
|
||||||
|
"""
|
||||||
|
负责将其它格式的文件转换为markdown
|
||||||
|
"""
|
||||||
|
def convert(self, document: Document) -> MarkdownDocument:
|
||||||
|
...
|
||||||
|
|
||||||
|
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||||
|
...
|
||||||
|
|
||||||
|
def support_format(self)->list[str]:
|
||||||
|
...
|
||||||
0
docutranslate/document_context/__init__.py
Normal file
0
docutranslate/document_context/__init__.py
Normal file
15
docutranslate/document_context/md_mask_context.py
Normal file
15
docutranslate/document_context/md_mask_context.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
from docutranslate.utils.markdown_utils import MaskDict, uris2placeholder, placeholder2uris
|
||||||
|
|
||||||
|
|
||||||
|
class MDMaskUrisContext:
|
||||||
|
def __init__(self, document: MarkdownDocument):
|
||||||
|
self.document = document
|
||||||
|
self.mask_dict = MaskDict()
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.document.content = uris2placeholder(self.document.content.decode(), self.mask_dict).encode()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
self.document.content = placeholder2uris(self.document.content.decode(), self.mask_dict).encode()
|
||||||
3
docutranslate/exporter/__init__.py
Normal file
3
docutranslate/exporter/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""
|
||||||
|
这个包用于将Document导出为其它格式
|
||||||
|
"""
|
||||||
8
docutranslate/exporter/export_config.py
Normal file
8
docutranslate/exporter/export_config.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExportConfig:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
16
docutranslate/exporter/interfaces.py
Normal file
16
docutranslate/exporter/interfaces.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from typing import Protocol, runtime_checkable, TypeVar, Any, Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
D_in = TypeVar('D_in', bound=Document)
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Exporter(Protocol[D_in]):
|
||||||
|
@classmethod
|
||||||
|
def from_config(cls, export_config: ExportConfig | None = None) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
def export(self, document: D_in) -> Any:
|
||||||
|
...
|
||||||
0
docutranslate/exporter/md2x/__init__.py
Normal file
0
docutranslate/exporter/md2x/__init__.py
Normal file
12
docutranslate/exporter/md2x/interfaces.py
Normal file
12
docutranslate/exporter/md2x/interfaces.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
|
from docutranslate.exporter.interfaces import Exporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
|
||||||
|
class MDExporter(Exporter):
|
||||||
|
|
||||||
|
def export(self,document:MarkdownDocument)->Document:
|
||||||
|
...
|
||||||
73
docutranslate/exporter/md2x/md2html_exporter.py
Normal file
73
docutranslate/exporter/md2x/md2html_exporter.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import jinja2
|
||||||
|
import markdown2
|
||||||
|
|
||||||
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
|
from docutranslate.exporter.md2x.interfaces import MDExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MD2HTMLExportConfig(ExportConfig):
|
||||||
|
cdn: bool = True
|
||||||
|
|
||||||
|
class MD2HTMLExporter(MDExporter):
|
||||||
|
def __init__(self, export_config: MD2HTMLExportConfig = None):
|
||||||
|
export_config = export_config or MD2HTMLExportConfig()
|
||||||
|
self.cdn=export_config.cdn
|
||||||
|
|
||||||
|
def export(self, document: MarkdownDocument) -> Document:
|
||||||
|
cdn = self.cdn
|
||||||
|
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
|
||||||
|
# language=html
|
||||||
|
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
|
||||||
|
html_template = resource_path("template/markdown.html").read_text(encoding="utf-8")
|
||||||
|
katex_css = f'<style>{resource_path("static/katex.css").read_text(encoding="utf-8")}</style>' if not cdn else r"""<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/katex.min.css" integrity="sha512-fHwaWebuwA7NSF5Qg/af4UeDx9XqUpYpOGgubo3yWu+b2IQR4UeQwbb42Ti7gVAjNtVoI/I9TEoYeu9omwcC6g==" crossorigin="anonymous" referrerpolicy="no-referrer" />"""
|
||||||
|
katex_js = f'<script>{resource_path("static/katex.js").read_text(encoding="utf-8")}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/katex.min.js" integrity="sha512-LQNxIMR5rXv7o+b1l8+N1EZMfhG7iFZ9HhnbJkTp4zjNr5Wvst75AqUeFDxeRUa7l5vEDyUiAip//r+EFLLCyA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
|
||||||
|
auto_render = f'<script>{resource_path("static/autoRender.js").read_text(encoding="utf-8")}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js" integrity="sha512-iWiuBS5nt6r60fCz26Nd0Zqe0nbk1ZTIQbl3Kv7kYsX+yKMUFHzjaH2+AnM6vp2Xs+gNmaBAVWJjSmuPw76Efg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
|
||||||
|
# language=javascript
|
||||||
|
render_math_in_element = r"""
|
||||||
|
<script>
|
||||||
|
document.addEventListener("DOMContentLoaded", function () {
|
||||||
|
renderMathInElement(document.body, {
|
||||||
|
delimiters: [
|
||||||
|
{left: '$$', right: '$$', display: true},
|
||||||
|
{left: '\\[', right: '\\]', display: true},
|
||||||
|
{left: '$', right: '$', display: false},
|
||||||
|
{left: '\\(', right: '\\)', display: false}
|
||||||
|
],
|
||||||
|
throwOnError: false
|
||||||
|
})
|
||||||
|
});
|
||||||
|
</script>""" if cdn else r"""
|
||||||
|
<script>
|
||||||
|
document.addEventListener("DOMContentLoaded", function
|
||||||
|
() {
|
||||||
|
renderMathInElement(document.body, {
|
||||||
|
delimiters: [
|
||||||
|
{left: '$$', right: '$$', display: true},
|
||||||
|
{left: '\\[', right: '\\]', display: true},
|
||||||
|
{left: '$', right: '$', display: false},
|
||||||
|
{left: '\\(', right: '\\)', display: false}
|
||||||
|
],
|
||||||
|
fonts: false,
|
||||||
|
throwOnError: false
|
||||||
|
})
|
||||||
|
});
|
||||||
|
</script>"""
|
||||||
|
mermaid = f'<script>{resource_path("static/mermaid.js").read_text(encoding="utf-8")}</script>'
|
||||||
|
content = markdowner.convert(document.content.decode().replace("\\", "\\\\"))
|
||||||
|
# TODO:实现MathJax本地化
|
||||||
|
render = jinja2.Template(html_template).render(
|
||||||
|
title=document.stem,
|
||||||
|
pico=pico,
|
||||||
|
katexCss=katex_css,
|
||||||
|
katexJs=katex_js,
|
||||||
|
autoRender=auto_render,
|
||||||
|
markdown=content,
|
||||||
|
renderMathInElement=render_math_in_element,
|
||||||
|
mermaid=mermaid,
|
||||||
|
)
|
||||||
|
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||||
26
docutranslate/exporter/md2x/md2md_exporter.py
Normal file
26
docutranslate/exporter/md2x/md2md_exporter.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import runtime_checkable
|
||||||
|
|
||||||
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
|
from docutranslate.exporter.md2x.interfaces import MDExporter
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument,Document
|
||||||
|
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MD2MDExportConfig(ExportConfig):
|
||||||
|
embed_images: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class MD2MDExporter(MDExporter):
|
||||||
|
def __init__(self, export_config: MD2MDExportConfig | None=None):
|
||||||
|
export_config=export_config or MD2MDExportConfig()
|
||||||
|
self.embed_images=export_config.embed_images
|
||||||
|
|
||||||
|
def export(self,document:MarkdownDocument)->Document:
|
||||||
|
if self.embed_images:
|
||||||
|
return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)
|
||||||
|
else:
|
||||||
|
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)
|
||||||
|
|
||||||
|
|
||||||
4
docutranslate/exporter/md2x/types.py
Normal file
4
docutranslate/exporter/md2x/types.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
||||||
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
||||||
|
|
||||||
|
x2md_convert_config_type=ConverterDoclingConfig | ConverterMineruConfig
|
||||||
0
docutranslate/exporter/txt2x/__init__.py
Normal file
0
docutranslate/exporter/txt2x/__init__.py
Normal file
8
docutranslate/exporter/txt2x/interfaces.py
Normal file
8
docutranslate/exporter/txt2x/interfaces.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from docutranslate.exporter.interfaces import Exporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
#TODO:看情况是否需要为TXT单独写一个document类型
|
||||||
|
class TXTExporter(Exporter):
|
||||||
|
|
||||||
|
def export(self,document:Document)->Document:
|
||||||
|
...
|
||||||
33
docutranslate/exporter/txt2x/txt2html_exporter.py
Normal file
33
docutranslate/exporter/txt2x/txt2html_exporter.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import jinja2
|
||||||
|
|
||||||
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
|
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TXT2HTMLExportConfig(ExportConfig):
|
||||||
|
cdn: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class TXT2HTMLExporter(TXTExporter):
|
||||||
|
def __init__(self, export_config: TXT2HTMLExportConfig = None):
|
||||||
|
export_config = export_config or TXT2HTMLExportConfig()
|
||||||
|
self.cdn = export_config.cdn
|
||||||
|
|
||||||
|
def export(self, document: MarkdownDocument) -> Document:
|
||||||
|
cdn = self.cdn
|
||||||
|
html_template = resource_path("template/txt.html").read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
# language=html
|
||||||
|
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
|
||||||
|
|
||||||
|
render = jinja2.Template(html_template).render(
|
||||||
|
title=document.stem,
|
||||||
|
pico=pico,
|
||||||
|
)
|
||||||
|
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||||
0
docutranslate/ir/__init__.py
Normal file
0
docutranslate/ir/__init__.py
Normal file
24
docutranslate/ir/document.py
Normal file
24
docutranslate/ir/document.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import dataclasses
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class Document:
|
||||||
|
def __init__(self,suffix:str,content:bytes,stem:str|None=None,path:Path=None):
|
||||||
|
self.suffix=suffix
|
||||||
|
self.content=content
|
||||||
|
self.stem=stem
|
||||||
|
self.path=path
|
||||||
|
@property
|
||||||
|
def name(self)->str|None:
|
||||||
|
if not self.stem:
|
||||||
|
return None
|
||||||
|
return self.stem+self.suffix
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_path(cls,path:Path|str):
|
||||||
|
if isinstance(path,str):
|
||||||
|
path=Path(path)
|
||||||
|
return cls(suffix=path.suffix,content=path.read_bytes(),stem=path.stem,path=path)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_bytes(cls,content:bytes,suffix:str,stem:str|None):
|
||||||
|
return cls(content=content,suffix=suffix,stem=stem)
|
||||||
7
docutranslate/ir/markdown_document.py
Normal file
7
docutranslate/ir/markdown_document.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownDocument(Document):
|
||||||
|
def __init__(self,*args,**kwargs):
|
||||||
|
super().__init__(*args,**kwargs)
|
||||||
|
self.suffix=".md"
|
||||||
0
docutranslate/manager/__init__.py
Normal file
0
docutranslate/manager/__init__.py
Normal file
51
docutranslate/manager/base_manager.py
Normal file
51
docutranslate/manager/base_manager.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from logging import Logger
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self, Generic, TypeVar
|
||||||
|
|
||||||
|
from docutranslate.exporter.interfaces import Exporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.logger import global_logger
|
||||||
|
|
||||||
|
T_Translated = TypeVar('T_Translated', bound=Document)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseManager(ABC, Generic[T_Translated]):
|
||||||
|
def __init__(self, logger: Logger = global_logger):
|
||||||
|
self.logger = logger
|
||||||
|
self.document_original: Document | None = None
|
||||||
|
self.document_translated: T_Translated | None = None
|
||||||
|
|
||||||
|
def read_path(self, path: Path | str):
|
||||||
|
document = Document.from_path(path)
|
||||||
|
self.document_original = document
|
||||||
|
|
||||||
|
def read_bytes(self, content: bytes, stem: str, suffix: str):
|
||||||
|
document = Document.from_bytes(content=content, stem=stem, suffix=suffix)
|
||||||
|
self.document_original = document
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def translate(self, *args, **kwargs) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def translate_async(self, *args, **kwargs) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
def _export(self, exporter: Exporter) -> Document:
|
||||||
|
if self.document_translated is None:
|
||||||
|
raise RuntimeError("Document has not been translated yet. Call translate() first.")
|
||||||
|
docu = exporter.export(self.document_translated)
|
||||||
|
return docu
|
||||||
|
|
||||||
|
def _save(self, exporter: Exporter, name: str = None, out_put_dir: Path | str = "./output"):
|
||||||
|
docu = self._export(exporter)
|
||||||
|
name = name or docu.name
|
||||||
|
output_path = Path(out_put_dir) / Path(name)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_bytes(docu.content)
|
||||||
|
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
||||||
|
return self
|
||||||
|
@abstractmethod
|
||||||
|
def support_export_format(self)->list[str]:
|
||||||
|
...
|
||||||
34
docutranslate/manager/interfaces.py
Normal file
34
docutranslate/manager/interfaces.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Protocol, runtime_checkable, Self, TypeVar
|
||||||
|
|
||||||
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
|
|
||||||
|
T = TypeVar("T", bound=ExportConfig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class HTMLExportable(Protocol):
|
||||||
|
def export_to_html(self, export_config: T) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
def save_as_html(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class MDExportable(Protocol):
|
||||||
|
def export_to_markdown(self, export_config: T) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
def save_as_markdown(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class TXTExportable(Protocol):
|
||||||
|
def export_to_txt(self, export_config: T) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
def save_as_txt(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
||||||
|
...
|
||||||
102
docutranslate/manager/md_based_manager.py
Normal file
102
docutranslate/manager/md_based_manager.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self, Literal, overload
|
||||||
|
|
||||||
|
from docutranslate.cacher import md_based_convert_cacher
|
||||||
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
||||||
|
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
||||||
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
||||||
|
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
|
||||||
|
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
|
||||||
|
from docutranslate.exporter.md2x.types import x2md_convert_config_type
|
||||||
|
from docutranslate.manager.base_manager import BaseManager
|
||||||
|
from docutranslate.manager.interfaces import HTMLExportable, MDExportable
|
||||||
|
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
||||||
|
|
||||||
|
def support_export_format(self) -> list[str]:
|
||||||
|
return [".md",".html",".zip"]
|
||||||
|
|
||||||
|
def _get_document_md(self, convert_engin, convert_config):
|
||||||
|
if self.document_original is None:
|
||||||
|
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
||||||
|
# 获取缓存的解析后文件
|
||||||
|
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
|
||||||
|
convert_config)
|
||||||
|
# 获取解析文件
|
||||||
|
if document_cached:
|
||||||
|
document_md = document_cached
|
||||||
|
else:
|
||||||
|
if convert_engin is None:
|
||||||
|
converter = ConverterIdentity()
|
||||||
|
elif convert_engin == "mineru":
|
||||||
|
if not isinstance(convert_config, ConverterMineruConfig):
|
||||||
|
raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterMineruConfig}")
|
||||||
|
converter = ConverterMineru(convert_config, logger=self.logger)
|
||||||
|
elif convert_engin == "docling":
|
||||||
|
if not isinstance(convert_config, ConverterDoclingConfig):
|
||||||
|
raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterDoclingConfig}")
|
||||||
|
converter = ConverterDocling(convert_config, logger=self.logger)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"不存在{convert_engin}解析引擎")
|
||||||
|
document_md = converter.convert(self.document_original)
|
||||||
|
# 获取缓存解析后文件
|
||||||
|
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
||||||
|
return document_md
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def translate(self, convert_engin: None,
|
||||||
|
convert_config: None, translate_config: MDTranslateConfig) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def translate(self, convert_engin: Literal["docling"],
|
||||||
|
convert_config: ConverterDoclingConfig, translate_config: MDTranslateConfig) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def translate(self, convert_engin: Literal["mineru"],
|
||||||
|
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
|
||||||
|
...
|
||||||
|
|
||||||
|
def translate(self, convert_engin: Literal["mineru", "docling"] | None,
|
||||||
|
convert_config: x2md_convert_config_type | None,
|
||||||
|
translate_config: MDTranslateConfig) -> Self:
|
||||||
|
document_md = self._get_document_md(convert_engin, convert_config)
|
||||||
|
# 翻译解析后文件
|
||||||
|
translator = MDTranslator(translate_config)
|
||||||
|
translator.translate(document_md)
|
||||||
|
self.document_translated = document_md
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
|
||||||
|
convert_config: x2md_convert_config_type | None,
|
||||||
|
translate_config: MDTranslateConfig) -> Self:
|
||||||
|
|
||||||
|
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
||||||
|
# 翻译解析后文件
|
||||||
|
translator = MDTranslator(translate_config)
|
||||||
|
await translator.translate_async(document_md)
|
||||||
|
self.document_translated = document_md
|
||||||
|
return self
|
||||||
|
|
||||||
|
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
|
||||||
|
docu = self._export(MD2HTMLExporter(export_config))
|
||||||
|
return docu.content.decode()
|
||||||
|
|
||||||
|
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
|
||||||
|
docu = self._export(MD2MDExporter(export_config))
|
||||||
|
return docu.content.decode()
|
||||||
|
|
||||||
|
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
|
||||||
|
export_config: MD2HTMLExportConfig | None = None) -> Self:
|
||||||
|
self._save(exporter=MD2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def save_as_markdown(self, name: str = None, out_put_dir: Path | str = "./output",
|
||||||
|
export_config: MD2MDExportConfig | None = None) -> Self:
|
||||||
|
|
||||||
|
self._save(exporter=MD2MDExporter(export_config), name=name, out_put_dir=out_put_dir)
|
||||||
|
return self
|
||||||
66
docutranslate/manager/txt_manager.py
Normal file
66
docutranslate/manager/txt_manager.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
from copy import copy
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
|
||||||
|
from docutranslate.manager.base_manager import BaseManager
|
||||||
|
from docutranslate.manager.interfaces import HTMLExportable
|
||||||
|
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TXTManagerConfig:
|
||||||
|
chunk_size: int = 3000
|
||||||
|
base_url: str | None = None
|
||||||
|
api_key = None,
|
||||||
|
model_id: str | None = None
|
||||||
|
temperature = 0.7
|
||||||
|
concurrent: int = 30
|
||||||
|
timeout = 2000
|
||||||
|
cache = True
|
||||||
|
logger: Logger | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class TXTManager(BaseManager, HTMLExportable):
|
||||||
|
def support_export_format(self) -> list[str]:
|
||||||
|
return [".txt", ".html"]
|
||||||
|
|
||||||
|
def translate(self, translate_config: TXTTranslateConfig) -> Self:
|
||||||
|
document = copy(self.document_original)
|
||||||
|
# 翻译解析后文件
|
||||||
|
translator = TXTTranslator(translate_config)
|
||||||
|
translator.translate(document)
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
|
||||||
|
document = copy(self.document_original)
|
||||||
|
# 翻译解析后文件
|
||||||
|
translator = TXTTranslator(translate_config)
|
||||||
|
await translator.translate_async(document)
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
def export_to_html(self, export_config: TXT2HTMLExportConfig) -> str:
|
||||||
|
docu = self._export(TXT2HTMLExporter(export_config))
|
||||||
|
return docu.content.decode()
|
||||||
|
|
||||||
|
def export_to_txt(self) -> str:
|
||||||
|
if self.document_translated is None:
|
||||||
|
raise RuntimeError("Document has not been translated yet. Call translate() first.")
|
||||||
|
return self.document_translated.content.decode()
|
||||||
|
|
||||||
|
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
|
||||||
|
export_config: TXT2HTMLExportConfig | None = None) -> Self:
|
||||||
|
self._save(exporter=TXT2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def save_as_txt(self, name: str = None, out_put_dir: Path | str = "./output", ) -> Self:
|
||||||
|
name = name or self.document_translated.name
|
||||||
|
output_path = Path(out_put_dir) / Path(name)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_bytes(self.document_translated.content)
|
||||||
|
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
||||||
|
return self
|
||||||
17
docutranslate/template/txt.html
Normal file
17
docutranslate/template/txt.html
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>{{ title }}</title>
|
||||||
|
{{pico}}
|
||||||
|
<style>
|
||||||
|
html {
|
||||||
|
padding: 2vh 10vw;
|
||||||
|
font-size: 15px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{{ body }}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -11,12 +11,12 @@ import markdown2
|
|||||||
|
|
||||||
from docutranslate.agents import Agent, AgentArgs
|
from docutranslate.agents import Agent, AgentArgs
|
||||||
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
||||||
from docutranslate.cacher import document_cacher_global
|
from docutranslate.cacher import md_based_convert_cacher
|
||||||
from docutranslate.converter import Document, ConverterMineru
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.global_values import available_packages
|
from docutranslate.global_values import available_packages
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
|
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2uris, MaskDict, clean_markdown_math_block, \
|
||||||
unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip
|
unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
@@ -62,7 +62,7 @@ class FileTranslater:
|
|||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.document: Document | None = None
|
self.document: Document | None = None
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
self.cacher = document_cacher_global
|
self.cacher = md_based_convert_cacher
|
||||||
if file_path:
|
if file_path:
|
||||||
self.read_file(file_path=file_path)
|
self.read_file(file_path=file_path)
|
||||||
|
|
||||||
@@ -79,7 +79,7 @@ class FileTranslater:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def _unmask_uris_in_markdown(self):
|
def _unmask_uris_in_markdown(self):
|
||||||
self.markdown = placeholder2_uris(self.markdown, self._mask_dict)
|
self.markdown = placeholder2uris(self.markdown, self._mask_dict)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _split_markdown_into_chunks(self) -> list[str]:
|
def _split_markdown_into_chunks(self) -> list[str]:
|
||||||
|
|||||||
0
docutranslate/translater/__init__.py
Normal file
0
docutranslate/translater/__init__.py
Normal file
16
docutranslate/translater/base.py
Normal file
16
docutranslate/translater/base.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AiTranslateConfig:
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
model_id: str
|
||||||
|
to_lang: str
|
||||||
|
custom_prompt: str | None = None
|
||||||
|
temperature: float = 0.7
|
||||||
|
timeout: int = 2000
|
||||||
|
chunk_size: int = 3000
|
||||||
|
concurrent: int = 30
|
||||||
|
logger: Logger | None = None
|
||||||
21
docutranslate/translater/interfaces.py
Normal file
21
docutranslate/translater/interfaces.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from typing import runtime_checkable, Protocol, TypeVar
|
||||||
|
|
||||||
|
from docutranslate.agents import Agent
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
T=TypeVar('T',bound=Document)
|
||||||
|
V=TypeVar('V',bound=Agent)
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Translator(Protocol[T,V]):
|
||||||
|
"""
|
||||||
|
翻译中间文本(原地替换),Translator不做格式转换
|
||||||
|
"""
|
||||||
|
def translate(self, document:T) -> Document:
|
||||||
|
...
|
||||||
|
|
||||||
|
async def translate_async(self, document: T) -> Document:
|
||||||
|
...
|
||||||
|
|
||||||
|
def log(self,info:str):
|
||||||
|
...
|
||||||
70
docutranslate/translater/md_translator.py
Normal file
70
docutranslate/translater/md_translator.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.agents import MDTranslateAgent
|
||||||
|
from docutranslate.document_context.md_mask_context import MDMaskUrisContext
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
from docutranslate.logger import global_logger
|
||||||
|
from docutranslate.translater.base import AiTranslateConfig
|
||||||
|
from docutranslate.translater.interfaces import Translator
|
||||||
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
|
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MDTranslateConfig(AiTranslateConfig):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MDTranslator(Translator):
|
||||||
|
def __init__(self, config: MDTranslateConfig):
|
||||||
|
self.logger = config.logger or global_logger
|
||||||
|
self.chunk_size = config.chunk_size
|
||||||
|
self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt,
|
||||||
|
to_lang=config.to_lang,
|
||||||
|
baseurl=config.base_url,
|
||||||
|
key=config.api_key,
|
||||||
|
model_id=config.model_id,
|
||||||
|
system_prompt=None,
|
||||||
|
temperature=config.temperature,
|
||||||
|
max_concurrent=config.concurrent,
|
||||||
|
timeout=config.timeout,
|
||||||
|
logger=self.logger)
|
||||||
|
|
||||||
|
def translate(self, document: MarkdownDocument) -> Self:
|
||||||
|
self.logger.info("正在翻译markdown")
|
||||||
|
with MDMaskUrisContext(document):
|
||||||
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||||
|
result: list[str] = self.translate_agent.send_prompts(chunks)
|
||||||
|
content = join_markdown_texts(result)
|
||||||
|
# 做一些加强鲁棒性的操作
|
||||||
|
content = content.replace(r'\(', r'\(')
|
||||||
|
content = content.replace(r'\)', r'\)')
|
||||||
|
content = clean_markdown_math_block(content)
|
||||||
|
|
||||||
|
document.content = content.encode()
|
||||||
|
self.logger.info("翻译完成")
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, document: MarkdownDocument) -> Self:
|
||||||
|
self.logger.info("正在翻译markdown")
|
||||||
|
with MDMaskUrisContext(document):
|
||||||
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||||
|
result: list[str] = await self.translate_agent.send_prompts_async(chunks)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
content = join_markdown_texts(result)
|
||||||
|
# 做一些加强鲁棒性的操作
|
||||||
|
content = content.replace(r'\(', r'\(')
|
||||||
|
content = content.replace(r'\)', r'\)')
|
||||||
|
content = clean_markdown_math_block(content)
|
||||||
|
document.content = content.encode()
|
||||||
|
|
||||||
|
await asyncio.to_thread(run)
|
||||||
|
self.logger.info("翻译完成")
|
||||||
|
return self
|
||||||
50
docutranslate/translater/txt_translator.py
Normal file
50
docutranslate/translater/txt_translator.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.agents.txt_agent import TXTTranslateAgent
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.logger import global_logger
|
||||||
|
from docutranslate.translater.base import AiTranslateConfig
|
||||||
|
from docutranslate.translater.interfaces import Translator
|
||||||
|
from docutranslate.utils.markdown_splitter import split_markdown_text
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TXTTranslateConfig(AiTranslateConfig):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class TXTTranslator(Translator):
|
||||||
|
def __init__(self, config: TXTTranslateConfig):
|
||||||
|
self.logger = config.logger or global_logger
|
||||||
|
self.chunk_size = config.chunk_size
|
||||||
|
self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt,
|
||||||
|
to_lang=config.to_lang,
|
||||||
|
baseurl=config.base_url,
|
||||||
|
key=config.api_key,
|
||||||
|
model_id=config.model_id,
|
||||||
|
system_prompt=None,
|
||||||
|
temperature=config.temperature,
|
||||||
|
max_concurrent=config.concurrent,
|
||||||
|
timeout=config.timeout,
|
||||||
|
logger=self.logger)
|
||||||
|
|
||||||
|
def translate(self, document: Document) -> Self:
|
||||||
|
self.logger.info("正在翻译txt")
|
||||||
|
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
||||||
|
self.logger.info(f"txt分为{len(chunks)}块")
|
||||||
|
result: list[str] = self.translate_agent.send_prompts(chunks)
|
||||||
|
content = "\n".join(result)
|
||||||
|
document.content = content.encode()
|
||||||
|
self.logger.info("翻译完成")
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, document: Document) -> Self:
|
||||||
|
self.logger.info("正在翻译txt")
|
||||||
|
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
||||||
|
self.logger.info(f"txt分为{len(chunks)}块")
|
||||||
|
result: list[str] = await self.translate_agent.send_prompts_async(chunks)
|
||||||
|
content = "\n".join(result)
|
||||||
|
document.content = content.encode()
|
||||||
|
self.logger.info("翻译完成")
|
||||||
|
return self
|
||||||
@@ -218,7 +218,7 @@ class MarkdownBlockSplitter:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def split_markdown_text(markdown_text, max_block_size=5000):
|
def split_markdown_text(markdown_text:str, max_block_size=5000):
|
||||||
"""
|
"""
|
||||||
将Markdown字符串分割成不超过max_block_size的块
|
将Markdown字符串分割成不超过max_block_size的块
|
||||||
可以通过简单拼接重建原始文本(分割的代码块除外)
|
可以通过简单拼接重建原始文本(分割的代码块除外)
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ def uris2placeholder(markdown: str, mask_dict: MaskDict):
|
|||||||
return markdown
|
return markdown
|
||||||
|
|
||||||
|
|
||||||
def placeholder2_uris(markdown: str, mask_dict: MaskDict):
|
def placeholder2uris(markdown: str, mask_dict: MaskDict):
|
||||||
def placeholder2uri(match: re.Match):
|
def placeholder2uri(match: re.Match):
|
||||||
id = match.group(1)
|
id = match.group(1)
|
||||||
uri = mask_dict.get(id)
|
uri = mask_dict.get(id)
|
||||||
|
|||||||
Reference in New Issue
Block a user