This commit is contained in:
xunbu
2025-05-08 14:28:15 +08:00
commit 41bd1e8945
23 changed files with 2891 additions and 0 deletions

262
filetranslate/translater.py Normal file
View File

@@ -0,0 +1,262 @@
from pathlib import Path
from typing import Literal
import markdown2
from filetranslate.decorator.markdown_mask import MaskDict
from filetranslate.utils.agent_utils import Agent
from filetranslate.utils.convert import pdf2markdown_embed_images
from filetranslate.utils.markdown_splitter import split_markdown_text
from filetranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris
class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunksize: int = 4096, base_url="", key=None,
model_id="", temperature=0.7, max_concurrent=6):
if isinstance(file_path, str):
file_path = Path(file_path)
self.file_path: Path = file_path
self.file_path: Path = file_path
self._mask_dict = MaskDict()
self.markdown: str = ""
self.chunksize = chunksize
self.max_concurrent = max_concurrent
self.base_url: str = base_url
self.key: str = key if key is not None else "xx"
self.model_id: str = model_id
self.temperature = temperature
def _mask_uris_in_markdown(self):
self.markdown = uris2placeholder(self.markdown, self._mask_dict)
return self
def _unmask_uris_in_markdown(self):
self.markdown = placeholder2_uris(self.markdown, self._mask_dict)
return self
def _split_markdown_into_chunks(self) -> list[str]:
chunks: list[str] = split_markdown_text(self.markdown, self.chunksize)
print(f"markdown分为{len(chunks)}")
return chunks
def create_refine_agent(self, baseurl=None, key=None, model_id=None, temperature=None):
baseurl = self.base_url if baseurl is None else baseurl
key = self.key if key is None else key
model_id = self.model_id if model_id is None else model_id
temperature = self.temperature if temperature is None else temperature
agent = Agent(baseurl=baseurl,
key=key,
model_id=model_id,
temperature=temperature,
max_concurrent=self.max_concurrent)
agent.system_prompt = r"""# 角色
你是一个修正markdown文本的专家。
# 工作
找到markdown片段的不合理之处对于缺失的句子应该查看缺失的语句是否可能被错误的放在了其他位置并通过重组段落、去掉异常字词修复不合理之处。
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。latex不要改变。
# 输出
修正后的markdown纯文本
# 示例
## 调整顺序
输入:
applications and scenarios becoming more and more extensive.
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
输出:
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.
## 去掉异常字词
输入:
你@好,你叫什么\名#字
输出:
你好,你叫什么名字\no_think"""
return agent
def create_translate_agent(self, baseurl=None, key=None, model_id=None, temperature=None, to_lang="中文"):
baseurl = self.base_url if baseurl is None else baseurl
key = self.key if key is None else key
model_id = self.model_id if model_id is None else model_id
temperature = self.temperature if temperature is None else temperature
agent = Agent(baseurl=baseurl,
key=key,
model_id=model_id,
temperature=temperature,
max_concurrent=self.max_concurrent)
agent.system_prompt = r"""# 角色
你是一个翻译markdown文本的专家。
# 工作
将输入的markdown文本翻译成{0}
尽量忠实于原文。
形如<ph-abc123>的占位符不要改变。
latex不要改变。
# 输出
翻译后的markdown纯文本
# 示例
## 英文翻译为中文:
输入:
hello<ph-aaaaaa>, what's your name?
输出:
你好<ph-aaaaaa>,你叫什么名字?\no_think""".format(to_lang)
return agent
def read_pdf_as_markdown(self, pdf: Path | None = None, formula=False, code=False, save=False):
print("正在将pdf转换为markdown")
if pdf is None:
pdf = self.file_path
self.markdown = pdf2markdown_embed_images(pdf, formula, code)
print("pdf已转换")
if save:
self.save_as_markdown(filename=f"{pdf.stem}.md")
return self
def read_markdown(self, markdown_path: Path | str):
if isinstance(markdown_path, str):
markdown_path = Path(markdown_path)
self.file_path = markdown_path
with open(markdown_path, "r") as f:
self.markdown = f.read()
return self
def refine_markdown(self, refine_agent: Agent | None = None) -> str:
if refine_agent is None:
refine_agent = self.create_refine_agent(self.base_url, self.key, self.model_id, self.temperature)
chuncks = self._split_markdown_into_chunks()
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
self.markdown = "".join(result)
print("markdown已修正")
return self.markdown
def translate_markdown(self, translate_agent: Agent | None = None):
print("正在翻译markdown")
if translate_agent is None:
translate_agent = self.create_translate_agent()
chuncks = self._split_markdown_into_chunks()
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
self.markdown = "".join(result)
print("翻译完成")
return self.markdown
def save_as_markdown(self, filename: str | Path = "output.md", output_dir: str | Path = "./output"):
if isinstance(filename, str):
filename = Path(filename)
if isinstance(output_dir, str):
output_dir = Path(output_dir)
# 确保输出目录存在
output_dir.mkdir(parents=True, exist_ok=True)
full_name = output_dir / filename
with open(full_name, "w") as file:
file.write(self.markdown)
print(f"文件已写入{full_name}")
return self
def save_as_html(self, filename: str | Path = "output.html", output_dir: str | Path = "./output"):
if isinstance(filename, str):
filename = Path(filename)
if isinstance(output_dir, str):
output_dir = Path(output_dir)
# 确保输出目录存在
output_dir.mkdir(parents=True, exist_ok=True)
full_name = output_dir / filename
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid'])
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{filename}</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@latest/css/pico.min.css">
<style>
html {{
padding:2vh 10vw;
font-size: 15px;
}}
</style>
</head>
<body>
{markdowner.convert(self.markdown)}
</body>
<script src="https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/tex-mml-chtml.min.js"></script>
<script type="module" defer>
import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@9/dist/mermaid.esm.min.mjs';
mermaid.initialize({{
securityLevel: 'loose',
startOnLoad: true
}});
let observer = new MutationObserver(mutations => {{
for(let mutation of mutations) {{
mutation.target.style.visibility = "visible";
}}
}});
document.querySelectorAll("pre.mermaid-pre div.mermaid").forEach(item => {{
observer.observe(item, {{
attributes: true,
attributeFilter: ['data-processed']
}});
}});
</script>
</html>
"""
with open(full_name, "w") as file:
file.write(html)
print(f"文件已写入{full_name}")
return self
def translate_pdf_file(self, pdf_path: Path | str | None = None, to_lang="中文", output_dir="./output",
formula=False,
code=False, output_format: Literal["markdown", "html"] = "markdown", refine=True,
refine_agent: Agent | None = None, translate_agent: Agent | None = None):
assert output_format in ("markdown", "html"), "output_format格式错误"
if pdf_path is None:
assert self.file_path is not None, "未输入文件路径"
pdf_path = self.file_path
if isinstance(pdf_path, str):
pdf_path = Path(pdf_path)
self.read_pdf_as_markdown(pdf_path, formula=formula, code=code)
self._mask_uris_in_markdown()
if refine:
if refine_agent is None:
refine_agent = self.create_refine_agent()
self.refine_markdown(refine_agent)
if translate_agent is None:
translate_agent = self.create_translate_agent(to_lang=to_lang)
self.translate_markdown(translate_agent)
self._unmask_uris_in_markdown()
if output_format == "markdown":
filename = f"{pdf_path.stem}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)
elif output_format == "html":
filename = f"{pdf_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
def translate_markdown_file(self, markdown_path: Path | str | None = None, to_lang="中文", output_dir="./output",
output_format: Literal["markdown", "html"] = "markdown",
refine=False, refine_agent: Agent | None = None, translate_agent: Agent | None = None):
assert output_format in ("markdown", "html"), "output_format格式错误"
if markdown_path is None:
assert self.file_path is not None, "未输入文件路径"
markdown_path = self.file_path
elif isinstance(markdown_path, str):
markdown_path = Path(markdown_path)
with open(markdown_path, "r") as f:
self.markdown = f.read()
self._mask_uris_in_markdown()
if refine:
if refine_agent is None:
refine_agent = self.create_refine_agent()
self.refine_markdown(refine_agent)
if translate_agent is None:
translate_agent = self.create_translate_agent(to_lang=to_lang)
self.translate_markdown(translate_agent)
self._unmask_uris_in_markdown()
if output_format == "markdown":
filename = f"{markdown_path.stem}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)
elif output_format == "html":
filename = f"{markdown_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
if __name__ == '__main__':
pass