This commit is contained in:
xunbu
2025-05-08 14:28:15 +08:00
commit 41bd1e8945
23 changed files with 2891 additions and 0 deletions

View File

View File

View File

@@ -0,0 +1,32 @@
from functools import wraps
from typing import Concatenate, ParamSpec, Callable
import re
from filetranslate.utils.markdown_utils import MaskDict
P=ParamSpec("P")
def mask_uris_temp(func:Callable[Concatenate[str, P], str]) -> Callable[Concatenate[str, P], str]:
@wraps(func)
def wrapper(markdown: str, *args: P.args, **kwargs: P.kwargs) -> str:
mask_dict=MaskDict()
def uri2placeholder(match:re.Match):
id=mask_dict.create_id()
mask_dict.set(id,match.group())
return f"<ph-{id}>"
def placeholder2uri(match:re.Match):
id=match.group(1)
uri=mask_dict.get(id)
if uri is None:
return match.group()
return uri
uri_pattern=r'!?\[.*?\]\(.*?\)'
markdown=re.sub(uri_pattern,uri2placeholder,markdown)
result=func(markdown, *args, **kwargs)
ph_pattern=r"<ph-([a-zA-Z0-9]+)>"
result=re.sub(ph_pattern,placeholder2uri,result)
return result
return wrapper
if __name__ == '__main__':
pass

View File

262
filetranslate/translater.py Normal file
View File

@@ -0,0 +1,262 @@
from pathlib import Path
from typing import Literal
import markdown2
from filetranslate.decorator.markdown_mask import MaskDict
from filetranslate.utils.agent_utils import Agent
from filetranslate.utils.convert import pdf2markdown_embed_images
from filetranslate.utils.markdown_splitter import split_markdown_text
from filetranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris
class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunksize: int = 4096, base_url="", key=None,
model_id="", temperature=0.7, max_concurrent=6):
if isinstance(file_path, str):
file_path = Path(file_path)
self.file_path: Path = file_path
self.file_path: Path = file_path
self._mask_dict = MaskDict()
self.markdown: str = ""
self.chunksize = chunksize
self.max_concurrent = max_concurrent
self.base_url: str = base_url
self.key: str = key if key is not None else "xx"
self.model_id: str = model_id
self.temperature = temperature
def _mask_uris_in_markdown(self):
self.markdown = uris2placeholder(self.markdown, self._mask_dict)
return self
def _unmask_uris_in_markdown(self):
self.markdown = placeholder2_uris(self.markdown, self._mask_dict)
return self
def _split_markdown_into_chunks(self) -> list[str]:
chunks: list[str] = split_markdown_text(self.markdown, self.chunksize)
print(f"markdown分为{len(chunks)}")
return chunks
def create_refine_agent(self, baseurl=None, key=None, model_id=None, temperature=None):
baseurl = self.base_url if baseurl is None else baseurl
key = self.key if key is None else key
model_id = self.model_id if model_id is None else model_id
temperature = self.temperature if temperature is None else temperature
agent = Agent(baseurl=baseurl,
key=key,
model_id=model_id,
temperature=temperature,
max_concurrent=self.max_concurrent)
agent.system_prompt = r"""# 角色
你是一个修正markdown文本的专家。
# 工作
找到markdown片段的不合理之处对于缺失的句子应该查看缺失的语句是否可能被错误的放在了其他位置并通过重组段落、去掉异常字词修复不合理之处。
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。latex不要改变。
# 输出
修正后的markdown纯文本
# 示例
## 调整顺序
输入:
applications and scenarios becoming more and more extensive.
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
输出:
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.
## 去掉异常字词
输入:
你@好,你叫什么\名#字
输出:
你好,你叫什么名字\no_think"""
return agent
def create_translate_agent(self, baseurl=None, key=None, model_id=None, temperature=None, to_lang="中文"):
baseurl = self.base_url if baseurl is None else baseurl
key = self.key if key is None else key
model_id = self.model_id if model_id is None else model_id
temperature = self.temperature if temperature is None else temperature
agent = Agent(baseurl=baseurl,
key=key,
model_id=model_id,
temperature=temperature,
max_concurrent=self.max_concurrent)
agent.system_prompt = r"""# 角色
你是一个翻译markdown文本的专家。
# 工作
将输入的markdown文本翻译成{0}
尽量忠实于原文。
形如<ph-abc123>的占位符不要改变。
latex不要改变。
# 输出
翻译后的markdown纯文本
# 示例
## 英文翻译为中文:
输入:
hello<ph-aaaaaa>, what's your name?
输出:
你好<ph-aaaaaa>,你叫什么名字?\no_think""".format(to_lang)
return agent
def read_pdf_as_markdown(self, pdf: Path | None = None, formula=False, code=False, save=False):
print("正在将pdf转换为markdown")
if pdf is None:
pdf = self.file_path
self.markdown = pdf2markdown_embed_images(pdf, formula, code)
print("pdf已转换")
if save:
self.save_as_markdown(filename=f"{pdf.stem}.md")
return self
def read_markdown(self, markdown_path: Path | str):
if isinstance(markdown_path, str):
markdown_path = Path(markdown_path)
self.file_path = markdown_path
with open(markdown_path, "r") as f:
self.markdown = f.read()
return self
def refine_markdown(self, refine_agent: Agent | None = None) -> str:
if refine_agent is None:
refine_agent = self.create_refine_agent(self.base_url, self.key, self.model_id, self.temperature)
chuncks = self._split_markdown_into_chunks()
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
self.markdown = "".join(result)
print("markdown已修正")
return self.markdown
def translate_markdown(self, translate_agent: Agent | None = None):
print("正在翻译markdown")
if translate_agent is None:
translate_agent = self.create_translate_agent()
chuncks = self._split_markdown_into_chunks()
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
self.markdown = "".join(result)
print("翻译完成")
return self.markdown
def save_as_markdown(self, filename: str | Path = "output.md", output_dir: str | Path = "./output"):
if isinstance(filename, str):
filename = Path(filename)
if isinstance(output_dir, str):
output_dir = Path(output_dir)
# 确保输出目录存在
output_dir.mkdir(parents=True, exist_ok=True)
full_name = output_dir / filename
with open(full_name, "w") as file:
file.write(self.markdown)
print(f"文件已写入{full_name}")
return self
def save_as_html(self, filename: str | Path = "output.html", output_dir: str | Path = "./output"):
if isinstance(filename, str):
filename = Path(filename)
if isinstance(output_dir, str):
output_dir = Path(output_dir)
# 确保输出目录存在
output_dir.mkdir(parents=True, exist_ok=True)
full_name = output_dir / filename
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid'])
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{filename}</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@latest/css/pico.min.css">
<style>
html {{
padding:2vh 10vw;
font-size: 15px;
}}
</style>
</head>
<body>
{markdowner.convert(self.markdown)}
</body>
<script src="https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/tex-mml-chtml.min.js"></script>
<script type="module" defer>
import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@9/dist/mermaid.esm.min.mjs';
mermaid.initialize({{
securityLevel: 'loose',
startOnLoad: true
}});
let observer = new MutationObserver(mutations => {{
for(let mutation of mutations) {{
mutation.target.style.visibility = "visible";
}}
}});
document.querySelectorAll("pre.mermaid-pre div.mermaid").forEach(item => {{
observer.observe(item, {{
attributes: true,
attributeFilter: ['data-processed']
}});
}});
</script>
</html>
"""
with open(full_name, "w") as file:
file.write(html)
print(f"文件已写入{full_name}")
return self
def translate_pdf_file(self, pdf_path: Path | str | None = None, to_lang="中文", output_dir="./output",
formula=False,
code=False, output_format: Literal["markdown", "html"] = "markdown", refine=True,
refine_agent: Agent | None = None, translate_agent: Agent | None = None):
assert output_format in ("markdown", "html"), "output_format格式错误"
if pdf_path is None:
assert self.file_path is not None, "未输入文件路径"
pdf_path = self.file_path
if isinstance(pdf_path, str):
pdf_path = Path(pdf_path)
self.read_pdf_as_markdown(pdf_path, formula=formula, code=code)
self._mask_uris_in_markdown()
if refine:
if refine_agent is None:
refine_agent = self.create_refine_agent()
self.refine_markdown(refine_agent)
if translate_agent is None:
translate_agent = self.create_translate_agent(to_lang=to_lang)
self.translate_markdown(translate_agent)
self._unmask_uris_in_markdown()
if output_format == "markdown":
filename = f"{pdf_path.stem}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)
elif output_format == "html":
filename = f"{pdf_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
def translate_markdown_file(self, markdown_path: Path | str | None = None, to_lang="中文", output_dir="./output",
output_format: Literal["markdown", "html"] = "markdown",
refine=False, refine_agent: Agent | None = None, translate_agent: Agent | None = None):
assert output_format in ("markdown", "html"), "output_format格式错误"
if markdown_path is None:
assert self.file_path is not None, "未输入文件路径"
markdown_path = self.file_path
elif isinstance(markdown_path, str):
markdown_path = Path(markdown_path)
with open(markdown_path, "r") as f:
self.markdown = f.read()
self._mask_uris_in_markdown()
if refine:
if refine_agent is None:
refine_agent = self.create_refine_agent()
self.refine_markdown(refine_agent)
if translate_agent is None:
translate_agent = self.create_translate_agent(to_lang=to_lang)
self.translate_markdown(translate_agent)
self._unmask_uris_in_markdown()
if output_format == "markdown":
filename = f"{markdown_path.stem}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)
elif output_format == "html":
filename = f"{markdown_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
if __name__ == '__main__':
pass

View File

View File

@@ -0,0 +1,109 @@
import asyncio
import httpx
class Agent:
def __init__(self, baseurl="", key="", model_id="", system_prompt="", temperature=0.7, max_concurrent=5):
self.baseurl = baseurl
self.key = key
self.model_id = model_id
self.system_prompt = system_prompt
self.temperature = temperature
# self.client=httpx.Client()
self.client_async = httpx.AsyncClient()
self.max_concurrent = max_concurrent
def _prepare_request_data(self, prompt, system_prompt, temperature=None, top_p=0.9):
if temperature is None:
temperature = self.temperature
headers = {"Content-Type": "application/json",
"Authorization": f"Bearer {self.key}"}
data = {
"model": self.model_id,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
"temperature": temperature,
"top_p": top_p
}
return headers, data
# def send_prompt(self,prompt,system_prompt=None,timeout=50):
# if system_prompt is None:
# system_prompt=self.system_prompt
# headers,data=self._prepare_request_data(prompt,system_prompt)
# response=self.client.post(f"{self.baseurl}/chat/completions",json=data,headers=headers,timeout=timeout)
# response.raise_for_status()
# return response.json()["choices"][0]["message"]["content"].lstrip()
async def send_async(self, prompt: str, system_prompt: None | str = None, timeout: int = 200) -> str:
if system_prompt is None:
system_prompt = self.system_prompt
"""Sends a single prompt asynchronously."""
headers, data = self._prepare_request_data(prompt, system_prompt)
try:
response = await self.client_async.post(
f"{self.baseurl}/chat/completions",
json=data,
headers=headers,
timeout=timeout
)
response.raise_for_status()
result: str = response.json()["choices"][0]["message"]["content"]
return result.lstrip()
except httpx.HTTPStatusError as e:
raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e
except httpx.RequestError as e:
raise Exception(f"AI请求连接错误 (async): {e}") from e
except (KeyError, IndexError) as e:
raise Exception(f"AI响应格式错误 (async): {e}") from e
async def send_prompts_async(
self,
prompts: list[str],
system_prompt: str | None = None,
timeout: int = 50,
max_concurrent: int = 5 # 新增参数默认并发数为5
) -> list[str]:
total = len(prompts)
count = 0
"""
Sends multiple prompts asynchronously, limiting concurrent requests.
"""
semaphore = asyncio.Semaphore(max_concurrent)
tasks = []
# 辅助协程,用于包装 self.send_async 并使用信号量
async def send_with_semaphore(p_text: str):
async with semaphore: # 在进入代码块前获取信号量,退出时释放
result = await self.send_async(
prompt=p_text,
system_prompt=system_prompt,
timeout=timeout
)
nonlocal count
count += 1
print(f"进行到{count}/{total}")
return result
for p_text in prompts:
task = asyncio.create_task(send_with_semaphore(p_text))
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=False)
return results
def send_prompts(
self,
prompts: list[str],
system_prompt: str | None = None,
timeout: int = 50,
) -> list[str]:
result = asyncio.run(self.send_prompts_async(prompts, system_prompt, timeout, self.max_concurrent))
return result
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,25 @@
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
IMAGE_RESOLUTION_SCALE = 4
def pdf2markdown_embed_images(pdf: Path | str, formula=False, code=False) -> str:
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
if formula:
pipeline_options.do_formula_enrichment=True
if code:
pipeline_options.do_code_enrichment=True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
result = converter.convert(pdf).document.export_to_markdown( image_mode=ImageRefMode.EMBEDDED)
return result
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,200 @@
import re
from typing import List
class MarkdownBlockSplitter:
def __init__(self, max_block_size: int = 4096):
"""
初始化MarkdownBlockSplitter。
参数:
max_block_size: 每个块的最大大小(以字符为单位)。
"""
self.max_block_size = max_block_size
def split_markdown(self, markdown_text: str) -> List[str]:
"""
将markdown文本拆分为指定最大大小的块。
参数:
markdown_text: 输入的markdown文本。
返回:
列表形式的markdown块每个都是一个字符串。
"""
# 使用更简单的方法按Markdown块拆分
# 这比使用AST解析更可靠
# 模式用于识别markdown块标题、段落、代码块等
blocks = self._split_into_logical_blocks(markdown_text)
# 现在合并块以遵守max_block_size
result_blocks = []
current_block = ""
for block in blocks:
# 如果单个块大于最大大小,则进一步拆分
if len(block) > self.max_block_size:
# 如果已有累积内容,先添加
if current_block:
result_blocks.append(current_block)
current_block = ""
# 拆分大块
large_block_parts = self._split_large_block(block)
result_blocks.extend(large_block_parts)
continue
# 如果添加此块会超过限制,则开始新的结果块
if len(current_block) + len(block) + 2 > self.max_block_size and current_block:
result_blocks.append(current_block)
current_block = block
else:
# 添加到当前块并适当换行
if current_block:
current_block += "\n\n" + block
else:
current_block = block
# 如果不为空则添加最后一个块
if current_block:
result_blocks.append(current_block)
return result_blocks
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
"""
将markdown文本拆分为逻辑块标题、段落、代码块等
参数:
markdown_text: 输入markdown文本
返回:
markdown块列表
"""
# 将Windows换行符替换为Unix风格
markdown_text = markdown_text.replace('\r\n', '\n')
# 匹配代码块的模式(用```或~~~围起来)
code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)'
# 将文本拆分为代码块和非代码块
parts = re.split(code_block_pattern, markdown_text)
blocks = []
for i, part in enumerate(parts):
# 如果是代码块(拆分结果中的奇数索引)
if i % 2 == 1:
blocks.append(part)
else:
# 对于非代码块,按空行拆分
part_blocks = re.split(r'\n\s*\n', part)
blocks.extend([b.strip() for b in part_blocks if b.strip()])
return blocks
def _split_large_block(self, block: str) -> List[str]:
"""
拆分超过max_block_size的大块。
参数:
block: 一个大的markdown块
返回:
较小的块列表
"""
result = []
# 检查是否是代码块
if block.startswith('```') or block.startswith('~~~'):
# 对于代码块,我们需要保留围栏标记
fence_marker = '```' if block.startswith('```') else '~~~'
# 提取语言说明符(如果存在)
first_line_end = block.find('\n')
first_line = block[:first_line_end]
language_spec = first_line[3:].strip()
# 拆分代码内容
code_content = block[first_line_end + 1:-3].strip()
# 按行拆分
lines = code_content.split('\n')
current_part = [first_line]
current_size = len(first_line) + 1 # +1表示换行符
for line in lines:
line_size = len(line) + 1 # +1表示换行符
if current_size + line_size + 3 > self.max_block_size: # +3表示关闭围栏
# 关闭当前代码块
current_part.append(fence_marker)
result.append('\n'.join(current_part))
# 开始新的代码块
current_part = [f"{fence_marker}{language_spec}"]
current_size = len(current_part[0]) + 1
current_part.append(line)
current_size += line_size
# 在最后部分添加关闭围栏
current_part.append(fence_marker)
result.append('\n'.join(current_part))
else:
# 对于其他块,按句子或行拆分
if '.' in block or '!' in block or '?' in block:
# 按句子拆分
sentences = re.split(r'(?<=[.!?])\s+', block)
current_part = []
current_size = 0
for sentence in sentences:
if current_size + len(sentence) + 1 > self.max_block_size and current_part:
result.append(' '.join(current_part))
current_part = [sentence]
current_size = len(sentence)
else:
current_part.append(sentence)
current_size += len(sentence) + 1 # +1表示空格
if current_part:
result.append(' '.join(current_part))
else:
# 按行拆分
lines = block.split('\n')
current_part = []
current_size = 0
for line in lines:
if current_size + len(line) + 1 > self.max_block_size and current_part:
result.append('\n'.join(current_part))
current_part = [line]
current_size = len(line)
else:
current_part.append(line)
current_size += len(line) + 1 # +1表示换行符
if current_part:
result.append('\n'.join(current_part))
return result
def split_markdown_text(markdown_text, max_block_size=4096):
"""
将markdown字符串拆分为不超过max_block_size的块。
参数:
markdown_text: 输入markdown文本
max_block_size: 每个块的最大字符数
返回:
markdown块列表
"""
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
return splitter.split_markdown(markdown_text)

View File

@@ -0,0 +1,61 @@
import re
import threading
import uuid
class MaskDict:
def __init__(self):
self._dict = {}
self._lock = threading.Lock()
def create_id(self):
with self._lock:
while True:
id = uuid.uuid1().hex[:6]
if id not in self._dict:
return id
def get(self, key):
with self._lock:
return self._dict.get(key)
def set(self, key, value):
with self._lock:
self._dict[key] = value
def delete(self, key):
with self._lock:
if key in self._dict:
del self._dict[key]
def __contains__(self, item):
with self._lock:
return item in self._dict
def uris2placeholder(markdown:str, mask_dict:MaskDict):
def uri2placeholder(match: re.Match):
id = mask_dict.create_id()
mask_dict.set(id, match.group())
return f"<ph-{id}>"
uri_pattern = r'!?\[.*?\]\(.*?\)'
markdown = re.sub(uri_pattern, uri2placeholder, markdown)
return markdown
def placeholder2_uris(markdown:str, mask_dict:MaskDict):
def placeholder2uri(match:re.Match):
id=match.group(1)
uri=mask_dict.get(id)
if uri is None:
return match.group()
return uri
ph_pattern = r"<ph-([a-zA-Z0-9]+)>"
markdown = re.sub(ph_pattern, placeholder2uri, markdown)
return markdown
if __name__ == '__main__':
pass