v0.0.1
This commit is contained in:
0
filetranslate/__init__.py
Normal file
0
filetranslate/__init__.py
Normal file
0
filetranslate/decorator/__init__.py
Normal file
0
filetranslate/decorator/__init__.py
Normal file
32
filetranslate/decorator/markdown_mask.py
Normal file
32
filetranslate/decorator/markdown_mask.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from functools import wraps
|
||||
from typing import Concatenate, ParamSpec, Callable
|
||||
import re
|
||||
|
||||
from filetranslate.utils.markdown_utils import MaskDict
|
||||
|
||||
P=ParamSpec("P")
|
||||
def mask_uris_temp(func:Callable[Concatenate[str, P], str]) -> Callable[Concatenate[str, P], str]:
|
||||
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(markdown: str, *args: P.args, **kwargs: P.kwargs) -> str:
|
||||
mask_dict=MaskDict()
|
||||
def uri2placeholder(match:re.Match):
|
||||
id=mask_dict.create_id()
|
||||
mask_dict.set(id,match.group())
|
||||
return f"<ph-{id}>"
|
||||
def placeholder2uri(match:re.Match):
|
||||
id=match.group(1)
|
||||
uri=mask_dict.get(id)
|
||||
if uri is None:
|
||||
return match.group()
|
||||
return uri
|
||||
uri_pattern=r'!?\[.*?\]\(.*?\)'
|
||||
markdown=re.sub(uri_pattern,uri2placeholder,markdown)
|
||||
result=func(markdown, *args, **kwargs)
|
||||
ph_pattern=r"<ph-([a-zA-Z0-9]+)>"
|
||||
result=re.sub(ph_pattern,placeholder2uri,result)
|
||||
return result
|
||||
return wrapper
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
0
filetranslate/decorator/time.py
Normal file
0
filetranslate/decorator/time.py
Normal file
262
filetranslate/translater.py
Normal file
262
filetranslate/translater.py
Normal file
@@ -0,0 +1,262 @@
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import markdown2
|
||||
|
||||
from filetranslate.decorator.markdown_mask import MaskDict
|
||||
from filetranslate.utils.agent_utils import Agent
|
||||
from filetranslate.utils.convert import pdf2markdown_embed_images
|
||||
from filetranslate.utils.markdown_splitter import split_markdown_text
|
||||
from filetranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris
|
||||
|
||||
|
||||
class FileTranslater:
|
||||
def __init__(self, file_path: Path | str | None = None, chunksize: int = 4096, base_url="", key=None,
|
||||
model_id="", temperature=0.7, max_concurrent=6):
|
||||
if isinstance(file_path, str):
|
||||
file_path = Path(file_path)
|
||||
self.file_path: Path = file_path
|
||||
self.file_path: Path = file_path
|
||||
self._mask_dict = MaskDict()
|
||||
self.markdown: str = ""
|
||||
self.chunksize = chunksize
|
||||
self.max_concurrent = max_concurrent
|
||||
self.base_url: str = base_url
|
||||
self.key: str = key if key is not None else "xx"
|
||||
self.model_id: str = model_id
|
||||
self.temperature = temperature
|
||||
|
||||
def _mask_uris_in_markdown(self):
|
||||
self.markdown = uris2placeholder(self.markdown, self._mask_dict)
|
||||
return self
|
||||
|
||||
def _unmask_uris_in_markdown(self):
|
||||
self.markdown = placeholder2_uris(self.markdown, self._mask_dict)
|
||||
return self
|
||||
|
||||
def _split_markdown_into_chunks(self) -> list[str]:
|
||||
chunks: list[str] = split_markdown_text(self.markdown, self.chunksize)
|
||||
print(f"markdown分为{len(chunks)}块")
|
||||
return chunks
|
||||
|
||||
def create_refine_agent(self, baseurl=None, key=None, model_id=None, temperature=None):
|
||||
baseurl = self.base_url if baseurl is None else baseurl
|
||||
key = self.key if key is None else key
|
||||
model_id = self.model_id if model_id is None else model_id
|
||||
temperature = self.temperature if temperature is None else temperature
|
||||
agent = Agent(baseurl=baseurl,
|
||||
key=key,
|
||||
model_id=model_id,
|
||||
temperature=temperature,
|
||||
max_concurrent=self.max_concurrent)
|
||||
agent.system_prompt = r"""# 角色
|
||||
你是一个修正markdown文本的专家。
|
||||
# 工作
|
||||
找到markdown片段的不合理之处,对于缺失的句子,应该查看缺失的语句是否可能被错误的放在了其他位置,并通过重组段落、去掉异常字词修复不合理之处。
|
||||
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。latex不要改变。
|
||||
# 输出
|
||||
修正后的markdown纯文本
|
||||
# 示例
|
||||
## 调整顺序
|
||||
输入:
|
||||
applications and scenarios becoming more and more extensive.
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
|
||||
输出:
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.
|
||||
## 去掉异常字词
|
||||
输入:
|
||||
你@好,你叫什么\名#字
|
||||
输出:
|
||||
你好,你叫什么名字\no_think"""
|
||||
return agent
|
||||
|
||||
def create_translate_agent(self, baseurl=None, key=None, model_id=None, temperature=None, to_lang="中文"):
|
||||
baseurl = self.base_url if baseurl is None else baseurl
|
||||
key = self.key if key is None else key
|
||||
model_id = self.model_id if model_id is None else model_id
|
||||
temperature = self.temperature if temperature is None else temperature
|
||||
agent = Agent(baseurl=baseurl,
|
||||
key=key,
|
||||
model_id=model_id,
|
||||
temperature=temperature,
|
||||
max_concurrent=self.max_concurrent)
|
||||
agent.system_prompt = r"""# 角色
|
||||
你是一个翻译markdown文本的专家。
|
||||
# 工作
|
||||
将输入的markdown文本翻译成{0}。
|
||||
尽量忠实于原文。
|
||||
形如<ph-abc123>的占位符不要改变。
|
||||
latex不要改变。
|
||||
# 输出
|
||||
翻译后的markdown纯文本
|
||||
# 示例
|
||||
## 英文翻译为中文:
|
||||
输入:
|
||||
hello<ph-aaaaaa>, what's your name?
|
||||
输出:
|
||||
你好<ph-aaaaaa>,你叫什么名字?\no_think""".format(to_lang)
|
||||
return agent
|
||||
|
||||
def read_pdf_as_markdown(self, pdf: Path | None = None, formula=False, code=False, save=False):
|
||||
print("正在将pdf转换为markdown")
|
||||
if pdf is None:
|
||||
pdf = self.file_path
|
||||
self.markdown = pdf2markdown_embed_images(pdf, formula, code)
|
||||
print("pdf已转换")
|
||||
if save:
|
||||
self.save_as_markdown(filename=f"{pdf.stem}.md")
|
||||
return self
|
||||
|
||||
def read_markdown(self, markdown_path: Path | str):
|
||||
if isinstance(markdown_path, str):
|
||||
markdown_path = Path(markdown_path)
|
||||
self.file_path = markdown_path
|
||||
with open(markdown_path, "r") as f:
|
||||
self.markdown = f.read()
|
||||
return self
|
||||
|
||||
def refine_markdown(self, refine_agent: Agent | None = None) -> str:
|
||||
if refine_agent is None:
|
||||
refine_agent = self.create_refine_agent(self.base_url, self.key, self.model_id, self.temperature)
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
|
||||
self.markdown = "".join(result)
|
||||
print("markdown已修正")
|
||||
return self.markdown
|
||||
|
||||
def translate_markdown(self, translate_agent: Agent | None = None):
|
||||
print("正在翻译markdown")
|
||||
if translate_agent is None:
|
||||
translate_agent = self.create_translate_agent()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
|
||||
self.markdown = "".join(result)
|
||||
print("翻译完成")
|
||||
return self.markdown
|
||||
|
||||
def save_as_markdown(self, filename: str | Path = "output.md", output_dir: str | Path = "./output"):
|
||||
if isinstance(filename, str):
|
||||
filename = Path(filename)
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
# 确保输出目录存在
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
full_name = output_dir / filename
|
||||
with open(full_name, "w") as file:
|
||||
file.write(self.markdown)
|
||||
print(f"文件已写入{full_name}")
|
||||
return self
|
||||
|
||||
def save_as_html(self, filename: str | Path = "output.html", output_dir: str | Path = "./output"):
|
||||
if isinstance(filename, str):
|
||||
filename = Path(filename)
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
# 确保输出目录存在
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
full_name = output_dir / filename
|
||||
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid'])
|
||||
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>{filename}</title>
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@latest/css/pico.min.css">
|
||||
<style>
|
||||
html {{
|
||||
padding:2vh 10vw;
|
||||
font-size: 15px;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{markdowner.convert(self.markdown)}
|
||||
</body>
|
||||
<script src="https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/tex-mml-chtml.min.js"></script>
|
||||
<script type="module" defer>
|
||||
import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@9/dist/mermaid.esm.min.mjs';
|
||||
mermaid.initialize({{
|
||||
securityLevel: 'loose',
|
||||
startOnLoad: true
|
||||
}});
|
||||
let observer = new MutationObserver(mutations => {{
|
||||
for(let mutation of mutations) {{
|
||||
mutation.target.style.visibility = "visible";
|
||||
}}
|
||||
}});
|
||||
document.querySelectorAll("pre.mermaid-pre div.mermaid").forEach(item => {{
|
||||
observer.observe(item, {{
|
||||
attributes: true,
|
||||
attributeFilter: ['data-processed']
|
||||
}});
|
||||
}});
|
||||
</script>
|
||||
</html>
|
||||
"""
|
||||
with open(full_name, "w") as file:
|
||||
file.write(html)
|
||||
print(f"文件已写入{full_name}")
|
||||
return self
|
||||
|
||||
def translate_pdf_file(self, pdf_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
||||
formula=False,
|
||||
code=False, output_format: Literal["markdown", "html"] = "markdown", refine=True,
|
||||
refine_agent: Agent | None = None, translate_agent: Agent | None = None):
|
||||
assert output_format in ("markdown", "html"), "output_format格式错误"
|
||||
if pdf_path is None:
|
||||
assert self.file_path is not None, "未输入文件路径"
|
||||
pdf_path = self.file_path
|
||||
if isinstance(pdf_path, str):
|
||||
pdf_path = Path(pdf_path)
|
||||
self.read_pdf_as_markdown(pdf_path, formula=formula, code=code)
|
||||
self._mask_uris_in_markdown()
|
||||
if refine:
|
||||
if refine_agent is None:
|
||||
refine_agent = self.create_refine_agent()
|
||||
self.refine_markdown(refine_agent)
|
||||
if translate_agent is None:
|
||||
translate_agent = self.create_translate_agent(to_lang=to_lang)
|
||||
self.translate_markdown(translate_agent)
|
||||
self._unmask_uris_in_markdown()
|
||||
if output_format == "markdown":
|
||||
filename = f"{pdf_path.stem}_{to_lang}.md"
|
||||
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
||||
elif output_format == "html":
|
||||
filename = f"{pdf_path.stem}_{to_lang}.html"
|
||||
self.save_as_html(filename=filename, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
def translate_markdown_file(self, markdown_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
||||
output_format: Literal["markdown", "html"] = "markdown",
|
||||
refine=False, refine_agent: Agent | None = None, translate_agent: Agent | None = None):
|
||||
assert output_format in ("markdown", "html"), "output_format格式错误"
|
||||
if markdown_path is None:
|
||||
assert self.file_path is not None, "未输入文件路径"
|
||||
markdown_path = self.file_path
|
||||
elif isinstance(markdown_path, str):
|
||||
markdown_path = Path(markdown_path)
|
||||
with open(markdown_path, "r") as f:
|
||||
self.markdown = f.read()
|
||||
self._mask_uris_in_markdown()
|
||||
if refine:
|
||||
if refine_agent is None:
|
||||
refine_agent = self.create_refine_agent()
|
||||
self.refine_markdown(refine_agent)
|
||||
if translate_agent is None:
|
||||
translate_agent = self.create_translate_agent(to_lang=to_lang)
|
||||
self.translate_markdown(translate_agent)
|
||||
self._unmask_uris_in_markdown()
|
||||
if output_format == "markdown":
|
||||
filename = f"{markdown_path.stem}_{to_lang}.md"
|
||||
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
||||
elif output_format == "html":
|
||||
filename = f"{markdown_path.stem}_{to_lang}.html"
|
||||
self.save_as_html(filename=filename, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
0
filetranslate/utils/__init__.py
Normal file
0
filetranslate/utils/__init__.py
Normal file
109
filetranslate/utils/agent_utils.py
Normal file
109
filetranslate/utils/agent_utils.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import asyncio
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class Agent:
|
||||
def __init__(self, baseurl="", key="", model_id="", system_prompt="", temperature=0.7, max_concurrent=5):
|
||||
self.baseurl = baseurl
|
||||
self.key = key
|
||||
self.model_id = model_id
|
||||
self.system_prompt = system_prompt
|
||||
self.temperature = temperature
|
||||
# self.client=httpx.Client()
|
||||
self.client_async = httpx.AsyncClient()
|
||||
self.max_concurrent = max_concurrent
|
||||
|
||||
def _prepare_request_data(self, prompt, system_prompt, temperature=None, top_p=0.9):
|
||||
if temperature is None:
|
||||
temperature = self.temperature
|
||||
headers = {"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.key}"}
|
||||
data = {
|
||||
"model": self.model_id,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"temperature": temperature,
|
||||
"top_p": top_p
|
||||
}
|
||||
return headers, data
|
||||
|
||||
# def send_prompt(self,prompt,system_prompt=None,timeout=50):
|
||||
# if system_prompt is None:
|
||||
# system_prompt=self.system_prompt
|
||||
# headers,data=self._prepare_request_data(prompt,system_prompt)
|
||||
# response=self.client.post(f"{self.baseurl}/chat/completions",json=data,headers=headers,timeout=timeout)
|
||||
# response.raise_for_status()
|
||||
# return response.json()["choices"][0]["message"]["content"].lstrip()
|
||||
|
||||
async def send_async(self, prompt: str, system_prompt: None | str = None, timeout: int = 200) -> str:
|
||||
if system_prompt is None:
|
||||
system_prompt = self.system_prompt
|
||||
"""Sends a single prompt asynchronously."""
|
||||
headers, data = self._prepare_request_data(prompt, system_prompt)
|
||||
try:
|
||||
response = await self.client_async.post(
|
||||
f"{self.baseurl}/chat/completions",
|
||||
json=data,
|
||||
headers=headers,
|
||||
timeout=timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
result: str = response.json()["choices"][0]["message"]["content"]
|
||||
return result.lstrip()
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e
|
||||
except httpx.RequestError as e:
|
||||
raise Exception(f"AI请求连接错误 (async): {e}") from e
|
||||
except (KeyError, IndexError) as e:
|
||||
raise Exception(f"AI响应格式错误 (async): {e}") from e
|
||||
|
||||
async def send_prompts_async(
|
||||
self,
|
||||
prompts: list[str],
|
||||
system_prompt: str | None = None,
|
||||
timeout: int = 50,
|
||||
max_concurrent: int = 5 # 新增参数,默认并发数为5
|
||||
) -> list[str]:
|
||||
total = len(prompts)
|
||||
count = 0
|
||||
"""
|
||||
Sends multiple prompts asynchronously, limiting concurrent requests.
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
tasks = []
|
||||
|
||||
# 辅助协程,用于包装 self.send_async 并使用信号量
|
||||
async def send_with_semaphore(p_text: str):
|
||||
async with semaphore: # 在进入代码块前获取信号量,退出时释放
|
||||
result = await self.send_async(
|
||||
prompt=p_text,
|
||||
system_prompt=system_prompt,
|
||||
timeout=timeout
|
||||
)
|
||||
nonlocal count
|
||||
count += 1
|
||||
print(f"进行到{count}/{total}")
|
||||
return result
|
||||
|
||||
for p_text in prompts:
|
||||
task = asyncio.create_task(send_with_semaphore(p_text))
|
||||
tasks.append(task)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=False)
|
||||
return results
|
||||
|
||||
def send_prompts(
|
||||
self,
|
||||
prompts: list[str],
|
||||
system_prompt: str | None = None,
|
||||
timeout: int = 50,
|
||||
) -> list[str]:
|
||||
result = asyncio.run(self.send_prompts_async(prompts, system_prompt, timeout, self.max_concurrent))
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
25
filetranslate/utils/convert.py
Normal file
25
filetranslate/utils/convert.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from pathlib import Path
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
def pdf2markdown_embed_images(pdf: Path | str, formula=False, code=False) -> str:
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_picture_images = True
|
||||
if formula:
|
||||
pipeline_options.do_formula_enrichment=True
|
||||
if code:
|
||||
pipeline_options.do_code_enrichment=True
|
||||
converter = DocumentConverter(format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
})
|
||||
result = converter.convert(pdf).document.export_to_markdown( image_mode=ImageRefMode.EMBEDDED)
|
||||
return result
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
200
filetranslate/utils/markdown_splitter.py
Normal file
200
filetranslate/utils/markdown_splitter.py
Normal file
@@ -0,0 +1,200 @@
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
|
||||
class MarkdownBlockSplitter:
|
||||
def __init__(self, max_block_size: int = 4096):
|
||||
"""
|
||||
初始化MarkdownBlockSplitter。
|
||||
|
||||
参数:
|
||||
max_block_size: 每个块的最大大小(以字符为单位)。
|
||||
"""
|
||||
self.max_block_size = max_block_size
|
||||
|
||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
||||
"""
|
||||
将markdown文本拆分为指定最大大小的块。
|
||||
|
||||
参数:
|
||||
markdown_text: 输入的markdown文本。
|
||||
|
||||
返回:
|
||||
列表形式的markdown块,每个都是一个字符串。
|
||||
"""
|
||||
# 使用更简单的方法:按Markdown块拆分
|
||||
# 这比使用AST解析更可靠
|
||||
|
||||
# 模式用于识别markdown块(标题、段落、代码块等)
|
||||
blocks = self._split_into_logical_blocks(markdown_text)
|
||||
|
||||
# 现在合并块以遵守max_block_size
|
||||
result_blocks = []
|
||||
current_block = ""
|
||||
|
||||
for block in blocks:
|
||||
# 如果单个块大于最大大小,则进一步拆分
|
||||
if len(block) > self.max_block_size:
|
||||
# 如果已有累积内容,先添加
|
||||
if current_block:
|
||||
result_blocks.append(current_block)
|
||||
current_block = ""
|
||||
|
||||
# 拆分大块
|
||||
large_block_parts = self._split_large_block(block)
|
||||
result_blocks.extend(large_block_parts)
|
||||
continue
|
||||
|
||||
# 如果添加此块会超过限制,则开始新的结果块
|
||||
if len(current_block) + len(block) + 2 > self.max_block_size and current_block:
|
||||
result_blocks.append(current_block)
|
||||
current_block = block
|
||||
else:
|
||||
# 添加到当前块并适当换行
|
||||
if current_block:
|
||||
current_block += "\n\n" + block
|
||||
else:
|
||||
current_block = block
|
||||
|
||||
# 如果不为空则添加最后一个块
|
||||
if current_block:
|
||||
result_blocks.append(current_block)
|
||||
|
||||
return result_blocks
|
||||
|
||||
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
||||
"""
|
||||
将markdown文本拆分为逻辑块(标题、段落、代码块等)
|
||||
|
||||
参数:
|
||||
markdown_text: 输入markdown文本
|
||||
|
||||
返回:
|
||||
markdown块列表
|
||||
"""
|
||||
# 将Windows换行符替换为Unix风格
|
||||
markdown_text = markdown_text.replace('\r\n', '\n')
|
||||
|
||||
# 匹配代码块的模式(用```或~~~围起来)
|
||||
code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)'
|
||||
|
||||
# 将文本拆分为代码块和非代码块
|
||||
parts = re.split(code_block_pattern, markdown_text)
|
||||
|
||||
blocks = []
|
||||
for i, part in enumerate(parts):
|
||||
# 如果是代码块(拆分结果中的奇数索引)
|
||||
if i % 2 == 1:
|
||||
blocks.append(part)
|
||||
else:
|
||||
# 对于非代码块,按空行拆分
|
||||
part_blocks = re.split(r'\n\s*\n', part)
|
||||
blocks.extend([b.strip() for b in part_blocks if b.strip()])
|
||||
|
||||
return blocks
|
||||
|
||||
def _split_large_block(self, block: str) -> List[str]:
|
||||
"""
|
||||
拆分超过max_block_size的大块。
|
||||
|
||||
参数:
|
||||
block: 一个大的markdown块
|
||||
|
||||
返回:
|
||||
较小的块列表
|
||||
"""
|
||||
result = []
|
||||
|
||||
# 检查是否是代码块
|
||||
if block.startswith('```') or block.startswith('~~~'):
|
||||
# 对于代码块,我们需要保留围栏标记
|
||||
fence_marker = '```' if block.startswith('```') else '~~~'
|
||||
|
||||
# 提取语言说明符(如果存在)
|
||||
first_line_end = block.find('\n')
|
||||
first_line = block[:first_line_end]
|
||||
language_spec = first_line[3:].strip()
|
||||
|
||||
# 拆分代码内容
|
||||
code_content = block[first_line_end + 1:-3].strip()
|
||||
|
||||
# 按行拆分
|
||||
lines = code_content.split('\n')
|
||||
|
||||
current_part = [first_line]
|
||||
current_size = len(first_line) + 1 # +1表示换行符
|
||||
|
||||
for line in lines:
|
||||
line_size = len(line) + 1 # +1表示换行符
|
||||
|
||||
if current_size + line_size + 3 > self.max_block_size: # +3表示关闭围栏
|
||||
# 关闭当前代码块
|
||||
current_part.append(fence_marker)
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
# 开始新的代码块
|
||||
current_part = [f"{fence_marker}{language_spec}"]
|
||||
current_size = len(current_part[0]) + 1
|
||||
|
||||
current_part.append(line)
|
||||
current_size += line_size
|
||||
|
||||
# 在最后部分添加关闭围栏
|
||||
current_part.append(fence_marker)
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
else:
|
||||
# 对于其他块,按句子或行拆分
|
||||
if '.' in block or '!' in block or '?' in block:
|
||||
# 按句子拆分
|
||||
sentences = re.split(r'(?<=[.!?])\s+', block)
|
||||
|
||||
current_part = []
|
||||
current_size = 0
|
||||
|
||||
for sentence in sentences:
|
||||
if current_size + len(sentence) + 1 > self.max_block_size and current_part:
|
||||
result.append(' '.join(current_part))
|
||||
current_part = [sentence]
|
||||
current_size = len(sentence)
|
||||
else:
|
||||
current_part.append(sentence)
|
||||
current_size += len(sentence) + 1 # +1表示空格
|
||||
|
||||
if current_part:
|
||||
result.append(' '.join(current_part))
|
||||
else:
|
||||
# 按行拆分
|
||||
lines = block.split('\n')
|
||||
|
||||
current_part = []
|
||||
current_size = 0
|
||||
|
||||
for line in lines:
|
||||
if current_size + len(line) + 1 > self.max_block_size and current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
current_part = [line]
|
||||
current_size = len(line)
|
||||
else:
|
||||
current_part.append(line)
|
||||
current_size += len(line) + 1 # +1表示换行符
|
||||
|
||||
if current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def split_markdown_text(markdown_text, max_block_size=4096):
|
||||
"""
|
||||
将markdown字符串拆分为不超过max_block_size的块。
|
||||
|
||||
参数:
|
||||
markdown_text: 输入markdown文本
|
||||
max_block_size: 每个块的最大字符数
|
||||
|
||||
返回:
|
||||
markdown块列表
|
||||
"""
|
||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||
return splitter.split_markdown(markdown_text)
|
||||
61
filetranslate/utils/markdown_utils.py
Normal file
61
filetranslate/utils/markdown_utils.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import re
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
|
||||
|
||||
class MaskDict:
|
||||
def __init__(self):
|
||||
self._dict = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def create_id(self):
|
||||
with self._lock:
|
||||
while True:
|
||||
id = uuid.uuid1().hex[:6]
|
||||
if id not in self._dict:
|
||||
return id
|
||||
|
||||
def get(self, key):
|
||||
with self._lock:
|
||||
return self._dict.get(key)
|
||||
|
||||
def set(self, key, value):
|
||||
with self._lock:
|
||||
self._dict[key] = value
|
||||
|
||||
def delete(self, key):
|
||||
with self._lock:
|
||||
if key in self._dict:
|
||||
del self._dict[key]
|
||||
|
||||
def __contains__(self, item):
|
||||
with self._lock:
|
||||
return item in self._dict
|
||||
def uris2placeholder(markdown:str, mask_dict:MaskDict):
|
||||
def uri2placeholder(match: re.Match):
|
||||
id = mask_dict.create_id()
|
||||
mask_dict.set(id, match.group())
|
||||
return f"<ph-{id}>"
|
||||
|
||||
uri_pattern = r'!?\[.*?\]\(.*?\)'
|
||||
markdown = re.sub(uri_pattern, uri2placeholder, markdown)
|
||||
return markdown
|
||||
|
||||
def placeholder2_uris(markdown:str, mask_dict:MaskDict):
|
||||
def placeholder2uri(match:re.Match):
|
||||
id=match.group(1)
|
||||
uri=mask_dict.get(id)
|
||||
if uri is None:
|
||||
return match.group()
|
||||
return uri
|
||||
|
||||
ph_pattern = r"<ph-([a-zA-Z0-9]+)>"
|
||||
markdown = re.sub(ph_pattern, placeholder2uri, markdown)
|
||||
return markdown
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
Reference in New Issue
Block a user