fix: 印尼语fallback缺失 + docx格式100%保留
- index.html: catch块增加id回退、setLang动态更新title、标题栏初始化 - docx_translator.py: 从分段翻译改为段落级翻译,Run结构100%保留 删除SIGNIFICANT_STYLES/is_tab_run/_process_element_children等~120行死代码 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>DocuTranslate - 交互式文档翻译</title>
|
||||
<title>DocuTranslate</title>
|
||||
<link rel="icon" href="/static/favicon.ico" type="image/x-icon">
|
||||
<!-- Bootstrap CSS -->
|
||||
<link href="/static/bootstrap.css" rel="stylesheet" crossorigin="anonymous">
|
||||
@@ -1860,6 +1860,8 @@
|
||||
localStorage.setItem('ui_language', l);
|
||||
const langMap = {zh: 'zh-CN', en: 'en', id: 'id'};
|
||||
document.documentElement.lang = langMap[l] || 'en';
|
||||
const dict = i18nData.value[l] || i18nData.value['en'] || {};
|
||||
document.title = dict['pageTitle'] || 'DocuTranslate';
|
||||
};
|
||||
const setTheme = (t) => {
|
||||
localStorage.setItem('theme', t);
|
||||
@@ -1886,6 +1888,7 @@
|
||||
if(i18nData.value.en) Object.assign(i18nData.value.en, extraEn);
|
||||
|
||||
} catch (e) {
|
||||
console.error("i18n load failed", e);
|
||||
i18nData.value = {
|
||||
zh: {
|
||||
pageTitle: "DocuTranslate",
|
||||
@@ -1921,10 +1924,32 @@
|
||||
mineruDeployServerUrlPlaceholder: "http://127.0.0.1:30000",
|
||||
mineruDeployParseMethodLabel: "Parse Method",
|
||||
mineruDeployTableEnableLabel: "Table Recognition"
|
||||
},
|
||||
id: {
|
||||
pageTitle: "DocuTranslate",
|
||||
tutorialBtn: "Tutorial",
|
||||
projectContributeBtn: "Kolaborasi",
|
||||
workflowTitle: "Pilih Alur Kerja",
|
||||
autoWorkflowLabel: "Pilih Otomatis",
|
||||
modelPresetLabel: "Preset Model",
|
||||
modelPresetPlaceholder: "Pilih preset model",
|
||||
modelPresetEmpty: "Konfigurasi preset di server",
|
||||
modelPresetRuntimeHint: "Provider, endpoint, dan API key akan dibaca dari environment server.",
|
||||
workflowOptionPptx: "Presentasi PPTX",
|
||||
pptxSettingsTitleText: "Pengaturan PPTX",
|
||||
mineruDeployServerUrlLabel: "Server URL",
|
||||
mineruDeployLangListLabel: "Daftar Bahasa",
|
||||
mineruDeployServerUrlPlaceholder: "http://127.0.0.1:30000",
|
||||
mineruDeployParseMethodLabel: "Parse Method",
|
||||
mineruDeployTableEnableLabel: "Table Recognition"
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Set initial page title based on detected language
|
||||
const initDict = i18nData.value[currentLang.value] || i18nData.value['en'] || {};
|
||||
document.title = initDict['pageTitle'] || 'DocuTranslate';
|
||||
|
||||
// Backend Metadata
|
||||
try {
|
||||
const [metaRes, enginRes, paramsRes, configRes] = await Promise.all([
|
||||
|
||||
@@ -24,19 +24,6 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTr
|
||||
|
||||
# ---------------- 辅助函数 ----------------
|
||||
|
||||
# [v6.2] 定义一组具有显著视觉效果的格式标签。
|
||||
# 我们只在 Run 包含这些格式时才将其视为空白格式边界。
|
||||
# 这避免了因字体、字号等微小变化导致的过度文本切分。
|
||||
SIGNIFICANT_STYLES = frozenset([
|
||||
qn('w:u'), # 下划线
|
||||
qn('w:strike'), # 删除线
|
||||
qn('w:dstrike'), # 双删除线
|
||||
qn('w:shd'), # 底纹/背景色
|
||||
qn('w:highlight'), # 荧光笔高亮
|
||||
qn('w:bdr'), # 边框
|
||||
qn('w:effectLst'), # 文本效果 (如发光、阴影)
|
||||
qn('w:em'), # 强调标记 (着重号)
|
||||
])
|
||||
|
||||
|
||||
def is_image_run(run: Run) -> bool:
|
||||
@@ -45,33 +32,6 @@ def is_image_run(run: Run) -> bool:
|
||||
return '<w:drawing' in xml or '<w:pict' in xml
|
||||
|
||||
|
||||
def is_formatting_only_run(run: Run) -> bool:
|
||||
"""
|
||||
检查一个 Run 是否仅用于格式化,不包含任何应被渲染的文本。
|
||||
这仅适用于其 .text 属性为 "" 的情况。
|
||||
"""
|
||||
return run.text == ""
|
||||
|
||||
|
||||
# ---------- 新增修改部分 1: is_styled_whitespace_run 函数被移除 ----------
|
||||
# 此函数不再需要,因为新的逻辑会根据格式变化来切分,而不是根据带格式的空格。
|
||||
# ---------------------- 修改结束 ----------------------
|
||||
|
||||
def is_tab_run(run: Run) -> bool:
|
||||
"""
|
||||
检查一个 Run 是否主要代表一个制表符,应被视作格式边界。
|
||||
仅当 Run 的文本内容为空或仅包含空白,且 XML 中存在 <w:tab/> 时,
|
||||
才将其视为纯格式化用途的 Run。
|
||||
"""
|
||||
# .text 属性会将 <w:tab/> 转换成 '\t'
|
||||
# 如果 .text 在去除空白后仍有内容,说明这个 Run 不仅仅是个制表符。
|
||||
if run.text.strip():
|
||||
return False
|
||||
|
||||
xml = getattr(run.element, 'xml', '')
|
||||
return '<w:tab' in xml or '<w:ptab' in xml
|
||||
|
||||
|
||||
# ---------------- 配置类 ----------------
|
||||
@dataclass
|
||||
class DocxTranslatorConfig(AiTranslatorConfig):
|
||||
@@ -110,14 +70,6 @@ class DocxTranslator(AiTranslator):
|
||||
[v6.0 - 语义切分重构版]
|
||||
- 重构核心逻辑,不再跳过域结果,而是将其作为语义边界来切分文本,增强了鲁棒性。
|
||||
"""
|
||||
IGNORED_TAGS = {
|
||||
qn('w:proofErr'), qn('w:lastRenderedPageBreak'), qn('w:bookmarkStart'),
|
||||
qn('w:bookmarkEnd'), qn('w:commentRangeStart'), qn('w:commentRangeEnd'),
|
||||
qn('w:del'), qn('w:ins'), qn('w:moveFrom'), qn('w:moveTo'),
|
||||
}
|
||||
RECURSIVE_CONTAINER_TAGS = {
|
||||
qn('w:smartTag'), qn('w:sdtContent'), qn('w:hyperlink'),
|
||||
}
|
||||
|
||||
def __init__(self, config: DocxTranslatorConfig):
|
||||
super().__init__(config=config)
|
||||
@@ -139,126 +91,35 @@ class DocxTranslator(AiTranslator):
|
||||
self.insert_mode = config.insert_mode
|
||||
self.separator = config.separator
|
||||
|
||||
# ---------- 新增修改部分 2: 增加用于比较格式的辅助函数 ----------
|
||||
def _get_significant_styles(self, run: Run) -> frozenset:
|
||||
"""从一个 Run 中提取“显著”格式标签的集合。"""
|
||||
if run is None:
|
||||
return frozenset()
|
||||
rPr = run.element.rPr
|
||||
if rPr is None:
|
||||
return frozenset()
|
||||
return frozenset(child.tag for child in rPr if child.tag in SIGNIFICANT_STYLES)
|
||||
|
||||
def _have_same_significant_styles(self, run1: Run, run2: Run) -> bool:
|
||||
"""检查两个 Run 是否具有相同的“显著”格式集合。"""
|
||||
styles1 = self._get_significant_styles(run1)
|
||||
styles2 = self._get_significant_styles(run2)
|
||||
return styles1 == styles2
|
||||
|
||||
# ---------------------- 修改结束 ----------------------
|
||||
|
||||
# ---------- 代码修改部分 1: 形状翻译逻辑的核心实现 ----------
|
||||
def _process_element_children(self, element, parent_paragraph: Paragraph, elements: List[Dict[str, Any]],
|
||||
texts: List[str],
|
||||
state: Dict[str, Any],
|
||||
top_level_para: Paragraph):
|
||||
|
||||
def flush_segment():
|
||||
current_runs = state['current_runs']
|
||||
if not current_runs:
|
||||
return
|
||||
full_text = "".join(r.text for r in current_runs)
|
||||
if full_text.strip():
|
||||
# 在 elements 中增加对父段落和顶级段落的引用
|
||||
elements.append({
|
||||
"type": "text_runs",
|
||||
"runs": list(current_runs),
|
||||
"paragraph": parent_paragraph,
|
||||
"top_level_paragraph": top_level_para
|
||||
})
|
||||
texts.append(full_text)
|
||||
state['current_runs'].clear()
|
||||
|
||||
for child in element:
|
||||
if child.tag in self.IGNORED_TAGS:
|
||||
continue
|
||||
|
||||
if child.tag in self.RECURSIVE_CONTAINER_TAGS:
|
||||
flush_segment()
|
||||
self._process_element_children(child, parent_paragraph, elements, texts, state, top_level_para)
|
||||
flush_segment() # 在递归容器后也刷新,确保其内容成为独立片段
|
||||
continue
|
||||
|
||||
field_char_element = child.find(qn('w:fldChar')) if isinstance(child, CT_R) else None
|
||||
if field_char_element is not None:
|
||||
fld_type = field_char_element.get(qn('w:fldCharType'))
|
||||
if fld_type == 'begin' or fld_type == 'end':
|
||||
flush_segment()
|
||||
continue
|
||||
|
||||
if isinstance(child, CT_R):
|
||||
# 传入 parent_paragraph 以确保 Run 对象具有正确的上下文
|
||||
run = Run(child, parent_paragraph)
|
||||
|
||||
# 新增逻辑:处理形状(drawing/pict)内的文本
|
||||
# 形状可以包含文本框,需要优先于图片处理逻辑进行解析
|
||||
if '<w:drawing' in run.element.xml or '<w:pict' in run.element.xml:
|
||||
# 使用 list() 消耗迭代器,以便检查是否找到了文本框
|
||||
text_boxes = list(run.element.iter(qn('w:txbxContent')))
|
||||
if text_boxes:
|
||||
flush_segment() # 包含文本的形状是一个边界,刷新前面的文本
|
||||
for txbx_content in text_boxes:
|
||||
# 遍历文本框内的所有段落
|
||||
for p_element in txbx_content.findall(qn('w:p')):
|
||||
# 创建新的段落对象,并传入父级上下文
|
||||
shape_para = Paragraph(p_element, parent_paragraph)
|
||||
# 递归处理该段落,并传递顶级段落上下文
|
||||
self._process_paragraph(shape_para, elements, texts, top_level_para=top_level_para)
|
||||
|
||||
# 如果处理了形状内的文本,则该 Run 的任务已完成
|
||||
continue
|
||||
|
||||
# 保留原有逻辑: 检查绝对边界(图片、制表符等)
|
||||
if is_image_run(run) or is_formatting_only_run(run) or is_tab_run(run):
|
||||
flush_segment()
|
||||
continue # 这些 Run 本身不包含在任何文本片段中
|
||||
|
||||
# 保留原有逻辑: 基于格式变化进行切分
|
||||
last_run_in_segment = state['current_runs'][-1] if state['current_runs'] else None
|
||||
if last_run_in_segment and not self._have_same_significant_styles(last_run_in_segment, run):
|
||||
flush_segment()
|
||||
|
||||
# 将当前 Run 添加到片段中
|
||||
state['current_runs'].append(run)
|
||||
else:
|
||||
# 遇到任何非 Run 的块级元素(如在单元格中嵌套的表格),都应结束当前文本片段。
|
||||
flush_segment()
|
||||
|
||||
def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str],
|
||||
top_level_para: Paragraph = None):
|
||||
# 如果是首次进入段落处理(非递归调用),则当前段落是顶级段落
|
||||
"""
|
||||
简化版段落处理:将段落内所有文本 Run 收集为一个翻译单元,完整保留格式结构。
|
||||
不再按格式变化切分 segment,翻译质量更好、格式保留 100%。
|
||||
"""
|
||||
if top_level_para is None:
|
||||
top_level_para = para
|
||||
|
||||
state = {
|
||||
'current_runs': [],
|
||||
}
|
||||
# 修改调用:传入 `para` 对象、其顶级上下文
|
||||
self._process_element_children(para._p, para, elements, texts, state, top_level_para)
|
||||
text_runs = []
|
||||
for run in para.runs:
|
||||
if is_image_run(run):
|
||||
continue # 跳过图片
|
||||
if not run.text.strip():
|
||||
# 保留带格式的空 Run(如下划线空格),但不加入文本
|
||||
continue
|
||||
|
||||
# 确保在段落处理结束时,刷新所有剩余的 Run
|
||||
current_runs = state['current_runs']
|
||||
if current_runs:
|
||||
full_text = "".join(r.text for r in current_runs)
|
||||
text_runs.append(run)
|
||||
|
||||
if text_runs:
|
||||
full_text = "".join(r.text for r in text_runs)
|
||||
if full_text.strip():
|
||||
elements.append({
|
||||
"type": "text_runs",
|
||||
"runs": list(current_runs),
|
||||
"runs": list(text_runs),
|
||||
"paragraph": para,
|
||||
"top_level_paragraph": top_level_para
|
||||
})
|
||||
texts.append(full_text)
|
||||
current_runs.clear()
|
||||
|
||||
# ---------------------- 修改结束 ----------------------
|
||||
|
||||
|
||||
Reference in New Issue
Block a user