fix: MT模式翻译残留、docx格式丢失、语言切换器及provider域名修复

- provider.py: 域名匹配改为包含匹配,覆盖dashscope-intl国际站
- segments_agent.py: MT模式改用<<<SEG:n>>>纯文本标记替代JSON,避免qwen-mt模型原文残留
- docx_translator.py: _apply_translation改为按字符比例分配译文到各Run,保留原始格式
- i18nData.json: vi(越南语)替换为id(印尼语),含完整175键翻译
- index.html: 语言切换器移至顶部标题栏,新增浏览器语言自动检测

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-08 14:07:13 +08:00
parent 9d8eacf0b4
commit 8a5f62342a
5 changed files with 377 additions and 331 deletions

View File

@@ -5,7 +5,7 @@ ProviderType: TypeAlias = Literal["ollama", "bigmodel", "aliyuncs", "volces", "g
def get_provider_by_domain(domain:str)->ProviderType: def get_provider_by_domain(domain:str)->ProviderType:
if domain == "open.bigmodel.cn": if domain == "open.bigmodel.cn":
return "bigmodel" return "bigmodel"
elif domain == "dashscope.aliyuncs.com": elif "dashscope.aliyuncs.com" in domain:
return "aliyuncs" return "aliyuncs"
elif domain == "ark.cn-beijing.volces.com": elif domain == "ark.cn-beijing.volces.com":
return "volces" return "volces"

View File

@@ -15,6 +15,9 @@ from docutranslate.agents.agent import PartialAgentResultError, AgentResultError
from docutranslate.glossary.glossary import Glossary from docutranslate.glossary.glossary import Glossary
from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
# MT mode plain-text segment marker — designed to survive machine translation unchanged
MT_SEG_MARKER_RE = re.compile(r'<<<SEG:(\d+)>>>\s*\n(.*?)(?=<<<SEG:\d+>>>|\Z)', re.DOTALL)
def generate_prompt(json_segments: str, to_lang: str): def generate_prompt(json_segments: str, to_lang: str):
return f""" return f"""
@@ -58,8 +61,8 @@ Below is an example of how merging should be done when necessary:
input: input:
```json ```json
{{ {{
"EXAMPLE_KEY_1":"汤姆说:杰克你", "EXAMPLE_KEY_1":"汤姆说:\"杰克你",
"EXAMPLE_KEY_2":"" "EXAMPLE_KEY_2":"\""
}} }}
``` ```
output: output:
@@ -92,6 +95,44 @@ def get_target_segments(result: str):
return result return result
def _chunk_to_mt_prompt(chunk: dict) -> str:
"""Convert a JSON chunk like {'0': 'text1', '1': 'text2'} to MT-friendly plain text."""
parts = []
for key in sorted(chunk.keys(), key=int):
parts.append(f"<<<SEG:{key}>>>\n{chunk[key]}")
return "\n".join(parts)
def _parse_mt_prompt_to_dict(mt_prompt: str) -> dict:
"""Parse an MT prompt string back to the original segment dict."""
result = {}
for match in MT_SEG_MARKER_RE.finditer(mt_prompt):
key = match.group(1)
value = match.group(2).strip()
result[key] = value
if not result:
# MT format parsing failed — wrap entire prompt as single segment
result = {"0": mt_prompt}
return result
def _parse_mt_response(text: str, original_chunk: dict) -> dict:
"""Parse MT plain-text response using <<<SEG:n>>> markers back to dict."""
result = {}
for match in MT_SEG_MARKER_RE.finditer(text):
key = match.group(1)
value = match.group(2).strip()
if key in original_chunk:
result[key] = value
# Fill missing keys from original
for key in original_chunk:
if key not in result:
result[key] = ""
return result
@dataclass(kw_only=True) @dataclass(kw_only=True)
class SegmentsTranslateAgentConfig(AgentConfig): class SegmentsTranslateAgentConfig(AgentConfig):
to_lang: str to_lang: str
@@ -123,20 +164,16 @@ class SegmentsTranslateAgent(Agent):
def _result_handler(self, result: str, origin_prompt: str, logger: Logger): def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
""" """
处理成功的API响应。 处理成功的API响应。
- 如果键完全匹配,返回翻译结果 MT模式下使用 <<<SEG:n>>> 标记解析纯文本响应避免JSON格式不兼容问题
- 如果键不匹配,构造一个部分成功的结果,并通过 PartialTranslationError 异常抛出,以触发重试。
- 其他错误如JSON解析失败、模型偷懒则抛出普通 ValueError 触发重试。
- MT模式下如果返回的是纯文本而非JSON将其按行分割并映射到原始键。
""" """
# MT模式下直接解析origin_prompt为JSON纯净JSON没有<input>包装)
if self.is_mt_mode: if self.is_mt_mode:
original_segments = origin_prompt return self._result_handler_mt(result, origin_prompt, logger)
else:
# --- Non-MT mode (JSON-based) ---
original_segments = get_original_segments(origin_prompt) original_segments = get_original_segments(origin_prompt)
result = get_target_segments(result) result = get_target_segments(result)
if result == "": if result == "":
if original_segments.strip() != "": if original_segments.strip() != "":
# print(f"【测试】origin_prompt:\n{origin_prompt}\nresult:\n{result}")
raise AgentResultError("result为空值但原文不为空") raise AgentResultError("result为空值但原文不为空")
return {} return {}
try: try:
@@ -144,37 +181,6 @@ class SegmentsTranslateAgent(Agent):
original_chunk = json_repair.loads(original_segments) original_chunk = json_repair.loads(original_segments)
repaired_result = json_repair.loads(result) repaired_result = json_repair.loads(result)
# MT模式兼容处理各种非标准返回格式
if self.is_mt_mode:
# 如果是列表,尝试合并所有字典
if isinstance(repaired_result, list):
logger.debug(f"[MT模式] 返回结果是列表,包含 {len(repaired_result)} 个元素")
merged_result = {}
for item in repaired_result:
if isinstance(item, dict):
merged_result.update(item)
repaired_result = merged_result
# 如果返回的是纯文本(字符串),尝试将其映射到原始键
if isinstance(repaired_result, str):
original_keys = list(original_chunk.keys())
# 按行分割结果,去除空行
result_lines = [line.strip() for line in repaired_result.split('\n') if line.strip()]
# 如果只有一行结果但多个键,将整个结果分配给第一个键,其余为空
if len(result_lines) == 1 and len(original_keys) > 1:
repaired_result = {original_keys[0]: result_lines[0]}
for key in original_keys[1:]:
repaired_result[key] = ""
# 如果结果行数与键数匹配,逐行对应
elif len(result_lines) == len(original_keys):
repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))}
# 如果结果行数不匹配,将所有结果合并给第一个键
else:
repaired_result = {original_keys[0]: repaired_result}
for key in original_keys[1:]:
repaired_result[key] = ""
if not isinstance(repaired_result, dict): if not isinstance(repaired_result, dict):
raise AgentResultError(f"Agent返回结果不是dict的json形式, result: {result}") raise AgentResultError(f"Agent返回结果不是dict的json形式, result: {result}")
@@ -184,9 +190,7 @@ class SegmentsTranslateAgent(Agent):
original_keys = set(original_chunk.keys()) original_keys = set(original_chunk.keys())
result_keys = set(repaired_result.keys()) result_keys = set(repaired_result.keys())
# 如果键不完全匹配
if original_keys != result_keys: if original_keys != result_keys:
# 仍然先构造一个最完整的“部分结果”
final_chunk = {} final_chunk = {}
common_keys = original_keys.intersection(result_keys) common_keys = original_keys.intersection(result_keys)
missing_keys = original_keys - result_keys missing_keys = original_keys - result_keys
@@ -201,74 +205,104 @@ class SegmentsTranslateAgent(Agent):
for key in missing_keys: for key in missing_keys:
final_chunk[key] = str(original_chunk[key]) final_chunk[key] = str(original_chunk[key])
raise PartialAgentResultError("键不匹配,触发重试", partial_result=final_chunk,
append_prompt=f"\nBe careful not to omit any keys from the input; do not combine sentences when translating.\n")
# 抛出自定义异常,将部分结果和错误信息一起传递出去
raise PartialAgentResultError("键不匹配,触发重试", partial_result=final_chunk,append_prompt=f"\nBe careful not to omit any keys from the input; do not combine sentences when translating.\n")
# 如果键完全匹配(理想情况),正常返回
for key, value in repaired_result.items(): for key, value in repaired_result.items():
repaired_result[key] = str(value) repaired_result[key] = str(value)
return repaired_result return repaired_result
except (RuntimeError, JSONDecodeError) as e: except (RuntimeError, JSONDecodeError) as e:
# MT模式兼容如果JSON解析失败尝试将结果作为纯文本处理
if self.is_mt_mode:
try:
original_chunk = json_repair.loads(original_segments)
original_keys = list(original_chunk.keys())
result_lines = [line.strip() for line in result.split('\n') if line.strip()]
if len(result_lines) == 1 and len(original_keys) > 1:
repaired_result = {original_keys[0]: result_lines[0]}
for key in original_keys[1:]:
repaired_result[key] = ""
elif len(result_lines) == len(original_keys):
repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))}
else:
repaired_result = {original_keys[0]: result}
for key in original_keys[1:]:
repaired_result[key] = ""
# 验证结果
if set(repaired_result.keys()) != set(original_chunk.keys()):
raise AgentResultError(f"MT模式解析后键不匹配")
return repaired_result
except Exception as mt_e:
raise AgentResultError(f"MT模式纯文本处理失败: {mt_e.__repr__()}")
# 对于JSON解析等硬性错误继续抛出普通ValueError
raise AgentResultError(f"结果处理失败: {e.__repr__()}") raise AgentResultError(f"结果处理失败: {e.__repr__()}")
def _result_handler_mt(self, result: str, origin_prompt: str, logger: Logger) -> dict:
"""MT模式专用结果处理器解析 <<<SEG:n>>> 标记格式的纯文本响应。"""
result_clean = result.strip()
if result_clean == "":
if origin_prompt.strip() != "":
raise AgentResultError("result为空值但原文不为空")
return {}
original_chunk = _parse_mt_prompt_to_dict(origin_prompt)
original_keys = set(original_chunk.keys())
# Try parsing with <<<SEG:n>>> markers
parsed = _parse_mt_response(result_clean, original_chunk)
if parsed and any(v.strip() for v in parsed.values()):
result_keys = set(parsed.keys())
if result_keys == original_keys:
# Check if result is identical to original (no translation happened)
all_same = all(
parsed.get(k, "").strip() == str(original_chunk.get(k, "")).strip()
for k in original_keys
)
if all_same:
raise AgentResultError("翻译结果与原文完全相同,疑似翻译失败,将进行重试。")
return parsed
# If key mismatch, try as Partial result
if result_keys and result_keys != original_keys:
final_chunk = {}
for key in original_keys:
final_chunk[key] = parsed.get(key, str(original_chunk.get(key, "")))
raise PartialAgentResultError(
"MT模式键不匹配触发重试",
partial_result=final_chunk,
append_prompt="\nPreserve all <<<SEG:n>>> markers exactly as they appear.\n"
)
# Fallback: Try line-by-line mapping (MT model might have removed markers)
result_lines = [line.strip() for line in result_clean.split('\n') if line.strip()]
original_seg_list = [str(original_chunk.get(str(i), "")) for i in range(len(original_chunk))]
non_empty_lines = [l for l in result_lines if l]
if len(non_empty_lines) == len(original_chunk):
repaired = {str(i): non_empty_lines[i] for i in range(len(non_empty_lines))}
all_same = all(
repaired.get(k, "").strip() == str(original_chunk.get(k, "")).strip()
for k in original_keys
)
if all_same:
raise AgentResultError("翻译结果与原文完全相同(逐行),疑似翻译失败,将进行重试。")
return repaired
# Last fallback: assign all result text to first key
if non_empty_lines:
repaired = {str(i): "" for i in range(len(original_chunk))}
repaired["0"] = "\n".join(non_empty_lines)
return repaired
raise AgentResultError("MT模式无法解析响应")
def _error_result_handler(self, origin_prompt: str, logger: Logger): def _error_result_handler(self, origin_prompt: str, logger: Logger):
""" """
处理在所有重试后仍然失败的请求。 处理在所有重试后仍然失败的请求。
作为备用方案,返回原文内容,并将所有值转换为字符串 作为备用方案,返回原文内容。
""" """
# MT模式下直接解析origin_prompt为JSON纯净JSON没有<input>包装)
if self.is_mt_mode: if self.is_mt_mode:
original_segments = origin_prompt original_chunk = _parse_mt_prompt_to_dict(origin_prompt)
else: for key in list(original_chunk.keys()):
original_chunk[key] = f"{original_chunk[key]}"
return original_chunk
original_segments = get_original_segments(origin_prompt) original_segments = get_original_segments(origin_prompt)
if original_segments == "": if original_segments == "":
return {} return {}
try: try:
original_chunk = json_repair.loads(original_segments) original_chunk = json_repair.loads(original_segments)
# 此处逻辑保留,作为最终的兜底方案
for key, value in original_chunk.items(): for key, value in original_chunk.items():
original_chunk[key] = f"{value}" original_chunk[key] = f"{value}"
return original_chunk return original_chunk
except (RuntimeError, JSONDecodeError): except (RuntimeError, JSONDecodeError):
logger.error(f"原始prompt也不是有效的json格式: {original_segments}") logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
# 如果原始prompt本身也无效返回一个清晰的错误对象
return {"error": f"{original_segments}"} return {"error": f"{original_segments}"}
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]: def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
# MT模式下直接发送纯净JSON不添加额外提示词
if self.is_mt_mode: if self.is_mt_mode:
prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks] prompts = [_chunk_to_mt_prompt(chunk) for chunk in chunks]
else: else:
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json, translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json,
@@ -292,7 +326,6 @@ class SegmentsTranslateAgent(Agent):
except Exception as e: except Exception as e:
self.logger.error(f"处理chunk时发生未知错误: {e.__repr__()}") self.logger.error(f"处理chunk时发生未知错误: {e.__repr__()}")
# 重建最终列表
result = [] result = []
last_end = 0 last_end = 0
ls = list(indexed_translated.values()) ls = list(indexed_translated.values())
@@ -308,9 +341,8 @@ class SegmentsTranslateAgent(Agent):
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]: async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments, indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
chunk_size) chunk_size)
# MT模式下直接发送纯净JSON不添加额外提示词
if self.is_mt_mode: if self.is_mt_mode:
prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks] prompts = [_chunk_to_mt_prompt(chunk) for chunk in chunks]
else: else:
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
@@ -326,7 +358,6 @@ class SegmentsTranslateAgent(Agent):
continue continue
for key, val in chunk.items(): for key, val in chunk.items():
if key in indexed_translated: if key in indexed_translated:
# 此处不再需要 str(val),因为 _result_handler 已经处理好了
indexed_translated[key] = val indexed_translated[key] = val
else: else:
self.logger.warning(f"在结果chunk中发现未知键 '{key}',已忽略。") self.logger.warning(f"在结果chunk中发现未知键 '{key}',已忽略。")
@@ -335,7 +366,6 @@ class SegmentsTranslateAgent(Agent):
except Exception as e: except Exception as e:
self.logger.error(f"处理chunk时发生未知错误: {e.__repr__()}") self.logger.error(f"处理chunk时发生未知错误: {e.__repr__()}")
# 重建最终列表
result = [] result = []
last_end = 0 last_end = 0
ls = list(indexed_translated.values()) ls = list(indexed_translated.values())

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,5 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="zh-CN" data-bs-theme="auto"> <html lang="en" data-bs-theme="auto">
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
@@ -159,15 +159,6 @@
white-space: pre; white-space: pre;
} }
.bottom-left-controls {
position: fixed;
bottom: 1rem;
left: 1rem;
z-index: 1050;
display: flex;
gap: 0.5rem;
}
.step-number { .step-number {
margin-right: 0.25rem; margin-right: 0.25rem;
} }
@@ -226,6 +217,31 @@
<div class="d-flex align-items-center"> <div class="d-flex align-items-center">
<h4 class="mb-0 me-3 fw-bold" :title="t('pageTitle')">DocuTranslate</h4> <h4 class="mb-0 me-3 fw-bold" :title="t('pageTitle')">DocuTranslate</h4>
</div> </div>
<!-- Language & Theme Controls -->
<div class="d-flex gap-2">
<div class="dropdown">
<button class="btn btn-outline-secondary btn-sm dropdown-toggle" type="button" data-bs-toggle="dropdown">
<i class="bi bi-translate me-1"></i><span>{{ {zh:'中文',en:'English',id:'Bahasa'}[currentLang] || 'Language' }}</span>
</button>
<ul class="dropdown-menu dropdown-menu-end">
<li><a class="dropdown-item" :class="{active: currentLang==='zh'}" href="#"
@click.prevent="setLang('zh')">中文</a></li>
<li><a class="dropdown-item" :class="{active: currentLang==='en'}" href="#"
@click.prevent="setLang('en')">English</a></li>
<li><a class="dropdown-item" :class="{active: currentLang==='id'}" href="#"
@click.prevent="setLang('id')">Bahasa Indonesia</a></li>
</ul>
</div>
<div class="dropdown">
<button class="btn btn-outline-secondary btn-sm dropdown-toggle" type="button" data-bs-toggle="dropdown"><i
class="bi bi-circle-half"></i></button>
<ul class="dropdown-menu dropdown-menu-end">
<li><button class="dropdown-item" @click="setTheme('light')"><i class="bi bi-sun-fill me-2"></i>Light</button></li>
<li><button class="dropdown-item" @click="setTheme('dark')"><i class="bi bi-moon-stars-fill me-2"></i>Dark</button></li>
<li><button class="dropdown-item" @click="setTheme('auto')"><i class="bi bi-circle-half me-2"></i>Auto</button></li>
</ul>
</div>
</div>
</div> </div>
<form id="translateForm" @submit.prevent> <form id="translateForm" @submit.prevent>
@@ -923,40 +939,7 @@
</div> </div>
<iframe id="printFrame" ref="printFrame" style="display: none;"></iframe> <iframe id="printFrame" ref="printFrame" style="display: none;"></iframe>
<!-- Controls --> <!-- Header controls now in left panel top-right -->
<div class="bottom-left-controls">
<div class="dropdown">
<button class="btn btn-secondary dropdown-toggle" type="button" data-bs-toggle="dropdown"><i
class="bi bi-translate"></i></button>
<ul class="dropdown-menu">
<li><a class="dropdown-item" :class="{active: currentLang==='zh'}" href="#"
@click.prevent="setLang('zh')">中文</a></li>
<li><a class="dropdown-item" :class="{active: currentLang==='en'}" href="#"
@click.prevent="setLang('en')">English</a></li>
<li><a class="dropdown-item" :class="{active: currentLang==='vi'}" href="#"
@click.prevent="setLang('vi')">Tiếng Việt</a></li>
</ul>
</div>
<div class="dropdown">
<button class="btn btn-secondary dropdown-toggle" type="button" data-bs-toggle="dropdown"><i
class="bi bi-circle-half"></i></button>
<ul class="dropdown-menu">
<li>
<button class="dropdown-item" @click="setTheme('light')"><i class="bi bi-sun-fill me-2"></i> Light
</button>
</li>
<li>
<button class="dropdown-item" @click="setTheme('dark')"><i class="bi bi-moon-stars-fill me-2"></i>
Dark
</button>
</li>
<li>
<button class="dropdown-item" @click="setTheme('auto')"><i class="bi bi-circle-half me-2"></i> Auto
</button>
</li>
</ul>
</div>
</div>
</div> </div>
<script src="/static/bootstrap.bundle.min.js"></script> <script src="/static/bootstrap.bundle.min.js"></script>
@@ -1048,7 +1031,14 @@
components: {SliderControl, ModelPresetSelector}, components: {SliderControl, ModelPresetSelector},
setup() { setup() {
const version = ref(""); const version = ref("");
const currentLang = ref(localStorage.getItem('ui_language') || 'zh'); function detectBrowserLang() {
const nav = navigator.language || navigator.userLanguage || '';
const lang = nav.split('-')[0].toLowerCase();
if (['zh', 'en', 'id'].includes(lang)) return lang;
if (lang === 'zh') return 'zh';
return 'en'; // default to English for unrecognized languages
}
const currentLang = ref(localStorage.getItem('ui_language') || detectBrowserLang());
const i18nData = ref({}); const i18nData = ref({});
const glossaryData = ref({}); const glossaryData = ref({});
const tasks = ref([]); const tasks = ref([]);
@@ -1868,7 +1858,8 @@
const setLang = (l) => { const setLang = (l) => {
currentLang.value = l; currentLang.value = l;
localStorage.setItem('ui_language', l); localStorage.setItem('ui_language', l);
document.documentElement.lang = l === 'zh' ? 'zh-CN' : 'en'; const langMap = {zh: 'zh-CN', en: 'en', id: 'id'};
document.documentElement.lang = langMap[l] || 'en';
}; };
const setTheme = (t) => { const setTheme = (t) => {
localStorage.setItem('theme', t); localStorage.setItem('theme', t);

View File

@@ -326,33 +326,58 @@ class DocxTranslator(AiTranslator):
runs = element_info["runs"] runs = element_info["runs"]
if not runs: return if not runs: return
first_real_run_index = -1 # Filter to runs that are still attached to the document
# 找到第一个可以写入文本的run valid_runs = []
for i, run in enumerate(runs): for run in runs:
if run.element.getparent() is not None: if run.element.getparent() is not None:
# 如果 run 是副本的一部分,其 _parent 可能仍然指向原始文档的段落
# 但我们需要确保它与 element_info["paragraph"] 同步
run._parent = element_info["paragraph"] run._parent = element_info["paragraph"]
run.text = final_text valid_runs.append(run)
first_real_run_index = i
break
# 如果没有找到有效的run例如它们都已被删除则记录警告 if not valid_runs:
if first_real_run_index == -1:
self.logger.warning(f"无法应用翻译 '{final_text}'因为找不到有效的run。") self.logger.warning(f"无法应用翻译 '{final_text}'因为找不到有效的run。")
return return
# 删除所有后续的run因为它们的文本已经被合并到第一个run中了 if len(valid_runs) == 1:
for i in range(first_real_run_index + 1, len(runs)): # Single run: just write the translation
run = runs[i] valid_runs[0].text = final_text
return
# Multiple runs: proportionally distribute translated text to preserve formatting
orig_lengths = [len(r.text) for r in valid_runs]
total_orig = sum(orig_lengths)
final_len = len(final_text)
if total_orig == 0:
valid_runs[0].text = final_text
for run in valid_runs[1:]:
self._remove_run_element(run)
return
# Distribute characters proportionally
char_pos = 0
for i, run in enumerate(valid_runs):
if i == len(valid_runs) - 1:
# Last run gets all remaining text
run.text = final_text[char_pos:]
else:
ratio = orig_lengths[i] / total_orig
run_char_count = max(1, round(final_len * ratio))
run_char_count = min(run_char_count, final_len - char_pos - (len(valid_runs) - i - 1))
if run_char_count <= 0:
# Remove runs that would get zero characters
self._remove_run_element(run)
continue
run.text = final_text[char_pos:char_pos + run_char_count]
char_pos += run_char_count
def _remove_run_element(self, run) -> None:
"""Safely remove a run element from its parent."""
parent_element = run.element.getparent() parent_element = run.element.getparent()
if parent_element is not None: if parent_element is not None:
try: try:
parent_element.remove(run.element) parent_element.remove(run.element)
except ValueError: except ValueError:
# 在某些复杂情况下一个run可能已经被其父元素隐式删除
self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。") self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。")
pass
# ---------- FIX START: 新增用于清理副本段落的辅助方法 ---------- # ---------- FIX START: 新增用于清理副本段落的辅助方法 ----------
def _prune_unwanted_elements_from_copy(self, p_element: OxmlElement): def _prune_unwanted_elements_from_copy(self, p_element: OxmlElement):