Compare commits
9 Commits
9d8eacf0b4
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 2d39c009cf | |||
| 4893163221 | |||
| 4f6bd1bc7b | |||
| a8b8c416dd | |||
| 4cf1a8c67d | |||
| 52bb8858c8 | |||
| 7f02abae0e | |||
| 97b7b20565 | |||
| 8a5f62342a |
@@ -5,7 +5,7 @@ ProviderType: TypeAlias = Literal["ollama", "bigmodel", "aliyuncs", "volces", "g
|
||||
def get_provider_by_domain(domain:str)->ProviderType:
|
||||
if domain == "open.bigmodel.cn":
|
||||
return "bigmodel"
|
||||
elif domain == "dashscope.aliyuncs.com":
|
||||
elif "dashscope.aliyuncs.com" in domain:
|
||||
return "aliyuncs"
|
||||
elif domain == "ark.cn-beijing.volces.com":
|
||||
return "volces"
|
||||
|
||||
@@ -15,10 +15,9 @@ from docutranslate.agents.agent import PartialAgentResultError, AgentResultError
|
||||
from docutranslate.glossary.glossary import Glossary
|
||||
from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
|
||||
|
||||
|
||||
def generate_prompt(json_segments: str, to_lang: str):
|
||||
return f"""
|
||||
You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.
|
||||
You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.
|
||||
Here is the input:
|
||||
|
||||
<input>
|
||||
@@ -58,8 +57,8 @@ Below is an example of how merging should be done when necessary:
|
||||
input:
|
||||
```json
|
||||
{{
|
||||
"EXAMPLE_KEY_1":"汤姆说:“杰克你",
|
||||
"EXAMPLE_KEY_2":"好”。"
|
||||
"EXAMPLE_KEY_1":"汤姆说:\"杰克你",
|
||||
"EXAMPLE_KEY_2":"好\"。"
|
||||
}}
|
||||
```
|
||||
output:
|
||||
@@ -121,22 +120,12 @@ class SegmentsTranslateAgent(Agent):
|
||||
return system_prompt, prompt
|
||||
|
||||
def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
|
||||
"""
|
||||
处理成功的API响应。
|
||||
- 如果键完全匹配,返回翻译结果。
|
||||
- 如果键不匹配,构造一个部分成功的结果,并通过 PartialTranslationError 异常抛出,以触发重试。
|
||||
- 其他错误(如JSON解析失败、模型偷懒)则抛出普通 ValueError 触发重试。
|
||||
- MT模式下,如果返回的是纯文本而非JSON,将其按行分割并映射到原始键。
|
||||
"""
|
||||
# MT模式下直接解析origin_prompt为JSON(纯净JSON,没有<input>包装)
|
||||
if self.is_mt_mode:
|
||||
original_segments = origin_prompt
|
||||
else:
|
||||
original_segments = get_original_segments(origin_prompt)
|
||||
"""处理非MT模式的JSON翻译响应。"""
|
||||
# --- JSON-based ---
|
||||
original_segments = get_original_segments(origin_prompt)
|
||||
result = get_target_segments(result)
|
||||
if result == "":
|
||||
if original_segments.strip() != "":
|
||||
# print(f"【测试】origin_prompt:\n{origin_prompt}\nresult:\n{result}")
|
||||
raise AgentResultError("result为空值但原文不为空")
|
||||
return {}
|
||||
try:
|
||||
@@ -144,37 +133,6 @@ class SegmentsTranslateAgent(Agent):
|
||||
original_chunk = json_repair.loads(original_segments)
|
||||
repaired_result = json_repair.loads(result)
|
||||
|
||||
# MT模式兼容:处理各种非标准返回格式
|
||||
if self.is_mt_mode:
|
||||
# 如果是列表,尝试合并所有字典
|
||||
if isinstance(repaired_result, list):
|
||||
logger.debug(f"[MT模式] 返回结果是列表,包含 {len(repaired_result)} 个元素")
|
||||
merged_result = {}
|
||||
for item in repaired_result:
|
||||
if isinstance(item, dict):
|
||||
merged_result.update(item)
|
||||
repaired_result = merged_result
|
||||
|
||||
# 如果返回的是纯文本(字符串),尝试将其映射到原始键
|
||||
if isinstance(repaired_result, str):
|
||||
original_keys = list(original_chunk.keys())
|
||||
# 按行分割结果,去除空行
|
||||
result_lines = [line.strip() for line in repaired_result.split('\n') if line.strip()]
|
||||
|
||||
# 如果只有一行结果但多个键,将整个结果分配给第一个键,其余为空
|
||||
if len(result_lines) == 1 and len(original_keys) > 1:
|
||||
repaired_result = {original_keys[0]: result_lines[0]}
|
||||
for key in original_keys[1:]:
|
||||
repaired_result[key] = ""
|
||||
# 如果结果行数与键数匹配,逐行对应
|
||||
elif len(result_lines) == len(original_keys):
|
||||
repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))}
|
||||
# 如果结果行数不匹配,将所有结果合并给第一个键
|
||||
else:
|
||||
repaired_result = {original_keys[0]: repaired_result}
|
||||
for key in original_keys[1:]:
|
||||
repaired_result[key] = ""
|
||||
|
||||
if not isinstance(repaired_result, dict):
|
||||
raise AgentResultError(f"Agent返回结果不是dict的json形式, result: {result}")
|
||||
|
||||
@@ -184,9 +142,7 @@ class SegmentsTranslateAgent(Agent):
|
||||
original_keys = set(original_chunk.keys())
|
||||
result_keys = set(repaired_result.keys())
|
||||
|
||||
# 如果键不完全匹配
|
||||
if original_keys != result_keys:
|
||||
# 仍然先构造一个最完整的“部分结果”
|
||||
final_chunk = {}
|
||||
common_keys = original_keys.intersection(result_keys)
|
||||
missing_keys = original_keys - result_keys
|
||||
@@ -201,76 +157,161 @@ class SegmentsTranslateAgent(Agent):
|
||||
for key in missing_keys:
|
||||
final_chunk[key] = str(original_chunk[key])
|
||||
|
||||
raise PartialAgentResultError("键不匹配,触发重试", partial_result=final_chunk,
|
||||
append_prompt=f"\nBe careful not to omit any keys from the input; do not combine sentences when translating.\n")
|
||||
|
||||
# 抛出自定义异常,将部分结果和错误信息一起传递出去
|
||||
raise PartialAgentResultError("键不匹配,触发重试", partial_result=final_chunk,append_prompt=f"\nBe careful not to omit any keys from the input; do not combine sentences when translating.\n")
|
||||
|
||||
# 如果键完全匹配(理想情况),正常返回
|
||||
for key, value in repaired_result.items():
|
||||
repaired_result[key] = str(value)
|
||||
|
||||
return repaired_result
|
||||
|
||||
except (RuntimeError, JSONDecodeError) as e:
|
||||
# MT模式兼容:如果JSON解析失败,尝试将结果作为纯文本处理
|
||||
if self.is_mt_mode:
|
||||
try:
|
||||
original_chunk = json_repair.loads(original_segments)
|
||||
original_keys = list(original_chunk.keys())
|
||||
result_lines = [line.strip() for line in result.split('\n') if line.strip()]
|
||||
|
||||
if len(result_lines) == 1 and len(original_keys) > 1:
|
||||
repaired_result = {original_keys[0]: result_lines[0]}
|
||||
for key in original_keys[1:]:
|
||||
repaired_result[key] = ""
|
||||
elif len(result_lines) == len(original_keys):
|
||||
repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))}
|
||||
else:
|
||||
repaired_result = {original_keys[0]: result}
|
||||
for key in original_keys[1:]:
|
||||
repaired_result[key] = ""
|
||||
|
||||
# 验证结果
|
||||
if set(repaired_result.keys()) != set(original_chunk.keys()):
|
||||
raise AgentResultError(f"MT模式解析后键不匹配")
|
||||
|
||||
return repaired_result
|
||||
except Exception as mt_e:
|
||||
raise AgentResultError(f"MT模式纯文本处理失败: {mt_e.__repr__()}")
|
||||
|
||||
# 对于JSON解析等硬性错误,继续抛出普通ValueError
|
||||
raise AgentResultError(f"结果处理失败: {e.__repr__()}")
|
||||
|
||||
def _error_result_handler(self, origin_prompt: str, logger: Logger):
|
||||
"""
|
||||
处理在所有重试后仍然失败的请求。
|
||||
作为备用方案,返回原文内容,并将所有值转换为字符串。
|
||||
"""
|
||||
# MT模式下直接解析origin_prompt为JSON(纯净JSON,没有<input>包装)
|
||||
if self.is_mt_mode:
|
||||
original_segments = origin_prompt
|
||||
else:
|
||||
original_segments = get_original_segments(origin_prompt)
|
||||
"""非MT模式: 所有重试失败后返回原文。"""
|
||||
original_segments = get_original_segments(origin_prompt)
|
||||
if original_segments == "":
|
||||
return {}
|
||||
try:
|
||||
original_chunk = json_repair.loads(original_segments)
|
||||
# 此处逻辑保留,作为最终的兜底方案
|
||||
for key, value in original_chunk.items():
|
||||
original_chunk[key] = f"{value}"
|
||||
return original_chunk
|
||||
except (RuntimeError, JSONDecodeError):
|
||||
logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
|
||||
# 如果原始prompt本身也无效,返回一个清晰的错误对象
|
||||
return {"error": f"{original_segments}"}
|
||||
|
||||
def _mt_json_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> dict:
|
||||
"""MT JSON batch: 解析JSON响应,返回{key: translation}。"""
|
||||
try:
|
||||
original_chunk = json_repair.loads(origin_prompt)
|
||||
repaired = json_repair.loads(result)
|
||||
except (RuntimeError, JSONDecodeError):
|
||||
raise AgentResultError("MT JSON parse failed")
|
||||
|
||||
if not isinstance(repaired, dict):
|
||||
raise AgentResultError(f"MT JSON result not dict: {type(repaired)}")
|
||||
|
||||
out = {}
|
||||
for key in original_chunk:
|
||||
out[key] = str(repaired.get(key, original_chunk[key]))
|
||||
return out
|
||||
|
||||
def _mt_json_error_handler(self, origin_prompt: str, logger: Logger) -> dict:
|
||||
"""MT JSON batch error: 返回原文。"""
|
||||
try:
|
||||
original_chunk = json_repair.loads(origin_prompt)
|
||||
return {k: str(v) for k, v in original_chunk.items()}
|
||||
except Exception:
|
||||
return {"0": origin_prompt}
|
||||
|
||||
def _mt_individual_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> str:
|
||||
"""MT individual: 直接返回翻译结果。"""
|
||||
return result.strip()
|
||||
|
||||
def _mt_individual_error_handler(self, origin_prompt: str, logger: Logger) -> str:
|
||||
"""MT individual error: 返回原文。"""
|
||||
return origin_prompt
|
||||
|
||||
def _apply_mt_batch_results(self, segments: list[str], batch_results: list,
|
||||
batch_indices: list[list[int]]) -> list[str]:
|
||||
"""应用批处理结果。对计数不匹配的批次,逐条回退重译。"""
|
||||
all_translated = [""] * len(segments)
|
||||
mismatch_batches = []
|
||||
|
||||
for batch_parts, indices in zip(batch_results, batch_indices):
|
||||
if len(batch_parts) == len(indices):
|
||||
for j, idx in enumerate(indices):
|
||||
all_translated[idx] = batch_parts[j]
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"MT batch mismatch: got {len(batch_parts)} parts, expected {len(indices)}. "
|
||||
f"Falling back to individual translation."
|
||||
)
|
||||
mismatch_batches.append(indices)
|
||||
|
||||
return all_translated, mismatch_batches
|
||||
|
||||
def _retranslate_mismatched(self, segments: list[str],
|
||||
mismatch_batches: list[list[int]]) -> list[str]:
|
||||
"""对计数不匹配的批次,逐条重新翻译。"""
|
||||
# Collect all mismatched indices
|
||||
all_mismatched = []
|
||||
for indices in mismatch_batches:
|
||||
all_mismatched.extend(indices)
|
||||
|
||||
if not all_mismatched:
|
||||
return []
|
||||
|
||||
self.logger.info(f"Retranslating {len(all_mismatched)} mismatched segments individually")
|
||||
individual_segments = [segments[i] for i in all_mismatched]
|
||||
individual_results = super().send_prompts(
|
||||
prompts=individual_segments,
|
||||
result_handler=self._mt_individual_result_handler,
|
||||
error_result_handler=self._mt_individual_error_handler,
|
||||
)
|
||||
|
||||
result_map = {}
|
||||
for idx, trans in zip(all_mismatched, individual_results):
|
||||
result_map[idx] = trans
|
||||
return result_map
|
||||
|
||||
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
|
||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||
# MT模式下直接发送纯净JSON,不添加额外提示词
|
||||
if self.is_mt_mode:
|
||||
prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
|
||||
else:
|
||||
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
|
||||
if not segments:
|
||||
return []
|
||||
# JSON batching: 616 segments → ~12 JSON chunks
|
||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
|
||||
translated_chunks = super().send_prompts(
|
||||
prompts=prompts,
|
||||
result_handler=self._mt_json_result_handler,
|
||||
error_result_handler=self._mt_json_error_handler,
|
||||
)
|
||||
|
||||
# Detect unchanged segments (MT model returned original text)
|
||||
indexed_translated = indexed_originals.copy()
|
||||
failed_indices = []
|
||||
for chunk_result, chunk_original in zip(translated_chunks, chunks):
|
||||
if not isinstance(chunk_result, dict):
|
||||
for k in chunk_original:
|
||||
failed_indices.append(int(k))
|
||||
continue
|
||||
for key in chunk_original:
|
||||
val = chunk_result.get(key, "")
|
||||
if isinstance(val, str) and val.strip() == str(chunk_original[key]).strip():
|
||||
failed_indices.append(int(key))
|
||||
indexed_translated[key] = str(val)
|
||||
|
||||
# Retry failed segments individually
|
||||
if failed_indices:
|
||||
self.logger.info(
|
||||
f"MT JSON batch: {len(failed_indices)}/{len(segments)} segments unchanged, retrying individually"
|
||||
)
|
||||
retry_segments = [segments[i] for i in failed_indices]
|
||||
retry_results = super().send_prompts(
|
||||
prompts=retry_segments,
|
||||
result_handler=self._mt_individual_result_handler,
|
||||
error_result_handler=self._mt_individual_error_handler,
|
||||
)
|
||||
for idx, trans in zip(failed_indices, retry_results):
|
||||
indexed_translated[str(idx)] = trans
|
||||
|
||||
# Reconstruct result list
|
||||
result = []
|
||||
last_end = 0
|
||||
ls = list(indexed_translated.values())
|
||||
for start, end in merged_indices_list:
|
||||
result.extend(ls[last_end:start])
|
||||
result.append("".join(map(str, ls[start:end])))
|
||||
last_end = end
|
||||
result.extend(ls[last_end:])
|
||||
return result
|
||||
|
||||
# Non-MT mode: JSON batch translation
|
||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
|
||||
translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json,
|
||||
pre_send_handler=self._pre_send_handler,
|
||||
result_handler=self._result_handler,
|
||||
@@ -292,7 +333,6 @@ class SegmentsTranslateAgent(Agent):
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理chunk时发生未知错误: {e.__repr__()}")
|
||||
|
||||
# 重建最终列表
|
||||
result = []
|
||||
last_end = 0
|
||||
ls = list(indexed_translated.values())
|
||||
@@ -306,14 +346,63 @@ class SegmentsTranslateAgent(Agent):
|
||||
return result
|
||||
|
||||
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
|
||||
if self.is_mt_mode:
|
||||
if not segments:
|
||||
return []
|
||||
# JSON batching: 616 segments → ~12 JSON chunks
|
||||
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(
|
||||
segments2json_chunks, segments, chunk_size
|
||||
)
|
||||
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
|
||||
translated_chunks = await super().send_prompts_async(
|
||||
prompts=prompts,
|
||||
result_handler=self._mt_json_result_handler,
|
||||
error_result_handler=self._mt_json_error_handler,
|
||||
)
|
||||
|
||||
# Detect unchanged segments
|
||||
indexed_translated = indexed_originals.copy()
|
||||
failed_indices = []
|
||||
for chunk_result, chunk_original in zip(translated_chunks, chunks):
|
||||
if not isinstance(chunk_result, dict):
|
||||
for k in chunk_original:
|
||||
failed_indices.append(int(k))
|
||||
continue
|
||||
for key in chunk_original:
|
||||
val = chunk_result.get(key, "")
|
||||
if isinstance(val, str) and val.strip() == str(chunk_original[key]).strip():
|
||||
failed_indices.append(int(key))
|
||||
indexed_translated[key] = str(val)
|
||||
|
||||
# Retry failed segments individually
|
||||
if failed_indices:
|
||||
self.logger.info(
|
||||
f"MT JSON batch: {len(failed_indices)}/{len(segments)} segments unchanged, retrying individually"
|
||||
)
|
||||
retry_segments = [segments[i] for i in failed_indices]
|
||||
retry_results = await super().send_prompts_async(
|
||||
prompts=retry_segments,
|
||||
result_handler=self._mt_individual_result_handler,
|
||||
error_result_handler=self._mt_individual_error_handler,
|
||||
)
|
||||
for idx, trans in zip(failed_indices, retry_results):
|
||||
indexed_translated[str(idx)] = trans
|
||||
|
||||
# Reconstruct result list
|
||||
result = []
|
||||
last_end = 0
|
||||
ls = list(indexed_translated.values())
|
||||
for start, end in merged_indices_list:
|
||||
result.extend(ls[last_end:start])
|
||||
result.append("".join(map(str, ls[start:end])))
|
||||
last_end = end
|
||||
result.extend(ls[last_end:])
|
||||
return result
|
||||
|
||||
# Non-MT mode: JSON batch translation
|
||||
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
||||
chunk_size)
|
||||
# MT模式下直接发送纯净JSON,不添加额外提示词
|
||||
if self.is_mt_mode:
|
||||
prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
|
||||
else:
|
||||
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
|
||||
|
||||
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
|
||||
translated_chunks = await super().send_prompts_async(prompts=prompts, force_json=self.force_json,
|
||||
pre_send_handler=self._pre_send_handler,
|
||||
result_handler=self._result_handler,
|
||||
@@ -326,7 +415,6 @@ class SegmentsTranslateAgent(Agent):
|
||||
continue
|
||||
for key, val in chunk.items():
|
||||
if key in indexed_translated:
|
||||
# 此处不再需要 str(val),因为 _result_handler 已经处理好了
|
||||
indexed_translated[key] = val
|
||||
else:
|
||||
self.logger.warning(f"在结果chunk中发现未知键 '{key}',已忽略。")
|
||||
@@ -335,7 +423,6 @@ class SegmentsTranslateAgent(Agent):
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理chunk时发生未知错误: {e.__repr__()}")
|
||||
|
||||
# 重建最终列表
|
||||
result = []
|
||||
last_end = 0
|
||||
ls = list(indexed_translated.values())
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,9 +1,9 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN" data-bs-theme="auto">
|
||||
<html lang="en" data-bs-theme="auto">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>DocuTranslate - 交互式文档翻译</title>
|
||||
<title>DocuTranslate</title>
|
||||
<link rel="icon" href="/static/favicon.ico" type="image/x-icon">
|
||||
<!-- Bootstrap CSS -->
|
||||
<link href="/static/bootstrap.css" rel="stylesheet" crossorigin="anonymous">
|
||||
@@ -159,15 +159,6 @@
|
||||
white-space: pre;
|
||||
}
|
||||
|
||||
.bottom-left-controls {
|
||||
position: fixed;
|
||||
bottom: 1rem;
|
||||
left: 1rem;
|
||||
z-index: 1050;
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.step-number {
|
||||
margin-right: 0.25rem;
|
||||
}
|
||||
@@ -226,6 +217,31 @@
|
||||
<div class="d-flex align-items-center">
|
||||
<h4 class="mb-0 me-3 fw-bold" :title="t('pageTitle')">DocuTranslate</h4>
|
||||
</div>
|
||||
<!-- Language & Theme Controls -->
|
||||
<div class="d-flex gap-2">
|
||||
<div class="dropdown">
|
||||
<button class="btn btn-outline-secondary btn-sm dropdown-toggle" type="button" data-bs-toggle="dropdown">
|
||||
<i class="bi bi-translate me-1"></i><span>{{ {zh:'中文',en:'English',id:'Bahasa'}[currentLang] || 'Language' }}</span>
|
||||
</button>
|
||||
<ul class="dropdown-menu dropdown-menu-end">
|
||||
<li><a class="dropdown-item" :class="{active: currentLang==='zh'}" href="#"
|
||||
@click.prevent="setLang('zh')">中文</a></li>
|
||||
<li><a class="dropdown-item" :class="{active: currentLang==='en'}" href="#"
|
||||
@click.prevent="setLang('en')">English</a></li>
|
||||
<li><a class="dropdown-item" :class="{active: currentLang==='id'}" href="#"
|
||||
@click.prevent="setLang('id')">Bahasa Indonesia</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="dropdown">
|
||||
<button class="btn btn-outline-secondary btn-sm dropdown-toggle" type="button" data-bs-toggle="dropdown"><i
|
||||
class="bi bi-circle-half"></i></button>
|
||||
<ul class="dropdown-menu dropdown-menu-end">
|
||||
<li><button class="dropdown-item" @click="setTheme('light')"><i class="bi bi-sun-fill me-2"></i>Light</button></li>
|
||||
<li><button class="dropdown-item" @click="setTheme('dark')"><i class="bi bi-moon-stars-fill me-2"></i>Dark</button></li>
|
||||
<li><button class="dropdown-item" @click="setTheme('auto')"><i class="bi bi-circle-half me-2"></i>Auto</button></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<form id="translateForm" @submit.prevent>
|
||||
@@ -923,40 +939,7 @@
|
||||
</div>
|
||||
<iframe id="printFrame" ref="printFrame" style="display: none;"></iframe>
|
||||
|
||||
<!-- Controls -->
|
||||
<div class="bottom-left-controls">
|
||||
<div class="dropdown">
|
||||
<button class="btn btn-secondary dropdown-toggle" type="button" data-bs-toggle="dropdown"><i
|
||||
class="bi bi-translate"></i></button>
|
||||
<ul class="dropdown-menu">
|
||||
<li><a class="dropdown-item" :class="{active: currentLang==='zh'}" href="#"
|
||||
@click.prevent="setLang('zh')">中文</a></li>
|
||||
<li><a class="dropdown-item" :class="{active: currentLang==='en'}" href="#"
|
||||
@click.prevent="setLang('en')">English</a></li>
|
||||
<li><a class="dropdown-item" :class="{active: currentLang==='vi'}" href="#"
|
||||
@click.prevent="setLang('vi')">Tiếng Việt</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="dropdown">
|
||||
<button class="btn btn-secondary dropdown-toggle" type="button" data-bs-toggle="dropdown"><i
|
||||
class="bi bi-circle-half"></i></button>
|
||||
<ul class="dropdown-menu">
|
||||
<li>
|
||||
<button class="dropdown-item" @click="setTheme('light')"><i class="bi bi-sun-fill me-2"></i> Light
|
||||
</button>
|
||||
</li>
|
||||
<li>
|
||||
<button class="dropdown-item" @click="setTheme('dark')"><i class="bi bi-moon-stars-fill me-2"></i>
|
||||
Dark
|
||||
</button>
|
||||
</li>
|
||||
<li>
|
||||
<button class="dropdown-item" @click="setTheme('auto')"><i class="bi bi-circle-half me-2"></i> Auto
|
||||
</button>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Header controls now in left panel top-right -->
|
||||
</div>
|
||||
|
||||
<script src="/static/bootstrap.bundle.min.js"></script>
|
||||
@@ -1048,7 +1031,14 @@
|
||||
components: {SliderControl, ModelPresetSelector},
|
||||
setup() {
|
||||
const version = ref("");
|
||||
const currentLang = ref(localStorage.getItem('ui_language') || 'zh');
|
||||
function detectBrowserLang() {
|
||||
const nav = navigator.language || navigator.userLanguage || '';
|
||||
const lang = nav.split('-')[0].toLowerCase();
|
||||
if (['zh', 'en', 'id'].includes(lang)) return lang;
|
||||
if (lang === 'zh') return 'zh';
|
||||
return 'en'; // default to English for unrecognized languages
|
||||
}
|
||||
const currentLang = ref(localStorage.getItem('ui_language') || detectBrowserLang());
|
||||
const i18nData = ref({});
|
||||
const glossaryData = ref({});
|
||||
const tasks = ref([]);
|
||||
@@ -1247,7 +1237,7 @@
|
||||
};
|
||||
|
||||
const t = (k) => {
|
||||
const dict = i18nData.value[currentLang.value] || i18nData.value['zh'] || {};
|
||||
const dict = i18nData.value[currentLang.value] || i18nData.value['en'] || {};
|
||||
return dict[k] || k;
|
||||
};
|
||||
const capitalize = (s) => s.charAt(0).toUpperCase() + s.slice(1);
|
||||
@@ -1868,7 +1858,10 @@
|
||||
const setLang = (l) => {
|
||||
currentLang.value = l;
|
||||
localStorage.setItem('ui_language', l);
|
||||
document.documentElement.lang = l === 'zh' ? 'zh-CN' : 'en';
|
||||
const langMap = {zh: 'zh-CN', en: 'en', id: 'id'};
|
||||
document.documentElement.lang = langMap[l] || 'en';
|
||||
const dict = i18nData.value[l] || i18nData.value['en'] || {};
|
||||
document.title = dict['pageTitle'] || 'DocuTranslate';
|
||||
};
|
||||
const setTheme = (t) => {
|
||||
localStorage.setItem('theme', t);
|
||||
@@ -1882,6 +1875,12 @@
|
||||
const res = await fetch("/static/i18nData.json");
|
||||
i18nData.value = await res.json();
|
||||
|
||||
// Backward compat: ensure id lang pack exists on servers not yet updated
|
||||
if (!i18nData.value.id) {
|
||||
console.warn('id language pack missing, using en as fallback');
|
||||
i18nData.value.id = i18nData.value.en || {};
|
||||
}
|
||||
|
||||
// Add new missing translations for Mineru Deploy
|
||||
const extraZh = {
|
||||
mineruDeployParseMethodLabel: "解析方法 (Parse Method)",
|
||||
@@ -1895,6 +1894,7 @@
|
||||
if(i18nData.value.en) Object.assign(i18nData.value.en, extraEn);
|
||||
|
||||
} catch (e) {
|
||||
console.error("i18n load failed", e);
|
||||
i18nData.value = {
|
||||
zh: {
|
||||
pageTitle: "DocuTranslate",
|
||||
@@ -1930,10 +1930,32 @@
|
||||
mineruDeployServerUrlPlaceholder: "http://127.0.0.1:30000",
|
||||
mineruDeployParseMethodLabel: "Parse Method",
|
||||
mineruDeployTableEnableLabel: "Table Recognition"
|
||||
},
|
||||
id: {
|
||||
pageTitle: "DocuTranslate",
|
||||
tutorialBtn: "Tutorial",
|
||||
projectContributeBtn: "Kolaborasi",
|
||||
workflowTitle: "Pilih Alur Kerja",
|
||||
autoWorkflowLabel: "Pilih Otomatis",
|
||||
modelPresetLabel: "Preset Model",
|
||||
modelPresetPlaceholder: "Pilih preset model",
|
||||
modelPresetEmpty: "Konfigurasi preset di server",
|
||||
modelPresetRuntimeHint: "Provider, endpoint, dan API key akan dibaca dari environment server.",
|
||||
workflowOptionPptx: "Presentasi PPTX",
|
||||
pptxSettingsTitleText: "Pengaturan PPTX",
|
||||
mineruDeployServerUrlLabel: "Server URL",
|
||||
mineruDeployLangListLabel: "Daftar Bahasa",
|
||||
mineruDeployServerUrlPlaceholder: "http://127.0.0.1:30000",
|
||||
mineruDeployParseMethodLabel: "Parse Method",
|
||||
mineruDeployTableEnableLabel: "Table Recognition"
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Set initial page title based on detected language
|
||||
const initDict = i18nData.value[currentLang.value] || i18nData.value['en'] || {};
|
||||
document.title = initDict['pageTitle'] || 'DocuTranslate';
|
||||
|
||||
// Backend Metadata
|
||||
try {
|
||||
const [metaRes, enginRes, paramsRes, configRes] = await Promise.all([
|
||||
|
||||
@@ -5,7 +5,7 @@ from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from typing import Self, Literal, List, Dict, Any, Tuple
|
||||
from typing import Self, Literal, List, Dict, Any, Tuple, Optional
|
||||
|
||||
import docx
|
||||
from docx.document import Document as DocumentObject
|
||||
@@ -24,19 +24,6 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTr
|
||||
|
||||
# ---------------- 辅助函数 ----------------
|
||||
|
||||
# [v6.2] 定义一组具有显著视觉效果的格式标签。
|
||||
# 我们只在 Run 包含这些格式时才将其视为空白格式边界。
|
||||
# 这避免了因字体、字号等微小变化导致的过度文本切分。
|
||||
SIGNIFICANT_STYLES = frozenset([
|
||||
qn('w:u'), # 下划线
|
||||
qn('w:strike'), # 删除线
|
||||
qn('w:dstrike'), # 双删除线
|
||||
qn('w:shd'), # 底纹/背景色
|
||||
qn('w:highlight'), # 荧光笔高亮
|
||||
qn('w:bdr'), # 边框
|
||||
qn('w:effectLst'), # 文本效果 (如发光、阴影)
|
||||
qn('w:em'), # 强调标记 (着重号)
|
||||
])
|
||||
|
||||
|
||||
def is_image_run(run: Run) -> bool:
|
||||
@@ -45,31 +32,13 @@ def is_image_run(run: Run) -> bool:
|
||||
return '<w:drawing' in xml or '<w:pict' in xml
|
||||
|
||||
|
||||
def is_formatting_only_run(run: Run) -> bool:
|
||||
def is_instr_text_run(run: Run) -> bool:
|
||||
"""
|
||||
检查一个 Run 是否仅用于格式化,不包含任何应被渲染的文本。
|
||||
这仅适用于其 .text 属性为 "" 的情况。
|
||||
检查 Run 是否包含域指令文本 (w:instrText)。
|
||||
目录(TOC)、页码等功能的指令代码存储在此标签中。
|
||||
必须跳过这些 Run,否则写入 text 会破坏域结构。
|
||||
"""
|
||||
return run.text == ""
|
||||
|
||||
|
||||
# ---------- 新增修改部分 1: is_styled_whitespace_run 函数被移除 ----------
|
||||
# 此函数不再需要,因为新的逻辑会根据格式变化来切分,而不是根据带格式的空格。
|
||||
# ---------------------- 修改结束 ----------------------
|
||||
|
||||
def is_tab_run(run: Run) -> bool:
|
||||
"""
|
||||
检查一个 Run 是否主要代表一个制表符,应被视作格式边界。
|
||||
仅当 Run 的文本内容为空或仅包含空白,且 XML 中存在 <w:tab/> 时,
|
||||
才将其视为纯格式化用途的 Run。
|
||||
"""
|
||||
# .text 属性会将 <w:tab/> 转换成 '\t'
|
||||
# 如果 .text 在去除空白后仍有内容,说明这个 Run 不仅仅是个制表符。
|
||||
if run.text.strip():
|
||||
return False
|
||||
|
||||
xml = getattr(run.element, 'xml', '')
|
||||
return '<w:tab' in xml or '<w:ptab' in xml
|
||||
return run.element.find(qn('w:instrText')) is not None
|
||||
|
||||
|
||||
# ---------------- 配置类 ----------------
|
||||
@@ -77,6 +46,7 @@ def is_tab_run(run: Run) -> bool:
|
||||
class DocxTranslatorConfig(AiTranslatorConfig):
|
||||
insert_mode: Literal["replace", "append", "prepend"] = "replace"
|
||||
separator: str = "\n"
|
||||
office_password: Optional[str] = None
|
||||
|
||||
|
||||
# ---------------- 主类 ----------------
|
||||
@@ -110,14 +80,6 @@ class DocxTranslator(AiTranslator):
|
||||
[v6.0 - 语义切分重构版]
|
||||
- 重构核心逻辑,不再跳过域结果,而是将其作为语义边界来切分文本,增强了鲁棒性。
|
||||
"""
|
||||
IGNORED_TAGS = {
|
||||
qn('w:proofErr'), qn('w:lastRenderedPageBreak'), qn('w:bookmarkStart'),
|
||||
qn('w:bookmarkEnd'), qn('w:commentRangeStart'), qn('w:commentRangeEnd'),
|
||||
qn('w:del'), qn('w:ins'), qn('w:moveFrom'), qn('w:moveTo'),
|
||||
}
|
||||
RECURSIVE_CONTAINER_TAGS = {
|
||||
qn('w:smartTag'), qn('w:sdtContent'), qn('w:hyperlink'),
|
||||
}
|
||||
|
||||
def __init__(self, config: DocxTranslatorConfig):
|
||||
super().__init__(config=config)
|
||||
@@ -138,127 +100,99 @@ class DocxTranslator(AiTranslator):
|
||||
self.translate_agent = SegmentsTranslateAgent(agent_config)
|
||||
self.insert_mode = config.insert_mode
|
||||
self.separator = config.separator
|
||||
self.office_password = config.office_password
|
||||
|
||||
# ---------- 新增修改部分 2: 增加用于比较格式的辅助函数 ----------
|
||||
def _get_significant_styles(self, run: Run) -> frozenset:
|
||||
"""从一个 Run 中提取“显著”格式标签的集合。"""
|
||||
if run is None:
|
||||
return frozenset()
|
||||
rPr = run.element.rPr
|
||||
if rPr is None:
|
||||
return frozenset()
|
||||
return frozenset(child.tag for child in rPr if child.tag in SIGNIFICANT_STYLES)
|
||||
def _decrypt_if_needed(self, content: bytes) -> bytes:
|
||||
"""如果文件加密则解密,否则返回原内容。"""
|
||||
try:
|
||||
import msoffcrypto
|
||||
from io import BytesIO as BIO
|
||||
file_stream = BIO(content)
|
||||
try:
|
||||
office_file = msoffcrypto.OfficeFile(file_stream)
|
||||
if office_file.is_encrypted():
|
||||
if not self.office_password:
|
||||
raise ValueError("此DOCX文件已加密,但未提供密码。")
|
||||
decrypted = BIO()
|
||||
office_file.load_key(password=self.office_password)
|
||||
office_file.decrypt(decrypted)
|
||||
return decrypted.getvalue()
|
||||
return content
|
||||
finally:
|
||||
file_stream.close()
|
||||
except ImportError:
|
||||
return content
|
||||
|
||||
def _have_same_significant_styles(self, run1: Run, run2: Run) -> bool:
|
||||
"""检查两个 Run 是否具有相同的“显著”格式集合。"""
|
||||
styles1 = self._get_significant_styles(run1)
|
||||
styles2 = self._get_significant_styles(run2)
|
||||
return styles1 == styles2
|
||||
@staticmethod
|
||||
def _run_format_key(run: Run):
|
||||
"""生成 Run 的格式签名,用于合并相同格式的 Run。"""
|
||||
return (
|
||||
run.bold,
|
||||
run.italic,
|
||||
run.underline,
|
||||
run.font.size if run.font.size else None,
|
||||
str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None,
|
||||
run.font.name or None,
|
||||
)
|
||||
|
||||
# ---------------------- 修改结束 ----------------------
|
||||
def _merge_adjacent_runs(self, runs: List[Run]) -> List[Run]:
|
||||
"""
|
||||
合并相邻的、格式完全相同的 Run,同时累积文本到第一个 Run。
|
||||
解决 Word 因修订历史/变更追踪产生的微观 Run 碎片问题(单个字符一个 Run)。
|
||||
"""
|
||||
if len(runs) <= 1:
|
||||
return runs
|
||||
|
||||
# ---------- 代码修改部分 1: 形状翻译逻辑的核心实现 ----------
|
||||
def _process_element_children(self, element, parent_paragraph: Paragraph, elements: List[Dict[str, Any]],
|
||||
texts: List[str],
|
||||
state: Dict[str, Any],
|
||||
top_level_para: Paragraph):
|
||||
merged = []
|
||||
group_start = 0
|
||||
for i in range(1, len(runs)):
|
||||
if self._run_format_key(runs[i]) != self._run_format_key(runs[group_start]):
|
||||
# Format boundary: finalize the current group
|
||||
if i - group_start > 1:
|
||||
# Merge: accumulate all text into first run, delete the rest
|
||||
runs[group_start].text = "".join(r.text for r in runs[group_start:i])
|
||||
for r in runs[group_start + 1:i]:
|
||||
self._remove_run_element(r)
|
||||
merged.append(runs[group_start])
|
||||
group_start = i
|
||||
|
||||
def flush_segment():
|
||||
current_runs = state['current_runs']
|
||||
if not current_runs:
|
||||
return
|
||||
full_text = "".join(r.text for r in current_runs)
|
||||
if full_text.strip():
|
||||
# 在 elements 中增加对父段落和顶级段落的引用
|
||||
elements.append({
|
||||
"type": "text_runs",
|
||||
"runs": list(current_runs),
|
||||
"paragraph": parent_paragraph,
|
||||
"top_level_paragraph": top_level_para
|
||||
})
|
||||
texts.append(full_text)
|
||||
state['current_runs'].clear()
|
||||
# Final group
|
||||
if len(runs) - group_start > 1:
|
||||
runs[group_start].text = "".join(r.text for r in runs[group_start:])
|
||||
for r in runs[group_start + 1:]:
|
||||
self._remove_run_element(r)
|
||||
merged.append(runs[group_start])
|
||||
|
||||
for child in element:
|
||||
if child.tag in self.IGNORED_TAGS:
|
||||
continue
|
||||
|
||||
if child.tag in self.RECURSIVE_CONTAINER_TAGS:
|
||||
flush_segment()
|
||||
self._process_element_children(child, parent_paragraph, elements, texts, state, top_level_para)
|
||||
flush_segment() # 在递归容器后也刷新,确保其内容成为独立片段
|
||||
continue
|
||||
|
||||
field_char_element = child.find(qn('w:fldChar')) if isinstance(child, CT_R) else None
|
||||
if field_char_element is not None:
|
||||
fld_type = field_char_element.get(qn('w:fldCharType'))
|
||||
if fld_type == 'begin' or fld_type == 'end':
|
||||
flush_segment()
|
||||
continue
|
||||
|
||||
if isinstance(child, CT_R):
|
||||
# 传入 parent_paragraph 以确保 Run 对象具有正确的上下文
|
||||
run = Run(child, parent_paragraph)
|
||||
|
||||
# 新增逻辑:处理形状(drawing/pict)内的文本
|
||||
# 形状可以包含文本框,需要优先于图片处理逻辑进行解析
|
||||
if '<w:drawing' in run.element.xml or '<w:pict' in run.element.xml:
|
||||
# 使用 list() 消耗迭代器,以便检查是否找到了文本框
|
||||
text_boxes = list(run.element.iter(qn('w:txbxContent')))
|
||||
if text_boxes:
|
||||
flush_segment() # 包含文本的形状是一个边界,刷新前面的文本
|
||||
for txbx_content in text_boxes:
|
||||
# 遍历文本框内的所有段落
|
||||
for p_element in txbx_content.findall(qn('w:p')):
|
||||
# 创建新的段落对象,并传入父级上下文
|
||||
shape_para = Paragraph(p_element, parent_paragraph)
|
||||
# 递归处理该段落,并传递顶级段落上下文
|
||||
self._process_paragraph(shape_para, elements, texts, top_level_para=top_level_para)
|
||||
|
||||
# 如果处理了形状内的文本,则该 Run 的任务已完成
|
||||
continue
|
||||
|
||||
# 保留原有逻辑: 检查绝对边界(图片、制表符等)
|
||||
if is_image_run(run) or is_formatting_only_run(run) or is_tab_run(run):
|
||||
flush_segment()
|
||||
continue # 这些 Run 本身不包含在任何文本片段中
|
||||
|
||||
# 保留原有逻辑: 基于格式变化进行切分
|
||||
last_run_in_segment = state['current_runs'][-1] if state['current_runs'] else None
|
||||
if last_run_in_segment and not self._have_same_significant_styles(last_run_in_segment, run):
|
||||
flush_segment()
|
||||
|
||||
# 将当前 Run 添加到片段中
|
||||
state['current_runs'].append(run)
|
||||
else:
|
||||
# 遇到任何非 Run 的块级元素(如在单元格中嵌套的表格),都应结束当前文本片段。
|
||||
flush_segment()
|
||||
return merged
|
||||
|
||||
def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str],
|
||||
top_level_para: Paragraph = None):
|
||||
# 如果是首次进入段落处理(非递归调用),则当前段落是顶级段落
|
||||
"""
|
||||
段落级翻译处理:收集所有文本 Run → 合并相邻同格式 Run → 整段翻译 → 按比例分配。
|
||||
"""
|
||||
if top_level_para is None:
|
||||
top_level_para = para
|
||||
|
||||
state = {
|
||||
'current_runs': [],
|
||||
}
|
||||
# 修改调用:传入 `para` 对象、其顶级上下文
|
||||
self._process_element_children(para._p, para, elements, texts, state, top_level_para)
|
||||
text_runs = []
|
||||
for run in para.runs:
|
||||
if is_image_run(run) or is_instr_text_run(run):
|
||||
continue
|
||||
if not run.text.strip():
|
||||
continue
|
||||
text_runs.append(run)
|
||||
|
||||
# 确保在段落处理结束时,刷新所有剩余的 Run
|
||||
current_runs = state['current_runs']
|
||||
if current_runs:
|
||||
full_text = "".join(r.text for r in current_runs)
|
||||
if text_runs:
|
||||
# Merge adjacent runs with identical formatting to reduce fragmentation
|
||||
text_runs = self._merge_adjacent_runs(text_runs)
|
||||
full_text = "".join(r.text for r in text_runs)
|
||||
if full_text.strip():
|
||||
elements.append({
|
||||
"type": "text_runs",
|
||||
"runs": list(current_runs),
|
||||
"runs": list(text_runs),
|
||||
"paragraph": para,
|
||||
"top_level_paragraph": top_level_para
|
||||
})
|
||||
texts.append(full_text)
|
||||
current_runs.clear()
|
||||
|
||||
# ---------------------- 修改结束 ----------------------
|
||||
|
||||
@@ -301,7 +235,8 @@ class DocxTranslator(AiTranslator):
|
||||
self._process_body_elements(parent_element, container, elements, texts)
|
||||
|
||||
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
|
||||
doc = docx.Document(BytesIO(document.content))
|
||||
content = self._decrypt_if_needed(document.content)
|
||||
doc = docx.Document(BytesIO(content))
|
||||
elements, texts = [], []
|
||||
|
||||
self._traverse_container(doc, elements, texts)
|
||||
@@ -326,33 +261,58 @@ class DocxTranslator(AiTranslator):
|
||||
runs = element_info["runs"]
|
||||
if not runs: return
|
||||
|
||||
first_real_run_index = -1
|
||||
# 找到第一个可以写入文本的run
|
||||
for i, run in enumerate(runs):
|
||||
# Filter to runs that are still attached to the document
|
||||
valid_runs = []
|
||||
for run in runs:
|
||||
if run.element.getparent() is not None:
|
||||
# 如果 run 是副本的一部分,其 _parent 可能仍然指向原始文档的段落
|
||||
# 但我们需要确保它与 element_info["paragraph"] 同步
|
||||
run._parent = element_info["paragraph"]
|
||||
run.text = final_text
|
||||
first_real_run_index = i
|
||||
break
|
||||
valid_runs.append(run)
|
||||
|
||||
# 如果没有找到有效的run(例如,它们都已被删除),则记录警告
|
||||
if first_real_run_index == -1:
|
||||
if not valid_runs:
|
||||
self.logger.warning(f"无法应用翻译 '{final_text}',因为找不到有效的run。")
|
||||
return
|
||||
|
||||
# 删除所有后续的run,因为它们的文本已经被合并到第一个run中了
|
||||
for i in range(first_real_run_index + 1, len(runs)):
|
||||
run = runs[i]
|
||||
parent_element = run.element.getparent()
|
||||
if parent_element is not None:
|
||||
try:
|
||||
parent_element.remove(run.element)
|
||||
except ValueError:
|
||||
# 在某些复杂情况下,一个run可能已经被其父元素隐式删除
|
||||
self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。")
|
||||
pass
|
||||
if len(valid_runs) == 1:
|
||||
# Single run: just write the translation
|
||||
valid_runs[0].text = final_text
|
||||
return
|
||||
|
||||
# Multiple runs: proportionally distribute translated text to preserve formatting
|
||||
orig_lengths = [len(r.text) for r in valid_runs]
|
||||
total_orig = sum(orig_lengths)
|
||||
final_len = len(final_text)
|
||||
|
||||
if total_orig == 0:
|
||||
valid_runs[0].text = final_text
|
||||
for run in valid_runs[1:]:
|
||||
self._remove_run_element(run)
|
||||
return
|
||||
|
||||
# Distribute characters proportionally
|
||||
char_pos = 0
|
||||
for i, run in enumerate(valid_runs):
|
||||
if i == len(valid_runs) - 1:
|
||||
# Last run gets all remaining text
|
||||
run.text = final_text[char_pos:]
|
||||
else:
|
||||
ratio = orig_lengths[i] / total_orig
|
||||
run_char_count = max(1, round(final_len * ratio))
|
||||
run_char_count = min(run_char_count, final_len - char_pos - (len(valid_runs) - i - 1))
|
||||
if run_char_count <= 0:
|
||||
# Remove runs that would get zero characters
|
||||
self._remove_run_element(run)
|
||||
continue
|
||||
run.text = final_text[char_pos:char_pos + run_char_count]
|
||||
char_pos += run_char_count
|
||||
|
||||
def _remove_run_element(self, run) -> None:
|
||||
"""Safely remove a run element from its parent."""
|
||||
parent_element = run.element.getparent()
|
||||
if parent_element is not None:
|
||||
try:
|
||||
parent_element.remove(run.element)
|
||||
except ValueError:
|
||||
self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。")
|
||||
|
||||
# ---------- FIX START: 新增用于清理副本段落的辅助方法 ----------
|
||||
def _prune_unwanted_elements_from_copy(self, p_element: OxmlElement):
|
||||
|
||||
Reference in New Issue
Block a user