translater现在可以拥有独立logger
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Lock
|
||||
@@ -6,7 +7,7 @@ from typing import TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
from docutranslate.logger import translater_logger
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
MAX_RETRY_COUNT = 2
|
||||
MAX_TOTAL_ERROR_COUNT = 10
|
||||
@@ -20,18 +21,20 @@ class AgentArgs(TypedDict, total=False):
|
||||
temperature: float
|
||||
max_concurrent: int
|
||||
timeout: int
|
||||
logger:logging.Logger
|
||||
|
||||
|
||||
class TotalErrorCounter:
|
||||
def __init__(self, ):
|
||||
def __init__(self,logger:logging.Logger):
|
||||
self.lock = Lock()
|
||||
self.count = 0
|
||||
self.logger=logger
|
||||
|
||||
def add(self):
|
||||
self.lock.acquire()
|
||||
self.count += 1
|
||||
if self.count>MAX_TOTAL_ERROR_COUNT:
|
||||
translater_logger.info(f"错误响应过多")
|
||||
self.logger.info(f"错误响应过多")
|
||||
self.lock.release()
|
||||
return self.reach_limit()
|
||||
|
||||
@@ -39,20 +42,19 @@ class TotalErrorCounter:
|
||||
return self.count > MAX_TOTAL_ERROR_COUNT
|
||||
|
||||
|
||||
total_error_counter = TotalErrorCounter()
|
||||
|
||||
|
||||
# 仅使用多线程时用以计数
|
||||
class PromptsCounter:
|
||||
def __init__(self, total: int):
|
||||
def __init__(self, total: int,logger:logging.Logger):
|
||||
self.lock = Lock()
|
||||
self.count = 0
|
||||
self.total = total
|
||||
self.logger=logger
|
||||
|
||||
def add(self):
|
||||
self.lock.acquire()
|
||||
self.count += 1
|
||||
translater_logger.info(f"多线程-已完成:{self.count}/{self.total}")
|
||||
self.logger.info(f"多线程-已完成:{self.count}/{self.total}")
|
||||
self.lock.release()
|
||||
|
||||
|
||||
@@ -61,7 +63,7 @@ TIMEOUT = 600
|
||||
|
||||
class Agent:
|
||||
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
|
||||
max_concurrent=15, timeout: int = TIMEOUT):
|
||||
max_concurrent=15, timeout: int = TIMEOUT,logger:logging.Logger|None=None):
|
||||
self.baseurl = baseurl.strip()
|
||||
if self.baseurl.endswith("/"):
|
||||
self.baseurl = self.baseurl[:-1]
|
||||
@@ -74,6 +76,8 @@ class Agent:
|
||||
self.max_concurrent = max_concurrent
|
||||
self.timeout = timeout
|
||||
|
||||
self.logger=logger if logger else global_logger
|
||||
self.total_error_counter = TotalErrorCounter(logger=self.logger)
|
||||
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
|
||||
if temperature is None:
|
||||
temperature = self.temperature
|
||||
@@ -109,23 +113,23 @@ class Agent:
|
||||
result = response.json()["choices"][0]["message"]["content"]
|
||||
return result
|
||||
except httpx.HTTPStatusError as e:
|
||||
translater_logger.warning(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}")
|
||||
self.logger.warning(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}")
|
||||
print(f"prompt:\n{prompt}")
|
||||
total_error_counter.add()
|
||||
self.total_error_counter.add()
|
||||
return prompt
|
||||
except httpx.RequestError as e:
|
||||
translater_logger.warning(f"AI请求连接错误 (async): {repr(e)}")
|
||||
self.logger.warning(f"AI请求连接错误 (async): {repr(e)}")
|
||||
except (KeyError, IndexError) as e:
|
||||
raise Exception(f"AI响应格式错误 (async): {repr(e)}")
|
||||
# 如果没有正常获取结果则重试
|
||||
if retry and retry_count < MAX_RETRY_COUNT:
|
||||
if total_error_counter.add():
|
||||
if self.total_error_counter.add():
|
||||
return prompt
|
||||
translater_logger.info(f"正在重试,重试次数{retry_count}")
|
||||
self.logger.info(f"正在重试,重试次数{retry_count}")
|
||||
await asyncio.sleep(0.5)
|
||||
return await self.send_async(prompt, system_prompt, retry=True, retry_count=retry_count + 1)
|
||||
else:
|
||||
translater_logger.error(f"达到重试次数上限")
|
||||
self.logger.error(f"达到重试次数上限")
|
||||
return prompt
|
||||
|
||||
async def send_prompts_async(
|
||||
@@ -149,7 +153,7 @@ class Agent:
|
||||
)
|
||||
nonlocal count
|
||||
count += 1
|
||||
translater_logger.info(f"协程-已完成{count}/{total}")
|
||||
self.logger.info(f"协程-已完成{count}/{total}")
|
||||
return result
|
||||
|
||||
for p_text in prompts:
|
||||
@@ -176,23 +180,23 @@ class Agent:
|
||||
result = response.json()["choices"][0]["message"]["content"]
|
||||
return result
|
||||
except httpx.HTTPStatusError as e:
|
||||
translater_logger.warning(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}")
|
||||
self.logger.warning(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}")
|
||||
print(f"prompt:\n{prompt}")
|
||||
total_error_counter.add()
|
||||
self.total_error_counter.add()
|
||||
return prompt
|
||||
except httpx.RequestError as e:
|
||||
translater_logger.warning(f"AI请求连接错误 (sync): {repr(e)}\nprompt:{prompt}")
|
||||
self.logger.warning(f"AI请求连接错误 (sync): {repr(e)}\nprompt:{prompt}")
|
||||
except (KeyError, IndexError) as e:
|
||||
raise Exception(f"AI响应格式错误 (sync): {repr(e)}")
|
||||
# 如果没有正常获取结果则重试
|
||||
if retry and retry_count < MAX_RETRY_COUNT:
|
||||
if total_error_counter.add():
|
||||
if self.total_error_counter.add():
|
||||
return prompt
|
||||
translater_logger.info(f"正在重试,重试次数{retry_count}")
|
||||
self.logger.info(f"正在重试,重试次数{retry_count}")
|
||||
time.sleep(0.5)
|
||||
return self.send(prompt, system_prompt, retry=True, retry_count=retry_count + 1)
|
||||
else:
|
||||
translater_logger.error(f"达到重试次数上限")
|
||||
self.logger.error(f"达到重试次数上限")
|
||||
return prompt
|
||||
|
||||
def _send_prompt_count(self, prompt: str, system_prompt: None | str, count: PromptsCounter) -> str:
|
||||
@@ -206,7 +210,7 @@ class Agent:
|
||||
system_prompt: str | None = None,
|
||||
) -> list[str]:
|
||||
system_prompts = [system_prompt] * len(prompts)
|
||||
counts = [PromptsCounter(len(prompts))] * len(prompts)
|
||||
counts = [PromptsCounter(len(prompts),self.logger)] * len(prompts)
|
||||
output_list = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
|
||||
results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts, counts)
|
||||
|
||||
@@ -19,7 +19,7 @@ from pydantic import BaseModel, Field
|
||||
|
||||
from docutranslate import FileTranslater, __version__
|
||||
from docutranslate.global_values import available_packages
|
||||
from docutranslate.logger import translater_logger
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.translater import default_params
|
||||
from docutranslate.utils.resource_utils import resource_path
|
||||
|
||||
@@ -79,10 +79,10 @@ async def lifespan(app: FastAPI):
|
||||
tasks_state.clear()
|
||||
tasks_log_queues.clear()
|
||||
tasks_log_histories.clear()
|
||||
for handler in translater_logger.handlers[:]:
|
||||
translater_logger.removeHandler(handler)
|
||||
translater_logger.propagate = False
|
||||
translater_logger.setLevel(logging.INFO)
|
||||
for handler in global_logger.handlers[:]:
|
||||
global_logger.removeHandler(handler)
|
||||
global_logger.propagate = False
|
||||
global_logger.setLevel(logging.INFO)
|
||||
print("应用启动完成,多任务状态已初始化。")
|
||||
yield
|
||||
await httpx_client.aclose()
|
||||
@@ -100,12 +100,12 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
|
||||
log_filter = logging.Filter()
|
||||
log_filter.task_id = task_id
|
||||
task_handler.addFilter(log_filter)
|
||||
translater_logger.addHandler(task_handler)
|
||||
global_logger.addHandler(task_handler)
|
||||
|
||||
translater_logger.info(f"后台翻译任务开始: 文件 '{original_filename}'")
|
||||
global_logger.info(f"后台翻译任务开始: 文件 '{original_filename}'")
|
||||
task_state["status_message"] = f"正在处理 '{original_filename}'..."
|
||||
try:
|
||||
translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}")
|
||||
global_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}")
|
||||
ft = FileTranslater(
|
||||
base_url=params['base_url'], key=params['apikey'], model_id=params['model_id'],
|
||||
chunk_size=params['chunk_size'], concurrent=params['concurrent'],
|
||||
@@ -125,7 +125,7 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
|
||||
timeout=3)
|
||||
html_content = ft.export_to_html(title=task_state["original_filename_stem"], cdn=True)
|
||||
except (httpx.TimeoutException, httpx.RequestError):
|
||||
translater_logger.info("CDN连接失败,使用本地JS进行渲染。")
|
||||
global_logger.info("CDN连接失败,使用本地JS进行渲染。")
|
||||
html_content = ft.export_to_html(title=task_state["original_filename_stem"], cdn=False)
|
||||
end_time = time.time()
|
||||
duration = end_time - task_state["task_start_time"]
|
||||
@@ -134,11 +134,11 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
|
||||
"html_content": html_content, "status_message": f"翻译成功!用时 {duration:.2f} 秒。",
|
||||
"download_ready": True, "error_flag": False, "task_end_time": end_time,
|
||||
})
|
||||
translater_logger.info(f"翻译成功完成,用时 {duration:.2f} 秒。")
|
||||
global_logger.info(f"翻译成功完成,用时 {duration:.2f} 秒。")
|
||||
except asyncio.CancelledError:
|
||||
end_time = time.time()
|
||||
duration = end_time - task_state["task_start_time"]
|
||||
translater_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).")
|
||||
global_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).")
|
||||
task_state.update({
|
||||
"status_message": f"翻译任务已取消 (用时 {duration:.2f} 秒).", "error_flag": False,
|
||||
"download_ready": False, "markdown_content": None, "md_zip_content": None,
|
||||
@@ -148,7 +148,7 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
|
||||
end_time = time.time()
|
||||
duration = end_time - task_state["task_start_time"]
|
||||
error_message = f"翻译失败: {e}"
|
||||
translater_logger.error(error_message, exc_info=True)
|
||||
global_logger.error(error_message, exc_info=True)
|
||||
task_state.update({
|
||||
"status_message": f"翻译过程中发生错误 (用时 {duration:.2f} 秒): {e}",
|
||||
"error_flag": True, "download_ready": False, "markdown_content": None,
|
||||
@@ -157,8 +157,8 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
|
||||
finally:
|
||||
task_state["is_processing"] = False
|
||||
task_state["current_task_ref"] = None
|
||||
translater_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
|
||||
translater_logger.removeHandler(task_handler)
|
||||
global_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
|
||||
global_logger.removeHandler(task_handler)
|
||||
|
||||
|
||||
# --- 核心任务启动与取消逻辑 (仅由服务层调用) ---
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from io import BytesIO
|
||||
@@ -11,62 +13,26 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||
|
||||
from docutranslate.logger import translater_logger
|
||||
|
||||
from docutranslate.converter import Converter, Document
|
||||
|
||||
import asyncio
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False,
|
||||
artifacts_path: Path | str | None = None) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_picture_images = True
|
||||
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
if formula:
|
||||
pipeline_options.do_formula_enrichment = True
|
||||
if code:
|
||||
pipeline_options.do_code_enrichment = True
|
||||
# pipeline_options.accelerator_options= AcceleratorOptions(
|
||||
# num_threads=4, device=AcceleratorDevice.AUTO
|
||||
# )
|
||||
# 打印时间
|
||||
settings.debug.profile_pipeline_timings = True
|
||||
converter = DocumentConverter(format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
|
||||
})
|
||||
try:
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
except LocalEntryNotFoundError:
|
||||
translater_logger.info(f"无法连接huggingface,正在尝试换源")
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
|
||||
return result
|
||||
|
||||
|
||||
class ConverterDocling(Converter):
|
||||
def __init__(self, code=True, formula=True, artifact=None):
|
||||
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
|
||||
self.code = code
|
||||
self.formula = formula
|
||||
self.artifact = artifact
|
||||
self.logger = logger if logger else global_logger
|
||||
|
||||
def convert(self, document):
|
||||
assert isinstance(document.filename, str)
|
||||
translater_logger.info(f"正在将文档转换为markdown")
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
|
||||
result = file2markdown_embed_images(document_stream, formula=self.formula, code=self.code,
|
||||
artifacts_path=self.artifact)
|
||||
translater_logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
result = self.file2markdown_embed_images(document_stream)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
|
||||
async def convert_async(self, document: Document) -> str:
|
||||
@@ -75,11 +41,43 @@ class ConverterDocling(Converter):
|
||||
document
|
||||
)
|
||||
|
||||
def set_config(self,cofig:dict):
|
||||
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_picture_images = True
|
||||
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
if self.formula:
|
||||
pipeline_options.do_formula_enrichment = True
|
||||
if self.code:
|
||||
pipeline_options.do_code_enrichment = True
|
||||
# pipeline_options.accelerator_options= AcceleratorOptions(
|
||||
# num_threads=4, device=AcceleratorDevice.AUTO
|
||||
# )
|
||||
# 打印时间
|
||||
settings.debug.profile_pipeline_timings = True
|
||||
converter = DocumentConverter(format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
|
||||
})
|
||||
try:
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
except LocalEntryNotFoundError:
|
||||
self.logger.info(f"无法连接huggingface,正在尝试换源")
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
|
||||
return result
|
||||
|
||||
def set_config(self, cofig: dict):
|
||||
pass
|
||||
|
||||
def get_config_list(self) ->list[str]|None:
|
||||
def get_config_list(self) -> list[str] | None:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import zipfile
|
||||
import httpx
|
||||
from docutranslate.converter import Converter, Document
|
||||
from docutranslate.logger import translater_logger
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||
|
||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||
@@ -21,10 +22,11 @@ client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||
|
||||
# TODO: 提供更详细的logger
|
||||
class ConverterMineru(Converter):
|
||||
def __init__(self, token: str, formula=True):
|
||||
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
||||
self.mineru_token = token.strip()
|
||||
self.client_async = httpx.AsyncClient(timeout=timeout)
|
||||
self.formula = formula
|
||||
self.logger=logger if logger else global_logger
|
||||
|
||||
def _get_header(self):
|
||||
return {
|
||||
@@ -74,12 +76,12 @@ class ConverterMineru(Converter):
|
||||
time.sleep(3)
|
||||
|
||||
def convert(self, document: Document) -> str:
|
||||
translater_logger.info(f"正在将文档转换为markdown")
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
batch_id = self.upload(document)
|
||||
file_url = self.get_file_url(batch_id)
|
||||
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||
translater_logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
|
||||
# TODO: 实现细粒度更高的协程
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .logger import translater_logger
|
||||
from .logger import global_logger
|
||||
@@ -3,8 +3,8 @@ import logging
|
||||
|
||||
|
||||
# 创建日志对象
|
||||
translater_logger = logging.getLogger("TranslaterLogger")
|
||||
translater_logger.setLevel(logging.DEBUG)
|
||||
global_logger = logging.getLogger("TranslaterLogger")
|
||||
global_logger.setLevel(logging.DEBUG)
|
||||
#输出到控制台
|
||||
console_handler = logging.StreamHandler()
|
||||
translater_logger.addHandler(console_handler)
|
||||
global_logger.addHandler(console_handler)
|
||||
@@ -1,20 +1,23 @@
|
||||
import asyncio
|
||||
import html
|
||||
import io
|
||||
import logging
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
import markdown2
|
||||
|
||||
import jinja2
|
||||
import markdown2
|
||||
|
||||
from docutranslate.agents import Agent, AgentArgs
|
||||
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
||||
from docutranslate.cacher import document_cacher_global
|
||||
from docutranslate.converter import Document, ConverterMineru
|
||||
from docutranslate.global_values import available_packages
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
|
||||
unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip
|
||||
from docutranslate.logger import translater_logger
|
||||
from docutranslate.global_values import available_packages
|
||||
from docutranslate.utils.resource_utils import resource_path
|
||||
|
||||
DOCLING_FLAG = True if available_packages.get("docling") else False
|
||||
@@ -35,7 +38,10 @@ class FileTranslater:
|
||||
concurrent: int = default_params["concurrent"], timeout=2000,
|
||||
convert_engin: Literal["docling", "mineru"] = "mineru",
|
||||
docling_artifact: Path | str | None = None,
|
||||
mineru_token: str = None, cache=True):
|
||||
mineru_token: str = None, cache=True,
|
||||
logger: logging.Logger | None = None):
|
||||
self.logger = logger if logger else global_logger
|
||||
|
||||
self.convert_engin = convert_engin
|
||||
self.mineru_token = mineru_token.strip() if mineru_token is not None else None
|
||||
self._mask_dict = MaskDict()
|
||||
@@ -51,7 +57,7 @@ class FileTranslater:
|
||||
artifact_path = Path("./docling_artifact")
|
||||
print(f"artifact_path:{artifact_path.resolve()},existed:{artifact_path.is_dir()}")
|
||||
if artifact_path.is_dir():
|
||||
translater_logger.info("检测到docling_artifact文件夹")
|
||||
self.logger.info("检测到docling_artifact文件夹")
|
||||
self.docling_artifact = artifact_path
|
||||
self.timeout = timeout
|
||||
self.document: Document | None = None
|
||||
@@ -78,7 +84,7 @@ class FileTranslater:
|
||||
|
||||
def _split_markdown_into_chunks(self) -> list[str]:
|
||||
chunks: list[str] = split_markdown_text(self.markdown, self.chunk_size)
|
||||
translater_logger.info(f"markdown分为{len(chunks)}块")
|
||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||
return chunks
|
||||
|
||||
def _default_agent_params(self) -> AgentArgs:
|
||||
@@ -92,7 +98,8 @@ class FileTranslater:
|
||||
"model_id": self.model_id,
|
||||
"temperature": self.temperature,
|
||||
"max_concurrent": self.concurrent,
|
||||
"timeout": self.timeout
|
||||
"timeout": self.timeout,
|
||||
"logger":self.logger
|
||||
}
|
||||
return result
|
||||
|
||||
@@ -105,26 +112,26 @@ class FileTranslater:
|
||||
def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str:
|
||||
cached_result = self.cacher.get_cached_result(document, formula, code, convert_engin=self.convert_engin)
|
||||
if cached_result:
|
||||
translater_logger.info("正在获取缓存结果")
|
||||
self.logger.info("正在获取缓存结果")
|
||||
return cached_result
|
||||
if document.suffix in [".md", ".txt"]:
|
||||
return document.filebytes.decode("utf-8")
|
||||
if document.suffix in ['.zip']:
|
||||
#寻找zip内的filename
|
||||
filename=find_markdown_in_zip(document.filebytes)
|
||||
return embed_inline_image_from_zip(document.filebytes,filename)
|
||||
translater_logger.info("正在转化为markdown")
|
||||
# 寻找zip内的filename
|
||||
filename = find_markdown_in_zip(document.filebytes)
|
||||
return embed_inline_image_from_zip(document.filebytes, filename)
|
||||
self.logger.info("正在转化为markdown")
|
||||
if self.convert_engin == "docling":
|
||||
if artifact is None:
|
||||
artifact = self.docling_artifact
|
||||
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact)
|
||||
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact,logger=self.logger)
|
||||
result = mdconverter.convert(document)
|
||||
else:
|
||||
if self.mineru_token is None:
|
||||
raise Exception("mineru_token未配置")
|
||||
if code:
|
||||
translater_logger.info("mineru暂不支持code识别")
|
||||
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
||||
self.logger.info("mineru暂不支持code识别")
|
||||
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula,logger=self.logger)
|
||||
result = mdconverter.convert(document)
|
||||
return self.cacher.cache_result(result, document, formula, code, convert_engin=self.convert_engin)
|
||||
|
||||
@@ -132,26 +139,26 @@ class FileTranslater:
|
||||
artifact: Path = None) -> str:
|
||||
cached_result = self.cacher.get_cached_result(document, formula, code, convert_engin=self.convert_engin)
|
||||
if cached_result:
|
||||
translater_logger.info("解析结果已缓存,获取缓存结果")
|
||||
self.logger.info("解析结果已缓存,获取缓存结果")
|
||||
return cached_result
|
||||
if document.suffix in [".md", ".txt"]:
|
||||
return document.filebytes.decode("utf-8")
|
||||
if document.suffix in ['.zip']:
|
||||
#寻找zip内的filename
|
||||
filename=find_markdown_in_zip(document.filebytes)
|
||||
return embed_inline_image_from_zip(document.filebytes,filename)
|
||||
translater_logger.info("正在转化为markdown")
|
||||
# 寻找zip内的filename
|
||||
filename = find_markdown_in_zip(document.filebytes)
|
||||
return embed_inline_image_from_zip(document.filebytes, filename)
|
||||
self.logger.info("正在转化为markdown")
|
||||
if self.convert_engin == "docling":
|
||||
if artifact is None:
|
||||
artifact = self.docling_artifact
|
||||
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact)
|
||||
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact,logger=self.logger)
|
||||
result = await mdconverter.convert_async(document)
|
||||
else:
|
||||
if self.mineru_token is None:
|
||||
raise Exception("mineru_token未配置")
|
||||
if code:
|
||||
translater_logger.info("mineru暂不支持code识别")
|
||||
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
||||
self.logger.info("mineru暂不支持code识别")
|
||||
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula,logger=self.logger)
|
||||
result = await mdconverter.convert_async(document)
|
||||
return self.cacher.cache_result(result, document, formula, code, convert_engin=self.convert_engin)
|
||||
|
||||
@@ -209,7 +216,7 @@ class FileTranslater:
|
||||
document = self.document
|
||||
if document is None:
|
||||
raise Exception("未读取文件")
|
||||
translater_logger.info(f"读取文件:{document.filename}")
|
||||
self.logger.info(f"读取文件:{document.filename}")
|
||||
self.read_document(document, formula=formula, code=code, save=save, save_format=save_format, refine=refine,
|
||||
refine_agent=refine_agent)
|
||||
return self
|
||||
@@ -223,14 +230,14 @@ class FileTranslater:
|
||||
document = self.document
|
||||
if document is None:
|
||||
raise Exception("未读取文件")
|
||||
translater_logger.info(f"读取文件:{document.filename}")
|
||||
self.logger.info(f"读取文件:{document.filename}")
|
||||
# 如果是markdown,直接读取
|
||||
await self.read_document_async(document, formula=formula, code=code, save=save, save_format=save_format,
|
||||
refine=refine, refine_agent=refine_agent)
|
||||
return self
|
||||
|
||||
def refine_markdown_by_agent(self, refine_agent: Agent | None = None, custom_prompt=None) -> str:
|
||||
translater_logger.info("正在修正markdown")
|
||||
self.logger.info("正在修正markdown")
|
||||
self._mask_uris_in_markdown()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
if refine_agent is None:
|
||||
@@ -241,11 +248,11 @@ class FileTranslater:
|
||||
else:
|
||||
self.markdown = join_markdown_texts(result)
|
||||
self._unmask_uris_in_markdown()
|
||||
translater_logger.info("markdown已修正")
|
||||
self.logger.info("markdown已修正")
|
||||
return self.markdown
|
||||
|
||||
def translate_markdown_by_agent(self, translate_agent: Agent | None = None, to_lang="中文", custom_prompt=None):
|
||||
translater_logger.info("正在翻译markdown")
|
||||
self.logger.info("正在翻译markdown")
|
||||
self._mask_uris_in_markdown()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
if translate_agent is None:
|
||||
@@ -256,11 +263,11 @@ class FileTranslater:
|
||||
else:
|
||||
self.markdown = join_markdown_texts(result)
|
||||
self._unmask_uris_in_markdown()
|
||||
translater_logger.info("翻译完成")
|
||||
self.logger.info("翻译完成")
|
||||
return self.markdown
|
||||
|
||||
async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None, custom_prompt=None) -> str:
|
||||
translater_logger.info("正在修正markdown")
|
||||
self.logger.info("正在修正markdown")
|
||||
self._mask_uris_in_markdown()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
if refine_agent is None:
|
||||
@@ -271,12 +278,12 @@ class FileTranslater:
|
||||
else:
|
||||
self.markdown = join_markdown_texts(result)
|
||||
self._unmask_uris_in_markdown()
|
||||
translater_logger.info("markdown已修正")
|
||||
self.logger.info("markdown已修正")
|
||||
return self.markdown
|
||||
|
||||
async def translate_markdown_by_agent_async(self, translate_agent: Agent | None = None, to_lang="中文",
|
||||
custom_prompt=None):
|
||||
translater_logger.info("正在翻译markdown")
|
||||
self.logger.info("正在翻译markdown")
|
||||
self._mask_uris_in_markdown()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
if translate_agent is None:
|
||||
@@ -287,7 +294,7 @@ class FileTranslater:
|
||||
else:
|
||||
self.markdown = join_markdown_texts(result)
|
||||
self._unmask_uris_in_markdown()
|
||||
translater_logger.info("翻译完成")
|
||||
self.logger.info("翻译完成")
|
||||
return self.markdown
|
||||
|
||||
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True):
|
||||
@@ -303,9 +310,9 @@ class FileTranslater:
|
||||
full_name = output_dir / filename.name
|
||||
with open(full_name, "w", encoding="utf-8") as file:
|
||||
file.write(self.export_to_markdown())
|
||||
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
||||
self.logger.info(f"文件已写入{full_name.resolve()}")
|
||||
else:
|
||||
output_dir=output_dir/filename.stem
|
||||
output_dir = output_dir / filename.stem
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
with zipfile.ZipFile(io.BytesIO(self.export_to_unembed_markdown())) as zip_ref:
|
||||
zip_ref.extractall(output_dir)
|
||||
@@ -336,9 +343,9 @@ class FileTranslater:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
full_name = output_dir / filename
|
||||
html_content = self.export_to_html(title=str(full_name.resolve().stem))
|
||||
with open(full_name, "w",encoding="utf-8") as file:
|
||||
with open(full_name, "w", encoding="utf-8") as file:
|
||||
file.write(html_content)
|
||||
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
||||
self.logger.info(f"文件已写入{full_name.resolve()}")
|
||||
return self
|
||||
|
||||
def export_to_html(self, title="title", cdn=True) -> str:
|
||||
@@ -463,7 +470,8 @@ class FileTranslater:
|
||||
formula=True,
|
||||
code=True, output_format: Literal["markdown", "html"] = "markdown",
|
||||
custom_prompt_translate=None, refine=False,
|
||||
refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=False):
|
||||
refine_agent: Agent | None = None, translate_agent: Agent | None = None,
|
||||
save=False):
|
||||
await self.read_bytes_async(name=name, file=file, formula=formula, code=code)
|
||||
|
||||
if refine:
|
||||
|
||||
Reference in New Issue
Block a user