diff --git a/docutranslate/converter/converter_identity.py b/docutranslate/converter/converter_identity.py new file mode 100644 index 0000000..70d3ffb --- /dev/null +++ b/docutranslate/converter/converter_identity.py @@ -0,0 +1,11 @@ +from docutranslate.converter.base import Converter +from docutranslate.ir.document import Document + + +class ConverterIdentity(Converter): + + def convert(self, document: Document) -> Document: + return Document.from_bytes(content=document.content, suffix=document.suffix, stem=document.stem) + + async def convert_async(self, document: Document) -> Document: + return Document.from_bytes(content=document.content, suffix=document.suffix, stem=document.stem) diff --git a/docutranslate/converter/x2md/converter_identity.py b/docutranslate/converter/x2md/converter_identity.py deleted file mode 100644 index 96462bf..0000000 --- a/docutranslate/converter/x2md/converter_identity.py +++ /dev/null @@ -1,15 +0,0 @@ -from docutranslate.converter.x2md.base import X2MarkdownConverter -from docutranslate.ir.document import Document -from docutranslate.ir.markdown_document import MarkdownDocument - - -class ConverterIdentity(X2MarkdownConverter): - #TODO:支持markdown_zip格式输入 - def convert(self, document: Document) -> MarkdownDocument: - return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem) - - async def convert_async(self, document: Document) -> MarkdownDocument: - return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem) - - def support_format(self) -> list[str]: - return [".md"] diff --git a/docutranslate/converter/x2xlsx/__init__.py b/docutranslate/converter/x2xlsx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/converter/x2xlsx/base.py b/docutranslate/converter/x2xlsx/base.py new file mode 100644 index 0000000..629f854 --- /dev/null +++ b/docutranslate/converter/x2xlsx/base.py @@ -0,0 +1,30 @@ +from abc import abstractmethod +from dataclasses import dataclass +from typing import Hashable + +from docutranslate.converter.base import Converter, ConverterConfig +from docutranslate.ir.document import Document + +@dataclass(kw_only=True) +class X2XlsxConverterConfig(ConverterConfig): + ... + @abstractmethod + def gethash(self) ->Hashable: + ... + +class X2XlsxConverter(Converter): + """ + 负责将其它格式的文件转换为xlsx + """ + + @abstractmethod + def convert(self, document: Document) -> Document: + ... + + @abstractmethod + async def convert_async(self, document: Document) -> Document: + ... + + @abstractmethod + def support_format(self)->list[str]: + ... \ No newline at end of file diff --git a/docutranslate/converter/x2xlsx/converter_csv2xlsx.py b/docutranslate/converter/x2xlsx/converter_csv2xlsx.py new file mode 100644 index 0000000..7329044 --- /dev/null +++ b/docutranslate/converter/x2xlsx/converter_csv2xlsx.py @@ -0,0 +1,110 @@ +import asyncio +import csv +import logging +from io import BytesIO, StringIO + +# 引入 chardet 用于编码检测 +import chardet +import openpyxl +from docutranslate.converter.x2xlsx.base import X2XlsxConverter +from docutranslate.ir.document import Document + + +# 配置一个基本的日志记录器(如果您的项目尚未配置) +# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + + +class ConverterCsv2Xlsx(X2XlsxConverter): + """ + 一个经过改进的、健壮的 CSV 到 XLSX 转换器。 + + 特性: + - 内存高效:使用流式写入模式处理大型文件。 + - 自动编码检测:避免乱码问题。 + - 自动 CSV 格式识别:支持不同的分隔符。 + - 完善的错误处理和日志记录。 + """ + + + def convert(self, document: Document) -> Document: + """ + 将 CSV Document 对象同步转换为 XLSX Document 对象。 + """ + self.logger.info(f"开始转换文件 {document.name} (大小: {len(document.content)} bytes)") + + try: + # --- 1. 自动检测文件编码 --- + # 为提高性能,只取文件头部一部分进行检测 + detection_result = chardet.detect(document.content[:4096]) + encoding = detection_result['encoding'] or 'utf-8' # 提供一个默认值 + confidence = detection_result['confidence'] + self.logger.info(f"检测到文件编码为: {encoding} (置信度: {confidence:.2%})") + + # --- 2. 解码并创建文本流 --- + try: + decoded_content = document.content.decode(encoding) + except UnicodeDecodeError: + self.logger.warning(f"使用检测到的编码 '{encoding}' 解码失败,尝试使用 'utf-8'。") + decoded_content = document.content.decode('utf-8', errors='replace') + + csv_text_stream = StringIO(decoded_content) + + # --- 3. 自动识别CSV方言(如分隔符) --- + try: + # Sniffer需要一些数据来嗅探,如果文件太小可能失败 + dialect = csv.Sniffer().sniff(csv_text_stream.read(2048)) + csv_text_stream.seek(0) # 将流指针重置回文件开头 + self.logger.info(f"检测到CSV分隔符为: '{dialect.delimiter}'") + except csv.Error: + self.logger.warning("无法自动识别CSV方言,将使用默认的逗号分隔符。") + dialect = 'excel' # 使用默认方言 + csv_text_stream.seek(0) + + csv_reader = csv.reader(csv_text_stream, dialect) + + # --- 4. 使用内存优化的`write_only`模式创建XLSX --- + wb = openpyxl.Workbook(write_only=True) + ws = wb.create_sheet() + + # --- 5. 逐行读取CSV并写入XLSX --- + row_count = 0 + for row_data in csv_reader: + ws.append(row_data) # append() 是 write_only 模式下的高效写入方法 + row_count += 1 + + self.logger.info(f"共处理 {row_count} 行数据。") + + # --- 6. 将生成的XLSX保存到内存中的字节流 --- + output_buffer = BytesIO() + wb.save(output_buffer) + output_buffer.seek(0) # 将指针移到开头,以便getvalue()读取完整内容 + + self.logger.info(f"文件 {document.name} 已成功转换为 XLSX 格式。") + + return Document.from_bytes( + content=output_buffer.getvalue(), + suffix=".xlsx", + stem=document.stem + ) + + except Exception as e: + self.logger.error(f"转换文件 {document.name} 时发生严重错误: {e}", exc_info=True) + # 根据您的业务逻辑,这里可以抛出异常或返回一个表示失败的特定对象 + raise + + async def convert_async(self, document: Document) -> Document: + """ + 异步执行转换操作。 + 由于核心转换逻辑是CPU密集型和阻塞IO,使用 to_thread 是正确的选择, + 它可以防止阻塞asyncio事件循环。 + """ + self.logger.info(f"为文件 {document.name} 的转换任务创建新线程。") + # 我们已经优化了 `convert` 方法,所以 `to_thread` 的方式非常适合 + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.convert, document) + + def support_format(self) -> list[str]: + """ + 声明此转换器支持的源文件格式。 + """ + return [".csv"] \ No newline at end of file diff --git a/docutranslate/exporter/xlsx/xlsx2csv_exporter.py b/docutranslate/exporter/xlsx/xlsx2csv_exporter.py new file mode 100644 index 0000000..ce7a95d --- /dev/null +++ b/docutranslate/exporter/xlsx/xlsx2csv_exporter.py @@ -0,0 +1,33 @@ +from io import BytesIO, StringIO + +import openpyxl +import csv +from docutranslate.exporter.xlsx.base import XlsxExporter +from docutranslate.ir.document import Document + + +class Xlsx2CsvExporter(XlsxExporter): + + def export(self, document: Document) -> Document: + workbook = openpyxl.load_workbook(BytesIO(document.content)) + sheet = workbook.active + + # 2. 使用 StringIO 作为文本缓冲区 + text_buffer = StringIO() + + # 3. 直接将缓冲区传递给 csv.writer + writer = csv.writer(text_buffer) + + # 遍历工作表中的每一行 + for row in sheet.rows: + writer.writerow([cell.value for cell in row]) + + # 4. 将文本缓冲区的内容编码为 bytes + output_bytes = text_buffer.getvalue().encode('utf-8') + + # 5. 返回一个后缀为 .csv 的 Document + return Document.from_bytes(content=output_bytes, suffix=".csv", stem=document.stem) + + + + diff --git a/docutranslate/workflow/interfaces.py b/docutranslate/workflow/interfaces.py index 3c36cf9..9b5de38 100644 --- a/docutranslate/workflow/interfaces.py +++ b/docutranslate/workflow/interfaces.py @@ -36,7 +36,7 @@ class MDZIPExportable(Protocol[T_ExporterConfig]): @runtime_checkable -class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig],Protocol): +class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig], Protocol): ... @@ -48,6 +48,7 @@ class TXTExportable(Protocol[T_ExporterConfig]): def save_as_txt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class JsonExportable(Protocol[T_ExporterConfig]): def export_to_json(self, config: T_ExporterConfig | None = None) -> str: @@ -56,6 +57,7 @@ class JsonExportable(Protocol[T_ExporterConfig]): def save_as_json(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class XlsxExportable(Protocol[T_ExporterConfig]): def export_to_xlsx(self, config: T_ExporterConfig | None = None) -> bytes: @@ -64,6 +66,16 @@ class XlsxExportable(Protocol[T_ExporterConfig]): def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + +@runtime_checkable +class CsvExportable(Protocol[T_ExporterConfig]): + def export_to_csv(self, config: T_ExporterConfig | None = None) -> bytes: + ... + + def save_as_csv(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: + ... + + @runtime_checkable class DocxExportable(Protocol[T_ExporterConfig]): def export_to_docx(self, config: T_ExporterConfig | None = None) -> bytes: @@ -72,6 +84,7 @@ class DocxExportable(Protocol[T_ExporterConfig]): def save_as_docx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class SrtExportable(Protocol[T_ExporterConfig]): def export_to_srt(self, config: T_ExporterConfig | None = None) -> str: @@ -80,10 +93,11 @@ class SrtExportable(Protocol[T_ExporterConfig]): def save_as_srt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class EpubExportable(Protocol[T_ExporterConfig]): def export_to_epub(self, config: T_ExporterConfig | None = None) -> bytes: ... def save_as_epub(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: - ... \ No newline at end of file + ... diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index c7f60f0..14b9917 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -11,7 +11,7 @@ from docutranslate.ir.markdown_document import MarkdownDocument if DOCLING_EXIST: from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling -from docutranslate.converter.x2md.converter_identity import ConverterIdentity +from docutranslate.converter.converter_identity import ConverterIdentity from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter @@ -35,7 +35,7 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark HTMLExportable[MD2HTMLExporterConfig], MDFormatsExportable[ExporterConfig]): _converter_factory: dict[ - ConvertEngineType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = { + ConvertEngineType, Tuple[Type[X2MarkdownConverter|ConverterIdentity], Type[X2MarkdownConverterConfig]] | None] = { "mineru": (ConverterMineru, ConverterMineruConfig), "identity": (ConverterIdentity, None) } diff --git a/docutranslate/workflow/xlsx_workflow.py b/docutranslate/workflow/xlsx_workflow.py index bdcb3f3..fb10240 100644 --- a/docutranslate/workflow/xlsx_workflow.py +++ b/docutranslate/workflow/xlsx_workflow.py @@ -1,14 +1,19 @@ +import asyncio from dataclasses import dataclass from pathlib import Path -from typing import Self +from typing import Self, Type +from docutranslate.converter.converter_identity import ConverterIdentity +from docutranslate.converter.x2xlsx.base import X2XlsxConverter +from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx from docutranslate.exporter.base import ExporterConfig +from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter from docutranslate.exporter.xlsx.xlsx2xlsx_exporter import Xlsx2XlsxExporter from docutranslate.ir.document import Document from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig, XlsxTranslator from docutranslate.workflow.base import Workflow, WorkflowConfig -from docutranslate.workflow.interfaces import HTMLExportable, XlsxExportable +from docutranslate.workflow.interfaces import HTMLExportable, XlsxExportable, CsvExportable @dataclass(kw_only=True) @@ -18,7 +23,13 @@ class XlsxWorkflowConfig(WorkflowConfig): class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig], - XlsxExportable[ExporterConfig]): + XlsxExportable[ExporterConfig],CsvExportable[ExporterConfig]): + _converter_factory: dict[ + str, Type[X2XlsxConverter | ConverterIdentity]] = { + ".csv": ConverterCsv2Xlsx, + ".xlsx": ConverterIdentity + } + def __init__(self, config: XlsxWorkflowConfig): super().__init__(config=config) if config.logger: @@ -26,20 +37,30 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta if sub_config: sub_config.logger = config.logger - def _pre_translate(self, document_original: Document): - document = document_original.copy() + def _get_document_xlsx(self, document: Document) -> Document: + suffix = document.suffix + converter_type = self._converter_factory.get(suffix) + if converter_type is None: + raise ValueError(f"Xlsx工作流不支持{suffix}格式文件") + converter = converter_type() + return converter.convert(document) + + def _pre_translate(self, document_pre_transalte: Document): + document = document_pre_transalte.copy() translate_config = self.config.translator_config translator = XlsxTranslator(translate_config) return document, translator def translate(self) -> Self: - document, translator = self._pre_translate(self.document_original) + document_xlsx = self._get_document_xlsx(self.document_original) + document, translator = self._pre_translate(document_xlsx) translator.translate(document) self.document_translated = document return self async def translate_async(self) -> Self: - document, translator = self._pre_translate(self.document_original) + document_xlsx = await asyncio.to_thread(self._get_document_xlsx, self.document_original) + document, translator = self._pre_translate(document_xlsx) await translator.translate_async(document) self.document_translated = document return self @@ -53,6 +74,10 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta docu = self._export(Xlsx2XlsxExporter()) return docu.content + def export_to_csv(self, _: ExporterConfig | None = None) -> bytes: + docu = self._export(Xlsx2CsvExporter()) + return docu.content + def save_as_html(self, name: str = None, output_dir: Path | str = "./output", config: Xlsx2HTMLExporter | None = None) -> Self: config = config or self.config.html_exporter_config @@ -63,3 +88,8 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta _: ExporterConfig | None = None) -> Self: self._save(exporter=Xlsx2XlsxExporter(), name=name, output_dir=output_dir) return self + + def save_as_csv(self, name: str = None, output_dir: Path | str = "./output", + _: ExporterConfig | None = None) -> Self: + self._save(exporter=Xlsx2CsvExporter(), name=name, output_dir=output_dir) + return self diff --git a/pyproject.toml b/pyproject.toml index be2b610..a5c02a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "beautifulsoup4>=4.13.4", "markdown>=3.8.2", "pymdown-extensions>=10.16.1", + "chardet>=5.2.0", ] dynamic = ["version"] diff --git a/uv.lock b/uv.lock index 11ee74a..3ec8f0b 100644 --- a/uv.lock +++ b/uv.lock @@ -96,6 +96,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216 }, ] +[[package]] +name = "chardet" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 }, +] + [[package]] name = "charset-normalizer" version = "3.4.2" @@ -316,6 +325,7 @@ name = "docutranslate" source = { editable = "." } dependencies = [ { name = "beautifulsoup4" }, + { name = "chardet" }, { name = "fastapi", extra = ["standard"] }, { name = "httpx" }, { name = "json-repair" }, @@ -346,6 +356,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.13.4" }, + { name = "chardet", specifier = ">=5.2.0" }, { name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, { name = "httpx", specifier = "==0.27.2" },