workflow增加csv支持

This commit is contained in:
xunbu
2025-08-23 16:27:22 +08:00
parent 661f249874
commit 111599926d
11 changed files with 251 additions and 26 deletions

View File

@@ -0,0 +1,11 @@
from docutranslate.converter.base import Converter
from docutranslate.ir.document import Document
class ConverterIdentity(Converter):
def convert(self, document: Document) -> Document:
return Document.from_bytes(content=document.content, suffix=document.suffix, stem=document.stem)
async def convert_async(self, document: Document) -> Document:
return Document.from_bytes(content=document.content, suffix=document.suffix, stem=document.stem)

View File

@@ -1,15 +0,0 @@
from docutranslate.converter.x2md.base import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class ConverterIdentity(X2MarkdownConverter):
#TODO:支持markdown_zip格式输入
def convert(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
async def convert_async(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
def support_format(self) -> list[str]:
return [".md"]

View File

@@ -0,0 +1,30 @@
from abc import abstractmethod
from dataclasses import dataclass
from typing import Hashable
from docutranslate.converter.base import Converter, ConverterConfig
from docutranslate.ir.document import Document
@dataclass(kw_only=True)
class X2XlsxConverterConfig(ConverterConfig):
...
@abstractmethod
def gethash(self) ->Hashable:
...
class X2XlsxConverter(Converter):
"""
负责将其它格式的文件转换为xlsx
"""
@abstractmethod
def convert(self, document: Document) -> Document:
...
@abstractmethod
async def convert_async(self, document: Document) -> Document:
...
@abstractmethod
def support_format(self)->list[str]:
...

View File

@@ -0,0 +1,110 @@
import asyncio
import csv
import logging
from io import BytesIO, StringIO
# 引入 chardet 用于编码检测
import chardet
import openpyxl
from docutranslate.converter.x2xlsx.base import X2XlsxConverter
from docutranslate.ir.document import Document
# 配置一个基本的日志记录器(如果您的项目尚未配置)
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class ConverterCsv2Xlsx(X2XlsxConverter):
"""
一个经过改进的、健壮的 CSV 到 XLSX 转换器。
特性:
- 内存高效:使用流式写入模式处理大型文件。
- 自动编码检测:避免乱码问题。
- 自动 CSV 格式识别:支持不同的分隔符。
- 完善的错误处理和日志记录。
"""
def convert(self, document: Document) -> Document:
"""
将 CSV Document 对象同步转换为 XLSX Document 对象。
"""
self.logger.info(f"开始转换文件 {document.name} (大小: {len(document.content)} bytes)")
try:
# --- 1. 自动检测文件编码 ---
# 为提高性能,只取文件头部一部分进行检测
detection_result = chardet.detect(document.content[:4096])
encoding = detection_result['encoding'] or 'utf-8' # 提供一个默认值
confidence = detection_result['confidence']
self.logger.info(f"检测到文件编码为: {encoding} (置信度: {confidence:.2%})")
# --- 2. 解码并创建文本流 ---
try:
decoded_content = document.content.decode(encoding)
except UnicodeDecodeError:
self.logger.warning(f"使用检测到的编码 '{encoding}' 解码失败,尝试使用 'utf-8'")
decoded_content = document.content.decode('utf-8', errors='replace')
csv_text_stream = StringIO(decoded_content)
# --- 3. 自动识别CSV方言如分隔符 ---
try:
# Sniffer需要一些数据来嗅探如果文件太小可能失败
dialect = csv.Sniffer().sniff(csv_text_stream.read(2048))
csv_text_stream.seek(0) # 将流指针重置回文件开头
self.logger.info(f"检测到CSV分隔符为: '{dialect.delimiter}'")
except csv.Error:
self.logger.warning("无法自动识别CSV方言将使用默认的逗号分隔符。")
dialect = 'excel' # 使用默认方言
csv_text_stream.seek(0)
csv_reader = csv.reader(csv_text_stream, dialect)
# --- 4. 使用内存优化的`write_only`模式创建XLSX ---
wb = openpyxl.Workbook(write_only=True)
ws = wb.create_sheet()
# --- 5. 逐行读取CSV并写入XLSX ---
row_count = 0
for row_data in csv_reader:
ws.append(row_data) # append() 是 write_only 模式下的高效写入方法
row_count += 1
self.logger.info(f"共处理 {row_count} 行数据。")
# --- 6. 将生成的XLSX保存到内存中的字节流 ---
output_buffer = BytesIO()
wb.save(output_buffer)
output_buffer.seek(0) # 将指针移到开头以便getvalue()读取完整内容
self.logger.info(f"文件 {document.name} 已成功转换为 XLSX 格式。")
return Document.from_bytes(
content=output_buffer.getvalue(),
suffix=".xlsx",
stem=document.stem
)
except Exception as e:
self.logger.error(f"转换文件 {document.name} 时发生严重错误: {e}", exc_info=True)
# 根据您的业务逻辑,这里可以抛出异常或返回一个表示失败的特定对象
raise
async def convert_async(self, document: Document) -> Document:
"""
异步执行转换操作。
由于核心转换逻辑是CPU密集型和阻塞IO使用 to_thread 是正确的选择,
它可以防止阻塞asyncio事件循环。
"""
self.logger.info(f"为文件 {document.name} 的转换任务创建新线程。")
# 我们已经优化了 `convert` 方法,所以 `to_thread` 的方式非常适合
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.convert, document)
def support_format(self) -> list[str]:
"""
声明此转换器支持的源文件格式。
"""
return [".csv"]

View File

@@ -0,0 +1,33 @@
from io import BytesIO, StringIO
import openpyxl
import csv
from docutranslate.exporter.xlsx.base import XlsxExporter
from docutranslate.ir.document import Document
class Xlsx2CsvExporter(XlsxExporter):
def export(self, document: Document) -> Document:
workbook = openpyxl.load_workbook(BytesIO(document.content))
sheet = workbook.active
# 2. 使用 StringIO 作为文本缓冲区
text_buffer = StringIO()
# 3. 直接将缓冲区传递给 csv.writer
writer = csv.writer(text_buffer)
# 遍历工作表中的每一行
for row in sheet.rows:
writer.writerow([cell.value for cell in row])
# 4. 将文本缓冲区的内容编码为 bytes
output_bytes = text_buffer.getvalue().encode('utf-8')
# 5. 返回一个后缀为 .csv 的 Document
return Document.from_bytes(content=output_bytes, suffix=".csv", stem=document.stem)

View File

@@ -36,7 +36,7 @@ class MDZIPExportable(Protocol[T_ExporterConfig]):
@runtime_checkable @runtime_checkable
class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig],Protocol): class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig], Protocol):
... ...
@@ -48,6 +48,7 @@ class TXTExportable(Protocol[T_ExporterConfig]):
def save_as_txt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_txt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class JsonExportable(Protocol[T_ExporterConfig]): class JsonExportable(Protocol[T_ExporterConfig]):
def export_to_json(self, config: T_ExporterConfig | None = None) -> str: def export_to_json(self, config: T_ExporterConfig | None = None) -> str:
@@ -56,6 +57,7 @@ class JsonExportable(Protocol[T_ExporterConfig]):
def save_as_json(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_json(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class XlsxExportable(Protocol[T_ExporterConfig]): class XlsxExportable(Protocol[T_ExporterConfig]):
def export_to_xlsx(self, config: T_ExporterConfig | None = None) -> bytes: def export_to_xlsx(self, config: T_ExporterConfig | None = None) -> bytes:
@@ -64,6 +66,16 @@ class XlsxExportable(Protocol[T_ExporterConfig]):
def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable
class CsvExportable(Protocol[T_ExporterConfig]):
def export_to_csv(self, config: T_ExporterConfig | None = None) -> bytes:
...
def save_as_csv(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
...
@runtime_checkable @runtime_checkable
class DocxExportable(Protocol[T_ExporterConfig]): class DocxExportable(Protocol[T_ExporterConfig]):
def export_to_docx(self, config: T_ExporterConfig | None = None) -> bytes: def export_to_docx(self, config: T_ExporterConfig | None = None) -> bytes:
@@ -72,6 +84,7 @@ class DocxExportable(Protocol[T_ExporterConfig]):
def save_as_docx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_docx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class SrtExportable(Protocol[T_ExporterConfig]): class SrtExportable(Protocol[T_ExporterConfig]):
def export_to_srt(self, config: T_ExporterConfig | None = None) -> str: def export_to_srt(self, config: T_ExporterConfig | None = None) -> str:
@@ -80,10 +93,11 @@ class SrtExportable(Protocol[T_ExporterConfig]):
def save_as_srt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_srt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class EpubExportable(Protocol[T_ExporterConfig]): class EpubExportable(Protocol[T_ExporterConfig]):
def export_to_epub(self, config: T_ExporterConfig | None = None) -> bytes: def export_to_epub(self, config: T_ExporterConfig | None = None) -> bytes:
... ...
def save_as_epub(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_epub(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...

View File

@@ -11,7 +11,7 @@ from docutranslate.ir.markdown_document import MarkdownDocument
if DOCLING_EXIST: if DOCLING_EXIST:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity from docutranslate.converter.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
@@ -35,7 +35,7 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
HTMLExportable[MD2HTMLExporterConfig], HTMLExportable[MD2HTMLExporterConfig],
MDFormatsExportable[ExporterConfig]): MDFormatsExportable[ExporterConfig]):
_converter_factory: dict[ _converter_factory: dict[
ConvertEngineType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = { ConvertEngineType, Tuple[Type[X2MarkdownConverter|ConverterIdentity], Type[X2MarkdownConverterConfig]] | None] = {
"mineru": (ConverterMineru, ConverterMineruConfig), "mineru": (ConverterMineru, ConverterMineruConfig),
"identity": (ConverterIdentity, None) "identity": (ConverterIdentity, None)
} }

View File

@@ -1,14 +1,19 @@
import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Self from typing import Self, Type
from docutranslate.converter.converter_identity import ConverterIdentity
from docutranslate.converter.x2xlsx.base import X2XlsxConverter
from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx
from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter
from docutranslate.exporter.xlsx.xlsx2xlsx_exporter import Xlsx2XlsxExporter from docutranslate.exporter.xlsx.xlsx2xlsx_exporter import Xlsx2XlsxExporter
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig, XlsxTranslator from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig, XlsxTranslator
from docutranslate.workflow.base import Workflow, WorkflowConfig from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import HTMLExportable, XlsxExportable from docutranslate.workflow.interfaces import HTMLExportable, XlsxExportable, CsvExportable
@dataclass(kw_only=True) @dataclass(kw_only=True)
@@ -18,7 +23,13 @@ class XlsxWorkflowConfig(WorkflowConfig):
class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig], class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig],
XlsxExportable[ExporterConfig]): XlsxExportable[ExporterConfig],CsvExportable[ExporterConfig]):
_converter_factory: dict[
str, Type[X2XlsxConverter | ConverterIdentity]] = {
".csv": ConverterCsv2Xlsx,
".xlsx": ConverterIdentity
}
def __init__(self, config: XlsxWorkflowConfig): def __init__(self, config: XlsxWorkflowConfig):
super().__init__(config=config) super().__init__(config=config)
if config.logger: if config.logger:
@@ -26,20 +37,30 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta
if sub_config: if sub_config:
sub_config.logger = config.logger sub_config.logger = config.logger
def _pre_translate(self, document_original: Document): def _get_document_xlsx(self, document: Document) -> Document:
document = document_original.copy() suffix = document.suffix
converter_type = self._converter_factory.get(suffix)
if converter_type is None:
raise ValueError(f"Xlsx工作流不支持{suffix}格式文件")
converter = converter_type()
return converter.convert(document)
def _pre_translate(self, document_pre_transalte: Document):
document = document_pre_transalte.copy()
translate_config = self.config.translator_config translate_config = self.config.translator_config
translator = XlsxTranslator(translate_config) translator = XlsxTranslator(translate_config)
return document, translator return document, translator
def translate(self) -> Self: def translate(self) -> Self:
document, translator = self._pre_translate(self.document_original) document_xlsx = self._get_document_xlsx(self.document_original)
document, translator = self._pre_translate(document_xlsx)
translator.translate(document) translator.translate(document)
self.document_translated = document self.document_translated = document
return self return self
async def translate_async(self) -> Self: async def translate_async(self) -> Self:
document, translator = self._pre_translate(self.document_original) document_xlsx = await asyncio.to_thread(self._get_document_xlsx, self.document_original)
document, translator = self._pre_translate(document_xlsx)
await translator.translate_async(document) await translator.translate_async(document)
self.document_translated = document self.document_translated = document
return self return self
@@ -53,6 +74,10 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta
docu = self._export(Xlsx2XlsxExporter()) docu = self._export(Xlsx2XlsxExporter())
return docu.content return docu.content
def export_to_csv(self, _: ExporterConfig | None = None) -> bytes:
docu = self._export(Xlsx2CsvExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output", def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
config: Xlsx2HTMLExporter | None = None) -> Self: config: Xlsx2HTMLExporter | None = None) -> Self:
config = config or self.config.html_exporter_config config = config or self.config.html_exporter_config
@@ -63,3 +88,8 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta
_: ExporterConfig | None = None) -> Self: _: ExporterConfig | None = None) -> Self:
self._save(exporter=Xlsx2XlsxExporter(), name=name, output_dir=output_dir) self._save(exporter=Xlsx2XlsxExporter(), name=name, output_dir=output_dir)
return self return self
def save_as_csv(self, name: str = None, output_dir: Path | str = "./output",
_: ExporterConfig | None = None) -> Self:
self._save(exporter=Xlsx2CsvExporter(), name=name, output_dir=output_dir)
return self

View File

@@ -17,6 +17,7 @@ dependencies = [
"beautifulsoup4>=4.13.4", "beautifulsoup4>=4.13.4",
"markdown>=3.8.2", "markdown>=3.8.2",
"pymdown-extensions>=10.16.1", "pymdown-extensions>=10.16.1",
"chardet>=5.2.0",
] ]
dynamic = ["version"] dynamic = ["version"]

11
uv.lock generated
View File

@@ -96,6 +96,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216 }, { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216 },
] ]
[[package]]
name = "chardet"
version = "5.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
]
[[package]] [[package]]
name = "charset-normalizer" name = "charset-normalizer"
version = "3.4.2" version = "3.4.2"
@@ -316,6 +325,7 @@ name = "docutranslate"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },
{ name = "chardet" },
{ name = "fastapi", extra = ["standard"] }, { name = "fastapi", extra = ["standard"] },
{ name = "httpx" }, { name = "httpx" },
{ name = "json-repair" }, { name = "json-repair" },
@@ -346,6 +356,7 @@ dev = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "beautifulsoup4", specifier = ">=4.13.4" }, { name = "beautifulsoup4", specifier = ">=4.13.4" },
{ name = "chardet", specifier = ">=5.2.0" },
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" }, { name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" },
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
{ name = "httpx", specifier = "==0.27.2" }, { name = "httpx", specifier = "==0.27.2" },