使用charset_normalizer替代chardet

This commit is contained in:
xunbu
2026-01-17 17:32:25 +08:00
parent 96e9404a76
commit 010ef31372
3 changed files with 2 additions and 15 deletions

View File

@@ -7,8 +7,7 @@ from dataclasses import dataclass
from io import BytesIO, StringIO
from typing import Hashable
# 引入 chardet 用于编码检测
import chardet
import charset_normalizer
import openpyxl
from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
@@ -47,7 +46,7 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
try:
# --- 1. 自动检测文件编码 ---
# 为提高性能,只取文件头部一部分进行检测
detection_result = chardet.detect(document.content[:4096])
detection_result = charset_normalizer.detect(document.content[:4096])
encoding = detection_result['encoding'] or 'utf-8' # 提供一个默认值
confidence = detection_result['confidence']
self.logger.info(f"检测到文件编码为: {encoding} (置信度: {confidence:.2%})")

View File

@@ -16,7 +16,6 @@ dependencies = [
"beautifulsoup4>=4.13.4",
"markdown>=3.8.2",
"pymdown-extensions>=10.16.1",
"chardet>=5.2.0",
"pysubs2>=1.8.0",
"httpx>=0.28.1",
"python-pptx>=1.0.2",

11
uv.lock generated
View File

@@ -101,15 +101,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438 },
]
[[package]]
name = "chardet"
version = "5.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
]
[[package]]
name = "charset-normalizer"
version = "3.4.4"
@@ -379,7 +370,6 @@ name = "docutranslate"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "chardet" },
{ name = "fastapi", extra = ["standard"] },
{ name = "httpx" },
{ name = "json-repair" },
@@ -414,7 +404,6 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = ">=4.13.4" },
{ name = "chardet", specifier = ">=5.2.0" },
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" },
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
{ name = "hf-xet", marker = "extra == 'docling'", specifier = ">=1.1.10" },