使用charset_normalizer替代chardet
This commit is contained in:
@@ -7,8 +7,7 @@ from dataclasses import dataclass
|
||||
from io import BytesIO, StringIO
|
||||
from typing import Hashable
|
||||
|
||||
# 引入 chardet 用于编码检测
|
||||
import chardet
|
||||
import charset_normalizer
|
||||
import openpyxl
|
||||
|
||||
from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
|
||||
@@ -47,7 +46,7 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
|
||||
try:
|
||||
# --- 1. 自动检测文件编码 ---
|
||||
# 为提高性能,只取文件头部一部分进行检测
|
||||
detection_result = chardet.detect(document.content[:4096])
|
||||
detection_result = charset_normalizer.detect(document.content[:4096])
|
||||
encoding = detection_result['encoding'] or 'utf-8' # 提供一个默认值
|
||||
confidence = detection_result['confidence']
|
||||
self.logger.info(f"检测到文件编码为: {encoding} (置信度: {confidence:.2%})")
|
||||
|
||||
@@ -16,7 +16,6 @@ dependencies = [
|
||||
"beautifulsoup4>=4.13.4",
|
||||
"markdown>=3.8.2",
|
||||
"pymdown-extensions>=10.16.1",
|
||||
"chardet>=5.2.0",
|
||||
"pysubs2>=1.8.0",
|
||||
"httpx>=0.28.1",
|
||||
"python-pptx>=1.0.2",
|
||||
|
||||
11
uv.lock
generated
11
uv.lock
generated
@@ -101,15 +101,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chardet"
|
||||
version = "5.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.4"
|
||||
@@ -379,7 +370,6 @@ name = "docutranslate"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "chardet" },
|
||||
{ name = "fastapi", extra = ["standard"] },
|
||||
{ name = "httpx" },
|
||||
{ name = "json-repair" },
|
||||
@@ -414,7 +404,6 @@ dev = [
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4", specifier = ">=4.13.4" },
|
||||
{ name = "chardet", specifier = ">=5.2.0" },
|
||||
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" },
|
||||
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
|
||||
{ name = "hf-xet", marker = "extra == 'docling'", specifier = ">=1.1.10" },
|
||||
|
||||
Reference in New Issue
Block a user