使用charset_normalizer替代chardet
This commit is contained in:
@@ -7,8 +7,7 @@ from dataclasses import dataclass
|
|||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
from typing import Hashable
|
from typing import Hashable
|
||||||
|
|
||||||
# 引入 chardet 用于编码检测
|
import charset_normalizer
|
||||||
import chardet
|
|
||||||
import openpyxl
|
import openpyxl
|
||||||
|
|
||||||
from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
|
from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
|
||||||
@@ -47,7 +46,7 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
|
|||||||
try:
|
try:
|
||||||
# --- 1. 自动检测文件编码 ---
|
# --- 1. 自动检测文件编码 ---
|
||||||
# 为提高性能,只取文件头部一部分进行检测
|
# 为提高性能,只取文件头部一部分进行检测
|
||||||
detection_result = chardet.detect(document.content[:4096])
|
detection_result = charset_normalizer.detect(document.content[:4096])
|
||||||
encoding = detection_result['encoding'] or 'utf-8' # 提供一个默认值
|
encoding = detection_result['encoding'] or 'utf-8' # 提供一个默认值
|
||||||
confidence = detection_result['confidence']
|
confidence = detection_result['confidence']
|
||||||
self.logger.info(f"检测到文件编码为: {encoding} (置信度: {confidence:.2%})")
|
self.logger.info(f"检测到文件编码为: {encoding} (置信度: {confidence:.2%})")
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ dependencies = [
|
|||||||
"beautifulsoup4>=4.13.4",
|
"beautifulsoup4>=4.13.4",
|
||||||
"markdown>=3.8.2",
|
"markdown>=3.8.2",
|
||||||
"pymdown-extensions>=10.16.1",
|
"pymdown-extensions>=10.16.1",
|
||||||
"chardet>=5.2.0",
|
|
||||||
"pysubs2>=1.8.0",
|
"pysubs2>=1.8.0",
|
||||||
"httpx>=0.28.1",
|
"httpx>=0.28.1",
|
||||||
"python-pptx>=1.0.2",
|
"python-pptx>=1.0.2",
|
||||||
|
|||||||
11
uv.lock
generated
11
uv.lock
generated
@@ -101,15 +101,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438 },
|
{ url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "chardet"
|
|
||||||
version = "5.2.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.4.4"
|
version = "3.4.4"
|
||||||
@@ -379,7 +370,6 @@ name = "docutranslate"
|
|||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "beautifulsoup4" },
|
{ name = "beautifulsoup4" },
|
||||||
{ name = "chardet" },
|
|
||||||
{ name = "fastapi", extra = ["standard"] },
|
{ name = "fastapi", extra = ["standard"] },
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
{ name = "json-repair" },
|
{ name = "json-repair" },
|
||||||
@@ -414,7 +404,6 @@ dev = [
|
|||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "beautifulsoup4", specifier = ">=4.13.4" },
|
{ name = "beautifulsoup4", specifier = ">=4.13.4" },
|
||||||
{ name = "chardet", specifier = ">=5.2.0" },
|
|
||||||
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" },
|
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.40.0" },
|
||||||
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
|
{ name = "fastapi", extras = ["standard"], specifier = ">=0.115.12" },
|
||||||
{ name = "hf-xet", marker = "extra == 'docling'", specifier = ">=1.1.10" },
|
{ name = "hf-xet", marker = "extra == 'docling'", specifier = ">=1.1.10" },
|
||||||
|
|||||||
Reference in New Issue
Block a user