增加read方法
This commit is contained in:
7
.idea/workspace.xml
generated
7
.idea/workspace.xml
generated
@@ -6,7 +6,8 @@
|
|||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/utils/convert.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/convert.py" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
@@ -369,12 +370,12 @@
|
|||||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746851336881" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746857730545" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746785064481" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746857126577" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
import markdown2
|
import markdown2
|
||||||
|
from docling.datamodel.document import DocumentStream
|
||||||
|
|
||||||
from docutranslate.Agents import Agent, AgentArgs
|
from docutranslate.Agents import Agent, AgentArgs
|
||||||
from docutranslate.Agents import MDRefineAgent, MDTranslateAgent
|
from docutranslate.Agents import MDRefineAgent, MDTranslateAgent
|
||||||
@@ -70,6 +72,27 @@ class FileTranslater:
|
|||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def read(self, name: str, file: bytes, formula=False, code=False, save=False,
|
||||||
|
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
|
refine_agent: Agent | None = None):
|
||||||
|
ds = DocumentStream(name=name, stream=BytesIO(file))
|
||||||
|
file_path = Path(name)
|
||||||
|
# 如果是markdown,直接读取
|
||||||
|
if file_path.suffix == ".md":
|
||||||
|
self.markdown = file.decode()
|
||||||
|
else:
|
||||||
|
print(f"正在将{file_path.resolve().name}转换为markdown")
|
||||||
|
self.markdown = file2markdown_embed_images(ds, formula, code, artifacts_path=self.docling_artifact)
|
||||||
|
print("已转换为markdown")
|
||||||
|
if refine:
|
||||||
|
self.refine_markdown_by_agent(refine_agent)
|
||||||
|
if save:
|
||||||
|
if save_format == "html":
|
||||||
|
self.save_as_html(filename=f"{file_path.stem}.html")
|
||||||
|
else:
|
||||||
|
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||||
|
return self
|
||||||
|
|
||||||
def read_file(self, file_path: Path | str | None = None, formula=False, code=False, save=False,
|
def read_file(self, file_path: Path | str | None = None, formula=False, code=False, save=False,
|
||||||
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
refine_agent: Agent | None = None):
|
refine_agent: Agent | None = None):
|
||||||
@@ -109,12 +132,12 @@ class FileTranslater:
|
|||||||
print("markdown已修正")
|
print("markdown已修正")
|
||||||
return self.markdown
|
return self.markdown
|
||||||
|
|
||||||
def translate_markdown_by_agent(self, translate_agent: Agent | None = None,to_lang="中文"):
|
def translate_markdown_by_agent(self, translate_agent: Agent | None = None, to_lang="中文"):
|
||||||
print("正在翻译markdown")
|
print("正在翻译markdown")
|
||||||
self._mask_uris_in_markdown()
|
self._mask_uris_in_markdown()
|
||||||
chuncks = self._split_markdown_into_chunks()
|
chuncks = self._split_markdown_into_chunks()
|
||||||
if translate_agent is None:
|
if translate_agent is None:
|
||||||
translate_agent = MDTranslateAgent(to_lang=to_lang,**self.default_agent_params())
|
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
|
||||||
result: list[str] = translate_agent.send_prompts(chuncks)
|
result: list[str] = translate_agent.send_prompts(chuncks)
|
||||||
self.markdown = "\n".join(result)
|
self.markdown = "\n".join(result)
|
||||||
self._unmask_uris_in_markdown()
|
self._unmask_uris_in_markdown()
|
||||||
@@ -231,7 +254,7 @@ class FileTranslater:
|
|||||||
self.read_file(file_path, formula=formula, code=code)
|
self.read_file(file_path, formula=formula, code=code)
|
||||||
if refine:
|
if refine:
|
||||||
self.refine_markdown_by_agent(refine_agent)
|
self.refine_markdown_by_agent(refine_agent)
|
||||||
self.translate_markdown_by_agent(translate_agent,to_lang=to_lang)
|
self.translate_markdown_by_agent(translate_agent, to_lang=to_lang)
|
||||||
if output_format == "markdown":
|
if output_format == "markdown":
|
||||||
filename = f"{file_path.stem}_{to_lang}.md"
|
filename = f"{file_path.stem}_{to_lang}.md"
|
||||||
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
||||||
|
|||||||
@@ -5,13 +5,12 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.datamodel.document import DocumentStream
|
||||||
|
|
||||||
IMAGE_RESOLUTION_SCALE = 4
|
IMAGE_RESOLUTION_SCALE = 4
|
||||||
|
|
||||||
|
|
||||||
def file2markdown_embed_images(file_path: Path | str, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
|
def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
|
||||||
if isinstance(file_path,str):
|
|
||||||
file_path=Path(file_path)
|
|
||||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||||
# pipeline_options.do_ocr=False
|
# pipeline_options.do_ocr=False
|
||||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
|||||||
Reference in New Issue
Block a user