增加read方法
This commit is contained in:
7
.idea/workspace.xml
generated
7
.idea/workspace.xml
generated
@@ -6,7 +6,8 @@
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/utils/convert.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/convert.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
@@ -369,12 +370,12 @@
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746851336881" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746857730545" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746785064481" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746857126577" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import markdown2
|
||||
from docling.datamodel.document import DocumentStream
|
||||
|
||||
from docutranslate.Agents import Agent, AgentArgs
|
||||
from docutranslate.Agents import MDRefineAgent, MDTranslateAgent
|
||||
@@ -70,6 +72,27 @@ class FileTranslater:
|
||||
}
|
||||
return result
|
||||
|
||||
def read(self, name: str, file: bytes, formula=False, code=False, save=False,
|
||||
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||
refine_agent: Agent | None = None):
|
||||
ds = DocumentStream(name=name, stream=BytesIO(file))
|
||||
file_path = Path(name)
|
||||
# 如果是markdown,直接读取
|
||||
if file_path.suffix == ".md":
|
||||
self.markdown = file.decode()
|
||||
else:
|
||||
print(f"正在将{file_path.resolve().name}转换为markdown")
|
||||
self.markdown = file2markdown_embed_images(ds, formula, code, artifacts_path=self.docling_artifact)
|
||||
print("已转换为markdown")
|
||||
if refine:
|
||||
self.refine_markdown_by_agent(refine_agent)
|
||||
if save:
|
||||
if save_format == "html":
|
||||
self.save_as_html(filename=f"{file_path.stem}.html")
|
||||
else:
|
||||
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||
return self
|
||||
|
||||
def read_file(self, file_path: Path | str | None = None, formula=False, code=False, save=False,
|
||||
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||
refine_agent: Agent | None = None):
|
||||
@@ -109,12 +132,12 @@ class FileTranslater:
|
||||
print("markdown已修正")
|
||||
return self.markdown
|
||||
|
||||
def translate_markdown_by_agent(self, translate_agent: Agent | None = None,to_lang="中文"):
|
||||
def translate_markdown_by_agent(self, translate_agent: Agent | None = None, to_lang="中文"):
|
||||
print("正在翻译markdown")
|
||||
self._mask_uris_in_markdown()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
if translate_agent is None:
|
||||
translate_agent = MDTranslateAgent(to_lang=to_lang,**self.default_agent_params())
|
||||
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
|
||||
result: list[str] = translate_agent.send_prompts(chuncks)
|
||||
self.markdown = "\n".join(result)
|
||||
self._unmask_uris_in_markdown()
|
||||
@@ -231,7 +254,7 @@ class FileTranslater:
|
||||
self.read_file(file_path, formula=formula, code=code)
|
||||
if refine:
|
||||
self.refine_markdown_by_agent(refine_agent)
|
||||
self.translate_markdown_by_agent(translate_agent,to_lang=to_lang)
|
||||
self.translate_markdown_by_agent(translate_agent, to_lang=to_lang)
|
||||
if output_format == "markdown":
|
||||
filename = f"{file_path.stem}_{to_lang}.md"
|
||||
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
||||
|
||||
@@ -5,13 +5,12 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from pathlib import Path
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.document import DocumentStream
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
def file2markdown_embed_images(file_path: Path | str, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
|
||||
if isinstance(file_path,str):
|
||||
file_path=Path(file_path)
|
||||
def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
# pipeline_options.do_ocr=False
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
Reference in New Issue
Block a user