MyTools / pages /TranslatePaper.py
Aye10032's picture
添加文献翻译
0ad4048
import json
import os
import shutil
import time
import zipfile
import requests
import streamlit as st
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_text_splitters import MarkdownHeaderTextSplitter
from loguru import logger
from ui.Component import side_bar_links
from utils.Doc2x import pre_upload, put_file, get_status, get_md
st.set_page_config(
page_title='工具箱',
page_icon='🔨',
layout='wide',
)
with st.sidebar:
side_bar_links()
def ac_translate(original_text: str):
_prompt = ChatPromptTemplate.from_messages(
[
SystemMessage(content="""你是一个能够高效准确翻译学术论文的助手。你的任务是将用户提供的学术论文从英文翻译成中文,并保留原始的Markdown格式完全不变。为了确保输出结果正确,请注意以下规则:
1. **保留Markdown结构**:包括标题、列表、表格、代码块、引用、脚注等,确保格式一致。
2. **翻译正文内容**:将正文内容翻译成流畅自然的中文,但保留学术术语、专有名词或引用标记(如`[引用]`、`<term>`),除非用户另有要求。
3. **避免错误**:
- 确保翻译结果中仅保留与原始Markdown相同的标题结构(如`#`等符号的使用)。
- 不要误将普通段落转换为标题。
4. **输出为纯Markdown文本**:不要添加额外的Markdown代码块标记(如`````markdown````或类似结构),直接返回翻译后的Markdown内容。
在完成任务时,请专注于准确性和格式一致性。如果有任何不确定的内容,请保持原文不变。
"""),
("human", """请将以下Markdown格式的学术论文从英文翻译成中文,并严格按照以下要求处理:
1. 保留原始Markdown结构,包括标题、列表、表格、代码块等。
2. 翻译正文内容为流畅自然的中文,但保留特定术语或标记(如`[引用]`、`<term>`)不被翻译。
3. 确保段落与标题的区分准确,避免错误地将正文内容标记为标题。
4. 直接返回翻译后的Markdown文本,不要额外包裹在代码块标记中。
以下是需要翻译的内容:
{original_text}
""")
]
)
llm = ChatOpenAI(
model_name="glm-4-flash",
openai_api_base='https://open.bigmodel.cn/api/paas/v4/',
temperature=0.5,
openai_api_key=st.secrets['gml_key'],
streaming=True
)
chain = _prompt | llm
llm_result = chain.stream({"original_text": original_text})
return llm_result
pdf_file = st.file_uploader(
'选择PDF文件',
type=['pdf'],
)
col1, col2, _ = st.columns([1, 1, 5])
translate_container = st.container(height=550, border=True)
if upload_btn := col1.button('翻译', disabled=pdf_file is None):
os.makedirs('tmp/translate', exist_ok=True)
pdf_path = os.path.join('tmp', pdf_file.name)
with open(pdf_path, 'wb') as f:
f.write(pdf_file.getbuffer())
with st.status('pre upload...'):
upload_data = pre_upload(st.secrets['doc2x'])
url = upload_data["url"]
uid = upload_data["uid"]
put_file(pdf_path, url)
progress_bar = st.progress(0, '解析中...')
while True:
status_data = get_status(uid, st.secrets['doc2x'])
if status_data["status"] == "success":
result = status_data["result"]
with open("result.json", "w") as f:
json.dump(result, f)
break
elif status_data["status"] == "failed":
detail = status_data["detail"]
raise Exception(f"parse failed: {detail}")
elif status_data["status"] == "processing":
# processing
progress = status_data["progress"]
progress_bar.progress(progress, '解析中...')
time.sleep(3)
progress_bar.progress(100, '解析完毕')
uid = '01938687-2e4e-779e-9371-1e03521e85ec'
logger.info(uid)
with st.status('导出为markdown'):
get_md(uid, st.secrets['doc2x'], True)
while True:
status_data = get_md(uid, st.secrets['doc2x'], False)
if status_data["status"] == "success":
result_url = status_data["url"]
response = requests.get(result_url)
with open('tmp/downloaded_file.zip', 'wb') as f:
f.write(response.content)
break
elif status_data["status"] == "failed":
detail = status_data["detail"]
raise Exception(f"parse failed: {detail}")
elif status_data["status"] == "processing":
progress = status_data["progress"]
time.sleep(3)
with zipfile.ZipFile('tmp/downloaded_file.zip', 'r') as zip_ref:
zip_ref.extractall('tmp/translate')
with open('tmp/translate/origin.md', 'r', encoding='utf-8') as md_file:
md_docs = md_file.read().splitlines()
with open('tmp/translate/translated.md', 'w+', encoding='utf-8') as f:
if md_docs[0] == '---':
f.write(md_docs.pop(0))
while True:
next_line = md_docs.pop(0)
f.write(f"{next_line}\n")
if next_line == '---':
break
for doc in md_docs:
if doc.startswith("![") or doc == '---' or len(doc) == 0:
f.write(f"{doc}\n")
continue
response = ac_translate(doc)
translate_result = translate_container.write_stream(response)
if doc.startswith('#'):
f.write(f"{translate_result}\n")
else:
f.write(f"{translate_result.lstrip('#')}\n \n")
shutil.make_archive(
f'translate',
'zip',
'tmp/translate',
f'./'
)
shutil.rmtree('tmp')
if os.path.exists('translate.zip'):
with open('translate.zip', 'rb') as f:
col2.download_button(
"下载",
data=f,
type="primary",
file_name='downloaded_file.zip',
mime="application/octet-stream",
)