File size: 6,281 Bytes
0ad4048 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import json
import os
import shutil
import time
import zipfile
import requests
import streamlit as st
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_text_splitters import MarkdownHeaderTextSplitter
from loguru import logger
from ui.Component import side_bar_links
from utils.Doc2x import pre_upload, put_file, get_status, get_md
st.set_page_config(
page_title='工具箱',
page_icon='🔨',
layout='wide',
)
with st.sidebar:
side_bar_links()
def ac_translate(original_text: str):
_prompt = ChatPromptTemplate.from_messages(
[
SystemMessage(content="""你是一个能够高效准确翻译学术论文的助手。你的任务是将用户提供的学术论文从英文翻译成中文,并保留原始的Markdown格式完全不变。为了确保输出结果正确,请注意以下规则:
1. **保留Markdown结构**:包括标题、列表、表格、代码块、引用、脚注等,确保格式一致。
2. **翻译正文内容**:将正文内容翻译成流畅自然的中文,但保留学术术语、专有名词或引用标记(如`[引用]`、`<term>`),除非用户另有要求。
3. **避免错误**:
- 确保翻译结果中仅保留与原始Markdown相同的标题结构(如`#`等符号的使用)。
- 不要误将普通段落转换为标题。
4. **输出为纯Markdown文本**:不要添加额外的Markdown代码块标记(如`````markdown````或类似结构),直接返回翻译后的Markdown内容。
在完成任务时,请专注于准确性和格式一致性。如果有任何不确定的内容,请保持原文不变。
"""),
("human", """请将以下Markdown格式的学术论文从英文翻译成中文,并严格按照以下要求处理:
1. 保留原始Markdown结构,包括标题、列表、表格、代码块等。
2. 翻译正文内容为流畅自然的中文,但保留特定术语或标记(如`[引用]`、`<term>`)不被翻译。
3. 确保段落与标题的区分准确,避免错误地将正文内容标记为标题。
4. 直接返回翻译后的Markdown文本,不要额外包裹在代码块标记中。
以下是需要翻译的内容:
{original_text}
""")
]
)
llm = ChatOpenAI(
model_name="glm-4-flash",
openai_api_base='https://open.bigmodel.cn/api/paas/v4/',
temperature=0.5,
openai_api_key=st.secrets['gml_key'],
streaming=True
)
chain = _prompt | llm
llm_result = chain.stream({"original_text": original_text})
return llm_result
pdf_file = st.file_uploader(
'选择PDF文件',
type=['pdf'],
)
col1, col2, _ = st.columns([1, 1, 5])
translate_container = st.container(height=550, border=True)
if upload_btn := col1.button('翻译', disabled=pdf_file is None):
os.makedirs('tmp/translate', exist_ok=True)
pdf_path = os.path.join('tmp', pdf_file.name)
with open(pdf_path, 'wb') as f:
f.write(pdf_file.getbuffer())
with st.status('pre upload...'):
upload_data = pre_upload(st.secrets['doc2x'])
url = upload_data["url"]
uid = upload_data["uid"]
put_file(pdf_path, url)
progress_bar = st.progress(0, '解析中...')
while True:
status_data = get_status(uid, st.secrets['doc2x'])
if status_data["status"] == "success":
result = status_data["result"]
with open("result.json", "w") as f:
json.dump(result, f)
break
elif status_data["status"] == "failed":
detail = status_data["detail"]
raise Exception(f"parse failed: {detail}")
elif status_data["status"] == "processing":
# processing
progress = status_data["progress"]
progress_bar.progress(progress, '解析中...')
time.sleep(3)
progress_bar.progress(100, '解析完毕')
uid = '01938687-2e4e-779e-9371-1e03521e85ec'
logger.info(uid)
with st.status('导出为markdown'):
get_md(uid, st.secrets['doc2x'], True)
while True:
status_data = get_md(uid, st.secrets['doc2x'], False)
if status_data["status"] == "success":
result_url = status_data["url"]
response = requests.get(result_url)
with open('tmp/downloaded_file.zip', 'wb') as f:
f.write(response.content)
break
elif status_data["status"] == "failed":
detail = status_data["detail"]
raise Exception(f"parse failed: {detail}")
elif status_data["status"] == "processing":
progress = status_data["progress"]
time.sleep(3)
with zipfile.ZipFile('tmp/downloaded_file.zip', 'r') as zip_ref:
zip_ref.extractall('tmp/translate')
with open('tmp/translate/origin.md', 'r', encoding='utf-8') as md_file:
md_docs = md_file.read().splitlines()
with open('tmp/translate/translated.md', 'w+', encoding='utf-8') as f:
if md_docs[0] == '---':
f.write(md_docs.pop(0))
while True:
next_line = md_docs.pop(0)
f.write(f"{next_line}\n")
if next_line == '---':
break
for doc in md_docs:
if doc.startswith("![") or doc == '---' or len(doc) == 0:
f.write(f"{doc}\n")
continue
response = ac_translate(doc)
translate_result = translate_container.write_stream(response)
if doc.startswith('#'):
f.write(f"{translate_result}\n")
else:
f.write(f"{translate_result.lstrip('#')}\n \n")
shutil.make_archive(
f'translate',
'zip',
'tmp/translate',
f'./'
)
shutil.rmtree('tmp')
if os.path.exists('translate.zip'):
with open('translate.zip', 'rb') as f:
col2.download_button(
"下载",
data=f,
type="primary",
file_name='downloaded_file.zip',
mime="application/octet-stream",
)
|