添加文献翻译
Browse files- pages/TextToImage.py +0 -92
- pages/TranslatePaper.py +176 -0
- requirements.txt +1 -2
- ui/Component.py +1 -1
- utils/Doc2x.py +75 -0
pages/TextToImage.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
import requests
|
4 |
-
import urllib3
|
5 |
-
|
6 |
-
import streamlit as st
|
7 |
-
from loguru import logger
|
8 |
-
|
9 |
-
from ui.Component import side_bar_links
|
10 |
-
|
11 |
-
st.set_page_config(
|
12 |
-
page_title='工具箱',
|
13 |
-
page_icon='🔨',
|
14 |
-
layout='wide',
|
15 |
-
)
|
16 |
-
|
17 |
-
with st.sidebar:
|
18 |
-
side_bar_links()
|
19 |
-
|
20 |
-
st.text_input('Api_key', type='password', key='api_key')
|
21 |
-
|
22 |
-
st.title('CogView 文生图')
|
23 |
-
|
24 |
-
|
25 |
-
def generate_image_url(prompt: str) -> str:
|
26 |
-
from zhipuai import ZhipuAI
|
27 |
-
|
28 |
-
api = st.session_state.get('api_key')
|
29 |
-
if api != '':
|
30 |
-
client = ZhipuAI(api_key=api) # 请填写您自己的APIKey
|
31 |
-
|
32 |
-
response = client.images.generations(
|
33 |
-
model="cogview-3",
|
34 |
-
prompt=prompt,
|
35 |
-
)
|
36 |
-
|
37 |
-
return response.data[0].url
|
38 |
-
else:
|
39 |
-
st.error('请先输入API!')
|
40 |
-
|
41 |
-
|
42 |
-
def download_img(img_url: str) -> str:
|
43 |
-
r = requests.get(img_url, stream=True)
|
44 |
-
if r.status_code == 200:
|
45 |
-
filename = img_url.split('/')[-1]
|
46 |
-
filepath = f'/home/aye/Service/MyTools/image/{filename}'
|
47 |
-
open(filepath, 'wb').write(r.content)
|
48 |
-
del r
|
49 |
-
return filepath
|
50 |
-
else:
|
51 |
-
st.error('download fail')
|
52 |
-
|
53 |
-
|
54 |
-
if 'filepath' not in st.session_state:
|
55 |
-
st.session_state['filepath'] = ''
|
56 |
-
|
57 |
-
if os.path.exists(st.session_state.get('filepath')):
|
58 |
-
with st.chat_message('user'):
|
59 |
-
st.write(st.session_state.get('image_prompt'))
|
60 |
-
with st.chat_message('ai'):
|
61 |
-
path: str = st.session_state.get('filepath')
|
62 |
-
st.image(path)
|
63 |
-
with open(path, "rb") as file:
|
64 |
-
btn = st.download_button(
|
65 |
-
label="下载",
|
66 |
-
data=file,
|
67 |
-
file_name=path.split('/')[-1],
|
68 |
-
mime="image/png"
|
69 |
-
)
|
70 |
-
|
71 |
-
if image_prompt := st.chat_input(key='image_prompt'):
|
72 |
-
with st.chat_message('user'):
|
73 |
-
logger.info(image_prompt)
|
74 |
-
st.write(image_prompt)
|
75 |
-
|
76 |
-
with st.spinner('正在生成图片...'):
|
77 |
-
url = generate_image_url(image_prompt)
|
78 |
-
logger.info(url)
|
79 |
-
|
80 |
-
with st.spinner('正在下载图片...'):
|
81 |
-
path = download_img(url)
|
82 |
-
st.session_state['filepath'] = path
|
83 |
-
|
84 |
-
with st.chat_message('ai'):
|
85 |
-
st.image(path)
|
86 |
-
with open(path, "rb") as file:
|
87 |
-
btn = st.download_button(
|
88 |
-
label="下载",
|
89 |
-
data=file,
|
90 |
-
file_name=url.split('/')[-1],
|
91 |
-
mime="image/png"
|
92 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/TranslatePaper.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import time
|
5 |
+
import zipfile
|
6 |
+
|
7 |
+
import requests
|
8 |
+
import streamlit as st
|
9 |
+
from langchain_core.messages import SystemMessage
|
10 |
+
from langchain_core.prompts import ChatPromptTemplate
|
11 |
+
from langchain_openai import ChatOpenAI
|
12 |
+
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
13 |
+
from loguru import logger
|
14 |
+
|
15 |
+
from ui.Component import side_bar_links
|
16 |
+
from utils.Doc2x import pre_upload, put_file, get_status, get_md
|
17 |
+
|
18 |
+
st.set_page_config(
|
19 |
+
page_title='工具箱',
|
20 |
+
page_icon='🔨',
|
21 |
+
layout='wide',
|
22 |
+
)
|
23 |
+
|
24 |
+
with st.sidebar:
|
25 |
+
side_bar_links()
|
26 |
+
|
27 |
+
|
28 |
+
def ac_translate(original_text: str):
|
29 |
+
_prompt = ChatPromptTemplate.from_messages(
|
30 |
+
[
|
31 |
+
SystemMessage(content="""你是一个能够高效准确翻译学术论文的助手。你的任务是将用户提供的学术论文从英文翻译成中文,并保留原始的Markdown格式完全不变。为了确保输出结果正确,请注意以下规则:
|
32 |
+
|
33 |
+
1. **保留Markdown结构**:包括标题、列表、表格、代码块、引用、脚注等,确保格式一致。
|
34 |
+
2. **翻译正文内容**:将正文内容翻译成流畅自然的中文,但保留学术术语、专有名词或引用标记(如`[引用]`、`<term>`),除非用户另有要求。
|
35 |
+
3. **避免错误**:
|
36 |
+
- 确保翻译结果中仅保留与原始Markdown相同的标题结构(如`#`等符号的使用)。
|
37 |
+
- 不要误将普通段落转换为标题。
|
38 |
+
4. **输出为纯Markdown文本**:不要添加额外的Markdown代码块标记(如`````markdown````或类似结构),直接返回翻译后的Markdown内容。
|
39 |
+
|
40 |
+
在完成任务时,请专注于准确性和格式一致性。如果有任何不确定的内容,请保持原文不变。
|
41 |
+
"""),
|
42 |
+
("human", """请将以下Markdown格式的学术论文从英文翻译成中文,并严格按照以下要求处理:
|
43 |
+
|
44 |
+
1. 保留原始Markdown结构,包括标题、列表、表格、代码块等。
|
45 |
+
2. 翻译正文内容为流畅自然的中文,但保留特定术语或标记(如`[引用]`、`<term>`)不被翻译。
|
46 |
+
3. 确保段落与标题的区分准确,避免错误地将正文内容标记为标题。
|
47 |
+
4. 直接返回翻译后的Markdown文本,不要额外包裹在代码块标记中。
|
48 |
+
|
49 |
+
以下是需要翻译的内容:
|
50 |
+
|
51 |
+
{original_text}
|
52 |
+
""")
|
53 |
+
]
|
54 |
+
)
|
55 |
+
|
56 |
+
llm = ChatOpenAI(
|
57 |
+
model_name="glm-4-flash",
|
58 |
+
openai_api_base='https://open.bigmodel.cn/api/paas/v4/',
|
59 |
+
temperature=0.5,
|
60 |
+
openai_api_key=st.secrets['gml_key'],
|
61 |
+
streaming=True
|
62 |
+
)
|
63 |
+
|
64 |
+
chain = _prompt | llm
|
65 |
+
|
66 |
+
llm_result = chain.stream({"original_text": original_text})
|
67 |
+
|
68 |
+
return llm_result
|
69 |
+
|
70 |
+
|
71 |
+
pdf_file = st.file_uploader(
|
72 |
+
'选择PDF文件',
|
73 |
+
type=['pdf'],
|
74 |
+
)
|
75 |
+
col1, col2, _ = st.columns([1, 1, 5])
|
76 |
+
translate_container = st.container(height=550, border=True)
|
77 |
+
if upload_btn := col1.button('翻译', disabled=pdf_file is None):
|
78 |
+
os.makedirs('tmp/translate', exist_ok=True)
|
79 |
+
|
80 |
+
pdf_path = os.path.join('tmp', pdf_file.name)
|
81 |
+
with open(pdf_path, 'wb') as f:
|
82 |
+
f.write(pdf_file.getbuffer())
|
83 |
+
|
84 |
+
with st.status('pre upload...'):
|
85 |
+
upload_data = pre_upload(st.secrets['doc2x'])
|
86 |
+
url = upload_data["url"]
|
87 |
+
uid = upload_data["uid"]
|
88 |
+
|
89 |
+
put_file(pdf_path, url)
|
90 |
+
progress_bar = st.progress(0, '解析中...')
|
91 |
+
|
92 |
+
while True:
|
93 |
+
|
94 |
+
status_data = get_status(uid, st.secrets['doc2x'])
|
95 |
+
if status_data["status"] == "success":
|
96 |
+
result = status_data["result"]
|
97 |
+
with open("result.json", "w") as f:
|
98 |
+
json.dump(result, f)
|
99 |
+
break
|
100 |
+
elif status_data["status"] == "failed":
|
101 |
+
detail = status_data["detail"]
|
102 |
+
raise Exception(f"parse failed: {detail}")
|
103 |
+
elif status_data["status"] == "processing":
|
104 |
+
# processing
|
105 |
+
progress = status_data["progress"]
|
106 |
+
progress_bar.progress(progress, '解析中...')
|
107 |
+
time.sleep(3)
|
108 |
+
|
109 |
+
progress_bar.progress(100, '解析完毕')
|
110 |
+
|
111 |
+
uid = '01938687-2e4e-779e-9371-1e03521e85ec'
|
112 |
+
logger.info(uid)
|
113 |
+
|
114 |
+
with st.status('导出为markdown'):
|
115 |
+
get_md(uid, st.secrets['doc2x'], True)
|
116 |
+
|
117 |
+
while True:
|
118 |
+
status_data = get_md(uid, st.secrets['doc2x'], False)
|
119 |
+
|
120 |
+
if status_data["status"] == "success":
|
121 |
+
result_url = status_data["url"]
|
122 |
+
response = requests.get(result_url)
|
123 |
+
with open('tmp/downloaded_file.zip', 'wb') as f:
|
124 |
+
f.write(response.content)
|
125 |
+
break
|
126 |
+
elif status_data["status"] == "failed":
|
127 |
+
detail = status_data["detail"]
|
128 |
+
raise Exception(f"parse failed: {detail}")
|
129 |
+
elif status_data["status"] == "processing":
|
130 |
+
progress = status_data["progress"]
|
131 |
+
time.sleep(3)
|
132 |
+
|
133 |
+
with zipfile.ZipFile('tmp/downloaded_file.zip', 'r') as zip_ref:
|
134 |
+
zip_ref.extractall('tmp/translate')
|
135 |
+
|
136 |
+
with open('tmp/translate/origin.md', 'r', encoding='utf-8') as md_file:
|
137 |
+
md_docs = md_file.read().splitlines()
|
138 |
+
|
139 |
+
with open('tmp/translate/translated.md', 'w+', encoding='utf-8') as f:
|
140 |
+
if md_docs[0] == '---':
|
141 |
+
f.write(md_docs.pop(0))
|
142 |
+
while True:
|
143 |
+
next_line = md_docs.pop(0)
|
144 |
+
f.write(f"{next_line}\n")
|
145 |
+
if next_line == '---':
|
146 |
+
break
|
147 |
+
|
148 |
+
for doc in md_docs:
|
149 |
+
if doc.startswith("![") or doc == '---' or len(doc) == 0:
|
150 |
+
f.write(f"{doc}\n")
|
151 |
+
continue
|
152 |
+
|
153 |
+
response = ac_translate(doc)
|
154 |
+
translate_result = translate_container.write_stream(response)
|
155 |
+
if doc.startswith('#'):
|
156 |
+
f.write(f"{translate_result}\n")
|
157 |
+
else:
|
158 |
+
f.write(f"{translate_result.lstrip('#')}\n \n")
|
159 |
+
|
160 |
+
shutil.make_archive(
|
161 |
+
f'translate',
|
162 |
+
'zip',
|
163 |
+
'tmp/translate',
|
164 |
+
f'./'
|
165 |
+
)
|
166 |
+
shutil.rmtree('tmp')
|
167 |
+
|
168 |
+
if os.path.exists('translate.zip'):
|
169 |
+
with open('translate.zip', 'rb') as f:
|
170 |
+
col2.download_button(
|
171 |
+
"下载",
|
172 |
+
data=f,
|
173 |
+
type="primary",
|
174 |
+
file_name='downloaded_file.zip',
|
175 |
+
mime="application/octet-stream",
|
176 |
+
)
|
requirements.txt
CHANGED
@@ -4,5 +4,4 @@ PyYAML
|
|
4 |
streamlit
|
5 |
langchain
|
6 |
langchain_openai
|
7 |
-
lxml
|
8 |
-
zhipuai
|
|
|
4 |
streamlit
|
5 |
langchain
|
6 |
langchain_openai
|
7 |
+
lxml
|
|
ui/Component.py
CHANGED
@@ -9,6 +9,6 @@ def side_bar_links():
|
|
9 |
st.page_link('pages/Reference.py', label='引用文献生成', icon='📙')
|
10 |
st.page_link('pages/Translate.py', label='翻译总结工具', icon='🌐')
|
11 |
st.page_link('pages/AcademicTranslate.py', label='学术中英互译', icon='🌐')
|
12 |
-
st.page_link('pages/
|
13 |
|
14 |
st.divider()
|
|
|
9 |
st.page_link('pages/Reference.py', label='引用文献生成', icon='📙')
|
10 |
st.page_link('pages/Translate.py', label='翻译总结工具', icon='🌐')
|
11 |
st.page_link('pages/AcademicTranslate.py', label='学术中英互译', icon='🌐')
|
12 |
+
st.page_link('pages/TranslatePaper.py', label='PDF解析翻译', icon='🌐')
|
13 |
|
14 |
st.divider()
|
utils/Doc2x.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import time
|
3 |
+
|
4 |
+
import requests as rq
|
5 |
+
|
6 |
+
base_url = "https://v2.doc2x.noedgeai.com"
|
7 |
+
|
8 |
+
|
9 |
+
def pre_upload(secret: str):
|
10 |
+
url = f"{base_url}/api/v2/parse/preupload"
|
11 |
+
headers = {
|
12 |
+
"Authorization": f"Bearer {secret}"
|
13 |
+
}
|
14 |
+
res = rq.post(url, headers=headers)
|
15 |
+
if res.status_code == 200:
|
16 |
+
data = res.json()
|
17 |
+
if data["code"] == "success":
|
18 |
+
return data["data"]
|
19 |
+
else:
|
20 |
+
raise Exception(f"get preupload url failed: {data}")
|
21 |
+
else:
|
22 |
+
raise Exception(f"get preupload url failed: {res.text}")
|
23 |
+
|
24 |
+
|
25 |
+
def put_file(path: str, url: str):
|
26 |
+
with open(path, "rb") as f:
|
27 |
+
res = rq.put(url, data=f) # body为文件二进制流
|
28 |
+
if res.status_code != 200:
|
29 |
+
raise Exception(f"put file failed: {res.text}")
|
30 |
+
|
31 |
+
|
32 |
+
def get_status(uid: str, secret: str):
|
33 |
+
url = f"{base_url}/api/v2/parse/status?uid={uid}"
|
34 |
+
headers = {
|
35 |
+
"Authorization": f"Bearer {secret}"
|
36 |
+
}
|
37 |
+
res = rq.get(url, headers=headers)
|
38 |
+
if res.status_code == 200:
|
39 |
+
data = res.json()
|
40 |
+
if data["code"] == "success":
|
41 |
+
return data["data"]
|
42 |
+
else:
|
43 |
+
raise Exception(f"get status failed: {data}")
|
44 |
+
else:
|
45 |
+
raise Exception(f"get status failed: {res.text}")
|
46 |
+
|
47 |
+
|
48 |
+
def get_md(uid: str, secret: str, trigger: bool = False):
|
49 |
+
headers = {
|
50 |
+
"Authorization": f"Bearer {secret}",
|
51 |
+
"Content-Type": "application/json",
|
52 |
+
}
|
53 |
+
|
54 |
+
data = {
|
55 |
+
"uid": uid,
|
56 |
+
"to": "md",
|
57 |
+
"formula_mode": "dollar",
|
58 |
+
"filename": "origin",
|
59 |
+
}
|
60 |
+
|
61 |
+
if trigger:
|
62 |
+
url = f"{base_url}/api/v2/convert/parse"
|
63 |
+
res = rq.post(url, headers=headers, data=json.dumps(data))
|
64 |
+
else:
|
65 |
+
url = f"{base_url}/api/v2/convert/parse/result?uid={uid}"
|
66 |
+
res = rq.get(url, headers=headers)
|
67 |
+
|
68 |
+
if res.status_code == 200:
|
69 |
+
data = res.json()
|
70 |
+
if data["code"] == "success":
|
71 |
+
return data["data"]
|
72 |
+
else:
|
73 |
+
raise Exception(f"get status failed: {data}")
|
74 |
+
else:
|
75 |
+
raise Exception(f"get status failed: {res.text}")
|