Aye10032 commited on
Commit
0ad4048
·
1 Parent(s): 826926d

添加文献翻译

Browse files
pages/TextToImage.py DELETED
@@ -1,92 +0,0 @@
1
- import os
2
-
3
- import requests
4
- import urllib3
5
-
6
- import streamlit as st
7
- from loguru import logger
8
-
9
- from ui.Component import side_bar_links
10
-
11
- st.set_page_config(
12
- page_title='工具箱',
13
- page_icon='🔨',
14
- layout='wide',
15
- )
16
-
17
- with st.sidebar:
18
- side_bar_links()
19
-
20
- st.text_input('Api_key', type='password', key='api_key')
21
-
22
- st.title('CogView 文生图')
23
-
24
-
25
- def generate_image_url(prompt: str) -> str:
26
- from zhipuai import ZhipuAI
27
-
28
- api = st.session_state.get('api_key')
29
- if api != '':
30
- client = ZhipuAI(api_key=api) # 请填写您自己的APIKey
31
-
32
- response = client.images.generations(
33
- model="cogview-3",
34
- prompt=prompt,
35
- )
36
-
37
- return response.data[0].url
38
- else:
39
- st.error('请先输入API!')
40
-
41
-
42
- def download_img(img_url: str) -> str:
43
- r = requests.get(img_url, stream=True)
44
- if r.status_code == 200:
45
- filename = img_url.split('/')[-1]
46
- filepath = f'/home/aye/Service/MyTools/image/{filename}'
47
- open(filepath, 'wb').write(r.content)
48
- del r
49
- return filepath
50
- else:
51
- st.error('download fail')
52
-
53
-
54
- if 'filepath' not in st.session_state:
55
- st.session_state['filepath'] = ''
56
-
57
- if os.path.exists(st.session_state.get('filepath')):
58
- with st.chat_message('user'):
59
- st.write(st.session_state.get('image_prompt'))
60
- with st.chat_message('ai'):
61
- path: str = st.session_state.get('filepath')
62
- st.image(path)
63
- with open(path, "rb") as file:
64
- btn = st.download_button(
65
- label="下载",
66
- data=file,
67
- file_name=path.split('/')[-1],
68
- mime="image/png"
69
- )
70
-
71
- if image_prompt := st.chat_input(key='image_prompt'):
72
- with st.chat_message('user'):
73
- logger.info(image_prompt)
74
- st.write(image_prompt)
75
-
76
- with st.spinner('正在生成图片...'):
77
- url = generate_image_url(image_prompt)
78
- logger.info(url)
79
-
80
- with st.spinner('正在下载图片...'):
81
- path = download_img(url)
82
- st.session_state['filepath'] = path
83
-
84
- with st.chat_message('ai'):
85
- st.image(path)
86
- with open(path, "rb") as file:
87
- btn = st.download_button(
88
- label="下载",
89
- data=file,
90
- file_name=url.split('/')[-1],
91
- mime="image/png"
92
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/TranslatePaper.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import time
5
+ import zipfile
6
+
7
+ import requests
8
+ import streamlit as st
9
+ from langchain_core.messages import SystemMessage
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_openai import ChatOpenAI
12
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
13
+ from loguru import logger
14
+
15
+ from ui.Component import side_bar_links
16
+ from utils.Doc2x import pre_upload, put_file, get_status, get_md
17
+
18
+ st.set_page_config(
19
+ page_title='工具箱',
20
+ page_icon='🔨',
21
+ layout='wide',
22
+ )
23
+
24
+ with st.sidebar:
25
+ side_bar_links()
26
+
27
+
28
+ def ac_translate(original_text: str):
29
+ _prompt = ChatPromptTemplate.from_messages(
30
+ [
31
+ SystemMessage(content="""你是一个能够高效准确翻译学术论文的助手。你的任务是将用户提供的学术论文从英文翻译成中文,并保留原始的Markdown格式完全不变。为了确保输出结果正确,请注意以下规则:
32
+
33
+ 1. **保留Markdown结构**:包括标题、列表、表格、代码块、引用、脚注等,确保格式一致。
34
+ 2. **翻译正文内容**:将正文内容翻译成流畅自然的中文,但保留学术术语、专有名词或引用标记(如`[引用]`、`<term>`),除非用户另有要求。
35
+ 3. **避免错误**:
36
+ - 确保翻译结果中仅保留与原始Markdown相同的标题结构(如`#`等符号的使用)。
37
+ - 不要误将普通段落转换为标题。
38
+ 4. **输出为纯Markdown文本**:不要添加额外的Markdown代码块标记(如`````markdown````或类似结构),直接返回翻译后的Markdown内容。
39
+
40
+ 在完成任务时,请专注于准确性和格式一致性。如果有任何不确定的内容,请保持原文不变。
41
+ """),
42
+ ("human", """请将以下Markdown格式的学术论文从英文翻译成中文,并严格按照以下要求处理:
43
+
44
+ 1. 保留原始Markdown结构,包括标题、列表、表格、代码块等。
45
+ 2. 翻译正文内容为流畅自然的中文,但保留特定术语或标记(如`[引用]`、`<term>`)不被翻译。
46
+ 3. 确保段落与标题的区分准确,避免错误地将正文内容标记为标题。
47
+ 4. 直接返回翻译后的Markdown文本,不要额外包裹在代码块标记中。
48
+
49
+ 以下是需要翻译的内容:
50
+
51
+ {original_text}
52
+ """)
53
+ ]
54
+ )
55
+
56
+ llm = ChatOpenAI(
57
+ model_name="glm-4-flash",
58
+ openai_api_base='https://open.bigmodel.cn/api/paas/v4/',
59
+ temperature=0.5,
60
+ openai_api_key=st.secrets['gml_key'],
61
+ streaming=True
62
+ )
63
+
64
+ chain = _prompt | llm
65
+
66
+ llm_result = chain.stream({"original_text": original_text})
67
+
68
+ return llm_result
69
+
70
+
71
+ pdf_file = st.file_uploader(
72
+ '选择PDF文件',
73
+ type=['pdf'],
74
+ )
75
+ col1, col2, _ = st.columns([1, 1, 5])
76
+ translate_container = st.container(height=550, border=True)
77
+ if upload_btn := col1.button('翻译', disabled=pdf_file is None):
78
+ os.makedirs('tmp/translate', exist_ok=True)
79
+
80
+ pdf_path = os.path.join('tmp', pdf_file.name)
81
+ with open(pdf_path, 'wb') as f:
82
+ f.write(pdf_file.getbuffer())
83
+
84
+ with st.status('pre upload...'):
85
+ upload_data = pre_upload(st.secrets['doc2x'])
86
+ url = upload_data["url"]
87
+ uid = upload_data["uid"]
88
+
89
+ put_file(pdf_path, url)
90
+ progress_bar = st.progress(0, '解析中...')
91
+
92
+ while True:
93
+
94
+ status_data = get_status(uid, st.secrets['doc2x'])
95
+ if status_data["status"] == "success":
96
+ result = status_data["result"]
97
+ with open("result.json", "w") as f:
98
+ json.dump(result, f)
99
+ break
100
+ elif status_data["status"] == "failed":
101
+ detail = status_data["detail"]
102
+ raise Exception(f"parse failed: {detail}")
103
+ elif status_data["status"] == "processing":
104
+ # processing
105
+ progress = status_data["progress"]
106
+ progress_bar.progress(progress, '解析中...')
107
+ time.sleep(3)
108
+
109
+ progress_bar.progress(100, '解析完毕')
110
+
111
+ uid = '01938687-2e4e-779e-9371-1e03521e85ec'
112
+ logger.info(uid)
113
+
114
+ with st.status('导出为markdown'):
115
+ get_md(uid, st.secrets['doc2x'], True)
116
+
117
+ while True:
118
+ status_data = get_md(uid, st.secrets['doc2x'], False)
119
+
120
+ if status_data["status"] == "success":
121
+ result_url = status_data["url"]
122
+ response = requests.get(result_url)
123
+ with open('tmp/downloaded_file.zip', 'wb') as f:
124
+ f.write(response.content)
125
+ break
126
+ elif status_data["status"] == "failed":
127
+ detail = status_data["detail"]
128
+ raise Exception(f"parse failed: {detail}")
129
+ elif status_data["status"] == "processing":
130
+ progress = status_data["progress"]
131
+ time.sleep(3)
132
+
133
+ with zipfile.ZipFile('tmp/downloaded_file.zip', 'r') as zip_ref:
134
+ zip_ref.extractall('tmp/translate')
135
+
136
+ with open('tmp/translate/origin.md', 'r', encoding='utf-8') as md_file:
137
+ md_docs = md_file.read().splitlines()
138
+
139
+ with open('tmp/translate/translated.md', 'w+', encoding='utf-8') as f:
140
+ if md_docs[0] == '---':
141
+ f.write(md_docs.pop(0))
142
+ while True:
143
+ next_line = md_docs.pop(0)
144
+ f.write(f"{next_line}\n")
145
+ if next_line == '---':
146
+ break
147
+
148
+ for doc in md_docs:
149
+ if doc.startswith("![") or doc == '---' or len(doc) == 0:
150
+ f.write(f"{doc}\n")
151
+ continue
152
+
153
+ response = ac_translate(doc)
154
+ translate_result = translate_container.write_stream(response)
155
+ if doc.startswith('#'):
156
+ f.write(f"{translate_result}\n")
157
+ else:
158
+ f.write(f"{translate_result.lstrip('#')}\n \n")
159
+
160
+ shutil.make_archive(
161
+ f'translate',
162
+ 'zip',
163
+ 'tmp/translate',
164
+ f'./'
165
+ )
166
+ shutil.rmtree('tmp')
167
+
168
+ if os.path.exists('translate.zip'):
169
+ with open('translate.zip', 'rb') as f:
170
+ col2.download_button(
171
+ "下载",
172
+ data=f,
173
+ type="primary",
174
+ file_name='downloaded_file.zip',
175
+ mime="application/octet-stream",
176
+ )
requirements.txt CHANGED
@@ -4,5 +4,4 @@ PyYAML
4
  streamlit
5
  langchain
6
  langchain_openai
7
- lxml
8
- zhipuai
 
4
  streamlit
5
  langchain
6
  langchain_openai
7
+ lxml
 
ui/Component.py CHANGED
@@ -9,6 +9,6 @@ def side_bar_links():
9
  st.page_link('pages/Reference.py', label='引用文献生成', icon='📙')
10
  st.page_link('pages/Translate.py', label='翻译总结工具', icon='🌐')
11
  st.page_link('pages/AcademicTranslate.py', label='学术中英互译', icon='🌐')
12
- st.page_link('pages/TextToImage.py', label='文生图', icon='🎨')
13
 
14
  st.divider()
 
9
  st.page_link('pages/Reference.py', label='引用文献生成', icon='📙')
10
  st.page_link('pages/Translate.py', label='翻译总结工具', icon='🌐')
11
  st.page_link('pages/AcademicTranslate.py', label='学术中英互译', icon='🌐')
12
+ st.page_link('pages/TranslatePaper.py', label='PDF解析翻译', icon='🌐')
13
 
14
  st.divider()
utils/Doc2x.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+
4
+ import requests as rq
5
+
6
+ base_url = "https://v2.doc2x.noedgeai.com"
7
+
8
+
9
+ def pre_upload(secret: str):
10
+ url = f"{base_url}/api/v2/parse/preupload"
11
+ headers = {
12
+ "Authorization": f"Bearer {secret}"
13
+ }
14
+ res = rq.post(url, headers=headers)
15
+ if res.status_code == 200:
16
+ data = res.json()
17
+ if data["code"] == "success":
18
+ return data["data"]
19
+ else:
20
+ raise Exception(f"get preupload url failed: {data}")
21
+ else:
22
+ raise Exception(f"get preupload url failed: {res.text}")
23
+
24
+
25
+ def put_file(path: str, url: str):
26
+ with open(path, "rb") as f:
27
+ res = rq.put(url, data=f) # body为文件二进制流
28
+ if res.status_code != 200:
29
+ raise Exception(f"put file failed: {res.text}")
30
+
31
+
32
+ def get_status(uid: str, secret: str):
33
+ url = f"{base_url}/api/v2/parse/status?uid={uid}"
34
+ headers = {
35
+ "Authorization": f"Bearer {secret}"
36
+ }
37
+ res = rq.get(url, headers=headers)
38
+ if res.status_code == 200:
39
+ data = res.json()
40
+ if data["code"] == "success":
41
+ return data["data"]
42
+ else:
43
+ raise Exception(f"get status failed: {data}")
44
+ else:
45
+ raise Exception(f"get status failed: {res.text}")
46
+
47
+
48
+ def get_md(uid: str, secret: str, trigger: bool = False):
49
+ headers = {
50
+ "Authorization": f"Bearer {secret}",
51
+ "Content-Type": "application/json",
52
+ }
53
+
54
+ data = {
55
+ "uid": uid,
56
+ "to": "md",
57
+ "formula_mode": "dollar",
58
+ "filename": "origin",
59
+ }
60
+
61
+ if trigger:
62
+ url = f"{base_url}/api/v2/convert/parse"
63
+ res = rq.post(url, headers=headers, data=json.dumps(data))
64
+ else:
65
+ url = f"{base_url}/api/v2/convert/parse/result?uid={uid}"
66
+ res = rq.get(url, headers=headers)
67
+
68
+ if res.status_code == 200:
69
+ data = res.json()
70
+ if data["code"] == "success":
71
+ return data["data"]
72
+ else:
73
+ raise Exception(f"get status failed: {data}")
74
+ else:
75
+ raise Exception(f"get status failed: {res.text}")