Spaces:
Running
Running
Shunfeng Zheng
commited on
Update 1_SpatialParse.py
Browse files- 1_SpatialParse.py +84 -64
1_SpatialParse.py
CHANGED
@@ -9,9 +9,28 @@ import pandas as pd
|
|
9 |
# import en_core_web_md
|
10 |
from spacy.tokens import Span, Doc, Token
|
11 |
from utils import geoutil
|
12 |
-
|
13 |
import urllib.parse
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
colors = {'GPE': "#43c6fc", "LOC": "#fd9720", "RSE":"#a6e22d"}
|
17 |
options = {"ents": ['GPE', 'LOC', "RSE"], "colors": colors}
|
@@ -147,73 +166,74 @@ def set_selected_entities(doc):
|
|
147 |
return doc
|
148 |
|
149 |
def extract_spatial_entities(text):
|
150 |
-
# nlp = en_core_web_md.load()
|
151 |
-
|
152 |
-
# nlp = spacy.load("en_core_web_md")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
# nlp.add_pipe("spatial_pipeline", after="ner")
|
154 |
# doc = nlp(text)
|
155 |
-
|
156 |
-
#
|
157 |
-
#
|
158 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
# show_spatial_ent_table(doc, text)
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
# 分句处理
|
166 |
-
sent_ents = []
|
167 |
-
sent_texts = []
|
168 |
-
sent_rse_id = []
|
169 |
-
offset = 0 # 记录当前 token 偏移量
|
170 |
-
sent_start_positions = [0] # 记录句子信息
|
171 |
-
doc_copy = doc.copy() # 用于展示方程组合
|
172 |
-
for sent in doc.sents:
|
173 |
-
|
174 |
-
sent_doc = nlp(sent.text) # 逐句处理
|
175 |
-
sent_doc = set_selected_entities(sent_doc) # 这里处理实体
|
176 |
-
sent_texts.append(sent_doc.text)
|
177 |
-
|
178 |
-
for ent in sent_doc.ents:
|
179 |
-
sent_rse_id.append(ent._.rse_id)
|
180 |
-
# **调整每个实体的索引,使其匹配完整文本**
|
181 |
-
for ent in sent_doc.ents:
|
182 |
-
new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
|
183 |
-
sent_ents.append(new_ent)
|
184 |
-
|
185 |
-
offset += len(sent) # 更新偏移量
|
186 |
-
sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
|
187 |
-
# **创建新 Doc**
|
188 |
-
final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
|
189 |
-
for i in sent_start_positions: # 手动标记句子起始点
|
190 |
-
if i < len(final_doc):
|
191 |
-
final_doc[i].is_sent_start = True
|
192 |
-
# **设置实体**
|
193 |
-
final_doc.set_ents(sent_ents)
|
194 |
-
|
195 |
-
for i in range(len(sent_rse_id)):
|
196 |
-
final_doc.ents[i]._.rse_id = sent_rse_id[i]
|
197 |
-
print(doc.ents[0].sent, '原始')
|
198 |
-
doc = final_doc
|
199 |
-
print(doc.ents[0].sent, '新')
|
200 |
-
# 分句处理完毕
|
201 |
-
|
202 |
-
# doc = set_selected_entities(doc)
|
203 |
-
# doc.to_disk("saved_doc.spacy")
|
204 |
-
doc.to_disk("/tmp/saved_doc.spacy")
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
html = displacy.render(doc,style="ent", options = options)
|
210 |
-
html = html.replace("\n","")
|
211 |
-
st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True)
|
212 |
-
show_spatial_ent_table(doc, text)
|
213 |
-
|
214 |
-
st.markdown("123123")
|
215 |
-
|
216 |
-
show_sentence_selector_table(doc_copy)
|
217 |
|
218 |
def show_sentence_selector_table(doc_copy):
|
219 |
st.markdown("**______________________________________________________________________________________**")
|
|
|
9 |
# import en_core_web_md
|
10 |
from spacy.tokens import Span, Doc, Token
|
11 |
from utils import geoutil
|
|
|
12 |
import urllib.parse
|
13 |
|
14 |
+
API_TOKEN = os.getenv("HF_API_TOKEN")
|
15 |
+
BACKEND_URL = "https://dsbb0707-dockerb2.hf.space/api/predict/"
|
16 |
+
def call_backend(input_text):
|
17 |
+
try:
|
18 |
+
headers = {
|
19 |
+
"Authorization": f"Bearer {API_TOKEN}"
|
20 |
+
}
|
21 |
+
response = requests.post(
|
22 |
+
BACKEND_URL,
|
23 |
+
headers=headers,
|
24 |
+
json={"data": [input_text]},
|
25 |
+
timeout=10
|
26 |
+
)
|
27 |
+
if response.status_code == 200:
|
28 |
+
result = response.json()["data"][0]
|
29 |
+
return f"✅ {result['result']}\n⏰ {result['timestamp']}"
|
30 |
+
return f"❌ Backend Error (HTTP {response.status_code})"
|
31 |
+
except Exception as e:
|
32 |
+
return f"⚠️ Connection Error: {str(e)}"
|
33 |
+
|
34 |
|
35 |
colors = {'GPE': "#43c6fc", "LOC": "#fd9720", "RSE":"#a6e22d"}
|
36 |
options = {"ents": ['GPE', 'LOC', "RSE"], "colors": colors}
|
|
|
166 |
return doc
|
167 |
|
168 |
def extract_spatial_entities(text):
|
169 |
+
# # nlp = en_core_web_md.load()
|
170 |
+
|
171 |
+
# # nlp = spacy.load("en_core_web_md")
|
172 |
+
# # nlp.add_pipe("spatial_pipeline", after="ner")
|
173 |
+
# # doc = nlp(text)
|
174 |
+
# # doc = set_selected_entities(doc)
|
175 |
+
# # html = displacy.render(doc, style="ent", options=options)
|
176 |
+
# # html = html.replace("\n", "")
|
177 |
+
# # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
178 |
+
# # show_spatial_ent_table(doc, text)
|
179 |
+
|
180 |
+
# nlp = spacy.load("en_core_web_md") #####
|
181 |
# nlp.add_pipe("spatial_pipeline", after="ner")
|
182 |
# doc = nlp(text)
|
183 |
+
|
184 |
+
# # 分句处理
|
185 |
+
# sent_ents = []
|
186 |
+
# sent_texts = []
|
187 |
+
# sent_rse_id = []
|
188 |
+
# offset = 0 # 记录当前 token 偏移量
|
189 |
+
# sent_start_positions = [0] # 记录句子信息
|
190 |
+
# doc_copy = doc.copy() # 用于展示方程组合
|
191 |
+
# for sent in doc.sents:
|
192 |
+
|
193 |
+
# sent_doc = nlp(sent.text) # 逐句处理
|
194 |
+
# sent_doc = set_selected_entities(sent_doc) # 这里处理实体
|
195 |
+
# sent_texts.append(sent_doc.text)
|
196 |
+
|
197 |
+
# for ent in sent_doc.ents:
|
198 |
+
# sent_rse_id.append(ent._.rse_id)
|
199 |
+
# # **调整每个实体的索引,使其匹配完整文本**
|
200 |
+
# for ent in sent_doc.ents:
|
201 |
+
# new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
|
202 |
+
# sent_ents.append(new_ent)
|
203 |
+
|
204 |
+
# offset += len(sent) # 更新偏移量
|
205 |
+
# sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
|
206 |
+
# # **创建新 Doc**
|
207 |
+
# final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
|
208 |
+
# for i in sent_start_positions: # 手动标记句子起始点
|
209 |
+
# if i < len(final_doc):
|
210 |
+
# final_doc[i].is_sent_start = True
|
211 |
+
# # **设置实体**
|
212 |
+
# final_doc.set_ents(sent_ents)
|
213 |
+
|
214 |
+
# for i in range(len(sent_rse_id)):
|
215 |
+
# final_doc.ents[i]._.rse_id = sent_rse_id[i]
|
216 |
+
# print(doc.ents[0].sent, '原始')
|
217 |
+
# doc = final_doc
|
218 |
+
# print(doc.ents[0].sent, '新')
|
219 |
+
# # 分句处理完毕
|
220 |
+
|
221 |
+
# # doc = set_selected_entities(doc)
|
222 |
+
# # doc.to_disk("saved_doc.spacy")
|
223 |
+
# doc.to_disk("/tmp/saved_doc.spacy")
|
224 |
+
|
225 |
+
api_result = call_backend(text)
|
226 |
+
st.markdown(api_result)
|
227 |
+
st.text_area(api_result)
|
228 |
+
|
229 |
+
# html = displacy.render(doc,style="ent", options = options)
|
230 |
+
# html = html.replace("\n","")
|
231 |
+
# st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True)
|
232 |
# show_spatial_ent_table(doc, text)
|
233 |
|
234 |
+
# st.markdown("123123")
|
235 |
+
|
236 |
+
# show_sentence_selector_table(doc_copy)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
def show_sentence_selector_table(doc_copy):
|
239 |
st.markdown("**______________________________________________________________________________________**")
|