Spaces:
Running
Running
Shunfeng Zheng
commited on
Update 1_SpatialParse.py
Browse files- 1_SpatialParse.py +7 -88
1_SpatialParse.py
CHANGED
@@ -6,7 +6,6 @@ from PIL import Image
|
|
6 |
import base64
|
7 |
import sys
|
8 |
import pandas as pd
|
9 |
-
# import en_core_web_md
|
10 |
from spacy.tokens import Span, Doc, Token
|
11 |
from utils import geoutil
|
12 |
import urllib.parse
|
@@ -156,17 +155,6 @@ def set_input():
|
|
156 |
text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
|
157 |
if(st.button("Extract")):
|
158 |
|
159 |
-
# return 'France has detected a highly pathogenic strain of bird flu in a pet shop near Paris, days after an identical outbreak in one of Corsica’s main cities.'
|
160 |
-
|
161 |
-
return 'Between Glebe and Pyrmont. Burwood.'
|
162 |
-
return 'I would like to know where is the area between Burwood and Glebe. Pyrmont.'
|
163 |
-
return '5 km east of Burwood. 3 km south of Glebe. Between Pyrmont and Glebe.'
|
164 |
-
# return 'Between Burwood and Pyrmont.'
|
165 |
-
# return 'Between Burwood and Glebe.'
|
166 |
-
# return 'Between Burwood and Darling Harbour.'
|
167 |
-
# return 'Between China and USA.'
|
168 |
-
# return 'The Burwood city.'
|
169 |
-
# text = "New York is north of Washington. Between Burwood and Pyrmont city."
|
170 |
return text
|
171 |
|
172 |
def set_selected_entities(doc):
|
@@ -177,61 +165,7 @@ def set_selected_entities(doc):
|
|
177 |
return doc
|
178 |
|
179 |
def extract_spatial_entities(text):
|
180 |
-
|
181 |
-
|
182 |
-
# # nlp = spacy.load("en_core_web_md")
|
183 |
-
# # nlp.add_pipe("spatial_pipeline", after="ner")
|
184 |
-
# # doc = nlp(text)
|
185 |
-
# # doc = set_selected_entities(doc)
|
186 |
-
# # html = displacy.render(doc, style="ent", options=options)
|
187 |
-
# # html = html.replace("\n", "")
|
188 |
-
# # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
189 |
-
# # show_spatial_ent_table(doc, text)
|
190 |
-
|
191 |
-
# nlp = spacy.load("en_core_web_md") #####
|
192 |
-
# nlp.add_pipe("spatial_pipeline", after="ner")
|
193 |
-
# doc = nlp(text)
|
194 |
-
|
195 |
-
# # 分句处理
|
196 |
-
# sent_ents = []
|
197 |
-
# sent_texts = []
|
198 |
-
# sent_rse_id = []
|
199 |
-
# offset = 0 # 记录当前 token 偏移量
|
200 |
-
# sent_start_positions = [0] # 记录句子信息
|
201 |
-
# doc_copy = doc.copy() # 用于展示方程组合
|
202 |
-
# for sent in doc.sents:
|
203 |
-
|
204 |
-
# sent_doc = nlp(sent.text) # 逐句处理
|
205 |
-
# sent_doc = set_selected_entities(sent_doc) # 这里处理实体
|
206 |
-
# sent_texts.append(sent_doc.text)
|
207 |
-
|
208 |
-
# for ent in sent_doc.ents:
|
209 |
-
# sent_rse_id.append(ent._.rse_id)
|
210 |
-
# # **调整每个实体的索引,使其匹配完整文本**
|
211 |
-
# for ent in sent_doc.ents:
|
212 |
-
# new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
|
213 |
-
# sent_ents.append(new_ent)
|
214 |
-
|
215 |
-
# offset += len(sent) # 更新偏移量
|
216 |
-
# sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
|
217 |
-
# # **创建新 Doc**
|
218 |
-
# final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
|
219 |
-
# for i in sent_start_positions: # 手动标记句子起始点
|
220 |
-
# if i < len(final_doc):
|
221 |
-
# final_doc[i].is_sent_start = True
|
222 |
-
# # **设置实体**
|
223 |
-
# final_doc.set_ents(sent_ents)
|
224 |
-
|
225 |
-
# for i in range(len(sent_rse_id)):
|
226 |
-
# final_doc.ents[i]._.rse_id = sent_rse_id[i]
|
227 |
-
# print(doc.ents[0].sent, '原始')
|
228 |
-
# doc = final_doc
|
229 |
-
# print(doc.ents[0].sent, '新')
|
230 |
-
# # 分句处理完毕
|
231 |
-
|
232 |
-
# # doc = set_selected_entities(doc)
|
233 |
-
# # doc.to_disk("saved_doc.spacy")
|
234 |
-
# doc.to_disk("/tmp/saved_doc.spacy")
|
235 |
|
236 |
Span.set_extension("rse_id", default="", force=True)
|
237 |
api_result = call_backend(text)
|
@@ -240,11 +174,6 @@ def extract_spatial_entities(text):
|
|
240 |
st.markdown(type(api_result))
|
241 |
st.markdown(doc_element)
|
242 |
|
243 |
-
# doc_element = {'text': 'Between Burwood and Glebe.', 'ents': [{'start': 8, 'end': 15, 'label': 'GPE'}, {'start': 20, 'end': 25, 'label': 'GPE'}], 'tokens': [{'id': 0, 'start': 0, 'end': 7}, {'id': 1, 'start': 8, 'end': 15}, {'id': 2, 'start': 16, 'end': 19}, {'id': 3, 'start': 20, 'end': 25}, {'id': 4, 'start': 25, 'end': 26}], 'ents_ext': [{'start': 8, 'end': 15, 'label': 'GPE', 'rse_id': 'Burwood'}, {'start': 20, 'end': 25, 'label': 'GPE', 'rse_id': 'Glebe'}]}
|
244 |
-
# doc_element = {'text': 'I would like to know where is the area between Burwood and Glebe. Pyrmont.', 'ents': [{'start': 47, 'end': 54, 'label': 'GPE'}, {'start': 59, 'end': 64, 'label': 'GPE'}, {'start': 66, 'end': 73, 'label': 'GPE'}], 'sents': [{'start': 0, 'end': 65}, {'start': 66, 'end': 74}], 'tokens': [{'id': 0, 'start': 0, 'end': 1}, {'id': 1, 'start': 2, 'end': 7}, {'id': 2, 'start': 8, 'end': 12}, {'id': 3, 'start': 13, 'end': 15}, {'id': 4, 'start': 16, 'end': 20}, {'id': 5, 'start': 21, 'end': 26}, {'id': 6, 'start': 27, 'end': 29}, {'id': 7, 'start': 30, 'end': 33}, {'id': 8, 'start': 34, 'end': 38}, {'id': 9, 'start': 39, 'end': 46}, {'id': 10, 'start': 47, 'end': 54}, {'id': 11, 'start': 55, 'end': 58}, {'id': 12, 'start': 59, 'end': 64}, {'id': 13, 'start': 64, 'end': 65}, {'id': 14, 'start': 66, 'end': 73}, {'id': 15, 'start': 73, 'end': 74}]}
|
245 |
-
# doc_element =
|
246 |
-
|
247 |
-
|
248 |
|
249 |
nlp = English()
|
250 |
nlp.add_pipe("sentencizer")
|
@@ -276,14 +205,14 @@ def show_sentence_selector_table(doc_copy):
|
|
276 |
st.markdown("**______________________________________________________________________________________**")
|
277 |
st.markdown("**Sentence Selector for Geographic Composition**")
|
278 |
|
279 |
-
|
280 |
sentences = list(doc_copy.sents)
|
281 |
|
282 |
-
|
283 |
rows = []
|
284 |
for idx, sent in enumerate(sentences):
|
285 |
sentence_text = sent.text.strip()
|
286 |
-
|
287 |
url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
|
288 |
new_row = {
|
289 |
'Sr.': idx + 1,
|
@@ -292,7 +221,6 @@ def show_sentence_selector_table(doc_copy):
|
|
292 |
}
|
293 |
rows.append(new_row)
|
294 |
|
295 |
-
# 转为 DataFrame 并渲染为 HTML
|
296 |
df = pd.DataFrame(rows)
|
297 |
st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
298 |
|
@@ -304,9 +232,8 @@ def show_spatial_ent_table(doc, text):
|
|
304 |
st.markdown("**______________________________________________________________________________________**")
|
305 |
st.markdown("**Spatial Entities List**")
|
306 |
|
307 |
-
# 初始化一个空 DataFrame
|
308 |
df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
|
309 |
-
rows = []
|
310 |
|
311 |
for ent in doc.ents:
|
312 |
url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
|
@@ -314,7 +241,6 @@ def show_spatial_ent_table(doc, text):
|
|
314 |
print(ent._.rse_id, 'pppp')
|
315 |
url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
|
316 |
|
317 |
-
# 创建新行
|
318 |
new_row = {
|
319 |
'Sr.': len(rows) + 1,
|
320 |
'entity': ent.text,
|
@@ -323,19 +249,12 @@ def show_spatial_ent_table(doc, text):
|
|
323 |
'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
|
324 |
}
|
325 |
|
326 |
-
rows.append(new_row)
|
327 |
|
328 |
-
# 将所有行转为 DataFrame
|
329 |
df = pd.DataFrame(rows)
|
330 |
|
331 |
-
# 使用 Streamlit 显示 HTML 表格
|
332 |
st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
333 |
|
334 |
-
# params = st.experimental_get_query_params()
|
335 |
-
# params = st.query_params
|
336 |
-
# ase, level_1, level_2, level_3 = geoutil.get_ent(params["entity"][0])
|
337 |
-
# print(geoutil.get_ent(params), 'ppppp')
|
338 |
-
|
339 |
def set_header(): # tetis Geospacy LOGO
|
340 |
LOGO_IMAGE = "title.jpg"
|
341 |
|
@@ -442,7 +361,7 @@ def set_side_menu():
|
|
442 |
|
443 |
def main():
|
444 |
global gpe_selected, loc_selected, rse_selected, model
|
445 |
-
|
446 |
set_header()
|
447 |
set_side_menu()
|
448 |
|
|
|
6 |
import base64
|
7 |
import sys
|
8 |
import pandas as pd
|
|
|
9 |
from spacy.tokens import Span, Doc, Token
|
10 |
from utils import geoutil
|
11 |
import urllib.parse
|
|
|
155 |
text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
|
156 |
if(st.button("Extract")):
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
return text
|
159 |
|
160 |
def set_selected_entities(doc):
|
|
|
165 |
return doc
|
166 |
|
167 |
def extract_spatial_entities(text):
|
168 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
Span.set_extension("rse_id", default="", force=True)
|
171 |
api_result = call_backend(text)
|
|
|
174 |
st.markdown(type(api_result))
|
175 |
st.markdown(doc_element)
|
176 |
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
nlp = English()
|
179 |
nlp.add_pipe("sentencizer")
|
|
|
205 |
st.markdown("**______________________________________________________________________________________**")
|
206 |
st.markdown("**Sentence Selector for Geographic Composition**")
|
207 |
|
208 |
+
|
209 |
sentences = list(doc_copy.sents)
|
210 |
|
211 |
+
|
212 |
rows = []
|
213 |
for idx, sent in enumerate(sentences):
|
214 |
sentence_text = sent.text.strip()
|
215 |
+
|
216 |
url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
|
217 |
new_row = {
|
218 |
'Sr.': idx + 1,
|
|
|
221 |
}
|
222 |
rows.append(new_row)
|
223 |
|
|
|
224 |
df = pd.DataFrame(rows)
|
225 |
st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
226 |
|
|
|
232 |
st.markdown("**______________________________________________________________________________________**")
|
233 |
st.markdown("**Spatial Entities List**")
|
234 |
|
|
|
235 |
df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
|
236 |
+
rows = []
|
237 |
|
238 |
for ent in doc.ents:
|
239 |
url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
|
|
|
241 |
print(ent._.rse_id, 'pppp')
|
242 |
url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
|
243 |
|
|
|
244 |
new_row = {
|
245 |
'Sr.': len(rows) + 1,
|
246 |
'entity': ent.text,
|
|
|
249 |
'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
|
250 |
}
|
251 |
|
252 |
+
rows.append(new_row)
|
253 |
|
|
|
254 |
df = pd.DataFrame(rows)
|
255 |
|
|
|
256 |
st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
257 |
|
|
|
|
|
|
|
|
|
|
|
258 |
def set_header(): # tetis Geospacy LOGO
|
259 |
LOGO_IMAGE = "title.jpg"
|
260 |
|
|
|
361 |
|
362 |
def main():
|
363 |
global gpe_selected, loc_selected, rse_selected, model
|
364 |
+
|
365 |
set_header()
|
366 |
set_side_menu()
|
367 |
|