Shunfeng Zheng commited on
Commit
8c23de1
·
verified ·
1 Parent(s): 9e246a3

Update 1_SpatialParse.py

Browse files
Files changed (1) hide show
  1. 1_SpatialParse.py +7 -88
1_SpatialParse.py CHANGED
@@ -6,7 +6,6 @@ from PIL import Image
6
  import base64
7
  import sys
8
  import pandas as pd
9
- # import en_core_web_md
10
  from spacy.tokens import Span, Doc, Token
11
  from utils import geoutil
12
  import urllib.parse
@@ -156,17 +155,6 @@ def set_input():
156
  text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
157
  if(st.button("Extract")):
158
 
159
- # return 'France has detected a highly pathogenic strain of bird flu in a pet shop near Paris, days after an identical outbreak in one of Corsica’s main cities.'
160
-
161
- return 'Between Glebe and Pyrmont. Burwood.'
162
- return 'I would like to know where is the area between Burwood and Glebe. Pyrmont.'
163
- return '5 km east of Burwood. 3 km south of Glebe. Between Pyrmont and Glebe.'
164
- # return 'Between Burwood and Pyrmont.'
165
- # return 'Between Burwood and Glebe.'
166
- # return 'Between Burwood and Darling Harbour.'
167
- # return 'Between China and USA.'
168
- # return 'The Burwood city.'
169
- # text = "New York is north of Washington. Between Burwood and Pyrmont city."
170
  return text
171
 
172
  def set_selected_entities(doc):
@@ -177,61 +165,7 @@ def set_selected_entities(doc):
177
  return doc
178
 
179
  def extract_spatial_entities(text):
180
- # # nlp = en_core_web_md.load()
181
-
182
- # # nlp = spacy.load("en_core_web_md")
183
- # # nlp.add_pipe("spatial_pipeline", after="ner")
184
- # # doc = nlp(text)
185
- # # doc = set_selected_entities(doc)
186
- # # html = displacy.render(doc, style="ent", options=options)
187
- # # html = html.replace("\n", "")
188
- # # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
189
- # # show_spatial_ent_table(doc, text)
190
-
191
- # nlp = spacy.load("en_core_web_md") #####
192
- # nlp.add_pipe("spatial_pipeline", after="ner")
193
- # doc = nlp(text)
194
-
195
- # # 分句处理
196
- # sent_ents = []
197
- # sent_texts = []
198
- # sent_rse_id = []
199
- # offset = 0 # 记录当前 token 偏移量
200
- # sent_start_positions = [0] # 记录句子信息
201
- # doc_copy = doc.copy() # 用于展示方程组合
202
- # for sent in doc.sents:
203
-
204
- # sent_doc = nlp(sent.text) # 逐句处理
205
- # sent_doc = set_selected_entities(sent_doc) # 这里处理实体
206
- # sent_texts.append(sent_doc.text)
207
-
208
- # for ent in sent_doc.ents:
209
- # sent_rse_id.append(ent._.rse_id)
210
- # # **调整每个实体的索引,使其匹配完整文本**
211
- # for ent in sent_doc.ents:
212
- # new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
213
- # sent_ents.append(new_ent)
214
-
215
- # offset += len(sent) # 更新偏移量
216
- # sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
217
- # # **创建新 Doc**
218
- # final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
219
- # for i in sent_start_positions: # 手动标记句子起始点
220
- # if i < len(final_doc):
221
- # final_doc[i].is_sent_start = True
222
- # # **设置实体**
223
- # final_doc.set_ents(sent_ents)
224
-
225
- # for i in range(len(sent_rse_id)):
226
- # final_doc.ents[i]._.rse_id = sent_rse_id[i]
227
- # print(doc.ents[0].sent, '原始')
228
- # doc = final_doc
229
- # print(doc.ents[0].sent, '新')
230
- # # 分句处理完毕
231
-
232
- # # doc = set_selected_entities(doc)
233
- # # doc.to_disk("saved_doc.spacy")
234
- # doc.to_disk("/tmp/saved_doc.spacy")
235
 
236
  Span.set_extension("rse_id", default="", force=True)
237
  api_result = call_backend(text)
@@ -240,11 +174,6 @@ def extract_spatial_entities(text):
240
  st.markdown(type(api_result))
241
  st.markdown(doc_element)
242
 
243
- # doc_element = {'text': 'Between Burwood and Glebe.', 'ents': [{'start': 8, 'end': 15, 'label': 'GPE'}, {'start': 20, 'end': 25, 'label': 'GPE'}], 'tokens': [{'id': 0, 'start': 0, 'end': 7}, {'id': 1, 'start': 8, 'end': 15}, {'id': 2, 'start': 16, 'end': 19}, {'id': 3, 'start': 20, 'end': 25}, {'id': 4, 'start': 25, 'end': 26}], 'ents_ext': [{'start': 8, 'end': 15, 'label': 'GPE', 'rse_id': 'Burwood'}, {'start': 20, 'end': 25, 'label': 'GPE', 'rse_id': 'Glebe'}]}
244
- # doc_element = {'text': 'I would like to know where is the area between Burwood and Glebe. Pyrmont.', 'ents': [{'start': 47, 'end': 54, 'label': 'GPE'}, {'start': 59, 'end': 64, 'label': 'GPE'}, {'start': 66, 'end': 73, 'label': 'GPE'}], 'sents': [{'start': 0, 'end': 65}, {'start': 66, 'end': 74}], 'tokens': [{'id': 0, 'start': 0, 'end': 1}, {'id': 1, 'start': 2, 'end': 7}, {'id': 2, 'start': 8, 'end': 12}, {'id': 3, 'start': 13, 'end': 15}, {'id': 4, 'start': 16, 'end': 20}, {'id': 5, 'start': 21, 'end': 26}, {'id': 6, 'start': 27, 'end': 29}, {'id': 7, 'start': 30, 'end': 33}, {'id': 8, 'start': 34, 'end': 38}, {'id': 9, 'start': 39, 'end': 46}, {'id': 10, 'start': 47, 'end': 54}, {'id': 11, 'start': 55, 'end': 58}, {'id': 12, 'start': 59, 'end': 64}, {'id': 13, 'start': 64, 'end': 65}, {'id': 14, 'start': 66, 'end': 73}, {'id': 15, 'start': 73, 'end': 74}]}
245
- # doc_element =
246
-
247
-
248
 
249
  nlp = English()
250
  nlp.add_pipe("sentencizer")
@@ -276,14 +205,14 @@ def show_sentence_selector_table(doc_copy):
276
  st.markdown("**______________________________________________________________________________________**")
277
  st.markdown("**Sentence Selector for Geographic Composition**")
278
 
279
- # 提取句子
280
  sentences = list(doc_copy.sents)
281
 
282
- # 构建表格数据
283
  rows = []
284
  for idx, sent in enumerate(sentences):
285
  sentence_text = sent.text.strip()
286
- # 生成跳转链接(定位到Tagger)
287
  url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
288
  new_row = {
289
  'Sr.': idx + 1,
@@ -292,7 +221,6 @@ def show_sentence_selector_table(doc_copy):
292
  }
293
  rows.append(new_row)
294
 
295
- # 转为 DataFrame 并渲染为 HTML
296
  df = pd.DataFrame(rows)
297
  st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
298
 
@@ -304,9 +232,8 @@ def show_spatial_ent_table(doc, text):
304
  st.markdown("**______________________________________________________________________________________**")
305
  st.markdown("**Spatial Entities List**")
306
 
307
- # 初始化一个空 DataFrame
308
  df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
309
- rows = [] # 用于存储所有行
310
 
311
  for ent in doc.ents:
312
  url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
@@ -314,7 +241,6 @@ def show_spatial_ent_table(doc, text):
314
  print(ent._.rse_id, 'pppp')
315
  url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
316
 
317
- # 创建新行
318
  new_row = {
319
  'Sr.': len(rows) + 1,
320
  'entity': ent.text,
@@ -323,19 +249,12 @@ def show_spatial_ent_table(doc, text):
323
  'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
324
  }
325
 
326
- rows.append(new_row) # 将新行添加到列表中
327
 
328
- # 将所有行转为 DataFrame
329
  df = pd.DataFrame(rows)
330
 
331
- # 使用 Streamlit 显示 HTML 表格
332
  st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
333
 
334
- # params = st.experimental_get_query_params()
335
- # params = st.query_params
336
- # ase, level_1, level_2, level_3 = geoutil.get_ent(params["entity"][0])
337
- # print(geoutil.get_ent(params), 'ppppp')
338
-
339
  def set_header(): # tetis Geospacy LOGO
340
  LOGO_IMAGE = "title.jpg"
341
 
@@ -442,7 +361,7 @@ def set_side_menu():
442
 
443
  def main():
444
  global gpe_selected, loc_selected, rse_selected, model
445
- #print(displacy.templates.TPL_ENT)
446
  set_header()
447
  set_side_menu()
448
 
 
6
  import base64
7
  import sys
8
  import pandas as pd
 
9
  from spacy.tokens import Span, Doc, Token
10
  from utils import geoutil
11
  import urllib.parse
 
155
  text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
156
  if(st.button("Extract")):
157
 
 
 
 
 
 
 
 
 
 
 
 
158
  return text
159
 
160
  def set_selected_entities(doc):
 
165
  return doc
166
 
167
  def extract_spatial_entities(text):
168
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  Span.set_extension("rse_id", default="", force=True)
171
  api_result = call_backend(text)
 
174
  st.markdown(type(api_result))
175
  st.markdown(doc_element)
176
 
 
 
 
 
 
177
 
178
  nlp = English()
179
  nlp.add_pipe("sentencizer")
 
205
  st.markdown("**______________________________________________________________________________________**")
206
  st.markdown("**Sentence Selector for Geographic Composition**")
207
 
208
+
209
  sentences = list(doc_copy.sents)
210
 
211
+
212
  rows = []
213
  for idx, sent in enumerate(sentences):
214
  sentence_text = sent.text.strip()
215
+
216
  url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
217
  new_row = {
218
  'Sr.': idx + 1,
 
221
  }
222
  rows.append(new_row)
223
 
 
224
  df = pd.DataFrame(rows)
225
  st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
226
 
 
232
  st.markdown("**______________________________________________________________________________________**")
233
  st.markdown("**Spatial Entities List**")
234
 
 
235
  df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
236
+ rows = []
237
 
238
  for ent in doc.ents:
239
  url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
 
241
  print(ent._.rse_id, 'pppp')
242
  url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
243
 
 
244
  new_row = {
245
  'Sr.': len(rows) + 1,
246
  'entity': ent.text,
 
249
  'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
250
  }
251
 
252
+ rows.append(new_row)
253
 
 
254
  df = pd.DataFrame(rows)
255
 
 
256
  st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
257
 
 
 
 
 
 
258
  def set_header(): # tetis Geospacy LOGO
259
  LOGO_IMAGE = "title.jpg"
260
 
 
361
 
362
  def main():
363
  global gpe_selected, loc_selected, rse_selected, model
364
+
365
  set_header()
366
  set_side_menu()
367