Shunfeng Zheng commited on
Commit
855ad4a
·
verified ·
1 Parent(s): 674110d

Upload 1_SpatialParse.py

Browse files
Files changed (1) hide show
  1. 1_SpatialParse.py +404 -0
1_SpatialParse.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from spacy import displacy
3
+ import spacy
4
+ import geospacy
5
+ from PIL import Image
6
+ import base64
7
+ import sys
8
+ import pandas as pd
9
+ # import en_core_web_md
10
+ from spacy.tokens import Span, Doc, Token
11
+ from utils import geoutil
12
+ import llm_coding
13
+ import urllib.parse
14
+
15
+
16
+ colors = {'GPE': "#43c6fc", "LOC": "#fd9720", "RSE":"#a6e22d"}
17
+ options = {"ents": ['GPE', 'LOC', "RSE"], "colors": colors}
18
+
19
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: none solid #a6e22d; border-radius: 0.25rem; padding: 1rem">{}</div>"""
20
+ model = ""
21
+
22
+ gpe_selected = "GPE"
23
+ loc_selected = "LOC"
24
+ rse_selected = "RSE"
25
+
26
+ types = ""
27
+
28
+ #BASE_URL = "http://localhost:8080/"
29
+ BASE_URL = ""
30
+
31
+
32
+
33
+ def set_header():
34
+ LOGO_IMAGE = "tetis-1.png"
35
+
36
+ st.markdown(
37
+ """
38
+ <style>
39
+ .container {
40
+ display: flex;
41
+ }
42
+ .logo-text {
43
+ font-weight:700 !important;
44
+ font-size:50px !important;
45
+ color: #f9a01b !important;
46
+ padding-left: 10px !important;
47
+ }
48
+ .logo-img {
49
+ float:right;
50
+ width: 28%;
51
+ height: 28%;
52
+ }
53
+ </style>
54
+ """,
55
+ unsafe_allow_html=True
56
+ )
57
+ st.markdown(
58
+ f"""
59
+ <div class="container">
60
+ <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
61
+ <p class="logo-text">GeOspaCy</p>
62
+ </div>
63
+ """,
64
+ unsafe_allow_html=True
65
+ )
66
+
67
+
68
+
69
+ def set_side_menu():
70
+
71
+ global gpe_selected, loc_selected, rse_selected, model, types
72
+ types =""
73
+ params = st.experimental_get_query_params()
74
+ # params = st.query_params
75
+ # print(params, 777)
76
+
77
+ st.sidebar.markdown("## Spacy Model")
78
+ st.sidebar.markdown("You can **select** the values of the *spacy model* from Dropdown.")
79
+ models = ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf']
80
+ if "model" in params:
81
+ default_ix = models.index(params["model"][0])
82
+ else:
83
+ default_ix = models.index('en_core_web_sm')
84
+ model = st.sidebar.selectbox('Spacy Model',models, index=default_ix)
85
+
86
+ st.sidebar.markdown("## Spatial Entity Labels")
87
+ st.sidebar.markdown("**Mark** the Spatial Entities you want to extract?")
88
+ tpes = ""
89
+ if "type" in params:
90
+ tpes = params['type'][0]
91
+
92
+ if "g" in tpes:
93
+ gpe = st.sidebar.checkbox('GPE', value = True)
94
+ else:
95
+ gpe = st.sidebar.checkbox('GPE')
96
+
97
+ if "l" in tpes:
98
+ loc = st.sidebar.checkbox('LOC', value = True)
99
+ else:
100
+ loc = st.sidebar.checkbox('LOC')
101
+ if "r" in tpes:
102
+ rse = st.sidebar.checkbox('RSE', value = True)
103
+ else:
104
+ rse = st.sidebar.checkbox('RSE')
105
+ if(gpe):
106
+ gpe_selected ="GPE"
107
+ types+="g"
108
+
109
+ if(loc):
110
+ loc_selected ="LOC"
111
+ types+="l"
112
+
113
+ if(rse):
114
+ rse_selected ="RSE"
115
+ types+="r"
116
+
117
+
118
+
119
+ def set_input():
120
+ params = st.experimental_get_query_params()
121
+ # params = st.query_params
122
+
123
+ if "text" not in params:
124
+ text = st.text_area("Input unstructured text:", "")
125
+ else:
126
+ text = st.text_area("Enter the text to extract {Spatial Entities}", params["text"][0])
127
+ if(st.button("Extract")):
128
+
129
+ # return 'France has detected a highly pathogenic strain of bird flu in a pet shop near Paris, days after an identical outbreak in one of Corsica’s main cities.'
130
+
131
+
132
+ return 'I would like to know where is the area between Burwood and Glebe. Pyrmont.'
133
+ return '5 km east of Burwood. 3 km south of Glebe. Between Pyrmont and Glebe.'
134
+ # return 'Between Burwood and Pyrmont.'
135
+ # return 'Between Burwood and Glebe.'
136
+ # return 'Between Burwood and Darling Harbour.'
137
+ # return 'Between China and USA.'
138
+ # return 'The Burwood city.'
139
+ # text = "New York is north of Washington. Between Burwood and Pyrmont city."
140
+ return text
141
+
142
+ def set_selected_entities(doc):
143
+ global gpe_selected, loc_selected, rse_selected, model
144
+ ents = [ent for ent in doc.ents if ent.label_ == gpe_selected or ent.label_ == loc_selected or ent.label_ == rse_selected]
145
+
146
+ doc.ents = ents
147
+ return doc
148
+
149
+ def extract_spatial_entities(text):
150
+ # nlp = en_core_web_md.load()
151
+
152
+ # nlp = spacy.load("en_core_web_md")
153
+ # nlp.add_pipe("spatial_pipeline", after="ner")
154
+ # doc = nlp(text)
155
+ # doc = set_selected_entities(doc)
156
+ # html = displacy.render(doc, style="ent", options=options)
157
+ # html = html.replace("\n", "")
158
+ # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
159
+ # show_spatial_ent_table(doc, text)
160
+
161
+ nlp = spacy.load("en_core_web_md") #####
162
+ nlp.add_pipe("spatial_pipeline", after="ner")
163
+ doc = nlp(text)
164
+
165
+ # 分句处理
166
+ sent_ents = []
167
+ sent_texts = []
168
+ sent_rse_id = []
169
+ offset = 0 # 记录当前 token 偏移量
170
+ sent_start_positions = [0] # 记录句子信息
171
+ doc_copy = doc.copy() # 用于展示方程组合
172
+ for sent in doc.sents:
173
+
174
+ sent_doc = nlp(sent.text) # 逐句处理
175
+ sent_doc = set_selected_entities(sent_doc) # 这里处理实体
176
+ sent_texts.append(sent_doc.text)
177
+
178
+ for ent in sent_doc.ents:
179
+ sent_rse_id.append(ent._.rse_id)
180
+ # **调整每个实体的索引,使其匹配完整文本**
181
+ for ent in sent_doc.ents:
182
+ new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
183
+ sent_ents.append(new_ent)
184
+
185
+ offset += len(sent) # 更新偏移量
186
+ sent_start_positions.append(sent_start_positions[-1] + len(sent)) # 记录句子起点
187
+ # **创建新 Doc**
188
+ final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
189
+ for i in sent_start_positions: # 手动标记句子起始点
190
+ if i < len(final_doc):
191
+ final_doc[i].is_sent_start = True
192
+ # **设置实体**
193
+ final_doc.set_ents(sent_ents)
194
+
195
+ for i in range(len(sent_rse_id)):
196
+ final_doc.ents[i]._.rse_id = sent_rse_id[i]
197
+ print(doc.ents[0].sent, '原始')
198
+ doc = final_doc
199
+ print(doc.ents[0].sent, '新')
200
+ # 分句处理完毕
201
+
202
+ # doc = set_selected_entities(doc)
203
+ doc.to_disk("saved_doc.spacy")
204
+
205
+
206
+
207
+
208
+ html = displacy.render(doc,style="ent", options = options)
209
+ html = html.replace("\n","")
210
+ st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True)
211
+ show_spatial_ent_table(doc, text)
212
+
213
+ st.markdown("123123")
214
+
215
+ show_sentence_selector_table(doc_copy)
216
+
217
+ def show_sentence_selector_table(doc_copy):
218
+ st.markdown("**______________________________________________________________________________________**")
219
+ st.markdown("**Sentence Selector for Geographic Composition**")
220
+
221
+ # 提取句子
222
+ sentences = list(doc_copy.sents)
223
+
224
+ # 构建表格数据
225
+ rows = []
226
+ for idx, sent in enumerate(sentences):
227
+ sentence_text = sent.text.strip()
228
+ # 生成跳转链接(定位到Tagger)
229
+ url = BASE_URL + "Tagger?mode=geocombo&text=" + urllib.parse.quote(sentence_text)
230
+ new_row = {
231
+ 'Sr.': idx + 1,
232
+ 'sentence': sentence_text,
233
+ 'Select': f'<a target="_self" href="{url}">Select this sentence</a>'
234
+ }
235
+ rows.append(new_row)
236
+
237
+ # 转为 DataFrame 并渲染为 HTML
238
+ df = pd.DataFrame(rows)
239
+ st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
240
+
241
+
242
+
243
+ def show_spatial_ent_table(doc, text):
244
+ global types
245
+ if len(doc.ents) > 0:
246
+ st.markdown("**______________________________________________________________________________________**")
247
+ st.markdown("**Spatial Entities List**")
248
+
249
+ # 初始化一个空 DataFrame
250
+ df = pd.DataFrame(columns=['Sr.', 'entity', 'label', 'Map', 'GEOJson'])
251
+ rows = [] # 用于存储所有行
252
+
253
+ for ent in doc.ents:
254
+ url_map = BASE_URL + "Tagger?map=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
255
+ print(url_map, 'uuurrr')
256
+ print(ent._.rse_id, 'pppp')
257
+ url_json = BASE_URL + "Tagger?geojson=true&type=" + types + "&model=" + model + "&text=" + text + "&entity=" + ent._.rse_id
258
+
259
+ # 创建新行
260
+ new_row = {
261
+ 'Sr.': len(rows) + 1,
262
+ 'entity': ent.text,
263
+ 'label': ent.label_,
264
+ 'Map': f'<a target="_self" href="{url_map}">View</a>',
265
+ 'GEOJson': f'<a target="_self" href="{url_json}">View</a>'
266
+ }
267
+
268
+ rows.append(new_row) # 将新行添加到列表中
269
+
270
+ # 将所有行转为 DataFrame
271
+ df = pd.DataFrame(rows)
272
+
273
+ # 使用 Streamlit 显示 HTML 表格
274
+ st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
275
+
276
+ # params = st.experimental_get_query_params()
277
+ # params = st.query_params
278
+ # ase, level_1, level_2, level_3 = geoutil.get_ent(params["entity"][0])
279
+ # print(geoutil.get_ent(params), 'ppppp')
280
+
281
+ def set_header(): # tetis Geospacy LOGO
282
+ LOGO_IMAGE = "title.jpg"
283
+
284
+ st.markdown(
285
+ """
286
+ <style>
287
+ .container {
288
+ display: flex;
289
+ }
290
+ .logo-text {
291
+ font-weight:700 !important;
292
+ font-size:50px !important;
293
+ color: #52aee3 !important;
294
+ padding-left: 10px !important;
295
+ }
296
+ .logo-img {
297
+ float:right;
298
+ width: 10%;
299
+ height: 10%;
300
+ }
301
+ </style>
302
+ """,
303
+ unsafe_allow_html=True
304
+ )
305
+ st.markdown(
306
+ f"""
307
+ <div class="container">
308
+ <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
309
+ <p class="logo-text">SpatialParse</p>
310
+ </div>
311
+ """,
312
+ unsafe_allow_html=True
313
+ )
314
+
315
+
316
+ def set_side_menu():
317
+ global gpe_selected, loc_selected, rse_selected, model, types
318
+ types = ""
319
+ params = st.experimental_get_query_params()
320
+ st.sidebar.markdown("## Deployment Method")
321
+ st.sidebar.markdown("You can select the deployment method for the model.")
322
+ deployment_options = ["API", "Local deployment"]
323
+ use_local_model = st.sidebar.radio("Choose deployment method:", deployment_options, index=0) == "Local deployment"
324
+
325
+ if use_local_model:
326
+ local_model_path = st.sidebar.text_input("Enter local model path:", "")
327
+
328
+ st.sidebar.markdown("## LLM Model")
329
+ st.sidebar.markdown("You can **select** different *LLM model* powered by API.")
330
+ models = ['Llama-3-8B', 'Mistral-7B-0.3', 'Gemma-2-10B', 'GPT-4o', 'Gemini Pro', 'Deepseek-R1', 'en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf']
331
+
332
+
333
+
334
+
335
+ if "model" in params:
336
+ default_ix = models.index(params["model"][0])
337
+ else:
338
+ default_ix = models.index('GPT-4o')
339
+
340
+
341
+
342
+
343
+ model = st.sidebar.selectbox('LLM Model', models, index=default_ix)
344
+
345
+ st.sidebar.markdown("## Spatial Entity Labels")
346
+
347
+ st.sidebar.markdown("Please **Mark** the Spatial Entities you want to extract.")
348
+ tpes = ""
349
+ if "type" in params:
350
+ tpes = params['type'][0]
351
+
352
+ st.sidebar.markdown("### Absolute Spatial Entity:")
353
+ if "g" in tpes:
354
+ gpe = st.sidebar.checkbox('GPE', value=True)
355
+ else:
356
+ gpe = st.sidebar.checkbox('GPE')
357
+
358
+ if "l" in tpes:
359
+ loc = st.sidebar.checkbox('LOC', value=True)
360
+ else:
361
+ loc = st.sidebar.checkbox('LOC')
362
+
363
+ st.sidebar.markdown("### Relative Spatial Entity:")
364
+
365
+ if "r" in tpes:
366
+ rse = st.sidebar.checkbox('RSE', value=True)
367
+ else:
368
+ rse = st.sidebar.checkbox('RSE')
369
+ if (gpe):
370
+ gpe_selected = "GPE"
371
+ types += "g"
372
+
373
+ if (loc):
374
+ loc_selected = "LOC"
375
+ types += "l"
376
+
377
+ if (rse):
378
+ rse_selected = "RSE"
379
+ types += "r"
380
+
381
+
382
+
383
+
384
+
385
+ def main():
386
+ global gpe_selected, loc_selected, rse_selected, model
387
+ #print(displacy.templates.TPL_ENT)
388
+ set_header()
389
+ set_side_menu()
390
+
391
+
392
+ text = set_input()
393
+
394
+ if(text is not None):
395
+ extract_spatial_entities(text)
396
+ elif "text" in st.session_state:
397
+ text = st.session_state.text
398
+ extract_spatial_entities(text)
399
+
400
+
401
+ if __name__ == '__main__':
402
+ main()
403
+
404
+