Spaces:
Runtime error
Runtime error
Shunfeng Zheng
commited on
Delete 提取测试.py
Browse files
提取测试.py
DELETED
@@ -1,268 +0,0 @@
|
|
1 |
-
import math
|
2 |
-
import streamlit as st
|
3 |
-
from utils import geoutil
|
4 |
-
import pickle
|
5 |
-
|
6 |
-
|
7 |
-
def update_entities(doc, entity_texts, replace=True):
|
8 |
-
"""
|
9 |
-
根据给定的文本内容标注实体,并直接修改 doc.ents。
|
10 |
-
|
11 |
-
:param doc: spaCy 解析后的 Doc 对象
|
12 |
-
:param entity_texts: 字典,键是要标注的实体文本,值是对应的实体类别
|
13 |
-
:param replace: 布尔值,True 则替换现有实体,False 则保留现有实体并添加新的
|
14 |
-
"""
|
15 |
-
new_ents = list(doc.ents) if not replace else [] # 如果 replace=False,保留已有实体
|
16 |
-
|
17 |
-
for ent_text, ent_label in entity_texts.items():
|
18 |
-
start = doc.text.find(ent_text) # 在全文中查找文本位置
|
19 |
-
if start != -1:
|
20 |
-
start_token = len(doc.text[:start].split()) # 计算起始 token 索引
|
21 |
-
end_token = start_token + len(ent_text.split()) # 计算结束 token 索引
|
22 |
-
|
23 |
-
if start_token < len(doc) and end_token <= len(doc): # 确保索引不越界
|
24 |
-
new_ent = Span(doc, start_token, end_token, label=ent_label)
|
25 |
-
new_ents.append(new_ent)
|
26 |
-
|
27 |
-
doc.set_ents(new_ents) # 更新 doc.ents
|
28 |
-
|
29 |
-
|
30 |
-
# def midpoint(x1, y1, x2, y2, angle):
|
31 |
-
def midpoint(y1, x1, y2, x2, angle):
|
32 |
-
|
33 |
-
lonA = math.radians(y1)
|
34 |
-
lonB = math.radians(y2)
|
35 |
-
latA = math.radians(x1)
|
36 |
-
latB = math.radians(x2)
|
37 |
-
|
38 |
-
dLon = lonB - lonA
|
39 |
-
|
40 |
-
Bx = math.cos(latB) * math.cos(dLon)
|
41 |
-
By = math.cos(latB) * math.sin(dLon)
|
42 |
-
|
43 |
-
latC = math.atan2(math.sin(latA) + math.sin(latB),
|
44 |
-
math.sqrt((math.cos(latA) + Bx) * (math.cos(latA) + Bx) + By * By))
|
45 |
-
lonC = lonA + math.atan2(By, math.cos(latA) + Bx)
|
46 |
-
lonC = (lonC + 3 * math.pi) % (2 * math.pi) - math.pi
|
47 |
-
latitude = round(math.degrees(latC), 8)
|
48 |
-
longitude = round(math.degrees(lonC) ,8)
|
49 |
-
|
50 |
-
return [longitude, latitude, angle
|
51 |
-
|
52 |
-
]
|
53 |
-
|
54 |
-
|
55 |
-
def get_midmid_point(centroid, point1, point2, is_midmid):
|
56 |
-
mid1 = midpoint(centroid[0], centroid[1],
|
57 |
-
point1[0], point1[1]
|
58 |
-
, point1[2])
|
59 |
-
mid2 = midpoint(centroid[0], centroid[1],
|
60 |
-
point2[0], point2[1],
|
61 |
-
point2[2])
|
62 |
-
midmid1 = midpoint(centroid[0], centroid[1],
|
63 |
-
mid1[0], mid1[1]
|
64 |
-
, mid1[2])
|
65 |
-
midmid2 = midpoint(centroid[0], centroid[1],
|
66 |
-
mid2[0], mid2[1],
|
67 |
-
mid2[2])
|
68 |
-
if is_midmid:
|
69 |
-
return midmid1, midmid2
|
70 |
-
else:
|
71 |
-
return mid1, mid2
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
import spacy
|
77 |
-
from spacy.language import Language
|
78 |
-
import regex_spatial
|
79 |
-
from spacy.tokens import Span, Doc, Token
|
80 |
-
import re
|
81 |
-
import llm_ent_extract
|
82 |
-
|
83 |
-
|
84 |
-
rse_id = "rse_id"
|
85 |
-
def set_extension():
|
86 |
-
Span.set_extension(rse_id, default="", force=True)
|
87 |
-
Doc.set_extension(rse_id, default="", force=True)
|
88 |
-
Token.set_extension(rse_id, default="", force=True)
|
89 |
-
def find_ent_by_regex(doc, sentence, ent, regex):
|
90 |
-
global id
|
91 |
-
|
92 |
-
if id == "":
|
93 |
-
id = ent.text
|
94 |
-
for match in re.finditer(regex, doc.text):
|
95 |
-
start, end = match.span()
|
96 |
-
if(start>= sentence.start_char and start<= sentence.end_char):
|
97 |
-
span = doc.char_span(start, end)
|
98 |
-
if span is not None:
|
99 |
-
id = span.text +"_"+ id
|
100 |
-
if(start > ent.end_char):
|
101 |
-
ent.end_char = end
|
102 |
-
else:
|
103 |
-
ent.start_char = start
|
104 |
-
|
105 |
-
return ent
|
106 |
-
|
107 |
-
return ent
|
108 |
-
def get_level1(doc, sentence, ent):
|
109 |
-
return find_ent_by_regex(doc, sentence, ent, regex_spatial.get_level1_regex())
|
110 |
-
|
111 |
-
def get_level2(doc, sentence, ent):
|
112 |
-
return find_ent_by_regex(doc, sentence, ent, regex_spatial.get_level2_regex())
|
113 |
-
|
114 |
-
def get_level3(doc, sentence, ent):
|
115 |
-
return find_ent_by_regex(doc, sentence, ent, regex_spatial.get_level3_regex())
|
116 |
-
|
117 |
-
def get_relative_entity(doc, sentence, ent):
|
118 |
-
global id
|
119 |
-
id = ""
|
120 |
-
rel_entity = get_level1(doc, sentence, ent)
|
121 |
-
|
122 |
-
rel_entity = get_level2(doc, sentence, rel_entity)
|
123 |
-
|
124 |
-
rel_entity = get_level3(doc, sentence, rel_entity)
|
125 |
-
|
126 |
-
# print(id)
|
127 |
-
if ("_" in id):
|
128 |
-
|
129 |
-
rel_entity = doc.char_span(rel_entity.start_char, rel_entity.end_char, "RSE")
|
130 |
-
rel_entity._.rse_id = id
|
131 |
-
|
132 |
-
return rel_entity
|
133 |
-
|
134 |
-
rel_entity = doc.char_span(ent.start_char, ent.end_char, ent.label_)
|
135 |
-
rel_entity._.rse_id = id
|
136 |
-
return rel_entity
|
137 |
-
|
138 |
-
|
139 |
-
@Language.component("spatial_pipeline")
|
140 |
-
def get_spatial_ent(doc):
|
141 |
-
set_extension()
|
142 |
-
new_ents = []
|
143 |
-
|
144 |
-
ents = [ent for ent in doc.ents if ent.label_ == "GPE" or ent.label_ == "LOC"]
|
145 |
-
|
146 |
-
# GPE = '[###5###]' # LLM 输出的实体
|
147 |
-
# GPE = llm_ent_extract.extract(GPE, 'LOC')
|
148 |
-
#
|
149 |
-
# update_entities(doc, GPE, True)
|
150 |
-
# ents = doc.ents
|
151 |
-
|
152 |
-
|
153 |
-
# GPE = llm_ent_extract.extract(llm_ent_extract.extract_GPE(doc.text), 'gpe')
|
154 |
-
# update_entities(doc, GPE)
|
155 |
-
|
156 |
-
end = None
|
157 |
-
for ent in ents:
|
158 |
-
if ent.end != len(doc):
|
159 |
-
next_token = doc[ent.end] # 怀疑多加了一个索引。Between Burwood and Pyrmont city. 分别是Pyrmont 和 .
|
160 |
-
if end is not None: # end 在4次循环中是0,2,5,8
|
161 |
-
start = end
|
162 |
-
else:
|
163 |
-
start = ent.sent.start # 似乎永远都是0
|
164 |
-
if next_token.text.lower() in regex_spatial.get_keywords():
|
165 |
-
end = next_token.i
|
166 |
-
else:
|
167 |
-
end = ent.end
|
168 |
-
rsi_ent = get_relative_entity(doc,Span(doc, start, end), ent)
|
169 |
-
# print(rsi_ent.text, rsi_ent.label_, rsi_ent._.rse_id, '```')
|
170 |
-
new_ents.append(rsi_ent)
|
171 |
-
|
172 |
-
doc.ents = new_ents
|
173 |
-
|
174 |
-
|
175 |
-
return doc
|
176 |
-
gpe_selected = "GPE"
|
177 |
-
loc_selected = "LOC"
|
178 |
-
rse_selected = "RSE"
|
179 |
-
|
180 |
-
def set_selected_entities(doc):
|
181 |
-
global gpe_selected, loc_selected, rse_selected
|
182 |
-
ents = [ent for ent in doc.ents if ent.label_ == gpe_selected or ent.label_ == loc_selected or ent.label_ == rse_selected]
|
183 |
-
|
184 |
-
doc.ents = ents
|
185 |
-
|
186 |
-
return doc
|
187 |
-
|
188 |
-
# text = 'Sydney is 6 kilometres to the east.'
|
189 |
-
def extract_spatial_entities(text):
|
190 |
-
|
191 |
-
|
192 |
-
nlp = spacy.load("en_core_web_md") #####
|
193 |
-
# nlp.add_pipe("spatial_pipeline", after="ner")
|
194 |
-
doc = nlp(text)
|
195 |
-
|
196 |
-
nlp.add_pipe("spatial_pipeline", after="ner")
|
197 |
-
|
198 |
-
# 分句处理
|
199 |
-
sent_ents = []
|
200 |
-
sent_texts = []
|
201 |
-
offset = 0 # 记录当前 token 偏移量
|
202 |
-
|
203 |
-
for sent in doc.sents:
|
204 |
-
|
205 |
-
sent_doc = nlp(sent.text) # 逐句处理
|
206 |
-
|
207 |
-
sent_doc = set_selected_entities(sent_doc) # 这里处理实体
|
208 |
-
|
209 |
-
sent_texts.append(sent_doc.text)
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
# **调整每个实体的索引,使其匹配完整文本**
|
214 |
-
for ent in sent_doc.ents:
|
215 |
-
new_ent = Span(doc, ent.start + offset, ent.end + offset, label=ent.label_)
|
216 |
-
sent_ents.append(new_ent)
|
217 |
-
|
218 |
-
offset += len(sent) # 更新偏移量
|
219 |
-
|
220 |
-
# **创建新 Doc**
|
221 |
-
final_doc = Doc(nlp.vocab, words=[token.text for token in doc], spaces=[token.whitespace_ for token in doc])
|
222 |
-
|
223 |
-
# **设置实体**
|
224 |
-
final_doc.set_ents(sent_ents)
|
225 |
-
# 分句处理完毕
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
print('-' * 50)
|
232 |
-
# print(doc.text)
|
233 |
-
# print(doc.ents)
|
234 |
-
# print("修改后实体:", [(ent.text, ent.label_) for ent in doc.ents])
|
235 |
-
print("修改后实体:", [(ent.text, ent.label_) for ent in final_doc.ents])
|
236 |
-
|
237 |
-
# print(doc.ents[0]._.rse_id, 'final_entO')
|
238 |
-
# print(final_doc.ents[0]._.rse_id, 'final_entO')
|
239 |
-
final_doc.ents[0]._.rse_id = '11'
|
240 |
-
print(final_doc.ents[0]._.rse_id, 'final_entO')
|
241 |
-
print(final_doc.ents[0].sent, 'final_entO')
|
242 |
-
# # print(doc.sents)
|
243 |
-
|
244 |
-
final_doc.to_disk("saved_doc.spacy")
|
245 |
-
print("Doc saved successfully!")
|
246 |
-
|
247 |
-
|
248 |
-
text = 'Between Burwood and Pyrmont. Between Burwood and Pyrmont city.'
|
249 |
-
text = 'Between Burwood and Pyrmont.'
|
250 |
-
text = "New York is north of Washington. Between Burwood and Pyrmont city."
|
251 |
-
text = "5 km east of Burwood."
|
252 |
-
|
253 |
-
extract_spatial_entities(text)
|
254 |
-
|
255 |
-
nlp = spacy.load("en_core_web_md")
|
256 |
-
doc = Doc(nlp.vocab).from_disk("saved_doc.spacy")
|
257 |
-
|
258 |
-
print("修改后实体:", [(ent.text, ent.label_) for ent in doc.ents])
|
259 |
-
print(doc.ents[0]._.rse_id, 'final_entO')
|
260 |
-
# print(doc.ents[0].sent, 'final_entO')
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|