Spaces:
Runtime error
Runtime error
from openai import OpenAI | |
import re | |
client = OpenAI( | |
api_key='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' | |
) | |
model = "gpt-3.5-turbo" | |
model = "gpt-4o" | |
def extract_GPE(text): | |
system_prompt = '''You are a professional geographer. Your task is to extract all geopolitical entities from a given text. Geopolitical entities can include countries, regions, cities, autonomous regions, or other administrative divisions. For each geopolitical entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"China and India are two of the most populous countries in Asia."\n\nExpected Output:\n[###China###, ###India###]''' | |
sent = 'Where is France?' | |
math_bot_messages = [ | |
{"role": "system", | |
"content": system_prompt}, | |
{"role": "user", "content": text}, | |
] | |
chat_completion = client.chat.completions.create( | |
messages=math_bot_messages, | |
model=model, | |
) | |
result = chat_completion.choices[0].message.content | |
return extract_to_dict(result, 'GPE') | |
def extract_LOC(text): | |
system_prompt = '''You are a professional geographer. Your task is to extract all location entities (LOC) from a given text. Location entities can include physical locations such as landmarks, geographical features, mountains, rivers, oceans, and places, but do not include political or administrative divisions such as countries or cities (these are considered geopolitical entities). For each location entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"China and India are two of the most populous countries in Asia."\n\nExpected Output:\n[###China###, ###India###]''' | |
sent = 'The Grand Canyon is one of the most spectacular natural wonders in the world, located in the state of Arizona. Nearby, the Colorado River flows through the canyon, carving its way through the rugged terrain. In the north, the Rocky Mountains stretch across several states, including Colorado and Wyoming.' | |
math_bot_messages = [ | |
{"role": "system", | |
"content": system_prompt}, | |
{"role": "user", "content": text}, | |
] | |
chat_completion = client.chat.completions.create( | |
messages=math_bot_messages, | |
model=model, | |
) | |
result = chat_completion.choices[0].message.content | |
return extract_to_dict(result, 'LOC') | |
def extract_RSE_1(text): | |
system_prompt = '''You are a professional geographer. Your task is to extract all spatial entities (directional keywords) from a given text. Spatial entities can include directional keywords such as north, south, east, west, and more specific terms like northeast, northwest, southeast, southwest, as well as terms indicating locations like center, central, downtown, and midtown. For each spatial entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The hotel is located in the downtown area of New York, just south of Central Park, with a beautiful view of the southeast corner."\n\nExpected Output:\n[###downtown###, ###south###, ###southeast###]''' | |
sent = 'The train station is situated in the central part of the city, just north of the river and east of the main square.' | |
math_bot_messages = [ | |
{"role": "system", | |
"content": system_prompt}, | |
{"role": "user", "content": text}, | |
] | |
chat_completion = client.chat.completions.create( | |
messages=math_bot_messages, | |
model=model, | |
) | |
result = chat_completion.choices[0].message.content | |
return extract_to_dict(result, 'RES_1') | |
def extract_RSE_2(text): | |
system_prompt = '''You are a professional geographer. Your task is to extract all fuzzy spatial entities (keywords) from a given text. Fuzzy spatial keywords can include terms like nearby, near, vicinity, close, beside, next, adjacent, immediate, border, surrounding, neighbourhood, proximity, territory, locality, and similar terms. For each fuzzy spatial keyword, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The park is located nearby the lake, with several cafes close to the walking paths, and a small garden adjacent to the main entrance."\n\nExpected Output:\n[###nearby###, ###close###, ###adjacent###]''' | |
sent = 'The village is situated in the vicinity of the mountain range, with a small river flowing beside the houses and several farms next to the road.' | |
math_bot_messages = [ | |
{"role": "system", | |
"content": system_prompt}, | |
{"role": "user", "content": text}, | |
] | |
chat_completion = client.chat.completions.create( | |
messages=math_bot_messages, | |
model=model, | |
) | |
result = chat_completion.choices[0].message.content | |
return extract_to_dict(result, 'RES_2') | |
def extract_RSE_3(text): | |
system_prompt = '''You are a professional geographer. Your task is to extract all fuzzy distance keywords from a given text. Fuzzy distance keywords include numeric values followed by distance units such as kilometer, mile, meter, foot, inch, centimeter, and other related units. The distance units can be in different formats, such as km, m, mi, ft, yd, cm, mm, or even in full words like kilometer, mile, or inch. For each fuzzy distance keyword, wrap the entire expression (number and unit) in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The park is located 3 km away from the city center, while the nearest supermarket is only 500 meters from here, and the lake is about 1 mile further down the road."\n\nExpected Output:\n[###3 km###, ###500 meters###, ###1 mile###]''' | |
sent = 'The school is located approximately 2 miles from the station, while the nearest bus stop is 200 meters away. The hiking trail is about 5 kilometers east of the town center.' | |
math_bot_messages = [ | |
{"role": "system", | |
"content": system_prompt}, | |
{"role": "user", "content": text}, | |
] | |
chat_completion = client.chat.completions.create( | |
messages=math_bot_messages, | |
model=model, | |
) | |
result = chat_completion.choices[0].message.content | |
return extract_to_dict(result, 'RES_3') | |
def extract_to_dict(respond, label): | |
# Extract entities from the response | |
entities = re.findall(r'###(.*?)###', respond) | |
# Create dictionary with entity names as keys and 'GPE' as values | |
entity_dict = {entity: label for entity in entities} | |
return entity_dict | |
# test GPT GPE LOC | |
from datasets import load_dataset | |
# 加载数据集 | |
dataset = load_dataset("eriktks/conll2003") | |
# dataset = load_dataset("tner/ontonotes5") | |
test_data = dataset["test"] | |
# 过滤出包含 ner_tag 为 4 的数据 | |
filtered_data = test_data.filter(lambda example: 4 in example['ner_tags']) | |
# 随机抽取 350 个样本 | |
random_sample = filtered_data.shuffle(seed=42).select([i for i in range(350)]) | |
# 计算正确数 | |
correct_count = 0 | |
for i, example in enumerate(random_sample): | |
# 取出非零的 ner_tags 和对应的 tokens | |
non_zero_tokens = {example['tokens'][j]: example['ner_tags'][j] for j in range(len(example['ner_tags'])) if example['ner_tags'][j] != 0} | |
# 生成 tokens 的字符串 | |
tokens_string = ' '.join(example['tokens']) | |
# 调用 extract_GPE 函数获取提取的 GPE 结果(一个字典) | |
extracted_gpe = extract_GPE(tokens_string) # 确保 extract_GPE 返回的是字典 | |
# 计算正确的匹配数 | |
for token in non_zero_tokens.keys(): | |
if token in extracted_gpe: | |
correct_count += 1 | |
break | |
# 计算准确率 | |
accuracy = correct_count / len(random_sample) | |
print(f"Total samples: {len(random_sample)}") | |
print(f"Correctly identified samples: {correct_count}") | |
print(f"Accuracy: {accuracy:.2%}") | |