Spaces:
Runtime error
Runtime error
File size: 9,242 Bytes
17e77ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from openai import OpenAI
import re
client = OpenAI(
api_key='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'
)
model = "gpt-3.5-turbo"
model = "gpt-4o"
def extract_GPE(text):
system_prompt = '''You are a professional geographer. Your task is to extract all geopolitical entities from a given text. Geopolitical entities can include countries, regions, cities, autonomous regions, or other administrative divisions. For each geopolitical entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"China and India are two of the most populous countries in Asia."\n\nExpected Output:\n[###China###, ###India###]'''
sent = 'Where is France?'
math_bot_messages = [
{"role": "system",
"content": system_prompt},
{"role": "user", "content": text},
]
chat_completion = client.chat.completions.create(
messages=math_bot_messages,
model=model,
)
result = chat_completion.choices[0].message.content
return extract_to_dict(result, 'GPE')
def extract_LOC(text):
system_prompt = '''You are a professional geographer. Your task is to extract all location entities (LOC) from a given text. Location entities can include physical locations such as landmarks, geographical features, mountains, rivers, oceans, and places, but do not include political or administrative divisions such as countries or cities (these are considered geopolitical entities). For each location entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"China and India are two of the most populous countries in Asia."\n\nExpected Output:\n[###China###, ###India###]'''
sent = 'The Grand Canyon is one of the most spectacular natural wonders in the world, located in the state of Arizona. Nearby, the Colorado River flows through the canyon, carving its way through the rugged terrain. In the north, the Rocky Mountains stretch across several states, including Colorado and Wyoming.'
math_bot_messages = [
{"role": "system",
"content": system_prompt},
{"role": "user", "content": text},
]
chat_completion = client.chat.completions.create(
messages=math_bot_messages,
model=model,
)
result = chat_completion.choices[0].message.content
return extract_to_dict(result, 'LOC')
def extract_RSE_1(text):
system_prompt = '''You are a professional geographer. Your task is to extract all spatial entities (directional keywords) from a given text. Spatial entities can include directional keywords such as north, south, east, west, and more specific terms like northeast, northwest, southeast, southwest, as well as terms indicating locations like center, central, downtown, and midtown. For each spatial entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The hotel is located in the downtown area of New York, just south of Central Park, with a beautiful view of the southeast corner."\n\nExpected Output:\n[###downtown###, ###south###, ###southeast###]'''
sent = 'The train station is situated in the central part of the city, just north of the river and east of the main square.'
math_bot_messages = [
{"role": "system",
"content": system_prompt},
{"role": "user", "content": text},
]
chat_completion = client.chat.completions.create(
messages=math_bot_messages,
model=model,
)
result = chat_completion.choices[0].message.content
return extract_to_dict(result, 'RES_1')
def extract_RSE_2(text):
system_prompt = '''You are a professional geographer. Your task is to extract all fuzzy spatial entities (keywords) from a given text. Fuzzy spatial keywords can include terms like nearby, near, vicinity, close, beside, next, adjacent, immediate, border, surrounding, neighbourhood, proximity, territory, locality, and similar terms. For each fuzzy spatial keyword, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The park is located nearby the lake, with several cafes close to the walking paths, and a small garden adjacent to the main entrance."\n\nExpected Output:\n[###nearby###, ###close###, ###adjacent###]'''
sent = 'The village is situated in the vicinity of the mountain range, with a small river flowing beside the houses and several farms next to the road.'
math_bot_messages = [
{"role": "system",
"content": system_prompt},
{"role": "user", "content": text},
]
chat_completion = client.chat.completions.create(
messages=math_bot_messages,
model=model,
)
result = chat_completion.choices[0].message.content
return extract_to_dict(result, 'RES_2')
def extract_RSE_3(text):
system_prompt = '''You are a professional geographer. Your task is to extract all fuzzy distance keywords from a given text. Fuzzy distance keywords include numeric values followed by distance units such as kilometer, mile, meter, foot, inch, centimeter, and other related units. The distance units can be in different formats, such as km, m, mi, ft, yd, cm, mm, or even in full words like kilometer, mile, or inch. For each fuzzy distance keyword, wrap the entire expression (number and unit) in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The park is located 3 km away from the city center, while the nearest supermarket is only 500 meters from here, and the lake is about 1 mile further down the road."\n\nExpected Output:\n[###3 km###, ###500 meters###, ###1 mile###]'''
sent = 'The school is located approximately 2 miles from the station, while the nearest bus stop is 200 meters away. The hiking trail is about 5 kilometers east of the town center.'
math_bot_messages = [
{"role": "system",
"content": system_prompt},
{"role": "user", "content": text},
]
chat_completion = client.chat.completions.create(
messages=math_bot_messages,
model=model,
)
result = chat_completion.choices[0].message.content
return extract_to_dict(result, 'RES_3')
def extract_to_dict(respond, label):
# Extract entities from the response
entities = re.findall(r'###(.*?)###', respond)
# Create dictionary with entity names as keys and 'GPE' as values
entity_dict = {entity: label for entity in entities}
return entity_dict
# dataset = []
#
# # for i in dataset:
#
# with open('tweets.dev.bio', 'r') as file:
# for line in file:
# print(line.strip())
# test GPT GPE LOC
from datasets import load_dataset
dataset = load_dataset("eriktks/conll2003")
test_data = dataset["test"]
filtered_data = test_data.filter(lambda example: 4 in example['ner_tags'])
random_sample = filtered_data.shuffle(seed=42).select([i for i in range(10)])
for i, example in enumerate(random_sample):
non_zero_tokens = [(example['tokens'][j], example['ner_tags'][j]) for j in range(len(example['ner_tags'])) if
example['ner_tags'][j] != 0]
tokens_string = ' '.join(example['tokens'])
print(f"Sample {i + 1}:")
print(f"Tokens: {tokens_string}")
print(f"Non-zero ner_tags and corresponding tokens: {non_zero_tokens}")
print(extract_GPE(tokens_string))
print("-" * 50)
# test GPT RSE
# sentences = [
# "The parade took place about 10 kilometers east of downtown Sydney.",
# "The origin of the virus is believed to be near Houston, Texas.",
# "The company’s new headquarters is located roughly 5 miles south of the city center.",
# "The discovery was made just north of the Arctic Circle, in the remote tundra.",
# "The train station lies approximately 15 kilometers west of the airport.",
# "The ancient ruins are located on the hilltop, overlooking the valley to the south.",
# "The earthquake's epicenter was found around 50 kilometers southwest of Tokyo.",
# "The forest stretches for several kilometers to the east, beyond the small village.",
# "The research facility is positioned at the foot of the mountains, 30 kilometers southeast of the capital.",
# "The new shopping mall is situated directly across from the park, just a few blocks north of the river."
# ]
# for i in sentences:
# print(i)
# print(extract_RSE_1(i))
# print(extract_RSE_2(i))
# print(extract_RSE_3(i))
# print("-" * 50)
|