File size: 9,242 Bytes
17e77ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from openai import OpenAI
import re

client = OpenAI(
    api_key='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'
)

model = "gpt-3.5-turbo"
model = "gpt-4o"

def extract_GPE(text):
    system_prompt = '''You are a professional geographer. Your task is to extract all geopolitical entities from a given text. Geopolitical entities can include countries, regions, cities, autonomous regions, or other administrative divisions. For each geopolitical entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"China and India are two of the most populous countries in Asia."\n\nExpected Output:\n[###China###, ###India###]'''
    sent = 'Where is France?'

    math_bot_messages = [
        {"role": "system",
         "content": system_prompt},
        {"role": "user", "content": text},
    ]

    chat_completion = client.chat.completions.create(
        messages=math_bot_messages,
        model=model,
    )

    result = chat_completion.choices[0].message.content
    return extract_to_dict(result, 'GPE')


def extract_LOC(text):
    system_prompt = '''You are a professional geographer. Your task is to extract all location entities (LOC) from a given text. Location entities can include physical locations such as landmarks, geographical features, mountains, rivers, oceans, and places, but do not include political or administrative divisions such as countries or cities (these are considered geopolitical entities). For each location entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"China and India are two of the most populous countries in Asia."\n\nExpected Output:\n[###China###, ###India###]'''
    sent = 'The Grand Canyon is one of the most spectacular natural wonders in the world, located in the state of Arizona. Nearby, the Colorado River flows through the canyon, carving its way through the rugged terrain. In the north, the Rocky Mountains stretch across several states, including Colorado and Wyoming.'

    math_bot_messages = [
        {"role": "system",
         "content": system_prompt},
        {"role": "user", "content": text},
    ]

    chat_completion = client.chat.completions.create(
        messages=math_bot_messages,
        model=model,
    )

    result = chat_completion.choices[0].message.content
    return extract_to_dict(result, 'LOC')


def extract_RSE_1(text):
    system_prompt = '''You are a professional geographer. Your task is to extract all spatial entities (directional keywords) from a given text. Spatial entities can include directional keywords such as north, south, east, west, and more specific terms like northeast, northwest, southeast, southwest, as well as terms indicating locations like center, central, downtown, and midtown. For each spatial entity, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The hotel is located in the downtown area of New York, just south of Central Park, with a beautiful view of the southeast corner."\n\nExpected Output:\n[###downtown###, ###south###, ###southeast###]'''
    sent = 'The train station is situated in the central part of the city, just north of the river and east of the main square.'

    math_bot_messages = [
        {"role": "system",
         "content": system_prompt},
        {"role": "user", "content": text},
    ]

    chat_completion = client.chat.completions.create(
        messages=math_bot_messages,
        model=model,
    )

    result = chat_completion.choices[0].message.content
    return extract_to_dict(result, 'RES_1')


def extract_RSE_2(text):
    system_prompt = '''You are a professional geographer. Your task is to extract all fuzzy spatial entities (keywords) from a given text. Fuzzy spatial keywords can include terms like nearby, near, vicinity, close, beside, next, adjacent, immediate, border, surrounding, neighbourhood, proximity, territory, locality, and similar terms. For each fuzzy spatial keyword, wrap the name in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The park is located nearby the lake, with several cafes close to the walking paths, and a small garden adjacent to the main entrance."\n\nExpected Output:\n[###nearby###, ###close###, ###adjacent###]'''
    sent = 'The village is situated in the vicinity of the mountain range, with a small river flowing beside the houses and several farms next to the road.'

    math_bot_messages = [
        {"role": "system",
         "content": system_prompt},
        {"role": "user", "content": text},
    ]

    chat_completion = client.chat.completions.create(
        messages=math_bot_messages,
        model=model,
    )

    result = chat_completion.choices[0].message.content
    return extract_to_dict(result, 'RES_2')


def extract_RSE_3(text):
    system_prompt = '''You are a professional geographer. Your task is to extract all fuzzy distance keywords from a given text. Fuzzy distance keywords include numeric values followed by distance units such as kilometer, mile, meter, foot, inch, centimeter, and other related units. The distance units can be in different formats, such as km, m, mi, ft, yd, cm, mm, or even in full words like kilometer, mile, or inch. For each fuzzy distance keyword, wrap the entire expression (number and unit) in a unique character sequence, such as [###ENTITY###]. If there are multiple entities, output them in the following format:\n[###ENTITY1###, ###ENTITY2###, ###ENTITY3###]\nHere is an example:\n Example:\n\nText:\n"The park is located 3 km away from the city center, while the nearest supermarket is only 500 meters from here, and the lake is about 1 mile further down the road."\n\nExpected Output:\n[###3 km###, ###500 meters###, ###1 mile###]'''
    sent = 'The school is located approximately 2 miles from the station, while the nearest bus stop is 200 meters away. The hiking trail is about 5 kilometers east of the town center.'

    math_bot_messages = [
        {"role": "system",
         "content": system_prompt},
        {"role": "user", "content": text},
    ]

    chat_completion = client.chat.completions.create(
        messages=math_bot_messages,
        model=model,
    )

    result = chat_completion.choices[0].message.content
    return extract_to_dict(result, 'RES_3')


def extract_to_dict(respond, label):
    # Extract entities from the response
    entities = re.findall(r'###(.*?)###', respond)

    # Create dictionary with entity names as keys and 'GPE' as values
    entity_dict = {entity: label for entity in entities}

    return entity_dict


# dataset = []
#
# # for i in dataset:
#
# with open('tweets.dev.bio', 'r') as file:
#     for line in file:
#         print(line.strip())


# test GPT GPE LOC
from datasets import load_dataset
dataset = load_dataset("eriktks/conll2003")
test_data = dataset["test"]
filtered_data = test_data.filter(lambda example: 4 in example['ner_tags'])
random_sample = filtered_data.shuffle(seed=42).select([i for i in range(10)])
for i, example in enumerate(random_sample):
    non_zero_tokens = [(example['tokens'][j], example['ner_tags'][j]) for j in range(len(example['ner_tags'])) if
                       example['ner_tags'][j] != 0]
    tokens_string = ' '.join(example['tokens'])
    print(f"Sample {i + 1}:")
    print(f"Tokens: {tokens_string}")
    print(f"Non-zero ner_tags and corresponding tokens: {non_zero_tokens}")
    print(extract_GPE(tokens_string))
    print("-" * 50)

# test GPT RSE
# sentences = [
#     "The parade took place about 10 kilometers east of downtown Sydney.",
#     "The origin of the virus is believed to be near Houston, Texas.",
#     "The company’s new headquarters is located roughly 5 miles south of the city center.",
#     "The discovery was made just north of the Arctic Circle, in the remote tundra.",
#     "The train station lies approximately 15 kilometers west of the airport.",
#     "The ancient ruins are located on the hilltop, overlooking the valley to the south.",
#     "The earthquake's epicenter was found around 50 kilometers southwest of Tokyo.",
#     "The forest stretches for several kilometers to the east, beyond the small village.",
#     "The research facility is positioned at the foot of the mountains, 30 kilometers southeast of the capital.",
#     "The new shopping mall is situated directly across from the park, just a few blocks north of the river."
# ]
# for i in sentences:
#     print(i)
#     print(extract_RSE_1(i))
#     print(extract_RSE_2(i))
#     print(extract_RSE_3(i))
#     print("-" * 50)