Spaces:
Sleeping
Sleeping
Update appStore/prep_data.py
Browse files- appStore/prep_data.py +11 -11
appStore/prep_data.py
CHANGED
@@ -58,19 +58,21 @@ def process_iati():
|
|
58 |
|
59 |
def process_giz_worldwide():
|
60 |
"""
|
61 |
-
This
|
62 |
-
|
63 |
- Reads the file 'giz_worldwide_api_download_23_02_2025.json'
|
64 |
- Renames 'name.en' to 'project_name'
|
65 |
-
- Uses the 'merged_text' column for
|
66 |
- Creates an empty 'url' column (since the new dataset has an empty URL)
|
67 |
- Renames 'country' to 'countries'
|
68 |
- Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
|
69 |
"""
|
70 |
# Read the new JSON file
|
71 |
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
|
|
|
72 |
# Sample random rows for quick embeddings (seed set for reproducibility)
|
73 |
-
giz_df = giz_df.sample(n=
|
|
|
74 |
# Rename columns per new dataset requirements
|
75 |
giz_df = giz_df.rename(columns={
|
76 |
'name.en': 'project_name',
|
@@ -79,20 +81,18 @@ def process_giz_worldwide():
|
|
79 |
'duration.project.end': 'end_year'
|
80 |
})
|
81 |
|
82 |
-
|
83 |
giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
|
84 |
|
85 |
# Create an empty 'url' column as the new dataset has an empty URL
|
86 |
giz_df['url'] = ''
|
87 |
|
88 |
-
#
|
89 |
giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
|
90 |
-
giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
|
97 |
giz_df['source'] = 'GIZ_WORLDWIDE'
|
98 |
return giz_df
|
|
|
58 |
|
59 |
def process_giz_worldwide():
|
60 |
"""
|
61 |
+
This function reads the new giz_worldwide file and prepares the data for embedding.
|
62 |
+
Adjustments made:
|
63 |
- Reads the file 'giz_worldwide_api_download_23_02_2025.json'
|
64 |
- Renames 'name.en' to 'project_name'
|
65 |
+
- Uses the 'merged_text' column for embedding the whole text (no chunking)
|
66 |
- Creates an empty 'url' column (since the new dataset has an empty URL)
|
67 |
- Renames 'country' to 'countries'
|
68 |
- Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
|
69 |
"""
|
70 |
# Read the new JSON file
|
71 |
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
|
72 |
+
|
73 |
# Sample random rows for quick embeddings (seed set for reproducibility)
|
74 |
+
giz_df = giz_df.sample(n=30, random_state=42)
|
75 |
+
|
76 |
# Rename columns per new dataset requirements
|
77 |
giz_df = giz_df.rename(columns={
|
78 |
'name.en': 'project_name',
|
|
|
81 |
'duration.project.end': 'end_year'
|
82 |
})
|
83 |
|
|
|
84 |
giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
|
85 |
|
86 |
# Create an empty 'url' column as the new dataset has an empty URL
|
87 |
giz_df['url'] = ''
|
88 |
|
89 |
+
# Compute text_size based on merged_text and assign full text to the 'chunks' column
|
90 |
giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
|
|
|
91 |
|
92 |
+
# Use the full merged_text instead of creating chunks.
|
93 |
+
# If your downstream code expects a list of texts, use:
|
94 |
+
# giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
|
95 |
+
giz_df['chunks'] = giz_df['merged_text']
|
96 |
|
97 |
giz_df['source'] = 'GIZ_WORLDWIDE'
|
98 |
return giz_df
|