TEST-GIZ-Project-Search

Sleeping

App Files Files Community

annikwag commited on Mar 4

Commit

b4aa482

verified ·

1 Parent(s): 3114b48

Update appStore/prep_data.py

Browse files

Files changed (1) hide show

appStore/prep_data.py +19 -13

appStore/prep_data.py CHANGED Viewed

@@ -56,6 +56,18 @@ def process_iati():
     return projects_df
 def process_giz_worldwide():
     """
     This function reads the new giz_worldwide file and prepares the data for embedding.
@@ -66,14 +78,12 @@ def process_giz_worldwide():
       - Creates an empty 'url' column (since the new dataset has an empty URL)
       - Renames 'country' to 'countries'
       - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
     """
     # Read the new JSON file
     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
-    # Sample random rows for quick embeddings (seed set for reproducibility)
-    #giz_df = giz_df.sample(n=100, random_state=42)
-    # Reset the index so that create_documents can iterate using integer indices
     giz_df = giz_df.reset_index(drop=True)
     # Rename columns per new dataset requirements
@@ -88,19 +98,15 @@ def process_giz_worldwide():
     # Create an empty 'url' column as the new dataset has an empty URL
     giz_df['url'] = ''
-    # Convert the CRS value to integer (if the column exists)
     if 'crs_value' in giz_df.columns:
-        giz_df['crs_value'] = giz_df['crs_value'].apply(
-            lambda x: int(float(x)) if pd.notnull(x) and str(x).strip() != "" else x
-        )
     # Compute text_size based on merged_text and assign full text to the 'chunks' column
     giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
-    # Use the full merged_text for embedding (no chunking).
-    # If downstream code expects a list, you could instead wrap it in a list:
-    # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
     giz_df['chunks'] = giz_df['merged_text']
     giz_df['source'] = 'GIZ_WORLDWIDE'

     return projects_df
+def convert_crs_value(x):
+    if pd.isnull(x):
+        return x
+    # Convert to string and remove trailing '.0' if present.
+    x_str = str(x).strip()
+    if x_str.endswith(".0"):
+        x_str = x_str[:-2]
+    try:
+        return int(x_str)
+    except ValueError:
+        return x
 def process_giz_worldwide():
     """
     This function reads the new giz_worldwide file and prepares the data for embedding.
       - Creates an empty 'url' column (since the new dataset has an empty URL)
       - Renames 'country' to 'countries'
       - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
+      - Converts 'crs_value' to an integer (dropping any .0) if present.
     """
     # Read the new JSON file
     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
+    # Reset index for safety
     giz_df = giz_df.reset_index(drop=True)
     # Rename columns per new dataset requirements
     # Create an empty 'url' column as the new dataset has an empty URL
     giz_df['url'] = ''
+    # Convert CRS value: use string replacement and then integer conversion.
     if 'crs_value' in giz_df.columns:
+        giz_df['crs_value'] = giz_df['crs_value'].apply(convert_crs_value)
     # Compute text_size based on merged_text and assign full text to the 'chunks' column
     giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
+    # Use the full merged_text for embedding (no chunking)
     giz_df['chunks'] = giz_df['merged_text']
     giz_df['source'] = 'GIZ_WORLDWIDE'