TEST-GIZ-Project-Search

Sleeping

App Files Files Community

annikwag commited on Feb 26

Commit

21fcb5a

verified ·

1 Parent(s): bf862a6

Update appStore/prep_data.py

Browse files

Files changed (1) hide show

appStore/prep_data.py +11 -11

appStore/prep_data.py CHANGED Viewed

@@ -58,19 +58,21 @@ def process_iati():
 def process_giz_worldwide():
     """
-    This will read the new giz_worldwide file and create the chunks.
-    The following adjustments have been made:
       - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
       - Renames 'name.en' to 'project_name'
-      - Uses the 'merged_text' column for creating chunks and computing text size
       - Creates an empty 'url' column (since the new dataset has an empty URL)
       - Renames 'country' to 'countries'
       - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
     """
     # Read the new JSON file
     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
     # Sample random rows for quick embeddings (seed set for reproducibility)
-    giz_df = giz_df.sample(n=15, random_state=42)
     # Rename columns per new dataset requirements
     giz_df = giz_df.rename(columns={
         'name.en': 'project_name',
@@ -79,20 +81,18 @@ def process_giz_worldwide():
         'duration.project.end': 'end_year'
     })
     giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
     # Create an empty 'url' column as the new dataset has an empty URL
     giz_df['url'] = ''
-    # Create text_size based on merged_text and create chunks from merged_text
     giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
-    giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
-    print("initial df length:", len(giz_df))
-    giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
-    print("new df length:", len(giz_df))
-    print(giz_df.columns)
     giz_df['source'] = 'GIZ_WORLDWIDE'
     return giz_df

 def process_giz_worldwide():
     """
+    This function reads the new giz_worldwide file and prepares the data for embedding.
+    Adjustments made:
       - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
       - Renames 'name.en' to 'project_name'
+      - Uses the 'merged_text' column for embedding the whole text (no chunking)
       - Creates an empty 'url' column (since the new dataset has an empty URL)
       - Renames 'country' to 'countries'
       - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
     """
     # Read the new JSON file
     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
     # Sample random rows for quick embeddings (seed set for reproducibility)
+    giz_df = giz_df.sample(n=30, random_state=42)
     # Rename columns per new dataset requirements
     giz_df = giz_df.rename(columns={
         'name.en': 'project_name',
         'duration.project.end': 'end_year'
     })
     giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
     # Create an empty 'url' column as the new dataset has an empty URL
     giz_df['url'] = ''
+    # Compute text_size based on merged_text and assign full text to the 'chunks' column
     giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
+    # Use the full merged_text instead of creating chunks.
+    # If your downstream code expects a list of texts, use:
+    # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
+    giz_df['chunks'] = giz_df['merged_text']
     giz_df['source'] = 'GIZ_WORLDWIDE'
     return giz_df