annikwag commited on
Commit
21fcb5a
·
verified ·
1 Parent(s): bf862a6

Update appStore/prep_data.py

Browse files
Files changed (1) hide show
  1. appStore/prep_data.py +11 -11
appStore/prep_data.py CHANGED
@@ -58,19 +58,21 @@ def process_iati():
58
 
59
  def process_giz_worldwide():
60
  """
61
- This will read the new giz_worldwide file and create the chunks.
62
- The following adjustments have been made:
63
  - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
64
  - Renames 'name.en' to 'project_name'
65
- - Uses the 'merged_text' column for creating chunks and computing text size
66
  - Creates an empty 'url' column (since the new dataset has an empty URL)
67
  - Renames 'country' to 'countries'
68
  - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
69
  """
70
  # Read the new JSON file
71
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
 
72
  # Sample random rows for quick embeddings (seed set for reproducibility)
73
- giz_df = giz_df.sample(n=15, random_state=42)
 
74
  # Rename columns per new dataset requirements
75
  giz_df = giz_df.rename(columns={
76
  'name.en': 'project_name',
@@ -79,20 +81,18 @@ def process_giz_worldwide():
79
  'duration.project.end': 'end_year'
80
  })
81
 
82
-
83
  giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
84
 
85
  # Create an empty 'url' column as the new dataset has an empty URL
86
  giz_df['url'] = ''
87
 
88
- # Create text_size based on merged_text and create chunks from merged_text
89
  giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
90
- giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
91
 
92
- print("initial df length:", len(giz_df))
93
- giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
94
- print("new df length:", len(giz_df))
95
- print(giz_df.columns)
96
 
97
  giz_df['source'] = 'GIZ_WORLDWIDE'
98
  return giz_df
 
58
 
59
  def process_giz_worldwide():
60
  """
61
+ This function reads the new giz_worldwide file and prepares the data for embedding.
62
+ Adjustments made:
63
  - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
64
  - Renames 'name.en' to 'project_name'
65
+ - Uses the 'merged_text' column for embedding the whole text (no chunking)
66
  - Creates an empty 'url' column (since the new dataset has an empty URL)
67
  - Renames 'country' to 'countries'
68
  - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
69
  """
70
  # Read the new JSON file
71
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
72
+
73
  # Sample random rows for quick embeddings (seed set for reproducibility)
74
+ giz_df = giz_df.sample(n=30, random_state=42)
75
+
76
  # Rename columns per new dataset requirements
77
  giz_df = giz_df.rename(columns={
78
  'name.en': 'project_name',
 
81
  'duration.project.end': 'end_year'
82
  })
83
 
 
84
  giz_df['end_year'] = giz_df['end_year'].apply(convert_to_date)
85
 
86
  # Create an empty 'url' column as the new dataset has an empty URL
87
  giz_df['url'] = ''
88
 
89
+ # Compute text_size based on merged_text and assign full text to the 'chunks' column
90
  giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
 
91
 
92
+ # Use the full merged_text instead of creating chunks.
93
+ # If your downstream code expects a list of texts, use:
94
+ # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
95
+ giz_df['chunks'] = giz_df['merged_text']
96
 
97
  giz_df['source'] = 'GIZ_WORLDWIDE'
98
  return giz_df