annikwag commited on
Commit
b4aa482
·
verified ·
1 Parent(s): 3114b48

Update appStore/prep_data.py

Browse files
Files changed (1) hide show
  1. appStore/prep_data.py +19 -13
appStore/prep_data.py CHANGED
@@ -56,6 +56,18 @@ def process_iati():
56
  return projects_df
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def process_giz_worldwide():
60
  """
61
  This function reads the new giz_worldwide file and prepares the data for embedding.
@@ -66,14 +78,12 @@ def process_giz_worldwide():
66
  - Creates an empty 'url' column (since the new dataset has an empty URL)
67
  - Renames 'country' to 'countries'
68
  - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
 
69
  """
70
  # Read the new JSON file
71
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
72
 
73
- # Sample random rows for quick embeddings (seed set for reproducibility)
74
- #giz_df = giz_df.sample(n=100, random_state=42)
75
-
76
- # Reset the index so that create_documents can iterate using integer indices
77
  giz_df = giz_df.reset_index(drop=True)
78
 
79
  # Rename columns per new dataset requirements
@@ -88,19 +98,15 @@ def process_giz_worldwide():
88
 
89
  # Create an empty 'url' column as the new dataset has an empty URL
90
  giz_df['url'] = ''
91
-
92
- # Convert the CRS value to integer (if the column exists)
93
  if 'crs_value' in giz_df.columns:
94
- giz_df['crs_value'] = giz_df['crs_value'].apply(
95
- lambda x: int(float(x)) if pd.notnull(x) and str(x).strip() != "" else x
96
- )
97
-
98
  # Compute text_size based on merged_text and assign full text to the 'chunks' column
99
  giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
100
 
101
- # Use the full merged_text for embedding (no chunking).
102
- # If downstream code expects a list, you could instead wrap it in a list:
103
- # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: [text] if isinstance(text, str) else [])
104
  giz_df['chunks'] = giz_df['merged_text']
105
 
106
  giz_df['source'] = 'GIZ_WORLDWIDE'
 
56
  return projects_df
57
 
58
 
59
+ def convert_crs_value(x):
60
+ if pd.isnull(x):
61
+ return x
62
+ # Convert to string and remove trailing '.0' if present.
63
+ x_str = str(x).strip()
64
+ if x_str.endswith(".0"):
65
+ x_str = x_str[:-2]
66
+ try:
67
+ return int(x_str)
68
+ except ValueError:
69
+ return x
70
+
71
  def process_giz_worldwide():
72
  """
73
  This function reads the new giz_worldwide file and prepares the data for embedding.
 
78
  - Creates an empty 'url' column (since the new dataset has an empty URL)
79
  - Renames 'country' to 'countries'
80
  - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
81
+ - Converts 'crs_value' to an integer (dropping any .0) if present.
82
  """
83
  # Read the new JSON file
84
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
85
 
86
+ # Reset index for safety
 
 
 
87
  giz_df = giz_df.reset_index(drop=True)
88
 
89
  # Rename columns per new dataset requirements
 
98
 
99
  # Create an empty 'url' column as the new dataset has an empty URL
100
  giz_df['url'] = ''
101
+
102
+ # Convert CRS value: use string replacement and then integer conversion.
103
  if 'crs_value' in giz_df.columns:
104
+ giz_df['crs_value'] = giz_df['crs_value'].apply(convert_crs_value)
105
+
 
 
106
  # Compute text_size based on merged_text and assign full text to the 'chunks' column
107
  giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
108
 
109
+ # Use the full merged_text for embedding (no chunking)
 
 
110
  giz_df['chunks'] = giz_df['merged_text']
111
 
112
  giz_df['source'] = 'GIZ_WORLDWIDE'