m7n commited on
Commit
d748d3b
·
1 Parent(s): 4c3ab20

Enhance data processing in app.py and openalex_utils.py by improving handling of referenced works and filling missing publication values with spaces.

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. openalex_utils.py +4 -1
app.py CHANGED
@@ -524,7 +524,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
524
  # Export relevant column
525
  export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
526
  export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
527
- export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
 
528
  if locally_approximate_publication_date_checkbox and plot_time_checkbox:
529
  export_df['approximate_publication_year'] = local_years
530
  export_df.to_csv(csv_file_path, index=False)
 
524
  # Export relevant column
525
  export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
526
  export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
527
+ export_df['referenced_works'] = [x if isinstance(x, str) else ', '.join(x) if isinstance(x, (list, tuple)) and not pd.isna(x) else '' for x in records_df['referenced_works']]
528
+
529
  if locally_approximate_publication_date_checkbox and plot_time_checkbox:
530
  export_df['approximate_publication_year'] = local_years
531
  export_df.to_csv(csv_file_path, index=False)
openalex_utils.py CHANGED
@@ -99,14 +99,17 @@ def process_records_to_df(records):
99
  records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
100
  if 'primary_location' in records_df.columns:
101
  records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
 
 
102
  else:
103
  # Process raw records as before
104
  records_df = pd.DataFrame(records)
105
  records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
106
  records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
 
107
 
108
  # Fill missing values and deduplicate
109
- records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
110
  records_df['abstract'] = records_df['abstract'].fillna(' ')
111
  records_df['title'] = records_df['title'].fillna(' ')
112
  records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)
 
99
  records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
100
  if 'primary_location' in records_df.columns:
101
  records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
102
+ records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.
103
+
104
  else:
105
  # Process raw records as before
106
  records_df = pd.DataFrame(records)
107
  records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
108
  records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
109
+ records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
110
 
111
  # Fill missing values and deduplicate
112
+
113
  records_df['abstract'] = records_df['abstract'].fillna(' ')
114
  records_df['title'] = records_df['title'].fillna(' ')
115
  records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)