latticetower commited on
Commit
ca7444f
·
1 Parent(s): 5710643

add fix for int64 conversion error

Browse files
Files changed (2) hide show
  1. app.py +9 -1
  2. mpl_data_plotter.py +2 -1
app.py CHANGED
@@ -8,14 +8,22 @@ from constants import *
8
 
9
  from mpl_data_plotter import MatplotlibDataPlotter
10
 
 
 
 
 
 
 
11
 
12
  print(f"Loading domains data...")
13
  single_df = pd.read_csv(SINGLE_DOMAINS_FILE, compression='gzip')
14
  single_df['biosyn_class_index'] = single_df.bgc_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
 
15
 
16
  pair_df = pd.read_csv(PAIR_DOMAINS_FILE, compression='gzip')
17
  pair_df['biosyn_class_index'] = pair_df.bgc_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
18
- # unique_domain_lengths = single_df.dom_location_len.unique()
 
19
  num_domains_in_region_df = single_df.groupby('cds_region_id', as_index=False).agg({'as_domain_id': 'count'}).rename(
20
  columns={'as_domain_id': 'num_domains'})
21
 
 
8
 
9
  from mpl_data_plotter import MatplotlibDataPlotter
10
 
11
+ def convert_int64_to_int32(df):
12
+ for col in df.columns:
13
+ if df[col].dtype == 'int64':
14
+ print(col)
15
+ df[col] = df[col].astype('int32')
16
+ return df
17
 
18
  print(f"Loading domains data...")
19
  single_df = pd.read_csv(SINGLE_DOMAINS_FILE, compression='gzip')
20
  single_df['biosyn_class_index'] = single_df.bgc_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
21
+ single_df = convert_int64_to_int32(single_df)
22
 
23
  pair_df = pd.read_csv(PAIR_DOMAINS_FILE, compression='gzip')
24
  pair_df['biosyn_class_index'] = pair_df.bgc_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
25
+ pair_df = convert_int64_to_int32(pair_df)
26
+
27
  num_domains_in_region_df = single_df.groupby('cds_region_id', as_index=False).agg({'as_domain_id': 'count'}).rename(
28
  columns={'as_domain_id': 'num_domains'})
29
 
mpl_data_plotter.py CHANGED
@@ -26,6 +26,7 @@ class MatplotlibDataPlotter:
26
  'cds_region_id'].values
27
  single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)]
28
 
 
29
  split_name = 'stratified'
30
  column_name = f'cosine_similarity_{split_name}'
31
  # single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains]
@@ -69,7 +70,7 @@ class MatplotlibDataPlotter:
69
  self.num_domains_in_region_df.num_domains >= num_domains,
70
  'cds_region_id'].values
71
  pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)]
72
-
73
  split_name = 'stratified'
74
  column_name = f'cosine_similarity_{split_name}'
75
  # pair_df_subset = pair_df.loc[pair_df.dom_location_len >= num_domains]
 
26
  'cds_region_id'].values
27
  single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)]
28
 
29
+ return self.single_domains_fig
30
  split_name = 'stratified'
31
  column_name = f'cosine_similarity_{split_name}'
32
  # single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains]
 
70
  self.num_domains_in_region_df.num_domains >= num_domains,
71
  'cds_region_id'].values
72
  pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)]
73
+ return self.pair_domains_fig
74
  split_name = 'stratified'
75
  column_name = f'cosine_similarity_{split_name}'
76
  # pair_df_subset = pair_df.loc[pair_df.dom_location_len >= num_domains]