keyword-embeddings-space / mpl_data_plotter.py
latticetower's picture
cleanup
af49af1
raw
history blame
3.85 kB
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('agg')
import plot_utils
from constants import *
class MatplotlibDataPlotter:
def __init__(self, single_df, pair_df, num_domains_in_region_df):
self.single_df = single_df
self.pair_df = pair_df
self.num_domains_in_region_df = num_domains_in_region_df
self.single_domains_fig = plt.figure(figsize=(5, 10))
self.pair_domains_fig = plt.figure(figsize=(5, 10))
def plot_single_domains(self, num_domains, split_name="stratified"):
selected_region_ids = self.num_domains_in_region_df.loc[
self.num_domains_in_region_df.num_domains >= num_domains,
'cds_region_id'].values
single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)]
biosyn_counts_single = single_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
hue2count_single = dict(biosyn_counts_single.values)
# split_name = 'stratified'
column_name = f'cosine_similarity_{split_name}'
# single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains]
selected_keyword_index = single_df_subset.groupby('cds_region_id').agg(
{column_name: 'idxmax'}
).values.flatten()
targets_list = single_df_subset.loc[selected_keyword_index, 'biosyn_class_index'].values
label_list = single_df_subset.loc[selected_keyword_index, 'profile_name'].values
top_n=5
bin_width=1
hue_group_offset=0.5
width=0.9
fig = self.single_domains_fig
fig.clf()
ax = fig.gca()
plot_utils.draw_barplots(
targets_list,
label_list=label_list,
top_n=top_n,
bin_width=bin_width,
hue_group_offset=hue_group_offset,
hue_order=BIOSYN_CLASS_NAMES,
hue2count=hue2count_single,
width=width,
ax=ax,
show_legend=False,
palette=COLOR_PALETTE
)
fig.tight_layout()
return fig
def plot_pair_domains(self, num_domains, split_name="stratified"):
selected_region_ids = self.num_domains_in_region_df.loc[
self.num_domains_in_region_df.num_domains >= num_domains,
'cds_region_id'].values
pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)]
biosyn_counts_pairs = pair_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
hue2count_pairs = dict(biosyn_counts_pairs.values)
column_name = f'cosine_similarity_{split_name}'
selected_keyword_index = pair_df_subset.groupby('cds_region_id').agg(
{column_name: 'idxmax'}
).values.flatten()
targets_list = pair_df_subset.loc[
selected_keyword_index, 'biosyn_class_index'].values
label_list=pair_df_subset.loc[
selected_keyword_index, 'profile_name'].values
top_n=5
bin_width=1
hue_group_offset=0.5
# hue_order=BIOSYN_CLASS_NAMES
hue2count={}
width=0.9
show_legend=False
fig = self.pair_domains_fig
fig.clf()
ax = fig.gca()
plot_utils.draw_barplots(
targets_list,
label_list=label_list,
top_n=top_n,
bin_width=bin_width,
hue_group_offset=hue_group_offset,
hue_order=BIOSYN_CLASS_NAMES,
hue2count=hue2count_pairs,
width=width,
ax=ax,
show_legend=show_legend,
palette=COLOR_PALETTE
)
fig.tight_layout()
return fig #plt.gcf()