Spaces:

latticetower
/

keyword-embeddings-space

Sleeping

App Files Files

keyword-embeddings-space / mpl_data_plotter.py

latticetower

cleanup

af49af1 2 months ago

raw

history blame

3.85 kB


	import matplotlib.pyplot as plt
	import matplotlib
	matplotlib.use('agg')

	import plot_utils
	from constants import *


	class MatplotlibDataPlotter:
	def __init__(self, single_df, pair_df, num_domains_in_region_df):
	self.single_df = single_df
	self.pair_df = pair_df

	self.num_domains_in_region_df = num_domains_in_region_df

	self.single_domains_fig = plt.figure(figsize=(5, 10))
	self.pair_domains_fig = plt.figure(figsize=(5, 10))

	def plot_single_domains(self, num_domains, split_name="stratified"):
	selected_region_ids = self.num_domains_in_region_df.loc[
	self.num_domains_in_region_df.num_domains >= num_domains,
	'cds_region_id'].values

	single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)]

	biosyn_counts_single = single_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
	hue2count_single = dict(biosyn_counts_single.values)

	# split_name = 'stratified'
	column_name = f'cosine_similarity_{split_name}'
	# single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains]
	selected_keyword_index = single_df_subset.groupby('cds_region_id').agg(
	{column_name: 'idxmax'}
	).values.flatten()
	targets_list = single_df_subset.loc[selected_keyword_index, 'biosyn_class_index'].values
	label_list = single_df_subset.loc[selected_keyword_index, 'profile_name'].values

	top_n=5
	bin_width=1
	hue_group_offset=0.5
	width=0.9

	fig = self.single_domains_fig
	fig.clf()

	ax = fig.gca()
	plot_utils.draw_barplots(
	targets_list,
	label_list=label_list,
	top_n=top_n,
	bin_width=bin_width,
	hue_group_offset=hue_group_offset,
	hue_order=BIOSYN_CLASS_NAMES,
	hue2count=hue2count_single,
	width=width,
	ax=ax,
	show_legend=False,
	palette=COLOR_PALETTE
	)
	fig.tight_layout()
	return fig

	def plot_pair_domains(self, num_domains, split_name="stratified"):
	selected_region_ids = self.num_domains_in_region_df.loc[
	self.num_domains_in_region_df.num_domains >= num_domains,
	'cds_region_id'].values

	pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)]

	biosyn_counts_pairs = pair_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
	hue2count_pairs = dict(biosyn_counts_pairs.values)

	column_name = f'cosine_similarity_{split_name}'

	selected_keyword_index = pair_df_subset.groupby('cds_region_id').agg(
	{column_name: 'idxmax'}
	).values.flatten()
	targets_list = pair_df_subset.loc[
	selected_keyword_index, 'biosyn_class_index'].values
	label_list=pair_df_subset.loc[
	selected_keyword_index, 'profile_name'].values

	top_n=5
	bin_width=1
	hue_group_offset=0.5
	# hue_order=BIOSYN_CLASS_NAMES
	hue2count={}
	width=0.9

	show_legend=False
	fig = self.pair_domains_fig
	fig.clf()

	ax = fig.gca()
	plot_utils.draw_barplots(
	targets_list,
	label_list=label_list,
	top_n=top_n,
	bin_width=bin_width,
	hue_group_offset=hue_group_offset,
	hue_order=BIOSYN_CLASS_NAMES,
	hue2count=hue2count_pairs,
	width=width,
	ax=ax,
	show_legend=show_legend,
	palette=COLOR_PALETTE
	)
	fig.tight_layout()
	return fig #plt.gcf()