Given the TPM data matrix a user provides to get_top_gene_set.py
, take some time to visually inspect each step of the processing pipeline and visualize some helpful summaries of the data.
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from Bio import SeqIO
import sys
sys.path.append('../') # use modules in main directory
import genbank_utils as gu
import get_top_gene_set as gtgs
Use the same data loading function from get_top_gene_set.py
# load TPM and feature data
data_file = '../data/extract_TPM_counts.tsv'
sample2condition_file = '../data/sample2condition.txt'
sample_file = None #'../config/samples_to_include.txt'
condition_file = '../config/conditions_to_include.txt'
gb_file = '../data/5GB1c_sequence.gb'
df, sample2condition, samples, conditions, pos_feats, neg_feats = gtgs.load_data(data_file,
sample2condition_file,
sample_file,
condition_file,
gb_file)
df.head()
feat2meta = gu.get_feat2meta_dict(gb_file)
feat2meta['EQU24_RS19310']
Use the operon flagging function to call out which loci in the genome seem to be too close together on the same strand. The choice of 120 was determined for this organism (Methylotubimicrobium buryatense 5GB1) by exploring known operons inter-gene distances. See operon_distance_exploration.ipynb
for more details.
op_min_dist = 120
maybe_operon_loci = gtgs.flag_potential_operon_loci(pos_feats, neg_feats, op_min_dist)
print(f"Number of possible operon loci at distance of {op_min_dist}bp: {len(maybe_operon_loci)}/{len(pos_feats)+len(neg_feats)}")
Pivot the data matrix and combine samples that belong to the same experimental condition. Values below are averages across samples.
# for the loaded TPM matrix, which column contains the unique gene ids?
LOCUS_ID_COL = 'locus_tag'
# use this column to get a full list of all genes for which expression was measured
LOCI = list(df[LOCUS_ID_COL].values)
df_means = gtgs.get_average_tpm_by_condition(df,samples,conditions,sample2condition,LOCI)
df_means
The printed output is of the same format of the output of get_top_gene_sets.py
# get top locus ids in all conditions
n = 3
top_locs = gtgs.get_top_n_perc_by_condition(df_means, LOCI, n)
# filter out loci maybe in operons
top_locs_op_filter_out = [x for x in top_locs if x in maybe_operon_loci]
# print top locus results
print(f'Top {n}%: {len(top_locs)} ({len(top_locs) - len(top_locs_op_filter_out)} filtered)\n')
print("Locus\toperon?")
for loc in top_locs:
print(f'{loc}\t{loc in top_locs_op_filter_out}')
The above example uses 3%, but how do you pick that threshold? There's a tradeoff between the number of genes in your top set and the TPM expression of genes in the top. If you are more strict (smaller n
), your top set will only consist of the very top genes, however you might not get very many of them. If you are more lenient (choose larger n
), you'll get more top genes, but then you start to sample genes which on average reach rather low TPM values...
The following visuals are to help show the trade off and help you decide how the minimum TPM expression you will tolerate in your "top gene set" while showing you how many genes that qualify as being in the top.
We also build a tsv showing which genes qualify as "top" at each n% threshold.
tradeoff_data = []
top_n_loci_qualifers = []
# examine top sets using a range of top 1% to top 10%
for i in range(1,11):
# get top locus ids in all conditions
top_locs = gtgs.get_top_n_perc_by_condition(df_means, LOCI, i)
top_locs_op_filter_out = [x for x in top_locs if x in maybe_operon_loci]
# get the average expression of each locus across conditions
means = []
for loc in top_locs:
mean_exp = np.mean(df_means[loc].values)
means.append(mean_exp)
# add row to the data frame consisting of:
# n: top % threshold
# locus count: number of top loci identified
# locus count filtered: locus count after filtering operon genes
# min expression: the minimum avg expression value from the top gene set
row = [i,
len(top_locs),
len(top_locs)-len(top_locs_op_filter_out),
min(means)
]
tradeoff_data.append(row)
# also, add all these loci to the top n gene sets
for loc in top_locs:
g = feat2meta[loc]['gene_symbol']
p = feat2meta[loc]['product']
t = feat2meta[loc]['type']
op = loc in top_locs_op_filter_out
i
top_n_loci_qualifers.append([loc,g,p,t,op,i])
tradeoff_df = pd.DataFrame(tradeoff_data, columns=['n','loc_count','loc_count_filt','min_exp'])
all_top_n_df = pd.DataFrame(top_n_loci_qualifers, columns=['locus_tag','gene_symbol','product','type','op?','top_perc']).groupby('locus_tag').first().reset_index().sort_values('top_perc')
tradeoff_df
all_top_n_df.head(20)
# write out tsv of top loci and which threshold at which they qualify as "top"
all_top_n_df.to_csv('top_n_loci_qualifiers.tsv',sep='\t',index=False)
def tradeoff_plot(df):
# number of genes in the top set
num_genes_layer = alt.Chart(df).mark_circle(size=100).encode(
x=alt.X('loc_count:Q', axis=alt.Axis(title='Number of loci in top n%',grid=False)),
y=alt.Y('min_exp:Q', axis=alt.Axis(title='Min. mean TPM of loci in top n%',grid=False)),
color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
tooltip=[alt.Tooltip('n:Q', title='Top n%'),
alt.Tooltip('loc_count:Q', title='Num loci in top n%'),
alt.Tooltip('loc_count_filt:Q', title='Num loci in top n% after operon filtering'),
alt.Tooltip('min_exp:Q', title='Min. ave. expr. in top n%')
]
)
# number of genes in the top set after filtering operon genes
op_ex_layer = alt.Chart(df).mark_point(size=100).encode(
x='loc_count_filt:Q',
y='min_exp:Q',
color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
tooltip=[alt.Tooltip('n:Q', title='Top N%'),
alt.Tooltip('loc_count:Q', title='Num loci in top n%'),
alt.Tooltip('loc_count_filt:Q', title='Num loci in top n% after operon filtering'),
alt.Tooltip('min_exp:Q', title='Min. ave. expr. in top n%')
]
)
# connecting line between top set total and operon-filtered sets
rule = alt.Chart(df).mark_rule().encode(
x='loc_count_filt:Q',
x2='loc_count:Q',
y='min_exp:Q',
color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
)
# top n% text label
text = alt.Chart(df).mark_text(
align='left',
baseline='middle',
angle=335,
dx=5,
dy=-7,
size=14,
color='black'
).encode(
x='loc_count:Q',
y='min_exp:Q',
text=alt.Text('n:N'),
#color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
).transform_calculate(n='"Top " + datum.n + "%"')
# combine layers into chart
chart = num_genes_layer + op_ex_layer + rule + text
# final chart configs
chart = chart.properties(
title='Tradeoff between count and expression of top n% threshold'
).configure_title(
fontSize=16,
).configure_axis(
labelFontSize=14,
titleFontSize=14
).configure_view(
strokeOpacity=0
).interactive()
#chart.save('tradeoff.html')
return chart
Instructions:
* hover over points to see the exact TPM and count values
* pan and zoom with mouse and scroll
tradeoff_plot(tradeoff_df)
This plot shows the tradeoff between number of genes in the top set (on the x-axis) and the mean TPM expression of the minimum gene in the top set (so the TPM value of the weakest gene that still qualified as being in the top n%). The filled circles represent the total number of genes in the top while the open circles are the same set but with loci-possibly-in-operons (because they were too close to another locus on the same strand) filtered out. So it is a subset of the filled circle.
The above visuals show a summary of all loci after being grouped into top sets at varying thresholds. The next visuals will instead explore the individual loci TPM profiles across conditions and compare the top set to the "rest" (all loci that did not make the top set).
# melt df to format for Altair parallel coordinates viz
alt_df = pd.melt(df_means, id_vars=['exp_condition'], value_vars=LOCI,value_name='mean_exp')
# add metadata columns from dict
alt_df['gene_symbol'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['gene_symbol'])
alt_df['product'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['product'])
alt_df['type'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['type'])
alt_df['desc_string'] = alt_df.apply(lambda row: f"{row['locus_tag']}|{row['gene_symbol']}|{row['product']}",axis=1)
# impose a specific x-axis sort order on the df
# choose custom exp_condition order!
list_ordering = ['uMax','lowCH4','NoCu','lowCu','medCu','highCu','NO3_lowO2_slow_growth','highO2_slow_growth','lowO2_fast_growth','MeOH','NoLanthanum','WithLanthanum']
alt_df["exp_condition_order"] = pd.Categorical(alt_df["exp_condition"], categories=list_ordering)
alt_df['exp_id'] = alt_df['exp_condition'].apply(lambda x: list_ordering.index(x))
alt_df.head()
def view_df(df):
'''Small function to display df gene info '''
print(f"Gene count: {len(df['locus_tag'].unique())}")
return df[['locus_tag','gene_symbol','product','type']].drop_duplicates().sort_values('locus_tag').reset_index()
view_df(alt_df)
As an example, use 3 sets of top loci: 1%, 3% and 10%
top_locs_1_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 1)
df_1 = alt_df[alt_df['locus_tag'].isin(top_locs_1_perc)]
top_locs_3_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 3)
df_3 = alt_df[alt_df['locus_tag'].isin(top_locs_3_perc)]
top_locs_10_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 10)
df_10 = alt_df[alt_df['locus_tag'].isin(top_locs_10_perc)]
def altair_pcoords_plot_select_legend_and_highlight(df,n,xorder='exp_condition_order'):
# If more than 30 entries, make 2 columns
col_num = 1 if len(df['locus_tag'].unique()) <=50 else 2
# selections
highlight = alt.selection(type='single', on='mouseover',
fields=['desc_string'], nearest=True)
selection = alt.selection_multi(fields=['desc_string'], bind='legend')
# base?
title=f'Average TPM of top {n}% of loci across conditions'
base = alt.Chart(
df.sort_values(xorder),
title=title,
#titleFontSize=20
).encode(
x=alt.X(f'{xorder}:N',
sort=alt.EncodingSortField(field=f"{xorder}:N", op="count"),
axis=alt.Axis(title='Experimental Condition')
),
y=alt.Y('mean_exp:Q',scale=alt.Scale(type='log'),axis=alt.Axis(title='Log TPM expression')),
size=alt.value(100),
color=alt.Color('desc_string:N',
legend=alt.Legend(title='Gene',
orient = 'right',
labelLimit=0,
columns=col_num,
symbolLimit=200
)),
)
# lines
lines = base.mark_line().encode(
size=alt.condition((selection|highlight), alt.value(3), alt.value(1)),
opacity=alt.condition((selection|highlight), alt.value(1), alt.value(0.5))
).add_selection(
selection,
).properties(
width=600,
height=400
).interactive()
# points
points = base.mark_circle().encode(
tooltip=['locus_tag','product','type','gene_symbol','desc_string'],
opacity=alt.condition((selection|highlight), alt.value(1), alt.value(0.2)),
size=alt.condition((selection|highlight), alt.value(100), alt.value(3))
).add_selection(highlight)
chart = lines + points
chart = chart.configure_axis(
labelFontSize=14,
titleFontSize=14
).properties(
title=f"Expression strength of loci in top {n}%"
).configure_title(
fontSize=16,
).interactive()
#chart.save(f"pcoords_top{n}.html")
return chart
This plot shows individual gene expression profiles across the 12 conditions used in this analysis.
Instructions:
* hover over lines to show gene info.
* select colors in the legend to highlight specific genes
* shift click to select multiple genes from the legend
* pan and zoom with mouse and scroll
altair_pcoords_plot_select_legend_and_highlight(df_1,"1")
# df of genes in top 1%
view_df(df_1)
altair_pcoords_plot_select_legend_and_highlight(df_3,"3")
# df view of genes in top 3%
view_df(df_3)
Now let's compare the top set to the rest of the loci - did we really pick a super strong set?
To explore this visually, we with still plot each locus with its own line, but now we will add grey bands showing the first and second standard deviations of the remaining genes. The mean of the "rest" is also depicted as a dotted line.
def get_stdev_bounds(df_comp):
'''
Given a dataframe of TPM expression for a top set complement (aka
all loci NOT in the top set), determin the mean and first/second
standard deviations
'''
rows = []
for cond, cond_df in df_comp.groupby('exp_condition'):
vals = cond_df['mean_exp'].values
m = np.mean(vals)
sd = np.std(vals)
sd1_high = m + sd
sd1_low = max((m - sd), 0)
sd2_high = m + (2*sd)
sd2_low = max((m - (2*sd)),0)
#print(cond, m, sd)
rows.append([cond, m, sd, sd1_high, sd1_low, sd2_high, sd2_low])
return pd.DataFrame(rows, columns=['exp_condition','mean_exp','stdev','stdev1_high','stdev1_low','stdev2_high','stdev2_low'])
def stdev_plot(n,short=False,save=False):
'''
Given an n% threshold, get the top locs for this gene set,
get it's complement and produce
a lineplot of the top genes vs the 1st/2nd standard deviation
of the remaining genes
'''
# get loci in top n% and filter altair-formatted df
top_locs = gtgs.get_top_n_perc_by_condition(df_means, LOCI, n)
df_n = alt_df[alt_df['locus_tag'].isin(top_locs)]
# num loci in the top - used for plotting color palette
num_loci = len(df_n['locus_tag'].unique())
# get complement of top locs (everything NOT in the top set)
df_n_comp = alt_df[~alt_df['locus_tag'].isin(top_locs)]
# get stdev boundary df of complement
df_n_comp_stdev = get_stdev_bounds(df_n_comp)
df_n_comp_stdev["exp_condition_order"] = pd.Categorical(df_n_comp_stdev["exp_condition"], categories=list_ordering)
df_n_comp_stdev['exp_id'] = df_n_comp_stdev['exp_condition'].apply(lambda x: list_ordering.index(x))
# if more than 40 lines are drawn, make them slightly transparent
alpha = 0.2 if len(df_n['desc_string'].unique()) > 40 else 1
# +--------------+
# | start figure |
# +--------------+
fig_dim = (8,4) if short else (8,6)
fig = plt.figure(figsize=fig_dim)
# top loci lines
sns.lineplot(data=df_n, x='exp_id', y='mean_exp',hue='desc_string',
palette=sns.color_palette("hls",num_loci),alpha=alpha,legend=None)
# get x-axis order for stdev fill
x_ax = df_n_comp_stdev.sort_values('exp_id')['exp_id'].values
# mean of not-top loci
#sns.lineplot(data=df_n_comp, x='exp_condition_order', y='mean_exp',color='black',lw=3)
comp_means = df_n_comp_stdev.sort_values('exp_condition_order')['mean_exp'].values
plt.plot(x_ax,comp_means,color='black',linestyle='--',lw=3)
# plot second stdev
lower_bound = df_n_comp_stdev.sort_values('exp_condition_order')['stdev2_low'].values
upper_bound = df_n_comp_stdev.sort_values('exp_condition_order')['stdev2_high'].values
plt.fill_between(x_ax,
lower_bound,
upper_bound,
color=sns.xkcd_rgb['light grey']
)
# first stdev
lower_bound = df_n_comp_stdev.sort_values('exp_condition_order')['stdev1_low'].values
upper_bound = df_n_comp_stdev.sort_values('exp_condition_order')['stdev1_high'].values
plt.fill_between(x_ax,
lower_bound,
upper_bound,
color=sns.xkcd_rgb['greyish']
)
# chart config
plt.yscale('log')
plt.ylim(1)
plt.title(f"Expression strength of loci in top {n}% compared to rest", fontsize=18)
plt.xticks(x_ax,fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel("Experimental condition ID",fontsize=14)
plt.ylabel("Log TPM expression",fontsize=14)
sns.despine(bottom = False, left = False)
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
if save:
short_str = '_short' if short else ''
plt.savefig(f'figs/top{n}_loci_vs_rest{short_str}.eps', format='eps')
plt.savefig(f'figs/top{n}_loci_vs_rest{short_str}.png', format='png')
plt.show()
stdev_plot(1)
The black dotted line is the mean of the "rest" of the genes in each condition, the dark grey is the 1st standard deviation, and the light grey is the 2nd standard deviation
stdev_plot(3)
stdev_plot(10)
I tried a few other versions for visualizing the summary of the "remaining loci." They each have pros and cons but ultimately I thought the standard deviation bands (in the above visualizations) mostly cleanly communicated the message.
alt.data_transformers.disable_max_rows()
# get complement of top n% of genes
df_1_comp = alt_df[~alt_df['locus_tag'].isin(top_locs_1_perc)]
#df_3_comp = alt_df[~alt_df['locus_tag'].isin(top_locs_3_perc)]
#df_10_comp = alt_df[~alt_df['locus_tag'].isin(top_locs_10_perc)]
def overlapping_lines(df,comp_df,n,xorder='exp_condition_order'):
# If more than 30 entries, make 2 columns
col_num = 1 if len(df['locus_tag'].unique()) <=50 else 2
title=f'Average TPM of top {n}% of loci across conditions'
# lines
lines = alt.Chart(
df.sort_values(xorder)
).mark_line().encode(
x=alt.X(f'{xorder}:N',
sort=list_ordering,
#sort=alt.EncodingSortField(field=f"{xorder}:N", op="count"),
axis=alt.Axis(title='Experimental Condition')
),
y=alt.Y('mean_exp:Q',scale=alt.Scale(type='log')),
size=alt.value(5),
opacity=alt.value(0.8),
color=alt.Color('desc_string:N',
legend=alt.Legend(title=f'Loci in top {n}%',
orient = 'right',
labelLimit=0,
columns=col_num,
symbolLimit=200
))
).properties(
width=600,
height=400
).interactive()
# complement
lines_comp = alt.Chart(
comp_df.sort_values(xorder)
).mark_line().encode(
x=alt.X(f'{xorder}:N',
sort=list_ordering,
#sort=alt.EncodingSortField(field=f"{xorder}:N", op="count"),
axis=alt.Axis(title='Experimental Condition')
),
y=alt.Y('mean_exp:Q',scale=alt.Scale(type='log'),axis=alt.Axis(title='Log TPM Expression')),
size=alt.value(0.5),
opacity=alt.value(0.1),
detail='desc_string:N',
color=alt.value('gray')
).properties(
width=600,
height=400
).interactive()
chart = lines + lines_comp
chart = chart.configure_axis(
grid=False
).configure_view(
strokeOpacity=0
).properties(
title=f"Expression strength of loci in top {n}% compared to rest"
).configure_title(
fontSize=16,
)
#chart.save(f"pcoords_overlap_top{n}.html")
return chart
overlapping_lines(df_1,df_1_comp,'1')