When testing the framework on a new organism, it is important to go through the computational validation to assess the quality of the predicted motif. This notebook:
E. coli and B. subtilis were tested at varying thresholds of n (the top n% of genes used to derived a consensus motif). The template for comparing a new organism is demonstrated with T. maritima.
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../') # use modules in main directory
import genbank_utils as gu
import consensus_viz_utils as cu
The computational pipeline was previously run for each of these organism at varying thresholds of n. The final _SELECTION.fa
file is provided for each n% threshold. The run commands to generate the _SELECTION.fa
files are listed below. (See Github readme for more details)
for N = [1,2,3,4,5,6,10,20]:
python get_top_gene_set.py data/extract_TPM_counts.tsv locus_tag [N] 120 data/sample2condition.txt data/5GB1c_sequence.gb mbur_out -c config/conditions_to_include.txt -s config/samples_to_include.txt
python extract_upstream_regions.py mbur_out/loci_in_top_[N]perc.txt data/5GB1c_sequence.gb mbur_out
python predict_promoter_signal.py mbur_out/loci_in_top_[N]perc_upstream_regions_w300_min20_trunc.fa config/bioprospector_config.txt mbur_out
for N = 1 thru 10:
python get_top_gene_set.py data/ecoli_log_TPM_counts.txt locus_tag [N] 83 data/ecoli_sample2condition.txt data/ecoli_NC_000913.3.gb ecoli_out -s config/ecoli_samples_to_include.txt -c config/ecoli_conditions_to_include.txt
python extract_upstream_regions.py ecoli_out/loci_in_top_[N]perc.txt data/ecoli_NC_000913.3.gb ecoli_out
python predict_promoter_signal.py ecoli_out/loci_in_top_[N]perc_upstream_regions_w300_min20_trunc.fa config/bioprospector_config.txt ecoli_out
for N = 3 thru 10:
python get_top_gene_set.py data/bsubtilis_nicolas.tsv locus_tag [N] 80 data/bsub_sample2condition.txt data/bsubtilis_AL009126.gbff bsub_out -s config/bsub_samples_to_include.txt -c config/bsub_conditions_to_include.txt
python extract_upstream_regions.py bsub_out/loci_in_top_[N]perc.txt data/bsubtilis_AL009126.gbff bsub_out -r
python predict_promoter_signal.py bsub_out/loci_in_top_[N]perc_upstream_regions_w300_minus15_min20_trunc.fa config/bioprospector_config.txt bsub_out
# --- M. buryatense ---
mb_f_dict = {
1:'../mbur_out/loci_in_top_1perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1598952145_SELECTION.fa',
2:'../mbur_out/loci_in_top_2perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1599038857_SELECTION.fa',
3:'../mbur_out/loci_in_top_3perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1598952202_SELECTION.fa',
4:'../mbur_out/loci_in_top_4perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1599039044_SELECTION.fa',
5:'../mbur_out/loci_in_top_5perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1598952321_SELECTION.fa',
6:'../mbur_out/loci_in_top_6perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1599039246_SELECTION.fa',
10:'../mbur_out/loci_in_top_10perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1598952515_SELECTION.fa',
20:'../mbur_out/loci_in_top_20perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1598953063_SELECTION.fa',
}
mb_gbFile = "../data/5GB1c_sequence.gb"
# --- E. coli ---
ec_f_dict = {
1:'../ecoli_out/loci_in_top_1perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616202668_SELECTION.fa',
2:'../ecoli_out/loci_in_top_2perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616202614_SELECTION.fa',
3:'../ecoli_out/loci_in_top_3perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615799844_SELECTION.fa',
4:'../ecoli_out/loci_in_top_4perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615799724_SELECTION.fa',
5:'../ecoli_out/loci_in_top_5perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615799564_SELECTION.fa',
6:'../ecoli_out/loci_in_top_6perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615799334_SELECTION.fa',
7:'../ecoli_out/loci_in_top_7perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615799001_SELECTION.fa',
8:'../ecoli_out/loci_in_top_8perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615798640_SELECTION.fa',
9:'../ecoli_out/loci_in_top_9perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615798208_SELECTION.fa',
10:'../ecoli_out/loci_in_top_10perc_upstream_regions_w300_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1615797619_SELECTION.fa'
}
ec_gbFile = '../data/ecoli_NC_000913.3.gb'
# --- B. subtilis ---
bs_f_dict = {
3:'../bsub_out/loci_in_top_3perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1617146461_SELECTION.fa',
4:'../bsub_out/loci_in_top_4perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1617146394_SELECTION.fa',
5:'../bsub_out/loci_in_top_5perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616122559_SELECTION.fa',
6:'../bsub_out/loci_in_top_6perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616122046_SELECTION.fa',
7:'../bsub_out/loci_in_top_7perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616121902_SELECTION.fa',
8:'../bsub_out/loci_in_top_8perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616121073_SELECTION.fa',
9:'../bsub_out/loci_in_top_9perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616121645_SELECTION.fa',
10:'../bsub_out/loci_in_top_10perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616120092_SELECTION.fa',
}
bs_gbFile = "../data/bsubtilis_AL009126.gbff"
The reload is much shorter than running the genome match finding code
mb_match_file = "../mbur_out/mbur1-20_all_motif_match_df.tsv"
mb_motif_dict, \
mb_all_motif_match_cat_df, \
mb_all_top_motif_match_cat_df = cu.reload_match_df_info(mb_match_file,mb_f_dict,mb_gbFile)
ec_match_file = "../ecoli_out/ecoli1-10_all_motif_match_df.tsv"
ec_motif_dict, \
ec_all_motif_match_cat_df, \
ec_all_top_motif_match_cat_df = cu.reload_match_df_info(ec_match_file,ec_f_dict,ec_gbFile, threshold=10)
bs_match_file = "../bsub_out/bsub3-10_all_motif_match_df.tsv"
bs_motif_dict, \
bs_all_motif_match_cat_df, \
bs_all_top_motif_match_cat_df = cu.reload_match_df_info(bs_match_file,bs_f_dict,bs_gbFile)
Don't run this section of cells if you've already run the reload section above! If this is your first time through, uncomment and run these cells.
# mb_motif_dict, \
# mb_all_hex_df, \
# mb_all_motif_match_df, \
# mb_all_motif_match_cat_df, \
# mb_all_top_motif_match_cat_df = cu.compare_consensus_motifs(mb_f_dict,mb_gbFile,threshold=12)
# ec_motif_dict, \
# ec_all_hex_df, \
# ec_all_motif_match_df, \
# ec_all_motif_match_cat_df, \
# ec_all_top_motif_match_cat_df = cu.compare_consensus_motifs(ec_f_dict,ec_gbFile,threshold=10) #threshold=10
# bs_motif_dict, \
# bs_all_hex_df, \
# bs_all_motif_match_df, \
# bs_all_motif_match_cat_df, \
# bs_all_top_motif_match_cat_df = cu.compare_consensus_motifs(bs_f_dict,bs_gbFile,threshold=12)
# write out if you haven't before (saves time later)
# mb_all_motif_match_df.to_csv("../mbur_out/mbur1-20_all_motif_match_df.tsv",index=False,sep='\t')
# ec_all_motif_match_df.to_csv("../ecoli_out/ecoli1-10_all_motif_match_df.tsv",index=False,sep='\t')
# bs_all_motif_match_df.to_csv("../bsub_out/bsub3-10_all_motif_match_df.tsv",index=False,sep='\t')
Same as Figure S3, but now for M. buryatense, E. coli and B. subtilis results
# smaller subset of n% groups used in supp figure visualizations
n_subset = [3,5,8,9,10]
# all pssm matches
print("Motif enrichment for all matches")
cu.compare_genome_cat_enrichment(mb_all_motif_match_cat_df)
# high scoring pssm above 12
print("Motif enrichment for strong matches > 12.0")
cu.compare_genome_cat_enrichment(mb_all_top_motif_match_cat_df)
# all pssm matches
print("Motif enrichment for all matches")
cu.compare_genome_cat_enrichment(ec_all_motif_match_cat_df)
print("Motif enrichment for strong matches > 10.0")
# high scoring pssm above 10
cu.compare_genome_cat_enrichment(ec_all_top_motif_match_cat_df)
print("Subset of n% groups used in Fig S4")
print("Motif enrichment for all matches")
cu.compare_genome_cat_enrichment(ec_all_motif_match_cat_df[ec_all_motif_match_cat_df['nperc'].isin(n_subset)])
print("Motif enrichment for strong matches > 10.0")
cu.compare_genome_cat_enrichment(ec_all_top_motif_match_cat_df[ec_all_top_motif_match_cat_df['nperc'].isin(n_subset)])
# all pssm matches
print("Motif enrichment for all matches")
cu.compare_genome_cat_enrichment(bs_all_motif_match_cat_df)
# high scoring pssm above 12
print("Motif enrichment for strong matches > 12.0")
cu.compare_genome_cat_enrichment(bs_all_top_motif_match_cat_df)
print("Subset of n% groups used in Fig S4")
print("Motif enrichment for all matches")
cu.compare_genome_cat_enrichment(bs_all_motif_match_cat_df[bs_all_motif_match_cat_df['nperc'].isin(n_subset)])
print("Motif enrichment for strong matches > 12.0")
cu.compare_genome_cat_enrichment(bs_all_top_motif_match_cat_df[bs_all_top_motif_match_cat_df['nperc'].isin(n_subset)])
log2(red_bar/orange_bar
). # M. buryatense
mb_enrich_df = cu.build_enrich_ratio_df(mb_all_motif_match_cat_df, mb_motif_dict)
mb_enrich_df
# E. coli
ec_enrich_df = cu.build_enrich_ratio_df(ec_all_motif_match_cat_df, ec_motif_dict)
ec_enrich_df
# E. coli subset used for Figure S4
ec_enrich_df_sub = cu.build_enrich_ratio_df(ec_all_motif_match_cat_df[ec_all_motif_match_cat_df['nperc'].isin(n_subset)], ec_motif_dict)
ec_enrich_df_sub
# B. subtilis
bs_enrich_df = cu.build_enrich_ratio_df(bs_all_motif_match_cat_df, bs_motif_dict)
bs_enrich_df
# B. subtilis subset used for Figure S4
bs_enrich_df_sub = cu.build_enrich_ratio_df(bs_all_motif_match_cat_df[bs_all_motif_match_cat_df['nperc'].isin(n_subset)], bs_motif_dict)
bs_enrich_df_sub
org_dict = {
'M. buryatense': mb_enrich_df,
'E. coli': ec_enrich_df,
'B. subtilis':bs_enrich_df
}
cu.compare_orgs_chart(org_dict)
# Top enrich dfs
mb_top_enrich_df = cu.build_enrich_ratio_df(mb_all_top_motif_match_cat_df, mb_motif_dict)
ec_top_enrich_df = cu.build_enrich_ratio_df(ec_all_top_motif_match_cat_df, ec_motif_dict)
bs_top_enrich_df = cu.build_enrich_ratio_df(bs_all_top_motif_match_cat_df, bs_motif_dict)
top_org_dict = {
'M. buryatense': mb_top_enrich_df,
'E. coli': ec_top_enrich_df,
'B. subtilis':bs_top_enrich_df
}
cu.compare_orgs_chart(top_org_dict)
# Top enrich dfs with subset for Figure S4
ec_top_enrich_df_sub = cu.build_enrich_ratio_df(ec_all_top_motif_match_cat_df[ec_all_top_motif_match_cat_df['nperc'].isin(n_subset)], ec_motif_dict)
bs_top_enrich_df_sub = cu.build_enrich_ratio_df(bs_all_top_motif_match_cat_df[bs_all_top_motif_match_cat_df['nperc'].isin(n_subset)], bs_motif_dict)
top_org_dict = {
'M. buryatense': mb_top_enrich_df,
'E. coli': ec_top_enrich_df_sub,
'B. subtilis':bs_top_enrich_df_sub
}
cu.compare_orgs_chart(top_org_dict)
Visualization to compare the quality of predicted motifs for different gene sets across different organisms. Each point is a predicted motif from M. buryatense, E. coli, or B. subtilis. The numerical label of each point is the n% gene set used to derive that motif. The x-axis represents the information content of each motif averaged across each of the 12 positions in the motif. The y-axis represents the log2 ratio of the frequency the motif was found <100bp from a gene start to the frequency the motif was found in intergenic regions (ratio of red bar to orange bar for each predicted motif in panels A and B). B. subilitis predictions had the highest overall information content and enrichment in promoter regions, with E. coli motif predictions resulting in lower information content and lower enrichment in promoter regions, consistent with findings from Latif et al suggesting E. coli promoters have higher variability relative to the consensus. The motif chosen for experimental follow up in M. buryatense, the top 3% motif highlighted in inset, appears in between the results for B. subtilis and E. coli in each dimension. When assessing the framework’s motif predictions for a new organism, users can use the provided visualization tutorial (below) to incorporate their organism results into this plot and use it as a guide to see if predictions are closer in quality to E. coli (e.g. potentially more variable and less robust) or closer to M. buryatense and B. subtilis (e.g. more robustly predicted with this framework).
If you have run this pipeline on a new organism for various thresholds of the top n% of genes, you can visualize your motif results relative to the M. buryatense, E. coli, and B. subtilis reults we've published. We've demonstrated using T. martima expression data from Latif et al 2012, but a user can fill in their own data using the commented template here.
# NEWORG_f_dict = {
# 3:'path/to/top3_SELECTION.fa',
# 4:'path/to/top4_SELECTION.fa',
# ...
# }
# # Genbank file
# NEWORG_gb = "path/to/genbank_file.gb"
# NEWORG_motif_dict, \
# NEWORG_all_hex_df, \
# NEWORG_all_motif_match_df, \
# NEWORG_all_motif_match_cat_df, \
# NEWORG_all_top_motif_match_cat_df = cu.compare_consensus_motifs(NEWORG_f_dict,NEWORG_gb)
tm_f_dict = {
3:'../tmari_out/loci_in_top_3perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650987_SELECTION.fa',
4:'../tmari_out/loci_in_top_4perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650860_SELECTION.fa',
5:'../tmari_out/loci_in_top_5perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650836_SELECTION.fa',
6:'../tmari_out/loci_in_top_6perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650811_SELECTION.fa',
7:'../tmari_out/loci_in_top_7perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650784_SELECTION.fa',
8:'../tmari_out/loci_in_top_8perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650751_SELECTION.fa',
9:'../tmari_out/loci_in_top_9perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650715_SELECTION.fa',
10:'../tmari_out/loci_in_top_10perc_upstream_regions_w300_minus15_min20_trunc_W6_w6_G18_g15_d1_a1_n200_1616650674_SELECTION.fa'
}
# Genbank file
tm_gb = "../data/tmaritima3.gb"
tm_motif_dict, \
tm_all_hex_df, \
tm_all_motif_match_df, \
tm_all_motif_match_cat_df, \
tm_all_top_motif_match_cat_df = cu.compare_consensus_motifs(tm_f_dict,tm_gb,threshold=12)
# NEWORG_enrich_df = cu.build_enrich_ratio_df(NEWORG_all_motif_match_cat_df, NEWORG_motif_dict)
# extra_org_dict = {
# 'M. buryatense': mb_enrich_df,
# 'E. coli': ec_enrich_df,
# 'B. subtilis':bs_enrich_df,
# 'NEW ORG NAME':NEWORG_enrich_df
# }
# cu.compare_orgs_chart(extra_org_dict)
tm_enrich_df = cu.build_enrich_ratio_df(tm_all_motif_match_cat_df, tm_motif_dict)
extra_org_dict = {
'M. buryatense': mb_enrich_df,
'E. coli': ec_enrich_df,
'B. subtilis':bs_enrich_df,
'T. maritima':tm_enrich_df
}
cu.compare_orgs_chart(extra_org_dict)
# NEWORG_top_enrich_df = cu.build_enrich_ratio_df(NEWORG_all_top_motif_match_cat_df, NEWORG_motif_dict)
# extra_top_org_dict = {
# 'M. buryatense': mb_top_enrich_df,
# 'E. coli': ec_top_enrich_df,
# 'B. subtilis':bs_top_enrich_df,
# 'NEW ORG NAME':NEWORG_top_enrich_df
# }
# cu.compare_orgs_chart(extra_top_org_dict)
tm_top_enrich_df = cu.build_enrich_ratio_df(tm_all_top_motif_match_cat_df, tm_motif_dict)
extra_top_org_dict = {
'M. buryatense': mb_top_enrich_df,
'E. coli': ec_top_enrich_df,
'B. subtilis':bs_top_enrich_df,
'T. maritima':tm_top_enrich_df
}
cu.compare_orgs_chart(extra_top_org_dict)