Operon analysis¶

We don't yet have an exhaustive annotation of all operons in M. buryatense but there are a small handful of known operons (curated by Mary Lidstrom, saved in /data/operon_ncbi_ids,txt). We examined the spacing between genes within this known set and used this to help us decide how to set the min_dist parameter when estimating operons in our main framework.

Basic feature file loading/parsing¶

from Bio import SeqIO

import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import sys
sys.path.append('../') # use modules in main directory
import genbank_utils as gu

# genbank files
gbFile_5G = '../data/5GB1c_sequence.gb'
gb_5G = SeqIO.parse(gbFile_5G, "genbank").__next__()

print("___ 5G ____")
print("Genome length:", len(gb_5G.seq), "bps")
print("num features:", len(gb_5G.features))
print("num CDS features:", len([x for x in gb_5G.features if x.type=='CDS']))
print("num gene features:", len([x for x in gb_5G.features if x.type=='gene']))
print("num other features:", len([x for x in gb_5G.features if x.type not in ['gene', 'CDS']]))

___ 5G ____
Genome length: 4998879 bps
num features: 8871
num CDS features: 4373
num gene features: 4431
num other features: 67

# get feature coords from genbanks file
feats_5G = gu.get_feature_tuples_from_genbank(gbFile_5G)
feats_5G_filt = [x for x in feats_5G if x[5] not in ['gene']]

feats_5G_filt[:10]

[(0, 1317, 1, 'EQU24_RS00005', 'dnaA', 'CDS'),
 (1502, 2603, 1, 'EQU24_RS00010', '', 'CDS'),
 (3060, 4140, 1, 'EQU24_RS00015', 'recF', 'CDS'),
 (4185, 6600, 1, 'EQU24_RS00020', 'gyrB', 'CDS'),
 (6825, 7062, 1, 'EQU24_RS00025', '', 'CDS'),
 (7098, 7257, 1, 'EQU24_RS00030', '', 'CDS'),
 (7350, 7734, 1, 'EQU24_RS00035', '', 'CDS'),
 (7818, 9075, 1, 'EQU24_RS00040', '', 'CDS'),
 (9071, 10241, 1, 'EQU24_RS00045', '', 'CDS'),
 (10240, 13306, 1, 'EQU24_RS00050', '', 'CDS')]

# feat list indices
LEFT_IDX = 0
RIGHT_IDX = 1
STRAND_IDX = 2
LOCUS_IDX = 3
GENE_IDX = 4
TYPE_IDX = 5

Curated operon analysis¶

# load file of curated operons
op_df = pd.read_csv("../data/operon_ncbi_ids.txt", sep='\t')
op_df

Mary curatd the above 30 operons as respresentative examples of operons in M. buryatense. We analyzed these examples to better understand the typical distance between genes known to be in an operon.

# get list of ids of genes in operons 
all_op_genes = set(",".join(op_df['all_genes'].values).split(','))
# as well as a list of first-genes in operons
op_first_genes = set(op_df['ncbi_locus'].values)

print("Total genes in example operons:",len(all_op_genes))

Total genes in example operons: 175

Build a dict of all genes in Mary's list to its upstream distance to its nearest neighbor¶

upd = {} # key: gene, value: upstream_distance

# loop through all features
for i,(g_left,g_right,strand,locus,gene,typee) in enumerate(feats_5G_filt):
    # if this is a gene in one of Mary's operons, get it's upstream dist
    if locus in all_op_genes:
        # if we're on the negative strand, look to the right
        if strand == -1:
            # make sure we're not the very last gene
            if i < len(feats_5G_filt) -1:
                # get the FOLLOWING feature (because on -1 strand)
                upstream_gene = feats_5G_filt[i+1]
                upstream_dist = upstream_gene[LEFT_IDX] - g_right

        # otherwise, we're on the positive strand so look left
        else:
            # make sure we're not the very first gene
            if i != 0:
                # get the PREVIOUS feature (because on +1 strand)
                upstream_gene = feats_5G_filt[i-1]
                upstream_dist = g_left - upstream_gene[RIGHT_IDX]
                
        upd[locus] = upstream_dist
        
# print a few examples from the usptream distance dict
[(x,upd[x]) for x in list(upd.keys())[:10]]

[('EQU24_RS01455', 187),
 ('EQU24_RS01460', -8),
 ('EQU24_RS01465', 19),
 ('EQU24_RS01470', -4),
 ('EQU24_RS01475', 314),
 ('EQU24_RS02225', 115),
 ('EQU24_RS02230', 24),
 ('EQU24_RS02235', 42),
 ('EQU24_RS02240', 42),
 ('EQU24_RS02245', 5)]

For every example operon, visualize the upstream distance for all genes inside, as well as the upstream distance from the first gene in the operon to its nearest neighor NOT in the operon¶

# Make a full grid of operon upstream distances
all_dfs = []

for i,row in op_df.iterrows():
    start_gene = row['ncbi_locus']
    other_genes = [x for x in row['all_genes'].split(',') if x !=start_gene]
    start_up_dist = upd[start_gene]
    other_dists = [upd[x] for x in other_genes]

    
    df = pd.DataFrame()
    df['gene'] = row['all_genes'].split(',')
    df['upstream_dist'] = df['gene'].apply(lambda x: upd[x])
    df['start?'] = df['gene'].apply(lambda x: True if x == start_gene else False)
    df['operon'] = row['shortd']
    
    all_dfs.append(df)

all_ops_df = pd.concat(all_dfs)
all_ops_df.head(10)

# We can ask for ALL THE AXES and put them into axes
fig, axes = plt.subplots(nrows=6, ncols=6, sharex=False, sharey='row', figsize=(15,15))
axes_list = [item for sublist in axes for item in sublist] 

for operon, selection in all_ops_df.groupby("operon"):
    ax = axes_list.pop(0)
    sns.barplot(data=selection,x='gene',y='upstream_dist',hue='start?',dodge=False, ax=ax)
    ax.set_title(operon,fontsize=14)
    ax.tick_params(
        right=False,
        top=False
    )
    ax.get_legend().remove()
    ax.grid(linewidth=0.25)
    ax.set_ylim((-50, 1700))
    ax.set_xlabel("")
    ax.set_xticklabels("")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Now use the matplotlib .remove() method to 
# delete anything we didn't use
for ax in axes_list:
    ax.remove()


plt.tight_layout()

Each operon has it's leading gene in orange and all internal genes in blue. The height of the bar is the distance between each gene and it's nearest upstream neighbor on the same strand. Overall, leading genes have the biggest upstream distance, though some internal genes (such as those for the "hydrogenase" operon) have large distances (over 250)

Calculate the average distance between genes inside operons¶

all_non_start_dists = []
mean_dist_per_operon = []

for i,row in op_df.iterrows():
    start_gene = row['ncbi_locus']
    other_genes = [x for x in row['all_genes'].split(',') if x !=start_gene]
    start_up_dist = upd[start_gene]
    other_dists = [upd[x] for x in other_genes]
    
    # collect averages
    op_ave = np.mean(other_dists)
    all_non_start_dists += other_dists
    mean_dist_per_operon.append(op_ave)
    
    # build dataframe for seaborn
    df = pd.DataFrame()
    df['gene'] = row['all_genes'].split(',')
    df['upstream_dist'] = df['gene'].apply(lambda x: upd[x])
    df['start?'] = df['gene'].apply(lambda x: True if x == start_gene else False)
    

# average over all non-start genes
print("Mean:",np.mean(all_non_start_dists))
print("Median:",np.median(all_non_start_dists))

Mean: 44.675862068965515
Median: 16.0

Visualize the distibution of upstream distances for all inside-operon genes¶

plt.figure(figsize=(10,5))
ax = sns.swarmplot(all_non_start_dists,s=6)
ax.set_xticks(np.arange(-50,400,50))
plt.xlabel("Upstream distance")
plt.title("Every non-starting operon gene's upstream distance",fontsize=20)
plt.show()

# interavtive version with some jitter to simulate swarm plot 
selection = alt.selection_multi(fields=['operon'], bind='legend')

stripplot = alt.Chart(
    all_ops_df[all_ops_df['start?']==False],
).mark_point().encode(
    y=alt.Y(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale()
    ),
    x=alt.X(
        'upstream_dist:Q',
        axis=alt.Axis(title="Upstream Distance")),
    color=alt.Color('operon:N',scale=alt.Scale(scheme="sinebow")),
    size=alt.condition(selection, alt.value(100), alt.value(10)),
    tooltip=["operon:N","gene:N", "upstream_dist:Q"],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).properties(
    height=300, 
    width=500,
    title="Every non-starting operon gene's upstream distance"
).add_selection(
        selection,
).interactive()

stripplot

Interactive plot!¶

Click operon in the legend to highlight points in the scatter plot
- < Shift > + click to select multiple points in the legend
Hover over points to see more detail
Use mouse scroll to zoom in and out

Most of the distribution of upstream distance for internal operon genes is shorter than 50 bases but there are a number of genes that do have larger distances. In particular the, distances for genes in the pMMO operon is 93 and 108. Ultimately we chose 120 as our default value for the miniumum distance genes must be within to be considered "possibly in an operon." 120 includes most genes from this hand curated set but misses a few.

	shortd	ncbi_locus	all_genes
0	central_metab_edd	EQU24_RS06110	EQU24_RS06110,EQU24_RS06105,EQU24_RS06100
1	central_metab2	EQU24_RS06095	EQU24_RS06095,EQU24_RS06090,EQU24_RS06085
2	central_metab3	EQU24_RS06080	EQU24_RS06080,EQU24_RS06070,EQU24_RS06065
3	sMMO	EQU24_RS05885	EQU24_RS05910,EQU24_RS05905,EQU24_RS05900,EQU2...
4	flagellar_sys	EQU24_RS04900	EQU24_RS04945,EQU24_RS04940,EQU24_RS04935,EQU2...
5	serine_cycle	EQU24_RS04570	EQU24_RS04570,EQU24_RS04565,EQU24_RS04560,EQU2...
6	PQQ_synth	EQU24_RS02970	EQU24_RS02970,EQU24_RS02965,EQU24_RS02960,EQU2...
7	ATP_synthase	EQU24_RS02225	EQU24_RS02265,EQU24_RS02260,EQU24_RS02255,EQU2...
8	molyb_form_dh	EQU24_RS01475	EQU24_RS01475,EQU24_RS01470,EQU24_RS01465,EQU2...
9	RuMP_cycle1	EQU24_RS21565	EQU24_RS21565,EQU24_RS21560,EQU24_RS21555,EQU2...
10	Rnf_elec_transp	EQU24_RS19930	EQU24_RS19965,EQU24_RS19960,EQU24_RS19955,EQU2...
11	urease	EQU24_RS14900	EQU24_RS14900,EQU24_RS14895,EQU24_RS14890,EQU2...
12	cyt_c_synth	EQU24_RS14455	EQU24_RS14455,EQU24_RS14450,EQU24_RS14445,EQU2...
13	cyt_c_oxi	EQU24_RS14130	EQU24_RS14130,EQU24_RS14125
14	pMMO	EQU24_RS19315	EQU24_RS19315,EQU24_RS19310,EQU24_RS19305
15	mxa_methanol_dh	EQU24_RS18140	EQU24_RS18140,EQU24_RS18135,EQU24_RS18130,EQU2...
16	nif_elec_transp	EQU24_RS17265	EQU24_RS17265,EQU24_RS17260,EQU24_RS17255
17	nif_cofactor1	EQU24_RS17230	EQU24_RS17230,EQU24_RS17225,EQU24_RS17220,EQU2...
18	nif_cofactor2	EQU24_RS17170	EQU24_RS17170,EQU24_RS17165,EQU24_RS17160,EQU2...
19	nif_structural	EQU24_RS17065	EQU24_RS17095,EQU24_RS17090,EQU24_RS17085,EQU2...
20	flagellum	EQU24_RS16845	EQU24_RS16845,EQU24_RS16840,EQU24_RS16835,EQU2...
21	hopanoid_synth	EQU24_RS15360	EQU24_RS15380,EQU24_RS15375,EQU24_RS15370,EQU2...
22	b12_synth	EQU24_RS12000	EQU24_RS12060,EQU24_RS12055,EQU24_RS12050,EQU2...
23	heat_shock	EQU24_RS11555	EQU24_RS11555,EQU24_RS11550,EQU24_RS11545,EQU2...
24	w_formate_dh	EQU24_RS11515	EQU24_RS11520,EQU24_RS11515
25	FeS_cofactor	EQU24_RS10230	EQU24_RS10245,EQU24_RS10240,EQU24_RS10235,EQU2...
26	NADH_dh	EQU24_RS09760	EQU24_RS09785,EQU24_RS09780,EQU24_RS09775,EQU2...
27	hydrogenase	EQU24_RS08690	EQU24_RS08690,EQU24_RS08685,EQU24_RS08680,EQU2...
28	transhydrogenase	EQU24_RS08400	EQU24_RS08400,EQU24_RS08395
29	succ_dh	EQU24_RS06960	EQU24_RS06980,EQU24_RS06975,EQU24_RS06970,EQU2...

	gene	upstream_dist	start?	operon
0	EQU24_RS06110	980	True	central_metab_edd
1	EQU24_RS06105	64	False	central_metab_edd
2	EQU24_RS06100	238	False	central_metab_edd
0	EQU24_RS06095	760	True	central_metab2
1	EQU24_RS06090	16	False	central_metab2
2	EQU24_RS06085	100	False	central_metab2
0	EQU24_RS06080	756	True	central_metab3
1	EQU24_RS06070	134	False	central_metab3
2	EQU24_RS06065	-1	False	central_metab3
0	EQU24_RS05910	23	False	sMMO