import pandas as pd
import os
# Directory containing your .csv files
= '../results/compartments/'
csv_dir
# Create a dictionary to store the DataFrames
= {}
dataframes
# Iterate over all .csv files in the directory
for filename in os.listdir(csv_dir):
if filename.endswith('.csv'): # Check for .csv files
# Construct the full file path
= os.path.join(csv_dir, filename)
filepath
# Load the CSV into a DataFrame
# Use the filename (without extension) as the dictionary key
= filename.replace('_a_comp_coords_', '_')
key = os.path.splitext(key)[0]
key = pd.read_csv(filepath)
dataframes[key] 'length'] = dataframes[key]['end'] - dataframes[key]['start']
dataframes[key][
# The `dataframes` dictionary now contains the DataFrames
dataframes.keys()
= pd.read_csv('../data/ech90_human_Mmul_10.csv')
ech90 'length'] = ech90['end'] - ech90['start'] ech90[
E1 vs. ECH
Overlaying the A-Compartments with Extended Common Haplotypes
The genomic regions in question
In 03_compartments.ipynb
we extracted the genomic intervals of A compartments on all cell types in all combinations of the following parameters:
- Cell type: fibroblast, spermatocyte, pachytene spermatocyte, round spermatid, sperm
- Chromosome: X
- E1 restriction: full-chromosome, chromosome arms, 10Mb windows
- Resolution: 100 kb, 500 kb
The following parameter was only changed for 100kb resolution:
- Smoothing: No smoothing, 5 bins (500kb)
Resulting in 45 .csv files. They are saved to ../results/compartments/
.
Load the data
Time to unleash genominterv
on the .csv files
Define a plotting function
# Kaspers plotting function
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'svg'
def plot_intervals(query=None, annot=None, **kwargs):
= []
tups if query is not None:
'query', query))
tups.append((if annot is not None:
'annot', annot))
tups.append((
tups.extend(kwargs.items())= reversed(tups)
tups
= []
df_list = []
labels for label, df in tups:
labels.append(label)'label'] = label
df[
df_list.append(df)= pd.concat(df_list)
bigdf
'chrom'] = pd.Categorical(bigdf['chrom'], bigdf['chrom'].unique())
bigdf['label'] = pd.Categorical(bigdf['label'], bigdf['label'].unique())
bigdf[
= bigdf.groupby('chrom', observed=False)
gr = plt.subplots(gr.ngroups, 1, figsize=(8, 1.5*gr.ngroups),
fig, axes =True
sharey# sharex=True
)if type(axes) is not np.ndarray:
# in case there is only one axis so it not returned as a list
= np.array([axes])
axes
# with plt.style.context(('default')):
for i, chrom in enumerate(gr.groups):
= gr.get_group(chrom)
_df = _df.groupby('label', observed=False)
_gr for y, label in enumerate(_gr.groups):
try:
= _gr.get_group(label)
df except KeyError:
continue
= np.repeat(y, df.index.size)
y =0.5, lw=5, colors=f'C{y[0]}')
axes[i].hlines(y, df.start.tolist(), df.end.tolist(), alpha= len(labels)/10
delta -delta, y+delta, alpha=0.5, lw=2, colors=f'C{y[0]}')
axes[i].vlines(df.start.tolist(), y-delta, y+delta, alpha=0.5, lw=2, colors=f'C{y[0]}')
axes[i].vlines(df.end.tolist(), y
'top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
axes[i].spines[
list(range(len(labels))), labels)
axes[i].set_yticks(='y', which='both', left=False)
axes[i].tick_params(axis-1, len(labels)-0.7)
axes[i].set_ylim(# axes[i].set_xlim(df.start.min()-delta, df.end.max()+delta)
if i != gr.ngroups-1:
='x', which='both', bottom=False)
axes[i].tick_params(axis
='left', fontsize=10)
axes[i].set_title(chrom, loc plt.tight_layout()
# My plotting function
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Rectangle
from mpl_toolkits.axes_grid1 import make_axes_locatable
from genominterv import interval_intersect
%config InlineBackend.figure_format = 'svg'
def plot_regions(query=None, annot=None, intersect=None):
= annot['chrom'].unique()[0]
chrom = pd.read_csv('../data/rheMac10.filtered.chrom.sizes', sep='\t', header=None, names=['chrom', 'size'])
chromsize = chromsize[chromsize['chrom'] == chrom]['size'].values[0]
chromsize
# Define the plot
= 1 + (1 if query is not None else 0) + (1 if intersect is not None else 0)
height = height * 0.75
height
= plt.subplots(figsize=(10, height), sharex=True)
f, ax False)
ax.spines[:].set_visible(
# Plot the annot
# Iterate over each interval in the DataFrame
for start, end in zip(annot['start'], annot['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:red', linewidth=0, alpha=0.6)
rect
ax.add_patch(rect)'bottom'].set_visible(True)
ax.spines[= annot['label'].unique()[0] if 'label' in annot.columns else 'A-Comp'
lbl =0, fontsize=10, labelpad=30)
ax.set_ylabel(lbl, rotation
= make_axes_locatable(ax)
divider
if query is not None:
= divider.append_axes("top", size="100%", pad=0.2, sharex=ax)
qax False)
qax.xaxis.set_visible(# Plot the query
for start, end in zip(query['start'], query['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:blue', linewidth=0, alpha=0.6)
rect
qax.add_patch(rect)False)
qax.spines[:].set_visible(
qax.set_yticks([]) ='left', fontsize=10)
qax.set_title(chrom, loc'ECH90', rotation=0, fontsize=10, labelpad=30)
qax.set_ylabel(
if intersect is not None:
= divider.append_axes("bottom", size="100%", pad=0.2, sharex=ax)
iax # Invisible x-axis for 'annot' (intersect ie below)
False)
ax.xaxis.set_visible(# Plot the intersect
for start, end in zip(intersect['start'], intersect['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6)
rect
iax.add_patch(rect)False)
iax.spines[:].set_visible(
iax.set_yticks([]) 'bottom'].set_visible(False)
ax.spines['bottom'].set_visible(True)
iax.spines['Intersect', rotation=0, fontsize=10, labelpad=30)
iax.set_ylabel(
ax.set_yticks([])0, chromsize)
ax.set_xlim(= np.linspace(0, chromsize, num=5)
ticks
ax.set_xticks(ticks) f'{int(t/1e6)} Mbp' for t in ticks])
ax.set_xticklabels([
plt.tight_layout()return f, ax
Test with a subsample of the data
= dataframes['round_spermatid_100kb_arms']
annot = ech90
query = interval_intersect(annot, query)
intersect
=intersect)
plot_intervals(query, annot, intersection plot_regions(query, annot, intersect)
from genominterv import proximity_test, interval_collapse, interval_diff, interval_intersect, jaccard_stat
= dataframes['round_spermatid_100kb_arms']
annot = ech90
query
#plot_intervals(query=query, annot=annot)
for key,annot in dataframes.items():
# Filter out subset
if ('round_spermatid_100') in key and not 'full' in key:
# Plot the intervals
= interval_intersect(query, annot)
intersection =query, annot=annot, intersection=intersection)
plot_intervals(query
plt.title(key)
# Do a proximity test
print(f"Tests for {key}")
= interval_collapse(annot)
annot_collapsed = interval_diff(query, annot_collapsed)
non_ovl_query print("Proximity:", proximity_test(non_ovl_query, annot_collapsed))
print("Jaccard:", jaccard_stat(query, annot))
print()
Tests for round_spermatid_100kb_arms_smoothed
Proximity: TestResult(statistic=0.20566666666666641, pvalue=0.105)
Jaccard: 0.03319511172796144
Tests for round_spermatid_100kb_arms
Proximity: TestResult(statistic=0.49242857142857144, pvalue=0.0004)
Jaccard: 0.03916232332293147
Tests for round_spermatid_100kb_10Mb
Proximity: TestResult(statistic=0.3223076923076922, pvalue=0.0209)
Jaccard: 0.04512778341139746
Tests for round_spermatid_100kb_10Mb_smoothed
Proximity: TestResult(statistic=0.4658333333333337, pvalue=0.0017)
Jaccard: 0.04494391747197651
Bootstrap to get a p-value
from genominterv import bootstrap
= dataframes['round_spermatid_100kb_arms']
annot = (
chromsizes '../data/rheMac10.filtered.chrom.sizes',
pd.read_csv(='\t',
sep='chrom',
index_col=None,
header=['chrom','size'])
names'size']
.to_dict()[
)#display(chromsizes)
@bootstrap(chromsizes, samples=1000)
def jaccard_bootstrap(query, annot):
return jaccard_stat(query, annot)
= jaccard_bootstrap(query, annot) jacccard_stat, p_value
jacccard_stat, p_value
(0.03916232332293147, 0.31)
Partition the A-compartments into regions around the edges
= dataframes['round_spermatid_100kb_arms']
df
= pd.DataFrame({
start_edge 'chrom': df['chrom'],
'start': df['start']-1*df['resolution'],
'end': df['start']+1*df['resolution'],
'resolution': df['resolution'],
'label': 'start_edge'
})= pd.DataFrame({
end_edge 'chrom': df['chrom'],
'start': df['end']-1*df['resolution'],
'end': df['end']+1*df['resolution'],
'resolution': df['resolution'],
'label': 'end_edge'
})
#df
= pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])
test_df interval_collapse(test_df)
start | end | chrom | |
---|---|---|---|
0 | 800000 | 1000000 | chrX |
1 | 2500000 | 2700000 | chrX |
2 | 3100000 | 3300000 | chrX |
3 | 3400000 | 3600000 | chrX |
4 | 6500000 | 6800000 | chrX |
... | ... | ... | ... |
86 | 147000000 | 147200000 | chrX |
87 | 148800000 | 149400000 | chrX |
88 | 150800000 | 151000000 | chrX |
89 | 152200000 | 152400000 | chrX |
90 | 153200000 | 153400000 | chrX |
91 rows × 3 columns
import os
for key, df in dataframes.items():
= '../results/edges'
outdir = os.path.join(outdir,f'{key+'_edges.csv'}')
edge_csv_name if not os.path.exists(edge_csv_name):
= df['resolution'].unique()[0]
res
= pd.DataFrame({
start_edge 'chrom': df['chrom'],
'start': df['start']-1*df['resolution'],
'end': df['start']+1*df['resolution'],
'resolution': df['resolution'],
'label': 'start_edge'
})= pd.DataFrame({
end_edge 'chrom': df['chrom'],
'start': df['end']-1*df['resolution'],
'end': df['end']+1*df['resolution'],
'resolution': df['resolution'],
'label': 'end_edge'
})
if not os.path.exists(outdir):
os.makedirs(outdir)
= pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])
tmp =res).to_csv(edge_csv_name, index=False) interval_collapse(tmp).assign(resolution
Import edges
import pandas as pd
import os
# Directory containing your .csv files
= '../results/edges/'
csv_dir
# Create a dictionary to store the DataFrames
= {}
edges
# Iterate over all .csv files in the directory
for filename in os.listdir(csv_dir):
if filename.endswith('.csv'): # Check for .csv files
# Construct the full file path
= os.path.join(csv_dir, filename)
filepath
# Load the CSV into a DataFrame
# Use the filename (without extension) as the dictionary key
= filename.replace('_edges_', '')
key = os.path.splitext(key)[0]
key = pd.read_csv(filepath)
edges[key] 'length'] = edges[key]['end'] - edges[key]['start']
edges[key][
# The `edges` dictionary now contains the DataFrames
print(edges.keys())
print(edges['fibroblast_100kb_10Mb_edges'].columns)
#ech90 = pd.read_csv('../data/ech90_human_Mmul_10.csv')
dict_keys(['sperm_100kb_arms_smoothed_edges', 'spermatogonia_500kb_full_edges', 'pachytene_spermatocyte_100kb_10Mb_smoothed_edges', 'spermatogonia_100kb_arms_edges', 'fibroblast_500kb_full_edges', 'round_spermatid_500kb_10Mb_edges', 'fibroblast_100kb_arms_edges', 'spermatogonia_100kb_full_smoothed_edges', 'round_spermatid_100kb_full_smoothed_edges', 'round_spermatid_100kb_full_edges', 'fibroblast_100kb_10Mb_edges', 'round_spermatid_500kb_arms_edges', 'fibroblast_100kb_full_smoothed_edges', 'spermatogonia_100kb_10Mb_edges', 'sperm_500kb_arms_edges', 'spermatogonia_100kb_10Mb_smoothed_edges', 'sperm_100kb_full_edges', 'pachytene_spermatocyte_500kb_10Mb_edges', 'pachytene_spermatocyte_100kb_full_smoothed_edges', 'pachytene_spermatocyte_100kb_full_edges', 'pachytene_spermatocyte_500kb_arms_edges', 'fibroblast_100kb_10Mb_smoothed_edges', 'sperm_500kb_10Mb_edges', 'round_spermatid_100kb_10Mb_smoothed_edges', 'spermatogonia_500kb_arms_edges', 'spermatogonia_100kb_full_edges', 'spermatogonia_100kb_arms_smoothed_edges', 'fibroblast_500kb_arms_edges', 'round_spermatid_100kb_10Mb_edges', 'fibroblast_100kb_full_edges', 'sperm_100kb_full_smoothed_edges', 'fibroblast_100kb_arms_smoothed_edges', 'round_spermatid_100kb_arms_edges', 'fibroblast_500kb_10Mb_edges', 'round_spermatid_500kb_full_edges', 'round_spermatid_100kb_arms_smoothed_edges', 'spermatogonia_500kb_10Mb_edges', 'sperm_500kb_full_edges', 'sperm_100kb_10Mb_smoothed_edges', 'sperm_100kb_arms_edges', 'pachytene_spermatocyte_100kb_arms_smoothed_edges', 'pachytene_spermatocyte_100kb_10Mb_edges', 'pachytene_spermatocyte_100kb_arms_edges', 'pachytene_spermatocyte_500kb_full_edges', 'sperm_100kb_10Mb_edges'])
Index(['chrom', 'start', 'end', 'resolution', 'label', 'length'], dtype='object')
from genominterv import interval_intersect
= 'round_spermatid_100kb_arms'
sample
= dataframes[sample]
full_df = interval_intersect(full_df, ech90).assign(length=lambda x: x['end'] - x['start'])
full_intersect = edges[f'{sample}_edges']
edge_df
# Plot full
plot_regions(ech90, full_df, full_intersect)'All edges') plt.suptitle(
Text(0.5, 0.98, 'All edges')
### Some stats about the data and intersections ###
# Determine the proportion of total regions in ECH90 that lies on compartment edges
print("Proportion of ECH90 on edges (#count)")
# Proportion of ECH90 on full regions
print(f'\t{full_intersect.shape[0] / ech90.shape[0]}')
print("\nProportion of ECH90 on edges (#bpairs)")
# Proportion of ECH90 on full regions
print(f'\t{full_intersect['length'].sum() / ech90['length'].sum()}')
# What is the total length of ech90 regions
print("\nTotal length of:")
print(f'\tECH90: {(ech90['end'] - ech90['start']).sum()} bp')
print(f'\tEdges: {(edge_df["end"] - edge_df["start"]).sum()} bp')
Proportion of ECH90 on edges (#count)
0.631578947368421
Proportion of ECH90 on edges (#bpairs)
0.3932398045966063
Total length of:
ECH90: 5511675 bp
Edges: 28800000 bp
= plot_regions(edge_df, ech90, full_intersect)
f, ax
for ax in list(f.axes):
if ax.get_ylabel() == 'ECH90':
'edge_df', rotation=0, fontsize=10, labelpad=30)
ax.set_ylabel(if ax.get_ylabel() == "query":
'ECH90', rotation=0, fontsize=10, labelpad=30)
ax.set_ylabel(if ax.get_ylabel() == 'Intersect':
#ax.set_ylabel('end_edge \nintersect', rotation=0, fontsize=10, labelpad=30)
pass
Do testing of the edges
%%capture
# Define what we are testing
print("""
Goal: To test whether ECH90 regions are enriched in compartment edges
Query: ECH90
Annotation: Start and end edges of compartments
Hypothesis:
ECH90 regions are enriched in compartment edges
Null hypothesis:
ECH90 regions are not enriched in compartment edges
Tests:
* Proximity test:
tests whether the query regions are closer to
the annotation regions than expected by chance.
NB regions can not overlap, so we need to collapse the annotation regions
* Jaccard index:
tests the similarity between the query and annotation regions,
where a value of 1 indicates perfect overlap
""")
### Proximity test ###
from genominterv import proximity_test, interval_collapse, interval_diff, interval_intersect
# Define the query and annotation
= ech90
query = full_intersect
annot
# Calculate the non-overlapping query regions
= interval_diff(query, annot)
non_ovl_query_full
# Perform the proximity test
= proximity_test(non_ovl_query_full, annot, two_sided=False)
proximity_full
print("Proximity test results: All edges")
print(f"\tstatistic: {proximity_full.statistic}, \n\tp-value: {proximity_full.pvalue}")
Proximity test results: All edges
statistic: 0.6821666666666665,
p-value: 0.0
### Jaccard index ###
from genominterv import jaccard_stat, bootstrap
= (pd.read_csv(
chromsizes '../data/rheMac10.filtered.chrom.sizes',
='\t',
sep='chrom',
index_col=None,
header=['chrom','size'])
names'size']
.to_dict()[
)
# # Calculate the Jaccard index
# jaccard_start = jaccard_stat(query, annot_start)
# jaccard_end = jaccard_stat(query, annot_end)
# jaccard_concat = jaccard_stat(query, annot_concat)
# print("\nJaccard index results")
# print(f"Start edge: {jaccard_start}")
# print(f"End edge: {jaccard_end}")
# print(f"Concat edge: {jaccard_concat}")
# Test with bootstrap decorator
@bootstrap(chromsizes, samples=1000)
def jaccard_bootstrap(query, annot):
return jaccard_stat(query, annot)
= jaccard_bootstrap(query,annot)
jaccard_stat_full, p_value_full
print(f"Jaccard index: {jaccard_stat_full}, p-value: {p_value_full}")
Jaccard index: 0.3932398045966063, p-value: 0.0
p-value is zero smaller than 0.001. Increase nr samples to get actual p-value.
Check the length of the intervals
Bioframe genomic intervals support
import bioframe
Geneinfo
How does the edges align with genes?
This first plot is just to figure out how to plot with gene_plot
.
import geneinfo as gi
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
# Use the proximity test results to plot the ECH90 regions and the compartment edges
= full_intersect['start'][2]
start = full_intersect['end'][5]
end
= [Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6) for start, end in zip(full_intersect['start'][2:6], full_intersect['end'][2:6])]
rectangles
= PatchCollection(rectangles, match_original=True)
pc
= gi.gene_plot('chrX', start-100_000, end+100_000, assembly='rheMac10')
ax ax.add_collection(pc)
Get the geneinfo for all intersections between edges and ECH90
And write to a csv file. If the file exists, read it with pandas.
# Use get_genes_region
import os.path as op
import geneinfo as gi
import pandas as pd
= '../results/edge_genes/rs_edges_100kb_genes.csv'
genes_file
if not op.exists(genes_file):
= pd.concat(
genes apply(
full_intersect.lambda x: gi.get_genes_region_dataframe('chrX', x['start'], x['end'], assembly='rheMac10'),
=1
axis
).to_list(),=True
ignore_index
)=False)
genes.to_csv(genes_file, indexelse:
= pd.read_csv(genes_file) genes
= genes['name'].unique().tolist()
genes_list genes_list
['SH3KBP1',
'MIR7206',
'LANCL3',
'XK',
'CYBB',
'LOC696657',
'DYNLT3',
'PAGE4',
'USP27X',
'CLCN5',
'LOC114675180',
'MIR532',
'MIR188',
'MIR500A',
'MIR362',
'MIR501',
'MIR500B',
'MIR660',
'MIR502',
'AKAP4',
'CCNB3',
'LOC114675218',
'LOC695959',
'CENPVL3',
'FAM120C',
'WNK3',
'LOC114675302',
'ZC3H12B',
'LAS1L',
'MSN',
'ATRX',
'MAGT1',
'LOC114675151',
'COX7B',
'ATP7A',
'ALG13',
'RAP2C',
'DKC1',
'LOC114675231',
'MPP1',
'SMIM9',
'F8']
import geneinfo as gi
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
# Use the proximity test results to plot the ECH90 regions and the compartment edges
= 10
start_idx = 11
end_idx
= full_intersect['start'][start_idx]
start = full_intersect['end'][end_idx]
end
= [Rectangle(
rectangles 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6) for start, end in zip(full_intersect['start'][start_idx:end_idx+1], full_intersect['end'][start_idx:end_idx+1])]
(start,
= PatchCollection(rectangles, match_original=True)
pc
= gi.gene_plot('chrX', start-100_000, end+100_000, assembly='rheMac10',
ax =genes_list,
highlight=True,
despine=(12, 5),
figsize=5,
aspect
) ax.add_collection(pc)
What can I do with the list of genes on the edges?
GO enrichment?
= gi.get_genes_region_dataframe('chrX', 0, 155_000_000, assembly='rheMac10') mmul_x_genes
= mmul_x_genes['name'].unique().tolist() mmul_x_genelist
= genes['name'].unique().tolist()
gene_list = 9544
taxid 'sojernj@gmail.com')
gi.email(#gi.go_annotation_table(taxid=taxid)
#gi.show_go_evidence_codes()
= gi.get_go_terms_for_genes(gene_list, taxid=taxid) go_terms
len(go_terms)
#gene_list[:5]
98
= gi.go_enrichment(
results 5],
gene_list[:# Use human as a start
=0.05,
alpha=go_terms
terms )
geneinfo_cache/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms; optional_attrs(def relationship)
Could not map gene symbol "MIR7206" to ncbi id