import matplotlib.pyplot as plt
# Define my params:
## Params will comply with my desired layout for the Manuscript (PDF)
= {
notebook_rcparams 'font.size': 7,
'axes.titlesize': 8,
'axes.labelsize': 7,
'xtick.labelsize': 6,
'ytick.labelsize': 6,
'figure.titlesize': 9,
'figure.figsize': [6.0, 2.0],
'figure.labelsize': 7,
'legend.fontsize': 6,
}
# Apply config
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
%config InlineBackend.rc = notebook_rcparams
E1 vs. ECH (recommended params)
Overlaying the A-Compartments with Extended Common Haplotypes
The genomic regions in question
In 05_rec_compartments.ipynb
we extracted the genomic intervals of A compartments on all cell types in all combinations of the following parameters:
- Cell type: fibroblast, spermatocyte, pachytene spermatocyte, round spermatid, sperm
- Chromosome: X
- E1 restriction: full-chromosome, chromosome arms, 10Mb windows
- Resolution: 100 kb, 500 kb, ps500kb (smoothed 100kb)
Resulting in 45 .csv files. They are saved to ../results/rec_compartments/
.
Setup inline backend
InlineBackend.rc
For consistent plots that fit with pdf manus.
print({key:plt.rcParams[key] for key in notebook_rcparams.keys()})
{'font.size': 10.0, 'axes.titlesize': 'large', 'axes.labelsize': 'medium', 'xtick.labelsize': 'medium', 'ytick.labelsize': 'medium', 'figure.titlesize': 'large', 'figure.figsize': [6.4, 4.8], 'figure.labelsize': 'large', 'legend.fontsize': 'medium'}
## Apparently IPython needs this twice to understand??
import matplotlib.pyplot as plt
# Define my params:
## Params will comply with my desired layout for the Manuscript (PDF)
= {
notebook_rcparams 'font.size': 7,
'axes.titlesize': 8,
'axes.labelsize': 7,
'xtick.labelsize': 6,
'ytick.labelsize': 6,
'figure.titlesize': 9,
'figure.figsize': [6.0, 2.0],
'figure.labelsize': 7,
'legend.fontsize': 6,
}
# Apply config
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
%config InlineBackend.rc = notebook_rcparams
print({key:plt.rcParams[key] for key in notebook_rcparams.keys()})
{'font.size': 7.0, 'axes.titlesize': 8.0, 'axes.labelsize': 7.0, 'xtick.labelsize': 6.0, 'ytick.labelsize': 6.0, 'figure.titlesize': 9.0, 'figure.figsize': [6.0, 2.0], 'figure.labelsize': 7.0, 'legend.fontsize': 6.0}
Import compartments
import pandas as pd
import os
# Directory containing your .csv files
= '../results/rec_compartments/'
csv_dir
# Create a dictionary to store the DataFrames
= {}
dataframes
# Iterate over all .csv files in the directory
for filename in os.listdir(csv_dir):
if filename.endswith('.csv') and 'e1' not in filename: # Check for .csv files
# Construct the full file path
= os.path.join(csv_dir, filename)
filepath
# Load the CSV into a DataFrame
# Use the filename (without extension) as the dictionary key
= filename.replace('_a_comp_coords_', '_')
key = os.path.splitext(key)[0]
key = pd.read_csv(filepath)
dataframes[key] 'length'] = dataframes[key]['end'] - dataframes[key]['start']
dataframes[key][
# The `dataframes` dictionary now contains the DataFrames
dataframes.keys()
= pd.read_csv('../data/ech90_human_Mmul_10.csv')
ech90 'length'] = ech90['end'] - ech90['start']
ech90[
# Load the chromosome sizes
= (pd.read_csv(
chromsizes '../data/rheMac10.filtered.chrom.sizes',
='\t',
sep='chrom',
index_col=None,
header=['chrom','size'])
names'size']
.to_dict()[ )
Define a plotting function
# Kaspers plotting function
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'svg'
def plot_intervals(query=None, annot=None, **kwargs):
= []
tups if query is not None:
'query', query))
tups.append((if annot is not None:
'annot', annot))
tups.append((
tups.extend(kwargs.items())= reversed(tups)
tups
= []
df_list = []
labels for label, df in tups:
labels.append(label)'label'] = label
df[
df_list.append(df)= pd.concat(df_list)
bigdf
'chrom'] = pd.Categorical(bigdf['chrom'], bigdf['chrom'].unique())
bigdf['label'] = pd.Categorical(bigdf['label'], bigdf['label'].unique())
bigdf[
= bigdf.groupby('chrom', observed=False)
gr = plt.subplots(gr.ngroups, 1, figsize=(10, 1.5*gr.ngroups),
fig, axes =True
sharey# sharex=True
)if type(axes) is not np.ndarray:
# in case there is only one axis so it not returned as a list
= np.array([axes])
axes
# with plt.style.context(('default')):
for i, chrom in enumerate(gr.groups):
= gr.get_group(chrom)
_df = _df.groupby('label', observed=False)
_gr for y, label in enumerate(_gr.groups):
try:
= _gr.get_group(label)
df except KeyError:
continue
= np.repeat(y, df.index.size)
y =0.5, lw=5, colors=f'C{y[0]}')
axes[i].hlines(y, df.start.tolist(), df.end.tolist(), alpha= len(labels)/10
delta -delta, y+delta, alpha=0.5, lw=2, colors=f'C{y[0]}')
axes[i].vlines(df.start.tolist(), y-delta, y+delta, alpha=0.5, lw=2, colors=f'C{y[0]}')
axes[i].vlines(df.end.tolist(), y
'top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
axes[i].spines[
list(range(len(labels))), labels)
axes[i].set_yticks(='y', which='both', left=False)
axes[i].tick_params(axis-1, len(labels)-0.7)
axes[i].set_ylim(# axes[i].set_xlim(df.start.min()-delta, df.end.max()+delta)
if i != gr.ngroups-1:
='x', which='both', bottom=False)
axes[i].tick_params(axis
='left', fontsize=10)
axes[i].set_title(chrom, loc plt.tight_layout()
# MY PLOTTING FUNCTION
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Rectangle
from mpl_toolkits.axes_grid1 import make_axes_locatable
from genominterv import interval_intersect
def plot_regions(query=None, annot=None, intersect=None,
=['query', 'annot', 'intersect'],
track_titles=(10, 1),
figsize='chrom',
title=0.8, lw=0):
alpha= annot['chrom'].unique()[0]
chrom = pd.read_csv('../data/rheMac10.filtered.chrom.sizes', sep='\t', header=None, names=['chrom', 'size'])
chromsizes
= chromsizes[chromsizes['chrom'] == chrom]['size'].values[0]
base_size = annot['end'].max()
annot_size = query['end'].max()
query_size = max(base_size, annot_size, query_size)
chromsize
if title == 'chrom':
= chrom
title
# Define the plot size
= plt.subplots(figsize=figsize, sharex=True)
f, ax
False)
ax.spines[:].set_visible(
# Plot the annot
# Iterate over each interval in the DataFrame
for start, end in zip(annot['start'], annot['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:red', linewidth=lw, alpha=alpha)
rect
ax.add_patch(rect)'bottom'].set_visible(True)
ax.spines[1], rotation=0, labelpad=10, ha='right', va='center')
ax.set_ylabel(track_titles[
= make_axes_locatable(ax)
divider
if query is not None:
= divider.append_axes("top", size="100%", pad=0.05, sharex=ax)
qax False)
qax.xaxis.set_visible(# Plot the query
for start, end in zip(query['start'], query['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:blue', linewidth=lw, alpha=alpha)
rect
qax.add_patch(rect)False)
qax.spines[:].set_visible(
qax.set_yticks([]) ='left')
qax.set_title(title, loc0], rotation=0, labelpad=10, ha='right', va='center')
qax.set_ylabel(track_titles[
if intersect is not None:
= divider.append_axes("bottom", size="100%", pad=0.05, sharex=ax)
iax # Invisible x-axis for 'annot' (intersect ie below)
False)
ax.xaxis.set_visible(# Plot the intersect
for start, end in zip(intersect['start'], intersect['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=lw, alpha=alpha)
rect
iax.add_patch(rect)False)
iax.spines[:].set_visible(
iax.set_yticks([]) 'bottom'].set_visible(False)
ax.spines['bottom'].set_visible(True)
iax.spines[2], rotation=0, labelpad=10, ha='right', va='center')
iax.set_ylabel(track_titles[
ax.set_yticks([])0, chromsize)
ax.set_xlim(= np.linspace(0, chromsize, num=5)
ticks
ax.set_xticks(ticks) f'{int(t/1e6)} Mbp' for t in ticks])
ax.set_xticklabels([
f.tight_layout()
plt.show()#plt.tight_layout()
Time to unleash genominterv
on the .csv files
Test with a subsample of the data
= dataframes['round_spermatid_100kb_arms']
annot = ech90
query = interval_intersect(annot, query)
intersect
=intersect)
plot_intervals(query, annot, intersection plot_regions(query, annot, intersect)
from genominterv import proximity_test, interval_collapse, interval_diff, interval_intersect, jaccard_stat
= dataframes['round_spermatid_100kb_arms']
annot = ech90
query
#plot_intervals(query=query, annot=annot)
for key,annot in dataframes.items():
# Filter out subset
if ('round_spermatid_100') in key and not 'full' in key:
# Plot the intervals
= interval_intersect(query, annot)
intersection =query, annot=annot, intersection=intersection)
plot_intervals(query
plt.title(key)
# Do a proximity test
print(f"Tests for {key}")
= interval_collapse(annot)
annot_collapsed = interval_diff(query, annot_collapsed)
non_ovl_query print("Proximity:", proximity_test(non_ovl_query, annot_collapsed))
print("Jaccard:", jaccard_stat(query, annot))
print()
Tests for round_spermatid_100kb_arms_smoothed
Proximity: TestResult(statistic=0.3711999999999999, pvalue=0.0204)
Jaccard: 0.03771152626674943
Tests for round_spermatid_100kb_10Mb
Proximity: TestResult(statistic=0.51525, pvalue=0.0)
Jaccard: 0.046023479610810235
Tests for round_spermatid_100kb_10Mb_smoothed
Proximity: TestResult(statistic=0.46520000000000006, pvalue=0.0049)
Jaccard: 0.042623832902054265
Tests for round_spermatid_100kb_arms
Proximity: TestResult(statistic=0.37199999999999966, pvalue=0.0121)
Jaccard: 0.046543092859550536
Bootstrap to get a p-value
from genominterv import bootstrap
= dataframes['round_spermatid_100kb_arms']
annot = pd.read_csv('../data/rheMac10.filtered.chrom.sizes', sep='\t', index_col='chrom', header=None, names=['chrom','size']).to_dict()['size']
chromsizes #display(chromsizes)
@bootstrap(chromsizes, samples=1000)
def jaccard_bootstrap(query, annot):
return jaccard_stat(query, annot)
= jaccard_bootstrap(query, annot) jacccard_stat, p_value
jacccard_stat, p_value
(0.046543092859550536, 0.112)
Partition the A-compartments into regions around the edges
from genominterv import interval_collapse, interval_union
= dataframes['round_spermatid_100kb_arms']
df
= pd.DataFrame({
start_edge 'chrom': df['chrom'],
'start': df.apply(lambda x: max(x['start'] - x['resolution'], 0), axis=1),
'end': df['start']+1*df['resolution'],
'resolution': df['resolution'],
'label': 'start_edge'
})= pd.DataFrame({
end_edge 'chrom': df['chrom'],
'start': df['end']-1*df['resolution'],
'end': df.apply(lambda x: min(x['end'] + x['resolution'], chromsizes['chrX']), axis=1),
'resolution': df['resolution'],
'label': 'end_edge'
})
# The end cannot exceed the chromosome size
= pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])
test_df interval_collapse(test_df)
start | end | chrom | |
---|---|---|---|
0 | 800000 | 1000000 | chrX |
1 | 1500000 | 1800000 | chrX |
2 | 2500000 | 2700000 | chrX |
3 | 2800000 | 3300000 | chrX |
4 | 3400000 | 3600000 | chrX |
... | ... | ... | ... |
91 | 148700000 | 148900000 | chrX |
92 | 149000000 | 149400000 | chrX |
93 | 151100000 | 151300000 | chrX |
94 | 152400000 | 152600000 | chrX |
95 | 153200000 | 153388924 | chrX |
96 rows × 3 columns
import os
from genominterv import interval_collapse
for key, df in dataframes.items():
= '../results/rec_edges'
outdir = os.path.join(outdir,f'{key+'_edges.csv'}')
edge_csv_name if not os.path.exists(edge_csv_name):
= df['resolution'].unique()[0]
res
= pd.DataFrame({
start_edge 'chrom': df['chrom'],
'start': df.apply(lambda x: max(x['start'] - x['resolution'], 0), axis=1),
'end': df['start']+1*df['resolution'],
'resolution': df['resolution'],
'label': 'start_edge'
})= pd.DataFrame({
end_edge 'chrom': df['chrom'],
'start': df['end']-1*df['resolution'],
'end': df.apply(lambda x: min(x['end'] + x['resolution'], chromsizes['chrX']), axis=1),
'resolution': df['resolution'],
'label': 'end_edge'
})
if not os.path.exists(outdir):
os.makedirs(outdir)
= pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])
tmp =res).to_csv(edge_csv_name, index=False) interval_collapse(tmp).assign(resolution
Import edges
import pandas as pd
import os
# Directory containing your .csv files
= '../results/rec_edges/'
csv_dir
# Create a dictionary to store the DataFrames
= {}
edge_df
# Iterate over all .csv files in the directory
for filename in os.listdir(csv_dir):
if filename.endswith('.csv'): # Check for .csv files
# Construct the full file path
= os.path.join(csv_dir, filename)
filepath
# Load the CSV into a DataFrame
# Use the filename (without extension) as the dictionary key
= filename.replace('_edges_', '')
key = os.path.splitext(key)[0]
key = pd.read_csv(filepath)
edge_df[key] 'length'] = edge_df[key]['end'] - edge_df[key]['start']
edge_df[key][
# The `edges` dictionary now contains the DataFrames
print(edge_df.keys())
print(edge_df['fibroblast_100kb_10Mb_edges'].columns)
#ech90 = pd.read_csv('../data/ech90_human_Mmul_10.csv')
dict_keys(['pachytene_spermatocyte_100kb_arms_smoothed_edges', 'round_spermatid_100kb_arms_smoothed_edges', 'fibroblast_500kb_arms_edges', 'round_spermatid_500kb_10Mb_edges', 'fibroblast_100kb_full_edges', 'round_spermatid_500kb_arms_edges', 'spermatogonia_100kb_arms_smoothed_edges', 'fibroblast_500kb_10Mb_edges', 'round_spermatid_100kb_full_edges', 'spermatogonia_500kb_arms_edges', 'spermatogonia_100kb_full_edges', 'pachytene_spermatocyte_100kb_arms_edges', 'sperm_500kb_full_edges', 'pachytene_spermatocyte_500kb_full_edges', 'sperm_100kb_arms_edges', 'sperm_100kb_arms_smoothed_edges', 'sperm_100kb_10Mb_edges', 'pachytene_spermatocyte_100kb_10Mb_edges', 'fibroblast_100kb_arms_smoothed_edges', 'spermatogonia_500kb_10Mb_edges', 'round_spermatid_100kb_full_smoothed_edges', 'fibroblast_500kb_full_edges', 'round_spermatid_100kb_10Mb_edges', 'pachytene_spermatocyte_100kb_full_smoothed_edges', 'fibroblast_100kb_arms_edges', 'sperm_100kb_10Mb_smoothed_edges', 'round_spermatid_500kb_full_edges', 'fibroblast_100kb_10Mb_edges', 'round_spermatid_100kb_arms_edges', 'fibroblast_100kb_10Mb_smoothed_edges', 'spermatogonia_100kb_full_smoothed_edges', 'spermatogonia_500kb_full_edges', 'spermatogonia_100kb_arms_edges', 'pachytene_spermatocyte_100kb_full_edges', 'sperm_500kb_arms_edges', 'sperm_100kb_full_smoothed_edges', 'pachytene_spermatocyte_100kb_10Mb_smoothed_edges', 'pachytene_spermatocyte_500kb_arms_edges', 'sperm_100kb_full_edges', 'round_spermatid_100kb_10Mb_smoothed_edges', 'spermatogonia_100kb_10Mb_smoothed_edges', 'fibroblast_100kb_full_smoothed_edges', 'pachytene_spermatocyte_500kb_10Mb_edges', 'sperm_500kb_10Mb_edges', 'spermatogonia_100kb_10Mb_edges'])
Index(['start', 'end', 'chrom', 'resolution', 'length'], dtype='object')
from genominterv import interval_intersect
= dataframes.keys()
samples #samples = ['round_spermatid_100kb_arms', 'round_spermatid_100kb_10Mb', 'sperm_100kb_arms', 'sperm_100kb_10Mb']
= {key:dataframes[key] for key in samples}
comps = {key:edge_df[f'{key}_edges'] for key in samples}
edges
= {key:interval_intersect(comp, ech90).assign(length = lambda x: x['end'] - x['start']) for key, comp in comps.items()}
comp_intersects = {key:interval_intersect(edge, ech90).assign(length = lambda x: x['end'] - x['start']) for key, edge in edges.items()}
edge_intersects
## For a single sample only ##
# sample = 'round_spermatid_100kb_arms'
# full_df = dataframes[sample]
# full_intersect = interval_intersect(full_df, ech90).assign(length=lambda x: x['end'] - x['start'])
# edge_intersect = interval_intersect(edge_df[f'{sample}_edges'], ech90).assign(length=lambda x: x['end'] - x['start'])
# edge_df = edge_df[f'{sample}_edges']
# # Plot full
# plot_regions(ech90, full_df, full_intersect)
# plt.suptitle('All edges')
# # Plot edge
# plot_regions(ech90, edge_df, edge_intersect)
# plt.suptitle('Edges only')
### Some stats about the data and intersections ###
= pd.DataFrame({
stats 'Sample': samples,
'Total regions': [comp.shape[0] for comp in comps.values()],
'Total regions on ECH90': [comp_intersect.shape[0] for comp_intersect in comp_intersects.values()],
'Total regions on edges': [edge_intersect.shape[0] for edge_intersect in edge_intersects.values()],
'Prop regions on ECH90': [comp_intersect.shape[0] / ech90.shape[0] for comp, comp_intersect in zip(comps.values(), comp_intersects.values())],
'Total bp': [comp['length'].sum() for comp in comps.values()],
'Total bp on ECH90': [comp_intersect['length'].sum() for comp_intersect in comp_intersects.values()],
'Total bp on edges': [edge_intersect['length'].sum() for edge_intersect in edge_intersects.values()],
'Prop bp on ECH90': [comp_intersect['length'].sum() / ech90['length'].sum() for comp, comp_intersect in zip(comps.values(), comp_intersects.values())],
})
#df[['Source', 'Rest']] = df['Sample'].str.extract(r'^(.*?)(_\d+\w+.*)')
'source', 'res', 'view', 'smoothed']] = stats['Sample'].str.extract(r'^(.*?)_(100kb|500kb)_(full|arms|10Mb)(?:_(smoothed))?$')
stats[['smoothed'] = stats['smoothed'].notna() # Convert to boolean
stats['source', 'res', 'view', 'smoothed','Prop regions on ECH90', 'Prop bp on ECH90']]
stats[[
source | res | view | smoothed | Prop regions on ECH90 | Prop bp on ECH90 | |
---|---|---|---|---|---|---|
0 | spermatogonia | 500kb | arms | False | 0.421053 | 0.489351 |
1 | round_spermatid | 500kb | full | False | 0.368421 | 0.418981 |
2 | fibroblast | 500kb | full | False | 0.684211 | 0.529916 |
3 | pachytene_spermatocyte | 100kb | arms | True | 0.684211 | 0.632145 |
4 | pachytene_spermatocyte | 500kb | full | False | 0.368421 | 0.362651 |
5 | pachytene_spermatocyte | 100kb | full | False | 0.473684 | 0.365265 |
6 | round_spermatid | 100kb | arms | True | 0.631579 | 0.412828 |
7 | fibroblast | 100kb | arms | True | 0.684211 | 0.499297 |
8 | fibroblast | 100kb | full | False | 0.842105 | 0.550167 |
9 | round_spermatid | 100kb | full | False | 0.421053 | 0.472204 |
10 | spermatogonia | 100kb | arms | False | 0.526316 | 0.550008 |
11 | spermatogonia | 100kb | 10Mb | False | 0.631579 | 0.567910 |
12 | sperm | 100kb | full | False | 0.631579 | 0.674746 |
13 | sperm | 100kb | arms | True | 0.578947 | 0.572887 |
14 | spermatogonia | 500kb | 10Mb | False | 0.684211 | 0.689415 |
15 | sperm | 500kb | full | False | 0.631579 | 0.559567 |
16 | spermatogonia | 100kb | arms | True | 0.526316 | 0.549381 |
17 | pachytene_spermatocyte | 100kb | 10Mb | False | 0.789474 | 0.630109 |
18 | round_spermatid | 100kb | 10Mb | False | 0.684211 | 0.519772 |
19 | round_spermatid | 100kb | 10Mb | True | 0.578947 | 0.483689 |
20 | fibroblast | 100kb | 10Mb | False | 0.684211 | 0.494509 |
21 | fibroblast | 100kb | 10Mb | True | 0.526316 | 0.414530 |
22 | fibroblast | 100kb | full | True | 0.736842 | 0.591105 |
23 | sperm | 100kb | arms | False | 0.736842 | 0.578219 |
24 | round_spermatid | 100kb | full | True | 0.368421 | 0.418981 |
25 | fibroblast | 500kb | 10Mb | False | 0.473684 | 0.359088 |
26 | round_spermatid | 500kb | 10Mb | False | 0.578947 | 0.496303 |
27 | sperm | 500kb | arms | False | 0.421053 | 0.395138 |
28 | pachytene_spermatocyte | 100kb | 10Mb | True | 0.631579 | 0.665339 |
29 | pachytene_spermatocyte | 500kb | 10Mb | False | 0.631579 | 0.538153 |
30 | pachytene_spermatocyte | 100kb | full | True | 0.368421 | 0.401552 |
31 | round_spermatid | 500kb | arms | False | 0.684211 | 0.655953 |
32 | spermatogonia | 500kb | full | False | 0.421053 | 0.392944 |
33 | fibroblast | 500kb | arms | False | 0.684211 | 0.550729 |
34 | spermatogonia | 100kb | 10Mb | True | 0.631579 | 0.580640 |
35 | spermatogonia | 100kb | full | True | 0.368421 | 0.386720 |
36 | sperm | 500kb | 10Mb | False | 0.526316 | 0.438818 |
37 | pachytene_spermatocyte | 500kb | arms | False | 0.631579 | 0.650289 |
38 | pachytene_spermatocyte | 100kb | arms | False | 0.789474 | 0.584619 |
39 | sperm | 100kb | 10Mb | True | 0.578947 | 0.549366 |
40 | sperm | 100kb | full | True | 0.578947 | 0.677841 |
41 | fibroblast | 100kb | arms | False | 0.736842 | 0.510641 |
42 | spermatogonia | 100kb | full | False | 0.473684 | 0.374801 |
43 | round_spermatid | 100kb | arms | False | 0.789474 | 0.500366 |
44 | sperm | 100kb | 10Mb | False | 0.684211 | 0.545798 |
Do testing of the edges
%%capture
# Define what we are testing
print("""
Goal: To test whether ECH90 regions are enriched in compartment edges
Query: ECH90
Annotation: Start and end edges of compartments
Hypothesis:
ECH90 regions are enriched in compartment edges
Null hypothesis:
ECH90 regions are not enriched in compartment edges
Tests:
* Proximity test:
tests whether the query regions are closer to
the annotation regions than expected by chance.
NB regions can not overlap, so we need to collapse the annotation regions
* Jaccard index:
tests the similarity between the query and annotation regions,
where a value of 1 indicates perfect overlap
""")
Big test don’t run
Parallelized proximity test
set samples = 100_000
import os.path as op
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
from genominterv.stats import proximity_test
from genominterv import interval_diff
def process_proximity(sample, nsamples = 100000):
= ech90
query_comp = ech90
query_edge = comps[sample]
annot_comp = edges[sample]
annot_edge
# Get non-overlapping query regions
= interval_diff(query_comp, annot_comp)
non_ovl_query_comp = interval_diff(query_edge, annot_edge)
non_ovl_query_edge
= proximity_test(non_ovl_query_comp, annot_comp,
proximity_comp =False,
nsamples, two_sided=True)
overlap_as_zero= proximity_test(non_ovl_query_edge, annot_edge,
proximity_edge =False,
nsamples, two_sided=True,span_as_zero=True)
overlap_as_zero
= [
results 'Sample': sample, 'Query': 'ECH90', 'Comp Statistic': proximity_comp.statistic,
{'Edge Statistic': proximity_edge.statistic, 'Comp P-value': proximity_comp.pvalue,
'Edge P-value': proximity_edge.pvalue}
]
# Swap query and annotation
= annot_comp, query_comp
query_comp, annot_comp = annot_edge, query_edge
query_edge, annot_edge
= interval_diff(query_comp, annot_comp)
non_ovl_query_comp = interval_diff(query_edge, annot_edge)
non_ovl_query_edge
= proximity_test(non_ovl_query_comp, annot_comp,
proximity_comp =False,
nsamples, two_sided=True)
overlap_as_zero= proximity_test(non_ovl_query_edge, annot_edge,
proximity_edge =False,
nsamples, two_sided=True)
overlap_as_zero
results.append('Sample': sample, 'Query': 'Edge', 'Comp Statistic': proximity_comp.statistic,
{'Edge Statistic': proximity_edge.statistic, 'Comp P-value': proximity_comp.pvalue,
'Edge P-value': proximity_edge.pvalue}
)
return results
### End of function definition ###
= 100_000
nsamples = f'../results/proximity_test_{nsamples}.csv'
proximity_file if op.exists(proximity_file):
= Parallel(n_jobs=-1)(
proximity_res for sample in tqdm(samples)
delayed(process_proximity)(sample, nsamples)
)
# Flatten the results
= pd.DataFrame([item for sublist in proximity_res for item in sublist])
proximity_res =False) proximity_res.to_csv(proximity_file, index
100%|██████████| 2/2 [00:00<00:00, 120.90it/s]
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/home/sojern/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker r = call_item() ^^^^^^^^^^^ File "/home/sojern/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__ return self.fn(*self.args, **self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/sojern/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py", line 598, in __call__ return [func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ipykernel_2160200/2582913803.py", line 22, in process_proximity TypeError: proximity_test() got an unexpected keyword argument 'span_as_zero' """ The above exception was the direct cause of the following exception: TypeError Traceback (most recent call last) Cell In[15], line 60 58 proximity_file = f'../results/proximity_test_{nsamples}.csv' 59 if op.exists(proximity_file): ---> 60 proximity_res = Parallel(n_jobs=-1)( 61 delayed(process_proximity)(sample, nsamples) for sample in tqdm(samples) 62 ) 64 # Flatten the results 65 proximity_res = pd.DataFrame([item for sublist in proximity_res for item in sublist]) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable) 2001 # The first item from the output is blank, but it makes the interpreter 2002 # progress until it enters the Try/Except block of the generator and 2003 # reaches the first `yield` statement. This starts the asynchronous 2004 # dispatch of the tasks to the workers. 2005 next(output) -> 2007 return output if self.return_generator else list(output) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch) 1647 yield 1649 with self._backend.retrieval_context(): -> 1650 yield from self._retrieve() 1652 except GeneratorExit: 1653 # The generator has been garbage collected before being fully 1654 # consumed. This aborts the remaining tasks if possible and warn 1655 # the user if necessary. 1656 self._exception = True File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1754, in Parallel._retrieve(self) 1747 while self._wait_retrieval(): 1748 1749 # If the callback thread of a worker has signaled that its task 1750 # triggered an exception, or if the retrieval loop has raised an 1751 # exception (e.g. `GeneratorExit`), exit the loop and surface the 1752 # worker traceback. 1753 if self._aborting: -> 1754 self._raise_error_fast() 1755 break 1757 # If the next job is not ready for retrieval yet, we just wait for 1758 # async callbacks to progress. File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1789, in Parallel._raise_error_fast(self) 1785 # If this error job exists, immediately raise the error by 1786 # calling get_result. This job might not exists if abort has been 1787 # called directly or if the generator is gc'ed. 1788 if error_job is not None: -> 1789 error_job.get_result(self.timeout) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:745, in BatchCompletionCallBack.get_result(self, timeout) 739 backend = self.parallel._backend 741 if backend.supports_retrieve_callback: 742 # We assume that the result has already been retrieved by the 743 # callback thread, and is stored internally. It's just waiting to 744 # be returned. --> 745 return self._return_or_raise() 747 # For other backends, the main thread needs to run the retrieval step. 748 try: File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:763, in BatchCompletionCallBack._return_or_raise(self) 761 try: 762 if self.status == TASK_ERROR: --> 763 raise self._result 764 return self._result 765 finally: TypeError: proximity_test() got an unexpected keyword argument 'span_as_zero'
Parallelized Jaccard test
from joblib import Parallel, delayed
from tqdm import tqdm
import os.path as op
from genominterv.stat import jaccard_stat
from genominterv.decorators import bootstrap
from datetime import datetime
import pandas as pd
= 100_000
b @bootstrap(chromsizes, samples=b)
def jaccard_bootstrap(query, annot):
return jaccard_stat(query, annot)
def process_sample(sample):
"""Function to process a single sample."""
= ech90
query_comp = ech90
query_edge = comps[sample]
annot_comp = edges[sample]
annot_edge
= jaccard_bootstrap(query_comp, annot_comp)
jaccard_comp = jaccard_bootstrap(query_edge, annot_edge)
jaccard_edge
= [{
result 'Sample': sample,
'Query': 'ECH90',
'Comp Index': jaccard_comp[0],
'Edge Index': jaccard_edge[0],
'Comp P-value': jaccard_comp[1],
'Edge P-value': jaccard_edge[1]
}
]
# Swap query and annotation for the Edge test
= annot_comp, query_comp
query_comp, annot_comp = annot_edge, query_edge
query_edge, annot_edge
= jaccard_bootstrap(query_comp, annot_comp)
jaccard_comp = jaccard_bootstrap(query_edge, annot_edge)
jaccard_edge
result.append({'Sample': sample,
'Query': 'Edge',
'Comp Index': jaccard_comp[0],
'Edge Index': jaccard_edge[0],
'Comp P-value': jaccard_comp[1],
'Edge P-value': jaccard_edge[1]
}
)return result
= f'../results/jaccard_test_{b}.csv'
jaccard_file
if not op.exists(jaccard_file):
# Run parallel computation
print(f"Running bootstrap ({b}) for all samples in parallel")
= Parallel(n_jobs=-1)(
results for sample in tqdm(samples)
delayed(process_sample)(sample)
)
# Flatten and create DataFrame
= pd.DataFrame([item for sublist in results for item in sublist])
jaccard_res
# Write to CSV
=False)
jaccard_res.to_csv(jaccard_file, index
else:
# Read
= pd.read_csv(jaccard_file) jaccard_res
Plot some summary statistics
# Split sample column:
= pd.read_csv(proximity_file)
proximity_res
'source', 'resolution', 'view', 'smoothed']] = proximity_res['Sample'].str.extract(r'^(.*?)_(100kb|500kb)_(full|arms|10Mb)(?:_(smoothed))?$')
proximity_res[['source'] = proximity_res['source'].map({'fibroblast':'Fib', 'spermatogonia': 'SP', 'pachytene_spermatocyte': 'PAC', 'round_spermatid': 'RS', 'sperm': 'Sperm'})
proximity_res['smoothed'] = proximity_res['smoothed'].notna()
proximity_res['resolution'] = proximity_res.apply(lambda x: 'ps500kb' if x['smoothed'] else x['resolution'], axis=1)
proximity_res[= proximity_res.melt(
proximity_res =['Sample', 'source', 'resolution', 'view', 'Query'],
id_vars=['Comp P-value', 'Edge P-value'],
value_vars='type', value_name='p-value',).query('Query == "ECH90"') var_name
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
= ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
source_order # Choose the combination og parameters to plot
= ['fibroblast','round_spermatid']
source = ['100kb']
res = ['arms']
view = ['Edge P-value']
annot
# Plot the results from @group
= proximity_res.query('resolution in @res and view in @view and type in @annot')
plot_group_prox
= plot_group_prox.assign(
plot_group_prox = lambda x: -np.log10(x['p-value']),
minuslog10p type = 'Proximity')
# Plot the results
="whitegrid", rc=notebook_rcparams)
sns.set_theme(style
= sns.catplot(
g ='bar',
plot_group_prox, kind='source', y='minuslog10p', hue='type',
x=source_order,
order=None, margin_titles=True, sharex=False,
errorbar
)
=True)
g.despine(left"", "-log10(p)")
g.set_axis_labels('')
g.legend.set_title(
for axis in g.axes.flat:
=True)
axis.tick_params(labelbottom# rotate said labels
for item in axis.get_xticklabels():
45)
item.set_rotation(
g.tight_layout()
# Now do the same for the Jaccard index
= pd.read_csv(jaccard_file)
jaccard_res
'source', 'resolution', 'view', 'smoothed']] = jaccard_res['Sample'].str.extract(r'^(.*?)_(100kb|500kb)_(full|arms|10Mb)(?:_(smoothed))?$')
jaccard_res[['source'] = jaccard_res['source'].map({'fibroblast':'Fib', 'spermatogonia': 'SP', 'pachytene_spermatocyte': 'PAC', 'round_spermatid': 'RS', 'sperm': 'Sperm'})
jaccard_res['smoothed'] = jaccard_res['smoothed'].notna()
jaccard_res['resolution'] = jaccard_res.apply(lambda x: 'ps500kb' if x['smoothed'] else x['resolution'], axis=1)
jaccard_res[= jaccard_res.melt(
jaccard_res =['Sample', 'source', 'resolution', 'view', 'Query'],
id_vars=['Comp P-value', 'Edge P-value'],
value_vars='type', value_name='p-value',).query('Query == "ECH90"')
var_name
import numpy as np
= ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
source_order # Choose the combination og parameters to plot
= ['fibroblast','round_spermatid']
source = ['100kb']
res = ['arms']
view = ['Edge P-value']
annot
# Plot the results from @group
= jaccard_res.query('resolution in @res and view in @view and type in @annot')
plot_group_jacc = plot_group_jacc.assign(
plot_group_jacc = lambda x: -np.log10(x['p-value']),
minuslog10p type = 'Jaccard')
# Plot the results
="whitegrid", rc=notebook_rcparams)
sns.set_theme(style
= sns.catplot(
g ='bar',
plot_group_jacc, kind='source', y='minuslog10p', hue='type',
x=source_order,
order=None, margin_titles=True, sharex=False,
errorbar
)
=True)
g.despine(left"", "-log10(p)")
g.set_axis_labels('')
g.legend.set_title(
for axis in g.axes.flat:
=True)
axis.tick_params(labelbottom# rotate said labels
for item in axis.get_xticklabels():
45)
item.set_rotation(
g.tight_layout()
Plot proximity result with Jaccard result
# Concatenate the two plot_groups
= pd.concat([plot_group_prox, plot_group_jacc])
group # display(group[['source', 'resolution', 'type', 'p-value', 'minuslog10p']].query(
# 'source in ("Fib", "RS")'
# ).reset_index(drop=True))
# g = sns.catplot(
# group, kind='bar',
# x='source', y='minuslog10p', hue='type',
# order=source_order,
# errorbar=None, margin_titles=True,
# height=2, aspect=1.5
# )
# # Plot a horisontal line at p=0.05
# g.ax.axhline(-np.log10(0.05), color='tab:red', linestyle='--')
= plt.subplots(figsize=(3,2))
f,ax
# barplot
= sns.barplot(
g =group, x='source', y='minuslog10p', hue='type',
data=source_order, ax = ax
order
)
-np.log10(0.05), color='tab:red', linestyle='--')
ax.axhline(
=g, left=True)
sns.despine(ax
'-log10(p)')
g.set_ylabel('')
g.set_xlabel('')
g.get_legend().set_title('upper center', bbox_to_anchor=(0.5,0.975))
sns.move_legend(g,
f.tight_layout()
plt.show()
Plot all cominations of the parameters
Avoid this to avoid p-hacking.
# # Define the order of the x-axis if we plot all combinations
# view_order = ['full', 'arms', '10Mb']
# row_order = ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
#
# sns.set_theme(style="whitegrid")
#
# g = sns.catplot(
# proximity_res, kind='bar', row='source',
# x='view', y='p-value', hue='type', col='resolution',
# order=view_order, row_order=row_order,
# errorbar=None, margin_titles=True, sharex=False,
# )
#
# g.despine(left=True)
# g.set_axis_labels("", "P-value")
# g.legend.set_title('')
#
#
# for axis in g.axes.flat:
# axis.tick_params(labelbottom=True)
# # rotate said labels
# for item in axis.get_xticklabels():
# item.set_rotation(45)
#
# g.tight_layout()
# # # Group the results by source and resolution
# # group = ["fibroblast", 'round_spermatid']
# # # Plot the results from @group
# # plot_group = jaccard_res.query(f'source in @group')
# sns.set_theme(style="whitegrid")
# view_order = ['full', 'arms', '10Mb']
# row_order = ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
# g = sns.catplot(
# jaccard_res, kind='bar', row='source',
# x='view', y='p-value', hue='type', col='resolution',
# margin_titles=True, sharex=False,
# order=view_order, row_order=row_order,
# errorbar=None)
# g.despine(left=True)
# g.set_axis_labels("", "P-value")
# g.legend.set_title('')
# for axis in g.axes.flat:
# axis.tick_params(labelbottom=True)
# # rotate said labels
# for item in axis.get_xticklabels():
# item.set_rotation(45)
# item.set_va('top')
# g.tight_layout()
Revised test (baboon)
Baboon data
Here, the data for hybrid incompatibility in baboons (having Papio hamadryas or Papio anubis ancestry) is introduced and compared with the edge regions of the compartments. The intervals are extracted from baboondiversity project, in the notebook baboon_ancestry_segments.ipynb
.
The data originates from a baboon population in a hybridization zone between olive and hamadryas baboons, and the extracted intervals are genomic regions where (almost) all individiuals have the same ancestral allele, 100% from olive or 95% hamadryas. It has nothing to do with Hi-C data, but maybe the intervals align as well?
The intervals were lifted to rheMac10 via hg38: panu_3.0 -> hg38 -> rheMac10: two intervals were deleted with no match in the last step:
#Deleted in new
chrX 58600000 58700000 high_olive
#Deleted in new
chrX 59400000 60800000 high_olive
Additionally, 4 intervals were lifted to the Y chromosome, so they were deleted as well
The tests
We think the regions where hamadryas ancestry is preserved are the most relevant to this analysis, as the regions are shorter and there are not as many.
Here, perform the tests as discussed (meeting)
Edge.v.Edge: (100kb +/- start,end)
jaccard_test
query: [ech, high_hama]
annot: [rs100_arms_edge, fb100_arms_edge]
- Total tests: \(4\)
proximity_test(overlap_as_zero=True,span_as_zero=True)
query: [ech, high_hama]
annot: [rs100_arms_edge, fb100_arms_edge]
- Total tests: \(4\)
Region.v.Region
jaccard_bootstrap()
query: [ech, high_hama]
annot: [rs100_arms_comp, fb100_arms_comp]
Load baboons
# Load baboons, define only high_hama
import pandas as pd
from genominterv import interval_collapse
### Lifted with standard UCSC LiftOver
# baboon_df = pd.read_csv('../results/high_baboon_rhemac10.bed', sep='\t', index_col=False, header=None, names=['chrom', 'start', 'end', 'group', 'score'])
= pd.read_csv('../results/hama_olive_high_intervals.tsv', sep='\t', index_col=False)
baboon_nonlift
# # print("Original:")
# # print(baboon_df.groupby('group').size())
# high_hama = (interval_collapse(baboon_df.query('group == "high_hama"')).
# assign(group = "high_hama").
# query('chrom == "chrX"')
# ).reset_index(drop=True)
# high_olive = (interval_collapse(baboon_df.query('group == "high_olive"')).
# assign(group = "high_olive").
# query('chrom == "chrX"')
# )
# print("\nCollapsed:")
# #print(pd.concat([high_hama, high_olive]).groupby('group').size())
# #baboon_dict = {"P.hama": high_hama, "P.anubis": high_olive}
# print(f"high_hama: {high_hama.shape[0]}")
### Lifted with segment_liftover (`lift` env)
= pd.read_csv('../data/lift/rheMac10/high_hama_rhemac10.bed', sep='\t', index_col=False, header=None, names=['group', 'chrom', 'start', 'end'])
high_hama
= pd.read_csv('../data/lift/rheMac10/high_olive_rhemac10.bed', sep='\t', index_col=False, header=None, names=['group', 'chrom', 'start', 'end'])
high_olive
Define functions
from genominterv import interval_collapse
from multiprocessing import cpu_count
from genominterv.stats import jaccard_stat, proximity_stat
from genominterv.decorators import bootstrap
import numpy as np
import pandas as pd
# Make edges function
## Make a function that splits an interval into its edges (and collapses overlapping edges)
def make_edges(df, resolution):
= pd.DataFrame({
start_edge 'chrom': df['chrom'],
'start': df.apply(lambda x: max(x['start'] - resolution, 0), axis=1),
'end': df['start']+1*resolution,
'resolution': resolution,
})= pd.DataFrame({
end_edge 'chrom': df['chrom'],
'start': df['end']-1*resolution,
'end': df.apply(lambda x: min(x['end'] + resolution, chromsizes['chrX']), axis=1),
'resolution': resolution,
})
return interval_collapse(pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])).assign(resolution=resolution)
=100_000
nsamples
@bootstrap(chromsizes, samples=nsamples)
def proximity_test(q, a):
"""
Tests if the mean relative distance of query segments to the
closest annotation is smaller than expected.
"""
return proximity_stat(q, a)
@bootstrap(chromsizes, samples=nsamples)
def jaccard_test(q, a):
""""
Tests if the overlap between query and annotation segments
is smaller than expected.
"""
return jaccard_stat(q, a)
def overlaps(df1, df2):
"""
Establishes whether each query segment overlaps at least one
annotation segment. Returns a boolean array with same length
as df1.index.
"""
= []
overlapping for i, (s1, e1) in enumerate(zip(df1.start, df1.end)):
= False
overlaps for s2, e2 in zip(df2.start, df2.end):
if e1 > s2 and e2 > s1:
= True
overlaps break
overlapping.append(overlaps)return np.array(overlapping)
def svedig_tabel(orig_df, index, columns, values, cmap='Reds'):
= (orig_df
df =np.log10(results.p))
.assign(log10p< 0.05)]
.loc[(results.p =index, columns=columns, values=values)
.pivot(index
)= df.rename(columns = {x:x.replace('_', '<br>') for x in df.columns.tolist()})
df = (df.style
df =df.columns, axis=None, cmap=cmap, vmin=0)
.background_gradient(subsetmap(lambda x: 'color: transparent; background-color: transparent' if np.isnan(x) else '')
.format('{:.3f}')
.
.set_table_styles('selector': '',
{c: [{'props': [('min-width', '100px')],
for c in df.columns}, overwrite=False
}]
)
)return df
Load comps and edges
# Define the rest and get an overview of the selected samples
= ['fibroblast_100kb_arms', 'round_spermatid_100kb_arms']
samples
= {
fb100_dict 'full':dataframes[samples[0]],
'edges':make_edges(dataframes[samples[0]], 100_000),
'limits': make_edges(dataframes[samples[0]], 1)}
= {
rs100_dict 'full':dataframes[samples[1]],
'edges':edge_df[f'{samples[1]}_edges'],
'limits': make_edges(dataframes[samples[1]], 1)}
= {
hama_dict 'full': high_hama,
'edges': make_edges(high_hama, 100_000),
'limits': make_edges(high_hama, 1)
}
= {
olive_dict 'full': high_olive,
'edges': make_edges(high_olive, 100_000),
'limits': make_edges(high_olive, 1)
}
= {
hamaolive_dict 'full': pd.concat([high_hama, high_olive]),
'edges': make_edges(pd.concat([high_hama, high_olive]), 100_000),
'limits': make_edges(pd.concat([high_hama, high_olive]), 1)
}
= {'full': ech90,
ech_dict 'edges': make_edges(ech90, 100_000),
'limits': make_edges(ech90, 1)}
Summarise
# Summarise how much the edges are reduced compared to the full regions
= ['Fibroblast', 'Round spermatid', 'Hamadryas', 'Olive', 'HamaOlive', 'ECH']
names = [fb100_dict, rs100_dict, hama_dict, olive_dict, hamaolive_dict, ech_dict]
samples
#
= pd.DataFrame({
summary 'Sample': names,
'Full_bp': [sum(sample['full']['end'] - sample['full']['start']) for sample in samples],
'full_max_bp': [max(sample['full']['end'] - sample['full']['start']) for sample in samples],
'full_avg_bp': [round(sum(sample['full']['end'] - sample['full']['start']) / sample['full'].shape[0], 1) for sample in samples],
'full_min_bp': [min(sample['full']['end'] - sample['full']['start']) for sample in samples],
'Edge_bp': [sum(sample['edges']['end'] - sample['edges']['start']) for sample in samples],
'edge_avg_bp': [round(sum(sample['edges']['end'] - sample['edges']['start']) / sample['edges'].shape[0], 1) for sample in samples],
=lambda x: round(x['Edge_bp'] / x['Full_bp'],3))
}).assign(edge_over_full
# tell pandas to use thousands separator
display(summary.styleformat("{:,.0f}", subset=['Full_bp', 'full_max_bp', 'full_min_bp', 'Edge_bp'])
.format("{:,.1f}", subset=['full_avg_bp', 'edge_avg_bp'])
.format("{:.1%}", subset='edge_over_full')
.
)
plot_regions(high_olive, high_hama, ech90, =['Hama', 'Olive', 'ECH90'],
track_titles="Edges")
title
'group == "high_olive"'),
plot_regions(high_olive,baboon_nonlift.query(=['Lifted High olive', 'Non-lifted high olive'],
track_titles="Compare liftOver")
title
'limits'], olive_dict['limits'], ech_dict['limits'],
plot_regions(hama_dict[=['Hama', 'Olive', 'ECH90'],
track_titles='Limits',
title=1, lw=0.3)
alpha
'limits'], fb100_dict['limits'], rs100_dict['limits'],
plot_regions(olive_dict[=['Hama', 'Fibroblast', 'Round spermatid'],
track_titles='Limits',
title=1, lw=0.3)
alpha
Sample | Full_bp | full_max_bp | full_avg_bp | full_min_bp | Edge_bp | edge_avg_bp | edge_over_full | |
---|---|---|---|---|---|---|---|---|
0 | Fibroblast | 64,200,000 | 5,000,000 | 1,088,135.6 | 100,000 | 21,600,000 | 260,241.0 | 33.6% |
1 | Round spermatid | 56,500,000 | 4,500,000 | 733,766.2 | 100,000 | 27,088,924 | 282,176.3 | 47.9% |
2 | Hamadryas | 8,775,834 | 1,787,836 | 516,225.5 | 96,276 | 6,434,612 | 229,807.6 | 73.3% |
3 | Olive | 44,876,120 | 6,493,293 | 1,319,885.9 | 97,613 | 12,715,662 | 219,235.6 | 28.3% |
4 | HamaOlive | 53,651,954 | 6,493,293 | 1,051,999.1 | 96,276 | 18,941,689 | 228,213.1 | 35.3% |
5 | ECH | 5,511,675 | 623,108 | 290,088.2 | 88,572 | 7,195,159 | 224,848.7 | 130.5% |
Test
Test revised I
# Now define tests
from genominterv.decorators import bootstrap
from genominterv.stats import proximity_test, jaccard_stat
from joblib import Parallel, delayed
from tqdm import tqdm
=100_000
n_samples
## Proximity test
def proximity_parallel(query, annot, nsamples=100_000, oaz=False, saz=False):
= query, annot
q_key, a_key = prox_dict[q_key]
query = prox_dict[a_key]
annot
= proximity_test(query, annot, nsamples,
test =oaz,
overlap_as_zero=saz
span_as_zero
)
= [
results 'Query': q_key,
{'Annot': a_key,
'Statistic': test.statistic,
'P-value': test.pvalue,
'Overlap as zero': oaz,
'Span as zero': saz
}
]
return results
## Jaccard bootstrap
@bootstrap(chromsizes, samples=10_000)
def jaccard_bootstrap(query, annot):
return jaccard_stat(query, annot)
def jaccard_parallel(query, annot):
= query
q_key = annot
a_key = jacc_dict[q_key]
query = jacc_dict[a_key]
annot
= jaccard_bootstrap(query, annot)
test
= [
results 'Query': q_key,
{'Annot': a_key,
'Index': test[0],
'P-value': test[1]
}]return results
# Now do the testing
# Define the samples
= {'ECH90': ech_dict['full'],
prox_dict 'Hamadryas': hama_dict['full'],
'Olive': olive_dict['full'],
'Fib100_edge': fb100_dict['edges'],
'RS100_edge': rs100_dict['edges'],
'RS100_full': rs100_dict['full']
}
= [('ECH90', 'Fib100_edge', n_samples),
prox_samples 'ECH90', 'RS100_edge', n_samples),
('Hamadryas', 'Fib100_edge', n_samples),
('Hamadryas', 'RS100_edge', n_samples),
('Olive', 'Fib100_edge', n_samples),
('Olive', 'RS100_edge', n_samples),
('Olive', 'RS100_full', n_samples),
('Hamadryas', 'RS100_full', n_samples),
(
]
# First, without oaz, saz
= Parallel(n_jobs=-1)(
proximity_res1 False, False) for (que,ann,n) in tqdm(prox_samples)
delayed(proximity_parallel)(que, ann, n,
)= pd.DataFrame([item for sublist in proximity_res1 for item in sublist])
proximity_res1
# Now with oaz, saz
= Parallel(n_jobs=-1)(
proximity_res2 True, True) for (que,ann,n) in tqdm(prox_samples)
delayed(proximity_parallel)(que, ann, n,
)= pd.DataFrame([item for sublist in proximity_res2 for item in sublist])
proximity_res2
# Now with oaz, saz
= Parallel(n_jobs=-1)(
proximity_res3 True, False) for (que,ann,n) in tqdm(prox_samples)
delayed(proximity_parallel)(que, ann, n,
)= pd.DataFrame([item for sublist in proximity_res3 for item in sublist])
proximity_res3
# Concatenate the results
= pd.concat([proximity_res1, proximity_res2, proximity_res3])
proximity_res
# Write to CSV
#proximity_res.to_csv('../results/proximity_revised.csv', index=False)
# Now do the Jaccard index
= [(ech_dict['full'], fb100_dict['full']),
jacc_samples 'full'], rs100_dict['full'])
(hama_dict[
]
= {'ECH90': ech_dict['full'],
jacc_dict 'Hamadryas': hama_dict['full'],
'Olive': olive_dict['full'],
'Fib100_edge': fb100_dict['edges'],
'Fib100_full': fb100_dict['full'],
'RS100_full': rs100_dict['full'],
'RS100_edge': rs100_dict['edges']
}= [
jacc_samples 'ECH90', 'Fib100_full'),
('ECH90', 'Fib100_edge'),
('ECH90', 'RS100_full'),
('ECH90', 'RS100_edge'),
('Hamadryas', 'Fib100_full'),
('Hamadryas', 'Fib100_edge'),
('Hamadryas', 'RS100_full'),
('Hamadryas', 'RS100_edge'),
('Olive', 'Fib100_edge'),
('Olive', 'RS100_edge'),
(
]
= Parallel(n_jobs=-1)(
jaccard_res for (que,ann) in tqdm(jacc_samples)
delayed(jaccard_parallel)(que, ann)
)
# Flatten and create DataFrame
= pd.DataFrame([item for sublist in jaccard_res for item in sublist])
jaccard_res
# Write to CSV
#jaccard_res.to_csv('../results/jaccard_revised.csv', index=False)
100%|██████████| 8/8 [00:00<00:00, 395.88it/s]
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[15], line 75 64 prox_samples = [('ECH90', 'Fib100_edge', n_samples), 65 ('ECH90', 'RS100_edge', n_samples), 66 ('Hamadryas', 'Fib100_edge', n_samples), (...) 71 ('Hamadryas', 'RS100_full', n_samples), 72 ] 74 # First, without oaz, saz ---> 75 proximity_res1 = Parallel(n_jobs=-1)( 76 delayed(proximity_parallel)(que, ann, n, False, False) for (que,ann,n) in tqdm(prox_samples) 77 ) 78 proximity_res1 = pd.DataFrame([item for sublist in proximity_res1 for item in sublist]) 80 # Now with oaz, saz File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable) 2001 # The first item from the output is blank, but it makes the interpreter 2002 # progress until it enters the Try/Except block of the generator and 2003 # reaches the first `yield` statement. This starts the asynchronous 2004 # dispatch of the tasks to the workers. 2005 next(output) -> 2007 return output if self.return_generator else list(output) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch) 1647 yield 1649 with self._backend.retrieval_context(): -> 1650 yield from self._retrieve() 1652 except GeneratorExit: 1653 # The generator has been garbage collected before being fully 1654 # consumed. This aborts the remaining tasks if possible and warn 1655 # the user if necessary. 1656 self._exception = True File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1762, in Parallel._retrieve(self) 1757 # If the next job is not ready for retrieval yet, we just wait for 1758 # async callbacks to progress. 1759 if ((len(self._jobs) == 0) or 1760 (self._jobs[0].get_status( 1761 timeout=self.timeout) == TASK_PENDING)): -> 1762 time.sleep(0.01) 1763 continue 1765 # We need to be careful: the job list can be filling up as 1766 # we empty it and Python list are not thread-safe by 1767 # default hence the use of the lock KeyboardInterrupt:
'Proximity:',proximity_res.sort_values(['Query', 'Annot','Overlap as zero']),
display('Jaccard:',jaccard_res)
'Proximity:'
Query | Annot | Statistic | P-value | Overlap as zero | Span as zero | |
---|---|---|---|---|---|---|
0 | ECH90 | Fib100_edge | 0.126462 | 0.18750 | False | False |
0 | ECH90 | Fib100_edge | 0.320105 | 0.00000 | True | True |
0 | ECH90 | Fib100_edge | 0.320105 | 0.00000 | True | False |
1 | ECH90 | RS100_edge | -0.051091 | 0.85932 | False | False |
1 | ECH90 | RS100_edge | 0.412667 | 0.00000 | True | True |
1 | ECH90 | RS100_edge | 0.383300 | 0.00000 | True | False |
2 | Hamadryas | Fib100_edge | 0.010583 | 0.49535 | False | False |
2 | Hamadryas | Fib100_edge | 0.121857 | 0.00057 | True | True |
2 | Hamadryas | Fib100_edge | 0.121857 | 0.32055 | True | False |
3 | Hamadryas | RS100_edge | 0.043037 | 0.46991 | False | False |
3 | Hamadryas | RS100_edge | 0.079267 | 0.10159 | True | True |
3 | Hamadryas | RS100_edge | 0.079267 | 0.00830 | True | False |
7 | Hamadryas | RS100_full | -0.038111 | 0.67801 | False | False |
7 | Hamadryas | RS100_full | 0.065700 | 0.55859 | True | True |
7 | Hamadryas | RS100_full | 0.065700 | 0.32201 | True | False |
4 | Olive | Fib100_edge | 0.267519 | 0.00000 | False | False |
4 | Olive | Fib100_edge | 0.281078 | 0.00000 | True | True |
4 | Olive | Fib100_edge | 0.277333 | 0.00000 | True | False |
5 | Olive | RS100_edge | 0.137642 | 0.00000 | False | False |
5 | Olive | RS100_edge | 0.156492 | 0.00000 | True | True |
5 | Olive | RS100_edge | 0.156492 | 0.00000 | True | False |
6 | Olive | RS100_full | 0.179701 | 0.00000 | False | False |
6 | Olive | RS100_full | 0.197664 | 0.00000 | True | True |
6 | Olive | RS100_full | 0.185778 | 0.00000 | True | False |
'Jaccard:'
Query | Annot | Index | P-value | |
---|---|---|---|---|
0 | ECH90 | Fib100_full | 0.042072 | 0.1962 |
1 | ECH90 | Fib100_edge | 0.063394 | 0.0141 |
2 | ECH90 | RS100_full | 0.046543 | 0.1005 |
3 | ECH90 | RS100_edge | 0.062911 | 0.0121 |
4 | Hamadryas | Fib100_full | 0.032936 | 0.2989 |
5 | Hamadryas | Fib100_edge | 0.015783 | 0.7405 |
6 | Hamadryas | RS100_full | 0.031271 | 0.3467 |
7 | Hamadryas | RS100_edge | 0.022037 | 0.5719 |
8 | Olive | Fib100_edge | 0.039584 | 0.7753 |
9 | Olive | RS100_edge | 0.039312 | 0.8489 |
Test revised II
Kasper realized it is very hard to capture both proximity and intersect in the same statistic. Thus, we go back to test the overlapping regions with Jaccard test, and use a new proximity test, measuring the mean distance between query regions and annotation regions, almost with the same power as the previous one.
### Test the functions ### took 25 seconds for nsamples=100_000
= olive_dict['limits']
query = make_edges(dataframes['sperm_100kb_10Mb'], 5)
annot
= jaccard_test(query, annot)
stat, p print(f'''
Jaccard overlap test:
stat: {stat}
p: {p}
''')
= query.loc[~overlaps(query, annot)]
query_non_ovl = proximity_test(query_non_ovl, annot)
stat, p print(f'''
Proximity test:
stat: {stat}
p: {p}
''')
Jaccard overlap test:
stat: 0.0013218770654329147
p: 0.00053
Proximity test:
stat: 0.2930268116279996
p: 0.00929
### Make it a loop in stead
= ['Fb100arms_edge', 'RS100arms_edge',
a_names 'Fb100_10Mb_edge', 'RS100_10Mb_edge',
'Fb100arms_full', 'RS100arms_full']
= ['ECH', 'Hama', 'Olive',
q_names 'ECH_edge', 'Hama_edge', 'Olive_edge']
= [edge_df['fibroblast_100kb_arms_edges'], edge_df['round_spermatid_100kb_arms_edges'],
annots 'fibroblast_100kb_10Mb_edges'], edge_df['round_spermatid_100kb_10Mb_edges'],
edge_df['fibroblast_100kb_arms'], dataframes['round_spermatid_100kb_arms']]
dataframes[= [ech_dict['full'], hama_dict['full'], olive_dict['full'],
queries 'edges'], hama_dict['edges'], olive_dict['edges']]
ech_dict[
= pd.DataFrame({
results 'Query': [],
'Annotation': [],
'Jaccard Index': [],
'Jaccard P-value': [],
'Proximity Statistic': [],
'Proximity P-value': []
})
for i,query in enumerate(queries):
print(f'i = {i}')
for j,annot in enumerate(annots):
print(f'j = {j}')
= jaccard_test(query, annot)
j_stat, j_p = query.loc[~overlaps(query, annot)]
query_non_ovl = proximity_test(query_non_ovl, annot)
p_stat, p_p = pd.DataFrame(
tmp
{'Query': q_names[i],
'Annotation': a_names[j],
'Jaccard Index': j_stat,
'Jaccard P-value': j_p,
'Proximity Statistic': p_stat,
'Proximity P-value': p_p
}, =[0])
index= pd.concat([results, tmp], ignore_index=True)
results
results
i = 0
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 1
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 2
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 3
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 4
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 5
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
Query | Annotation | Jaccard Index | Jaccard P-value | Proximity Statistic | Proximity P-value | |
---|---|---|---|---|---|---|
0 | ECH | Fb100arms_edge | 0.063394 | 0.01234 | 0.281690 | 0.31470 |
1 | ECH | RS100arms_edge | 0.062911 | 0.01029 | 0.237249 | 0.61862 |
2 | ECH | Fb100_10Mb_edge | 0.048480 | 0.08578 | 0.221680 | 0.73127 |
3 | ECH | RS100_10Mb_edge | 0.057965 | 0.02468 | 0.280611 | 0.33594 |
4 | ECH | Fb100arms_full | 0.042072 | 0.19746 | 0.188639 | 0.40612 |
5 | ECH | RS100arms_full | 0.046543 | 0.10295 | 0.277032 | 0.39865 |
6 | Hama | Fb100arms_edge | 0.021886 | 0.90170 | 0.253639 | 0.51690 |
7 | Hama | RS100arms_edge | 0.027870 | 0.85085 | 0.256731 | 0.50018 |
8 | Hama | Fb100_10Mb_edge | 0.031809 | 0.75087 | 0.253548 | 0.51142 |
9 | Hama | RS100_10Mb_edge | 0.034824 | 0.72564 | 0.298500 | 0.39744 |
10 | Hama | Fb100arms_full | 0.059582 | 0.33048 | 0.255948 | 0.65396 |
11 | Hama | RS100arms_full | 0.069205 | 0.15910 | 0.242701 | 0.43243 |
12 | Olive | Fb100arms_edge | 0.109276 | 0.39990 | 0.279857 | 0.27209 |
13 | Olive | RS100arms_edge | 0.139396 | 0.23299 | 0.239110 | 0.65271 |
14 | Olive | Fb100_10Mb_edge | 0.082725 | 0.93293 | 0.252348 | 0.52897 |
15 | Olive | RS100_10Mb_edge | 0.146962 | 0.17601 | 0.242756 | 0.61343 |
16 | Olive | Fb100arms_full | 0.281681 | 0.03051 | 0.272324 | 0.56266 |
17 | Olive | RS100arms_full | 0.209506 | 0.32056 | 0.235625 | 0.71433 |
18 | ECH_edge | Fb100arms_edge | 0.076356 | 0.00404 | 0.295478 | 0.18026 |
19 | ECH_edge | RS100arms_edge | 0.062408 | 0.03379 | 0.266103 | 0.38942 |
20 | ECH_edge | Fb100_10Mb_edge | 0.057708 | 0.06767 | 0.269624 | 0.34382 |
21 | ECH_edge | RS100_10Mb_edge | 0.074277 | 0.00498 | 0.273560 | 0.35419 |
22 | ECH_edge | Fb100arms_full | 0.051794 | 0.18634 | 0.295836 | 0.50053 |
23 | ECH_edge | RS100arms_full | 0.054435 | 0.11747 | 0.274170 | 0.81663 |
24 | Hama_edge | Fb100arms_edge | 0.014028 | 0.95055 | 0.240012 | 0.64811 |
25 | Hama_edge | RS100arms_edge | 0.033281 | 0.54103 | 0.244095 | 0.60123 |
26 | Hama_edge | Fb100_10Mb_edge | 0.029314 | 0.63328 | 0.262847 | 0.40403 |
27 | Hama_edge | RS100_10Mb_edge | 0.044633 | 0.21629 | 0.234146 | 0.67565 |
28 | Hama_edge | Fb100arms_full | 0.053583 | 0.05438 | 0.191426 | 0.12090 |
29 | Hama_edge | RS100arms_full | 0.061216 | 0.00873 | 0.200406 | 0.12220 |
30 | Olive_edge | Fb100arms_edge | 0.038987 | 0.87121 | 0.250063 | 0.54679 |
31 | Olive_edge | RS100arms_edge | 0.049514 | 0.75761 | 0.257123 | 0.43135 |
32 | Olive_edge | Fb100_10Mb_edge | 0.057027 | 0.47526 | 0.257397 | 0.42551 |
33 | Olive_edge | RS100_10Mb_edge | 0.051652 | 0.71941 | 0.243567 | 0.64567 |
34 | Olive_edge | Fb100arms_full | 0.079507 | 0.32217 | 0.258289 | 0.46750 |
35 | Olive_edge | RS100arms_full | 0.071486 | 0.51011 | 0.237859 | 0.67702 |
= ['RS100arms_edge', 'RS100_10Mb_edge',
subset_annot 'Fb100arms_edge', 'Fb100_10Mb_edge']
= ['ECH_edge', 'Hama_edge', 'Olive_edge']
subset_query
display((results'Annotation in @subset_annot and Query in @subset_query')
.query('Annotation','Query'])
.sort_values([filter(['Annotation','Query', 'Jaccard P-value', 'Proximity P-value'])
.
) )
Annotation | Query | Jaccard P-value | Proximity P-value | |
---|---|---|---|---|
20 | Fb100_10Mb_edge | ECH_edge | 0.06767 | 0.34382 |
26 | Fb100_10Mb_edge | Hama_edge | 0.63328 | 0.40403 |
32 | Fb100_10Mb_edge | Olive_edge | 0.47526 | 0.42551 |
18 | Fb100arms_edge | ECH_edge | 0.00404 | 0.18026 |
24 | Fb100arms_edge | Hama_edge | 0.95055 | 0.64811 |
30 | Fb100arms_edge | Olive_edge | 0.87121 | 0.54679 |
21 | RS100_10Mb_edge | ECH_edge | 0.00498 | 0.35419 |
27 | RS100_10Mb_edge | Hama_edge | 0.21629 | 0.67565 |
33 | RS100_10Mb_edge | Olive_edge | 0.71941 | 0.64567 |
19 | RS100arms_edge | ECH_edge | 0.03379 | 0.38942 |
25 | RS100arms_edge | Hama_edge | 0.54103 | 0.60123 |
31 | RS100arms_edge | Olive_edge | 0.75761 | 0.43135 |
# plot the significant proximity-result
=ech_dict['edges']
query= fb100_dict['full']
annot
=['ECH90', 'Fb100_full', 'Intersect'])
plot_regions(query, annot, interval_intersect(query, annot), track_titles
# plot some Jaccard-results
=ech_dict['edges']
query= edge_df['round_spermatid_100kb_10Mb_edges']
annot
=['ECH90', 'RS100_10Mb_edges', 'Intersect']) plot_regions(query, annot, interval_intersect(query, annot), track_titles
import matplotlib.pyplot as plt
import seaborn as sns
= (
test_results
results'Annotation in @subset_annot and Query in @subset_query')
.query(filter(['Annotation','Query', 'Jaccard P-value', 'Proximity P-value'])
.
)
# Melt the results to a P-value column
= (
test_results_melt
test_results={'Jaccard P-value': 'Jaccard', 'Proximity P-value': 'Proximity'})
.rename(columns
.melt(=['Query', 'Annotation'],
id_vars=['Jaccard', 'Proximity'],
value_vars='type', value_name='p-value')
var_name=lambda x: -np.log10(x['p-value']))
.assign(logp
)
= ['Fb', 'Spa', 'Pac', 'RS', 'Spz']
source_order
= plt.subplots(figsize=(3,2))
f,ax
= sns.barplot(
g =test_results_melt, x='Annotation', y='logp', hue='type',
data=source_order, ax = ax
order
)-np.log10(0.05), color='tab:red', linestyle='--')
ax.axhline(
=g, left=True)
sns.despine(ax
'-log10(p)')
g.set_ylabel('')
g.set_xlabel('')
g.get_legend().set_title('upper center', bbox_to_anchor=(0.4,0.975))
sns.move_legend(g,
Do new test with limits (2bp)
# Calcilate the proximity and Jaccard tests
# Annot: Fb, SPA, PAC, RS, SP (edges)
# Query: ECH
# Define the samples
= {
annots 'Fb': dataframes['fibroblast_100kb_arms'],
'Spa': dataframes['spermatogonia_100kb_arms'],
'Pac': dataframes['pachytene_spermatocyte_100kb_arms'],
'RS': dataframes['round_spermatid_100kb_arms'],
'Spz': dataframes['sperm_100kb_arms']
}= {
queries 'ECH': ech90,
'ECH_lim': make_edges(ech90, 1),
'Hama_lim': make_edges(high_hama, 1),
'Olive_lim': make_edges(high_olive, 1)
}
# Run the tests
= {
test_results 'Query': [],
'Source': [],
'Type': [],
'Jaccard': [],
'Proximity': []
}
for i,(name,annot) in enumerate(annots.items()):
print(f'{i+1} of {len(annots)}: {name}')
= {'comps': annot,
tmp_annots 'edges': make_edges(annot, 100_000),
'limits': make_edges(annot, 1)}
for tmp_query_key, query in queries.items():
print(f'\tQuery: {tmp_query_key}')
for tmp_key, tmp_annot in tmp_annots.items():
print(f'\t\t{tmp_key}',end=' ')
= jaccard_test(query, tmp_annot)
_, j_p print('--> Jaccard done', end=' ')
= query.loc[~overlaps(query, tmp_annot)]
query_non_ovl = proximity_test(query_non_ovl, tmp_annot)
_, p_p print('--> Proximity done.')
# Append to results
'Query'].append(tmp_query_key)
test_results['Source'].append(name)
test_results['Type'].append(tmp_key)
test_results['Jaccard'].append(j_p)
test_results['Proximity'].append(p_p)
test_results[
= pd.DataFrame(test_results)
results results
1 of 5: Fb
Query: ECH
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: ECH_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Hama_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Olive_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
2 of 5: Spa
Query: ECH
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: ECH_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Hama_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Olive_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
3 of 5: Pac
Query: ECH
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: ECH_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Hama_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Olive_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
4 of 5: RS
Query: ECH
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: ECH_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Hama_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Olive_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
5 of 5: Spz
Query: ECH
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: ECH_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Hama_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query: Olive_lim
comps --> Jaccard done --> Proximity done.
edges --> Jaccard done --> Proximity done.
limits --> Jaccard done --> Proximity done.
Query | Source | Type | Jaccard | Proximity | |
---|---|---|---|---|---|
0 | ECH | Fb | comps | 0.19934 | 0.82548 |
1 | ECH | Fb | edges | 0.01199 | 0.31260 |
2 | ECH | Fb | limits | 0.00530 | 0.55282 |
3 | ECH_lim | Fb | comps | 0.40849 | 0.65961 |
4 | ECH_lim | Fb | edges | 0.03409 | 0.29474 |
5 | ECH_lim | Fb | limits | 1.00000 | 0.55628 |
6 | Hama_lim | Fb | comps | 0.12358 | 0.62059 |
7 | Hama_lim | Fb | edges | 0.87612 | 0.89615 |
8 | Hama_lim | Fb | limits | 1.00000 | 0.86046 |
9 | Olive_lim | Fb | comps | 0.27137 | 0.36505 |
10 | Olive_lim | Fb | edges | 0.85070 | 0.51847 |
11 | Olive_lim | Fb | limits | 1.00000 | 0.60821 |
12 | ECH | Spa | comps | 0.05156 | 0.00314 |
13 | ECH | Spa | edges | 0.49491 | 0.24411 |
14 | ECH | Spa | limits | 0.68780 | 0.23246 |
15 | ECH_lim | Spa | comps | 0.33333 | 0.03413 |
16 | ECH_lim | Spa | edges | 0.23260 | 0.09237 |
17 | ECH_lim | Spa | limits | 1.00000 | 0.04169 |
18 | Hama_lim | Spa | comps | 0.09449 | 0.51142 |
19 | Hama_lim | Spa | edges | 0.62895 | 0.61381 |
20 | Hama_lim | Spa | limits | 0.00001 | 0.50210 |
21 | Olive_lim | Spa | comps | 0.57259 | 0.58325 |
22 | Olive_lim | Spa | edges | 0.91817 | 0.19321 |
23 | Olive_lim | Spa | limits | 1.00000 | 0.46966 |
24 | ECH | Pac | comps | 0.59824 | 0.42413 |
25 | ECH | Pac | edges | 0.10229 | 0.74138 |
26 | ECH | Pac | limits | 0.15461 | 0.69764 |
27 | ECH_lim | Pac | comps | 0.46237 | 0.92811 |
28 | ECH_lim | Pac | edges | 0.10141 | 0.74043 |
29 | ECH_lim | Pac | limits | 1.00000 | 0.71976 |
30 | Hama_lim | Pac | comps | 0.39982 | 0.66599 |
31 | Hama_lim | Pac | edges | 0.48866 | 0.22423 |
32 | Hama_lim | Pac | limits | 0.00005 | 0.44405 |
33 | Olive_lim | Pac | comps | 0.03725 | 0.78024 |
34 | Olive_lim | Pac | edges | 0.58759 | 0.34903 |
35 | Olive_lim | Pac | limits | 0.00004 | 0.39258 |
36 | ECH | RS | comps | 0.10578 | 0.07079 |
37 | ECH | RS | edges | 0.01046 | 0.61816 |
38 | ECH | RS | limits | 0.00655 | 0.73480 |
39 | ECH_lim | RS | comps | 0.18603 | 0.34104 |
40 | ECH_lim | RS | edges | 0.11797 | 0.53686 |
41 | ECH_lim | RS | limits | 1.00000 | 0.35290 |
42 | Hama_lim | RS | comps | 0.01714 | 0.85609 |
43 | Hama_lim | RS | edges | 0.56556 | 0.78961 |
44 | Hama_lim | RS | limits | 1.00000 | 0.78258 |
45 | Olive_lim | RS | comps | 0.39299 | 0.78907 |
46 | Olive_lim | RS | edges | 0.66654 | 0.75863 |
47 | Olive_lim | RS | limits | 1.00000 | 0.56521 |
48 | ECH | Spz | comps | 0.03892 | 0.86643 |
49 | ECH | Spz | edges | 0.07330 | 0.63545 |
50 | ECH | Spz | limits | 0.08195 | 0.88817 |
51 | ECH_lim | Spz | comps | 0.10352 | 0.54812 |
52 | ECH_lim | Spz | edges | 0.80407 | 0.54553 |
53 | ECH_lim | Spz | limits | 1.00000 | 0.73543 |
54 | Hama_lim | Spz | comps | 0.20128 | 0.13174 |
55 | Hama_lim | Spz | edges | 0.53662 | 0.22328 |
56 | Hama_lim | Spz | limits | 1.00000 | 0.09604 |
57 | Olive_lim | Spz | comps | 0.18841 | 0.59118 |
58 | Olive_lim | Spz | edges | 0.84400 | 0.55331 |
59 | Olive_lim | Spz | limits | 1.00000 | 0.64852 |
= ['ECH', 'Hama_lim', 'Olive_lim']
q_filter
'Query in @q_filter and Type == "limits"').sort_values(['Source', 'Type']) results.query(
Query | Source | Type | Jaccard | Proximity | |
---|---|---|---|---|---|
2 | ECH | Fb | limits | 0.00530 | 0.55282 |
8 | Hama_lim | Fb | limits | 1.00000 | 0.86046 |
11 | Olive_lim | Fb | limits | 1.00000 | 0.60821 |
26 | ECH | Pac | limits | 0.15461 | 0.69764 |
32 | Hama_lim | Pac | limits | 0.00005 | 0.44405 |
35 | Olive_lim | Pac | limits | 0.00004 | 0.39258 |
38 | ECH | RS | limits | 0.00655 | 0.73480 |
44 | Hama_lim | RS | limits | 1.00000 | 0.78258 |
47 | Olive_lim | RS | limits | 1.00000 | 0.56521 |
14 | ECH | Spa | limits | 0.68780 | 0.23246 |
20 | Hama_lim | Spa | limits | 0.00001 | 0.50210 |
23 | Olive_lim | Spa | limits | 1.00000 | 0.46966 |
50 | ECH | Spz | limits | 0.08195 | 0.88817 |
56 | Hama_lim | Spz | limits | 1.00000 | 0.09604 |
59 | Olive_lim | Spz | limits | 1.00000 | 0.64852 |
import matplotlib.pyplot as plt
import seaborn as sns
# Melt the results to a P-value column
= (
test_results_melt 'Query == "ECH_limits"')
results.query(
.melt(=['Query', 'Source'],
id_vars=['Jaccard', 'Proximity'],
value_vars='test', value_name='p-value')
var_name=lambda x: -np.log10(x['p-value']))
.assign(logp
)
= ['Fb', 'Spa', 'Pac', 'RS', 'Spz']
source_order
= plt.subplots(figsize=(3,2))
f,ax
= sns.barplot(
g =test_results_melt, x='Source', y='logp', hue='test',
data=source_order, ax = ax
order
)-np.log10(0.05), color='tab:red', linestyle='--')
ax.axhline(
=g, left=True)
sns.despine(ax
'-log10(p)')
g.set_ylabel('')
g.set_xlabel('')
g.get_legend().set_title('upper center', bbox_to_anchor=(0.4,0.975)) sns.move_legend(g,
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[38], line 29 27 g.set_ylabel('-log10(p)') 28 g.set_xlabel('') ---> 29 g.get_legend().set_title('') 30 sns.move_legend(g, 'upper center', bbox_to_anchor=(0.4,0.975)) AttributeError: 'NoneType' object has no attribute 'set_title'
Old test
# # Now we should test
# from genominterv import proximity_test, interval_diff, jaccard_stat, bootstrap
# #query = ech90
# query = edges['round_spermatid_100kb_arms']
# annot_hama = hama_edges
# #annot_olive = baboon_dict['P.anubis']
# b = 1_000
# # Proximity test
# non_ovl_query_hama = interval_diff(query, annot_hama)
# #non_ovl_query_olive = interval_diff(query, annot_olive)
# proximity_hama = proximity_test(non_ovl_query_hama, annot_hama, two_sided=True, samples=10_000)
# #proximity_olive = proximity_test(non_ovl_query_olive, annot_olive, two_sided=True, samples=10_000)
# # Jaccard index
# @bootstrap(chromsizes, samples=b, smaller=False)
# def jaccard_bootstrap(query, annot):
# return jaccard_stat(query, annot)
# jaccard_hama = jaccard_bootstrap(query, annot_hama)
# #jaccard_olive = jaccard_bootstrap(query, annot_olive)
# print(f"""
# Proximity test results:
# Hama: {proximity_hama}
# # Olive: {proximity_olive}
# Jaccard index results:
# Hama: {jaccard_hama}
# # Olive: {jaccard_olive}
# """)
# # # Swap annot and query
# # non_ovl_query_hama = interval_diff(annot_hama, query)
# # non_ovl_query_olive = interval_diff(annot_olive, query)
# # proximity_hama_flip = proximity_test(non_ovl_query_hama, query, two_sided=True, samples=b)
# # proximity_olive_flip = proximity_test(non_ovl_query_olive, query, two_sided=True, samples=b)
# # jaccard_hama_flip = jaccard_bootstrap(annot_hama, query)
# # jaccard_olive_flip = jaccard_bootstrap(annot_olive, query)
# Make a long dataframe
# pd.DataFrame({
# 'Group': ['P.hama', 'P.anubis'],
# #'Proximity Statistic': [proximity_hama.statistic, proximity_olive.statistic],
# 'Proximity P-value': [proximity_hama.pvalue, proximity_olive.pvalue],
# #'Jaccard Index': [jaccard_hama[0], jaccard_olive[0]],
# 'Jaccard P-value': [jaccard_hama[1], jaccard_olive[1]]
# }).melt(id_vars='Group', var_name='Test', value_name='Value')
pd.DataFrame({'Group': ['P.hama', 'P.anubis'],
#'Proximity Statistic': [proximity_hama.statistic, proximity_olive.statistic],
'Proximity P-value': [proximity_hama.pvalue, proximity_olive.pvalue],
#'Jaccard Index': [jaccard_hama[0], jaccard_olive[0]],
'Jaccard P-value': [jaccard_hama[1], jaccard_olive[1]],
# #'Proximity Statistic (flip)': [proximity_hama_flip.statistic, proximity_olive_flip.statistic],
# 'Proximity P-value (flip)': [proximity_hama_flip.pvalue, proximity_olive_flip.pvalue],
# #'Jaccard Index (flip)': [jaccard_hama_flip[0], jaccard_olive_flip[0]],
# 'Jaccard P-value (flip)': [jaccard_hama_flip[1], jaccard_olive_flip[1]],
='Group', var_name='Test', value_name='Value')
}).melt(id_vars
Group | Test | Value | |
---|---|---|---|
0 | P.hama | Proximity P-value | 0.0069 |
1 | P.anubis | Proximity P-value | 0.0274 |
2 | P.hama | Jaccard P-value | 0.2720 |
3 | P.anubis | Jaccard P-value | 0.8250 |
Plot the relative distances (proximities)
=ech90['start']-5000,
ech90.assign(start= ech90['end']+5000) end
chrom | start | end | length | |
---|---|---|---|---|
0 | chrX | 19531773 | 19641984 | 100211 |
1 | chrX | 20930188 | 21051124 | 110936 |
2 | chrX | 36218142 | 36427842 | 199700 |
3 | chrX | 37230935 | 37637028 | 396093 |
4 | chrX | 49511522 | 50019126 | 497604 |
5 | chrX | 50816385 | 51449493 | 623108 |
6 | chrX | 53897995 | 54111817 | 203822 |
7 | chrX | 62091538 | 62298349 | 196811 |
8 | chrX | 70884995 | 71154673 | 259678 |
9 | chrX | 71783766 | 72149275 | 355509 |
10 | chrX | 74336598 | 74640686 | 294088 |
11 | chrX | 96245950 | 96344522 | 88572 |
12 | chrX | 108173096 | 108564466 | 381370 |
13 | chrX | 111297881 | 111601540 | 293659 |
14 | chrX | 124073238 | 124182167 | 98929 |
15 | chrX | 126833587 | 127249031 | 405444 |
16 | chrX | 128398780 | 128706766 | 297986 |
17 | chrX | 129729920 | 130039306 | 299386 |
18 | chrX | 150800639 | 151219408 | 408769 |
# Size of each of the dataframes
print(f"""
rs100_full: {rs100_dict['full'].shape[0]}
rs100_edges: {rs100_dict['edges'].shape[0]}
fb100_full: {fb100_dict['full'].shape[0]}
fb100_edges: {fb100_dict['edges'].shape[0]}
hama_dict: {hama_dict['full'].shape[0]}
ech_dict: {ech_dict['full'].shape[0]}
""")
rs100_full: 77
rs100_edges: 96
fb100_full: 59
fb100_edges: 83
hama_dict: 31
ech_dict: 19
import seaborn as sns
from genominterv.remapping import remap_interval_data
= 1000
n = np.sort(np.random.randint(1, 10_000_000, size=n))
a = pd.DataFrame(dict(chrom='chrX', start=a, end=a+10))
annot = np.sort(np.random.randint(1, 10_000_000, size=n))
q = pd.DataFrame(dict(chrom='chrX', start=q, end=q+10))
query = remap_interval_data(query, annot, relative=True)
df1 'mid'] = (df1.start + df1.end) / 2
df1['absmid'] = df1.mid.abs()
df1[
# dummy data
= np.sort(np.random.randint(1, 10_000_000, size=n))
a = pd.DataFrame(dict(chrom='chrX', start=a, end=a+10))
annot = a + np.random.randint(-1000, 1000, size=n)
q = pd.DataFrame(dict(chrom='chrX', start=q, end=q+10))
query = remap_interval_data(query, annot, relative=True)
df2 'mid'] = (df2.start + df2.end) / 2
df2['absmid'] = df2.mid.abs()
df2[
# Real data
= hama_dict['full']
q1 = rs100_dict['full']
a1 = remap_interval_data(q1, a1, relative=True)
df3 'mid'] = (df3.start + df3.end) / 2
df3['absmid'] = df3.mid.abs()
df3[
= hama_dict['full']
q2 = fb100_dict['full']
a2 = remap_interval_data(q2, a2, relative=True)
df4 'mid'] = (df4.start + df4.end) / 2
df4['absmid'] = df4.mid.abs()
df4[
='mid', bins=np.linspace(-0.5, 0.5, 20))
sns.histplot(df3, x='mid', bins=np.linspace(-0.5, 0.5, 20)) sns.histplot(df4, x
='absmid', bins=np.linspace(0, 0.5, 20))
sns.histplot(df3, x='absmid', bins=np.linspace(0, 0.5, 20)) sns.histplot(df4, x
Geneinfo
How does the edges align with genes?
This first plot is just to figure out how to plot with gene_plot
.
import geneinfo as gi
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
# Use the proximity test results to plot the ECH90 regions and the compartment edges
= full_intersect['start'][2]
start = full_intersect['end'][5]
end
= [Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6) for start, end in zip(full_intersect['start'][2:6], full_intersect['end'][2:6])]
rectangles
= PatchCollection(rectangles, match_original=True)
pc
= gi.gene_plot('chrX', start-100_000, end+100_000, assembly='rheMac10')
ax ax.add_collection(pc)
Get the geneinfo for all intersections between edges and ECH90
And write to a csv file. If the file exists, read it with pandas.
# Use get_genes_region
import os.path as op
import geneinfo as gi
import pandas as pd
from os import makedirs as mkdir
= '../results/rec_edge_genes/'
genes_dir if not op.exists(genes_dir):
mkdir(genes_dir)
= op.join(genes_dir,'rs_edges_100kb_genes.csv')
genes_file
if not op.exists(genes_file):
= pd.concat(
genes apply(
full_intersect.lambda x: gi.get_genes_region_dataframe('chrX', x['start'], x['end'], assembly='rheMac10'),
=1
axis
).to_list(),=True
ignore_index
)=False)
genes.to_csv(genes_file, indexelse:
= pd.read_csv(genes_file) genes
= genes['name'].unique().tolist()
genes_list genes_list
['SH3KBP1',
'MIR7206',
'LANCL3',
'XK',
'CYBB',
'LOC696657',
'DYNLT3',
'PAGE4',
'USP27X',
'CLCN5',
'LOC114675180',
'MIR532',
'MIR188',
'MIR500A',
'MIR362',
'MIR501',
'MIR500B',
'MIR660',
'MIR502',
'AKAP4',
'CCNB3',
'LOC114675218',
'LOC695959',
'CENPVL3',
'FAM120C',
'WNK3',
'LOC114675302',
'ZC3H12B',
'LAS1L',
'MSN',
'ATRX',
'MAGT1',
'LOC114675151',
'COX7B',
'ATP7A',
'ALG13',
'LOC706958',
'TRPC5',
'ENOX2',
'RAP2C',
'LOC114675176',
'DKC1',
'LOC114675231',
'MPP1',
'SMIM9',
'F8',
'H2AFB3',
'FUNDC2',
'CMC4',
'MTCP1',
'BRCC3',
'LOC703257']
import geneinfo as gi
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
# Use the proximity test results to plot the ECH90 regions and the compartment edges
for i in full_intersect.index:
= i
start_idx = i
end_idx
= full_intersect['start'][start_idx]
start = full_intersect['end'][end_idx]
end
= [Rectangle(
rectangles 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6) for start, end in zip(full_intersect['start'][start_idx:end_idx+1], full_intersect['end'][start_idx:end_idx+1])]
(start,
= PatchCollection(rectangles, match_original=True)
pc
= gi.gene_plot('chrX', start-100_000, end+100_000, assembly='rheMac10',
ax =genes_list,
highlight=True,
despine=(5, 5),
figsize=5,
aspect
) ax.add_collection(pc)
What can I do with the list of genes on the edges?
GO enrichment?
= gi.get_genes_region_dataframe('chrX', 0, 155_000_000, assembly='rheMac10') mmul_x_genes
= mmul_x_genes['name'].unique().tolist() mmul_x_genelist
= genes['name'].unique().tolist()
gene_list = 9544
taxid 'sojernj@gmail.com')
gi.email(#gi.go_annotation_table(taxid=taxid)
#gi.show_go_evidence_codes()
= gi.get_go_terms_for_genes(gene_list, taxid=taxid) go_terms
len(go_terms)
#gene_list[:5]
127
= gi.go_enrichment(
results # Use human taxid as a start
gene_list, =0.05,
alpha=go_terms) terms
geneinfo_cache/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms; optional_attrs(def relationship)
Could not map gene symbol "MIR7206" to ncbi id
Could not map gene symbol "LOC696657" to ncbi id
Could not map gene symbol "LOC114675180" to ncbi id
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1050, in _cached_symbol2ncbi(symbols, taxid) 1049 try: -> 1050 return symbol2ncbi.loc[symbols].tolist() 1051 except KeyError: File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1191, in _LocationIndexer.__getitem__(self, key) 1190 maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable) -> 1191 return self._getitem_axis(maybe_callable, axis=axis) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1420, in _LocIndexer._getitem_axis(self, key, axis) 1418 raise ValueError("Cannot index with multidimensional key") -> 1420 return self._getitem_iterable(key, axis=axis) 1422 # nested tuple slicing File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1360, in _LocIndexer._getitem_iterable(self, key, axis) 1359 # A collection of keys -> 1360 keyarr, indexer = self._get_listlike_indexer(key, axis) 1361 return self.obj._reindex_with_indexers( 1362 {axis: [keyarr, indexer]}, copy=True, allow_dups=True 1363 ) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1558, in _LocIndexer._get_listlike_indexer(self, key, axis) 1556 axis_name = self.obj._get_axis_name(axis) -> 1558 keyarr, indexer = ax._get_indexer_strict(key, axis_name) 1560 return keyarr, indexer File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:6200, in Index._get_indexer_strict(self, key, axis_name) 6198 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr) -> 6200 self._raise_if_missing(keyarr, indexer, axis_name) 6202 keyarr = self.take(indexer) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:6252, in Index._raise_if_missing(self, key, indexer, axis_name) 6251 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) -> 6252 raise KeyError(f"{not_found} not in index") KeyError: "['MIR7206', 'LOC696657', 'LOC114675180', 'MIR532', 'MIR188', 'MIR500A', 'MIR362', 'MIR501', 'MIR500B', 'MIR660', 'MIR502', 'LOC114675218', 'LOC695959', 'LOC114675302', 'LOC114675151', 'LOC706958', 'LOC114675176', 'LOC114675231', 'H2AFB3', 'LOC703257'] not in index" During handling of the above exception, another exception occurred: KeyError Traceback (most recent call last) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key) 3804 try: -> 3805 return self._engine.get_loc(casted_key) 3806 except KeyError as err: File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc() File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc() File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'MIR532' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1055, in _cached_symbol2ncbi(symbols, taxid) 1054 try: -> 1055 geneids.append(symbol2ncbi.loc[symbol]) 1056 except KeyError: File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1191, in _LocationIndexer.__getitem__(self, key) 1190 maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable) -> 1191 return self._getitem_axis(maybe_callable, axis=axis) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1431, in _LocIndexer._getitem_axis(self, key, axis) 1430 self._validate_key(key, axis) -> 1431 return self._get_label(key, axis=axis) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1381, in _LocIndexer._get_label(self, label, axis) 1379 def _get_label(self, label, axis: AxisInt): 1380 # GH#5567 this will fail if the label is not present in the axis. -> 1381 return self.obj.xs(label, axis=axis) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/generic.py:4301, in NDFrame.xs(self, key, axis, level, drop_level) 4300 else: -> 4301 loc = index.get_loc(key) 4303 if isinstance(loc, np.ndarray): File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key) 3811 raise InvalidIndexError(key) -> 3812 raise KeyError(key) from err 3813 except TypeError: 3814 # If we have a listlike key, _check_indexing_error will raise 3815 # InvalidIndexError. Otherwise we fall through and re-raise 3816 # the TypeError. KeyError: 'MIR532' During handling of the above exception, another exception occurred: HTTPError Traceback (most recent call last) Cell In[55], line 1 ----> 1 results = gi.go_enrichment( 2 # Use human taxid as a start 3 gene_list, 4 alpha=0.05, 5 terms=go_terms) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1780, in go_enrichment(gene_list, taxid, background_chrom, background_genes, terms, list_study_genes, alpha) 1777 GeneID2nt = background_genes.GENEID2NT 1779 if not all(type(x) is int for x in gene_list): -> 1780 gene_list = _cached_symbol2ncbi(gene_list, taxid=taxid) 1782 goeaobj = GOEnrichmentStudyNS( 1783 GeneID2nt, # List of mouse protein-coding genes 1784 ns2assoc, # geneid/GO associations (...) 1788 methods=['fdr_bh'], 1789 pvalcalc='fisher_scipy_stats') 1791 goea_results_all = goeaobj.run_study(gene_list) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1058, in _cached_symbol2ncbi(symbols, taxid) 1056 except KeyError: 1057 try: -> 1058 ncbi_id = hgcn_symbol(symbol) 1059 if ncbi_id not in symbol2ncbi.index: 1060 print(ncbi_id, 'not in symbol2ncbi index') File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:256, in hgcn_symbol(name) 254 return [ensembl2symbol(ensembl_id(n)) for n in name] 255 else: --> 256 return ensembl2symbol(ensembl_id(name)) File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:227, in ensembl2symbol(ensembl_id) 225 r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) 226 if not r.ok: --> 227 r.raise_for_status() 228 decoded = r.json() 229 symbols = [x['display_id'] for x in decoded if x['dbname'] == 'HGNC'] File ~/miniconda3/envs/hic/lib/python3.12/site-packages/requests/models.py:1024, in Response.raise_for_status(self) 1019 http_error_msg = ( 1020 f"{self.status_code} Server Error: {reason} for url: {self.url}" 1021 ) 1023 if http_error_msg: -> 1024 raise HTTPError(http_error_msg, response=self) HTTPError: 400 Client Error: Bad Request for url: https://rest.ensembl.org/xrefs/id/%5B'ENSG00000207758'%5D