E1 vs. ECH (recommended params)

Overlaying the A-Compartments with Extended Common Haplotypes

The genomic regions in question

In 05_rec_compartments.ipynb we extracted the genomic intervals of A compartments on all cell types in all combinations of the following parameters:

Cell type: fibroblast, spermatocyte, pachytene spermatocyte, round spermatid, sperm
Chromosome: X
E1 restriction: full-chromosome, chromosome arms, 10Mb windows
Resolution: 100 kb, 500 kb, ps500kb (smoothed 100kb)

Resulting in 45 .csv files. They are saved to ../results/rec_compartments/.

Setup inline backend

`InlineBackend.rc`

For consistent plots that fit with pdf manus.

import matplotlib.pyplot as plt

# Define my params:
## Params will comply with my desired layout for the Manuscript (PDF)

notebook_rcparams = {
    'font.size': 7,
    'axes.titlesize': 8,
    'axes.labelsize': 7,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'figure.titlesize': 9,
    'figure.figsize': [6.0, 2.0],
    'figure.labelsize': 7,
    'legend.fontsize': 6,
    
}

# Apply config
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
%config InlineBackend.rc = notebook_rcparams

print({key:plt.rcParams[key] for key in notebook_rcparams.keys()})

{'font.size': 10.0, 'axes.titlesize': 'large', 'axes.labelsize': 'medium', 'xtick.labelsize': 'medium', 'ytick.labelsize': 'medium', 'figure.titlesize': 'large', 'figure.figsize': [6.4, 4.8], 'figure.labelsize': 'large', 'legend.fontsize': 'medium'}

## Apparently IPython needs this twice to understand?? 
import matplotlib.pyplot as plt

# Define my params:
## Params will comply with my desired layout for the Manuscript (PDF)

notebook_rcparams = {
    'font.size': 7,
    'axes.titlesize': 8,
    'axes.labelsize': 7,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'figure.titlesize': 9,
    'figure.figsize': [6.0, 2.0],
    'figure.labelsize': 7,
    'legend.fontsize': 6,
    
}

# Apply config
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
%config InlineBackend.rc = notebook_rcparams

print({key:plt.rcParams[key] for key in notebook_rcparams.keys()})

{'font.size': 7.0, 'axes.titlesize': 8.0, 'axes.labelsize': 7.0, 'xtick.labelsize': 6.0, 'ytick.labelsize': 6.0, 'figure.titlesize': 9.0, 'figure.figsize': [6.0, 2.0], 'figure.labelsize': 7.0, 'legend.fontsize': 6.0}

Import compartments

import pandas as pd
import os

# Directory containing your .csv files
csv_dir = '../results/rec_compartments/'

# Create a dictionary to store the DataFrames
dataframes = {}

# Iterate over all .csv files in the directory
for filename in os.listdir(csv_dir):
    if filename.endswith('.csv') and 'e1' not in filename:  # Check for .csv files
        # Construct the full file path
        filepath = os.path.join(csv_dir, filename)
        
        # Load the CSV into a DataFrame
        # Use the filename (without extension) as the dictionary key
        key = filename.replace('_a_comp_coords_', '_')
        key = os.path.splitext(key)[0]
        dataframes[key] = pd.read_csv(filepath)
        dataframes[key]['length'] = dataframes[key]['end'] - dataframes[key]['start']

# The `dataframes` dictionary now contains the DataFrames
dataframes.keys()

ech90 = pd.read_csv('../data/ech90_human_Mmul_10.csv')
ech90['length'] = ech90['end'] - ech90['start']

# Load the chromosome sizes
chromsizes = (pd.read_csv(
    '../data/rheMac10.filtered.chrom.sizes', 
    sep='\t', 
    index_col='chrom', 
    header=None, 
    names=['chrom','size'])
.to_dict()['size']
)

Define a plotting function

# Kaspers plotting function
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%config InlineBackend.figure_format = 'svg'

def plot_intervals(query=None, annot=None, **kwargs):

    tups = []
    if query is not None:
        tups.append(('query', query))
    if annot is not None:
        tups.append(('annot', annot))
    tups.extend(kwargs.items())
    tups = reversed(tups)

    df_list = []
    labels = []
    for label, df in tups:
        labels.append(label)
        df['label'] = label
        df_list.append(df)
    bigdf = pd.concat(df_list)

    bigdf['chrom'] = pd.Categorical(bigdf['chrom'], bigdf['chrom'].unique())
    bigdf['label'] = pd.Categorical(bigdf['label'], bigdf['label'].unique())

    gr = bigdf.groupby('chrom', observed=False)
    fig, axes = plt.subplots(gr.ngroups, 1, figsize=(10, 1.5*gr.ngroups), 
                            sharey=True
                            #  sharex=True
                             )
    if type(axes) is not np.ndarray:
        # in case there is only one axis so it not returned as a list
        axes = np.array([axes])
    
    # with plt.style.context(('default')):

    for i, chrom in enumerate(gr.groups):
        _df = gr.get_group(chrom)
        _gr = _df.groupby('label', observed=False)
        for y, label in enumerate(_gr.groups):
            try:
                df = _gr.get_group(label)
            except KeyError:
                continue
            y = np.repeat(y, df.index.size)
            axes[i].hlines(y, df.start.tolist(), df.end.tolist(), alpha=0.5, lw=5, colors=f'C{y[0]}')
            delta = len(labels)/10
            axes[i].vlines(df.start.tolist(), y-delta, y+delta, alpha=0.5, lw=2, colors=f'C{y[0]}')
            axes[i].vlines(df.end.tolist(), y-delta, y+delta, alpha=0.5, lw=2, colors=f'C{y[0]}')

        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        axes[i].set_yticks(list(range(len(labels))), labels)
        axes[i].tick_params(axis='y', which='both', left=False)
        axes[i].set_ylim(-1, len(labels)-0.7)
        # axes[i].set_xlim(df.start.min()-delta, df.end.max()+delta)
        if i != gr.ngroups-1:
            axes[i].tick_params(axis='x', which='both', bottom=False)

        axes[i].set_title(chrom, loc='left', fontsize=10)
    plt.tight_layout()

# MY PLOTTING FUNCTION

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Rectangle
from mpl_toolkits.axes_grid1 import make_axes_locatable
from genominterv import interval_intersect


def plot_regions(query=None, annot=None, intersect=None, 
                 track_titles=['query', 'annot', 'intersect'], 
                 figsize=(10, 1),
                 title='chrom',
                 alpha=0.8, lw=0):
    chrom = annot['chrom'].unique()[0]
    chromsizes = pd.read_csv('../data/rheMac10.filtered.chrom.sizes', sep='\t', header=None, names=['chrom', 'size'])

    base_size = chromsizes[chromsizes['chrom'] == chrom]['size'].values[0]
    annot_size = annot['end'].max()
    query_size = query['end'].max()
    chromsize = max(base_size, annot_size, query_size)


    if title == 'chrom':
        title = chrom
    

    # Define the plot size

    f, ax = plt.subplots(figsize=figsize, sharex=True)

    ax.spines[:].set_visible(False)

    # Plot the annot
    # Iterate over each interval in the DataFrame
    for start, end in zip(annot['start'], annot['end']):
        rect = Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:red', linewidth=lw, alpha=alpha)
        ax.add_patch(rect)
        ax.spines['bottom'].set_visible(True)
        ax.set_ylabel(track_titles[1], rotation=0, labelpad=10, ha='right', va='center')

    
    divider = make_axes_locatable(ax)

    if query is not None:
        qax = divider.append_axes("top", size="100%", pad=0.05, sharex=ax)
        qax.xaxis.set_visible(False)
        # Plot the query
        for start, end in zip(query['start'], query['end']):
            rect = Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:blue', linewidth=lw, alpha=alpha)
            qax.add_patch(rect)
            qax.spines[:].set_visible(False)
            qax.set_yticks([]) 
            qax.set_title(title, loc='left')
            qax.set_ylabel(track_titles[0], rotation=0, labelpad=10, ha='right', va='center')
    
    if intersect is not None:
        iax = divider.append_axes("bottom", size="100%", pad=0.05, sharex=ax)
        # Invisible x-axis for 'annot' (intersect ie below) 
        ax.xaxis.set_visible(False)
        # Plot the intersect
        for start, end in zip(intersect['start'], intersect['end']):
            rect = Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=lw, alpha=alpha)
            iax.add_patch(rect)
            iax.spines[:].set_visible(False)
            iax.set_yticks([]) 
            ax.spines['bottom'].set_visible(False)
            iax.spines['bottom'].set_visible(True)
            iax.set_ylabel(track_titles[2], rotation=0, labelpad=10, ha='right', va='center')



    ax.set_yticks([])
    ax.set_xlim(0, chromsize)
    ticks = np.linspace(0, chromsize, num=5)
    ax.set_xticks(ticks) 
    ax.set_xticklabels([f'{int(t/1e6)} Mbp' for t in ticks])
    f.tight_layout()
    plt.show()
    #plt.tight_layout()

Time to unleash `genominterv` on the .csv files

Test with a subsample of the data


annot = dataframes['round_spermatid_100kb_arms']
query = ech90
intersect = interval_intersect(annot, query)

plot_intervals(query, annot, intersection=intersect)
plot_regions(query, annot, intersect)

from genominterv import proximity_test, interval_collapse, interval_diff, interval_intersect, jaccard_stat

annot = dataframes['round_spermatid_100kb_arms']
query = ech90

#plot_intervals(query=query, annot=annot)

for key,annot in dataframes.items():
    # Filter out subset
    if ('round_spermatid_100') in key and not 'full' in key:
        # Plot the intervals
        intersection = interval_intersect(query, annot)
        plot_intervals(query=query, annot=annot, intersection=intersection)
        plt.title(key)

        # Do a proximity test
        print(f"Tests for {key}")
        annot_collapsed = interval_collapse(annot)
        non_ovl_query = interval_diff(query, annot_collapsed)
        print("Proximity:", proximity_test(non_ovl_query, annot_collapsed))
        print("Jaccard:", jaccard_stat(query, annot))
        print()

Tests for round_spermatid_100kb_arms_smoothed
Proximity: TestResult(statistic=0.3711999999999999, pvalue=0.0204)
Jaccard: 0.03771152626674943

Tests for round_spermatid_100kb_10Mb
Proximity: TestResult(statistic=0.51525, pvalue=0.0)
Jaccard: 0.046023479610810235

Tests for round_spermatid_100kb_10Mb_smoothed
Proximity: TestResult(statistic=0.46520000000000006, pvalue=0.0049)
Jaccard: 0.042623832902054265

Tests for round_spermatid_100kb_arms
Proximity: TestResult(statistic=0.37199999999999966, pvalue=0.0121)
Jaccard: 0.046543092859550536

Bootstrap to get a p-value

from genominterv import bootstrap

annot = dataframes['round_spermatid_100kb_arms']
chromsizes = pd.read_csv('../data/rheMac10.filtered.chrom.sizes', sep='\t', index_col='chrom', header=None, names=['chrom','size']).to_dict()['size']
#display(chromsizes)

@bootstrap(chromsizes, samples=1000)
def jaccard_bootstrap(query, annot):
    return jaccard_stat(query, annot)

jacccard_stat, p_value = jaccard_bootstrap(query, annot)

jacccard_stat, p_value

(0.046543092859550536, 0.112)

Partition the A-compartments into regions around the edges

from genominterv import interval_collapse, interval_union

df = dataframes['round_spermatid_100kb_arms']

start_edge = pd.DataFrame({
    'chrom': df['chrom'],
    'start': df.apply(lambda x: max(x['start'] - x['resolution'], 0), axis=1),
    'end': df['start']+1*df['resolution'],
    'resolution': df['resolution'],
    'label': 'start_edge'
})
end_edge = pd.DataFrame({
    'chrom': df['chrom'],
    'start': df['end']-1*df['resolution'],
    'end': df.apply(lambda x: min(x['end'] + x['resolution'], chromsizes['chrX']), axis=1),
    'resolution': df['resolution'],
    'label': 'end_edge'
})

# The end cannot exceed the chromosome size
test_df = pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])
interval_collapse(test_df)

	start	end	chrom
0	800000	1000000	chrX
1	1500000	1800000	chrX
2	2500000	2700000	chrX
3	2800000	3300000	chrX
4	3400000	3600000	chrX
...	...	...	...
91	148700000	148900000	chrX
92	149000000	149400000	chrX
93	151100000	151300000	chrX
94	152400000	152600000	chrX
95	153200000	153388924	chrX

96 rows × 3 columns

import os
from genominterv import interval_collapse

for key, df in dataframes.items():
    outdir = '../results/rec_edges'
    edge_csv_name = os.path.join(outdir,f'{key+'_edges.csv'}')
    if not os.path.exists(edge_csv_name):
        res = df['resolution'].unique()[0]

        start_edge = pd.DataFrame({
            'chrom': df['chrom'],
            'start': df.apply(lambda x: max(x['start'] - x['resolution'], 0), axis=1),
            'end': df['start']+1*df['resolution'],
            'resolution': df['resolution'],
            'label': 'start_edge'
        })
        end_edge = pd.DataFrame({
            'chrom': df['chrom'],
            'start': df['end']-1*df['resolution'],
            'end': df.apply(lambda x: min(x['end'] + x['resolution'], chromsizes['chrX']), axis=1),
            'resolution': df['resolution'],
            'label': 'end_edge'
        })

        if not os.path.exists(outdir):
            os.makedirs(outdir)

        tmp = pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])
        interval_collapse(tmp).assign(resolution=res).to_csv(edge_csv_name, index=False)

Import edges

import pandas as pd
import os

# Directory containing your .csv files
csv_dir = '../results/rec_edges/'

# Create a dictionary to store the DataFrames
edge_df = {}

# Iterate over all .csv files in the directory
for filename in os.listdir(csv_dir):
    if filename.endswith('.csv'):  # Check for .csv files
        # Construct the full file path
        filepath = os.path.join(csv_dir, filename)
        
        # Load the CSV into a DataFrame
        # Use the filename (without extension) as the dictionary key
        key = filename.replace('_edges_', '')
        key = os.path.splitext(key)[0]
        edge_df[key] = pd.read_csv(filepath)
        edge_df[key]['length'] = edge_df[key]['end'] - edge_df[key]['start']

# The `edges` dictionary now contains the DataFrames
print(edge_df.keys())
print(edge_df['fibroblast_100kb_10Mb_edges'].columns)

#ech90 = pd.read_csv('../data/ech90_human_Mmul_10.csv')

dict_keys(['pachytene_spermatocyte_100kb_arms_smoothed_edges', 'round_spermatid_100kb_arms_smoothed_edges', 'fibroblast_500kb_arms_edges', 'round_spermatid_500kb_10Mb_edges', 'fibroblast_100kb_full_edges', 'round_spermatid_500kb_arms_edges', 'spermatogonia_100kb_arms_smoothed_edges', 'fibroblast_500kb_10Mb_edges', 'round_spermatid_100kb_full_edges', 'spermatogonia_500kb_arms_edges', 'spermatogonia_100kb_full_edges', 'pachytene_spermatocyte_100kb_arms_edges', 'sperm_500kb_full_edges', 'pachytene_spermatocyte_500kb_full_edges', 'sperm_100kb_arms_edges', 'sperm_100kb_arms_smoothed_edges', 'sperm_100kb_10Mb_edges', 'pachytene_spermatocyte_100kb_10Mb_edges', 'fibroblast_100kb_arms_smoothed_edges', 'spermatogonia_500kb_10Mb_edges', 'round_spermatid_100kb_full_smoothed_edges', 'fibroblast_500kb_full_edges', 'round_spermatid_100kb_10Mb_edges', 'pachytene_spermatocyte_100kb_full_smoothed_edges', 'fibroblast_100kb_arms_edges', 'sperm_100kb_10Mb_smoothed_edges', 'round_spermatid_500kb_full_edges', 'fibroblast_100kb_10Mb_edges', 'round_spermatid_100kb_arms_edges', 'fibroblast_100kb_10Mb_smoothed_edges', 'spermatogonia_100kb_full_smoothed_edges', 'spermatogonia_500kb_full_edges', 'spermatogonia_100kb_arms_edges', 'pachytene_spermatocyte_100kb_full_edges', 'sperm_500kb_arms_edges', 'sperm_100kb_full_smoothed_edges', 'pachytene_spermatocyte_100kb_10Mb_smoothed_edges', 'pachytene_spermatocyte_500kb_arms_edges', 'sperm_100kb_full_edges', 'round_spermatid_100kb_10Mb_smoothed_edges', 'spermatogonia_100kb_10Mb_smoothed_edges', 'fibroblast_100kb_full_smoothed_edges', 'pachytene_spermatocyte_500kb_10Mb_edges', 'sperm_500kb_10Mb_edges', 'spermatogonia_100kb_10Mb_edges'])
Index(['start', 'end', 'chrom', 'resolution', 'length'], dtype='object')

from genominterv import interval_intersect

samples = dataframes.keys()
#samples = ['round_spermatid_100kb_arms', 'round_spermatid_100kb_10Mb', 'sperm_100kb_arms', 'sperm_100kb_10Mb']

comps = {key:dataframes[key] for key in samples}
edges = {key:edge_df[f'{key}_edges'] for key in samples}

comp_intersects = {key:interval_intersect(comp, ech90).assign(length = lambda x: x['end'] - x['start']) for key, comp in comps.items()}
edge_intersects = {key:interval_intersect(edge, ech90).assign(length = lambda x: x['end'] - x['start']) for key, edge in edges.items()}




## For a single sample only ##
# sample = 'round_spermatid_100kb_arms'

# full_df = dataframes[sample]
# full_intersect = interval_intersect(full_df, ech90).assign(length=lambda x: x['end'] - x['start'])
# edge_intersect = interval_intersect(edge_df[f'{sample}_edges'], ech90).assign(length=lambda x: x['end'] - x['start'])

# edge_df = edge_df[f'{sample}_edges']

# # Plot full
# plot_regions(ech90, full_df, full_intersect)
# plt.suptitle('All edges')

# # Plot edge
# plot_regions(ech90, edge_df, edge_intersect)
# plt.suptitle('Edges only')

### Some stats about the data and intersections ###

stats = pd.DataFrame({
    'Sample': samples,
    'Total regions': [comp.shape[0] for comp in comps.values()],
    'Total regions on ECH90': [comp_intersect.shape[0] for comp_intersect in comp_intersects.values()],
    'Total regions on edges': [edge_intersect.shape[0] for edge_intersect in edge_intersects.values()],
    'Prop regions on ECH90': [comp_intersect.shape[0] / ech90.shape[0] for comp, comp_intersect in zip(comps.values(), comp_intersects.values())],
    'Total bp': [comp['length'].sum() for comp in comps.values()],
    'Total bp on ECH90': [comp_intersect['length'].sum() for comp_intersect in comp_intersects.values()],
    'Total bp on edges': [edge_intersect['length'].sum() for edge_intersect in edge_intersects.values()],
    'Prop bp on ECH90': [comp_intersect['length'].sum() / ech90['length'].sum() for comp, comp_intersect in zip(comps.values(), comp_intersects.values())],
})


#df[['Source', 'Rest']] = df['Sample'].str.extract(r'^(.*?)(_\d+\w+.*)')
stats[['source', 'res', 'view', 'smoothed']] = stats['Sample'].str.extract(r'^(.*?)_(100kb|500kb)_(full|arms|10Mb)(?:_(smoothed))?$')
stats['smoothed'] = stats['smoothed'].notna() # Convert to boolean
stats[['source', 'res', 'view', 'smoothed','Prop regions on ECH90', 'Prop bp on ECH90']]

	source	res	view	smoothed	Prop regions on ECH90	Prop bp on ECH90
0	spermatogonia	500kb	arms	False	0.421053	0.489351
1	round_spermatid	500kb	full	False	0.368421	0.418981
2	fibroblast	500kb	full	False	0.684211	0.529916
3	pachytene_spermatocyte	100kb	arms	True	0.684211	0.632145
4	pachytene_spermatocyte	500kb	full	False	0.368421	0.362651
5	pachytene_spermatocyte	100kb	full	False	0.473684	0.365265
6	round_spermatid	100kb	arms	True	0.631579	0.412828
7	fibroblast	100kb	arms	True	0.684211	0.499297
8	fibroblast	100kb	full	False	0.842105	0.550167
9	round_spermatid	100kb	full	False	0.421053	0.472204
10	spermatogonia	100kb	arms	False	0.526316	0.550008
11	spermatogonia	100kb	10Mb	False	0.631579	0.567910
12	sperm	100kb	full	False	0.631579	0.674746
13	sperm	100kb	arms	True	0.578947	0.572887
14	spermatogonia	500kb	10Mb	False	0.684211	0.689415
15	sperm	500kb	full	False	0.631579	0.559567
16	spermatogonia	100kb	arms	True	0.526316	0.549381
17	pachytene_spermatocyte	100kb	10Mb	False	0.789474	0.630109
18	round_spermatid	100kb	10Mb	False	0.684211	0.519772
19	round_spermatid	100kb	10Mb	True	0.578947	0.483689
20	fibroblast	100kb	10Mb	False	0.684211	0.494509
21	fibroblast	100kb	10Mb	True	0.526316	0.414530
22	fibroblast	100kb	full	True	0.736842	0.591105
23	sperm	100kb	arms	False	0.736842	0.578219
24	round_spermatid	100kb	full	True	0.368421	0.418981
25	fibroblast	500kb	10Mb	False	0.473684	0.359088
26	round_spermatid	500kb	10Mb	False	0.578947	0.496303
27	sperm	500kb	arms	False	0.421053	0.395138
28	pachytene_spermatocyte	100kb	10Mb	True	0.631579	0.665339
29	pachytene_spermatocyte	500kb	10Mb	False	0.631579	0.538153
30	pachytene_spermatocyte	100kb	full	True	0.368421	0.401552
31	round_spermatid	500kb	arms	False	0.684211	0.655953
32	spermatogonia	500kb	full	False	0.421053	0.392944
33	fibroblast	500kb	arms	False	0.684211	0.550729
34	spermatogonia	100kb	10Mb	True	0.631579	0.580640
35	spermatogonia	100kb	full	True	0.368421	0.386720
36	sperm	500kb	10Mb	False	0.526316	0.438818
37	pachytene_spermatocyte	500kb	arms	False	0.631579	0.650289
38	pachytene_spermatocyte	100kb	arms	False	0.789474	0.584619
39	sperm	100kb	10Mb	True	0.578947	0.549366
40	sperm	100kb	full	True	0.578947	0.677841
41	fibroblast	100kb	arms	False	0.736842	0.510641
42	spermatogonia	100kb	full	False	0.473684	0.374801
43	round_spermatid	100kb	arms	False	0.789474	0.500366
44	sperm	100kb	10Mb	False	0.684211	0.545798

Do testing of the edges

%%capture
# Define what we are testing

print("""
Goal: To test whether ECH90 regions are enriched in compartment edges
Query: ECH90
Annotation: Start and end edges of compartments

Hypothesis: 
      ECH90 regions are enriched in compartment edges
Null hypothesis: 
      ECH90 regions are not enriched in compartment edges

Tests: 
* Proximity test: 
      tests whether the query regions are closer to 
      the annotation regions than expected by chance. 
      NB regions can not overlap, so we need to collapse the annotation regions

* Jaccard index: 
      tests the similarity between the query and annotation regions, 
      where a value of 1 indicates perfect overlap
""")

Big test don’t run

Parallelized proximity test

set samples = 100_000

import os.path as op
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
from genominterv.stats import proximity_test
from genominterv import interval_diff


def process_proximity(sample, nsamples = 100000):
    query_comp = ech90
    query_edge = ech90
    annot_comp = comps[sample]
    annot_edge = edges[sample]

    # Get non-overlapping query regions
    non_ovl_query_comp = interval_diff(query_comp, annot_comp)
    non_ovl_query_edge = interval_diff(query_edge, annot_edge)

    proximity_comp = proximity_test(non_ovl_query_comp, annot_comp, 
                                    nsamples, two_sided=False,
                                    overlap_as_zero=True)
    proximity_edge = proximity_test(non_ovl_query_edge, annot_edge, 
                                    nsamples, two_sided=False,
                                    overlap_as_zero=True,span_as_zero=True)

    results = [
        {'Sample': sample, 'Query': 'ECH90', 'Comp Statistic': proximity_comp.statistic, 
         'Edge Statistic': proximity_edge.statistic, 'Comp P-value': proximity_comp.pvalue, 
         'Edge P-value': proximity_edge.pvalue}
    ]

    # Swap query and annotation
    query_comp, annot_comp = annot_comp, query_comp
    query_edge, annot_edge = annot_edge, query_edge

    non_ovl_query_comp = interval_diff(query_comp, annot_comp)
    non_ovl_query_edge = interval_diff(query_edge, annot_edge)

    proximity_comp = proximity_test(non_ovl_query_comp, annot_comp, 
                                    nsamples, two_sided=False,
                                    overlap_as_zero=True)
    proximity_edge = proximity_test(non_ovl_query_edge, annot_edge, 
                                    nsamples, two_sided=False,
                                    overlap_as_zero=True)

    results.append(
        {'Sample': sample, 'Query': 'Edge', 'Comp Statistic': proximity_comp.statistic, 
         'Edge Statistic': proximity_edge.statistic, 'Comp P-value': proximity_comp.pvalue, 
         'Edge P-value': proximity_edge.pvalue}
    )

    return results

### End of function definition ###


nsamples = 100_000
proximity_file = f'../results/proximity_test_{nsamples}.csv'
if op.exists(proximity_file):
    proximity_res = Parallel(n_jobs=-1)(
        delayed(process_proximity)(sample, nsamples) for sample in tqdm(samples)
    )

    # Flatten the results
    proximity_res = pd.DataFrame([item for sublist in proximity_res for item in sublist])
    proximity_res.to_csv(proximity_file, index=False)

100%|██████████| 2/2 [00:00<00:00, 120.90it/s]

---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/sojern/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
    r = call_item()
        ^^^^^^^^^^^
  File "/home/sojern/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/sojern/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_2160200/2582913803.py", line 22, in process_proximity
TypeError: proximity_test() got an unexpected keyword argument 'span_as_zero'
"""

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[15], line 60
     58 proximity_file = f'../results/proximity_test_{nsamples}.csv'
     59 if op.exists(proximity_file):
---> 60     proximity_res = Parallel(n_jobs=-1)(
     61         delayed(process_proximity)(sample, nsamples) for sample in tqdm(samples)
     62     )
     64     # Flatten the results
     65     proximity_res = pd.DataFrame([item for sublist in proximity_res for item in sublist])

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable)
   2001 # The first item from the output is blank, but it makes the interpreter
   2002 # progress until it enters the Try/Except block of the generator and
   2003 # reaches the first `yield` statement. This starts the asynchronous
   2004 # dispatch of the tasks to the workers.
   2005 next(output)
-> 2007 return output if self.return_generator else list(output)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch)
   1647     yield
   1649     with self._backend.retrieval_context():
-> 1650         yield from self._retrieve()
   1652 except GeneratorExit:
   1653     # The generator has been garbage collected before being fully
   1654     # consumed. This aborts the remaining tasks if possible and warn
   1655     # the user if necessary.
   1656     self._exception = True

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1754, in Parallel._retrieve(self)
   1747 while self._wait_retrieval():
   1748 
   1749     # If the callback thread of a worker has signaled that its task
   1750     # triggered an exception, or if the retrieval loop has raised an
   1751     # exception (e.g. `GeneratorExit`), exit the loop and surface the
   1752     # worker traceback.
   1753     if self._aborting:
-> 1754         self._raise_error_fast()
   1755         break
   1757     # If the next job is not ready for retrieval yet, we just wait for
   1758     # async callbacks to progress.

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1789, in Parallel._raise_error_fast(self)
   1785 # If this error job exists, immediately raise the error by
   1786 # calling get_result. This job might not exists if abort has been
   1787 # called directly or if the generator is gc'ed.
   1788 if error_job is not None:
-> 1789     error_job.get_result(self.timeout)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:745, in BatchCompletionCallBack.get_result(self, timeout)
    739 backend = self.parallel._backend
    741 if backend.supports_retrieve_callback:
    742     # We assume that the result has already been retrieved by the
    743     # callback thread, and is stored internally. It's just waiting to
    744     # be returned.
--> 745     return self._return_or_raise()
    747 # For other backends, the main thread needs to run the retrieval step.
    748 try:

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:763, in BatchCompletionCallBack._return_or_raise(self)
    761 try:
    762     if self.status == TASK_ERROR:
--> 763         raise self._result
    764     return self._result
    765 finally:

TypeError: proximity_test() got an unexpected keyword argument 'span_as_zero'

Parallelized Jaccard test

from joblib import Parallel, delayed
from tqdm import tqdm
import os.path as op
from genominterv.stat import jaccard_stat
from genominterv.decorators import bootstrap
from datetime import datetime
import pandas as pd

b = 100_000
@bootstrap(chromsizes, samples=b)
def jaccard_bootstrap(query, annot):
    return jaccard_stat(query, annot)

def process_sample(sample):
    """Function to process a single sample."""
    query_comp = ech90
    query_edge = ech90
    annot_comp = comps[sample]
    annot_edge = edges[sample]

    jaccard_comp = jaccard_bootstrap(query_comp, annot_comp)
    jaccard_edge = jaccard_bootstrap(query_edge, annot_edge)

    result = [{
        'Sample': sample, 
        'Query': 'ECH90', 
        'Comp Index': jaccard_comp[0], 
        'Edge Index': jaccard_edge[0], 
        'Comp P-value': jaccard_comp[1], 
        'Edge P-value': jaccard_edge[1]
        }
    ]

    # Swap query and annotation for the Edge test
    query_comp, annot_comp = annot_comp, query_comp
    query_edge, annot_edge = annot_edge, query_edge

    jaccard_comp = jaccard_bootstrap(query_comp, annot_comp)
    jaccard_edge = jaccard_bootstrap(query_edge, annot_edge)

    result.append({
        'Sample': sample, 
        'Query': 'Edge', 
        'Comp Index': jaccard_comp[0], 
        'Edge Index': jaccard_edge[0], 
        'Comp P-value': jaccard_comp[1], 
        'Edge P-value': jaccard_edge[1]
        }
    )
    return result

jaccard_file = f'../results/jaccard_test_{b}.csv'

if not op.exists(jaccard_file):

    # Run parallel computation
    print(f"Running bootstrap ({b}) for all samples in parallel")
    results = Parallel(n_jobs=-1)(
        delayed(process_sample)(sample) for sample in tqdm(samples)
    )

    # Flatten and create DataFrame
    jaccard_res = pd.DataFrame([item for sublist in results for item in sublist])

    # Write to CSV
    jaccard_res.to_csv(jaccard_file, index=False)

else:
    # Read
    jaccard_res = pd.read_csv(jaccard_file)

Plot some summary statistics

# Split sample column:
proximity_res = pd.read_csv(proximity_file)

proximity_res[['source', 'resolution', 'view', 'smoothed']] = proximity_res['Sample'].str.extract(r'^(.*?)_(100kb|500kb)_(full|arms|10Mb)(?:_(smoothed))?$')
proximity_res['source'] = proximity_res['source'].map({'fibroblast':'Fib', 'spermatogonia': 'SP', 'pachytene_spermatocyte': 'PAC', 'round_spermatid': 'RS', 'sperm': 'Sperm'})
proximity_res['smoothed'] = proximity_res['smoothed'].notna()
proximity_res['resolution'] = proximity_res.apply(lambda x: 'ps500kb' if x['smoothed'] else x['resolution'], axis=1)
proximity_res = proximity_res.melt(
    id_vars=['Sample', 'source', 'resolution', 'view', 'Query'],
    value_vars=['Comp P-value', 'Edge P-value'], 
     var_name='type', value_name='p-value',).query('Query == "ECH90"')

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

source_order = ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
# Choose the combination og parameters to plot
source = ['fibroblast','round_spermatid']
res = ['100kb']
view = ['arms']
annot = ['Edge P-value']

# Plot the results from @group
plot_group_prox = proximity_res.query('resolution in @res and view in @view and type in @annot')

plot_group_prox = plot_group_prox.assign(
    minuslog10p = lambda x: -np.log10(x['p-value']),
    type = 'Proximity')

# Plot the results

sns.set_theme(style="whitegrid", rc=notebook_rcparams)

g = sns.catplot(
    plot_group_prox, kind='bar',
    x='source', y='minuslog10p', hue='type', 
    order=source_order,
    errorbar=None, margin_titles=True, sharex=False,
    )

g.despine(left=True)
g.set_axis_labels("", "-log10(p)")
g.legend.set_title('')


for axis in g.axes.flat:
    axis.tick_params(labelbottom=True)
    # rotate said labels
    for item in axis.get_xticklabels():
        item.set_rotation(45)

g.tight_layout()

# Now do the same for the Jaccard index

jaccard_res = pd.read_csv(jaccard_file)

jaccard_res[['source', 'resolution', 'view', 'smoothed']] = jaccard_res['Sample'].str.extract(r'^(.*?)_(100kb|500kb)_(full|arms|10Mb)(?:_(smoothed))?$')
jaccard_res['source'] = jaccard_res['source'].map({'fibroblast':'Fib', 'spermatogonia': 'SP', 'pachytene_spermatocyte': 'PAC', 'round_spermatid': 'RS', 'sperm': 'Sperm'})
jaccard_res['smoothed'] = jaccard_res['smoothed'].notna()
jaccard_res['resolution'] = jaccard_res.apply(lambda x: 'ps500kb' if x['smoothed'] else x['resolution'], axis=1)
jaccard_res = jaccard_res.melt(
    id_vars=['Sample', 'source', 'resolution', 'view', 'Query'],
    value_vars=['Comp P-value', 'Edge P-value'], 
    var_name='type', value_name='p-value',).query('Query == "ECH90"')

import numpy as np

source_order = ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
# Choose the combination og parameters to plot
source = ['fibroblast','round_spermatid']
res = ['100kb']
view = ['arms']
annot = ['Edge P-value']

# Plot the results from @group
plot_group_jacc = jaccard_res.query('resolution in @res and view in @view and type in @annot')
plot_group_jacc = plot_group_jacc.assign(
    minuslog10p = lambda x: -np.log10(x['p-value']),
    type = 'Jaccard')

# Plot the results

sns.set_theme(style="whitegrid", rc=notebook_rcparams)

g = sns.catplot(
    plot_group_jacc, kind='bar',
    x='source', y='minuslog10p', hue='type', 
    order=source_order,
    errorbar=None, margin_titles=True, sharex=False,
    )

g.despine(left=True)
g.set_axis_labels("", "-log10(p)")
g.legend.set_title('')


for axis in g.axes.flat:
    axis.tick_params(labelbottom=True)
    # rotate said labels
    for item in axis.get_xticklabels():
        item.set_rotation(45)

g.tight_layout()

Plot proximity result with Jaccard result

# Concatenate the two plot_groups

group = pd.concat([plot_group_prox, plot_group_jacc])
# display(group[['source', 'resolution', 'type', 'p-value', 'minuslog10p']].query(
#     'source in ("Fib", "RS")'
# ).reset_index(drop=True))

# g = sns.catplot(
#     group, kind='bar',
#     x='source', y='minuslog10p', hue='type', 
#     order=source_order,
#     errorbar=None, margin_titles=True,
#     height=2, aspect=1.5
#     )
# # Plot a horisontal line at p=0.05
# g.ax.axhline(-np.log10(0.05), color='tab:red', linestyle='--')

f,ax = plt.subplots(figsize=(3,2))

# barplot
g = sns.barplot(
    data=group, x='source', y='minuslog10p', hue='type', 
    order=source_order, ax = ax
    )

ax.axhline(-np.log10(0.05), color='tab:red', linestyle='--')


sns.despine(ax=g, left=True)

g.set_ylabel('-log10(p)')
g.set_xlabel('')
g.get_legend().set_title('')
sns.move_legend(g, 'upper center', bbox_to_anchor=(0.5,0.975))

f.tight_layout()
plt.show()

Figure 7.1: Proximity and Jaccard index p-values for ECH90 regions on compartment edges for all cell types at 100kb resolution at arms view \(p=0.05\) is marked as a red line.

Plot all cominations of the parameters

Avoid this to avoid p-hacking.


# # Define the order of the x-axis if we plot all combinations
# view_order = ['full', 'arms', '10Mb']
# row_order = ['Fib', 'SP', 'PAC', 'RS', 'Sperm']
#
# sns.set_theme(style="whitegrid")
#
# g = sns.catplot(
#     proximity_res, kind='bar', row='source',
#     x='view', y='p-value', hue='type', col='resolution', 
#     order=view_order, row_order=row_order,
#     errorbar=None, margin_titles=True, sharex=False,
#     )
#
# g.despine(left=True)
# g.set_axis_labels("", "P-value")
# g.legend.set_title('')
#
#
# for axis in g.axes.flat:
#     axis.tick_params(labelbottom=True)
#     # rotate said labels
#     for item in axis.get_xticklabels():
#         item.set_rotation(45)
#
# g.tight_layout()


# # # Group the results by source and resolution
# # group = ["fibroblast", 'round_spermatid']
# # # Plot the results from @group
# # plot_group = jaccard_res.query(f'source in @group')
# sns.set_theme(style="whitegrid")

# view_order = ['full', 'arms', '10Mb']
# row_order = ['Fib', 'SP', 'PAC', 'RS', 'Sperm']

# g = sns.catplot(
#     jaccard_res, kind='bar', row='source',
#     x='view', y='p-value', hue='type', col='resolution', 
#     margin_titles=True, sharex=False, 
#     order=view_order, row_order=row_order,
#     errorbar=None)

# g.despine(left=True)
# g.set_axis_labels("", "P-value")
# g.legend.set_title('')
# for axis in g.axes.flat:
#     axis.tick_params(labelbottom=True)
#     # rotate said labels
#     for item in axis.get_xticklabels():
#         item.set_rotation(45)
#         item.set_va('top')

# g.tight_layout()

Revised test (baboon)

Baboon data

Here, the data for hybrid incompatibility in baboons (having Papio hamadryas or Papio anubis ancestry) is introduced and compared with the edge regions of the compartments. The intervals are extracted from baboondiversity project, in the notebook baboon_ancestry_segments.ipynb.

The data originates from a baboon population in a hybridization zone between olive and hamadryas baboons, and the extracted intervals are genomic regions where (almost) all individiuals have the same ancestral allele, 100% from olive or 95% hamadryas. It has nothing to do with Hi-C data, but maybe the intervals align as well?

The intervals were lifted to rheMac10 via hg38: panu_3.0 -> hg38 -> rheMac10: two intervals were deleted with no match in the last step:

#Deleted in new
chrX    58600000    58700000    high_olive
#Deleted in new
chrX    59400000    60800000    high_olive

Additionally, 4 intervals were lifted to the Y chromosome, so they were deleted as well

The tests

We think the regions where hamadryas ancestry is preserved are the most relevant to this analysis, as the regions are shorter and there are not as many.

Here, perform the tests as discussed (meeting)

Edge.v.Edge: (100kb +/- start,end)

jaccard_test
- query: [ech, high_hama]
- annot: [rs100_arms_edge, fb100_arms_edge]
- Total tests: \(4\)
proximity_test(overlap_as_zero=True,span_as_zero=True)
- query: [ech, high_hama]
- annot: [rs100_arms_edge, fb100_arms_edge]
- Total tests: \(4\)

Region.v.Region

jaccard_bootstrap()
- query: [ech, high_hama]
- annot: [rs100_arms_comp, fb100_arms_comp]

Load baboons

# Load baboons, define only high_hama

import pandas as pd
from genominterv import interval_collapse


### Lifted with standard UCSC LiftOver
# baboon_df = pd.read_csv('../results/high_baboon_rhemac10.bed', sep='\t', index_col=False, header=None, names=['chrom', 'start', 'end', 'group', 'score'])
baboon_nonlift = pd.read_csv('../results/hama_olive_high_intervals.tsv', sep='\t', index_col=False)

# # print("Original:")
# # print(baboon_df.groupby('group').size())

# high_hama = (interval_collapse(baboon_df.query('group == "high_hama"')).
#              assign(group = "high_hama").
#              query('chrom == "chrX"')
#              ).reset_index(drop=True)
# high_olive = (interval_collapse(baboon_df.query('group == "high_olive"')).
#               assign(group = "high_olive").
#               query('chrom == "chrX"')
#               )
# print("\nCollapsed:")
# #print(pd.concat([high_hama, high_olive]).groupby('group').size())


# #baboon_dict = {"P.hama": high_hama, "P.anubis": high_olive}
# print(f"high_hama: {high_hama.shape[0]}")

### Lifted with  segment_liftover (`lift` env)

high_hama = pd.read_csv('../data/lift/rheMac10/high_hama_rhemac10.bed', sep='\t', index_col=False, header=None, names=['group', 'chrom', 'start', 'end'])

high_olive = pd.read_csv('../data/lift/rheMac10/high_olive_rhemac10.bed', sep='\t', index_col=False, header=None, names=['group', 'chrom', 'start', 'end'])

Define functions

from genominterv import interval_collapse
from multiprocessing import cpu_count
from genominterv.stats import jaccard_stat, proximity_stat
from genominterv.decorators import bootstrap
import numpy as np
import pandas as pd

# Make edges function

## Make a function that splits an interval into its edges (and collapses overlapping edges)

def make_edges(df, resolution):
    start_edge = pd.DataFrame({
        'chrom': df['chrom'],
        'start': df.apply(lambda x: max(x['start'] - resolution, 0), axis=1),
        'end': df['start']+1*resolution,
        'resolution': resolution,
    })
    end_edge = pd.DataFrame({
        'chrom': df['chrom'],
        'start': df['end']-1*resolution,
        'end': df.apply(lambda x: min(x['end'] + resolution, chromsizes['chrX']), axis=1),
        'resolution': resolution,
    })

    return interval_collapse(pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])).assign(resolution=resolution)


nsamples=100_000

@bootstrap(chromsizes, samples=nsamples)
def proximity_test(q, a):
    """
    Tests if the mean relative distance of query segments to the
    closest annotation is smaller than expected.
    """
    return proximity_stat(q, a)

@bootstrap(chromsizes, samples=nsamples)
def jaccard_test(q, a):
    """"
    Tests if the overlap between query and annotation segments 
    is smaller than expected.
    """
    return jaccard_stat(q, a)

def overlaps(df1, df2):
    """
    Establishes whether each query segment overlaps at least one 
    annotation segment. Returns a boolean array with same length 
    as df1.index.
    """
    overlapping = []
    for i, (s1, e1) in enumerate(zip(df1.start, df1.end)):
        overlaps = False
        for s2, e2 in zip(df2.start, df2.end):
            if e1 > s2 and e2 > s1:
                overlaps = True
                break
        overlapping.append(overlaps)
    return np.array(overlapping)

def svedig_tabel(orig_df, index, columns, values, cmap='Reds'):
    df = (orig_df
        .assign(log10p=np.log10(results.p))
        .loc[(results.p < 0.05)]
        .pivot(index=index, columns=columns, values=values)
    )
    df = df.rename(columns = {x:x.replace('_', '<br>') for x in df.columns.tolist()})
    df = (df.style
        .background_gradient(subset=df.columns, axis=None, cmap=cmap, vmin=0)
        .map(lambda x: 'color: transparent; background-color: transparent' if np.isnan(x) else '')
        .format('{:.3f}')
        .set_table_styles(
                {c: [{'selector': '', 
                        'props': [('min-width', '100px')],
                        }] for c in df.columns}, overwrite=False
        )
    )
    return df

Load comps and edges

# Define the rest and get an overview of the selected samples

samples = ['fibroblast_100kb_arms', 'round_spermatid_100kb_arms']

fb100_dict = {
    'full':dataframes[samples[0]], 
    'edges':make_edges(dataframes[samples[0]], 100_000),
    'limits': make_edges(dataframes[samples[0]], 1)}

rs100_dict = {
    'full':dataframes[samples[1]],
    'edges':edge_df[f'{samples[1]}_edges'],
    'limits': make_edges(dataframes[samples[1]], 1)}

hama_dict = {
    'full': high_hama,
    'edges': make_edges(high_hama, 100_000),
    'limits': make_edges(high_hama, 1)
}

olive_dict = {
    'full': high_olive,
    'edges': make_edges(high_olive, 100_000),
    'limits': make_edges(high_olive, 1)
}

hamaolive_dict = {
    'full': pd.concat([high_hama, high_olive]),
    'edges': make_edges(pd.concat([high_hama, high_olive]), 100_000),
    'limits': make_edges(pd.concat([high_hama, high_olive]), 1)
}

ech_dict = {'full': ech90,
            'edges': make_edges(ech90, 100_000),
            'limits': make_edges(ech90, 1)}

Summarise

# Summarise how much the edges are reduced compared to the full regions

names = ['Fibroblast', 'Round spermatid', 'Hamadryas', 'Olive', 'HamaOlive', 'ECH']
samples = [fb100_dict, rs100_dict, hama_dict, olive_dict, hamaolive_dict, ech_dict]

# 

summary = pd.DataFrame({
    'Sample': names,
    'Full_bp': [sum(sample['full']['end'] - sample['full']['start']) for sample in samples],
    'full_max_bp': [max(sample['full']['end'] - sample['full']['start']) for sample in samples],
    'full_avg_bp': [round(sum(sample['full']['end'] - sample['full']['start']) / sample['full'].shape[0], 1) for sample in samples],
    'full_min_bp': [min(sample['full']['end'] - sample['full']['start']) for sample in samples],
    'Edge_bp': [sum(sample['edges']['end'] - sample['edges']['start']) for sample in samples],
    'edge_avg_bp': [round(sum(sample['edges']['end'] - sample['edges']['start']) / sample['edges'].shape[0], 1) for sample in samples],

}).assign(edge_over_full=lambda x: round(x['Edge_bp'] / x['Full_bp'],3))

# tell pandas to use thousands separator
display(summary.style
        .format("{:,.0f}", subset=['Full_bp', 'full_max_bp', 'full_min_bp', 'Edge_bp'])
        .format("{:,.1f}", subset=['full_avg_bp', 'edge_avg_bp'])
        .format("{:.1%}", subset='edge_over_full')
        )

plot_regions(high_olive, high_hama, ech90, 
             track_titles=['Hama', 'Olive', 'ECH90'],
             title="Edges")

plot_regions(high_olive,baboon_nonlift.query('group == "high_olive"'), 
             track_titles=['Lifted High olive', 'Non-lifted high olive'],
             title="Compare liftOver")

plot_regions(hama_dict['limits'], olive_dict['limits'], ech_dict['limits'], 
             track_titles=['Hama', 'Olive', 'ECH90'], 
             title='Limits', 
             alpha=1, lw=0.3)

plot_regions(olive_dict['limits'], fb100_dict['limits'], rs100_dict['limits'], 
             track_titles=['Hama', 'Fibroblast', 'Round spermatid'],
             title='Limits',
             alpha=1, lw=0.3)

	Sample	Full_bp	full_max_bp	full_avg_bp	full_min_bp	Edge_bp	edge_avg_bp	edge_over_full
0	Fibroblast	64,200,000	5,000,000	1,088,135.6	100,000	21,600,000	260,241.0	33.6%
1	Round spermatid	56,500,000	4,500,000	733,766.2	100,000	27,088,924	282,176.3	47.9%
2	Hamadryas	8,775,834	1,787,836	516,225.5	96,276	6,434,612	229,807.6	73.3%
3	Olive	44,876,120	6,493,293	1,319,885.9	97,613	12,715,662	219,235.6	28.3%
4	HamaOlive	53,651,954	6,493,293	1,051,999.1	96,276	18,941,689	228,213.1	35.3%
5	ECH	5,511,675	623,108	290,088.2	88,572	7,195,159	224,848.7	130.5%

Test

Test revised I

# Now define tests 
from genominterv.decorators import bootstrap
from genominterv.stats import proximity_test, jaccard_stat
from joblib import Parallel, delayed
from tqdm import tqdm

n_samples=100_000

## Proximity test

def proximity_parallel(query, annot, nsamples=100_000, oaz=False, saz=False):
    q_key, a_key = query, annot
    query = prox_dict[q_key]
    annot = prox_dict[a_key]
    
    test = proximity_test(query, annot, nsamples, 
                          overlap_as_zero=oaz, 
                          span_as_zero=saz
                          )

    results = [
        {'Query': q_key,
         'Annot': a_key, 
         'Statistic': test.statistic, 
         'P-value': test.pvalue,
         'Overlap as zero': oaz,
         'Span as zero': saz
         }
    ]

    return results

## Jaccard bootstrap
@bootstrap(chromsizes, samples=10_000)
def jaccard_bootstrap(query, annot):
    return jaccard_stat(query, annot)

def jaccard_parallel(query, annot):
    q_key = query
    a_key = annot
    query = jacc_dict[q_key]
    annot = jacc_dict[a_key]

    test = jaccard_bootstrap(query, annot)

    results = [
        {'Query': q_key,
            'Annot': a_key, 
            'Index': test[0], 
            'P-value': test[1]
            }]
    return results

# Now do the testing
# Define the samples
prox_dict = {'ECH90': ech_dict['full'],
             'Hamadryas': hama_dict['full'],
             'Olive': olive_dict['full'],
             'Fib100_edge': fb100_dict['edges'],
             'RS100_edge': rs100_dict['edges'],
             'RS100_full': rs100_dict['full']
            }

prox_samples = [('ECH90', 'Fib100_edge', n_samples),
                ('ECH90', 'RS100_edge', n_samples),
                ('Hamadryas', 'Fib100_edge', n_samples),
                ('Hamadryas', 'RS100_edge', n_samples),
                ('Olive', 'Fib100_edge', n_samples),
                ('Olive', 'RS100_edge', n_samples),
                ('Olive', 'RS100_full', n_samples),
                ('Hamadryas', 'RS100_full', n_samples),
                ]

# First, without oaz, saz
proximity_res1 = Parallel(n_jobs=-1)(
    delayed(proximity_parallel)(que, ann, n, False, False) for (que,ann,n) in tqdm(prox_samples)
)
proximity_res1 = pd.DataFrame([item for sublist in proximity_res1 for item in sublist])

# Now with oaz, saz
proximity_res2 = Parallel(n_jobs=-1)(
    delayed(proximity_parallel)(que, ann, n, True, True) for (que,ann,n) in tqdm(prox_samples)
)
proximity_res2 = pd.DataFrame([item for sublist in proximity_res2 for item in sublist])

# Now with oaz, saz
proximity_res3 = Parallel(n_jobs=-1)(
    delayed(proximity_parallel)(que, ann, n, True, False) for (que,ann,n) in tqdm(prox_samples)
)
proximity_res3 = pd.DataFrame([item for sublist in proximity_res3 for item in sublist])


# Concatenate the results
proximity_res = pd.concat([proximity_res1, proximity_res2, proximity_res3])

# Write to CSV
#proximity_res.to_csv('../results/proximity_revised.csv', index=False)

# Now do the Jaccard index
jacc_samples = [(ech_dict['full'], fb100_dict['full']), 
                (hama_dict['full'], rs100_dict['full'])
                ]

jacc_dict = {'ECH90': ech_dict['full'],
             'Hamadryas': hama_dict['full'],
             'Olive': olive_dict['full'],
             'Fib100_edge': fb100_dict['edges'],
             'Fib100_full': fb100_dict['full'],
             'RS100_full': rs100_dict['full'],
             'RS100_edge': rs100_dict['edges']
            }
jacc_samples = [
    ('ECH90', 'Fib100_full'),
    ('ECH90', 'Fib100_edge'),
    ('ECH90', 'RS100_full'),
    ('ECH90', 'RS100_edge'),
    ('Hamadryas', 'Fib100_full'),
    ('Hamadryas', 'Fib100_edge'),
    ('Hamadryas', 'RS100_full'),
    ('Hamadryas', 'RS100_edge'),
    ('Olive', 'Fib100_edge'),
    ('Olive', 'RS100_edge'),
    ]

jaccard_res = Parallel(n_jobs=-1)(
    delayed(jaccard_parallel)(que, ann) for (que,ann) in tqdm(jacc_samples)
)   

# Flatten and create DataFrame
jaccard_res = pd.DataFrame([item for sublist in jaccard_res for item in sublist])

# Write to CSV
#jaccard_res.to_csv('../results/jaccard_revised.csv', index=False)

100%|██████████| 8/8 [00:00<00:00, 395.88it/s]

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[15], line 75
     64 prox_samples = [('ECH90', 'Fib100_edge', n_samples),
     65                 ('ECH90', 'RS100_edge', n_samples),
     66                 ('Hamadryas', 'Fib100_edge', n_samples),
   (...)
     71                 ('Hamadryas', 'RS100_full', n_samples),
     72                 ]
     74 # First, without oaz, saz
---> 75 proximity_res1 = Parallel(n_jobs=-1)(
     76     delayed(proximity_parallel)(que, ann, n, False, False) for (que,ann,n) in tqdm(prox_samples)
     77 )
     78 proximity_res1 = pd.DataFrame([item for sublist in proximity_res1 for item in sublist])
     80 # Now with oaz, saz

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable)
   2001 # The first item from the output is blank, but it makes the interpreter
   2002 # progress until it enters the Try/Except block of the generator and
   2003 # reaches the first `yield` statement. This starts the asynchronous
   2004 # dispatch of the tasks to the workers.
   2005 next(output)
-> 2007 return output if self.return_generator else list(output)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch)
   1647     yield
   1649     with self._backend.retrieval_context():
-> 1650         yield from self._retrieve()
   1652 except GeneratorExit:
   1653     # The generator has been garbage collected before being fully
   1654     # consumed. This aborts the remaining tasks if possible and warn
   1655     # the user if necessary.
   1656     self._exception = True

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/joblib/parallel.py:1762, in Parallel._retrieve(self)
   1757 # If the next job is not ready for retrieval yet, we just wait for
   1758 # async callbacks to progress.
   1759 if ((len(self._jobs) == 0) or
   1760     (self._jobs[0].get_status(
   1761         timeout=self.timeout) == TASK_PENDING)):
-> 1762     time.sleep(0.01)
   1763     continue
   1765 # We need to be careful: the job list can be filling up as
   1766 # we empty it and Python list are not thread-safe by
   1767 # default hence the use of the lock

KeyboardInterrupt:


display('Proximity:',proximity_res.sort_values(['Query', 'Annot','Overlap as zero']), 
        'Jaccard:',jaccard_res)

'Proximity:'

	Query	Annot	Statistic	P-value	Overlap as zero	Span as zero
0	ECH90	Fib100_edge	0.126462	0.18750	False	False
0	ECH90	Fib100_edge	0.320105	0.00000	True	True
0	ECH90	Fib100_edge	0.320105	0.00000	True	False
1	ECH90	RS100_edge	-0.051091	0.85932	False	False
1	ECH90	RS100_edge	0.412667	0.00000	True	True
1	ECH90	RS100_edge	0.383300	0.00000	True	False
2	Hamadryas	Fib100_edge	0.010583	0.49535	False	False
2	Hamadryas	Fib100_edge	0.121857	0.00057	True	True
2	Hamadryas	Fib100_edge	0.121857	0.32055	True	False
3	Hamadryas	RS100_edge	0.043037	0.46991	False	False
3	Hamadryas	RS100_edge	0.079267	0.10159	True	True
3	Hamadryas	RS100_edge	0.079267	0.00830	True	False
7	Hamadryas	RS100_full	-0.038111	0.67801	False	False
7	Hamadryas	RS100_full	0.065700	0.55859	True	True
7	Hamadryas	RS100_full	0.065700	0.32201	True	False
4	Olive	Fib100_edge	0.267519	0.00000	False	False
4	Olive	Fib100_edge	0.281078	0.00000	True	True
4	Olive	Fib100_edge	0.277333	0.00000	True	False
5	Olive	RS100_edge	0.137642	0.00000	False	False
5	Olive	RS100_edge	0.156492	0.00000	True	True
5	Olive	RS100_edge	0.156492	0.00000	True	False
6	Olive	RS100_full	0.179701	0.00000	False	False
6	Olive	RS100_full	0.197664	0.00000	True	True
6	Olive	RS100_full	0.185778	0.00000	True	False

'Jaccard:'

	Query	Annot	Index	P-value
0	ECH90	Fib100_full	0.042072	0.1962
1	ECH90	Fib100_edge	0.063394	0.0141
2	ECH90	RS100_full	0.046543	0.1005
3	ECH90	RS100_edge	0.062911	0.0121
4	Hamadryas	Fib100_full	0.032936	0.2989
5	Hamadryas	Fib100_edge	0.015783	0.7405
6	Hamadryas	RS100_full	0.031271	0.3467
7	Hamadryas	RS100_edge	0.022037	0.5719
8	Olive	Fib100_edge	0.039584	0.7753
9	Olive	RS100_edge	0.039312	0.8489

Test revised II

Kasper realized it is very hard to capture both proximity and intersect in the same statistic. Thus, we go back to test the overlapping regions with Jaccard test, and use a new proximity test, measuring the mean distance between query regions and annotation regions, almost with the same power as the previous one.


### Test the functions ### took 25 seconds for nsamples=100_000

query = olive_dict['limits']
annot = make_edges(dataframes['sperm_100kb_10Mb'], 5)

stat, p = jaccard_test(query, annot)
print(f'''
Jaccard overlap test: 
    stat: {stat} 
       p: {p}
''')

query_non_ovl = query.loc[~overlaps(query, annot)]   
stat, p = proximity_test(query_non_ovl, annot)
print(f'''
Proximity test: 
    stat: {stat} 
       p: {p} 
''')


Jaccard overlap test: 
    stat: 0.0013218770654329147 
       p: 0.00053


Proximity test: 
    stat: 0.2930268116279996 
       p: 0.00929

### Make it a loop in stead
a_names = ['Fb100arms_edge', 'RS100arms_edge', 
           'Fb100_10Mb_edge', 'RS100_10Mb_edge',
           'Fb100arms_full', 'RS100arms_full']
q_names = ['ECH', 'Hama', 'Olive',
           'ECH_edge', 'Hama_edge', 'Olive_edge']

annots = [edge_df['fibroblast_100kb_arms_edges'], edge_df['round_spermatid_100kb_arms_edges'], 
          edge_df['fibroblast_100kb_10Mb_edges'], edge_df['round_spermatid_100kb_10Mb_edges'],
          dataframes['fibroblast_100kb_arms'], dataframes['round_spermatid_100kb_arms']]
queries = [ech_dict['full'], hama_dict['full'], olive_dict['full'],
            ech_dict['edges'], hama_dict['edges'], olive_dict['edges']]

results = pd.DataFrame({
    'Query': [],
    'Annotation': [],
    'Jaccard Index': [],
    'Jaccard P-value': [],
    'Proximity Statistic': [],
    'Proximity P-value': []
})

for i,query in enumerate(queries):
    print(f'i = {i}')
    for j,annot in enumerate(annots):
        print(f'j = {j}')
        j_stat, j_p = jaccard_test(query, annot)
        query_non_ovl = query.loc[~overlaps(query, annot)]   
        p_stat, p_p = proximity_test(query_non_ovl, annot)
        tmp = pd.DataFrame(
            {
            'Query': q_names[i],
            'Annotation': a_names[j],
            'Jaccard Index': j_stat,
            'Jaccard P-value': j_p,
            'Proximity Statistic': p_stat,
            'Proximity P-value': p_p
            }, 
            index=[0])
        results = pd.concat([results, tmp], ignore_index=True)

results

i = 0
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 1
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 2
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 3
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 4
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5
i = 5
j = 0
j = 1
j = 2
j = 3
j = 4
j = 5

	Query	Annotation	Jaccard Index	Jaccard P-value	Proximity Statistic	Proximity P-value
0	ECH	Fb100arms_edge	0.063394	0.01234	0.281690	0.31470
1	ECH	RS100arms_edge	0.062911	0.01029	0.237249	0.61862
2	ECH	Fb100_10Mb_edge	0.048480	0.08578	0.221680	0.73127
3	ECH	RS100_10Mb_edge	0.057965	0.02468	0.280611	0.33594
4	ECH	Fb100arms_full	0.042072	0.19746	0.188639	0.40612
5	ECH	RS100arms_full	0.046543	0.10295	0.277032	0.39865
6	Hama	Fb100arms_edge	0.021886	0.90170	0.253639	0.51690
7	Hama	RS100arms_edge	0.027870	0.85085	0.256731	0.50018
8	Hama	Fb100_10Mb_edge	0.031809	0.75087	0.253548	0.51142
9	Hama	RS100_10Mb_edge	0.034824	0.72564	0.298500	0.39744
10	Hama	Fb100arms_full	0.059582	0.33048	0.255948	0.65396
11	Hama	RS100arms_full	0.069205	0.15910	0.242701	0.43243
12	Olive	Fb100arms_edge	0.109276	0.39990	0.279857	0.27209
13	Olive	RS100arms_edge	0.139396	0.23299	0.239110	0.65271
14	Olive	Fb100_10Mb_edge	0.082725	0.93293	0.252348	0.52897
15	Olive	RS100_10Mb_edge	0.146962	0.17601	0.242756	0.61343
16	Olive	Fb100arms_full	0.281681	0.03051	0.272324	0.56266
17	Olive	RS100arms_full	0.209506	0.32056	0.235625	0.71433
18	ECH_edge	Fb100arms_edge	0.076356	0.00404	0.295478	0.18026
19	ECH_edge	RS100arms_edge	0.062408	0.03379	0.266103	0.38942
20	ECH_edge	Fb100_10Mb_edge	0.057708	0.06767	0.269624	0.34382
21	ECH_edge	RS100_10Mb_edge	0.074277	0.00498	0.273560	0.35419
22	ECH_edge	Fb100arms_full	0.051794	0.18634	0.295836	0.50053
23	ECH_edge	RS100arms_full	0.054435	0.11747	0.274170	0.81663
24	Hama_edge	Fb100arms_edge	0.014028	0.95055	0.240012	0.64811
25	Hama_edge	RS100arms_edge	0.033281	0.54103	0.244095	0.60123
26	Hama_edge	Fb100_10Mb_edge	0.029314	0.63328	0.262847	0.40403
27	Hama_edge	RS100_10Mb_edge	0.044633	0.21629	0.234146	0.67565
28	Hama_edge	Fb100arms_full	0.053583	0.05438	0.191426	0.12090
29	Hama_edge	RS100arms_full	0.061216	0.00873	0.200406	0.12220
30	Olive_edge	Fb100arms_edge	0.038987	0.87121	0.250063	0.54679
31	Olive_edge	RS100arms_edge	0.049514	0.75761	0.257123	0.43135
32	Olive_edge	Fb100_10Mb_edge	0.057027	0.47526	0.257397	0.42551
33	Olive_edge	RS100_10Mb_edge	0.051652	0.71941	0.243567	0.64567
34	Olive_edge	Fb100arms_full	0.079507	0.32217	0.258289	0.46750
35	Olive_edge	RS100arms_full	0.071486	0.51011	0.237859	0.67702

subset_annot = ['RS100arms_edge', 'RS100_10Mb_edge',
                'Fb100arms_edge', 'Fb100_10Mb_edge']
subset_query = ['ECH_edge', 'Hama_edge', 'Olive_edge']
display((results
 .query('Annotation in @subset_annot and Query in @subset_query')
 .sort_values(['Annotation','Query'])
 .filter(['Annotation','Query', 'Jaccard P-value', 'Proximity P-value'])
)
)

	Annotation	Query	Jaccard P-value	Proximity P-value
20	Fb100_10Mb_edge	ECH_edge	0.06767	0.34382
26	Fb100_10Mb_edge	Hama_edge	0.63328	0.40403
32	Fb100_10Mb_edge	Olive_edge	0.47526	0.42551
18	Fb100arms_edge	ECH_edge	0.00404	0.18026
24	Fb100arms_edge	Hama_edge	0.95055	0.64811
30	Fb100arms_edge	Olive_edge	0.87121	0.54679
21	RS100_10Mb_edge	ECH_edge	0.00498	0.35419
27	RS100_10Mb_edge	Hama_edge	0.21629	0.67565
33	RS100_10Mb_edge	Olive_edge	0.71941	0.64567
19	RS100arms_edge	ECH_edge	0.03379	0.38942
25	RS100arms_edge	Hama_edge	0.54103	0.60123
31	RS100arms_edge	Olive_edge	0.75761	0.43135

# plot the significant proximity-result
query=ech_dict['edges']
annot= fb100_dict['full']

plot_regions(query, annot, interval_intersect(query, annot), track_titles=['ECH90', 'Fb100_full', 'Intersect'])

# plot some Jaccard-results

query=ech_dict['edges']
annot= edge_df['round_spermatid_100kb_10Mb_edges']

plot_regions(query, annot, interval_intersect(query, annot), track_titles=['ECH90', 'RS100_10Mb_edges', 'Intersect'])

import matplotlib.pyplot as plt
import seaborn as sns

test_results = (
    results
    .query('Annotation in @subset_annot and Query in @subset_query')
    .filter(['Annotation','Query', 'Jaccard P-value', 'Proximity P-value'])
)

# Melt the results to a P-value column
test_results_melt = (
    test_results
    .rename(columns={'Jaccard P-value': 'Jaccard', 'Proximity P-value': 'Proximity'})
    .melt(
        id_vars=['Query', 'Annotation'],
        value_vars=['Jaccard', 'Proximity'], 
        var_name='type', value_name='p-value')
    .assign(logp=lambda x: -np.log10(x['p-value']))
)

source_order = ['Fb', 'Spa', 'Pac', 'RS', 'Spz']

f,ax = plt.subplots(figsize=(3,2))

g = sns.barplot(
    data=test_results_melt, x='Annotation', y='logp', hue='type', 
    order=source_order, ax = ax
    )
ax.axhline(-np.log10(0.05), color='tab:red', linestyle='--')


sns.despine(ax=g, left=True)

g.set_ylabel('-log10(p)')
g.set_xlabel('')
g.get_legend().set_title('')
sns.move_legend(g, 'upper center', bbox_to_anchor=(0.4,0.975))

Figure 7.2: Proximity and Jaccard index p-values for ECH90 regions on compartment edges for all cell types at 100kb resolution at arms view \(p=0.05\) is marked as a red line.

Do new test with limits (2bp)


# Calcilate the proximity and Jaccard tests
# Annot: Fb, SPA, PAC, RS, SP (edges)
# Query: ECH

# Define the samples

annots = {
    'Fb': dataframes['fibroblast_100kb_arms'], 
    'Spa': dataframes['spermatogonia_100kb_arms'],
    'Pac': dataframes['pachytene_spermatocyte_100kb_arms'],
    'RS': dataframes['round_spermatid_100kb_arms'],
    'Spz': dataframes['sperm_100kb_arms']
}
queries = {
    'ECH': ech90,
    'ECH_lim': make_edges(ech90, 1),
    'Hama_lim': make_edges(high_hama, 1),
    'Olive_lim': make_edges(high_olive, 1)
}

# Run the tests

test_results = {
    'Query': [],
    'Source': [],
    'Type': [],
    'Jaccard': [],
    'Proximity': []
}

for i,(name,annot) in enumerate(annots.items()):
    print(f'{i+1} of {len(annots)}: {name}')

    tmp_annots = {'comps': annot, 
                  'edges': make_edges(annot, 100_000), 
                  'limits': make_edges(annot, 1)}
    
    for tmp_query_key, query in queries.items():
        print(f'\tQuery: {tmp_query_key}')
        for tmp_key, tmp_annot in tmp_annots.items():
            print(f'\t\t{tmp_key}',end=' ')
            _, j_p = jaccard_test(query, tmp_annot)
            print('--> Jaccard done', end=' ')
            query_non_ovl = query.loc[~overlaps(query, tmp_annot)]   
            _, p_p = proximity_test(query_non_ovl, tmp_annot)
            print('--> Proximity done.')

            # Append to results
            test_results['Query'].append(tmp_query_key)
            test_results['Source'].append(name)
            test_results['Type'].append(tmp_key)
            test_results['Jaccard'].append(j_p)
            test_results['Proximity'].append(p_p)

results = pd.DataFrame(test_results)
results

1 of 5: Fb
    Query: ECH
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: ECH_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Hama_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Olive_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
2 of 5: Spa
    Query: ECH
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: ECH_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Hama_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Olive_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
3 of 5: Pac
    Query: ECH
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: ECH_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Hama_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Olive_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
4 of 5: RS
    Query: ECH
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: ECH_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Hama_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Olive_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
5 of 5: Spz
    Query: ECH
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: ECH_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Hama_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.
    Query: Olive_lim
        comps --> Jaccard done --> Proximity done.
        edges --> Jaccard done --> Proximity done.
        limits --> Jaccard done --> Proximity done.

	Query	Source	Type	Jaccard	Proximity
0	ECH	Fb	comps	0.19934	0.82548
1	ECH	Fb	edges	0.01199	0.31260
2	ECH	Fb	limits	0.00530	0.55282
3	ECH_lim	Fb	comps	0.40849	0.65961
4	ECH_lim	Fb	edges	0.03409	0.29474
5	ECH_lim	Fb	limits	1.00000	0.55628
6	Hama_lim	Fb	comps	0.12358	0.62059
7	Hama_lim	Fb	edges	0.87612	0.89615
8	Hama_lim	Fb	limits	1.00000	0.86046
9	Olive_lim	Fb	comps	0.27137	0.36505
10	Olive_lim	Fb	edges	0.85070	0.51847
11	Olive_lim	Fb	limits	1.00000	0.60821
12	ECH	Spa	comps	0.05156	0.00314
13	ECH	Spa	edges	0.49491	0.24411
14	ECH	Spa	limits	0.68780	0.23246
15	ECH_lim	Spa	comps	0.33333	0.03413
16	ECH_lim	Spa	edges	0.23260	0.09237
17	ECH_lim	Spa	limits	1.00000	0.04169
18	Hama_lim	Spa	comps	0.09449	0.51142
19	Hama_lim	Spa	edges	0.62895	0.61381
20	Hama_lim	Spa	limits	0.00001	0.50210
21	Olive_lim	Spa	comps	0.57259	0.58325
22	Olive_lim	Spa	edges	0.91817	0.19321
23	Olive_lim	Spa	limits	1.00000	0.46966
24	ECH	Pac	comps	0.59824	0.42413
25	ECH	Pac	edges	0.10229	0.74138
26	ECH	Pac	limits	0.15461	0.69764
27	ECH_lim	Pac	comps	0.46237	0.92811
28	ECH_lim	Pac	edges	0.10141	0.74043
29	ECH_lim	Pac	limits	1.00000	0.71976
30	Hama_lim	Pac	comps	0.39982	0.66599
31	Hama_lim	Pac	edges	0.48866	0.22423
32	Hama_lim	Pac	limits	0.00005	0.44405
33	Olive_lim	Pac	comps	0.03725	0.78024
34	Olive_lim	Pac	edges	0.58759	0.34903
35	Olive_lim	Pac	limits	0.00004	0.39258
36	ECH	RS	comps	0.10578	0.07079
37	ECH	RS	edges	0.01046	0.61816
38	ECH	RS	limits	0.00655	0.73480
39	ECH_lim	RS	comps	0.18603	0.34104
40	ECH_lim	RS	edges	0.11797	0.53686
41	ECH_lim	RS	limits	1.00000	0.35290
42	Hama_lim	RS	comps	0.01714	0.85609
43	Hama_lim	RS	edges	0.56556	0.78961
44	Hama_lim	RS	limits	1.00000	0.78258
45	Olive_lim	RS	comps	0.39299	0.78907
46	Olive_lim	RS	edges	0.66654	0.75863
47	Olive_lim	RS	limits	1.00000	0.56521
48	ECH	Spz	comps	0.03892	0.86643
49	ECH	Spz	edges	0.07330	0.63545
50	ECH	Spz	limits	0.08195	0.88817
51	ECH_lim	Spz	comps	0.10352	0.54812
52	ECH_lim	Spz	edges	0.80407	0.54553
53	ECH_lim	Spz	limits	1.00000	0.73543
54	Hama_lim	Spz	comps	0.20128	0.13174
55	Hama_lim	Spz	edges	0.53662	0.22328
56	Hama_lim	Spz	limits	1.00000	0.09604
57	Olive_lim	Spz	comps	0.18841	0.59118
58	Olive_lim	Spz	edges	0.84400	0.55331
59	Olive_lim	Spz	limits	1.00000	0.64852

q_filter = ['ECH', 'Hama_lim', 'Olive_lim']

results.query('Query in @q_filter and Type == "limits"').sort_values(['Source', 'Type'])

	Query	Source	Type	Jaccard	Proximity
2	ECH	Fb	limits	0.00530	0.55282
8	Hama_lim	Fb	limits	1.00000	0.86046
11	Olive_lim	Fb	limits	1.00000	0.60821
26	ECH	Pac	limits	0.15461	0.69764
32	Hama_lim	Pac	limits	0.00005	0.44405
35	Olive_lim	Pac	limits	0.00004	0.39258
38	ECH	RS	limits	0.00655	0.73480
44	Hama_lim	RS	limits	1.00000	0.78258
47	Olive_lim	RS	limits	1.00000	0.56521
14	ECH	Spa	limits	0.68780	0.23246
20	Hama_lim	Spa	limits	0.00001	0.50210
23	Olive_lim	Spa	limits	1.00000	0.46966
50	ECH	Spz	limits	0.08195	0.88817
56	Hama_lim	Spz	limits	1.00000	0.09604
59	Olive_lim	Spz	limits	1.00000	0.64852

import matplotlib.pyplot as plt
import seaborn as sns

# Melt the results to a P-value column
test_results_melt = (
    results.query('Query == "ECH_limits"')
    .melt(
        id_vars=['Query', 'Source'],
        value_vars=['Jaccard', 'Proximity'], 
        var_name='test', value_name='p-value')
    .assign(logp=lambda x: -np.log10(x['p-value']))
)

source_order = ['Fb', 'Spa', 'Pac', 'RS', 'Spz']

f,ax = plt.subplots(figsize=(3,2))

g = sns.barplot(
    data=test_results_melt, x='Source', y='logp', hue='test', 
    order=source_order, ax = ax
    )
ax.axhline(-np.log10(0.05), color='tab:red', linestyle='--')


sns.despine(ax=g, left=True)

g.set_ylabel('-log10(p)')
g.set_xlabel('')
g.get_legend().set_title('')
sns.move_legend(g, 'upper center', bbox_to_anchor=(0.4,0.975))

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[38], line 29
     27 g.set_ylabel('-log10(p)')
     28 g.set_xlabel('')
---> 29 g.get_legend().set_title('')
     30 sns.move_legend(g, 'upper center', bbox_to_anchor=(0.4,0.975))

AttributeError: 'NoneType' object has no attribute 'set_title'

Old test

# # Now we should test

# from genominterv import proximity_test, interval_diff, jaccard_stat, bootstrap

# #query = ech90
# query = edges['round_spermatid_100kb_arms']
# annot_hama = hama_edges
# #annot_olive = baboon_dict['P.anubis']
# b = 1_000

# # Proximity test
# non_ovl_query_hama = interval_diff(query, annot_hama)
# #non_ovl_query_olive = interval_diff(query, annot_olive)

# proximity_hama = proximity_test(non_ovl_query_hama, annot_hama, two_sided=True, samples=10_000)
# #proximity_olive = proximity_test(non_ovl_query_olive, annot_olive, two_sided=True, samples=10_000)

# # Jaccard index

# @bootstrap(chromsizes, samples=b, smaller=False)
# def jaccard_bootstrap(query, annot):
#     return jaccard_stat(query, annot)

# jaccard_hama = jaccard_bootstrap(query, annot_hama)
# #jaccard_olive = jaccard_bootstrap(query, annot_olive)

# print(f"""
# Proximity test results:
#     Hama: {proximity_hama}
#  #   Olive: {proximity_olive}

# Jaccard index results:
#     Hama: {jaccard_hama}
#   #  Olive: {jaccard_olive}
# """)

# # # Swap annot and query 

# # non_ovl_query_hama = interval_diff(annot_hama, query)
# # non_ovl_query_olive = interval_diff(annot_olive, query)

# # proximity_hama_flip = proximity_test(non_ovl_query_hama, query, two_sided=True, samples=b)
# # proximity_olive_flip = proximity_test(non_ovl_query_olive, query, two_sided=True, samples=b)

# # jaccard_hama_flip = jaccard_bootstrap(annot_hama, query)
# # jaccard_olive_flip = jaccard_bootstrap(annot_olive, query)


# Make a long dataframe

# pd.DataFrame({
#     'Group': ['P.hama', 'P.anubis'],
#     #'Proximity Statistic': [proximity_hama.statistic, proximity_olive.statistic],
#     'Proximity P-value': [proximity_hama.pvalue, proximity_olive.pvalue],
#     #'Jaccard Index': [jaccard_hama[0], jaccard_olive[0]],
#     'Jaccard P-value': [jaccard_hama[1], jaccard_olive[1]]
# }).melt(id_vars='Group', var_name='Test', value_name='Value')

pd.DataFrame({
    'Group': ['P.hama', 'P.anubis'],
    #'Proximity Statistic': [proximity_hama.statistic, proximity_olive.statistic],
    'Proximity P-value': [proximity_hama.pvalue, proximity_olive.pvalue],
    #'Jaccard Index': [jaccard_hama[0], jaccard_olive[0]],
    'Jaccard P-value': [jaccard_hama[1], jaccard_olive[1]],
    # #'Proximity Statistic (flip)': [proximity_hama_flip.statistic, proximity_olive_flip.statistic],
    # 'Proximity P-value (flip)': [proximity_hama_flip.pvalue, proximity_olive_flip.pvalue],
    # #'Jaccard Index (flip)': [jaccard_hama_flip[0], jaccard_olive_flip[0]],
    # 'Jaccard P-value (flip)': [jaccard_hama_flip[1], jaccard_olive_flip[1]],
}).melt(id_vars='Group', var_name='Test', value_name='Value')

	Group	Test	Value
0	P.hama	Proximity P-value	0.0069
1	P.anubis	Proximity P-value	0.0274
2	P.hama	Jaccard P-value	0.2720
3	P.anubis	Jaccard P-value	0.8250

Plot the relative distances (proximities)

ech90.assign(start=ech90['start']-5000,
             end  = ech90['end']+5000)

	chrom	start	end	length
0	chrX	19531773	19641984	100211
1	chrX	20930188	21051124	110936
2	chrX	36218142	36427842	199700
3	chrX	37230935	37637028	396093
4	chrX	49511522	50019126	497604
5	chrX	50816385	51449493	623108
6	chrX	53897995	54111817	203822
7	chrX	62091538	62298349	196811
8	chrX	70884995	71154673	259678
9	chrX	71783766	72149275	355509
10	chrX	74336598	74640686	294088
11	chrX	96245950	96344522	88572
12	chrX	108173096	108564466	381370
13	chrX	111297881	111601540	293659
14	chrX	124073238	124182167	98929
15	chrX	126833587	127249031	405444
16	chrX	128398780	128706766	297986
17	chrX	129729920	130039306	299386
18	chrX	150800639	151219408	408769

# Size of each of the dataframes

print(f"""
rs100_full: {rs100_dict['full'].shape[0]}
rs100_edges: {rs100_dict['edges'].shape[0]}
fb100_full: {fb100_dict['full'].shape[0]}
fb100_edges: {fb100_dict['edges'].shape[0]}
hama_dict: {hama_dict['full'].shape[0]}
ech_dict: {ech_dict['full'].shape[0]}

""")


rs100_full: 77
rs100_edges: 96
fb100_full: 59
fb100_edges: 83
hama_dict: 31
ech_dict: 19

import seaborn as sns
from genominterv.remapping import remap_interval_data

n = 1000
a = np.sort(np.random.randint(1, 10_000_000, size=n))
annot = pd.DataFrame(dict(chrom='chrX', start=a, end=a+10))
q = np.sort(np.random.randint(1, 10_000_000, size=n))
query = pd.DataFrame(dict(chrom='chrX', start=q, end=q+10))
df1 = remap_interval_data(query, annot, relative=True)
df1['mid'] = (df1.start + df1.end) / 2
df1['absmid'] = df1.mid.abs()

# dummy data
a = np.sort(np.random.randint(1, 10_000_000, size=n))
annot = pd.DataFrame(dict(chrom='chrX', start=a, end=a+10))
q = a + np.random.randint(-1000, 1000, size=n)
query = pd.DataFrame(dict(chrom='chrX', start=q, end=q+10))
df2 = remap_interval_data(query, annot, relative=True)
df2['mid'] = (df2.start + df2.end) / 2
df2['absmid'] = df2.mid.abs()

# Real data
q1 = hama_dict['full']
a1 = rs100_dict['full']
df3 = remap_interval_data(q1, a1, relative=True)
df3['mid'] = (df3.start + df3.end) / 2
df3['absmid'] = df3.mid.abs()

q2 = hama_dict['full']
a2 = fb100_dict['full']
df4 = remap_interval_data(q2, a2, relative=True)
df4['mid'] = (df4.start + df4.end) / 2
df4['absmid'] = df4.mid.abs()

sns.histplot(df3, x='mid', bins=np.linspace(-0.5, 0.5, 20))
sns.histplot(df4, x='mid', bins=np.linspace(-0.5, 0.5, 20))

sns.histplot(df3, x='absmid', bins=np.linspace(0, 0.5, 20))
sns.histplot(df4, x='absmid', bins=np.linspace(0, 0.5, 20))

df3['dist'] = 'uniform'
df4['dist'] = 'closer'
df = pd.concat([df3, df4])
sns.barplot(df, y='absmid', x='dist', hue='dist')

Geneinfo

How does the edges align with genes?

This first plot is just to figure out how to plot with gene_plot.

import geneinfo as gi
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection


# Use the proximity test results to plot the ECH90 regions and the compartment edges

start = full_intersect['start'][2]
end = full_intersect['end'][5]

rectangles = [Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6) for start, end in zip(full_intersect['start'][2:6], full_intersect['end'][2:6])]

pc = PatchCollection(rectangles, match_original=True)

ax = gi.gene_plot('chrX', start-100_000, end+100_000, assembly='rheMac10')
ax.add_collection(pc)

Get the geneinfo for all intersections between edges and ECH90

And write to a csv file. If the file exists, read it with pandas.

# Use get_genes_region
import os.path as op
import geneinfo as gi
import pandas as pd
from os import makedirs as mkdir

genes_dir = '../results/rec_edge_genes/'
if not op.exists(genes_dir):
    mkdir(genes_dir)
    
genes_file = op.join(genes_dir,'rs_edges_100kb_genes.csv')

if not op.exists(genes_file):
    genes = pd.concat(
        full_intersect.apply(
            lambda x: gi.get_genes_region_dataframe('chrX', x['start'], x['end'], assembly='rheMac10'), 
            axis =1
            ).to_list(),
        ignore_index=True
    )
    genes.to_csv(genes_file, index=False) 
else: 
    genes = pd.read_csv(genes_file)

genes_list = genes['name'].unique().tolist()
genes_list

['SH3KBP1',
 'MIR7206',
 'LANCL3',
 'XK',
 'CYBB',
 'LOC696657',
 'DYNLT3',
 'PAGE4',
 'USP27X',
 'CLCN5',
 'LOC114675180',
 'MIR532',
 'MIR188',
 'MIR500A',
 'MIR362',
 'MIR501',
 'MIR500B',
 'MIR660',
 'MIR502',
 'AKAP4',
 'CCNB3',
 'LOC114675218',
 'LOC695959',
 'CENPVL3',
 'FAM120C',
 'WNK3',
 'LOC114675302',
 'ZC3H12B',
 'LAS1L',
 'MSN',
 'ATRX',
 'MAGT1',
 'LOC114675151',
 'COX7B',
 'ATP7A',
 'ALG13',
 'LOC706958',
 'TRPC5',
 'ENOX2',
 'RAP2C',
 'LOC114675176',
 'DKC1',
 'LOC114675231',
 'MPP1',
 'SMIM9',
 'F8',
 'H2AFB3',
 'FUNDC2',
 'CMC4',
 'MTCP1',
 'BRCC3',
 'LOC703257']

import geneinfo as gi
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection


# Use the proximity test results to plot the ECH90 regions and the compartment edges

for i in full_intersect.index:
    start_idx = i
    end_idx = i

    start = full_intersect['start'][start_idx]
    end = full_intersect['end'][end_idx]

    rectangles = [Rectangle(
        (start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=0, alpha=0.6) for start, end in zip(full_intersect['start'][start_idx:end_idx+1], full_intersect['end'][start_idx:end_idx+1])]

    pc = PatchCollection(rectangles, match_original=True)

    ax = gi.gene_plot('chrX', start-100_000, end+100_000, assembly='rheMac10', 
                    highlight=genes_list,
                    despine=True,
                    figsize=(5, 5),
                    aspect=5,
                    )
    ax.add_collection(pc)

What can I do with the list of genes on the edges?

GO enrichment?

mmul_x_genes = gi.get_genes_region_dataframe('chrX', 0, 155_000_000, assembly='rheMac10')

mmul_x_genelist = mmul_x_genes['name'].unique().tolist()

gene_list = genes['name'].unique().tolist()
taxid = 9544
gi.email('sojernj@gmail.com')
#gi.go_annotation_table(taxid=taxid)

#gi.show_go_evidence_codes()

go_terms = gi.get_go_terms_for_genes(gene_list, taxid=taxid)

len(go_terms)
#gene_list[:5]

results = gi.go_enrichment(
    # Use human taxid as a start
    gene_list, 
    alpha=0.05,
    terms=go_terms)

geneinfo_cache/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms; optional_attrs(def relationship)

Could not map gene symbol "MIR7206" to ncbi id
Could not map gene symbol "LOC696657" to ncbi id
Could not map gene symbol "LOC114675180" to ncbi id

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1050, in _cached_symbol2ncbi(symbols, taxid)
   1049 try:    
-> 1050     return symbol2ncbi.loc[symbols].tolist() 
   1051 except KeyError:

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1191, in _LocationIndexer.__getitem__(self, key)
   1190 maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable)
-> 1191 return self._getitem_axis(maybe_callable, axis=axis)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1420, in _LocIndexer._getitem_axis(self, key, axis)
   1418         raise ValueError("Cannot index with multidimensional key")
-> 1420     return self._getitem_iterable(key, axis=axis)
   1422 # nested tuple slicing

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1360, in _LocIndexer._getitem_iterable(self, key, axis)
   1359 # A collection of keys
-> 1360 keyarr, indexer = self._get_listlike_indexer(key, axis)
   1361 return self.obj._reindex_with_indexers(
   1362     {axis: [keyarr, indexer]}, copy=True, allow_dups=True
   1363 )

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1558, in _LocIndexer._get_listlike_indexer(self, key, axis)
   1556 axis_name = self.obj._get_axis_name(axis)
-> 1558 keyarr, indexer = ax._get_indexer_strict(key, axis_name)
   1560 return keyarr, indexer

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:6200, in Index._get_indexer_strict(self, key, axis_name)
   6198     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6200 self._raise_if_missing(keyarr, indexer, axis_name)
   6202 keyarr = self.take(indexer)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:6252, in Index._raise_if_missing(self, key, indexer, axis_name)
   6251 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 6252 raise KeyError(f"{not_found} not in index")

KeyError: "['MIR7206', 'LOC696657', 'LOC114675180', 'MIR532', 'MIR188', 'MIR500A', 'MIR362', 'MIR501', 'MIR500B', 'MIR660', 'MIR502', 'LOC114675218', 'LOC695959', 'LOC114675302', 'LOC114675151', 'LOC706958', 'LOC114675176', 'LOC114675231', 'H2AFB3', 'LOC703257'] not in index"

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'MIR532'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1055, in _cached_symbol2ncbi(symbols, taxid)
   1054 try:
-> 1055     geneids.append(symbol2ncbi.loc[symbol])
   1056 except KeyError:

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1191, in _LocationIndexer.__getitem__(self, key)
   1190 maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable)
-> 1191 return self._getitem_axis(maybe_callable, axis=axis)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1431, in _LocIndexer._getitem_axis(self, key, axis)
   1430 self._validate_key(key, axis)
-> 1431 return self._get_label(key, axis=axis)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexing.py:1381, in _LocIndexer._get_label(self, label, axis)
   1379 def _get_label(self, label, axis: AxisInt):
   1380     # GH#5567 this will fail if the label is not present in the axis.
-> 1381     return self.obj.xs(label, axis=axis)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/generic.py:4301, in NDFrame.xs(self, key, axis, level, drop_level)
   4300 else:
-> 4301     loc = index.get_loc(key)
   4303     if isinstance(loc, np.ndarray):

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.

KeyError: 'MIR532'

During handling of the above exception, another exception occurred:

HTTPError                                 Traceback (most recent call last)
Cell In[55], line 1
----> 1 results = gi.go_enrichment(
      2     # Use human taxid as a start
      3     gene_list, 
      4     alpha=0.05,
      5     terms=go_terms)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1780, in go_enrichment(gene_list, taxid, background_chrom, background_genes, terms, list_study_genes, alpha)
   1777 GeneID2nt = background_genes.GENEID2NT
   1779 if not all(type(x) is int for x in gene_list):
-> 1780     gene_list = _cached_symbol2ncbi(gene_list, taxid=taxid)
   1782 goeaobj = GOEnrichmentStudyNS(
   1783         GeneID2nt, # List of mouse protein-coding genes
   1784         ns2assoc, # geneid/GO associations
   (...)
   1788         methods=['fdr_bh'],
   1789         pvalcalc='fisher_scipy_stats') 
   1791 goea_results_all = goeaobj.run_study(gene_list)

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:1058, in _cached_symbol2ncbi(symbols, taxid)
   1056 except KeyError:
   1057     try:
-> 1058         ncbi_id = hgcn_symbol(symbol)
   1059         if ncbi_id not in symbol2ncbi.index:
   1060             print(ncbi_id, 'not in symbol2ncbi index')

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:256, in hgcn_symbol(name)
    254     return [ensembl2symbol(ensembl_id(n)) for n in name]
    255 else:
--> 256     return ensembl2symbol(ensembl_id(name))

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/geneinfo/__init__.py:227, in ensembl2symbol(ensembl_id)
    225 r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
    226 if not r.ok:
--> 227     r.raise_for_status()
    228 decoded = r.json()
    229 symbols = [x['display_id'] for x in decoded if x['dbname'] == 'HGNC']

File ~/miniconda3/envs/hic/lib/python3.12/site-packages/requests/models.py:1024, in Response.raise_for_status(self)
   1019     http_error_msg = (
   1020         f"{self.status_code} Server Error: {reason} for url: {self.url}"
   1021     )
   1023 if http_error_msg:
-> 1024     raise HTTPError(http_error_msg, response=self)

HTTPError: 400 Client Error: Bad Request for url: https://rest.ensembl.org/xrefs/id/%5B'ENSG00000207758'%5D