# import standard python libraries
import numpy as np
import pandas as pd
# Import packages for working with cooler files and tools for plots
import cooler
import cooltools.lib.plotting
import cooltools
import seaborn as sns
from cooltools.lib.numutils import adaptive_coarsegrain, interp_nan
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
Various Comparisons
Various plots using homemade plotting utility comparing different parameters and methods for the manuscript
Notes before we start
We want to create a plotting utility so we can easily plot and compare different matrices. We might move them to hicstuff.py
for import, but as for now, they are defined only in this notebook.
Setup
Imports for plots etc
InlineBackend.rc
For consistent plots that fit with pdf manus.
import matplotlib.pyplot as plt
# Define my params:
## Params will comply with my desired layout for the Manuscript (PDF)
= {
notebook_rcparams 'font.size': 7,
'axes.titlesize': 8,
'axes.labelsize': 7,
'xtick.labelsize': 6,
'ytick.labelsize': 6,
'figure.titlesize': 9,
'figure.figsize': [6.0, 2.0],
'figure.labelsize': 7,
}
# Apply config
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
%config InlineBackend.rc = notebook_rcparams
print({key:plt.rcParams[key] for key in notebook_rcparams.keys()})
{'font.size': 10.0, 'axes.titlesize': 'large', 'axes.labelsize': 'medium', 'xtick.labelsize': 'medium', 'ytick.labelsize': 'medium', 'figure.titlesize': 'large', 'figure.figsize': [6.4, 4.8], 'figure.labelsize': 'large'}
## Apparently IPython needs this twice to understand??
import matplotlib.pyplot as plt
# Define my params:
## Params will comply with my desired layout for the Manuscript (PDF)
= {
notebook_rcparams 'font.size': 7,
'axes.titlesize': 8,
'axes.labelsize': 7,
'xtick.labelsize': 6,
'ytick.labelsize': 6,
'figure.titlesize': 9,
'figure.figsize': [6.0, 2.0],
'figure.labelsize': 7,
}
# Apply config
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
%config InlineBackend.rc = notebook_rcparams
print({key:plt.rcParams[key] for key in notebook_rcparams.keys()})
{'font.size': 7.0, 'axes.titlesize': 8.0, 'axes.labelsize': 7.0, 'xtick.labelsize': 6.0, 'ytick.labelsize': 6.0, 'figure.titlesize': 9.0, 'figure.figsize': [6.0, 2.0], 'figure.labelsize': 7.0}
Load all the coolers and eigenvectors
## Loading all coolers and their compartments
import glob
import os.path as op
import cooler
# Abbreviate the names (almost the same abbreviations as Wang et al. 2019)
= {'fibroblast': 'Fib', 'spermatogonia': 'SPA', 'pachytene_spermatocyte': 'PAC', 'round_spermatid': 'RS', 'sperm': 'Sp'}
names_abbr
# Paths and resolutions
= pd.Series(glob.glob("../steps/bwa/*/cool/*/*.mcool"))
mcools = "::resolutions/100000"
res_100 = "::resolutions/500000"
res_500
# Create a DataFrame
= pd.DataFrame({
clr_df 'name': pd.concat([mcools.apply(lambda x: op.basename(op.dirname(x)))] * 3, ignore_index=True),
'resolution': ['100'] * len(mcools) + ['ps500'] * len(mcools) + ['500'] * len(mcools),
'mcool_path': pd.concat([mcools] * 3, ignore_index=True)
})
# Insert the 'sname' column (shortname/abbreviation)
'sname'] = clr_df['name'].apply(lambda x: names_abbr[x])
clr_df[
# Create the 'cooler_id' column
'cooler_id'] = clr_df.apply(
clr_df[lambda row: cooler.Cooler(
'mcool_path'] + (res_100 if row['resolution'] in ['100', 'ps500'] else res_500)
row[
),=1
axis
)
# Determine parsing run (dirname 3 levels up PE or recPE)
'run'] = clr_df['mcool_path'].apply(lambda x: 'recPE' if 'rec' in x else 'PE')
clr_df[
# Add the 'nbins' column
'nbins'] = clr_df['cooler_id'].apply(lambda x: len(x.bins().fetch('chrX')))
clr_df[
# Add the path to the compartments (../results/{compartments,rec_compartments})
# make a comp_dir dict
= {
comp_dirs 'PE': '../results/compartments/',
'recPE': '../results/rec_compartments/'
}
'e1_dict'] = clr_df.apply(
clr_df[lambda row: {view: pd.read_csv(op.join(
'run']], f"{row['name']}_e1_{row['resolution']}kb_{view}.csv"
comp_dirs[row['e1']
))[if row['resolution'] != 'ps500'
else pd.read_csv(op.join(
'run']], f"{row['name']}_e1_100kb_{view}_smoothed.csv"
comp_dirs[row['e1']
))[for view in ['full', 'arms', '10Mb']},
=1
axis
)
# Now we have all the coolers and compartments ready for plotting (in semi-long format)
#clr_df
'sname', 'resolution', 'nbins', 'run', 'e1_dict']].query('resolution == "ps500"')
clr_df[[#print(clr_df['e1_dict'].values)
sname | resolution | nbins | run | e1_dict | |
---|---|---|---|---|---|
10 | PAC | ps500 | 1534 | recPE | {'full': [nan, nan, nan, nan, nan, nan, nan, n... |
11 | SPA | ps500 | 1534 | recPE | {'full': [nan, nan, nan, nan, nan, nan, nan, n... |
12 | Fib | ps500 | 1534 | recPE | {'full': [nan, nan, nan, nan, nan, -0.15077432... |
13 | RS | ps500 | 1534 | recPE | {'full': [nan, nan, nan, nan, nan, nan, nan, n... |
14 | Sp | ps500 | 1534 | recPE | {'full': [nan, nan, nan, nan, nan, -0.00698684... |
15 | RS | ps500 | 1534 | PE | {'full': [nan, nan, nan, nan, nan, -0.17383142... |
16 | SPA | ps500 | 1534 | PE | {'full': [nan, nan, nan, nan, nan, nan, nan, -... |
17 | Sp | ps500 | 1534 | PE | {'full': [nan, nan, nan, -0.102536873416233, -... |
18 | Fib | ps500 | 1534 | PE | {'full': [nan, nan, nan, -0.2342462339281897, ... |
19 | PAC | ps500 | 1534 | PE | {'full': [nan, nan, nan, nan, nan, 0.595328434... |
Load the genomic intervals
A-compartments
import pandas as pd
import os
# ECH regions and chromsizes
= pd.read_csv('../data/ech90_human_Mmul_10.csv').assign(length=lambda x: x.end - x.start)
ech90
= (pd.read_csv(
chromsizes '../data/rheMac10.filtered.chrom.sizes',
='\t',
sep='chrom',
index_col=None,
header=['chrom','size'])
names'size']
.to_dict()[
)
# Directory containing your .csv files
= {
comp_dirs 'PE': '../results/compartments/',
'recPE': '../results/rec_compartments/'
}
# Create a dictionary to store the comps
= {
comps 'PE': {},
'recPE': {}
}
# Iterate over all .csv files in the directory
for run in comps.keys():
= comp_dirs[run]
comp_dir for filename in os.listdir(comp_dir):
# We only want the .csv files and not e1 tracks
if filename.endswith('.csv') and 'e1' not in filename:
# Construct the full file path
= os.path.join(comp_dir, filename)
filepath
# Load the CSV into a DataFrame
# Use the filename (without extension) as the dictionary key
= filename.replace('_a_comp_coords_', '_')
key = os.path.splitext(key)[0]
key = pd.read_csv(filepath).assign(length=lambda x: x.end - x.start)
comps[run][key]
# The `comps` dictionary now contains the DataFrames
# Example usage:
'recPE']['round_spermatid_100kb_arms'] comps[
chrom | start | end | resolution | length | |
---|---|---|---|---|---|
0 | chrX | 900000 | 1600000 | 100000 | 700000 |
1 | chrX | 1700000 | 2600000 | 100000 | 900000 |
2 | chrX | 2900000 | 3100000 | 100000 | 200000 |
3 | chrX | 3200000 | 3500000 | 100000 | 300000 |
4 | chrX | 6600000 | 6700000 | 100000 | 100000 |
... | ... | ... | ... | ... | ... |
72 | chrX | 145400000 | 147100000 | 100000 | 1700000 |
73 | chrX | 147300000 | 147400000 | 100000 | 100000 |
74 | chrX | 148800000 | 149100000 | 100000 | 300000 |
75 | chrX | 149300000 | 151200000 | 100000 | 1900000 |
76 | chrX | 152500000 | 153300000 | 100000 | 800000 |
77 rows × 5 columns
Edge intervals
import pandas as pd
import os
# Directory containing your .csv files
= {
edge_dirs 'PE': '../results/edges/',
'recPE': '../results/rec_edges/'
}
= {
edges 'PE': {},
'recPE': {}
}
# Iterate over all .csv files in the directories
for run in edges.keys():
= edge_dirs[run]
edge_dir for filename in os.listdir(edge_dir):
if filename.endswith('.csv'): # Check for .csv files
# Construct the full file path
= os.path.join(edge_dir, filename)
filepath
# Load the CSV into a DataFrame
# Use the filename (without extension) as the dictionary key
= filename.replace('_edges', '')
key = os.path.splitext(key)[0]
key = pd.read_csv(filepath).assign(length=lambda x: x.end - x.start)
edges[run][key]
# The `edges` dictionary now contains the DataFrames
print(edges.keys())
'recPE']['fibroblast_100kb_10Mb'])
display(edges[
#ech90 = pd.read_csv('../data/ech90_human_Mmul_10.csv')
dict_keys(['PE', 'recPE'])
start | end | chrom | resolution | length | |
---|---|---|---|---|---|
0 | 700000 | 900000 | chrX | 100000 | 200000 |
1 | 3300000 | 3500000 | chrX | 100000 | 200000 |
2 | 8200000 | 8400000 | chrX | 100000 | 200000 |
3 | 13300000 | 13500000 | chrX | 100000 | 200000 |
4 | 14900000 | 15100000 | chrX | 100000 | 200000 |
... | ... | ... | ... | ... | ... |
87 | 144400000 | 144800000 | chrX | 100000 | 400000 |
88 | 145400000 | 145600000 | chrX | 100000 | 200000 |
89 | 146700000 | 147100000 | chrX | 100000 | 400000 |
90 | 148800000 | 149000000 | chrX | 100000 | 200000 |
91 | 152100000 | 152300000 | chrX | 100000 | 200000 |
92 rows × 5 columns
Baboons
# Load baboons
import pandas as pd
from genominterv import interval_collapse
= pd.read_csv('../data/lift/papanu4/high_hama_papanu4.bed', sep='\t', index_col=False, header=None, names=['group','chrom', 'start', 'end'])
hama_panu3
= pd.read_csv('../data/lift/papanu4/high_olive_papanu4.bed', sep='\t', index_col=False, header=None, names=['group','chrom', 'start', 'end'])
olive_panu3
= pd.read_csv('../results/high_baboon_rhemac10.bed', sep='\t', index_col=False, header=None, names=['chrom', 'start', 'end', 'group', 'score'])
baboon_df
= (interval_collapse(baboon_df.query('group == "high_hama"')).
high_hama_ucsc = "high_hama").
assign(group 'chrom == "chrX"')
query(
)= (interval_collapse(baboon_df.query('group == "high_olive"')).
high_olive_ucsc = "high_olive").
assign(group 'chrom == "chrX"')
query(
)
= pd.read_csv('../data/lift/rheMac10/high_hama_rhemac10.bed', sep='\t', index_col=False, header=None, names=['group','chrom', 'start', 'end'])
high_hama
= pd.read_csv('../data/lift/rheMac10/high_olive_rhemac10.bed', sep='\t', index_col=False, header=None, names=['group','chrom', 'start', 'end'])
high_olive
= {"P.hama": high_hama, "P.anubis": high_olive} baboon_dict
Define a plotting utility for compartments and E1
This plots a group/subset of coolers and their respective E1 compartments.
# We can vary the number of subplots based on the number of coolers
# We can vary the number of views for each cooler
# We can define the region (tuple) of the chromosome to plot (default chrX)
def plot_grouped(
group: pd.DataFrame, str = 'chrX', # or (chrom, start, end)
region: bool = True,
include_compartments: list[str] = ['10Mb', 'arms', 'full'],
view_order: tuple[float, float] = (6, 6)
figsize: -> None:
)
= group.reset_index(drop=True)
group
# Define the number of subplots based on the number of coolers
= plt.subplots(1,group.shape[0],
f, axs =figsize,
figsize=True
sharex
)
# Logtransform the colorscale
= LogNorm(vmax=0.1)
norm
# If there is only one cooler, axs is not a list, so we need to make it a list
if not isinstance(axs, np.ndarray):
= [axs]
axs
# Loop through the clrs and its matrix: plot the matrix on its axis
for row in group.itertuples():
= axs[row.Index]
ax # Define the y-axis label
= f'{row.sname}{row.resolution}kb: {row.run}'
title = f'{region}:{row.resolution}kb [Mb]'
ylab
# Extract some variables
= {k: v for k, v in row.e1_dict.items() if k in view_order}
e1_dict = row.cooler_id.bins().fetch(region)['start'].values
chrom_start = row.cooler_id.bins().fetch(region)['end'].values
chrom_end
= ax.matshow(
im
row.cooler_id.matrix().fetch(region),=norm,
norm='fall',
cmap;
)0, row.nbins)
ax.set_xlim(0)
ax.set_ylim(row.nbins,
ax.set_ylabel(ylab)= np.linspace(0, row.nbins, 5)
ticks
ax.set_yticks(ticks)f'{x*row.cooler_id.binsize/1_000_000:.0f}' for x in ticks])
ax.set_yticklabels([False)
ax.xaxis.set_visible(
= make_axes_locatable(ax)
divider = divider.append_axes("right", size="2%", pad=0.05)
cax =cax, label='corrected frequencies');
plt.colorbar(im, cax# Only show the colorbar on the last plot and ytitle on first plot
if row.Index != group.shape[0] - 1:
False)
cax.set_visible(elif row.Index != 0:
'')
ax.set_ylabel(
# Skip the E1 plot when
if not include_compartments:
ax.set_title(title)continue
for j, (view, e1) in enumerate(e1_dict.items()):
= divider.append_axes("top", size="10%", pad=0.1, sharex = ax)
ax1
# Create stairs
= np.zeros(2*chrom_start.size)
x = np.zeros(2*chrom_start.size)
y 0::2] = chrom_start/row.cooler_id.binsize
x[1::2] = chrom_end/row.cooler_id.binsize
x[0::2] = e1
y[1::2] = e1
y[
0, where=(y > 0), color='tab:red', ec = 'None')
ax1.fill_between(x, y, 0, where=(y < 0), color='tab:blue', ec = 'None')
ax1.fill_between(x, y,
=0, ha='right', va='center')
ax1.set_ylabel(view, rotation-0.8, 0.8)
ax1.set_ylim(
ax1.set_yticks([])
ax1.set_xticks([])False)
ax1.spines[:].set_visible(
ax1.set_title(title)'compressed')
f.set_layout_engine(
plt.show()
Plotting matrices separately
Plotting function that makes separate plots in stead of mpl.subplots. This makes it compatible with YAML header subfigures and layout options for embedding into Quarto
def plot_for_quarto(
group: pd.DataFrame, str | tuple[str, int, int] = 'chrX',
region: bool = True,
include_compartments: list[str] = ['10Mb', 'arms', 'full'],
view_order: tuple[float, float] = (6, 6),
figsize: float = 1e-3,
col_vmin: float = 0.1,
col_vmax: str = None
save_png_name: -> None:
) """
Plot each cooler as a separate figure to allow Quarto YAML options for subfigure plots and captions.
"""
= group.reset_index(drop=True)
group
# Scale figsize based on the number of coolers
= (figsize[0]/group.shape[0], figsize[1])
figsize
# Set the region to a tuple if it is a string
if isinstance(region, str):
= (region, 0, group.iloc[0].cooler_id.chromsizes[region])
region
# Define start and end for the matrix plot
= region[1], region[2]
start, end
# Loop through each cooler (row) in the DataFrame
for row in group.itertuples():
# Create a new figure for each cooler
# Scale figsize
= plt.subplots(figsize=figsize)
fig, ax
# Logtransform the colorscale
= LogNorm(vmin=col_vmin,vmax=col_vmax)
norm
# Define the y-axis label
# title = f'{row.sname}{row.resolution}kb: {row.run}'
= f'{row.resolution}kb'
ylab
# Extract some variables
= {k: row.e1_dict[k] for k in view_order}
e1_dict = row.cooler_id.bins().fetch(region)['start'].values
chrom_start = row.cooler_id.bins().fetch(region)['end'].values
chrom_end = chrom_start.size # != row.nbins when subsetting chr
nbins = row.cooler_id.binsize # is used several times
binsize
# Plot the Hi-C matrix
= ax.matshow(
im
row.cooler_id.matrix().fetch(region),=norm,
norm='fall',
cmap=(0, nbins, nbins, 0)
extent
)0, nbins)
ax.set_xlim(0)
ax.set_ylim(nbins,
ax.set_ylabel(ylab)= np.linspace(0, nbins, 5)
ticks
ax.set_yticks(ticks)f'{(start/binsize+x)*binsize/1_000_000:.0f}' for x in ticks])
ax.set_yticklabels([False)
ax.xaxis.set_visible(
# Add a colorbar
= make_axes_locatable(ax)
divider = divider.append_axes("right", size="2%", pad=0.05)
cax =cax, label='corrected frequencies')
plt.colorbar(im, cax
# Skip E1 plot if not including compartments
if include_compartments:
for view, e1 in e1_dict.items():
# subset the e1 to region (chrX:start-end)/binsize
= int(chrom_start[0]/binsize)
start_bin = int(chrom_start[-1]/binsize)
end_bin = e1.iloc[start_bin:end_bin+1].reset_index(drop=True)
e1 = divider.append_axes("top", size="10%", pad=0.05, sharex=ax)
ax1
# Create stairs for E1 values
= np.zeros(2 * chrom_start.size)
x = np.zeros(2 * chrom_start.size)
y 0::2] = np.arange(chrom_start.size)
x[1::2] = np.arange(chrom_start.size)
x[0::2] = e1
y[1::2] = e1
y[
0, where=(y > 0), color='tab:red', ec='None')
ax1.fill_between(x, y, 0, where=(y < 0), color='tab:blue', ec='None')
ax1.fill_between(x, y,
if len(e1_dict) > 1:
=0, ha='right', va='center', fontsize=7)
ax1.set_ylabel(view, rotationelse:
'E1', rotation=0, ha='right', va='center', fontsize=7)
ax1.set_ylabel(
-0.8, 0.8)
ax1.set_ylim(
ax1.set_yticks([])
ax1.set_xticks([])False)
ax1.spines[:].set_visible(
# # Set the figure title
# ax1.set_title(title)
fig.tight_layout()
if save_png_name:
=320)
plt.savefig(save_png_name, dpi
# Display the figure for Quarto to capture
plt.show()
Plot intervals
Plot genomic intervals as tracks (up to 3)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Rectangle
from mpl_toolkits.axes_grid1 import make_axes_locatable
from genominterv import interval_intersect
def plot_regions(query=None, annot=None, intersect=None,
=['query', 'annot', 'intersect'],
track_titles=(10, 1),
figsize='chrom',
title=0.8, lw=0):
alpha= annot['chrom'].unique()[0]
chrom = pd.read_csv('../data/rheMac10.filtered.chrom.sizes', sep='\t', header=None, names=['chrom', 'size'])
chromsizes
= chromsizes[chromsizes['chrom'] == chrom]['size'].values[0]
base_size = annot['end'].max()
annot_size = query['end'].max()
query_size = max(base_size, annot_size, query_size)
chromsize
if title == 'chrom':
= chrom
title
# Define the plot size
= plt.subplots(figsize=figsize, sharex=True)
f, ax
False)
ax.spines[:].set_visible(
# Plot the annot
# Iterate over each interval in the DataFrame
for start, end in zip(annot['start'], annot['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:red', linewidth=lw, alpha=alpha)
rect
ax.add_patch(rect)'bottom'].set_visible(True)
ax.spines[1], rotation=0, labelpad=10, ha='right', va='center')
ax.set_ylabel(track_titles[
= make_axes_locatable(ax)
divider
if query is not None:
= divider.append_axes("top", size="100%", pad=0.05, sharex=ax)
qax False)
qax.xaxis.set_visible(# Plot the query
for start, end in zip(query['start'], query['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:blue', linewidth=lw, alpha=alpha)
rect
qax.add_patch(rect)False)
qax.spines[:].set_visible(
qax.set_yticks([]) ='left')
qax.set_title(title, loc0], rotation=0, labelpad=10, ha='right', va='center')
qax.set_ylabel(track_titles[
if intersect is not None:
= divider.append_axes("bottom", size="100%", pad=0.05, sharex=ax)
iax # Invisible x-axis for 'annot' (intersect ie below)
False)
ax.xaxis.set_visible(# Plot the intersect
for start, end in zip(intersect['start'], intersect['end']):
= Rectangle((start, 0.1), width=end-start, height=0.9, color='tab:green', linewidth=lw, alpha=alpha)
rect
iax.add_patch(rect)False)
iax.spines[:].set_visible(
iax.set_yticks([]) 'bottom'].set_visible(False)
ax.spines['bottom'].set_visible(True)
iax.spines[2], rotation=0, labelpad=10, ha='right', va='center')
iax.set_ylabel(track_titles[
ax.set_yticks([])0, chromsize)
ax.set_xlim(= np.linspace(0, chromsize, num=5)
ticks
ax.set_xticks(ticks) f'{int(t/1e6)} Mbp' for t in ticks])
ax.set_xticklabels([
f.tight_layout()
plt.show()#plt.tight_layout()
Make edges
def make_edges(df, resolution):
= pd.DataFrame({
start_edge 'chrom': df['chrom'],
'start': df.apply(lambda x: max(x['start'] - resolution, 0), axis=1),
'end': df['start']+1*resolution,
'resolution': resolution,
})= pd.DataFrame({
end_edge 'chrom': df['chrom'],
'start': df['end']-1*resolution,
'end': df.apply(lambda x: min(x['end'] + resolution, chromsizes['chrX']), axis=1),
'resolution': resolution,
})
return interval_collapse(pd.concat([start_edge, end_edge]).sort_values(['chrom', 'start', 'end'])).assign(resolution=resolution)
Svedig tabel
= pd.read_csv('../results/all_tests.csv', index_col=0)
all_tests
def svedig_tabel(orig_df, index, columns, values,
='Reds', col_order=None, col_snames=None):
cmap
"""
Creates a styled pivot table from a DataFrame, filtering by significance,
applying a log transformation, and adding visual styling.
Parameters:
----------
orig_df : pd.DataFrame
The original DataFrame containing data to be processed.
index : str
Column name to use as the row index in the pivot table.
columns : str
Column name to use as the column headers in the pivot table.
values : str
Column name whose values populate the pivot table cells.
cmap : str, optional
Colormap to use for the background gradient styling (default is 'Reds').
Returns:
-------
pd.io.formats.style.Styler
A styled DataFrame object with the pivot table, including gradient-based
cell coloring, transparent NaN values, formatted numbers, and table styles.
"""
= (orig_df
df =np.log10(all_tests.p))
.assign(log10p< 0.05)]
.loc[(all_tests.p =index, columns=columns, values=values)
.pivot(index
)if col_order:
= [x in df.columns for x in col_order]
col_bool = [x for x in col_order if x in df.columns]
col_order = df[col_order]
df if col_snames:
= [col_snames[i] for i,x in enumerate(col_bool) if x]
col_snames = df.rename(columns = {x:col_snames[i] for i,x in enumerate(df.columns.tolist())})
df else:
= df.rename(columns = {x:x.replace('_', '<br>') for x in df.columns.tolist()})
df = (df.style
df =df.columns, axis=None, cmap=cmap, vmin=0)
.background_gradient(subsetmap(lambda x: 'color: transparent; background-color: transparent' if np.isnan(x) else '')
.format('{:.3f}')
.
.set_table_styles('selector': '',
{c: [{'props': [('min-width', '20px')],
for c in df.columns}, overwrite=False
}]
)
)return df
print("That should be all the setup :-)")
That should be all the setup :-)
Matrices / E1
E1: RS100/500
# Compare the compartments of the round_spermatid, 500kb, all views
= ['100', '500']
res = ['round_spermatid']
name = ['recPE']
run
= clr_df.query('name in @name and resolution in @res and run in @run')
group plot_for_quarto(group)
PE vs recPE (RS100kb)
# Compare PE and recPE compartments for round_spermatid, 100kb
= clr_df.query('resolution == "100" and name == "round_spermatid"')
group
=(6, 6)) plot_grouped(group, figsize
PE vs. recPE: RS100kb
= clr_df.query('resolution == "100" and name == "round_spermatid"')
df
# Check the order of the group
#print(df[['name', 'resolution', 'run']].reset_index(drop=True))
=['10Mb','arms'], figsize=(5, 5))
plot_for_quarto(df, view_order
= 70_000_000, 78_000_000
start, end = ('chrX', start, end)
region =region, include_compartments=True, view_order=['10Mb','arms'], figsize=(5, 5)) plot_for_quarto(df, region
Intervals
PE/recPE compare compartments (100kb)
from genominterv import interval_diff
### Plot the intervals and the diff between ###
# Round Spermatid arms
= comps['PE']['round_spermatid_100kb_arms']
query = comps['recPE']['round_spermatid_100kb_arms']
annot =query, annot=annot,
plot_regions(query=interval_diff(query, annot),
intersect=['PE', 'recPE', 'Diff'],
track_titles=(2.5, 0.8),
figsize=None)
title
# Fibroblast arms
= comps['PE']['fibroblast_100kb_arms']
query = comps['recPE']['fibroblast_100kb_arms']
annot =query, annot=annot,
plot_regions(query=interval_diff(query, annot),
intersect=['PE', 'recPE', 'Diff'],
track_titles=(2.5, 0.8),
figsize=None)
title
# Round Spermatid 10Mb
= comps['PE']['round_spermatid_100kb_10Mb']
query = comps['recPE']['round_spermatid_100kb_10Mb']
annot =query, annot=annot,
plot_regions(query=interval_diff(query, annot),
intersect=['PE', 'recPE', 'Diff'],
track_titles=(2.5, 0.8),
figsize=None)
title
# Fibroblast 10Mb
= comps['PE']['fibroblast_100kb_10Mb']
query = comps['recPE']['fibroblast_100kb_10Mb']
annot =query, annot=annot,
plot_regions(query=interval_diff(query, annot),
intersect=['PE', 'recPE', 'Diff'],
track_titles=(2.5, .8),
figsize=None) title
from genominterv import interval_intersect
= ech90
query
= comps['recPE']['fibroblast_100kb_arms']
fb_ann_comp = comps['recPE']['round_spermatid_100kb_arms']
rs_ann_comp = edges['recPE']['fibroblast_100kb_arms']
fb_ann_edge = edges['recPE']['round_spermatid_100kb_arms']
rs_ann_edge
=(3.2, 0.8)
size
# Fib and RS compartments
=query, annot=fb_ann_comp,
plot_regions(query=interval_intersect(query, fb_ann_comp),
intersect=['ECH90', 'A-Comp', 'Intersect'],
track_titles=size,
figsize=None,
title=0.3)
lw
=query, annot=rs_ann_comp,
plot_regions(query=interval_intersect(query, rs_ann_comp),
intersect=['ECH90', 'A-Comp', 'Intersect'],
track_titles=size,
figsize=None,
title=0.3)
lw
# Fib and RS edges
=query, annot=fb_ann_edge,
plot_regions(query=interval_intersect(query, fb_ann_edge),
intersect=['ECH90', 'Edges', 'Intersect'],
track_titles=size,
figsize=None,
title=0.3, alpha=1)
lw
=query, annot=rs_ann_edge,
plot_regions(query=interval_intersect(query, rs_ann_edge),
intersect=['ECH90', 'Edges', 'Intersect'],
track_titles=size,
figsize=None,
title=0.3, alpha=1) lw
Visualize ECH enrichment
from genominterv import interval_diff, interval_intersect, interval_union
= [comps['recPE']['round_spermatid_100kb_arms'],
complist 'recPE']['fibroblast_100kb_arms']]
comps[= [edges['recPE']['round_spermatid_100kb_arms'],
edgelist 'recPE']['fibroblast_100kb_arms']]
edges[
for comp, edge in zip(complist, edgelist):
= interval_intersect(ech90, comp)
int_comp = interval_intersect(ech90, edge)
int_edge
= interval_diff(int_edge, int_comp)
diff_int_compedge
plot_regions(int_comp, int_edge, diff_int_compedge,=['CompInt', 'EdgeInt', 'DiffInt'],
track_titles="ECH90 intersect",
title=(3.2, 1),
figsize=1, lw=0.5)
alpha
All 3 sets
plot_regions(ech90, high_olive, high_hama,=['ECH90', r"$\it{P.anubis}$", r"$\it{P.hama}$"],
track_titles=(5, 0.8),
figsize=None,
title=0.5, alpha=0.8) lw
LiftOver comparison
= [olive_panu3, hama_panu3]
track0 = [high_olive, high_hama]
track1 = [high_olive_ucsc, high_hama_ucsc]
track2
for t0,t1,t2 in zip(track0, track1, track2):
plot_regions(t0,t1,t2,=['Panu_3.0', 'segment_lift','UCSCliftOver'],
track_titles# title='High Olive Ancestry Intervals',
=None,
title=(6, 0.8),
figsize=0.8, lw=0.1)
alpha
Baboon-rs100-intersect
# Plot
= (3.2, 0.8)
size = edges['recPE']['round_spermatid_100kb_arms']
edge_rs
for name,group in baboon_dict.items():
=edge_rs, annot=group,
plot_regions(query=interval_intersect(edge_rs, group),
intersect#intersect=ech90,
=['RS100', fr"$\it{{{name}}}$", 'Intersect'],
track_titles=size,
figsize=None,
title=0.3, alpha=0.9)
lw
= edges['recPE']['round_spermatid_100kb_10Mb']
edge_rs
for name,group in baboon_dict.items():
=edge_rs, annot=group,
plot_regions(query=interval_intersect(edge_rs, group),
intersect#intersect=ech90,
=['RS100', fr"$\it{{{name}}}$", 'Intersect'],
track_titles=size,
figsize=None,
title=0.2, alpha=1) lw
Table: Test 1bp limits
#.query('query == "hama_edge_1bp"') all_tests.head()
tissue | pc_scale | query | annot | test | value | p | -log10p | |
---|---|---|---|---|---|---|---|---|
0 | fibroblast | arms | ECH90 | comp_edge_1bp | jaccard | 0.000002 | 0.01656 | 1.780940 |
1 | fibroblast | arms | hama_edge_1bp | comp_edge_1bp | proximity | 0.221717 | 0.88421 | 0.053445 |
2 | fibroblast | arms | olive_edge_1bp | comp_edge_1bp | proximity | 0.243643 | 0.67235 | 0.172405 |
3 | fibroblast | arms | olivehama_edge_1bp | comp_edge_1bp | proximity | 0.243643 | 0.74099 | 0.130188 |
4 | fibroblast | 10Mb | ECH90 | comp_edge_1bp | jaccard | 0.080105 | 0.02146 | 1.668370 |
= ['fibroblast', 'spermatogonia', 'pachytene_spermatocyte', 'round_spermatid', 'sperm']
tissue_order = ['Fb', 'Spa', 'Pac', 'RS', 'Sperm']
tissue_sname
= (svedig_tabel(all_tests
t0 map(lambda x: x.replace('_edge_1bp', '') if isinstance(x, str) else x)
.={'pc_scale': 'viewframe'}),
.rename(columns=['viewframe', 'test','query'],
index=["tissue"],
columns= tissue_order,
col_order = tissue_sname,
col_snames ='p',
values='Reds_r')
cmap
)
= (svedig_tabel(
t1
all_testsmap(lambda x: x.replace('_edge_1bp', '') if isinstance(x, str) else x)
.'pc_scale == "arms"'),
.query(=['test','query'],
index=["tissue"],
columns= tissue_order,
col_order = tissue_sname,
col_snames ='p',
values='Reds_r')
cmap#.relabel_index(labels=[lbl for lbl in tissue_sname if lbl in t1.columns], axis=1)
)
= (svedig_tabel(all_tests
t2 map(lambda x: x.replace('_edge_1bp', '') if isinstance(x, str) else x)
.'pc_scale == "10Mb"'),
.query(=['test','query'],
index=["tissue"],
columns= tissue_order,
col_order = tissue_sname,
col_snames ='p',
values='Reds_r')
cmap
)
display(t0)
tissue | Fb | Spa | Pac | RS | Sperm | ||
---|---|---|---|---|---|---|---|
viewframe | test | query | |||||
10Mb | jaccard | ECH90 | 0.021 | 0.006 | nan | nan | 0.048 |
proximity | olive | nan | nan | nan | nan | 0.016 | |
olivehama | nan | 0.019 | 0.012 | nan | 0.001 | ||
arms | jaccard | ECH90 | 0.017 | 0.006 | 0.005 | 0.028 | nan |
proximity | olivehama | nan | nan | 0.036 | nan | nan |
= ['fibroblast', 'spermatogonia', 'pachytene_spermatocyte', 'round_spermatid', 'sperm']
tissue_order = ['Fb', 'Spa', 'Pac', 'RS', 'Sperm']
tissue_sname
= (all_tests
df map(lambda x: x.replace('_edge_1bp', '_1bp') if isinstance(x, str) else x)
.=np.log10(all_tests.p))
.assign(log10p< 0.05)]
.loc[(all_tests.p =['pc_scale', 'query', 'test' ], columns=["tissue"], values="p").
.pivot(indexfilter(tissue_order)
)
= df.rename(columns = {x:tissue_sname[i] for i,x in enumerate(df.columns.tolist())})
df = None
df.columns.name = df.reset_index().fillna('-')
df = df.rename(columns={'pc_scale': 'View', 'query': 'Query', 'test': 'Test'})
df ='index') df.style.hide(axis
View | Query | Test | Fb | Spa | Pac | RS | Sperm |
---|---|---|---|---|---|---|---|
10Mb | ECH90 | jaccard | 0.021460 | 0.006110 | - | - | 0.047500 |
10Mb | olive_1bp | proximity | - | - | - | - | 0.016140 |
10Mb | olivehama_1bp | proximity | - | 0.018790 | 0.012300 | - | 0.001280 |
arms | ECH90 | jaccard | 0.016560 | 0.006120 | 0.005340 | 0.027690 | - |
arms | olivehama_1bp | proximity | - | - | 0.035640 | - | - |
Figures for the defence
%config InlineBackend.figure_formats = ['retina']
= clr_df.query('name == "sperm" and resolution == "500" and run == "recPE"')
group
= "../slides/images/hic-example.png"
slide_path
=['10Mb'],
plot_for_quarto(group, view_order=1e-4, col_vmax=0.5,
col_vmin=slide_path) save_png_name
Introduce selected regions
Compare liftover
from genominterv import interval_union
= [olive_panu3, hama_panu3]
track0 = [high_olive, high_hama]
track1 = [high_olive_ucsc, high_hama_ucsc]
track2
= [interval_union(olive_panu3, hama_panu3)]
track0 = [interval_union(high_olive, high_hama)]
track1 = [interval_union(high_olive_ucsc, high_hama_ucsc)]
track2
for t0,t1,t2 in zip(track0, track1, track2):
plot_regions(t0,t1,t2,=['Panu_3.0', 'segment_lift','UCSCliftOver'],
track_titles# title='Baboon Intervals',
=None,
title=(10, 1.5),
figsize=0.8, lw=0.2) alpha
ECH S100 limit intersect
= make_edges(comps['recPE']['fibroblast_100kb_arms'], 2)
annot = ech90
query
=query, annot=annot,
plot_regions(query=interval_intersect(query, annot),
intersect=['ECH90', 'Limit', 'Intersect'],
track_titles=(10, 1.5),
figsize="Spermatogonia 1bp limit",
title=0.6, alpha=1)
lw
# Plot
= (3.2, 0.8)
size = edges['recPE']['sperm_100kb_arms']
edge_rs
for name,group in baboon_dict.items():
=edge_rs, annot=group,
plot_regions(query=interval_intersect(edge_rs, group),
intersect#intersect=ech90,
=['S100', fr"$\it{{{name}}}$", 'Intersect'],
track_titles=size,
figsize=None,
title=0.3, alpha=0.9)
lw
= edges['recPE']['sperm_100kb_10Mb']
edge_rs
for name,group in baboon_dict.items():
=edge_rs, annot=group,
plot_regions(query=interval_intersect(edge_rs, group),
intersect#intersect=ech90,
=['S100', fr"$\it{{{name}}}$", 'Intersect'],
track_titles=size,
figsize=None,
title=0.2, alpha=1) lw