from scipy.stats import fisher_exact
import pandas as pd
def read_google_sheet():
= '1JSjSLuto3jqdEnnG7JqzeC_1pUZw76n7XueVAYrUOpk'
SHEET_ID = 'Sheet1'
SHEET_NAME = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'
url = pd.read_csv(url, header=1)
df return df.loc[:, [not x.startswith('Unnamed') for x in df.columns]]
def gene_list_names():
= read_google_sheet()
df return sorted(df.columns.tolist())
def gene_list(name):
= read_google_sheet()
df = df[name]
sr return sr[~sr.isnull()]
= set(gene_list('all_npx'))
all_set = set(gene_list('sfari_all_conf'))
asd_set = set(gene_list('meritxell_spermatid_expr'))
spermatid_set = set(gene_list('hpa_brain_prot'))
brain_set = set(gene_list('matos_neuron')) neuron_set
GO enrichment analysis
# candidate genes found under positive selection by RELATE
= {'ACSL4',
selection_set 'AMMECR1',
'ARSL',
'BCOR',
'CASK',
'CDKL5',
'CLCN4',
'CLCN5',
'DMD',
'ENOX2',
'FAM120C',
'FRMPD4',
'G6PD',
'GNL3L',
'HUWE1',
'IGSF1',
'IL1RAPL1',
# 'LINC01278',
# 'LINC01278',
# 'LOC101928359',
# 'LOC105377212',
# 'LOC112268307',
# 'LOC124905191',
'MAGT1',
'MAMLD1',
# 'MIR325HG',
'NHS',
'NYX',
'PAK3',
'PASD1',
'PHKA1',
'PRKX',
'PTCHD1',
# 'PTCHD1-AS',
'RAB33A',
# 'RAP2C-AS1',
'RTL4',
'TENM1',
'TMEM164',
'TMLHE',
'WWC3',
'XPNPEP2',
'ZMYM3',
'ZNF185'}
print(len(selection_set))
35
# ASD-related genes are more likely to be expressed in brain+spermatids than in only brain
= len(asd_set)
M = len(asd_set.intersection(brain_set))
N = len(asd_set.intersection(spermatid_set))
n = len(asd_set.intersection(spermatid_set).intersection(brain_set))
x
= [[ x, n - x ],
table - x, M - (n + N) + x]]
[ N ='greater').pvalue table, fisher_exact(table, alternative
([[66, 2], [5, 4]], 0.0012471014633956233)
# H1: Positively selected genes are enriched for ASD
= len(all_set)
M = len(all_set.intersection(asd_set))
N = len(all_set.intersection(selection_set))
n = len(all_set.intersection(asd_set).intersection(selection_set))
x
= [[ x, n - x ],
table - x, M - (n + N) + x]]
[ N ='greater').pvalue table, fisher_exact(table, alternative
([[9, 26], [68, 1309]], 5.947106735011354e-05)
# H2: Positive selection is more common in genes that are involved in both spermatid and neuron than just in neuron
= len(brain_set)
M = len(brain_set.intersection(spermatid_set))
N = len(brain_set.intersection(selection_set))
n = len(brain_set.intersection(spermatid_set).intersection(selection_set))
x
= [[ x, n - x ],
table - x, M - (n + N) + x]]
[ N ='greater').pvalue table, fisher_exact(table, alternative
([[29, 2], [371, 160]], 0.001915857329301735)