GO enrichment analysis

from scipy.stats import fisher_exact
import pandas as pd

def read_google_sheet():
    SHEET_ID = '1JSjSLuto3jqdEnnG7JqzeC_1pUZw76n7XueVAYrUOpk'
    SHEET_NAME = 'Sheet1'
    url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'
    df = pd.read_csv(url, header=1)
    return df.loc[:, [not x.startswith('Unnamed') for x in df.columns]]
    
def gene_list_names():
    df = read_google_sheet()
    return sorted(df.columns.tolist())

def gene_list(name):
    df = read_google_sheet()
    sr = df[name]
    return sr[~sr.isnull()]
    
all_set = set(gene_list('all_npx'))
asd_set = set(gene_list('sfari_all_conf'))
spermatid_set = set(gene_list('meritxell_spermatid_expr'))
brain_set = set(gene_list('hpa_brain_prot'))
neuron_set = set(gene_list('matos_neuron'))

# candidate genes found under positive selection by RELATE
selection_set = {'ACSL4',
'AMMECR1',
'ARSL',
'BCOR',
'CASK',
'CDKL5',
'CLCN4',
'CLCN5',
'DMD',
'ENOX2',
'FAM120C',
'FRMPD4',
'G6PD',
'GNL3L',
'HUWE1',
'IGSF1',
'IL1RAPL1',
# 'LINC01278',
# 'LINC01278',
# 'LOC101928359',
# 'LOC105377212',
# 'LOC112268307',
# 'LOC124905191',
'MAGT1',
'MAMLD1',
# 'MIR325HG',
'NHS',
'NYX',
'PAK3',
'PASD1',                 
'PHKA1',
'PRKX',
'PTCHD1',
# 'PTCHD1-AS',
'RAB33A',
# 'RAP2C-AS1',
'RTL4',
'TENM1',
'TMEM164',
'TMLHE',
'WWC3',
'XPNPEP2',
'ZMYM3',
'ZNF185'}
print(len(selection_set))

# ASD-related genes are more likely to be expressed in brain+spermatids than in only brain
M = len(asd_set) 
N = len(asd_set.intersection(brain_set)) 
n = len(asd_set.intersection(spermatid_set))
x = len(asd_set.intersection(spermatid_set).intersection(brain_set))

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue

([[66, 2], [5, 4]], 0.0012471014633956233)

# H1: Positively selected genes are enriched for ASD
M = len(all_set)
N = len(all_set.intersection(asd_set))
n = len(all_set.intersection(selection_set))
x = len(all_set.intersection(asd_set).intersection(selection_set))

table = [[  x,           n - x          ],  
         [ N - x,        M - (n + N) + x]] 
table, fisher_exact(table, alternative='greater').pvalue

([[9, 26], [68, 1309]], 5.947106735011354e-05)

# H2: Positive selection is more common in genes that are involved in both spermatid and neuron than just in neuron
M = len(brain_set)
N = len(brain_set.intersection(spermatid_set))
n = len(brain_set.intersection(selection_set))
x = len(brain_set.intersection(spermatid_set).intersection(selection_set))

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue

([[29, 2], [371, 160]], 0.001915857329301735)