‘LINC01278’,

from scipy.stats import fisher_exact
import pandas as pd

def read_google_sheet():
    SHEET_ID = '1JSjSLuto3jqdEnnG7JqzeC_1pUZw76n7XueVAYrUOpk'
    SHEET_NAME = 'Sheet1'
    url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'
    df = pd.read_csv(url, header=1)
    return df.loc[:, [not x.startswith('Unnamed') for x in df.columns]]
    
def gene_list_names():
    df = read_google_sheet()
    return sorted(df.columns.tolist())

def gene_list(name):
    df = read_google_sheet()
    sr = df[name]
    return sr[~sr.isnull()]
    
all_set = set(gene_list('all_npx'))
asd_set = set(gene_list('sfari_all_conf'))
spermatid_set = set(gene_list('meritxell_spermatid_expr'))
brain_set = set(gene_list('hpa_brain_prot'))
neuron_set = set(gene_list('matos_neuron'))

print(len(all_set))
print(len(asd_set))
print(len(spermatid_set))
print(len(brain_set))
print(len(neuron_set))
1412
77
759
562
466

I removed the unannotated genes, the microRNA genes and the antisense genes. That leaves only well-annotated protein genes.

selection_set = {'ACSL4',
'AMMECR1',
'ARSL',
'BCOR',
'CASK',
'CDKL5',
'CLCN4',
'CLCN5',
'DMD',
'ENOX2',
'FAM120C',
'FRMPD4',
'G6PD',
'GNL3L',
'HUWE1',
'IGSF1',
'IL1RAPL1',
# 'LINC01278',
# 'LOC101928359',
# 'LOC105377212',
# 'LOC112268307',
# 'LOC124905191',
'MAGT1',
'MAMLD1',
# 'MIR325HG',
'NHS',
'NYX',
'PAK3',
'PASD1',                 
'PHKA1',
'PRKX',
'PTCHD1',
# 'PTCHD1-AS',
'RAB33A',
# 'RAP2C-AS1',
'RTL4',
'TENM1',
'TMEM164',
'TMLHE',
'WWC3',
'XPNPEP2',
'ZMYM3',
'ZNF185'}
len(selection_set)

brain_sel = selection_set.intersection(brain_set)
print(len(neuron_sel.intersection(spermatid_set)))
29
x = selection_set.intersection(neuron_set)
print(x, len(x))
print()
y = selection_set.intersection(brain_set)
print(y, len(y))
print()

brain_neuron = x.intersection(y)
print(brain_neuron, len(brain_neuron))

# NYX is expressed in neurons but no in brain --> ?
# IL1RAPL1, ARSL are brain genes, but not expressed in neurons --> OKAY
{'CLCN5', 'CLCN4', 'TMEM164', 'MAGT1', 'IGSF1', 'PRKX', 'PTCHD1', 'FAM120C', 'ZNF185', 'RAB33A', 'PAK3', 'FRMPD4', 'ACSL4', 'PHKA1', 'NYX', 'ZMYM3', 'G6PD', 'GNL3L', 'DMD', 'NHS', 'MAMLD1', 'HUWE1', 'ENOX2', 'AMMECR1', 'TENM1', 'TMLHE', 'WWC3', 'CASK', 'BCOR', 'CDKL5'} 30

{'CLCN5', 'IL1RAPL1', 'ARSL', 'CLCN4', 'TMEM164', 'MAGT1', 'IGSF1', 'PRKX', 'PTCHD1', 'FAM120C', 'ZNF185', 'RAB33A', 'PAK3', 'FRMPD4', 'ACSL4', 'PHKA1', 'ZMYM3', 'G6PD', 'GNL3L', 'DMD', 'NHS', 'MAMLD1', 'HUWE1', 'ENOX2', 'AMMECR1', 'TENM1', 'TMLHE', 'WWC3', 'CASK', 'BCOR', 'CDKL5'} 31

{'CLCN5', 'CLCN4', 'TMEM164', 'MAGT1', 'IGSF1', 'PRKX', 'PTCHD1', 'FAM120C', 'ZNF185', 'RAB33A', 'PAK3', 'FRMPD4', 'ACSL4', 'PHKA1', 'ZMYM3', 'G6PD', 'GNL3L', 'DMD', 'NHS', 'MAMLD1', 'HUWE1', 'ENOX2', 'AMMECR1', 'TENM1', 'TMLHE', 'WWC3', 'CASK', 'BCOR', 'CDKL5'} 29
z = selection_set.intersection(neuron_set).intersection(spermatid_set)
print(z, len(z))
print()

t = spermatid_set.intersection(neuron_set)
print(t, len(t))
{'CLCN5', 'CLCN4', 'TMEM164', 'MAGT1', 'PRKX', 'PTCHD1', 'FAM120C', 'ZNF185', 'RAB33A', 'PAK3', 'FRMPD4', 'ACSL4', 'PHKA1', 'ZMYM3', 'G6PD', 'GNL3L', 'DMD', 'NHS', 'MAMLD1', 'HUWE1', 'ENOX2', 'AMMECR1', 'TENM1', 'TMLHE', 'WWC3', 'CASK', 'BCOR', 'CDKL5'} 28

{'LANCL3', 'BRWD3', 'PCDH11X', 'ZFX', 'ATG4A', 'PIR', 'NUP62CL', 'MAGT1', 'MOSPD1', 'TBC1D8B', 'PRKX', 'BCORL1', 'AP1S2', 'PUDP', 'KLHL13', 'NHS', 'MAMLD1', 'RBMX2', 'HPRT1', 'ENOX2', 'MID1IP1', 'GLRA2', 'TXLNG', 'TIMP1', 'BEX4', 'ARHGEF9', 'PNPLA4', 'OFD1', 'PCSK1N', 'CUL4B', 'AIFM1', 'FRMPD3', 'TSPAN6', 'LAS1L', 'ATP6AP1', 'HMGB3', 'ARAF', 'NLGN4X', 'SCML2', 'MBTPS2', 'GLA', 'BCAP31', 'PHF6', 'AMMECR1', 'CCDC120', 'PHEX', 'RBM41', 'TFE3', 'HCCS', 'PORCN', 'LDOC1', 'CA5B', 'RBMX', 'ZDHHC15', 'CSTF2', 'ELK1', 'CLCN4', 'NONO', 'EBP', 'SLC9A7', 'ARMCX4', 'DMD', 'RENBP', 'YIPF6', 'SYN1', 'WNK3', 'WWC3', 'GRIA3', 'MECP2', 'ABCD1', 'RPL10', 'MCTS1', 'APOOL', 'PRICKLE3', 'FUNDC2', 'ZNF81', 'IGBP1', 'MED14', 'ARSD', 'SH3BGRL', 'IRAK1', 'MAP7D2', 'GABRA3', 'PHKA2', 'GEMIN8', 'FTSJ1', 'STAG2', 'PLXNA3', 'LAGE3', 'PRPS1', 'CTPS2', 'HDAC6', 'PPP1R3F', 'TMEM164', 'MSL3', 'FRMPD4', 'FGD1', 'RRAGB', 'EMD', 'EIF2S3', 'RADX', 'RBBP7', 'CD99L2', 'RPS4X', 'RBM3', 'RAB9B', 'NKRF', 'ACOT9', 'TAFAZZIN', 'UBE2A', 'PNCK', 'RAI2', 'PABIR3', 'ZMYM3', 'CYSLTR1', 'DYNLT3', 'FLNA', 'HSD17B10', 'MAOB', 'ZDHHC9', 'HDX', 'AFF2', 'ADGRG2', 'GLOD5', 'ABCB7', 'CDKL5', 'GPM6B', 'EFHC2', 'IL13RA1', 'DDX3X', 'SCML1', 'TSPYL2', 'TAF1', 'MORF4L2', 'DACH2', 'PHKA1', 'LRCH2', 'KRBOX4', 'KLF8', 'MBNL3', 'OCRL', 'FGF13', 'KLHL4', 'STEEP1', 'SH3KBP1', 'FAM9B', 'EIF1AX', 'GDI1', 'CETN2', 'ZNF41', 'CHM', 'TMEM47', 'NDUFA1', 'RPL39', 'DKC1', 'VMA21', 'ZNF182', 'FHL1', 'DLG3', 'RAB33A', 'RIBC1', 'ACSL4', 'MCF2', 'PHF8', 'CITED1', 'ZNF674', 'TIMM17B', 'MED12', 'PRAF2', 'HUWE1', 'RAB9A', 'RS1', 'CHIC1', 'MPP1', 'ZC3H12B', 'MAP7D3', 'PPEF1', 'PQBP1', 'FMR1', 'CDK16', 'PCDH19', 'IDS', 'PGK1', 'DIAPH2', 'FANCB', 'SNX12', 'SLC16A2', 'ALG13', 'SMC1A', 'HAUS7', 'G6PD', 'PDHA1', 'ANOS1', 'MID1', 'SLC25A53', 'NAA10', 'C1GALT1C1', 'GPR173', 'MAP3K15', 'AMOT', 'GK', 'ZNF75D', 'FAM199X', 'FAM50A', 'TSPAN7', 'PTCHD1', 'FUNDC1', 'NDUFB11', 'MORC4', 'JADE3', 'MMGT1', 'GNL3L', 'WDR13', 'ADGRG4', 'ZNF449', 'RP2', 'CNKSR2', 'MAGED1', 'OTUD5', 'PGRMC1', 'ARHGAP6', 'OPHN1', 'TMSB4X', 'NLGN3', 'RPS6KA3', 'LONRF3', 'RAP2C', 'BEX1', 'MOSPD2', 'TAB3', 'FAM9C', 'TAF9B', 'TENM1', 'TMLHE', 'CASK', 'PBDC1', 'GPC3', 'UBA1', 'HNRNPH2', 'SAT1', 'RPGR', 'GYG2', 'KLHL15', 'CLCN5', 'MSN', 'FAM133A', 'PJA1', 'PDZD11', 'RPS6KA6', 'ATP7A', 'PCYT1B', 'SLC25A14', 'ARHGAP4', 'POLA1', 'ZMAT1', 'ZNF185', 'TCEAL4', 'XK', 'MAGED2', 'KDM6A', 'CHST7', 'CHRDL1', 'RPL36A', 'GSPT2', 'TMEM187', 'CFAP47', 'USP9X', 'RAB41', 'FAAH2', 'FAM3A', 'TCEAL8', 'ZIC3', 'HCFC1', 'PABIR2', 'PLP2', 'CLIC2', 'REPS2', 'SLC35A2', 'UPF3B', 'UXT', 'DNASE1L1', 'HTATSF1', 'MID2', 'ARHGEF6', 'OGT', 'RLIM', 'HMGN5', 'PDK3', 'PRDX4', 'UTP14A', 'SYAP1', 'TKTL1', 'BCOR', 'BEX2', 'TSC22D3', 'GPR82', 'CXorf58', 'TCEAL1', 'WDR45', 'RHOXF1', 'NXT2', 'PIGA', 'USP11', 'SYP', 'GUCY2F', 'XIAP', 'PLS3', 'IDH3G', 'SMS', 'PSMD10', 'TBL1X', 'TSR2', 'GPKOW', 'ATP11C', 'SLC25A5', 'ZRSR2', 'SHROOM2', 'ZNF280C', 'SLC6A8', 'CXorf38', 'SSR4', 'CMC4', 'PRRG1', 'COX7B', 'UPRT', 'NKAP', 'TIMM8A', 'STS', 'APOO', 'ZBTB33', 'GRIPAP1', 'SLC9A6', 'ATRX', 'SEPTIN6', 'BRCC3', 'HDAC8', 'PIN4', 'TRMT2B', 'OTUD6A', 'USP51', 'ZC4H2', 'TBC1D25', 'F8', 'ZNF711', 'THOC2', 'KDM5C', 'FAM120C', 'PAK3', 'ATP6AP2', 'WDR44', 'SMARCA1', 'MAOA', 'VBP1', 'PDZD4', 'LAMP2'} 355
M = 466 
N = 355
n = 30
x = 28

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue
([[28, 2], [327, 109]], 0.012810619118361193)
# ASD-related genes are more likely to be expressed in neurons+spermatids than in only neurons
M = len(asd_set) 
N = len(asd_set.intersection(brain_set)) 
n = len(asd_set.intersection(spermatid_set))
x = len(asd_set.intersection(spermatid_set).intersection(brain_set))

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue
([[66, 2], [5, 4]], 0.0012471014633956233)
# H1: Positively selected genes are enriched for ASD
M = len(all_set)
N = len(all_set.intersection(asd_set))
n = len(all_set.intersection(selection_set))
x = len(all_set.intersection(asd_set).intersection(selection_set))

table = [[  x,           n - x          ],  
         [ N - x,        M - (n + N) + x]] 
table, fisher_exact(table, alternative='greater').pvalue
([[9, 26], [68, 1309]], 5.947106735011354e-05)
# H2: Positive selection is more common in genes that are involved in both spermatid and neuron than just in neuron
M = len(brain_set)
N = len(brain_set.intersection(spermatid_set))
n = len(brain_set.intersection(selection_set))
x = len(brain_set.intersection(spermatid_set).intersection(selection_set))

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 2
      1 # H2: Positive selection is more common in genes that are involved in both spermatid and neuron than just in neuron
----> 2 M = len(brain_set)
      3 N = len(brain_set.intersection(spermatid_set))
      4 n = len(brain_set.intersection(selection_set))

NameError: name 'brain_set' is not defined
# H2: Positive selection is more common in ASD genes also expressed in spermatids
M = len(asd_set)
N = len(asd_set.intersection(spermatid_set))
n = len(asd_set.intersection(selection_set))
x = len(asd_set.intersection(spermatid_set).intersection(selection_set))

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue
([[9, 0], [59, 9]], 0.30547534876941307)
# H2: Positive selection is more common in genes that are involved in both spermatid and neuron than just in neuron
M = len(neuron_set)
N = len(neuron_set.intersection(spermatid_set))
n = 31#len(neuron_set.intersection(selection_set))
x = len(neuron_set.intersection(spermatid_set).intersection(selection_set))

table = [[  x,           n - x          ],
         [ N - x,        M - (n + N) + x]]
table, fisher_exact(table, alternative='greater').pvalue
([[28, 3], [327, 108]], 0.03721007992246583)