Detect selection in populations

library("ggplot2")
library("gridExtra")
library("ggpubr")
library("RColorBrewer")

RELATE

AFRICAN POPULATIONS

LWK <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/LWK/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(LWK)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781642 . 1 1 1 1 1 1 1 1 -0.1336880 -0.1734100 -0.2221450 -0.3321430 -0.3804730 -2.01040e-06 -0.000017053 0 -0.450077 -0.0978628
2 2781865 . 1 1 1 1 1 1 1 1 -2.4857200 -3.0492300 -1.8755900 -2.8888600 -2.4228300 -1.39308e+00 -1.576160000 0 -2.019680 -1.8051100
3 2781927 . 1 1 1 1 1 1 1 1 -0.0954429 -0.0623195 -0.0632732 -0.1635510 -0.4259450 -7.05956e-02 -0.034994400 0 -0.228440 -0.5814090
4 2782572 . 1 1 1 1 1 1 1 1 -0.1458270 -0.2778760 -0.3514220 -0.0907462 -0.0452048 -1.40217e-01 -0.176239000 0 -0.167312 -0.0570220
5 2783658 . 1 1 1 1 1 1 1 1 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 -4.91135e-01 -0.676691000 0 -1.409400 -0.4134500
6 2784657 . 1 1 1 1 1 1 1 1 -0.1243950 -0.1766240 -0.4322030 -0.4919260 -0.6999360 -7.43392e-01 -0.332010000 0 -0.212519 -1.0023100
LWK_plot <- plot(LWK$pos, -(LWK$when_mutation_has_freq2), pch = 19, col = "#8DA0CB", xlab = "Genomic Position", ylab = "log10(p-value)", main = "LWK",cex = 1.5, yaxp = c(0, 10, 5),cex.lab = 1.2)

pdf("LWK_plot.pdf", width = 20, height = 5)
LWK_plot <- plot(LWK$pos, -(LWK$when_mutation_has_freq2), 
                 pch = 19, col = "#8DA0CB", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "LWK",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2
ESN <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/ESN/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(ESN)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781514 . 1 1 1 1 1 1 1 1 -0.241670 -0.163302 -0.211304 -0.385895 -0.654856 -0.342752 -0.492519 0 -0.418871 -0.757029
2 2781584 . 1 1 1 1 1 1 1 1 -0.327150 -0.196983 -0.174943 -0.153555 -0.248025 -0.261389 -0.445471 0 -0.293212 -0.658046
3 2781635 . 1 1 1 1 1 1 1 1 -0.486129 -0.319048 -0.511543 -0.744948 -0.701433 -0.315941 -0.226448 0 -0.849279 -0.203300
4 2781642 . 1 1 1 1 1 1 1 1 -0.355773 -0.505792 -0.598037 -0.100054 -0.162419 -0.290461 -0.382397 0 -0.320757 -0.190161
5 2781865 . 1 1 1 1 1 1 1 1 -0.584338 -0.299828 -0.218171 -0.373706 -0.187458 -0.249755 -0.502119 0 -0.312090 -0.580090
6 2781927 . 1 1 1 1 1 1 1 1 -0.600341 -0.785731 -0.495711 -0.520437 -0.477516 -1.343730 -1.212490 0 -0.488977 -0.879552
ESN_plot <- plot(ESN$pos, -(ESN$when_mutation_has_freq2), pch=19, col = "#66C2A5", xlab="Genomic Position", ylab="log10(p-value)", main ="ESN", cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("ESN_plot.pdf", width = 20, height = 5)
ESN_plot <- plot(ESN$pos, -(ESN$when_mutation_has_freq2), 
                 pch = 19, col = "#66C2A5", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "ESN",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

GWD <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/GWD/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(GWD)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781514 . 1 1 1 1 1 1 1 1 -0.407876 -0.463329 -0.743589 -0.597593 -0.507644 -0.253240 -0.349971000 0 -0.614516 -0.656961
2 2781584 . 1 1 1 1 1 1 1 1 -0.179203 -0.154162 -0.180094 -0.205703 -0.124971 -0.231529 -0.204117000 0 -0.240526 -0.268348
3 2781635 . 1 1 1 1 1 1 1 1 -0.111829 -0.212617 -0.202914 -0.378425 -0.322488 -0.277704 -0.282409000 0 -0.190963 -0.782312
4 2781642 . 1 1 1 1 1 1 1 1 1.000000 -0.539265 -0.347216 -0.565445 -0.308591 -0.153304 -0.000101637 0 -0.686005 -0.379514
5 2781865 . 1 1 1 1 1 1 1 1 -0.519564 -0.462069 -0.316058 -0.352527 -0.137388 -0.134281 -0.264966000 0 -0.590395 -0.857276
6 2782386 . 1 1 1 1 1 1 1 1 -0.402728 -0.221762 -0.361282 -0.213686 -0.360681 -0.502329 -0.307497000 0 -0.555433 -0.201529
GWD_plot <- plot(GWD$pos, -(GWD$when_mutation_has_freq2), pch = 19, col = "#FC8D62", xlab = "Genomic Position", ylab = "log10(p-value)", main = "GWD", cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("GWD_plot.pdf", width = 20, height = 5)
GWD_plot <- plot(GWD$pos, -(GWD$when_mutation_has_freq2), 
                 pch = 19, col = "#FC8D62", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "GWD",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

MSL <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/MSL/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(MSL)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781514 . 1 1 1 1 1 1 1 1 -0.00819344 -0.0230286 -0.0125319 -0.03071400 -0.0335299 -0.0137678 -0.0461147 0 -0.0346441 -0.1966720
2 2781584 . 1 1 1 1 1 1 1 1 -0.57859900 -0.6665050 -0.4813950 -0.19035200 -0.0996204 -0.3150730 -0.0203248 0 -0.6003820 -0.3105280
3 2781635 . 1 1 1 1 1 1 1 1 -0.48882200 -0.6708420 -0.4792100 -0.46056600 -0.7251980 -0.1401750 -0.2732000 0 -0.7939040 -1.4443700
4 2781865 . 1 1 1 1 1 1 1 1 -0.72332800 -0.4880510 -0.3577220 -0.49751700 -0.1608950 -0.2124540 -0.3456440 0 -0.7253440 -0.2675890
5 2781927 . 1 1 1 1 1 1 1 1 -0.40212600 -0.1816380 -0.2759440 -0.20043600 -0.1026650 -0.1161240 -0.1489970 0 -0.4218150 -0.4057530
6 2783658 . 1 1 1 1 1 1 1 1 -0.06384770 -0.0422283 -0.0282915 -0.00793583 -0.0163262 -0.0427755 0.0000000 0 -0.0417048 -0.0653668
MSL_plot <- plot(MSL$pos, -(MSL$when_mutation_has_freq2), pch = 19, col = "#E78AC3", xlab = "Genomic Position", ylab = "log10(p-value)", main = "MSL", cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("MSL_plot.pdf", width = 20, height = 5)
MSL_plot <- plot(MSL$pos, -(MSL$when_mutation_has_freq2), 
                 pch = 19, col = "#E78AC3", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "MSL",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

YRI <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/YRI/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(YRI)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781584 . 1 1 1 1 1 1 1 1 -1.202970 -0.649106 -1.2164900 -0.6031470 -0.860137 -1.338100 -1.0011900 0 -1.181280 -0.5485560
2 2781635 . 1 1 1 1 1 1 1 1 -0.174386 -0.318978 -0.0721344 -0.0799852 -0.184531 -0.190856 -0.3960370 0 -0.102744 -0.5156580
3 2781642 . 1 1 1 1 1 1 1 1 1.000000 1.000000 1.0000000 1.0000000 -0.621146 -0.869970 -1.2066100 0 -1.387990 -0.6117740
4 2781927 . 1 1 1 1 1 1 1 1 -0.668357 -0.674931 -0.6905980 -0.5737620 -0.274520 -0.216861 -0.3180610 0 -0.517044 -1.2772800
5 2782597 . 1 1 1 1 1 1 1 1 -0.262238 -0.398503 -0.5650450 -0.6882240 -0.131008 -0.191674 -0.0770765 0 -0.399880 -0.2131010
6 2783658 . 1 1 1 1 1 1 1 1 -0.514807 -0.961053 -0.9117400 -0.7238790 -1.248990 -1.566840 -1.3695800 0 -1.302540 -0.0112159
YRI_plot <- plot(YRI$pos, -(YRI$when_mutation_has_freq2), pch = 19, col = "#A6D854", xlab = "Genomic Position", ylab = "log10(p-value)", main = "YRI" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("YRI_plot.pdf", width = 20, height = 5)
YRI_plot <- plot(YRI$pos, -(YRI$when_mutation_has_freq2), 
                 pch = 19, col = "#A6D854", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "YRI",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

# Combine data frames
LWK$Population <- "LWK"
ESN$Population <- "ESN"
GWD$Population <- "GWD"
MSL$Population <- "MSL"
YRI$Population <- "YRI"

combined_data <- rbind(LWK, ESN, GWD, MSL, YRI)

custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = combined_data, aes(x = pos, y = -(when_mutation_has_freq2), color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "log10(p-value)", title = "Africans") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_brewer(palette = "Set2") +
  theme(
    text = element_text(size = 15),
    plot.title = element_text(hjust = 0.5) # Center the title
  )

# Print the plot
print(plot_combined)

# Save the plot
ggsave("africans_threshold_REL_SQUARE.png", plot_combined, width = 7, height = 5)

## SELECT JUST POSITION LOWER THAN THRESHOLD (log10 p-value < -6)

# Define file paths for the five .sele files
file_paths <- c(
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/LWK/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/ESN/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/GWD/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/MSL/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/YRI/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele"
)

# Read the files
LWK <- read.table(file_paths[1], header = TRUE)
ESN <- read.table(file_paths[2], header = TRUE)
GWD <- read.table(file_paths[3], header = TRUE)
MSL <- read.table(file_paths[4], header = TRUE)
YRI <- read.table(file_paths[5], header = TRUE)

# Function to subset data based on log10 p-value
subset_data <- function(data, population) {
  subset(data, when_mutation_has_freq2 < -6) %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = LWK, population = "LWK"),
  list(data = ESN, population = "ESN"),
  list(data = GWD, population = "GWD"),
  list(data = MSL, population = "MSL"),
  list(data = YRI, population = "YRI")
)

# Subset and bind data from all files
africans_low_pvalue <- do.call(rbind, lapply(file_pop, function(fp) {
  subset_data(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(africans_low_pvalue, "africans_low_pvalue.csv", row.names = FALSE)

filtered <- read.csv("/home/ari/ari-intern/people/ari/ariadna-intern/notebooks/africans_low_pvalue.csv")
head(filtered)
cat("Number of rows and columns:", paste(dim(filtered), collapse = "x"))
plot_threshold <- ggplot(filtered, aes(x = pos, y = -(when_mutation_has_freq2), color = population)) +
  geom_point(size = 4) +
  labs(x = "Genomic position", y = "log10 p-value", title = "AFRICAN") + 
  theme_minimal() + 
scale_color_brewer(palette = "Set2") + theme(text = element_text(size = 15),
        plot.title = element_text(hjust = 0.5))


print(plot_threshold)
ggsave("africans_low_pvalue_r.png", plot_threshold, width = 20, height = 8)

EUROPEAN POPULATIONS

GBR <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/GBR/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(GBR)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781604 . 1 1 1 1 1 1 1 1 -0.04246550 -0.0410563 -0.0277191 0.00000000 0.0000000 0.0000000 0.000000 0 -0.0652181 -0.140835
2 2781635 . 1 1 1 1 1 1 1 1 -0.07381820 -0.0622531 -0.1658510 -0.20821400 -0.1090100 -0.2489250 0.000000 0 -0.2119610 -0.159586
3 2781927 . 1 1 1 1 1 1 1 1 -1.63264000 -1.4569200 -1.0390600 -1.39977000 -1.1803100 -0.5330060 -0.624974 0 -2.2223900 -1.710600
4 2781986 . 1 1 1 1 1 1 1 1 -1.63264000 -1.4569200 -1.0390600 -1.39977000 -1.1803100 -0.5330060 -0.624974 0 -2.2223900 -1.710600
5 2782116 . 1 1 1 1 1 1 1 1 -0.06384060 -0.0555249 -0.0357912 0.00000000 0.0000000 0.0000000 0.000000 0 -0.1008840 -0.372146
6 2782161 . 1 1 1 1 1 1 1 1 -0.00997385 -0.0102032 -0.0278379 -0.00742967 -0.0071796 -0.0841723 0.000000 0 -0.0321039 -0.141249
GBR_plot <- plot(GBR$pos, -(GBR$when_mutation_has_freq2), pch = 19, col = "lightblue", xlab = "Genomic Position", ylab = "log10(p-value)", main = "GBR" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("GBR_plot.pdf", width = 20, height = 5)
YRI_plot <- plot(GBR$pos, -(GBR$when_mutation_has_freq2), 
                 pch = 19, col = "lightblue", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "GBR",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

FIN <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/FIN/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(FIN)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781604 . 1 1 1 1 1 1 1 1 -0.00440755 -0.00534391 -0.0121758 -0.0395929 -0.12861100 -0.07316910 -4.72098e-05 0 -0.00567415 -0.0443709
2 2781635 . 1 1 1 1 1 1 1 1 -0.40902500 -0.62795400 -0.3375140 -0.4141800 -0.58893800 -0.68931200 -1.13840e+00 0 -0.57942200 -0.4660740
3 2781927 . 1 1 1 1 1 1 1 1 -0.65529300 -0.20453900 -0.2242660 -0.0580616 -0.00696292 -0.00380284 -3.23889e-03 0 -0.45251000 -1.3097700
4 2781986 . 1 1 1 1 1 1 1 1 -0.65529300 -0.20453900 -0.2242660 -0.0580616 -0.00696292 -0.00380284 -3.23889e-03 0 -0.45251000 -1.3097700
5 2782116 . 1 1 1 1 1 1 1 1 -0.00440755 -0.00534391 -0.0121758 -0.0395929 -0.12861100 -0.07316910 -4.72098e-05 0 -0.00567415 -0.0443709
6 2782161 . 1 1 1 1 1 1 1 1 -0.04182390 -0.09758080 -0.0624887 -0.1448340 -0.37536700 -0.37688400 -4.58744e-01 0 -0.03356820 -0.0363945
FIN_plot <- plot(FIN$pos, -(FIN$when_mutation_has_freq2), pch = 19, col = "lightpink", xlab = "Genomic Position", ylab = "log10(p-value)", main = "FIN" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("FIN_plot.pdf", width = 20, height = 5)
YRI_plot <- plot(FIN$pos, -(FIN$when_mutation_has_freq2), 
                 pch = 19, col = "lightpink", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "FIN",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

IBS <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/IBS/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(IBS)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781514 . 1 1 1 1 1 1 1 1 -0.5030530 -0.62381400 -0.6142110 -0.5986810 -0.2936090 -0.5673030 -0.5812090 0 -0.5659250 -0.430028
2 2781604 . 1 1 1 1 1 1 1 1 -0.1494370 -0.13499200 -0.0992126 -0.0863440 -0.0437142 -0.0229475 -0.0256627 0 -0.0950446 -0.487534
3 2781635 . 1 1 1 1 1 1 1 1 -0.6871990 -0.41871100 -0.4638830 -0.4103800 -0.7481900 -0.2844150 -0.4797510 0 -0.9172860 -0.408764
4 2781927 . 1 1 1 1 1 1 1 1 -0.0408422 -0.00879619 -0.0348159 -0.0113192 -0.0213689 -0.0237058 -0.1975070 0 -0.0570564 -1.165120
5 2781986 . 1 1 1 1 1 1 1 1 -0.0408422 -0.00879619 -0.0348159 -0.0113192 -0.0213689 -0.0237058 -0.1975070 0 -0.0570564 -1.165120
6 2782048 . 1 1 1 1 1 1 1 1 1.0000000 1.00000000 1.0000000 1.0000000 1.0000000 -0.9076720 -1.1323700 0 -1.5064100 -0.838322
IBS_plot <- plot(IBS$pos, -(IBS$when_mutation_has_freq2), pch = 19, col = "gold", xlab = "Genomic Position", ylab = "log10(p-value)", main = "IBS" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("IBS_plot.pdf", width = 20, height = 5)
IBS_plot <- plot(IBS$pos, -(IBS$when_mutation_has_freq2), 
                 pch = 19, col = "gold", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "IBS",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

TSI <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/TSI/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(TSI)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781604 . 1 1 1 1 1 1 1 1 -0.2105130 -0.0994744 -0.00275779 -0.0264850 -0.1290760 -4.82756e-05 -2.05973e-05 0 -0.239337 -0.2886860
2 2781635 . 1 1 1 1 1 1 1 1 -0.0223887 -0.0254137 -0.05456920 -0.0224805 -0.0427598 -4.61010e-05 -2.00428e-05 0 -0.175876 -0.1633620
3 2781927 . 1 1 1 1 1 1 1 1 -1.7507600 -1.5958100 -2.17561000 -1.6023800 -0.9149070 -9.32056e-01 -1.16969e-01 0 -1.454490 -1.2938300
4 2781986 . 1 1 1 1 1 1 1 1 -1.7507600 -1.5958100 -2.17561000 -1.6023800 -0.9149070 -9.32056e-01 -1.16969e-01 0 -1.454490 -1.2938300
5 2782116 . 1 1 1 1 1 1 1 1 -0.1580850 -0.0765467 -0.00201294 -0.0222592 -0.1175840 -3.60871e-05 -1.89802e-05 0 -0.210797 -0.0313601
6 2782161 . 1 1 1 1 1 1 1 1 -0.3088920 -0.2460760 -0.54315200 -0.2379020 -0.5093210 -5.29402e-01 -1.07703e-01 0 -0.446141 -0.3001710
TSI_plot <- plot(TSI$pos, -(TSI$when_mutation_has_freq2), pch = 19, col = "lightgreen", xlab = "Genomic Position", ylab = "log10(p-value)", main = "TSI" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("TSI_plot.pdf", width = 20, height = 5)
TSI_plot <- plot(TSI$pos, -(TSI$when_mutation_has_freq2), 
                 pch = 19, col = "lightgreen", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "TSI",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

PUR <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/PUR/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
PUR_plot <- plot(PUR$pos, -(PUR$when_mutation_has_freq2), pch = 19, col = "orange2", xlab = "Genomic Position", ylab = "log10(p-value)", main = "PUR" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("PUR_plot.pdf", width = 20, height = 5)
PUR_plot <- plot(PUR$pos, -(PUR$when_mutation_has_freq2), 
                 pch = 19, col = "orange2", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "PUR",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

# Combine data frames
GBR$Population <- "GBR"
FIN$Population <- "FIN"
IBS$Population <- "IBS"
TSI$Population <- "TSI"
PUR$Population <- "PUR"

combined_data <- rbind(GBR, FIN, IBS, TSI, PUR)


custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

plot_combined <- ggplot(data = combined_data, aes(x = pos, y = -(when_mutation_has_freq2), color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "log10(p-value)", title = "Europeans + PUR") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_manual(values = c("GBR" = "lightblue", "FIN" = "lightpink", "IBS" = "gold", "TSI" = "lightgreen", "PUR" = "orange2")) + theme(text = element_text(size = 15)) 

print(plot_combined)

ggsave("europeans_threshold_REL_SQUARE.png", plot_combined, width = 7, height = 5)

## SELECT JUST POSITION LOWER THAN THRESHOLD (log10 p-value < -6)

# Define file paths for the five .sele files
file_paths_euro <- c(
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/GBR/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/FIN/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/IBS/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/TSI/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/PUR/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele"
)

# Read the files
GBR <- read.table(file_paths_euro[1], header = TRUE)
FIN <- read.table(file_paths_euro[2], header = TRUE)
IBS <- read.table(file_paths_euro[3], header = TRUE)
TSI <- read.table(file_paths_euro[4], header = TRUE)
PUR <- read.table(file_paths_euro[5], header = TRUE)

# Function to subset data based on log10 p-value
subset_data <- function(data, population) {
  subset(data, when_mutation_has_freq2 < -6) %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = GBR, population = "GBR"),
  list(data = FIN, population = "FIN"),
  list(data = IBS, population = "IBS"),
  list(data = TSI, population = "TSI"),
  list(data = PUR, population = "PUR")
)

# Subset and bind data from all files
europeans_low_pvalue <- do.call(rbind, lapply(file_pop, function(fp) {
  subset_data(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(europeans_low_pvalue, "europeans_low_pvalue.csv", row.names = FALSE)

filtered_euro <- read.csv("/home/ari/ari-intern/people/ari/ariadna-intern/notebooks/europeans_low_pvalue.csv")
head(filtered_euro)
cat("Number of rows and columns:", paste(dim(filtered_euro), collapse = "x"))
A data.frame: 6 × 36
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2 population
<int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr>
1 18861677 . 1 1 1 1 1 1 1 1 -0.597739 -1.048370 -0.741127 -0.417716 -0.544644 -0.187459 0 -0.890654 -6.38066 GBR
2 18865240 . 1 1 1 1 1 1 1 1 -0.639418 -0.897673 -0.582284 -0.612257 -0.702097 -0.241534 0 -0.991791 -6.72386 GBR
3 18867826 . 1 1 1 1 1 1 1 1 -1.137600 -0.715037 -0.976634 -0.572180 -0.612715 -0.132271 0 -1.339690 -6.54745 GBR
4 18867954 . 1 1 1 1 1 1 1 1 -1.127090 -0.618784 -1.215980 -0.720713 -0.718903 -0.155134 0 -1.423040 -7.78284 GBR
5 18868013 . 1 1 1 1 1 1 1 1 -1.137600 -0.715037 -0.976634 -0.572180 -0.612715 -0.132271 0 -1.339690 -6.54745 GBR
6 18868321 . 1 1 1 1 1 1 1 1 -1.137600 -0.715037 -0.976634 -0.572180 -0.612715 -0.132271 0 -1.339690 -6.54745 GBR
Number of rows and columns: 456x36
plot_threshold <- ggplot(filtered_euro, aes(x = pos, y = -(when_mutation_has_freq2), color = population)) +
  geom_point(size = 4) +
  labs(x = "Genomic position", y = "log10 p-value", title = "European Populations") + 
  theme_minimal() + scale_color_manual(values = c("GBR" = "lightblue", "FIN" = "lightpink", "IBS" = "gold", "TSI" = "lightgreen", "PUR" = "orange2")) +
theme(text = element_text(size = 15),
        plot.title = element_text(hjust = 0.5))
print(plot_threshold)

EAST ASIAN POPULATIONS

CDX <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/CDX/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(CDX)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781635 . 1 1 1 1 1 1 1 1 -1.0086400 -1.1085600 -1.1057000 -1.3585700 -0.943955 -1.1108200 -0.973984 0 -1.2454700 -0.4867200
2 2781927 . 1 1 1 1 1 1 1 1 -0.3131650 -0.2109630 -0.2253310 -0.0785041 -0.164815 -0.1097100 -0.108324 0 -0.3075960 -0.5534800
3 2781986 . 1 1 1 1 1 1 1 1 -0.3131650 -0.2109630 -0.2253310 -0.0785041 -0.164815 -0.1097100 -0.108324 0 -0.3075960 -0.5534800
4 2782161 . 1 1 1 1 1 1 1 1 -1.0339200 -1.0921500 -1.3285200 -1.5450600 -1.067340 -1.2125100 -1.057640 0 -1.2068300 -1.3679800
5 2782560 . 1 1 1 1 1 1 1 1 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000 0.000000 0 -0.0766008 -0.0269834
6 2782572 . 1 1 1 1 1 1 1 1 -0.0993169 -0.0502764 -0.0654512 -0.0539695 -0.238131 -0.0119095 -0.033889 0 -0.1106720 -1.6958700
CDX_plot <- plot(CDX$pos, -(CDX$when_mutation_has_freq2), pch = 19, col = "#A6CEE3", xlab = "Genomic Position", ylab = "log10(p-value)", main = "CDX" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("CDX_plot.pdf", width = 20, height = 5)
CDX_plot <- plot(CDX$pos, -(CDX$when_mutation_has_freq2), 
                 pch = 19, col = "#A6CEE3", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "CDX",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

CHB <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/CHB/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(CHB)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781927 . 1 1 1 1 1 1 1 1 -0.469389 -0.1350390 -0.0537125 -0.0683953 -0.1218410 -1.66876e-05 -4.30725e-07 0 -0.357821 -0.707989
2 2781986 . 1 1 1 1 1 1 1 1 -0.469389 -0.1350390 -0.0537125 -0.0683953 -0.1218410 -1.66876e-05 -4.30725e-07 0 -0.357821 -0.707989
3 2782161 . 1 1 1 1 1 1 1 1 -0.134001 -0.1969290 -0.3636970 -0.1665180 -0.0778992 -1.11767e-01 -1.29266e-06 0 -0.223143 -0.803470
4 2782572 . 1 1 1 1 1 1 1 1 -0.352616 -0.0748059 -0.0297824 -0.1350440 -0.0487577 0.00000e+00 0.00000e+00 0 -0.314035 -2.224800
5 2784758 . 1 1 1 1 1 1 1 1 -0.414356 -0.1018040 -0.1638560 -0.1877580 -0.0405383 -3.93919e-05 0.00000e+00 0 -0.292926 -0.067107
6 2784905 . 1 1 1 1 1 1 1 1 -0.414356 -0.1018040 -0.1638560 -0.1877580 -0.0405383 -3.93919e-05 0.00000e+00 0 -0.292926 -0.067107
CHB_plot <- plot(CHB$pos, -(CHB$when_mutation_has_freq2), pch = 19, col = "#1F78B4", xlab = "Genomic Position", ylab = "log10(p-value)", main = "CHB" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("CHB_plot.pdf", width = 20, height = 5)
CHB_plot <- plot(CHB$pos, -(CHB$when_mutation_has_freq2), 
                 pch = 19, col = "#1F78B4", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "CHB",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

CHS <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/CHS/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(CHS)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781635 . 1 1 1 1 1 1 1 1 -0.509651 -0.5295390 -0.540640 -0.2382600 -0.353003 -0.495016 -0.453266 0 -0.6016760 -0.926999
2 2781927 . 1 1 1 1 1 1 1 1 -0.191600 -0.0350659 -0.102777 -0.0583544 -0.278356 -0.185307 -0.293794 0 -0.0503824 -0.294591
3 2781986 . 1 1 1 1 1 1 1 1 -0.191600 -0.0350659 -0.102777 -0.0583544 -0.278356 -0.185307 -0.293794 0 -0.0503824 -0.294591
4 2782161 . 1 1 1 1 1 1 1 1 -0.921153 -1.5332800 -0.849335 -0.7494320 -0.419384 -0.937000 -0.393215 0 -0.9510930 -1.741520
5 2782572 . 1 1 1 1 1 1 1 1 -0.719856 -0.7079780 -0.505628 -0.5949980 -0.269845 -0.588185 -0.232325 0 -0.5436550 -0.533548
6 2782784 . 1 1 1 1 1 1 1 1 1.000000 1.0000000 1.000000 1.0000000 1.000000 1.000000 1.000000 0 1.0000000 -0.701893
CHS_plot <- plot(CHS$pos, -(CHS$when_mutation_has_freq2), pch = 19, col = "#B2DF8A", xlab = "Genomic Position", ylab = "log10(p-value)", main = "CHS" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("CHS_plot.pdf", width = 20, height = 5)
CHS_plot <- plot(CHS$pos, -(CHS$when_mutation_has_freq2), 
                 pch = 19, col = "#B2DF8A", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "CHS",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

JPT <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/JPT/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(JPT)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781514 . 1 1 1 1 1 1 1 1 -0.544881 -0.543074 -0.3360220 -0.5822880 -0.4354700 -0.6970980 -1.1308500 0 -0.548842 -0.746704
2 2781986 . 1 1 1 1 1 1 1 1 -0.354544 -0.370605 -0.0512392 -0.0805736 -0.0537353 -0.1730980 -0.4372470 0 -0.455204 -0.112781
3 2782572 . 1 1 1 1 1 1 1 1 -0.319862 -0.318994 -0.4698040 -0.2719470 -0.0442616 -0.0408214 -0.0908543 0 -0.396086 -1.069260
4 2784758 . 1 1 1 1 1 1 1 1 -0.629078 -0.263415 -0.7430090 -0.3709640 -0.8436410 -0.9887650 -0.7396860 0 -0.796063 -0.483658
5 2784905 . 1 1 1 1 1 1 1 1 -0.629078 -0.263415 -0.7430090 -0.3709640 -0.8436410 -0.9887650 -0.7396860 0 -0.796063 -0.483658
6 2785313 . 1 1 1 1 1 1 1 1 -0.629078 -0.263415 -0.7430090 -0.3709640 -0.8436410 -0.9887650 -0.7396860 0 -0.796063 -0.483658
JPT_plot <- plot(JPT$pos, -(JPT$when_mutation_has_freq2), pch = 19, col = "#33A02C", xlab = "Genomic Position", ylab = "log10(p-value)", main = "JPT" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("JPT_plot.pdf", width = 20, height = 5)
JPT_plot <- plot(JPT$pos, -(JPT$when_mutation_has_freq2), 
                 pch = 19, col = "#33A02C", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "JPT",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

KHV <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/KHV/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele", header = TRUE)
head(KHV)
A data.frame: 6 × 35
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X257.030640 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2
<int> <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2781635 . 1 1 1 1 1 1 1 1 -0.8030350 -0.6989060 -0.605342 -0.45654000 -0.342867 -0.503979 -0.69663300 0 -1.0258700 -0.847264
2 2781927 . 1 1 1 1 1 1 1 1 -0.0590766 -0.0280912 -0.045688 -0.00312863 -0.010792 -0.018873 -0.00296262 0 -0.0553768 -0.635131
3 2781986 . 1 1 1 1 1 1 1 1 -0.0590766 -0.0280912 -0.045688 -0.00312863 -0.010792 -0.018873 -0.00296262 0 -0.0553768 -0.635131
4 2782161 . 1 1 1 1 1 1 1 1 -0.9175290 -1.2623400 -0.941499 -1.60266000 -1.237230 -1.025680 -1.11297000 0 -1.2013300 -0.406086
5 2784102 . 1 1 1 1 1 1 1 1 -1.3344400 -2.3292500 -1.154890 -1.79781000 -1.452120 -1.538880 -1.08854000 0 -2.0425200 -0.471176
6 2784738 . 1 1 1 1 1 1 1 1 1.0000000 1.0000000 1.000000 1.00000000 1.000000 1.000000 -0.56223200 0 -0.9722480 -0.418237
KHV_plot <- plot(KHV$pos, -(KHV$when_mutation_has_freq2), pch = 19, col = "#FB9A99", xlab = "Genomic Position", ylab = "log10(p-value)", main = "KHV" , cex = 1.5, ylim = c(0, 10), cex.lab = 1.2)
pdf("KHV_plot.pdf", width = 20, height = 5)
KHV_plot <- plot(KHV$pos, -(KHV$when_mutation_has_freq2), 
                 pch = 19, col = "#FB9A99", 
                 xlab = "Genomic Position", 
                 ylab = "log10(p-value)", 
                 main = "KHV",
                 cex = 1.5, 
                 yaxp = c(0, 10, 5),
                 cex.lab = 1.3,
                 cex.main = 1.6,  # Adjust title size
                 cex.axis = 1.2)  # Adjust axis numbers size

dev.off()
png: 2

library(wesanderson)
# Combine data frames
CDX$Population <- "CDX"
CHB$Population <- "CHB"
CHS$Population <- "CHS"
JPT$Population <- "JPT"
KHV$Population <- "KHV"

combined_data <- rbind(CDX, CHB, CHS, JPT, KHV)

custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

plot_combined <- ggplot(data = combined_data, aes(x = pos, y = -(when_mutation_has_freq2), color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "log10(p-value)", title = "East Asians") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_brewer(palette = "Paired") + 
  theme(text = element_text(size = 15))

print(plot_combined)

ggsave("asians_threshold_REL_SQUARE.png", plot_combined, width = 7, height = 5)

## SELECT JUST POSITION LOWER THAN THRESHOLD (log10 p-value < -6)

# Define file paths for the five .sele files
file_paths_asia <- c(
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/CDX/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/CHB/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/CHS/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/JPT/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/KHV/relate/run_relate/1000g_ppl_phased_haplotypes_selection.sele"
)

# Read the files
CDX <- read.table(file_paths_asia[1], header = TRUE)
CHB <- read.table(file_paths_asia[2], header = TRUE)
CHS <- read.table(file_paths_asia[3], header = TRUE)
JPT <- read.table(file_paths_asia[4], header = TRUE)
KHV <- read.table(file_paths_asia[5], header = TRUE)

# Function to subset data based on log10 p-value
subset_data <- function(data, population) {
  subset(data, when_mutation_has_freq2 < -6) %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = CDX, population = "CDX"),
  list(data = CHB, population = "CHB"),
  list(data = CHS, population = "CHS"),
  list(data = JPT, population = "JPT"),
  list(data = KHV, population = "KHV")
)

# Subset and bind data from all files
asians_low_pvalue <- do.call(rbind, lapply(file_pop, function(fp) {
  subset_data(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(asians_low_pvalue, "asians_low_pvalue.csv", row.names = FALSE)

filtered_asia <- read.csv("/home/ari/ari-intern/people/ari/ariadna-intern/notebooks/asians_low_pvalue.csv")
head(filtered_asia)
cat("Number of rows and columns:", paste(dim(filtered_asia), collapse = "x"))
A data.frame: 6 × 36
pos rs_id X3571428.500000 X357142.937500 X257030.656250 X184981.281250 X133128.375000 X95810.585938 X68953.507812 X49624.847656 X184.981262 X133.128357 X95.810577 X68.953499 X49.624844 X35.714287 X0.000000 when_DAF_is_half when_mutation_has_freq2 population
<int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr>
1 10122953 . 1 1 1 1 1 1 1 1 -0.987813 -0.961790 -1.191430 -0.559613 -0.761888 -1.644260 0 -1.39061 -6.29509 CDX
2 11859476 . 1 1 1 1 1 1 1 1 -2.101280 -1.576190 -0.977768 -1.610050 -1.664170 -0.557903 0 -2.69332 -6.64483 CDX
3 11864438 . 1 1 1 1 1 1 1 1 -2.101280 -1.576190 -0.977768 -1.610050 -1.664170 -0.557903 0 -2.69332 -6.64483 CDX
4 32635171 . 1 1 1 1 1 1 1 1 -1.181500 -0.933064 -0.809430 -0.399554 -0.202704 -0.298736 0 -2.22411 -6.58804 CDX
5 105249963 . 1 1 1 1 1 1 1 1 -0.612386 -0.384370 -0.240353 -0.152790 -0.228024 -0.150998 0 -1.40614 -6.11750 CDX
6 3712725 . 1 1 1 1 1 1 1 1 -0.439584 -0.609713 -0.366916 -0.877111 -0.129806 0.000000 0 -1.08669 -6.50623 CHB
Number of rows and columns: 27x36
plot_threshold <- ggplot(filtered_asia, aes(x = pos, y = -(when_mutation_has_freq2), color = population)) +
  geom_point(size = 4) +
  labs(x = "Genomic position", y = "log10 p-value", title = "East Asian Populations") + 
  theme_minimal() + scale_color_brewer(palette = "Paired") + theme(text = element_text(size = 15),
        plot.title = element_text(hjust = 0.5))
print(plot_threshold)

## PLOT TOGETHER ALL POPULATIONS SNPS BELOW THRESHOLD
# Define theme with larger text size for legend
custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create plots with the custom theme
africa <- ggplot(filtered, aes(x = pos, y = -(when_mutation_has_freq2), color = population)) +
  geom_point(size = 6) +
  labs(x = NULL, y = NULL, title = "Africans") + 
  custom_theme +  # Apply the custom theme
  scale_color_brewer(palette = "Set2") + 
  ylim(6, 10)

europe <- ggplot(filtered_euro, aes(x = pos, y = -(when_mutation_has_freq2), color = population)) +
  geom_point(size = 6) +
  labs(x = NULL, y = NULL, title = "Europeans + PUR") + 
  custom_theme +  # Apply the custom theme
  scale_color_manual(values = c("GBR" = "lightblue", "FIN" = "lightpink", "IBS" = "gold", "TSI" = "lightgreen", "PUR" = "orange2")) +
  ylim(6, 10)

asia <- ggplot(filtered_asia, aes(x = pos, y = -(when_mutation_has_freq2), color = population)) +
  geom_point(size = 6) +
  labs(x = NULL, y = NULL, title = "East Asians") + 
  custom_theme +  # Apply the custom theme
  scale_color_manual(values = c("CDX" = "#A6CEE3", "CHB" = "#1F78B4", "CHS" = "#B2DF8A", "JPT" = "#33A02C", "KHV" = "#FB9A99")) + 
  ylim(6, 10)

# Combine the plots
combined_plots <- grid.arrange(africa, europe, asia, ncol = 1, 
                                left = "log10(p-value)",         
                                bottom = "Genomic position")

# Save the combined plot with larger text sizes
ggsave("combined_threshold_RELATE.png", combined_plots, width = 7, height = 15) 
Warning message:
“Removed 5 rows containing missing values (`geom_point()`).”

RUNS TEST

AFRICAN POPULATIONS

# Read the CSV files
LWK <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_LWK_runstats.csv", sep=",", header=TRUE)
GWD <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_GWD_runstats.csv", sep=",", header=TRUE)
ESN <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_ESN_runstats.csv", sep=",", header=TRUE)
MSL <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_MSL_runstats.csv", sep=",", header=TRUE)
YRI <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_YRI_runstats.csv", sep=",", header=TRUE)

# Store the datasets in a list
datasets <- list(LWK=LWK, GWD=GWD, ESN=ESN, MSL=MSL, YRI=YRI)

# Loop over each dataset and modify them in place
for (name in names(datasets)) {
  # Get the dataset
  data <- datasets[[name]]
  
  # Filter the data
  filtered_data <- data %>%
    filter(stat_name == 'nr_runs' & nr_mut > 20)
  
  # Calculate -log10(p)
  filtered_data$log10p <- -log10(filtered_data$p)
  
  # Ensure 'stat' is treated as a factor
  filtered_data$stat <- as.factor(filtered_data$stat)
  
  # Assign the modified dataset back to the original variable name
  assign(name, filtered_data)
}

# Combine data frames
LWK$Population <- "LWK"
ESN$Population <- "ESN"
GWD$Population <- "GWD"
MSL$Population <- "MSL"
YRI$Population <- "YRI"

combined_data <- rbind(LWK, ESN, GWD, MSL, YRI)

custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = combined_data, aes(x = pos, y = log10p, color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "Africans") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_brewer(palette = "Set2") +
  theme(
    text = element_text(size = 15),
    plot.title = element_text(hjust = 0.5) # Center the title
  )

# Print the plot
print(plot_combined)
ggsave("africans_threshold_RUNS_SQUARE.png", plot_combined, width = 7, height = 5)
ERROR: Error: object 'stat_name' not found

Error: object 'stat_name' not found
Traceback:

1. data %>% filter(stat_name == "nr_runs" & nr_mut > 20)
2. filter(., stat_name == "nr_runs" & nr_mut > 20)
# Load necessary libraries
library(dplyr)
library(ggplot2)

# Define the file paths
file_paths <- list(
  LWK = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_LWK_runstats.csv",
  GWD = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_GWD_runstats.csv",
  ESN = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_ESN_runstats.csv",
  MSL = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_MSL_runstats.csv",
  YRI = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_YRI_runstats.csv"
)

# Read the CSV files
datasets <- lapply(file_paths, function(path) {
  read.csv(path, stringsAsFactors = FALSE)
})

# Filter and process each dataset
datasets <- lapply(datasets, function(data) {
  data %>%
    filter(stat_name == 'nr_runs' & nr_mut > 20) %>%
    mutate(
      log10p = -log10(p),
      stat = as.factor(stat)
    )
})

# Combine data frames and add Population column
populations <- names(datasets)
for (i in seq_along(datasets)) {
  datasets[[i]]$Population <- populations[i]
}
combined_data <- do.call(rbind, datasets)

# Define the custom theme
custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = combined_data, aes(x = pos, y = log10p, color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "Africans") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_brewer(palette = "Set2") +
  theme(
    text = element_text(size = 15),
    plot.title = element_text(hjust = 0.5) # Center the title
  )

# Print the plot
print(plot_combined)
ggsave("africans_threshold_RUNS_SQUARE.png", plot_combined, width = 7, height = 5)

Attaching package: ‘dplyr’


The following object is masked from ‘package:gridExtra’:

    combine


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Warning message:
“Removed 2353 rows containing missing values (`geom_point()`).”
Warning message:
“Removed 2353 rows containing missing values (`geom_point()`).”

EUROPEANS + PUR

# Read the CSV files
GBR <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_GBR_runstats.csv", sep=",", header=TRUE)
FIN <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_FIN_runstats.csv", sep=",", header=TRUE)
IBS <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_IBS_runstats.csv", sep=",", header=TRUE)
TSI <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_TSI_runstats.csv", sep=",", header=TRUE)
PUR <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_PUR_runstats.csv", sep=",", header=TRUE)

# Store the datasets in a list
datasets <- list(GBR=GBR, FIN=FIN, IBS=IBS, TSI=TSI, PUR=PUR)

# Loop over each dataset and modify them in place
for (name in names(datasets)) {
  # Get the dataset
  data <- datasets[[name]]
  
  # Filter the data
  filtered_data <- data %>%
    filter(stat_name == 'nr_runs' & nr_mut > 20)
  
  # Calculate -log10(p)
  filtered_data$log10p <- -log10(filtered_data$p)
  
  # Ensure 'stat' is treated as a factor
  filtered_data$stat <- as.factor(filtered_data$stat)
  
  # Assign the modified dataset back to the original variable name
  assign(name, filtered_data)
}

# Combine data frames
GBR$Population <- "GBR"
FIN$Population <- "FIN"
IBS$Population <- "IBS"
TSI$Population <- "TSI"
PUR$Population <- "PUR"

combined_data <- rbind(GBR, FIN, IBS, TSI, PUR)

custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = combined_data, aes(x = pos, y = log10p, color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "Europeans + PUR") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_manual(values = c("GBR" = "lightblue", "FIN" = "lightpink", "IBS" = "gold", "TSI" = "lightgreen", "PUR" = "orange2"))+
  theme(
    text = element_text(size = 15),
    plot.title = element_text(hjust = 0.5) # Center the title
  )

# Print the plot
print(plot_combined)

ggsave("europeans_threshold_RUNS_SQUARE.png", plot_combined, width = 7, height = 5)
Warning message:
“Removed 7004 rows containing missing values (`geom_point()`).”
Warning message:
“Removed 7004 rows containing missing values (`geom_point()`).”

EAST ASIANS

# Load necessary library
library(dplyr)
library(ggplot2)

# Read the CSV files
CDX <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_CDX_runstats.csv", sep=",", header=TRUE)
CHB <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_CHB_runstats.csv", sep=",", header=TRUE)
CHS <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_CHS_runstats.csv", sep=",", header=TRUE)
JPT <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_JPT_runstats.csv", sep=",", header=TRUE)
KHV <- read.table("/home/ari/ari-intern/people/ari/ariadna-intern/steps/runs_test/1000g_ppl_phased_haplotypes_KHV_runstats.csv", sep=",", header=TRUE)

# Store the datasets in a list
datasets <- list(CDX=CDX, CHB=CHB, CHS=CHS, JPT=JPT, KHV=KHV)

# Loop over each dataset and modify them in place
for (name in names(datasets)) {
  # Get the dataset
  data <- datasets[[name]]
  
  # Filter the data
  filtered_data <- data %>%
    filter(stat_name == 'nr_runs' & nr_mut > 20)
  
  # Calculate -log10(p)
  filtered_data$log10p <- -log10(filtered_data$p)
  
  # Ensure 'stat' is treated as a factor
  filtered_data$stat <- as.factor(filtered_data$stat)
  
  # Assign the modified dataset back to the original variable name
  assign(name, filtered_data)
}

# Combine data frames
CDX$Population <- "CDX"
CHB$Population <- "CHB"
CHS$Population <- "CHS"
JPT$Population <- "JPT"
KHV$Population <- "KHV"

combined_data <- rbind(CDX, CHB, CHS, JPT, KHV)

custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = combined_data, aes(x = pos, y = log10p, color = Population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "East Asians") +
  theme_light() + custom_theme +
  geom_hline(yintercept = 6, color = "red") +
  scale_color_brewer(palette = "Paired") +
  theme(
    text = element_text(size = 15),
    plot.title = element_text(hjust = 0.5) # Center the title
  )

# Print the plot
print(plot_combined)

ggsave("asians_threshold_RUNS_SQUARE_correct.png", plot_combined, width = 7, height = 5)
Warning message:
“Removed 5769 rows containing missing values (`geom_point()`).”
Warning message:
“Removed 5769 rows containing missing values (`geom_point()`).”

COMBINED P-VALUES

# Load the necessary library
library(dplyr)

# Define file paths for the five .sele files
file_paths_africa <- c(
  LWK = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_LWK.csv",
  ESN = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_ESN.csv",
  GWD = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_GWD.csv",
  MSL = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_MSL.csv",
  YRI = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_YRI.csv"
)

# Read the files
LWK <- read.csv(file_paths_africa["LWK"], header = TRUE)
ESN <- read.csv(file_paths_africa["ESN"], header = TRUE)
GWD <- read.csv(file_paths_africa["GWD"], header = TRUE)
MSL <- read.csv(file_paths_africa["MSL"], header = TRUE)
YRI <- read.csv(file_paths_africa["YRI"], header = TRUE)

# Function to subset data based on combined_p_values threshold
subset_data <- function(data, population) {
  data %>%
    filter(combined_p_values >= 6) %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = LWK, population = "LWK"),
  list(data = ESN, population = "ESN"),
  list(data = GWD, population = "GWD"),
  list(data = MSL, population = "MSL"),
  list(data = YRI, population = "YRI")
)

# Subset and bind data from all files
africans_high_pvalue <- do.call(rbind, lapply(file_pop, function(fp) {
  subset_data(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(africans_high_pvalue, "africans_combined.csv", row.names = FALSE)

Attaching package: ‘dplyr’


The following object is masked from ‘package:gridExtra’:

    combine


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

library(dplyr)
## SELECT JUST POSITION LOWER THAN THRESHOLD (log10 p-value < -6)
# Define file paths for the five .sele files
file_paths_euro <- c(
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_GBR.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_FIN.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_IBS.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_TSI.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_PUR.csv"
)

# Read the files
GBR <- read.csv(file_paths_euro[1], header = TRUE)
FIN <- read.csv(file_paths_euro[2], header = TRUE)
IBS <- read.csv(file_paths_euro[3], header = TRUE)
TSI <- read.csv(file_paths_euro[4], header = TRUE)
PUR <- read.csv(file_paths_euro[5], header = TRUE)

# Function to subset data based on combined_p_values threshold
subset_data <- function(data, population) {
  data %>%
    filter(combined_p_values >= 6) %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = GBR, population = "GBR"),
  list(data = FIN, population = "FIN"),
  list(data = IBS, population = "IBS"),
  list(data = TSI, population = "TSI"),
  list(data = PUR, population = "PUR")
)

# Subset and bind data from all files
europeans_low_pvalue <- do.call(rbind, lapply(file_pop, function(fp) {
  subset_data(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(europeans_low_pvalue, "europeans_combined.csv", row.names = FALSE)
# Define file paths for the five .csv files
file_paths_asia_combined <- c(
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_CDX.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_CHB.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_CHS.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_JPT.csv",
  "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_KHV.csv"
)

# Read the files
CDX <- read.csv(file_paths_asia_combined[1])
CHB <- read.csv(file_paths_asia_combined[2])
CHS <- read.csv(file_paths_asia_combined[3])
JPT <- read.csv(file_paths_asia_combined[4])
KHV <- read.csv(file_paths_asia_combined[5])

# Add population column to each data frame
CDX$population <- "CDX"
CHB$population <- "CHB"
CHS$population <- "CHS"
JPT$population <- "JPT"
KHV$population <- "KHV"

# Function to subset data based on combined_p_values
subset_data <- function(data, population) {
  data %>%
    filter(combined_p_values >= 6) %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = CDX, population = "CDX"),
  list(data = CHB, population = "CHB"),
  list(data = CHS, population = "CHS"),
  list(data = JPT, population = "JPT"),
  list(data = KHV, population = "KHV")
)

# Subset and bind data from all files
asians_combined <- do.call(rbind, lapply(file_pop, function(fp) {
  subset_data(fp$data, fp$population)
}))

# Save the combined data to a single CSV file
write.csv(asians_combined, "asians_combined.csv", row.names = FALSE)
africa_file <- read.csv("/home/ari/ari-intern/people/ari/ariadna-intern/notebooks/africans_combined.csv")
europe_file <- read.csv("/home/ari/ari-intern/people/ari/ariadna-intern/notebooks/europeans_combined.csv")
asia_file <- read.csv("/home/ari/ari-intern/people/ari/ariadna-intern/notebooks/asians_combined.csv")

# Define theme with larger text size for legend
custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create plots with the custom theme
africa <- ggplot(africa_file, aes(x = pos, y = (combined_p_values), color = population)) +
  geom_point(size = 6) +
  labs(x = NULL, y = NULL, title = "Africans") + 
  custom_theme +  # Apply the custom theme
  scale_color_brewer(palette = "Set2") + 
  ylim(6, 10)

europe <- ggplot(europe_file, aes(x = pos, y = (combined_p_values), color = population)) +
  geom_point(size = 6) +
  labs(x = NULL, y = NULL, title = "Europeans + PUR") + 
  custom_theme +  # Apply the custom theme
  scale_color_manual(values = c("GBR" = "lightblue", "FIN" = "lightpink", "IBS" = "gold", "TSI" = "lightgreen", "PUR" = "orange2")) +
  ylim(6, 10)

asia <- ggplot(asia_file, aes(x = pos, y = (combined_p_values), color = population)) +
  geom_point(size = 6) +
  labs(x = NULL, y = NULL, title = "East Asians") + 
  custom_theme +  # Apply the custom theme
  scale_color_manual(values = c("CDX" = "#A6CEE3", "CHB" = "#1F78B4", "CHS" = "#B2DF8A", "JPT" = "#33A02C", "KHV" = "#FB9A99")) + 
  ylim(6, 10)

# Combine the plots
combined_plots <- grid.arrange(africa, europe, asia, ncol = 1, 
                                left = "log10(p-value)",         
                                bottom = "Genomic position")

# Save the combined plot with larger text sizes
ggsave("combined_threshold_COMBINED_gap.png", combined_plots, width = 7, height = 15) 
Warning message:
“Removed 3 rows containing missing values (`geom_point()`).”

# Load the necessary libraries
library(dplyr)
library(ggplot2)

# Define file paths for the five .sele files
file_paths_africa <- c(
  LWK = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_LWK.csv",
  ESN = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_ESN.csv",
  GWD = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_GWD.csv",
  MSL = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_MSL.csv",
  YRI = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_YRI.csv"
)

# Read the files
LWK <- read.csv(file_paths_africa["LWK"], header = TRUE)
ESN <- read.csv(file_paths_africa["ESN"], header = TRUE)
GWD <- read.csv(file_paths_africa["GWD"], header = TRUE)
MSL <- read.csv(file_paths_africa["MSL"], header = TRUE)
YRI <- read.csv(file_paths_africa["YRI"], header = TRUE)

# Function to add population column to the data
add_population <- function(data, population) {
  data %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = LWK, population = "LWK"),
  list(data = ESN, population = "ESN"),
  list(data = GWD, population = "GWD"),
  list(data = MSL, population = "MSL"),
  list(data = YRI, population = "YRI")
)

# Add population column and bind data from all files
africans_combined <- do.call(rbind, lapply(file_pop, function(fp) {
  add_population(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(africans_combined, "africans_combined.csv", row.names = FALSE)

# Define theme with larger text size for legend
custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = africans_combined, aes(x = pos, y = combined_p_values, color = population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "Africans") +
  theme_light() +
  custom_theme +
  scale_color_brewer(palette = "Set2") +
  geom_hline(yintercept = 6, color = "red")

# Print the plot
print(plot_combined)
ggsave("africans_combined_pvalues_SQUARE_big.png", plot_combined, width = 7, height = 5)

# Load the necessary libraries
library(dplyr)
library(ggplot2)

# Define file paths for the five .sele files
file_paths <- c(
  GBR = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_GBR.csv",
  FIN = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_FIN.csv",
  IBS = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_IBS.csv",
  TSI = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_TSI.csv",
  PUR = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_PUR.csv"
)

# Read the files
GBR <- read.csv(file_paths["GBR"], header = TRUE)
FIN <- read.csv(file_paths["FIN"], header = TRUE)
IBS <- read.csv(file_paths["IBS"], header = TRUE)
TSI <- read.csv(file_paths["TSI"], header = TRUE)
PUR <- read.csv(file_paths["PUR"], header = TRUE)

# Function to add population column to the data
add_population <- function(data, population) {
  data %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = GBR, population = "GBR"),
  list(data = FIN, population = "FIN"),
  list(data = IBS, population = "IBS"),
  list(data = TSI, population = "TSI"),
  list(data = PUR, population = "PUR")
)

# Add population column and bind data from all files
combined_data <- do.call(rbind, lapply(file_pop, function(fp) {
  add_population(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(combined_data, "european_populations.csv", row.names = FALSE)

# Create the plot
plot_combined <- ggplot(data = combined_data, aes(x = pos, y = combined_p_values, color = population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "Europeans+PUR") +
  theme_light() +
  custom_theme+
  scale_color_manual(values = c(
    "GBR" = "lightblue", 
    "FIN" = "lightpink", 
    "IBS" = "gold", 
    "TSI" = "lightgreen", 
    "PUR" = "orange2"
  )) +
  geom_hline(yintercept = 6, color = "red")

# Print the plot
print(plot_combined)
ggsave("europeans_combined_pvalues_SQUARE_big.png", plot_combined, width = 7, height = 5)

# Define file paths for the five .sele files
file_paths_asia <- c(
  CDX = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_CDX.csv",
  CHB = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_CHB.csv",
  CHS = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_CHS.csv",
  JPT = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_JPT.csv",
  KHV = "/home/ari/ari-intern/people/ari/ariadna-intern/steps/fisher/combined_KHV.csv"
)

# Read the files
CDX <- read.csv(file_paths_asia["CDX"], header = TRUE)
CHB <- read.csv(file_paths_asia["CHB"], header = TRUE)
CHS <- read.csv(file_paths_asia["CHS"], header = TRUE)
JPT <- read.csv(file_paths_asia["JPT"], header = TRUE)
KHV <- read.csv(file_paths_asia["KHV"], header = TRUE)

# Function to add population column to the data
add_population <- function(data, population) {
  data %>%
    mutate(population = population)
}

# Define data frames and corresponding population names
file_pop <- list(
  list(data = CDX, population = "CDX"),
  list(data = CHB, population = "CHB"),
  list(data = CHS, population = "CHS"),
  list(data = JPT, population = "JPT"),
  list(data = KHV, population = "KHV")
)

# Add population column and bind data from all files
asians_combined <- do.call(rbind, lapply(file_pop, function(fp) {
  add_population(fp$data, fp$population)
}))

# Save the combined data to a single file
write.csv(asians_combined, "asians_combined.csv", row.names = FALSE)

# Define theme with larger text size for legend
custom_theme <- theme_minimal() +
  theme(
    text = element_text(size = 14),  # Increase the text size for all text elements
    plot.title = element_text(size = 16, hjust = 0.5),  # Increase the title size
    axis.title = element_text(size = 15),  # Increase the axis label size
    axis.text = element_text(size = 12),  # Increase the axis text size
    legend.text = element_text(size = 14)  # Increase the legend text size
  )

# Create the plot
plot_combined <- ggplot(data = asians_combined, aes(x = pos, y = combined_p_values, color = population)) +
  geom_point(pch = 19, size = 4) +
  labs(x = "Genomic Position", y = "-log10(p-value)", title = "East Asians") +
  theme_light() +
  custom_theme +
  scale_color_brewer(palette = "Paired") +
  geom_hline(yintercept = 6, color = "red")

# Print the plot
print(plot_combined)
ggsave("asians_combined_pvalues_SQUARE_big.png", plot_combined, width = 7, height = 5)