diff --git a/COLOC/Snakefile b/COLOC/Snakefile index a503f63..b6387fe 100644 --- a/COLOC/Snakefile +++ b/COLOC/Snakefile @@ -5,24 +5,31 @@ gwas_data = config["gwas"] qtl_data = config["qtl"] outFolder = config["outFolder"] geneMeta = config["geneMeta"] - +sig_threshold = config["sig_threshold"] # sanity check - can the GWAS and QTL data be found in the database? -gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2) -qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1) +gwas_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 1) +qtl_df = pd.read_excel("/sc/arion/projects/ad-omics/data/references//GWAS/GWAS-QTL_data_dictionary.xlsx", sheet_name = 2) gwas_check = all(i in gwas_df["dataset"].tolist() for i in gwas_data ) qtl_check = all(i in qtl_df["dataset"].tolist() for i in qtl_data ) -if not all([gwas_check, qtl_check]): - print(" * QTL and/or GWAS cannot be found in database") +if not all([gwas_check]): + print(" * GWAS cannot be found in database") + sys.exit() + +if not all([qtl_check]): + print(" * QTL cannot be found in database") + print(qtl_check) sys.exit() + shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;") rule all: input: #expand(outFolder + "{GWAS}/{QTL}/{QTL}_{GWAS}_coloc_results.snp.1kgp3_ld.tsv.gz", GWAS = gwas_data, QTL = qtl_data) outFolder + "all_COLOC_results_merged_H4_0_no_LD.tsv.gz", outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz", + #outFolder + "pairwise_QTL_results.tsv.gz" #expand( outFolder + "{GWAS}/{QTL}_{GWAS}_COLOC.tsv", GWAS = gwas_data, QTL = qtl_data ) #outFolder + "all_COLOC_summary_results.tsv.gz" @@ -34,8 +41,8 @@ rule run_COLOC: script = "scripts/run_COLOC.R" shell: "mkdir -p {outFolder}{wildcards.GWAS};" - "ml R/4.0.3;" #ml arrow;" - "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ " + "ml R;" #ml arrow;" + "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} -o {outFolder}{wildcards.GWAS}/ -s 1e-5 --verbose --lowmem" rule merge_COLOC_snp_level: input: @@ -46,7 +53,7 @@ rule merge_COLOC_snp_level: params: script = "scripts/merge_COLOC_results.R" shell: - "ml R/4.0.3;" + "ml R;" # LDLINKR only installed on 3.6 "Rscript {params.script} --threshold 0 --inFolder {outFolder} --geneMeta {geneMeta};" "Rscript {params.script} --threshold 0.5 --ld --inFolder {outFolder} --geneMeta {geneMeta} " @@ -61,3 +68,25 @@ rule create_LD_tables: shell: "conda activate echoR;" "Rscript {params.script} -g {wildcards.GWAS} -q {wildcards.QTL} --inFile {input} --outFolder {params.out}" + +rule prepare_QTL_COLOC: + input: + outFolder + "all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz" + output: + outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + params: + script = "scripts/prepare_pairwise_qtl_coloc.R" + shell: + "ml R/4.2.0;" + "Rscript {params.script} -i {input} -o {output} -m across -p 0.8" + +rule run_QTL_COLOC: + input: + outFolder + "pairwise_QTL_COLOC_targets.tsv.gz" + output: + outFolder + "pairwise_QTL_results.tsv.gz" + params: + script = "scripts/run_COLOC.R" + shell: + "ml R/4.2.0;" + "Rscript {params.script} -t {input} -o {output}" diff --git a/COLOC/cluster.yaml b/COLOC/cluster.yaml index 337150f..bccd9de 100644 --- a/COLOC/cluster.yaml +++ b/COLOC/cluster.yaml @@ -1,10 +1,13 @@ __default__: - #partition: chimera queue: premium cores: 1 - mem: 24000 + mem: 48000 time: '120' name: $(basename $(pwd)):{rule}:{wildcards} output: logs/{rule}:{wildcards}.stdout error: logs/{rule}:{wildcards}.stderr himem: "" +pairwise_QTL_COLOC: + cores: 4 + mem: 16000 + time: 8640 diff --git a/COLOC/pairwise_qtl_coloc.smk b/COLOC/pairwise_qtl_coloc.smk new file mode 100644 index 0000000..a79653f --- /dev/null +++ b/COLOC/pairwise_qtl_coloc.smk @@ -0,0 +1,31 @@ +import sys +import pandas as pd +import glob +import re +import os + +# Pairwise QTL COLOC pipeline +# Jack Humphrey +outFolder = config["outFolder"] +inFolder = config["inFolder"] +MAF_file = config["MAF_file"] + +target_files = [ x for x in glob.glob(inFolder + "*genewide*pairs*tsv.gz" )] +target_ids = [os.path.basename(re.sub('.tsv.gz', '', x, count=1)) for x in target_files ] + + +shell.prefix("export PS1=""; ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate snakemake;") +rule all: + input: + expand( outFolder + "{target}_COLOC.tsv.gz", target = target_ids) + +rule pairwise_QTL_COLOC: + input: + inFolder + "{target}.tsv.gz" + output: + outFolder + "{target}_COLOC.tsv.gz" + params: + script = "scripts/run_COLOC.R" + shell: + "ml R;" + "Rscript {params.script} --targets {input} -o {outFolder} --lowmem --threads 1 --MAF {MAF_file}" diff --git a/COLOC/scripts/prepare_pairwise_qtl_coloc.R b/COLOC/scripts/prepare_pairwise_qtl_coloc.R new file mode 100644 index 0000000..1c5028e --- /dev/null +++ b/COLOC/scripts/prepare_pairwise_qtl_coloc.R @@ -0,0 +1,73 @@ +library(optparse) + +option_list <- list( + make_option(c('-o', '--outFile'), help='the path to the output file', default = ""), + make_option(c('-i', '--inFile'), help='the path to the input file', default = ""), + make_option(c('-m', '--mode'), help='mode - across or within?', default = "across"), + make_option(c('-p', '--pp'), help='min PP4', default = 0.5), + make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE) +) + +option.parser <- OptionParser(option_list=option_list) +opt <- parse_args(option.parser) + +inFile <- opt$inFile +outFile <- opt$outFile +mode <- opt$mode +PP4 <- opt$pp +library(tidyverse) + +#all_coloc <- read_tsv("../all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz") +all_coloc <- read_tsv(inFile) + +all_coloc$distance <- abs(all_coloc$GWAS_pos - all_coloc$QTL_pos) +all_coloc$distance_filter <- (!is.na(all_coloc$LD) & all_coloc$LD > 0.1) | all_coloc$distance < 5e5 + +qtl_coloc_input <- all_coloc %>% + filter(distance_filter == TRUE & PP.H4.abf > PP4) %>% + select(GWAS, QTL, locus, feature) %>% + distinct() + +message(" * ", nrow(qtl_coloc_input), " phenotypes colocalize at PP4 > ", PP4) +stopifnot( nrow(qtl_coloc_input) > 0) + + +shared_loci <- select(qtl_coloc_input, GWAS, locus, QTL) %>% + distinct() %>% + group_by(GWAS, locus) %>% + tally() %>% + filter(n > 1) +message(" * ", nrow(shared_loci), " loci are shared between QTL types") +stopifnot( nrow(shared_loci) > 0) + +qtl_coloc_input <- filter(qtl_coloc_input, locus %in% shared_loci$locus) + +# get all pairwise comparisons required +# make optional - do you want to test features within a dataset or just across? +qtl_coloc_split <- qtl_coloc_input %>% + split(paste0(.$GWAS, ":", .$locus)) %>% + map_df( ~{ + d <- select(.x, GWAS,locus, QTL, feature) %>% distinct() + qtl_pairs <- combn(unique(paste(d$QTL, d$feature) ), 2) + locus_df <- select(d, GWAS, locus) + pair_df <- + data.frame(qtl_1 = qtl_pairs[1,], qtl_2 = qtl_pairs[2,] ) %>% + tidyr::separate(qtl_1, into = c("qtl_1", "feature_1"), sep = " ") %>% + tidyr::separate(qtl_2, into = c("qtl_2", "feature_2"), sep = " ") %>% + mutate(GWAS = unique(d$GWAS), locus = unique(d$locus)) %>% + select(GWAS, locus, everything() ) + + }) %>% + distinct() + +if( mode == "across"){ + +qtl_coloc_split <- qtl_coloc_split %>% + mutate( within = qtl_1 == qtl_2) %>% filter(within == FALSE) +} + +write_tsv(qtl_coloc_split, outFile) #"test_eQTL_edQTL_pairwise_input.tsv") + + + + diff --git a/COLOC/scripts/run_COLOC.R b/COLOC/scripts/run_COLOC.R index 69bc401..1758c26 100644 --- a/COLOC/scripts/run_COLOC.R +++ b/COLOC/scripts/run_COLOC.R @@ -26,9 +26,24 @@ suppressPackageStartupMessages(library(optparse)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) +options(future.globals.maxSize = 8000 * 1024^2) #library(arrow) +message_ <- function(text){ + message(Sys.time(), " * ", text) +} +## verbose messaging +message_verbose <- function(input){ + if(verbose){ + if( "data.frame" %in% class(input)){ + print(input) + } + if( class(input) == "character"){ + message(Sys.time(), " * ", input) + } + } +} # flank coordinates by set number of bases (default 1MB) # work on either a coordinate string or a dataframe containing chr start and end columns joinCoords <- function(input, flank = 0){ @@ -36,6 +51,7 @@ joinCoords <- function(input, flank = 0){ coord <- splitCoords(input) coord$start <- coord$start - flank coord$end <- coord$end + flank + if( coord$start < 0){coord$start <- 0} coord <- paste0(coord$chr, ":", coord$start, "-", coord$end) return(coord) } @@ -43,10 +59,15 @@ joinCoords <- function(input, flank = 0){ stopifnot( all(c("chr", "start", "end") %in% names(input) ) | all(c("chr","pos") %in% names(input) ) ) stopifnot( flank >= 0) if( all( c("chr","start","end") %in% names(input) ) ){ - coords <- paste0(input$chr, ":", input$start - flank, "-",input$end + flank) + # deal with flanks that go negative + starts <- input$start - flank + starts[starts < 0 ] <- 1 + coords <- paste0(input$chr, ":", starts, "-",input$end + flank) } if( all( c("chr", "pos") %in% names(input) ) ){ - coords <- paste0(input$chr, ":", input$pos - (flank + 1), "-", input$pos + flank) + starts <- input$pos - (flank + 1) + starts[starts < 0 ] <- 1 + coords <- paste0(input$chr, ":", starts, "-", input$pos + flank) } return(coords) } @@ -61,10 +82,10 @@ splitCoords <- function(coords){ } pullData <- function(dataset, type = "GWAS"){ - message(Sys.time()," * selected dataset: ", dataset) + message_verbose(paste0("selected dataset: ", dataset)) db_path <- "/sc/arion/projects/ad-omics/data/references/GWAS/GWAS-QTL_data_dictionary.xlsx" - message(Sys.time()," * reading GWAS database from ", db_path) + message_verbose(paste0("reading GWAS database from ", db_path)) stopifnot( file.exists(db_path) ) gwas_db <- suppressMessages(readxl::read_excel(db_path, sheet = type, na= c("", "-","NA"))) @@ -132,9 +153,9 @@ extractLoci <- function(gwas){ # already filtered from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/supporting/EUR.2of4intersection_allele_freq.20100804.sites.vcf.gz # all sites with an RSID and 2+ alleles loadMAF <- function(path){ - message(" * loading in MAF from 1000 Genomes") + #message_("loading in MAF from 1000 Genomes") maf <- data.table::fread(path, nThread = 4) - names(maf) <- c("chr", "pos", "snp", "MAF") + names(maf)[1:4] <- c("chr", "pos", "snp", "MAF") maf$chr <- as.character(maf$chr) data.table::setkey(maf, "chr") maf$MAF <- suppressWarnings(as.numeric(maf$MAF)) @@ -153,13 +174,14 @@ matchMAF <- function(data, maf){ # subset maf by chromosome maf <- maf[chr] # match on RSID - message(" * before joining MAF: ", nrow(data), " SNPs") + message_verbose(paste0("before joining MAF: ", nrow(data), " SNPs")) data$MAF <- maf$MAF[match(data$snp, maf$snp)] - matched <- data[ !is.na(data$MAF) ,] - message(" * after joining MAF: ", nrow(matched), " SNPs") + matched <- data[ !is.na(data$MAF) & data$MAF >0 & data$MAF < 1,] + message_verbose(paste0("after joining MAF: ", nrow(matched), " SNPs")) if(nrow(matched) == 0 ){ - print(head(data) ) - stop("no SNPs match on RSID. Inspect data") + #message_verbose(head(data) ) + message_verbose("no SNPs match on RSID. Inspect data") + return(NULL) } return(matched) } @@ -198,11 +220,11 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da } } - cmd <- paste( "ml bcftools; tabix -h ", gwas_path, coord ) - message(" * running command: ", cmd) + cmd <- paste( "ml bcftools/1.9; tabix -h ", gwas_path, coord ) + message_verbose(paste0("running command: ", cmd) ) result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) ) - message(" * GWAS extracted!") + message_verbose("GWAS extracted!") stopifnot( nrow(result) > 0 ) @@ -222,20 +244,18 @@ extractGWAS <- function(gwas, coord, refFolder = "/sc/arion/projects/ad-omics/da #return(result) # deal with MAF if missing - if( debug == TRUE){ result$MAF <- NA }else{ if( is.na(mafCol) | force_maf == TRUE ){ - message(" * MAF not present - using 1000 Genomes MAF") + message_verbose("MAF not present - using 1000 Genomes MAF") # how to get this in to the function? it can't find maf_1000g result <- matchMAF(result, maf = maf_1000g) }else{ - message(" * using supplied MAF") + message_verbose("using supplied MAF") names(result)[names(col_dict) == mafCol] <- "MAF" } - } # convert standard error to the variance # se is standard error - not standard deviation! result$varbeta <- result$varbeta^2 - print(head(result)) + #print(head(result)) #save.image("debug.RData") result <- dplyr::select( result, snp, pvalues, beta, varbeta, MAF, chr, pos, A1, A2) @@ -276,9 +296,9 @@ liftOverCoord <- function(coord_string, from = "hg19", to = "hg38"){ ) # lift over using rtracklayer # watch out for duplicate entries - message(" * lifting over....") + message_verbose("lifting over....") lifted_over <- rtracklayer::liftOver(coord_gr, chain ) - message(" * lifted!" ) + message_verbose("lifted!" ) #return(lifted_over) stopifnot(length(lifted_over) == length(coord_gr) ) @@ -308,7 +328,7 @@ extractQTL_parquet <- function(qtl, coord, sig_level = 0.05){ perm_res <- dplyr::bind_cols(perm_res, splitCoords(perm_res$variant_id) ) sig <- dplyr::filter(perm_res, qval < sig_level & chr == coord_split$chr & start >= coord_split$start & start <= coord_split$end ) - message( paste0( nrow(sig), " significant genes or splicing events at this locus" ) ) + message_verbose( paste0( nrow(sig), " significant genes or splicing events at this locus" ) ) if( nrow(sig) == 0 ){ return(NULL) } # read in nominal QTL associations @@ -332,23 +352,23 @@ extractQTL_tabix <- function(qtl, coord){ } } stopifnot( file.exists(qtl$full_path) ) - cmd <- paste( "ml bcftools; tabix -h ", qtl$full_path, coord ) - message(" * running command: ", cmd) + cmd <- paste( "ml bcftools/1.9; tabix -h ", qtl$full_path, coord ) + message_verbose(paste0("running command: ", cmd)) result <- as.data.frame(data.table::fread(cmd = cmd, nThread = 4) ) # deal with regions of no QTL association if( nrow(result) == 0){ return(NULL) } #stopifnot( nrow(result) > 0 ) - message(" * QTL extracted!") + message_verbose("QTL extracted!") return(result) } # extract all nominal QTL P-values overlapping the flanked GWAS hit # split by Gene being tested -# sig_level doesn't do anything yet -extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ +# now implemented sig_level filtering for minimum P-value for each phenotype +extractQTL <- function(qtl, coord, sig_level = NULL, force_maf = FALSE, targets = NULL){ # variables stored in qtl # if qtl type is parquet then read in parquet files # if qtl type is tabix then query the coordinates through tabix @@ -379,7 +399,7 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ columns <- colnames(data.table::fread( cmd = cmd )) col_dict <- setNames(1:length(columns), columns) - print(col_dict) + #print(col_dict) #stopifnot( ncol(result) == length(col_dict) ) col_dict <- col_dict[1:ncol(result)] # hacky workaround for now names(result)[names(col_dict) == phenoCol] <- "gene" @@ -392,42 +412,56 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ names(result)[names(col_dict) == seCol] <- "varbeta" # don't forget to square the standard error to get the variance result$varbeta <- result$varbeta^2 - + message_verbose(" remove associations with 0 variance") + message_verbose(paste0(" before: ", nrow(result), " associations")) # remove rows with variance of 0 - this will corrupt COLOC result <- dplyr::filter(result, varbeta != 0) + message_verbose(paste0(" after: ", nrow(result), " associations")) } names(result)[names(col_dict) == snpCol] <- "snp" # Young microglia use log10P - convert if( pvalCol == "log10_p"){ result$pvalues <- 10^result$pvalues } - + # P-value thresholding - methylation QTLs, there are way too many sites to test all of them + if( !is.null(sig_level) ){ + message_verbose(paste0(" currently: ", length(unique(result$gene) ), " phenotypes in window" ) ) + message_verbose(paste0(" removing phenotypes with P < ", sig_level)) + pheno_p <- result %>% dplyr::group_by(gene) %>% dplyr::slice_min(pvalues, n = 1, with_ties = FALSE) %>% dplyr::filter(pvalues < as.numeric(sig_level) ) + result <- dplyr::filter(result, gene %in% pheno_p$gene) + if(nrow(result) == 0){ + message_verbose(" 0 phenotypes remaining after P thresholding") + return(NULL) + } + message_verbose(paste0("retaining ", nrow(result), " associations from ", nrow(pheno_p), " phenotypes" )) + } # deal with MAF - meta-analysis outputs won't have it # this requires "chr" to be present in QTL data - if( debug == TRUE){ result$MAF <- NA }else{ + #if( debug == TRUE){ result$MAF <- NA }else{ # why do we do this? breaks it when debug is TRUE if( is.na(mafCol) | force_maf == TRUE ){ - message(" * MAF not present - using 1000 Genomes MAF") + message_verbose("MAF not present - using 1000 Genomes MAF") #stopifnot( "chr" %in% names(col_dict) ) names(result)[names(col_dict) == "chr"] <- "chr" result <- matchMAF(result, maf = maf_1000g) - print(head(result) ) + if( all(is.na(result$MAF) ) | is.null(result) ){ return(NULL) } + #print(head(result) ) }else{ - message(" * using supplied MAF") + message_verbose("using supplied MAF") names(result)[names(col_dict) == mafCol] <- "MAF" - print(col_dict) - print(head(result) ) - } + #print(col_dict) + #print(head(result) ) } + #} # deal with edge cases - 1 gene and the gene id is NA if( all( is.na(result$gene) ) ){ - message(" * no genes in QTL result") + message_verbose("no genes in QTL result") return(NULL) }else{ result <- dplyr::filter(result, !is.na(gene) ) } # if MAF needs matching then chrCol will be "chr" - change it back to "QTL_chr" - print( names(col_dict) ) - print( names(col_dict) == chrCol ) + #print( names(col_dict) ) + #print( names(col_dict) == chrCol ) names(result)[which(names(col_dict) == chrCol) ] <- "QTL_chr" names(result)[which(names(col_dict) == posCol) ] <- "QTL_pos" @@ -439,10 +473,28 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ }else{ res_subset <- dplyr::select(result, gene, snp, pvalues, MAF, QTL_chr, QTL_pos) } - print(head(res_subset) ) - print("passed this point") + # jan 2025: another edge case - NA pvalues + message_verbose("removing NA P-values") + res_subset <- filter(res_subset, !is.na(pvalues) ) + message_verbose(paste0("after: ", nrow(res_subset) )) + # edge case - multiallelic SNPs lead to two entries per SNP-gene - remove + message_verbose("removing multi-allelic SNPs") + res_subset <- dplyr::distinct(res_subset) + message_verbose(paste0("after: ", nrow(res_subset) )) + #print(head(res_subset) ) + message_verbose("passed this point") + #save(list = ls(all.names = TRUE), file = "image.RData", envir = + #environment()) #dplyr::filter( pos >= coord_split$start & pos <= coord_split$end) - + + if( !is.null(targets) ){ + res_subset <- dplyr::filter(res_subset, gene == targets) + message_verbose(paste0(length(unique(res_subset$gene)), " target features remain" ) ) + message_verbose(paste0(nrow(res_subset), " associations remain")) + if( nrow(res_subset) == 0 ){ + return(NULL) + } + } # split by gene, convert to list object res_split <- split(res_subset, res_subset$gene) %>% @@ -455,65 +507,8 @@ extractQTL <- function(qtl, coord, sig_level = 0.05, force_maf = FALSE){ return(res_split) } - - - -# running COLOC between a GWAS locus and all eQTLs within 1MB either side -# flank coordinates -# extract GWAS nominal SNPs from region -# find GWAS hit SNP for recording -# extract QTLs within range - at set significance level -# join together and run COLOC -# output a list of objects: -## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value -## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting. -runCOLOC <- function(gwas, qtl, hit){ - message(" * Analysing LOCUS: ", hit$locus ) - # hit is a dataframe containing "snp", "chr", "pos", "locus" - - hit_coord <- joinCoords(hit, flank = 0) - hit_range <- joinCoords(hit, flank = 1e6) - - # extract GWAS and QTL summary for given coord - message(" * extracting GWAS") - g <- extractGWAS(gwas, hit_range) - # get hit info from GWAS - hit_snp <- hit$snp - # this assumes that the hit SNP is within the g object - # if the top hit doesn't have an RSID or cannot be MAF matched then this is violated - # I will have to use the hit object instead and accept that not all top_loci lists include the MAF, P value etc. - #if( hit_snp == hit$snp ){ - #hit_info <- as.data.frame(g, stringsAsFactors = FALSE) %>% filter(snp == hit_snp) %>% - # select(GWAS_SNP = snp, GWAS_P = pvalues, GWAS_effect_size = beta, GWAS_MAF = MAF) %>% - # mutate(locus = hit$locus) %>% select(locus, everything() ) - #}else{ - # now just use top loci - - # problem when no p column in hit! - - hit_info <- hit %>% - select(locus, GWAS_SNP = snp, GWAS_P = p) - #} - - # if GWAS is hg19 then lift over to hg38 - # use the lifted over position in hit_info - if(gwas$build != qtl$build){ - qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build) - }else{ - qtl_coord <- hit_coord - } - # record the position of the GWAS top SNP - hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr - hit_info$GWAS_pos <- splitCoords(qtl_coord)$end - - - qtl_range <- joinCoords(qtl_coord, flank = 1e6) - - message(" * extracting QTLs") - q <- extractQTL(qtl, qtl_range) - print(names(q[[1]])) - if( is.null(q) ){ return(NULL) } - # for each gene extract top QTL SNP +# for each gene extract top QTL SNP +getQTLinfo <- function(q, hit_info){ if( all( c("beta", "varbeta") %in% names(q[[1]]) ) ){ qtl_info <- q %>% map_df( ~{ @@ -538,7 +533,7 @@ runCOLOC <- function(gwas, qtl, hit){ } }) qtl_info <- bind_cols(qtl_info, gwas_info) - print(head(qtl_info) ) + message_verbose(head(qtl_info) ) }else{ qtl_info <- q %>% map_df( ~{ @@ -548,14 +543,91 @@ runCOLOC <- function(gwas, qtl, hit){ }) } - # actually run COLOC - message(" * running COLOC") + return(qtl_info) +} + + +# running COLOC between a GWAS locus and all eQTLs within 1MB either side +# flank coordinates +# extract GWAS nominal SNPs from region +# find GWAS hit SNP for recording +# extract QTLs within range - at set significance level +# join together and run COLOC +# output a list of objects: +## COLOC results - this should include the GWAS locus, the gene of interest, the top QTL variant and the top QTL p-value +## object - this should be a table combining the two input datasets, inner joined on "snp", to be used for plotting. +## sig.level now filters out QTLs with P > sig.level +## bare - if TRUE then only return COLOC summary, not full object (saves memory) +runCOLOC <- function(gwas, qtl, qtl2 = NULL, hit, sig.level = NULL, target.file = NULL, bare = FALSE){ + message_(paste0("Analysing locus: ", hit$locus )) + # hit is a dataframe containing "snp", "chr", "pos", "locus" - # run COLOC on each QTL gene + hit_coord <- joinCoords(hit, flank = 0) + hit_range <- joinCoords(hit, flank = 1e6) + + # extract GWAS and QTL summary for given coord + if( is.null(qtl2) ){ + message_verbose("extracting GWAS") + g <- extractGWAS(gwas, hit_range) + } + # get hit info from GWAS + hit_snp <- hit$snp + + # problem when no p column in hit! + if(is.null(gwas) ){ + hit_info <- hit %>% + select(locus) %>% mutate(GWAS_SNP = NA, GWAS_P = NA) + qtl_range <- hit_range + }else{ + hit_info <- hit %>% + select(locus, GWAS_SNP = snp, GWAS_P = p) + # if GWAS is hg19 then lift over to hg38 + if(gwas$build != qtl$build){ + qtl_coord <- liftOverCoord(hit_coord, from = gwas$build, to = qtl$build) + }else{ + qtl_coord <- hit_coord + } + # record the position of the GWAS top SNP + hit_info$GWAS_chr <- splitCoords(qtl_coord)$chr + hit_info$GWAS_pos <- splitCoords(qtl_coord)$end + qtl_range <- joinCoords(qtl_coord, flank = 1e6) + } + message_verbose("extracting QTLs") + q <- extractQTL(qtl, qtl_range, targets = target.file[1], sig_level = sig.level) + #print(names(q[[1]])) + if( is.null(q) ){ return(NULL) } + qtl_info <- getQTLinfo(q, hit_info) + # if sig filtering requested + if( !is.null(sig.level ) ){ + qtl_info <- filter(qtl_info, QTL_P < sig.level) + q <- q[ qtl_info$gene ] + message_verbose(paste0(length(q), " features in QTL")) + } + + if( !is.null(qtl2) ){ + # if second QTL dataset requested + q2 <- extractQTL(qtl2, qtl_range, targets = target.file[2]) + if( is.null(q2) ){ return(NULL) } + # get QTL betas for lead SNP in QTL 1 + hit_info_qtl1 <- data.frame(locus = hit_info$locus, GWAS_SNP = qtl_info$QTL_SNP, GWAS_P = qtl_info$QTL_P) + qtl2_info <- getQTLinfo(q2, hit_info_qtl1) + # get QTL betas for lead SNP in QTL 2 + hit_info_qtl2 <- data.frame(locus = hit_info$locus, GWAS_SNP = qtl2_info$QTL_SNP, GWAS_P = qtl2_info$QTL_P) + qtl_info <- getQTLinfo(q, hit_info_qtl2) + if( !is.null(sig.level ) ){ + qtl2_info <- filter(qtl2_info, QTL_P < sig.level) + q2 <- q2[ qtl2_info$gene ] + } + message_verbose( paste0(length(q2), " features in QTL 2")) + } + # actually run COLOC + message_("running COLOC") # return the results table and an object containing g and q + # GWAS-QTL COLOC + if( is.null(qtl2) ){ + message_("GWAS-QTL COLOC") coloc_res <- purrr::map( q, ~{ - # if QTL has no overlapping SNPs with GWAS summary then return NULL if( length(intersect(g$snp, .x$snp) ) == 0 ){ message("Hold up! GWAS and QTL have no SNPs in common") return(NULL) @@ -564,25 +636,58 @@ runCOLOC <- function(gwas, qtl, hit){ # add in g and q to coloc object gq <- dplyr::inner_join(as.data.frame(g),as.data.frame(.x), by = "snp", suffix = c(".gwas", ".qtl") ) coloc_object$results <- dplyr::inner_join(coloc_object$results, gq, by = "snp" ) - # pull out coloc summary as data.frame coloc_df <- as.data.frame(t(coloc_object$summary), stringsAsFactors = FALSE) return( list(df = coloc_df, object = coloc_object) ) }) - if( is.null(coloc_res) ){ return(NULL) } + }else{ + message_("QTL-QTL COLOC") + # remove features with lead SNPs > 1e-5 + # test all pairs of QTL features in the locus + qtl_combos <- expand.grid(1:length(q), 1:length(q2) ) + message_(paste0("testing ", nrow(qtl_combos), " pairs of QTLs")) + # qtl_info is matched on to COLOC results using gene names - but for QTL-QTL there are two genes, so instead use the index of the pair + qtl_info <- + full_join( + qtl_info[qtl_combos$Var1,] %>% mutate(index = 1:nrow(qtl_combos) ), + qtl2_info[qtl_combos$Var2,] %>% mutate(index = 1:nrow(qtl_combos) ), + by = "index", + suffix = c(".qtl1", ".qtl2") ) %>% + dplyr::mutate(gene = as.character(index)) %>% + select(-index) - message(" * COLOC finished") + coloc_res <- + purrr::map( 1:nrow(qtl_combos), ~{ + q_1 = q[[ qtl_combos[.x, "Var1"] ]] + q_2 = q2[[ qtl_combos[.x, "Var2"] ]] + message_( paste0( .x, ": ", unique(q_1$gene), " vs ", unique(q_2$gene) ) ) + if( length(intersect(q_1$snp, q_2$snp) ) == 0 ){ + message_("Hold up! GWAS and QTL have no SNPs in common") + return(NULL) + } + coloc_object <- coloc.abf(dataset1 = q_1, dataset2 = q_2) + qq <- dplyr::inner_join(as.data.frame(q_1),as.data.frame(q_2), by = "snp", suffix = c(".qtl1", ".qtl2") ) + coloc_object$results <- dplyr::inner_join(coloc_object$results, qq, by = "snp" ) + coloc_df <- as.data.frame(t(coloc_object$summary), stringsAsFactors = FALSE) + return( list(df = coloc_df, object = coloc_object) ) + }) + } + if( is.null(unlist(coloc_res)) ){ return(NULL) } + + message_("COLOC finished") coloc_df <- map_df(coloc_res, "df", .id = "gene") %>% dplyr::mutate(locus = hit$locus ) %>% dplyr::select(locus, gene, everything() ) - message(" * COLOC res created") + message_("COLOC res created") # bind coloc_res to other tables full_res <- full_join(qtl_info, coloc_df, by = "gene") full_res <- full_join(hit_info, full_res, by = "locus") - - # add pairwise LD between GWAS and QTL SNP - #full_res <- calc_LD(full_res) - - return(list(full_res = full_res, coloc_object = coloc_res) ) + # remove rows with no coloc run + full_res <- filter(full_res, !is.na(PP.H4.abf) ) + if(bare == TRUE){ + return(list(full_res = full_res) ) + }else{ + return(list(full_res = full_res, coloc_object = coloc_res) ) + } } # for each COLOC result per locus @@ -592,7 +697,7 @@ calc_LD <- function( coloc_res ){ # get LDlink token from .Renviron (only Jack has this, for access go to https://ldlink.nci.nih.gov/?tab=apiaccess) token <- Sys.getenv("LDLINK_TOKEN") if( token == ""){ - warning(" * LDlink token not found!" ) + warning(Sys.time(), " * LDlink token not found!" ) return(NA) } snps <- unique( c( unique(coloc_res$GWAS_SNP), unique( coloc_res$QTL_SNP) ) ) @@ -615,80 +720,143 @@ calc_LD <- function( coloc_res ){ option_list <- list( - make_option(c('-o', '--outFolder'), help='the path to the output file', default = ""), - make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database" ), + make_option(c('-o', '--outFolder'), help='the path to the output file', default = "."), + make_option(c('--gwas', '-g'), help= "the dataset ID for a GWAS in the GWAS/QTL database", default = NULL ), make_option(c('--qtl', '-q'), help = "the dataset ID for a QTL dataset in the GWAS/QTL database"), - make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE) + make_option(c('--qtl2', '-r'), help = "a second QTL dataset - using this will trigger QTL-QTL COLOC", default = NULL), + make_option(c('--sig', '-s'), help = "the minimum p-value a QTL should have for testing", default = NULL), + make_option(c('--loci', '-l'), help = "text file of loci names", default = NULL), + make_option(c('--targets', '-t'), help = "TSV file of features, loci and GWAS for targeted analysis", default = NULL), + make_option(c('--debug'), help = "load all files and then save RData without running COLOC", action = "store_true", default = FALSE), + make_option(c('--lowmem', '-b'), help = "do not save COLOC objects, just the summaries", action = "store_true", default = FALSE), + make_option(c('--threads', '-c'), help = "how many threads to use", default = 1), + make_option(c('--MAF', '-m'), help = "path to custom MAF file", default = NULL), + make_option(c('--verbose', '-v'), help = "if selected then output more information", action = "store_true", default = FALSE) ) option.parser <- OptionParser(option_list=option_list) opt <- parse_args(option.parser) - outFolder <- opt$outFolder gwas_dataset <- opt$gwas qtl_dataset <- opt$qtl +qtl_dataset_2 <- opt$qtl2 debug <- opt$debug -#gwas_prefix <- "/sc/arion/projects/als-omics/ALS_GWAS/Nicolas_2018/processed/Nicolas_2018_processed_" -#qtl_prefix <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/QTL-mapping-pipeline/results/LumbarSpinalCord_expression/peer30/LumbarSpinalCord_expression_peer30" -#qtl_prefix <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/QTL-mapping-pipeline/results/LumbarSpinalCord_splicing/peer20/LumbarSpinalCord_splicing_peer20" - -#hits_file <- "/sc/arion/projects/als-omics/QTL/NYGC_Freeze02_European_Feb2020/downstream-QTL/COLOC/Nicolas_2018/Nicolas_2018_hits_1e-7.tsv" -#outFile <- "test_coloc_results.tsv" - -#options(echo = TRUE) +sig <- opt$sig +loci <- opt$loci +target_file <- opt$targets +low_mem <- opt$lowmem +threads <- opt$threads +verbose <- opt$verbose +MAF <- opt$MAF + +# default MAF is 1000g phase 3v5 +# but users can provide their own +# must have columns chr, pos, snp, MAF +if( is.null(MAF) ){ + #maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" + MAF <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" +} -# load in data -# extract top GWAS loci -# for each locus run COLOC -maf_1000gp1 <- "/sc/arion/projects/ad-omics/data/references/1KGP1/1000G_EUR_MAF.bed.gz" -maf_1000gp3 <- "/sc/arion/projects/ad-omics/data/references/1KGPp3v5/EUR_MAF/EUR.all.phase3_MAF.bed.gz" +if(verbose){print(opt)} # load in MAF table -if( !exists("maf_1000g")){ - - maf_1000g <- loadMAF(maf_1000gp3) - # load in liftover chain - chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") - } +if( !exists("maf_1000g") ){ + message(paste0("loading MAF from ", MAF ) ) + maf_1000g <- loadMAF(MAF) + # load in liftover chain + chain_hg19_hg38 <- import.chain("/sc/arion/projects/ad-omics/data/references/liftOver/hg19ToHg38.over.chain") +} main <- function(){ - - #gwas_dataset <- "Kunkle_2019" - #gwas_dataset <- "Nalls23andMe_2019" - #gwas_dataset <- "Marioni_2018" - #qtl_dataset <- "Microglia_THA" - - gwas <- pullData(gwas_dataset, "GWAS") - qtl <- pullData(qtl_dataset, "QTL") - - top_loci <- extractLoci(gwas) - - # for testing - #top_loci <- top_loci[2,] + message_("starting COLOC pipeline") if( debug == TRUE){ save.image("debug.RData") } - - - all_coloc <- purrr::map( + # regular GWAS-QTL COLOC + if( is.null(target_file) ){ + message_("GWAS - QTL COLOC") + gwas <- pullData(gwas_dataset, "GWAS") + qtl <- pullData(qtl_dataset, "QTL") + + if( !is.null(qtl_dataset_2) ){ + qtl2 <- pullData(qtl_dataset_2, "QTL") + }else{ qtl2 <- NULL} + top_loci <- extractLoci(gwas) + message_(paste0(" testing ", nrow(top_loci), " loci ") ) + + all_coloc <- purrr::map( 1:nrow(top_loci), ~{ - res <- runCOLOC(gwas, qtl, hit = top_loci[.x,]) + res <- runCOLOC(gwas, qtl, qtl2, hit = top_loci[.x,], sig.level = sig) if(is.null(res) ){return(NULL) } return(res) + }) + } + # QTL-QTL COLOC with pairwise targets + if(!is.null(target_file) ){ + message_("QTL-QTL COLOC") + targets <- read_tsv(target_file) + # for TESTING + #targets <- head(targets, 3) + message_( paste0("testing ", nrow(targets), " pairs of QTLs" ) ) + # doesn't work properly - furrr loads the whole MAF object into memory for each parallel run, 100MB per run which is crazy. + # for parallel to work I would have to transition to querying the MAF database each time like we do with the sumstats. + if(threads > 1){ + suppressPackageStartupMessages(library(furrr)) + plan(multisession, workers = threads) + map <- future_map } - ) - all_res <- map_df(all_coloc, "full_res") - all_obj <- map(all_coloc, "coloc_object") - names(all_obj) <- top_loci$locus - - - # arrange by H4 - all_res <- arrange(all_res, desc(PP.H4.abf) ) + all_coloc <- map( + 1:nrow(targets), ~{ + message_verbose( paste0("Pair ", .x , " of ", nrow(targets) ) ) + pair_df <- targets[.x,] + if( "GWAS" %in% names(pair_df) ){ + gwas <- pullData(pair_df$GWAS, "GWAS") + locus <- extractLoci(gwas) %>% filter(locus == pair_df$locus) + }else{ + message_verbose("no GWAS specified - using chr and pos from targets file") + locus <- pair_df + gwas <- NULL + } + qtl1 <- pullData(pair_df$qtl_1, "QTL") + qtl2 <- pullData(pair_df$qtl_2, "QTL") + features <- c(pair_df$feature_1, pair_df$feature_2) + res <- runCOLOC(gwas, qtl1, qtl2, hit = locus, sig.level = sig, target.file = features, bare = low_mem) + gc() + if(is.null(res) ){return(NULL) } + return(res) + } + ) + } - outFile <- paste0(outFolder, qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") - readr::write_tsv(all_res, path = outFile) + all_res <- purrr::map_df(all_coloc, "full_res") + if(!is.null(target_file) ){ + outFile <- file.path(outFolder, gsub(".tsv", "_COLOC.tsv", basename(target_file) ) ) + # here if any of the COLOCs did not proceed then the target df will be larger than the all_res results + + #all_res <- dplyr::bind_cols(targets, all_res %>% select(-locus) ) + }else{ + outFile <- paste0(outFolder,"/", qtl_dataset, "_", gwas_dataset, "_COLOC.tsv") + names(all_obj) <- top_loci$locus + } + dir.create(outFolder, showWarnings = FALSE) + # in case all COLOCs are null for some reason + if(nrow(all_res) > 0){ + all_res <- dplyr::arrange(all_res, desc(PP.H4.abf) ) + } + message("finished COLOC pipeline") + message_verbose(paste0("writing output to ", outFile)) + readr::write_tsv(all_res, file = outFile) # save COLOC objects for plotting - save(all_obj, file = gsub("tsv", "RData", outFile)) + if( !low_mem ){ + all_obj <- purrr::map(all_coloc, "coloc_object") + if( is.null(targets) ){ + names(all_obj) <- top_loci$locus + } + outData <- gsub("tsv.gz", "RData", outFile) + outData <- gsub("tsv", "RData", outFile) + save(all_obj, file = outData) + } } main() + diff --git a/GWAS/processGWAS.R b/GWAS/processGWAS.R index 12a1835..97fc82a 100644 --- a/GWAS/processGWAS.R +++ b/GWAS/processGWAS.R @@ -81,7 +81,7 @@ get_standard_error <- function(OR, P){ # write out sorted file to reference folder # tabix index the file # Ripke has loci with chr23 for X - remove -sortGWAS <- function(gwas, out_folder = "./"){ +sortGWAS <- function(gwas, out_folder = ""){ dataset <- gwas$dataset col_chr <- gwas$full_chrom @@ -127,7 +127,7 @@ sortGWAS <- function(gwas, out_folder = "./"){ # remove duplicate rows gwas_sorted <- dplyr::distinct(gwas_sorted) - out_path <- paste0(out_folder, dataset, ".processed.tsv") + out_path <- file.path(out_folder, paste0(dataset, ".processed.tsv") ) message(Sys.time()," * writing sorted GWAS file to ", out_path) readr::write_tsv(gwas_sorted, out_path) stopifnot(file.exists(out_path) ) diff --git a/qvalue_sharing/Snakefile b/qvalue_sharing/Snakefile index f6c9c85..4b086d5 100644 --- a/qvalue_sharing/Snakefile +++ b/qvalue_sharing/Snakefile @@ -18,7 +18,7 @@ rule run_qvalue: output: outFolder + "{source}:{target}:{threshold}.qvalue.tsv" shell: - "ml R/3.6.0;" + "ml R;" "Rscript {params.script} -s {params.source} -t {params.target} -q {threshold} -o {outFolder}" rule merge_qvalue: @@ -29,5 +29,5 @@ rule merge_qvalue: output: outFolder + "all_qvalue_merged." + threshold + ".tsv" shell: - "ml R/3.6.0;" + "ml R;" "Rscript {params.script} -o {outFolder} -q {threshold}" diff --git a/qvalue_sharing/run_qvalue.R b/qvalue_sharing/run_qvalue.R index 50a372b..38c6511 100644 --- a/qvalue_sharing/run_qvalue.R +++ b/qvalue_sharing/run_qvalue.R @@ -11,10 +11,10 @@ pullData <- function(dataset, type = "GWAS"){ message(Sys.time()," * reading GWAS database from ", db_path) stopifnot( file.exists(db_path) ) if( type == "GWAS"){ - n_sheet <- 3 + n_sheet <- 2 } if( type == "QTL"){ - n_sheet <- 2 + n_sheet <- 3 } gwas_db <- suppressMessages(readxl::read_excel(db_path, sheet = n_sheet,na= c("", "-","NA"))) @@ -78,7 +78,7 @@ extractTargetQTL <- function(qtl, chr, source_qtls){ stopifnot( file.exists(qtl$full_path) ) # read in QTL - cmd <- paste0("ml bcftools; tabix ", qtl$full_path, " ", chr) + cmd <- paste0("ml tabix; tabix ", qtl$full_path, " ", chr) message( " * ", cmd) result <- data.table::fread( cmd = cmd, nThread = 8 ) if( nrow(result) == 0){