From 62c1857c36043d4501cd84912198eeb083fa4631 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Thu, 12 Mar 2026 15:30:43 +0100 Subject: [PATCH 01/13] add fgsea for genewise significance estimation --- R/2-propd-genewise.R | 411 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 383 insertions(+), 28 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 4c614621..869bbce4 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -1,32 +1,35 @@ #' Convert pairwise propd results into genewise -#' +#' #' This function converts pairwise propd results into genewise results. The resulting -#' genewise results are direct indirectors of genes being differentially expressed. -#' +#' genewise results are direct indicators of genes being differentially expressed. +#' #' @param propd A \code{\link{propd}} object, with FDR values from updateF. Note: -#' for the moment, only theta results with F-stats are supported. Later we will look +#' for the moment, only theta results with F-stats are supported. Later we will look #' on the option to get FDR values based on permutations. -#' @param pairwise_fdr FDR threashold to consider a pairwise relationship as significant. +#' @param pairwise_fdr FDR threshold to consider a pairwise relationship as significant. #' Default is 0.05. +#' @param partner_fraction Numeric in (0, 1). Fraction of genes to use as +#' partners per batch. Default 0.017. Increase for more stable ES at the +#' cost of compute time. +#' @param n_iter Integer. Number of independent iterations to aggregate. +#' Default 5. Higher values give more stable results. +#' @param ... Additional arguments passed to \code{.compute_fgsea_padj_batches}, +#' such as \code{nperm}, \code{seed}, or \code{scoreType}. #' @return A data frame with genewise results that can be used to identify differentially #' expressed genes. It contains the following columns: #' - "id": gene identifier #' - "lfc": Log Fold Change of the gene, using the geometric mean of all genes as reference. -#' - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset +#' - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset #' of genes as reference (only the ones that are significantly connected to the gene). #' - "connectivity": number of significant pairwise relationships the gene has. -#' - "wconnectivity": weighted connectivity, which sums the strength of significant pairwise -#' relationships (defined as 1/theta) for the gene. -#' - "FDR_mean": average FDR value across all pairwise comparisons for the gene, which can be -#' used as a genewise significance statistic. Note that this is a simple average and may -#' not be the most robust method for determining genewise significance, but it provides a -#' starting point for identifying genes of interest based on their pairwise relationships. -#' Future updates may include more sophisticated methods for calculating genewise p-values -#' or FDR values based on the pairwise results. -#' +#' - "ES": Enrichment Score of the gene. +#' - "padj": adjusted p-value of the gene, based on the Enrichment Score. +#' #' @rdname propdGenewise #' @export -propdGenewise <- function(propd, pairwise_fdr = 0.05) { +propdGenewise <- function(propd, pairwise_fdr = 0.05, + partner_fraction = 0.017, + n_iter = 5L, ...) { # for the moment it only works with theta results with F-stats, # but later we will look into the option to get FDR values based on permutations. @@ -34,6 +37,12 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05) { stop("Please run updateF on the propd object to get FDR values before running propdGenewise.") } + n_na_fdr <- sum(is.na(propd@results$FDR)) + if (n_na_fdr > 0) { + stop("%d of %d pairs (%.1f%%) have NA FDR, likely due to zero counts. ", + "Try adjusting zero-handling options (e.g., adding pseudocounts) and re-running updateF.") + } + # not working for alpha != NA too if (!is.na(propd@alpha)) { stop("propdGenewise currently only works for alpha = NA. Future updates may include support for other alpha values.") @@ -56,12 +65,14 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05) { ## ---- Connectivity ---- connectivity <- rowSums(adj, na.rm = TRUE) - ## ---- Weighted connectivity ---- - weight_mat <- ifelse(adj, 1 / theta_mat, 0) - wconnectivity <- rowSums(weight_mat, na.rm = TRUE) + ## ---- GSEA-inspired enrichment score (vectorized) ---- + es_scores <- .compute_es(propd@results, features) + + ## ---- Estimate significance of ES via permutation ---- + fgsea_batches <- .compute_fgsea_padj_batches(propd, partner_fraction = 0.017, + n_iter = 5L, nperm = 1000L, + seed = 42L, scoreType = "pos") - ## ---- FDR mean ---- - fdr_mean <- rowMeans(fdr_mat, na.rm = TRUE) ## ---- Build lrm matrices ---- lrm1_all <- results_to_matrix(propd@results, what = "lrm1", features = features) @@ -75,22 +86,366 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05) { ## ---- LFC (CLR-based log fold change) ---- # lrm differences represent log fold changes; averaging across all genes as # reference is equivalent to using the geometric mean (CLR transformation) - lrm_diff <- lrm1_all - lrm2_all - lfc <- rowMeans(lrm_diff, na.rm = TRUE) / log(2) + lrm_diff_full <- lrm1_all - lrm2_all + lfc <- rowMeans(lrm_diff_full, na.rm = TRUE) / log(2) ## ---- lrmD (LFC using only significant partners as reference) ---- - lrm_diff <- ifelse(adj, lrm_diff, NA) # keep only significant pairwise relationships - lrmD <- apply(lrm_diff, 1, median, na.rm = TRUE) / log(2) + lrm_diff_sig <- ifelse(adj, lrm_diff_full, NA) # keep only significant pairwise relationships + lrmD <- apply(lrm_diff_sig, 1, median, na.rm = TRUE) / log(2) ## ---- Compile results into a data frame ---- - data.frame( + result <- data.frame( id = features, lfc = lfc, lrmD = lrmD, connectivity = connectivity, - wconnectivity = wconnectivity, - FDR_mean = fdr_mean, + ES = es_scores$es_pos, + padj = fgsea_batches$padj, + ES_batch = fgsea_batches$ES_batch, stringsAsFactors = FALSE, row.names = NULL ) + + ## ---- Warn if full and batch ES are poorly concordant (overall) ---- + es_cor <- cor(es_scores$es_pos, fgsea_batches$ES_batch, + method = "spearman", use = "complete.obs") + if (is.na(es_cor) || es_cor < 0.7) { + warning(sprintf( + paste0( + "Low concordance between full ES and batch ES (Spearman rho = %.2f). ", + "The redundancy assumption may be violated: partner subsampling in the ", + "batch procedure may not reflect the full gene-set signal. ", + "Treat 'padj' values with caution. Consider increasing partner_fraction ", + "or n_iter." + ), + es_cor + )) + } else { + message(sprintf("ES concordance (Spearman rho = %.2f): OK.", es_cor)) + } + + es_diff_threshold <- 0.5 + padj_threshold <- 0.05 + sig_mask <- result$padj < padj_threshold & !is.na(result$padj) + relative_diff <- abs(result$ES - result$ES_batch) / (abs(result$ES) + 1e-9) + discordant_sig <- sig_mask & (relative_diff > es_diff_threshold) + + if (any(discordant_sig)) { + warning(sprintf( + paste0( + "%d significant gene(s) (padj < %.2f) show high discordance between ", + "full ES and batch ES (relative difference > %.0f%%). ", + "The redundancy assumption may be violated for these genes and their ", + "'padj' values may be unreliable. ", + "Affected genes: %s" + ), + sum(discordant_sig), + padj_threshold, + es_diff_threshold * 100, + paste(result$id[discordant_sig], collapse = ", ") + )) + } else { + message("No significant genes show high discordance between full ES and batch ES.") + } + return(result) +} + + +#' Compute GSEA-inspired enrichment scores (vectorized, integer-native) +#' +#' Pairs are ranked by theta ascending (most differentially proportional first). +#' For each gene g, its "set" is the G-1 pairs it participates in. The running +#' sum increments by 1/(G-1) on a hit and decrements by 1/(N-(G-1)) on a miss. +#' +#' Key insight: the running sum at hit j (occurring at rank r_j) has a closed form: +#' rs(j) = j * (hit_inc + miss_inc) - r_j * miss_inc +#' The minimum always occurs just BEFORE a hit (rs(j) - hit_inc, which also +#' correctly handles the trough before the first hit) or in the tail after the +#' last hit. This gives a fully vectorized implementation via tapply. +#' +#' Partner and Pair columns in results are assumed to be 1-based integer indices +#' into features (as produced by propd), so no string matching is needed. +#' +#' @param results A data frame from propd@@results with columns Partner (int), +#' Pair (int), theta. +#' @param features Character vector of gene names (length G). +#' @return A data frame with columns es, es_pos, es_neg (one row per gene, +#' same order as features). +#' @keywords internal +.compute_es <- function(results, features) { + + G <- length(features) + N <- nrow(results) + + expected_N <- G * (G - 1L) / 2L + if (N != expected_N) { + warning(sprintf( + ".compute_es: expected %d pairs for %d genes but got %d. ES may be unreliable.", + expected_N, G, N + )) + } + + K <- G - 1L + hit_inc <- 1.0 / K + miss_inc <- 1.0 / (N - K) + + # Rank all pairs by theta ascending (1 = most DP) + hit_ranks <- rank(results$theta, ties.method = "first") + + # Partner and Pair are 1-based integer indices — use directly, no match() needed. + # Build long table: each pair contributes two rows (one per gene). + long_gene <- c(results$Partner, results$Pair) + long_hit_rank <- c(hit_ranks, hit_ranks) + + # Sort by gene index, then by rank within gene + ord <- order(long_gene, long_hit_rank) + long_gene <- long_gene[ord] + long_hit_rank <- long_hit_rank[ord] + + # Hit index j (1..K) within each gene + j <- as.numeric(ave(long_hit_rank, long_gene, FUN = seq_along)) + + # Running sum AT hit j: j*(hit_inc + miss_inc) - r_j * miss_inc + rs <- j * (hit_inc + miss_inc) - long_hit_rank * miss_inc + + # Trough just BEFORE hit j (also handles pre-first-hit descent correctly) + rs_before <- rs - hit_inc + + # Aggregate per gene (integer keys -> tapply result indexed 1..G) + es_pos <- as.numeric(tapply(rs, long_gene, max)) + last_rs <- as.numeric(tapply(rs, long_gene, function(x) x[length(x)])) + last_rank <- as.numeric(tapply(long_hit_rank, long_gene, max)) + tail_val <- last_rs - (N - last_rank) * miss_inc + min_before <- as.numeric(tapply(rs_before, long_gene, min)) + es_neg <- pmin(min_before, tail_val) + + es <- ifelse(abs(es_pos) >= abs(es_neg), es_pos, es_neg) + + data.frame(es = es, es_pos = es_pos, es_neg = es_neg, row.names = NULL) + } + + +#' Compute genewise ES (in batches) and permutation p-values from a propd object +#' +#' This function extends \code{\link{propdGenewise}} by adding permutation-based +#' p-values via a batched GSEA strategy. Because all gene-pair sets overlap +#' (every pair is shared by two genes), running fgsea naively would violate +#' the independence assumption of the permutation test. The solution is to +#' process genes in batches: within each batch, each focal gene is assigned a +#' disjoint random subset of partner genes, so the resulting pair sets are +#' independent by construction. fgsea is then run on this reduced, independent +#' score vector, yielding valid p-values. +#' +#' To reduce variance from the random partner assignment, the procedure is +#' repeated \code{n_iter} times with different seeds. The final ES is the mean +#' across iterations and the final p-value is the median (robust to outlier +#' iterations). Global BH correction is applied to the median p-values. +#' +#' @param propd A \code{\link{propd}} object. +#' @param partner_fraction Numeric in (0, 1). Fraction of all other genes to +#' use as partners for each focal gene per batch. Default 0.017 (~1.7\%), +#' giving ~300 partners at G=18080. A minimum of 100 partners is enforced. +#' @param n_iter Integer. Number of independent iterations to run and aggregate. +#' Default 5. Higher values give more stable ES and p-values at the cost of +#' proportionally more compute time. +#' @param nperm Integer. Number of permutations per fgseaSimple call. Default 1000. +#' @param seed Integer. Base random seed. Each iteration uses seed + i for +#' reproducibility. Default 42. +#' @param scoreType Character. Passed to fgseaSimple. "pos" (default) tests +#' enrichment at the low-theta end only. "std" tests both ends. +#' @return A data frame with one row per gene and columns: +#' - "id": gene identifier +#' - "ES_batch": median enrichment score across iterations (subsampled pairs) +#' - "pval": median p-value across iterations +#' - "padj": BH-adjusted median p-value (global, across all genes) +#' +#' @importFrom fgsea fgseaSimple +#' @rdname propdGenewisePval +#' @export +.compute_fgsea_padj_batches <- function(propd, + partner_fraction = 0.017, + n_iter = 5L, + nperm = 1000L, + seed = 42L, + scoreType = "pos") { + + if (!requireNamespace("fgsea", quietly = TRUE)) { + stop("Package 'fgsea' is required. Install with: BiocManager::install('fgsea')") + } + if (partner_fraction <= 0 || partner_fraction >= 1) { + stop("partner_fraction must be in (0, 1).") + } + n_iter <- as.integer(n_iter) + if (n_iter < 1L) stop("n_iter must be >= 1.") + + features <- colnames(propd@counts) + G <- length(features) + results <- propd@results + + # ---- Derive max_partners ---- + max_partners <- max(100L, floor((G - 1L) * partner_fraction)) + max_partners <- as.integer(min(max_partners, G - 1L)) + + batch_size <- floor(G / (1L + max_partners)) + if (batch_size < 1L) stop("Derived batch size < 1. Decrease partner_fraction.") + n_batches <- ceiling(G / batch_size) + + message(sprintf( + "G=%d | partner_fraction=%.3f | max_partners=%d (%.1f%%) | batch_size=%d | n_batches=%d | n_iter=%d", + G, partner_fraction, max_partners, + max_partners / (G - 1) * 100, + batch_size, n_batches, n_iter + )) + + # ---- Global score vector: -theta so high score = low theta = most DP ---- + pair_names <- paste(results$Partner, results$Pair, sep = ",") + global_scores <- setNames(-results$theta, pair_names) + + + # ---- Run n_iter independent iterations ---- + # Each iteration returns a G x 2 matrix (ES_batch, pval) per gene + iter_es <- matrix(NA_real_, nrow = G, ncol = n_iter) + iter_pval <- matrix(NA_real_, nrow = G, ncol = n_iter) + + for (iter in seq_len(n_iter)) { + message(sprintf("Iteration %d / %d", iter, n_iter)) + iter_result <- .run_batches_once( + results = results, + G = G, + global_scores = global_scores, + max_partners = max_partners, + batch_size = batch_size, + n_batches = n_batches, + nperm = nperm, + seed = seed + iter - 1L, + scoreType = scoreType + ) + iter_es[, iter] <- iter_result$ES_batch + iter_pval[, iter] <- iter_result$pval + } + + # ---- Aggregate across iterations ---- + # ES: median (stable summary of enrichment magnitude) + # pval: median (robust to unlucky subsample in one iteration) + es_median <- apply(iter_es, 1, median, na.rm = TRUE) + pval_median <- apply(iter_pval, 1, median, na.rm = TRUE) + + # Global BH correction on median p-values + padj <- p.adjust(pval_median, method = "BH") + + # ---- Assemble output ---- + data.frame( + id = features, + ES_batch = es_median, + pval = pval_median, + padj = padj, + stringsAsFactors = FALSE, + row.names = NULL + ) +} + + +#' Run one iteration of the batched fgsea procedure +#' +#' Internal helper called by \code{propdGenewisePval}. Processes all genes +#' in batches, assigning disjoint partner subsets per focal gene, running +#' fgseaSimple per batch, and returning a per-gene data frame of ES and pval. +#' +#' @keywords internal +.run_batches_once <- function(results, G, global_scores, max_partners, + batch_size, n_batches, nperm, seed, scoreType) { + + set.seed(seed) + all_gene_ids <- seq_len(G) + batch_starts <- seq(1L, G, by = batch_size) + batch_results <- vector("list", length(batch_starts)) + total_dropped <- 0L + + for (b in seq_along(batch_starts)) { + batch_start <- batch_starts[b] + batch_end <- min(batch_start + batch_size - 1L, G) + focal_ids <- all_gene_ids[batch_start:batch_end] + n_focal <- length(focal_ids) + + available <- sample(setdiff(all_gene_ids, focal_ids)) + effective_partners <- min(max_partners, floor(length(available) / n_focal)) + + if (effective_partners < 1L) { + warning(sprintf("Batch %d: not enough available partners, skipping.", b)) + next + } + + pathways <- vector("list", n_focal) + names(pathways) <- paste0("gene_", focal_ids) + + for (i in seq_len(n_focal)) { + partner_ids <- available[((i - 1L) * effective_partners + 1L): + (i * effective_partners)] + higher <- pmax(focal_ids[i], partner_ids) + lower <- pmin(focal_ids[i], partner_ids) + pathways[[i]] <- paste(higher, lower, sep = ",") + } + + batch_pair_names <- unique(unlist(pathways)) + batch_scores <- global_scores[batch_pair_names] + + if (anyNA(batch_scores)) { + warning(sprintf("Batch %d: %d pairs not found in results, dropping.", + b, sum(is.na(batch_scores)))) + batch_scores <- batch_scores[!is.na(batch_scores)] + pathways <- lapply(pathways, function(p) p[p %in% names(batch_scores)]) + } + + batch_scores <- sort(batch_scores, decreasing = TRUE) + + fgsea_res <- fgsea::fgseaSimple( + pathways = pathways, + stats = batch_scores, + scoreType = scoreType, + nperm = nperm + ) + + fgsea_res$gene_id <- as.integer(sub("^gene_", "", fgsea_res$pathway)) + + # Fill genes silently dropped by fgsea with ES=0, pval=1 + returned_pathways <- paste0("gene_", fgsea_res$gene_id) + missing_pathways <- setdiff(names(pathways), returned_pathways) + if (length(missing_pathways) > 0L) { + total_dropped <- total_dropped + length(missing_pathways) + missing_ids <- as.integer(sub("^gene_", "", missing_pathways)) + fgsea_res <- rbind( + data.frame(gene_id = fgsea_res$gene_id, + ES_batch = fgsea_res$ES, + pval = fgsea_res$pval, + stringsAsFactors = FALSE), + data.frame(gene_id = missing_ids, + ES_batch = 0, + pval = 1, + stringsAsFactors = FALSE) + ) + } else { + fgsea_res <- data.frame( + gene_id = fgsea_res$gene_id, + ES_batch = fgsea_res$ES, + pval = fgsea_res$pval, + stringsAsFactors = FALSE + ) + } + + batch_results[[b]] <- fgsea_res + } + + if (total_dropped > 0L) { + warning(sprintf( + "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. ", + total_dropped, G, total_dropped / G * 100, + "Consider increasing partner_fraction or nperm." + )) + } + + # Combine and sort by gene_id so rows align with features order + combined <- do.call(rbind, batch_results) + combined[order(combined$gene_id), ] +} + From 18739a12ca5308c5415b725c41f4367b5ffef227 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Thu, 2 Apr 2026 12:00:31 +0200 Subject: [PATCH 02/13] add propdGenewise tests --- R/2-propd-genewise.R | 17 ++- tests/testthat/test-PROPD-genewise.R | 160 +++++++++++++++++---------- 2 files changed, 115 insertions(+), 62 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 869bbce4..07d83c60 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -57,6 +57,18 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, features <- colnames(propd@counts) nfeatures <- length(features) + # warn for small datasets with only few genes. + if (nfeatures < 50) { + warning(sprintf( + paste0("propdGenewise: only %d genes detected. \n", + "Permutation p-values from fgsea are unreliable at this scale. \n" , + "Results are for testing purposes only. \n", + "Using connnectivity is prefered. " + ), + nfeatures + )) + } + ## ---- Build matrices needed for connectivity metrics ---- fdr_mat <- results_to_matrix(propd@results, what = "FDR", features = features) theta_mat <- results_to_matrix(propd@results, what = "theta", features = features) @@ -438,9 +450,8 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, if (total_dropped > 0L) { warning(sprintf( - "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. ", - total_dropped, G, total_dropped / G * 100, - "Consider increasing partner_fraction or nperm." + "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. Consider increasing partner_fraction or nperm.", + total_dropped, G, total_dropped / G * 100 )) } diff --git a/tests/testthat/test-PROPD-genewise.R b/tests/testthat/test-PROPD-genewise.R index 1b599731..c397a7d9 100644 --- a/tests/testthat/test-PROPD-genewise.R +++ b/tests/testthat/test-PROPD-genewise.R @@ -1,91 +1,86 @@ library(testthat) library(propr) -# data +# ---- shared test data ---- keep <- iris$Species %in% c("setosa", "versicolor") counts <- iris[keep, 1:4] * 10 -group <- ifelse(iris[keep, "Species"] == "setosa", "A", "B") +group <- ifelse(iris[keep, "Species"] == "setosa", "A", "B") -# calculate propd and F-statistics (adds FDR column) -pd <- propd(counts, group) -pd <- updateF(pd) +pd <- propd(counts, group) +pd_with_f <- updateF(pd) + +# Run genewise once; reuse across tests to keep the suite fast. +res <- propdGenewise(pd_with_f) + +# ---- error conditions ---- test_that("propdGenewise errors without FDR column", { - pd_no_fdr <- propd(counts, group) - expect_error( - propdGenewise(pd_no_fdr), - "Please run updateF" - ) + expect_error(propdGenewise(pd), "Please run updateF") +}) + +test_that("propdGenewise errors for alpha != NA", { + pd_alpha <- propd(counts, group, alpha = 0.5) + pd_alpha <- updateF(pd_alpha) + expect_error(propdGenewise(pd_alpha), "only works for alpha = NA") +}) + +test_that("propdGenewise errors for more than 2 groups", { + pd3 <- propd(iris[, 1:4], as.character(iris[, 5])) + pd3 <- updateF(pd3) + expect_error(propdGenewise(pd3), "only works for 2 groups") }) -test_that("propdGenewise returns correct structure", { - res <- propdGenewise(pd) +# ---- output structure ---- +test_that("propdGenewise returns a data frame with correct columns", { expect_s3_class(res, "data.frame") - expect_equal(colnames(res), c("id", "lfc", "lrmD", "connectivity", "wconnectivity", "FDR_mean")) + expect_equal(colnames(res), + c("id", "lfc", "lrmD", "connectivity", "ES", "padj", "ES_batch")) +}) + +test_that("propdGenewise has one row per gene", { expect_equal(nrow(res), ncol(counts)) expect_equal(res$id, colnames(counts)) }) -test_that("connectivity counts match manual calculation", { +# ---- connectivity ---- + +test_that("connectivity matches manual calculation", { fdr_cutoff <- 0.05 - res <- propdGenewise(pd, pairwise_fdr = fdr_cutoff) + res_c <- propdGenewise(pd_with_f, pairwise_fdr = fdr_cutoff) features <- colnames(counts) - # manually compute connectivity from the results table - sig <- pd@results[pd@results$FDR > 0 & pd@results$FDR < fdr_cutoff, ] + sig <- pd_with_f@results[ + pd_with_f@results$FDR > 0 & pd_with_f@results$FDR < fdr_cutoff, ] manual_conn <- setNames(rep(0L, length(features)), features) for (i in seq_len(nrow(sig))) { - manual_conn[sig$Pair[i]] <- manual_conn[sig$Pair[i]] + 1L + manual_conn[sig$Pair[i]] <- manual_conn[sig$Pair[i]] + 1L manual_conn[sig$Partner[i]] <- manual_conn[sig$Partner[i]] + 1L } - expect_equal(res$connectivity, as.integer(manual_conn[features])) + expect_equal(res_c$connectivity, as.integer(manual_conn[features])) }) -test_that("weighted connectivity matches manual calculation", { - fdr_cutoff <- 0.05 - res <- propdGenewise(pd, pairwise_fdr = fdr_cutoff) - features <- colnames(counts) - - # manually compute weighted connectivity - sig <- pd@results[pd@results$FDR > 0 & pd@results$FDR < fdr_cutoff, ] - manual_wconn <- setNames(rep(0, length(features)), features) - for (i in seq_len(nrow(sig))) { - w <- 1 / sig$theta[i] - manual_wconn[sig$Pair[i]] <- manual_wconn[sig$Pair[i]] + w - manual_wconn[sig$Partner[i]] <- manual_wconn[sig$Partner[i]] + w - } - - expect_equal(res$wconnectivity, as.numeric(manual_wconn[features])) +test_that("connectivity values are non-negative integers", { + expect_true(all(res$connectivity >= 0)) + expect_equal(res$connectivity, as.integer(res$connectivity)) }) -test_that("strict pairwise_fdr yields zero or fewer connections", { - res_loose <- propdGenewise(pd, pairwise_fdr = 0.5) - res_strict <- propdGenewise(pd, pairwise_fdr = 0.001) - +test_that("stricter pairwise_fdr yields fewer or equal connections", { + res_loose <- propdGenewise(pd_with_f, pairwise_fdr = 0.5) + res_strict <- propdGenewise(pd_with_f, pairwise_fdr = 0.001) expect_true(all(res_strict$connectivity <= res_loose$connectivity)) }) -test_that("pairwise_fdr of 0 gives zero connectivity for all genes", { - res <- propdGenewise(pd, pairwise_fdr = 0) - expect_true(all(res$connectivity == 0)) - expect_true(all(res$wconnectivity == 0)) +test_that("pairwise_fdr = 0 gives zero connectivity for all genes", { + res_zero <- propdGenewise(pd_with_f, pairwise_fdr = 0) + expect_true(all(res_zero$connectivity == 0)) }) -test_that("connectivity values are non-negative integers", { - res <- propdGenewise(pd) - expect_true(all(res$connectivity >= 0)) - expect_equal(res$connectivity, as.integer(res$connectivity)) -}) - -test_that("wconnectivity values are non-negative", { - res <- propdGenewise(pd) - expect_true(all(res$wconnectivity >= 0)) -}) +# ---- lfc ---- test_that("lfc is computed correctly via CLR", { - res <- propdGenewise(pd) + res <- propdGenewise(pd_with_f) # manually compute CLR-based LFC ct <- as.matrix(counts) @@ -94,20 +89,67 @@ test_that("lfc is computed correctly via CLR", { g1 <- group == "A" g2 <- group == "B" - clr1 <- propr:::logratio(ct[g1,], 'clr', NA) - clr2 <- propr:::logratio(ct[g2,], 'clr', NA) + clr1 <- propr:::logratio(ct[g1,], 'clr', NA) + clr2 <- propr:::logratio(ct[g2,], 'clr', NA) manual_lfc <- (colMeans(clr1 - clr2)) / log(2) expect_equal(res$lfc, as.numeric(manual_lfc)) }) +# ---- lrmD ---- + test_that("lrmD is NA when gene has no significant connections", { # use very strict cutoff so no pairs are significant - res <- propdGenewise(pd, pairwise_fdr = 0) + res <- propdGenewise(pd_with_f, pairwise_fdr = 0) expect_true(all(is.na(res$lrmD))) }) -test_that("FDR_mean is between 0 and 1", { - res <- propdGenewise(pd) - expect_true(all(res$FDR_mean >= 0 & res$FDR_mean <= 1)) +# ---- ES / padj ---- + +test_that("ES values are finite", { + expect_true(all(is.finite(res$ES))) +}) + +test_that("padj values are between 0 and 1", { + expect_true(all(res$padj >= 0 & res$padj <= 1, na.rm = TRUE)) +}) + +test_that("ES_batch values are finite", { + expect_true(all(is.finite(res$ES_batch))) +}) + +test_that("propdGenewise is reproducible with same inputs", { + res1 <- propdGenewise(pd_with_f) + res2 <- propdGenewise(pd_with_f) + expect_equal(res1$padj, res2$padj) + expect_equal(res1$ES_batch, res2$ES_batch) +}) + +test_that("ES values are non-negative when scoreType is pos", { + expect_true(all(res$ES >= 0)) +}) + +# ---- .compute_es internal function ---- + +test_that(".compute_es returns expected structure", { + es <- propr:::.compute_es(pd_with_f@results, colnames(counts)) + + expect_s3_class(es, "data.frame") + expect_equal(colnames(es), c("es", "es_pos", "es_neg")) + expect_equal(nrow(es), ncol(counts)) + expect_true(all(is.finite(es$es))) + expect_true(all(is.finite(es$es_pos))) + expect_true(all(is.finite(es$es_neg))) +}) + +test_that(".compute_es: es_pos is non-negative and es_neg is non-positive", { + es <- propr:::.compute_es(pd_with_f@results, colnames(counts)) + expect_true(all(es$es_pos >= 0)) + expect_true(all(es$es_neg <= 0)) +}) + +test_that(".compute_es: es is the larger-magnitude of es_pos and es_neg", { + es <- propr:::.compute_es(pd_with_f@results, colnames(counts)) + expected_es <- ifelse(abs(es$es_pos) >= abs(es$es_neg), es$es_pos, es$es_neg) + expect_equal(es$es, expected_es) }) From b56501e7d572b0f4303ff93eb5dac38928254d0e Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Tue, 7 Apr 2026 18:25:33 +0200 Subject: [PATCH 03/13] docs: run devtools::document() to update man/ and NAMESPACE --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/2-propd-genewise.R | 2 +- man/dot-compute_es.Rd | 34 ++++++++++++++++ man/dot-run_batches_once.Rd | 24 ++++++++++++ man/propdGenewise.Rd | 78 ++++++++++++++++++++++++++++++------- 6 files changed, 125 insertions(+), 17 deletions(-) create mode 100644 man/dot-compute_es.Rd create mode 100644 man/dot-run_batches_once.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 77c881c2..d14504f6 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,7 @@ Description: The bioinformatic evaluation of gene co-expression often begins wit License: GPL-2 LazyData: true VignetteBuilder: knitr -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Encoding: UTF-8 Depends: methods, diff --git a/NAMESPACE b/NAMESPACE index 898ffdf5..60f4f182 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(.compute_fgsea_padj_batches) export(aldex2propr) export(calculate_theta) export(compute_iqlr) @@ -43,6 +44,7 @@ exportClasses(propd) exportClasses(propr) exportMethods(show) importFrom(Rcpp,sourceCpp) +importFrom(fgsea,fgseaSimple) importFrom(methods,new) importFrom(methods,show) useDynLib(propr, .registration = TRUE) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 07d83c60..21c42bbe 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -273,7 +273,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' - "padj": BH-adjusted median p-value (global, across all genes) #' #' @importFrom fgsea fgseaSimple -#' @rdname propdGenewisePval +#' @rdname propdGenewise #' @export .compute_fgsea_padj_batches <- function(propd, partner_fraction = 0.017, diff --git a/man/dot-compute_es.Rd b/man/dot-compute_es.Rd new file mode 100644 index 00000000..70d58b4b --- /dev/null +++ b/man/dot-compute_es.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/2-propd-genewise.R +\name{.compute_es} +\alias{.compute_es} +\title{Compute GSEA-inspired enrichment scores (vectorized, integer-native)} +\usage{ +.compute_es(results, features) +} +\arguments{ +\item{results}{A data frame from propd@results with columns Partner (int), +Pair (int), theta.} + +\item{features}{Character vector of gene names (length G).} +} +\value{ +A data frame with columns es, es_pos, es_neg (one row per gene, + same order as features). +} +\description{ +Pairs are ranked by theta ascending (most differentially proportional first). +For each gene g, its "set" is the G-1 pairs it participates in. The running +sum increments by 1/(G-1) on a hit and decrements by 1/(N-(G-1)) on a miss. +} +\details{ +Key insight: the running sum at hit j (occurring at rank r_j) has a closed form: + rs(j) = j * (hit_inc + miss_inc) - r_j * miss_inc +The minimum always occurs just BEFORE a hit (rs(j) - hit_inc, which also +correctly handles the trough before the first hit) or in the tail after the +last hit. This gives a fully vectorized implementation via tapply. + +Partner and Pair columns in results are assumed to be 1-based integer indices +into features (as produced by propd), so no string matching is needed. +} +\keyword{internal} diff --git a/man/dot-run_batches_once.Rd b/man/dot-run_batches_once.Rd new file mode 100644 index 00000000..48fa98c3 --- /dev/null +++ b/man/dot-run_batches_once.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/2-propd-genewise.R +\name{.run_batches_once} +\alias{.run_batches_once} +\title{Run one iteration of the batched fgsea procedure} +\usage{ +.run_batches_once( + results, + G, + global_scores, + max_partners, + batch_size, + n_batches, + nperm, + seed, + scoreType +) +} +\description{ +Internal helper called by \code{propdGenewisePval}. Processes all genes +in batches, assigning disjoint partner subsets per focal gene, running +fgseaSimple per batch, and returning a per-gene data frame of ES and pval. +} +\keyword{internal} diff --git a/man/propdGenewise.Rd b/man/propdGenewise.Rd index f6a71c41..df11f887 100644 --- a/man/propdGenewise.Rd +++ b/man/propdGenewise.Rd @@ -2,36 +2,84 @@ % Please edit documentation in R/2-propd-genewise.R \name{propdGenewise} \alias{propdGenewise} +\alias{.compute_fgsea_padj_batches} \title{Convert pairwise propd results into genewise} \usage{ -propdGenewise(propd, pairwise_fdr = 0.05) +propdGenewise( + propd, + pairwise_fdr = 0.05, + partner_fraction = 0.017, + n_iter = 5L, + ... +) + +.compute_fgsea_padj_batches( + propd, + partner_fraction = 0.017, + n_iter = 5L, + nperm = 1000L, + seed = 42L, + scoreType = "pos" +) } \arguments{ -\item{propd}{A \code{\link{propd}} object, with FDR values from updateF. Note: -for the moment, only theta results with F-stats are supported. Later we will look -on the option to get FDR values based on permutations.} +\item{propd}{A \code{\link{propd}} object.} -\item{pairwise_fdr}{FDR threashold to consider a pairwise relationship as significant. +\item{pairwise_fdr}{FDR threshold to consider a pairwise relationship as significant. Default is 0.05.} + +\item{partner_fraction}{Numeric in (0, 1). Fraction of all other genes to +use as partners for each focal gene per batch. Default 0.017 (~1.7\%), +giving ~300 partners at G=18080. A minimum of 100 partners is enforced.} + +\item{n_iter}{Integer. Number of independent iterations to run and aggregate. +Default 5. Higher values give more stable ES and p-values at the cost of +proportionally more compute time.} + +\item{...}{Additional arguments passed to \code{.compute_fgsea_padj_batches}, +such as \code{nperm}, \code{seed}, or \code{scoreType}.} + +\item{nperm}{Integer. Number of permutations per fgseaSimple call. Default 1000.} + +\item{seed}{Integer. Base random seed. Each iteration uses seed + i for +reproducibility. Default 42.} + +\item{scoreType}{Character. Passed to fgseaSimple. "pos" (default) tests +enrichment at the low-theta end only. "std" tests both ends.} } \value{ A data frame with genewise results that can be used to identify differentially expressed genes. It contains the following columns: - "id": gene identifier - "lfc": Log Fold Change of the gene, using the geometric mean of all genes as reference. - - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset + - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset of genes as reference (only the ones that are significantly connected to the gene). - "connectivity": number of significant pairwise relationships the gene has. - - "wconnectivity": weighted connectivity, which sums the strength of significant pairwise - relationships (defined as 1/theta) for the gene. -- "FDR_mean": average FDR value across all pairwise comparisons for the gene, which can be - used as a genewise significance statistic. Note that this is a simple average and may - not be the most robust method for determining genewise significance, but it provides a - starting point for identifying genes of interest based on their pairwise relationships. - Future updates may include more sophisticated methods for calculating genewise p-values - or FDR values based on the pairwise results. + - "ES": Enrichment Score of the gene. + - "padj": adjusted p-value of the gene, based on the Enrichment Score. + +A data frame with one row per gene and columns: + - "id": gene identifier + - "ES_batch": median enrichment score across iterations (subsampled pairs) + - "pval": median p-value across iterations + - "padj": BH-adjusted median p-value (global, across all genes) } \description{ This function converts pairwise propd results into genewise results. The resulting -genewise results are direct indirectors of genes being differentially expressed. +genewise results are direct indicators of genes being differentially expressed. + +This function extends \code{\link{propdGenewise}} by adding permutation-based +p-values via a batched GSEA strategy. Because all gene-pair sets overlap +(every pair is shared by two genes), running fgsea naively would violate +the independence assumption of the permutation test. The solution is to +process genes in batches: within each batch, each focal gene is assigned a +disjoint random subset of partner genes, so the resulting pair sets are +independent by construction. fgsea is then run on this reduced, independent +score vector, yielding valid p-values. +} +\details{ +To reduce variance from the random partner assignment, the procedure is +repeated \code{n_iter} times with different seeds. The final ES is the mean +across iterations and the final p-value is the median (robust to outlier +iterations). Global BH correction is applied to the median p-values. } From 4d678bad99fde182a6430998ee2f8db4b449b404 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Tue, 7 Apr 2026 18:32:13 +0200 Subject: [PATCH 04/13] solve @keywords internal --- R/2-propd-genewise.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 21c42bbe..9e3b250d 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -274,7 +274,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' #' @importFrom fgsea fgseaSimple #' @rdname propdGenewise -#' @export +#' @keywords internal .compute_fgsea_padj_batches <- function(propd, partner_fraction = 0.017, n_iter = 5L, From 68d9980d38218fa1ea507d6d611c232dd5a13039 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Tue, 7 Apr 2026 18:36:15 +0200 Subject: [PATCH 05/13] update version in DESCRIPTION --- DESCRIPTION | 5 +++-- NEWS.md | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d14504f6..b1b8ea5f 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: propr Title: An R package to calculate proportionality and other measures for compositional data -Version: 5.1.8 +Version: 5.1.9 URL: https://github.com/tpq/propr BugReports: https://github.com/tpq/propr/issues Authors@R: c( @@ -43,7 +43,8 @@ Suggests: parallel, rmarkdown, testthat (>= 3.0.0), - vegan + vegan, + fgsea LinkingTo: Rcpp Config/testthat/edition: 3 diff --git a/NEWS.md b/NEWS.md index 48114a30..245efff6 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# propr 5.1.9 +* Added `propdGenewise()` to convert pairwise propd results into genewise + results, including CLR-based log fold change, connectivity, and GSEA-inspired + enrichment scores with permutation-based p-values via `fgsea`. + # propr 5.1.8 --------------------- * Use feature-wise permutation for propr() From c6312d2643c40d9b31a94b9e851c0f34915114f8 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Tue, 7 Apr 2026 18:54:26 +0200 Subject: [PATCH 06/13] solve duplicated default parameters --- R/2-propd-genewise.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 9e3b250d..0e63e592 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -81,9 +81,8 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, es_scores <- .compute_es(propd@results, features) ## ---- Estimate significance of ES via permutation ---- - fgsea_batches <- .compute_fgsea_padj_batches(propd, partner_fraction = 0.017, - n_iter = 5L, nperm = 1000L, - seed = 42L, scoreType = "pos") + fgsea_batches <- .compute_fgsea_padj_batches(propd, partner_fraction = partner_fraction, + n_iter = n_iter, ...) ## ---- Build lrm matrices ---- @@ -276,8 +275,8 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' @rdname propdGenewise #' @keywords internal .compute_fgsea_padj_batches <- function(propd, - partner_fraction = 0.017, - n_iter = 5L, + partner_fraction, + n_iter, nperm = 1000L, seed = 42L, scoreType = "pos") { From 5da3a77db44b7cdf2e7ad943c1698e9a793ae3a4 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Tue, 7 Apr 2026 18:59:25 +0200 Subject: [PATCH 07/13] add check Partner > Pair --- R/2-propd-genewise.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 0e63e592..01696922 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -284,6 +284,14 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, if (!requireNamespace("fgsea", quietly = TRUE)) { stop("Package 'fgsea' is required. Install with: BiocManager::install('fgsea')") } + + # Validate assumption that Partner > Pair on a random sample + sample_idx <- sample(nrow(results), min(1000L, nrow(results))) + if (any(results$Partner[sample_idx] <= results$Pair[sample_idx])) { + stop("results$Partner must always be greater than results$Pair. ", + "This is expected propd convention but appears to be violated.") + } + if (partner_fraction <= 0 || partner_fraction >= 1) { stop("partner_fraction must be in (0, 1).") } From a51e704d734eb4f3dc56ca69d92196bf6d8b5193 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Tue, 7 Apr 2026 19:10:52 +0200 Subject: [PATCH 08/13] remove deprecated fgseaSimple function --- NAMESPACE | 3 +-- R/2-propd-genewise.R | 13 +++++++------ man/propdGenewise.Rd | 6 ++++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 60f4f182..693eb6d7 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(.compute_fgsea_padj_batches) export(aldex2propr) export(calculate_theta) export(compute_iqlr) @@ -44,7 +43,7 @@ exportClasses(propd) exportClasses(propr) exportMethods(show) importFrom(Rcpp,sourceCpp) -importFrom(fgsea,fgseaSimple) +importFrom(fgsea,fgsea) importFrom(methods,new) importFrom(methods,show) useDynLib(propr, .registration = TRUE) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 01696922..bb24e76b 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -23,6 +23,7 @@ #' of genes as reference (only the ones that are significantly connected to the gene). #' - "connectivity": number of significant pairwise relationships the gene has. #' - "ES": Enrichment Score of the gene. +#' - "ES_batch": median Enrichment Score across batch iterations #' - "padj": adjusted p-value of the gene, based on the Enrichment Score. #' #' @rdname propdGenewise @@ -271,7 +272,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' - "pval": median p-value across iterations #' - "padj": BH-adjusted median p-value (global, across all genes) #' -#' @importFrom fgsea fgseaSimple +#' @importFrom fgsea fgsea #' @rdname propdGenewise #' @keywords internal .compute_fgsea_padj_batches <- function(propd, @@ -418,11 +419,11 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, batch_scores <- sort(batch_scores, decreasing = TRUE) - fgsea_res <- fgsea::fgseaSimple( - pathways = pathways, - stats = batch_scores, - scoreType = scoreType, - nperm = nperm + fgsea_res <- fgsea::fgsea( + pathways = pathways, + stats = batch_scores, + scoreType = scoreType, + nPermSimple = nperm ) fgsea_res$gene_id <- as.integer(sub("^gene_", "", fgsea_res$pathway)) diff --git a/man/propdGenewise.Rd b/man/propdGenewise.Rd index df11f887..ebda4ca6 100644 --- a/man/propdGenewise.Rd +++ b/man/propdGenewise.Rd @@ -15,8 +15,8 @@ propdGenewise( .compute_fgsea_padj_batches( propd, - partner_fraction = 0.017, - n_iter = 5L, + partner_fraction, + n_iter, nperm = 1000L, seed = 42L, scoreType = "pos" @@ -56,6 +56,7 @@ expressed genes. It contains the following columns: of genes as reference (only the ones that are significantly connected to the gene). - "connectivity": number of significant pairwise relationships the gene has. - "ES": Enrichment Score of the gene. + - "ES_batch": median Enrichment Score across batch iterations - "padj": adjusted p-value of the gene, based on the Enrichment Score. A data frame with one row per gene and columns: @@ -83,3 +84,4 @@ repeated \code{n_iter} times with different seeds. The final ES is the mean across iterations and the final p-value is the median (robust to outlier iterations). Global BH correction is applied to the median p-values. } +\keyword{internal} From 8cd0ebadad9fe2c3a6c7720d9e42c4786fbea16f Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Wed, 8 Apr 2026 08:54:38 +0200 Subject: [PATCH 09/13] remove n_batches as argument --- R/2-propd-genewise.R | 5 ++--- man/dot-run_batches_once.Rd | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index bb24e76b..7b74b689 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -336,7 +336,6 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, global_scores = global_scores, max_partners = max_partners, batch_size = batch_size, - n_batches = n_batches, nperm = nperm, seed = seed + iter - 1L, scoreType = scoreType @@ -374,10 +373,10 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' #' @keywords internal .run_batches_once <- function(results, G, global_scores, max_partners, - batch_size, n_batches, nperm, seed, scoreType) { + batch_size, nperm, seed, scoreType) { set.seed(seed) - all_gene_ids <- seq_len(G) + all_gene_ids <- sample(seq_len(G)) # Randomize the order batch_starts <- seq(1L, G, by = batch_size) batch_results <- vector("list", length(batch_starts)) total_dropped <- 0L diff --git a/man/dot-run_batches_once.Rd b/man/dot-run_batches_once.Rd index 48fa98c3..00514f0a 100644 --- a/man/dot-run_batches_once.Rd +++ b/man/dot-run_batches_once.Rd @@ -10,7 +10,6 @@ global_scores, max_partners, batch_size, - n_batches, nperm, seed, scoreType From b0d69ef582827c7019dcd71466a6f9f00b23b81b Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Wed, 8 Apr 2026 09:08:24 +0200 Subject: [PATCH 10/13] explain hardcoded thresholds --- R/2-propd-genewise.R | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 7b74b689..4759e918 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -64,7 +64,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, paste0("propdGenewise: only %d genes detected. \n", "Permutation p-values from fgsea are unreliable at this scale. \n" , "Results are for testing purposes only. \n", - "Using connnectivity is prefered. " + "Using connectivity is preferred. " ), nfeatures )) @@ -119,9 +119,23 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, ) ## ---- Warn if full and batch ES are poorly concordant (overall) ---- + + # Diagnostic thresholds for ES concordance warnings. + # Not exposed as parameters since these are internal quality checks, + # not analysis tuning parameters. + # These thresholds are heuristic and not formally benchmarked. + # 0.7 is a commonly used minimum for acceptable Spearman correlation. + # 0.5 relative difference flags cases where batch ES deviates substantially from full ES. + # Both values may be revisited with further benchmark in the future. + + es_cor_threshold <- 0.7 # minimum acceptable Spearman rho between full and batch ES + es_diff_threshold <- 0.5 # maximum acceptable relative difference for significant genes + padj_threshold <- 0.05 + es_cor <- cor(es_scores$es_pos, fgsea_batches$ES_batch, method = "spearman", use = "complete.obs") - if (is.na(es_cor) || es_cor < 0.7) { + + if (is.na(es_cor) || es_cor < es_cor_threshold) { warning(sprintf( paste0( "Low concordance between full ES and batch ES (Spearman rho = %.2f). ", @@ -136,8 +150,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, message(sprintf("ES concordance (Spearman rho = %.2f): OK.", es_cor)) } - es_diff_threshold <- 0.5 - padj_threshold <- 0.05 + sig_mask <- result$padj < padj_threshold & !is.na(result$padj) relative_diff <- abs(result$ES - result$ES_batch) / (abs(result$ES) + 1e-9) discordant_sig <- sig_mask & (relative_diff > es_diff_threshold) From d4ff34b5747c8a7c3e25188664892d79c5393a59 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Wed, 8 Apr 2026 09:20:40 +0200 Subject: [PATCH 11/13] extend documentation for propdGenewise --- R/2-propd-genewise.R | 122 ++++++++++++++++++++++++++++++++----------- man/propdGenewise.Rd | 87 ++++++++++++++++++++++++------ 2 files changed, 162 insertions(+), 47 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 4759e918..d2339783 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -1,30 +1,87 @@ -#' Convert pairwise propd results into genewise +#' Convert pairwise propd results into genewise results #' -#' This function converts pairwise propd results into genewise results. The resulting -#' genewise results are direct indicators of genes being differentially expressed. +#' This function summarises pairwise differential proportionality results at +#' the gene level, producing metrics that can be used to rank and identify +#' differentially expressed genes. #' -#' @param propd A \code{\link{propd}} object, with FDR values from updateF. Note: -#' for the moment, only theta results with F-stats are supported. Later we will look -#' on the option to get FDR values based on permutations. -#' @param pairwise_fdr FDR threshold to consider a pairwise relationship as significant. -#' Default is 0.05. -#' @param partner_fraction Numeric in (0, 1). Fraction of genes to use as -#' partners per batch. Default 0.017. Increase for more stable ES at the -#' cost of compute time. -#' @param n_iter Integer. Number of independent iterations to aggregate. -#' Default 5. Higher values give more stable results. +#' @details +#' ## What this function computes +#' +#' **Connectivity** counts how many significant pairwise relationships each +#' gene has (controlled by \code{pairwise_fdr}). A highly connected gene is +#' one whose log-ratio with many other genes changes between groups. +#' +#' **LFC** is a CLR-based log fold change, computed by averaging log-ratios +#' across all other genes as reference (equivalent to using the geometric mean +#' as reference). This is the standard compositionally-aware fold change. +#' +#' **lrmD** is similar to LFC but uses only significantly connected partners +#' as reference, making it more robust when only a subset of genes are +#' differentially proportional. +#' +#' **ES (Enrichment Score)** is a GSEA-inspired score. Pairs are ranked by +#' theta ascending (most differentially proportional first). For each gene, +#' its "gene set" is all pairs it participates in. The ES measures how much +#' those pairs are enriched at the top of the ranking (i.e. among the most +#' differentially proportional pairs). A high ES means the gene is +#' systematically involved in differentially proportional relationships. +#' +#' ## Permutation p-values and the batch procedure +#' +#' Because gene-pair sets overlap (every pair belongs to two genes), +#' standard GSEA permutation tests are invalid. To solve this, genes are +#' processed in batches where each focal gene is assigned a disjoint random +#' subset of partners. This ensures independence within each batch, yielding +#' valid permutation p-values. +#' +#' The procedure is repeated \code{n_iter} times with different random seeds +#' and results are aggregated (median ES and median p-value across iterations) +#' to reduce variance from the random partner assignment. +#' +#' ## How to tune the parameters +#' +#' - \code{partner_fraction}: controls how many partners each gene gets per +#' batch (as a fraction of all genes). Higher values give more stable ES +#' at the cost of compute time. If you see the ES concordance warning, +#' try increasing this first. +#' - \code{n_iter}: more iterations give more stable p-values. Increase if +#' results vary between runs. Each extra iteration adds proportional +#' compute time. +#' - \code{nperm} (passed via \code{...}): number of permutations per fgsea +#' call. Increase for more precise p-values, especially for very small +#' p-values. Default is 1000. +#' - \code{seed} (passed via \code{...}): random seed for reproducibility. +#' Default is 42. +#' +#' @param propd A \code{\link{propd}} object with FDR values from +#' \code{\link{updateF}}. +#' @param pairwise_fdr FDR threshold to consider a pairwise relationship as +#' significant. Controls connectivity and lrmD. Default is 0.05. +#' @param partner_fraction Numeric in (0, 1). Fraction of all genes to use +#' as partners per focal gene per batch. Higher values give more stable ES +#' at the cost of compute time. Default 0.017 (~1.7\%, giving ~300 partners +#' at G=18000). Increase to 0.05-0.1 if you see concordance warnings. +#' @param n_iter Integer. Number of independent iterations to run and +#' aggregate. Higher values give more stable results. Default 5. #' @param ... Additional arguments passed to \code{.compute_fgsea_padj_batches}, -#' such as \code{nperm}, \code{seed}, or \code{scoreType}. -#' @return A data frame with genewise results that can be used to identify differentially -#' expressed genes. It contains the following columns: -#' - "id": gene identifier -#' - "lfc": Log Fold Change of the gene, using the geometric mean of all genes as reference. -#' - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset -#' of genes as reference (only the ones that are significantly connected to the gene). -#' - "connectivity": number of significant pairwise relationships the gene has. -#' - "ES": Enrichment Score of the gene. -#' - "ES_batch": median Enrichment Score across batch iterations -#' - "padj": adjusted p-value of the gene, based on the Enrichment Score. +#' such as \code{nperm} (number of permutations, default 1000), +#' \code{seed} (random seed, default 42), or \code{scoreType} (default +#' \code{"pos"}, testing enrichment at the low-theta end only). +#' +#' @return A data frame with one row per gene and the following columns: +#' \describe{ +#' \item{id}{gene identifier} +#' \item{lfc}{CLR-based log fold change, using the geometric mean of all +#' genes as reference.} +#' \item{lrmD}{log fold change using only significantly connected partners +#' as reference. NA if the gene has no significant connections.} +#' \item{connectivity}{number of significant pairwise relationships.} +#' \item{ES}{GSEA-inspired enrichment score. Higher values indicate the +#' gene is systematically involved in differentially proportional pairs.} +#' \item{ES_batch}{median ES across batch iterations, used internally for +#' concordance checking.} +#' \item{padj}{BH-adjusted permutation p-value for the ES.} +#' } #' #' @rdname propdGenewise #' @export @@ -411,13 +468,16 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, pathways <- vector("list", n_focal) names(pathways) <- paste0("gene_", focal_ids) - for (i in seq_len(n_focal)) { - partner_ids <- available[((i - 1L) * effective_partners + 1L): - (i * effective_partners)] - higher <- pmax(focal_ids[i], partner_ids) - lower <- pmin(focal_ids[i], partner_ids) - pathways[[i]] <- paste(higher, lower, sep = ",") - } + # vectorized replacement for the for loop + partner_matrix <- matrix(available[seq_len(n_focal * effective_partners)], + nrow = effective_partners, ncol = n_focal) + + higher <- pmax(focal_ids[col(partner_matrix)], partner_matrix) + lower <- pmin(focal_ids[col(partner_matrix)], partner_matrix) + + pathways <- lapply(seq_len(n_focal), function(i) { + paste(higher[, i], lower[, i], sep = ",") + }) batch_pair_names <- unique(unlist(pathways)) batch_scores <- global_scores[batch_pair_names] diff --git a/man/propdGenewise.Rd b/man/propdGenewise.Rd index ebda4ca6..0532c795 100644 --- a/man/propdGenewise.Rd +++ b/man/propdGenewise.Rd @@ -3,7 +3,7 @@ \name{propdGenewise} \alias{propdGenewise} \alias{.compute_fgsea_padj_batches} -\title{Convert pairwise propd results into genewise} +\title{Convert pairwise propd results into genewise results} \usage{ propdGenewise( propd, @@ -25,8 +25,8 @@ propdGenewise( \arguments{ \item{propd}{A \code{\link{propd}} object.} -\item{pairwise_fdr}{FDR threshold to consider a pairwise relationship as significant. -Default is 0.05.} +\item{pairwise_fdr}{FDR threshold to consider a pairwise relationship as +significant. Controls connectivity and lrmD. Default is 0.05.} \item{partner_fraction}{Numeric in (0, 1). Fraction of all other genes to use as partners for each focal gene per batch. Default 0.017 (~1.7\%), @@ -37,7 +37,9 @@ Default 5. Higher values give more stable ES and p-values at the cost of proportionally more compute time.} \item{...}{Additional arguments passed to \code{.compute_fgsea_padj_batches}, -such as \code{nperm}, \code{seed}, or \code{scoreType}.} +such as \code{nperm} (number of permutations, default 1000), +\code{seed} (random seed, default 42), or \code{scoreType} (default +\code{"pos"}, testing enrichment at the low-theta end only).} \item{nperm}{Integer. Number of permutations per fgseaSimple call. Default 1000.} @@ -48,16 +50,20 @@ reproducibility. Default 42.} enrichment at the low-theta end only. "std" tests both ends.} } \value{ -A data frame with genewise results that can be used to identify differentially -expressed genes. It contains the following columns: - - "id": gene identifier - - "lfc": Log Fold Change of the gene, using the geometric mean of all genes as reference. - - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset - of genes as reference (only the ones that are significantly connected to the gene). - - "connectivity": number of significant pairwise relationships the gene has. - - "ES": Enrichment Score of the gene. - - "ES_batch": median Enrichment Score across batch iterations - - "padj": adjusted p-value of the gene, based on the Enrichment Score. +A data frame with one row per gene and the following columns: + \describe{ + \item{id}{gene identifier} + \item{lfc}{CLR-based log fold change, using the geometric mean of all + genes as reference.} + \item{lrmD}{log fold change using only significantly connected partners + as reference. NA if the gene has no significant connections.} + \item{connectivity}{number of significant pairwise relationships.} + \item{ES}{GSEA-inspired enrichment score. Higher values indicate the + gene is systematically involved in differentially proportional pairs.} + \item{ES_batch}{median ES across batch iterations, used internally for + concordance checking.} + \item{padj}{BH-adjusted permutation p-value for the ES.} + } A data frame with one row per gene and columns: - "id": gene identifier @@ -66,8 +72,9 @@ A data frame with one row per gene and columns: - "padj": BH-adjusted median p-value (global, across all genes) } \description{ -This function converts pairwise propd results into genewise results. The resulting -genewise results are direct indicators of genes being differentially expressed. +This function summarises pairwise differential proportionality results at +the gene level, producing metrics that can be used to rank and identify +differentially expressed genes. This function extends \code{\link{propdGenewise}} by adding permutation-based p-values via a batched GSEA strategy. Because all gene-pair sets overlap @@ -79,6 +86,54 @@ independent by construction. fgsea is then run on this reduced, independent score vector, yielding valid p-values. } \details{ +## What this function computes + +**Connectivity** counts how many significant pairwise relationships each +gene has (controlled by \code{pairwise_fdr}). A highly connected gene is +one whose log-ratio with many other genes changes between groups. + +**LFC** is a CLR-based log fold change, computed by averaging log-ratios +across all other genes as reference (equivalent to using the geometric mean +as reference). This is the standard compositionally-aware fold change. + +**lrmD** is similar to LFC but uses only significantly connected partners +as reference, making it more robust when only a subset of genes are +differentially proportional. + +**ES (Enrichment Score)** is a GSEA-inspired score. Pairs are ranked by +theta ascending (most differentially proportional first). For each gene, +its "gene set" is all pairs it participates in. The ES measures how much +those pairs are enriched at the top of the ranking (i.e. among the most +differentially proportional pairs). A high ES means the gene is +systematically involved in differentially proportional relationships. + +## Permutation p-values and the batch procedure + +Because gene-pair sets overlap (every pair belongs to two genes), +standard GSEA permutation tests are invalid. To solve this, genes are +processed in batches where each focal gene is assigned a disjoint random +subset of partners. This ensures independence within each batch, yielding +valid permutation p-values. + +The procedure is repeated \code{n_iter} times with different random seeds +and results are aggregated (median ES and median p-value across iterations) +to reduce variance from the random partner assignment. + +## How to tune the parameters + +- \code{partner_fraction}: controls how many partners each gene gets per + batch (as a fraction of all genes). Higher values give more stable ES + at the cost of compute time. If you see the ES concordance warning, + try increasing this first. +- \code{n_iter}: more iterations give more stable p-values. Increase if + results vary between runs. Each extra iteration adds proportional + compute time. +- \code{nperm} (passed via \code{...}): number of permutations per fgsea + call. Increase for more precise p-values, especially for very small + p-values. Default is 1000. +- \code{seed} (passed via \code{...}): random seed for reproducibility. + Default is 42. + To reduce variance from the random partner assignment, the procedure is repeated \code{n_iter} times with different seeds. The final ES is the mean across iterations and the final p-value is the median (robust to outlier From b3543d9fa0d735c005eae9dc32b5a2aaceebd408 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Wed, 8 Apr 2026 15:20:06 +0200 Subject: [PATCH 12/13] add test data (simulated) --- R/2-propd-genewise.R | 39 ++++++++++++++++------------ tests/testthat/test-PROPD-genewise.R | 30 ++++++++++++++++++--- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index d2339783..a5d943ce 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -331,10 +331,10 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' @param n_iter Integer. Number of independent iterations to run and aggregate. #' Default 5. Higher values give more stable ES and p-values at the cost of #' proportionally more compute time. -#' @param nperm Integer. Number of permutations per fgseaSimple call. Default 1000. +#' @param nperm Integer. Number of permutations per fgsea call. Default 1000. #' @param seed Integer. Base random seed. Each iteration uses seed + i for #' reproducibility. Default 42. -#' @param scoreType Character. Passed to fgseaSimple. "pos" (default) tests +#' @param scoreType Character. Passed to fgsea. "pos" (default) tests #' enrichment at the low-theta end only. "std" tests both ends. #' @return A data frame with one row per gene and columns: #' - "id": gene identifier @@ -356,13 +356,6 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, stop("Package 'fgsea' is required. Install with: BiocManager::install('fgsea')") } - # Validate assumption that Partner > Pair on a random sample - sample_idx <- sample(nrow(results), min(1000L, nrow(results))) - if (any(results$Partner[sample_idx] <= results$Pair[sample_idx])) { - stop("results$Partner must always be greater than results$Pair. ", - "This is expected propd convention but appears to be violated.") - } - if (partner_fraction <= 0 || partner_fraction >= 1) { stop("partner_fraction must be in (0, 1).") } @@ -373,8 +366,15 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, G <- length(features) results <- propd@results + # Validate assumption that Partner > Pair on a random sample + sample_idx <- sample(nrow(results), min(1000L, nrow(results))) + if (any(results$Partner[sample_idx] <= results$Pair[sample_idx])) { + stop("results$Partner must always be greater than results$Pair. ", + "This is expected propd convention but appears to be violated.") + } + # ---- Derive max_partners ---- - max_partners <- max(100L, floor((G - 1L) * partner_fraction)) + max_partners <- max(10L, floor((G - 1L) * partner_fraction)) max_partners <- as.integer(min(max_partners, G - 1L)) batch_size <- floor(G / (1L + max_partners)) @@ -382,8 +382,8 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, n_batches <- ceiling(G / batch_size) message(sprintf( - "G=%d | partner_fraction=%.3f | max_partners=%d (%.1f%%) | batch_size=%d | n_batches=%d | n_iter=%d", - G, partner_fraction, max_partners, + "G=%d | used partner_fraction=%.3f | max_partners=%d (%.1f%%) | batch_size=%d | n_batches=%d | n_iter=%d", + G, max_partners/G, max_partners, max_partners / (G - 1) * 100, batch_size, n_batches, n_iter )) @@ -439,7 +439,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, #' #' Internal helper called by \code{propdGenewisePval}. Processes all genes #' in batches, assigning disjoint partner subsets per focal gene, running -#' fgseaSimple per batch, and returning a per-gene data frame of ES and pval. +#' fgsea per batch, and returning a per-gene data frame of ES and pval. #' #' @keywords internal .run_batches_once <- function(results, G, global_scores, max_partners, @@ -466,19 +466,22 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, } pathways <- vector("list", n_focal) - names(pathways) <- paste0("gene_", focal_ids) # vectorized replacement for the for loop partner_matrix <- matrix(available[seq_len(n_focal * effective_partners)], nrow = effective_partners, ncol = n_focal) - higher <- pmax(focal_ids[col(partner_matrix)], partner_matrix) - lower <- pmin(focal_ids[col(partner_matrix)], partner_matrix) + higher <- matrix(pmax(focal_ids[col(partner_matrix)], partner_matrix), + nrow = nrow(partner_matrix), ncol = ncol(partner_matrix)) + lower <- matrix(pmin(focal_ids[col(partner_matrix)], partner_matrix), + nrow = nrow(partner_matrix), ncol = ncol(partner_matrix)) pathways <- lapply(seq_len(n_focal), function(i) { paste(higher[, i], lower[, i], sep = ",") }) + names(pathways) <- paste0("gene_", focal_ids) + batch_pair_names <- unique(unlist(pathways)) batch_scores <- global_scores[batch_pair_names] @@ -491,6 +494,8 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, batch_scores <- sort(batch_scores, decreasing = TRUE) + #message("batch ", b, ": n_focal=", n_focal, " n_pairs=", length(batch_scores)) + fgsea_res <- fgsea::fgsea( pathways = pathways, stats = batch_scores, @@ -530,7 +535,7 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, if (total_dropped > 0L) { warning(sprintf( - "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. Consider increasing partner_fraction or nperm.", + "%d / %d genes (%.1f%%) dropped by fgsea and filled with ES=0 (conservative), pval=1. Consider increasing partner_fraction or nperm.", total_dropped, G, total_dropped / G * 100 )) } diff --git a/tests/testthat/test-PROPD-genewise.R b/tests/testthat/test-PROPD-genewise.R index c397a7d9..a26675a2 100644 --- a/tests/testthat/test-PROPD-genewise.R +++ b/tests/testthat/test-PROPD-genewise.R @@ -1,10 +1,32 @@ library(testthat) library(propr) -# ---- shared test data ---- -keep <- iris$Species %in% c("setosa", "versicolor") -counts <- iris[keep, 1:4] * 10 -group <- ifelse(iris[keep, "Species"] == "setosa", "A", "B") +# ---- shared test data (simulated, for genewise tests) ---- +set.seed(42) + +# simulate 100 genes, 40 samples, 2 groups +n_samples <- 40 +n_genes <- 100 +group <- rep(c("A", "B"), each = n_samples / 2) + +# base counts with some differential proportionality signal +counts_base <- matrix(rnbinom(n_samples * n_genes, mu = 100, size = 10), + nrow = n_samples, ncol = n_genes) +colnames(counts_base) <- paste0("gene", seq_len(n_genes)) +rownames(counts_base) <- paste0("sample", seq_len(n_samples)) + +# add signal: make a few genes differentially proportional in group B +signal_genes <- sample(seq(100))[1:10] +counts_base[group == "B", signal_genes] <- matrix( + rnbinom(sum(group == "B") * length(signal_genes), mu = 300, size = 5), + nrow = sum(group == "B")) + +propd_testdata <- list(counts = counts_base, group = group) + +# ---- Generate propd object ------------ + +counts <- propd_testdata$counts +group <- propd_testdata$group pd <- propd(counts, group) pd_with_f <- updateF(pd) From ef0f11502e76f5fb9d4d56bd4a09f8804e18a3d6 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Wed, 8 Apr 2026 15:27:38 +0200 Subject: [PATCH 13/13] add check for incomplete batches --- R/2-propd-genewise.R | 6 ++++++ man/dot-run_batches_once.Rd | 2 +- man/propdGenewise.Rd | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index a5d943ce..01a9f742 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -462,6 +462,12 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, if (effective_partners < 1L) { warning(sprintf("Batch %d: not enough available partners, skipping.", b)) + batch_results[[b]] <- data.frame( + gene_id = focal_ids, + ES_batch = 0, + pval = 1, + stringsAsFactors = FALSE + ) next } diff --git a/man/dot-run_batches_once.Rd b/man/dot-run_batches_once.Rd index 00514f0a..4c3846fc 100644 --- a/man/dot-run_batches_once.Rd +++ b/man/dot-run_batches_once.Rd @@ -18,6 +18,6 @@ \description{ Internal helper called by \code{propdGenewisePval}. Processes all genes in batches, assigning disjoint partner subsets per focal gene, running -fgseaSimple per batch, and returning a per-gene data frame of ES and pval. +fgsea per batch, and returning a per-gene data frame of ES and pval. } \keyword{internal} diff --git a/man/propdGenewise.Rd b/man/propdGenewise.Rd index 0532c795..d99983ad 100644 --- a/man/propdGenewise.Rd +++ b/man/propdGenewise.Rd @@ -41,12 +41,12 @@ such as \code{nperm} (number of permutations, default 1000), \code{seed} (random seed, default 42), or \code{scoreType} (default \code{"pos"}, testing enrichment at the low-theta end only).} -\item{nperm}{Integer. Number of permutations per fgseaSimple call. Default 1000.} +\item{nperm}{Integer. Number of permutations per fgsea call. Default 1000.} \item{seed}{Integer. Base random seed. Each iteration uses seed + i for reproducibility. Default 42.} -\item{scoreType}{Character. Passed to fgseaSimple. "pos" (default) tests +\item{scoreType}{Character. Passed to fgsea. "pos" (default) tests enrichment at the low-theta end only. "std" tests both ends.} } \value{