-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
For using seurat wrapper, I have to use enhance_seurat_wrapper. But An error comes up saying
Error in colSums(data_raw) :
'x' must be an array of at least two dimensions
Looking into the code, I think it is because GetAssayData function returns a sparse matrix. If I edit the function to use matrix instead of sparse matrix, it works. If it's a gloabal issue you may rectify it, or it could be some interefernce in my R environment.
The function I used at the end is. Only change 'as.matrix' in the seurat wrapper function.
# ENHANCE denoising algorithm for single-cell RNA-Seq data
# Version: 0.1
#
# Authors: Dalia Barkley <dalia.barkley@nyulangone.org>, Florian Wagner <florian.wagner@nyu.edu>
# Copyright (c) 2019 New York University
#
# Notes
# =====
# - R implementation, depends on "rsvd" CRAN package.
normalize_median = function(
D,
med = NULL
){
# Normalizes the expression profiles in an expression matrix.
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# med (optional): The desired transcript count to normalize to.
# By default, the median transcript count across all cells is used.
#
# Returns:
# The normalized expression matrix.
tpercell = colSums(D)
if (is.null(med)){
med = median(tpercell)
}
D = med * t(t(D) / tpercell)
return(D)
}
transform_ft = function(
D
){
# Performs Freeman-Tukey transformation of a normalized expression matrix.
#
# Args:
# D: The normalized expression matrix (rows=genes, columns=cells).
#
# Returns:
# The transformed expression matrix.
D = sqrt(D) + sqrt(1+D)
return(D)
}
simulate_noise_matrix = function(
D
){
# Simulates a matrix in which all variance is noise.
# First, the mean expression level across the original matrix is calculated.
# Then, a new matrix is generated by poisson sampling from these mean values.
#
#
# Args:
# D: The normalized expression matrix (rows=genes, columns=cells).
#
# Returns:
# A simulated pure noise matrix of identical dimensions to the input matrix
ncells = dim(D)[2]
means = rowMeans(D)
D = t(sapply(means, function(m){
rpois(ncells, lambda = m)
}))
colnames(D) = 1:ncells
return(D)
}
perform_pca = function(
D,
med = NULL,
return_pcs = TRUE,
ratio_pcs = NULL
){
# Performs PCA
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# med (optional): The desired transcript count to normalize to.
# By default, the median transcript count across all cells is used.
# return_pcs (optional): Whether to determine the number of significant principal components
# using a simulated expression matrix.
# Default: FALSE
# ratio_pcs (optional): If determining the number of significant principal components
# using a simulated expression matrix, variance ratio between simulated and real matrix
# to use to determined the threshold.
# Default: 2
#
# Returns:
# A PCA object containing rotation (gene loadings), x (cell scores) and pcs (significant principal components) if return_pcs is TRUE.
if (is.null(med)){
tpercell = colSums(D)
med = median(tpercell)
}
PCA = rpca(t(transform_ft(normalize_median(D, med = med))), k = 50, center = TRUE, scale = FALSE, retx = TRUE, p = 10, q = 7)
if (return_pcs){
D_sim = simulate_noise_matrix(normalize_median(D, med=med))
PCA_sim = rpca(t(transform_ft(normalize_median(D_sim, med = med))), k = 50, center = TRUE, scale = FALSE, retx = TRUE, p = 10, q = 7)
pcs = which(PCA$sdev^2 > ratio_pcs*PCA_sim$sdev[1]^2)
if (length(pcs) < 2){
pcs = c(1,2)
}
return(c(PCA, list('pcs' = pcs)))
} else {
return(PCA)
}
}
find_nearest_neighbors = function(
D,
k_nn
){
# Finds the nearest neighbors of each cell in a matrix
# First, calculates the distance between each pair of cells.
# Then, for each cell, sorts the distances to it to return the nearest neighbors.
#
#
# Args:
# D: The matrix from which to calculate the distance (rows=variables, columns=cells).
# This can be an expression matrix (rows=genes) or principal component coordinates (rows=scores)
# k_nn: Number of neighbors to return
#
# Returns:
# A list containing, for each cell, the indices of its k_nn nearest neighbors
ncells = dim(D)[2]
distances = as.matrix(dist(t(D), method = 'euclidean'))
nn = lapply(1:ncells, function(c){
order(distances[,c])[1:k_nn]
})
return(nn)
}
aggregate_nearest_neighbors = function(
D,
nn
){
# For each cell, aggregates the expression values of its nearest neighbors
#
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# nn: The list containing, for each cell, the indices of its k_nn nearest neighbors
#
# Returns:
# The aggregated expression matrix
D_agg = sapply(nn, function(indices){
rowSums(D[, indices])
})
return(D_agg)
}
estimate_sizes = function(
D,
nn
){
# For each cell, estimates its size by taking the median size of its k_nn nearest neighbors
#
#
# Args:
# D: The count expression matrix (rows=genes, columns=cells).
# nn: The list containing, for each cell, the indices of its k_nn nearest neighbors
#
# Returns:
# The vector of estimated cell sizes
sizes = sapply(nn, function(indices){
median(colSums(D[, indices]))
})
return(sizes)
}
enhance = function(
data_raw = NULL,
ratio_pcs = 2,
k_nn = NULL,
target_transcripts = 2*10^5,
percent_cells_max = 2
){
# Main function, performs all the steps to return a denoised expression matrix
#
#
# Args:
# data_raw: The raw count expression matrix (rows=genes, columns=cells).
# ratio_pcs (optional): Variance ratio between simulated and real matrix
# to use to determined the threshold at which to keep principal components
# Default: 2
# k_nn (optional): Number of neighbors to aggregate
# Default: Calculated so that aggregation yields ~ target_transcripts per cell
# target_transcripts (optional): Number of transcripts per cell to aim for in aggregation
# Default: 2*10^5
# percent_cells_max (optional): Maximum percentage of cells to aggregate
# Default: 2
#
# Returns:
# The denoised expression matrix
library(rsvd)
# Determining k_nn
med_raw = median(colSums(data_raw))
if (is.null(k_nn)){
ncells = dim(data_raw)[2]
k_nn_max = ceiling(percent_cells_max/100 * ncells)
k_nn = ceiling(target_transcripts / med_raw)
print(paste('Calculating number of neighbors to aggregate to aim for', target_transcripts, 'transcripts'))
if (k_nn > k_nn_max){
k_nn = k_nn_max
print(paste('Calculated number of neighbors to aggregate was too high, reducing to', percent_cells_max, 'percent of cells'))
}
}
print(paste('Number of neighbors to aggregate:', k_nn))
# PCA on the raw data
pca_raw = perform_pca(D = data_raw, med = med_raw, return_pcs = TRUE, ratio_pcs = ratio_pcs)
print(paste('Number of principal components to use:', max(pca_raw$pcs)))
data_raw_pca_raw = t(pca_raw$x)[pca_raw$pcs, ]
# Find nearest neighbors based on PCA, then aggregate raw data
nn_1 = find_nearest_neighbors(D = data_raw_pca_raw, k_nn = k_nn)
data_agg1 = aggregate_nearest_neighbors(D = data_raw, nn = nn_1)
# PCA on the aggregated data, then projection of the raw data onto these PCs
pca_agg1 = perform_pca(D = data_agg1, med = med_raw, return_pcs = FALSE)
data_raw_pca_agg1 = t(t(transform_ft(normalize_median(data_raw, med = med_raw)) - pca_agg1$center) %*% pca_agg1$rotation)[pca_raw$pcs, ]
# Find nearest neighbors based on the projected PCA, then aggregate raw data
nn_2 = find_nearest_neighbors(D = data_raw_pca_agg1, k_nn = k_nn)
data_agg2 = aggregate_nearest_neighbors(D = data_raw, nn = nn_2)
sizes_agg2 = estimate_sizes(D = data_raw, nn = nn_2)
med_agg2 = median(colSums(data_agg2))
# PCA on the aggregated data
pca_agg2 = perform_pca(D = data_agg2, med = med_agg2, return_pcs = FALSE)
data_agg2_pca_agg2 = t(pca_agg2$x)[pca_raw$pcs, ]
# Reversing PCA, rescaling to estimated cell sizes, and reversing transform
data_denoise = pca_agg2$rotation[, pca_raw$pcs] %*% t(pca_agg2$x)[pca_raw$pcs, ] + pca_agg2$center
data_denoise[data_denoise < 1] = 1
data_denoise = (data_denoise^2 - 1)^2 / (4*data_denoise^2)
data_denoise = t(t(data_denoise) * sizes_agg2/colSums(data_denoise))
colnames(data_denoise) = colnames(data_raw)
return(data_denoise)
}
enhance_seurat_wrapper = function(
object,
setDefaultAssay = TRUE,
assay = 'RNA',
ratio_pcs = 2,
k_nn = NULL,
target_transcripts = 2*10^5,
percent_cells_max = 2
){
# Seurat wrapper for enhance
#
# Args:
# object: A Seurat object
# setDefaultAssay: Whether to set enhance as the default assay
# Default: TRUE
# assay: Which assay to run enhance on
# Default: "RNA"
# ratio_pcs (optional): Variance ratio between simulated and real matrix
# to use to determined the threshold at which to keep principal components
# Default: 2
# k_nn (optional): Number of neighbors to aggregate
# Default: Calculated so that aggregation yields ~ target_transcripts per cell
# target_transcripts (optional): Number of transcripts per cell to aim for in aggregation
# percent_cells_max (optional): Maximum percentage of cells to aggregate
# Default: 2
#
# Returns:
# A Seurat object with a new assay called "enhance", in which the "counts" slot is the denoised counts matrix
# Downstream analysis may include NormalizeData, ScaleData, FindVariableFeatures, RunPCA, etc
counts = as.matrix(GetAssayData(object, assay = assay, slot = "counts"))
counts_enhance = enhance(data_raw = counts, ratio_pcs = ratio_pcs, k_nn = k_nn, target_transcripts = target_transcripts, percent_cells_max = percent_cells_max)
counts_enhance = Matrix(data = counts_enhance, sparse = TRUE)
colnames(counts_enhance) = colnames(counts)
rownames(counts_enhance) = rownames(counts)
object[["enhance"]] = CreateAssayObject(counts = counts_enhance)
if (setDefaultAssay) {
message("Setting default assay as enhance")
DefaultAssay(object) = "enhance"
}
return(object)
}
Puriney
Metadata
Metadata
Assignees
Labels
No labels