From bd5a121cb33adcd759895c27eb37ebd3f3bbbff5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 13:48:47 +0100 Subject: [PATCH 1/8] perf: try out new cliping numba kernel --- benchmarks/benchmarks/preprocessing_log.py | 33 +++++++++++------- pyproject.toml | 2 ++ .../preprocessing/_highly_variable_genes.py | 34 +++++++++++++++---- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 9633c8e208..8d79ce88e2 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -34,7 +34,7 @@ class PreprocessingSuite: # noqa: D101 def setup_cache(self) -> None: """Without this caching, asv was running several processes which meant the data was repeatedly downloaded.""" - for dataset, layer in product(*self.params): + for dataset, layer in product(*self.params[:2]): adata, _ = get_dataset(dataset, layer=layer) adata.write_h5ad(f"{dataset}_{layer}.h5ad") @@ -47,17 +47,6 @@ def time_pca(self, *_) -> None: def peakmem_pca(self, *_) -> None: sc.pp.pca(self.adata, svd_solver="arpack") - def time_highly_variable_genes(self, *_) -> None: - # the default flavor runs on log-transformed data - sc.pp.highly_variable_genes( - self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5 - ) - - def peakmem_highly_variable_genes(self, *_) -> None: - sc.pp.highly_variable_genes( - self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5 - ) - # regress_out is very slow for this dataset @skip_when(dataset={"pbmc3k"}) def time_regress_out(self, *_) -> None: @@ -72,3 +61,23 @@ def time_scale(self, *_) -> None: def peakmem_scale(self, *_) -> None: sc.pp.scale(self.adata, max_value=10) + + +class HVGSuite(PreprocessingSuite): # noqa: D101 + params = (*params, ["seurat_v3", "cell_ranger", "seurat"]) + param_names = (*param_names, "flavor") + + def setup(self, dataset, layer, flavor) -> None: + self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad") + self.flavor = flavor + + def time_highly_variable_genes(self, *_) -> None: + # the default flavor runs on log-transformed data + sc.pp.highly_variable_genes( + self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor + ) + + def peakmem_highly_variable_genes(self, *_) -> None: + sc.pp.highly_variable_genes( + self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor + ) diff --git a/pyproject.toml b/pyproject.toml index c61961606c..16fcae67d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -198,6 +198,8 @@ filterwarnings = [ "ignore:.*'(parseAll)'.*'(parse_all)':DeprecationWarning", # igraph vs leidenalg warning "ignore:The `igraph` implementation of leiden clustering:UserWarning", + "ignore:Detected unsupported threading environment:UserWarning", + "ignore:Cannot cache compiled function", ] [tool.coverage.run] diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py index 26d5a7763e..845b1fd21e 100644 --- a/src/scanpy/preprocessing/_highly_variable_genes.py +++ b/src/scanpy/preprocessing/_highly_variable_genes.py @@ -13,7 +13,7 @@ from fast_array_utils import stats from .. import logging as logg -from .._compat import CSBase, CSRBase, DaskArray, old_positionals, warn +from .._compat import CSBase, CSRBase, DaskArray, njit, old_positionals, warn from .._settings import Verbosity, settings from .._utils import ( check_nonnegative_integers, @@ -98,8 +98,7 @@ def _(data_batch: CSBase, clip_val: np.ndarray) -> tuple[np.ndarray, np.ndarray] ) -# parallel=False needed for accuracy -@numba.njit(cache=True, parallel=False) # noqa: TID251 +@njit def _sum_and_sum_squares_clipped( indices: NDArray[np.integer], data: NDArray[np.floating], @@ -108,13 +107,34 @@ def _sum_and_sum_squares_clipped( clip_val: NDArray[np.float64], nnz: int, ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: - squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64) - batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + """ + Parallel implementation using thread-local buffers to avoid race conditions. + + Previous implementation used parallel=False due to race condition on shared arrays. + This version uses explicit thread-local reduction to restore both correctness + and parallelism. + """ + # Thread-local accumulators for parallel reduction + n_threads = numba.get_num_threads() + squared_local = np.zeros((n_threads, n_cols), dtype=np.float64) + sum_local = np.zeros((n_threads, n_cols), dtype=np.float64) + + # Parallel accumulation into thread-local buffers (no race condition) for i in numba.prange(nnz): + tid = numba.get_thread_id() idx = indices[i] element = min(np.float64(data[i]), clip_val[idx]) - squared_batch_counts_sum[idx] += element**2 - batch_counts_sum[idx] += element + squared_local[tid, idx] += element**2 + sum_local[tid, idx] += element + + # Reduction phase: combine thread-local results + squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + + for t in range(n_threads): + for j in range(n_cols): + squared_batch_counts_sum[j] += squared_local[t, j] + batch_counts_sum[j] += sum_local[t, j] return squared_batch_counts_sum, batch_counts_sum From 94b9b53df3477a39d0c71758b4484b8bbab82bc1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 13:56:57 +0100 Subject: [PATCH 2/8] fix: try scikit misc --- benchmarks/asv.conf.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index d19b822178..32b9ca1a8f 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -83,7 +83,7 @@ // "psutil": [""] "pooch": [""], "scikit-image": [""], // https://github.com/conda-forge/scikit-misc-feedstock/pull/29 - // "scikit-misc": [""], + "scikit-misc": [""], }, // Combinations of libraries/python versions can be excluded/included @@ -104,7 +104,7 @@ // - environment_type // Environment type, as above. // - sys_platform - // Platform, as in sys.platform. Possible values for the common + // Platform, as in sys.platform. Possible values for the commonπ // cases: 'linux2', 'win32', 'cygwin', 'darwin'. // // "exclude": [ From de3ba0f3a814979adbc2d7764279a8f34facace7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 14:05:45 +0100 Subject: [PATCH 3/8] chore: try other datasets --- benchmarks/benchmarks/preprocessing_log.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 8d79ce88e2..3390d32795 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -63,12 +63,17 @@ def peakmem_scale(self, *_) -> None: sc.pp.scale(self.adata, max_value=10) -class HVGSuite(PreprocessingSuite): # noqa: D101 - params = (*params, ["seurat_v3", "cell_ranger", "seurat"]) - param_names = (*param_names, "flavor") +class HVGSuite: # noqa: D101 + params = (["seurat_v3", "cell_ranger", "seurat"],) + param_names = ("flavor",) + + def setup_cache(self) -> None: + """Without this caching, asv was running several processes which meant the data was repeatedly downloaded.""" + adata, _ = get_dataset("pbmc3k") + adata.write_h5ad("pbmc3k.h5ad") def setup(self, dataset, layer, flavor) -> None: - self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad") + self.adata = sc.pp.filter_genes(ad.read_h5ad("pbmc3k.h5ad"), min_cells=3) self.flavor = flavor def time_highly_variable_genes(self, *_) -> None: From c462044e38d186e0907c31fbceda518df1eb32d8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 14:15:31 +0100 Subject: [PATCH 4/8] fix: oops --- benchmarks/benchmarks/preprocessing_log.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 3390d32795..318a8ce82c 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -72,8 +72,9 @@ def setup_cache(self) -> None: adata, _ = get_dataset("pbmc3k") adata.write_h5ad("pbmc3k.h5ad") - def setup(self, dataset, layer, flavor) -> None: - self.adata = sc.pp.filter_genes(ad.read_h5ad("pbmc3k.h5ad"), min_cells=3) + def setup(self, flavor) -> None: + self.adata = ad.read_h5ad("pbmc3k.h5ad") + sc.pp.filter_genes(self.adata, min_cells=3) self.flavor = flavor def time_highly_variable_genes(self, *_) -> None: From 1cf7a5eef63b4397ca5b12aacfc4a6df9937248e Mon Sep 17 00:00:00 2001 From: Intron7 Date: Mon, 26 Jan 2026 12:08:43 +0100 Subject: [PATCH 5/8] update kernel --- .../preprocessing/_highly_variable_genes.py | 59 +++++++++---------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py index 845b1fd21e..6c3a5d73d7 100644 --- a/src/scanpy/preprocessing/_highly_variable_genes.py +++ b/src/scanpy/preprocessing/_highly_variable_genes.py @@ -92,51 +92,50 @@ def _(data_batch: CSBase, clip_val: np.ndarray) -> tuple[np.ndarray, np.ndarray] return _sum_and_sum_squares_clipped( batch_counts.indices, batch_counts.data, + batch_counts.indptr, + n_rows=batch_counts.shape[0], n_cols=batch_counts.shape[1], clip_val=clip_val, - nnz=batch_counts.nnz, ) @njit def _sum_and_sum_squares_clipped( - indices: NDArray[np.integer], - data: NDArray[np.floating], + indices: np.ndarray, + data: np.ndarray, + indptr: np.ndarray, *, + n_rows: int, n_cols: int, - clip_val: NDArray[np.float64], - nnz: int, -) -> tuple[NDArray[np.float64], NDArray[np.float64]]: - """ - Parallel implementation using thread-local buffers to avoid race conditions. - - Previous implementation used parallel=False due to race condition on shared arrays. - This version uses explicit thread-local reduction to restore both correctness - and parallelism. - """ - # Thread-local accumulators for parallel reduction + clip_val: np.ndarray, +): n_threads = numba.get_num_threads() - squared_local = np.zeros((n_threads, n_cols), dtype=np.float64) - sum_local = np.zeros((n_threads, n_cols), dtype=np.float64) - # Parallel accumulation into thread-local buffers (no race condition) - for i in numba.prange(nnz): - tid = numba.get_thread_id() - idx = indices[i] - element = min(np.float64(data[i]), clip_val[idx]) - squared_local[tid, idx] += element**2 - sum_local[tid, idx] += element + # Each thread gets its own private buffer to avoid race conditions + sum_local = np.zeros((n_threads, n_cols), dtype=np.float64) + squared_local = np.zeros((n_threads, n_cols), dtype=np.float64) - # Reduction phase: combine thread-local results - squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64) - batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + # We parallelize over the ROWS (your original approach) + for tid in numba.prange(n_threads): + for r in range(tid, n_rows, n_threads): + for i in range(indptr[r], indptr[r + 1]): + col_idx = indices[i] + val = np.float64(data[i]) + element = min(val, clip_val[col_idx]) + # Use the thread's private buffer slice + sum_local[tid, col_idx] += element + squared_local[tid, col_idx] += element**2 + + # Reduction phase (merging the thread buffers) + final_sum = np.zeros(n_cols, dtype=np.float64) + final_squared = np.zeros(n_cols, dtype=np.float64) for t in range(n_threads): - for j in range(n_cols): - squared_batch_counts_sum[j] += squared_local[t, j] - batch_counts_sum[j] += sum_local[t, j] + for c in range(n_cols): + final_sum[c] += sum_local[t, c] + final_squared[c] += squared_local[t, c] - return squared_batch_counts_sum, batch_counts_sum + return final_squared, final_sum def _highly_variable_genes_seurat_v3( # noqa: PLR0912, PLR0915 From 967c842398bc555908e871603da8125dad2b7fd2 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Mon, 26 Jan 2026 12:09:25 +0100 Subject: [PATCH 6/8] fix --- src/scanpy/preprocessing/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py index 6c3a5d73d7..fcd9257b5d 100644 --- a/src/scanpy/preprocessing/_highly_variable_genes.py +++ b/src/scanpy/preprocessing/_highly_variable_genes.py @@ -115,7 +115,7 @@ def _sum_and_sum_squares_clipped( sum_local = np.zeros((n_threads, n_cols), dtype=np.float64) squared_local = np.zeros((n_threads, n_cols), dtype=np.float64) - # We parallelize over the ROWS (your original approach) + # We parallelize over the rows of the sparse matrix for tid in numba.prange(n_threads): for r in range(tid, n_rows, n_threads): for i in range(indptr[r], indptr[r + 1]): From a2acd9faa1cd929ec0df6cb8c843869275c6e322 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Mon, 26 Jan 2026 13:17:48 +0100 Subject: [PATCH 7/8] remove unneed computation --- .../preprocessing/_highly_variable_genes.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py index fcd9257b5d..dd62093cfd 100644 --- a/src/scanpy/preprocessing/_highly_variable_genes.py +++ b/src/scanpy/preprocessing/_highly_variable_genes.py @@ -195,13 +195,19 @@ def _highly_variable_genes_seurat_v3( # noqa: PLR0912, PLR0915 ) norm_gene_vars = [] - for b in np.unique(batch_info): + unique_batches = np.unique(batch_info) + n_batches = len(unique_batches) + + for b in unique_batches: data_batch = data[batch_info == b] - mean, var = stats.mean_var(data_batch, axis=0, correction=1) - # These get computed anyway for loess - if isinstance(mean, DaskArray): - mean, var = mean.compute(), var.compute() + if n_batches > 1: + mean, var = stats.mean_var(data_batch, axis=0, correction=1) + # Compute Dask arrays since loess requires in-memory data + if isinstance(mean, DaskArray): + mean, var = mean.compute(), var.compute() + else: + mean, var = df["means"].to_numpy(), df["variances"].to_numpy() not_const = var > 0 estimat_var = np.zeros(data.shape[1], dtype=np.float64) From 8b3f328748a0f2cc350509b14adb85fc9b2aa7b5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 30 Jan 2026 12:08:41 +0100 Subject: [PATCH 8/8] fix: maybe? --- benchmarks/benchmarks/preprocessing_log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 318a8ce82c..84a356548c 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -34,7 +34,7 @@ class PreprocessingSuite: # noqa: D101 def setup_cache(self) -> None: """Without this caching, asv was running several processes which meant the data was repeatedly downloaded.""" - for dataset, layer in product(*self.params[:2]): + for dataset, layer in product(*self.params): adata, _ = get_dataset(dataset, layer=layer) adata.write_h5ad(f"{dataset}_{layer}.h5ad")