From bd5a121cb33adcd759895c27eb37ebd3f3bbbff5 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Fri, 23 Jan 2026 13:48:47 +0100
Subject: [PATCH 1/8] perf: try out new cliping numba kernel

---
 benchmarks/benchmarks/preprocessing_log.py    | 33 +++++++++++-------
 pyproject.toml                                |  2 ++
 .../preprocessing/_highly_variable_genes.py   | 34 +++++++++++++++----
 3 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py
index 9633c8e208..8d79ce88e2 100644
--- a/benchmarks/benchmarks/preprocessing_log.py
+++ b/benchmarks/benchmarks/preprocessing_log.py
@@ -34,7 +34,7 @@ class PreprocessingSuite:  # noqa: D101
 
     def setup_cache(self) -> None:
         """Without this caching, asv was running several processes which meant the data was repeatedly downloaded."""
-        for dataset, layer in product(*self.params):
+        for dataset, layer in product(*self.params[:2]):
             adata, _ = get_dataset(dataset, layer=layer)
             adata.write_h5ad(f"{dataset}_{layer}.h5ad")
 
@@ -47,17 +47,6 @@ def time_pca(self, *_) -> None:
     def peakmem_pca(self, *_) -> None:
         sc.pp.pca(self.adata, svd_solver="arpack")
 
-    def time_highly_variable_genes(self, *_) -> None:
-        # the default flavor runs on log-transformed data
-        sc.pp.highly_variable_genes(
-            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
-        )
-
-    def peakmem_highly_variable_genes(self, *_) -> None:
-        sc.pp.highly_variable_genes(
-            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
-        )
-
     # regress_out is very slow for this dataset
     @skip_when(dataset={"pbmc3k"})
     def time_regress_out(self, *_) -> None:
@@ -72,3 +61,23 @@ def time_scale(self, *_) -> None:
 
     def peakmem_scale(self, *_) -> None:
         sc.pp.scale(self.adata, max_value=10)
+
+
+class HVGSuite(PreprocessingSuite):  # noqa: D101
+    params = (*params, ["seurat_v3", "cell_ranger", "seurat"])
+    param_names = (*param_names, "flavor")
+
+    def setup(self, dataset, layer, flavor) -> None:
+        self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad")
+        self.flavor = flavor
+
+    def time_highly_variable_genes(self, *_) -> None:
+        # the default flavor runs on log-transformed data
+        sc.pp.highly_variable_genes(
+            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor
+        )
+
+    def peakmem_highly_variable_genes(self, *_) -> None:
+        sc.pp.highly_variable_genes(
+            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor
+        )
diff --git a/pyproject.toml b/pyproject.toml
index c61961606c..16fcae67d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -198,6 +198,8 @@ filterwarnings = [
     "ignore:.*'(parseAll)'.*'(parse_all)':DeprecationWarning",
     # igraph vs leidenalg warning
     "ignore:The `igraph` implementation of leiden clustering:UserWarning",
+    "ignore:Detected unsupported threading environment:UserWarning",
+    "ignore:Cannot cache compiled function",
 ]
 
 [tool.coverage.run]
diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py
index 26d5a7763e..845b1fd21e 100644
--- a/src/scanpy/preprocessing/_highly_variable_genes.py
+++ b/src/scanpy/preprocessing/_highly_variable_genes.py
@@ -13,7 +13,7 @@
 from fast_array_utils import stats
 
 from .. import logging as logg
-from .._compat import CSBase, CSRBase, DaskArray, old_positionals, warn
+from .._compat import CSBase, CSRBase, DaskArray, njit, old_positionals, warn
 from .._settings import Verbosity, settings
 from .._utils import (
     check_nonnegative_integers,
@@ -98,8 +98,7 @@ def _(data_batch: CSBase, clip_val: np.ndarray) -> tuple[np.ndarray, np.ndarray]
     )
 
 
-# parallel=False needed for accuracy
-@numba.njit(cache=True, parallel=False)  # noqa: TID251
+@njit
 def _sum_and_sum_squares_clipped(
     indices: NDArray[np.integer],
     data: NDArray[np.floating],
@@ -108,13 +107,34 @@ def _sum_and_sum_squares_clipped(
     clip_val: NDArray[np.float64],
     nnz: int,
 ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-    squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
-    batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+    """
+    Parallel implementation using thread-local buffers to avoid race conditions.
+
+    Previous implementation used parallel=False due to race condition on shared arrays.
+    This version uses explicit thread-local reduction to restore both correctness
+    and parallelism.
+    """
+    # Thread-local accumulators for parallel reduction
+    n_threads = numba.get_num_threads()
+    squared_local = np.zeros((n_threads, n_cols), dtype=np.float64)
+    sum_local = np.zeros((n_threads, n_cols), dtype=np.float64)
+
+    # Parallel accumulation into thread-local buffers (no race condition)
     for i in numba.prange(nnz):
+        tid = numba.get_thread_id()
         idx = indices[i]
         element = min(np.float64(data[i]), clip_val[idx])
-        squared_batch_counts_sum[idx] += element**2
-        batch_counts_sum[idx] += element
+        squared_local[tid, idx] += element**2
+        sum_local[tid, idx] += element
+
+    # Reduction phase: combine thread-local results
+    squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+    batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+
+    for t in range(n_threads):
+        for j in range(n_cols):
+            squared_batch_counts_sum[j] += squared_local[t, j]
+            batch_counts_sum[j] += sum_local[t, j]
 
     return squared_batch_counts_sum, batch_counts_sum
 

From 94b9b53df3477a39d0c71758b4484b8bbab82bc1 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Fri, 23 Jan 2026 13:56:57 +0100
Subject: [PATCH 2/8] fix: try scikit misc

---
 benchmarks/asv.conf.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index d19b822178..32b9ca1a8f 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -83,7 +83,7 @@
         // "psutil": [""]
         "pooch": [""],
         "scikit-image": [""], // https://github.com/conda-forge/scikit-misc-feedstock/pull/29
-        // "scikit-misc": [""],
+        "scikit-misc": [""],
     },
 
     // Combinations of libraries/python versions can be excluded/included
@@ -104,7 +104,7 @@
     // - environment_type
     //     Environment type, as above.
     // - sys_platform
-    //     Platform, as in sys.platform. Possible values for the common
+    //     Platform, as in sys.platform. Possible values for the commonπ
     //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
     //
     // "exclude": [

From de3ba0f3a814979adbc2d7764279a8f34facace7 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Fri, 23 Jan 2026 14:05:45 +0100
Subject: [PATCH 3/8] chore: try other datasets

---
 benchmarks/benchmarks/preprocessing_log.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py
index 8d79ce88e2..3390d32795 100644
--- a/benchmarks/benchmarks/preprocessing_log.py
+++ b/benchmarks/benchmarks/preprocessing_log.py
@@ -63,12 +63,17 @@ def peakmem_scale(self, *_) -> None:
         sc.pp.scale(self.adata, max_value=10)
 
 
-class HVGSuite(PreprocessingSuite):  # noqa: D101
-    params = (*params, ["seurat_v3", "cell_ranger", "seurat"])
-    param_names = (*param_names, "flavor")
+class HVGSuite:  # noqa: D101
+    params = (["seurat_v3", "cell_ranger", "seurat"],)
+    param_names = ("flavor",)
+
+    def setup_cache(self) -> None:
+        """Without this caching, asv was running several processes which meant the data was repeatedly downloaded."""
+        adata, _ = get_dataset("pbmc3k")
+        adata.write_h5ad("pbmc3k.h5ad")
 
     def setup(self, dataset, layer, flavor) -> None:
-        self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad")
+        self.adata = sc.pp.filter_genes(ad.read_h5ad("pbmc3k.h5ad"), min_cells=3)
         self.flavor = flavor
 
     def time_highly_variable_genes(self, *_) -> None:

From c462044e38d186e0907c31fbceda518df1eb32d8 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Fri, 23 Jan 2026 14:15:31 +0100
Subject: [PATCH 4/8] fix: oops

---
 benchmarks/benchmarks/preprocessing_log.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py
index 3390d32795..318a8ce82c 100644
--- a/benchmarks/benchmarks/preprocessing_log.py
+++ b/benchmarks/benchmarks/preprocessing_log.py
@@ -72,8 +72,9 @@ def setup_cache(self) -> None:
         adata, _ = get_dataset("pbmc3k")
         adata.write_h5ad("pbmc3k.h5ad")
 
-    def setup(self, dataset, layer, flavor) -> None:
-        self.adata = sc.pp.filter_genes(ad.read_h5ad("pbmc3k.h5ad"), min_cells=3)
+    def setup(self, flavor) -> None:
+        self.adata = ad.read_h5ad("pbmc3k.h5ad")
+        sc.pp.filter_genes(self.adata, min_cells=3)
         self.flavor = flavor
 
     def time_highly_variable_genes(self, *_) -> None:

From 1cf7a5eef63b4397ca5b12aacfc4a6df9937248e Mon Sep 17 00:00:00 2001
From: Intron7 <severin.dicks@icloud.com>
Date: Mon, 26 Jan 2026 12:08:43 +0100
Subject: [PATCH 5/8] update kernel

---
 .../preprocessing/_highly_variable_genes.py   | 59 +++++++++----------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py
index 845b1fd21e..6c3a5d73d7 100644
--- a/src/scanpy/preprocessing/_highly_variable_genes.py
+++ b/src/scanpy/preprocessing/_highly_variable_genes.py
@@ -92,51 +92,50 @@ def _(data_batch: CSBase, clip_val: np.ndarray) -> tuple[np.ndarray, np.ndarray]
     return _sum_and_sum_squares_clipped(
         batch_counts.indices,
         batch_counts.data,
+        batch_counts.indptr,
+        n_rows=batch_counts.shape[0],
         n_cols=batch_counts.shape[1],
         clip_val=clip_val,
-        nnz=batch_counts.nnz,
     )
 
 
 @njit
 def _sum_and_sum_squares_clipped(
-    indices: NDArray[np.integer],
-    data: NDArray[np.floating],
+    indices: np.ndarray,
+    data: np.ndarray,
+    indptr: np.ndarray,
     *,
+    n_rows: int,
     n_cols: int,
-    clip_val: NDArray[np.float64],
-    nnz: int,
-) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-    """
-    Parallel implementation using thread-local buffers to avoid race conditions.
-
-    Previous implementation used parallel=False due to race condition on shared arrays.
-    This version uses explicit thread-local reduction to restore both correctness
-    and parallelism.
-    """
-    # Thread-local accumulators for parallel reduction
+    clip_val: np.ndarray,
+):
     n_threads = numba.get_num_threads()
-    squared_local = np.zeros((n_threads, n_cols), dtype=np.float64)
-    sum_local = np.zeros((n_threads, n_cols), dtype=np.float64)
 
-    # Parallel accumulation into thread-local buffers (no race condition)
-    for i in numba.prange(nnz):
-        tid = numba.get_thread_id()
-        idx = indices[i]
-        element = min(np.float64(data[i]), clip_val[idx])
-        squared_local[tid, idx] += element**2
-        sum_local[tid, idx] += element
+    # Each thread gets its own private buffer to avoid race conditions
+    sum_local = np.zeros((n_threads, n_cols), dtype=np.float64)
+    squared_local = np.zeros((n_threads, n_cols), dtype=np.float64)
 
-    # Reduction phase: combine thread-local results
-    squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
-    batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+    # We parallelize over the ROWS (your original approach)
+    for tid in numba.prange(n_threads):
+        for r in range(tid, n_rows, n_threads):
+            for i in range(indptr[r], indptr[r + 1]):
+                col_idx = indices[i]
+                val = np.float64(data[i])
+                element = min(val, clip_val[col_idx])
+                # Use the thread's private buffer slice
+                sum_local[tid, col_idx] += element
+                squared_local[tid, col_idx] += element**2
+
+    # Reduction phase (merging the thread buffers)
+    final_sum = np.zeros(n_cols, dtype=np.float64)
+    final_squared = np.zeros(n_cols, dtype=np.float64)
 
     for t in range(n_threads):
-        for j in range(n_cols):
-            squared_batch_counts_sum[j] += squared_local[t, j]
-            batch_counts_sum[j] += sum_local[t, j]
+        for c in range(n_cols):
+            final_sum[c] += sum_local[t, c]
+            final_squared[c] += squared_local[t, c]
 
-    return squared_batch_counts_sum, batch_counts_sum
+    return final_squared, final_sum
 
 
 def _highly_variable_genes_seurat_v3(  # noqa: PLR0912, PLR0915

From 967c842398bc555908e871603da8125dad2b7fd2 Mon Sep 17 00:00:00 2001
From: Intron7 <severin.dicks@icloud.com>
Date: Mon, 26 Jan 2026 12:09:25 +0100
Subject: [PATCH 6/8] fix

---
 src/scanpy/preprocessing/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py
index 6c3a5d73d7..fcd9257b5d 100644
--- a/src/scanpy/preprocessing/_highly_variable_genes.py
+++ b/src/scanpy/preprocessing/_highly_variable_genes.py
@@ -115,7 +115,7 @@ def _sum_and_sum_squares_clipped(
     sum_local = np.zeros((n_threads, n_cols), dtype=np.float64)
     squared_local = np.zeros((n_threads, n_cols), dtype=np.float64)
 
-    # We parallelize over the ROWS (your original approach)
+    # We parallelize over the rows of the sparse matrix
     for tid in numba.prange(n_threads):
         for r in range(tid, n_rows, n_threads):
             for i in range(indptr[r], indptr[r + 1]):

From a2acd9faa1cd929ec0df6cb8c843869275c6e322 Mon Sep 17 00:00:00 2001
From: Intron7 <severin.dicks@icloud.com>
Date: Mon, 26 Jan 2026 13:17:48 +0100
Subject: [PATCH 7/8] remove unneed computation

---
 .../preprocessing/_highly_variable_genes.py      | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py
index fcd9257b5d..dd62093cfd 100644
--- a/src/scanpy/preprocessing/_highly_variable_genes.py
+++ b/src/scanpy/preprocessing/_highly_variable_genes.py
@@ -195,13 +195,19 @@ def _highly_variable_genes_seurat_v3(  # noqa: PLR0912, PLR0915
     )
 
     norm_gene_vars = []
-    for b in np.unique(batch_info):
+    unique_batches = np.unique(batch_info)
+    n_batches = len(unique_batches)
+
+    for b in unique_batches:
         data_batch = data[batch_info == b]
 
-        mean, var = stats.mean_var(data_batch, axis=0, correction=1)
-        # These get computed anyway for loess
-        if isinstance(mean, DaskArray):
-            mean, var = mean.compute(), var.compute()
+        if n_batches > 1:
+            mean, var = stats.mean_var(data_batch, axis=0, correction=1)
+            # Compute Dask arrays since loess requires in-memory data
+            if isinstance(mean, DaskArray):
+                mean, var = mean.compute(), var.compute()
+        else:
+            mean, var = df["means"].to_numpy(), df["variances"].to_numpy()
         not_const = var > 0
         estimat_var = np.zeros(data.shape[1], dtype=np.float64)
 

From 8b3f328748a0f2cc350509b14adb85fc9b2aa7b5 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Fri, 30 Jan 2026 12:08:41 +0100
Subject: [PATCH 8/8] fix: maybe?

---
 benchmarks/benchmarks/preprocessing_log.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py
index 318a8ce82c..84a356548c 100644
--- a/benchmarks/benchmarks/preprocessing_log.py
+++ b/benchmarks/benchmarks/preprocessing_log.py
@@ -34,7 +34,7 @@ class PreprocessingSuite:  # noqa: D101
 
     def setup_cache(self) -> None:
         """Without this caching, asv was running several processes which meant the data was repeatedly downloaded."""
-        for dataset, layer in product(*self.params[:2]):
+        for dataset, layer in product(*self.params):
             adata, _ = get_dataset(dataset, layer=layer)
             adata.write_h5ad(f"{dataset}_{layer}.h5ad")