From 17d150dccdeb28b113dedb24640056bf0ea368ce Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Mon, 9 Feb 2026 12:53:01 -0800
Subject: [PATCH 01/46] initial chkpoint of implementation state

---
 diskann-benchmark/example/pipe-search.json    |  83 ++
 .../src/backend/disk_index/search.rs          | 330 +++++---
 diskann-benchmark/src/inputs/disk.rs          |  34 +
 diskann-disk/src/data_model/graph_header.rs   |   1 +
 diskann-disk/src/search/mod.rs                |   3 +
 diskann-disk/src/search/pipelined/mod.rs      |  22 +
 .../src/search/pipelined/pipelined_reader.rs  | 144 ++++
 .../src/search/pipelined/pipelined_search.rs  | 795 ++++++++++++++++++
 .../search/pipelined/pipelined_searcher.rs    | 184 ++++
 9 files changed, 1499 insertions(+), 97 deletions(-)
 create mode 100644 diskann-benchmark/example/pipe-search.json
 create mode 100644 diskann-disk/src/search/pipelined/mod.rs
 create mode 100644 diskann-disk/src/search/pipelined/pipelined_reader.rs
 create mode 100644 diskann-disk/src/search/pipelined/pipelined_search.rs
 create mode 100644 diskann-disk/src/search/pipelined/pipelined_searcher.rs

diff --git a/diskann-benchmark/example/pipe-search.json b/diskann-benchmark/example/pipe-search.json
new file mode 100644
index 000000000..41349a3e7
--- /dev/null
+++ b/diskann-benchmark/example/pipe-search.json
@@ -0,0 +1,83 @@
+{
+  "search_directories": [
+    "test_data/disk_index_search"
+  ],
+  "jobs": [
+    {
+      "type": "disk-index",
+      "content": {
+        "source": {
+          "disk-index-source": "Load",
+          "data_type": "float32",
+          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
+        },
+        "search_phase": {
+          "queries": "disk_index_sample_query_10pts.fbin",
+          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
+          "search_list": [10, 20, 40, 80],
+          "beam_width": 4,
+          "recall_at": 10,
+          "num_threads": 1,
+          "is_flat_search": false,
+          "distance": "squared_l2",
+          "vector_filters_file": null,
+          "search_mode": {
+            "mode": "BeamSearch"
+          }
+        }
+      }
+    },
+    {
+      "type": "disk-index",
+      "content": {
+        "source": {
+          "disk-index-source": "Load",
+          "data_type": "float32",
+          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
+        },
+        "search_phase": {
+          "queries": "disk_index_sample_query_10pts.fbin",
+          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
+          "search_list": [10, 20, 40, 80],
+          "beam_width": 4,
+          "recall_at": 10,
+          "num_threads": 1,
+          "is_flat_search": false,
+          "distance": "squared_l2",
+          "vector_filters_file": null,
+          "search_mode": {
+            "mode": "PipeSearch",
+            "initial_beam_width": 4,
+            "relaxed_monotonicity_l": null
+          }
+        }
+      }
+    },
+    {
+      "type": "disk-index",
+      "content": {
+        "source": {
+          "disk-index-source": "Load",
+          "data_type": "float32",
+          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
+        },
+        "search_phase": {
+          "queries": "disk_index_sample_query_10pts.fbin",
+          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
+          "search_list": [10, 20, 40, 80],
+          "beam_width": 8,
+          "recall_at": 10,
+          "num_threads": 1,
+          "is_flat_search": false,
+          "distance": "squared_l2",
+          "vector_filters_file": null,
+          "search_mode": {
+            "mode": "PipeSearch",
+            "initial_beam_width": 2,
+            "relaxed_monotonicity_l": 50
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 65e5804a7..21c2ba9dd 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -16,9 +16,12 @@ use diskann_disk::{
     search::provider::{
         disk_provider::DiskIndexSearcher, disk_vertex_provider_factory::DiskVertexProviderFactory,
     },
+    search::traits::VertexProviderFactory,
     storage::disk_index_reader::DiskIndexReader,
     utils::{instrumentation::PerfLogger, statistics, AlignedFileReaderFactory, QueryStatistics},
 };
+#[cfg(target_os = "linux")]
+use diskann_disk::search::pipelined::PipelinedSearcher;
 use diskann_providers::storage::StorageReadProvider;
 use diskann_providers::{
     storage::{
@@ -32,7 +35,7 @@ use serde::Serialize;
 
 use crate::{
     backend::disk_index::{graph_data_type::GraphData, json_spancollector::JsonSpanCollector},
-    inputs::disk::{DiskIndexLoad, DiskSearchPhase},
+    inputs::disk::{DiskIndexLoad, DiskSearchPhase, SearchMode},
     utils::{datafiles, SimilarityMeasure},
 };
 
@@ -44,6 +47,7 @@ pub(super) struct DiskSearchStats {
     pub(crate) is_flat_search: bool,
     pub(crate) distance: SimilarityMeasure,
     pub(crate) uses_vector_filters: bool,
+    pub(super) search_mode: String,
     pub(super) num_nodes_to_cache: Option<usize>,
     pub(super) search_results_per_l: Vec<DiskSearchResult>,
     span_metrics: serde_json::Value,
@@ -214,113 +218,243 @@ where
         CachingStrategy::None
     };
 
-    let reader_factory = AlignedFileReaderFactory::new(disk_index_path);
+    let reader_factory = AlignedFileReaderFactory::new(disk_index_path.clone());
     let vertex_provider_factory = DiskVertexProviderFactory::new(reader_factory, caching_strategy)?;
 
-    let searcher = &DiskIndexSearcher::<GraphData<T>, _>::new(
-        search_params.num_threads,
-        if let Some(lim) = search_params.search_io_limit {
-            lim
-        } else {
-            usize::MAX
-        },
-        &index_reader,
-        vertex_provider_factory,
-        search_params.distance.into(),
-        None,
-    )?;
-
-    logger.log_checkpoint("index_loaded");
-
     let pool = create_thread_pool(search_params.num_threads)?;
     let mut search_results_per_l = Vec::with_capacity(search_params.search_list.len());
     let has_any_search_failed = AtomicBool::new(false);
 
-    // Execute search iterations
-    for &l in search_params.search_list.iter() {
-        let mut statistics_vec: Vec<QueryStatistics> =
-            vec![QueryStatistics::default(); num_queries];
-        let mut result_counts: Vec<u32> = vec![0; num_queries];
-        let mut result_ids: Vec<u32> = vec![0; (search_params.recall_at as usize) * num_queries];
-        let mut result_dists: Vec<f32> =
-            vec![0.0; (search_params.recall_at as usize) * num_queries];
-
-        let start = Instant::now();
-
-        let mut l_span = {
-            let tracer = global::tracer("");
-            let span_name = format!("search-with-L={}-bw={}", l, search_params.beam_width);
-            tracer.start(span_name)
-        };
+    match &search_params.search_mode {
+        SearchMode::BeamSearch => {
+            let searcher = &DiskIndexSearcher::<GraphData<T>, _>::new(
+                search_params.num_threads,
+                search_params.search_io_limit.unwrap_or(usize::MAX),
+                &index_reader,
+                vertex_provider_factory,
+                search_params.distance.into(),
+                None,
+            )?;
 
-        let zipped = queries
-            .par_row_iter()
-            .zip(vector_filters.par_iter())
-            .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
-            .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
-            .zip(statistics_vec.par_iter_mut())
-            .zip(result_counts.par_iter_mut());
-
-        zipped.for_each_in_pool(&pool, |(((((q, vf), id_chunk), dist_chunk), stats), rc)| {
-            let vector_filter = if search_params.vector_filters_file.is_none() {
-                None
-            } else {
-                Some(Box::new(move |vid: &u32| vf.contains(vid))
-                    as Box<dyn Fn(&u32) -> bool + Send + Sync>)
-            };
-
-            match searcher.search(
-                q,
-                search_params.recall_at,
-                l,
-                Some(search_params.beam_width),
-                vector_filter,
-                search_params.is_flat_search,
-            ) {
-                Ok(search_result) => {
-                    *stats = search_result.stats.query_statistics;
-                    *rc = search_result.results.len() as u32;
-                    let actual_results = search_result
-                        .results
-                        .len()
-                        .min(search_params.recall_at as usize);
-                    for (i, result_item) in search_result
-                        .results
-                        .iter()
-                        .take(actual_results)
-                        .enumerate()
-                    {
-                        id_chunk[i] = result_item.vertex_id;
-                        dist_chunk[i] = result_item.distance;
-                    }
-                }
-                Err(e) => {
-                    eprintln!("Search failed for query: {:?}", e);
-                    *rc = 0;
-                    id_chunk.fill(0);
-                    dist_chunk.fill(0.0);
-                    has_any_search_failed.store(true, std::sync::atomic::Ordering::Release);
+            logger.log_checkpoint("index_loaded");
+
+            for &l in search_params.search_list.iter() {
+                let mut statistics_vec: Vec<QueryStatistics> =
+                    vec![QueryStatistics::default(); num_queries];
+                let mut result_counts: Vec<u32> = vec![0; num_queries];
+                let mut result_ids: Vec<u32> =
+                    vec![0; (search_params.recall_at as usize) * num_queries];
+                let mut result_dists: Vec<f32> =
+                    vec![0.0; (search_params.recall_at as usize) * num_queries];
+
+                let start = Instant::now();
+
+                let mut l_span = {
+                    let tracer = global::tracer("");
+                    let span_name =
+                        format!("search-with-L={}-bw={}", l, search_params.beam_width);
+                    tracer.start(span_name)
+                };
+
+                let zipped = queries
+                    .par_row_iter()
+                    .zip(vector_filters.par_iter())
+                    .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
+                    .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
+                    .zip(statistics_vec.par_iter_mut())
+                    .zip(result_counts.par_iter_mut());
+
+                zipped.for_each_in_pool(
+                    &pool,
+                    |(((((q, vf), id_chunk), dist_chunk), stats), rc)| {
+                        let vector_filter = if search_params.vector_filters_file.is_none() {
+                            None
+                        } else {
+                            Some(Box::new(move |vid: &u32| vf.contains(vid))
+                                as Box<dyn Fn(&u32) -> bool + Send + Sync>)
+                        };
+
+                        match searcher.search(
+                            q,
+                            search_params.recall_at,
+                            l,
+                            Some(search_params.beam_width),
+                            vector_filter,
+                            search_params.is_flat_search,
+                        ) {
+                            Ok(search_result) => {
+                                *stats = search_result.stats.query_statistics;
+                                *rc = search_result.results.len() as u32;
+                                let actual_results = search_result
+                                    .results
+                                    .len()
+                                    .min(search_params.recall_at as usize);
+                                for (i, result_item) in search_result
+                                    .results
+                                    .iter()
+                                    .take(actual_results)
+                                    .enumerate()
+                                {
+                                    id_chunk[i] = result_item.vertex_id;
+                                    dist_chunk[i] = result_item.distance;
+                                }
+                            }
+                            Err(e) => {
+                                eprintln!("Search failed for query: {:?}", e);
+                                *rc = 0;
+                                id_chunk.fill(0);
+                                dist_chunk.fill(0.0);
+                                has_any_search_failed
+                                    .store(true, std::sync::atomic::Ordering::Release);
+                            }
+                        }
+                    },
+                );
+                let total_time = start.elapsed();
+
+                if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
+                    anyhow::bail!("One or more searches failed. See logs for details.");
                 }
-            }
-        });
-        let total_time = start.elapsed();
 
-        if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
-            anyhow::bail!("One or more searches failed. See logs for details.");
+                let search_result = DiskSearchResult::new(
+                    &statistics_vec,
+                    &result_ids,
+                    &result_counts,
+                    l,
+                    total_time.as_secs_f32(),
+                    num_queries,
+                    &gt_context,
+                )?;
+
+                l_span.end();
+                search_results_per_l.push(search_result);
+            }
         }
+        SearchMode::PipeSearch {
+            initial_beam_width,
+            relaxed_monotonicity_l,
+        } => {
+            #[cfg(target_os = "linux")]
+            {
+                let graph_header = vertex_provider_factory.get_header()?;
+                let pq_data = index_reader.get_pq_data();
+                let metric = search_params.distance.into();
+                let search_io_limit = search_params.search_io_limit.unwrap_or(usize::MAX);
+                let initial_beam_width = *initial_beam_width;
+                let relaxed_monotonicity_l = *relaxed_monotonicity_l;
+
+                logger.log_checkpoint("index_loaded");
+
+                for &l in search_params.search_list.iter() {
+                    let mut statistics_vec: Vec<QueryStatistics> =
+                        vec![QueryStatistics::default(); num_queries];
+                    let mut result_counts: Vec<u32> = vec![0; num_queries];
+                    let mut result_ids: Vec<u32> =
+                        vec![0; (search_params.recall_at as usize) * num_queries];
+                    let mut result_dists: Vec<f32> =
+                        vec![0.0; (search_params.recall_at as usize) * num_queries];
+
+                    let start = Instant::now();
+
+                    let mut l_span = {
+                        let tracer = global::tracer("");
+                        let span_name =
+                            format!("pipesearch-with-L={}-bw={}", l, search_params.beam_width);
+                        tracer.start(span_name)
+                    };
+
+                    let zipped = queries
+                        .par_row_iter()
+                        .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
+                        .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
+                        .zip(statistics_vec.par_iter_mut())
+                        .zip(result_counts.par_iter_mut());
+
+                    zipped.for_each_in_pool(
+                        &pool,
+                        |((((q, id_chunk), dist_chunk), stats), rc)| {
+                            let pipe_searcher =
+                                match PipelinedSearcher::<GraphData<T>>::new(
+                                    graph_header.clone(),
+                                    pq_data.clone(),
+                                    metric,
+                                    search_io_limit,
+                                    initial_beam_width,
+                                    relaxed_monotonicity_l,
+                                    disk_index_path.clone(),
+                                ) {
+                                    Ok(s) => s,
+                                    Err(e) => {
+                                        eprintln!("Failed to create PipelinedSearcher: {:?}", e);
+                                        *rc = 0;
+                                        id_chunk.fill(0);
+                                        dist_chunk.fill(0.0);
+                                        has_any_search_failed
+                                            .store(true, std::sync::atomic::Ordering::Release);
+                                        return;
+                                    }
+                                };
+
+                            match pipe_searcher.search(
+                                q,
+                                search_params.recall_at,
+                                l,
+                                search_params.beam_width,
+                            ) {
+                                Ok(search_result) => {
+                                    *stats = search_result.stats.query_statistics;
+                                    *rc = search_result.results.len() as u32;
+                                    let actual_results = search_result
+                                        .results
+                                        .len()
+                                        .min(search_params.recall_at as usize);
+                                    for (i, result_item) in search_result
+                                        .results
+                                        .iter()
+                                        .take(actual_results)
+                                        .enumerate()
+                                    {
+                                        id_chunk[i] = result_item.vertex_id;
+                                        dist_chunk[i] = result_item.distance;
+                                    }
+                                }
+                                Err(e) => {
+                                    eprintln!("PipeSearch failed for query: {:?}", e);
+                                    *rc = 0;
+                                    id_chunk.fill(0);
+                                    dist_chunk.fill(0.0);
+                                    has_any_search_failed
+                                        .store(true, std::sync::atomic::Ordering::Release);
+                                }
+                            }
+                        },
+                    );
+                    let total_time = start.elapsed();
+
+                    if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
+                        anyhow::bail!("One or more searches failed. See logs for details.");
+                    }
 
-        let search_result = DiskSearchResult::new(
-            &statistics_vec,
-            &result_ids,
-            &result_counts,
-            l,
-            total_time.as_secs_f32(),
-            num_queries,
-            &gt_context,
-        )?;
-
-        l_span.end();
-        search_results_per_l.push(search_result);
+                    let search_result = DiskSearchResult::new(
+                        &statistics_vec,
+                        &result_ids,
+                        &result_counts,
+                        l,
+                        total_time.as_secs_f32(),
+                        num_queries,
+                        &gt_context,
+                    )?;
+
+                    l_span.end();
+                    search_results_per_l.push(search_result);
+                }
+            }
+            #[cfg(not(target_os = "linux"))]
+            {
+                let _ = (initial_beam_width, relaxed_monotonicity_l);
+                anyhow::bail!("PipeSearch is only supported on Linux");
+            }
+        }
     }
 
     // Log search completed checkpoint
@@ -343,6 +477,7 @@ where
         is_flat_search: search_params.is_flat_search,
         distance: search_params.distance,
         uses_vector_filters: search_params.vector_filters_file.is_some(),
+        search_mode: format!("{:?}", search_params.search_mode),
         num_nodes_to_cache: search_params.num_nodes_to_cache,
         search_results_per_l,
         span_metrics,
@@ -427,6 +562,7 @@ impl fmt::Display for DiskSearchStats {
         writeln!(f, "Flat search,      : {}", self.is_flat_search)?;
         writeln!(f, "Distance,         : {}", self.distance)?;
         writeln!(f, "Vector filters,   : {}", self.uses_vector_filters)?;
+        writeln!(f, "Search mode,      : {}", self.search_mode)?;
         writeln!(
             f,
             "Nodes to cache,   : {}",
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index bf843d72f..557aa7aed 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -71,6 +71,27 @@ pub(crate) struct DiskIndexBuild {
     pub(crate) save_path: String,
 }
 
+/// Search algorithm to use for disk index search.
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+#[serde(tag = "mode")]
+pub(crate) enum SearchMode {
+    /// Standard beam search (default, current behavior).
+    #[default]
+    BeamSearch,
+    /// PipeANN pipelined search with IO/compute overlap.
+    PipeSearch {
+        /// Initial beam width before adaptive adjustment (default: 4).
+        #[serde(default = "default_initial_beam_width")]
+        initial_beam_width: usize,
+        /// Optional relaxed monotonicity parameter for early termination.
+        relaxed_monotonicity_l: Option<usize>,
+    },
+}
+
+fn default_initial_beam_width() -> usize {
+    4
+}
+
 /// Search phase configuration
 #[derive(Debug, Deserialize, Serialize)]
 pub(crate) struct DiskSearchPhase {
@@ -85,6 +106,9 @@ pub(crate) struct DiskSearchPhase {
     pub(crate) vector_filters_file: Option<InputFile>,
     pub(crate) num_nodes_to_cache: Option<usize>,
     pub(crate) search_io_limit: Option<usize>,
+    /// Search algorithm to use (defaults to BeamSearch).
+    #[serde(default)]
+    pub(crate) search_mode: SearchMode,
 }
 
 /////////
@@ -234,6 +258,14 @@ impl CheckDeserialization for DiskSearchPhase {
                 anyhow::bail!("search_io_limit must be positive if specified");
             }
         }
+        match &self.search_mode {
+            SearchMode::BeamSearch => {}
+            SearchMode::PipeSearch { initial_beam_width, .. } => {
+                if *initial_beam_width == 0 {
+                    anyhow::bail!("initial_beam_width must be positive");
+                }
+            }
+        }
         Ok(())
     }
 }
@@ -272,6 +304,7 @@ impl Example for DiskIndexOperation {
             vector_filters_file: None,
             num_nodes_to_cache: None,
             search_io_limit: None,
+            search_mode: SearchMode::default(),
         };
 
         Self {
@@ -397,6 +430,7 @@ impl DiskSearchPhase {
             Some(lim) => write_field!(f, "Search IO Limit", format!("{lim}"))?,
             None => write_field!(f, "Search IO Limit", "none (defaults to `usize::MAX`)")?,
         }
+        write_field!(f, "Search Mode", format!("{:?}", self.search_mode))?;
         Ok(())
     }
 }
diff --git a/diskann-disk/src/data_model/graph_header.rs b/diskann-disk/src/data_model/graph_header.rs
index f04803e4a..fc7ed78bc 100644
--- a/diskann-disk/src/data_model/graph_header.rs
+++ b/diskann-disk/src/data_model/graph_header.rs
@@ -12,6 +12,7 @@ use thiserror::Error;
 use super::{GraphLayoutVersion, GraphMetadata};
 
 /// GraphHeader. The header is stored in the first sector of the disk index file, or the first segment of the JET stream.
+#[derive(Clone)]
 pub struct GraphHeader {
     // Graph metadata.
     metadata: GraphMetadata,
diff --git a/diskann-disk/src/search/mod.rs b/diskann-disk/src/search/mod.rs
index 915956ad4..2a4009504 100644
--- a/diskann-disk/src/search/mod.rs
+++ b/diskann-disk/src/search/mod.rs
@@ -7,3 +7,6 @@
 
 pub mod provider;
 pub mod traits;
+
+#[cfg(target_os = "linux")]
+pub mod pipelined;
diff --git a/diskann-disk/src/search/pipelined/mod.rs b/diskann-disk/src/search/pipelined/mod.rs
new file mode 100644
index 000000000..31b9c4b63
--- /dev/null
+++ b/diskann-disk/src/search/pipelined/mod.rs
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Pipelined search module implementing the PipeANN algorithm.
+//!
+//! This module provides a pipelined disk search that overlaps IO and compute
+//! within a single query, using io_uring for non-blocking IO on Linux.
+
+#[cfg(target_os = "linux")]
+mod pipelined_reader;
+#[cfg(target_os = "linux")]
+pub use pipelined_reader::PipelinedReader;
+
+#[cfg(target_os = "linux")]
+mod pipelined_search;
+
+#[cfg(target_os = "linux")]
+mod pipelined_searcher;
+#[cfg(target_os = "linux")]
+pub use pipelined_searcher::PipelinedSearcher;
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
new file mode 100644
index 000000000..e32931b4a
--- /dev/null
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Pipelined IO reader using io_uring with non-blocking submit/poll semantics.
+
+use std::{
+    fs::OpenOptions,
+    os::{fd::AsRawFd, unix::fs::OpenOptionsExt},
+};
+
+use diskann::{ANNError, ANNResult};
+use diskann_providers::common::AlignedBoxWithSlice;
+use io_uring::IoUring;
+
+/// Maximum number of concurrent IO operations supported by the ring.
+pub const MAX_IO_CONCURRENCY: usize = 128;
+
+/// A pipelined IO reader that wraps `io_uring` for non-blocking submit/poll.
+///
+/// Unlike `LinuxAlignedFileReader` which uses `submit_and_wait` (blocking),
+/// this reader submits reads and polls completions independently, enabling
+/// IO/compute overlap within a single search query.
+pub struct PipelinedReader {
+    ring: IoUring,
+    /// Pre-allocated sector-aligned read buffers, one per slot.
+    slot_bufs: AlignedBoxWithSlice<u8>,
+    /// Size of each slot buffer in bytes.
+    slot_size: usize,
+    /// Maximum number of slots available.
+    max_slots: usize,
+    /// Number of currently in-flight (submitted but not completed) reads.
+    in_flight: usize,
+    /// Keep the file handle alive for the lifetime of the reader.
+    _file: std::fs::File,
+}
+
+impl PipelinedReader {
+    /// Create a new pipelined reader.
+    ///
+    /// # Arguments
+    /// * `file_path` - Path to the disk index file.
+    /// * `max_slots` - Number of buffer slots (must be <= MAX_IO_CONCURRENCY).
+    /// * `slot_size` - Size of each buffer slot in bytes (should be sector-aligned).
+    /// * `alignment` - Memory alignment for the buffer (typically 4096 for O_DIRECT).
+    pub fn new(
+        file_path: &str,
+        max_slots: usize,
+        slot_size: usize,
+        alignment: usize,
+    ) -> ANNResult<Self> {
+        let file = OpenOptions::new()
+            .read(true)
+            .custom_flags(libc::O_DIRECT)
+            .open(file_path)
+            .map_err(ANNError::log_io_error)?;
+
+        let ring = IoUring::new(max_slots.min(MAX_IO_CONCURRENCY) as u32)?;
+        let fd = file.as_raw_fd();
+        ring.submitter().register_files(std::slice::from_ref(&fd))?;
+
+        let slot_bufs = AlignedBoxWithSlice::new(max_slots * slot_size, alignment)?;
+
+        Ok(Self {
+            ring,
+            slot_bufs,
+            slot_size,
+            max_slots,
+            in_flight: 0,
+            _file: file,
+        })
+    }
+
+    /// Submit an asynchronous read into the buffer at `slot_id`.
+    ///
+    /// The read will fetch `slot_size` bytes from `sector_offset` (in bytes) into
+    /// the pre-allocated buffer for the given slot. The `slot_id` is stored as
+    /// `user_data` in the CQE for later retrieval.
+    pub fn submit_read(&mut self, sector_offset: u64, slot_id: usize) -> ANNResult<()> {
+        assert!(slot_id < self.max_slots, "slot_id out of range");
+
+        let buf_start = slot_id * self.slot_size;
+        let buf_ptr = self.slot_bufs[buf_start..buf_start + self.slot_size].as_mut_ptr();
+
+        let read_op = io_uring::opcode::Read::new(
+            io_uring::types::Fixed(0),
+            buf_ptr,
+            self.slot_size as u32,
+        )
+        .offset(sector_offset)
+        .build()
+        .user_data(slot_id as u64);
+
+        // SAFETY: The buffer at slot_id is pre-allocated and will remain valid
+        // for the duration of the IO operation. Each slot is used exclusively
+        // (caller must not reuse a slot while it is in-flight).
+        unsafe {
+            self.ring
+                .submission()
+                .push(&read_op)
+                .map_err(ANNError::log_push_error)?;
+        }
+
+        self.ring.submit()?;
+        self.in_flight += 1;
+        Ok(())
+    }
+
+    /// Non-blocking poll of completed IO operations.
+    ///
+    /// Returns the slot_ids of all completed reads since the last poll.
+    pub fn poll_completions(&mut self) -> ANNResult<Vec<usize>> {
+        let mut completed = Vec::new();
+        for cqe in self.ring.completion() {
+            if cqe.result() < 0 {
+                self.in_flight = self.in_flight.saturating_sub(1);
+                return Err(ANNError::log_io_error(std::io::Error::from_raw_os_error(
+                    -cqe.result(),
+                )));
+            }
+            let slot_id = cqe.user_data() as usize;
+            completed.push(slot_id);
+            self.in_flight = self.in_flight.saturating_sub(1);
+        }
+        Ok(completed)
+    }
+
+    /// Returns the read buffer for a completed slot.
+    pub fn get_slot_buf(&self, slot_id: usize) -> &[u8] {
+        let start = slot_id * self.slot_size;
+        &self.slot_bufs[start..start + self.slot_size]
+    }
+
+    /// Returns the number of submitted but not yet completed reads.
+    pub fn in_flight_count(&self) -> usize {
+        self.in_flight
+    }
+
+    /// Returns the slot size in bytes.
+    pub fn slot_size(&self) -> usize {
+        self.slot_size
+    }
+}
diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
new file mode 100644
index 000000000..672d21a8e
--- /dev/null
+++ b/diskann-disk/src/search/pipelined/pipelined_search.rs
@@ -0,0 +1,795 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Core PipeANN pipelined search algorithm.
+
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::time::Instant;
+
+use byteorder::{ByteOrder, LittleEndian};
+use diskann::{utils::VectorRepr, ANNResult};
+use diskann_providers::model::{compute_pq_distance, pq::quantizer_preprocess, PQData, PQScratch};
+use diskann_vector::{distance::Metric, DistanceFunction};
+
+use super::pipelined_reader::PipelinedReader;
+
+/// A candidate in the sorted candidate pool.
+struct Candidate {
+    id: u32,
+    distance: f32,
+    /// true = unvisited and not in-flight, false = in-flight or already-read
+    flag: bool,
+    /// true = node has been processed (neighbors expanded)
+    visited: bool,
+}
+
+/// Tracks an in-flight IO request.
+struct InFlightIo {
+    vertex_id: u32,
+    slot_id: usize,
+}
+
+/// A loaded node parsed from sector data.
+struct LoadedNode {
+    fp_vector: Vec<u8>,
+    adjacency_list: Vec<u32>,
+}
+
+/// Result of a pipelined search.
+pub struct PipeSearchResult {
+    pub ids: Vec<u32>,
+    pub distances: Vec<f32>,
+    pub stats: PipeSearchStats,
+}
+
+/// Statistics for a pipelined search.
+pub struct PipeSearchStats {
+    pub total_us: u128,
+    pub io_us: u128,
+    pub cpu_us: u128,
+    pub io_count: u32,
+    pub comparisons: u32,
+    pub hops: u32,
+}
+
+/// Compute the sector index that contains a given vertex.
+#[inline]
+fn node_sector_index(
+    vertex_id: u32,
+    num_nodes_per_sector: u64,
+    num_sectors_per_node: usize,
+) -> u64 {
+    1 + if num_nodes_per_sector > 0 {
+        vertex_id as u64 / num_nodes_per_sector
+    } else {
+        vertex_id as u64 * num_sectors_per_node as u64
+    }
+}
+
+/// Compute the byte offset of a node within its sector.
+#[inline]
+fn node_offset_in_sector(vertex_id: u32, num_nodes_per_sector: u64, node_len: u64) -> usize {
+    if num_nodes_per_sector == 0 {
+        0
+    } else {
+        (vertex_id as u64 % num_nodes_per_sector * node_len) as usize
+    }
+}
+
+/// Parse a node from raw sector buffer bytes.
+fn parse_node(
+    sector_buf: &[u8],
+    vertex_id: u32,
+    num_nodes_per_sector: u64,
+    node_len: u64,
+    fp_vector_len: u64,
+) -> LoadedNode {
+    let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
+    let node_data = &sector_buf[offset..offset + node_len as usize];
+
+    let fp_vector = node_data[..fp_vector_len as usize].to_vec();
+
+    let neighbor_data = &node_data[fp_vector_len as usize..];
+    let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
+    let mut adjacency_list = Vec::with_capacity(num_neighbors);
+    for i in 0..num_neighbors {
+        let start = 4 + i * 4;
+        adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
+    }
+
+    LoadedNode {
+        fp_vector,
+        adjacency_list,
+    }
+}
+
+/// Insert a candidate into the sorted retset, maintaining sort order by distance.
+/// Returns the insertion position.
+fn insert_into_pool(retset: &mut Vec<Candidate>, pool_size: &mut usize, candidate: Candidate) -> usize {
+    // Binary search for insertion point
+    let pos = retset[..*pool_size]
+        .binary_search_by(|probe| {
+            probe
+                .distance
+                .partial_cmp(&candidate.distance)
+                .unwrap_or(std::cmp::Ordering::Equal)
+        })
+        .unwrap_or_else(|x| x);
+
+    // If pool is full and candidate is worse than all existing, don't insert
+    if pos >= retset.len() {
+        return pos;
+    }
+
+    // Make room if needed
+    if *pool_size >= retset.len() {
+        retset.resize_with(retset.len() * 2, || Candidate {
+            id: 0,
+            distance: f32::MAX,
+            flag: false,
+            visited: false,
+        });
+    }
+
+    // Shift elements right
+    let end = (*pool_size).min(retset.len() - 1);
+    for i in (pos..end).rev() {
+        retset.swap(i, i + 1);
+    }
+    retset[pos] = candidate;
+
+    pos
+}
+
+/// Core pipelined search function implementing the PipeANN algorithm.
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn pipe_search<T: VectorRepr>(
+    reader: &mut PipelinedReader,
+    pq_data: &PQData,
+    distance_comparer: &T::Distance,
+    query: &[T],
+    k: usize,
+    search_l: usize,
+    beam_width: usize,
+    medoid: u32,
+    dims: usize,
+    node_len: u64,
+    num_nodes_per_sector: u64,
+    block_size: usize,
+    fp_vector_len: u64,
+    pq_scratch: &mut PQScratch,
+    relaxed_monotonicity_l: Option<usize>,
+    metric: Metric,
+) -> ANNResult<PipeSearchResult> {
+    let timer = Instant::now();
+    let mut io_count: u32 = 0;
+    let mut comparisons: u32 = 0;
+    let mut hops: u32 = 0;
+    let mut io_time = std::time::Duration::ZERO;
+    let mut cpu_time = std::time::Duration::ZERO;
+
+    let num_pq_chunks = pq_data.get_num_chunks();
+    let pq_compressed = pq_data.pq_compressed_data().get_data();
+
+    let num_sectors_per_node = if num_nodes_per_sector > 0 {
+        1
+    } else {
+        (node_len as usize).div_ceil(block_size)
+    };
+
+    // Prepare PQ distance table for the query and compute PQ distance to medoid
+    pq_scratch.set(dims, query, 1.0)?;
+    let medoid_ids = [medoid];
+    quantizer_preprocess(pq_scratch, pq_data, metric, &medoid_ids)?;
+    let medoid_dist = pq_scratch.aligned_dist_scratch[0];
+
+    // Initialize candidate pool
+    let initial_cap = search_l * 2 + 10;
+    let mut retset: Vec<Candidate> = Vec::with_capacity(initial_cap);
+    for _ in 0..initial_cap {
+        retset.push(Candidate {
+            id: 0,
+            distance: f32::MAX,
+            flag: false,
+            visited: false,
+        });
+    }
+    retset[0] = Candidate {
+        id: medoid,
+        distance: medoid_dist,
+        flag: true,
+        visited: false,
+    };
+    let mut cur_list_size: usize = 1;
+
+    let mut visited = HashSet::new();
+    visited.insert(medoid);
+
+    let mut full_retset: Vec<(u32, f32)> = Vec::with_capacity(search_l * 2);
+
+    let mut on_flight_ios: VecDeque<InFlightIo> = VecDeque::new();
+    let mut id_buf_map: HashMap<u32, LoadedNode> = HashMap::new();
+    let mut next_slot_id: usize = 0;
+
+    let mut cur_beam_width: usize = beam_width.min(4);
+    let mut max_marker: usize = 0;
+    let mut cur_n_in: usize = 0;
+    let mut cur_tot: usize = 0;
+    let mut converge_size: i64 = -1;
+
+    // Closure-like helpers implemented as inline functions via the loop body
+
+    // Submit initial reads
+    {
+        let io_start = Instant::now();
+        let to_send = cur_beam_width.saturating_sub(on_flight_ios.len());
+        let mut n_sent = 0;
+        let mut marker = 0;
+        while marker < cur_list_size && n_sent < to_send {
+            if retset[marker].flag && !id_buf_map.contains_key(&retset[marker].id) {
+                // Send read for this candidate
+                let vid = retset[marker].id;
+                retset[marker].flag = false;
+
+                let sector_idx =
+                    node_sector_index(vid, num_nodes_per_sector, num_sectors_per_node);
+                let sector_offset = sector_idx * block_size as u64;
+                let slot_id = next_slot_id % max_slots(beam_width);
+                reader.submit_read(sector_offset, slot_id)?;
+                on_flight_ios.push_back(InFlightIo {
+                    vertex_id: vid,
+                    slot_id,
+                });
+                next_slot_id = (next_slot_id + 1) % max_slots(beam_width);
+                io_count += 1;
+                n_sent += 1;
+            }
+            marker += 1;
+        }
+        io_time += io_start.elapsed();
+    }
+
+    // Main search loop
+    loop {
+        // Check if there's a first unvisited candidate
+        let first_unvisited = retset[..cur_list_size]
+            .iter()
+            .position(|c| !c.visited);
+        if first_unvisited.is_none() {
+            break;
+        }
+
+        // Poll completions
+        let io_poll_start = Instant::now();
+        let completed_slots = reader.poll_completions()?;
+        io_time += io_poll_start.elapsed();
+        let mut n_in: usize = 0;
+        let mut n_out: usize = 0;
+
+        // Process completed IOs: move from on_flight to id_buf_map
+        if !completed_slots.is_empty() {
+            let completed_set: HashSet<usize> = completed_slots.into_iter().collect();
+            let mut remaining = VecDeque::new();
+            while let Some(io) = on_flight_ios.pop_front() {
+                if completed_set.contains(&io.slot_id) {
+                    let sector_buf = reader.get_slot_buf(io.slot_id);
+                    let node = parse_node(
+                        sector_buf,
+                        io.vertex_id,
+                        num_nodes_per_sector,
+                        node_len,
+                        fp_vector_len,
+                    );
+                    // Track convergence: is this node still in the top of retset?
+                    if cur_list_size > 0 {
+                        let last_dist = retset[cur_list_size - 1].distance;
+                        // Find this node's PQ distance in retset
+                        let in_pool = retset[..cur_list_size]
+                            .iter()
+                            .any(|c| c.id == io.vertex_id && c.distance <= last_dist);
+                        if in_pool {
+                            n_in += 1;
+                        } else {
+                            n_out += 1;
+                        }
+                    }
+                    id_buf_map.insert(io.vertex_id, node);
+                } else {
+                    remaining.push_back(io);
+                }
+            }
+            on_flight_ios = remaining;
+        }
+
+        // Track convergence and adjust beam width
+        if max_marker >= 5 && (n_in + n_out) > 0 {
+            cur_n_in += n_in;
+            cur_tot += n_in + n_out;
+            const WASTE_THRESHOLD: f64 = 0.1;
+            if (cur_tot - cur_n_in) as f64 / cur_tot as f64 <= WASTE_THRESHOLD {
+                cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+            }
+            if let Some(rm_l) = relaxed_monotonicity_l {
+                if rm_l > 0 && converge_size < 0 {
+                    converge_size = full_retset.len() as i64;
+                }
+            }
+        }
+
+        // Check relaxed monotonicity termination
+        if let Some(rm_l) = relaxed_monotonicity_l {
+            if rm_l > 0
+                && converge_size >= 0
+                && full_retset.len() >= (converge_size as usize) + rm_l
+            {
+                break;
+            }
+        }
+
+        // Submit more reads if room
+        if on_flight_ios.len() < cur_beam_width {
+            let io_submit_start = Instant::now();
+            let to_send = 1;
+            let mut n_sent = 0;
+            let mut marker = 0;
+            while marker < cur_list_size && n_sent < to_send {
+                let c = &retset[marker];
+                if c.flag && !id_buf_map.contains_key(&c.id) {
+                    let vid = retset[marker].id;
+                    retset[marker].flag = false;
+
+                    let sector_idx =
+                        node_sector_index(vid, num_nodes_per_sector, num_sectors_per_node);
+                    let sector_offset = sector_idx * block_size as u64;
+                    let slot_id = next_slot_id % max_slots(beam_width);
+                    reader.submit_read(sector_offset, slot_id)?;
+                    on_flight_ios.push_back(InFlightIo {
+                        vertex_id: vid,
+                        slot_id,
+                    });
+                    next_slot_id = (next_slot_id + 1) % max_slots(beam_width);
+                    io_count += 1;
+                    n_sent += 1;
+                }
+                marker += 1;
+            }
+            io_time += io_submit_start.elapsed();
+        }
+
+        // calc_best_node: find one node in id_buf_map that's in retset and unvisited, process it
+        let cpu_start = Instant::now();
+        let mut best_marker = cur_list_size;
+        let calc_limit = cur_list_size;
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..calc_limit {
+            if !retset[i].visited && id_buf_map.contains_key(&retset[i].id) {
+                retset[i].flag = false;
+                retset[i].visited = true;
+                let vid = retset[i].id;
+                hops += 1;
+
+                if let Some(node) = id_buf_map.get(&vid) {
+                    // Compute full-precision distance
+                    let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
+                    let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
+                    full_retset.push((vid, fp_dist));
+
+                    // Expand neighbors
+                    let mut nbors_to_compute: Vec<u32> = Vec::new();
+                    for &nbr_id in &node.adjacency_list {
+                        if visited.insert(nbr_id) {
+                            nbors_to_compute.push(nbr_id);
+                        }
+                    }
+
+                    if !nbors_to_compute.is_empty() {
+                        comparisons += nbors_to_compute.len() as u32;
+                        // Compute PQ distances for unvisited neighbors
+                        compute_pq_distance(
+                            &nbors_to_compute,
+                            num_pq_chunks,
+                            &pq_scratch.aligned_pqtable_dist_scratch,
+                            pq_compressed,
+                            &mut pq_scratch.aligned_pq_coord_scratch,
+                            &mut pq_scratch.aligned_dist_scratch,
+                        )?;
+
+                        let mut nk = cur_list_size;
+                        for (m, &nbr_id) in nbors_to_compute.iter().enumerate() {
+                            let nbr_dist = pq_scratch.aligned_dist_scratch[m];
+                            if cur_list_size == search_l
+                                && nbr_dist >= retset[cur_list_size - 1].distance
+                            {
+                                continue;
+                            }
+                            let nn = Candidate {
+                                id: nbr_id,
+                                distance: nbr_dist,
+                                flag: true,
+                                visited: false,
+                            };
+                            let r = insert_into_pool(&mut retset, &mut cur_list_size, nn);
+                            if cur_list_size < search_l {
+                                cur_list_size += 1;
+                            }
+                            if r < nk {
+                                nk = r;
+                            }
+                        }
+                    }
+                }
+
+                // Find first_unvisited_eager for convergence tracking
+                for (j, c) in retset.iter().enumerate().take(cur_list_size) {
+                    if !c.visited && c.flag && !id_buf_map.contains_key(&c.id) {
+                        best_marker = j;
+                        break;
+                    }
+                }
+                break;
+            }
+        }
+        max_marker = max_marker.max(best_marker);
+        cpu_time += cpu_start.elapsed();
+    }
+
+    // In relaxed monotonicity mode: drain remaining IOs and process unvisited nodes
+    if relaxed_monotonicity_l.is_some_and(|l| l > 0) {
+        // Drain all in-flight IOs
+        while !on_flight_ios.is_empty() {
+            let completed_slots = reader.poll_completions()?;
+            if !completed_slots.is_empty() {
+                let completed_set: HashSet<usize> = completed_slots.into_iter().collect();
+                let mut remaining = VecDeque::new();
+                while let Some(io) = on_flight_ios.pop_front() {
+                    if completed_set.contains(&io.slot_id) {
+                        let sector_buf = reader.get_slot_buf(io.slot_id);
+                        let node = parse_node(
+                            sector_buf,
+                            io.vertex_id,
+                            num_nodes_per_sector,
+                            node_len,
+                            fp_vector_len,
+                        );
+                        id_buf_map.insert(io.vertex_id, node);
+                    } else {
+                        remaining.push_back(io);
+                    }
+                }
+                on_flight_ios = remaining;
+            }
+        }
+        // Process remaining unvisited nodes
+        for c in retset.iter_mut().take(cur_list_size) {
+            if !c.visited {
+                if let Some(node) = id_buf_map.get(&c.id) {
+                    c.visited = true;
+                    let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
+                    let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
+                    full_retset.push((c.id, fp_dist));
+                }
+            }
+        }
+    }
+
+    // Sort full_retset and return top-k
+    full_retset.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+
+    // Deduplicate
+    let mut ids = Vec::with_capacity(k);
+    let mut distances = Vec::with_capacity(k);
+    let mut seen = HashSet::new();
+    for (id, dist) in &full_retset {
+        if ids.len() >= k {
+            break;
+        }
+        if seen.insert(*id) {
+            ids.push(*id);
+            distances.push(*dist);
+        }
+    }
+
+    let total_us = timer.elapsed().as_micros();
+
+    Ok(PipeSearchResult {
+        ids,
+        distances,
+        stats: PipeSearchStats {
+            total_us,
+            io_us: io_time.as_micros(),
+            cpu_us: cpu_time.as_micros(),
+            io_count,
+            comparisons,
+            hops,
+        },
+    })
+}
+
+/// Max buffer slots to use, based on beam width.
+#[inline]
+fn max_slots(beam_width: usize) -> usize {
+    (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ---- helpers ----
+
+    fn make_candidate(id: u32, distance: f32) -> Candidate {
+        Candidate {
+            id,
+            distance,
+            flag: true,
+            visited: false,
+        }
+    }
+
+    fn empty_pool(cap: usize) -> Vec<Candidate> {
+        (0..cap)
+            .map(|_| Candidate {
+                id: 0,
+                distance: f32::MAX,
+                flag: false,
+                visited: false,
+            })
+            .collect()
+    }
+
+    fn pool_distances(retset: &[Candidate], pool_size: usize) -> Vec<f32> {
+        retset[..pool_size].iter().map(|c| c.distance).collect()
+    }
+
+    fn pool_ids(retset: &[Candidate], pool_size: usize) -> Vec<u32> {
+        retset[..pool_size].iter().map(|c| c.id).collect()
+    }
+
+    // ---- insert_into_pool tests ----
+
+    #[test]
+    fn test_insert_into_pool_empty() {
+        let mut retset = empty_pool(8);
+        let mut pool_size: usize = 0;
+        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(1, 0.5));
+        // Pool was empty, should insert at position 0.
+        assert_eq!(pos, 0);
+        assert_eq!(retset[0].id, 1);
+        assert_eq!(retset[0].distance, 0.5);
+    }
+
+    #[test]
+    fn test_insert_into_pool_front() {
+        let mut retset = empty_pool(8);
+        let mut pool_size: usize = 0;
+        insert_into_pool(&mut retset, &mut pool_size, make_candidate(10, 5.0));
+        pool_size += 1;
+        insert_into_pool(&mut retset, &mut pool_size, make_candidate(20, 3.0));
+        pool_size += 1;
+
+        // Candidate with lowest distance should go to front.
+        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(30, 1.0));
+        pool_size += 1;
+        assert_eq!(pos, 0);
+        assert_eq!(pool_ids(&retset, pool_size), vec![30, 20, 10]);
+        assert_eq!(pool_distances(&retset, pool_size), vec![1.0, 3.0, 5.0]);
+    }
+
+    #[test]
+    fn test_insert_into_pool_end() {
+        let mut retset = empty_pool(8);
+        let mut pool_size: usize = 0;
+        insert_into_pool(&mut retset, &mut pool_size, make_candidate(1, 1.0));
+        pool_size += 1;
+        insert_into_pool(&mut retset, &mut pool_size, make_candidate(2, 2.0));
+        pool_size += 1;
+
+        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(3, 10.0));
+        pool_size += 1;
+        assert_eq!(pos, 2);
+        assert_eq!(pool_distances(&retset, pool_size), vec![1.0, 2.0, 10.0]);
+    }
+
+    #[test]
+    fn test_insert_into_pool_at_capacity_better_candidate() {
+        // Capacity = 4, pool full with 4 items. Insert one that is better.
+        let mut retset = empty_pool(4);
+        let mut pool_size: usize = 0;
+        for (id, d) in [(1, 1.0), (2, 3.0), (3, 5.0), (4, 7.0)] {
+            insert_into_pool(&mut retset, &mut pool_size, make_candidate(id, d));
+            pool_size += 1;
+        }
+        assert_eq!(pool_size, 4);
+
+        // Pool is at capacity (pool_size == retset.len()), insert a better candidate.
+        // insert_into_pool should grow the buffer to make room.
+        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(5, 2.0));
+        assert_eq!(pos, 1);
+        // The pool buffer should have grown and the element is in sorted order.
+        assert!(retset.len() >= 5);
+        assert_eq!(retset[0].id, 1);
+        assert_eq!(retset[1].id, 5);
+        assert_eq!(retset[1].distance, 2.0);
+    }
+
+    #[test]
+    fn test_insert_into_pool_at_capacity_worse_candidate() {
+        // Capacity = 4, pool full. Insert a candidate worse than all existing.
+        let mut retset = empty_pool(4);
+        let mut pool_size: usize = 0;
+        for (id, d) in [(1, 1.0), (2, 3.0), (3, 5.0), (4, 7.0)] {
+            insert_into_pool(&mut retset, &mut pool_size, make_candidate(id, d));
+            pool_size += 1;
+        }
+
+        // Candidate distance 100.0 is worse than the sentinel f32::MAX only if
+        // pool_size == retset.len(), the function grows the buffer. Verify sorted order.
+        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(99, 100.0));
+        // pos should be 4 (after last real element); the buffer was grown.
+        assert_eq!(pos, 4);
+    }
+
+    #[test]
+    fn test_insert_into_pool_maintains_sort_order() {
+        let mut retset = empty_pool(16);
+        let mut pool_size: usize = 0;
+        let distances = [5.0, 1.0, 3.0, 7.0, 2.0, 6.0, 4.0];
+        for (i, &d) in distances.iter().enumerate() {
+            insert_into_pool(&mut retset, &mut pool_size, make_candidate(i as u32, d));
+            pool_size += 1;
+        }
+        let dists = pool_distances(&retset, pool_size);
+        for w in dists.windows(2) {
+            assert!(w[0] <= w[1], "Pool not sorted: {:?}", dists);
+        }
+        assert_eq!(dists, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]);
+    }
+
+    // ---- parse_node tests ----
+
+    /// Build a fake sector buffer containing a single node at a given offset.
+    fn build_sector_buf(
+        offset: usize,
+        fp_vector: &[u8],
+        neighbors: &[u32],
+        total_size: usize,
+    ) -> Vec<u8> {
+        let node_len = fp_vector.len() + 4 + neighbors.len() * 4;
+        let mut buf = vec![0u8; total_size.max(offset + node_len)];
+        buf[offset..offset + fp_vector.len()].copy_from_slice(fp_vector);
+        let neigh_offset = offset + fp_vector.len();
+        LittleEndian::write_u32(
+            &mut buf[neigh_offset..neigh_offset + 4],
+            neighbors.len() as u32,
+        );
+        for (i, &n) in neighbors.iter().enumerate() {
+            let start = neigh_offset + 4 + i * 4;
+            LittleEndian::write_u32(&mut buf[start..start + 4], n);
+        }
+        buf
+    }
+
+    #[test]
+    fn test_parse_node_basic() {
+        let fp_vec = vec![1u8, 2, 3, 4, 5, 6, 7, 8]; // 8-byte vector
+        let neighbors = vec![10u32, 20, 30];
+        let fp_vector_len = fp_vec.len() as u64;
+        let node_len = fp_vector_len + 4 + 3 * 4; // vec + count + 3 neighbors
+
+        let buf = build_sector_buf(0, &fp_vec, &neighbors, 4096);
+        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len);
+
+        assert_eq!(node.fp_vector, fp_vec);
+        assert_eq!(node.adjacency_list, vec![10, 20, 30]);
+    }
+
+    #[test]
+    fn test_parse_node_multi_node_per_sector() {
+        let fp_vector_len = 8u64;
+        let node_len = fp_vector_len + 4 + 2 * 4; // 8-byte vec, 2 neighbors
+        let num_nodes_per_sector = 4u64;
+
+        // Place 4 nodes in the sector, each with different data.
+        let mut buf = vec![0u8; 4096];
+        for node_idx in 0u32..4 {
+            let offset = (node_idx as u64 * node_len) as usize;
+            let fp_vec: Vec<u8> = (0..8).map(|b| b + (node_idx as u8) * 10).collect();
+            let neighbors = vec![100 + node_idx, 200 + node_idx];
+            let partial = build_sector_buf(0, &fp_vec, &neighbors, node_len as usize);
+            buf[offset..offset + node_len as usize]
+                .copy_from_slice(&partial[..node_len as usize]);
+        }
+
+        // Parse node at index 2 (vertex_id=2 within same sector)
+        let node = parse_node(&buf, 2, num_nodes_per_sector, node_len, fp_vector_len);
+        let expected_fp: Vec<u8> = (0..8).map(|b| b + 20).collect();
+        assert_eq!(node.fp_vector, expected_fp);
+        assert_eq!(node.adjacency_list, vec![102, 202]);
+    }
+
+    #[test]
+    fn test_parse_node_zero_neighbors() {
+        let fp_vec = vec![42u8; 16];
+        let fp_vector_len = 16u64;
+        let neighbors: Vec<u32> = vec![];
+        let node_len = fp_vector_len + 4; // vec + count only
+
+        let buf = build_sector_buf(0, &fp_vec, &neighbors, 4096);
+        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len);
+
+        assert_eq!(node.fp_vector, vec![42u8; 16]);
+        assert!(node.adjacency_list.is_empty());
+    }
+
+    // ---- node_sector_index tests ----
+
+    #[test]
+    fn test_node_sector_index_multi_node_per_sector() {
+        let num_nodes_per_sector = 4u64;
+        let num_sectors_per_node = 1usize;
+
+        // Matches disk_sector_graph.rs: sector = 1 + vertex_id / num_nodes_per_sector
+        assert_eq!(node_sector_index(0, num_nodes_per_sector, num_sectors_per_node), 1);
+        assert_eq!(node_sector_index(3, num_nodes_per_sector, num_sectors_per_node), 1);
+        assert_eq!(node_sector_index(4, num_nodes_per_sector, num_sectors_per_node), 2);
+        assert_eq!(node_sector_index(5, num_nodes_per_sector, num_sectors_per_node), 2);
+        assert_eq!(node_sector_index(7, num_nodes_per_sector, num_sectors_per_node), 2);
+        assert_eq!(node_sector_index(8, num_nodes_per_sector, num_sectors_per_node), 3);
+        assert_eq!(node_sector_index(1023, num_nodes_per_sector, num_sectors_per_node), 256);
+        assert_eq!(node_sector_index(1024, num_nodes_per_sector, num_sectors_per_node), 257);
+    }
+
+    #[test]
+    fn test_node_sector_index_multi_sector_per_node() {
+        let num_nodes_per_sector = 0u64;
+        let num_sectors_per_node = 2usize;
+
+        // sector = 1 + vertex_id * num_sectors_per_node
+        assert_eq!(node_sector_index(0, num_nodes_per_sector, num_sectors_per_node), 1);
+        assert_eq!(node_sector_index(3, num_nodes_per_sector, num_sectors_per_node), 7);
+        assert_eq!(node_sector_index(4, num_nodes_per_sector, num_sectors_per_node), 9);
+        assert_eq!(node_sector_index(5, num_nodes_per_sector, num_sectors_per_node), 11);
+        assert_eq!(node_sector_index(7, num_nodes_per_sector, num_sectors_per_node), 15);
+        assert_eq!(node_sector_index(8, num_nodes_per_sector, num_sectors_per_node), 17);
+        assert_eq!(node_sector_index(1023, num_nodes_per_sector, num_sectors_per_node), 2047);
+        assert_eq!(node_sector_index(1024, num_nodes_per_sector, num_sectors_per_node), 2049);
+    }
+
+    // ---- node_offset_in_sector tests ----
+
+    #[test]
+    fn test_node_offset_multi_node_per_sector() {
+        let num_nodes_per_sector = 4u64;
+        let node_len = 256u64;
+
+        // offset = (vertex_id % num_nodes_per_sector) * node_len
+        assert_eq!(node_offset_in_sector(0, num_nodes_per_sector, node_len), 0);
+        assert_eq!(node_offset_in_sector(1, num_nodes_per_sector, node_len), 256);
+        assert_eq!(node_offset_in_sector(2, num_nodes_per_sector, node_len), 512);
+        assert_eq!(node_offset_in_sector(3, num_nodes_per_sector, node_len), 768);
+        assert_eq!(node_offset_in_sector(4, num_nodes_per_sector, node_len), 0); // wraps
+        assert_eq!(node_offset_in_sector(5, num_nodes_per_sector, node_len), 256);
+    }
+
+    #[test]
+    fn test_node_offset_multi_sector_per_node() {
+        // When num_nodes_per_sector is 0 (multi-sector), offset is always 0.
+        assert_eq!(node_offset_in_sector(0, 0, 8192), 0);
+        assert_eq!(node_offset_in_sector(5, 0, 8192), 0);
+        assert_eq!(node_offset_in_sector(100, 0, 8192), 0);
+    }
+
+    // ---- max_slots tests ----
+
+    #[test]
+    fn test_max_slots() {
+        // beam_width * 2 clamped to [16, MAX_IO_CONCURRENCY]
+        assert_eq!(max_slots(1), 16); // 2 clamped up to 16
+        assert_eq!(max_slots(8), 16);
+        assert_eq!(max_slots(16), 32);
+        assert_eq!(max_slots(64), 128);
+        assert_eq!(max_slots(100), 128); // 200 clamped down to 128
+    }
+}
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
new file mode 100644
index 000000000..864eca6ff
--- /dev/null
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Public API for pipelined disk search.
+
+use std::sync::Arc;
+
+use diskann::{utils::VectorRepr, ANNResult};
+use diskann_providers::model::{
+    graph::traits::GraphDataType, PQData, PQScratch,
+};
+use diskann_vector::distance::Metric;
+
+use crate::{
+    data_model::GraphHeader,
+    search::provider::disk_provider::{SearchResult, SearchResultItem, SearchResultStats},
+    utils::QueryStatistics,
+};
+
+use super::pipelined_reader::PipelinedReader;
+use super::pipelined_search::{pipe_search, PipeSearchResult};
+
+/// A pipelined disk index searcher implementing the PipeANN algorithm.
+///
+/// Analogous to `DiskIndexSearcher` but uses pipelined IO (non-blocking io_uring
+/// submit/poll) to overlap IO and compute within a single query.
+pub struct PipelinedSearcher<Data: GraphDataType<VectorIdType = u32>> {
+    graph_header: GraphHeader,
+    distance_comparer: <Data::VectorDataType as VectorRepr>::Distance,
+    pq_data: Arc<PQData>,
+    metric: Metric,
+    /// Maximum IO operations per search (reserved for future IO budget enforcement).
+    #[allow(dead_code)]
+    search_io_limit: usize,
+    /// Default beam width when not overridden per-query.
+    #[allow(dead_code)]
+    initial_beam_width: usize,
+    relaxed_monotonicity_l: Option<usize>,
+    disk_index_path: String,
+}
+
+impl<Data> PipelinedSearcher<Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    /// Create a new pipelined searcher.
+    ///
+    /// # Arguments
+    /// * `graph_header` - Graph metadata from the disk index.
+    /// * `pq_data` - Shared PQ data for approximate distance computation.
+    /// * `metric` - Distance metric (L2, InnerProduct, etc.).
+    /// * `search_io_limit` - Maximum IO operations per search.
+    /// * `initial_beam_width` - Initial number of concurrent IOs (adapts during search).
+    /// * `relaxed_monotonicity_l` - Optional early termination parameter.
+    /// * `disk_index_path` - Path to the disk index file for creating readers.
+    pub fn new(
+        graph_header: GraphHeader,
+        pq_data: Arc<PQData>,
+        metric: Metric,
+        search_io_limit: usize,
+        initial_beam_width: usize,
+        relaxed_monotonicity_l: Option<usize>,
+        disk_index_path: String,
+    ) -> ANNResult<Self> {
+        let dims = graph_header.metadata().dims;
+        let distance_comparer = Data::VectorDataType::distance(metric, Some(dims));
+        Ok(Self {
+            graph_header,
+            distance_comparer,
+            pq_data,
+            metric,
+            search_io_limit,
+            initial_beam_width,
+            relaxed_monotonicity_l,
+            disk_index_path,
+        })
+    }
+
+    /// Perform a pipelined search on the disk index.
+    ///
+    /// # Arguments
+    /// * `query` - The query vector.
+    /// * `return_list_size` - Number of results to return (k).
+    /// * `search_list_size` - Size of the candidate pool (L).
+    /// * `beam_width` - Maximum beam width for pipelined IO.
+    pub fn search(
+        &self,
+        query: &[Data::VectorDataType],
+        return_list_size: u32,
+        search_list_size: u32,
+        beam_width: usize,
+    ) -> ANNResult<SearchResult<Data::AssociatedDataType>> {
+        let metadata = self.graph_header.metadata();
+        let dims = metadata.dims;
+        let node_len = metadata.node_len;
+        let num_nodes_per_sector = metadata.num_nodes_per_block;
+        let fp_vector_len =
+            (dims * std::mem::size_of::<Data::VectorDataType>()) as u64;
+        let medoid = metadata.medoid as u32;
+
+        let mut block_size = self.graph_header.block_size() as usize;
+        let version = self.graph_header.layout_version();
+        if (version.major_version() == 0 && version.minor_version() == 0) || block_size == 0 {
+            block_size = 4096;
+        }
+
+        let num_sectors_per_node = if num_nodes_per_sector > 0 {
+            1
+        } else {
+            (node_len as usize).div_ceil(block_size)
+        };
+        let slot_size = num_sectors_per_node * block_size;
+
+        let max_slots = (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
+
+        // Create a per-call reader
+        let mut reader = PipelinedReader::new(
+            &self.disk_index_path,
+            max_slots,
+            slot_size,
+            block_size,
+        )?;
+
+        let graph_degree = self.graph_header.max_degree::<Data::VectorDataType>()?;
+        let num_pq_chunks = self.pq_data.get_num_chunks();
+        let num_pq_centers = self.pq_data.get_num_centers();
+
+        let mut pq_scratch = PQScratch::new(
+            graph_degree,
+            dims,
+            num_pq_chunks,
+            num_pq_centers,
+        )?;
+
+        let result: PipeSearchResult = pipe_search::<Data::VectorDataType>(
+            &mut reader,
+            &self.pq_data,
+            &self.distance_comparer,
+            query,
+            return_list_size as usize,
+            search_list_size as usize,
+            beam_width,
+            medoid,
+            dims,
+            node_len,
+            num_nodes_per_sector,
+            block_size,
+            fp_vector_len,
+            &mut pq_scratch,
+            self.relaxed_monotonicity_l,
+            self.metric,
+        )?;
+
+        let query_statistics = QueryStatistics {
+            total_execution_time_us: result.stats.total_us,
+            io_time_us: result.stats.io_us,
+            cpu_time_us: result.stats.cpu_us,
+            total_io_operations: result.stats.io_count,
+            total_comparisons: result.stats.comparisons,
+            total_vertices_loaded: result.stats.io_count,
+            search_hops: result.stats.hops,
+            ..Default::default()
+        };
+
+        let stats = SearchResultStats {
+            cmps: result.stats.comparisons,
+            result_count: result.ids.len() as u32,
+            query_statistics,
+        };
+
+        let mut results = Vec::with_capacity(result.ids.len());
+        for (id, dist) in result.ids.iter().zip(result.distances.iter()) {
+            results.push(SearchResultItem {
+                vertex_id: *id,
+                distance: *dist,
+                data: Data::AssociatedDataType::default(),
+            });
+        }
+
+        Ok(SearchResult { results, stats })
+    }
+}

From 6e6d237a55d3f6a15d3aa9bc0d6bd73a920d4708 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Mon, 9 Feb 2026 13:44:02 -0800
Subject: [PATCH 02/46] implementation of SQPoll and more testing

---
 Cargo.lock                                    |  17 +-
 diskann-benchmark/example/pipe-search.json    |  53 ++++
 .../src/backend/disk_index/search.rs          |  52 ++--
 diskann-benchmark/src/inputs/disk.rs          |   6 +
 diskann-disk/Cargo.toml                       |   2 +-
 diskann-disk/src/build/builder/core.rs        |  89 ++++++
 diskann-disk/src/build/builder/tests.rs       |  15 +-
 diskann-disk/src/search/pipelined/mod.rs      |  10 +
 .../src/search/pipelined/pipelined_reader.rs  |  51 +++-
 .../src/search/pipelined/pipelined_search.rs  |   3 +
 .../search/pipelined/pipelined_searcher.rs    | 267 ++++++++++++++++--
 .../src/search/provider/disk_provider.rs      | 197 +++++++++++++
 diskann-platform/Cargo.toml                   |   2 +-
 13 files changed, 712 insertions(+), 52 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 426fe8795..d830c9aed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -110,7 +110,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8cce2075c711f351f0aa52c05e645cc41f1b3cc0cdba1ad12c5f67c121c1bb7d"
 dependencies = [
  "cfg-if",
- "io-uring",
+ "io-uring 0.6.4",
  "libc",
  "rand 0.8.5",
  "rand 0.9.2",
@@ -531,7 +531,7 @@ dependencies = [
  "half",
  "hashbrown 0.16.1",
  "iai-callgrind",
- "io-uring",
+ "io-uring 0.7.11",
  "libc",
  "opentelemetry",
  "rand 0.9.2",
@@ -586,7 +586,7 @@ dependencies = [
 name = "diskann-platform"
 version = "0.45.0"
 dependencies = [
- "io-uring",
+ "io-uring 0.7.11",
  "libc",
  "tracing",
  "windows-sys 0.59.0",
@@ -1294,6 +1294,17 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "io-uring"
+version = "0.7.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd7bddefd0a8833b88a4b68f90dae22c7450d11b354198baee3874fd811b344"
+dependencies = [
+ "bitflags 2.10.0",
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "is-terminal"
 version = "0.4.17"
diff --git a/diskann-benchmark/example/pipe-search.json b/diskann-benchmark/example/pipe-search.json
index 41349a3e7..bec885b8d 100644
--- a/diskann-benchmark/example/pipe-search.json
+++ b/diskann-benchmark/example/pipe-search.json
@@ -78,6 +78,59 @@
           }
         }
       }
+    },
+    {
+      "type": "disk-index",
+      "content": {
+        "source": {
+          "disk-index-source": "Load",
+          "data_type": "float32",
+          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
+        },
+        "search_phase": {
+          "queries": "disk_index_sample_query_10pts.fbin",
+          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
+          "search_list": [10, 20, 40, 80],
+          "beam_width": 4,
+          "recall_at": 10,
+          "num_threads": 1,
+          "is_flat_search": false,
+          "distance": "squared_l2",
+          "vector_filters_file": null,
+          "search_mode": {
+            "mode": "PipeSearch",
+            "initial_beam_width": 4,
+            "sqpoll_idle_ms": 1000,
+            "iopoll": false
+          }
+        }
+      }
+    },
+    {
+      "type": "disk-index",
+      "content": {
+        "source": {
+          "disk-index-source": "Load",
+          "data_type": "float32",
+          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
+        },
+        "search_phase": {
+          "queries": "disk_index_sample_query_10pts.fbin",
+          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
+          "search_list": [10, 20, 40, 80],
+          "beam_width": 4,
+          "recall_at": 10,
+          "num_threads": 1,
+          "is_flat_search": false,
+          "distance": "squared_l2",
+          "vector_filters_file": null,
+          "search_mode": {
+            "mode": "PipeSearch",
+            "initial_beam_width": 4,
+            "iopoll": true
+          }
+        }
+      }
     }
   ]
 }
diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 21c2ba9dd..23a835090 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -4,7 +4,7 @@
  */
 
 use rayon::prelude::*;
-use std::{collections::HashSet, fmt, sync::atomic::AtomicBool, time::Instant};
+use std::{collections::HashSet, fmt, sync::atomic::AtomicBool, sync::Arc, time::Instant};
 
 use opentelemetry::{global, trace::Span, trace::Tracer};
 use opentelemetry_sdk::trace::SdkTracerProvider;
@@ -21,7 +21,7 @@ use diskann_disk::{
     utils::{instrumentation::PerfLogger, statistics, AlignedFileReaderFactory, QueryStatistics},
 };
 #[cfg(target_os = "linux")]
-use diskann_disk::search::pipelined::PipelinedSearcher;
+use diskann_disk::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
 use diskann_providers::storage::StorageReadProvider;
 use diskann_providers::{
     storage::{
@@ -330,9 +330,14 @@ where
                 search_results_per_l.push(search_result);
             }
         }
+        // PipeANN pipelined search — for read-only search on completed (static) indices only.
+        // Searcher is created once; internal ObjectPool handles per-thread scratch allocation.
+        // Build's internal search always uses BeamSearch above.
         SearchMode::PipeSearch {
             initial_beam_width,
             relaxed_monotonicity_l,
+            sqpoll_idle_ms,
+            iopoll,
         } => {
             #[cfg(target_os = "linux")]
             {
@@ -343,6 +348,23 @@ where
                 let initial_beam_width = *initial_beam_width;
                 let relaxed_monotonicity_l = *relaxed_monotonicity_l;
 
+                let reader_config = PipelinedReaderConfig {
+                    sqpoll_idle_ms: *sqpoll_idle_ms,
+                    iopoll: *iopoll,
+                };
+
+                // Create searcher once — pool handles per-thread scratch allocation
+                let pipe_searcher = Arc::new(PipelinedSearcher::<GraphData<T>>::new(
+                    graph_header.clone(),
+                    pq_data.clone(),
+                    metric,
+                    search_io_limit,
+                    initial_beam_width,
+                    relaxed_monotonicity_l,
+                    disk_index_path.clone(),
+                    reader_config,
+                )?);
+
                 logger.log_checkpoint("index_loaded");
 
                 for &l in search_params.search_list.iter() {
@@ -363,6 +385,8 @@ where
                         tracer.start(span_name)
                     };
 
+                    let pipe_searcher = pipe_searcher.clone(); // Arc clone for this L iteration
+
                     let zipped = queries
                         .par_row_iter()
                         .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
@@ -373,28 +397,6 @@ where
                     zipped.for_each_in_pool(
                         &pool,
                         |((((q, id_chunk), dist_chunk), stats), rc)| {
-                            let pipe_searcher =
-                                match PipelinedSearcher::<GraphData<T>>::new(
-                                    graph_header.clone(),
-                                    pq_data.clone(),
-                                    metric,
-                                    search_io_limit,
-                                    initial_beam_width,
-                                    relaxed_monotonicity_l,
-                                    disk_index_path.clone(),
-                                ) {
-                                    Ok(s) => s,
-                                    Err(e) => {
-                                        eprintln!("Failed to create PipelinedSearcher: {:?}", e);
-                                        *rc = 0;
-                                        id_chunk.fill(0);
-                                        dist_chunk.fill(0.0);
-                                        has_any_search_failed
-                                            .store(true, std::sync::atomic::Ordering::Release);
-                                        return;
-                                    }
-                                };
-
                             match pipe_searcher.search(
                                 q,
                                 search_params.recall_at,
@@ -451,7 +453,7 @@ where
             }
             #[cfg(not(target_os = "linux"))]
             {
-                let _ = (initial_beam_width, relaxed_monotonicity_l);
+                let _ = (initial_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms, iopoll);
                 anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index 557aa7aed..405f33f28 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -85,6 +85,12 @@ pub(crate) enum SearchMode {
         initial_beam_width: usize,
         /// Optional relaxed monotonicity parameter for early termination.
         relaxed_monotonicity_l: Option<usize>,
+        /// Enable kernel-side SQ polling (ms idle timeout). None = disabled.
+        #[serde(default)]
+        sqpoll_idle_ms: Option<u32>,
+        /// Enable busy-wait IO polling. Default: false.
+        #[serde(default)]
+        iopoll: bool,
     },
 }
 
diff --git a/diskann-disk/Cargo.toml b/diskann-disk/Cargo.toml
index c68d65769..204cf36e0 100644
--- a/diskann-disk/Cargo.toml
+++ b/diskann-disk/Cargo.toml
@@ -47,7 +47,7 @@ vfs = { workspace = true }
 opentelemetry = { workspace = true, optional = true }
 
 [target.'cfg(target_os = "linux")'.dependencies]
-io-uring = "0.6.4"
+io-uring = "0.7"
 libc = "0.2.148"
 
 [dev-dependencies]
diff --git a/diskann-disk/src/build/builder/core.rs b/diskann-disk/src/build/builder/core.rs
index d27182e83..d4fcdc53f 100644
--- a/diskann-disk/src/build/builder/core.rs
+++ b/diskann-disk/src/build/builder/core.rs
@@ -1119,6 +1119,95 @@ pub(crate) mod disk_index_builder_tests {
         Ok(())
     }
 
+    /// Verifies search results via PipelinedSearcher (PipeANN) have good recall
+    /// against ground truth computed from the dataset.
+    #[cfg(target_os = "linux")]
+    pub(crate) fn verify_search_result_with_ground_truth_pipelined<
+        G: GraphDataType<VectorIdType = u32, AssociatedDataType = ()>,
+    >(
+        params: &TestParams,
+        top_k: usize,
+        search_l: u32,
+        storage_provider: &Arc<VirtualStorageProvider<OverlayFS>>,
+    ) -> ANNResult<()> {
+        use crate::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
+        use crate::search::traits::vertex_provider_factory::VertexProviderFactory;
+
+        let pq_pivot_path = get_pq_pivot_file(&params.index_path_prefix);
+        let pq_compressed_path = get_compressed_pq_file(&params.index_path_prefix);
+
+        let index_reader = DiskIndexReader::<G::VectorDataType>::new(
+            pq_pivot_path,
+            pq_compressed_path,
+            storage_provider.as_ref(),
+        )?;
+        let pq_data = index_reader.get_pq_data();
+
+        let vertex_provider_factory = DiskVertexProviderFactory::<G, _>::new(
+            VirtualAlignedReaderFactory::new(
+                get_disk_index_file(&params.index_path_prefix),
+                Arc::clone(storage_provider),
+            ),
+            CachingStrategy::None,
+        )?;
+        let graph_header = vertex_provider_factory.get_header()?;
+
+        // Resolve real filesystem path (PipelinedSearcher uses O_DIRECT).
+        let vfs_suffix = params.index_path_prefix.trim_start_matches('/');
+        let real_index_path = diskann_utils::test_data_root()
+            .join(format!("{}_disk.index", vfs_suffix));
+        let real_index_path_str = real_index_path.to_str().unwrap();
+
+        let pipe_searcher = PipelinedSearcher::<G>::new(
+            graph_header,
+            pq_data,
+            params.metric,
+            usize::MAX,
+            4,
+            None,
+            real_index_path_str.to_string(),
+            PipelinedReaderConfig::default(),
+        )?;
+
+        let (data, npoints, dim) = file_util::load_bin::<G::VectorDataType, _>(
+            storage_provider.as_ref(),
+            &params.data_path,
+            0,
+        )?;
+        let data =
+            diskann_utils::views::Matrix::try_from(data.into(), npoints, dim).bridge_err()?;
+        let distance = <G::VectorDataType>::distance(params.metric, Some(dim));
+
+        for (q, query_data) in data.row_iter().enumerate() {
+            let gt =
+                diskann_providers::test_utils::groundtruth(data.as_view(), query_data, |a, b| {
+                    distance.evaluate_similarity(a, b)
+                });
+
+            let result =
+                pipe_searcher.search(query_data, top_k as u32, search_l, 4)?;
+            let result_ids: Vec<u32> =
+                result.results.iter().map(|item| item.vertex_id).collect();
+
+            let gt_ids: Vec<u32> = gt.iter().take(top_k).map(|n| n.id).collect();
+            let matching = result_ids
+                .iter()
+                .filter(|id| gt_ids.contains(id))
+                .count();
+            let recall = matching as f32 / top_k as f32;
+            assert!(
+                recall >= 0.8,
+                "PipeANN recall {:.0}% < 80% for query {}, got {:?}, expected {:?}",
+                recall * 100.0,
+                q,
+                result_ids,
+                gt_ids,
+            );
+        }
+
+        Ok(())
+    }
+
     // Compare that the index built in test is the same as the truth index. The truth index doesn't have associated data, we are only comparing the vector and neighbor data.
     pub fn compare_disk_index_graphs(graph_data: &[u8], truth_graph_data: &[u8]) {
         let graph_header = GraphHeader::try_from(&graph_data[8..]).unwrap();
diff --git a/diskann-disk/src/build/builder/tests.rs b/diskann-disk/src/build/builder/tests.rs
index 347f834e4..cb73bf239 100644
--- a/diskann-disk/src/build/builder/tests.rs
+++ b/diskann-disk/src/build/builder/tests.rs
@@ -21,7 +21,8 @@ mod chunkable_disk_index_build_tests {
     use crate::{
         build::{
             builder::core::disk_index_builder_tests::{
-                new_vfs, verify_search_result_with_ground_truth, CheckpointParams,
+                new_vfs, verify_search_result_with_ground_truth,
+                CheckpointParams,
                 IndexBuildFixture, TestParams,
             },
             chunking::{
@@ -35,6 +36,9 @@ mod chunkable_disk_index_build_tests {
         QuantizationType,
     };
 
+    #[cfg(target_os = "linux")]
+    use crate::build::builder::core::disk_index_builder_tests::verify_search_result_with_ground_truth_pipelined;
+
     #[derive(PartialEq)]
     enum BuildType {
         AsyncFP,
@@ -191,6 +195,15 @@ mod chunkable_disk_index_build_tests {
         )
         .unwrap();
 
+        #[cfg(target_os = "linux")]
+        verify_search_result_with_ground_truth_pipelined::<GraphDataF32VectorUnitData>(
+            &fixture.params,
+            top_k,
+            search_l,
+            &fixture.storage_provider,
+        )
+        .unwrap();
+
         remove_checkpoint_record_file(&index_path_prefix);
     }
 
diff --git a/diskann-disk/src/search/pipelined/mod.rs b/diskann-disk/src/search/pipelined/mod.rs
index 31b9c4b63..2005c0ce8 100644
--- a/diskann-disk/src/search/pipelined/mod.rs
+++ b/diskann-disk/src/search/pipelined/mod.rs
@@ -7,11 +7,21 @@
 //!
 //! This module provides a pipelined disk search that overlaps IO and compute
 //! within a single query, using io_uring for non-blocking IO on Linux.
+//!
+//! # Safety
+//!
+//! This search implementation is designed for **read-only search on completed
+//! (static) disk indices**. It bypasses the synchronized `DiskProvider` path and
+//! reads raw sectors directly via O_DIRECT, so it must NOT be used concurrently
+//! with index modifications (build, insert, delete). For search during streaming
+//! operations, use `DiskIndexSearcher` (beam search) instead.
 
 #[cfg(target_os = "linux")]
 mod pipelined_reader;
 #[cfg(target_os = "linux")]
 pub use pipelined_reader::PipelinedReader;
+#[cfg(target_os = "linux")]
+pub use pipelined_reader::PipelinedReaderConfig;
 
 #[cfg(target_os = "linux")]
 mod pipelined_search;
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index e32931b4a..d97061adf 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -17,6 +17,20 @@ use io_uring::IoUring;
 /// Maximum number of concurrent IO operations supported by the ring.
 pub const MAX_IO_CONCURRENCY: usize = 128;
 
+/// Configuration for io_uring-based pipelined reader.
+#[derive(Debug, Clone, Default)]
+pub struct PipelinedReaderConfig {
+    /// Enable kernel-side SQ polling. If `Some(idle_ms)`, a kernel thread polls
+    /// the submission queue, eliminating the syscall per submit. After `idle_ms`
+    /// milliseconds of inactivity the kernel thread sleeps (resumed automatically
+    /// on next `submit()`). Requires Linux kernel >= 5.11 (>= 5.13 unprivileged).
+    pub sqpoll_idle_ms: Option<u32>,
+    /// Enable busy-wait polling for IO completions (IORING_SETUP_IOPOLL).
+    /// Reduces latency at the cost of higher CPU usage. Requires O_DIRECT and
+    /// a file system that supports polling.
+    pub iopoll: bool,
+}
+
 /// A pipelined IO reader that wraps `io_uring` for non-blocking submit/poll.
 ///
 /// Unlike `LinuxAlignedFileReader` which uses `submit_and_wait` (blocking),
@@ -32,6 +46,8 @@ pub struct PipelinedReader {
     max_slots: usize,
     /// Number of currently in-flight (submitted but not completed) reads.
     in_flight: usize,
+    /// Whether IOPOLL mode is active (requires active polling for completions).
+    iopoll: bool,
     /// Keep the file handle alive for the lifetime of the reader.
     _file: std::fs::File,
 }
@@ -49,6 +65,7 @@ impl PipelinedReader {
         max_slots: usize,
         slot_size: usize,
         alignment: usize,
+        config: &PipelinedReaderConfig,
     ) -> ANNResult<Self> {
         let file = OpenOptions::new()
             .read(true)
@@ -56,7 +73,19 @@ impl PipelinedReader {
             .open(file_path)
             .map_err(ANNError::log_io_error)?;
 
-        let ring = IoUring::new(max_slots.min(MAX_IO_CONCURRENCY) as u32)?;
+        let entries = max_slots.min(MAX_IO_CONCURRENCY) as u32;
+        let ring = if config.sqpoll_idle_ms.is_some() || config.iopoll {
+            let mut builder = IoUring::builder();
+            if let Some(idle_ms) = config.sqpoll_idle_ms {
+                builder.setup_sqpoll(idle_ms);
+            }
+            if config.iopoll {
+                builder.setup_iopoll();
+            }
+            builder.build(entries)?
+        } else {
+            IoUring::new(entries)?
+        };
         let fd = file.as_raw_fd();
         ring.submitter().register_files(std::slice::from_ref(&fd))?;
 
@@ -68,6 +97,7 @@ impl PipelinedReader {
             slot_size,
             max_slots,
             in_flight: 0,
+            iopoll: config.iopoll,
             _file: file,
         })
     }
@@ -107,10 +137,18 @@ impl PipelinedReader {
         Ok(())
     }
 
-    /// Non-blocking poll of completed IO operations.
+    /// Poll for completed IO operations.
     ///
-    /// Returns the slot_ids of all completed reads since the last poll.
+    /// In default mode, this is non-blocking (drains already-completed CQEs).
+    /// In IOPOLL mode, this actively polls the kernel for at least one completion
+    /// when there are in-flight IOs, since IOPOLL completions require active reaping.
     pub fn poll_completions(&mut self) -> ANNResult<Vec<usize>> {
+        // IOPOLL requires the kernel to actively poll for completions.
+        // Without this, ring.completion() will always be empty.
+        if self.iopoll && self.in_flight > 0 {
+            self.ring.submit_and_wait(1)?;
+        }
+
         let mut completed = Vec::new();
         for cqe in self.ring.completion() {
             if cqe.result() < 0 {
@@ -132,6 +170,13 @@ impl PipelinedReader {
         &self.slot_bufs[start..start + self.slot_size]
     }
 
+    /// Reset the reader for reuse: clear in-flight count and drain remaining CQEs.
+    pub fn reset(&mut self) {
+        self.in_flight = 0;
+        // Drain any remaining completions from the ring.
+        for _cqe in self.ring.completion() {}
+    }
+
     /// Returns the number of submitted but not yet completed reads.
     pub fn in_flight_count(&self) -> usize {
         self.in_flight
diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
index 672d21a8e..650b87e8e 100644
--- a/diskann-disk/src/search/pipelined/pipelined_search.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_search.rs
@@ -93,6 +93,9 @@ fn parse_node(
 
     let neighbor_data = &node_data[fp_vector_len as usize..];
     let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
+    // Clamp to the available data to avoid out-of-bounds reads.
+    let max_neighbors = (neighbor_data.len().saturating_sub(4)) / 4;
+    let num_neighbors = num_neighbors.min(max_neighbors);
     let mut adjacency_list = Vec::with_capacity(num_neighbors);
     for i in 0..num_neighbors {
         let start = 4 + i * 4;
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index 864eca6ff..f360692e6 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -7,7 +7,13 @@
 
 use std::sync::Arc;
 
-use diskann::{utils::VectorRepr, ANNResult};
+use diskann::{
+    utils::{
+        object_pool::{ObjectPool, PoolOption, TryAsPooled},
+        VectorRepr,
+    },
+    ANNError, ANNResult,
+};
 use diskann_providers::model::{
     graph::traits::GraphDataType, PQData, PQScratch,
 };
@@ -19,13 +25,76 @@ use crate::{
     utils::QueryStatistics,
 };
 
-use super::pipelined_reader::PipelinedReader;
+use super::pipelined_reader::{PipelinedReader, PipelinedReaderConfig};
 use super::pipelined_search::{pipe_search, PipeSearchResult};
 
+/// Scratch space for pipelined search operations, pooled for reuse across queries.
+struct PipelinedSearchScratch {
+    reader: PipelinedReader,
+    pq_scratch: PQScratch,
+}
+
+/// Arguments for creating or resetting a [`PipelinedSearchScratch`].
+#[derive(Clone)]
+struct PipelinedScratchArgs<'a> {
+    disk_index_path: &'a str,
+    max_slots: usize,
+    slot_size: usize,
+    alignment: usize,
+    graph_degree: usize,
+    dims: usize,
+    num_pq_chunks: usize,
+    num_pq_centers: usize,
+    reader_config: PipelinedReaderConfig,
+}
+
+impl TryAsPooled<&PipelinedScratchArgs<'_>> for PipelinedSearchScratch {
+    type Error = ANNError;
+
+    fn try_create(args: &PipelinedScratchArgs<'_>) -> Result<Self, Self::Error> {
+        let reader = PipelinedReader::new(
+            args.disk_index_path,
+            args.max_slots,
+            args.slot_size,
+            args.alignment,
+            &args.reader_config,
+        )?;
+        let pq_scratch = PQScratch::new(
+            args.graph_degree,
+            args.dims,
+            args.num_pq_chunks,
+            args.num_pq_centers,
+        )?;
+        Ok(Self { reader, pq_scratch })
+    }
+
+    fn try_modify(&mut self, _args: &PipelinedScratchArgs<'_>) -> Result<(), Self::Error> {
+        self.reader.reset();
+        Ok(())
+    }
+}
+
 /// A pipelined disk index searcher implementing the PipeANN algorithm.
 ///
 /// Analogous to `DiskIndexSearcher` but uses pipelined IO (non-blocking io_uring
 /// submit/poll) to overlap IO and compute within a single query.
+///
+/// # Safety
+///
+/// This searcher is designed for **read-only search on completed (static) disk indices**.
+/// It opens independent file descriptors with O_DIRECT and reads raw sectors without
+/// going through the synchronized `DiskProvider` path. It must NOT be used concurrently
+/// with index build, insert, or delete operations on the same index file.
+///
+/// For search during streaming or dynamic index operations, use [`DiskIndexSearcher`]
+/// (beam search) instead, which provides proper synchronization through the
+/// `DiskProvider` and `VertexProvider` abstractions.
+///
+/// # Thread Safety
+///
+/// Multiple concurrent `search()` calls on the same `PipelinedSearcher` are safe.
+/// Each search operates on its own `PipelinedReader` and `PQScratch` (pooled for
+/// amortized allocation). Shared state (`PQData`, `GraphHeader`) is immutable.
 pub struct PipelinedSearcher<Data: GraphDataType<VectorIdType = u32>> {
     graph_header: GraphHeader,
     distance_comparer: <Data::VectorDataType as VectorRepr>::Distance,
@@ -39,6 +108,9 @@ pub struct PipelinedSearcher<Data: GraphDataType<VectorIdType = u32>> {
     initial_beam_width: usize,
     relaxed_monotonicity_l: Option<usize>,
     disk_index_path: String,
+    reader_config: PipelinedReaderConfig,
+    /// Pool of reusable reader + PQ scratch instances.
+    scratch_pool: Arc<ObjectPool<PipelinedSearchScratch>>,
 }
 
 impl<Data> PipelinedSearcher<Data>
@@ -63,9 +135,48 @@ where
         initial_beam_width: usize,
         relaxed_monotonicity_l: Option<usize>,
         disk_index_path: String,
+        config: PipelinedReaderConfig,
     ) -> ANNResult<Self> {
-        let dims = graph_header.metadata().dims;
+        let metadata = graph_header.metadata();
+        let dims = metadata.dims;
         let distance_comparer = Data::VectorDataType::distance(metric, Some(dims));
+
+        let node_len = metadata.node_len;
+        let num_nodes_per_sector = metadata.num_nodes_per_block;
+
+        let mut block_size = graph_header.block_size() as usize;
+        let version = graph_header.layout_version();
+        if (version.major_version() == 0 && version.minor_version() == 0) || block_size == 0 {
+            block_size = 4096;
+        }
+
+        let num_sectors_per_node = if num_nodes_per_sector > 0 {
+            1
+        } else {
+            (node_len as usize).div_ceil(block_size)
+        };
+        let slot_size = num_sectors_per_node * block_size;
+
+        let max_slots =
+            (initial_beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
+
+        let graph_degree = graph_header.max_degree::<Data::VectorDataType>()?;
+        let num_pq_chunks = pq_data.get_num_chunks();
+        let num_pq_centers = pq_data.get_num_centers();
+
+        let scratch_args = PipelinedScratchArgs {
+            disk_index_path: &disk_index_path,
+            max_slots,
+            slot_size,
+            alignment: block_size,
+            graph_degree,
+            dims,
+            num_pq_chunks,
+            num_pq_centers,
+            reader_config: config.clone(),
+        };
+        let scratch_pool = Arc::new(ObjectPool::try_new(&scratch_args, 0, None)?);
+
         Ok(Self {
             graph_header,
             distance_comparer,
@@ -75,6 +186,8 @@ where
             initial_beam_width,
             relaxed_monotonicity_l,
             disk_index_path,
+            reader_config: config,
+            scratch_pool,
         })
     }
 
@@ -111,31 +224,32 @@ where
         } else {
             (node_len as usize).div_ceil(block_size)
         };
-        let slot_size = num_sectors_per_node * block_size;
-
-        let max_slots = (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
-
-        // Create a per-call reader
-        let mut reader = PipelinedReader::new(
-            &self.disk_index_path,
-            max_slots,
-            slot_size,
-            block_size,
-        )?;
 
         let graph_degree = self.graph_header.max_degree::<Data::VectorDataType>()?;
         let num_pq_chunks = self.pq_data.get_num_chunks();
         let num_pq_centers = self.pq_data.get_num_centers();
+        let max_slots = (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
+        let slot_size = num_sectors_per_node * block_size;
 
-        let mut pq_scratch = PQScratch::new(
+        let args = PipelinedScratchArgs {
+            disk_index_path: &self.disk_index_path,
+            max_slots,
+            slot_size,
+            alignment: block_size,
             graph_degree,
             dims,
             num_pq_chunks,
             num_pq_centers,
-        )?;
+            reader_config: self.reader_config.clone(),
+        };
+        let mut scratch = PoolOption::try_pooled(&self.scratch_pool, &args)?;
+        let PipelinedSearchScratch {
+            ref mut reader,
+            ref mut pq_scratch,
+        } = *scratch;
 
         let result: PipeSearchResult = pipe_search::<Data::VectorDataType>(
-            &mut reader,
+            reader,
             &self.pq_data,
             &self.distance_comparer,
             query,
@@ -148,7 +262,7 @@ where
             num_nodes_per_sector,
             block_size,
             fp_vector_len,
-            &mut pq_scratch,
+            pq_scratch,
             self.relaxed_monotonicity_l,
             self.metric,
         )?;
@@ -182,3 +296,120 @@ where
         Ok(SearchResult { results, stats })
     }
 }
+
+#[cfg(test)]
+#[cfg(target_os = "linux")]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    use diskann_providers::storage::{get_disk_index_file, VirtualStorageProvider};
+    use diskann_providers::test_utils::graph_data_type_utils::GraphDataF32VectorUnitData;
+    use diskann_utils::test_data_root;
+    use diskann_vector::distance::Metric;
+    use rayon::prelude::*;
+
+    use crate::data_model::CachingStrategy;
+    use crate::search::provider::disk_vertex_provider_factory::DiskVertexProviderFactory;
+    use crate::search::traits::vertex_provider_factory::VertexProviderFactory;
+    use crate::storage::disk_index_reader::DiskIndexReader;
+    use crate::utils::VirtualAlignedReaderFactory;
+
+    use super::PipelinedReaderConfig;
+
+    const TEST_INDEX_PREFIX: &str =
+        "/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search";
+    const TEST_PQ_PIVOT: &str =
+        "/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_pq_pivots.bin";
+    const TEST_PQ_COMPRESSED: &str =
+        "/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_pq_compressed.bin";
+    const TEST_QUERY: &str = "/disk_index_search/disk_index_sample_query_10pts.fbin";
+
+    fn create_test_searcher() -> PipelinedSearcher<GraphDataF32VectorUnitData> {
+        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
+
+        let disk_index_reader = DiskIndexReader::<f32>::new(
+            TEST_PQ_PIVOT.to_string(),
+            TEST_PQ_COMPRESSED.to_string(),
+            storage_provider.as_ref(),
+        )
+        .unwrap();
+        let pq_data = disk_index_reader.get_pq_data();
+
+        let aligned_reader_factory = VirtualAlignedReaderFactory::new(
+            get_disk_index_file(TEST_INDEX_PREFIX),
+            Arc::clone(&storage_provider),
+        );
+        let vertex_provider_factory =
+            DiskVertexProviderFactory::<GraphDataF32VectorUnitData, _>::new(
+                aligned_reader_factory,
+                CachingStrategy::None,
+            )
+            .unwrap();
+        let graph_header = vertex_provider_factory.get_header().unwrap();
+
+        let real_index_path = test_data_root().join(
+            "disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_disk.index",
+        );
+
+        PipelinedSearcher::<GraphDataF32VectorUnitData>::new(
+            graph_header,
+            pq_data,
+            Metric::L2,
+            usize::MAX,
+            4,
+            None,
+            real_index_path.to_str().unwrap().to_string(),
+            PipelinedReaderConfig::default(),
+        )
+        .unwrap()
+    }
+
+    fn load_test_query() -> Vec<f32> {
+        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
+        let (query_vector, _npts, _dim) =
+            diskann_providers::utils::file_util::load_bin::<f32, _>(
+                storage_provider.as_ref(),
+                TEST_QUERY,
+                0,
+            )
+            .unwrap();
+        query_vector[0..128].to_vec()
+    }
+
+    #[test]
+    fn test_pool_reuse_sequential_searches() {
+        let searcher = create_test_searcher();
+        let query = load_test_query();
+
+        let r1 = searcher.search(&query, 10, 40, 4).unwrap();
+        let r2 = searcher.search(&query, 10, 40, 4).unwrap();
+
+        assert!(!r1.results.is_empty());
+        assert!(!r2.results.is_empty());
+        // Same query must return same number of results.
+        assert_eq!(r1.results.len(), r2.results.len());
+        // All distances must be non-negative.
+        for item in r1.results.iter().chain(r2.results.iter()) {
+            assert!(item.distance >= 0.0);
+        }
+    }
+
+    #[test]
+    fn test_pool_concurrent_searches() {
+        let searcher = Arc::new(create_test_searcher());
+        let query = load_test_query();
+
+        let results: Vec<_> = (0..4)
+            .into_par_iter()
+            .map(|_| searcher.search(&query, 10, 40, 4).unwrap())
+            .collect();
+
+        for r in &results {
+            assert!(!r.results.is_empty());
+            for item in &r.results {
+                assert!(item.distance >= 0.0);
+            }
+        }
+    }
+}
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index ab0a4f4e7..468ca82c6 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -2131,4 +2131,201 @@ mod disk_provider_tests {
         // search hits io_limit that it doesn't break and the recall degrades gracefully
         assert!(recall >= 60.0, "Match percentage is below 60%: {}", recall);
     }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn test_pipe_search_k10_l100_128dim() {
+        use crate::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
+        use diskann_providers::storage::get_disk_index_file;
+
+        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
+
+        // Load PQ data via DiskIndexReader.
+        let disk_index_reader = DiskIndexReader::<f32>::new(
+            TEST_PQ_PIVOT_128DIM.to_string(),
+            TEST_PQ_COMPRESSED_128DIM.to_string(),
+            storage_provider.as_ref(),
+        )
+        .unwrap();
+        let pq_data = disk_index_reader.get_pq_data();
+
+        // Read graph header via DiskVertexProviderFactory.
+        let aligned_reader_factory = VirtualAlignedReaderFactory::new(
+            get_disk_index_file(TEST_INDEX_PREFIX_128DIM),
+            Arc::clone(&storage_provider),
+        );
+        let vertex_provider_factory =
+            DiskVertexProviderFactory::<GraphDataF32VectorUnitData, _>::new(
+                aligned_reader_factory,
+                CachingStrategy::None,
+            )
+            .unwrap();
+        let graph_header = vertex_provider_factory.get_header().unwrap();
+
+        // Resolve real filesystem path for PipelinedSearcher (needs O_DIRECT).
+        let real_index_path = test_data_root().join(
+            "disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_disk.index",
+        );
+        let real_index_path_str = real_index_path.to_str().unwrap();
+
+        let pipe_searcher = PipelinedSearcher::<GraphDataF32VectorUnitData>::new(
+            graph_header,
+            pq_data,
+            Metric::L2,
+            usize::MAX,
+            4,
+            None,
+            real_index_path_str.to_string(),
+            PipelinedReaderConfig::default(),
+        )
+        .unwrap();
+
+        // Load queries and ground truth.
+        let (query_vector, _, _) = diskann_providers::utils::file_util::load_bin::<f32, _>(
+            storage_provider.as_ref(),
+            TEST_QUERY_10PTS_128DIM,
+            0,
+        )
+        .unwrap();
+        let truth_result =
+            load_query_result(storage_provider.as_ref(), TEST_TRUTH_RESULT_10PTS_128DIM);
+
+        let dim = 128usize;
+        let k = 10usize;
+        let l = 100u32;
+        let num_queries = query_vector.len() / dim;
+
+        let mut total_recall = 0.0f32;
+        for q in 0..num_queries {
+            let query = &query_vector[q * dim..(q + 1) * dim];
+            let result = pipe_searcher.search(query, k as u32, l, 4).unwrap();
+            let indices: Vec<u32> = result.results.iter().map(|item| item.vertex_id).collect();
+            let truth_slice = &truth_result[q * k..(q + 1) * k];
+
+            // Count recall overlap (PipeANN traversal order may differ from beam search ground truth).
+            let matching = indices
+                .iter()
+                .filter(|id| truth_slice.contains(id))
+                .count();
+            total_recall += matching as f32 / k as f32;
+        }
+        let avg_recall = total_recall / num_queries as f32;
+        assert!(
+            avg_recall >= 0.8,
+            "PipeANN average recall {:.0}% < 80%",
+            avg_recall * 100.0,
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn test_concurrent_beam_and_pipe_search_128dim() {
+        use crate::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
+        use diskann_providers::storage::get_disk_index_file;
+        use rayon::prelude::*;
+
+        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
+
+        // Create beam search engine (DiskIndexSearcher).
+        let beam_engine = create_disk_index_searcher::<GraphDataF32VectorUnitData>(
+            CreateDiskIndexSearcherParams {
+                max_thread_num: 2,
+                pq_pivot_file_path: TEST_PQ_PIVOT_128DIM,
+                pq_compressed_file_path: TEST_PQ_COMPRESSED_128DIM,
+                index_path: TEST_INDEX_128DIM,
+                index_path_prefix: TEST_INDEX_PREFIX_128DIM,
+                ..Default::default()
+            },
+            &storage_provider,
+        );
+
+        // Create pipelined search engine (PipelinedSearcher).
+        let disk_index_reader = DiskIndexReader::<f32>::new(
+            TEST_PQ_PIVOT_128DIM.to_string(),
+            TEST_PQ_COMPRESSED_128DIM.to_string(),
+            storage_provider.as_ref(),
+        )
+        .unwrap();
+        let pq_data = disk_index_reader.get_pq_data();
+
+        let aligned_reader_factory = VirtualAlignedReaderFactory::new(
+            get_disk_index_file(TEST_INDEX_PREFIX_128DIM),
+            Arc::clone(&storage_provider),
+        );
+        let vertex_provider_factory =
+            DiskVertexProviderFactory::<GraphDataF32VectorUnitData, _>::new(
+                aligned_reader_factory,
+                CachingStrategy::None,
+            )
+            .unwrap();
+        let graph_header = vertex_provider_factory.get_header().unwrap();
+
+        let real_index_path = test_data_root().join(
+            "disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_disk.index",
+        );
+        let pipe_searcher = Arc::new(
+            PipelinedSearcher::<GraphDataF32VectorUnitData>::new(
+                graph_header,
+                pq_data,
+                Metric::L2,
+                usize::MAX,
+                4,
+                None,
+                real_index_path.to_str().unwrap().to_string(),
+                PipelinedReaderConfig::default(),
+            )
+            .unwrap(),
+        );
+
+        // Load queries and ground truth.
+        let (query_vector, _, _) = diskann_providers::utils::file_util::load_bin::<f32, _>(
+            storage_provider.as_ref(),
+            TEST_QUERY_10PTS_128DIM,
+            0,
+        )
+        .unwrap();
+        let truth_result =
+            load_query_result(storage_provider.as_ref(), TEST_TRUTH_RESULT_10PTS_128DIM);
+
+        let dim = 128usize;
+        let k = 10usize;
+        let l = 100u32;
+        let num_queries = query_vector.len() / dim;
+
+        // Run beam search and pipe search concurrently via rayon.
+        let queries: Vec<&[f32]> = (0..num_queries)
+            .map(|q| &query_vector[q * dim..(q + 1) * dim])
+            .collect();
+        let beam_ref = &beam_engine;
+        let pipe_ref = &pipe_searcher;
+        let truth_ref = &truth_result;
+
+        queries.par_iter().enumerate().for_each(|(q, query)| {
+            // Beam search
+            let beam_result = beam_ref
+                .search(query, k as u32, l, None, None, false)
+                .unwrap();
+            let beam_ids: Vec<u32> = beam_result.results.iter().map(|r| r.vertex_id).collect();
+            let truth_slice = &truth_ref[q * k..(q + 1) * k];
+
+            // Pipe search (runs concurrently with beam search across rayon threads)
+            let pipe_result = pipe_ref.search(query, k as u32, l, 4).unwrap();
+            let pipe_ids: Vec<u32> = pipe_result.results.iter().map(|r| r.vertex_id).collect();
+
+            // Both should produce results with reasonable overlap.
+            let beam_matching = beam_ids.iter().filter(|id| truth_slice.contains(id)).count();
+            let pipe_matching = pipe_ids.iter().filter(|id| truth_slice.contains(id)).count();
+            // Per-query: at least some overlap (>=30%) to guard against total failures.
+            assert!(
+                beam_matching as f32 / k as f32 >= 0.3,
+                "Beam search returned no relevant results for query {}",
+                q,
+            );
+            assert!(
+                pipe_matching as f32 / k as f32 >= 0.3,
+                "Pipe search returned no relevant results for query {}",
+                q,
+            );
+        });
+    }
 }
diff --git a/diskann-platform/Cargo.toml b/diskann-platform/Cargo.toml
index 07bbd3a33..5eac8fe68 100644
--- a/diskann-platform/Cargo.toml
+++ b/diskann-platform/Cargo.toml
@@ -15,7 +15,7 @@ documentation.workspace = true
 tracing.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
-io-uring = "0.6.4"
+io-uring = "0.7"
 libc = "0.2.148"
 
 [target.'cfg(target_os = "windows")'.dependencies.windows-sys]

From c051711a35dac8a6a7097df8c365c692f4d11939 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Mon, 9 Feb 2026 14:41:02 -0800
Subject: [PATCH 03/46] some simplifying refactoring

---
 diskann-benchmark/example/pipe-search.json    |  31 +-
 .../src/backend/disk_index/search.rs          | 363 +++++++++---------
 diskann-benchmark/src/inputs/disk.rs          |  25 +-
 diskann-disk/src/build/builder/core.rs        |   1 -
 diskann-disk/src/data_model/graph_header.rs   |  22 ++
 diskann-disk/src/search/mod.rs                |   2 +
 .../src/search/pipelined/pipelined_reader.rs  |  24 +-
 .../src/search/pipelined/pipelined_search.rs  |  69 ++--
 .../search/pipelined/pipelined_searcher.rs    | 111 +++---
 .../src/search/provider/disk_provider.rs      |   2 -
 .../src/search/provider/disk_sector_graph.rs  |  40 +-
 diskann-disk/src/search/sector_math.rs        |  33 ++
 12 files changed, 362 insertions(+), 361 deletions(-)
 create mode 100644 diskann-disk/src/search/sector_math.rs

diff --git a/diskann-benchmark/example/pipe-search.json b/diskann-benchmark/example/pipe-search.json
index bec885b8d..bd525e989 100644
--- a/diskann-benchmark/example/pipe-search.json
+++ b/diskann-benchmark/example/pipe-search.json
@@ -65,7 +65,7 @@
           "queries": "disk_index_sample_query_10pts.fbin",
           "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
           "search_list": [10, 20, 40, 80],
-          "beam_width": 8,
+          "beam_width": 4,
           "recall_at": 10,
           "num_threads": 1,
           "is_flat_search": false,
@@ -100,34 +100,7 @@
           "search_mode": {
             "mode": "PipeSearch",
             "initial_beam_width": 4,
-            "sqpoll_idle_ms": 1000,
-            "iopoll": false
-          }
-        }
-      }
-    },
-    {
-      "type": "disk-index",
-      "content": {
-        "source": {
-          "disk-index-source": "Load",
-          "data_type": "float32",
-          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
-        },
-        "search_phase": {
-          "queries": "disk_index_sample_query_10pts.fbin",
-          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
-          "search_list": [10, 20, 40, 80],
-          "beam_width": 4,
-          "recall_at": 10,
-          "num_threads": 1,
-          "is_flat_search": false,
-          "distance": "squared_l2",
-          "vector_filters_file": null,
-          "search_mode": {
-            "mode": "PipeSearch",
-            "initial_beam_width": 4,
-            "iopoll": true
+            "sqpoll_idle_ms": 1000
           }
         }
       }
diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 23a835090..8f1416d71 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -9,12 +9,13 @@ use std::{collections::HashSet, fmt, sync::atomic::AtomicBool, sync::Arc, time::
 use opentelemetry::{global, trace::Span, trace::Tracer};
 use opentelemetry_sdk::trace::SdkTracerProvider;
 
-use diskann::utils::VectorRepr;
+use diskann::{utils::VectorRepr, ANNResult};
 use diskann_benchmark_runner::{files::InputFile, utils::MicroSeconds};
 use diskann_disk::{
     data_model::CachingStrategy,
     search::provider::{
-        disk_provider::DiskIndexSearcher, disk_vertex_provider_factory::DiskVertexProviderFactory,
+        disk_provider::{DiskIndexSearcher, SearchResult},
+        disk_vertex_provider_factory::DiskVertexProviderFactory,
     },
     search::traits::VertexProviderFactory,
     storage::disk_index_reader::DiskIndexReader,
@@ -158,6 +159,96 @@ impl DiskSearchResult {
     }
 }
 
+/// Write a single query's search result into pre-allocated buffers.
+fn write_query_result(
+    result: ANNResult<SearchResult<()>>,
+    recall_at: usize,
+    stats: &mut QueryStatistics,
+    rc: &mut u32,
+    id_chunk: &mut [u32],
+    dist_chunk: &mut [f32],
+    has_any_search_failed: &AtomicBool,
+    error_label: &str,
+) {
+    match result {
+        Ok(search_result) => {
+            *stats = search_result.stats.query_statistics;
+            *rc = search_result.results.len() as u32;
+            let actual_results = search_result.results.len().min(recall_at);
+            for (i, result_item) in search_result.results.iter().take(actual_results).enumerate() {
+                id_chunk[i] = result_item.vertex_id;
+                dist_chunk[i] = result_item.distance;
+            }
+        }
+        Err(e) => {
+            eprintln!("{} failed for query: {:?}", error_label, e);
+            *rc = 0;
+            id_chunk.fill(0);
+            dist_chunk.fill(0.0);
+            has_any_search_failed.store(true, std::sync::atomic::Ordering::Release);
+        }
+    }
+}
+
+/// Execute the per-L search iteration loop, handling buffer allocation, timing,
+/// span management, error checking, and result aggregation.
+fn run_search_loop(
+    search_list: &[u32],
+    recall_at: u32,
+    beam_width: usize,
+    num_queries: usize,
+    span_prefix: &str,
+    has_any_search_failed: &AtomicBool,
+    gt_context: &GroundTruthContext,
+    mut iteration_body: impl FnMut(u32, &mut [QueryStatistics], &mut [u32], &mut [u32], &mut [f32]),
+) -> anyhow::Result<Vec<DiskSearchResult>> {
+    let mut results = Vec::with_capacity(search_list.len());
+
+    for &l in search_list.iter() {
+        let mut statistics_vec = vec![QueryStatistics::default(); num_queries];
+        let mut result_counts = vec![0u32; num_queries];
+        let mut result_ids = vec![0u32; (recall_at as usize) * num_queries];
+        let mut result_dists = vec![0.0f32; (recall_at as usize) * num_queries];
+
+        let start = Instant::now();
+
+        let mut l_span = {
+            let tracer = global::tracer("");
+            let span_name = format!("{}-with-L={}-bw={}", span_prefix, l, beam_width);
+            tracer.start(span_name)
+        };
+
+        iteration_body(
+            l,
+            &mut statistics_vec,
+            &mut result_counts,
+            &mut result_ids,
+            &mut result_dists,
+        );
+
+        let total_time = start.elapsed();
+
+        if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
+            anyhow::bail!("One or more searches failed. See logs for details.");
+        }
+
+        let search_result = DiskSearchResult::new(
+            &statistics_vec,
+            &result_ids,
+            &result_counts,
+            l,
+            total_time.as_secs_f32(),
+            num_queries,
+            gt_context,
+        )?;
+
+        l_span.end();
+        results.push(search_result);
+    }
+
+    Ok(results)
+}
+
 pub(super) fn search_disk_index<T, StorageType>(
     index_load: &DiskIndexLoad,
     search_params: &DiskSearchPhase,
@@ -222,7 +313,7 @@ where
     let vertex_provider_factory = DiskVertexProviderFactory::new(reader_factory, caching_strategy)?;
 
     let pool = create_thread_pool(search_params.num_threads)?;
-    let mut search_results_per_l = Vec::with_capacity(search_params.search_list.len());
+    let search_results_per_l;
     let has_any_search_failed = AtomicBool::new(false);
 
     match &search_params.search_mode {
@@ -238,97 +329,54 @@ where
 
             logger.log_checkpoint("index_loaded");
 
-            for &l in search_params.search_list.iter() {
-                let mut statistics_vec: Vec<QueryStatistics> =
-                    vec![QueryStatistics::default(); num_queries];
-                let mut result_counts: Vec<u32> = vec![0; num_queries];
-                let mut result_ids: Vec<u32> =
-                    vec![0; (search_params.recall_at as usize) * num_queries];
-                let mut result_dists: Vec<f32> =
-                    vec![0.0; (search_params.recall_at as usize) * num_queries];
-
-                let start = Instant::now();
-
-                let mut l_span = {
-                    let tracer = global::tracer("");
-                    let span_name =
-                        format!("search-with-L={}-bw={}", l, search_params.beam_width);
-                    tracer.start(span_name)
-                };
-
-                let zipped = queries
-                    .par_row_iter()
-                    .zip(vector_filters.par_iter())
-                    .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
-                    .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
-                    .zip(statistics_vec.par_iter_mut())
-                    .zip(result_counts.par_iter_mut());
-
-                zipped.for_each_in_pool(
-                    &pool,
-                    |(((((q, vf), id_chunk), dist_chunk), stats), rc)| {
-                        let vector_filter = if search_params.vector_filters_file.is_none() {
-                            None
-                        } else {
-                            Some(Box::new(move |vid: &u32| vf.contains(vid))
-                                as Box<dyn Fn(&u32) -> bool + Send + Sync>)
-                        };
-
-                        match searcher.search(
-                            q,
-                            search_params.recall_at,
-                            l,
-                            Some(search_params.beam_width),
-                            vector_filter,
-                            search_params.is_flat_search,
-                        ) {
-                            Ok(search_result) => {
-                                *stats = search_result.stats.query_statistics;
-                                *rc = search_result.results.len() as u32;
-                                let actual_results = search_result
-                                    .results
-                                    .len()
-                                    .min(search_params.recall_at as usize);
-                                for (i, result_item) in search_result
-                                    .results
-                                    .iter()
-                                    .take(actual_results)
-                                    .enumerate()
-                                {
-                                    id_chunk[i] = result_item.vertex_id;
-                                    dist_chunk[i] = result_item.distance;
-                                }
-                            }
-                            Err(e) => {
-                                eprintln!("Search failed for query: {:?}", e);
-                                *rc = 0;
-                                id_chunk.fill(0);
-                                dist_chunk.fill(0.0);
-                                has_any_search_failed
-                                    .store(true, std::sync::atomic::Ordering::Release);
-                            }
-                        }
-                    },
-                );
-                let total_time = start.elapsed();
-
-                if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
-                    anyhow::bail!("One or more searches failed. See logs for details.");
-                }
-
-                let search_result = DiskSearchResult::new(
-                    &statistics_vec,
-                    &result_ids,
-                    &result_counts,
-                    l,
-                    total_time.as_secs_f32(),
-                    num_queries,
-                    &gt_context,
-                )?;
+            search_results_per_l = run_search_loop(
+                &search_params.search_list,
+                search_params.recall_at,
+                search_params.beam_width,
+                num_queries,
+                "search",
+                &has_any_search_failed,
+                &gt_context,
+                |l, statistics_vec, result_counts, result_ids, result_dists| {
+                    let zipped = queries
+                        .par_row_iter()
+                        .zip(vector_filters.par_iter())
+                        .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
+                        .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
+                        .zip(statistics_vec.par_iter_mut())
+                        .zip(result_counts.par_iter_mut());
 
-                l_span.end();
-                search_results_per_l.push(search_result);
-            }
+                    zipped.for_each_in_pool(
+                        &pool,
+                        |(((((q, vf), id_chunk), dist_chunk), stats), rc)| {
+                            let vector_filter = if search_params.vector_filters_file.is_none() {
+                                None
+                            } else {
+                                Some(Box::new(move |vid: &u32| vf.contains(vid))
+                                    as Box<dyn Fn(&u32) -> bool + Send + Sync>)
+                            };
+
+                            write_query_result(
+                                searcher.search(
+                                    q,
+                                    search_params.recall_at,
+                                    l,
+                                    Some(search_params.beam_width),
+                                    vector_filter,
+                                    search_params.is_flat_search,
+                                ),
+                                search_params.recall_at as usize,
+                                stats,
+                                rc,
+                                id_chunk,
+                                dist_chunk,
+                                &has_any_search_failed,
+                                "Search",
+                            );
+                        },
+                    );
+                },
+            )?;
         }
         // PipeANN pipelined search — for read-only search on completed (static) indices only.
         // Searcher is created once; internal ObjectPool handles per-thread scratch allocation.
@@ -337,20 +385,17 @@ where
             initial_beam_width,
             relaxed_monotonicity_l,
             sqpoll_idle_ms,
-            iopoll,
         } => {
             #[cfg(target_os = "linux")]
             {
                 let graph_header = vertex_provider_factory.get_header()?;
                 let pq_data = index_reader.get_pq_data();
                 let metric = search_params.distance.into();
-                let search_io_limit = search_params.search_io_limit.unwrap_or(usize::MAX);
                 let initial_beam_width = *initial_beam_width;
                 let relaxed_monotonicity_l = *relaxed_monotonicity_l;
 
                 let reader_config = PipelinedReaderConfig {
                     sqpoll_idle_ms: *sqpoll_idle_ms,
-                    iopoll: *iopoll,
                 };
 
                 // Create searcher once — pool handles per-thread scratch allocation
@@ -358,7 +403,6 @@ where
                     graph_header.clone(),
                     pq_data.clone(),
                     metric,
-                    search_io_limit,
                     initial_beam_width,
                     relaxed_monotonicity_l,
                     disk_index_path.clone(),
@@ -367,93 +411,50 @@ where
 
                 logger.log_checkpoint("index_loaded");
 
-                for &l in search_params.search_list.iter() {
-                    let mut statistics_vec: Vec<QueryStatistics> =
-                        vec![QueryStatistics::default(); num_queries];
-                    let mut result_counts: Vec<u32> = vec![0; num_queries];
-                    let mut result_ids: Vec<u32> =
-                        vec![0; (search_params.recall_at as usize) * num_queries];
-                    let mut result_dists: Vec<f32> =
-                        vec![0.0; (search_params.recall_at as usize) * num_queries];
-
-                    let start = Instant::now();
-
-                    let mut l_span = {
-                        let tracer = global::tracer("");
-                        let span_name =
-                            format!("pipesearch-with-L={}-bw={}", l, search_params.beam_width);
-                        tracer.start(span_name)
-                    };
-
-                    let pipe_searcher = pipe_searcher.clone(); // Arc clone for this L iteration
-
-                    let zipped = queries
-                        .par_row_iter()
-                        .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
-                        .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
-                        .zip(statistics_vec.par_iter_mut())
-                        .zip(result_counts.par_iter_mut());
-
-                    zipped.for_each_in_pool(
-                        &pool,
-                        |((((q, id_chunk), dist_chunk), stats), rc)| {
-                            match pipe_searcher.search(
-                                q,
-                                search_params.recall_at,
-                                l,
-                                search_params.beam_width,
-                            ) {
-                                Ok(search_result) => {
-                                    *stats = search_result.stats.query_statistics;
-                                    *rc = search_result.results.len() as u32;
-                                    let actual_results = search_result
-                                        .results
-                                        .len()
-                                        .min(search_params.recall_at as usize);
-                                    for (i, result_item) in search_result
-                                        .results
-                                        .iter()
-                                        .take(actual_results)
-                                        .enumerate()
-                                    {
-                                        id_chunk[i] = result_item.vertex_id;
-                                        dist_chunk[i] = result_item.distance;
-                                    }
-                                }
-                                Err(e) => {
-                                    eprintln!("PipeSearch failed for query: {:?}", e);
-                                    *rc = 0;
-                                    id_chunk.fill(0);
-                                    dist_chunk.fill(0.0);
-                                    has_any_search_failed
-                                        .store(true, std::sync::atomic::Ordering::Release);
-                                }
-                            }
-                        },
-                    );
-                    let total_time = start.elapsed();
-
-                    if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
-                        anyhow::bail!("One or more searches failed. See logs for details.");
-                    }
-
-                    let search_result = DiskSearchResult::new(
-                        &statistics_vec,
-                        &result_ids,
-                        &result_counts,
-                        l,
-                        total_time.as_secs_f32(),
-                        num_queries,
-                        &gt_context,
-                    )?;
-
-                    l_span.end();
-                    search_results_per_l.push(search_result);
-                }
+                search_results_per_l = run_search_loop(
+                    &search_params.search_list,
+                    search_params.recall_at,
+                    search_params.beam_width,
+                    num_queries,
+                    "pipesearch",
+                    &has_any_search_failed,
+                    &gt_context,
+                    |l, statistics_vec, result_counts, result_ids, result_dists| {
+                        let pipe_searcher = pipe_searcher.clone();
+
+                        let zipped = queries
+                            .par_row_iter()
+                            .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
+                            .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
+                            .zip(statistics_vec.par_iter_mut())
+                            .zip(result_counts.par_iter_mut());
+
+                        zipped.for_each_in_pool(
+                            &pool,
+                            |((((q, id_chunk), dist_chunk), stats), rc)| {
+                                write_query_result(
+                                    pipe_searcher.search(
+                                        q,
+                                        search_params.recall_at,
+                                        l,
+                                        search_params.beam_width,
+                                    ),
+                                    search_params.recall_at as usize,
+                                    stats,
+                                    rc,
+                                    id_chunk,
+                                    dist_chunk,
+                                    &has_any_search_failed,
+                                    "PipeSearch",
+                                );
+                            },
+                        );
+                    },
+                )?;
             }
             #[cfg(not(target_os = "linux"))]
             {
-                let _ = (initial_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms, iopoll);
+                let _ = (initial_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms);
                 anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index 405f33f28..d32ee93da 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -88,12 +88,31 @@ pub(crate) enum SearchMode {
         /// Enable kernel-side SQ polling (ms idle timeout). None = disabled.
         #[serde(default)]
         sqpoll_idle_ms: Option<u32>,
-        /// Enable busy-wait IO polling. Default: false.
-        #[serde(default)]
-        iopoll: bool,
     },
 }
 
+impl fmt::Display for SearchMode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            SearchMode::BeamSearch => write!(f, "BeamSearch"),
+            SearchMode::PipeSearch {
+                initial_beam_width,
+                relaxed_monotonicity_l,
+                sqpoll_idle_ms,
+            } => {
+                write!(f, "PipeSearch(bw={}", initial_beam_width)?;
+                if let Some(rm) = relaxed_monotonicity_l {
+                    write!(f, ",rm={}", rm)?;
+                }
+                if let Some(sq) = sqpoll_idle_ms {
+                    write!(f, ",sqpoll={}ms", sq)?;
+                }
+                write!(f, ")")
+            }
+        }
+    }
+}
+
 fn default_initial_beam_width() -> usize {
     4
 }
diff --git a/diskann-disk/src/build/builder/core.rs b/diskann-disk/src/build/builder/core.rs
index d4fcdc53f..271fe2f9f 100644
--- a/diskann-disk/src/build/builder/core.rs
+++ b/diskann-disk/src/build/builder/core.rs
@@ -1162,7 +1162,6 @@ pub(crate) mod disk_index_builder_tests {
             graph_header,
             pq_data,
             params.metric,
-            usize::MAX,
             4,
             None,
             real_index_path_str.to_string(),
diff --git a/diskann-disk/src/data_model/graph_header.rs b/diskann-disk/src/data_model/graph_header.rs
index fc7ed78bc..67999ad36 100644
--- a/diskann-disk/src/data_model/graph_header.rs
+++ b/diskann-disk/src/data_model/graph_header.rs
@@ -86,6 +86,28 @@ impl GraphHeader {
         &self.layout_version
     }
 
+    /// Returns the effective block size, falling back to the default (4096) for
+    /// legacy (v0.0) layouts or when the stored value is zero.
+    pub fn effective_block_size(&self) -> usize {
+        let bs = self.block_size as usize;
+        if (self.layout_version.major_version() == 0 && self.layout_version.minor_version() == 0)
+            || bs == 0
+        {
+            4096
+        } else {
+            bs
+        }
+    }
+
+    /// Returns the number of disk sectors required to store a single graph node.
+    pub fn num_sectors_per_node(&self) -> usize {
+        if self.metadata.num_nodes_per_block > 0 {
+            1
+        } else {
+            (self.metadata.node_len as usize).div_ceil(self.effective_block_size())
+        }
+    }
+
     /// Returns the maximum degree of the graph
     ///
     /// # Type Parameters
diff --git a/diskann-disk/src/search/mod.rs b/diskann-disk/src/search/mod.rs
index 2a4009504..2c475e10a 100644
--- a/diskann-disk/src/search/mod.rs
+++ b/diskann-disk/src/search/mod.rs
@@ -8,5 +8,7 @@
 pub mod provider;
 pub mod traits;
 
+pub(crate) mod sector_math;
+
 #[cfg(target_os = "linux")]
 pub mod pipelined;
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index d97061adf..3af5d9232 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -25,10 +25,6 @@ pub struct PipelinedReaderConfig {
     /// milliseconds of inactivity the kernel thread sleeps (resumed automatically
     /// on next `submit()`). Requires Linux kernel >= 5.11 (>= 5.13 unprivileged).
     pub sqpoll_idle_ms: Option<u32>,
-    /// Enable busy-wait polling for IO completions (IORING_SETUP_IOPOLL).
-    /// Reduces latency at the cost of higher CPU usage. Requires O_DIRECT and
-    /// a file system that supports polling.
-    pub iopoll: bool,
 }
 
 /// A pipelined IO reader that wraps `io_uring` for non-blocking submit/poll.
@@ -46,8 +42,6 @@ pub struct PipelinedReader {
     max_slots: usize,
     /// Number of currently in-flight (submitted but not completed) reads.
     in_flight: usize,
-    /// Whether IOPOLL mode is active (requires active polling for completions).
-    iopoll: bool,
     /// Keep the file handle alive for the lifetime of the reader.
     _file: std::fs::File,
 }
@@ -74,14 +68,11 @@ impl PipelinedReader {
             .map_err(ANNError::log_io_error)?;
 
         let entries = max_slots.min(MAX_IO_CONCURRENCY) as u32;
-        let ring = if config.sqpoll_idle_ms.is_some() || config.iopoll {
+        let ring = if config.sqpoll_idle_ms.is_some() {
             let mut builder = IoUring::builder();
             if let Some(idle_ms) = config.sqpoll_idle_ms {
                 builder.setup_sqpoll(idle_ms);
             }
-            if config.iopoll {
-                builder.setup_iopoll();
-            }
             builder.build(entries)?
         } else {
             IoUring::new(entries)?
@@ -97,7 +88,6 @@ impl PipelinedReader {
             slot_size,
             max_slots,
             in_flight: 0,
-            iopoll: config.iopoll,
             _file: file,
         })
     }
@@ -137,18 +127,10 @@ impl PipelinedReader {
         Ok(())
     }
 
-    /// Poll for completed IO operations.
+    /// Poll for completed IO operations (non-blocking).
     ///
-    /// In default mode, this is non-blocking (drains already-completed CQEs).
-    /// In IOPOLL mode, this actively polls the kernel for at least one completion
-    /// when there are in-flight IOs, since IOPOLL completions require active reaping.
+    /// Drains already-completed CQEs from the io_uring completion queue.
     pub fn poll_completions(&mut self) -> ANNResult<Vec<usize>> {
-        // IOPOLL requires the kernel to actively poll for completions.
-        // Without this, ring.completion() will always be empty.
-        if self.iopoll && self.in_flight > 0 {
-            self.ring.submit_and_wait(1)?;
-        }
-
         let mut completed = Vec::new();
         for cqe in self.ring.completion() {
             if cqe.result() < 0 {
diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
index 650b87e8e..540bbb7a1 100644
--- a/diskann-disk/src/search/pipelined/pipelined_search.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_search.rs
@@ -9,11 +9,12 @@ use std::collections::{HashMap, HashSet, VecDeque};
 use std::time::Instant;
 
 use byteorder::{ByteOrder, LittleEndian};
-use diskann::{utils::VectorRepr, ANNResult};
+use diskann::{utils::VectorRepr, ANNError, ANNResult};
 use diskann_providers::model::{compute_pq_distance, pq::quantizer_preprocess, PQData, PQScratch};
 use diskann_vector::{distance::Metric, DistanceFunction};
 
 use super::pipelined_reader::PipelinedReader;
+use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 
 /// A candidate in the sorted candidate pool.
 struct Candidate {
@@ -54,30 +55,6 @@ pub struct PipeSearchStats {
     pub hops: u32,
 }
 
-/// Compute the sector index that contains a given vertex.
-#[inline]
-fn node_sector_index(
-    vertex_id: u32,
-    num_nodes_per_sector: u64,
-    num_sectors_per_node: usize,
-) -> u64 {
-    1 + if num_nodes_per_sector > 0 {
-        vertex_id as u64 / num_nodes_per_sector
-    } else {
-        vertex_id as u64 * num_sectors_per_node as u64
-    }
-}
-
-/// Compute the byte offset of a node within its sector.
-#[inline]
-fn node_offset_in_sector(vertex_id: u32, num_nodes_per_sector: u64, node_len: u64) -> usize {
-    if num_nodes_per_sector == 0 {
-        0
-    } else {
-        (vertex_id as u64 % num_nodes_per_sector * node_len) as usize
-    }
-}
-
 /// Parse a node from raw sector buffer bytes.
 fn parse_node(
     sector_buf: &[u8],
@@ -85,13 +62,33 @@ fn parse_node(
     num_nodes_per_sector: u64,
     node_len: u64,
     fp_vector_len: u64,
-) -> LoadedNode {
+) -> ANNResult<LoadedNode> {
     let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
-    let node_data = &sector_buf[offset..offset + node_len as usize];
+    let end = offset + node_len as usize;
+    let node_data = sector_buf.get(offset..end).ok_or_else(|| {
+        ANNError::log_index_error(format_args!(
+            "Node data out of bounds: vertex {} offset {}..{} in buffer of len {}",
+            vertex_id,
+            offset,
+            end,
+            sector_buf.len()
+        ))
+    })?;
+
+    let fp_vector_len_usize = fp_vector_len as usize;
+    if fp_vector_len_usize > node_data.len() {
+        return Err(ANNError::log_index_error(format_args!(
+            "fp_vector_len {} exceeds node_data len {}",
+            fp_vector_len_usize,
+            node_data.len()
+        )));
+    }
 
-    let fp_vector = node_data[..fp_vector_len as usize].to_vec();
+    // Copy required: the slot buffer will be reused for subsequent IOs while
+    // the parsed node remains in id_buf_map until visited.
+    let fp_vector = node_data[..fp_vector_len_usize].to_vec();
 
-    let neighbor_data = &node_data[fp_vector_len as usize..];
+    let neighbor_data = &node_data[fp_vector_len_usize..];
     let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
     // Clamp to the available data to avoid out-of-bounds reads.
     let max_neighbors = (neighbor_data.len().saturating_sub(4)) / 4;
@@ -102,10 +99,10 @@ fn parse_node(
         adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
     }
 
-    LoadedNode {
+    Ok(LoadedNode {
         fp_vector,
         adjacency_list,
-    }
+    })
 }
 
 /// Insert a candidate into the sorted retset, maintaining sort order by distance.
@@ -284,7 +281,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                         num_nodes_per_sector,
                         node_len,
                         fp_vector_len,
-                    );
+                    )?;
                     // Track convergence: is this node still in the top of retset?
                     if cur_list_size > 0 {
                         let last_dist = retset[cur_list_size - 1].distance;
@@ -455,7 +452,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                             num_nodes_per_sector,
                             node_len,
                             fp_vector_len,
-                        );
+                        )?;
                         id_buf_map.insert(io.vertex_id, node);
                     } else {
                         remaining.push_back(io);
@@ -682,7 +679,7 @@ mod tests {
         let node_len = fp_vector_len + 4 + 3 * 4; // vec + count + 3 neighbors
 
         let buf = build_sector_buf(0, &fp_vec, &neighbors, 4096);
-        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len);
+        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len).unwrap();
 
         assert_eq!(node.fp_vector, fp_vec);
         assert_eq!(node.adjacency_list, vec![10, 20, 30]);
@@ -706,7 +703,7 @@ mod tests {
         }
 
         // Parse node at index 2 (vertex_id=2 within same sector)
-        let node = parse_node(&buf, 2, num_nodes_per_sector, node_len, fp_vector_len);
+        let node = parse_node(&buf, 2, num_nodes_per_sector, node_len, fp_vector_len).unwrap();
         let expected_fp: Vec<u8> = (0..8).map(|b| b + 20).collect();
         assert_eq!(node.fp_vector, expected_fp);
         assert_eq!(node.adjacency_list, vec![102, 202]);
@@ -720,7 +717,7 @@ mod tests {
         let node_len = fp_vector_len + 4; // vec + count only
 
         let buf = build_sector_buf(0, &fp_vec, &neighbors, 4096);
-        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len);
+        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len).unwrap();
 
         assert_eq!(node.fp_vector, vec![42u8; 16]);
         assert!(node.adjacency_list.is_empty());
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index f360692e6..d282d3234 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -96,21 +96,31 @@ impl TryAsPooled<&PipelinedScratchArgs<'_>> for PipelinedSearchScratch {
 /// Each search operates on its own `PipelinedReader` and `PQScratch` (pooled for
 /// amortized allocation). Shared state (`PQData`, `GraphHeader`) is immutable.
 pub struct PipelinedSearcher<Data: GraphDataType<VectorIdType = u32>> {
+    #[allow(dead_code)]
     graph_header: GraphHeader,
     distance_comparer: <Data::VectorDataType as VectorRepr>::Distance,
     pq_data: Arc<PQData>,
     metric: Metric,
-    /// Maximum IO operations per search (reserved for future IO budget enforcement).
-    #[allow(dead_code)]
-    search_io_limit: usize,
-    /// Default beam width when not overridden per-query.
-    #[allow(dead_code)]
-    initial_beam_width: usize,
     relaxed_monotonicity_l: Option<usize>,
     disk_index_path: String,
     reader_config: PipelinedReaderConfig,
     /// Pool of reusable reader + PQ scratch instances.
     scratch_pool: Arc<ObjectPool<PipelinedSearchScratch>>,
+
+    // Precomputed values derived from graph_header / pq_data, cached to avoid
+    // re-derivation on every search() call.
+    block_size: usize,
+    #[allow(dead_code)]
+    num_sectors_per_node: usize,
+    slot_size: usize,
+    fp_vector_len: u64,
+    dims: usize,
+    node_len: u64,
+    num_nodes_per_sector: u64,
+    medoid: u32,
+    graph_degree: usize,
+    num_pq_chunks: usize,
+    num_pq_centers: usize,
 }
 
 impl<Data> PipelinedSearcher<Data>
@@ -123,42 +133,33 @@ where
     /// * `graph_header` - Graph metadata from the disk index.
     /// * `pq_data` - Shared PQ data for approximate distance computation.
     /// * `metric` - Distance metric (L2, InnerProduct, etc.).
-    /// * `search_io_limit` - Maximum IO operations per search.
-    /// * `initial_beam_width` - Initial number of concurrent IOs (adapts during search).
+    /// * `beam_width` - Default beam width used for pool sizing.
     /// * `relaxed_monotonicity_l` - Optional early termination parameter.
     /// * `disk_index_path` - Path to the disk index file for creating readers.
     pub fn new(
         graph_header: GraphHeader,
         pq_data: Arc<PQData>,
         metric: Metric,
-        search_io_limit: usize,
-        initial_beam_width: usize,
+        beam_width: usize,
         relaxed_monotonicity_l: Option<usize>,
         disk_index_path: String,
         config: PipelinedReaderConfig,
     ) -> ANNResult<Self> {
         let metadata = graph_header.metadata();
         let dims = metadata.dims;
-        let distance_comparer = Data::VectorDataType::distance(metric, Some(dims));
-
         let node_len = metadata.node_len;
         let num_nodes_per_sector = metadata.num_nodes_per_block;
+        let fp_vector_len =
+            (dims * std::mem::size_of::<Data::VectorDataType>()) as u64;
+        let medoid = metadata.medoid as u32;
+        let distance_comparer = Data::VectorDataType::distance(metric, Some(dims));
 
-        let mut block_size = graph_header.block_size() as usize;
-        let version = graph_header.layout_version();
-        if (version.major_version() == 0 && version.minor_version() == 0) || block_size == 0 {
-            block_size = 4096;
-        }
-
-        let num_sectors_per_node = if num_nodes_per_sector > 0 {
-            1
-        } else {
-            (node_len as usize).div_ceil(block_size)
-        };
+        let block_size = graph_header.effective_block_size();
+        let num_sectors_per_node = graph_header.num_sectors_per_node();
         let slot_size = num_sectors_per_node * block_size;
 
         let max_slots =
-            (initial_beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
+            (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
 
         let graph_degree = graph_header.max_degree::<Data::VectorDataType>()?;
         let num_pq_chunks = pq_data.get_num_chunks();
@@ -182,12 +183,21 @@ where
             distance_comparer,
             pq_data,
             metric,
-            search_io_limit,
-            initial_beam_width,
             relaxed_monotonicity_l,
             disk_index_path,
             reader_config: config,
             scratch_pool,
+            block_size,
+            num_sectors_per_node,
+            slot_size,
+            fp_vector_len,
+            dims,
+            node_len,
+            num_nodes_per_sector,
+            medoid,
+            graph_degree,
+            num_pq_chunks,
+            num_pq_centers,
         })
     }
 
@@ -205,41 +215,17 @@ where
         search_list_size: u32,
         beam_width: usize,
     ) -> ANNResult<SearchResult<Data::AssociatedDataType>> {
-        let metadata = self.graph_header.metadata();
-        let dims = metadata.dims;
-        let node_len = metadata.node_len;
-        let num_nodes_per_sector = metadata.num_nodes_per_block;
-        let fp_vector_len =
-            (dims * std::mem::size_of::<Data::VectorDataType>()) as u64;
-        let medoid = metadata.medoid as u32;
-
-        let mut block_size = self.graph_header.block_size() as usize;
-        let version = self.graph_header.layout_version();
-        if (version.major_version() == 0 && version.minor_version() == 0) || block_size == 0 {
-            block_size = 4096;
-        }
-
-        let num_sectors_per_node = if num_nodes_per_sector > 0 {
-            1
-        } else {
-            (node_len as usize).div_ceil(block_size)
-        };
-
-        let graph_degree = self.graph_header.max_degree::<Data::VectorDataType>()?;
-        let num_pq_chunks = self.pq_data.get_num_chunks();
-        let num_pq_centers = self.pq_data.get_num_centers();
         let max_slots = (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
-        let slot_size = num_sectors_per_node * block_size;
 
         let args = PipelinedScratchArgs {
             disk_index_path: &self.disk_index_path,
             max_slots,
-            slot_size,
-            alignment: block_size,
-            graph_degree,
-            dims,
-            num_pq_chunks,
-            num_pq_centers,
+            slot_size: self.slot_size,
+            alignment: self.block_size,
+            graph_degree: self.graph_degree,
+            dims: self.dims,
+            num_pq_chunks: self.num_pq_chunks,
+            num_pq_centers: self.num_pq_centers,
             reader_config: self.reader_config.clone(),
         };
         let mut scratch = PoolOption::try_pooled(&self.scratch_pool, &args)?;
@@ -256,12 +242,12 @@ where
             return_list_size as usize,
             search_list_size as usize,
             beam_width,
-            medoid,
-            dims,
-            node_len,
-            num_nodes_per_sector,
-            block_size,
-            fp_vector_len,
+            self.medoid,
+            self.dims,
+            self.node_len,
+            self.num_nodes_per_sector,
+            self.block_size,
+            self.fp_vector_len,
             pq_scratch,
             self.relaxed_monotonicity_l,
             self.metric,
@@ -356,7 +342,6 @@ mod tests {
             graph_header,
             pq_data,
             Metric::L2,
-            usize::MAX,
             4,
             None,
             real_index_path.to_str().unwrap().to_string(),
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index 468ca82c6..3dc6cc896 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -2172,7 +2172,6 @@ mod disk_provider_tests {
             graph_header,
             pq_data,
             Metric::L2,
-            usize::MAX,
             4,
             None,
             real_index_path_str.to_string(),
@@ -2268,7 +2267,6 @@ mod disk_provider_tests {
                 graph_header,
                 pq_data,
                 Metric::L2,
-                usize::MAX,
                 4,
                 None,
                 real_index_path.to_str().unwrap().to_string(),
diff --git a/diskann-disk/src/search/provider/disk_sector_graph.rs b/diskann-disk/src/search/provider/disk_sector_graph.rs
index 1f00ad6db..700cf84df 100644
--- a/diskann-disk/src/search/provider/disk_sector_graph.rs
+++ b/diskann-disk/src/search/provider/disk_sector_graph.rs
@@ -15,9 +15,7 @@ use crate::{
     utils::aligned_file_reader::{traits::AlignedFileReader, AlignedRead},
 };
 
-const DEFAULT_DISK_SECTOR_LEN: usize = 4096;
-
-/// Sector graph read from disk index
+/// Sector graph read from diskindex
 pub struct DiskSectorGraph<AlignedReaderType: AlignedFileReader> {
     /// Ensure `sector_reader` is dropped before `sectors_data` by placing it before `sectors_data`.
     /// Graph storage to read sectors
@@ -57,19 +55,11 @@ impl<AlignedReaderType: AlignedFileReader> DiskSectorGraph<AlignedReaderType> {
         header: &GraphHeader,
         max_n_batch_sector_read: usize,
     ) -> ANNResult<Self> {
-        let mut block_size = header.block_size() as usize;
-        let version = header.layout_version();
-        if (version.major_version() == 0 && version.minor_version() == 0) || block_size == 0 {
-            block_size = DEFAULT_DISK_SECTOR_LEN;
-        }
+        let block_size = header.effective_block_size();
 
         let num_nodes_per_sector = header.metadata().num_nodes_per_block;
         let node_len = header.metadata().node_len;
-        let num_sectors_per_node = if num_nodes_per_sector > 0 {
-            1
-        } else {
-            (node_len as usize).div_ceil(block_size)
-        };
+        let num_sectors_per_node = header.num_sectors_per_node();
 
         Ok(Self {
             sector_reader,
@@ -152,23 +142,21 @@ impl<AlignedReaderType: AlignedFileReader> DiskSectorGraph<AlignedReaderType> {
     /// Get offset of node in sectors_data
     #[inline]
     fn get_node_offset(&self, vertex_id: u32) -> usize {
-        if self.num_nodes_per_sector == 0 {
-            // multi-sector node
-            0
-        } else {
-            // multi node in a sector
-            (vertex_id as u64 % self.num_nodes_per_sector * self.node_len) as usize
-        }
+        crate::search::sector_math::node_offset_in_sector(
+            vertex_id,
+            self.num_nodes_per_sector,
+            self.node_len,
+        )
     }
 
     #[inline]
     /// Gets the index for the sector that contains the node with the given vertex_id
     pub fn node_sector_index(&self, vertex_id: u32) -> u64 {
-        1 + if self.num_nodes_per_sector > 0 {
-            vertex_id as u64 / self.num_nodes_per_sector
-        } else {
-            vertex_id as u64 * self.num_sectors_per_node as u64
-        }
+        crate::search::sector_math::node_sector_index(
+            vertex_id,
+            self.num_nodes_per_sector,
+            self.num_sectors_per_node,
+        )
     }
 }
 
@@ -190,6 +178,8 @@ mod disk_sector_graph_test {
     use super::*;
     use crate::data_model::{GraphLayoutVersion, GraphMetadata};
 
+    const DEFAULT_DISK_SECTOR_LEN: usize = 4096;
+
     fn test_index_path() -> String {
         test_data_root()
             .join("disk_index_misc/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_aligned_reader_test.index")
diff --git a/diskann-disk/src/search/sector_math.rs b/diskann-disk/src/search/sector_math.rs
new file mode 100644
index 000000000..0313f33ab
--- /dev/null
+++ b/diskann-disk/src/search/sector_math.rs
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Shared sector-layout arithmetic used by both beam search and pipelined search.
+
+/// Compute the sector index that contains the given vertex.
+///
+/// The first sector (index 0) is reserved for the graph header, so data sectors
+/// start at index 1.
+#[inline]
+pub fn node_sector_index(
+    vertex_id: u32,
+    num_nodes_per_sector: u64,
+    num_sectors_per_node: usize,
+) -> u64 {
+    1 + if num_nodes_per_sector > 0 {
+        vertex_id as u64 / num_nodes_per_sector
+    } else {
+        vertex_id as u64 * num_sectors_per_node as u64
+    }
+}
+
+/// Compute the byte offset of a node within its sector.
+#[inline]
+pub fn node_offset_in_sector(vertex_id: u32, num_nodes_per_sector: u64, node_len: u64) -> usize {
+    if num_nodes_per_sector == 0 {
+        0
+    } else {
+        (vertex_id as u64 % num_nodes_per_sector * node_len) as usize
+    }
+}

From c197e5619234d35f6d56f51621eae463c7c2398b Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Mon, 9 Feb 2026 15:01:08 -0800
Subject: [PATCH 04/46] fix overflow and total order problems

---
 .../src/search/pipelined/pipelined_search.rs       | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
index 540bbb7a1..6d8b2f92a 100644
--- a/diskann-disk/src/search/pipelined/pipelined_search.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_search.rs
@@ -110,12 +110,7 @@ fn parse_node(
 fn insert_into_pool(retset: &mut Vec<Candidate>, pool_size: &mut usize, candidate: Candidate) -> usize {
     // Binary search for insertion point
     let pos = retset[..*pool_size]
-        .binary_search_by(|probe| {
-            probe
-                .distance
-                .partial_cmp(&candidate.distance)
-                .unwrap_or(std::cmp::Ordering::Equal)
-        })
+        .binary_search_by(|probe| probe.distance.total_cmp(&candidate.distance))
         .unwrap_or_else(|x| x);
 
     // If pool is full and candidate is worse than all existing, don't insert
@@ -172,6 +167,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
 
     let num_pq_chunks = pq_data.get_num_chunks();
     let pq_compressed = pq_data.pq_compressed_data().get_data();
+    let num_pts = pq_compressed.len() / num_pq_chunks;
 
     let num_sectors_per_node = if num_nodes_per_sector > 0 {
         1
@@ -379,7 +375,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                     // Expand neighbors
                     let mut nbors_to_compute: Vec<u32> = Vec::new();
                     for &nbr_id in &node.adjacency_list {
-                        if visited.insert(nbr_id) {
+                        if (nbr_id as usize) < num_pts && visited.insert(nbr_id) {
                             nbors_to_compute.push(nbr_id);
                         }
                     }
@@ -474,8 +470,8 @@ pub(crate) fn pipe_search<T: VectorRepr>(
         }
     }
 
-    // Sort full_retset and return top-k
-    full_retset.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+    // Sort full_retset and return top-k (total_cmp handles NaN correctly)
+    full_retset.sort_by(|a, b| a.1.total_cmp(&b.1));
 
     // Deduplicate
     let mut ids = Vec::with_capacity(k);

From 338f1dd48906c569cfc20070ced393917259e23d Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Mon, 9 Feb 2026 20:50:12 -0800
Subject: [PATCH 05/46] wait completions, filtering, relaxed mono drain

---
 .../src/backend/disk_index/search.rs          |  1 +
 diskann-disk/src/build/builder/core.rs        |  2 +-
 .../src/search/pipelined/pipelined_reader.rs  | 17 +++++++++++++++++
 .../src/search/pipelined/pipelined_search.rs  | 19 +++++++++++++------
 .../search/pipelined/pipelined_searcher.rs    | 10 +++++++---
 .../src/search/provider/disk_provider.rs      |  4 ++--
 6 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 8f1416d71..890251a02 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -438,6 +438,7 @@ where
                                         search_params.recall_at,
                                         l,
                                         search_params.beam_width,
+                                        None,
                                     ),
                                     search_params.recall_at as usize,
                                     stats,
diff --git a/diskann-disk/src/build/builder/core.rs b/diskann-disk/src/build/builder/core.rs
index 271fe2f9f..9c8fffa61 100644
--- a/diskann-disk/src/build/builder/core.rs
+++ b/diskann-disk/src/build/builder/core.rs
@@ -1184,7 +1184,7 @@ pub(crate) mod disk_index_builder_tests {
                 });
 
             let result =
-                pipe_searcher.search(query_data, top_k as u32, search_l, 4)?;
+                pipe_searcher.search(query_data, top_k as u32, search_l, 4, None)?;
             let result_ids: Vec<u32> =
                 result.results.iter().map(|item| item.vertex_id).collect();
 
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index 3af5d9232..0b922f81b 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -131,6 +131,23 @@ impl PipelinedReader {
     ///
     /// Drains already-completed CQEs from the io_uring completion queue.
     pub fn poll_completions(&mut self) -> ANNResult<Vec<usize>> {
+        self.drain_cqes()
+    }
+
+    /// Block until at least one IO completes, then drain all available CQEs.
+    ///
+    /// Use this when [`poll_completions`] returned an empty vec but there are
+    /// in-flight reads — avoids busy-spinning while waiting for the kernel.
+    pub fn wait_completions(&mut self) -> ANNResult<Vec<usize>> {
+        if self.in_flight == 0 {
+            return Ok(Vec::new());
+        }
+        self.ring.submit_and_wait(1)?;
+        self.drain_cqes()
+    }
+
+    /// Drain all available CQEs from the completion queue.
+    fn drain_cqes(&mut self) -> ANNResult<Vec<usize>> {
         let mut completed = Vec::new();
         for cqe in self.ring.completion() {
             if cqe.result() < 0 {
diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
index 6d8b2f92a..8b1c6bf0f 100644
--- a/diskann-disk/src/search/pipelined/pipelined_search.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_search.rs
@@ -157,6 +157,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
     pq_scratch: &mut PQScratch,
     relaxed_monotonicity_l: Option<usize>,
     metric: Metric,
+    vector_filter: Option<&(dyn Fn(&u32) -> bool + Send + Sync)>,
 ) -> ANNResult<PipeSearchResult> {
     let timer = Instant::now();
     let mut io_count: u32 = 0;
@@ -257,7 +258,8 @@ pub(crate) fn pipe_search<T: VectorRepr>(
             break;
         }
 
-        // Poll completions
+        // Poll completions (non-blocking). Keeping this non-blocking is critical
+        // for overlapping IO and compute — blocking here would serialize the pipeline.
         let io_poll_start = Instant::now();
         let completed_slots = reader.poll_completions()?;
         io_time += io_poll_start.elapsed();
@@ -367,10 +369,13 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                 hops += 1;
 
                 if let Some(node) = id_buf_map.get(&vid) {
-                    // Compute full-precision distance
+                    // Compute full-precision distance; only add to results if
+                    // filter is absent or the node passes the filter predicate.
                     let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
                     let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
-                    full_retset.push((vid, fp_dist));
+                    if vector_filter.map_or(true, |f| f(&vid)) {
+                        full_retset.push((vid, fp_dist));
+                    }
 
                     // Expand neighbors
                     let mut nbors_to_compute: Vec<u32> = Vec::new();
@@ -433,9 +438,9 @@ pub(crate) fn pipe_search<T: VectorRepr>(
 
     // In relaxed monotonicity mode: drain remaining IOs and process unvisited nodes
     if relaxed_monotonicity_l.is_some_and(|l| l > 0) {
-        // Drain all in-flight IOs
+        // Drain all in-flight IOs (block until each completes)
         while !on_flight_ios.is_empty() {
-            let completed_slots = reader.poll_completions()?;
+            let completed_slots = reader.wait_completions()?;
             if !completed_slots.is_empty() {
                 let completed_set: HashSet<usize> = completed_slots.into_iter().collect();
                 let mut remaining = VecDeque::new();
@@ -464,7 +469,9 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                     c.visited = true;
                     let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
                     let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
-                    full_retset.push((c.id, fp_dist));
+                    if vector_filter.map_or(true, |f| f(&c.id)) {
+                        full_retset.push((c.id, fp_dist));
+                    }
                 }
             }
         }
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index d282d3234..852330a16 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -208,12 +208,15 @@ where
     /// * `return_list_size` - Number of results to return (k).
     /// * `search_list_size` - Size of the candidate pool (L).
     /// * `beam_width` - Maximum beam width for pipelined IO.
+    /// * `vector_filter` - Optional predicate; only vertices passing the filter
+    ///   are included in the result set. Graph traversal is unaffected.
     pub fn search(
         &self,
         query: &[Data::VectorDataType],
         return_list_size: u32,
         search_list_size: u32,
         beam_width: usize,
+        vector_filter: Option<&(dyn Fn(&u32) -> bool + Send + Sync)>,
     ) -> ANNResult<SearchResult<Data::AssociatedDataType>> {
         let max_slots = (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
 
@@ -251,6 +254,7 @@ where
             pq_scratch,
             self.relaxed_monotonicity_l,
             self.metric,
+            vector_filter,
         )?;
 
         let query_statistics = QueryStatistics {
@@ -367,8 +371,8 @@ mod tests {
         let searcher = create_test_searcher();
         let query = load_test_query();
 
-        let r1 = searcher.search(&query, 10, 40, 4).unwrap();
-        let r2 = searcher.search(&query, 10, 40, 4).unwrap();
+        let r1 = searcher.search(&query, 10, 40, 4, None).unwrap();
+        let r2 = searcher.search(&query, 10, 40, 4, None).unwrap();
 
         assert!(!r1.results.is_empty());
         assert!(!r2.results.is_empty());
@@ -387,7 +391,7 @@ mod tests {
 
         let results: Vec<_> = (0..4)
             .into_par_iter()
-            .map(|_| searcher.search(&query, 10, 40, 4).unwrap())
+            .map(|_| searcher.search(&query, 10, 40, 4, None).unwrap())
             .collect();
 
         for r in &results {
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index 3dc6cc896..60506af20 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -2197,7 +2197,7 @@ mod disk_provider_tests {
         let mut total_recall = 0.0f32;
         for q in 0..num_queries {
             let query = &query_vector[q * dim..(q + 1) * dim];
-            let result = pipe_searcher.search(query, k as u32, l, 4).unwrap();
+            let result = pipe_searcher.search(query, k as u32, l, 4, None).unwrap();
             let indices: Vec<u32> = result.results.iter().map(|item| item.vertex_id).collect();
             let truth_slice = &truth_result[q * k..(q + 1) * k];
 
@@ -2307,7 +2307,7 @@ mod disk_provider_tests {
             let truth_slice = &truth_ref[q * k..(q + 1) * k];
 
             // Pipe search (runs concurrently with beam search across rayon threads)
-            let pipe_result = pipe_ref.search(query, k as u32, l, 4).unwrap();
+            let pipe_result = pipe_ref.search(query, k as u32, l, 4, None).unwrap();
             let pipe_ids: Vec<u32> = pipe_result.results.iter().map(|r| r.vertex_id).collect();
 
             // Both should produce results with reasonable overlap.

From 7bb919139e46e588b159ace023a43f80e5016d56 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 13:14:39 -0800
Subject: [PATCH 06/46] Queue-based ExpandBeam: unified pipelined search via
 submit_expand/expand_available

Refactors ExpandBeam trait to support pipelined IO through queue semantics:
- submit_expand(): queue node IDs for async IO (no-op for non-pipelined)
- expand_available(): expand nodes with completed IO (sync for non-pipelined)
- has_pending(): check for in-flight IO operations

Refactors search_internal() to use the queue-based loop with adaptive beam
width and relaxed monotonicity support.

Creates PipelinedDiskAccessor implementing ExpandBeam with io_uring-based
IO, integrated via DiskIndexSearcher.search_pipelined().

Key advantage over PrefetchBeam approach: single trait instead of two,
simpler loop, no leaked abstraction. All 14 existing ExpandBeam impls
work unchanged via backwards-compatible defaults.

Benchmark results (SIFT-128d, 10 queries):
  BeamSearch:        597-202 QPS, 89-92% recall
  PipeSearch:        734-286 QPS, 72-91% recall
  UnifiedPipeSearch: 624-258 QPS, 93-92% recall
---
 .../src/backend/disk_index/search.rs          |  74 ++
 diskann-benchmark/src/inputs/disk.rs          |  14 +
 diskann-disk/src/search/pipelined/mod.rs      |   2 +
 .../search/pipelined/pipelined_searcher.rs    |   9 +-
 .../src/search/provider/disk_provider.rs      |  24 +-
 diskann-disk/src/search/provider/mod.rs       |   3 +
 .../src/search/provider/pipelined_accessor.rs | 673 ++++++++++++++++++
 diskann/src/graph/glue.rs                     |  58 ++
 diskann/src/graph/index.rs                    |  88 ++-
 diskann/src/graph/misc.rs                     |  16 +
 10 files changed, 929 insertions(+), 32 deletions(-)
 create mode 100644 diskann-disk/src/search/provider/pipelined_accessor.rs

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 890251a02..270ddf1fc 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -23,6 +23,8 @@ use diskann_disk::{
 };
 #[cfg(target_os = "linux")]
 use diskann_disk::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
+#[cfg(target_os = "linux")]
+use diskann_disk::search::provider::pipelined_accessor::PipelinedConfig;
 use diskann_providers::storage::StorageReadProvider;
 use diskann_providers::{
     storage::{
@@ -459,6 +461,78 @@ where
                 anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
+        SearchMode::UnifiedPipeSearch { sqpoll_idle_ms } => {
+            #[cfg(target_os = "linux")]
+            {
+                let reader_config = PipelinedReaderConfig {
+                    sqpoll_idle_ms: *sqpoll_idle_ms,
+                };
+
+                let mut searcher = DiskIndexSearcher::<GraphData<T>, _>::new(
+                    search_params.num_threads,
+                    search_params.search_io_limit.unwrap_or(usize::MAX),
+                    &index_reader,
+                    vertex_provider_factory,
+                    search_params.distance.into(),
+                    None,
+                )?;
+
+                searcher.with_pipelined_config(PipelinedConfig {
+                    disk_index_path: disk_index_path.clone(),
+                    reader_config,
+                    beam_width: search_params.beam_width,
+                });
+
+                let searcher = &searcher;
+
+                logger.log_checkpoint("index_loaded");
+
+                search_results_per_l = run_search_loop(
+                    &search_params.search_list,
+                    search_params.recall_at,
+                    search_params.beam_width,
+                    num_queries,
+                    "unified_pipesearch",
+                    &has_any_search_failed,
+                    &gt_context,
+                    |l, statistics_vec, result_counts, result_ids, result_dists| {
+                        let zipped = queries
+                            .par_row_iter()
+                            .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
+                            .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
+                            .zip(statistics_vec.par_iter_mut())
+                            .zip(result_counts.par_iter_mut());
+
+                        zipped.for_each_in_pool(
+                            &pool,
+                            |((((q, id_chunk), dist_chunk), stats), rc)| {
+                                write_query_result(
+                                    searcher.search_pipelined(
+                                        q,
+                                        search_params.recall_at,
+                                        l,
+                                        search_params.beam_width,
+                                        None,
+                                    ),
+                                    search_params.recall_at as usize,
+                                    stats,
+                                    rc,
+                                    id_chunk,
+                                    dist_chunk,
+                                    &has_any_search_failed,
+                                    "UnifiedPipeSearch",
+                                );
+                            },
+                        );
+                    },
+                )?;
+            }
+            #[cfg(not(target_os = "linux"))]
+            {
+                let _ = sqpoll_idle_ms;
+                anyhow::bail!("UnifiedPipeSearch is only supported on Linux");
+            }
+        }
     }
 
     // Log search completed checkpoint
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index d32ee93da..d056035a4 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -89,6 +89,12 @@ pub(crate) enum SearchMode {
         #[serde(default)]
         sqpoll_idle_ms: Option<u32>,
     },
+    /// Unified pipelined search through the generic search loop (queue-based ExpandBeam).
+    UnifiedPipeSearch {
+        /// Enable kernel-side SQ polling (ms idle timeout). None = disabled.
+        #[serde(default)]
+        sqpoll_idle_ms: Option<u32>,
+    },
 }
 
 impl fmt::Display for SearchMode {
@@ -109,6 +115,13 @@ impl fmt::Display for SearchMode {
                 }
                 write!(f, ")")
             }
+            SearchMode::UnifiedPipeSearch { sqpoll_idle_ms } => {
+                write!(f, "UnifiedPipeSearch")?;
+                if let Some(sq) = sqpoll_idle_ms {
+                    write!(f, "(sqpoll={}ms)", sq)?;
+                }
+                Ok(())
+            }
         }
     }
 }
@@ -290,6 +303,7 @@ impl CheckDeserialization for DiskSearchPhase {
                     anyhow::bail!("initial_beam_width must be positive");
                 }
             }
+            SearchMode::UnifiedPipeSearch { .. } => {}
         }
         Ok(())
     }
diff --git a/diskann-disk/src/search/pipelined/mod.rs b/diskann-disk/src/search/pipelined/mod.rs
index 2005c0ce8..26378da4f 100644
--- a/diskann-disk/src/search/pipelined/mod.rs
+++ b/diskann-disk/src/search/pipelined/mod.rs
@@ -22,6 +22,8 @@ mod pipelined_reader;
 pub use pipelined_reader::PipelinedReader;
 #[cfg(target_os = "linux")]
 pub use pipelined_reader::PipelinedReaderConfig;
+#[cfg(target_os = "linux")]
+pub use pipelined_reader::MAX_IO_CONCURRENCY;
 
 #[cfg(target_os = "linux")]
 mod pipelined_search;
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index 852330a16..712c879c0 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -76,8 +76,12 @@ impl TryAsPooled<&PipelinedScratchArgs<'_>> for PipelinedSearchScratch {
 
 /// A pipelined disk index searcher implementing the PipeANN algorithm.
 ///
-/// Analogous to `DiskIndexSearcher` but uses pipelined IO (non-blocking io_uring
-/// submit/poll) to overlap IO and compute within a single query.
+/// # Deprecation
+///
+/// This standalone searcher duplicates the generic search loop. Prefer using
+/// `DiskIndexSearcher::search_pipelined()` which integrates pipelined IO via the
+/// queue-based `ExpandBeam` trait, providing the same IO/compute overlap without
+/// code duplication.
 ///
 /// # Safety
 ///
@@ -95,6 +99,7 @@ impl TryAsPooled<&PipelinedScratchArgs<'_>> for PipelinedSearchScratch {
 /// Multiple concurrent `search()` calls on the same `PipelinedSearcher` are safe.
 /// Each search operates on its own `PipelinedReader` and `PQScratch` (pooled for
 /// amortized allocation). Shared state (`PQData`, `GraphHeader`) is immutable.
+#[deprecated(note = "Use DiskIndexSearcher::search_pipelined() instead for unified pipelined search")]
 pub struct PipelinedSearcher<Data: GraphDataType<VectorIdType = u32>> {
     #[allow(dead_code)]
     graph_header: GraphHeader,
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index 60506af20..f65431097 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -71,19 +71,19 @@ where
     Data: GraphDataType<VectorIdType = u32>,
 {
     /// Holds the graph header information that contains metadata about disk-index file.
-    graph_header: GraphHeader,
+    pub(crate) graph_header: GraphHeader,
 
     // Full precision distance comparer used in post_process to reorder results.
-    distance_comparer: <Data::VectorDataType as VectorRepr>::Distance,
+    pub(crate) distance_comparer: <Data::VectorDataType as VectorRepr>::Distance,
 
     /// The PQ data used for quantization.
-    pq_data: Arc<PQData>,
+    pub(crate) pq_data: Arc<PQData>,
 
     /// The number of points in the graph.
-    num_points: usize,
+    pub(crate) num_points: usize,
 
     /// Metric used for distance computation.
-    metric: Metric,
+    pub(crate) metric: Metric,
 
     /// The number of IO operations that can be done in parallel.
     search_io_limit: usize,
@@ -373,8 +373,8 @@ where
 
 /// The query computer for the disk provider. This is used to compute the distance between the query vector and the PQ coordinates.
 pub struct DiskQueryComputer {
-    num_pq_chunks: usize,
-    query_centroid_l2_distance: Vec<f32>,
+    pub(crate) num_pq_chunks: usize,
+    pub(crate) query_centroid_l2_distance: Vec<f32>,
 }
 
 impl PreprocessedDistanceFunction<&[u8], f32> for DiskQueryComputer {
@@ -783,14 +783,18 @@ pub struct DiskIndexSearcher<
     Data: GraphDataType<VectorIdType = u32>,
     ProviderFactory: VertexProviderFactory<Data>,
 {
-    index: DiskANNIndex<DiskProvider<Data>>,
-    runtime: Runtime,
+    pub(crate) index: DiskANNIndex<DiskProvider<Data>>,
+    pub(crate) runtime: Runtime,
 
     /// The vertex provider factory is used to create the vertex provider for each search instance.
     vertex_provider_factory: ProviderFactory,
 
     /// Scratch pool for disk search operations that need allocations.
     scratch_pool: Arc<ObjectPool<DiskSearchScratch<Data, ProviderFactory::VertexProviderType>>>,
+
+    /// Optional pipelined search configuration (Linux only, io_uring-based).
+    #[cfg(target_os = "linux")]
+    pub(crate) pipelined_config: Option<super::pipelined_accessor::PipelinedConfig>,
 }
 
 #[derive(Debug)]
@@ -891,6 +895,8 @@ where
             runtime,
             vertex_provider_factory,
             scratch_pool,
+            #[cfg(target_os = "linux")]
+            pipelined_config: None,
         })
     }
 
diff --git a/diskann-disk/src/search/provider/mod.rs b/diskann-disk/src/search/provider/mod.rs
index 69f697d84..2a168522a 100644
--- a/diskann-disk/src/search/provider/mod.rs
+++ b/diskann-disk/src/search/provider/mod.rs
@@ -13,3 +13,6 @@ pub mod disk_provider;
 pub mod disk_sector_graph;
 pub mod disk_vertex_provider;
 pub mod disk_vertex_provider_factory;
+
+#[cfg(target_os = "linux")]
+pub mod pipelined_accessor;
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
new file mode 100644
index 000000000..09730362c
--- /dev/null
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -0,0 +1,673 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Queue-based pipelined disk accessor that integrates with the generic search loop
+//! via the `ExpandBeam` trait's `submit_expand` / `expand_available` / `has_pending` methods.
+//!
+//! Instead of duplicating the search loop (like `PipelinedSearcher`), this accessor
+//! plugs into `DiskANNIndex::search_internal()` and overlaps IO with computation
+//! using io_uring under the hood.
+
+use std::collections::{HashMap, VecDeque};
+use std::future::Future;
+use std::ops::Range;
+
+use byteorder::{ByteOrder, LittleEndian};
+use diskann::{
+    graph::{
+        glue::{ExpandBeam, HybridPredicate, IdIterator, SearchExt, SearchPostProcess, SearchStrategy},
+        search_output_buffer, AdjacencyList, SearchOutputBuffer, SearchParams,
+    },
+    neighbor::Neighbor,
+    provider::{Accessor, BuildQueryComputer, DefaultContext, DelegateNeighbor, HasId, NeighborAccessor},
+    ANNError, ANNResult,
+};
+use diskann_providers::model::{
+    compute_pq_distance, graph::traits::GraphDataType, pq::quantizer_preprocess, PQScratch,
+};
+use diskann_vector::DistanceFunction;
+
+use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig, MAX_IO_CONCURRENCY};
+use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
+use crate::search::traits::VertexProviderFactory;
+use crate::utils::QueryStatistics;
+
+use super::disk_provider::{
+    DiskIndexSearcher, DiskProvider, DiskQueryComputer, SearchResult, SearchResultItem,
+    SearchResultStats,
+};
+
+/// A loaded node parsed from sector data.
+struct LoadedNode {
+    fp_vector: Vec<u8>,
+    adjacency_list: Vec<u32>,
+}
+
+/// Tracks an in-flight IO request.
+struct InFlightIo {
+    vertex_id: u32,
+    slot_id: usize,
+}
+
+/// Parse a node from raw sector buffer bytes.
+fn parse_node(
+    sector_buf: &[u8],
+    vertex_id: u32,
+    num_nodes_per_sector: u64,
+    node_len: u64,
+    fp_vector_len: u64,
+) -> ANNResult<LoadedNode> {
+    let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
+    let end = offset + node_len as usize;
+    let node_data = sector_buf.get(offset..end).ok_or_else(|| {
+        ANNError::log_index_error(format_args!(
+            "Node data out of bounds: vertex {} offset {}..{} in buffer of len {}",
+            vertex_id, offset, end, sector_buf.len()
+        ))
+    })?;
+
+    let fp_vector_len_usize = fp_vector_len as usize;
+    if fp_vector_len_usize > node_data.len() {
+        return Err(ANNError::log_index_error(format_args!(
+            "fp_vector_len {} exceeds node_data len {}",
+            fp_vector_len_usize,
+            node_data.len()
+        )));
+    }
+
+    let fp_vector = node_data[..fp_vector_len_usize].to_vec();
+    let neighbor_data = &node_data[fp_vector_len_usize..];
+    let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
+    let max_neighbors = (neighbor_data.len().saturating_sub(4)) / 4;
+    let num_neighbors = num_neighbors.min(max_neighbors);
+    let mut adjacency_list = Vec::with_capacity(num_neighbors);
+    for i in 0..num_neighbors {
+        let start = 4 + i * 4;
+        adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
+    }
+
+    Ok(LoadedNode {
+        fp_vector,
+        adjacency_list,
+    })
+}
+
+/// Max buffer slots to use, based on beam width.
+#[inline]
+fn max_slots(beam_width: usize) -> usize {
+    (beam_width * 2).clamp(16, MAX_IO_CONCURRENCY)
+}
+
+/// Pipelined disk accessor that overlaps IO and compute via io_uring.
+///
+/// Implements the `ExpandBeam` trait's queue-based methods:
+/// - `submit_expand`: submits non-blocking io_uring reads for the given node IDs
+/// - `expand_available`: polls for completed reads and expands those nodes
+/// - `has_pending`: returns true when IO operations are in-flight
+pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
+    provider: &'a DiskProvider<Data>,
+    reader: PipelinedReader,
+    pq_scratch: PQScratch,
+    query: &'a [Data::VectorDataType],
+
+    // Graph geometry (cached from GraphHeader)
+    num_nodes_per_sector: u64,
+    num_sectors_per_node: usize,
+    block_size: usize,
+    node_len: u64,
+    fp_vector_len: u64,
+    num_points: usize,
+
+    // IO state
+    in_flight_ios: VecDeque<InFlightIo>,
+    loaded_nodes: HashMap<u32, LoadedNode>,
+    next_slot_id: usize,
+    max_slots: usize,
+
+    // Distance cache for post-processing rerank
+    distance_cache: HashMap<u32, f32>,
+}
+
+impl<'a, Data> PipelinedDiskAccessor<'a, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    /// Create a new pipelined disk accessor.
+    pub fn new(
+        provider: &'a DiskProvider<Data>,
+        query: &'a [Data::VectorDataType],
+        disk_index_path: &str,
+        beam_width: usize,
+        reader_config: &PipelinedReaderConfig,
+    ) -> ANNResult<Self> {
+        let metadata = provider.graph_header.metadata();
+        let dims = metadata.dims;
+        let num_nodes_per_sector = metadata.num_nodes_per_block;
+        let node_len = metadata.node_len;
+        let fp_vector_len = (dims * std::mem::size_of::<Data::VectorDataType>()) as u64;
+
+        let block_size = provider.graph_header.effective_block_size();
+        let num_sectors_per_node = provider.graph_header.num_sectors_per_node();
+        let slot_size = num_sectors_per_node * block_size;
+        let slots = max_slots(beam_width);
+
+        let reader = PipelinedReader::new(
+            disk_index_path,
+            slots,
+            slot_size,
+            block_size,
+            reader_config,
+        )?;
+
+        let graph_degree = provider.graph_header.max_degree::<Data::VectorDataType>()?;
+        let mut pq_scratch = PQScratch::new(
+            graph_degree,
+            dims,
+            provider.pq_data.get_num_chunks(),
+            provider.pq_data.get_num_centers(),
+        )?;
+
+        // Preprocess PQ distance table for this query
+        let medoid = metadata.medoid as u32;
+        pq_scratch.set(dims, query, 1.0)?;
+        quantizer_preprocess(&mut pq_scratch, &provider.pq_data, provider.metric, &[medoid])?;
+
+        Ok(Self {
+            provider,
+            reader,
+            pq_scratch,
+            query,
+            num_nodes_per_sector,
+            num_sectors_per_node,
+            block_size,
+            node_len,
+            fp_vector_len,
+            num_points: provider.num_points,
+            in_flight_ios: VecDeque::new(),
+            loaded_nodes: HashMap::new(),
+            next_slot_id: 0,
+            max_slots: slots,
+            distance_cache: HashMap::new(),
+        })
+    }
+
+    /// Compute PQ distances for a set of neighbor IDs.
+    fn pq_distances<F>(&mut self, ids: &[u32], mut f: F) -> ANNResult<()>
+    where
+        F: FnMut(f32, u32),
+    {
+        compute_pq_distance(
+            ids,
+            self.provider.pq_data.get_num_chunks(),
+            &self.pq_scratch.aligned_pqtable_dist_scratch,
+            self.provider.pq_data.pq_compressed_data().get_data(),
+            &mut self.pq_scratch.aligned_pq_coord_scratch,
+            &mut self.pq_scratch.aligned_dist_scratch,
+        )?;
+        for (i, id) in ids.iter().enumerate() {
+            f(self.pq_scratch.aligned_dist_scratch[i], *id);
+        }
+        Ok(())
+    }
+
+    /// Poll completed IOs and move data from reader buffers into loaded_nodes.
+    fn drain_completions(&mut self) -> ANNResult<()> {
+        let completed_slots = if self.in_flight_ios.is_empty() {
+            Vec::new()
+        } else {
+            self.reader.poll_completions()?
+        };
+
+        if !completed_slots.is_empty() {
+            let completed_set: std::collections::HashSet<usize> =
+                completed_slots.into_iter().collect();
+            let mut remaining = VecDeque::new();
+            while let Some(io) = self.in_flight_ios.pop_front() {
+                if completed_set.contains(&io.slot_id) {
+                    let sector_buf = self.reader.get_slot_buf(io.slot_id);
+                    let node = parse_node(
+                        sector_buf,
+                        io.vertex_id,
+                        self.num_nodes_per_sector,
+                        self.node_len,
+                        self.fp_vector_len,
+                    )?;
+                    self.loaded_nodes.insert(io.vertex_id, node);
+                } else {
+                    remaining.push_back(io);
+                }
+            }
+            self.in_flight_ios = remaining;
+        }
+        Ok(())
+    }
+}
+
+impl<Data> HasId for PipelinedDiskAccessor<'_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type Id = u32;
+}
+
+impl<'a, Data> Accessor for PipelinedDiskAccessor<'a, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type Extended = &'a [u8];
+    type Element<'b>
+        = &'a [u8]
+    where
+        Self: 'b;
+    type ElementRef<'b> = &'b [u8];
+    type GetError = ANNError;
+
+    fn get_element(
+        &mut self,
+        id: Self::Id,
+    ) -> impl Future<Output = Result<Self::Element<'_>, Self::GetError>> + Send {
+        std::future::ready(self.provider.pq_data.get_compressed_vector(id as usize))
+    }
+}
+
+impl<Data> IdIterator<Range<u32>> for PipelinedDiskAccessor<'_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    async fn id_iterator(&mut self) -> Result<Range<u32>, ANNError> {
+        Ok(0..self.num_points as u32)
+    }
+}
+
+/// Delegate for neighbor access (required by AsNeighbor).
+pub struct PipelinedNeighborDelegate<'a, 'b, Data: GraphDataType<VectorIdType = u32>>(
+    #[allow(dead_code)] &'a mut PipelinedDiskAccessor<'b, Data>,
+);
+
+impl<Data> HasId for PipelinedNeighborDelegate<'_, '_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type Id = u32;
+}
+
+impl<Data> NeighborAccessor for PipelinedNeighborDelegate<'_, '_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    fn get_neighbors(
+        self,
+        _id: Self::Id,
+        _neighbors: &mut AdjacencyList<Self::Id>,
+    ) -> impl Future<Output = ANNResult<Self>> + Send {
+        // Neighbor expansion is handled by expand_available, not get_neighbors
+        async { Ok(self) }
+    }
+}
+
+impl<'a, 'b, Data> DelegateNeighbor<'a> for PipelinedDiskAccessor<'b, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type Delegate = PipelinedNeighborDelegate<'a, 'b, Data>;
+    fn delegate_neighbor(&'a mut self) -> Self::Delegate {
+        PipelinedNeighborDelegate(self)
+    }
+}
+
+impl<Data> BuildQueryComputer<[Data::VectorDataType]> for PipelinedDiskAccessor<'_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type QueryComputerError = ANNError;
+    type QueryComputer = DiskQueryComputer;
+
+    fn build_query_computer(
+        &self,
+        _from: &[Data::VectorDataType],
+    ) -> Result<Self::QueryComputer, Self::QueryComputerError> {
+        Ok(DiskQueryComputer {
+            num_pq_chunks: self.provider.pq_data.get_num_chunks(),
+            query_centroid_l2_distance: self
+                .pq_scratch
+                .aligned_pqtable_dist_scratch
+                .as_slice()
+                .to_vec(),
+        })
+    }
+
+    async fn distances_unordered<Itr, F>(
+        &mut self,
+        vec_id_itr: Itr,
+        _computer: &Self::QueryComputer,
+        f: F,
+    ) -> Result<(), Self::GetError>
+    where
+        F: Send + FnMut(f32, Self::Id),
+        Itr: Iterator<Item = Self::Id>,
+    {
+        self.pq_distances(&vec_id_itr.collect::<Box<[_]>>(), f)
+    }
+}
+
+impl<Data> ExpandBeam<[Data::VectorDataType]> for PipelinedDiskAccessor<'_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    /// Submit non-blocking io_uring reads for the given node IDs.
+    fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) {
+        for id in ids {
+            if self.loaded_nodes.contains_key(&id) {
+                continue; // Already loaded
+            }
+            let sector_idx =
+                node_sector_index(id, self.num_nodes_per_sector, self.num_sectors_per_node);
+            let sector_offset = sector_idx * self.block_size as u64;
+            let slot_id = self.next_slot_id % self.max_slots;
+            // Best-effort: if submission fails, the node will be retried
+            if self.reader.submit_read(sector_offset, slot_id).is_ok() {
+                self.in_flight_ios.push_back(InFlightIo {
+                    vertex_id: id,
+                    slot_id,
+                });
+                self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
+            }
+        }
+    }
+
+    /// Poll for completed reads and expand nodes whose data is available.
+    fn expand_available<P, F>(
+        &mut self,
+        _ids: impl Iterator<Item = Self::Id> + Send,
+        _computer: &Self::QueryComputer,
+        mut pred: P,
+        mut on_neighbors: F,
+    ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
+    where
+        P: HybridPredicate<Self::Id> + Send + Sync,
+        F: FnMut(f32, Self::Id) + Send,
+    {
+        async move {
+            // Poll completions
+            self.drain_completions()?;
+
+            // If nothing is loaded yet and we have in-flight IO, wait for at least one
+            if self.loaded_nodes.is_empty() && !self.in_flight_ios.is_empty() {
+                let completed = self.reader.wait_completions()?;
+                if !completed.is_empty() {
+                    let completed_set: std::collections::HashSet<usize> =
+                        completed.into_iter().collect();
+                    let mut remaining = VecDeque::new();
+                    while let Some(io) = self.in_flight_ios.pop_front() {
+                        if completed_set.contains(&io.slot_id) {
+                            let sector_buf = self.reader.get_slot_buf(io.slot_id);
+                            let node = parse_node(
+                                sector_buf,
+                                io.vertex_id,
+                                self.num_nodes_per_sector,
+                                self.node_len,
+                                self.fp_vector_len,
+                            )?;
+                            self.loaded_nodes.insert(io.vertex_id, node);
+                        } else {
+                            remaining.push_back(io);
+                        }
+                    }
+                    self.in_flight_ios = remaining;
+                }
+            }
+
+            // Expand loaded nodes: get neighbors, compute PQ distances
+            let loaded_ids: Vec<u32> = self.loaded_nodes.keys().copied().collect();
+            let mut expanded = 0;
+
+            for vid in loaded_ids {
+                let node = match self.loaded_nodes.remove(&vid) {
+                    Some(n) => n,
+                    None => continue,
+                };
+
+                // Compute full-precision distance and cache it for post-processing
+                let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
+                let fp_dist = self
+                    .provider
+                    .distance_comparer
+                    .evaluate_similarity(self.query, fp_vec);
+                self.distance_cache.insert(vid, fp_dist);
+
+                // Get unvisited neighbors
+                let neighbors: Vec<u32> = node
+                    .adjacency_list
+                    .iter()
+                    .copied()
+                    .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr))
+                    .collect();
+
+                if !neighbors.is_empty() {
+                    self.pq_distances(&neighbors, &mut on_neighbors)?;
+                }
+
+                expanded += 1;
+            }
+
+            Ok(expanded)
+        }
+    }
+
+    /// Returns true when there are in-flight IO operations.
+    fn has_pending(&self) -> bool {
+        !self.in_flight_ios.is_empty() || !self.loaded_nodes.is_empty()
+    }
+}
+
+impl<Data> SearchExt for PipelinedDiskAccessor<'_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    async fn starting_points(&self) -> ANNResult<Vec<u32>> {
+        let start_vertex_id = self.provider.graph_header.metadata().medoid as u32;
+        Ok(vec![start_vertex_id])
+    }
+
+    fn terminate_early(&mut self) -> bool {
+        false
+    }
+}
+
+// ---------------------------------------------------------------------------
+// SearchStrategy + PostProcessor for pipelined search
+// ---------------------------------------------------------------------------
+
+/// Configuration for creating a pipelined search through DiskIndexSearcher.
+#[derive(Debug, Clone)]
+pub struct PipelinedConfig {
+    pub disk_index_path: String,
+    pub reader_config: PipelinedReaderConfig,
+    pub beam_width: usize,
+}
+
+/// Search strategy that creates PipelinedDiskAccessor instances.
+pub struct PipelinedSearchStrategy<'a, Data: GraphDataType<VectorIdType = u32>> {
+    query: &'a [Data::VectorDataType],
+    config: &'a PipelinedConfig,
+    vector_filter: &'a (dyn Fn(&u32) -> bool + Send + Sync),
+}
+
+/// Post-processor for pipelined search that reranks using cached full-precision distances.
+#[derive(Clone, Copy)]
+pub struct PipelinedPostProcessor<'a> {
+    filter: &'a (dyn Fn(&u32) -> bool + Send + Sync),
+}
+
+impl<Data> SearchPostProcess<
+    PipelinedDiskAccessor<'_, Data>,
+    [Data::VectorDataType],
+    (u32, Data::AssociatedDataType),
+> for PipelinedPostProcessor<'_>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type Error = ANNError;
+
+    async fn post_process<I, B>(
+        &self,
+        accessor: &mut PipelinedDiskAccessor<'_, Data>,
+        _query: &[Data::VectorDataType],
+        _computer: &DiskQueryComputer,
+        candidates: I,
+        output: &mut B,
+    ) -> Result<usize, Self::Error>
+    where
+        I: Iterator<Item = Neighbor<u32>> + Send,
+        B: SearchOutputBuffer<(u32, Data::AssociatedDataType)> + Send + ?Sized,
+    {
+        let mut reranked: Vec<((u32, Data::AssociatedDataType), f32)> = candidates
+            .map(|n| n.id)
+            .filter(|id| (self.filter)(id))
+            .filter_map(|id| {
+                accessor
+                    .distance_cache
+                    .get(&id)
+                    .map(|&dist| ((id, Data::AssociatedDataType::default()), dist))
+            })
+            .collect();
+
+        reranked.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+        Ok(output.extend(reranked))
+    }
+}
+
+impl<'this, Data> SearchStrategy<
+    DiskProvider<Data>,
+    [Data::VectorDataType],
+    (u32, Data::AssociatedDataType),
+> for PipelinedSearchStrategy<'this, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    type QueryComputer = DiskQueryComputer;
+    type SearchAccessor<'a> = PipelinedDiskAccessor<'a, Data>;
+    type SearchAccessorError = ANNError;
+    type PostProcessor = PipelinedPostProcessor<'this>;
+
+    fn search_accessor<'a>(
+        &'a self,
+        provider: &'a DiskProvider<Data>,
+        _context: &DefaultContext,
+    ) -> Result<Self::SearchAccessor<'a>, Self::SearchAccessorError> {
+        PipelinedDiskAccessor::new(
+            provider,
+            self.query,
+            &self.config.disk_index_path,
+            self.config.beam_width,
+            &self.config.reader_config,
+        )
+    }
+
+    fn post_processor(&self) -> Self::PostProcessor {
+        PipelinedPostProcessor {
+            filter: self.vector_filter,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// DiskIndexSearcher integration (search_pipelined method)
+// ---------------------------------------------------------------------------
+
+impl<Data, ProviderFactory> DiskIndexSearcher<Data, ProviderFactory>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+    ProviderFactory: VertexProviderFactory<Data>,
+{
+    /// Attach a pipelined configuration to this searcher.
+    pub fn with_pipelined_config(&mut self, config: PipelinedConfig) {
+        self.pipelined_config = Some(config);
+    }
+
+    /// Perform a pipelined search through the unified search loop.
+    ///
+    /// Requires that `with_pipelined_config()` was called first.
+    pub fn search_pipelined(
+        &self,
+        query: &[Data::VectorDataType],
+        return_list_size: u32,
+        search_list_size: u32,
+        beam_width: usize,
+        vector_filter: Option<&(dyn Fn(&u32) -> bool + Send + Sync)>,
+    ) -> ANNResult<SearchResult<Data::AssociatedDataType>> {
+        let config = self
+            .pipelined_config
+            .as_ref()
+            .ok_or_else(|| ANNError::log_index_error("pipelined_config not set"))?;
+
+        let default_filter: Box<dyn Fn(&u32) -> bool + Send + Sync> = Box::new(|_| true);
+        let filter: &(dyn Fn(&u32) -> bool + Send + Sync) =
+            vector_filter.unwrap_or(default_filter.as_ref());
+
+        let strategy = PipelinedSearchStrategy {
+            query,
+            config,
+            vector_filter: filter,
+        };
+
+        let search_params = SearchParams::new(
+            return_list_size as usize,
+            search_list_size as usize,
+            Some(beam_width),
+        )?
+        .with_adaptive_beam_width();
+
+        let mut indices = vec![0u32; return_list_size as usize];
+        let mut distances = vec![0f32; return_list_size as usize];
+        let mut associated_data =
+            vec![Data::AssociatedDataType::default(); return_list_size as usize];
+        let mut result_output_buffer = search_output_buffer::IdDistanceAssociatedData::new(
+            &mut indices[..],
+            &mut distances[..],
+            &mut associated_data[..],
+        );
+
+        let mut query_stats = QueryStatistics::default();
+        let timer = std::time::Instant::now();
+
+        // Preprocess PQ distance table: the accessor's build_query_computer relies
+        // on the pq_scratch having been preprocessed for this query.
+        let stats = self.runtime.block_on(self.index.search(
+            &strategy,
+            &DefaultContext,
+            query,
+            &search_params,
+            &mut result_output_buffer,
+        ))?;
+
+        query_stats.total_comparisons = stats.cmps;
+        query_stats.search_hops = stats.hops;
+        query_stats.total_execution_time_us = timer.elapsed().as_micros();
+
+        let mut search_result = SearchResult {
+            results: Vec::with_capacity(return_list_size as usize),
+            stats: SearchResultStats {
+                cmps: stats.cmps,
+                result_count: stats.result_count,
+                query_statistics: query_stats,
+            },
+        };
+
+        for ((vertex_id, distance), data) in indices
+            .into_iter()
+            .zip(distances.into_iter())
+            .zip(associated_data.into_iter())
+        {
+            search_result.results.push(SearchResultItem {
+                vertex_id,
+                distance,
+                data,
+            });
+        }
+
+        Ok(search_result)
+    }
+}
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 411a97031..1c942a7fc 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -255,6 +255,64 @@ pub trait ExpandBeam<T>: BuildQueryComputer<T> + AsNeighbor + Sized
 where
     T: ?Sized,
 {
+    /// Submit IDs to the expansion queue.
+    ///
+    /// For non-pipelined providers (default), IDs are stored in an internal buffer and
+    /// processed synchronously in [`expand_available`]. For pipelined providers, this
+    /// submits non-blocking IO requests (e.g., io_uring reads) so that data loading
+    /// overlaps with other computation.
+    ///
+    /// The default implementation delegates to [`expand_beam`] from within
+    /// [`expand_available`], so overriding this method is only necessary for pipelined
+    /// providers that need to separate submission from completion.
+    fn submit_expand(&mut self, _ids: impl Iterator<Item = Self::Id> + Send) {
+        // Default: no-op. IDs are passed directly to expand_beam in expand_available.
+    }
+
+    /// Expand nodes whose data is available, invoking `on_neighbors` for each discovered
+    /// neighbor.
+    ///
+    /// For non-pipelined providers (default), this expands all the `ids` passed in
+    /// synchronously via [`expand_beam`]. For pipelined providers, this polls for
+    /// completed IO operations and expands only the nodes whose data has arrived,
+    /// returning immediately without blocking.
+    ///
+    /// Returns the number of nodes that were expanded in this call.
+    fn expand_available<P, F>(
+        &mut self,
+        ids: impl Iterator<Item = Self::Id> + Send,
+        computer: &Self::QueryComputer,
+        pred: P,
+        on_neighbors: F,
+    ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
+    where
+        P: HybridPredicate<Self::Id> + Send + Sync,
+        F: FnMut(f32, Self::Id) + Send,
+    {
+        async move {
+            let id_vec: Vec<Self::Id> = ids.collect();
+            let count = id_vec.len();
+            self.expand_beam(id_vec.into_iter(), computer, pred, on_neighbors)
+                .await?;
+            Ok(count)
+        }
+    }
+
+    /// Returns true if there are submitted but not-yet-expanded nodes pending.
+    ///
+    /// For non-pipelined providers (default), this always returns `false` since
+    /// [`expand_available`] processes everything synchronously. Pipelined providers
+    /// return `true` when IO operations are in-flight.
+    fn has_pending(&self) -> bool {
+        false
+    }
+
+    /// Expand all `ids` synchronously: load data, get neighbors, compute distances.
+    ///
+    /// This is the original single-shot expansion method. For non-pipelined providers,
+    /// the default [`expand_available`] delegates to this. Pipelined providers may
+    /// override [`submit_expand`] and [`expand_available`] instead and leave this as
+    /// the default.
     fn expand_beam<Itr, P, F>(
         &mut self,
         ids: Itr,
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index ea48adc0b..f5490de50 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -398,8 +398,10 @@ where
                 let mut search_record =
                     VisitedSearchRecord::new(self.estimate_visited_set_capacity(Some(search_l)));
 
+                let default_params = SearchParams::new(1, scratch.best.search_l(), None)
+                    .expect("valid default search params");
                 self.search_internal(
-                    None, // beam_width
+                    &default_params,
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -522,8 +524,10 @@ where
                     self.estimate_visited_set_capacity(Some(scratch.best.search_l())),
                 );
 
+                let default_params = SearchParams::new(1, scratch.best.search_l(), None)
+                    .expect("valid default search params");
                 self.search_internal(
-                    None, // beam_width
+                    &default_params,
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -1330,8 +1334,10 @@ where
 
             let mut scratch = self.search_scratch(l_value, start_ids.len());
 
+            let default_params = SearchParams::new(1, scratch.best.search_l(), None)
+                .expect("valid default search params");
             self.search_internal(
-                None, // beam_width
+                &default_params,
                 &start_ids,
                 &mut search_accessor,
                 &computer,
@@ -2063,7 +2069,7 @@ where
     // A is the accessor type, T is the query type used for BuildQueryComputer
     fn search_internal<A, T, SR, Q>(
         &self,
-        beam_width: Option<usize>,
+        search_params: &SearchParams,
         start_ids: &[DP::InternalId],
         accessor: &mut A,
         computer: &A::QueryComputer,
@@ -2077,7 +2083,17 @@ where
         Q: NeighborQueue<DP::InternalId>,
     {
         async move {
-            let beam_width = beam_width.unwrap_or(1);
+            let beam_width = search_params.beam_width.unwrap_or(1);
+
+            // Adaptive beam width: start smaller and grow based on convergence
+            let mut cur_beam_width = if search_params.adaptive_beam_width {
+                beam_width.min(4)
+            } else {
+                beam_width
+            };
+
+            // Relaxed monotonicity: continue exploring after convergence
+            let mut converge_size: Option<usize> = None;
 
             // paged search can call search_internal multiple times, we only need to initialize
             // state if not already initialized.
@@ -2095,20 +2111,30 @@ where
             }
 
             let mut neighbors = Vec::with_capacity(self.max_degree_with_slack());
-            while scratch.best.has_notvisited_node() && !accessor.terminate_early() {
-                scratch.beam_nodes.clear();
 
-                // In this loop we are going to find the beam_width number of nodes that are closest to the query.
-                // Each of these nodes will be a frontier node.
-                while scratch.best.has_notvisited_node() && scratch.beam_nodes.len() < beam_width {
+            while (scratch.best.has_notvisited_node() || accessor.has_pending())
+                && !accessor.terminate_early()
+            {
+                // Select beam_width closest unvisited nodes
+                scratch.beam_nodes.clear();
+                let available = cur_beam_width.saturating_sub(
+                    if accessor.has_pending() { cur_beam_width / 2 } else { 0 }
+                );
+                while scratch.best.has_notvisited_node()
+                    && scratch.beam_nodes.len() < available
+                {
                     let closest_node = scratch.best.closest_notvisited();
                     search_record.record(closest_node, scratch.hops, scratch.cmps);
                     scratch.beam_nodes.push(closest_node.id);
                 }
 
+                // Submit to expansion queue (no-op for non-pipelined)
+                accessor.submit_expand(scratch.beam_nodes.iter().copied());
+
+                // Expand whatever is available (all for non-pipelined, completed IO for pipelined)
                 neighbors.clear();
-                accessor
-                    .expand_beam(
+                let expanded = accessor
+                    .expand_available(
                         scratch.beam_nodes.iter().copied(),
                         computer,
                         glue::NotInMut::new(&mut scratch.visited),
@@ -2116,16 +2142,32 @@ where
                     )
                     .await?;
 
-                // The predicate ensures that the contents of `neighbors` are unique.
-                //
-                // We insert into the priority queue outside of the expansion for
-                // code-locality purposes.
                 neighbors
                     .iter()
                     .for_each(|neighbor| scratch.best.insert(*neighbor));
 
                 scratch.cmps += neighbors.len() as u32;
-                scratch.hops += scratch.beam_nodes.len() as u32;
+                scratch.hops += expanded as u32;
+
+                // Adaptive beam width
+                if search_params.adaptive_beam_width && expanded > 0 {
+                    // All expanded nodes are useful by definition
+                    cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+                }
+
+                // Relaxed monotonicity: detect convergence and extend search
+                if let Some(rm_l) = search_params.relaxed_monotonicity_l {
+                    if rm_l > 0 {
+                        if !scratch.best.has_notvisited_node() && converge_size.is_none() {
+                            converge_size = Some(scratch.cmps as usize);
+                        }
+                        if let Some(cs) = converge_size {
+                            if (scratch.cmps as usize) >= cs + rm_l {
+                                break;
+                            }
+                        }
+                    }
+                }
             }
 
             Ok(InternalSearchStats {
@@ -2418,7 +2460,7 @@ where
 
             let stats = self
                 .search_internal(
-                    search_params.beam_width,
+                    search_params,
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -2615,9 +2657,11 @@ where
 
             let mut scratch = self.search_scratch(search_params.starting_l_value, start_ids.len());
 
+            let range_default_params = SearchParams::new(1, scratch.best.search_l(), search_params.beam_width)
+                .expect("valid default search params");
             let initial_stats = self
                 .search_internal(
-                    search_params.beam_width,
+                    &range_default_params,
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -2964,8 +3008,10 @@ where
                     .into_ann_result()?;
 
                 let start_ids = accessor.starting_points().await?;
+                let default_params = SearchParams::new(1, search_state.scratch.best.search_l(), None)
+                    .expect("valid default search params");
                 self.search_internal(
-                    None, // beam_width
+                    &default_params,
                     &start_ids,
                     &mut accessor,
                     &search_state.extra.1,
@@ -3713,7 +3759,7 @@ where
 
             let stats = self
                 .search_internal(
-                    search_params.beam_width,
+                    search_params,
                     &start_ids,
                     &mut accessor,
                     &computer,
diff --git a/diskann/src/graph/misc.rs b/diskann/src/graph/misc.rs
index 8c58f6edb..95bf93363 100644
--- a/diskann/src/graph/misc.rs
+++ b/diskann/src/graph/misc.rs
@@ -41,6 +41,10 @@ pub struct SearchParams {
     pub k_value: usize,
     pub l_value: usize,
     pub beam_width: Option<usize>,
+    /// Enable adaptive beam width based on waste ratio tracking.
+    pub adaptive_beam_width: bool,
+    /// Optional relaxed monotonicity parameter.
+    pub relaxed_monotonicity_l: Option<usize>,
 }
 
 #[derive(Debug, Error)]
@@ -80,12 +84,24 @@ impl SearchParams {
             k_value,
             l_value,
             beam_width,
+            adaptive_beam_width: false,
+            relaxed_monotonicity_l: None,
         })
     }
 
     pub fn new_default(k_value: usize, l_value: usize) -> Result<Self, SearchParamsError> {
         SearchParams::new(k_value, l_value, None)
     }
+
+    pub fn with_adaptive_beam_width(mut self) -> Self {
+        self.adaptive_beam_width = true;
+        self
+    }
+
+    pub fn with_relaxed_monotonicity(mut self, l: usize) -> Self {
+        self.relaxed_monotonicity_l = Some(l);
+        self
+    }
 }
 
 // Parameters for the search algorithm

From 3d1ade73955762b8b81834b91826b53462ba9185 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 14:06:53 -0800
Subject: [PATCH 07/46] Add node cache integration and IO tracking to
 PipelinedDiskAccessor

- submit_expand() now checks the shared node cache (Arc<Cache<Data>>)
  before issuing io_uring reads. Cached nodes are served instantly
  without disk IO, matching the DiskAccessor's CachedDiskVertexProvider
  behavior.

- Added io_count and cache_hits tracking with shared PipelinedIoStats
  (atomic counters) so search_pipelined() can populate QueryStatistics
  accurately. IO counts now show in benchmark output.

- PipelinedConfig now carries Arc<Cache<Data>> and benchmark extracts
  the cache from DiskVertexProviderFactory before passing it.
---
 .../src/backend/disk_index/search.rs          |  9 ++
 .../src/search/provider/disk_provider.rs      |  2 +-
 .../src/search/provider/pipelined_accessor.rs | 96 ++++++++++++++++++-
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 270ddf1fc..53cb98d5b 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -464,10 +464,18 @@ where
         SearchMode::UnifiedPipeSearch { sqpoll_idle_ms } => {
             #[cfg(target_os = "linux")]
             {
+                use diskann_disk::data_model::Cache;
+
                 let reader_config = PipelinedReaderConfig {
                     sqpoll_idle_ms: *sqpoll_idle_ms,
                 };
 
+                // Extract the node cache before moving vertex_provider_factory into the searcher
+                let node_cache: Arc<Cache<GraphData<T>>> = vertex_provider_factory
+                    .cache
+                    .clone()
+                    .unwrap_or_else(|| Arc::new(Cache::new(0, 0).expect("empty cache")));
+
                 let mut searcher = DiskIndexSearcher::<GraphData<T>, _>::new(
                     search_params.num_threads,
                     search_params.search_io_limit.unwrap_or(usize::MAX),
@@ -481,6 +489,7 @@ where
                     disk_index_path: disk_index_path.clone(),
                     reader_config,
                     beam_width: search_params.beam_width,
+                    node_cache,
                 });
 
                 let searcher = &searcher;
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index f65431097..b052f397f 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -794,7 +794,7 @@ pub struct DiskIndexSearcher<
 
     /// Optional pipelined search configuration (Linux only, io_uring-based).
     #[cfg(target_os = "linux")]
-    pub(crate) pipelined_config: Option<super::pipelined_accessor::PipelinedConfig>,
+    pub(crate) pipelined_config: Option<super::pipelined_accessor::PipelinedConfig<Data>>,
 }
 
 #[derive(Debug)]
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 09730362c..8da82aa92 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -13,6 +13,8 @@
 use std::collections::{HashMap, VecDeque};
 use std::future::Future;
 use std::ops::Range;
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::Arc;
 
 use byteorder::{ByteOrder, LittleEndian};
 use diskann::{
@@ -29,6 +31,7 @@ use diskann_providers::model::{
 };
 use diskann_vector::DistanceFunction;
 
+use crate::data_model::Cache;
 use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig, MAX_IO_CONCURRENCY};
 use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 use crate::search::traits::VertexProviderFactory;
@@ -120,6 +123,9 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     fp_vector_len: u64,
     num_points: usize,
 
+    // Node cache (shared, read-only) for avoiding disk IO on hot nodes
+    node_cache: Arc<Cache<Data>>,
+
     // IO state
     in_flight_ios: VecDeque<InFlightIo>,
     loaded_nodes: HashMap<u32, LoadedNode>,
@@ -128,6 +134,12 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
 
     // Distance cache for post-processing rerank
     distance_cache: HashMap<u32, f32>,
+
+    // IO statistics
+    io_count: u32,
+    cache_hits: u32,
+    // Shared stats written on drop so caller can read them after search
+    shared_io_stats: Arc<PipelinedIoStats>,
 }
 
 impl<'a, Data> PipelinedDiskAccessor<'a, Data>
@@ -141,6 +153,8 @@ where
         disk_index_path: &str,
         beam_width: usize,
         reader_config: &PipelinedReaderConfig,
+        node_cache: Arc<Cache<Data>>,
+        shared_io_stats: Arc<PipelinedIoStats>,
     ) -> ANNResult<Self> {
         let metadata = provider.graph_header.metadata();
         let dims = metadata.dims;
@@ -185,11 +199,15 @@ where
             node_len,
             fp_vector_len,
             num_points: provider.num_points,
+            node_cache,
             in_flight_ios: VecDeque::new(),
             loaded_nodes: HashMap::new(),
             next_slot_id: 0,
             max_slots: slots,
             distance_cache: HashMap::new(),
+            io_count: 0,
+            cache_hits: 0,
+            shared_io_stats,
         })
     }
 
@@ -212,6 +230,16 @@ where
         Ok(())
     }
 
+    /// Returns the number of disk IO operations performed.
+    pub fn io_count(&self) -> u32 {
+        self.io_count
+    }
+
+    /// Returns the number of cache hits (nodes served from cache without IO).
+    pub fn cache_hits(&self) -> u32 {
+        self.cache_hits
+    }
+
     /// Poll completed IOs and move data from reader buffers into loaded_nodes.
     fn drain_completions(&mut self) -> ANNResult<()> {
         let completed_slots = if self.in_flight_ios.is_empty() {
@@ -357,11 +385,27 @@ where
     Data: GraphDataType<VectorIdType = u32>,
 {
     /// Submit non-blocking io_uring reads for the given node IDs.
+    /// Nodes found in the node cache are placed directly into `loaded_nodes`,
+    /// skipping disk IO entirely.
     fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) {
         for id in ids {
             if self.loaded_nodes.contains_key(&id) {
-                continue; // Already loaded
+                continue; // Already loaded from a previous IO
             }
+
+            // Check node cache first — if the node is cached, build a LoadedNode
+            // from the cache and skip IO entirely.
+            if let (Some(vec_data), Some(adj_list)) = (
+                self.node_cache.get_vector(&id),
+                self.node_cache.get_adjacency_list(&id),
+            ) {
+                let fp_vector: Vec<u8> = bytemuck::cast_slice(vec_data).to_vec();
+                let adjacency_list: Vec<u32> = adj_list.iter().copied().collect();
+                self.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list });
+                self.cache_hits += 1;
+                continue;
+            }
+
             let sector_idx =
                 node_sector_index(id, self.num_nodes_per_sector, self.num_sectors_per_node);
             let sector_offset = sector_idx * self.block_size as u64;
@@ -373,6 +417,7 @@ where
                     slot_id,
                 });
                 self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
+                self.io_count += 1;
             }
         }
     }
@@ -476,23 +521,56 @@ where
     }
 }
 
+impl<Data> Drop for PipelinedDiskAccessor<'_, Data>
+where
+    Data: GraphDataType<VectorIdType = u32>,
+{
+    fn drop(&mut self) {
+        self.shared_io_stats
+            .io_count
+            .fetch_add(self.io_count, Ordering::Relaxed);
+        self.shared_io_stats
+            .cache_hits
+            .fetch_add(self.cache_hits, Ordering::Relaxed);
+    }
+}
+
 // ---------------------------------------------------------------------------
 // SearchStrategy + PostProcessor for pipelined search
 // ---------------------------------------------------------------------------
 
 /// Configuration for creating a pipelined search through DiskIndexSearcher.
-#[derive(Debug, Clone)]
-pub struct PipelinedConfig {
+pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub disk_index_path: String,
     pub reader_config: PipelinedReaderConfig,
     pub beam_width: usize,
+    /// Shared node cache. Nodes found here skip disk IO entirely.
+    pub node_cache: Arc<Cache<Data>>,
+}
+
+/// Shared IO statistics written by the accessor and read by the caller after search.
+/// Uses atomics so the accessor (which lives inside search_internal) can write stats
+/// that the caller can read after the search completes.
+pub struct PipelinedIoStats {
+    pub io_count: AtomicU32,
+    pub cache_hits: AtomicU32,
+}
+
+impl Default for PipelinedIoStats {
+    fn default() -> Self {
+        Self {
+            io_count: AtomicU32::new(0),
+            cache_hits: AtomicU32::new(0),
+        }
+    }
 }
 
 /// Search strategy that creates PipelinedDiskAccessor instances.
 pub struct PipelinedSearchStrategy<'a, Data: GraphDataType<VectorIdType = u32>> {
     query: &'a [Data::VectorDataType],
-    config: &'a PipelinedConfig,
+    config: &'a PipelinedConfig<Data>,
     vector_filter: &'a (dyn Fn(&u32) -> bool + Send + Sync),
+    io_stats: Arc<PipelinedIoStats>,
 }
 
 /// Post-processor for pipelined search that reranks using cached full-precision distances.
@@ -563,6 +641,8 @@ where
             &self.config.disk_index_path,
             self.config.beam_width,
             &self.config.reader_config,
+            self.config.node_cache.clone(),
+            self.io_stats.clone(),
         )
     }
 
@@ -583,7 +663,7 @@ where
     ProviderFactory: VertexProviderFactory<Data>,
 {
     /// Attach a pipelined configuration to this searcher.
-    pub fn with_pipelined_config(&mut self, config: PipelinedConfig) {
+    pub fn with_pipelined_config(&mut self, config: PipelinedConfig<Data>) {
         self.pipelined_config = Some(config);
     }
 
@@ -607,10 +687,13 @@ where
         let filter: &(dyn Fn(&u32) -> bool + Send + Sync) =
             vector_filter.unwrap_or(default_filter.as_ref());
 
+        let io_stats = Arc::new(PipelinedIoStats::default());
+
         let strategy = PipelinedSearchStrategy {
             query,
             config,
             vector_filter: filter,
+            io_stats: io_stats.clone(),
         };
 
         let search_params = SearchParams::new(
@@ -646,6 +729,9 @@ where
         query_stats.total_comparisons = stats.cmps;
         query_stats.search_hops = stats.hops;
         query_stats.total_execution_time_us = timer.elapsed().as_micros();
+        query_stats.total_io_operations = io_stats.io_count.load(Ordering::Relaxed);
+        query_stats.total_vertices_loaded =
+            io_stats.io_count.load(Ordering::Relaxed) + io_stats.cache_hits.load(Ordering::Relaxed);
 
         let mut search_result = SearchResult {
             results: Vec::with_capacity(return_list_size as usize),

From 585bfd66e7042a7c249a0069c2b458feceab4311 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 14:34:12 -0800
Subject: [PATCH 08/46] Fix slot overflow and heap corruption in pipelined IO

- Guard submit_expand against reusing io_uring slots still in-flight
- Add Drop impl to PipelinedReader to drain all in-flight IOs before
  freeing buffers (prevents kernel DMA into freed memory)
- Use f32::total_cmp in post-processor sort for NaN safety
---
 .../src/search/pipelined/pipelined_reader.rs  | 20 +++++++++++++++++++
 .../src/search/provider/pipelined_accessor.rs |  8 +++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index 0b922f81b..1722db54b 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -185,4 +185,24 @@ impl PipelinedReader {
     pub fn slot_size(&self) -> usize {
         self.slot_size
     }
+
+    /// Drain all in-flight IOs, blocking until they complete.
+    /// Must be called before freeing the slot buffers.
+    fn drain_all(&mut self) {
+        while self.in_flight > 0 {
+            let _ = self.ring.submit_and_wait(1);
+            for cqe in self.ring.completion() {
+                let _ = cqe;
+                self.in_flight = self.in_flight.saturating_sub(1);
+            }
+        }
+    }
+}
+
+impl Drop for PipelinedReader {
+    fn drop(&mut self) {
+        // Must wait for all in-flight kernel IOs to complete before freeing
+        // the slot buffers — otherwise the kernel may DMA into freed memory.
+        self.drain_all();
+    }
 }
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 8da82aa92..348119a9c 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -406,6 +406,12 @@ where
                 continue;
             }
 
+            // Don't submit if all io_uring slots are occupied — prevents overwriting
+            // buffers that still have in-flight reads.
+            if self.in_flight_ios.len() >= self.max_slots {
+                break;
+            }
+
             let sector_idx =
                 node_sector_index(id, self.num_nodes_per_sector, self.num_sectors_per_node);
             let sector_offset = sector_idx * self.block_size as u64;
@@ -612,7 +618,7 @@ where
             })
             .collect();
 
-        reranked.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+        reranked.sort_unstable_by(|a, b| a.1.total_cmp(&b.1));
         Ok(output.extend(reranked))
     }
 }

From ef026419186f8a99eccde6fd5e30dcfe4297fc15 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 14:44:08 -0800
Subject: [PATCH 09/46] Expose relaxed_monotonicity_l for UnifiedPipeSearch

Wire the existing relaxed monotonicity support in search_internal
through PipelinedConfig and the benchmark SearchMode enum, giving
UnifiedPipeSearch feature parity with PipeSearch.
---
 .../src/backend/disk_index/search.rs          |  5 +++--
 diskann-benchmark/src/inputs/disk.rs          | 19 ++++++++++++++++---
 .../src/search/provider/pipelined_accessor.rs |  9 ++++++++-
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 53cb98d5b..9e8f3897d 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -461,7 +461,7 @@ where
                 anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
-        SearchMode::UnifiedPipeSearch { sqpoll_idle_ms } => {
+        SearchMode::UnifiedPipeSearch { relaxed_monotonicity_l, sqpoll_idle_ms } => {
             #[cfg(target_os = "linux")]
             {
                 use diskann_disk::data_model::Cache;
@@ -489,6 +489,7 @@ where
                     disk_index_path: disk_index_path.clone(),
                     reader_config,
                     beam_width: search_params.beam_width,
+                    relaxed_monotonicity_l: *relaxed_monotonicity_l,
                     node_cache,
                 });
 
@@ -538,7 +539,7 @@ where
             }
             #[cfg(not(target_os = "linux"))]
             {
-                let _ = sqpoll_idle_ms;
+                let _ = (relaxed_monotonicity_l, sqpoll_idle_ms);
                 anyhow::bail!("UnifiedPipeSearch is only supported on Linux");
             }
         }
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index d056035a4..54c395073 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -91,6 +91,9 @@ pub(crate) enum SearchMode {
     },
     /// Unified pipelined search through the generic search loop (queue-based ExpandBeam).
     UnifiedPipeSearch {
+        /// Optional relaxed monotonicity parameter for early termination.
+        #[serde(default)]
+        relaxed_monotonicity_l: Option<usize>,
         /// Enable kernel-side SQ polling (ms idle timeout). None = disabled.
         #[serde(default)]
         sqpoll_idle_ms: Option<u32>,
@@ -115,10 +118,20 @@ impl fmt::Display for SearchMode {
                 }
                 write!(f, ")")
             }
-            SearchMode::UnifiedPipeSearch { sqpoll_idle_ms } => {
+            SearchMode::UnifiedPipeSearch { relaxed_monotonicity_l, sqpoll_idle_ms } => {
                 write!(f, "UnifiedPipeSearch")?;
-                if let Some(sq) = sqpoll_idle_ms {
-                    write!(f, "(sqpoll={}ms)", sq)?;
+                let has_rm = relaxed_monotonicity_l.is_some();
+                let has_sq = sqpoll_idle_ms.is_some();
+                if has_rm || has_sq {
+                    write!(f, "(")?;
+                    if let Some(rm) = relaxed_monotonicity_l {
+                        write!(f, "rm_l={}", rm)?;
+                        if has_sq { write!(f, ", ")?; }
+                    }
+                    if let Some(sq) = sqpoll_idle_ms {
+                        write!(f, "sqpoll={}ms", sq)?;
+                    }
+                    write!(f, ")")?;
                 }
                 Ok(())
             }
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 348119a9c..5e7020ffc 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -550,6 +550,9 @@ pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub disk_index_path: String,
     pub reader_config: PipelinedReaderConfig,
     pub beam_width: usize,
+    /// Optional relaxed monotonicity: continue exploring this many extra
+    /// comparisons after the candidate list converges.
+    pub relaxed_monotonicity_l: Option<usize>,
     /// Shared node cache. Nodes found here skip disk IO entirely.
     pub node_cache: Arc<Cache<Data>>,
 }
@@ -702,13 +705,17 @@ where
             io_stats: io_stats.clone(),
         };
 
-        let search_params = SearchParams::new(
+        let mut search_params = SearchParams::new(
             return_list_size as usize,
             search_list_size as usize,
             Some(beam_width),
         )?
         .with_adaptive_beam_width();
 
+        if let Some(rm_l) = config.relaxed_monotonicity_l {
+            search_params = search_params.with_relaxed_monotonicity(rm_l);
+        }
+
         let mut indices = vec![0u32; return_list_size as usize];
         let mut distances = vec![0f32; return_list_size as usize];
         let mut associated_data =

From 47e9413b6c024591921368a8103d1e228cd2dcda Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 14:57:54 -0800
Subject: [PATCH 10/46] Make adaptive_beam_width configurable for
 UnifiedPipeSearch

Add adaptive_beam_width bool to PipelinedConfig and SearchMode enum
(defaults to true for backwards compatibility). Benchmark ablation
shows SQPOLL is harmful for the queue-based submit/poll pattern.
---
 .../src/backend/disk_index/search.rs          |  5 +++--
 diskann-benchmark/src/inputs/disk.rs          | 21 ++++++++++++++++---
 .../src/search/provider/pipelined_accessor.rs |  8 +++++--
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 9e8f3897d..4f8b437b8 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -461,7 +461,7 @@ where
                 anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
-        SearchMode::UnifiedPipeSearch { relaxed_monotonicity_l, sqpoll_idle_ms } => {
+        SearchMode::UnifiedPipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
             #[cfg(target_os = "linux")]
             {
                 use diskann_disk::data_model::Cache;
@@ -489,6 +489,7 @@ where
                     disk_index_path: disk_index_path.clone(),
                     reader_config,
                     beam_width: search_params.beam_width,
+                    adaptive_beam_width: *adaptive_beam_width,
                     relaxed_monotonicity_l: *relaxed_monotonicity_l,
                     node_cache,
                 });
@@ -539,7 +540,7 @@ where
             }
             #[cfg(not(target_os = "linux"))]
             {
-                let _ = (relaxed_monotonicity_l, sqpoll_idle_ms);
+                let _ = (adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms);
                 anyhow::bail!("UnifiedPipeSearch is only supported on Linux");
             }
         }
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index 54c395073..373ca003b 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -91,6 +91,9 @@ pub(crate) enum SearchMode {
     },
     /// Unified pipelined search through the generic search loop (queue-based ExpandBeam).
     UnifiedPipeSearch {
+        /// Start with a smaller beam and grow adaptively. Defaults to true.
+        #[serde(default = "default_true")]
+        adaptive_beam_width: bool,
         /// Optional relaxed monotonicity parameter for early termination.
         #[serde(default)]
         relaxed_monotonicity_l: Option<usize>,
@@ -118,17 +121,25 @@ impl fmt::Display for SearchMode {
                 }
                 write!(f, ")")
             }
-            SearchMode::UnifiedPipeSearch { relaxed_monotonicity_l, sqpoll_idle_ms } => {
+            SearchMode::UnifiedPipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
                 write!(f, "UnifiedPipeSearch")?;
+                let has_abw = *adaptive_beam_width;
                 let has_rm = relaxed_monotonicity_l.is_some();
                 let has_sq = sqpoll_idle_ms.is_some();
-                if has_rm || has_sq {
+                if has_abw || has_rm || has_sq {
                     write!(f, "(")?;
+                    let mut first = true;
+                    if has_abw {
+                        write!(f, "abw")?;
+                        first = false;
+                    }
                     if let Some(rm) = relaxed_monotonicity_l {
+                        if !first { write!(f, ", ")?; }
                         write!(f, "rm_l={}", rm)?;
-                        if has_sq { write!(f, ", ")?; }
+                        first = false;
                     }
                     if let Some(sq) = sqpoll_idle_ms {
+                        if !first { write!(f, ", ")?; }
                         write!(f, "sqpoll={}ms", sq)?;
                     }
                     write!(f, ")")?;
@@ -143,6 +154,10 @@ fn default_initial_beam_width() -> usize {
     4
 }
 
+fn default_true() -> bool {
+    true
+}
+
 /// Search phase configuration
 #[derive(Debug, Deserialize, Serialize)]
 pub(crate) struct DiskSearchPhase {
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 5e7020ffc..8f5f74295 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -550,6 +550,8 @@ pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub disk_index_path: String,
     pub reader_config: PipelinedReaderConfig,
     pub beam_width: usize,
+    /// Start with a smaller beam and grow adaptively.
+    pub adaptive_beam_width: bool,
     /// Optional relaxed monotonicity: continue exploring this many extra
     /// comparisons after the candidate list converges.
     pub relaxed_monotonicity_l: Option<usize>,
@@ -709,9 +711,11 @@ where
             return_list_size as usize,
             search_list_size as usize,
             Some(beam_width),
-        )?
-        .with_adaptive_beam_width();
+        )?;
 
+        if config.adaptive_beam_width {
+            search_params = search_params.with_adaptive_beam_width();
+        }
         if let Some(rm_l) = config.relaxed_monotonicity_l {
             search_params = search_params.with_relaxed_monotonicity(rm_l);
         }

From f0ae5af7156c1d7d0cf35e3269b04940a46d3378 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 15:18:15 -0800
Subject: [PATCH 11/46] Add up_to parameter to expand_available for pipeline
 granularity

- expand_available now takes an up_to limit on nodes to expand per call
- search_internal uses process-one-submit-N pattern when pipelining:
  expand 1 node, then submit as many new IOs as were expanded (max 1)
- This matches PipeSearch's tight loop behavior while keeping the
  generic search loop clean
- Result: recall restored to 96.6% at L=100, IOs match BeamSearch,
  QPS within 15% of hand-tuned PipeSearch
---
 .../src/search/provider/pipelined_accessor.rs | 10 +++++--
 diskann/src/graph/glue.rs                     |  6 ++++
 diskann/src/graph/index.rs                    | 29 +++++++++++++++----
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 8f5f74295..165e01c89 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -428,13 +428,16 @@ where
         }
     }
 
-    /// Poll for completed reads and expand nodes whose data is available.
+    /// Poll for completed reads and expand up to `up_to` nodes.
+    /// Remaining loaded-but-unexpanded nodes stay buffered for the next call,
+    /// which lets the search loop submit new IOs sooner (process-few-submit-few).
     fn expand_available<P, F>(
         &mut self,
         _ids: impl Iterator<Item = Self::Id> + Send,
         _computer: &Self::QueryComputer,
         mut pred: P,
         mut on_neighbors: F,
+        up_to: usize,
     ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
@@ -470,8 +473,9 @@ where
                 }
             }
 
-            // Expand loaded nodes: get neighbors, compute PQ distances
-            let loaded_ids: Vec<u32> = self.loaded_nodes.keys().copied().collect();
+            // Expand up to `up_to` loaded nodes. Unexpanded nodes remain buffered
+            // in loaded_nodes for the next call.
+            let loaded_ids: Vec<u32> = self.loaded_nodes.keys().copied().take(up_to).collect();
             let mut expanded = 0;
 
             for vid in loaded_ids {
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 1c942a7fc..5d30fa012 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -277,6 +277,10 @@ where
     /// completed IO operations and expands only the nodes whose data has arrived,
     /// returning immediately without blocking.
     ///
+    /// `up_to` limits how many nodes are expanded in a single call. Pipelined
+    /// providers should respect this to keep the IO pipeline full (expand fewer →
+    /// submit sooner). Non-pipelined providers may ignore it.
+    ///
     /// Returns the number of nodes that were expanded in this call.
     fn expand_available<P, F>(
         &mut self,
@@ -284,12 +288,14 @@ where
         computer: &Self::QueryComputer,
         pred: P,
         on_neighbors: F,
+        up_to: usize,
     ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
+            let _ = up_to; // default impl processes everything
             let id_vec: Vec<Self::Id> = ids.collect();
             let count = id_vec.len();
             self.expand_beam(id_vec.into_iter(), computer, pred, on_neighbors)
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index f5490de50..46f3d9773 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2111,17 +2111,29 @@ where
             }
 
             let mut neighbors = Vec::with_capacity(self.max_degree_with_slack());
+            // Tracks how many nodes were expanded last iteration, so the
+            // pipelined submit can match its rate (process-N-submit-N).
+            let mut last_expanded: usize = 0;
 
             while (scratch.best.has_notvisited_node() || accessor.has_pending())
                 && !accessor.terminate_early()
             {
-                // Select beam_width closest unvisited nodes
+                let has_pending = accessor.has_pending();
+
+                // When pipelining, match the number of new submits to the number
+                // we just expanded (process-N-submit-N), keeping the pipeline
+                // steadily full without over-committing speculative reads.
+                // On the first iteration (nothing expanded yet), prime the pipe
+                // with cur_beam_width IOs. For non-pipelined, submit everything.
+                let submit_limit = if has_pending {
+                    last_expanded.max(1)
+                } else {
+                    cur_beam_width
+                };
+
                 scratch.beam_nodes.clear();
-                let available = cur_beam_width.saturating_sub(
-                    if accessor.has_pending() { cur_beam_width / 2 } else { 0 }
-                );
                 while scratch.best.has_notvisited_node()
-                    && scratch.beam_nodes.len() < available
+                    && scratch.beam_nodes.len() < submit_limit
                 {
                     let closest_node = scratch.best.closest_notvisited();
                     search_record.record(closest_node, scratch.hops, scratch.cmps);
@@ -2131,7 +2143,10 @@ where
                 // Submit to expansion queue (no-op for non-pipelined)
                 accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
-                // Expand whatever is available (all for non-pipelined, completed IO for pipelined)
+                // Expand available nodes. When pipelining, expand one at a time
+                // so we loop back to submit new IOs sooner (process-one-submit-one).
+                // For non-pipelined, expand all (usize::MAX).
+                let expand_limit = if has_pending { 1 } else { usize::MAX };
                 neighbors.clear();
                 let expanded = accessor
                     .expand_available(
@@ -2139,8 +2154,10 @@ where
                         computer,
                         glue::NotInMut::new(&mut scratch.visited),
                         |distance, id| neighbors.push(Neighbor::new(id, distance)),
+                        expand_limit,
                     )
                     .await?;
+                last_expanded = expanded;
 
                 neighbors
                     .iter()

From 0a4679161b89ee2d5cdccfff943628f9d4f7b042 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 15:30:44 -0800
Subject: [PATCH 12/46] Pool io_uring reader and PQ scratch across queries

PipelinedDiskAccessor now borrows a pooled PipelinedScratch
(io_uring ring + file descriptor + PQ buffers) via ObjectPool
instead of creating them fresh per query. This eliminates the
two largest per-query allocation costs.

Closes the QPS gap with PipeSearch from ~15% to <5%.
---
 .../src/backend/disk_index/search.rs          |  33 +++-
 .../src/search/pipelined/pipelined_reader.rs  |   5 +
 .../src/search/provider/pipelined_accessor.rs | 148 ++++++++++++------
 3 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 4f8b437b8..970d7460b 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -465,6 +465,10 @@ where
             #[cfg(target_os = "linux")]
             {
                 use diskann_disk::data_model::Cache;
+                use diskann_disk::search::provider::pipelined_accessor::{
+                    PipelinedScratch, PipelinedScratchArgs,
+                };
+                use diskann::utils::object_pool::ObjectPool;
 
                 let reader_config = PipelinedReaderConfig {
                     sqpoll_idle_ms: *sqpoll_idle_ms,
@@ -476,6 +480,31 @@ where
                     .clone()
                     .unwrap_or_else(|| Arc::new(Cache::new(0, 0).expect("empty cache")));
 
+                // Derive pool args from the graph header before moving factory into searcher
+                let graph_header = vertex_provider_factory.get_header()?;
+                let pq_data = index_reader.get_pq_data();
+                let metadata = graph_header.metadata();
+                let block_size = graph_header.effective_block_size();
+                let num_sectors_per_node = graph_header.num_sectors_per_node();
+                let slot_size = num_sectors_per_node * block_size;
+                let bw = search_params.beam_width;
+                let max_slots = (bw * 2).clamp(16, 128);
+
+                let scratch_args = PipelinedScratchArgs {
+                    disk_index_path: disk_index_path.clone(),
+                    max_slots,
+                    slot_size,
+                    alignment: block_size,
+                    graph_degree: graph_header.max_degree::<T>()?,
+                    dims: metadata.dims,
+                    num_pq_chunks: pq_data.get_num_chunks(),
+                    num_pq_centers: pq_data.get_num_centers(),
+                    reader_config,
+                };
+                let scratch_pool = Arc::new(
+                    ObjectPool::<PipelinedScratch>::try_new(scratch_args.clone(), 0, None)?
+                );
+
                 let mut searcher = DiskIndexSearcher::<GraphData<T>, _>::new(
                     search_params.num_threads,
                     search_params.search_io_limit.unwrap_or(usize::MAX),
@@ -486,12 +515,12 @@ where
                 )?;
 
                 searcher.with_pipelined_config(PipelinedConfig {
-                    disk_index_path: disk_index_path.clone(),
-                    reader_config,
                     beam_width: search_params.beam_width,
                     adaptive_beam_width: *adaptive_beam_width,
                     relaxed_monotonicity_l: *relaxed_monotonicity_l,
                     node_cache,
+                    scratch_pool,
+                    scratch_args,
                 });
 
                 let searcher = &searcher;
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index 1722db54b..42f637fb2 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -186,6 +186,11 @@ impl PipelinedReader {
         self.slot_size
     }
 
+    /// Returns the maximum number of buffer slots.
+    pub fn max_slots(&self) -> usize {
+        self.max_slots
+    }
+
     /// Drain all in-flight IOs, blocking until they complete.
     /// Must be called before freeing the slot buffers.
     fn drain_all(&mut self) {
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 165e01c89..7afb267b9 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -24,6 +24,7 @@ use diskann::{
     },
     neighbor::Neighbor,
     provider::{Accessor, BuildQueryComputer, DefaultContext, DelegateNeighbor, HasId, NeighborAccessor},
+    utils::object_pool::{ObjectPool, PoolOption, TryAsPooled},
     ANNError, ANNResult,
 };
 use diskann_providers::model::{
@@ -103,6 +104,61 @@ fn max_slots(beam_width: usize) -> usize {
     (beam_width * 2).clamp(16, MAX_IO_CONCURRENCY)
 }
 
+// ---------------------------------------------------------------------------
+// Poolable scratch: PipelinedReader + PQScratch, reused across queries
+// ---------------------------------------------------------------------------
+
+/// Reusable scratch state for pipelined search, pooled to avoid per-query
+/// allocation of io_uring rings, file descriptors, and PQ scratch buffers.
+pub struct PipelinedScratch {
+    pub reader: PipelinedReader,
+    pub pq_scratch: PQScratch,
+}
+
+/// Arguments for creating or resetting a [`PipelinedScratch`].
+#[derive(Clone)]
+pub struct PipelinedScratchArgs {
+    pub disk_index_path: String,
+    pub max_slots: usize,
+    pub slot_size: usize,
+    pub alignment: usize,
+    pub graph_degree: usize,
+    pub dims: usize,
+    pub num_pq_chunks: usize,
+    pub num_pq_centers: usize,
+    pub reader_config: PipelinedReaderConfig,
+}
+
+impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
+    type Error = ANNError;
+
+    fn try_create(args: PipelinedScratchArgs) -> Result<Self, Self::Error> {
+        let reader = PipelinedReader::new(
+            &args.disk_index_path,
+            args.max_slots,
+            args.slot_size,
+            args.alignment,
+            &args.reader_config,
+        )?;
+        let pq_scratch = PQScratch::new(
+            args.graph_degree,
+            args.dims,
+            args.num_pq_chunks,
+            args.num_pq_centers,
+        )?;
+        Ok(Self { reader, pq_scratch })
+    }
+
+    fn try_modify(&mut self, _args: PipelinedScratchArgs) -> Result<(), Self::Error> {
+        self.reader.reset();
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// PipelinedDiskAccessor
+// ---------------------------------------------------------------------------
+
 /// Pipelined disk accessor that overlaps IO and compute via io_uring.
 ///
 /// Implements the `ExpandBeam` trait's queue-based methods:
@@ -111,8 +167,7 @@ fn max_slots(beam_width: usize) -> usize {
 /// - `has_pending`: returns true when IO operations are in-flight
 pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     provider: &'a DiskProvider<Data>,
-    reader: PipelinedReader,
-    pq_scratch: PQScratch,
+    scratch: PoolOption<PipelinedScratch>,
     query: &'a [Data::VectorDataType],
 
     // Graph geometry (cached from GraphHeader)
@@ -146,13 +201,11 @@ impl<'a, Data> PipelinedDiskAccessor<'a, Data>
 where
     Data: GraphDataType<VectorIdType = u32>,
 {
-    /// Create a new pipelined disk accessor.
+    /// Create a new pipelined disk accessor using a pooled scratch.
     pub fn new(
         provider: &'a DiskProvider<Data>,
         query: &'a [Data::VectorDataType],
-        disk_index_path: &str,
-        beam_width: usize,
-        reader_config: &PipelinedReaderConfig,
+        scratch: PoolOption<PipelinedScratch>,
         node_cache: Arc<Cache<Data>>,
         shared_io_stats: Arc<PipelinedIoStats>,
     ) -> ANNResult<Self> {
@@ -164,34 +217,11 @@ where
 
         let block_size = provider.graph_header.effective_block_size();
         let num_sectors_per_node = provider.graph_header.num_sectors_per_node();
-        let slot_size = num_sectors_per_node * block_size;
-        let slots = max_slots(beam_width);
-
-        let reader = PipelinedReader::new(
-            disk_index_path,
-            slots,
-            slot_size,
-            block_size,
-            reader_config,
-        )?;
-
-        let graph_degree = provider.graph_header.max_degree::<Data::VectorDataType>()?;
-        let mut pq_scratch = PQScratch::new(
-            graph_degree,
-            dims,
-            provider.pq_data.get_num_chunks(),
-            provider.pq_data.get_num_centers(),
-        )?;
-
-        // Preprocess PQ distance table for this query
-        let medoid = metadata.medoid as u32;
-        pq_scratch.set(dims, query, 1.0)?;
-        quantizer_preprocess(&mut pq_scratch, &provider.pq_data, provider.metric, &[medoid])?;
+        let slots = scratch.reader.max_slots();
 
         Ok(Self {
             provider,
-            reader,
-            pq_scratch,
+            scratch,
             query,
             num_nodes_per_sector,
             num_sectors_per_node,
@@ -211,21 +241,38 @@ where
         })
     }
 
+    /// Preprocess PQ distance tables for this query. Must be called before search.
+    pub fn preprocess_query(&mut self) -> ANNResult<()> {
+        let metadata = self.provider.graph_header.metadata();
+        let dims = metadata.dims;
+        let medoid = metadata.medoid as u32;
+        self.scratch.pq_scratch.set(dims, self.query, 1.0)?;
+        quantizer_preprocess(
+            &mut self.scratch.pq_scratch,
+            &self.provider.pq_data,
+            self.provider.metric,
+            &[medoid],
+        )?;
+        Ok(())
+    }
+
     /// Compute PQ distances for a set of neighbor IDs.
     fn pq_distances<F>(&mut self, ids: &[u32], mut f: F) -> ANNResult<()>
     where
         F: FnMut(f32, u32),
     {
+        let pq = &mut self.scratch.pq_scratch;
         compute_pq_distance(
             ids,
             self.provider.pq_data.get_num_chunks(),
-            &self.pq_scratch.aligned_pqtable_dist_scratch,
+            &pq.aligned_pqtable_dist_scratch,
             self.provider.pq_data.pq_compressed_data().get_data(),
-            &mut self.pq_scratch.aligned_pq_coord_scratch,
-            &mut self.pq_scratch.aligned_dist_scratch,
+            &mut pq.aligned_pq_coord_scratch,
+            &mut pq.aligned_dist_scratch,
         )?;
+        let pq = &self.scratch.pq_scratch;
         for (i, id) in ids.iter().enumerate() {
-            f(self.pq_scratch.aligned_dist_scratch[i], *id);
+            f(pq.aligned_dist_scratch[i], *id);
         }
         Ok(())
     }
@@ -245,7 +292,7 @@ where
         let completed_slots = if self.in_flight_ios.is_empty() {
             Vec::new()
         } else {
-            self.reader.poll_completions()?
+            self.scratch.reader.poll_completions()?
         };
 
         if !completed_slots.is_empty() {
@@ -254,7 +301,7 @@ where
             let mut remaining = VecDeque::new();
             while let Some(io) = self.in_flight_ios.pop_front() {
                 if completed_set.contains(&io.slot_id) {
-                    let sector_buf = self.reader.get_slot_buf(io.slot_id);
+                    let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
                     let node = parse_node(
                         sector_buf,
                         io.vertex_id,
@@ -359,6 +406,7 @@ where
         Ok(DiskQueryComputer {
             num_pq_chunks: self.provider.pq_data.get_num_chunks(),
             query_centroid_l2_distance: self
+                .scratch
                 .pq_scratch
                 .aligned_pqtable_dist_scratch
                 .as_slice()
@@ -417,7 +465,7 @@ where
             let sector_offset = sector_idx * self.block_size as u64;
             let slot_id = self.next_slot_id % self.max_slots;
             // Best-effort: if submission fails, the node will be retried
-            if self.reader.submit_read(sector_offset, slot_id).is_ok() {
+            if self.scratch.reader.submit_read(sector_offset, slot_id).is_ok() {
                 self.in_flight_ios.push_back(InFlightIo {
                     vertex_id: id,
                     slot_id,
@@ -449,14 +497,14 @@ where
 
             // If nothing is loaded yet and we have in-flight IO, wait for at least one
             if self.loaded_nodes.is_empty() && !self.in_flight_ios.is_empty() {
-                let completed = self.reader.wait_completions()?;
+                let completed = self.scratch.reader.wait_completions()?;
                 if !completed.is_empty() {
                     let completed_set: std::collections::HashSet<usize> =
                         completed.into_iter().collect();
                     let mut remaining = VecDeque::new();
                     while let Some(io) = self.in_flight_ios.pop_front() {
                         if completed_set.contains(&io.slot_id) {
-                            let sector_buf = self.reader.get_slot_buf(io.slot_id);
+                            let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
                             let node = parse_node(
                                 sector_buf,
                                 io.vertex_id,
@@ -551,8 +599,6 @@ where
 
 /// Configuration for creating a pipelined search through DiskIndexSearcher.
 pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
-    pub disk_index_path: String,
-    pub reader_config: PipelinedReaderConfig,
     pub beam_width: usize,
     /// Start with a smaller beam and grow adaptively.
     pub adaptive_beam_width: bool,
@@ -561,6 +607,10 @@ pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub relaxed_monotonicity_l: Option<usize>,
     /// Shared node cache. Nodes found here skip disk IO entirely.
     pub node_cache: Arc<Cache<Data>>,
+    /// Pooled scratch (io_uring reader + PQ buffers), created once and reused.
+    pub scratch_pool: Arc<ObjectPool<PipelinedScratch>>,
+    /// Args for retrieving/creating pooled scratch instances.
+    pub scratch_args: PipelinedScratchArgs,
 }
 
 /// Shared IO statistics written by the accessor and read by the caller after search.
@@ -650,15 +700,19 @@ where
         provider: &'a DiskProvider<Data>,
         _context: &DefaultContext,
     ) -> Result<Self::SearchAccessor<'a>, Self::SearchAccessorError> {
-        PipelinedDiskAccessor::new(
+        let scratch = PoolOption::try_pooled(
+            &self.config.scratch_pool,
+            self.config.scratch_args.clone(),
+        )?;
+        let mut accessor = PipelinedDiskAccessor::new(
             provider,
             self.query,
-            &self.config.disk_index_path,
-            self.config.beam_width,
-            &self.config.reader_config,
+            scratch,
             self.config.node_cache.clone(),
             self.io_stats.clone(),
-        )
+        )?;
+        accessor.preprocess_query()?;
+        Ok(accessor)
     }
 
     fn post_processor(&self) -> Self::PostProcessor {

From 375500b8a2b4801aaad0728194ff1d7a087691f5 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 16:40:03 -0800
Subject: [PATCH 13/46] Non-blocking expand, ranked expansion, spin_loop hint

- Make expand_available non-blocking: return 0 instead of blocking
  when no data is loaded, letting the search loop submit more IOs
- Add submission rank to LoadedNode/InFlightIo so expand_available
  processes the highest-priority (earliest-submitted) node first
- Add std::hint::spin_loop() when nothing to submit or expand,
  reducing tail latency by ~50% at p99.9
- Add inflight_count() to ExpandBeam trait for submission gating
- Keep inflight cap at cur_beam_width to avoid priority queue
  over-commitment
---
 .../src/search/provider/pipelined_accessor.rs | 86 +++++++++++--------
 diskann/src/graph/glue.rs                     |  9 ++
 diskann/src/graph/index.rs                    | 22 +++--
 3 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 7afb267b9..650833392 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -47,12 +47,23 @@ use super::disk_provider::{
 struct LoadedNode {
     fp_vector: Vec<u8>,
     adjacency_list: Vec<u32>,
+    /// Submission rank (lower = higher priority / submitted earlier).
+    /// Nodes submitted first via closest_notvisited() have better PQ distance,
+    /// so expanding them first (like PipeSearch) improves search quality.
+    rank: u64,
 }
 
 /// Tracks an in-flight IO request.
 struct InFlightIo {
     vertex_id: u32,
     slot_id: usize,
+    rank: u64,
+}
+
+/// Parsed node data from a sector buffer (without rank metadata).
+struct ParsedNode {
+    fp_vector: Vec<u8>,
+    adjacency_list: Vec<u32>,
 }
 
 /// Parse a node from raw sector buffer bytes.
@@ -62,7 +73,7 @@ fn parse_node(
     num_nodes_per_sector: u64,
     node_len: u64,
     fp_vector_len: u64,
-) -> ANNResult<LoadedNode> {
+) -> ANNResult<ParsedNode> {
     let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
     let end = offset + node_len as usize;
     let node_data = sector_buf.get(offset..end).ok_or_else(|| {
@@ -92,7 +103,7 @@ fn parse_node(
         adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
     }
 
-    Ok(LoadedNode {
+    Ok(ParsedNode {
         fp_vector,
         adjacency_list,
     })
@@ -186,6 +197,8 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     loaded_nodes: HashMap<u32, LoadedNode>,
     next_slot_id: usize,
     max_slots: usize,
+    /// Monotonically increasing submission rank for priority-ordered expansion.
+    next_rank: u64,
 
     // Distance cache for post-processing rerank
     distance_cache: HashMap<u32, f32>,
@@ -234,6 +247,7 @@ where
             loaded_nodes: HashMap::new(),
             next_slot_id: 0,
             max_slots: slots,
+            next_rank: 0,
             distance_cache: HashMap::new(),
             io_count: 0,
             cache_hits: 0,
@@ -302,14 +316,18 @@ where
             while let Some(io) = self.in_flight_ios.pop_front() {
                 if completed_set.contains(&io.slot_id) {
                     let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
-                    let node = parse_node(
+                    let parsed = parse_node(
                         sector_buf,
                         io.vertex_id,
                         self.num_nodes_per_sector,
                         self.node_len,
                         self.fp_vector_len,
                     )?;
-                    self.loaded_nodes.insert(io.vertex_id, node);
+                    self.loaded_nodes.insert(io.vertex_id, LoadedNode {
+                        fp_vector: parsed.fp_vector,
+                        adjacency_list: parsed.adjacency_list,
+                        rank: io.rank,
+                    });
                 } else {
                     remaining.push_back(io);
                 }
@@ -449,7 +467,9 @@ where
             ) {
                 let fp_vector: Vec<u8> = bytemuck::cast_slice(vec_data).to_vec();
                 let adjacency_list: Vec<u32> = adj_list.iter().copied().collect();
-                self.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list });
+                let rank = self.next_rank;
+                self.next_rank += 1;
+                self.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list, rank });
                 self.cache_hits += 1;
                 continue;
             }
@@ -464,11 +484,14 @@ where
                 node_sector_index(id, self.num_nodes_per_sector, self.num_sectors_per_node);
             let sector_offset = sector_idx * self.block_size as u64;
             let slot_id = self.next_slot_id % self.max_slots;
+            let rank = self.next_rank;
+            self.next_rank += 1;
             // Best-effort: if submission fails, the node will be retried
             if self.scratch.reader.submit_read(sector_offset, slot_id).is_ok() {
                 self.in_flight_ios.push_back(InFlightIo {
                     vertex_id: id,
                     slot_id,
+                    rank,
                 });
                 self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
                 self.io_count += 1;
@@ -492,41 +515,32 @@ where
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
-            // Poll completions
+            // Non-blocking poll for completions
             self.drain_completions()?;
 
-            // If nothing is loaded yet and we have in-flight IO, wait for at least one
-            if self.loaded_nodes.is_empty() && !self.in_flight_ios.is_empty() {
-                let completed = self.scratch.reader.wait_completions()?;
-                if !completed.is_empty() {
-                    let completed_set: std::collections::HashSet<usize> =
-                        completed.into_iter().collect();
-                    let mut remaining = VecDeque::new();
-                    while let Some(io) = self.in_flight_ios.pop_front() {
-                        if completed_set.contains(&io.slot_id) {
-                            let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
-                            let node = parse_node(
-                                sector_buf,
-                                io.vertex_id,
-                                self.num_nodes_per_sector,
-                                self.node_len,
-                                self.fp_vector_len,
-                            )?;
-                            self.loaded_nodes.insert(io.vertex_id, node);
-                        } else {
-                            remaining.push_back(io);
-                        }
-                    }
-                    self.in_flight_ios = remaining;
-                }
+            // If nothing loaded yet, return 0 so the search loop can submit
+            // more IOs before we block. This matches PipeSearch's non-blocking
+            // poll pattern and avoids stalling the pipeline.
+            if self.loaded_nodes.is_empty() {
+                return Ok(0);
             }
 
-            // Expand up to `up_to` loaded nodes. Unexpanded nodes remain buffered
-            // in loaded_nodes for the next call.
-            let loaded_ids: Vec<u32> = self.loaded_nodes.keys().copied().take(up_to).collect();
+            // Expand loaded nodes in submission order (lowest rank first).
+            // Nodes submitted earlier had better PQ distance (came from
+            // closest_notvisited), so expanding them first — like PipeSearch's
+            // "best available" strategy — improves search quality.
+            let mut ranked: Vec<(u64, u32)> = self
+                .loaded_nodes
+                .iter()
+                .map(|(&id, node)| (node.rank, id))
+                .collect();
+            ranked.sort_unstable();
             let mut expanded = 0;
 
-            for vid in loaded_ids {
+            for (_, vid) in ranked {
+                if expanded >= up_to {
+                    break;
+                }
                 let node = match self.loaded_nodes.remove(&vid) {
                     Some(n) => n,
                     None => continue,
@@ -563,6 +577,10 @@ where
     fn has_pending(&self) -> bool {
         !self.in_flight_ios.is_empty() || !self.loaded_nodes.is_empty()
     }
+
+    fn inflight_count(&self) -> usize {
+        self.in_flight_ios.len()
+    }
 }
 
 impl<Data> SearchExt for PipelinedDiskAccessor<'_, Data>
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 5d30fa012..6a1c7d405 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -313,6 +313,15 @@ where
         false
     }
 
+    /// Returns the number of IOs currently in-flight (submitted but not completed).
+    ///
+    /// The search loop uses this to cap submissions at `cur_beam_width`, matching
+    /// PipeSearch's behavior of not over-committing speculative reads.
+    /// Default: 0 (non-pipelined providers have no in-flight IO).
+    fn inflight_count(&self) -> usize {
+        0
+    }
+
     /// Expand all `ids` synchronously: load data, get neighbors, compute distances.
     ///
     /// This is the original single-shot expansion method. For non-pipelined providers,
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 46f3d9773..9fc2f80d6 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2120,13 +2120,16 @@ where
             {
                 let has_pending = accessor.has_pending();
 
-                // When pipelining, match the number of new submits to the number
-                // we just expanded (process-N-submit-N), keeping the pipeline
-                // steadily full without over-committing speculative reads.
-                // On the first iteration (nothing expanded yet), prime the pipe
-                // with cur_beam_width IOs. For non-pipelined, submit everything.
+                // When pipelining, cap total in-flight IOs at cur_beam_width
+                // (like PipeSearch) to avoid over-committing the priority queue.
+                // Within that cap, submit at the rate we expand (process-N-submit-N).
                 let submit_limit = if has_pending {
-                    last_expanded.max(1)
+                    let inflight = accessor.inflight_count();
+                    if inflight >= cur_beam_width {
+                        0
+                    } else {
+                        last_expanded.max(1).min(cur_beam_width - inflight)
+                    }
                 } else {
                     cur_beam_width
                 };
@@ -2159,6 +2162,13 @@ where
                     .await?;
                 last_expanded = expanded;
 
+                // When pipelining and nothing was submitted or expanded,
+                // hint the CPU we're spin-waiting for IO to avoid burning
+                // cycles and hurting tail latency on shared cores.
+                if expanded == 0 && scratch.beam_nodes.is_empty() && has_pending {
+                    std::hint::spin_loop();
+                }
+
                 neighbors
                     .iter()
                     .for_each(|neighbor| scratch.best.insert(*neighbor));

From a3c50e859e7fa7b825160b45307c4a3d4e77a21a Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 17:30:50 -0800
Subject: [PATCH 14/46] Add SearchTrace: per-query tracing and profiling
 infrastructure

- New search_trace module in diskann-disk with SearchTrace, TraceEvent,
  SearchProfile, and OptionalTrace (zero-cost when disabled)
- Events: Submit, Complete, CacheHit, Expand (with FP distance + neighbor
  counts), SpinWait, Done
- Profile counters: io_poll, io_submit, fp_distance, pq_distance,
  queue_ops, spin_wait, parse_node (all in microseconds)
- Instrumented PipeSearch: poll, submit, complete, expand phases
- Instrumented UnifiedPipe: drain_completions, submit_expand,
  expand_available with per-phase profiling
- Both accept optional trace (None = zero overhead in production)
---
 diskann-disk/src/search/mod.rs                |   1 +
 .../src/search/pipelined/pipelined_search.rs  |  43 +++
 .../search/pipelined/pipelined_searcher.rs    |   1 +
 .../src/search/provider/pipelined_accessor.rs |  52 ++++
 diskann-disk/src/search/search_trace.rs       | 294 ++++++++++++++++++
 5 files changed, 391 insertions(+)
 create mode 100644 diskann-disk/src/search/search_trace.rs

diff --git a/diskann-disk/src/search/mod.rs b/diskann-disk/src/search/mod.rs
index 2c475e10a..3b7c98c3f 100644
--- a/diskann-disk/src/search/mod.rs
+++ b/diskann-disk/src/search/mod.rs
@@ -9,6 +9,7 @@ pub mod provider;
 pub mod traits;
 
 pub(crate) mod sector_math;
+pub mod search_trace;
 
 #[cfg(target_os = "linux")]
 pub mod pipelined;
diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
index 8b1c6bf0f..0a3c045eb 100644
--- a/diskann-disk/src/search/pipelined/pipelined_search.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_search.rs
@@ -14,6 +14,7 @@ use diskann_providers::model::{compute_pq_distance, pq::quantizer_preprocess, PQ
 use diskann_vector::{distance::Metric, DistanceFunction};
 
 use super::pipelined_reader::PipelinedReader;
+use crate::search::search_trace::{OptionalTrace, SearchTrace, TraceEventKind};
 use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 
 /// A candidate in the sorted candidate pool.
@@ -158,7 +159,9 @@ pub(crate) fn pipe_search<T: VectorRepr>(
     relaxed_monotonicity_l: Option<usize>,
     metric: Metric,
     vector_filter: Option<&(dyn Fn(&u32) -> bool + Send + Sync)>,
+    trace: Option<&mut SearchTrace>,
 ) -> ANNResult<PipeSearchResult> {
+    let mut trace = OptionalTrace(trace);
     let timer = Instant::now();
     let mut io_count: u32 = 0;
     let mut comparisons: u32 = 0;
@@ -261,7 +264,9 @@ pub(crate) fn pipe_search<T: VectorRepr>(
         // Poll completions (non-blocking). Keeping this non-blocking is critical
         // for overlapping IO and compute — blocking here would serialize the pipeline.
         let io_poll_start = Instant::now();
+        trace.begin_phase();
         let completed_slots = reader.poll_completions()?;
+        trace.end_phase_io_poll();
         io_time += io_poll_start.elapsed();
         let mut n_in: usize = 0;
         let mut n_out: usize = 0;
@@ -273,6 +278,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
             while let Some(io) = on_flight_ios.pop_front() {
                 if completed_set.contains(&io.slot_id) {
                     let sector_buf = reader.get_slot_buf(io.slot_id);
+                    trace.begin_phase();
                     let node = parse_node(
                         sector_buf,
                         io.vertex_id,
@@ -280,6 +286,8 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                         node_len,
                         fp_vector_len,
                     )?;
+                    trace.end_phase_parse_node();
+                    trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
                     // Track convergence: is this node still in the top of retset?
                     if cur_list_size > 0 {
                         let last_dist = retset[cur_list_size - 1].distance;
@@ -329,6 +337,7 @@ pub(crate) fn pipe_search<T: VectorRepr>(
         // Submit more reads if room
         if on_flight_ios.len() < cur_beam_width {
             let io_submit_start = Instant::now();
+            trace.begin_phase();
             let to_send = 1;
             let mut n_sent = 0;
             let mut marker = 0;
@@ -347,12 +356,17 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                         vertex_id: vid,
                         slot_id,
                     });
+                    trace.event(TraceEventKind::Submit {
+                        node_id: vid,
+                        inflight: on_flight_ios.len(),
+                    });
                     next_slot_id = (next_slot_id + 1) % max_slots(beam_width);
                     io_count += 1;
                     n_sent += 1;
                 }
                 marker += 1;
             }
+            trace.end_phase_io_submit();
             io_time += io_submit_start.elapsed();
         }
 
@@ -371,8 +385,10 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                 if let Some(node) = id_buf_map.get(&vid) {
                     // Compute full-precision distance; only add to results if
                     // filter is absent or the node passes the filter predicate.
+                    trace.begin_phase();
                     let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
                     let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
+                    trace.end_phase_fp_distance();
                     if vector_filter.map_or(true, |f| f(&vid)) {
                         full_retset.push((vid, fp_dist));
                     }
@@ -385,9 +401,11 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                         }
                     }
 
+                    let num_new_candidates;
                     if !nbors_to_compute.is_empty() {
                         comparisons += nbors_to_compute.len() as u32;
                         // Compute PQ distances for unvisited neighbors
+                        trace.begin_phase();
                         compute_pq_distance(
                             &nbors_to_compute,
                             num_pq_chunks,
@@ -396,8 +414,11 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                             &mut pq_scratch.aligned_pq_coord_scratch,
                             &mut pq_scratch.aligned_dist_scratch,
                         )?;
+                        trace.end_phase_pq_distance();
 
+                        trace.begin_phase();
                         let mut nk = cur_list_size;
+                        let mut n_inserted: u32 = 0;
                         for (m, &nbr_id) in nbors_to_compute.iter().enumerate() {
                             let nbr_dist = pq_scratch.aligned_dist_scratch[m];
                             if cur_list_size == search_l
@@ -418,8 +439,21 @@ pub(crate) fn pipe_search<T: VectorRepr>(
                             if r < nk {
                                 nk = r;
                             }
+                            n_inserted += 1;
                         }
+                        trace.end_phase_queue_ops();
+                        num_new_candidates = n_inserted;
+                    } else {
+                        num_new_candidates = 0;
                     }
+
+                    trace.record_expand();
+                    trace.event(TraceEventKind::Expand {
+                        node_id: vid,
+                        fp_distance: fp_dist,
+                        num_neighbors: node.adjacency_list.len() as u32,
+                        num_new_candidates,
+                    });
                 }
 
                 // Find first_unvisited_eager for convergence tracking
@@ -496,6 +530,15 @@ pub(crate) fn pipe_search<T: VectorRepr>(
 
     let total_us = timer.elapsed().as_micros();
 
+    trace.event(TraceEventKind::Done {
+        total_hops: hops,
+        total_ios: io_count,
+        total_comparisons: comparisons,
+    });
+    if let Some(t) = trace.0.as_mut() {
+        t.finish();
+    }
+
     Ok(PipeSearchResult {
         ids,
         distances,
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index 712c879c0..09129c3ed 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -260,6 +260,7 @@ where
             self.relaxed_monotonicity_l,
             self.metric,
             vector_filter,
+            None, // trace
         )?;
 
         let query_statistics = QueryStatistics {
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 650833392..c8044ccba 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -15,6 +15,7 @@ use std::future::Future;
 use std::ops::Range;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::Arc;
+use std::time::Instant;
 
 use byteorder::{ByteOrder, LittleEndian};
 use diskann::{
@@ -34,6 +35,7 @@ use diskann_vector::DistanceFunction;
 
 use crate::data_model::Cache;
 use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig, MAX_IO_CONCURRENCY};
+use crate::search::search_trace::{OptionalTrace, SearchTrace, TraceEventKind};
 use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 use crate::search::traits::VertexProviderFactory;
 use crate::utils::QueryStatistics;
@@ -208,6 +210,9 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     cache_hits: u32,
     // Shared stats written on drop so caller can read them after search
     shared_io_stats: Arc<PipelinedIoStats>,
+
+    // Optional per-query trace for profiling and algorithmic comparison
+    trace: Option<SearchTrace>,
 }
 
 impl<'a, Data> PipelinedDiskAccessor<'a, Data>
@@ -252,6 +257,7 @@ where
             io_count: 0,
             cache_hits: 0,
             shared_io_stats,
+            trace: None,
         })
     }
 
@@ -270,6 +276,19 @@ where
         Ok(())
     }
 
+    /// Enable per-query tracing. Call before search.
+    pub fn enable_trace(&mut self) {
+        self.trace = Some(SearchTrace::new());
+    }
+
+    /// Take the completed trace (if any). Call after search.
+    pub fn take_trace(&mut self) -> Option<SearchTrace> {
+        if let Some(t) = self.trace.as_mut() {
+            t.finish();
+        }
+        self.trace.take()
+    }
+
     /// Compute PQ distances for a set of neighbor IDs.
     fn pq_distances<F>(&mut self, ids: &[u32], mut f: F) -> ANNResult<()>
     where
@@ -303,11 +322,14 @@ where
 
     /// Poll completed IOs and move data from reader buffers into loaded_nodes.
     fn drain_completions(&mut self) -> ANNResult<()> {
+        let mut trace = OptionalTrace(self.trace.as_mut());
+        trace.begin_phase();
         let completed_slots = if self.in_flight_ios.is_empty() {
             Vec::new()
         } else {
             self.scratch.reader.poll_completions()?
         };
+        trace.end_phase_io_poll();
 
         if !completed_slots.is_empty() {
             let completed_set: std::collections::HashSet<usize> =
@@ -315,6 +337,7 @@ where
             let mut remaining = VecDeque::new();
             while let Some(io) = self.in_flight_ios.pop_front() {
                 if completed_set.contains(&io.slot_id) {
+                    trace.begin_phase();
                     let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
                     let parsed = parse_node(
                         sector_buf,
@@ -323,6 +346,8 @@ where
                         self.node_len,
                         self.fp_vector_len,
                     )?;
+                    trace.end_phase_parse_node();
+                    trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
                     self.loaded_nodes.insert(io.vertex_id, LoadedNode {
                         fp_vector: parsed.fp_vector,
                         adjacency_list: parsed.adjacency_list,
@@ -454,6 +479,8 @@ where
     /// Nodes found in the node cache are placed directly into `loaded_nodes`,
     /// skipping disk IO entirely.
     fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) {
+        let mut trace = OptionalTrace(self.trace.as_mut());
+        trace.begin_phase();
         for id in ids {
             if self.loaded_nodes.contains_key(&id) {
                 continue; // Already loaded from a previous IO
@@ -471,6 +498,7 @@ where
                 self.next_rank += 1;
                 self.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list, rank });
                 self.cache_hits += 1;
+                trace.event(TraceEventKind::CacheHit { node_id: id });
                 continue;
             }
 
@@ -493,10 +521,15 @@ where
                     slot_id,
                     rank,
                 });
+                trace.event(TraceEventKind::Submit {
+                    node_id: id,
+                    inflight: self.in_flight_ios.len(),
+                });
                 self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
                 self.io_count += 1;
             }
         }
+        trace.end_phase_io_submit();
     }
 
     /// Poll for completed reads and expand up to `up_to` nodes.
@@ -547,11 +580,15 @@ where
                 };
 
                 // Compute full-precision distance and cache it for post-processing
+                let fp_start = Instant::now();
                 let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
                 let fp_dist = self
                     .provider
                     .distance_comparer
                     .evaluate_similarity(self.query, fp_vec);
+                if let Some(t) = self.trace.as_mut() {
+                    t.profile.fp_distance_us += fp_start.elapsed().as_micros() as u64;
+                }
                 self.distance_cache.insert(vid, fp_dist);
 
                 // Get unvisited neighbors
@@ -561,9 +598,24 @@ where
                     .copied()
                     .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr))
                     .collect();
+                let num_new = neighbors.len() as u32;
 
                 if !neighbors.is_empty() {
+                    let pq_start = Instant::now();
                     self.pq_distances(&neighbors, &mut on_neighbors)?;
+                    if let Some(t) = self.trace.as_mut() {
+                        t.profile.pq_distance_us += pq_start.elapsed().as_micros() as u64;
+                    }
+                }
+
+                if let Some(t) = self.trace.as_mut() {
+                    t.record_expand();
+                    t.event(TraceEventKind::Expand {
+                        node_id: vid,
+                        fp_distance: fp_dist,
+                        num_neighbors: node.adjacency_list.len() as u32,
+                        num_new_candidates: num_new,
+                    });
                 }
 
                 expanded += 1;
diff --git a/diskann-disk/src/search/search_trace.rs b/diskann-disk/src/search/search_trace.rs
new file mode 100644
index 000000000..16b0d3f5d
--- /dev/null
+++ b/diskann-disk/src/search/search_trace.rs
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Per-query search tracing and profiling for comparing PipeSearch vs UnifiedPipeSearch.
+//!
+//! Captures two kinds of data:
+//! - **Event trace**: Ordered list of search events (submit, complete, expand, etc.)
+//!   for side-by-side algorithmic comparison.
+//! - **Profile counters**: Cumulative time in each phase (IO poll, IO submit, expand,
+//!   PQ distance, queue ops, spin-wait) for identifying bottlenecks.
+//!
+//! Tracing is opt-in: create a `SearchTrace` and pass it to the search function.
+//! When disabled (None), all operations are zero-cost.
+
+use std::time::{Duration, Instant};
+
+/// A single event in the search trace.
+#[derive(Debug, Clone)]
+pub struct TraceEvent {
+    /// Microseconds since the start of the search.
+    pub time_us: u64,
+    /// The event kind.
+    pub kind: TraceEventKind,
+}
+
+/// Kinds of trace events.
+#[derive(Debug, Clone)]
+pub enum TraceEventKind {
+    /// IO submitted for a node. `inflight` is the count AFTER submission.
+    Submit { node_id: u32, inflight: usize },
+    /// IO completed for a node (data loaded from disk).
+    Complete { node_id: u32 },
+    /// Node loaded from cache (no IO needed).
+    CacheHit { node_id: u32 },
+    /// Node expanded: FP distance computed, neighbors discovered.
+    Expand {
+        node_id: u32,
+        fp_distance: f32,
+        num_neighbors: u32,
+        num_new_candidates: u32,
+    },
+    /// Node selected from priority queue for submission.
+    Select {
+        node_id: u32,
+        pq_distance: f32,
+        queue_position: u32,
+    },
+    /// Poll returned no completions (spin-wait iteration).
+    SpinWait,
+    /// Search terminated.
+    Done {
+        total_hops: u32,
+        total_ios: u32,
+        total_comparisons: u32,
+    },
+}
+
+/// Cumulative profiling counters for a single query.
+#[derive(Debug, Clone, Default)]
+pub struct SearchProfile {
+    /// Time spent polling io_uring for completions.
+    pub io_poll_us: u64,
+    /// Time spent submitting IO requests.
+    pub io_submit_us: u64,
+    /// Time spent computing full-precision distances.
+    pub fp_distance_us: u64,
+    /// Time spent computing PQ distances for neighbors.
+    pub pq_distance_us: u64,
+    /// Time spent on priority queue operations (insert, closest_notvisited).
+    pub queue_ops_us: u64,
+    /// Time spent in spin-wait (nothing to submit or expand).
+    pub spin_wait_us: u64,
+    /// Time spent parsing nodes from sector buffers.
+    pub parse_node_us: u64,
+    /// Number of spin-wait iterations.
+    pub spin_wait_count: u64,
+    /// Number of IO poll calls.
+    pub poll_count: u64,
+    /// Number of IO submit calls.
+    pub submit_count: u64,
+    /// Number of nodes expanded.
+    pub expand_count: u64,
+    /// Total search wall time.
+    pub total_us: u64,
+}
+
+/// Per-query search trace collector.
+///
+/// Create one per query, pass to search functions. After search completes,
+/// inspect `events` and `profile` for analysis.
+pub struct SearchTrace {
+    start: Instant,
+    pub events: Vec<TraceEvent>,
+    pub profile: SearchProfile,
+    phase_start: Option<Instant>,
+}
+
+impl SearchTrace {
+    pub fn new() -> Self {
+        Self {
+            start: Instant::now(),
+            events: Vec::with_capacity(256),
+            profile: SearchProfile::default(),
+            phase_start: None,
+        }
+    }
+
+    /// Record a trace event with the current timestamp.
+    #[inline]
+    pub fn event(&mut self, kind: TraceEventKind) {
+        let time_us = self.start.elapsed().as_micros() as u64;
+        self.events.push(TraceEvent { time_us, kind });
+    }
+
+    /// Start timing a phase. Call `end_phase_*` to accumulate the duration.
+    #[inline]
+    pub fn begin_phase(&mut self) {
+        self.phase_start = Some(Instant::now());
+    }
+
+    /// End the current phase and add elapsed time to `io_poll_us`.
+    #[inline]
+    pub fn end_phase_io_poll(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.io_poll_us += start.elapsed().as_micros() as u64;
+            self.profile.poll_count += 1;
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_io_submit(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.io_submit_us += start.elapsed().as_micros() as u64;
+            self.profile.submit_count += 1;
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_fp_distance(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.fp_distance_us += start.elapsed().as_micros() as u64;
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_pq_distance(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.pq_distance_us += start.elapsed().as_micros() as u64;
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_queue_ops(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.queue_ops_us += start.elapsed().as_micros() as u64;
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_spin_wait(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.spin_wait_us += start.elapsed().as_micros() as u64;
+            self.profile.spin_wait_count += 1;
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_parse_node(&mut self) {
+        if let Some(start) = self.phase_start.take() {
+            self.profile.parse_node_us += start.elapsed().as_micros() as u64;
+        }
+    }
+
+    #[inline]
+    pub fn record_expand(&mut self) {
+        self.profile.expand_count += 1;
+    }
+
+    /// Finalize the trace, recording total wall time.
+    pub fn finish(&mut self) {
+        self.profile.total_us = self.start.elapsed().as_micros() as u64;
+    }
+
+    /// Print a summary of the profile to stderr (for debugging).
+    pub fn print_profile_summary(&self) {
+        let p = &self.profile;
+        let accounted = p.io_poll_us + p.io_submit_us + p.fp_distance_us
+            + p.pq_distance_us + p.queue_ops_us + p.spin_wait_us + p.parse_node_us;
+        let other = p.total_us.saturating_sub(accounted);
+        eprintln!(
+            "Profile: total={}us io_poll={}us({}) io_submit={}us({}) \
+             fp_dist={}us pq_dist={}us queue={}us spin={}us({}) parse={}us other={}us | \
+             expands={} polls={} submits={}",
+            p.total_us,
+            p.io_poll_us, p.poll_count,
+            p.io_submit_us, p.submit_count,
+            p.fp_distance_us,
+            p.pq_distance_us,
+            p.queue_ops_us,
+            p.spin_wait_us, p.spin_wait_count,
+            p.parse_node_us,
+            other,
+            p.expand_count, p.poll_count, p.submit_count,
+        );
+    }
+
+    /// Print the first N events to stderr (for debugging).
+    pub fn print_events(&self, max: usize) {
+        for (i, ev) in self.events.iter().enumerate().take(max) {
+            eprintln!("  [{:>4}] @{:>6}us {:?}", i, ev.time_us, ev.kind);
+        }
+        if self.events.len() > max {
+            eprintln!("  ... ({} more events)", self.events.len() - max);
+        }
+    }
+}
+
+/// Optional trace wrapper — all methods are no-ops when None.
+/// This avoids polluting call sites with `if let Some(trace) = ...`.
+pub struct OptionalTrace<'a>(pub Option<&'a mut SearchTrace>);
+
+impl<'a> OptionalTrace<'a> {
+    #[inline]
+    pub fn event(&mut self, kind: TraceEventKind) {
+        if let Some(t) = self.0.as_mut() {
+            t.event(kind);
+        }
+    }
+
+    #[inline]
+    pub fn begin_phase(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.begin_phase();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_io_poll(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_io_poll();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_io_submit(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_io_submit();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_fp_distance(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_fp_distance();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_pq_distance(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_pq_distance();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_queue_ops(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_queue_ops();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_spin_wait(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_spin_wait();
+        }
+    }
+
+    #[inline]
+    pub fn end_phase_parse_node(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.end_phase_parse_node();
+        }
+    }
+
+    #[inline]
+    pub fn record_expand(&mut self) {
+        if let Some(t) = self.0.as_mut() {
+            t.record_expand();
+        }
+    }
+}

From 39b7ba79af14e350e44de6465d4b03c6afd8fd2c Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 17:52:55 -0800
Subject: [PATCH 15/46] Wire DISKANN_TRACE env var for per-query profiling

When DISKANN_TRACE=1, both PipeSearch and UnifiedPipeSearch print
per-query profile summaries to stderr. PipelinedConfig gets a
trace_enabled field. Enables aggregate profiling analysis.
---
 diskann-benchmark/src/backend/disk_index/search.rs |  3 +++
 .../src/search/pipelined/pipelined_searcher.rs     | 14 +++++++++++++-
 .../src/search/provider/pipelined_accessor.rs      | 12 ++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 970d7460b..20c4b7439 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -514,6 +514,8 @@ where
                     None,
                 )?;
 
+                let trace_enabled = std::env::var("DISKANN_TRACE").map_or(false, |v| v == "1");
+
                 searcher.with_pipelined_config(PipelinedConfig {
                     beam_width: search_params.beam_width,
                     adaptive_beam_width: *adaptive_beam_width,
@@ -521,6 +523,7 @@ where
                     node_cache,
                     scratch_pool,
                     scratch_args,
+                    trace_enabled,
                 });
 
                 let searcher = &searcher;
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index 09129c3ed..34a68803d 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -27,6 +27,7 @@ use crate::{
 
 use super::pipelined_reader::{PipelinedReader, PipelinedReaderConfig};
 use super::pipelined_search::{pipe_search, PipeSearchResult};
+use crate::search::search_trace::SearchTrace;
 
 /// Scratch space for pipelined search operations, pooled for reuse across queries.
 struct PipelinedSearchScratch {
@@ -242,6 +243,13 @@ where
             ref mut pq_scratch,
         } = *scratch;
 
+        let trace_enabled = std::env::var("DISKANN_TRACE").map_or(false, |v| v == "1");
+        let mut trace = if trace_enabled {
+            Some(SearchTrace::new())
+        } else {
+            None
+        };
+
         let result: PipeSearchResult = pipe_search::<Data::VectorDataType>(
             reader,
             &self.pq_data,
@@ -260,9 +268,13 @@ where
             self.relaxed_monotonicity_l,
             self.metric,
             vector_filter,
-            None, // trace
+            trace.as_mut(),
         )?;
 
+        if let Some(t) = &trace {
+            t.print_profile_summary();
+        }
+
         let query_statistics = QueryStatistics {
             total_execution_time_us: result.stats.total_us,
             io_time_us: result.stats.io_us,
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index c8044ccba..31d49db28 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -660,6 +660,12 @@ where
         self.shared_io_stats
             .cache_hits
             .fetch_add(self.cache_hits, Ordering::Relaxed);
+
+        // Print trace if enabled
+        if let Some(trace) = self.trace.as_mut() {
+            trace.finish();
+            trace.print_profile_summary();
+        }
     }
 }
 
@@ -681,6 +687,9 @@ pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub scratch_pool: Arc<ObjectPool<PipelinedScratch>>,
     /// Args for retrieving/creating pooled scratch instances.
     pub scratch_args: PipelinedScratchArgs,
+    /// Enable per-query SearchTrace. The trace profile is printed to stderr
+    /// after each query completes. Use for profiling, not production.
+    pub trace_enabled: bool,
 }
 
 /// Shared IO statistics written by the accessor and read by the caller after search.
@@ -782,6 +791,9 @@ where
             self.io_stats.clone(),
         )?;
         accessor.preprocess_query()?;
+        if self.config.trace_enabled {
+            accessor.enable_trace();
+        }
         Ok(accessor)
     }
 

From 08b4f889b7326f773bddde0fbf00686ed2ebe8cb Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 21:13:11 -0800
Subject: [PATCH 16/46] Decouple IO submission from node visitation

Speculative submission: nodes are no longer marked visited when their IO
is submitted. Instead, a separate 'submitted' HashSet tracks which nodes
have been submitted but not yet expanded. Nodes are only marked visited
after actual expansion, matching PipeSearch's decoupled flag/visited model.

Key changes:
- Queue: add peek_best_unsubmitted() and mark_visited_by_id() (13 new tests)
- Search loop: use peek_best_unsubmitted for pipelined submission,
  mark_visited_by_id after expansion, pass queue-ordered IDs to
  expand_available for best-available node selection
- Accessor: expand_available prefers nodes by caller-supplied queue
  order (first loaded match), falling back to submission rank.
  Tracks expanded IDs via last_expanded_ids() for the search loop.
- ExpandBeam trait: add last_expanded_ids() with empty default
- Non-pipelined path completely unchanged (uses closest_notvisited)
---
 .../src/search/provider/pipelined_accessor.rs | 175 ++++++++-----
 diskann/src/graph/glue.rs                     |  25 +-
 diskann/src/graph/index.rs                    |  73 ++++--
 diskann/src/neighbor/queue.rs                 | 246 +++++++++++++++++-
 4 files changed, 432 insertions(+), 87 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 31d49db28..ddb2e7b84 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -201,6 +201,8 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     max_slots: usize,
     /// Monotonically increasing submission rank for priority-ordered expansion.
     next_rank: u64,
+    /// IDs expanded in the most recent `expand_available` call.
+    expanded_ids: Vec<u32>,
 
     // Distance cache for post-processing rerank
     distance_cache: HashMap<u32, f32>,
@@ -253,6 +255,7 @@ where
             next_slot_id: 0,
             max_slots: slots,
             next_rank: 0,
+            expanded_ids: Vec::new(),
             distance_cache: HashMap::new(),
             io_count: 0,
             cache_hits: 0,
@@ -331,6 +334,44 @@ where
         };
         trace.end_phase_io_poll();
 
+        if !completed_slots.is_empty() {
+            let completed_set: std::collections::HashSet<usize> =
+                completed_slots.into_iter().collect();
+            let mut remaining = VecDeque::new();
+            while let Some(io) = self.in_flight_ios.pop_front() {
+                if completed_set.contains(&io.slot_id) {
+                    trace.begin_phase();
+                    let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
+                    let parsed = parse_node(
+                        sector_buf,
+                        io.vertex_id,
+                        self.num_nodes_per_sector,
+                        self.node_len,
+                        self.fp_vector_len,
+                    )?;
+                    trace.end_phase_parse_node();
+                    trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
+                    self.loaded_nodes.insert(io.vertex_id, LoadedNode {
+                        fp_vector: parsed.fp_vector,
+                        adjacency_list: parsed.adjacency_list,
+                        rank: io.rank,
+                    });
+                } else {
+                    remaining.push_back(io);
+                }
+            }
+            self.in_flight_ios = remaining;
+        }
+        Ok(())
+    }
+    /// Block until at least one IO completes, then eagerly drain all available.
+    /// Reuses the same completion → loaded_nodes logic as drain_completions.
+    fn wait_and_drain(&mut self) -> ANNResult<()> {
+        let mut trace = OptionalTrace(self.trace.as_mut());
+        trace.begin_phase();
+        let completed_slots = self.scratch.reader.wait_completions()?;
+        trace.end_phase_io_poll();
+
         if !completed_slots.is_empty() {
             let completed_set: std::collections::HashSet<usize> =
                 completed_slots.into_iter().collect();
@@ -532,96 +573,93 @@ where
         trace.end_phase_io_submit();
     }
 
-    /// Poll for completed reads and expand up to `up_to` nodes.
-    /// Remaining loaded-but-unexpanded nodes stay buffered for the next call,
-    /// which lets the search loop submit new IOs sooner (process-few-submit-few).
+    /// Poll for completed reads and expand all loaded nodes.
     fn expand_available<P, F>(
         &mut self,
-        _ids: impl Iterator<Item = Self::Id> + Send,
+        ids: impl Iterator<Item = Self::Id> + Send,
         _computer: &Self::QueryComputer,
         mut pred: P,
         mut on_neighbors: F,
-        up_to: usize,
     ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
+            self.expanded_ids.clear();
+
             // Non-blocking poll for completions
             self.drain_completions()?;
 
-            // If nothing loaded yet, return 0 so the search loop can submit
-            // more IOs before we block. This matches PipeSearch's non-blocking
-            // poll pattern and avoids stalling the pipeline.
             if self.loaded_nodes.is_empty() {
                 return Ok(0);
             }
 
-            // Expand loaded nodes in submission order (lowest rank first).
-            // Nodes submitted earlier had better PQ distance (came from
-            // closest_notvisited), so expanding them first — like PipeSearch's
-            // "best available" strategy — improves search quality.
-            let mut ranked: Vec<(u64, u32)> = self
-                .loaded_nodes
-                .iter()
-                .map(|(&id, node)| (node.rank, id))
-                .collect();
-            ranked.sort_unstable();
-            let mut expanded = 0;
-
-            for (_, vid) in ranked {
-                if expanded >= up_to {
+            // Prefer expanding a node the search loop ranks highest (first
+            // match in the caller-supplied `ids` iterator). Fall back to the
+            // loaded node with lowest submission rank for backward compat.
+            let mut best_vid: Option<u32> = None;
+            for id in ids {
+                if self.loaded_nodes.contains_key(&id) {
+                    best_vid = Some(id);
                     break;
                 }
-                let node = match self.loaded_nodes.remove(&vid) {
-                    Some(n) => n,
-                    None => continue,
-                };
-
-                // Compute full-precision distance and cache it for post-processing
-                let fp_start = Instant::now();
-                let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
-                let fp_dist = self
-                    .provider
-                    .distance_comparer
-                    .evaluate_similarity(self.query, fp_vec);
-                if let Some(t) = self.trace.as_mut() {
-                    t.profile.fp_distance_us += fp_start.elapsed().as_micros() as u64;
-                }
-                self.distance_cache.insert(vid, fp_dist);
-
-                // Get unvisited neighbors
-                let neighbors: Vec<u32> = node
-                    .adjacency_list
+            }
+            if best_vid.is_none() {
+                best_vid = self
+                    .loaded_nodes
                     .iter()
-                    .copied()
-                    .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr))
-                    .collect();
-                let num_new = neighbors.len() as u32;
-
-                if !neighbors.is_empty() {
-                    let pq_start = Instant::now();
-                    self.pq_distances(&neighbors, &mut on_neighbors)?;
-                    if let Some(t) = self.trace.as_mut() {
-                        t.profile.pq_distance_us += pq_start.elapsed().as_micros() as u64;
-                    }
-                }
+                    .min_by_key(|(_, node)| node.rank)
+                    .map(|(&id, _)| id);
+            }
+
+            let vid = match best_vid {
+                Some(id) => id,
+                None => return Ok(0),
+            };
+            let node = self.loaded_nodes.remove(&vid).unwrap();
+            self.expanded_ids.push(vid);
+
+            // Compute full-precision distance and cache it for post-processing
+            let fp_start = Instant::now();
+            let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
+            let fp_dist = self
+                .provider
+                .distance_comparer
+                .evaluate_similarity(self.query, fp_vec);
+            if let Some(t) = self.trace.as_mut() {
+                t.profile.fp_distance_us += fp_start.elapsed().as_micros() as u64;
+            }
+            self.distance_cache.insert(vid, fp_dist);
 
+            // Get unvisited neighbors
+            let neighbors: Vec<u32> = node
+                .adjacency_list
+                .iter()
+                .copied()
+                .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr))
+                .collect();
+            let num_new = neighbors.len() as u32;
+
+            if !neighbors.is_empty() {
+                let pq_start = Instant::now();
+                self.pq_distances(&neighbors, &mut on_neighbors)?;
                 if let Some(t) = self.trace.as_mut() {
-                    t.record_expand();
-                    t.event(TraceEventKind::Expand {
-                        node_id: vid,
-                        fp_distance: fp_dist,
-                        num_neighbors: node.adjacency_list.len() as u32,
-                        num_new_candidates: num_new,
-                    });
+                    t.profile.pq_distance_us += pq_start.elapsed().as_micros() as u64;
                 }
+            }
 
-                expanded += 1;
+            if let Some(t) = self.trace.as_mut() {
+                t.record_expand();
+                t.event(TraceEventKind::Expand {
+                    node_id: vid,
+                    fp_distance: fp_dist,
+                    num_neighbors: node.adjacency_list.len() as u32,
+                    num_new_candidates: num_new,
+                });
             }
 
-            Ok(expanded)
+            Ok(1)
         }
     }
 
@@ -633,6 +671,17 @@ where
     fn inflight_count(&self) -> usize {
         self.in_flight_ios.len()
     }
+
+    fn wait_for_io(&mut self) {
+        // Only block if there are actually in-flight IOs to wait for
+        if !self.in_flight_ios.is_empty() {
+            let _ = self.wait_and_drain();
+        }
+    }
+
+    fn last_expanded_ids(&self) -> &[u32] {
+        &self.expanded_ids
+    }
 }
 
 impl<Data> SearchExt for PipelinedDiskAccessor<'_, Data>
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 6a1c7d405..ba95ecaf0 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -277,10 +277,6 @@ where
     /// completed IO operations and expands only the nodes whose data has arrived,
     /// returning immediately without blocking.
     ///
-    /// `up_to` limits how many nodes are expanded in a single call. Pipelined
-    /// providers should respect this to keep the IO pipeline full (expand fewer →
-    /// submit sooner). Non-pipelined providers may ignore it.
-    ///
     /// Returns the number of nodes that were expanded in this call.
     fn expand_available<P, F>(
         &mut self,
@@ -288,14 +284,12 @@ where
         computer: &Self::QueryComputer,
         pred: P,
         on_neighbors: F,
-        up_to: usize,
     ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
-            let _ = up_to; // default impl processes everything
             let id_vec: Vec<Self::Id> = ids.collect();
             let count = id_vec.len();
             self.expand_beam(id_vec.into_iter(), computer, pred, on_neighbors)
@@ -322,6 +316,25 @@ where
         0
     }
 
+    /// Block until at least one IO completes, then eagerly drain all available.
+    ///
+    /// Called by the search loop only when it cannot make progress: nothing was
+    /// submitted (no candidates or inflight cap reached) AND nothing was expanded
+    /// (no completions available). Blocking here yields the CPU thread instead of
+    /// spin-polling, while the eager drain ensures we process bursts efficiently.
+    ///
+    /// Default: no-op (non-pipelined providers never need to wait).
+    fn wait_for_io(&mut self) {}
+
+    /// Return the IDs of nodes expanded in the most recent `expand_available` call.
+    ///
+    /// The search loop uses this to mark speculatively submitted nodes as visited
+    /// only after they have actually been expanded. Non-pipelined providers return
+    /// an empty slice (they mark visited at selection time).
+    fn last_expanded_ids(&self) -> &[Self::Id] {
+        &[]
+    }
+
     /// Expand all `ids` synchronously: load data, get neighbors, compute distances.
     ///
     /// This is the original single-shot expansion method. For non-pipelined providers,
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 9fc2f80d6..7aff58115 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2114,11 +2114,17 @@ where
             // Tracks how many nodes were expanded last iteration, so the
             // pipelined submit can match its rate (process-N-submit-N).
             let mut last_expanded: usize = 0;
+            // Tracks speculatively submitted (but not yet visited/expanded) nodes
+            // so the pipelined path can decouple submission from visitation.
+            let mut submitted = std::collections::HashSet::<DP::InternalId>::new();
 
-            while (scratch.best.has_notvisited_node() || accessor.has_pending())
+            while (scratch.best.has_notvisited_node()
+                || scratch.best.peek_best_unsubmitted(&submitted).is_some()
+                || accessor.has_pending())
                 && !accessor.terminate_early()
             {
                 let has_pending = accessor.has_pending();
+                let pipelining = has_pending || !submitted.is_empty();
 
                 // When pipelining, cap total in-flight IOs at cur_beam_width
                 // (like PipeSearch) to avoid over-committing the priority queue.
@@ -2135,38 +2141,73 @@ where
                 };
 
                 scratch.beam_nodes.clear();
-                while scratch.best.has_notvisited_node()
-                    && scratch.beam_nodes.len() < submit_limit
-                {
-                    let closest_node = scratch.best.closest_notvisited();
-                    search_record.record(closest_node, scratch.hops, scratch.cmps);
-                    scratch.beam_nodes.push(closest_node.id);
+                if pipelining {
+                    // Speculative submission: peek without marking visited.
+                    // Nodes are marked visited only after actual expansion.
+                    while scratch.beam_nodes.len() < submit_limit {
+                        if let Some(closest_node) =
+                            scratch.best.peek_best_unsubmitted(&submitted)
+                        {
+                            search_record.record(closest_node, scratch.hops, scratch.cmps);
+                            submitted.insert(closest_node.id);
+                            scratch.beam_nodes.push(closest_node.id);
+                        } else {
+                            break;
+                        }
+                    }
+                } else {
+                    // Non-pipelined: use the original visited-at-selection path.
+                    while scratch.best.has_notvisited_node()
+                        && scratch.beam_nodes.len() < submit_limit
+                    {
+                        let closest_node = scratch.best.closest_notvisited();
+                        search_record.record(closest_node, scratch.hops, scratch.cmps);
+                        scratch.beam_nodes.push(closest_node.id);
+                    }
                 }
 
                 // Submit to expansion queue (no-op for non-pipelined)
                 accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
-                // Expand available nodes. When pipelining, expand one at a time
-                // so we loop back to submit new IOs sooner (process-one-submit-one).
-                // For non-pipelined, expand all (usize::MAX).
-                let expand_limit = if has_pending { 1 } else { usize::MAX };
+                // Expand all available loaded nodes. The pipeline depth is
+                // controlled by the inflight cap on submissions, not by
+                // limiting expansions. For non-pipelined, this expands the
+                // full beam as before.
                 neighbors.clear();
+                // Pass submitted-but-not-yet-expanded nodes in queue priority
+                // order so the accessor expands the best available loaded node,
+                // matching PipeSearch's "best unvisited in retset" strategy.
+                let queue_ordered: Vec<DP::InternalId> = if pipelining {
+                    scratch.best.iter()
+                        .filter(|n| submitted.contains(&n.id))
+                        .map(|n| n.id)
+                        .collect()
+                } else {
+                    Vec::new()
+                };
                 let expanded = accessor
                     .expand_available(
-                        scratch.beam_nodes.iter().copied(),
+                        queue_ordered.iter().copied(),
                         computer,
                         glue::NotInMut::new(&mut scratch.visited),
                         |distance, id| neighbors.push(Neighbor::new(id, distance)),
-                        expand_limit,
                     )
                     .await?;
                 last_expanded = expanded;
 
+                // Mark expanded nodes as visited and remove from submitted set.
+                for &id in accessor.last_expanded_ids() {
+                    scratch.best.mark_visited_by_id(&id);
+                    submitted.remove(&id);
+                }
+
                 // When pipelining and nothing was submitted or expanded,
-                // hint the CPU we're spin-waiting for IO to avoid burning
-                // cycles and hurting tail latency on shared cores.
+                // block until at least one IO completes. This only fires
+                // when the loop truly has nothing to do (inflight cap reached
+                // AND no completions available), replacing ~400 spin iterations
+                // with a single kernel wait + eager drain of all ready CQEs.
                 if expanded == 0 && scratch.beam_nodes.is_empty() && has_pending {
-                    std::hint::spin_loop();
+                    accessor.wait_for_io();
                 }
 
                 neighbors
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index 48453ca2f..ec1148fd0 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -4,6 +4,7 @@
  */
 
 use diskann_wide::{SIMDMask, SIMDPartialOrd, SIMDVector};
+use std::collections::HashSet;
 use std::marker::PhantomData;
 
 use super::Neighbor;
@@ -11,14 +12,14 @@ use super::Neighbor;
 /// Shared trait for type the generic `I` parameter used by the
 /// `NeighborPeriorityQueue`.
 pub trait NeighborPriorityQueueIdType:
-    Default + Eq + Clone + Copy + std::fmt::Debug + std::fmt::Display + Send + Sync
+    Default + Eq + Clone + Copy + std::fmt::Debug + std::fmt::Display + std::hash::Hash + Send + Sync
 {
 }
 
 /// Any type that implements all the individual requirements for
 /// `NeighborPriorityQueueIdType` implements the full trait.
 impl<T> NeighborPriorityQueueIdType for T where
-    T: Default + Eq + Clone + Copy + std::fmt::Debug + std::fmt::Display + Send + Sync
+    T: Default + Eq + Clone + Copy + std::fmt::Debug + std::fmt::Display + std::hash::Hash + Send + Sync
 {
 }
 
@@ -59,6 +60,18 @@ pub trait NeighborQueue<I: NeighborPriorityQueueIdType>: std::fmt::Debug + Send
 
     /// Return an iterator over the best candidates.
     fn iter(&self) -> Self::Iter<'_>;
+
+    /// Return the first node (by distance order) that is not visited and not in `submitted`,
+    /// scanning positions 0..min(size, search_l). Does not modify any state.
+    fn peek_best_unsubmitted(&self, _submitted: &HashSet<I>) -> Option<Neighbor<I>> {
+        None
+    }
+
+    /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
+    /// Returns true if found and marked, false otherwise.
+    fn mark_visited_by_id(&mut self, _id: &I) -> bool {
+        false
+    }
 }
 
 /// Neighbor priority Queue based on the distance to the query node
@@ -485,6 +498,38 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
             self.cursor = 0;
         }
     }
+
+    /// Return the first node that is not visited and not in `submitted`,
+    /// scanning positions 0..min(size, search_param_l). Does not modify any state.
+    pub fn peek_best_unsubmitted(&self, submitted: &HashSet<I>) -> Option<Neighbor<I>> {
+        let limit = self.search_param_l.min(self.size);
+        for i in 0..limit {
+            let (id, visited) = self.id_visiteds[i];
+            if !visited && !submitted.contains(&id) {
+                return Some(Neighbor::new(id, self.distances[i]));
+            }
+        }
+        None
+    }
+
+    /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
+    /// Returns true if found and marked, false otherwise.
+    pub fn mark_visited_by_id(&mut self, id: &I) -> bool {
+        for i in 0..self.size {
+            if self.id_visiteds[i].0 == *id {
+                self.id_visiteds[i].1 = true;
+                // If the cursor was pointing at this node, advance past visited nodes
+                if self.cursor == i {
+                    self.cursor += 1;
+                    while self.cursor < self.size && self.get_visited(self.cursor) {
+                        self.cursor += 1;
+                    }
+                }
+                return true;
+            }
+        }
+        false
+    }
 }
 
 impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<I> {
@@ -529,6 +574,14 @@ impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<
     fn iter(&self) -> Self::Iter<'_> {
         self.iter()
     }
+
+    fn peek_best_unsubmitted(&self, submitted: &HashSet<I>) -> Option<Neighbor<I>> {
+        self.peek_best_unsubmitted(submitted)
+    }
+
+    fn mark_visited_by_id(&mut self, id: &I) -> bool {
+        self.mark_visited_by_id(id)
+    }
 }
 
 /// Enable the following syntax for iteration over the valid elements in the queue.
@@ -1427,4 +1480,193 @@ mod neighbor_priority_queue_test {
         assert_eq!(queue.size(), 1);
         assert_eq!(queue.cursor, 0); // cursor is always reset to 0
     }
+
+    #[test]
+    fn test_peek_best_unsubmitted_basic() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
+
+        let submitted = HashSet::new();
+        let result = queue.peek_best_unsubmitted(&submitted);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().id, 2); // closest unvisited, unsubmitted
+    }
+
+    #[test]
+    fn test_peek_best_unsubmitted_skips_submitted() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
+
+        let mut submitted = HashSet::new();
+        submitted.insert(2u32);
+        let result = queue.peek_best_unsubmitted(&submitted);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().id, 1); // 2 is submitted, so next is 1
+    }
+
+    #[test]
+    fn test_peek_best_unsubmitted_skips_visited() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
+
+        queue.closest_notvisited(); // visits 2
+
+        let submitted = HashSet::new();
+        let result = queue.peek_best_unsubmitted(&submitted);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().id, 1); // 2 is visited, so next is 1
+    }
+
+    #[test]
+    fn test_peek_best_unsubmitted_none_when_all_excluded() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+
+        let mut submitted = HashSet::new();
+        submitted.insert(1u32);
+        submitted.insert(2u32);
+        let result = queue.peek_best_unsubmitted(&submitted);
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_peek_best_unsubmitted_respects_search_l() {
+        let mut queue = NeighborPriorityQueue::auto_resizable_with_search_param_l(2);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        queue.insert(Neighbor::new(4, 2.0));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5), 4(2.0)], search_l=2
+
+        let mut submitted = HashSet::new();
+        submitted.insert(2u32);
+        submitted.insert(1u32);
+        // Both nodes within search_l window are submitted
+        let result = queue.peek_best_unsubmitted(&submitted);
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_peek_best_unsubmitted_does_not_modify_state() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+
+        let submitted = HashSet::new();
+        let _ = queue.peek_best_unsubmitted(&submitted);
+        let _ = queue.peek_best_unsubmitted(&submitted);
+
+        // Cursor should still be at 0 (no state modification)
+        assert_eq!(queue.cursor, 0);
+        assert!(queue.has_notvisited_node());
+    }
+
+    #[test]
+    fn test_peek_best_unsubmitted_empty_queue() {
+        let queue = NeighborPriorityQueue::<u32>::new(5);
+        let submitted = HashSet::new();
+        assert!(queue.peek_best_unsubmitted(&submitted).is_none());
+    }
+
+    #[test]
+    fn test_mark_visited_by_id_basic() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
+
+        assert!(queue.mark_visited_by_id(&1));
+        assert!(queue.get_visited(1)); // id=1 is at index 1
+    }
+
+    #[test]
+    fn test_mark_visited_by_id_not_found() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+
+        assert!(!queue.mark_visited_by_id(&99));
+    }
+
+    #[test]
+    fn test_mark_visited_by_id_advances_cursor() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)], cursor=0
+
+        // Mark the node at cursor (id=2 at index 0)
+        assert!(queue.mark_visited_by_id(&2));
+        // Cursor should advance past this visited node to index 1
+        assert_eq!(queue.cursor, 1);
+    }
+
+    #[test]
+    fn test_mark_visited_by_id_cursor_skips_consecutive_visited() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)], cursor=0
+
+        // Visit id=1 (index 1) first - cursor stays at 0
+        assert!(queue.mark_visited_by_id(&1));
+        assert_eq!(queue.cursor, 0);
+
+        // Now visit id=2 (index 0, where cursor is) - cursor should skip past both visited nodes
+        assert!(queue.mark_visited_by_id(&2));
+        assert_eq!(queue.cursor, 2); // skips index 0 (visited) and index 1 (visited)
+    }
+
+    #[test]
+    fn test_mark_visited_by_id_does_not_move_cursor_for_non_cursor_node() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)], cursor=0
+
+        // Mark id=3 (index 2) as visited - cursor should stay at 0
+        assert!(queue.mark_visited_by_id(&3));
+        assert_eq!(queue.cursor, 0);
+    }
+
+    #[test]
+    fn test_peek_and_mark_workflow() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
+
+        let mut submitted = HashSet::new();
+
+        // Peek - should return id=2
+        let node = queue.peek_best_unsubmitted(&submitted).unwrap();
+        assert_eq!(node.id, 2);
+        submitted.insert(node.id);
+
+        // Peek again - should return id=1 (2 is submitted)
+        let node = queue.peek_best_unsubmitted(&submitted).unwrap();
+        assert_eq!(node.id, 1);
+        submitted.insert(node.id);
+
+        // Mark id=2 as visited (IO completed)
+        assert!(queue.mark_visited_by_id(&2));
+
+        // Peek - should return id=3 (2 visited, 1 submitted)
+        let node = queue.peek_best_unsubmitted(&submitted).unwrap();
+        assert_eq!(node.id, 3);
+    }
 }

From bbce45363ebe2820385dd026442d24c9d17c2419 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 21:27:02 -0800
Subject: [PATCH 17/46] Fix decoupled submission: is_pipelined flag, drop stale
 nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add is_pipelined() to ExpandBeam trait (default false), true for pipelined
- Use peek_best_unsubmitted from first iteration when pipelined
- has_pending() only counts in-flight IOs, not stale loaded_nodes
- Remove rank fallback in expand_available — stale nodes are abandoned
- Result: IO/hop pattern matches PipeSearch (35/20 at L=10)
---
 .../src/search/provider/pipelined_accessor.rs | 21 +++++++++----------
 diskann/src/graph/glue.rs                     |  7 +++++++
 diskann/src/graph/index.rs                    | 11 ++++++----
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index ddb2e7b84..05d0ca1dc 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -595,9 +595,11 @@ where
                 return Ok(0);
             }
 
-            // Prefer expanding a node the search loop ranks highest (first
-            // match in the caller-supplied `ids` iterator). Fall back to the
-            // loaded node with lowest submission rank for backward compat.
+            // Expand the highest-priority loaded node according to the
+            // search loop's current queue ordering (passed via `ids`).
+            // If no queue-preferred node is loaded, return 0 — stale loaded
+            // nodes whose candidates have been superseded are abandoned,
+            // matching PipeSearch's behavior of not expanding evicted nodes.
             let mut best_vid: Option<u32> = None;
             for id in ids {
                 if self.loaded_nodes.contains_key(&id) {
@@ -605,13 +607,6 @@ where
                     break;
                 }
             }
-            if best_vid.is_none() {
-                best_vid = self
-                    .loaded_nodes
-                    .iter()
-                    .min_by_key(|(_, node)| node.rank)
-                    .map(|(&id, _)| id);
-            }
 
             let vid = match best_vid {
                 Some(id) => id,
@@ -665,7 +660,7 @@ where
 
     /// Returns true when there are in-flight IO operations.
     fn has_pending(&self) -> bool {
-        !self.in_flight_ios.is_empty() || !self.loaded_nodes.is_empty()
+        !self.in_flight_ios.is_empty()
     }
 
     fn inflight_count(&self) -> usize {
@@ -682,6 +677,10 @@ where
     fn last_expanded_ids(&self) -> &[u32] {
         &self.expanded_ids
     }
+
+    fn is_pipelined(&self) -> bool {
+        true
+    }
 }
 
 impl<Data> SearchExt for PipelinedDiskAccessor<'_, Data>
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index ba95ecaf0..51ffba6ab 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -326,6 +326,13 @@ where
     /// Default: no-op (non-pipelined providers never need to wait).
     fn wait_for_io(&mut self) {}
 
+    /// Whether this accessor uses asynchronous IO (pipelined).
+    /// When true, the search loop uses speculative submission (peek without
+    /// marking visited). When false, the classic visited-at-selection path.
+    fn is_pipelined(&self) -> bool {
+        false
+    }
+
     /// Return the IDs of nodes expanded in the most recent `expand_available` call.
     ///
     /// The search loop uses this to mark speculatively submitted nodes as visited
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 7aff58115..41cebb022 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2124,7 +2124,7 @@ where
                 && !accessor.terminate_early()
             {
                 let has_pending = accessor.has_pending();
-                let pipelining = has_pending || !submitted.is_empty();
+                let pipelining = accessor.is_pipelined();
 
                 // When pipelining, cap total in-flight IOs at cur_beam_width
                 // (like PipeSearch) to avoid over-committing the priority queue.
@@ -2156,12 +2156,15 @@ where
                         }
                     }
                 } else {
-                    // Non-pipelined: use the original visited-at-selection path.
+                    // Non-pipelined OR first iteration before pipelining starts.
+                    // Use closest_notvisited but also track in submitted set so
+                    // expand_available can find these nodes in queue_ordered.
                     while scratch.best.has_notvisited_node()
                         && scratch.beam_nodes.len() < submit_limit
                     {
                         let closest_node = scratch.best.closest_notvisited();
                         search_record.record(closest_node, scratch.hops, scratch.cmps);
+                        submitted.insert(closest_node.id);
                         scratch.beam_nodes.push(closest_node.id);
                     }
                 }
@@ -2177,13 +2180,13 @@ where
                 // Pass submitted-but-not-yet-expanded nodes in queue priority
                 // order so the accessor expands the best available loaded node,
                 // matching PipeSearch's "best unvisited in retset" strategy.
-                let queue_ordered: Vec<DP::InternalId> = if pipelining {
+                let queue_ordered: Vec<DP::InternalId> = if !submitted.is_empty() {
                     scratch.best.iter()
                         .filter(|n| submitted.contains(&n.id))
                         .map(|n| n.id)
                         .collect()
                 } else {
-                    Vec::new()
+                    scratch.beam_nodes.clone()
                 };
                 let expanded = accessor
                     .expand_available(

From cfae61227ac9ae07c7d8baaf3915df5261873455 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 23:27:44 -0800
Subject: [PATCH 18/46] Match PipeANN recall: full_retset reranking, send-1
 pacing, FIFO completion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three key changes that close the recall gap between UnifiedPipe and PipeSearch:

1. Full retset reranking: post_process now uses ALL entries in
   distance_cache (every expanded node's fp-distance), not just
   candidates remaining in the priority queue. This matches PipeANN's
   full_retset approach where every expanded node contributes to
   results regardless of its PQ distance ranking.

2. Send-1 IO pacing: when pipelining, submit exactly 1 IO per
   iteration (after initial burst), matching PipeANN's
   send_best_read_req(1). Each IO decision benefits from the most
   recent neighbor information.

3. Reordered loop: pipelined path now does expand→insert→submit
   (matching PipeANN's poll→calc→send order) instead of submit→expand.

4. FIFO completion processing: drain_completions processes one
   completion per call in submission order using a pending_cqe_slots
   buffer, matching PipeANN's while(front().finished()) pattern.

Results at L=10,BW=4 (4 threads):
  PipeSearch:  57.86% recall, 1757 QPS
  UnifiedPipe: 59.49% recall, 1643 QPS  (was 52.3%)
  Gap: UnifiedPipe now +1.6% recall (was -5.6%)
---
 .../src/backend/disk_index/search.rs          |  10 +
 .../search/pipelined/pipelined_searcher.rs    |   3 +
 .../src/search/provider/pipelined_accessor.rs | 122 ++++++-----
 diskann/src/graph/index.rs                    | 193 ++++++++++--------
 4 files changed, 198 insertions(+), 130 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 20c4b7439..dcca626cc 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -230,6 +230,16 @@ fn run_search_loop(
 
         let total_time = start.elapsed();
 
+        // Print per-query result IDs for trace comparison
+        if std::env::var("DISKANN_TRACE_EVENTS").is_ok() {
+            for qi in 0..num_queries {
+                let start_idx = qi * recall_at as usize;
+                let count = result_counts[qi] as usize;
+                let ids: Vec<u32> = result_ids[start_idx..start_idx + count.min(recall_at as usize)].to_vec();
+                eprintln!("RESULT q={} L={} ids={:?}", qi, l, ids);
+            }
+        }
+
         if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
             anyhow::bail!("One or more searches failed. See logs for details.");
         }
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
index 34a68803d..4c0691d7f 100644
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
@@ -273,6 +273,9 @@ where
 
         if let Some(t) = &trace {
             t.print_profile_summary();
+            if std::env::var("DISKANN_TRACE_EVENTS").is_ok() {
+                t.print_events(500);
+            }
         }
 
         let query_statistics = QueryStatistics {
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 05d0ca1dc..3fb676f74 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -197,6 +197,9 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     // IO state
     in_flight_ios: VecDeque<InFlightIo>,
     loaded_nodes: HashMap<u32, LoadedNode>,
+    /// Buffered CQE slot IDs not yet matched to in-flight IOs.
+    /// Allows processing one completion at a time for strict pipelining.
+    pending_cqe_slots: Vec<usize>,
     next_slot_id: usize,
     max_slots: usize,
     /// Monotonically increasing submission rank for priority-ordered expansion.
@@ -252,6 +255,7 @@ where
             node_cache,
             in_flight_ios: VecDeque::new(),
             loaded_nodes: HashMap::new(),
+            pending_cqe_slots: Vec::new(),
             next_slot_id: 0,
             max_slots: slots,
             next_rank: 0,
@@ -326,41 +330,54 @@ where
     /// Poll completed IOs and move data from reader buffers into loaded_nodes.
     fn drain_completions(&mut self) -> ANNResult<()> {
         let mut trace = OptionalTrace(self.trace.as_mut());
+
+        // Poll new CQEs and buffer them
         trace.begin_phase();
-        let completed_slots = if self.in_flight_ios.is_empty() {
-            Vec::new()
-        } else {
-            self.scratch.reader.poll_completions()?
-        };
+        if !self.in_flight_ios.is_empty() {
+            let new_slots = self.scratch.reader.poll_completions()?;
+            self.pending_cqe_slots.extend(new_slots);
+        }
         trace.end_phase_io_poll();
 
-        if !completed_slots.is_empty() {
-            let completed_set: std::collections::HashSet<usize> =
-                completed_slots.into_iter().collect();
-            let mut remaining = VecDeque::new();
-            while let Some(io) = self.in_flight_ios.pop_front() {
-                if completed_set.contains(&io.slot_id) {
-                    trace.begin_phase();
-                    let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
-                    let parsed = parse_node(
-                        sector_buf,
-                        io.vertex_id,
-                        self.num_nodes_per_sector,
-                        self.node_len,
-                        self.fp_vector_len,
-                    )?;
-                    trace.end_phase_parse_node();
-                    trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-                    self.loaded_nodes.insert(io.vertex_id, LoadedNode {
-                        fp_vector: parsed.fp_vector,
-                        adjacency_list: parsed.adjacency_list,
-                        rank: io.rank,
-                    });
-                } else {
-                    remaining.push_back(io);
-                }
+        if self.pending_cqe_slots.is_empty() {
+            return Ok(());
+        }
+
+        // Process at most ONE completion in submission (FIFO) order,
+        // matching PipeANN's `while(front().finished())` which processes
+        // in-flight IOs strictly front-to-back.
+        let completed_set: std::collections::HashSet<usize> =
+            self.pending_cqe_slots.iter().copied().collect();
+
+        // Scan in_flight_ios from front: process the first one whose CQE arrived
+        let mut found_idx = None;
+        for (i, io) in self.in_flight_ios.iter().enumerate() {
+            if completed_set.contains(&io.slot_id) {
+                found_idx = Some(i);
+                break;
             }
-            self.in_flight_ios = remaining;
+        }
+
+        if let Some(idx) = found_idx {
+            let io = self.in_flight_ios.remove(idx).unwrap();
+            self.pending_cqe_slots.retain(|&s| s != io.slot_id);
+
+            trace.begin_phase();
+            let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
+            let parsed = parse_node(
+                sector_buf,
+                io.vertex_id,
+                self.num_nodes_per_sector,
+                self.node_len,
+                self.fp_vector_len,
+            )?;
+            trace.end_phase_parse_node();
+            trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
+            self.loaded_nodes.insert(io.vertex_id, LoadedNode {
+                fp_vector: parsed.fp_vector,
+                adjacency_list: parsed.adjacency_list,
+                rank: io.rank,
+            });
         }
         Ok(())
     }
@@ -370,11 +387,13 @@ where
         let mut trace = OptionalTrace(self.trace.as_mut());
         trace.begin_phase();
         let completed_slots = self.scratch.reader.wait_completions()?;
+        self.pending_cqe_slots.extend(completed_slots);
         trace.end_phase_io_poll();
 
-        if !completed_slots.is_empty() {
-            let completed_set: std::collections::HashSet<usize> =
-                completed_slots.into_iter().collect();
+        // When blocking, drain ALL pending completions
+        let completed_set: std::collections::HashSet<usize> =
+            self.pending_cqe_slots.drain(..).collect();
+        if !completed_set.is_empty() {
             let mut remaining = VecDeque::new();
             while let Some(io) = self.in_flight_ios.pop_front() {
                 if completed_set.contains(&io.slot_id) {
@@ -595,11 +614,10 @@ where
                 return Ok(0);
             }
 
-            // Expand the highest-priority loaded node according to the
-            // search loop's current queue ordering (passed via `ids`).
-            // If no queue-preferred node is loaded, return 0 — stale loaded
-            // nodes whose candidates have been superseded are abandoned,
-            // matching PipeSearch's behavior of not expanding evicted nodes.
+            // Expand the highest-priority loaded node, scanning `ids` in the
+            // caller's priority order (queue-ordered by PQ distance, then
+            // evicted-but-submitted nodes as fallback).  This matches PipeANN's
+            // retset scan: best PQ-distance loaded node first.
             let mut best_vid: Option<u32> = None;
             for id in ids {
                 if self.loaded_nodes.contains_key(&id) {
@@ -660,7 +678,7 @@ where
 
     /// Returns true when there are in-flight IO operations.
     fn has_pending(&self) -> bool {
-        !self.in_flight_ios.is_empty()
+        !self.in_flight_ios.is_empty() || !self.pending_cqe_slots.is_empty()
     }
 
     fn inflight_count(&self) -> usize {
@@ -713,6 +731,10 @@ where
         if let Some(trace) = self.trace.as_mut() {
             trace.finish();
             trace.print_profile_summary();
+            // Print events if DISKANN_TRACE_EVENTS is set
+            if std::env::var("DISKANN_TRACE_EVENTS").is_ok() {
+                trace.print_events(500);
+            }
         }
     }
 }
@@ -786,22 +808,22 @@ where
         accessor: &mut PipelinedDiskAccessor<'_, Data>,
         _query: &[Data::VectorDataType],
         _computer: &DiskQueryComputer,
-        candidates: I,
+        _candidates: I,
         output: &mut B,
     ) -> Result<usize, Self::Error>
     where
         I: Iterator<Item = Neighbor<u32>> + Send,
         B: SearchOutputBuffer<(u32, Data::AssociatedDataType)> + Send + ?Sized,
     {
-        let mut reranked: Vec<((u32, Data::AssociatedDataType), f32)> = candidates
-            .map(|n| n.id)
-            .filter(|id| (self.filter)(id))
-            .filter_map(|id| {
-                accessor
-                    .distance_cache
-                    .get(&id)
-                    .map(|&dist| ((id, Data::AssociatedDataType::default()), dist))
-            })
+        // Rerank using ALL expanded nodes' cached fp-distances, not just
+        // candidates from the priority queue. This matches PipeANN's
+        // full_retset approach: every expanded node contributes to results
+        // regardless of its PQ distance ranking.
+        let mut reranked: Vec<((u32, Data::AssociatedDataType), f32)> = accessor
+            .distance_cache
+            .iter()
+            .filter(|(id, _)| (self.filter)(id))
+            .map(|(&id, &dist)| ((id, Data::AssociatedDataType::default()), dist))
             .collect();
 
         reranked.sort_unstable_by(|a, b| a.1.total_cmp(&b.1));
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 41cebb022..75bfbf14a 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2126,104 +2126,137 @@ where
                 let has_pending = accessor.has_pending();
                 let pipelining = accessor.is_pipelined();
 
-                // When pipelining, cap total in-flight IOs at cur_beam_width
-                // (like PipeSearch) to avoid over-committing the priority queue.
-                // Within that cap, submit at the rate we expand (process-N-submit-N).
-                let submit_limit = if has_pending {
-                    let inflight = accessor.inflight_count();
-                    if inflight >= cur_beam_width {
-                        0
+                // PIPELINED ORDER (matching PipeANN's loop):
+                //   1. poll completions + expand one loaded node
+                //   2. insert neighbors into queue
+                //   3. submit one IO (with latest neighbor info)
+                //
+                // NON-PIPELINED ORDER (original beam search):
+                //   1. submit beam_width nodes
+                //   2. expand all (synchronous)
+
+                if pipelining && has_pending {
+                    // Step 1: Expand one loaded node (polls internally)
+                    neighbors.clear();
+                    let queue_ordered: Vec<DP::InternalId> = if !submitted.is_empty() {
+                        scratch.best.iter()
+                            .filter(|n| submitted.contains(&n.id))
+                            .map(|n| n.id)
+                            .collect()
                     } else {
-                        last_expanded.max(1).min(cur_beam_width - inflight)
+                        Vec::new()
+                    };
+                    let expanded = accessor
+                        .expand_available(
+                            queue_ordered.iter().copied(),
+                            computer,
+                            glue::NotInMut::new(&mut scratch.visited),
+                            |distance, id| neighbors.push(Neighbor::new(id, distance)),
+                        )
+                        .await?;
+                    last_expanded = expanded;
+
+                    for &id in accessor.last_expanded_ids() {
+                        scratch.best.mark_visited_by_id(&id);
+                        submitted.remove(&id);
                     }
-                } else {
-                    cur_beam_width
-                };
 
-                scratch.beam_nodes.clear();
-                if pipelining {
-                    // Speculative submission: peek without marking visited.
-                    // Nodes are marked visited only after actual expansion.
-                    while scratch.beam_nodes.len() < submit_limit {
+                    // Step 2: Insert neighbors (updates queue before IO decision)
+                    neighbors
+                        .iter()
+                        .for_each(|neighbor| scratch.best.insert(*neighbor));
+                    scratch.cmps += neighbors.len() as u32;
+                    scratch.hops += expanded as u32;
+
+                    // Adaptive beam width
+                    if search_params.adaptive_beam_width && expanded > 0 {
+                        cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+                    }
+
+                    // Step 3: Submit one IO (with updated queue)
+                    let inflight = accessor.inflight_count();
+                    if inflight < cur_beam_width {
+                        scratch.beam_nodes.clear();
                         if let Some(closest_node) =
                             scratch.best.peek_best_unsubmitted(&submitted)
                         {
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
-                        } else {
-                            break;
                         }
+                        accessor.submit_expand(scratch.beam_nodes.iter().copied());
                     }
-                } else {
-                    // Non-pipelined OR first iteration before pipelining starts.
-                    // Use closest_notvisited but also track in submitted set so
-                    // expand_available can find these nodes in queue_ordered.
-                    while scratch.best.has_notvisited_node()
-                        && scratch.beam_nodes.len() < submit_limit
-                    {
-                        let closest_node = scratch.best.closest_notvisited();
-                        search_record.record(closest_node, scratch.hops, scratch.cmps);
-                        submitted.insert(closest_node.id);
-                        scratch.beam_nodes.push(closest_node.id);
-                    }
-                }
-
-                // Submit to expansion queue (no-op for non-pipelined)
-                accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
-                // Expand all available loaded nodes. The pipeline depth is
-                // controlled by the inflight cap on submissions, not by
-                // limiting expansions. For non-pipelined, this expands the
-                // full beam as before.
-                neighbors.clear();
-                // Pass submitted-but-not-yet-expanded nodes in queue priority
-                // order so the accessor expands the best available loaded node,
-                // matching PipeSearch's "best unvisited in retset" strategy.
-                let queue_ordered: Vec<DP::InternalId> = if !submitted.is_empty() {
-                    scratch.best.iter()
-                        .filter(|n| submitted.contains(&n.id))
-                        .map(|n| n.id)
-                        .collect()
+                    // Block if truly idle
+                    if expanded == 0 && has_pending {
+                        let inflight = accessor.inflight_count();
+                        if inflight > 0 {
+                            accessor.wait_for_io();
+                        }
+                    }
                 } else {
-                    scratch.beam_nodes.clone()
-                };
-                let expanded = accessor
-                    .expand_available(
-                        queue_ordered.iter().copied(),
-                        computer,
-                        glue::NotInMut::new(&mut scratch.visited),
-                        |distance, id| neighbors.push(Neighbor::new(id, distance)),
-                    )
-                    .await?;
-                last_expanded = expanded;
+                    // Non-pipelined path OR initial burst (has_pending=false)
+                    let submit_limit = if has_pending { 0 } else { cur_beam_width };
+
+                    scratch.beam_nodes.clear();
+                    if pipelining {
+                        while scratch.beam_nodes.len() < submit_limit {
+                            if let Some(closest_node) =
+                                scratch.best.peek_best_unsubmitted(&submitted)
+                            {
+                                search_record.record(closest_node, scratch.hops, scratch.cmps);
+                                submitted.insert(closest_node.id);
+                                scratch.beam_nodes.push(closest_node.id);
+                            } else {
+                                break;
+                            }
+                        }
+                    } else {
+                        while scratch.best.has_notvisited_node()
+                            && scratch.beam_nodes.len() < submit_limit
+                        {
+                            let closest_node = scratch.best.closest_notvisited();
+                            search_record.record(closest_node, scratch.hops, scratch.cmps);
+                            submitted.insert(closest_node.id);
+                            scratch.beam_nodes.push(closest_node.id);
+                        }
+                    }
 
-                // Mark expanded nodes as visited and remove from submitted set.
-                for &id in accessor.last_expanded_ids() {
-                    scratch.best.mark_visited_by_id(&id);
-                    submitted.remove(&id);
-                }
+                    accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
-                // When pipelining and nothing was submitted or expanded,
-                // block until at least one IO completes. This only fires
-                // when the loop truly has nothing to do (inflight cap reached
-                // AND no completions available), replacing ~400 spin iterations
-                // with a single kernel wait + eager drain of all ready CQEs.
-                if expanded == 0 && scratch.beam_nodes.is_empty() && has_pending {
-                    accessor.wait_for_io();
-                }
+                    neighbors.clear();
+                    let queue_ordered: Vec<DP::InternalId> = if !submitted.is_empty() {
+                        scratch.best.iter()
+                            .filter(|n| submitted.contains(&n.id))
+                            .map(|n| n.id)
+                            .collect()
+                    } else {
+                        scratch.beam_nodes.clone()
+                    };
+                    let expanded = accessor
+                        .expand_available(
+                            queue_ordered.iter().copied(),
+                            computer,
+                            glue::NotInMut::new(&mut scratch.visited),
+                            |distance, id| neighbors.push(Neighbor::new(id, distance)),
+                        )
+                        .await?;
+                    last_expanded = expanded;
 
-                neighbors
-                    .iter()
-                    .for_each(|neighbor| scratch.best.insert(*neighbor));
+                    for &id in accessor.last_expanded_ids() {
+                        scratch.best.mark_visited_by_id(&id);
+                        submitted.remove(&id);
+                    }
 
-                scratch.cmps += neighbors.len() as u32;
-                scratch.hops += expanded as u32;
+                    neighbors
+                        .iter()
+                        .for_each(|neighbor| scratch.best.insert(*neighbor));
+                    scratch.cmps += neighbors.len() as u32;
+                    scratch.hops += expanded as u32;
 
-                // Adaptive beam width
-                if search_params.adaptive_beam_width && expanded > 0 {
-                    // All expanded nodes are useful by definition
-                    cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+                    if search_params.adaptive_beam_width && expanded > 0 {
+                        cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+                    }
                 }
 
                 // Relaxed monotonicity: detect convergence and extend search

From 8ad3f017c704daf4a10d1a229842c713e6f8608c Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Tue, 10 Feb 2026 23:53:51 -0800
Subject: [PATCH 19/46] Optimize pipelined search: batch completions,
 rank-based expansion, IO timing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Performance optimizations:
- Batch drain_completions: process ALL CQEs per poll, not one-at-a-time.
  Removes pending_cqe_slots buffer and per-drain HashSet allocation.
- Rank-based expansion fallback: when the caller passes no priority hints,
  expand the loaded node with lowest rank (earliest submitted = best PQ
  distance at submission time). Eliminates per-iteration queue_ordered
  Vec allocation in the pipelined search loop.
- Simplified non-pipelined expand: pass beam_nodes directly instead of
  rebuilding queue_ordered Vec.

Statistics fix:
- Track io_time and cpu_time in PipelinedDiskAccessor (accumulated Durations)
- Report io_us and cpu_us through PipelinedIoStats atomics
- search_pipelined now populates io_time_us and cpu_time_us in QueryStatistics

Results (4 threads, BW=4, sift1m, UnifiedPipe):
  L=10: QPS 1637→1789 (+9%), recall 59.6%→61.8%, p999 4106→3671us
  L=50: QPS  829→ 896 (+8%), p999 24151→6048us
  QPS gap vs PipeSearch: -10%→-4% at L=10
---
 .../src/search/provider/pipelined_accessor.rs | 176 ++++++++++--------
 diskann/src/graph/index.rs                    |  23 +--
 2 files changed, 103 insertions(+), 96 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 3fb676f74..1d4104b87 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -197,9 +197,6 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     // IO state
     in_flight_ios: VecDeque<InFlightIo>,
     loaded_nodes: HashMap<u32, LoadedNode>,
-    /// Buffered CQE slot IDs not yet matched to in-flight IOs.
-    /// Allows processing one completion at a time for strict pipelining.
-    pending_cqe_slots: Vec<usize>,
     next_slot_id: usize,
     max_slots: usize,
     /// Monotonically increasing submission rank for priority-ordered expansion.
@@ -213,6 +210,10 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     // IO statistics
     io_count: u32,
     cache_hits: u32,
+    /// Accumulated IO time (submission + polling + waiting)
+    io_time: std::time::Duration,
+    /// Accumulated CPU time (fp distance + PQ distance + node parsing)
+    cpu_time: std::time::Duration,
     // Shared stats written on drop so caller can read them after search
     shared_io_stats: Arc<PipelinedIoStats>,
 
@@ -255,7 +256,6 @@ where
             node_cache,
             in_flight_ios: VecDeque::new(),
             loaded_nodes: HashMap::new(),
-            pending_cqe_slots: Vec::new(),
             next_slot_id: 0,
             max_slots: slots,
             next_rank: 0,
@@ -263,6 +263,8 @@ where
             distance_cache: HashMap::new(),
             io_count: 0,
             cache_hits: 0,
+            io_time: std::time::Duration::ZERO,
+            cpu_time: std::time::Duration::ZERO,
             shared_io_stats,
             trace: None,
         })
@@ -329,96 +331,90 @@ where
 
     /// Poll completed IOs and move data from reader buffers into loaded_nodes.
     fn drain_completions(&mut self) -> ANNResult<()> {
+        if self.in_flight_ios.is_empty() {
+            return Ok(());
+        }
+
         let mut trace = OptionalTrace(self.trace.as_mut());
 
-        // Poll new CQEs and buffer them
+        let io_start = Instant::now();
         trace.begin_phase();
-        if !self.in_flight_ios.is_empty() {
-            let new_slots = self.scratch.reader.poll_completions()?;
-            self.pending_cqe_slots.extend(new_slots);
-        }
+        let completed_slots = self.scratch.reader.poll_completions()?;
         trace.end_phase_io_poll();
+        self.io_time += io_start.elapsed();
 
-        if self.pending_cqe_slots.is_empty() {
+        if completed_slots.is_empty() {
             return Ok(());
         }
 
-        // Process at most ONE completion in submission (FIFO) order,
-        // matching PipeANN's `while(front().finished())` which processes
-        // in-flight IOs strictly front-to-back.
         let completed_set: std::collections::HashSet<usize> =
-            self.pending_cqe_slots.iter().copied().collect();
+            completed_slots.into_iter().collect();
 
-        // Scan in_flight_ios from front: process the first one whose CQE arrived
-        let mut found_idx = None;
-        for (i, io) in self.in_flight_ios.iter().enumerate() {
+        let mut remaining = VecDeque::new();
+        while let Some(io) = self.in_flight_ios.pop_front() {
             if completed_set.contains(&io.slot_id) {
-                found_idx = Some(i);
-                break;
+                trace.begin_phase();
+                let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
+                let parsed = parse_node(
+                    sector_buf,
+                    io.vertex_id,
+                    self.num_nodes_per_sector,
+                    self.node_len,
+                    self.fp_vector_len,
+                )?;
+                trace.end_phase_parse_node();
+                trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
+                self.loaded_nodes.insert(io.vertex_id, LoadedNode {
+                    fp_vector: parsed.fp_vector,
+                    adjacency_list: parsed.adjacency_list,
+                    rank: io.rank,
+                });
+            } else {
+                remaining.push_back(io);
             }
         }
-
-        if let Some(idx) = found_idx {
-            let io = self.in_flight_ios.remove(idx).unwrap();
-            self.pending_cqe_slots.retain(|&s| s != io.slot_id);
-
-            trace.begin_phase();
-            let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
-            let parsed = parse_node(
-                sector_buf,
-                io.vertex_id,
-                self.num_nodes_per_sector,
-                self.node_len,
-                self.fp_vector_len,
-            )?;
-            trace.end_phase_parse_node();
-            trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-            self.loaded_nodes.insert(io.vertex_id, LoadedNode {
-                fp_vector: parsed.fp_vector,
-                adjacency_list: parsed.adjacency_list,
-                rank: io.rank,
-            });
-        }
+        self.in_flight_ios = remaining;
         Ok(())
     }
     /// Block until at least one IO completes, then eagerly drain all available.
-    /// Reuses the same completion → loaded_nodes logic as drain_completions.
     fn wait_and_drain(&mut self) -> ANNResult<()> {
         let mut trace = OptionalTrace(self.trace.as_mut());
+        let io_start = Instant::now();
         trace.begin_phase();
         let completed_slots = self.scratch.reader.wait_completions()?;
-        self.pending_cqe_slots.extend(completed_slots);
         trace.end_phase_io_poll();
+        self.io_time += io_start.elapsed();
+
+        if completed_slots.is_empty() {
+            return Ok(());
+        }
 
-        // When blocking, drain ALL pending completions
         let completed_set: std::collections::HashSet<usize> =
-            self.pending_cqe_slots.drain(..).collect();
-        if !completed_set.is_empty() {
-            let mut remaining = VecDeque::new();
-            while let Some(io) = self.in_flight_ios.pop_front() {
-                if completed_set.contains(&io.slot_id) {
-                    trace.begin_phase();
-                    let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
-                    let parsed = parse_node(
-                        sector_buf,
-                        io.vertex_id,
-                        self.num_nodes_per_sector,
-                        self.node_len,
-                        self.fp_vector_len,
-                    )?;
-                    trace.end_phase_parse_node();
-                    trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-                    self.loaded_nodes.insert(io.vertex_id, LoadedNode {
-                        fp_vector: parsed.fp_vector,
-                        adjacency_list: parsed.adjacency_list,
-                        rank: io.rank,
-                    });
-                } else {
-                    remaining.push_back(io);
-                }
+            completed_slots.into_iter().collect();
+        let mut remaining = VecDeque::new();
+        while let Some(io) = self.in_flight_ios.pop_front() {
+            if completed_set.contains(&io.slot_id) {
+                trace.begin_phase();
+                let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
+                let parsed = parse_node(
+                    sector_buf,
+                    io.vertex_id,
+                    self.num_nodes_per_sector,
+                    self.node_len,
+                    self.fp_vector_len,
+                )?;
+                trace.end_phase_parse_node();
+                trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
+                self.loaded_nodes.insert(io.vertex_id, LoadedNode {
+                    fp_vector: parsed.fp_vector,
+                    adjacency_list: parsed.adjacency_list,
+                    rank: io.rank,
+                });
+            } else {
+                remaining.push_back(io);
             }
-            self.in_flight_ios = remaining;
         }
+        self.in_flight_ios = remaining;
         Ok(())
     }
 }
@@ -540,6 +536,7 @@ where
     /// skipping disk IO entirely.
     fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) {
         let mut trace = OptionalTrace(self.trace.as_mut());
+        let io_start = Instant::now();
         trace.begin_phase();
         for id in ids {
             if self.loaded_nodes.contains_key(&id) {
@@ -590,9 +587,15 @@ where
             }
         }
         trace.end_phase_io_submit();
+        self.io_time += io_start.elapsed();
     }
 
-    /// Poll for completed reads and expand all loaded nodes.
+    /// Poll for completed reads and expand the best loaded node.
+    ///
+    /// Uses two selection strategies:
+    /// 1. If `ids` provides candidates, pick the first loaded match (queue order)
+    /// 2. Otherwise, pick the loaded node with the lowest submission rank
+    ///    (earliest submitted = best PQ distance at submission time)
     fn expand_available<P, F>(
         &mut self,
         ids: impl Iterator<Item = Self::Id> + Send,
@@ -614,10 +617,7 @@ where
                 return Ok(0);
             }
 
-            // Expand the highest-priority loaded node, scanning `ids` in the
-            // caller's priority order (queue-ordered by PQ distance, then
-            // evicted-but-submitted nodes as fallback).  This matches PipeANN's
-            // retset scan: best PQ-distance loaded node first.
+            // Try caller's priority order first
             let mut best_vid: Option<u32> = None;
             for id in ids {
                 if self.loaded_nodes.contains_key(&id) {
@@ -626,6 +626,15 @@ where
                 }
             }
 
+            // Fallback: pick loaded node with lowest rank (best PQ at submission)
+            if best_vid.is_none() {
+                best_vid = self
+                    .loaded_nodes
+                    .iter()
+                    .min_by_key(|(_, node)| node.rank)
+                    .map(|(&id, _)| id);
+            }
+
             let vid = match best_vid {
                 Some(id) => id,
                 None => return Ok(0),
@@ -634,14 +643,14 @@ where
             self.expanded_ids.push(vid);
 
             // Compute full-precision distance and cache it for post-processing
-            let fp_start = Instant::now();
+            let cpu_start = Instant::now();
             let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
             let fp_dist = self
                 .provider
                 .distance_comparer
                 .evaluate_similarity(self.query, fp_vec);
             if let Some(t) = self.trace.as_mut() {
-                t.profile.fp_distance_us += fp_start.elapsed().as_micros() as u64;
+                t.profile.fp_distance_us += cpu_start.elapsed().as_micros() as u64;
             }
             self.distance_cache.insert(vid, fp_dist);
 
@@ -661,6 +670,7 @@ where
                     t.profile.pq_distance_us += pq_start.elapsed().as_micros() as u64;
                 }
             }
+            self.cpu_time += cpu_start.elapsed();
 
             if let Some(t) = self.trace.as_mut() {
                 t.record_expand();
@@ -678,7 +688,7 @@ where
 
     /// Returns true when there are in-flight IO operations.
     fn has_pending(&self) -> bool {
-        !self.in_flight_ios.is_empty() || !self.pending_cqe_slots.is_empty()
+        !self.in_flight_ios.is_empty()
     }
 
     fn inflight_count(&self) -> usize {
@@ -726,6 +736,12 @@ where
         self.shared_io_stats
             .cache_hits
             .fetch_add(self.cache_hits, Ordering::Relaxed);
+        self.shared_io_stats
+            .io_us
+            .fetch_add(self.io_time.as_micros() as u64, Ordering::Relaxed);
+        self.shared_io_stats
+            .cpu_us
+            .fetch_add(self.cpu_time.as_micros() as u64, Ordering::Relaxed);
 
         // Print trace if enabled
         if let Some(trace) = self.trace.as_mut() {
@@ -768,6 +784,8 @@ pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
 pub struct PipelinedIoStats {
     pub io_count: AtomicU32,
     pub cache_hits: AtomicU32,
+    pub io_us: std::sync::atomic::AtomicU64,
+    pub cpu_us: std::sync::atomic::AtomicU64,
 }
 
 impl Default for PipelinedIoStats {
@@ -775,6 +793,8 @@ impl Default for PipelinedIoStats {
         Self {
             io_count: AtomicU32::new(0),
             cache_hits: AtomicU32::new(0),
+            io_us: std::sync::atomic::AtomicU64::new(0),
+            cpu_us: std::sync::atomic::AtomicU64::new(0),
         }
     }
 }
@@ -959,6 +979,8 @@ where
         query_stats.total_io_operations = io_stats.io_count.load(Ordering::Relaxed);
         query_stats.total_vertices_loaded =
             io_stats.io_count.load(Ordering::Relaxed) + io_stats.cache_hits.load(Ordering::Relaxed);
+        query_stats.io_time_us = io_stats.io_us.load(Ordering::Relaxed) as u128;
+        query_stats.cpu_time_us = io_stats.cpu_us.load(Ordering::Relaxed) as u128;
 
         let mut search_result = SearchResult {
             results: Vec::with_capacity(return_list_size as usize),
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 75bfbf14a..1e71596da 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2136,19 +2136,12 @@ where
                 //   2. expand all (synchronous)
 
                 if pipelining && has_pending {
-                    // Step 1: Expand one loaded node (polls internally)
+                    // Step 1: Expand one loaded node (polls internally).
+                    // Pass empty iterator — the accessor picks by rank.
                     neighbors.clear();
-                    let queue_ordered: Vec<DP::InternalId> = if !submitted.is_empty() {
-                        scratch.best.iter()
-                            .filter(|n| submitted.contains(&n.id))
-                            .map(|n| n.id)
-                            .collect()
-                    } else {
-                        Vec::new()
-                    };
                     let expanded = accessor
                         .expand_available(
-                            queue_ordered.iter().copied(),
+                            std::iter::empty(),
                             computer,
                             glue::NotInMut::new(&mut scratch.visited),
                             |distance, id| neighbors.push(Neighbor::new(id, distance)),
@@ -2225,17 +2218,9 @@ where
                     accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
                     neighbors.clear();
-                    let queue_ordered: Vec<DP::InternalId> = if !submitted.is_empty() {
-                        scratch.best.iter()
-                            .filter(|n| submitted.contains(&n.id))
-                            .map(|n| n.id)
-                            .collect()
-                    } else {
-                        scratch.beam_nodes.clone()
-                    };
                     let expanded = accessor
                         .expand_available(
-                            queue_ordered.iter().copied(),
+                            scratch.beam_nodes.iter().copied(),
                             computer,
                             glue::NotInMut::new(&mut scratch.visited),
                             |distance, id| neighbors.push(Neighbor::new(id, distance)),

From 4c499c1b0e744083deea2b190c708649f072e331 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 00:04:15 -0800
Subject: [PATCH 20/46] Clean up: consolidate PipeSearch into unified
 implementation

- Merge old PipeSearch and UnifiedPipeSearch enum variants into a single
  PipeSearch variant that uses the queue-based ExpandBeam implementation.
  The old standalone PipelinedSearcher is kept (deprecated) for existing
  tests but removed from the benchmark SearchMode enum.
- Remove DISKANN_TRACE_EVENTS debug output and per-query RESULT printing
  from the benchmark. DISKANN_TRACE=1 profile summaries are retained.
- Update pipe-search.json example config for the new API.
- serde(alias = "UnifiedPipeSearch") preserves backward-compat with old configs.
---
 diskann-benchmark/example/pipe-search.json    |  31 +-----
 .../src/backend/disk_index/search.rs          | 103 ++----------------
 diskann-benchmark/src/inputs/disk.rs          |  44 +-------
 .../src/search/provider/pipelined_accessor.rs |   6 +-
 diskann-disk/src/search/search_trace.rs       |   2 +-
 5 files changed, 16 insertions(+), 170 deletions(-)

diff --git a/diskann-benchmark/example/pipe-search.json b/diskann-benchmark/example/pipe-search.json
index bd525e989..1a91353aa 100644
--- a/diskann-benchmark/example/pipe-search.json
+++ b/diskann-benchmark/example/pipe-search.json
@@ -46,9 +46,7 @@
           "distance": "squared_l2",
           "vector_filters_file": null,
           "search_mode": {
-            "mode": "PipeSearch",
-            "initial_beam_width": 4,
-            "relaxed_monotonicity_l": null
+            "mode": "PipeSearch"
           }
         }
       }
@@ -73,37 +71,10 @@
           "vector_filters_file": null,
           "search_mode": {
             "mode": "PipeSearch",
-            "initial_beam_width": 2,
             "relaxed_monotonicity_l": 50
           }
         }
       }
-    },
-    {
-      "type": "disk-index",
-      "content": {
-        "source": {
-          "disk-index-source": "Load",
-          "data_type": "float32",
-          "load_path": "test_data/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search"
-        },
-        "search_phase": {
-          "queries": "disk_index_sample_query_10pts.fbin",
-          "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin",
-          "search_list": [10, 20, 40, 80],
-          "beam_width": 4,
-          "recall_at": 10,
-          "num_threads": 1,
-          "is_flat_search": false,
-          "distance": "squared_l2",
-          "vector_filters_file": null,
-          "search_mode": {
-            "mode": "PipeSearch",
-            "initial_beam_width": 4,
-            "sqpoll_idle_ms": 1000
-          }
-        }
-      }
     }
   ]
 }
diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index dcca626cc..b0ef4a7f8 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -22,7 +22,7 @@ use diskann_disk::{
     utils::{instrumentation::PerfLogger, statistics, AlignedFileReaderFactory, QueryStatistics},
 };
 #[cfg(target_os = "linux")]
-use diskann_disk::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
+use diskann_disk::search::pipelined::PipelinedReaderConfig;
 #[cfg(target_os = "linux")]
 use diskann_disk::search::provider::pipelined_accessor::PipelinedConfig;
 use diskann_providers::storage::StorageReadProvider;
@@ -230,16 +230,6 @@ fn run_search_loop(
 
         let total_time = start.elapsed();
 
-        // Print per-query result IDs for trace comparison
-        if std::env::var("DISKANN_TRACE_EVENTS").is_ok() {
-            for qi in 0..num_queries {
-                let start_idx = qi * recall_at as usize;
-                let count = result_counts[qi] as usize;
-                let ids: Vec<u32> = result_ids[start_idx..start_idx + count.min(recall_at as usize)].to_vec();
-                eprintln!("RESULT q={} L={} ids={:?}", qi, l, ids);
-            }
-        }
-
         if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) {
             anyhow::bail!("One or more searches failed. See logs for details.");
         }
@@ -390,88 +380,9 @@ where
                 },
             )?;
         }
-        // PipeANN pipelined search — for read-only search on completed (static) indices only.
-        // Searcher is created once; internal ObjectPool handles per-thread scratch allocation.
-        // Build's internal search always uses BeamSearch above.
-        SearchMode::PipeSearch {
-            initial_beam_width,
-            relaxed_monotonicity_l,
-            sqpoll_idle_ms,
-        } => {
-            #[cfg(target_os = "linux")]
-            {
-                let graph_header = vertex_provider_factory.get_header()?;
-                let pq_data = index_reader.get_pq_data();
-                let metric = search_params.distance.into();
-                let initial_beam_width = *initial_beam_width;
-                let relaxed_monotonicity_l = *relaxed_monotonicity_l;
-
-                let reader_config = PipelinedReaderConfig {
-                    sqpoll_idle_ms: *sqpoll_idle_ms,
-                };
-
-                // Create searcher once — pool handles per-thread scratch allocation
-                let pipe_searcher = Arc::new(PipelinedSearcher::<GraphData<T>>::new(
-                    graph_header.clone(),
-                    pq_data.clone(),
-                    metric,
-                    initial_beam_width,
-                    relaxed_monotonicity_l,
-                    disk_index_path.clone(),
-                    reader_config,
-                )?);
-
-                logger.log_checkpoint("index_loaded");
-
-                search_results_per_l = run_search_loop(
-                    &search_params.search_list,
-                    search_params.recall_at,
-                    search_params.beam_width,
-                    num_queries,
-                    "pipesearch",
-                    &has_any_search_failed,
-                    &gt_context,
-                    |l, statistics_vec, result_counts, result_ids, result_dists| {
-                        let pipe_searcher = pipe_searcher.clone();
-
-                        let zipped = queries
-                            .par_row_iter()
-                            .zip(result_ids.par_chunks_mut(search_params.recall_at as usize))
-                            .zip(result_dists.par_chunks_mut(search_params.recall_at as usize))
-                            .zip(statistics_vec.par_iter_mut())
-                            .zip(result_counts.par_iter_mut());
-
-                        zipped.for_each_in_pool(
-                            &pool,
-                            |((((q, id_chunk), dist_chunk), stats), rc)| {
-                                write_query_result(
-                                    pipe_searcher.search(
-                                        q,
-                                        search_params.recall_at,
-                                        l,
-                                        search_params.beam_width,
-                                        None,
-                                    ),
-                                    search_params.recall_at as usize,
-                                    stats,
-                                    rc,
-                                    id_chunk,
-                                    dist_chunk,
-                                    &has_any_search_failed,
-                                    "PipeSearch",
-                                );
-                            },
-                        );
-                    },
-                )?;
-            }
-            #[cfg(not(target_os = "linux"))]
-            {
-                let _ = (initial_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms);
-                anyhow::bail!("PipeSearch is only supported on Linux");
-            }
-        }
-        SearchMode::UnifiedPipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
+        // Pipelined search — for read-only search on completed (static) indices only.
+        // Uses io_uring for IO/compute overlap through the generic search loop.
+        SearchMode::PipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
             #[cfg(target_os = "linux")]
             {
                 use diskann_disk::data_model::Cache;
@@ -545,7 +456,7 @@ where
                     search_params.recall_at,
                     search_params.beam_width,
                     num_queries,
-                    "unified_pipesearch",
+                    "pipesearch",
                     &has_any_search_failed,
                     &gt_context,
                     |l, statistics_vec, result_counts, result_ids, result_dists| {
@@ -573,7 +484,7 @@ where
                                     id_chunk,
                                     dist_chunk,
                                     &has_any_search_failed,
-                                    "UnifiedPipeSearch",
+                                    "PipeSearch",
                                 );
                             },
                         );
@@ -583,7 +494,7 @@ where
             #[cfg(not(target_os = "linux"))]
             {
                 let _ = (adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms);
-                anyhow::bail!("UnifiedPipeSearch is only supported on Linux");
+                anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
     }
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index 373ca003b..d481297e4 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -78,19 +78,10 @@ pub(crate) enum SearchMode {
     /// Standard beam search (default, current behavior).
     #[default]
     BeamSearch,
-    /// PipeANN pipelined search with IO/compute overlap.
+    /// Pipelined search through the generic search loop (queue-based ExpandBeam).
+    /// Overlaps IO and compute using io_uring on Linux.
+    #[serde(alias = "UnifiedPipeSearch")]
     PipeSearch {
-        /// Initial beam width before adaptive adjustment (default: 4).
-        #[serde(default = "default_initial_beam_width")]
-        initial_beam_width: usize,
-        /// Optional relaxed monotonicity parameter for early termination.
-        relaxed_monotonicity_l: Option<usize>,
-        /// Enable kernel-side SQ polling (ms idle timeout). None = disabled.
-        #[serde(default)]
-        sqpoll_idle_ms: Option<u32>,
-    },
-    /// Unified pipelined search through the generic search loop (queue-based ExpandBeam).
-    UnifiedPipeSearch {
         /// Start with a smaller beam and grow adaptively. Defaults to true.
         #[serde(default = "default_true")]
         adaptive_beam_width: bool,
@@ -107,22 +98,8 @@ impl fmt::Display for SearchMode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             SearchMode::BeamSearch => write!(f, "BeamSearch"),
-            SearchMode::PipeSearch {
-                initial_beam_width,
-                relaxed_monotonicity_l,
-                sqpoll_idle_ms,
-            } => {
-                write!(f, "PipeSearch(bw={}", initial_beam_width)?;
-                if let Some(rm) = relaxed_monotonicity_l {
-                    write!(f, ",rm={}", rm)?;
-                }
-                if let Some(sq) = sqpoll_idle_ms {
-                    write!(f, ",sqpoll={}ms", sq)?;
-                }
-                write!(f, ")")
-            }
-            SearchMode::UnifiedPipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
-                write!(f, "UnifiedPipeSearch")?;
+            SearchMode::PipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
+                write!(f, "PipeSearch")?;
                 let has_abw = *adaptive_beam_width;
                 let has_rm = relaxed_monotonicity_l.is_some();
                 let has_sq = sqpoll_idle_ms.is_some();
@@ -150,10 +127,6 @@ impl fmt::Display for SearchMode {
     }
 }
 
-fn default_initial_beam_width() -> usize {
-    4
-}
-
 fn default_true() -> bool {
     true
 }
@@ -326,12 +299,7 @@ impl CheckDeserialization for DiskSearchPhase {
         }
         match &self.search_mode {
             SearchMode::BeamSearch => {}
-            SearchMode::PipeSearch { initial_beam_width, .. } => {
-                if *initial_beam_width == 0 {
-                    anyhow::bail!("initial_beam_width must be positive");
-                }
-            }
-            SearchMode::UnifiedPipeSearch { .. } => {}
+            SearchMode::PipeSearch { .. } => {}
         }
         Ok(())
     }
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 1d4104b87..8def4ef31 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -743,14 +743,10 @@ where
             .cpu_us
             .fetch_add(self.cpu_time.as_micros() as u64, Ordering::Relaxed);
 
-        // Print trace if enabled
+        // Print trace profile if enabled (controlled by DISKANN_TRACE=1)
         if let Some(trace) = self.trace.as_mut() {
             trace.finish();
             trace.print_profile_summary();
-            // Print events if DISKANN_TRACE_EVENTS is set
-            if std::env::var("DISKANN_TRACE_EVENTS").is_ok() {
-                trace.print_events(500);
-            }
         }
     }
 }
diff --git a/diskann-disk/src/search/search_trace.rs b/diskann-disk/src/search/search_trace.rs
index 16b0d3f5d..ebcb253b2 100644
--- a/diskann-disk/src/search/search_trace.rs
+++ b/diskann-disk/src/search/search_trace.rs
@@ -14,7 +14,7 @@
 //! Tracing is opt-in: create a `SearchTrace` and pass it to the search function.
 //! When disabled (None), all operations are zero-cost.
 
-use std::time::{Duration, Instant};
+use std::time::Instant;
 
 /// A single event in the search trace.
 #[derive(Debug, Clone)]

From 24a2d329aa298ff8ef3fa5046511f128cff8f4b8 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 00:10:16 -0800
Subject: [PATCH 21/46] Add SIFT1M benchmark script for ablation testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds diskann-benchmark/scripts/sift1m_benchmark.sh that:
1. Downloads SIFT1M from the texmex corpus (fvecs/ivecs format)
2. Converts to DiskANN binary format (fbin) using numpy
3. Builds a disk index (R=64, L=100, PQ_16)
4. Runs BeamSearch vs PipeSearch ablation at L=10,20,50,100

Supports --skip-download, --skip-build, --skip-index flags for
incremental runs on the same machine. Configurable via --threads
and --beam-width.

Ablation results (SIFT1M, 4 threads, BW=4):
  L=10:  Recall 52.5%→62.0%, QPS +20%, p95 -15%, p999 -14%
  L=50:  Recall 89.9%→90.6%, QPS +23%, p95 -19%, p999 -38%
  L=100: Recall 96.6%→96.7%, QPS +27%, p95 -21%, p999 -48%
---
 .gitignore                                    |   4 +-
 diskann-benchmark/scripts/sift1m_benchmark.sh | 274 ++++++++++++++++++
 2 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100755 diskann-benchmark/scripts/sift1m_benchmark.sh

diff --git a/.gitignore b/.gitignore
index 948e2e563..8df99897b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -333,4 +333,6 @@ target/
 *.info
 
 # ignore VS Code local history
-.history/
\ No newline at end of file
+.history/
+# Benchmark data (downloaded by scripts/sift1m_benchmark.sh)
+benchmark_data/
diff --git a/diskann-benchmark/scripts/sift1m_benchmark.sh b/diskann-benchmark/scripts/sift1m_benchmark.sh
new file mode 100755
index 000000000..30401edb8
--- /dev/null
+++ b/diskann-benchmark/scripts/sift1m_benchmark.sh
@@ -0,0 +1,274 @@
+#!/usr/bin/env bash
+# SIFT1M Pipelined Search Benchmark
+#
+# Downloads SIFT1M dataset, builds a disk index, and runs an ablation
+# benchmark comparing BeamSearch vs PipeSearch (io_uring pipelining).
+#
+# Prerequisites:
+#   - Linux (PipeSearch requires io_uring)
+#   - Rust toolchain (cargo)
+#   - curl, tar, python3 with numpy
+#   - ~2GB free disk space for data + index
+#
+# Usage:
+#   ./diskann-benchmark/scripts/sift1m_benchmark.sh [--data-dir DIR] [--skip-download] [--skip-build]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Defaults
+DATA_DIR="${DATA_DIR:-$REPO_ROOT/benchmark_data/sift1m}"
+NUM_THREADS="${NUM_THREADS:-4}"
+BEAM_WIDTH="${BEAM_WIDTH:-4}"
+SKIP_DOWNLOAD=false
+SKIP_BUILD=false
+SKIP_INDEX=false
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --data-dir)   DATA_DIR="$2"; shift 2 ;;
+        --skip-download) SKIP_DOWNLOAD=true; shift ;;
+        --skip-build) SKIP_BUILD=true; shift ;;
+        --skip-index) SKIP_INDEX=true; shift ;;
+        --threads)    NUM_THREADS="$2"; shift 2 ;;
+        --beam-width) BEAM_WIDTH="$2"; shift 2 ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --data-dir DIR     Data directory (default: \$REPO_ROOT/benchmark_data/sift1m)"
+            echo "  --skip-download    Skip downloading SIFT1M (use existing data)"
+            echo "  --skip-build       Skip building the benchmark binary"
+            echo "  --skip-index       Skip building the disk index (use existing index)"
+            echo "  --threads N        Number of search threads (default: 4)"
+            echo "  --beam-width N     Beam width / pipeline width (default: 4)"
+            exit 0
+            ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+BIN_DIR="$DATA_DIR/bin"
+INDEX_DIR="$DATA_DIR/index"
+INDEX_PREFIX="$INDEX_DIR/sift1m_R64_L100"
+CONFIG_FILE="$DATA_DIR/benchmark_config.json"
+OUTPUT_FILE="$DATA_DIR/benchmark_results.json"
+
+echo "=== SIFT1M Pipelined Search Benchmark ==="
+echo "Data directory: $DATA_DIR"
+echo "Threads: $NUM_THREADS, Beam width: $BEAM_WIDTH"
+echo ""
+
+# -------------------------------------------------------------------
+# Step 1: Download SIFT1M
+# -------------------------------------------------------------------
+if [ "$SKIP_DOWNLOAD" = false ]; then
+    echo "--- Step 1: Downloading SIFT1M dataset ---"
+    mkdir -p "$BIN_DIR"
+
+    SIFT_URL="ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz"
+    SIFT_TAR="$DATA_DIR/sift.tar.gz"
+
+    if [ ! -f "$BIN_DIR/sift_base.fbin" ]; then
+        if [ ! -f "$SIFT_TAR" ]; then
+            echo "Downloading from $SIFT_URL ..."
+            curl -L -o "$SIFT_TAR" "$SIFT_URL"
+        fi
+
+        echo "Extracting..."
+        EXTRACT_DIR="$DATA_DIR/extract"
+        mkdir -p "$EXTRACT_DIR"
+        tar xzf "$SIFT_TAR" -C "$EXTRACT_DIR"
+
+        echo "Converting .bvecs/.fvecs to .fbin format..."
+        python3 - "$EXTRACT_DIR/sift" "$BIN_DIR" << 'PYEOF'
+import sys, struct, numpy as np
+from pathlib import Path
+
+src_dir = Path(sys.argv[1])
+dst_dir = Path(sys.argv[2])
+
+def read_fvecs(path):
+    """Read .fvecs format: [dim(int32), vec(float32*dim)] per row."""
+    data = np.fromfile(path, dtype=np.float32)
+    dim = int(data[0].view(np.int32))
+    return data.reshape(-1, dim + 1)[:, 1:]
+
+def read_ivecs(path):
+    """Read .ivecs format: [dim(int32), vec(int32*dim)] per row."""
+    data = np.fromfile(path, dtype=np.int32)
+    dim = data[0]
+    return data.reshape(-1, dim + 1)[:, 1:]
+
+def write_fbin(path, data):
+    """Write DiskANN .fbin format: [npts(u32), dim(u32), data(float32*npts*dim)]."""
+    npts, dim = data.shape
+    with open(path, 'wb') as f:
+        f.write(struct.pack('II', npts, dim))
+        data.astype(np.float32).tofile(f)
+
+def write_ibin(path, data):
+    """Write DiskANN groundtruth .bin: [npts(u32), dim(u32), data(uint32*npts*dim)]."""
+    npts, dim = data.shape
+    with open(path, 'wb') as f:
+        f.write(struct.pack('II', npts, dim))
+        data.astype(np.uint32).tofile(f)
+
+# Convert base vectors
+base = read_fvecs(src_dir / "sift_base.fvecs")
+print(f"  Base: {base.shape[0]} points, {base.shape[1]} dims")
+write_fbin(dst_dir / "sift_base.fbin", base)
+
+# Convert query vectors
+query = read_fvecs(src_dir / "sift_query.fvecs")
+print(f"  Query: {query.shape[0]} points, {query.shape[1]} dims")
+write_fbin(dst_dir / "sift_query.fbin", query)
+
+# Convert ground truth (take top-100)
+gt = read_ivecs(src_dir / "sift_groundtruth.ivecs")
+print(f"  Groundtruth: {gt.shape[0]} queries, top-{gt.shape[1]}")
+write_ibin(dst_dir / "sift_groundtruth.bin", gt)
+
+print("  Conversion complete!")
+PYEOF
+
+        # Clean up extracted files
+        rm -rf "$EXTRACT_DIR" "$SIFT_TAR"
+    else
+        echo "SIFT1M data already exists at $BIN_DIR, skipping download."
+    fi
+    echo ""
+fi
+
+# -------------------------------------------------------------------
+# Step 2: Build the benchmark binary
+# -------------------------------------------------------------------
+if [ "$SKIP_BUILD" = false ]; then
+    echo "--- Step 2: Building diskann-benchmark ---"
+    cd "$REPO_ROOT"
+    cargo build --release -p diskann-benchmark --features disk-index 2>&1 | tail -3
+    echo ""
+fi
+
+BENCHMARK_BIN="$REPO_ROOT/target/release/diskann-benchmark"
+if [ ! -x "$BENCHMARK_BIN" ]; then
+    echo "ERROR: benchmark binary not found at $BENCHMARK_BIN"
+    echo "Run without --skip-build or build manually:"
+    echo "  cargo build --release -p diskann-benchmark --features disk-index"
+    exit 1
+fi
+
+# -------------------------------------------------------------------
+# Step 3: Build disk index (if needed)
+# -------------------------------------------------------------------
+if [ "$SKIP_INDEX" = false ] && [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
+    echo "--- Step 3: Building disk index (R=64, L=100, PQ_16) ---"
+    mkdir -p "$INDEX_DIR"
+
+    cat > "$DATA_DIR/build_config.json" << BUILDEOF
+{
+    "search_directories": ["$BIN_DIR"],
+    "jobs": [
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Build",
+                    "data_type": "float32",
+                    "data": "sift_base.fbin",
+                    "distance": "squared_l2",
+                    "dim": 128,
+                    "max_degree": 64,
+                    "l_build": 100,
+                    "num_threads": $NUM_THREADS,
+                    "build_ram_limit_gb": 4.0,
+                    "num_pq_chunks": 16,
+                    "quantization_type": "FP",
+                    "save_path": "$INDEX_PREFIX"
+                }
+            }
+        }
+    ]
+}
+BUILDEOF
+
+    "$BENCHMARK_BIN" run --input-file "$DATA_DIR/build_config.json" --output-file /dev/null
+    echo ""
+elif [ "$SKIP_INDEX" = true ]; then
+    echo "--- Step 3: Skipping index build (--skip-index) ---"
+    echo ""
+else
+    echo "--- Step 3: Disk index already exists, skipping build ---"
+    echo ""
+fi
+
+if [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
+    echo "ERROR: Disk index not found at ${INDEX_PREFIX}_disk.index"
+    exit 1
+fi
+
+# -------------------------------------------------------------------
+# Step 4: Run ablation benchmark (BeamSearch vs PipeSearch)
+# -------------------------------------------------------------------
+echo "--- Step 4: Running ablation benchmark ---"
+cat > "$CONFIG_FILE" << CFGEOF
+{
+    "search_directories": ["$BIN_DIR"],
+    "jobs": [
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Load",
+                    "data_type": "float32",
+                    "load_path": "$INDEX_PREFIX"
+                },
+                "search_phase": {
+                    "queries": "sift_query.fbin",
+                    "groundtruth": "sift_groundtruth.bin",
+                    "search_list": [10, 20, 50, 100],
+                    "beam_width": $BEAM_WIDTH,
+                    "recall_at": 10,
+                    "num_threads": $NUM_THREADS,
+                    "is_flat_search": false,
+                    "distance": "squared_l2",
+                    "search_mode": {"mode": "BeamSearch"}
+                }
+            }
+        },
+        {
+            "type": "disk-index",
+            "content": {
+                "source": {
+                    "disk-index-source": "Load",
+                    "data_type": "float32",
+                    "load_path": "$INDEX_PREFIX"
+                },
+                "search_phase": {
+                    "queries": "sift_query.fbin",
+                    "groundtruth": "sift_groundtruth.bin",
+                    "search_list": [10, 20, 50, 100],
+                    "beam_width": $BEAM_WIDTH,
+                    "recall_at": 10,
+                    "num_threads": $NUM_THREADS,
+                    "is_flat_search": false,
+                    "distance": "squared_l2",
+                    "search_mode": {"mode": "PipeSearch"}
+                }
+            }
+        }
+    ]
+}
+CFGEOF
+
+"$BENCHMARK_BIN" run --input-file "$CONFIG_FILE" --output-file "$OUTPUT_FILE"
+
+echo ""
+echo "=== Benchmark Complete ==="
+echo "Results saved to: $OUTPUT_FILE"
+echo ""
+echo "To re-run with different parameters:"
+echo "  $0 --skip-download --skip-index --threads N --beam-width N"

From a030ae7825eff7036a7e3a0d532827ef3f0d8721 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 09:37:31 -0800
Subject: [PATCH 22/46] Fix benchmark script: add search_phase to build config,
 thread sweep with charts

Fixes:
- Build config now includes required search_phase (was missing, caused
  'missing field search_phase' error)

New features:
- Thread sweep: runs BeamSearch vs PipeSearch for 1..max_threads
  (configurable --max-threads, --thread-stride)
- Generates 2x2 chart (QPS, mean latency, p95, p99.9 vs threads)
  with both modes as colored lines on same plot
- Outputs CSV for external plotting tools
- Configurable --search-l for single L value per sweep point
---
 diskann-benchmark/scripts/sift1m_benchmark.sh | 309 ++++++++++++++----
 1 file changed, 239 insertions(+), 70 deletions(-)

diff --git a/diskann-benchmark/scripts/sift1m_benchmark.sh b/diskann-benchmark/scripts/sift1m_benchmark.sh
index 30401edb8..7ea01b9ae 100755
--- a/diskann-benchmark/scripts/sift1m_benchmark.sh
+++ b/diskann-benchmark/scripts/sift1m_benchmark.sh
@@ -4,14 +4,17 @@
 # Downloads SIFT1M dataset, builds a disk index, and runs an ablation
 # benchmark comparing BeamSearch vs PipeSearch (io_uring pipelining).
 #
+# By default, sweeps thread counts from 1 to max_threads in strides of 4
+# and produces charts (QPS, mean latency, tail latency vs threads).
+#
 # Prerequisites:
 #   - Linux (PipeSearch requires io_uring)
 #   - Rust toolchain (cargo)
-#   - curl, tar, python3 with numpy
+#   - curl, tar, python3 with numpy and matplotlib
 #   - ~2GB free disk space for data + index
 #
 # Usage:
-#   ./diskann-benchmark/scripts/sift1m_benchmark.sh [--data-dir DIR] [--skip-download] [--skip-build]
+#   ./diskann-benchmark/scripts/sift1m_benchmark.sh [OPTIONS]
 
 set -euo pipefail
 
@@ -20,8 +23,10 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 
 # Defaults
 DATA_DIR="${DATA_DIR:-$REPO_ROOT/benchmark_data/sift1m}"
-NUM_THREADS="${NUM_THREADS:-4}"
+MAX_THREADS="${MAX_THREADS:-48}"
+THREAD_STRIDE="${THREAD_STRIDE:-4}"
 BEAM_WIDTH="${BEAM_WIDTH:-4}"
+SEARCH_L="${SEARCH_L:-100}"
 SKIP_DOWNLOAD=false
 SKIP_BUILD=false
 SKIP_INDEX=false
@@ -29,22 +34,26 @@ SKIP_INDEX=false
 # Parse arguments
 while [[ $# -gt 0 ]]; do
     case "$1" in
-        --data-dir)   DATA_DIR="$2"; shift 2 ;;
+        --data-dir)      DATA_DIR="$2"; shift 2 ;;
         --skip-download) SKIP_DOWNLOAD=true; shift ;;
-        --skip-build) SKIP_BUILD=true; shift ;;
-        --skip-index) SKIP_INDEX=true; shift ;;
-        --threads)    NUM_THREADS="$2"; shift 2 ;;
-        --beam-width) BEAM_WIDTH="$2"; shift 2 ;;
+        --skip-build)    SKIP_BUILD=true; shift ;;
+        --skip-index)    SKIP_INDEX=true; shift ;;
+        --max-threads)   MAX_THREADS="$2"; shift 2 ;;
+        --thread-stride) THREAD_STRIDE="$2"; shift 2 ;;
+        --beam-width)    BEAM_WIDTH="$2"; shift 2 ;;
+        --search-l)      SEARCH_L="$2"; shift 2 ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo ""
             echo "Options:"
-            echo "  --data-dir DIR     Data directory (default: \$REPO_ROOT/benchmark_data/sift1m)"
-            echo "  --skip-download    Skip downloading SIFT1M (use existing data)"
-            echo "  --skip-build       Skip building the benchmark binary"
-            echo "  --skip-index       Skip building the disk index (use existing index)"
-            echo "  --threads N        Number of search threads (default: 4)"
-            echo "  --beam-width N     Beam width / pipeline width (default: 4)"
+            echo "  --data-dir DIR       Data directory (default: \$REPO_ROOT/benchmark_data/sift1m)"
+            echo "  --skip-download      Skip downloading SIFT1M (use existing data)"
+            echo "  --skip-build         Skip building the benchmark binary"
+            echo "  --skip-index         Skip building the disk index (use existing index)"
+            echo "  --max-threads N      Maximum thread count for sweep (default: 48)"
+            echo "  --thread-stride N    Thread count increment (default: 4)"
+            echo "  --beam-width N       Beam width / pipeline width (default: 4)"
+            echo "  --search-l N         Search list size L (default: 100)"
             exit 0
             ;;
         *) echo "Unknown option: $1"; exit 1 ;;
@@ -54,12 +63,12 @@ done
 BIN_DIR="$DATA_DIR/bin"
 INDEX_DIR="$DATA_DIR/index"
 INDEX_PREFIX="$INDEX_DIR/sift1m_R64_L100"
-CONFIG_FILE="$DATA_DIR/benchmark_config.json"
-OUTPUT_FILE="$DATA_DIR/benchmark_results.json"
+OUTPUT_DIR="$DATA_DIR/results"
 
 echo "=== SIFT1M Pipelined Search Benchmark ==="
 echo "Data directory: $DATA_DIR"
-echo "Threads: $NUM_THREADS, Beam width: $BEAM_WIDTH"
+echo "Thread sweep: 1, 4..${MAX_THREADS} (stride ${THREAD_STRIDE})"
+echo "Beam width: $BEAM_WIDTH, Search L: $SEARCH_L"
 echo ""
 
 # -------------------------------------------------------------------
@@ -92,50 +101,41 @@ src_dir = Path(sys.argv[1])
 dst_dir = Path(sys.argv[2])
 
 def read_fvecs(path):
-    """Read .fvecs format: [dim(int32), vec(float32*dim)] per row."""
     data = np.fromfile(path, dtype=np.float32)
     dim = int(data[0].view(np.int32))
     return data.reshape(-1, dim + 1)[:, 1:]
 
 def read_ivecs(path):
-    """Read .ivecs format: [dim(int32), vec(int32*dim)] per row."""
     data = np.fromfile(path, dtype=np.int32)
     dim = data[0]
     return data.reshape(-1, dim + 1)[:, 1:]
 
 def write_fbin(path, data):
-    """Write DiskANN .fbin format: [npts(u32), dim(u32), data(float32*npts*dim)]."""
     npts, dim = data.shape
     with open(path, 'wb') as f:
         f.write(struct.pack('II', npts, dim))
         data.astype(np.float32).tofile(f)
 
 def write_ibin(path, data):
-    """Write DiskANN groundtruth .bin: [npts(u32), dim(u32), data(uint32*npts*dim)]."""
     npts, dim = data.shape
     with open(path, 'wb') as f:
         f.write(struct.pack('II', npts, dim))
         data.astype(np.uint32).tofile(f)
 
-# Convert base vectors
 base = read_fvecs(src_dir / "sift_base.fvecs")
 print(f"  Base: {base.shape[0]} points, {base.shape[1]} dims")
 write_fbin(dst_dir / "sift_base.fbin", base)
 
-# Convert query vectors
 query = read_fvecs(src_dir / "sift_query.fvecs")
 print(f"  Query: {query.shape[0]} points, {query.shape[1]} dims")
 write_fbin(dst_dir / "sift_query.fbin", query)
 
-# Convert ground truth (take top-100)
 gt = read_ivecs(src_dir / "sift_groundtruth.ivecs")
 print(f"  Groundtruth: {gt.shape[0]} queries, top-{gt.shape[1]}")
 write_ibin(dst_dir / "sift_groundtruth.bin", gt)
-
 print("  Conversion complete!")
 PYEOF
 
-        # Clean up extracted files
         rm -rf "$EXTRACT_DIR" "$SIFT_TAR"
     else
         echo "SIFT1M data already exists at $BIN_DIR, skipping download."
@@ -168,6 +168,8 @@ if [ "$SKIP_INDEX" = false ] && [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
     echo "--- Step 3: Building disk index (R=64, L=100, PQ_16) ---"
     mkdir -p "$INDEX_DIR"
 
+    # Build job requires a search_phase; we include a minimal one that also
+    # validates the index works after building.
     cat > "$DATA_DIR/build_config.json" << BUILDEOF
 {
     "search_directories": ["$BIN_DIR"],
@@ -183,11 +185,21 @@ if [ "$SKIP_INDEX" = false ] && [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
                     "dim": 128,
                     "max_degree": 64,
                     "l_build": 100,
-                    "num_threads": $NUM_THREADS,
+                    "num_threads": 4,
                     "build_ram_limit_gb": 4.0,
                     "num_pq_chunks": 16,
                     "quantization_type": "FP",
                     "save_path": "$INDEX_PREFIX"
+                },
+                "search_phase": {
+                    "queries": "sift_query.fbin",
+                    "groundtruth": "sift_groundtruth.bin",
+                    "search_list": [50],
+                    "beam_width": 4,
+                    "recall_at": 10,
+                    "num_threads": 1,
+                    "is_flat_search": false,
+                    "distance": "squared_l2"
                 }
             }
         }
@@ -211,64 +223,221 @@ if [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
 fi
 
 # -------------------------------------------------------------------
-# Step 4: Run ablation benchmark (BeamSearch vs PipeSearch)
+# Step 4: Thread sweep benchmark
 # -------------------------------------------------------------------
-echo "--- Step 4: Running ablation benchmark ---"
-cat > "$CONFIG_FILE" << CFGEOF
-{
-    "search_directories": ["$BIN_DIR"],
-    "jobs": [
+echo "--- Step 4: Running thread sweep benchmark ---"
+mkdir -p "$OUTPUT_DIR"
+
+# Build thread list: 1, then 4, 8, ..., MAX_THREADS
+THREAD_LIST="1"
+for (( t=THREAD_STRIDE; t<=MAX_THREADS; t+=THREAD_STRIDE )); do
+    THREAD_LIST="$THREAD_LIST $t"
+done
+echo "Thread counts: $THREAD_LIST"
+
+# Generate a single config with all jobs (2 per thread count: Beam + Pipe)
+JOBS=""
+for T in $THREAD_LIST; do
+    [ -n "$JOBS" ] && JOBS="$JOBS,"
+    JOBS="$JOBS
         {
-            "type": "disk-index",
-            "content": {
-                "source": {
-                    "disk-index-source": "Load",
-                    "data_type": "float32",
-                    "load_path": "$INDEX_PREFIX"
+            \"type\": \"disk-index\",
+            \"content\": {
+                \"source\": {
+                    \"disk-index-source\": \"Load\",
+                    \"data_type\": \"float32\",
+                    \"load_path\": \"$INDEX_PREFIX\"
                 },
-                "search_phase": {
-                    "queries": "sift_query.fbin",
-                    "groundtruth": "sift_groundtruth.bin",
-                    "search_list": [10, 20, 50, 100],
-                    "beam_width": $BEAM_WIDTH,
-                    "recall_at": 10,
-                    "num_threads": $NUM_THREADS,
-                    "is_flat_search": false,
-                    "distance": "squared_l2",
-                    "search_mode": {"mode": "BeamSearch"}
+                \"search_phase\": {
+                    \"queries\": \"sift_query.fbin\",
+                    \"groundtruth\": \"sift_groundtruth.bin\",
+                    \"search_list\": [$SEARCH_L],
+                    \"beam_width\": $BEAM_WIDTH,
+                    \"recall_at\": 10,
+                    \"num_threads\": $T,
+                    \"is_flat_search\": false,
+                    \"distance\": \"squared_l2\",
+                    \"search_mode\": {\"mode\": \"BeamSearch\"}
                 }
             }
         },
         {
-            "type": "disk-index",
-            "content": {
-                "source": {
-                    "disk-index-source": "Load",
-                    "data_type": "float32",
-                    "load_path": "$INDEX_PREFIX"
+            \"type\": \"disk-index\",
+            \"content\": {
+                \"source\": {
+                    \"disk-index-source\": \"Load\",
+                    \"data_type\": \"float32\",
+                    \"load_path\": \"$INDEX_PREFIX\"
                 },
-                "search_phase": {
-                    "queries": "sift_query.fbin",
-                    "groundtruth": "sift_groundtruth.bin",
-                    "search_list": [10, 20, 50, 100],
-                    "beam_width": $BEAM_WIDTH,
-                    "recall_at": 10,
-                    "num_threads": $NUM_THREADS,
-                    "is_flat_search": false,
-                    "distance": "squared_l2",
-                    "search_mode": {"mode": "PipeSearch"}
+                \"search_phase\": {
+                    \"queries\": \"sift_query.fbin\",
+                    \"groundtruth\": \"sift_groundtruth.bin\",
+                    \"search_list\": [$SEARCH_L],
+                    \"beam_width\": $BEAM_WIDTH,
+                    \"recall_at\": 10,
+                    \"num_threads\": $T,
+                    \"is_flat_search\": false,
+                    \"distance\": \"squared_l2\",
+                    \"search_mode\": {\"mode\": \"PipeSearch\"}
                 }
             }
-        }
+        }"
+done
+
+SWEEP_CONFIG="$OUTPUT_DIR/sweep_config.json"
+SWEEP_OUTPUT="$OUTPUT_DIR/sweep_results.json"
+
+cat > "$SWEEP_CONFIG" << SWEEPEOF
+{
+    "search_directories": ["$BIN_DIR"],
+    "jobs": [$JOBS
     ]
 }
-CFGEOF
+SWEEPEOF
+
+"$BENCHMARK_BIN" run --input-file "$SWEEP_CONFIG" --output-file "$SWEEP_OUTPUT"
+
+echo ""
+echo "--- Step 5: Generating charts ---"
+
+python3 - "$SWEEP_OUTPUT" "$OUTPUT_DIR" "$SEARCH_L" "$BEAM_WIDTH" << 'CHARTEOF'
+import json, sys, os
+
+output_dir = sys.argv[2]
+search_l = sys.argv[3]
+beam_width = sys.argv[4]
+
+with open(sys.argv[1]) as f:
+    data = json.load(f)
+
+# Parse results: each job is data[i] with structure:
+#   data[i]["results"]["search"]["search_mode"] — "BeamSearch" or "PipeSearch(...)"
+#   data[i]["results"]["search"]["num_threads"] — thread count
+#   data[i]["results"]["search"]["search_results_per_l"][0] — first (only) L result
+beam = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
+pipe = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
+
+for job in data:
+    search = job.get("results", {}).get("search", {})
+    if not search:
+        continue
+    results_per_l = search.get("search_results_per_l", [])
+    if not results_per_l:
+        continue
+    r = results_per_l[0]
+    threads = search.get("num_threads", 0)
+    mode = str(search.get("search_mode", ""))
+
+    d = beam if "BeamSearch" in mode else pipe
+
+    d["threads"].append(threads)
+    d["qps"].append(r.get("qps", 0))
+    d["mean_lat"].append(r.get("mean_latency", 0))
+    d["p95_lat"].append(r.get("p95_latency", 0))
+    d["p999_lat"].append(r.get("p999_latency", 0))
+    d["recall"].append(r.get("recall", 0))
+
+# Sort by threads
+for d in [beam, pipe]:
+    if d["threads"]:
+        order = sorted(range(len(d["threads"])), key=lambda i: d["threads"][i])
+        for k in d:
+            d[k] = [d[k][i] for i in order]
+
+# Print table
+print(f"\n{'Threads':>7s}  {'BeamSearch QPS':>14s} {'PipeSearch QPS':>14s}  "
+      f"{'Beam Mean':>10s} {'Pipe Mean':>10s}  "
+      f"{'Beam p999':>10s} {'Pipe p999':>10s}  "
+      f"{'Beam Recall':>11s} {'Pipe Recall':>11s}")
+print("=" * 120)
+
+for i in range(len(beam["threads"])):
+    bt, bq, bm, bp9 = beam["threads"][i], beam["qps"][i], beam["mean_lat"][i], beam["p999_lat"][i]
+    br = beam["recall"][i]
+    if i < len(pipe["threads"]):
+        pt, pq, pm, pp9 = pipe["threads"][i], pipe["qps"][i], pipe["mean_lat"][i], pipe["p999_lat"][i]
+        pr = pipe["recall"][i]
+    else:
+        pt, pq, pm, pp9, pr = bt, 0, 0, 0, 0
+    print(f"{bt:7d}  {bq:14.1f} {pq:14.1f}  {bm:9.0f}us {pm:9.0f}us  "
+          f"{bp9:9d}us {pp9:9d}us  {br:10.2f}% {pr:10.2f}%")
+
+# Generate charts
+try:
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f'SIFT1M BeamSearch vs PipeSearch (L={search_l}, BW={beam_width})', fontsize=14)
+
+    # QPS vs Threads
+    ax = axes[0][0]
+    ax.plot(beam["threads"], beam["qps"], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], pipe["qps"], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('QPS')
+    ax.set_title('Throughput (QPS)')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # Mean Latency vs Threads
+    ax = axes[0][1]
+    ax.plot(beam["threads"], [x/1000 for x in beam["mean_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], [x/1000 for x in pipe["mean_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('Mean Latency (ms)')
+    ax.set_title('Mean Latency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # p95 Latency vs Threads
+    ax = axes[1][0]
+    ax.plot(beam["threads"], [x/1000 for x in beam["p95_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], [x/1000 for x in pipe["p95_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('p95 Latency (ms)')
+    ax.set_title('p95 Tail Latency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # p99.9 Latency vs Threads
+    ax = axes[1][1]
+    ax.plot(beam["threads"], [x/1000 for x in beam["p999_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], [x/1000 for x in pipe["p999_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('p99.9 Latency (ms)')
+    ax.set_title('p99.9 Tail Latency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    chart_path = os.path.join(output_dir, 'thread_sweep.png')
+    plt.savefig(chart_path, dpi=150)
+    print(f"\nChart saved to: {chart_path}")
+    plt.close()
+
+except ImportError:
+    print("\nmatplotlib not available — skipping chart generation.")
+    print("Install with: pip install matplotlib")
 
-"$BENCHMARK_BIN" run --input-file "$CONFIG_FILE" --output-file "$OUTPUT_FILE"
+# Save CSV for external plotting
+csv_path = os.path.join(output_dir, 'thread_sweep.csv')
+with open(csv_path, 'w') as f:
+    f.write("threads,mode,qps,mean_lat_us,p95_lat_us,p999_lat_us,recall\n")
+    for d, mode in [(beam, "BeamSearch"), (pipe, "PipeSearch")]:
+        for i in range(len(d["threads"])):
+            f.write(f"{d['threads'][i]},{mode},{d['qps'][i]:.1f},"
+                    f"{d['mean_lat'][i]:.0f},{d['p95_lat'][i]},"
+                    f"{d['p999_lat'][i]},{d['recall'][i]:.3f}\n")
+print(f"CSV saved to: {csv_path}")
+CHARTEOF
 
 echo ""
 echo "=== Benchmark Complete ==="
-echo "Results saved to: $OUTPUT_FILE"
+echo "Results: $SWEEP_OUTPUT"
+echo "Charts:  $OUTPUT_DIR/thread_sweep.png"
+echo "CSV:     $OUTPUT_DIR/thread_sweep.csv"
 echo ""
 echo "To re-run with different parameters:"
-echo "  $0 --skip-download --skip-index --threads N --beam-width N"
+echo "  $0 --skip-download --skip-index --max-threads N --search-l N"

From c9abe71ef23657b50e0696c3106a43f70632500a Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 10:02:53 -0800
Subject: [PATCH 23/46] more ablation

---
 diskann-benchmark/scripts/sift1m_benchmark.sh | 207 +++++++++---------
 1 file changed, 109 insertions(+), 98 deletions(-)

diff --git a/diskann-benchmark/scripts/sift1m_benchmark.sh b/diskann-benchmark/scripts/sift1m_benchmark.sh
index 7ea01b9ae..2c2e11cab 100755
--- a/diskann-benchmark/scripts/sift1m_benchmark.sh
+++ b/diskann-benchmark/scripts/sift1m_benchmark.sh
@@ -30,6 +30,7 @@ SEARCH_L="${SEARCH_L:-100}"
 SKIP_DOWNLOAD=false
 SKIP_BUILD=false
 SKIP_INDEX=false
+SQPOLL_MS=""
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -42,6 +43,7 @@ while [[ $# -gt 0 ]]; do
         --thread-stride) THREAD_STRIDE="$2"; shift 2 ;;
         --beam-width)    BEAM_WIDTH="$2"; shift 2 ;;
         --search-l)      SEARCH_L="$2"; shift 2 ;;
+        --sqpoll)        SQPOLL_MS="${2:-1000}"; shift 2 ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo ""
@@ -54,6 +56,7 @@ while [[ $# -gt 0 ]]; do
             echo "  --thread-stride N    Thread count increment (default: 4)"
             echo "  --beam-width N       Beam width / pipeline width (default: 4)"
             echo "  --search-l N         Search list size L (default: 100)"
+            echo "  --sqpoll MS          Enable SQPOLL on all PipeSearch configs (idle timeout in ms)"
             exit 0
             ;;
         *) echo "Unknown option: $1"; exit 1 ;;
@@ -235,9 +238,11 @@ for (( t=THREAD_STRIDE; t<=MAX_THREADS; t+=THREAD_STRIDE )); do
 done
 echo "Thread counts: $THREAD_LIST"
 
-# Generate a single config with all jobs (2 per thread count: Beam + Pipe)
+# Generate a single config with all jobs (4 per thread count for ablation)
+# Modes: BeamSearch, PipeSearch (base), PipeSearch+ABW, PipeSearch+ABW+RelaxedMono
 JOBS=""
-for T in $THREAD_LIST; do
+add_job() {
+    local threads="$1" mode_json="$2"
     [ -n "$JOBS" ] && JOBS="$JOBS,"
     JOBS="$JOBS
         {
@@ -254,34 +259,24 @@ for T in $THREAD_LIST; do
                     \"search_list\": [$SEARCH_L],
                     \"beam_width\": $BEAM_WIDTH,
                     \"recall_at\": 10,
-                    \"num_threads\": $T,
-                    \"is_flat_search\": false,
-                    \"distance\": \"squared_l2\",
-                    \"search_mode\": {\"mode\": \"BeamSearch\"}
-                }
-            }
-        },
-        {
-            \"type\": \"disk-index\",
-            \"content\": {
-                \"source\": {
-                    \"disk-index-source\": \"Load\",
-                    \"data_type\": \"float32\",
-                    \"load_path\": \"$INDEX_PREFIX\"
-                },
-                \"search_phase\": {
-                    \"queries\": \"sift_query.fbin\",
-                    \"groundtruth\": \"sift_groundtruth.bin\",
-                    \"search_list\": [$SEARCH_L],
-                    \"beam_width\": $BEAM_WIDTH,
-                    \"recall_at\": 10,
-                    \"num_threads\": $T,
+                    \"num_threads\": $threads,
                     \"is_flat_search\": false,
                     \"distance\": \"squared_l2\",
-                    \"search_mode\": {\"mode\": \"PipeSearch\"}
+                    \"search_mode\": $mode_json
                 }
             }
         }"
+}
+# Optional SQPOLL suffix for PipeSearch configs
+SQPOLL_JSON=""
+if [ -n "$SQPOLL_MS" ]; then
+    SQPOLL_JSON=", \"sqpoll_idle_ms\": $SQPOLL_MS"
+fi
+for T in $THREAD_LIST; do
+    add_job "$T" '{"mode": "BeamSearch"}'
+    add_job "$T" '{"mode": "PipeSearch", "adaptive_beam_width": false'"$SQPOLL_JSON"'}'
+    add_job "$T" '{"mode": "PipeSearch", "adaptive_beam_width": true'"$SQPOLL_JSON"'}'
+    add_job "$T" '{"mode": "PipeSearch", "adaptive_beam_width": true, "relaxed_monotonicity_l": '"$SEARCH_L$SQPOLL_JSON"'}'
 done
 
 SWEEP_CONFIG="$OUTPUT_DIR/sweep_config.json"
@@ -310,12 +305,30 @@ beam_width = sys.argv[4]
 with open(sys.argv[1]) as f:
     data = json.load(f)
 
-# Parse results: each job is data[i] with structure:
-#   data[i]["results"]["search"]["search_mode"] — "BeamSearch" or "PipeSearch(...)"
-#   data[i]["results"]["search"]["num_threads"] — thread count
-#   data[i]["results"]["search"]["search_results_per_l"][0] — first (only) L result
-beam = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
-pipe = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
+# Classify each job into one of 4 modes based on search_mode string
+# BeamSearch, PipeSearch (no abw), PipeSearch(abw), PipeSearch(abw, rm_l=N)
+MODE_KEYS = [
+    ("BeamSearch",      "BeamSearch"),
+    ("PipeSearch(base)","PipeSearch"),      # no abw, no rm
+    ("PipeSearch+ABW",  "PipeSearch(abw)"), # abw only
+    ("PipeSearch+ABW+RM","PipeSearch(abw, rm_l="),  # abw + relaxed mono
+]
+
+def classify_mode(mode_str):
+    s = str(mode_str)
+    if "BeamSearch" in s:
+        return "BeamSearch"
+    if "rm_l=" in s:
+        return "PipeSearch+ABW+RM"
+    if "abw" in s:
+        return "PipeSearch+ABW"
+    # PipeSearch with no options or empty parens
+    return "PipeSearch(base)"
+
+def empty_series():
+    return {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
+
+series = {k: empty_series() for k, _ in MODE_KEYS}
 
 for job in data:
     search = job.get("results", {}).get("search", {})
@@ -326,10 +339,10 @@ for job in data:
         continue
     r = results_per_l[0]
     threads = search.get("num_threads", 0)
-    mode = str(search.get("search_mode", ""))
-
-    d = beam if "BeamSearch" in mode else pipe
-
+    mode = classify_mode(search.get("search_mode", ""))
+    d = series.get(mode)
+    if d is None:
+        continue
     d["threads"].append(threads)
     d["qps"].append(r.get("qps", 0))
     d["mean_lat"].append(r.get("mean_latency", 0))
@@ -337,79 +350,77 @@ for job in data:
     d["p999_lat"].append(r.get("p999_latency", 0))
     d["recall"].append(r.get("recall", 0))
 
-# Sort by threads
-for d in [beam, pipe]:
+# Sort each series by threads
+for d in series.values():
     if d["threads"]:
         order = sorted(range(len(d["threads"])), key=lambda i: d["threads"][i])
         for k in d:
             d[k] = [d[k][i] for i in order]
 
 # Print table
-print(f"\n{'Threads':>7s}  {'BeamSearch QPS':>14s} {'PipeSearch QPS':>14s}  "
-      f"{'Beam Mean':>10s} {'Pipe Mean':>10s}  "
-      f"{'Beam p999':>10s} {'Pipe p999':>10s}  "
-      f"{'Beam Recall':>11s} {'Pipe Recall':>11s}")
-print("=" * 120)
-
-for i in range(len(beam["threads"])):
-    bt, bq, bm, bp9 = beam["threads"][i], beam["qps"][i], beam["mean_lat"][i], beam["p999_lat"][i]
-    br = beam["recall"][i]
-    if i < len(pipe["threads"]):
-        pt, pq, pm, pp9 = pipe["threads"][i], pipe["qps"][i], pipe["mean_lat"][i], pipe["p999_lat"][i]
-        pr = pipe["recall"][i]
-    else:
-        pt, pq, pm, pp9, pr = bt, 0, 0, 0, 0
-    print(f"{bt:7d}  {bq:14.1f} {pq:14.1f}  {bm:9.0f}us {pm:9.0f}us  "
-          f"{bp9:9d}us {pp9:9d}us  {br:10.2f}% {pr:10.2f}%")
-
-# Generate charts
+header_modes = list(series.keys())
+print(f"\n{'Threads':>7s}", end="")
+for m in header_modes:
+    print(f"  {m+' QPS':>18s}", end="")
+print()
+print(f"{'':>7s}", end="")
+for m in header_modes:
+    print(f"  {'recall':>8s} {'mean':>8s} {'p999':>8s}", end="")
+print()
+print("=" * (7 + len(header_modes) * 30))
+
+max_len = max(len(series[m]["threads"]) for m in header_modes)
+for i in range(max_len):
+    t = None
+    for m in header_modes:
+        if i < len(series[m]["threads"]):
+            t = series[m]["threads"][i]
+            break
+    print(f"{t or 0:7d}", end="")
+    for m in header_modes:
+        d = series[m]
+        if i < len(d["threads"]):
+            print(f"  {d['qps'][i]:8.0f}qps {d['recall'][i]:6.1f}% {d['mean_lat'][i]/1000:5.1f}ms {d['p999_lat'][i]/1000:5.1f}ms", end="")
+        else:
+            print(f"  {'N/A':>30s}", end="")
+    print()
+
+# Chart styles per mode
+STYLES = {
+    "BeamSearch":         {"color": "#2196F3", "marker": "o", "ls": "-"},
+    "PipeSearch(base)":   {"color": "#FF9800", "marker": "D", "ls": "--"},
+    "PipeSearch+ABW":     {"color": "#FF5722", "marker": "s", "ls": "-"},
+    "PipeSearch+ABW+RM":  {"color": "#4CAF50", "marker": "^", "ls": "-"},
+}
+
 try:
     import matplotlib
     matplotlib.use('Agg')
     import matplotlib.pyplot as plt
 
     fig, axes = plt.subplots(2, 2, figsize=(14, 10))
-    fig.suptitle(f'SIFT1M BeamSearch vs PipeSearch (L={search_l}, BW={beam_width})', fontsize=14)
-
-    # QPS vs Threads
-    ax = axes[0][0]
-    ax.plot(beam["threads"], beam["qps"], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], pipe["qps"], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('QPS')
-    ax.set_title('Throughput (QPS)')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    # Mean Latency vs Threads
-    ax = axes[0][1]
-    ax.plot(beam["threads"], [x/1000 for x in beam["mean_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], [x/1000 for x in pipe["mean_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('Mean Latency (ms)')
-    ax.set_title('Mean Latency')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    # p95 Latency vs Threads
-    ax = axes[1][0]
-    ax.plot(beam["threads"], [x/1000 for x in beam["p95_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], [x/1000 for x in pipe["p95_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('p95 Latency (ms)')
-    ax.set_title('p95 Tail Latency')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    # p99.9 Latency vs Threads
-    ax = axes[1][1]
-    ax.plot(beam["threads"], [x/1000 for x in beam["p999_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], [x/1000 for x in pipe["p999_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('p99.9 Latency (ms)')
-    ax.set_title('p99.9 Tail Latency')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
+    fig.suptitle(f'SIFT1M Ablation (L={search_l}, BW={beam_width})', fontsize=14)
+
+    metrics = [
+        (axes[0][0], "qps",      "QPS",                1,    False),
+        (axes[0][1], "mean_lat", "Mean Latency (ms)",   1000, True),
+        (axes[1][0], "p95_lat",  "p95 Latency (ms)",    1000, True),
+        (axes[1][1], "p999_lat", "p99.9 Latency (ms)",  1000, True),
+    ]
+
+    for ax, key, title, divisor, is_latency in metrics:
+        for mode_name, d in series.items():
+            if not d["threads"]:
+                continue
+            st = STYLES.get(mode_name, {"color": "gray", "marker": ".", "ls": "-"})
+            vals = [v / divisor for v in d[key]] if divisor != 1 else d[key]
+            ax.plot(d["threads"], vals, marker=st["marker"], linestyle=st["ls"],
+                    color=st["color"], label=mode_name, linewidth=2, markersize=5)
+        ax.set_xlabel('Threads')
+        ax.set_ylabel(title)
+        ax.set_title(title)
+        ax.legend(fontsize=8)
+        ax.grid(True, alpha=0.3)
 
     plt.tight_layout()
     chart_path = os.path.join(output_dir, 'thread_sweep.png')
@@ -425,9 +436,9 @@ except ImportError:
 csv_path = os.path.join(output_dir, 'thread_sweep.csv')
 with open(csv_path, 'w') as f:
     f.write("threads,mode,qps,mean_lat_us,p95_lat_us,p999_lat_us,recall\n")
-    for d, mode in [(beam, "BeamSearch"), (pipe, "PipeSearch")]:
+    for mode_name, d in series.items():
         for i in range(len(d["threads"])):
-            f.write(f"{d['threads'][i]},{mode},{d['qps'][i]:.1f},"
+            f.write(f"{d['threads'][i]},{mode_name},{d['qps'][i]:.1f},"
                     f"{d['mean_lat'][i]:.0f},{d['p95_lat'][i]},"
                     f"{d['p999_lat'][i]},{d['recall'][i]:.3f}\n")
 print(f"CSV saved to: {csv_path}")

From 6f3d4bb2543dfa29ff3b3726ac4fd7cfea485c0b Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 10:08:49 -0800
Subject: [PATCH 24/46] Revert "more ablation"

This reverts commit 545ee2cc100070f68c18a66188d5ca3b4f496d40.
---
 diskann-benchmark/scripts/sift1m_benchmark.sh | 207 +++++++++---------
 1 file changed, 98 insertions(+), 109 deletions(-)

diff --git a/diskann-benchmark/scripts/sift1m_benchmark.sh b/diskann-benchmark/scripts/sift1m_benchmark.sh
index 2c2e11cab..7ea01b9ae 100755
--- a/diskann-benchmark/scripts/sift1m_benchmark.sh
+++ b/diskann-benchmark/scripts/sift1m_benchmark.sh
@@ -30,7 +30,6 @@ SEARCH_L="${SEARCH_L:-100}"
 SKIP_DOWNLOAD=false
 SKIP_BUILD=false
 SKIP_INDEX=false
-SQPOLL_MS=""
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -43,7 +42,6 @@ while [[ $# -gt 0 ]]; do
         --thread-stride) THREAD_STRIDE="$2"; shift 2 ;;
         --beam-width)    BEAM_WIDTH="$2"; shift 2 ;;
         --search-l)      SEARCH_L="$2"; shift 2 ;;
-        --sqpoll)        SQPOLL_MS="${2:-1000}"; shift 2 ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo ""
@@ -56,7 +54,6 @@ while [[ $# -gt 0 ]]; do
             echo "  --thread-stride N    Thread count increment (default: 4)"
             echo "  --beam-width N       Beam width / pipeline width (default: 4)"
             echo "  --search-l N         Search list size L (default: 100)"
-            echo "  --sqpoll MS          Enable SQPOLL on all PipeSearch configs (idle timeout in ms)"
             exit 0
             ;;
         *) echo "Unknown option: $1"; exit 1 ;;
@@ -238,11 +235,9 @@ for (( t=THREAD_STRIDE; t<=MAX_THREADS; t+=THREAD_STRIDE )); do
 done
 echo "Thread counts: $THREAD_LIST"
 
-# Generate a single config with all jobs (4 per thread count for ablation)
-# Modes: BeamSearch, PipeSearch (base), PipeSearch+ABW, PipeSearch+ABW+RelaxedMono
+# Generate a single config with all jobs (2 per thread count: Beam + Pipe)
 JOBS=""
-add_job() {
-    local threads="$1" mode_json="$2"
+for T in $THREAD_LIST; do
     [ -n "$JOBS" ] && JOBS="$JOBS,"
     JOBS="$JOBS
         {
@@ -259,24 +254,34 @@ add_job() {
                     \"search_list\": [$SEARCH_L],
                     \"beam_width\": $BEAM_WIDTH,
                     \"recall_at\": 10,
-                    \"num_threads\": $threads,
+                    \"num_threads\": $T,
+                    \"is_flat_search\": false,
+                    \"distance\": \"squared_l2\",
+                    \"search_mode\": {\"mode\": \"BeamSearch\"}
+                }
+            }
+        },
+        {
+            \"type\": \"disk-index\",
+            \"content\": {
+                \"source\": {
+                    \"disk-index-source\": \"Load\",
+                    \"data_type\": \"float32\",
+                    \"load_path\": \"$INDEX_PREFIX\"
+                },
+                \"search_phase\": {
+                    \"queries\": \"sift_query.fbin\",
+                    \"groundtruth\": \"sift_groundtruth.bin\",
+                    \"search_list\": [$SEARCH_L],
+                    \"beam_width\": $BEAM_WIDTH,
+                    \"recall_at\": 10,
+                    \"num_threads\": $T,
                     \"is_flat_search\": false,
                     \"distance\": \"squared_l2\",
-                    \"search_mode\": $mode_json
+                    \"search_mode\": {\"mode\": \"PipeSearch\"}
                 }
             }
         }"
-}
-# Optional SQPOLL suffix for PipeSearch configs
-SQPOLL_JSON=""
-if [ -n "$SQPOLL_MS" ]; then
-    SQPOLL_JSON=", \"sqpoll_idle_ms\": $SQPOLL_MS"
-fi
-for T in $THREAD_LIST; do
-    add_job "$T" '{"mode": "BeamSearch"}'
-    add_job "$T" '{"mode": "PipeSearch", "adaptive_beam_width": false'"$SQPOLL_JSON"'}'
-    add_job "$T" '{"mode": "PipeSearch", "adaptive_beam_width": true'"$SQPOLL_JSON"'}'
-    add_job "$T" '{"mode": "PipeSearch", "adaptive_beam_width": true, "relaxed_monotonicity_l": '"$SEARCH_L$SQPOLL_JSON"'}'
 done
 
 SWEEP_CONFIG="$OUTPUT_DIR/sweep_config.json"
@@ -305,30 +310,12 @@ beam_width = sys.argv[4]
 with open(sys.argv[1]) as f:
     data = json.load(f)
 
-# Classify each job into one of 4 modes based on search_mode string
-# BeamSearch, PipeSearch (no abw), PipeSearch(abw), PipeSearch(abw, rm_l=N)
-MODE_KEYS = [
-    ("BeamSearch",      "BeamSearch"),
-    ("PipeSearch(base)","PipeSearch"),      # no abw, no rm
-    ("PipeSearch+ABW",  "PipeSearch(abw)"), # abw only
-    ("PipeSearch+ABW+RM","PipeSearch(abw, rm_l="),  # abw + relaxed mono
-]
-
-def classify_mode(mode_str):
-    s = str(mode_str)
-    if "BeamSearch" in s:
-        return "BeamSearch"
-    if "rm_l=" in s:
-        return "PipeSearch+ABW+RM"
-    if "abw" in s:
-        return "PipeSearch+ABW"
-    # PipeSearch with no options or empty parens
-    return "PipeSearch(base)"
-
-def empty_series():
-    return {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
-
-series = {k: empty_series() for k, _ in MODE_KEYS}
+# Parse results: each job is data[i] with structure:
+#   data[i]["results"]["search"]["search_mode"] — "BeamSearch" or "PipeSearch(...)"
+#   data[i]["results"]["search"]["num_threads"] — thread count
+#   data[i]["results"]["search"]["search_results_per_l"][0] — first (only) L result
+beam = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
+pipe = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
 
 for job in data:
     search = job.get("results", {}).get("search", {})
@@ -339,10 +326,10 @@ for job in data:
         continue
     r = results_per_l[0]
     threads = search.get("num_threads", 0)
-    mode = classify_mode(search.get("search_mode", ""))
-    d = series.get(mode)
-    if d is None:
-        continue
+    mode = str(search.get("search_mode", ""))
+
+    d = beam if "BeamSearch" in mode else pipe
+
     d["threads"].append(threads)
     d["qps"].append(r.get("qps", 0))
     d["mean_lat"].append(r.get("mean_latency", 0))
@@ -350,77 +337,79 @@ for job in data:
     d["p999_lat"].append(r.get("p999_latency", 0))
     d["recall"].append(r.get("recall", 0))
 
-# Sort each series by threads
-for d in series.values():
+# Sort by threads
+for d in [beam, pipe]:
     if d["threads"]:
         order = sorted(range(len(d["threads"])), key=lambda i: d["threads"][i])
         for k in d:
             d[k] = [d[k][i] for i in order]
 
 # Print table
-header_modes = list(series.keys())
-print(f"\n{'Threads':>7s}", end="")
-for m in header_modes:
-    print(f"  {m+' QPS':>18s}", end="")
-print()
-print(f"{'':>7s}", end="")
-for m in header_modes:
-    print(f"  {'recall':>8s} {'mean':>8s} {'p999':>8s}", end="")
-print()
-print("=" * (7 + len(header_modes) * 30))
-
-max_len = max(len(series[m]["threads"]) for m in header_modes)
-for i in range(max_len):
-    t = None
-    for m in header_modes:
-        if i < len(series[m]["threads"]):
-            t = series[m]["threads"][i]
-            break
-    print(f"{t or 0:7d}", end="")
-    for m in header_modes:
-        d = series[m]
-        if i < len(d["threads"]):
-            print(f"  {d['qps'][i]:8.0f}qps {d['recall'][i]:6.1f}% {d['mean_lat'][i]/1000:5.1f}ms {d['p999_lat'][i]/1000:5.1f}ms", end="")
-        else:
-            print(f"  {'N/A':>30s}", end="")
-    print()
-
-# Chart styles per mode
-STYLES = {
-    "BeamSearch":         {"color": "#2196F3", "marker": "o", "ls": "-"},
-    "PipeSearch(base)":   {"color": "#FF9800", "marker": "D", "ls": "--"},
-    "PipeSearch+ABW":     {"color": "#FF5722", "marker": "s", "ls": "-"},
-    "PipeSearch+ABW+RM":  {"color": "#4CAF50", "marker": "^", "ls": "-"},
-}
-
+print(f"\n{'Threads':>7s}  {'BeamSearch QPS':>14s} {'PipeSearch QPS':>14s}  "
+      f"{'Beam Mean':>10s} {'Pipe Mean':>10s}  "
+      f"{'Beam p999':>10s} {'Pipe p999':>10s}  "
+      f"{'Beam Recall':>11s} {'Pipe Recall':>11s}")
+print("=" * 120)
+
+for i in range(len(beam["threads"])):
+    bt, bq, bm, bp9 = beam["threads"][i], beam["qps"][i], beam["mean_lat"][i], beam["p999_lat"][i]
+    br = beam["recall"][i]
+    if i < len(pipe["threads"]):
+        pt, pq, pm, pp9 = pipe["threads"][i], pipe["qps"][i], pipe["mean_lat"][i], pipe["p999_lat"][i]
+        pr = pipe["recall"][i]
+    else:
+        pt, pq, pm, pp9, pr = bt, 0, 0, 0, 0
+    print(f"{bt:7d}  {bq:14.1f} {pq:14.1f}  {bm:9.0f}us {pm:9.0f}us  "
+          f"{bp9:9d}us {pp9:9d}us  {br:10.2f}% {pr:10.2f}%")
+
+# Generate charts
 try:
     import matplotlib
     matplotlib.use('Agg')
     import matplotlib.pyplot as plt
 
     fig, axes = plt.subplots(2, 2, figsize=(14, 10))
-    fig.suptitle(f'SIFT1M Ablation (L={search_l}, BW={beam_width})', fontsize=14)
-
-    metrics = [
-        (axes[0][0], "qps",      "QPS",                1,    False),
-        (axes[0][1], "mean_lat", "Mean Latency (ms)",   1000, True),
-        (axes[1][0], "p95_lat",  "p95 Latency (ms)",    1000, True),
-        (axes[1][1], "p999_lat", "p99.9 Latency (ms)",  1000, True),
-    ]
-
-    for ax, key, title, divisor, is_latency in metrics:
-        for mode_name, d in series.items():
-            if not d["threads"]:
-                continue
-            st = STYLES.get(mode_name, {"color": "gray", "marker": ".", "ls": "-"})
-            vals = [v / divisor for v in d[key]] if divisor != 1 else d[key]
-            ax.plot(d["threads"], vals, marker=st["marker"], linestyle=st["ls"],
-                    color=st["color"], label=mode_name, linewidth=2, markersize=5)
-        ax.set_xlabel('Threads')
-        ax.set_ylabel(title)
-        ax.set_title(title)
-        ax.legend(fontsize=8)
-        ax.grid(True, alpha=0.3)
+    fig.suptitle(f'SIFT1M BeamSearch vs PipeSearch (L={search_l}, BW={beam_width})', fontsize=14)
+
+    # QPS vs Threads
+    ax = axes[0][0]
+    ax.plot(beam["threads"], beam["qps"], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], pipe["qps"], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('QPS')
+    ax.set_title('Throughput (QPS)')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # Mean Latency vs Threads
+    ax = axes[0][1]
+    ax.plot(beam["threads"], [x/1000 for x in beam["mean_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], [x/1000 for x in pipe["mean_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('Mean Latency (ms)')
+    ax.set_title('Mean Latency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # p95 Latency vs Threads
+    ax = axes[1][0]
+    ax.plot(beam["threads"], [x/1000 for x in beam["p95_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], [x/1000 for x in pipe["p95_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('p95 Latency (ms)')
+    ax.set_title('p95 Tail Latency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # p99.9 Latency vs Threads
+    ax = axes[1][1]
+    ax.plot(beam["threads"], [x/1000 for x in beam["p999_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
+    ax.plot(pipe["threads"], [x/1000 for x in pipe["p999_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
+    ax.set_xlabel('Threads')
+    ax.set_ylabel('p99.9 Latency (ms)')
+    ax.set_title('p99.9 Tail Latency')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
 
     plt.tight_layout()
     chart_path = os.path.join(output_dir, 'thread_sweep.png')
@@ -436,9 +425,9 @@ except ImportError:
 csv_path = os.path.join(output_dir, 'thread_sweep.csv')
 with open(csv_path, 'w') as f:
     f.write("threads,mode,qps,mean_lat_us,p95_lat_us,p999_lat_us,recall\n")
-    for mode_name, d in series.items():
+    for d, mode in [(beam, "BeamSearch"), (pipe, "PipeSearch")]:
         for i in range(len(d["threads"])):
-            f.write(f"{d['threads'][i]},{mode_name},{d['qps'][i]:.1f},"
+            f.write(f"{d['threads'][i]},{mode},{d['qps'][i]:.1f},"
                     f"{d['mean_lat'][i]:.0f},{d['p95_lat'][i]},"
                     f"{d['p999_lat'][i]},{d['recall'][i]:.3f}\n")
 print(f"CSV saved to: {csv_path}")

From 65655efae575310f1ddce2770f682fe0b4b1b6ce Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 10:15:38 -0800
Subject: [PATCH 25/46] track PQ preprocess time in pipelined path

---
 .../src/search/provider/pipelined_accessor.rs        | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 8def4ef31..2d28b1e8c 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -214,6 +214,8 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     io_time: std::time::Duration,
     /// Accumulated CPU time (fp distance + PQ distance + node parsing)
     cpu_time: std::time::Duration,
+    /// PQ preprocess time (distance table construction)
+    preprocess_time: std::time::Duration,
     // Shared stats written on drop so caller can read them after search
     shared_io_stats: Arc<PipelinedIoStats>,
 
@@ -265,6 +267,7 @@ where
             cache_hits: 0,
             io_time: std::time::Duration::ZERO,
             cpu_time: std::time::Duration::ZERO,
+            preprocess_time: std::time::Duration::ZERO,
             shared_io_stats,
             trace: None,
         })
@@ -272,6 +275,7 @@ where
 
     /// Preprocess PQ distance tables for this query. Must be called before search.
     pub fn preprocess_query(&mut self) -> ANNResult<()> {
+        let timer = std::time::Instant::now();
         let metadata = self.provider.graph_header.metadata();
         let dims = metadata.dims;
         let medoid = metadata.medoid as u32;
@@ -282,6 +286,7 @@ where
             self.provider.metric,
             &[medoid],
         )?;
+        self.preprocess_time = timer.elapsed();
         Ok(())
     }
 
@@ -742,6 +747,9 @@ where
         self.shared_io_stats
             .cpu_us
             .fetch_add(self.cpu_time.as_micros() as u64, Ordering::Relaxed);
+        self.shared_io_stats
+            .preprocess_us
+            .fetch_add(self.preprocess_time.as_micros() as u64, Ordering::Relaxed);
 
         // Print trace profile if enabled (controlled by DISKANN_TRACE=1)
         if let Some(trace) = self.trace.as_mut() {
@@ -782,6 +790,7 @@ pub struct PipelinedIoStats {
     pub cache_hits: AtomicU32,
     pub io_us: std::sync::atomic::AtomicU64,
     pub cpu_us: std::sync::atomic::AtomicU64,
+    pub preprocess_us: std::sync::atomic::AtomicU64,
 }
 
 impl Default for PipelinedIoStats {
@@ -791,6 +800,7 @@ impl Default for PipelinedIoStats {
             cache_hits: AtomicU32::new(0),
             io_us: std::sync::atomic::AtomicU64::new(0),
             cpu_us: std::sync::atomic::AtomicU64::new(0),
+            preprocess_us: std::sync::atomic::AtomicU64::new(0),
         }
     }
 }
@@ -977,6 +987,8 @@ where
             io_stats.io_count.load(Ordering::Relaxed) + io_stats.cache_hits.load(Ordering::Relaxed);
         query_stats.io_time_us = io_stats.io_us.load(Ordering::Relaxed) as u128;
         query_stats.cpu_time_us = io_stats.cpu_us.load(Ordering::Relaxed) as u128;
+        query_stats.query_pq_preprocess_time_us =
+            io_stats.preprocess_us.load(Ordering::Relaxed) as u128;
 
         let mut search_result = SearchResult {
             results: Vec::with_capacity(return_list_size as usize),

From 80c5ff175e9a7fcdf67177bb033d5429817cc518 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 10:36:14 -0800
Subject: [PATCH 26/46] Eliminate per-query allocations in pipelined search

Move VecDeque, HashMaps, and Vec scratch collections into pooled
PipelinedScratch so they retain capacity across queries. Replace
per-poll HashSet+VecDeque in drain_completions/wait_and_drain with
linear scan and in-place swap_remove_back. Reuse neighbor_buf for
PQ distance computation via split borrows.

Also tracks PQ preprocess time through PipelinedIoStats so it is
correctly reported (was previously 0 on the pipelined path).
---
 .../src/search/provider/pipelined_accessor.rs | 210 +++++++++++-------
 1 file changed, 124 insertions(+), 86 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 2d28b1e8c..4774dfc5d 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -126,6 +126,13 @@ fn max_slots(beam_width: usize) -> usize {
 pub struct PipelinedScratch {
     pub reader: PipelinedReader,
     pub pq_scratch: PQScratch,
+    // Per-query scratch collections, cleared between queries but retain capacity
+    in_flight_ios: VecDeque<InFlightIo>,
+    loaded_nodes: HashMap<u32, LoadedNode>,
+    expanded_ids: Vec<u32>,
+    distance_cache: HashMap<u32, f32>,
+    /// Reusable buffer for neighbor IDs during expand_available
+    neighbor_buf: Vec<u32>,
 }
 
 /// Arguments for creating or resetting a [`PipelinedScratch`].
@@ -159,11 +166,24 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
             args.num_pq_chunks,
             args.num_pq_centers,
         )?;
-        Ok(Self { reader, pq_scratch })
+        Ok(Self {
+            reader,
+            pq_scratch,
+            in_flight_ios: VecDeque::new(),
+            loaded_nodes: HashMap::new(),
+            expanded_ids: Vec::new(),
+            distance_cache: HashMap::new(),
+            neighbor_buf: Vec::new(),
+        })
     }
 
     fn try_modify(&mut self, _args: PipelinedScratchArgs) -> Result<(), Self::Error> {
         self.reader.reset();
+        self.in_flight_ios.clear();
+        self.loaded_nodes.clear();
+        self.expanded_ids.clear();
+        self.distance_cache.clear();
+        self.neighbor_buf.clear();
         Ok(())
     }
 }
@@ -194,18 +214,11 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     // Node cache (shared, read-only) for avoiding disk IO on hot nodes
     node_cache: Arc<Cache<Data>>,
 
-    // IO state
-    in_flight_ios: VecDeque<InFlightIo>,
-    loaded_nodes: HashMap<u32, LoadedNode>,
+    // IO state (now lives in scratch for reuse, accessed via self.scratch)
     next_slot_id: usize,
     max_slots: usize,
     /// Monotonically increasing submission rank for priority-ordered expansion.
     next_rank: u64,
-    /// IDs expanded in the most recent `expand_available` call.
-    expanded_ids: Vec<u32>,
-
-    // Distance cache for post-processing rerank
-    distance_cache: HashMap<u32, f32>,
 
     // IO statistics
     io_count: u32,
@@ -256,13 +269,9 @@ where
             fp_vector_len,
             num_points: provider.num_points,
             node_cache,
-            in_flight_ios: VecDeque::new(),
-            loaded_nodes: HashMap::new(),
             next_slot_id: 0,
             max_slots: slots,
             next_rank: 0,
-            expanded_ids: Vec::new(),
-            distance_cache: HashMap::new(),
             io_count: 0,
             cache_hits: 0,
             io_time: std::time::Duration::ZERO,
@@ -304,20 +313,31 @@ where
     }
 
     /// Compute PQ distances for a set of neighbor IDs.
+    /// `ids` must not alias any mutable scratch fields used by PQ computation.
     fn pq_distances<F>(&mut self, ids: &[u32], mut f: F) -> ANNResult<()>
     where
         F: FnMut(f32, u32),
     {
-        let pq = &mut self.scratch.pq_scratch;
+        Self::pq_distances_inner(&mut self.scratch.pq_scratch, self.provider, ids, &mut f)
+    }
+
+    fn pq_distances_inner<F>(
+        pq: &mut PQScratch,
+        provider: &DiskProvider<Data>,
+        ids: &[u32],
+        f: &mut F,
+    ) -> ANNResult<()>
+    where
+        F: FnMut(f32, u32),
+    {
         compute_pq_distance(
             ids,
-            self.provider.pq_data.get_num_chunks(),
+            provider.pq_data.get_num_chunks(),
             &pq.aligned_pqtable_dist_scratch,
-            self.provider.pq_data.pq_compressed_data().get_data(),
+            provider.pq_data.pq_compressed_data().get_data(),
             &mut pq.aligned_pq_coord_scratch,
             &mut pq.aligned_dist_scratch,
         )?;
-        let pq = &self.scratch.pq_scratch;
         for (i, id) in ids.iter().enumerate() {
             f(pq.aligned_dist_scratch[i], *id);
         }
@@ -336,7 +356,7 @@ where
 
     /// Poll completed IOs and move data from reader buffers into loaded_nodes.
     fn drain_completions(&mut self) -> ANNResult<()> {
-        if self.in_flight_ios.is_empty() {
+        if self.scratch.in_flight_ios.is_empty() {
             return Ok(());
         }
 
@@ -352,34 +372,14 @@ where
             return Ok(());
         }
 
-        let completed_set: std::collections::HashSet<usize> =
-            completed_slots.into_iter().collect();
-
-        let mut remaining = VecDeque::new();
-        while let Some(io) = self.in_flight_ios.pop_front() {
-            if completed_set.contains(&io.slot_id) {
-                trace.begin_phase();
-                let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
-                let parsed = parse_node(
-                    sector_buf,
-                    io.vertex_id,
-                    self.num_nodes_per_sector,
-                    self.node_len,
-                    self.fp_vector_len,
-                )?;
-                trace.end_phase_parse_node();
-                trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-                self.loaded_nodes.insert(io.vertex_id, LoadedNode {
-                    fp_vector: parsed.fp_vector,
-                    adjacency_list: parsed.adjacency_list,
-                    rank: io.rank,
-                });
-            } else {
-                remaining.push_back(io);
-            }
-        }
-        self.in_flight_ios = remaining;
-        Ok(())
+        Self::process_completed_ios_inner(
+            &mut self.scratch,
+            &completed_slots,
+            &mut trace,
+            self.num_nodes_per_sector,
+            self.node_len,
+            self.fp_vector_len,
+        )
     }
     /// Block until at least one IO completes, then eagerly drain all available.
     fn wait_and_drain(&mut self) -> ANNResult<()> {
@@ -394,32 +394,52 @@ where
             return Ok(());
         }
 
-        let completed_set: std::collections::HashSet<usize> =
-            completed_slots.into_iter().collect();
-        let mut remaining = VecDeque::new();
-        while let Some(io) = self.in_flight_ios.pop_front() {
-            if completed_set.contains(&io.slot_id) {
+        Self::process_completed_ios_inner(
+            &mut self.scratch,
+            &completed_slots,
+            &mut trace,
+            self.num_nodes_per_sector,
+            self.node_len,
+            self.fp_vector_len,
+        )
+    }
+
+    /// Shared logic: process completed slot IDs, parse nodes, retain in-flight.
+    /// Uses linear scan on completed_slots (small, bounded by max_slots) to
+    /// avoid per-poll HashSet allocation.
+    fn process_completed_ios_inner(
+        scratch: &mut PipelinedScratch,
+        completed_slots: &[usize],
+        trace: &mut OptionalTrace<'_>,
+        num_nodes_per_sector: u64,
+        node_len: u64,
+        fp_vector_len: u64,
+    ) -> ANNResult<()> {
+        let mut i = 0;
+        while i < scratch.in_flight_ios.len() {
+            let io = &scratch.in_flight_ios[i];
+            if completed_slots.contains(&io.slot_id) {
+                let io = scratch.in_flight_ios.swap_remove_back(i).unwrap();
                 trace.begin_phase();
-                let sector_buf = self.scratch.reader.get_slot_buf(io.slot_id);
+                let sector_buf = scratch.reader.get_slot_buf(io.slot_id);
                 let parsed = parse_node(
                     sector_buf,
                     io.vertex_id,
-                    self.num_nodes_per_sector,
-                    self.node_len,
-                    self.fp_vector_len,
+                    num_nodes_per_sector,
+                    node_len,
+                    fp_vector_len,
                 )?;
                 trace.end_phase_parse_node();
                 trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-                self.loaded_nodes.insert(io.vertex_id, LoadedNode {
+                scratch.loaded_nodes.insert(io.vertex_id, LoadedNode {
                     fp_vector: parsed.fp_vector,
                     adjacency_list: parsed.adjacency_list,
                     rank: io.rank,
                 });
             } else {
-                remaining.push_back(io);
+                i += 1;
             }
         }
-        self.in_flight_ios = remaining;
         Ok(())
     }
 }
@@ -528,7 +548,16 @@ where
         F: Send + FnMut(f32, Self::Id),
         Itr: Iterator<Item = Self::Id>,
     {
-        self.pq_distances(&vec_id_itr.collect::<Box<[_]>>(), f)
+        self.scratch.neighbor_buf.clear();
+        self.scratch.neighbor_buf.extend(vec_id_itr);
+        let mut f = f;
+        let PipelinedScratch { ref mut pq_scratch, ref neighbor_buf, .. } = *self.scratch;
+        Self::pq_distances_inner(
+            pq_scratch,
+            self.provider,
+            neighbor_buf,
+            &mut f,
+        )
     }
 }
 
@@ -544,7 +573,7 @@ where
         let io_start = Instant::now();
         trace.begin_phase();
         for id in ids {
-            if self.loaded_nodes.contains_key(&id) {
+            if self.scratch.loaded_nodes.contains_key(&id) {
                 continue; // Already loaded from a previous IO
             }
 
@@ -558,7 +587,7 @@ where
                 let adjacency_list: Vec<u32> = adj_list.iter().copied().collect();
                 let rank = self.next_rank;
                 self.next_rank += 1;
-                self.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list, rank });
+                self.scratch.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list, rank });
                 self.cache_hits += 1;
                 trace.event(TraceEventKind::CacheHit { node_id: id });
                 continue;
@@ -566,7 +595,7 @@ where
 
             // Don't submit if all io_uring slots are occupied — prevents overwriting
             // buffers that still have in-flight reads.
-            if self.in_flight_ios.len() >= self.max_slots {
+            if self.scratch.in_flight_ios.len() >= self.max_slots {
                 break;
             }
 
@@ -578,14 +607,14 @@ where
             self.next_rank += 1;
             // Best-effort: if submission fails, the node will be retried
             if self.scratch.reader.submit_read(sector_offset, slot_id).is_ok() {
-                self.in_flight_ios.push_back(InFlightIo {
+                self.scratch.in_flight_ios.push_back(InFlightIo {
                     vertex_id: id,
                     slot_id,
                     rank,
                 });
                 trace.event(TraceEventKind::Submit {
                     node_id: id,
-                    inflight: self.in_flight_ios.len(),
+                    inflight: self.scratch.in_flight_ios.len(),
                 });
                 self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
                 self.io_count += 1;
@@ -613,19 +642,19 @@ where
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
-            self.expanded_ids.clear();
+            self.scratch.expanded_ids.clear();
 
             // Non-blocking poll for completions
             self.drain_completions()?;
 
-            if self.loaded_nodes.is_empty() {
+            if self.scratch.loaded_nodes.is_empty() {
                 return Ok(0);
             }
 
             // Try caller's priority order first
             let mut best_vid: Option<u32> = None;
             for id in ids {
-                if self.loaded_nodes.contains_key(&id) {
+                if self.scratch.loaded_nodes.contains_key(&id) {
                     best_vid = Some(id);
                     break;
                 }
@@ -634,6 +663,7 @@ where
             // Fallback: pick loaded node with lowest rank (best PQ at submission)
             if best_vid.is_none() {
                 best_vid = self
+                    .scratch
                     .loaded_nodes
                     .iter()
                     .min_by_key(|(_, node)| node.rank)
@@ -644,8 +674,8 @@ where
                 Some(id) => id,
                 None => return Ok(0),
             };
-            let node = self.loaded_nodes.remove(&vid).unwrap();
-            self.expanded_ids.push(vid);
+            let node = self.scratch.loaded_nodes.remove(&vid).unwrap();
+            self.scratch.expanded_ids.push(vid);
 
             // Compute full-precision distance and cache it for post-processing
             let cpu_start = Instant::now();
@@ -657,20 +687,27 @@ where
             if let Some(t) = self.trace.as_mut() {
                 t.profile.fp_distance_us += cpu_start.elapsed().as_micros() as u64;
             }
-            self.distance_cache.insert(vid, fp_dist);
-
-            // Get unvisited neighbors
-            let neighbors: Vec<u32> = node
-                .adjacency_list
-                .iter()
-                .copied()
-                .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr))
-                .collect();
-            let num_new = neighbors.len() as u32;
-
-            if !neighbors.is_empty() {
+            self.scratch.distance_cache.insert(vid, fp_dist);
+
+            // Get unvisited neighbors into reusable buffer
+            self.scratch.neighbor_buf.clear();
+            self.scratch.neighbor_buf.extend(
+                node.adjacency_list
+                    .iter()
+                    .copied()
+                    .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr)),
+            );
+            let num_new = self.scratch.neighbor_buf.len() as u32;
+
+            if !self.scratch.neighbor_buf.is_empty() {
                 let pq_start = Instant::now();
-                self.pq_distances(&neighbors, &mut on_neighbors)?;
+                let PipelinedScratch { ref mut pq_scratch, ref neighbor_buf, .. } = *self.scratch;
+                Self::pq_distances_inner(
+                    pq_scratch,
+                    self.provider,
+                    neighbor_buf,
+                    &mut on_neighbors,
+                )?;
                 if let Some(t) = self.trace.as_mut() {
                     t.profile.pq_distance_us += pq_start.elapsed().as_micros() as u64;
                 }
@@ -693,22 +730,22 @@ where
 
     /// Returns true when there are in-flight IO operations.
     fn has_pending(&self) -> bool {
-        !self.in_flight_ios.is_empty()
+        !self.scratch.in_flight_ios.is_empty()
     }
 
     fn inflight_count(&self) -> usize {
-        self.in_flight_ios.len()
+        self.scratch.in_flight_ios.len()
     }
 
     fn wait_for_io(&mut self) {
         // Only block if there are actually in-flight IOs to wait for
-        if !self.in_flight_ios.is_empty() {
+        if !self.scratch.in_flight_ios.is_empty() {
             let _ = self.wait_and_drain();
         }
     }
 
     fn last_expanded_ids(&self) -> &[u32] {
-        &self.expanded_ids
+        &self.scratch.expanded_ids
     }
 
     fn is_pipelined(&self) -> bool {
@@ -846,6 +883,7 @@ where
         // full_retset approach: every expanded node contributes to results
         // regardless of its PQ distance ranking.
         let mut reranked: Vec<((u32, Data::AssociatedDataType), f32)> = accessor
+            .scratch
             .distance_cache
             .iter()
             .filter(|(id, _)| (self.filter)(id))

From 1ca067b78f50ba59553456914e2d28b6c771ab97 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 11:11:01 -0800
Subject: [PATCH 27/46] Pool LoadedNode instances to eliminate per-IO Vec
 allocations

Add node_pool freelist to PipelinedScratch. LoadedNode instances are
acquired from the pool before parsing (reusing Vec<u8>/Vec<u32> capacity)
and returned after expansion. parse_from() clears and extends existing
Vecs instead of allocating new ones.

Results at L=10: +10.8% QPS, -84% p999 tail latency vs prior commit.
---
 .../src/search/provider/pipelined_accessor.rs | 151 +++++++++++-------
 1 file changed, 90 insertions(+), 61 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 4774dfc5d..5482f88f2 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -55,60 +55,58 @@ struct LoadedNode {
     rank: u64,
 }
 
-/// Tracks an in-flight IO request.
-struct InFlightIo {
-    vertex_id: u32,
-    slot_id: usize,
-    rank: u64,
-}
+impl LoadedNode {
+    /// Reset and fill from sector buffer, reusing existing Vec capacity.
+    fn parse_from(
+        &mut self,
+        sector_buf: &[u8],
+        vertex_id: u32,
+        num_nodes_per_sector: u64,
+        node_len: u64,
+        fp_vector_len: u64,
+        rank: u64,
+    ) -> ANNResult<()> {
+        let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
+        let end = offset + node_len as usize;
+        let node_data = sector_buf.get(offset..end).ok_or_else(|| {
+            ANNError::log_index_error(format_args!(
+                "Node data out of bounds: vertex {} offset {}..{} in buffer of len {}",
+                vertex_id, offset, end, sector_buf.len()
+            ))
+        })?;
+
+        let fp_len = fp_vector_len as usize;
+        if fp_len > node_data.len() {
+            return Err(ANNError::log_index_error(format_args!(
+                "fp_vector_len {} exceeds node_data len {}",
+                fp_len, node_data.len()
+            )));
+        }
 
-/// Parsed node data from a sector buffer (without rank metadata).
-struct ParsedNode {
-    fp_vector: Vec<u8>,
-    adjacency_list: Vec<u32>,
-}
+        self.fp_vector.clear();
+        self.fp_vector.extend_from_slice(&node_data[..fp_len]);
 
-/// Parse a node from raw sector buffer bytes.
-fn parse_node(
-    sector_buf: &[u8],
-    vertex_id: u32,
-    num_nodes_per_sector: u64,
-    node_len: u64,
-    fp_vector_len: u64,
-) -> ANNResult<ParsedNode> {
-    let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
-    let end = offset + node_len as usize;
-    let node_data = sector_buf.get(offset..end).ok_or_else(|| {
-        ANNError::log_index_error(format_args!(
-            "Node data out of bounds: vertex {} offset {}..{} in buffer of len {}",
-            vertex_id, offset, end, sector_buf.len()
-        ))
-    })?;
-
-    let fp_vector_len_usize = fp_vector_len as usize;
-    if fp_vector_len_usize > node_data.len() {
-        return Err(ANNError::log_index_error(format_args!(
-            "fp_vector_len {} exceeds node_data len {}",
-            fp_vector_len_usize,
-            node_data.len()
-        )));
-    }
+        let neighbor_data = &node_data[fp_len..];
+        let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
+        let max_neighbors = (neighbor_data.len().saturating_sub(4)) / 4;
+        let num_neighbors = num_neighbors.min(max_neighbors);
+
+        self.adjacency_list.clear();
+        for i in 0..num_neighbors {
+            let start = 4 + i * 4;
+            self.adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
+        }
 
-    let fp_vector = node_data[..fp_vector_len_usize].to_vec();
-    let neighbor_data = &node_data[fp_vector_len_usize..];
-    let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
-    let max_neighbors = (neighbor_data.len().saturating_sub(4)) / 4;
-    let num_neighbors = num_neighbors.min(max_neighbors);
-    let mut adjacency_list = Vec::with_capacity(num_neighbors);
-    for i in 0..num_neighbors {
-        let start = 4 + i * 4;
-        adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
+        self.rank = rank;
+        Ok(())
     }
+}
 
-    Ok(ParsedNode {
-        fp_vector,
-        adjacency_list,
-    })
+/// Tracks an in-flight IO request.
+struct InFlightIo {
+    vertex_id: u32,
+    slot_id: usize,
+    rank: u64,
 }
 
 /// Max buffer slots to use, based on beam width.
@@ -133,6 +131,8 @@ pub struct PipelinedScratch {
     distance_cache: HashMap<u32, f32>,
     /// Reusable buffer for neighbor IDs during expand_available
     neighbor_buf: Vec<u32>,
+    /// Freelist of LoadedNode instances to avoid per-node allocation
+    node_pool: Vec<LoadedNode>,
 }
 
 /// Arguments for creating or resetting a [`PipelinedScratch`].
@@ -174,13 +174,15 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
             expanded_ids: Vec::new(),
             distance_cache: HashMap::new(),
             neighbor_buf: Vec::new(),
+            node_pool: Vec::new(),
         })
     }
 
     fn try_modify(&mut self, _args: PipelinedScratchArgs) -> Result<(), Self::Error> {
         self.reader.reset();
+        // Return all loaded_nodes back to the pool before clearing
+        self.node_pool.extend(self.loaded_nodes.drain().map(|(_, node)| node));
         self.in_flight_ios.clear();
-        self.loaded_nodes.clear();
         self.expanded_ids.clear();
         self.distance_cache.clear();
         self.neighbor_buf.clear();
@@ -188,6 +190,22 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
     }
 }
 
+impl PipelinedScratch {
+    /// Get a LoadedNode from the pool, or create a new empty one.
+    fn acquire_node(&mut self) -> LoadedNode {
+        self.node_pool.pop().unwrap_or_else(|| LoadedNode {
+            fp_vector: Vec::new(),
+            adjacency_list: Vec::new(),
+            rank: 0,
+        })
+    }
+
+    /// Return a LoadedNode to the pool for reuse.
+    fn release_node(&mut self, node: LoadedNode) {
+        self.node_pool.push(node);
+    }
+}
+
 // ---------------------------------------------------------------------------
 // PipelinedDiskAccessor
 // ---------------------------------------------------------------------------
@@ -406,7 +424,8 @@ where
 
     /// Shared logic: process completed slot IDs, parse nodes, retain in-flight.
     /// Uses linear scan on completed_slots (small, bounded by max_slots) to
-    /// avoid per-poll HashSet allocation.
+    /// avoid per-poll HashSet allocation. Reuses LoadedNode instances from the
+    /// node pool to avoid per-IO Vec allocations.
     fn process_completed_ios_inner(
         scratch: &mut PipelinedScratch,
         completed_slots: &[usize],
@@ -421,21 +440,25 @@ where
             if completed_slots.contains(&io.slot_id) {
                 let io = scratch.in_flight_ios.swap_remove_back(i).unwrap();
                 trace.begin_phase();
+                // Acquire node first (mutably borrows node_pool),
+                // then get sector buf (immutably borrows reader) — no conflict.
+                let mut node = scratch.node_pool.pop().unwrap_or_else(|| LoadedNode {
+                    fp_vector: Vec::new(),
+                    adjacency_list: Vec::new(),
+                    rank: 0,
+                });
                 let sector_buf = scratch.reader.get_slot_buf(io.slot_id);
-                let parsed = parse_node(
+                node.parse_from(
                     sector_buf,
                     io.vertex_id,
                     num_nodes_per_sector,
                     node_len,
                     fp_vector_len,
+                    io.rank,
                 )?;
                 trace.end_phase_parse_node();
                 trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-                scratch.loaded_nodes.insert(io.vertex_id, LoadedNode {
-                    fp_vector: parsed.fp_vector,
-                    adjacency_list: parsed.adjacency_list,
-                    rank: io.rank,
-                });
+                scratch.loaded_nodes.insert(io.vertex_id, node);
             } else {
                 i += 1;
             }
@@ -583,11 +606,14 @@ where
                 self.node_cache.get_vector(&id),
                 self.node_cache.get_adjacency_list(&id),
             ) {
-                let fp_vector: Vec<u8> = bytemuck::cast_slice(vec_data).to_vec();
-                let adjacency_list: Vec<u32> = adj_list.iter().copied().collect();
-                let rank = self.next_rank;
+                let mut node = self.scratch.acquire_node();
+                node.fp_vector.clear();
+                node.fp_vector.extend_from_slice(bytemuck::cast_slice(vec_data));
+                node.adjacency_list.clear();
+                node.adjacency_list.extend(adj_list.iter().copied());
+                node.rank = self.next_rank;
                 self.next_rank += 1;
-                self.scratch.loaded_nodes.insert(id, LoadedNode { fp_vector, adjacency_list, rank });
+                self.scratch.loaded_nodes.insert(id, node);
                 self.cache_hits += 1;
                 trace.event(TraceEventKind::CacheHit { node_id: id });
                 continue;
@@ -724,6 +750,9 @@ where
                 });
             }
 
+            // Return node to pool for reuse
+            self.scratch.release_node(node);
+
             Ok(1)
         }
     }

From 028e2159241356421e7ad150c06b081a42e58bf2 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 11:24:51 -0800
Subject: [PATCH 28/46] Remove old standalone PipelinedSearcher implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Delete pipelined_search.rs and pipelined_searcher.rs — the standalone
search loop is fully replaced by PipelinedDiskAccessor which plugs into
the generic search_internal() loop via the ExpandBeam trait.

Remove associated tests from disk_provider.rs and builder/core.rs.
Keep pipelined_reader.rs (io_uring reader used by the accessor).
---
 diskann-disk/src/build/builder/core.rs        |  88 --
 diskann-disk/src/build/builder/tests.rs       |  12 -
 diskann-disk/src/search/pipelined/mod.rs      |  23 +-
 .../src/search/pipelined/pipelined_search.rs  | 841 ------------------
 .../search/pipelined/pipelined_searcher.rs    | 425 ---------
 .../src/search/provider/disk_provider.rs      | 196 +---
 .../src/search/provider/pipelined_accessor.rs |   2 +-
 7 files changed, 6 insertions(+), 1581 deletions(-)
 delete mode 100644 diskann-disk/src/search/pipelined/pipelined_search.rs
 delete mode 100644 diskann-disk/src/search/pipelined/pipelined_searcher.rs

diff --git a/diskann-disk/src/build/builder/core.rs b/diskann-disk/src/build/builder/core.rs
index 9c8fffa61..d27182e83 100644
--- a/diskann-disk/src/build/builder/core.rs
+++ b/diskann-disk/src/build/builder/core.rs
@@ -1119,94 +1119,6 @@ pub(crate) mod disk_index_builder_tests {
         Ok(())
     }
 
-    /// Verifies search results via PipelinedSearcher (PipeANN) have good recall
-    /// against ground truth computed from the dataset.
-    #[cfg(target_os = "linux")]
-    pub(crate) fn verify_search_result_with_ground_truth_pipelined<
-        G: GraphDataType<VectorIdType = u32, AssociatedDataType = ()>,
-    >(
-        params: &TestParams,
-        top_k: usize,
-        search_l: u32,
-        storage_provider: &Arc<VirtualStorageProvider<OverlayFS>>,
-    ) -> ANNResult<()> {
-        use crate::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
-        use crate::search::traits::vertex_provider_factory::VertexProviderFactory;
-
-        let pq_pivot_path = get_pq_pivot_file(&params.index_path_prefix);
-        let pq_compressed_path = get_compressed_pq_file(&params.index_path_prefix);
-
-        let index_reader = DiskIndexReader::<G::VectorDataType>::new(
-            pq_pivot_path,
-            pq_compressed_path,
-            storage_provider.as_ref(),
-        )?;
-        let pq_data = index_reader.get_pq_data();
-
-        let vertex_provider_factory = DiskVertexProviderFactory::<G, _>::new(
-            VirtualAlignedReaderFactory::new(
-                get_disk_index_file(&params.index_path_prefix),
-                Arc::clone(storage_provider),
-            ),
-            CachingStrategy::None,
-        )?;
-        let graph_header = vertex_provider_factory.get_header()?;
-
-        // Resolve real filesystem path (PipelinedSearcher uses O_DIRECT).
-        let vfs_suffix = params.index_path_prefix.trim_start_matches('/');
-        let real_index_path = diskann_utils::test_data_root()
-            .join(format!("{}_disk.index", vfs_suffix));
-        let real_index_path_str = real_index_path.to_str().unwrap();
-
-        let pipe_searcher = PipelinedSearcher::<G>::new(
-            graph_header,
-            pq_data,
-            params.metric,
-            4,
-            None,
-            real_index_path_str.to_string(),
-            PipelinedReaderConfig::default(),
-        )?;
-
-        let (data, npoints, dim) = file_util::load_bin::<G::VectorDataType, _>(
-            storage_provider.as_ref(),
-            &params.data_path,
-            0,
-        )?;
-        let data =
-            diskann_utils::views::Matrix::try_from(data.into(), npoints, dim).bridge_err()?;
-        let distance = <G::VectorDataType>::distance(params.metric, Some(dim));
-
-        for (q, query_data) in data.row_iter().enumerate() {
-            let gt =
-                diskann_providers::test_utils::groundtruth(data.as_view(), query_data, |a, b| {
-                    distance.evaluate_similarity(a, b)
-                });
-
-            let result =
-                pipe_searcher.search(query_data, top_k as u32, search_l, 4, None)?;
-            let result_ids: Vec<u32> =
-                result.results.iter().map(|item| item.vertex_id).collect();
-
-            let gt_ids: Vec<u32> = gt.iter().take(top_k).map(|n| n.id).collect();
-            let matching = result_ids
-                .iter()
-                .filter(|id| gt_ids.contains(id))
-                .count();
-            let recall = matching as f32 / top_k as f32;
-            assert!(
-                recall >= 0.8,
-                "PipeANN recall {:.0}% < 80% for query {}, got {:?}, expected {:?}",
-                recall * 100.0,
-                q,
-                result_ids,
-                gt_ids,
-            );
-        }
-
-        Ok(())
-    }
-
     // Compare that the index built in test is the same as the truth index. The truth index doesn't have associated data, we are only comparing the vector and neighbor data.
     pub fn compare_disk_index_graphs(graph_data: &[u8], truth_graph_data: &[u8]) {
         let graph_header = GraphHeader::try_from(&graph_data[8..]).unwrap();
diff --git a/diskann-disk/src/build/builder/tests.rs b/diskann-disk/src/build/builder/tests.rs
index cb73bf239..e73e552a7 100644
--- a/diskann-disk/src/build/builder/tests.rs
+++ b/diskann-disk/src/build/builder/tests.rs
@@ -36,9 +36,6 @@ mod chunkable_disk_index_build_tests {
         QuantizationType,
     };
 
-    #[cfg(target_os = "linux")]
-    use crate::build::builder::core::disk_index_builder_tests::verify_search_result_with_ground_truth_pipelined;
-
     #[derive(PartialEq)]
     enum BuildType {
         AsyncFP,
@@ -195,15 +192,6 @@ mod chunkable_disk_index_build_tests {
         )
         .unwrap();
 
-        #[cfg(target_os = "linux")]
-        verify_search_result_with_ground_truth_pipelined::<GraphDataF32VectorUnitData>(
-            &fixture.params,
-            top_k,
-            search_l,
-            &fixture.storage_provider,
-        )
-        .unwrap();
-
         remove_checkpoint_record_file(&index_path_prefix);
     }
 
diff --git a/diskann-disk/src/search/pipelined/mod.rs b/diskann-disk/src/search/pipelined/mod.rs
index 26378da4f..fd66d64d8 100644
--- a/diskann-disk/src/search/pipelined/mod.rs
+++ b/diskann-disk/src/search/pipelined/mod.rs
@@ -3,18 +3,11 @@
  * Licensed under the MIT license.
  */
 
-//! Pipelined search module implementing the PipeANN algorithm.
+//! Pipelined IO reader for disk search using io_uring.
 //!
-//! This module provides a pipelined disk search that overlaps IO and compute
-//! within a single query, using io_uring for non-blocking IO on Linux.
-//!
-//! # Safety
-//!
-//! This search implementation is designed for **read-only search on completed
-//! (static) disk indices**. It bypasses the synchronized `DiskProvider` path and
-//! reads raw sectors directly via O_DIRECT, so it must NOT be used concurrently
-//! with index modifications (build, insert, delete). For search during streaming
-//! operations, use `DiskIndexSearcher` (beam search) instead.
+//! Provides [`PipelinedReader`] for non-blocking sector reads with O_DIRECT,
+//! used by [`PipelinedDiskAccessor`](super::provider::pipelined_accessor::PipelinedDiskAccessor)
+//! to overlap IO and compute within a single query.
 
 #[cfg(target_os = "linux")]
 mod pipelined_reader;
@@ -24,11 +17,3 @@ pub use pipelined_reader::PipelinedReader;
 pub use pipelined_reader::PipelinedReaderConfig;
 #[cfg(target_os = "linux")]
 pub use pipelined_reader::MAX_IO_CONCURRENCY;
-
-#[cfg(target_os = "linux")]
-mod pipelined_search;
-
-#[cfg(target_os = "linux")]
-mod pipelined_searcher;
-#[cfg(target_os = "linux")]
-pub use pipelined_searcher::PipelinedSearcher;
diff --git a/diskann-disk/src/search/pipelined/pipelined_search.rs b/diskann-disk/src/search/pipelined/pipelined_search.rs
deleted file mode 100644
index 0a3c045eb..000000000
--- a/diskann-disk/src/search/pipelined/pipelined_search.rs
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Core PipeANN pipelined search algorithm.
-
-use std::collections::{HashMap, HashSet, VecDeque};
-use std::time::Instant;
-
-use byteorder::{ByteOrder, LittleEndian};
-use diskann::{utils::VectorRepr, ANNError, ANNResult};
-use diskann_providers::model::{compute_pq_distance, pq::quantizer_preprocess, PQData, PQScratch};
-use diskann_vector::{distance::Metric, DistanceFunction};
-
-use super::pipelined_reader::PipelinedReader;
-use crate::search::search_trace::{OptionalTrace, SearchTrace, TraceEventKind};
-use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
-
-/// A candidate in the sorted candidate pool.
-struct Candidate {
-    id: u32,
-    distance: f32,
-    /// true = unvisited and not in-flight, false = in-flight or already-read
-    flag: bool,
-    /// true = node has been processed (neighbors expanded)
-    visited: bool,
-}
-
-/// Tracks an in-flight IO request.
-struct InFlightIo {
-    vertex_id: u32,
-    slot_id: usize,
-}
-
-/// A loaded node parsed from sector data.
-struct LoadedNode {
-    fp_vector: Vec<u8>,
-    adjacency_list: Vec<u32>,
-}
-
-/// Result of a pipelined search.
-pub struct PipeSearchResult {
-    pub ids: Vec<u32>,
-    pub distances: Vec<f32>,
-    pub stats: PipeSearchStats,
-}
-
-/// Statistics for a pipelined search.
-pub struct PipeSearchStats {
-    pub total_us: u128,
-    pub io_us: u128,
-    pub cpu_us: u128,
-    pub io_count: u32,
-    pub comparisons: u32,
-    pub hops: u32,
-}
-
-/// Parse a node from raw sector buffer bytes.
-fn parse_node(
-    sector_buf: &[u8],
-    vertex_id: u32,
-    num_nodes_per_sector: u64,
-    node_len: u64,
-    fp_vector_len: u64,
-) -> ANNResult<LoadedNode> {
-    let offset = node_offset_in_sector(vertex_id, num_nodes_per_sector, node_len);
-    let end = offset + node_len as usize;
-    let node_data = sector_buf.get(offset..end).ok_or_else(|| {
-        ANNError::log_index_error(format_args!(
-            "Node data out of bounds: vertex {} offset {}..{} in buffer of len {}",
-            vertex_id,
-            offset,
-            end,
-            sector_buf.len()
-        ))
-    })?;
-
-    let fp_vector_len_usize = fp_vector_len as usize;
-    if fp_vector_len_usize > node_data.len() {
-        return Err(ANNError::log_index_error(format_args!(
-            "fp_vector_len {} exceeds node_data len {}",
-            fp_vector_len_usize,
-            node_data.len()
-        )));
-    }
-
-    // Copy required: the slot buffer will be reused for subsequent IOs while
-    // the parsed node remains in id_buf_map until visited.
-    let fp_vector = node_data[..fp_vector_len_usize].to_vec();
-
-    let neighbor_data = &node_data[fp_vector_len_usize..];
-    let num_neighbors = LittleEndian::read_u32(&neighbor_data[..4]) as usize;
-    // Clamp to the available data to avoid out-of-bounds reads.
-    let max_neighbors = (neighbor_data.len().saturating_sub(4)) / 4;
-    let num_neighbors = num_neighbors.min(max_neighbors);
-    let mut adjacency_list = Vec::with_capacity(num_neighbors);
-    for i in 0..num_neighbors {
-        let start = 4 + i * 4;
-        adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
-    }
-
-    Ok(LoadedNode {
-        fp_vector,
-        adjacency_list,
-    })
-}
-
-/// Insert a candidate into the sorted retset, maintaining sort order by distance.
-/// Returns the insertion position.
-fn insert_into_pool(retset: &mut Vec<Candidate>, pool_size: &mut usize, candidate: Candidate) -> usize {
-    // Binary search for insertion point
-    let pos = retset[..*pool_size]
-        .binary_search_by(|probe| probe.distance.total_cmp(&candidate.distance))
-        .unwrap_or_else(|x| x);
-
-    // If pool is full and candidate is worse than all existing, don't insert
-    if pos >= retset.len() {
-        return pos;
-    }
-
-    // Make room if needed
-    if *pool_size >= retset.len() {
-        retset.resize_with(retset.len() * 2, || Candidate {
-            id: 0,
-            distance: f32::MAX,
-            flag: false,
-            visited: false,
-        });
-    }
-
-    // Shift elements right
-    let end = (*pool_size).min(retset.len() - 1);
-    for i in (pos..end).rev() {
-        retset.swap(i, i + 1);
-    }
-    retset[pos] = candidate;
-
-    pos
-}
-
-/// Core pipelined search function implementing the PipeANN algorithm.
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn pipe_search<T: VectorRepr>(
-    reader: &mut PipelinedReader,
-    pq_data: &PQData,
-    distance_comparer: &T::Distance,
-    query: &[T],
-    k: usize,
-    search_l: usize,
-    beam_width: usize,
-    medoid: u32,
-    dims: usize,
-    node_len: u64,
-    num_nodes_per_sector: u64,
-    block_size: usize,
-    fp_vector_len: u64,
-    pq_scratch: &mut PQScratch,
-    relaxed_monotonicity_l: Option<usize>,
-    metric: Metric,
-    vector_filter: Option<&(dyn Fn(&u32) -> bool + Send + Sync)>,
-    trace: Option<&mut SearchTrace>,
-) -> ANNResult<PipeSearchResult> {
-    let mut trace = OptionalTrace(trace);
-    let timer = Instant::now();
-    let mut io_count: u32 = 0;
-    let mut comparisons: u32 = 0;
-    let mut hops: u32 = 0;
-    let mut io_time = std::time::Duration::ZERO;
-    let mut cpu_time = std::time::Duration::ZERO;
-
-    let num_pq_chunks = pq_data.get_num_chunks();
-    let pq_compressed = pq_data.pq_compressed_data().get_data();
-    let num_pts = pq_compressed.len() / num_pq_chunks;
-
-    let num_sectors_per_node = if num_nodes_per_sector > 0 {
-        1
-    } else {
-        (node_len as usize).div_ceil(block_size)
-    };
-
-    // Prepare PQ distance table for the query and compute PQ distance to medoid
-    pq_scratch.set(dims, query, 1.0)?;
-    let medoid_ids = [medoid];
-    quantizer_preprocess(pq_scratch, pq_data, metric, &medoid_ids)?;
-    let medoid_dist = pq_scratch.aligned_dist_scratch[0];
-
-    // Initialize candidate pool
-    let initial_cap = search_l * 2 + 10;
-    let mut retset: Vec<Candidate> = Vec::with_capacity(initial_cap);
-    for _ in 0..initial_cap {
-        retset.push(Candidate {
-            id: 0,
-            distance: f32::MAX,
-            flag: false,
-            visited: false,
-        });
-    }
-    retset[0] = Candidate {
-        id: medoid,
-        distance: medoid_dist,
-        flag: true,
-        visited: false,
-    };
-    let mut cur_list_size: usize = 1;
-
-    let mut visited = HashSet::new();
-    visited.insert(medoid);
-
-    let mut full_retset: Vec<(u32, f32)> = Vec::with_capacity(search_l * 2);
-
-    let mut on_flight_ios: VecDeque<InFlightIo> = VecDeque::new();
-    let mut id_buf_map: HashMap<u32, LoadedNode> = HashMap::new();
-    let mut next_slot_id: usize = 0;
-
-    let mut cur_beam_width: usize = beam_width.min(4);
-    let mut max_marker: usize = 0;
-    let mut cur_n_in: usize = 0;
-    let mut cur_tot: usize = 0;
-    let mut converge_size: i64 = -1;
-
-    // Closure-like helpers implemented as inline functions via the loop body
-
-    // Submit initial reads
-    {
-        let io_start = Instant::now();
-        let to_send = cur_beam_width.saturating_sub(on_flight_ios.len());
-        let mut n_sent = 0;
-        let mut marker = 0;
-        while marker < cur_list_size && n_sent < to_send {
-            if retset[marker].flag && !id_buf_map.contains_key(&retset[marker].id) {
-                // Send read for this candidate
-                let vid = retset[marker].id;
-                retset[marker].flag = false;
-
-                let sector_idx =
-                    node_sector_index(vid, num_nodes_per_sector, num_sectors_per_node);
-                let sector_offset = sector_idx * block_size as u64;
-                let slot_id = next_slot_id % max_slots(beam_width);
-                reader.submit_read(sector_offset, slot_id)?;
-                on_flight_ios.push_back(InFlightIo {
-                    vertex_id: vid,
-                    slot_id,
-                });
-                next_slot_id = (next_slot_id + 1) % max_slots(beam_width);
-                io_count += 1;
-                n_sent += 1;
-            }
-            marker += 1;
-        }
-        io_time += io_start.elapsed();
-    }
-
-    // Main search loop
-    loop {
-        // Check if there's a first unvisited candidate
-        let first_unvisited = retset[..cur_list_size]
-            .iter()
-            .position(|c| !c.visited);
-        if first_unvisited.is_none() {
-            break;
-        }
-
-        // Poll completions (non-blocking). Keeping this non-blocking is critical
-        // for overlapping IO and compute — blocking here would serialize the pipeline.
-        let io_poll_start = Instant::now();
-        trace.begin_phase();
-        let completed_slots = reader.poll_completions()?;
-        trace.end_phase_io_poll();
-        io_time += io_poll_start.elapsed();
-        let mut n_in: usize = 0;
-        let mut n_out: usize = 0;
-
-        // Process completed IOs: move from on_flight to id_buf_map
-        if !completed_slots.is_empty() {
-            let completed_set: HashSet<usize> = completed_slots.into_iter().collect();
-            let mut remaining = VecDeque::new();
-            while let Some(io) = on_flight_ios.pop_front() {
-                if completed_set.contains(&io.slot_id) {
-                    let sector_buf = reader.get_slot_buf(io.slot_id);
-                    trace.begin_phase();
-                    let node = parse_node(
-                        sector_buf,
-                        io.vertex_id,
-                        num_nodes_per_sector,
-                        node_len,
-                        fp_vector_len,
-                    )?;
-                    trace.end_phase_parse_node();
-                    trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
-                    // Track convergence: is this node still in the top of retset?
-                    if cur_list_size > 0 {
-                        let last_dist = retset[cur_list_size - 1].distance;
-                        // Find this node's PQ distance in retset
-                        let in_pool = retset[..cur_list_size]
-                            .iter()
-                            .any(|c| c.id == io.vertex_id && c.distance <= last_dist);
-                        if in_pool {
-                            n_in += 1;
-                        } else {
-                            n_out += 1;
-                        }
-                    }
-                    id_buf_map.insert(io.vertex_id, node);
-                } else {
-                    remaining.push_back(io);
-                }
-            }
-            on_flight_ios = remaining;
-        }
-
-        // Track convergence and adjust beam width
-        if max_marker >= 5 && (n_in + n_out) > 0 {
-            cur_n_in += n_in;
-            cur_tot += n_in + n_out;
-            const WASTE_THRESHOLD: f64 = 0.1;
-            if (cur_tot - cur_n_in) as f64 / cur_tot as f64 <= WASTE_THRESHOLD {
-                cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
-            }
-            if let Some(rm_l) = relaxed_monotonicity_l {
-                if rm_l > 0 && converge_size < 0 {
-                    converge_size = full_retset.len() as i64;
-                }
-            }
-        }
-
-        // Check relaxed monotonicity termination
-        if let Some(rm_l) = relaxed_monotonicity_l {
-            if rm_l > 0
-                && converge_size >= 0
-                && full_retset.len() >= (converge_size as usize) + rm_l
-            {
-                break;
-            }
-        }
-
-        // Submit more reads if room
-        if on_flight_ios.len() < cur_beam_width {
-            let io_submit_start = Instant::now();
-            trace.begin_phase();
-            let to_send = 1;
-            let mut n_sent = 0;
-            let mut marker = 0;
-            while marker < cur_list_size && n_sent < to_send {
-                let c = &retset[marker];
-                if c.flag && !id_buf_map.contains_key(&c.id) {
-                    let vid = retset[marker].id;
-                    retset[marker].flag = false;
-
-                    let sector_idx =
-                        node_sector_index(vid, num_nodes_per_sector, num_sectors_per_node);
-                    let sector_offset = sector_idx * block_size as u64;
-                    let slot_id = next_slot_id % max_slots(beam_width);
-                    reader.submit_read(sector_offset, slot_id)?;
-                    on_flight_ios.push_back(InFlightIo {
-                        vertex_id: vid,
-                        slot_id,
-                    });
-                    trace.event(TraceEventKind::Submit {
-                        node_id: vid,
-                        inflight: on_flight_ios.len(),
-                    });
-                    next_slot_id = (next_slot_id + 1) % max_slots(beam_width);
-                    io_count += 1;
-                    n_sent += 1;
-                }
-                marker += 1;
-            }
-            trace.end_phase_io_submit();
-            io_time += io_submit_start.elapsed();
-        }
-
-        // calc_best_node: find one node in id_buf_map that's in retset and unvisited, process it
-        let cpu_start = Instant::now();
-        let mut best_marker = cur_list_size;
-        let calc_limit = cur_list_size;
-        #[allow(clippy::needless_range_loop)]
-        for i in 0..calc_limit {
-            if !retset[i].visited && id_buf_map.contains_key(&retset[i].id) {
-                retset[i].flag = false;
-                retset[i].visited = true;
-                let vid = retset[i].id;
-                hops += 1;
-
-                if let Some(node) = id_buf_map.get(&vid) {
-                    // Compute full-precision distance; only add to results if
-                    // filter is absent or the node passes the filter predicate.
-                    trace.begin_phase();
-                    let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
-                    let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
-                    trace.end_phase_fp_distance();
-                    if vector_filter.map_or(true, |f| f(&vid)) {
-                        full_retset.push((vid, fp_dist));
-                    }
-
-                    // Expand neighbors
-                    let mut nbors_to_compute: Vec<u32> = Vec::new();
-                    for &nbr_id in &node.adjacency_list {
-                        if (nbr_id as usize) < num_pts && visited.insert(nbr_id) {
-                            nbors_to_compute.push(nbr_id);
-                        }
-                    }
-
-                    let num_new_candidates;
-                    if !nbors_to_compute.is_empty() {
-                        comparisons += nbors_to_compute.len() as u32;
-                        // Compute PQ distances for unvisited neighbors
-                        trace.begin_phase();
-                        compute_pq_distance(
-                            &nbors_to_compute,
-                            num_pq_chunks,
-                            &pq_scratch.aligned_pqtable_dist_scratch,
-                            pq_compressed,
-                            &mut pq_scratch.aligned_pq_coord_scratch,
-                            &mut pq_scratch.aligned_dist_scratch,
-                        )?;
-                        trace.end_phase_pq_distance();
-
-                        trace.begin_phase();
-                        let mut nk = cur_list_size;
-                        let mut n_inserted: u32 = 0;
-                        for (m, &nbr_id) in nbors_to_compute.iter().enumerate() {
-                            let nbr_dist = pq_scratch.aligned_dist_scratch[m];
-                            if cur_list_size == search_l
-                                && nbr_dist >= retset[cur_list_size - 1].distance
-                            {
-                                continue;
-                            }
-                            let nn = Candidate {
-                                id: nbr_id,
-                                distance: nbr_dist,
-                                flag: true,
-                                visited: false,
-                            };
-                            let r = insert_into_pool(&mut retset, &mut cur_list_size, nn);
-                            if cur_list_size < search_l {
-                                cur_list_size += 1;
-                            }
-                            if r < nk {
-                                nk = r;
-                            }
-                            n_inserted += 1;
-                        }
-                        trace.end_phase_queue_ops();
-                        num_new_candidates = n_inserted;
-                    } else {
-                        num_new_candidates = 0;
-                    }
-
-                    trace.record_expand();
-                    trace.event(TraceEventKind::Expand {
-                        node_id: vid,
-                        fp_distance: fp_dist,
-                        num_neighbors: node.adjacency_list.len() as u32,
-                        num_new_candidates,
-                    });
-                }
-
-                // Find first_unvisited_eager for convergence tracking
-                for (j, c) in retset.iter().enumerate().take(cur_list_size) {
-                    if !c.visited && c.flag && !id_buf_map.contains_key(&c.id) {
-                        best_marker = j;
-                        break;
-                    }
-                }
-                break;
-            }
-        }
-        max_marker = max_marker.max(best_marker);
-        cpu_time += cpu_start.elapsed();
-    }
-
-    // In relaxed monotonicity mode: drain remaining IOs and process unvisited nodes
-    if relaxed_monotonicity_l.is_some_and(|l| l > 0) {
-        // Drain all in-flight IOs (block until each completes)
-        while !on_flight_ios.is_empty() {
-            let completed_slots = reader.wait_completions()?;
-            if !completed_slots.is_empty() {
-                let completed_set: HashSet<usize> = completed_slots.into_iter().collect();
-                let mut remaining = VecDeque::new();
-                while let Some(io) = on_flight_ios.pop_front() {
-                    if completed_set.contains(&io.slot_id) {
-                        let sector_buf = reader.get_slot_buf(io.slot_id);
-                        let node = parse_node(
-                            sector_buf,
-                            io.vertex_id,
-                            num_nodes_per_sector,
-                            node_len,
-                            fp_vector_len,
-                        )?;
-                        id_buf_map.insert(io.vertex_id, node);
-                    } else {
-                        remaining.push_back(io);
-                    }
-                }
-                on_flight_ios = remaining;
-            }
-        }
-        // Process remaining unvisited nodes
-        for c in retset.iter_mut().take(cur_list_size) {
-            if !c.visited {
-                if let Some(node) = id_buf_map.get(&c.id) {
-                    c.visited = true;
-                    let fp_vec: &[T] = bytemuck::cast_slice(&node.fp_vector);
-                    let fp_dist = distance_comparer.evaluate_similarity(query, fp_vec);
-                    if vector_filter.map_or(true, |f| f(&c.id)) {
-                        full_retset.push((c.id, fp_dist));
-                    }
-                }
-            }
-        }
-    }
-
-    // Sort full_retset and return top-k (total_cmp handles NaN correctly)
-    full_retset.sort_by(|a, b| a.1.total_cmp(&b.1));
-
-    // Deduplicate
-    let mut ids = Vec::with_capacity(k);
-    let mut distances = Vec::with_capacity(k);
-    let mut seen = HashSet::new();
-    for (id, dist) in &full_retset {
-        if ids.len() >= k {
-            break;
-        }
-        if seen.insert(*id) {
-            ids.push(*id);
-            distances.push(*dist);
-        }
-    }
-
-    let total_us = timer.elapsed().as_micros();
-
-    trace.event(TraceEventKind::Done {
-        total_hops: hops,
-        total_ios: io_count,
-        total_comparisons: comparisons,
-    });
-    if let Some(t) = trace.0.as_mut() {
-        t.finish();
-    }
-
-    Ok(PipeSearchResult {
-        ids,
-        distances,
-        stats: PipeSearchStats {
-            total_us,
-            io_us: io_time.as_micros(),
-            cpu_us: cpu_time.as_micros(),
-            io_count,
-            comparisons,
-            hops,
-        },
-    })
-}
-
-/// Max buffer slots to use, based on beam width.
-#[inline]
-fn max_slots(beam_width: usize) -> usize {
-    (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // ---- helpers ----
-
-    fn make_candidate(id: u32, distance: f32) -> Candidate {
-        Candidate {
-            id,
-            distance,
-            flag: true,
-            visited: false,
-        }
-    }
-
-    fn empty_pool(cap: usize) -> Vec<Candidate> {
-        (0..cap)
-            .map(|_| Candidate {
-                id: 0,
-                distance: f32::MAX,
-                flag: false,
-                visited: false,
-            })
-            .collect()
-    }
-
-    fn pool_distances(retset: &[Candidate], pool_size: usize) -> Vec<f32> {
-        retset[..pool_size].iter().map(|c| c.distance).collect()
-    }
-
-    fn pool_ids(retset: &[Candidate], pool_size: usize) -> Vec<u32> {
-        retset[..pool_size].iter().map(|c| c.id).collect()
-    }
-
-    // ---- insert_into_pool tests ----
-
-    #[test]
-    fn test_insert_into_pool_empty() {
-        let mut retset = empty_pool(8);
-        let mut pool_size: usize = 0;
-        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(1, 0.5));
-        // Pool was empty, should insert at position 0.
-        assert_eq!(pos, 0);
-        assert_eq!(retset[0].id, 1);
-        assert_eq!(retset[0].distance, 0.5);
-    }
-
-    #[test]
-    fn test_insert_into_pool_front() {
-        let mut retset = empty_pool(8);
-        let mut pool_size: usize = 0;
-        insert_into_pool(&mut retset, &mut pool_size, make_candidate(10, 5.0));
-        pool_size += 1;
-        insert_into_pool(&mut retset, &mut pool_size, make_candidate(20, 3.0));
-        pool_size += 1;
-
-        // Candidate with lowest distance should go to front.
-        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(30, 1.0));
-        pool_size += 1;
-        assert_eq!(pos, 0);
-        assert_eq!(pool_ids(&retset, pool_size), vec![30, 20, 10]);
-        assert_eq!(pool_distances(&retset, pool_size), vec![1.0, 3.0, 5.0]);
-    }
-
-    #[test]
-    fn test_insert_into_pool_end() {
-        let mut retset = empty_pool(8);
-        let mut pool_size: usize = 0;
-        insert_into_pool(&mut retset, &mut pool_size, make_candidate(1, 1.0));
-        pool_size += 1;
-        insert_into_pool(&mut retset, &mut pool_size, make_candidate(2, 2.0));
-        pool_size += 1;
-
-        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(3, 10.0));
-        pool_size += 1;
-        assert_eq!(pos, 2);
-        assert_eq!(pool_distances(&retset, pool_size), vec![1.0, 2.0, 10.0]);
-    }
-
-    #[test]
-    fn test_insert_into_pool_at_capacity_better_candidate() {
-        // Capacity = 4, pool full with 4 items. Insert one that is better.
-        let mut retset = empty_pool(4);
-        let mut pool_size: usize = 0;
-        for (id, d) in [(1, 1.0), (2, 3.0), (3, 5.0), (4, 7.0)] {
-            insert_into_pool(&mut retset, &mut pool_size, make_candidate(id, d));
-            pool_size += 1;
-        }
-        assert_eq!(pool_size, 4);
-
-        // Pool is at capacity (pool_size == retset.len()), insert a better candidate.
-        // insert_into_pool should grow the buffer to make room.
-        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(5, 2.0));
-        assert_eq!(pos, 1);
-        // The pool buffer should have grown and the element is in sorted order.
-        assert!(retset.len() >= 5);
-        assert_eq!(retset[0].id, 1);
-        assert_eq!(retset[1].id, 5);
-        assert_eq!(retset[1].distance, 2.0);
-    }
-
-    #[test]
-    fn test_insert_into_pool_at_capacity_worse_candidate() {
-        // Capacity = 4, pool full. Insert a candidate worse than all existing.
-        let mut retset = empty_pool(4);
-        let mut pool_size: usize = 0;
-        for (id, d) in [(1, 1.0), (2, 3.0), (3, 5.0), (4, 7.0)] {
-            insert_into_pool(&mut retset, &mut pool_size, make_candidate(id, d));
-            pool_size += 1;
-        }
-
-        // Candidate distance 100.0 is worse than the sentinel f32::MAX only if
-        // pool_size == retset.len(), the function grows the buffer. Verify sorted order.
-        let pos = insert_into_pool(&mut retset, &mut pool_size, make_candidate(99, 100.0));
-        // pos should be 4 (after last real element); the buffer was grown.
-        assert_eq!(pos, 4);
-    }
-
-    #[test]
-    fn test_insert_into_pool_maintains_sort_order() {
-        let mut retset = empty_pool(16);
-        let mut pool_size: usize = 0;
-        let distances = [5.0, 1.0, 3.0, 7.0, 2.0, 6.0, 4.0];
-        for (i, &d) in distances.iter().enumerate() {
-            insert_into_pool(&mut retset, &mut pool_size, make_candidate(i as u32, d));
-            pool_size += 1;
-        }
-        let dists = pool_distances(&retset, pool_size);
-        for w in dists.windows(2) {
-            assert!(w[0] <= w[1], "Pool not sorted: {:?}", dists);
-        }
-        assert_eq!(dists, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]);
-    }
-
-    // ---- parse_node tests ----
-
-    /// Build a fake sector buffer containing a single node at a given offset.
-    fn build_sector_buf(
-        offset: usize,
-        fp_vector: &[u8],
-        neighbors: &[u32],
-        total_size: usize,
-    ) -> Vec<u8> {
-        let node_len = fp_vector.len() + 4 + neighbors.len() * 4;
-        let mut buf = vec![0u8; total_size.max(offset + node_len)];
-        buf[offset..offset + fp_vector.len()].copy_from_slice(fp_vector);
-        let neigh_offset = offset + fp_vector.len();
-        LittleEndian::write_u32(
-            &mut buf[neigh_offset..neigh_offset + 4],
-            neighbors.len() as u32,
-        );
-        for (i, &n) in neighbors.iter().enumerate() {
-            let start = neigh_offset + 4 + i * 4;
-            LittleEndian::write_u32(&mut buf[start..start + 4], n);
-        }
-        buf
-    }
-
-    #[test]
-    fn test_parse_node_basic() {
-        let fp_vec = vec![1u8, 2, 3, 4, 5, 6, 7, 8]; // 8-byte vector
-        let neighbors = vec![10u32, 20, 30];
-        let fp_vector_len = fp_vec.len() as u64;
-        let node_len = fp_vector_len + 4 + 3 * 4; // vec + count + 3 neighbors
-
-        let buf = build_sector_buf(0, &fp_vec, &neighbors, 4096);
-        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len).unwrap();
-
-        assert_eq!(node.fp_vector, fp_vec);
-        assert_eq!(node.adjacency_list, vec![10, 20, 30]);
-    }
-
-    #[test]
-    fn test_parse_node_multi_node_per_sector() {
-        let fp_vector_len = 8u64;
-        let node_len = fp_vector_len + 4 + 2 * 4; // 8-byte vec, 2 neighbors
-        let num_nodes_per_sector = 4u64;
-
-        // Place 4 nodes in the sector, each with different data.
-        let mut buf = vec![0u8; 4096];
-        for node_idx in 0u32..4 {
-            let offset = (node_idx as u64 * node_len) as usize;
-            let fp_vec: Vec<u8> = (0..8).map(|b| b + (node_idx as u8) * 10).collect();
-            let neighbors = vec![100 + node_idx, 200 + node_idx];
-            let partial = build_sector_buf(0, &fp_vec, &neighbors, node_len as usize);
-            buf[offset..offset + node_len as usize]
-                .copy_from_slice(&partial[..node_len as usize]);
-        }
-
-        // Parse node at index 2 (vertex_id=2 within same sector)
-        let node = parse_node(&buf, 2, num_nodes_per_sector, node_len, fp_vector_len).unwrap();
-        let expected_fp: Vec<u8> = (0..8).map(|b| b + 20).collect();
-        assert_eq!(node.fp_vector, expected_fp);
-        assert_eq!(node.adjacency_list, vec![102, 202]);
-    }
-
-    #[test]
-    fn test_parse_node_zero_neighbors() {
-        let fp_vec = vec![42u8; 16];
-        let fp_vector_len = 16u64;
-        let neighbors: Vec<u32> = vec![];
-        let node_len = fp_vector_len + 4; // vec + count only
-
-        let buf = build_sector_buf(0, &fp_vec, &neighbors, 4096);
-        let node = parse_node(&buf, 0, 1, node_len, fp_vector_len).unwrap();
-
-        assert_eq!(node.fp_vector, vec![42u8; 16]);
-        assert!(node.adjacency_list.is_empty());
-    }
-
-    // ---- node_sector_index tests ----
-
-    #[test]
-    fn test_node_sector_index_multi_node_per_sector() {
-        let num_nodes_per_sector = 4u64;
-        let num_sectors_per_node = 1usize;
-
-        // Matches disk_sector_graph.rs: sector = 1 + vertex_id / num_nodes_per_sector
-        assert_eq!(node_sector_index(0, num_nodes_per_sector, num_sectors_per_node), 1);
-        assert_eq!(node_sector_index(3, num_nodes_per_sector, num_sectors_per_node), 1);
-        assert_eq!(node_sector_index(4, num_nodes_per_sector, num_sectors_per_node), 2);
-        assert_eq!(node_sector_index(5, num_nodes_per_sector, num_sectors_per_node), 2);
-        assert_eq!(node_sector_index(7, num_nodes_per_sector, num_sectors_per_node), 2);
-        assert_eq!(node_sector_index(8, num_nodes_per_sector, num_sectors_per_node), 3);
-        assert_eq!(node_sector_index(1023, num_nodes_per_sector, num_sectors_per_node), 256);
-        assert_eq!(node_sector_index(1024, num_nodes_per_sector, num_sectors_per_node), 257);
-    }
-
-    #[test]
-    fn test_node_sector_index_multi_sector_per_node() {
-        let num_nodes_per_sector = 0u64;
-        let num_sectors_per_node = 2usize;
-
-        // sector = 1 + vertex_id * num_sectors_per_node
-        assert_eq!(node_sector_index(0, num_nodes_per_sector, num_sectors_per_node), 1);
-        assert_eq!(node_sector_index(3, num_nodes_per_sector, num_sectors_per_node), 7);
-        assert_eq!(node_sector_index(4, num_nodes_per_sector, num_sectors_per_node), 9);
-        assert_eq!(node_sector_index(5, num_nodes_per_sector, num_sectors_per_node), 11);
-        assert_eq!(node_sector_index(7, num_nodes_per_sector, num_sectors_per_node), 15);
-        assert_eq!(node_sector_index(8, num_nodes_per_sector, num_sectors_per_node), 17);
-        assert_eq!(node_sector_index(1023, num_nodes_per_sector, num_sectors_per_node), 2047);
-        assert_eq!(node_sector_index(1024, num_nodes_per_sector, num_sectors_per_node), 2049);
-    }
-
-    // ---- node_offset_in_sector tests ----
-
-    #[test]
-    fn test_node_offset_multi_node_per_sector() {
-        let num_nodes_per_sector = 4u64;
-        let node_len = 256u64;
-
-        // offset = (vertex_id % num_nodes_per_sector) * node_len
-        assert_eq!(node_offset_in_sector(0, num_nodes_per_sector, node_len), 0);
-        assert_eq!(node_offset_in_sector(1, num_nodes_per_sector, node_len), 256);
-        assert_eq!(node_offset_in_sector(2, num_nodes_per_sector, node_len), 512);
-        assert_eq!(node_offset_in_sector(3, num_nodes_per_sector, node_len), 768);
-        assert_eq!(node_offset_in_sector(4, num_nodes_per_sector, node_len), 0); // wraps
-        assert_eq!(node_offset_in_sector(5, num_nodes_per_sector, node_len), 256);
-    }
-
-    #[test]
-    fn test_node_offset_multi_sector_per_node() {
-        // When num_nodes_per_sector is 0 (multi-sector), offset is always 0.
-        assert_eq!(node_offset_in_sector(0, 0, 8192), 0);
-        assert_eq!(node_offset_in_sector(5, 0, 8192), 0);
-        assert_eq!(node_offset_in_sector(100, 0, 8192), 0);
-    }
-
-    // ---- max_slots tests ----
-
-    #[test]
-    fn test_max_slots() {
-        // beam_width * 2 clamped to [16, MAX_IO_CONCURRENCY]
-        assert_eq!(max_slots(1), 16); // 2 clamped up to 16
-        assert_eq!(max_slots(8), 16);
-        assert_eq!(max_slots(16), 32);
-        assert_eq!(max_slots(64), 128);
-        assert_eq!(max_slots(100), 128); // 200 clamped down to 128
-    }
-}
diff --git a/diskann-disk/src/search/pipelined/pipelined_searcher.rs b/diskann-disk/src/search/pipelined/pipelined_searcher.rs
deleted file mode 100644
index 4c0691d7f..000000000
--- a/diskann-disk/src/search/pipelined/pipelined_searcher.rs
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Public API for pipelined disk search.
-
-use std::sync::Arc;
-
-use diskann::{
-    utils::{
-        object_pool::{ObjectPool, PoolOption, TryAsPooled},
-        VectorRepr,
-    },
-    ANNError, ANNResult,
-};
-use diskann_providers::model::{
-    graph::traits::GraphDataType, PQData, PQScratch,
-};
-use diskann_vector::distance::Metric;
-
-use crate::{
-    data_model::GraphHeader,
-    search::provider::disk_provider::{SearchResult, SearchResultItem, SearchResultStats},
-    utils::QueryStatistics,
-};
-
-use super::pipelined_reader::{PipelinedReader, PipelinedReaderConfig};
-use super::pipelined_search::{pipe_search, PipeSearchResult};
-use crate::search::search_trace::SearchTrace;
-
-/// Scratch space for pipelined search operations, pooled for reuse across queries.
-struct PipelinedSearchScratch {
-    reader: PipelinedReader,
-    pq_scratch: PQScratch,
-}
-
-/// Arguments for creating or resetting a [`PipelinedSearchScratch`].
-#[derive(Clone)]
-struct PipelinedScratchArgs<'a> {
-    disk_index_path: &'a str,
-    max_slots: usize,
-    slot_size: usize,
-    alignment: usize,
-    graph_degree: usize,
-    dims: usize,
-    num_pq_chunks: usize,
-    num_pq_centers: usize,
-    reader_config: PipelinedReaderConfig,
-}
-
-impl TryAsPooled<&PipelinedScratchArgs<'_>> for PipelinedSearchScratch {
-    type Error = ANNError;
-
-    fn try_create(args: &PipelinedScratchArgs<'_>) -> Result<Self, Self::Error> {
-        let reader = PipelinedReader::new(
-            args.disk_index_path,
-            args.max_slots,
-            args.slot_size,
-            args.alignment,
-            &args.reader_config,
-        )?;
-        let pq_scratch = PQScratch::new(
-            args.graph_degree,
-            args.dims,
-            args.num_pq_chunks,
-            args.num_pq_centers,
-        )?;
-        Ok(Self { reader, pq_scratch })
-    }
-
-    fn try_modify(&mut self, _args: &PipelinedScratchArgs<'_>) -> Result<(), Self::Error> {
-        self.reader.reset();
-        Ok(())
-    }
-}
-
-/// A pipelined disk index searcher implementing the PipeANN algorithm.
-///
-/// # Deprecation
-///
-/// This standalone searcher duplicates the generic search loop. Prefer using
-/// `DiskIndexSearcher::search_pipelined()` which integrates pipelined IO via the
-/// queue-based `ExpandBeam` trait, providing the same IO/compute overlap without
-/// code duplication.
-///
-/// # Safety
-///
-/// This searcher is designed for **read-only search on completed (static) disk indices**.
-/// It opens independent file descriptors with O_DIRECT and reads raw sectors without
-/// going through the synchronized `DiskProvider` path. It must NOT be used concurrently
-/// with index build, insert, or delete operations on the same index file.
-///
-/// For search during streaming or dynamic index operations, use [`DiskIndexSearcher`]
-/// (beam search) instead, which provides proper synchronization through the
-/// `DiskProvider` and `VertexProvider` abstractions.
-///
-/// # Thread Safety
-///
-/// Multiple concurrent `search()` calls on the same `PipelinedSearcher` are safe.
-/// Each search operates on its own `PipelinedReader` and `PQScratch` (pooled for
-/// amortized allocation). Shared state (`PQData`, `GraphHeader`) is immutable.
-#[deprecated(note = "Use DiskIndexSearcher::search_pipelined() instead for unified pipelined search")]
-pub struct PipelinedSearcher<Data: GraphDataType<VectorIdType = u32>> {
-    #[allow(dead_code)]
-    graph_header: GraphHeader,
-    distance_comparer: <Data::VectorDataType as VectorRepr>::Distance,
-    pq_data: Arc<PQData>,
-    metric: Metric,
-    relaxed_monotonicity_l: Option<usize>,
-    disk_index_path: String,
-    reader_config: PipelinedReaderConfig,
-    /// Pool of reusable reader + PQ scratch instances.
-    scratch_pool: Arc<ObjectPool<PipelinedSearchScratch>>,
-
-    // Precomputed values derived from graph_header / pq_data, cached to avoid
-    // re-derivation on every search() call.
-    block_size: usize,
-    #[allow(dead_code)]
-    num_sectors_per_node: usize,
-    slot_size: usize,
-    fp_vector_len: u64,
-    dims: usize,
-    node_len: u64,
-    num_nodes_per_sector: u64,
-    medoid: u32,
-    graph_degree: usize,
-    num_pq_chunks: usize,
-    num_pq_centers: usize,
-}
-
-impl<Data> PipelinedSearcher<Data>
-where
-    Data: GraphDataType<VectorIdType = u32>,
-{
-    /// Create a new pipelined searcher.
-    ///
-    /// # Arguments
-    /// * `graph_header` - Graph metadata from the disk index.
-    /// * `pq_data` - Shared PQ data for approximate distance computation.
-    /// * `metric` - Distance metric (L2, InnerProduct, etc.).
-    /// * `beam_width` - Default beam width used for pool sizing.
-    /// * `relaxed_monotonicity_l` - Optional early termination parameter.
-    /// * `disk_index_path` - Path to the disk index file for creating readers.
-    pub fn new(
-        graph_header: GraphHeader,
-        pq_data: Arc<PQData>,
-        metric: Metric,
-        beam_width: usize,
-        relaxed_monotonicity_l: Option<usize>,
-        disk_index_path: String,
-        config: PipelinedReaderConfig,
-    ) -> ANNResult<Self> {
-        let metadata = graph_header.metadata();
-        let dims = metadata.dims;
-        let node_len = metadata.node_len;
-        let num_nodes_per_sector = metadata.num_nodes_per_block;
-        let fp_vector_len =
-            (dims * std::mem::size_of::<Data::VectorDataType>()) as u64;
-        let medoid = metadata.medoid as u32;
-        let distance_comparer = Data::VectorDataType::distance(metric, Some(dims));
-
-        let block_size = graph_header.effective_block_size();
-        let num_sectors_per_node = graph_header.num_sectors_per_node();
-        let slot_size = num_sectors_per_node * block_size;
-
-        let max_slots =
-            (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
-
-        let graph_degree = graph_header.max_degree::<Data::VectorDataType>()?;
-        let num_pq_chunks = pq_data.get_num_chunks();
-        let num_pq_centers = pq_data.get_num_centers();
-
-        let scratch_args = PipelinedScratchArgs {
-            disk_index_path: &disk_index_path,
-            max_slots,
-            slot_size,
-            alignment: block_size,
-            graph_degree,
-            dims,
-            num_pq_chunks,
-            num_pq_centers,
-            reader_config: config.clone(),
-        };
-        let scratch_pool = Arc::new(ObjectPool::try_new(&scratch_args, 0, None)?);
-
-        Ok(Self {
-            graph_header,
-            distance_comparer,
-            pq_data,
-            metric,
-            relaxed_monotonicity_l,
-            disk_index_path,
-            reader_config: config,
-            scratch_pool,
-            block_size,
-            num_sectors_per_node,
-            slot_size,
-            fp_vector_len,
-            dims,
-            node_len,
-            num_nodes_per_sector,
-            medoid,
-            graph_degree,
-            num_pq_chunks,
-            num_pq_centers,
-        })
-    }
-
-    /// Perform a pipelined search on the disk index.
-    ///
-    /// # Arguments
-    /// * `query` - The query vector.
-    /// * `return_list_size` - Number of results to return (k).
-    /// * `search_list_size` - Size of the candidate pool (L).
-    /// * `beam_width` - Maximum beam width for pipelined IO.
-    /// * `vector_filter` - Optional predicate; only vertices passing the filter
-    ///   are included in the result set. Graph traversal is unaffected.
-    pub fn search(
-        &self,
-        query: &[Data::VectorDataType],
-        return_list_size: u32,
-        search_list_size: u32,
-        beam_width: usize,
-        vector_filter: Option<&(dyn Fn(&u32) -> bool + Send + Sync)>,
-    ) -> ANNResult<SearchResult<Data::AssociatedDataType>> {
-        let max_slots = (beam_width * 2).clamp(16, super::pipelined_reader::MAX_IO_CONCURRENCY);
-
-        let args = PipelinedScratchArgs {
-            disk_index_path: &self.disk_index_path,
-            max_slots,
-            slot_size: self.slot_size,
-            alignment: self.block_size,
-            graph_degree: self.graph_degree,
-            dims: self.dims,
-            num_pq_chunks: self.num_pq_chunks,
-            num_pq_centers: self.num_pq_centers,
-            reader_config: self.reader_config.clone(),
-        };
-        let mut scratch = PoolOption::try_pooled(&self.scratch_pool, &args)?;
-        let PipelinedSearchScratch {
-            ref mut reader,
-            ref mut pq_scratch,
-        } = *scratch;
-
-        let trace_enabled = std::env::var("DISKANN_TRACE").map_or(false, |v| v == "1");
-        let mut trace = if trace_enabled {
-            Some(SearchTrace::new())
-        } else {
-            None
-        };
-
-        let result: PipeSearchResult = pipe_search::<Data::VectorDataType>(
-            reader,
-            &self.pq_data,
-            &self.distance_comparer,
-            query,
-            return_list_size as usize,
-            search_list_size as usize,
-            beam_width,
-            self.medoid,
-            self.dims,
-            self.node_len,
-            self.num_nodes_per_sector,
-            self.block_size,
-            self.fp_vector_len,
-            pq_scratch,
-            self.relaxed_monotonicity_l,
-            self.metric,
-            vector_filter,
-            trace.as_mut(),
-        )?;
-
-        if let Some(t) = &trace {
-            t.print_profile_summary();
-            if std::env::var("DISKANN_TRACE_EVENTS").is_ok() {
-                t.print_events(500);
-            }
-        }
-
-        let query_statistics = QueryStatistics {
-            total_execution_time_us: result.stats.total_us,
-            io_time_us: result.stats.io_us,
-            cpu_time_us: result.stats.cpu_us,
-            total_io_operations: result.stats.io_count,
-            total_comparisons: result.stats.comparisons,
-            total_vertices_loaded: result.stats.io_count,
-            search_hops: result.stats.hops,
-            ..Default::default()
-        };
-
-        let stats = SearchResultStats {
-            cmps: result.stats.comparisons,
-            result_count: result.ids.len() as u32,
-            query_statistics,
-        };
-
-        let mut results = Vec::with_capacity(result.ids.len());
-        for (id, dist) in result.ids.iter().zip(result.distances.iter()) {
-            results.push(SearchResultItem {
-                vertex_id: *id,
-                distance: *dist,
-                data: Data::AssociatedDataType::default(),
-            });
-        }
-
-        Ok(SearchResult { results, stats })
-    }
-}
-
-#[cfg(test)]
-#[cfg(target_os = "linux")]
-mod tests {
-    use super::*;
-    use std::sync::Arc;
-
-    use diskann_providers::storage::{get_disk_index_file, VirtualStorageProvider};
-    use diskann_providers::test_utils::graph_data_type_utils::GraphDataF32VectorUnitData;
-    use diskann_utils::test_data_root;
-    use diskann_vector::distance::Metric;
-    use rayon::prelude::*;
-
-    use crate::data_model::CachingStrategy;
-    use crate::search::provider::disk_vertex_provider_factory::DiskVertexProviderFactory;
-    use crate::search::traits::vertex_provider_factory::VertexProviderFactory;
-    use crate::storage::disk_index_reader::DiskIndexReader;
-    use crate::utils::VirtualAlignedReaderFactory;
-
-    use super::PipelinedReaderConfig;
-
-    const TEST_INDEX_PREFIX: &str =
-        "/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search";
-    const TEST_PQ_PIVOT: &str =
-        "/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_pq_pivots.bin";
-    const TEST_PQ_COMPRESSED: &str =
-        "/disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_pq_compressed.bin";
-    const TEST_QUERY: &str = "/disk_index_search/disk_index_sample_query_10pts.fbin";
-
-    fn create_test_searcher() -> PipelinedSearcher<GraphDataF32VectorUnitData> {
-        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
-
-        let disk_index_reader = DiskIndexReader::<f32>::new(
-            TEST_PQ_PIVOT.to_string(),
-            TEST_PQ_COMPRESSED.to_string(),
-            storage_provider.as_ref(),
-        )
-        .unwrap();
-        let pq_data = disk_index_reader.get_pq_data();
-
-        let aligned_reader_factory = VirtualAlignedReaderFactory::new(
-            get_disk_index_file(TEST_INDEX_PREFIX),
-            Arc::clone(&storage_provider),
-        );
-        let vertex_provider_factory =
-            DiskVertexProviderFactory::<GraphDataF32VectorUnitData, _>::new(
-                aligned_reader_factory,
-                CachingStrategy::None,
-            )
-            .unwrap();
-        let graph_header = vertex_provider_factory.get_header().unwrap();
-
-        let real_index_path = test_data_root().join(
-            "disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_disk.index",
-        );
-
-        PipelinedSearcher::<GraphDataF32VectorUnitData>::new(
-            graph_header,
-            pq_data,
-            Metric::L2,
-            4,
-            None,
-            real_index_path.to_str().unwrap().to_string(),
-            PipelinedReaderConfig::default(),
-        )
-        .unwrap()
-    }
-
-    fn load_test_query() -> Vec<f32> {
-        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
-        let (query_vector, _npts, _dim) =
-            diskann_providers::utils::file_util::load_bin::<f32, _>(
-                storage_provider.as_ref(),
-                TEST_QUERY,
-                0,
-            )
-            .unwrap();
-        query_vector[0..128].to_vec()
-    }
-
-    #[test]
-    fn test_pool_reuse_sequential_searches() {
-        let searcher = create_test_searcher();
-        let query = load_test_query();
-
-        let r1 = searcher.search(&query, 10, 40, 4, None).unwrap();
-        let r2 = searcher.search(&query, 10, 40, 4, None).unwrap();
-
-        assert!(!r1.results.is_empty());
-        assert!(!r2.results.is_empty());
-        // Same query must return same number of results.
-        assert_eq!(r1.results.len(), r2.results.len());
-        // All distances must be non-negative.
-        for item in r1.results.iter().chain(r2.results.iter()) {
-            assert!(item.distance >= 0.0);
-        }
-    }
-
-    #[test]
-    fn test_pool_concurrent_searches() {
-        let searcher = Arc::new(create_test_searcher());
-        let query = load_test_query();
-
-        let results: Vec<_> = (0..4)
-            .into_par_iter()
-            .map(|_| searcher.search(&query, 10, 40, 4, None).unwrap())
-            .collect();
-
-        for r in &results {
-            assert!(!r.results.is_empty());
-            for item in &r.results {
-                assert!(item.distance >= 0.0);
-            }
-        }
-    }
-}
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index b052f397f..38a1579a8 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -2137,199 +2137,5 @@ mod disk_provider_tests {
         // search hits io_limit that it doesn't break and the recall degrades gracefully
         assert!(recall >= 60.0, "Match percentage is below 60%: {}", recall);
     }
-
-    #[test]
-    #[cfg(target_os = "linux")]
-    fn test_pipe_search_k10_l100_128dim() {
-        use crate::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
-        use diskann_providers::storage::get_disk_index_file;
-
-        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
-
-        // Load PQ data via DiskIndexReader.
-        let disk_index_reader = DiskIndexReader::<f32>::new(
-            TEST_PQ_PIVOT_128DIM.to_string(),
-            TEST_PQ_COMPRESSED_128DIM.to_string(),
-            storage_provider.as_ref(),
-        )
-        .unwrap();
-        let pq_data = disk_index_reader.get_pq_data();
-
-        // Read graph header via DiskVertexProviderFactory.
-        let aligned_reader_factory = VirtualAlignedReaderFactory::new(
-            get_disk_index_file(TEST_INDEX_PREFIX_128DIM),
-            Arc::clone(&storage_provider),
-        );
-        let vertex_provider_factory =
-            DiskVertexProviderFactory::<GraphDataF32VectorUnitData, _>::new(
-                aligned_reader_factory,
-                CachingStrategy::None,
-            )
-            .unwrap();
-        let graph_header = vertex_provider_factory.get_header().unwrap();
-
-        // Resolve real filesystem path for PipelinedSearcher (needs O_DIRECT).
-        let real_index_path = test_data_root().join(
-            "disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_disk.index",
-        );
-        let real_index_path_str = real_index_path.to_str().unwrap();
-
-        let pipe_searcher = PipelinedSearcher::<GraphDataF32VectorUnitData>::new(
-            graph_header,
-            pq_data,
-            Metric::L2,
-            4,
-            None,
-            real_index_path_str.to_string(),
-            PipelinedReaderConfig::default(),
-        )
-        .unwrap();
-
-        // Load queries and ground truth.
-        let (query_vector, _, _) = diskann_providers::utils::file_util::load_bin::<f32, _>(
-            storage_provider.as_ref(),
-            TEST_QUERY_10PTS_128DIM,
-            0,
-        )
-        .unwrap();
-        let truth_result =
-            load_query_result(storage_provider.as_ref(), TEST_TRUTH_RESULT_10PTS_128DIM);
-
-        let dim = 128usize;
-        let k = 10usize;
-        let l = 100u32;
-        let num_queries = query_vector.len() / dim;
-
-        let mut total_recall = 0.0f32;
-        for q in 0..num_queries {
-            let query = &query_vector[q * dim..(q + 1) * dim];
-            let result = pipe_searcher.search(query, k as u32, l, 4, None).unwrap();
-            let indices: Vec<u32> = result.results.iter().map(|item| item.vertex_id).collect();
-            let truth_slice = &truth_result[q * k..(q + 1) * k];
-
-            // Count recall overlap (PipeANN traversal order may differ from beam search ground truth).
-            let matching = indices
-                .iter()
-                .filter(|id| truth_slice.contains(id))
-                .count();
-            total_recall += matching as f32 / k as f32;
-        }
-        let avg_recall = total_recall / num_queries as f32;
-        assert!(
-            avg_recall >= 0.8,
-            "PipeANN average recall {:.0}% < 80%",
-            avg_recall * 100.0,
-        );
-    }
-
-    #[test]
-    #[cfg(target_os = "linux")]
-    fn test_concurrent_beam_and_pipe_search_128dim() {
-        use crate::search::pipelined::{PipelinedSearcher, PipelinedReaderConfig};
-        use diskann_providers::storage::get_disk_index_file;
-        use rayon::prelude::*;
-
-        let storage_provider = Arc::new(VirtualStorageProvider::new_overlay(test_data_root()));
-
-        // Create beam search engine (DiskIndexSearcher).
-        let beam_engine = create_disk_index_searcher::<GraphDataF32VectorUnitData>(
-            CreateDiskIndexSearcherParams {
-                max_thread_num: 2,
-                pq_pivot_file_path: TEST_PQ_PIVOT_128DIM,
-                pq_compressed_file_path: TEST_PQ_COMPRESSED_128DIM,
-                index_path: TEST_INDEX_128DIM,
-                index_path_prefix: TEST_INDEX_PREFIX_128DIM,
-                ..Default::default()
-            },
-            &storage_provider,
-        );
-
-        // Create pipelined search engine (PipelinedSearcher).
-        let disk_index_reader = DiskIndexReader::<f32>::new(
-            TEST_PQ_PIVOT_128DIM.to_string(),
-            TEST_PQ_COMPRESSED_128DIM.to_string(),
-            storage_provider.as_ref(),
-        )
-        .unwrap();
-        let pq_data = disk_index_reader.get_pq_data();
-
-        let aligned_reader_factory = VirtualAlignedReaderFactory::new(
-            get_disk_index_file(TEST_INDEX_PREFIX_128DIM),
-            Arc::clone(&storage_provider),
-        );
-        let vertex_provider_factory =
-            DiskVertexProviderFactory::<GraphDataF32VectorUnitData, _>::new(
-                aligned_reader_factory,
-                CachingStrategy::None,
-            )
-            .unwrap();
-        let graph_header = vertex_provider_factory.get_header().unwrap();
-
-        let real_index_path = test_data_root().join(
-            "disk_index_search/disk_index_sift_learn_R4_L50_A1.2_truth_search_disk.index",
-        );
-        let pipe_searcher = Arc::new(
-            PipelinedSearcher::<GraphDataF32VectorUnitData>::new(
-                graph_header,
-                pq_data,
-                Metric::L2,
-                4,
-                None,
-                real_index_path.to_str().unwrap().to_string(),
-                PipelinedReaderConfig::default(),
-            )
-            .unwrap(),
-        );
-
-        // Load queries and ground truth.
-        let (query_vector, _, _) = diskann_providers::utils::file_util::load_bin::<f32, _>(
-            storage_provider.as_ref(),
-            TEST_QUERY_10PTS_128DIM,
-            0,
-        )
-        .unwrap();
-        let truth_result =
-            load_query_result(storage_provider.as_ref(), TEST_TRUTH_RESULT_10PTS_128DIM);
-
-        let dim = 128usize;
-        let k = 10usize;
-        let l = 100u32;
-        let num_queries = query_vector.len() / dim;
-
-        // Run beam search and pipe search concurrently via rayon.
-        let queries: Vec<&[f32]> = (0..num_queries)
-            .map(|q| &query_vector[q * dim..(q + 1) * dim])
-            .collect();
-        let beam_ref = &beam_engine;
-        let pipe_ref = &pipe_searcher;
-        let truth_ref = &truth_result;
-
-        queries.par_iter().enumerate().for_each(|(q, query)| {
-            // Beam search
-            let beam_result = beam_ref
-                .search(query, k as u32, l, None, None, false)
-                .unwrap();
-            let beam_ids: Vec<u32> = beam_result.results.iter().map(|r| r.vertex_id).collect();
-            let truth_slice = &truth_ref[q * k..(q + 1) * k];
-
-            // Pipe search (runs concurrently with beam search across rayon threads)
-            let pipe_result = pipe_ref.search(query, k as u32, l, 4, None).unwrap();
-            let pipe_ids: Vec<u32> = pipe_result.results.iter().map(|r| r.vertex_id).collect();
-
-            // Both should produce results with reasonable overlap.
-            let beam_matching = beam_ids.iter().filter(|id| truth_slice.contains(id)).count();
-            let pipe_matching = pipe_ids.iter().filter(|id| truth_slice.contains(id)).count();
-            // Per-query: at least some overlap (>=30%) to guard against total failures.
-            assert!(
-                beam_matching as f32 / k as f32 >= 0.3,
-                "Beam search returned no relevant results for query {}",
-                q,
-            );
-            assert!(
-                pipe_matching as f32 / k as f32 >= 0.3,
-                "Pipe search returned no relevant results for query {}",
-                q,
-            );
-        });
-    }
 }
+
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 5482f88f2..99410e399 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -6,7 +6,7 @@
 //! Queue-based pipelined disk accessor that integrates with the generic search loop
 //! via the `ExpandBeam` trait's `submit_expand` / `expand_available` / `has_pending` methods.
 //!
-//! Instead of duplicating the search loop (like `PipelinedSearcher`), this accessor
+//! Plugs into `DiskANNIndex::search_internal()` and overlaps IO with computation
 //! plugs into `DiskANNIndex::search_internal()` and overlaps IO with computation
 //! using io_uring under the hood.
 

From 03acde71b78b00e7bc1d31976d66e6deb311065e Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 11:52:02 -0800
Subject: [PATCH 29/46] remove search tracing

---
 .../src/backend/disk_index/search.rs          |   3 -
 diskann-disk/src/search/mod.rs                |   1 -
 .../src/search/provider/pipelined_accessor.rs |  70 +----
 diskann-disk/src/search/search_trace.rs       | 294 ------------------
 4 files changed, 1 insertion(+), 367 deletions(-)
 delete mode 100644 diskann-disk/src/search/search_trace.rs

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index b0ef4a7f8..2a304e10a 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -435,8 +435,6 @@ where
                     None,
                 )?;
 
-                let trace_enabled = std::env::var("DISKANN_TRACE").map_or(false, |v| v == "1");
-
                 searcher.with_pipelined_config(PipelinedConfig {
                     beam_width: search_params.beam_width,
                     adaptive_beam_width: *adaptive_beam_width,
@@ -444,7 +442,6 @@ where
                     node_cache,
                     scratch_pool,
                     scratch_args,
-                    trace_enabled,
                 });
 
                 let searcher = &searcher;
diff --git a/diskann-disk/src/search/mod.rs b/diskann-disk/src/search/mod.rs
index 3b7c98c3f..2c475e10a 100644
--- a/diskann-disk/src/search/mod.rs
+++ b/diskann-disk/src/search/mod.rs
@@ -9,7 +9,6 @@ pub mod provider;
 pub mod traits;
 
 pub(crate) mod sector_math;
-pub mod search_trace;
 
 #[cfg(target_os = "linux")]
 pub mod pipelined;
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 99410e399..28427d306 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -35,7 +35,7 @@ use diskann_vector::DistanceFunction;
 
 use crate::data_model::Cache;
 use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig, MAX_IO_CONCURRENCY};
-use crate::search::search_trace::{OptionalTrace, SearchTrace, TraceEventKind};
+
 use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 use crate::search::traits::VertexProviderFactory;
 use crate::utils::QueryStatistics;
@@ -249,9 +249,6 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     preprocess_time: std::time::Duration,
     // Shared stats written on drop so caller can read them after search
     shared_io_stats: Arc<PipelinedIoStats>,
-
-    // Optional per-query trace for profiling and algorithmic comparison
-    trace: Option<SearchTrace>,
 }
 
 impl<'a, Data> PipelinedDiskAccessor<'a, Data>
@@ -296,7 +293,6 @@ where
             cpu_time: std::time::Duration::ZERO,
             preprocess_time: std::time::Duration::ZERO,
             shared_io_stats,
-            trace: None,
         })
     }
 
@@ -317,19 +313,6 @@ where
         Ok(())
     }
 
-    /// Enable per-query tracing. Call before search.
-    pub fn enable_trace(&mut self) {
-        self.trace = Some(SearchTrace::new());
-    }
-
-    /// Take the completed trace (if any). Call after search.
-    pub fn take_trace(&mut self) -> Option<SearchTrace> {
-        if let Some(t) = self.trace.as_mut() {
-            t.finish();
-        }
-        self.trace.take()
-    }
-
     /// Compute PQ distances for a set of neighbor IDs.
     /// `ids` must not alias any mutable scratch fields used by PQ computation.
     fn pq_distances<F>(&mut self, ids: &[u32], mut f: F) -> ANNResult<()>
@@ -378,12 +361,8 @@ where
             return Ok(());
         }
 
-        let mut trace = OptionalTrace(self.trace.as_mut());
-
         let io_start = Instant::now();
-        trace.begin_phase();
         let completed_slots = self.scratch.reader.poll_completions()?;
-        trace.end_phase_io_poll();
         self.io_time += io_start.elapsed();
 
         if completed_slots.is_empty() {
@@ -393,7 +372,6 @@ where
         Self::process_completed_ios_inner(
             &mut self.scratch,
             &completed_slots,
-            &mut trace,
             self.num_nodes_per_sector,
             self.node_len,
             self.fp_vector_len,
@@ -401,11 +379,8 @@ where
     }
     /// Block until at least one IO completes, then eagerly drain all available.
     fn wait_and_drain(&mut self) -> ANNResult<()> {
-        let mut trace = OptionalTrace(self.trace.as_mut());
         let io_start = Instant::now();
-        trace.begin_phase();
         let completed_slots = self.scratch.reader.wait_completions()?;
-        trace.end_phase_io_poll();
         self.io_time += io_start.elapsed();
 
         if completed_slots.is_empty() {
@@ -415,7 +390,6 @@ where
         Self::process_completed_ios_inner(
             &mut self.scratch,
             &completed_slots,
-            &mut trace,
             self.num_nodes_per_sector,
             self.node_len,
             self.fp_vector_len,
@@ -429,7 +403,6 @@ where
     fn process_completed_ios_inner(
         scratch: &mut PipelinedScratch,
         completed_slots: &[usize],
-        trace: &mut OptionalTrace<'_>,
         num_nodes_per_sector: u64,
         node_len: u64,
         fp_vector_len: u64,
@@ -439,7 +412,6 @@ where
             let io = &scratch.in_flight_ios[i];
             if completed_slots.contains(&io.slot_id) {
                 let io = scratch.in_flight_ios.swap_remove_back(i).unwrap();
-                trace.begin_phase();
                 // Acquire node first (mutably borrows node_pool),
                 // then get sector buf (immutably borrows reader) — no conflict.
                 let mut node = scratch.node_pool.pop().unwrap_or_else(|| LoadedNode {
@@ -456,8 +428,6 @@ where
                     fp_vector_len,
                     io.rank,
                 )?;
-                trace.end_phase_parse_node();
-                trace.event(TraceEventKind::Complete { node_id: io.vertex_id });
                 scratch.loaded_nodes.insert(io.vertex_id, node);
             } else {
                 i += 1;
@@ -592,9 +562,7 @@ where
     /// Nodes found in the node cache are placed directly into `loaded_nodes`,
     /// skipping disk IO entirely.
     fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) {
-        let mut trace = OptionalTrace(self.trace.as_mut());
         let io_start = Instant::now();
-        trace.begin_phase();
         for id in ids {
             if self.scratch.loaded_nodes.contains_key(&id) {
                 continue; // Already loaded from a previous IO
@@ -615,7 +583,6 @@ where
                 self.next_rank += 1;
                 self.scratch.loaded_nodes.insert(id, node);
                 self.cache_hits += 1;
-                trace.event(TraceEventKind::CacheHit { node_id: id });
                 continue;
             }
 
@@ -638,15 +605,10 @@ where
                     slot_id,
                     rank,
                 });
-                trace.event(TraceEventKind::Submit {
-                    node_id: id,
-                    inflight: self.scratch.in_flight_ios.len(),
-                });
                 self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
                 self.io_count += 1;
             }
         }
-        trace.end_phase_io_submit();
         self.io_time += io_start.elapsed();
     }
 
@@ -710,9 +672,6 @@ where
                 .provider
                 .distance_comparer
                 .evaluate_similarity(self.query, fp_vec);
-            if let Some(t) = self.trace.as_mut() {
-                t.profile.fp_distance_us += cpu_start.elapsed().as_micros() as u64;
-            }
             self.scratch.distance_cache.insert(vid, fp_dist);
 
             // Get unvisited neighbors into reusable buffer
@@ -723,10 +682,8 @@ where
                     .copied()
                     .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr)),
             );
-            let num_new = self.scratch.neighbor_buf.len() as u32;
 
             if !self.scratch.neighbor_buf.is_empty() {
-                let pq_start = Instant::now();
                 let PipelinedScratch { ref mut pq_scratch, ref neighbor_buf, .. } = *self.scratch;
                 Self::pq_distances_inner(
                     pq_scratch,
@@ -734,22 +691,9 @@ where
                     neighbor_buf,
                     &mut on_neighbors,
                 )?;
-                if let Some(t) = self.trace.as_mut() {
-                    t.profile.pq_distance_us += pq_start.elapsed().as_micros() as u64;
-                }
             }
             self.cpu_time += cpu_start.elapsed();
 
-            if let Some(t) = self.trace.as_mut() {
-                t.record_expand();
-                t.event(TraceEventKind::Expand {
-                    node_id: vid,
-                    fp_distance: fp_dist,
-                    num_neighbors: node.adjacency_list.len() as u32,
-                    num_new_candidates: num_new,
-                });
-            }
-
             // Return node to pool for reuse
             self.scratch.release_node(node);
 
@@ -816,12 +760,6 @@ where
         self.shared_io_stats
             .preprocess_us
             .fetch_add(self.preprocess_time.as_micros() as u64, Ordering::Relaxed);
-
-        // Print trace profile if enabled (controlled by DISKANN_TRACE=1)
-        if let Some(trace) = self.trace.as_mut() {
-            trace.finish();
-            trace.print_profile_summary();
-        }
     }
 }
 
@@ -843,9 +781,6 @@ pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub scratch_pool: Arc<ObjectPool<PipelinedScratch>>,
     /// Args for retrieving/creating pooled scratch instances.
     pub scratch_args: PipelinedScratchArgs,
-    /// Enable per-query SearchTrace. The trace profile is printed to stderr
-    /// after each query completes. Use for profiling, not production.
-    pub trace_enabled: bool,
 }
 
 /// Shared IO statistics written by the accessor and read by the caller after search.
@@ -954,9 +889,6 @@ where
             self.io_stats.clone(),
         )?;
         accessor.preprocess_query()?;
-        if self.config.trace_enabled {
-            accessor.enable_trace();
-        }
         Ok(accessor)
     }
 
diff --git a/diskann-disk/src/search/search_trace.rs b/diskann-disk/src/search/search_trace.rs
deleted file mode 100644
index ebcb253b2..000000000
--- a/diskann-disk/src/search/search_trace.rs
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Per-query search tracing and profiling for comparing PipeSearch vs UnifiedPipeSearch.
-//!
-//! Captures two kinds of data:
-//! - **Event trace**: Ordered list of search events (submit, complete, expand, etc.)
-//!   for side-by-side algorithmic comparison.
-//! - **Profile counters**: Cumulative time in each phase (IO poll, IO submit, expand,
-//!   PQ distance, queue ops, spin-wait) for identifying bottlenecks.
-//!
-//! Tracing is opt-in: create a `SearchTrace` and pass it to the search function.
-//! When disabled (None), all operations are zero-cost.
-
-use std::time::Instant;
-
-/// A single event in the search trace.
-#[derive(Debug, Clone)]
-pub struct TraceEvent {
-    /// Microseconds since the start of the search.
-    pub time_us: u64,
-    /// The event kind.
-    pub kind: TraceEventKind,
-}
-
-/// Kinds of trace events.
-#[derive(Debug, Clone)]
-pub enum TraceEventKind {
-    /// IO submitted for a node. `inflight` is the count AFTER submission.
-    Submit { node_id: u32, inflight: usize },
-    /// IO completed for a node (data loaded from disk).
-    Complete { node_id: u32 },
-    /// Node loaded from cache (no IO needed).
-    CacheHit { node_id: u32 },
-    /// Node expanded: FP distance computed, neighbors discovered.
-    Expand {
-        node_id: u32,
-        fp_distance: f32,
-        num_neighbors: u32,
-        num_new_candidates: u32,
-    },
-    /// Node selected from priority queue for submission.
-    Select {
-        node_id: u32,
-        pq_distance: f32,
-        queue_position: u32,
-    },
-    /// Poll returned no completions (spin-wait iteration).
-    SpinWait,
-    /// Search terminated.
-    Done {
-        total_hops: u32,
-        total_ios: u32,
-        total_comparisons: u32,
-    },
-}
-
-/// Cumulative profiling counters for a single query.
-#[derive(Debug, Clone, Default)]
-pub struct SearchProfile {
-    /// Time spent polling io_uring for completions.
-    pub io_poll_us: u64,
-    /// Time spent submitting IO requests.
-    pub io_submit_us: u64,
-    /// Time spent computing full-precision distances.
-    pub fp_distance_us: u64,
-    /// Time spent computing PQ distances for neighbors.
-    pub pq_distance_us: u64,
-    /// Time spent on priority queue operations (insert, closest_notvisited).
-    pub queue_ops_us: u64,
-    /// Time spent in spin-wait (nothing to submit or expand).
-    pub spin_wait_us: u64,
-    /// Time spent parsing nodes from sector buffers.
-    pub parse_node_us: u64,
-    /// Number of spin-wait iterations.
-    pub spin_wait_count: u64,
-    /// Number of IO poll calls.
-    pub poll_count: u64,
-    /// Number of IO submit calls.
-    pub submit_count: u64,
-    /// Number of nodes expanded.
-    pub expand_count: u64,
-    /// Total search wall time.
-    pub total_us: u64,
-}
-
-/// Per-query search trace collector.
-///
-/// Create one per query, pass to search functions. After search completes,
-/// inspect `events` and `profile` for analysis.
-pub struct SearchTrace {
-    start: Instant,
-    pub events: Vec<TraceEvent>,
-    pub profile: SearchProfile,
-    phase_start: Option<Instant>,
-}
-
-impl SearchTrace {
-    pub fn new() -> Self {
-        Self {
-            start: Instant::now(),
-            events: Vec::with_capacity(256),
-            profile: SearchProfile::default(),
-            phase_start: None,
-        }
-    }
-
-    /// Record a trace event with the current timestamp.
-    #[inline]
-    pub fn event(&mut self, kind: TraceEventKind) {
-        let time_us = self.start.elapsed().as_micros() as u64;
-        self.events.push(TraceEvent { time_us, kind });
-    }
-
-    /// Start timing a phase. Call `end_phase_*` to accumulate the duration.
-    #[inline]
-    pub fn begin_phase(&mut self) {
-        self.phase_start = Some(Instant::now());
-    }
-
-    /// End the current phase and add elapsed time to `io_poll_us`.
-    #[inline]
-    pub fn end_phase_io_poll(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.io_poll_us += start.elapsed().as_micros() as u64;
-            self.profile.poll_count += 1;
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_io_submit(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.io_submit_us += start.elapsed().as_micros() as u64;
-            self.profile.submit_count += 1;
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_fp_distance(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.fp_distance_us += start.elapsed().as_micros() as u64;
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_pq_distance(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.pq_distance_us += start.elapsed().as_micros() as u64;
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_queue_ops(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.queue_ops_us += start.elapsed().as_micros() as u64;
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_spin_wait(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.spin_wait_us += start.elapsed().as_micros() as u64;
-            self.profile.spin_wait_count += 1;
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_parse_node(&mut self) {
-        if let Some(start) = self.phase_start.take() {
-            self.profile.parse_node_us += start.elapsed().as_micros() as u64;
-        }
-    }
-
-    #[inline]
-    pub fn record_expand(&mut self) {
-        self.profile.expand_count += 1;
-    }
-
-    /// Finalize the trace, recording total wall time.
-    pub fn finish(&mut self) {
-        self.profile.total_us = self.start.elapsed().as_micros() as u64;
-    }
-
-    /// Print a summary of the profile to stderr (for debugging).
-    pub fn print_profile_summary(&self) {
-        let p = &self.profile;
-        let accounted = p.io_poll_us + p.io_submit_us + p.fp_distance_us
-            + p.pq_distance_us + p.queue_ops_us + p.spin_wait_us + p.parse_node_us;
-        let other = p.total_us.saturating_sub(accounted);
-        eprintln!(
-            "Profile: total={}us io_poll={}us({}) io_submit={}us({}) \
-             fp_dist={}us pq_dist={}us queue={}us spin={}us({}) parse={}us other={}us | \
-             expands={} polls={} submits={}",
-            p.total_us,
-            p.io_poll_us, p.poll_count,
-            p.io_submit_us, p.submit_count,
-            p.fp_distance_us,
-            p.pq_distance_us,
-            p.queue_ops_us,
-            p.spin_wait_us, p.spin_wait_count,
-            p.parse_node_us,
-            other,
-            p.expand_count, p.poll_count, p.submit_count,
-        );
-    }
-
-    /// Print the first N events to stderr (for debugging).
-    pub fn print_events(&self, max: usize) {
-        for (i, ev) in self.events.iter().enumerate().take(max) {
-            eprintln!("  [{:>4}] @{:>6}us {:?}", i, ev.time_us, ev.kind);
-        }
-        if self.events.len() > max {
-            eprintln!("  ... ({} more events)", self.events.len() - max);
-        }
-    }
-}
-
-/// Optional trace wrapper — all methods are no-ops when None.
-/// This avoids polluting call sites with `if let Some(trace) = ...`.
-pub struct OptionalTrace<'a>(pub Option<&'a mut SearchTrace>);
-
-impl<'a> OptionalTrace<'a> {
-    #[inline]
-    pub fn event(&mut self, kind: TraceEventKind) {
-        if let Some(t) = self.0.as_mut() {
-            t.event(kind);
-        }
-    }
-
-    #[inline]
-    pub fn begin_phase(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.begin_phase();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_io_poll(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_io_poll();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_io_submit(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_io_submit();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_fp_distance(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_fp_distance();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_pq_distance(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_pq_distance();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_queue_ops(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_queue_ops();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_spin_wait(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_spin_wait();
-        }
-    }
-
-    #[inline]
-    pub fn end_phase_parse_node(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.end_phase_parse_node();
-        }
-    }
-
-    #[inline]
-    pub fn record_expand(&mut self) {
-        if let Some(t) = self.0.as_mut() {
-            t.record_expand();
-        }
-    }
-}

From 948c5c95e6a6447d032cdc7baf31f9d52fc16b82 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 12:14:24 -0800
Subject: [PATCH 30/46] cleanup unused

---
 diskann/src/graph/index.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 1e71596da..5cc7f3725 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2111,9 +2111,6 @@ where
             }
 
             let mut neighbors = Vec::with_capacity(self.max_degree_with_slack());
-            // Tracks how many nodes were expanded last iteration, so the
-            // pipelined submit can match its rate (process-N-submit-N).
-            let mut last_expanded: usize = 0;
             // Tracks speculatively submitted (but not yet visited/expanded) nodes
             // so the pipelined path can decouple submission from visitation.
             let mut submitted = std::collections::HashSet::<DP::InternalId>::new();
@@ -2147,7 +2144,6 @@ where
                             |distance, id| neighbors.push(Neighbor::new(id, distance)),
                         )
                         .await?;
-                    last_expanded = expanded;
 
                     for &id in accessor.last_expanded_ids() {
                         scratch.best.mark_visited_by_id(&id);
@@ -2226,7 +2222,6 @@ where
                             |distance, id| neighbors.push(Neighbor::new(id, distance)),
                         )
                         .await?;
-                    last_expanded = expanded;
 
                     for &id in accessor.last_expanded_ids() {
                         scratch.best.mark_visited_by_id(&id);

From 61c80490ba9ddec44b99953abe66eebde1b68e6f Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 12:29:59 -0800
Subject: [PATCH 31/46] Waste-based adaptive beam width matching PipeANN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace unconditional beam growth with PipeANN's waste ratio tracking:
- Start at initial_beam_width (default 4), grow to beam_width
- Track useful vs wasted IOs after 5-hop convergence gate
- Grow beam +1 only when waste ratio ≤ 10%
- initial_beam_width configurable via SearchParams

At BW=8: +43% QPS at L=100 vs BW=4 with minimal IO increase.
---
 diskann/src/graph/index.rs | 47 ++++++++++++++++++++++++++++++++++----
 diskann/src/graph/misc.rs  |  4 ++++
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 5cc7f3725..1445ba923 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2085,12 +2085,16 @@ where
         async move {
             let beam_width = search_params.beam_width.unwrap_or(1);
 
-            // Adaptive beam width: start smaller and grow based on convergence
+            // Adaptive beam width: start at initial_beam_width and grow based on
+            // IO waste ratio. Mirrors PipeANN: grow +1 when ≤10% of IOs are wasted
+            // (expanded node no longer in top-L). Only kicks in after 5 hops.
             let mut cur_beam_width = if search_params.adaptive_beam_width {
-                beam_width.min(4)
+                beam_width.min(search_params.initial_beam_width)
             } else {
                 beam_width
             };
+            let mut abw_useful: u32 = 0; // IOs whose expansion was still useful
+            let mut abw_total: u32 = 0; // total IOs tracked for waste ratio
 
             // Relaxed monotonicity: continue exploring after convergence
             let mut converge_size: Option<usize> = None;
@@ -2151,15 +2155,32 @@ where
                     }
 
                     // Step 2: Insert neighbors (updates queue before IO decision)
+                    let worst_before = {
+                        let sz = scratch.best.size().min(scratch.best.search_l());
+                        if sz > 0 { scratch.best.get(sz - 1).distance } else { f32::MAX }
+                    };
                     neighbors
                         .iter()
                         .for_each(|neighbor| scratch.best.insert(*neighbor));
                     scratch.cmps += neighbors.len() as u32;
                     scratch.hops += expanded as u32;
 
-                    // Adaptive beam width
+                    // Adaptive beam width: track IO waste after convergence gate
                     if search_params.adaptive_beam_width && expanded > 0 {
-                        cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+                        if scratch.hops >= 5 {
+                            let improved = neighbors.iter().any(|n| n.distance < worst_before);
+                            abw_total += 1;
+                            if improved {
+                                abw_useful += 1;
+                            }
+                            // Grow when ≤10% waste (matching PipeANN's kWasteThreshold)
+                            if abw_total > 0
+                                && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
+                            {
+                                cur_beam_width =
+                                    (cur_beam_width + 1).max(search_params.initial_beam_width).min(beam_width);
+                            }
+                        }
                     }
 
                     // Step 3: Submit one IO (with updated queue)
@@ -2228,6 +2249,10 @@ where
                         submitted.remove(&id);
                     }
 
+                    let worst_before = {
+                        let sz = scratch.best.size().min(scratch.best.search_l());
+                        if sz > 0 { scratch.best.get(sz - 1).distance } else { f32::MAX }
+                    };
                     neighbors
                         .iter()
                         .for_each(|neighbor| scratch.best.insert(*neighbor));
@@ -2235,7 +2260,19 @@ where
                     scratch.hops += expanded as u32;
 
                     if search_params.adaptive_beam_width && expanded > 0 {
-                        cur_beam_width = (cur_beam_width + 1).max(4).min(beam_width);
+                        if scratch.hops >= 5 {
+                            let improved = neighbors.iter().any(|n| n.distance < worst_before);
+                            abw_total += 1;
+                            if improved {
+                                abw_useful += 1;
+                            }
+                            if abw_total > 0
+                                && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
+                            {
+                                cur_beam_width =
+                                    (cur_beam_width + 1).max(search_params.initial_beam_width).min(beam_width);
+                            }
+                        }
                     }
                 }
 
diff --git a/diskann/src/graph/misc.rs b/diskann/src/graph/misc.rs
index 95bf93363..ca0641641 100644
--- a/diskann/src/graph/misc.rs
+++ b/diskann/src/graph/misc.rs
@@ -43,6 +43,9 @@ pub struct SearchParams {
     pub beam_width: Option<usize>,
     /// Enable adaptive beam width based on waste ratio tracking.
     pub adaptive_beam_width: bool,
+    /// Starting beam width when adaptive_beam_width is true.
+    /// Defaults to 4 (matching PipeANN). Grows up to beam_width.
+    pub initial_beam_width: usize,
     /// Optional relaxed monotonicity parameter.
     pub relaxed_monotonicity_l: Option<usize>,
 }
@@ -85,6 +88,7 @@ impl SearchParams {
             l_value,
             beam_width,
             adaptive_beam_width: false,
+            initial_beam_width: 4,
             relaxed_monotonicity_l: None,
         })
     }

From b5b2eeaf939d44ad20ad43044df49af6d8ab1291 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 12:53:51 -0800
Subject: [PATCH 32/46] Make ABW convergence gate configurable
 (abw_convergence_hops, default 5)

---
 diskann/src/graph/index.rs | 5 ++---
 diskann/src/graph/misc.rs  | 4 ++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 1445ba923..65e17f8a0 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2165,9 +2165,8 @@ where
                     scratch.cmps += neighbors.len() as u32;
                     scratch.hops += expanded as u32;
 
-                    // Adaptive beam width: track IO waste after convergence gate
                     if search_params.adaptive_beam_width && expanded > 0 {
-                        if scratch.hops >= 5 {
+                        if scratch.hops >= search_params.abw_convergence_hops {
                             let improved = neighbors.iter().any(|n| n.distance < worst_before);
                             abw_total += 1;
                             if improved {
@@ -2260,7 +2259,7 @@ where
                     scratch.hops += expanded as u32;
 
                     if search_params.adaptive_beam_width && expanded > 0 {
-                        if scratch.hops >= 5 {
+                        if scratch.hops >= search_params.abw_convergence_hops {
                             let improved = neighbors.iter().any(|n| n.distance < worst_before);
                             abw_total += 1;
                             if improved {
diff --git a/diskann/src/graph/misc.rs b/diskann/src/graph/misc.rs
index ca0641641..df2670f64 100644
--- a/diskann/src/graph/misc.rs
+++ b/diskann/src/graph/misc.rs
@@ -46,6 +46,9 @@ pub struct SearchParams {
     /// Starting beam width when adaptive_beam_width is true.
     /// Defaults to 4 (matching PipeANN). Grows up to beam_width.
     pub initial_beam_width: usize,
+    /// Number of hops before adaptive beam width starts tracking waste.
+    /// Defaults to 5 (matching PipeANN's max_marker convergence gate).
+    pub abw_convergence_hops: u32,
     /// Optional relaxed monotonicity parameter.
     pub relaxed_monotonicity_l: Option<usize>,
 }
@@ -89,6 +92,7 @@ impl SearchParams {
             beam_width,
             adaptive_beam_width: false,
             initial_beam_width: 4,
+            abw_convergence_hops: 5,
             relaxed_monotonicity_l: None,
         })
     }

From b538f74f14c85c46e5c52537255984fc719ff1d6 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 12:54:50 -0800
Subject: [PATCH 33/46] Fix relaxed monotonicity to use max_marker convergence
 matching PipeANN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add peek_best_unsubmitted_with_position to track queue depth (max_marker)
- Both ABW waste tracking and RM convergence now gate on max_marker
  reaching abw_convergence_depth (default 5), matching PipeANN's
  max_marker >= 5 convergence gate
- Rename abw_convergence_hops → abw_convergence_depth (it's queue depth)
- RM counter uses hops (node expansions) matching PipeANN semantics
---
 diskann/src/graph/index.rs    | 34 ++++++++++++++++++++++++----------
 diskann/src/graph/misc.rs     | 10 ++++++----
 diskann/src/neighbor/queue.rs | 26 +++++++++++++++++++++++++-
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 65e17f8a0..9102f9473 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2095,6 +2095,11 @@ where
             };
             let mut abw_useful: u32 = 0; // IOs whose expansion was still useful
             let mut abw_total: u32 = 0; // total IOs tracked for waste ratio
+            // Tracks the deepest position in the sorted queue where we found an
+            // unsubmitted candidate. Mirrors PipeANN's max_marker — when this
+            // reaches the convergence gate, the search has explored past initial
+            // warmup and convergence-dependent features (ABW, RM) activate.
+            let mut max_marker: usize = 0;
 
             // Relaxed monotonicity: continue exploring after convergence
             let mut converge_size: Option<usize> = None;
@@ -2166,7 +2171,7 @@ where
                     scratch.hops += expanded as u32;
 
                     if search_params.adaptive_beam_width && expanded > 0 {
-                        if scratch.hops >= search_params.abw_convergence_hops {
+                        if max_marker >= search_params.abw_convergence_depth {
                             let improved = neighbors.iter().any(|n| n.distance < worst_before);
                             abw_total += 1;
                             if improved {
@@ -2186,9 +2191,10 @@ where
                     let inflight = accessor.inflight_count();
                     if inflight < cur_beam_width {
                         scratch.beam_nodes.clear();
-                        if let Some(closest_node) =
-                            scratch.best.peek_best_unsubmitted(&submitted)
+                        if let Some((pos, closest_node)) =
+                            scratch.best.peek_best_unsubmitted_with_position(&submitted)
                         {
+                            max_marker = max_marker.max(pos);
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
@@ -2210,9 +2216,10 @@ where
                     scratch.beam_nodes.clear();
                     if pipelining {
                         while scratch.beam_nodes.len() < submit_limit {
-                            if let Some(closest_node) =
-                                scratch.best.peek_best_unsubmitted(&submitted)
+                            if let Some((pos, closest_node)) =
+                                scratch.best.peek_best_unsubmitted_with_position(&submitted)
                             {
+                                max_marker = max_marker.max(pos);
                                 search_record.record(closest_node, scratch.hops, scratch.cmps);
                                 submitted.insert(closest_node.id);
                                 scratch.beam_nodes.push(closest_node.id);
@@ -2259,7 +2266,7 @@ where
                     scratch.hops += expanded as u32;
 
                     if search_params.adaptive_beam_width && expanded > 0 {
-                        if scratch.hops >= search_params.abw_convergence_hops {
+                        if max_marker >= search_params.abw_convergence_depth {
                             let improved = neighbors.iter().any(|n| n.distance < worst_before);
                             abw_total += 1;
                             if improved {
@@ -2275,14 +2282,21 @@ where
                     }
                 }
 
-                // Relaxed monotonicity: detect convergence and extend search
+                // Relaxed monotonicity: detect convergence and extend search.
+                // Convergence is detected when max_marker reaches the convergence
+                // depth — meaning the best unsubmitted candidate is deep enough in
+                // the sorted queue that the top candidates have been explored.
+                // After convergence, the search continues for rm_l additional node
+                // expansions to improve recall beyond the greedy optimum.
                 if let Some(rm_l) = search_params.relaxed_monotonicity_l {
                     if rm_l > 0 {
-                        if !scratch.best.has_notvisited_node() && converge_size.is_none() {
-                            converge_size = Some(scratch.cmps as usize);
+                        if max_marker >= search_params.abw_convergence_depth
+                            && converge_size.is_none()
+                        {
+                            converge_size = Some(scratch.hops as usize);
                         }
                         if let Some(cs) = converge_size {
-                            if (scratch.cmps as usize) >= cs + rm_l {
+                            if (scratch.hops as usize) >= cs + rm_l {
                                 break;
                             }
                         }
diff --git a/diskann/src/graph/misc.rs b/diskann/src/graph/misc.rs
index df2670f64..1829cb1f9 100644
--- a/diskann/src/graph/misc.rs
+++ b/diskann/src/graph/misc.rs
@@ -46,9 +46,11 @@ pub struct SearchParams {
     /// Starting beam width when adaptive_beam_width is true.
     /// Defaults to 4 (matching PipeANN). Grows up to beam_width.
     pub initial_beam_width: usize,
-    /// Number of hops before adaptive beam width starts tracking waste.
-    /// Defaults to 5 (matching PipeANN's max_marker convergence gate).
-    pub abw_convergence_hops: u32,
+    /// Queue depth threshold before adaptive beam width and relaxed monotonicity
+    /// activate. Defaults to 5 (matching PipeANN's max_marker convergence gate).
+    /// When the best unsubmitted candidate is at position ≥ this value in the
+    /// sorted queue, the search is considered past initial warmup.
+    pub abw_convergence_depth: usize,
     /// Optional relaxed monotonicity parameter.
     pub relaxed_monotonicity_l: Option<usize>,
 }
@@ -92,7 +94,7 @@ impl SearchParams {
             beam_width,
             adaptive_beam_width: false,
             initial_beam_width: 4,
-            abw_convergence_hops: 5,
+            abw_convergence_depth: 5,
             relaxed_monotonicity_l: None,
         })
     }
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index ec1148fd0..9b5e8b635 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -67,6 +67,15 @@ pub trait NeighborQueue<I: NeighborPriorityQueueIdType>: std::fmt::Debug + Send
         None
     }
 
+    /// Like `peek_best_unsubmitted`, but also returns the queue position (0-indexed).
+    /// The position indicates how deep into the sorted queue we had to search.
+    fn peek_best_unsubmitted_with_position(
+        &self,
+        submitted: &HashSet<I>,
+    ) -> Option<(usize, Neighbor<I>)> {
+        self.peek_best_unsubmitted(submitted).map(|n| (0, n))
+    }
+
     /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
     /// Returns true if found and marked, false otherwise.
     fn mark_visited_by_id(&mut self, _id: &I) -> bool {
@@ -502,11 +511,19 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Return the first node that is not visited and not in `submitted`,
     /// scanning positions 0..min(size, search_param_l). Does not modify any state.
     pub fn peek_best_unsubmitted(&self, submitted: &HashSet<I>) -> Option<Neighbor<I>> {
+        self.peek_best_unsubmitted_with_position(submitted)
+            .map(|(_, n)| n)
+    }
+
+    pub fn peek_best_unsubmitted_with_position(
+        &self,
+        submitted: &HashSet<I>,
+    ) -> Option<(usize, Neighbor<I>)> {
         let limit = self.search_param_l.min(self.size);
         for i in 0..limit {
             let (id, visited) = self.id_visiteds[i];
             if !visited && !submitted.contains(&id) {
-                return Some(Neighbor::new(id, self.distances[i]));
+                return Some((i, Neighbor::new(id, self.distances[i])));
             }
         }
         None
@@ -579,6 +596,13 @@ impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<
         self.peek_best_unsubmitted(submitted)
     }
 
+    fn peek_best_unsubmitted_with_position(
+        &self,
+        submitted: &HashSet<I>,
+    ) -> Option<(usize, Neighbor<I>)> {
+        self.peek_best_unsubmitted_with_position(submitted)
+    }
+
     fn mark_visited_by_id(&mut self, id: &I) -> bool {
         self.mark_visited_by_id(id)
     }

From 3cd317d32f0392aff72979b5f42cd4c5783a13c6 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 14:20:00 -0800
Subject: [PATCH 34/46] Expose ABW and relaxed monotonicity for BeamSearch

- Change expand_available return type from usize to Vec<Id>, eliminating
  last_expanded_ids() from the ExpandBeam trait entirely. Both pipelined
  and non-pipelined paths now uniformly iterate the returned IDs.
- Add search_with_params() to DiskIndexSearcher for full SearchParams control.
- Add adaptive_beam_width and relaxed_monotonicity_l fields to
  SearchMode::BeamSearch in the benchmark config.
- ABW with BW=8 gives +21% QPS for BeamSearch at L=100 with same recall.
---
 .../src/backend/disk_index/search.rs          | 25 ++++--
 diskann-benchmark/src/inputs/disk.rs          | 43 ++++++++--
 .../src/search/provider/disk_provider.rs      | 78 +++++++++++++++++++
 .../src/search/provider/pipelined_accessor.rs | 12 +--
 diskann/src/graph/glue.rs                     | 18 +----
 diskann/src/graph/index.rs                    | 46 +++++------
 6 files changed, 162 insertions(+), 60 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 2a304e10a..0af6051cd 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -9,7 +9,7 @@ use std::{collections::HashSet, fmt, sync::atomic::AtomicBool, sync::Arc, time::
 use opentelemetry::{global, trace::Span, trace::Tracer};
 use opentelemetry_sdk::trace::SdkTracerProvider;
 
-use diskann::{utils::VectorRepr, ANNResult};
+use diskann::{graph::SearchParams, utils::VectorRepr, ANNResult};
 use diskann_benchmark_runner::{files::InputFile, utils::MicroSeconds};
 use diskann_disk::{
     data_model::CachingStrategy,
@@ -319,7 +319,7 @@ where
     let has_any_search_failed = AtomicBool::new(false);
 
     match &search_params.search_mode {
-        SearchMode::BeamSearch => {
+        SearchMode::BeamSearch { adaptive_beam_width, relaxed_monotonicity_l } => {
             let searcher = &DiskIndexSearcher::<GraphData<T>, _>::new(
                 search_params.num_threads,
                 search_params.search_io_limit.unwrap_or(usize::MAX),
@@ -329,6 +329,9 @@ where
                 None,
             )?;
 
+            let abw = *adaptive_beam_width;
+            let rm_l = *relaxed_monotonicity_l;
+
             logger.log_checkpoint("index_loaded");
 
             search_results_per_l = run_search_loop(
@@ -358,12 +361,22 @@ where
                                     as Box<dyn Fn(&u32) -> bool + Send + Sync>)
                             };
 
+                            let mut sp = SearchParams::new(
+                                search_params.recall_at as usize,
+                                l as usize,
+                                Some(search_params.beam_width),
+                            ).unwrap();
+                            if abw {
+                                sp = sp.with_adaptive_beam_width();
+                            }
+                            if let Some(rm) = rm_l {
+                                sp = sp.with_relaxed_monotonicity(rm);
+                            }
+
                             write_query_result(
-                                searcher.search(
+                                searcher.search_with_params(
                                     q,
-                                    search_params.recall_at,
-                                    l,
-                                    Some(search_params.beam_width),
+                                    &sp,
                                     vector_filter,
                                     search_params.is_flat_search,
                                 ),
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index d481297e4..662364f8a 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -72,12 +72,18 @@ pub(crate) struct DiskIndexBuild {
 }
 
 /// Search algorithm to use for disk index search.
-#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 #[serde(tag = "mode")]
 pub(crate) enum SearchMode {
     /// Standard beam search (default, current behavior).
-    #[default]
-    BeamSearch,
+    BeamSearch {
+        /// Start with a smaller beam and grow adaptively. Defaults to false.
+        #[serde(default)]
+        adaptive_beam_width: bool,
+        /// Optional relaxed monotonicity parameter for early termination.
+        #[serde(default)]
+        relaxed_monotonicity_l: Option<usize>,
+    },
     /// Pipelined search through the generic search loop (queue-based ExpandBeam).
     /// Overlaps IO and compute using io_uring on Linux.
     #[serde(alias = "UnifiedPipeSearch")]
@@ -94,10 +100,37 @@ pub(crate) enum SearchMode {
     },
 }
 
+impl Default for SearchMode {
+    fn default() -> Self {
+        SearchMode::BeamSearch {
+            adaptive_beam_width: false,
+            relaxed_monotonicity_l: None,
+        }
+    }
+}
+
 impl fmt::Display for SearchMode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            SearchMode::BeamSearch => write!(f, "BeamSearch"),
+            SearchMode::BeamSearch { adaptive_beam_width, relaxed_monotonicity_l } => {
+                write!(f, "BeamSearch")?;
+                let has_abw = *adaptive_beam_width;
+                let has_rm = relaxed_monotonicity_l.is_some();
+                if has_abw || has_rm {
+                    write!(f, "(")?;
+                    let mut first = true;
+                    if has_abw {
+                        write!(f, "abw")?;
+                        first = false;
+                    }
+                    if let Some(rm) = relaxed_monotonicity_l {
+                        if !first { write!(f, ", ")?; }
+                        write!(f, "rm_l={}", rm)?;
+                    }
+                    write!(f, ")")?;
+                }
+                Ok(())
+            }
             SearchMode::PipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
                 write!(f, "PipeSearch")?;
                 let has_abw = *adaptive_beam_width;
@@ -298,7 +331,7 @@ impl CheckDeserialization for DiskSearchPhase {
             }
         }
         match &self.search_mode {
-            SearchMode::BeamSearch => {}
+            SearchMode::BeamSearch { .. } => {}
             SearchMode::PipeSearch { .. } => {}
         }
         Ok(())
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index 38a1579a8..f55d1c8ee 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -965,6 +965,84 @@ where
         Ok(search_result)
     }
 
+    /// Perform a search with explicit [`SearchParams`] for full control over
+    /// adaptive beam width, relaxed monotonicity, etc.
+    pub fn search_with_params(
+        &self,
+        query: &[Data::VectorDataType],
+        search_params: &SearchParams,
+        vector_filter: Option<VectorFilter<Data>>,
+        is_flat_search: bool,
+    ) -> ANNResult<SearchResult<Data::AssociatedDataType>> {
+        let k_value = search_params.k_value;
+        let mut query_stats = QueryStatistics::default();
+        let mut indices = vec![0u32; k_value];
+        let mut distances = vec![0f32; k_value];
+        let mut associated_data = vec![Data::AssociatedDataType::default(); k_value];
+
+        let mut result_output_buffer = search_output_buffer::IdDistanceAssociatedData::new(
+            &mut indices[..k_value],
+            &mut distances[..k_value],
+            &mut associated_data[..k_value],
+        );
+
+        let filter = vector_filter.unwrap_or(default_vector_filter::<Data>());
+        let strategy = self.search_strategy(query, &filter);
+        let timer = Instant::now();
+        let stats = if is_flat_search {
+            self.runtime.block_on(self.index.flat_search(
+                &strategy,
+                &DefaultContext,
+                strategy.query,
+                &filter,
+                search_params,
+                &mut result_output_buffer,
+            ))?
+        } else {
+            self.runtime.block_on(self.index.search(
+                &strategy,
+                &DefaultContext,
+                strategy.query,
+                search_params,
+                &mut result_output_buffer,
+            ))?
+        };
+        query_stats.total_comparisons = stats.cmps;
+        query_stats.search_hops = stats.hops;
+        query_stats.total_execution_time_us = timer.elapsed().as_micros();
+        query_stats.io_time_us = IOTracker::time(&strategy.io_tracker.io_time_us) as u128;
+        query_stats.total_io_operations = strategy.io_tracker.io_count() as u32;
+        query_stats.total_vertices_loaded = strategy.io_tracker.io_count() as u32;
+        query_stats.query_pq_preprocess_time_us =
+            IOTracker::time(&strategy.io_tracker.preprocess_time_us) as u128;
+        query_stats.cpu_time_us = query_stats.total_execution_time_us
+            - query_stats.io_time_us
+            - query_stats.query_pq_preprocess_time_us;
+
+        let mut search_result = SearchResult {
+            results: Vec::with_capacity(k_value),
+            stats: SearchResultStats {
+                cmps: query_stats.total_comparisons,
+                result_count: stats.result_count,
+                query_statistics: query_stats,
+            },
+        };
+
+        for ((vertex_id, distance), associated_data) in indices
+            .into_iter()
+            .zip(distances.into_iter())
+            .zip(associated_data.into_iter())
+        {
+            search_result.results.push(SearchResultItem {
+                vertex_id,
+                distance,
+                data: associated_data,
+            });
+        }
+
+        Ok(search_result)
+    }
+
     /// Perform a raw search on the disk index.
     /// This is a lower-level API that allows more control over the search parameters and output buffers.
     #[allow(clippy::too_many_arguments)]
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 28427d306..eaab02f4c 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -624,7 +624,7 @@ where
         _computer: &Self::QueryComputer,
         mut pred: P,
         mut on_neighbors: F,
-    ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
+    ) -> impl std::future::Future<Output = ANNResult<Vec<Self::Id>>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
@@ -636,7 +636,7 @@ where
             self.drain_completions()?;
 
             if self.scratch.loaded_nodes.is_empty() {
-                return Ok(0);
+                return Ok(Vec::new());
             }
 
             // Try caller's priority order first
@@ -660,7 +660,7 @@ where
 
             let vid = match best_vid {
                 Some(id) => id,
-                None => return Ok(0),
+                None => return Ok(Vec::new()),
             };
             let node = self.scratch.loaded_nodes.remove(&vid).unwrap();
             self.scratch.expanded_ids.push(vid);
@@ -697,7 +697,7 @@ where
             // Return node to pool for reuse
             self.scratch.release_node(node);
 
-            Ok(1)
+            Ok(self.scratch.expanded_ids.clone())
         }
     }
 
@@ -717,10 +717,6 @@ where
         }
     }
 
-    fn last_expanded_ids(&self) -> &[u32] {
-        &self.scratch.expanded_ids
-    }
-
     fn is_pipelined(&self) -> bool {
         true
     }
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 51ffba6ab..c0db71859 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -277,24 +277,23 @@ where
     /// completed IO operations and expands only the nodes whose data has arrived,
     /// returning immediately without blocking.
     ///
-    /// Returns the number of nodes that were expanded in this call.
+    /// Returns the IDs of nodes that were actually expanded in this call.
     fn expand_available<P, F>(
         &mut self,
         ids: impl Iterator<Item = Self::Id> + Send,
         computer: &Self::QueryComputer,
         pred: P,
         on_neighbors: F,
-    ) -> impl std::future::Future<Output = ANNResult<usize>> + Send
+    ) -> impl std::future::Future<Output = ANNResult<Vec<Self::Id>>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
             let id_vec: Vec<Self::Id> = ids.collect();
-            let count = id_vec.len();
-            self.expand_beam(id_vec.into_iter(), computer, pred, on_neighbors)
+            self.expand_beam(id_vec.iter().copied(), computer, pred, on_neighbors)
                 .await?;
-            Ok(count)
+            Ok(id_vec)
         }
     }
 
@@ -333,15 +332,6 @@ where
         false
     }
 
-    /// Return the IDs of nodes expanded in the most recent `expand_available` call.
-    ///
-    /// The search loop uses this to mark speculatively submitted nodes as visited
-    /// only after they have actually been expanded. Non-pipelined providers return
-    /// an empty slice (they mark visited at selection time).
-    fn last_expanded_ids(&self) -> &[Self::Id] {
-        &[]
-    }
-
     /// Expand all `ids` synchronously: load data, get neighbors, compute distances.
     ///
     /// This is the original single-shot expansion method. For non-pipelined providers,
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 9102f9473..e23ceaa40 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2145,7 +2145,7 @@ where
                     // Step 1: Expand one loaded node (polls internally).
                     // Pass empty iterator — the accessor picks by rank.
                     neighbors.clear();
-                    let expanded = accessor
+                    let expanded_ids = accessor
                         .expand_available(
                             std::iter::empty(),
                             computer,
@@ -2154,7 +2154,7 @@ where
                         )
                         .await?;
 
-                    for &id in accessor.last_expanded_ids() {
+                    for &id in &expanded_ids {
                         scratch.best.mark_visited_by_id(&id);
                         submitted.remove(&id);
                     }
@@ -2168,9 +2168,9 @@ where
                         .iter()
                         .for_each(|neighbor| scratch.best.insert(*neighbor));
                     scratch.cmps += neighbors.len() as u32;
-                    scratch.hops += expanded as u32;
+                    scratch.hops += expanded_ids.len() as u32;
 
-                    if search_params.adaptive_beam_width && expanded > 0 {
+                    if search_params.adaptive_beam_width && !expanded_ids.is_empty() {
                         if max_marker >= search_params.abw_convergence_depth {
                             let improved = neighbors.iter().any(|n| n.distance < worst_before);
                             abw_total += 1;
@@ -2203,45 +2203,36 @@ where
                     }
 
                     // Block if truly idle
-                    if expanded == 0 && has_pending {
+                    if expanded_ids.is_empty() && has_pending {
                         let inflight = accessor.inflight_count();
                         if inflight > 0 {
                             accessor.wait_for_io();
                         }
                     }
                 } else {
-                    // Non-pipelined path OR initial burst (has_pending=false)
+                    // Non-pipelined path OR initial burst (has_pending=false).
+                    // Both pipelined and non-pipelined use the same node selection
+                    // to track max_marker for ABW/RM convergence detection.
                     let submit_limit = if has_pending { 0 } else { cur_beam_width };
 
                     scratch.beam_nodes.clear();
-                    if pipelining {
-                        while scratch.beam_nodes.len() < submit_limit {
-                            if let Some((pos, closest_node)) =
-                                scratch.best.peek_best_unsubmitted_with_position(&submitted)
-                            {
-                                max_marker = max_marker.max(pos);
-                                search_record.record(closest_node, scratch.hops, scratch.cmps);
-                                submitted.insert(closest_node.id);
-                                scratch.beam_nodes.push(closest_node.id);
-                            } else {
-                                break;
-                            }
-                        }
-                    } else {
-                        while scratch.best.has_notvisited_node()
-                            && scratch.beam_nodes.len() < submit_limit
+                    while scratch.beam_nodes.len() < submit_limit {
+                        if let Some((pos, closest_node)) =
+                            scratch.best.peek_best_unsubmitted_with_position(&submitted)
                         {
-                            let closest_node = scratch.best.closest_notvisited();
+                            max_marker = max_marker.max(pos);
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
+                        } else {
+                            break;
                         }
                     }
 
                     accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
                     neighbors.clear();
-                    let expanded = accessor
+                    let expanded_ids = accessor
                         .expand_available(
                             scratch.beam_nodes.iter().copied(),
                             computer,
@@ -2250,7 +2241,8 @@ where
                         )
                         .await?;
 
-                    for &id in accessor.last_expanded_ids() {
+                    // Mark expanded nodes visited.
+                    for &id in &expanded_ids {
                         scratch.best.mark_visited_by_id(&id);
                         submitted.remove(&id);
                     }
@@ -2263,9 +2255,9 @@ where
                         .iter()
                         .for_each(|neighbor| scratch.best.insert(*neighbor));
                     scratch.cmps += neighbors.len() as u32;
-                    scratch.hops += expanded as u32;
+                    scratch.hops += expanded_ids.len() as u32;
 
-                    if search_params.adaptive_beam_width && expanded > 0 {
+                    if search_params.adaptive_beam_width && !expanded_ids.is_empty() {
                         if max_marker >= search_params.abw_convergence_depth {
                             let improved = neighbors.iter().any(|n| n.distance < worst_before);
                             abw_total += 1;

From 91ccb5208161bfb0fc691d25d7a19e5c2533f175 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 14:26:19 -0800
Subject: [PATCH 35/46] Add ABW and RM variants to sift1m benchmark script

- Add --abw and --rm-l N flags to enable adaptive beam width and relaxed
  monotonicity variants for both BeamSearch and PipeSearch.
- Dynamic mode generation: baseline (2 modes), +ABW adds 2 more, +RM adds 2 more.
- Charts and CSV now handle arbitrary number of modes with distinct colors/markers.
- Plot title includes ABW/RM parameters when enabled.
- Switch search_mode JSON output from Debug to Display format for cleaner labels
  (e.g. 'BeamSearch(abw, rm_l=200)' instead of 'BeamSearch { adaptive_beam_width: true, ... }').
---
 diskann-benchmark/scripts/sift1m_benchmark.sh | 249 +++++++++++-------
 .../src/backend/disk_index/search.rs          |   2 +-
 2 files changed, 153 insertions(+), 98 deletions(-)

diff --git a/diskann-benchmark/scripts/sift1m_benchmark.sh b/diskann-benchmark/scripts/sift1m_benchmark.sh
index 7ea01b9ae..c37ad148a 100755
--- a/diskann-benchmark/scripts/sift1m_benchmark.sh
+++ b/diskann-benchmark/scripts/sift1m_benchmark.sh
@@ -2,7 +2,8 @@
 # SIFT1M Pipelined Search Benchmark
 #
 # Downloads SIFT1M dataset, builds a disk index, and runs an ablation
-# benchmark comparing BeamSearch vs PipeSearch (io_uring pipelining).
+# benchmark comparing BeamSearch vs PipeSearch (io_uring pipelining)
+# with optional adaptive beam width (ABW) and relaxed monotonicity (RM).
 #
 # By default, sweeps thread counts from 1 to max_threads in strides of 4
 # and produces charts (QPS, mean latency, tail latency vs threads).
@@ -27,6 +28,8 @@ MAX_THREADS="${MAX_THREADS:-48}"
 THREAD_STRIDE="${THREAD_STRIDE:-4}"
 BEAM_WIDTH="${BEAM_WIDTH:-4}"
 SEARCH_L="${SEARCH_L:-100}"
+ABW=false
+RM_L=""
 SKIP_DOWNLOAD=false
 SKIP_BUILD=false
 SKIP_INDEX=false
@@ -42,6 +45,8 @@ while [[ $# -gt 0 ]]; do
         --thread-stride) THREAD_STRIDE="$2"; shift 2 ;;
         --beam-width)    BEAM_WIDTH="$2"; shift 2 ;;
         --search-l)      SEARCH_L="$2"; shift 2 ;;
+        --abw)           ABW=true; shift ;;
+        --rm-l)          RM_L="$2"; shift 2 ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo ""
@@ -54,6 +59,8 @@ while [[ $# -gt 0 ]]; do
             echo "  --thread-stride N    Thread count increment (default: 4)"
             echo "  --beam-width N       Beam width / pipeline width (default: 4)"
             echo "  --search-l N         Search list size L (default: 100)"
+            echo "  --abw                Enable adaptive beam width for ABW variants"
+            echo "  --rm-l N             Enable relaxed monotonicity with budget N for RM variants"
             exit 0
             ;;
         *) echo "Unknown option: $1"; exit 1 ;;
@@ -69,6 +76,8 @@ echo "=== SIFT1M Pipelined Search Benchmark ==="
 echo "Data directory: $DATA_DIR"
 echo "Thread sweep: 1, 4..${MAX_THREADS} (stride ${THREAD_STRIDE})"
 echo "Beam width: $BEAM_WIDTH, Search L: $SEARCH_L"
+echo "Adaptive beam width: $ABW"
+[ -n "$RM_L" ] && echo "Relaxed monotonicity L: $RM_L"
 echo ""
 
 # -------------------------------------------------------------------
@@ -235,11 +244,53 @@ for (( t=THREAD_STRIDE; t<=MAX_THREADS; t+=THREAD_STRIDE )); do
 done
 echo "Thread counts: $THREAD_LIST"
 
-# Generate a single config with all jobs (2 per thread count: Beam + Pipe)
+# Build the list of search mode configurations to benchmark.
+# Always includes baseline BeamSearch and PipeSearch.
+# When --abw or --rm-l are specified, adds ABW and/or ABW+RM variants for both.
+MODES=()
+
+# Helper: build a search_mode JSON fragment
+beam_mode() {
+    local abw="${1:-false}"
+    local rm="${2:-}"
+    local json="{\"mode\": \"BeamSearch\", \"adaptive_beam_width\": $abw"
+    [ -n "$rm" ] && json="$json, \"relaxed_monotonicity_l\": $rm"
+    echo "$json}"
+}
+pipe_mode() {
+    local abw="${1:-true}"
+    local rm="${2:-}"
+    local json="{\"mode\": \"PipeSearch\", \"adaptive_beam_width\": $abw"
+    [ -n "$rm" ] && json="$json, \"relaxed_monotonicity_l\": $rm"
+    echo "$json}"
+}
+
+# Baseline (no ABW, no RM)
+MODES+=("$(beam_mode false)")
+MODES+=("$(pipe_mode false)")
+
+# ABW variants
+if [ "$ABW" = true ]; then
+    MODES+=("$(beam_mode true)")
+    MODES+=("$(pipe_mode true)")
+fi
+
+# ABW+RM variants (RM requires ABW for the convergence gate to work)
+if [ -n "$RM_L" ]; then
+    MODES+=("$(beam_mode true "$RM_L")")
+    MODES+=("$(pipe_mode true "$RM_L")")
+fi
+
+NUM_MODES=${#MODES[@]}
+echo "Search modes ($NUM_MODES per thread count):"
+for m in "${MODES[@]}"; do echo "  $m"; done
+
+# Generate a single config with all jobs
 JOBS=""
 for T in $THREAD_LIST; do
-    [ -n "$JOBS" ] && JOBS="$JOBS,"
-    JOBS="$JOBS
+    for MODE_JSON in "${MODES[@]}"; do
+        [ -n "$JOBS" ] && JOBS="$JOBS,"
+        JOBS="$JOBS
         {
             \"type\": \"disk-index\",
             \"content\": {
@@ -257,31 +308,11 @@ for T in $THREAD_LIST; do
                     \"num_threads\": $T,
                     \"is_flat_search\": false,
                     \"distance\": \"squared_l2\",
-                    \"search_mode\": {\"mode\": \"BeamSearch\"}
-                }
-            }
-        },
-        {
-            \"type\": \"disk-index\",
-            \"content\": {
-                \"source\": {
-                    \"disk-index-source\": \"Load\",
-                    \"data_type\": \"float32\",
-                    \"load_path\": \"$INDEX_PREFIX\"
-                },
-                \"search_phase\": {
-                    \"queries\": \"sift_query.fbin\",
-                    \"groundtruth\": \"sift_groundtruth.bin\",
-                    \"search_list\": [$SEARCH_L],
-                    \"beam_width\": $BEAM_WIDTH,
-                    \"recall_at\": 10,
-                    \"num_threads\": $T,
-                    \"is_flat_search\": false,
-                    \"distance\": \"squared_l2\",
-                    \"search_mode\": {\"mode\": \"PipeSearch\"}
+                    \"search_mode\": $MODE_JSON
                 }
             }
         }"
+    done
 done
 
 SWEEP_CONFIG="$OUTPUT_DIR/sweep_config.json"
@@ -300,22 +331,23 @@ SWEEPEOF
 echo ""
 echo "--- Step 5: Generating charts ---"
 
-python3 - "$SWEEP_OUTPUT" "$OUTPUT_DIR" "$SEARCH_L" "$BEAM_WIDTH" << 'CHARTEOF'
+python3 - "$SWEEP_OUTPUT" "$OUTPUT_DIR" "$SEARCH_L" "$BEAM_WIDTH" "$ABW" "$RM_L" << 'CHARTEOF'
 import json, sys, os
+from collections import defaultdict
 
 output_dir = sys.argv[2]
 search_l = sys.argv[3]
 beam_width = sys.argv[4]
+abw_flag = sys.argv[5]
+rm_l = sys.argv[6] if len(sys.argv) > 6 else ""
 
 with open(sys.argv[1]) as f:
     data = json.load(f)
 
-# Parse results: each job is data[i] with structure:
-#   data[i]["results"]["search"]["search_mode"] — "BeamSearch" or "PipeSearch(...)"
-#   data[i]["results"]["search"]["num_threads"] — thread count
-#   data[i]["results"]["search"]["search_results_per_l"][0] — first (only) L result
-beam = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
-pipe = {"threads": [], "qps": [], "mean_lat": [], "p95_lat": [], "p999_lat": [], "recall": []}
+# Parse results into per-mode buckets keyed by the search_mode display string.
+# Each bucket holds lists of (threads, qps, mean_lat, p95_lat, p999_lat, recall).
+modes = defaultdict(lambda: {"threads": [], "qps": [], "mean_lat": [],
+                              "p95_lat": [], "p999_lat": [], "recall": []})
 
 for job in data:
     search = job.get("results", {}).get("search", {})
@@ -328,8 +360,7 @@ for job in data:
     threads = search.get("num_threads", 0)
     mode = str(search.get("search_mode", ""))
 
-    d = beam if "BeamSearch" in mode else pipe
-
+    d = modes[mode]
     d["threads"].append(threads)
     d["qps"].append(r.get("qps", 0))
     d["mean_lat"].append(r.get("mean_latency", 0))
@@ -337,30 +368,70 @@ for job in data:
     d["p999_lat"].append(r.get("p999_latency", 0))
     d["recall"].append(r.get("recall", 0))
 
-# Sort by threads
-for d in [beam, pipe]:
+# Sort each mode by threads
+for d in modes.values():
     if d["threads"]:
         order = sorted(range(len(d["threads"])), key=lambda i: d["threads"][i])
         for k in d:
             d[k] = [d[k][i] for i in order]
 
-# Print table
-print(f"\n{'Threads':>7s}  {'BeamSearch QPS':>14s} {'PipeSearch QPS':>14s}  "
-      f"{'Beam Mean':>10s} {'Pipe Mean':>10s}  "
-      f"{'Beam p999':>10s} {'Pipe p999':>10s}  "
-      f"{'Beam Recall':>11s} {'Pipe Recall':>11s}")
-print("=" * 120)
-
-for i in range(len(beam["threads"])):
-    bt, bq, bm, bp9 = beam["threads"][i], beam["qps"][i], beam["mean_lat"][i], beam["p999_lat"][i]
-    br = beam["recall"][i]
-    if i < len(pipe["threads"]):
-        pt, pq, pm, pp9 = pipe["threads"][i], pipe["qps"][i], pipe["mean_lat"][i], pipe["p999_lat"][i]
-        pr = pipe["recall"][i]
+# Assign short labels and colors
+COLORS = ['#2196F3', '#FF5722', '#4CAF50', '#9C27B0', '#FF9800', '#00BCD4']
+mode_names = sorted(modes.keys())
+labels = {}
+for name in mode_names:
+    # Build a concise label from the search_mode string
+    if "BeamSearch" in name:
+        lbl = "Beam"
+    elif "PipeSearch" in name:
+        lbl = "Pipe"
     else:
-        pt, pq, pm, pp9, pr = bt, 0, 0, 0, 0
-    print(f"{bt:7d}  {bq:14.1f} {pq:14.1f}  {bm:9.0f}us {pm:9.0f}us  "
-          f"{bp9:9d}us {pp9:9d}us  {br:10.2f}% {pr:10.2f}%")
+        lbl = name[:10]
+    if "abw" in name.lower() or "adaptive_beam_width: true" in name.lower():
+        lbl += "+ABW"
+    if "rm_l=" in name.lower() or "relaxed_monotonicity_l: Some" in name:
+        lbl += "+RM"
+    labels[name] = lbl
+
+# Print table
+header = f"{'Threads':>7s}"
+for name in mode_names:
+    lbl = labels[name]
+    header += f"  {lbl+' QPS':>14s}"
+header += " "
+for name in mode_names:
+    lbl = labels[name]
+    header += f"  {lbl+' Recall':>12s}"
+print(f"\n{header}")
+print("=" * len(header))
+
+max_rows = max(len(modes[n]["threads"]) for n in mode_names)
+for i in range(max_rows):
+    row = ""
+    t = 0
+    for name in mode_names:
+        d = modes[name]
+        if i < len(d["threads"]):
+            t = d["threads"][i]
+            row += f"  {d['qps'][i]:14.1f}"
+        else:
+            row += f"  {'':>14s}"
+    row += " "
+    for name in mode_names:
+        d = modes[name]
+        if i < len(d["threads"]):
+            row += f"  {d['recall'][i]:11.2f}%"
+        else:
+            row += f"  {'':>12s}"
+    print(f"{t:7d}{row}")
+
+# Build plot title
+title_parts = [f"L={search_l}", f"BW={beam_width}"]
+if abw_flag == "true":
+    title_parts.append("ABW")
+if rm_l:
+    title_parts.append(f"RM_L={rm_l}")
+plot_title = f"SIFT1M Search Benchmark ({', '.join(title_parts)})"
 
 # Generate charts
 try:
@@ -369,47 +440,29 @@ try:
     import matplotlib.pyplot as plt
 
     fig, axes = plt.subplots(2, 2, figsize=(14, 10))
-    fig.suptitle(f'SIFT1M BeamSearch vs PipeSearch (L={search_l}, BW={beam_width})', fontsize=14)
-
-    # QPS vs Threads
-    ax = axes[0][0]
-    ax.plot(beam["threads"], beam["qps"], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], pipe["qps"], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('QPS')
-    ax.set_title('Throughput (QPS)')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    # Mean Latency vs Threads
-    ax = axes[0][1]
-    ax.plot(beam["threads"], [x/1000 for x in beam["mean_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], [x/1000 for x in pipe["mean_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('Mean Latency (ms)')
-    ax.set_title('Mean Latency')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    # p95 Latency vs Threads
-    ax = axes[1][0]
-    ax.plot(beam["threads"], [x/1000 for x in beam["p95_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], [x/1000 for x in pipe["p95_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('p95 Latency (ms)')
-    ax.set_title('p95 Tail Latency')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
-
-    # p99.9 Latency vs Threads
-    ax = axes[1][1]
-    ax.plot(beam["threads"], [x/1000 for x in beam["p999_lat"]], 'o-', color='#2196F3', label='BeamSearch', linewidth=2, markersize=5)
-    ax.plot(pipe["threads"], [x/1000 for x in pipe["p999_lat"]], 's-', color='#FF5722', label='PipeSearch', linewidth=2, markersize=5)
-    ax.set_xlabel('Threads')
-    ax.set_ylabel('p99.9 Latency (ms)')
-    ax.set_title('p99.9 Tail Latency')
-    ax.legend()
-    ax.grid(True, alpha=0.3)
+    fig.suptitle(plot_title, fontsize=14)
+
+    metrics = [
+        (axes[0][0], "qps",      "QPS",              "Throughput (QPS)",    1,    False),
+        (axes[0][1], "mean_lat",  "Mean Latency (ms)", "Mean Latency",      1000, True),
+        (axes[1][0], "p95_lat",   "p95 Latency (ms)",  "p95 Tail Latency",  1000, True),
+        (axes[1][1], "p999_lat",  "p99.9 Latency (ms)","p99.9 Tail Latency",1000, True),
+    ]
+
+    markers = ['o', 's', '^', 'D', 'v', 'P']
+    for ax, key, ylabel, title, divisor, _ in metrics:
+        for idx, name in enumerate(mode_names):
+            d = modes[name]
+            vals = [x / divisor for x in d[key]]
+            ax.plot(d["threads"], vals,
+                    f'{markers[idx % len(markers)]}-',
+                    color=COLORS[idx % len(COLORS)],
+                    label=labels[name], linewidth=2, markersize=5)
+        ax.set_xlabel('Threads')
+        ax.set_ylabel(ylabel)
+        ax.set_title(title)
+        ax.legend()
+        ax.grid(True, alpha=0.3)
 
     plt.tight_layout()
     chart_path = os.path.join(output_dir, 'thread_sweep.png')
@@ -425,9 +478,11 @@ except ImportError:
 csv_path = os.path.join(output_dir, 'thread_sweep.csv')
 with open(csv_path, 'w') as f:
     f.write("threads,mode,qps,mean_lat_us,p95_lat_us,p999_lat_us,recall\n")
-    for d, mode in [(beam, "BeamSearch"), (pipe, "PipeSearch")]:
+    for name in mode_names:
+        d = modes[name]
+        lbl = labels[name]
         for i in range(len(d["threads"])):
-            f.write(f"{d['threads'][i]},{mode},{d['qps'][i]:.1f},"
+            f.write(f"{d['threads'][i]},{lbl},{d['qps'][i]:.1f},"
                     f"{d['mean_lat'][i]:.0f},{d['p95_lat'][i]},"
                     f"{d['p999_lat'][i]},{d['recall'][i]:.3f}\n")
 print(f"CSV saved to: {csv_path}")
@@ -440,4 +495,4 @@ echo "Charts:  $OUTPUT_DIR/thread_sweep.png"
 echo "CSV:     $OUTPUT_DIR/thread_sweep.csv"
 echo ""
 echo "To re-run with different parameters:"
-echo "  $0 --skip-download --skip-index --max-threads N --search-l N"
+echo "  $0 --skip-download --skip-index --max-threads N --search-l N --abw --rm-l N"
diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 0af6051cd..5a2b82d8d 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -529,7 +529,7 @@ where
         is_flat_search: search_params.is_flat_search,
         distance: search_params.distance,
         uses_vector_filters: search_params.vector_filters_file.is_some(),
-        search_mode: format!("{:?}", search_params.search_mode),
+        search_mode: format!("{}", search_params.search_mode),
         num_nodes_to_cache: search_params.num_nodes_to_cache,
         search_results_per_l,
         span_metrics,

From d1be70dd4aa02d46c4de2c1c2e2a4b7c398941d8 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 14:59:37 -0800
Subject: [PATCH 36/46] Fix review findings: slot safety, stale CQEs, cursor
 optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. CRITICAL: Replace round-robin slot allocation with free-list (VecDeque)
   in PipelinedDiskAccessor. Prevents reusing an in-flight slot when NVMe
   completions arrive out-of-order. Mark submit_read as unsafe fn.

2. HIGH: Change PipelinedReader::reset() to call drain_all(), blocking
   until all kernel IOs complete before the scratch is reused. Prevents
   stale CQEs from corrupting subsequent queries.

3. HIGH: Use cursor in peek_best_unsubmitted_with_position and
   mark_visited_by_id (scan from self.cursor instead of 0). Restores
   O(1) amortized node selection for non-pipelined search, fixing
   O(L²) regression from the unified loop.

4. MEDIUM: Replace expanded_ids.clone() with std::mem::take() in
   pipelined expand_available, eliminating per-call Vec allocation.
---
 .../src/search/pipelined/pipelined_reader.rs  | 22 ++++++++-----
 .../src/search/provider/pipelined_accessor.rs | 31 +++++++++++--------
 diskann/src/neighbor/queue.rs                 |  4 +--
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index 42f637fb2..9a53eb12d 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -97,7 +97,15 @@ impl PipelinedReader {
     /// The read will fetch `slot_size` bytes from `sector_offset` (in bytes) into
     /// the pre-allocated buffer for the given slot. The `slot_id` is stored as
     /// `user_data` in the CQE for later retrieval.
-    pub fn submit_read(&mut self, sector_offset: u64, slot_id: usize) -> ANNResult<()> {
+    /// Submit a read for the given sector offset into the specified buffer slot.
+    ///
+    /// # Safety
+    /// The caller must ensure `slot_id` is not currently in-flight (i.e., it has
+    /// been returned by a previous completion or was never submitted). Violating
+    /// this invariant allows the kernel to DMA into a buffer being read, causing
+    /// data corruption. When using a free-list for slot management (see
+    /// `PipelinedScratch::free_slots`), this invariant is structurally guaranteed.
+    pub unsafe fn submit_read(&mut self, sector_offset: u64, slot_id: usize) -> ANNResult<()> {
         assert!(slot_id < self.max_slots, "slot_id out of range");
 
         let buf_start = slot_id * self.slot_size;
@@ -169,11 +177,9 @@ impl PipelinedReader {
         &self.slot_bufs[start..start + self.slot_size]
     }
 
-    /// Reset the reader for reuse: clear in-flight count and drain remaining CQEs.
+    /// Reset the reader for reuse: drain all in-flight IOs, then clear state.
     pub fn reset(&mut self) {
-        self.in_flight = 0;
-        // Drain any remaining completions from the ring.
-        for _cqe in self.ring.completion() {}
+        self.drain_all();
     }
 
     /// Returns the number of submitted but not yet completed reads.
@@ -194,12 +200,12 @@ impl PipelinedReader {
     /// Drain all in-flight IOs, blocking until they complete.
     /// Must be called before freeing the slot buffers.
     fn drain_all(&mut self) {
-        while self.in_flight > 0 {
-            let _ = self.ring.submit_and_wait(1);
+        if self.in_flight > 0 {
+            let _ = self.ring.submit_and_wait(self.in_flight);
             for cqe in self.ring.completion() {
                 let _ = cqe;
-                self.in_flight = self.in_flight.saturating_sub(1);
             }
+            self.in_flight = 0;
         }
     }
 }
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index eaab02f4c..0bd7bd8d6 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -133,6 +133,8 @@ pub struct PipelinedScratch {
     neighbor_buf: Vec<u32>,
     /// Freelist of LoadedNode instances to avoid per-node allocation
     node_pool: Vec<LoadedNode>,
+    /// Free io_uring buffer slot IDs available for new submissions.
+    free_slots: VecDeque<usize>,
 }
 
 /// Arguments for creating or resetting a [`PipelinedScratch`].
@@ -175,10 +177,12 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
             distance_cache: HashMap::new(),
             neighbor_buf: Vec::new(),
             node_pool: Vec::new(),
+            free_slots: (0..args.max_slots).collect(),
         })
     }
 
     fn try_modify(&mut self, _args: PipelinedScratchArgs) -> Result<(), Self::Error> {
+        let max_slots = self.reader.max_slots();
         self.reader.reset();
         // Return all loaded_nodes back to the pool before clearing
         self.node_pool.extend(self.loaded_nodes.drain().map(|(_, node)| node));
@@ -186,6 +190,8 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
         self.expanded_ids.clear();
         self.distance_cache.clear();
         self.neighbor_buf.clear();
+        self.free_slots.clear();
+        self.free_slots.extend(0..max_slots);
         Ok(())
     }
 }
@@ -233,8 +239,6 @@ pub struct PipelinedDiskAccessor<'a, Data: GraphDataType<VectorIdType = u32>> {
     node_cache: Arc<Cache<Data>>,
 
     // IO state (now lives in scratch for reuse, accessed via self.scratch)
-    next_slot_id: usize,
-    max_slots: usize,
     /// Monotonically increasing submission rank for priority-ordered expansion.
     next_rank: u64,
 
@@ -271,7 +275,6 @@ where
 
         let block_size = provider.graph_header.effective_block_size();
         let num_sectors_per_node = provider.graph_header.num_sectors_per_node();
-        let slots = scratch.reader.max_slots();
 
         Ok(Self {
             provider,
@@ -284,8 +287,6 @@ where
             fp_vector_len,
             num_points: provider.num_points,
             node_cache,
-            next_slot_id: 0,
-            max_slots: slots,
             next_rank: 0,
             io_count: 0,
             cache_hits: 0,
@@ -412,6 +413,8 @@ where
             let io = &scratch.in_flight_ios[i];
             if completed_slots.contains(&io.slot_id) {
                 let io = scratch.in_flight_ios.swap_remove_back(i).unwrap();
+                // Return the slot to the free-list so it can be reused.
+                scratch.free_slots.push_back(io.slot_id);
                 // Acquire node first (mutably borrows node_pool),
                 // then get sector buf (immutably borrows reader) — no conflict.
                 let mut node = scratch.node_pool.pop().unwrap_or_else(|| LoadedNode {
@@ -586,27 +589,29 @@ where
                 continue;
             }
 
-            // Don't submit if all io_uring slots are occupied — prevents overwriting
-            // buffers that still have in-flight reads.
-            if self.scratch.in_flight_ios.len() >= self.max_slots {
+            // Don't submit if no free io_uring slots are available.
+            if self.scratch.free_slots.is_empty() {
                 break;
             }
 
             let sector_idx =
                 node_sector_index(id, self.num_nodes_per_sector, self.num_sectors_per_node);
             let sector_offset = sector_idx * self.block_size as u64;
-            let slot_id = self.next_slot_id % self.max_slots;
+            let slot_id = self.scratch.free_slots.pop_front().unwrap();
             let rank = self.next_rank;
             self.next_rank += 1;
-            // Best-effort: if submission fails, the node will be retried
-            if self.scratch.reader.submit_read(sector_offset, slot_id).is_ok() {
+            // Best-effort: if submission fails, return the slot and retry later
+            // SAFETY: slot_id was just popped from the free-list, guaranteeing
+            // it is not currently in-flight.
+            if unsafe { self.scratch.reader.submit_read(sector_offset, slot_id) }.is_ok() {
                 self.scratch.in_flight_ios.push_back(InFlightIo {
                     vertex_id: id,
                     slot_id,
                     rank,
                 });
-                self.next_slot_id = (self.next_slot_id + 1) % self.max_slots;
                 self.io_count += 1;
+            } else {
+                self.scratch.free_slots.push_back(slot_id);
             }
         }
         self.io_time += io_start.elapsed();
@@ -697,7 +702,7 @@ where
             // Return node to pool for reuse
             self.scratch.release_node(node);
 
-            Ok(self.scratch.expanded_ids.clone())
+            Ok(std::mem::take(&mut self.scratch.expanded_ids))
         }
     }
 
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index 9b5e8b635..8027f47d5 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -520,7 +520,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
         submitted: &HashSet<I>,
     ) -> Option<(usize, Neighbor<I>)> {
         let limit = self.search_param_l.min(self.size);
-        for i in 0..limit {
+        for i in self.cursor..limit {
             let (id, visited) = self.id_visiteds[i];
             if !visited && !submitted.contains(&id) {
                 return Some((i, Neighbor::new(id, self.distances[i])));
@@ -532,7 +532,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
     /// Returns true if found and marked, false otherwise.
     pub fn mark_visited_by_id(&mut self, id: &I) -> bool {
-        for i in 0..self.size {
+        for i in self.cursor..self.size {
             if self.id_visiteds[i].0 == *id {
                 self.id_visiteds[i].1 = true;
                 // If the cursor was pointing at this node, advance past visited nodes

From 0d1c4ea03290e9ad27379ee478616a3f3b4b2533 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 15:41:14 -0800
Subject: [PATCH 37/46] remove benchmarking files

---
 .gitignore                                    |   2 -
 diskann-benchmark/scripts/sift1m_benchmark.sh | 498 ------------------
 2 files changed, 500 deletions(-)
 delete mode 100755 diskann-benchmark/scripts/sift1m_benchmark.sh

diff --git a/.gitignore b/.gitignore
index 8df99897b..946937707 100644
--- a/.gitignore
+++ b/.gitignore
@@ -334,5 +334,3 @@ target/
 
 # ignore VS Code local history
 .history/
-# Benchmark data (downloaded by scripts/sift1m_benchmark.sh)
-benchmark_data/
diff --git a/diskann-benchmark/scripts/sift1m_benchmark.sh b/diskann-benchmark/scripts/sift1m_benchmark.sh
deleted file mode 100755
index c37ad148a..000000000
--- a/diskann-benchmark/scripts/sift1m_benchmark.sh
+++ /dev/null
@@ -1,498 +0,0 @@
-#!/usr/bin/env bash
-# SIFT1M Pipelined Search Benchmark
-#
-# Downloads SIFT1M dataset, builds a disk index, and runs an ablation
-# benchmark comparing BeamSearch vs PipeSearch (io_uring pipelining)
-# with optional adaptive beam width (ABW) and relaxed monotonicity (RM).
-#
-# By default, sweeps thread counts from 1 to max_threads in strides of 4
-# and produces charts (QPS, mean latency, tail latency vs threads).
-#
-# Prerequisites:
-#   - Linux (PipeSearch requires io_uring)
-#   - Rust toolchain (cargo)
-#   - curl, tar, python3 with numpy and matplotlib
-#   - ~2GB free disk space for data + index
-#
-# Usage:
-#   ./diskann-benchmark/scripts/sift1m_benchmark.sh [OPTIONS]
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-
-# Defaults
-DATA_DIR="${DATA_DIR:-$REPO_ROOT/benchmark_data/sift1m}"
-MAX_THREADS="${MAX_THREADS:-48}"
-THREAD_STRIDE="${THREAD_STRIDE:-4}"
-BEAM_WIDTH="${BEAM_WIDTH:-4}"
-SEARCH_L="${SEARCH_L:-100}"
-ABW=false
-RM_L=""
-SKIP_DOWNLOAD=false
-SKIP_BUILD=false
-SKIP_INDEX=false
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --data-dir)      DATA_DIR="$2"; shift 2 ;;
-        --skip-download) SKIP_DOWNLOAD=true; shift ;;
-        --skip-build)    SKIP_BUILD=true; shift ;;
-        --skip-index)    SKIP_INDEX=true; shift ;;
-        --max-threads)   MAX_THREADS="$2"; shift 2 ;;
-        --thread-stride) THREAD_STRIDE="$2"; shift 2 ;;
-        --beam-width)    BEAM_WIDTH="$2"; shift 2 ;;
-        --search-l)      SEARCH_L="$2"; shift 2 ;;
-        --abw)           ABW=true; shift ;;
-        --rm-l)          RM_L="$2"; shift 2 ;;
-        -h|--help)
-            echo "Usage: $0 [OPTIONS]"
-            echo ""
-            echo "Options:"
-            echo "  --data-dir DIR       Data directory (default: \$REPO_ROOT/benchmark_data/sift1m)"
-            echo "  --skip-download      Skip downloading SIFT1M (use existing data)"
-            echo "  --skip-build         Skip building the benchmark binary"
-            echo "  --skip-index         Skip building the disk index (use existing index)"
-            echo "  --max-threads N      Maximum thread count for sweep (default: 48)"
-            echo "  --thread-stride N    Thread count increment (default: 4)"
-            echo "  --beam-width N       Beam width / pipeline width (default: 4)"
-            echo "  --search-l N         Search list size L (default: 100)"
-            echo "  --abw                Enable adaptive beam width for ABW variants"
-            echo "  --rm-l N             Enable relaxed monotonicity with budget N for RM variants"
-            exit 0
-            ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-BIN_DIR="$DATA_DIR/bin"
-INDEX_DIR="$DATA_DIR/index"
-INDEX_PREFIX="$INDEX_DIR/sift1m_R64_L100"
-OUTPUT_DIR="$DATA_DIR/results"
-
-echo "=== SIFT1M Pipelined Search Benchmark ==="
-echo "Data directory: $DATA_DIR"
-echo "Thread sweep: 1, 4..${MAX_THREADS} (stride ${THREAD_STRIDE})"
-echo "Beam width: $BEAM_WIDTH, Search L: $SEARCH_L"
-echo "Adaptive beam width: $ABW"
-[ -n "$RM_L" ] && echo "Relaxed monotonicity L: $RM_L"
-echo ""
-
-# -------------------------------------------------------------------
-# Step 1: Download SIFT1M
-# -------------------------------------------------------------------
-if [ "$SKIP_DOWNLOAD" = false ]; then
-    echo "--- Step 1: Downloading SIFT1M dataset ---"
-    mkdir -p "$BIN_DIR"
-
-    SIFT_URL="ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz"
-    SIFT_TAR="$DATA_DIR/sift.tar.gz"
-
-    if [ ! -f "$BIN_DIR/sift_base.fbin" ]; then
-        if [ ! -f "$SIFT_TAR" ]; then
-            echo "Downloading from $SIFT_URL ..."
-            curl -L -o "$SIFT_TAR" "$SIFT_URL"
-        fi
-
-        echo "Extracting..."
-        EXTRACT_DIR="$DATA_DIR/extract"
-        mkdir -p "$EXTRACT_DIR"
-        tar xzf "$SIFT_TAR" -C "$EXTRACT_DIR"
-
-        echo "Converting .bvecs/.fvecs to .fbin format..."
-        python3 - "$EXTRACT_DIR/sift" "$BIN_DIR" << 'PYEOF'
-import sys, struct, numpy as np
-from pathlib import Path
-
-src_dir = Path(sys.argv[1])
-dst_dir = Path(sys.argv[2])
-
-def read_fvecs(path):
-    data = np.fromfile(path, dtype=np.float32)
-    dim = int(data[0].view(np.int32))
-    return data.reshape(-1, dim + 1)[:, 1:]
-
-def read_ivecs(path):
-    data = np.fromfile(path, dtype=np.int32)
-    dim = data[0]
-    return data.reshape(-1, dim + 1)[:, 1:]
-
-def write_fbin(path, data):
-    npts, dim = data.shape
-    with open(path, 'wb') as f:
-        f.write(struct.pack('II', npts, dim))
-        data.astype(np.float32).tofile(f)
-
-def write_ibin(path, data):
-    npts, dim = data.shape
-    with open(path, 'wb') as f:
-        f.write(struct.pack('II', npts, dim))
-        data.astype(np.uint32).tofile(f)
-
-base = read_fvecs(src_dir / "sift_base.fvecs")
-print(f"  Base: {base.shape[0]} points, {base.shape[1]} dims")
-write_fbin(dst_dir / "sift_base.fbin", base)
-
-query = read_fvecs(src_dir / "sift_query.fvecs")
-print(f"  Query: {query.shape[0]} points, {query.shape[1]} dims")
-write_fbin(dst_dir / "sift_query.fbin", query)
-
-gt = read_ivecs(src_dir / "sift_groundtruth.ivecs")
-print(f"  Groundtruth: {gt.shape[0]} queries, top-{gt.shape[1]}")
-write_ibin(dst_dir / "sift_groundtruth.bin", gt)
-print("  Conversion complete!")
-PYEOF
-
-        rm -rf "$EXTRACT_DIR" "$SIFT_TAR"
-    else
-        echo "SIFT1M data already exists at $BIN_DIR, skipping download."
-    fi
-    echo ""
-fi
-
-# -------------------------------------------------------------------
-# Step 2: Build the benchmark binary
-# -------------------------------------------------------------------
-if [ "$SKIP_BUILD" = false ]; then
-    echo "--- Step 2: Building diskann-benchmark ---"
-    cd "$REPO_ROOT"
-    cargo build --release -p diskann-benchmark --features disk-index 2>&1 | tail -3
-    echo ""
-fi
-
-BENCHMARK_BIN="$REPO_ROOT/target/release/diskann-benchmark"
-if [ ! -x "$BENCHMARK_BIN" ]; then
-    echo "ERROR: benchmark binary not found at $BENCHMARK_BIN"
-    echo "Run without --skip-build or build manually:"
-    echo "  cargo build --release -p diskann-benchmark --features disk-index"
-    exit 1
-fi
-
-# -------------------------------------------------------------------
-# Step 3: Build disk index (if needed)
-# -------------------------------------------------------------------
-if [ "$SKIP_INDEX" = false ] && [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
-    echo "--- Step 3: Building disk index (R=64, L=100, PQ_16) ---"
-    mkdir -p "$INDEX_DIR"
-
-    # Build job requires a search_phase; we include a minimal one that also
-    # validates the index works after building.
-    cat > "$DATA_DIR/build_config.json" << BUILDEOF
-{
-    "search_directories": ["$BIN_DIR"],
-    "jobs": [
-        {
-            "type": "disk-index",
-            "content": {
-                "source": {
-                    "disk-index-source": "Build",
-                    "data_type": "float32",
-                    "data": "sift_base.fbin",
-                    "distance": "squared_l2",
-                    "dim": 128,
-                    "max_degree": 64,
-                    "l_build": 100,
-                    "num_threads": 4,
-                    "build_ram_limit_gb": 4.0,
-                    "num_pq_chunks": 16,
-                    "quantization_type": "FP",
-                    "save_path": "$INDEX_PREFIX"
-                },
-                "search_phase": {
-                    "queries": "sift_query.fbin",
-                    "groundtruth": "sift_groundtruth.bin",
-                    "search_list": [50],
-                    "beam_width": 4,
-                    "recall_at": 10,
-                    "num_threads": 1,
-                    "is_flat_search": false,
-                    "distance": "squared_l2"
-                }
-            }
-        }
-    ]
-}
-BUILDEOF
-
-    "$BENCHMARK_BIN" run --input-file "$DATA_DIR/build_config.json" --output-file /dev/null
-    echo ""
-elif [ "$SKIP_INDEX" = true ]; then
-    echo "--- Step 3: Skipping index build (--skip-index) ---"
-    echo ""
-else
-    echo "--- Step 3: Disk index already exists, skipping build ---"
-    echo ""
-fi
-
-if [ ! -f "${INDEX_PREFIX}_disk.index" ]; then
-    echo "ERROR: Disk index not found at ${INDEX_PREFIX}_disk.index"
-    exit 1
-fi
-
-# -------------------------------------------------------------------
-# Step 4: Thread sweep benchmark
-# -------------------------------------------------------------------
-echo "--- Step 4: Running thread sweep benchmark ---"
-mkdir -p "$OUTPUT_DIR"
-
-# Build thread list: 1, then 4, 8, ..., MAX_THREADS
-THREAD_LIST="1"
-for (( t=THREAD_STRIDE; t<=MAX_THREADS; t+=THREAD_STRIDE )); do
-    THREAD_LIST="$THREAD_LIST $t"
-done
-echo "Thread counts: $THREAD_LIST"
-
-# Build the list of search mode configurations to benchmark.
-# Always includes baseline BeamSearch and PipeSearch.
-# When --abw or --rm-l are specified, adds ABW and/or ABW+RM variants for both.
-MODES=()
-
-# Helper: build a search_mode JSON fragment
-beam_mode() {
-    local abw="${1:-false}"
-    local rm="${2:-}"
-    local json="{\"mode\": \"BeamSearch\", \"adaptive_beam_width\": $abw"
-    [ -n "$rm" ] && json="$json, \"relaxed_monotonicity_l\": $rm"
-    echo "$json}"
-}
-pipe_mode() {
-    local abw="${1:-true}"
-    local rm="${2:-}"
-    local json="{\"mode\": \"PipeSearch\", \"adaptive_beam_width\": $abw"
-    [ -n "$rm" ] && json="$json, \"relaxed_monotonicity_l\": $rm"
-    echo "$json}"
-}
-
-# Baseline (no ABW, no RM)
-MODES+=("$(beam_mode false)")
-MODES+=("$(pipe_mode false)")
-
-# ABW variants
-if [ "$ABW" = true ]; then
-    MODES+=("$(beam_mode true)")
-    MODES+=("$(pipe_mode true)")
-fi
-
-# ABW+RM variants (RM requires ABW for the convergence gate to work)
-if [ -n "$RM_L" ]; then
-    MODES+=("$(beam_mode true "$RM_L")")
-    MODES+=("$(pipe_mode true "$RM_L")")
-fi
-
-NUM_MODES=${#MODES[@]}
-echo "Search modes ($NUM_MODES per thread count):"
-for m in "${MODES[@]}"; do echo "  $m"; done
-
-# Generate a single config with all jobs
-JOBS=""
-for T in $THREAD_LIST; do
-    for MODE_JSON in "${MODES[@]}"; do
-        [ -n "$JOBS" ] && JOBS="$JOBS,"
-        JOBS="$JOBS
-        {
-            \"type\": \"disk-index\",
-            \"content\": {
-                \"source\": {
-                    \"disk-index-source\": \"Load\",
-                    \"data_type\": \"float32\",
-                    \"load_path\": \"$INDEX_PREFIX\"
-                },
-                \"search_phase\": {
-                    \"queries\": \"sift_query.fbin\",
-                    \"groundtruth\": \"sift_groundtruth.bin\",
-                    \"search_list\": [$SEARCH_L],
-                    \"beam_width\": $BEAM_WIDTH,
-                    \"recall_at\": 10,
-                    \"num_threads\": $T,
-                    \"is_flat_search\": false,
-                    \"distance\": \"squared_l2\",
-                    \"search_mode\": $MODE_JSON
-                }
-            }
-        }"
-    done
-done
-
-SWEEP_CONFIG="$OUTPUT_DIR/sweep_config.json"
-SWEEP_OUTPUT="$OUTPUT_DIR/sweep_results.json"
-
-cat > "$SWEEP_CONFIG" << SWEEPEOF
-{
-    "search_directories": ["$BIN_DIR"],
-    "jobs": [$JOBS
-    ]
-}
-SWEEPEOF
-
-"$BENCHMARK_BIN" run --input-file "$SWEEP_CONFIG" --output-file "$SWEEP_OUTPUT"
-
-echo ""
-echo "--- Step 5: Generating charts ---"
-
-python3 - "$SWEEP_OUTPUT" "$OUTPUT_DIR" "$SEARCH_L" "$BEAM_WIDTH" "$ABW" "$RM_L" << 'CHARTEOF'
-import json, sys, os
-from collections import defaultdict
-
-output_dir = sys.argv[2]
-search_l = sys.argv[3]
-beam_width = sys.argv[4]
-abw_flag = sys.argv[5]
-rm_l = sys.argv[6] if len(sys.argv) > 6 else ""
-
-with open(sys.argv[1]) as f:
-    data = json.load(f)
-
-# Parse results into per-mode buckets keyed by the search_mode display string.
-# Each bucket holds lists of (threads, qps, mean_lat, p95_lat, p999_lat, recall).
-modes = defaultdict(lambda: {"threads": [], "qps": [], "mean_lat": [],
-                              "p95_lat": [], "p999_lat": [], "recall": []})
-
-for job in data:
-    search = job.get("results", {}).get("search", {})
-    if not search:
-        continue
-    results_per_l = search.get("search_results_per_l", [])
-    if not results_per_l:
-        continue
-    r = results_per_l[0]
-    threads = search.get("num_threads", 0)
-    mode = str(search.get("search_mode", ""))
-
-    d = modes[mode]
-    d["threads"].append(threads)
-    d["qps"].append(r.get("qps", 0))
-    d["mean_lat"].append(r.get("mean_latency", 0))
-    d["p95_lat"].append(r.get("p95_latency", 0))
-    d["p999_lat"].append(r.get("p999_latency", 0))
-    d["recall"].append(r.get("recall", 0))
-
-# Sort each mode by threads
-for d in modes.values():
-    if d["threads"]:
-        order = sorted(range(len(d["threads"])), key=lambda i: d["threads"][i])
-        for k in d:
-            d[k] = [d[k][i] for i in order]
-
-# Assign short labels and colors
-COLORS = ['#2196F3', '#FF5722', '#4CAF50', '#9C27B0', '#FF9800', '#00BCD4']
-mode_names = sorted(modes.keys())
-labels = {}
-for name in mode_names:
-    # Build a concise label from the search_mode string
-    if "BeamSearch" in name:
-        lbl = "Beam"
-    elif "PipeSearch" in name:
-        lbl = "Pipe"
-    else:
-        lbl = name[:10]
-    if "abw" in name.lower() or "adaptive_beam_width: true" in name.lower():
-        lbl += "+ABW"
-    if "rm_l=" in name.lower() or "relaxed_monotonicity_l: Some" in name:
-        lbl += "+RM"
-    labels[name] = lbl
-
-# Print table
-header = f"{'Threads':>7s}"
-for name in mode_names:
-    lbl = labels[name]
-    header += f"  {lbl+' QPS':>14s}"
-header += " "
-for name in mode_names:
-    lbl = labels[name]
-    header += f"  {lbl+' Recall':>12s}"
-print(f"\n{header}")
-print("=" * len(header))
-
-max_rows = max(len(modes[n]["threads"]) for n in mode_names)
-for i in range(max_rows):
-    row = ""
-    t = 0
-    for name in mode_names:
-        d = modes[name]
-        if i < len(d["threads"]):
-            t = d["threads"][i]
-            row += f"  {d['qps'][i]:14.1f}"
-        else:
-            row += f"  {'':>14s}"
-    row += " "
-    for name in mode_names:
-        d = modes[name]
-        if i < len(d["threads"]):
-            row += f"  {d['recall'][i]:11.2f}%"
-        else:
-            row += f"  {'':>12s}"
-    print(f"{t:7d}{row}")
-
-# Build plot title
-title_parts = [f"L={search_l}", f"BW={beam_width}"]
-if abw_flag == "true":
-    title_parts.append("ABW")
-if rm_l:
-    title_parts.append(f"RM_L={rm_l}")
-plot_title = f"SIFT1M Search Benchmark ({', '.join(title_parts)})"
-
-# Generate charts
-try:
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-
-    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
-    fig.suptitle(plot_title, fontsize=14)
-
-    metrics = [
-        (axes[0][0], "qps",      "QPS",              "Throughput (QPS)",    1,    False),
-        (axes[0][1], "mean_lat",  "Mean Latency (ms)", "Mean Latency",      1000, True),
-        (axes[1][0], "p95_lat",   "p95 Latency (ms)",  "p95 Tail Latency",  1000, True),
-        (axes[1][1], "p999_lat",  "p99.9 Latency (ms)","p99.9 Tail Latency",1000, True),
-    ]
-
-    markers = ['o', 's', '^', 'D', 'v', 'P']
-    for ax, key, ylabel, title, divisor, _ in metrics:
-        for idx, name in enumerate(mode_names):
-            d = modes[name]
-            vals = [x / divisor for x in d[key]]
-            ax.plot(d["threads"], vals,
-                    f'{markers[idx % len(markers)]}-',
-                    color=COLORS[idx % len(COLORS)],
-                    label=labels[name], linewidth=2, markersize=5)
-        ax.set_xlabel('Threads')
-        ax.set_ylabel(ylabel)
-        ax.set_title(title)
-        ax.legend()
-        ax.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    chart_path = os.path.join(output_dir, 'thread_sweep.png')
-    plt.savefig(chart_path, dpi=150)
-    print(f"\nChart saved to: {chart_path}")
-    plt.close()
-
-except ImportError:
-    print("\nmatplotlib not available — skipping chart generation.")
-    print("Install with: pip install matplotlib")
-
-# Save CSV for external plotting
-csv_path = os.path.join(output_dir, 'thread_sweep.csv')
-with open(csv_path, 'w') as f:
-    f.write("threads,mode,qps,mean_lat_us,p95_lat_us,p999_lat_us,recall\n")
-    for name in mode_names:
-        d = modes[name]
-        lbl = labels[name]
-        for i in range(len(d["threads"])):
-            f.write(f"{d['threads'][i]},{lbl},{d['qps'][i]:.1f},"
-                    f"{d['mean_lat'][i]:.0f},{d['p95_lat'][i]},"
-                    f"{d['p999_lat'][i]},{d['recall'][i]:.3f}\n")
-print(f"CSV saved to: {csv_path}")
-CHARTEOF
-
-echo ""
-echo "=== Benchmark Complete ==="
-echo "Results: $SWEEP_OUTPUT"
-echo "Charts:  $OUTPUT_DIR/thread_sweep.png"
-echo "CSV:     $OUTPUT_DIR/thread_sweep.csv"
-echo ""
-echo "To re-run with different parameters:"
-echo "  $0 --skip-download --skip-index --max-threads N --search-l N --abw --rm-l N"

From 2ee89bc81385f166d4856723d28c0f965f7553cc Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 15:48:10 -0800
Subject: [PATCH 38/46] Fix clippy warnings and fmt cleanup

- Collapse nested if-let chains in ABW/RM logic (index.rs)
- Remove dead code: max_slots() fn and pq_distances() wrapper (pipelined_accessor.rs)
- Simplify async fn signatures (pipelined_accessor.rs)
- Remove unnecessary f64 cast (search_disk_index.rs)
- Remove unused MAX_IO_CONCURRENCY import
- Fix indentation after async move removal
---
 .../src/backend/disk_index/search.rs          |  39 +++-
 diskann-benchmark/src/inputs/disk.rs          |  23 +-
 diskann-disk/src/build/builder/tests.rs       |   3 +-
 .../src/search/pipelined/pipelined_reader.rs  |  13 +-
 .../src/search/provider/disk_provider.rs      |   1 -
 .../src/search/provider/pipelined_accessor.rs | 211 +++++++++---------
 diskann-tools/src/utils/search_disk_index.rs  |   2 +-
 diskann/src/graph/index.rs                    | 106 ++++-----
 diskann/src/neighbor/queue.rs                 |  10 +-
 9 files changed, 216 insertions(+), 192 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 5a2b82d8d..c8b92c59f 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -11,6 +11,10 @@ use opentelemetry_sdk::trace::SdkTracerProvider;
 
 use diskann::{graph::SearchParams, utils::VectorRepr, ANNResult};
 use diskann_benchmark_runner::{files::InputFile, utils::MicroSeconds};
+#[cfg(target_os = "linux")]
+use diskann_disk::search::pipelined::PipelinedReaderConfig;
+#[cfg(target_os = "linux")]
+use diskann_disk::search::provider::pipelined_accessor::PipelinedConfig;
 use diskann_disk::{
     data_model::CachingStrategy,
     search::provider::{
@@ -21,10 +25,6 @@ use diskann_disk::{
     storage::disk_index_reader::DiskIndexReader,
     utils::{instrumentation::PerfLogger, statistics, AlignedFileReaderFactory, QueryStatistics},
 };
-#[cfg(target_os = "linux")]
-use diskann_disk::search::pipelined::PipelinedReaderConfig;
-#[cfg(target_os = "linux")]
-use diskann_disk::search::provider::pipelined_accessor::PipelinedConfig;
 use diskann_providers::storage::StorageReadProvider;
 use diskann_providers::{
     storage::{
@@ -177,7 +177,12 @@ fn write_query_result(
             *stats = search_result.stats.query_statistics;
             *rc = search_result.results.len() as u32;
             let actual_results = search_result.results.len().min(recall_at);
-            for (i, result_item) in search_result.results.iter().take(actual_results).enumerate() {
+            for (i, result_item) in search_result
+                .results
+                .iter()
+                .take(actual_results)
+                .enumerate()
+            {
                 id_chunk[i] = result_item.vertex_id;
                 dist_chunk[i] = result_item.distance;
             }
@@ -319,7 +324,10 @@ where
     let has_any_search_failed = AtomicBool::new(false);
 
     match &search_params.search_mode {
-        SearchMode::BeamSearch { adaptive_beam_width, relaxed_monotonicity_l } => {
+        SearchMode::BeamSearch {
+            adaptive_beam_width,
+            relaxed_monotonicity_l,
+        } => {
             let searcher = &DiskIndexSearcher::<GraphData<T>, _>::new(
                 search_params.num_threads,
                 search_params.search_io_limit.unwrap_or(usize::MAX),
@@ -365,7 +373,8 @@ where
                                 search_params.recall_at as usize,
                                 l as usize,
                                 Some(search_params.beam_width),
-                            ).unwrap();
+                            )
+                            .unwrap();
                             if abw {
                                 sp = sp.with_adaptive_beam_width();
                             }
@@ -395,14 +404,18 @@ where
         }
         // Pipelined search — for read-only search on completed (static) indices only.
         // Uses io_uring for IO/compute overlap through the generic search loop.
-        SearchMode::PipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
+        SearchMode::PipeSearch {
+            adaptive_beam_width,
+            relaxed_monotonicity_l,
+            sqpoll_idle_ms,
+        } => {
             #[cfg(target_os = "linux")]
             {
+                use diskann::utils::object_pool::ObjectPool;
                 use diskann_disk::data_model::Cache;
                 use diskann_disk::search::provider::pipelined_accessor::{
                     PipelinedScratch, PipelinedScratchArgs,
                 };
-                use diskann::utils::object_pool::ObjectPool;
 
                 let reader_config = PipelinedReaderConfig {
                     sqpoll_idle_ms: *sqpoll_idle_ms,
@@ -435,9 +448,11 @@ where
                     num_pq_centers: pq_data.get_num_centers(),
                     reader_config,
                 };
-                let scratch_pool = Arc::new(
-                    ObjectPool::<PipelinedScratch>::try_new(scratch_args.clone(), 0, None)?
-                );
+                let scratch_pool = Arc::new(ObjectPool::<PipelinedScratch>::try_new(
+                    scratch_args.clone(),
+                    0,
+                    None,
+                )?);
 
                 let mut searcher = DiskIndexSearcher::<GraphData<T>, _>::new(
                     search_params.num_threads,
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index 662364f8a..fcc2739c9 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -112,7 +112,10 @@ impl Default for SearchMode {
 impl fmt::Display for SearchMode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            SearchMode::BeamSearch { adaptive_beam_width, relaxed_monotonicity_l } => {
+            SearchMode::BeamSearch {
+                adaptive_beam_width,
+                relaxed_monotonicity_l,
+            } => {
                 write!(f, "BeamSearch")?;
                 let has_abw = *adaptive_beam_width;
                 let has_rm = relaxed_monotonicity_l.is_some();
@@ -124,14 +127,20 @@ impl fmt::Display for SearchMode {
                         first = false;
                     }
                     if let Some(rm) = relaxed_monotonicity_l {
-                        if !first { write!(f, ", ")?; }
+                        if !first {
+                            write!(f, ", ")?;
+                        }
                         write!(f, "rm_l={}", rm)?;
                     }
                     write!(f, ")")?;
                 }
                 Ok(())
             }
-            SearchMode::PipeSearch { adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms } => {
+            SearchMode::PipeSearch {
+                adaptive_beam_width,
+                relaxed_monotonicity_l,
+                sqpoll_idle_ms,
+            } => {
                 write!(f, "PipeSearch")?;
                 let has_abw = *adaptive_beam_width;
                 let has_rm = relaxed_monotonicity_l.is_some();
@@ -144,12 +153,16 @@ impl fmt::Display for SearchMode {
                         first = false;
                     }
                     if let Some(rm) = relaxed_monotonicity_l {
-                        if !first { write!(f, ", ")?; }
+                        if !first {
+                            write!(f, ", ")?;
+                        }
                         write!(f, "rm_l={}", rm)?;
                         first = false;
                     }
                     if let Some(sq) = sqpoll_idle_ms {
-                        if !first { write!(f, ", ")?; }
+                        if !first {
+                            write!(f, ", ")?;
+                        }
                         write!(f, "sqpoll={}ms", sq)?;
                     }
                     write!(f, ")")?;
diff --git a/diskann-disk/src/build/builder/tests.rs b/diskann-disk/src/build/builder/tests.rs
index e73e552a7..347f834e4 100644
--- a/diskann-disk/src/build/builder/tests.rs
+++ b/diskann-disk/src/build/builder/tests.rs
@@ -21,8 +21,7 @@ mod chunkable_disk_index_build_tests {
     use crate::{
         build::{
             builder::core::disk_index_builder_tests::{
-                new_vfs, verify_search_result_with_ground_truth,
-                CheckpointParams,
+                new_vfs, verify_search_result_with_ground_truth, CheckpointParams,
                 IndexBuildFixture, TestParams,
             },
             chunking::{
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
index 9a53eb12d..b12fe24fe 100644
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ b/diskann-disk/src/search/pipelined/pipelined_reader.rs
@@ -111,14 +111,11 @@ impl PipelinedReader {
         let buf_start = slot_id * self.slot_size;
         let buf_ptr = self.slot_bufs[buf_start..buf_start + self.slot_size].as_mut_ptr();
 
-        let read_op = io_uring::opcode::Read::new(
-            io_uring::types::Fixed(0),
-            buf_ptr,
-            self.slot_size as u32,
-        )
-        .offset(sector_offset)
-        .build()
-        .user_data(slot_id as u64);
+        let read_op =
+            io_uring::opcode::Read::new(io_uring::types::Fixed(0), buf_ptr, self.slot_size as u32)
+                .offset(sector_offset)
+                .build()
+                .user_data(slot_id as u64);
 
         // SAFETY: The buffer at slot_id is pre-allocated and will remain valid
         // for the duration of the IO operation. Each slot is used exclusively
diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs
index f55d1c8ee..a49abe22d 100644
--- a/diskann-disk/src/search/provider/disk_provider.rs
+++ b/diskann-disk/src/search/provider/disk_provider.rs
@@ -2216,4 +2216,3 @@ mod disk_provider_tests {
         assert!(recall >= 60.0, "Match percentage is below 60%: {}", recall);
     }
 }
-
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 0bd7bd8d6..925b8f4ca 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -20,11 +20,15 @@ use std::time::Instant;
 use byteorder::{ByteOrder, LittleEndian};
 use diskann::{
     graph::{
-        glue::{ExpandBeam, HybridPredicate, IdIterator, SearchExt, SearchPostProcess, SearchStrategy},
+        glue::{
+            ExpandBeam, HybridPredicate, IdIterator, SearchExt, SearchPostProcess, SearchStrategy,
+        },
         search_output_buffer, AdjacencyList, SearchOutputBuffer, SearchParams,
     },
     neighbor::Neighbor,
-    provider::{Accessor, BuildQueryComputer, DefaultContext, DelegateNeighbor, HasId, NeighborAccessor},
+    provider::{
+        Accessor, BuildQueryComputer, DefaultContext, DelegateNeighbor, HasId, NeighborAccessor,
+    },
     utils::object_pool::{ObjectPool, PoolOption, TryAsPooled},
     ANNError, ANNResult,
 };
@@ -34,7 +38,7 @@ use diskann_providers::model::{
 use diskann_vector::DistanceFunction;
 
 use crate::data_model::Cache;
-use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig, MAX_IO_CONCURRENCY};
+use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig};
 
 use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 use crate::search::traits::VertexProviderFactory;
@@ -71,7 +75,10 @@ impl LoadedNode {
         let node_data = sector_buf.get(offset..end).ok_or_else(|| {
             ANNError::log_index_error(format_args!(
                 "Node data out of bounds: vertex {} offset {}..{} in buffer of len {}",
-                vertex_id, offset, end, sector_buf.len()
+                vertex_id,
+                offset,
+                end,
+                sector_buf.len()
             ))
         })?;
 
@@ -79,7 +86,8 @@ impl LoadedNode {
         if fp_len > node_data.len() {
             return Err(ANNError::log_index_error(format_args!(
                 "fp_vector_len {} exceeds node_data len {}",
-                fp_len, node_data.len()
+                fp_len,
+                node_data.len()
             )));
         }
 
@@ -94,7 +102,8 @@ impl LoadedNode {
         self.adjacency_list.clear();
         for i in 0..num_neighbors {
             let start = 4 + i * 4;
-            self.adjacency_list.push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
+            self.adjacency_list
+                .push(LittleEndian::read_u32(&neighbor_data[start..start + 4]));
         }
 
         self.rank = rank;
@@ -109,12 +118,6 @@ struct InFlightIo {
     rank: u64,
 }
 
-/// Max buffer slots to use, based on beam width.
-#[inline]
-fn max_slots(beam_width: usize) -> usize {
-    (beam_width * 2).clamp(16, MAX_IO_CONCURRENCY)
-}
-
 // ---------------------------------------------------------------------------
 // Poolable scratch: PipelinedReader + PQScratch, reused across queries
 // ---------------------------------------------------------------------------
@@ -185,7 +188,8 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
         let max_slots = self.reader.max_slots();
         self.reader.reset();
         // Return all loaded_nodes back to the pool before clearing
-        self.node_pool.extend(self.loaded_nodes.drain().map(|(_, node)| node));
+        self.node_pool
+            .extend(self.loaded_nodes.drain().map(|(_, node)| node));
         self.in_flight_ios.clear();
         self.expanded_ids.clear();
         self.distance_cache.clear();
@@ -314,15 +318,6 @@ where
         Ok(())
     }
 
-    /// Compute PQ distances for a set of neighbor IDs.
-    /// `ids` must not alias any mutable scratch fields used by PQ computation.
-    fn pq_distances<F>(&mut self, ids: &[u32], mut f: F) -> ANNResult<()>
-    where
-        F: FnMut(f32, u32),
-    {
-        Self::pq_distances_inner(&mut self.scratch.pq_scratch, self.provider, ids, &mut f)
-    }
-
     fn pq_distances_inner<F>(
         pq: &mut PQScratch,
         provider: &DiskProvider<Data>,
@@ -492,13 +487,13 @@ impl<Data> NeighborAccessor for PipelinedNeighborDelegate<'_, '_, Data>
 where
     Data: GraphDataType<VectorIdType = u32>,
 {
-    fn get_neighbors(
+    async fn get_neighbors(
         self,
         _id: Self::Id,
         _neighbors: &mut AdjacencyList<Self::Id>,
-    ) -> impl Future<Output = ANNResult<Self>> + Send {
+    ) -> ANNResult<Self> {
         // Neighbor expansion is handled by expand_available, not get_neighbors
-        async { Ok(self) }
+        Ok(self)
     }
 }
 
@@ -547,13 +542,12 @@ where
         self.scratch.neighbor_buf.clear();
         self.scratch.neighbor_buf.extend(vec_id_itr);
         let mut f = f;
-        let PipelinedScratch { ref mut pq_scratch, ref neighbor_buf, .. } = *self.scratch;
-        Self::pq_distances_inner(
-            pq_scratch,
-            self.provider,
-            neighbor_buf,
-            &mut f,
-        )
+        let PipelinedScratch {
+            ref mut pq_scratch,
+            ref neighbor_buf,
+            ..
+        } = *self.scratch;
+        Self::pq_distances_inner(pq_scratch, self.provider, neighbor_buf, &mut f)
     }
 }
 
@@ -579,7 +573,8 @@ where
             ) {
                 let mut node = self.scratch.acquire_node();
                 node.fp_vector.clear();
-                node.fp_vector.extend_from_slice(bytemuck::cast_slice(vec_data));
+                node.fp_vector
+                    .extend_from_slice(bytemuck::cast_slice(vec_data));
                 node.adjacency_list.clear();
                 node.adjacency_list.extend(adj_list.iter().copied());
                 node.rank = self.next_rank;
@@ -623,87 +618,84 @@ where
     /// 1. If `ids` provides candidates, pick the first loaded match (queue order)
     /// 2. Otherwise, pick the loaded node with the lowest submission rank
     ///    (earliest submitted = best PQ distance at submission time)
-    fn expand_available<P, F>(
+    async fn expand_available<P, F>(
         &mut self,
         ids: impl Iterator<Item = Self::Id> + Send,
         _computer: &Self::QueryComputer,
         mut pred: P,
         mut on_neighbors: F,
-    ) -> impl std::future::Future<Output = ANNResult<Vec<Self::Id>>> + Send
+    ) -> ANNResult<Vec<Self::Id>>
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
-        async move {
-            self.scratch.expanded_ids.clear();
+        self.scratch.expanded_ids.clear();
 
-            // Non-blocking poll for completions
-            self.drain_completions()?;
+        // Non-blocking poll for completions
+        self.drain_completions()?;
 
-            if self.scratch.loaded_nodes.is_empty() {
-                return Ok(Vec::new());
-            }
-
-            // Try caller's priority order first
-            let mut best_vid: Option<u32> = None;
-            for id in ids {
-                if self.scratch.loaded_nodes.contains_key(&id) {
-                    best_vid = Some(id);
-                    break;
-                }
-            }
+        if self.scratch.loaded_nodes.is_empty() {
+            return Ok(Vec::new());
+        }
 
-            // Fallback: pick loaded node with lowest rank (best PQ at submission)
-            if best_vid.is_none() {
-                best_vid = self
-                    .scratch
-                    .loaded_nodes
-                    .iter()
-                    .min_by_key(|(_, node)| node.rank)
-                    .map(|(&id, _)| id);
+        // Try caller's priority order first
+        let mut best_vid: Option<u32> = None;
+        for id in ids {
+            if self.scratch.loaded_nodes.contains_key(&id) {
+                best_vid = Some(id);
+                break;
             }
+        }
 
-            let vid = match best_vid {
-                Some(id) => id,
-                None => return Ok(Vec::new()),
-            };
-            let node = self.scratch.loaded_nodes.remove(&vid).unwrap();
-            self.scratch.expanded_ids.push(vid);
-
-            // Compute full-precision distance and cache it for post-processing
-            let cpu_start = Instant::now();
-            let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
-            let fp_dist = self
-                .provider
-                .distance_comparer
-                .evaluate_similarity(self.query, fp_vec);
-            self.scratch.distance_cache.insert(vid, fp_dist);
-
-            // Get unvisited neighbors into reusable buffer
-            self.scratch.neighbor_buf.clear();
-            self.scratch.neighbor_buf.extend(
-                node.adjacency_list
-                    .iter()
-                    .copied()
-                    .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr)),
-            );
-
-            if !self.scratch.neighbor_buf.is_empty() {
-                let PipelinedScratch { ref mut pq_scratch, ref neighbor_buf, .. } = *self.scratch;
-                Self::pq_distances_inner(
-                    pq_scratch,
-                    self.provider,
-                    neighbor_buf,
-                    &mut on_neighbors,
-                )?;
-            }
-            self.cpu_time += cpu_start.elapsed();
+        // Fallback: pick loaded node with lowest rank (best PQ at submission)
+        if best_vid.is_none() {
+            best_vid = self
+                .scratch
+                .loaded_nodes
+                .iter()
+                .min_by_key(|(_, node)| node.rank)
+                .map(|(&id, _)| id);
+        }
 
-            // Return node to pool for reuse
-            self.scratch.release_node(node);
+        let vid = match best_vid {
+            Some(id) => id,
+            None => return Ok(Vec::new()),
+        };
+        let node = self.scratch.loaded_nodes.remove(&vid).unwrap();
+        self.scratch.expanded_ids.push(vid);
+
+        // Compute full-precision distance and cache it for post-processing
+        let cpu_start = Instant::now();
+        let fp_vec: &[Data::VectorDataType] = bytemuck::cast_slice(&node.fp_vector);
+        let fp_dist = self
+            .provider
+            .distance_comparer
+            .evaluate_similarity(self.query, fp_vec);
+        self.scratch.distance_cache.insert(vid, fp_dist);
+
+        // Get unvisited neighbors into reusable buffer
+        self.scratch.neighbor_buf.clear();
+        self.scratch.neighbor_buf.extend(
+            node.adjacency_list
+                .iter()
+                .copied()
+                .filter(|&nbr| (nbr as usize) < self.num_points && pred.eval_mut(&nbr)),
+        );
 
-            Ok(std::mem::take(&mut self.scratch.expanded_ids))
+        if !self.scratch.neighbor_buf.is_empty() {
+            let PipelinedScratch {
+                ref mut pq_scratch,
+                ref neighbor_buf,
+                ..
+            } = *self.scratch;
+            Self::pq_distances_inner(pq_scratch, self.provider, neighbor_buf, &mut on_neighbors)?;
         }
+        self.cpu_time += cpu_start.elapsed();
+
+        // Return node to pool for reuse
+        self.scratch.release_node(node);
+
+        Ok(std::mem::take(&mut self.scratch.expanded_ids))
     }
 
     /// Returns true when there are in-flight IO operations.
@@ -821,11 +813,12 @@ pub struct PipelinedPostProcessor<'a> {
     filter: &'a (dyn Fn(&u32) -> bool + Send + Sync),
 }
 
-impl<Data> SearchPostProcess<
-    PipelinedDiskAccessor<'_, Data>,
-    [Data::VectorDataType],
-    (u32, Data::AssociatedDataType),
-> for PipelinedPostProcessor<'_>
+impl<Data>
+    SearchPostProcess<
+        PipelinedDiskAccessor<'_, Data>,
+        [Data::VectorDataType],
+        (u32, Data::AssociatedDataType),
+    > for PipelinedPostProcessor<'_>
 where
     Data: GraphDataType<VectorIdType = u32>,
 {
@@ -860,11 +853,9 @@ where
     }
 }
 
-impl<'this, Data> SearchStrategy<
-    DiskProvider<Data>,
-    [Data::VectorDataType],
-    (u32, Data::AssociatedDataType),
-> for PipelinedSearchStrategy<'this, Data>
+impl<'this, Data>
+    SearchStrategy<DiskProvider<Data>, [Data::VectorDataType], (u32, Data::AssociatedDataType)>
+    for PipelinedSearchStrategy<'this, Data>
 where
     Data: GraphDataType<VectorIdType = u32>,
 {
@@ -878,10 +869,8 @@ where
         provider: &'a DiskProvider<Data>,
         _context: &DefaultContext,
     ) -> Result<Self::SearchAccessor<'a>, Self::SearchAccessorError> {
-        let scratch = PoolOption::try_pooled(
-            &self.config.scratch_pool,
-            self.config.scratch_args.clone(),
-        )?;
+        let scratch =
+            PoolOption::try_pooled(&self.config.scratch_pool, self.config.scratch_args.clone())?;
         let mut accessor = PipelinedDiskAccessor::new(
             provider,
             self.query,
diff --git a/diskann-tools/src/utils/search_disk_index.rs b/diskann-tools/src/utils/search_disk_index.rs
index 1a2936f5e..575b18280 100644
--- a/diskann-tools/src/utils/search_disk_index.rs
+++ b/diskann-tools/src/utils/search_disk_index.rs
@@ -389,7 +389,7 @@ where
             _span.set_attribute(KeyValue::new("latency_95", latency_95 as f64));
             _span.set_attribute(KeyValue::new("mean_cpus", mean_cpus));
             _span.set_attribute(KeyValue::new("mean_io_time", mean_io_time));
-            _span.set_attribute(KeyValue::new("mean_ios", mean_ios as f64));
+            _span.set_attribute(KeyValue::new("mean_ios", mean_ios));
             _span.set_attribute(KeyValue::new("mean_comps", mean_comps));
             _span.set_attribute(KeyValue::new("mean_hops", mean_hops));
             _span.set_attribute(KeyValue::new("recall", recall as f64));
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index e23ceaa40..c967176b4 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -398,8 +398,7 @@ where
                 let mut search_record =
                     VisitedSearchRecord::new(self.estimate_visited_set_capacity(Some(search_l)));
 
-                let default_params = SearchParams::new(1, scratch.best.search_l(), None)
-                    .expect("valid default search params");
+                let default_params = SearchParams::new(1, scratch.best.search_l(), None)?;
                 self.search_internal(
                     &default_params,
                     &start_ids,
@@ -524,8 +523,7 @@ where
                     self.estimate_visited_set_capacity(Some(scratch.best.search_l())),
                 );
 
-                let default_params = SearchParams::new(1, scratch.best.search_l(), None)
-                    .expect("valid default search params");
+                let default_params = SearchParams::new(1, scratch.best.search_l(), None)?;
                 self.search_internal(
                     &default_params,
                     &start_ids,
@@ -1334,8 +1332,7 @@ where
 
             let mut scratch = self.search_scratch(l_value, start_ids.len());
 
-            let default_params = SearchParams::new(1, scratch.best.search_l(), None)
-                .expect("valid default search params");
+            let default_params = SearchParams::new(1, scratch.best.search_l(), None)?;
             self.search_internal(
                 &default_params,
                 &start_ids,
@@ -2162,7 +2159,11 @@ where
                     // Step 2: Insert neighbors (updates queue before IO decision)
                     let worst_before = {
                         let sz = scratch.best.size().min(scratch.best.search_l());
-                        if sz > 0 { scratch.best.get(sz - 1).distance } else { f32::MAX }
+                        if sz > 0 {
+                            scratch.best.get(sz - 1).distance
+                        } else {
+                            f32::MAX
+                        }
                     };
                     neighbors
                         .iter()
@@ -2170,20 +2171,22 @@ where
                     scratch.cmps += neighbors.len() as u32;
                     scratch.hops += expanded_ids.len() as u32;
 
-                    if search_params.adaptive_beam_width && !expanded_ids.is_empty() {
-                        if max_marker >= search_params.abw_convergence_depth {
-                            let improved = neighbors.iter().any(|n| n.distance < worst_before);
-                            abw_total += 1;
-                            if improved {
-                                abw_useful += 1;
-                            }
-                            // Grow when ≤10% waste (matching PipeANN's kWasteThreshold)
-                            if abw_total > 0
-                                && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
-                            {
-                                cur_beam_width =
-                                    (cur_beam_width + 1).max(search_params.initial_beam_width).min(beam_width);
-                            }
+                    if search_params.adaptive_beam_width
+                        && !expanded_ids.is_empty()
+                        && max_marker >= search_params.abw_convergence_depth
+                    {
+                        let improved = neighbors.iter().any(|n| n.distance < worst_before);
+                        abw_total += 1;
+                        if improved {
+                            abw_useful += 1;
+                        }
+                        // Grow when ≤10% waste (matching PipeANN's kWasteThreshold)
+                        if abw_total > 0
+                            && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
+                        {
+                            cur_beam_width = (cur_beam_width + 1)
+                                .max(search_params.initial_beam_width)
+                                .min(beam_width);
                         }
                     }
 
@@ -2249,7 +2252,11 @@ where
 
                     let worst_before = {
                         let sz = scratch.best.size().min(scratch.best.search_l());
-                        if sz > 0 { scratch.best.get(sz - 1).distance } else { f32::MAX }
+                        if sz > 0 {
+                            scratch.best.get(sz - 1).distance
+                        } else {
+                            f32::MAX
+                        }
                     };
                     neighbors
                         .iter()
@@ -2257,19 +2264,21 @@ where
                     scratch.cmps += neighbors.len() as u32;
                     scratch.hops += expanded_ids.len() as u32;
 
-                    if search_params.adaptive_beam_width && !expanded_ids.is_empty() {
-                        if max_marker >= search_params.abw_convergence_depth {
-                            let improved = neighbors.iter().any(|n| n.distance < worst_before);
-                            abw_total += 1;
-                            if improved {
-                                abw_useful += 1;
-                            }
-                            if abw_total > 0
-                                && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
-                            {
-                                cur_beam_width =
-                                    (cur_beam_width + 1).max(search_params.initial_beam_width).min(beam_width);
-                            }
+                    if search_params.adaptive_beam_width
+                        && !expanded_ids.is_empty()
+                        && max_marker >= search_params.abw_convergence_depth
+                    {
+                        let improved = neighbors.iter().any(|n| n.distance < worst_before);
+                        abw_total += 1;
+                        if improved {
+                            abw_useful += 1;
+                        }
+                        if abw_total > 0
+                            && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
+                        {
+                            cur_beam_width = (cur_beam_width + 1)
+                                .max(search_params.initial_beam_width)
+                                .min(beam_width);
                         }
                     }
                 }
@@ -2280,18 +2289,13 @@ where
                 // the sorted queue that the top candidates have been explored.
                 // After convergence, the search continues for rm_l additional node
                 // expansions to improve recall beyond the greedy optimum.
-                if let Some(rm_l) = search_params.relaxed_monotonicity_l {
-                    if rm_l > 0 {
-                        if max_marker >= search_params.abw_convergence_depth
-                            && converge_size.is_none()
-                        {
-                            converge_size = Some(scratch.hops as usize);
-                        }
-                        if let Some(cs) = converge_size {
-                            if (scratch.hops as usize) >= cs + rm_l {
-                                break;
-                            }
-                        }
+                if let Some(rm_l) = search_params.relaxed_monotonicity_l.filter(|&l| l > 0) {
+                    if max_marker >= search_params.abw_convergence_depth && converge_size.is_none()
+                    {
+                        converge_size = Some(scratch.hops as usize);
+                    }
+                    if converge_size.is_some_and(|cs| (scratch.hops as usize) >= cs + rm_l) {
+                        break;
                     }
                 }
             }
@@ -2783,8 +2787,8 @@ where
 
             let mut scratch = self.search_scratch(search_params.starting_l_value, start_ids.len());
 
-            let range_default_params = SearchParams::new(1, scratch.best.search_l(), search_params.beam_width)
-                .expect("valid default search params");
+            let range_default_params =
+                SearchParams::new(1, scratch.best.search_l(), search_params.beam_width)?;
             let initial_stats = self
                 .search_internal(
                     &range_default_params,
@@ -3134,8 +3138,8 @@ where
                     .into_ann_result()?;
 
                 let start_ids = accessor.starting_points().await?;
-                let default_params = SearchParams::new(1, search_state.scratch.best.search_l(), None)
-                    .expect("valid default search params");
+                let default_params =
+                    SearchParams::new(1, search_state.scratch.best.search_l(), None)?;
                 self.search_internal(
                     &default_params,
                     &start_ids,
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index 8027f47d5..f90ee4213 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -19,7 +19,15 @@ pub trait NeighborPriorityQueueIdType:
 /// Any type that implements all the individual requirements for
 /// `NeighborPriorityQueueIdType` implements the full trait.
 impl<T> NeighborPriorityQueueIdType for T where
-    T: Default + Eq + Clone + Copy + std::fmt::Debug + std::fmt::Display + std::hash::Hash + Send + Sync
+    T: Default
+        + Eq
+        + Clone
+        + Copy
+        + std::fmt::Debug
+        + std::fmt::Display
+        + std::hash::Hash
+        + Send
+        + Sync
 {
 }
 

From 554ece9364e5045f67b2c27384bbaa2e8bd91227 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 18:05:29 -0800
Subject: [PATCH 39/46] Remove adaptive beam width and relaxed monotonicity

Strip ABW and RM from the core search loop to minimize changes
needed for the pipelining PR. These optimizations can be added
back in a separate PR.

- Remove adaptive_beam_width, relaxed_monotonicity_l,
  initial_beam_width, abw_convergence_depth from SearchParams
- Remove ABW waste tracking and RM convergence logic from
  search_internal
- Simplify SearchMode enum (BeamSearch is now a unit variant)
- Simplify PipelinedConfig (no ABW/RM fields)
- Update benchmark to use searcher.search() directly
---
 diskann-benchmark/example/pipe-search.json    |   3 +-
 .../src/backend/disk_index/search.rs          |  39 ++-----
 diskann-benchmark/src/inputs/disk.rs          |  89 ++--------------
 .../src/search/provider/pipelined_accessor.rs |  14 +--
 diskann/src/graph/index.rs                    | 100 +-----------------
 diskann/src/graph/misc.rs                     |  26 -----
 6 files changed, 22 insertions(+), 249 deletions(-)

diff --git a/diskann-benchmark/example/pipe-search.json b/diskann-benchmark/example/pipe-search.json
index 1a91353aa..6b0a97871 100644
--- a/diskann-benchmark/example/pipe-search.json
+++ b/diskann-benchmark/example/pipe-search.json
@@ -70,8 +70,7 @@
           "distance": "squared_l2",
           "vector_filters_file": null,
           "search_mode": {
-            "mode": "PipeSearch",
-            "relaxed_monotonicity_l": 50
+            "mode": "PipeSearch"
           }
         }
       }
diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index c8b92c59f..7feca085f 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -9,7 +9,7 @@ use std::{collections::HashSet, fmt, sync::atomic::AtomicBool, sync::Arc, time::
 use opentelemetry::{global, trace::Span, trace::Tracer};
 use opentelemetry_sdk::trace::SdkTracerProvider;
 
-use diskann::{graph::SearchParams, utils::VectorRepr, ANNResult};
+use diskann::{utils::VectorRepr, ANNResult};
 use diskann_benchmark_runner::{files::InputFile, utils::MicroSeconds};
 #[cfg(target_os = "linux")]
 use diskann_disk::search::pipelined::PipelinedReaderConfig;
@@ -324,10 +324,7 @@ where
     let has_any_search_failed = AtomicBool::new(false);
 
     match &search_params.search_mode {
-        SearchMode::BeamSearch {
-            adaptive_beam_width,
-            relaxed_monotonicity_l,
-        } => {
+        SearchMode::BeamSearch => {
             let searcher = &DiskIndexSearcher::<GraphData<T>, _>::new(
                 search_params.num_threads,
                 search_params.search_io_limit.unwrap_or(usize::MAX),
@@ -337,9 +334,6 @@ where
                 None,
             )?;
 
-            let abw = *adaptive_beam_width;
-            let rm_l = *relaxed_monotonicity_l;
-
             logger.log_checkpoint("index_loaded");
 
             search_results_per_l = run_search_loop(
@@ -369,23 +363,12 @@ where
                                     as Box<dyn Fn(&u32) -> bool + Send + Sync>)
                             };
 
-                            let mut sp = SearchParams::new(
-                                search_params.recall_at as usize,
-                                l as usize,
-                                Some(search_params.beam_width),
-                            )
-                            .unwrap();
-                            if abw {
-                                sp = sp.with_adaptive_beam_width();
-                            }
-                            if let Some(rm) = rm_l {
-                                sp = sp.with_relaxed_monotonicity(rm);
-                            }
-
                             write_query_result(
-                                searcher.search_with_params(
+                                searcher.search(
                                     q,
-                                    &sp,
+                                    search_params.recall_at,
+                                    l,
+                                    search_params.beam_width,
                                     vector_filter,
                                     search_params.is_flat_search,
                                 ),
@@ -404,11 +387,7 @@ where
         }
         // Pipelined search — for read-only search on completed (static) indices only.
         // Uses io_uring for IO/compute overlap through the generic search loop.
-        SearchMode::PipeSearch {
-            adaptive_beam_width,
-            relaxed_monotonicity_l,
-            sqpoll_idle_ms,
-        } => {
+        SearchMode::PipeSearch { sqpoll_idle_ms } => {
             #[cfg(target_os = "linux")]
             {
                 use diskann::utils::object_pool::ObjectPool;
@@ -465,8 +444,6 @@ where
 
                 searcher.with_pipelined_config(PipelinedConfig {
                     beam_width: search_params.beam_width,
-                    adaptive_beam_width: *adaptive_beam_width,
-                    relaxed_monotonicity_l: *relaxed_monotonicity_l,
                     node_cache,
                     scratch_pool,
                     scratch_args,
@@ -518,7 +495,7 @@ where
             }
             #[cfg(not(target_os = "linux"))]
             {
-                let _ = (adaptive_beam_width, relaxed_monotonicity_l, sqpoll_idle_ms);
+                let _ = sqpoll_idle_ms;
                 anyhow::bail!("PipeSearch is only supported on Linux");
             }
         }
diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs
index fcc2739c9..cf995cd09 100644
--- a/diskann-benchmark/src/inputs/disk.rs
+++ b/diskann-benchmark/src/inputs/disk.rs
@@ -74,98 +74,29 @@ pub(crate) struct DiskIndexBuild {
 /// Search algorithm to use for disk index search.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 #[serde(tag = "mode")]
+#[derive(Default)]
 pub(crate) enum SearchMode {
     /// Standard beam search (default, current behavior).
-    BeamSearch {
-        /// Start with a smaller beam and grow adaptively. Defaults to false.
-        #[serde(default)]
-        adaptive_beam_width: bool,
-        /// Optional relaxed monotonicity parameter for early termination.
-        #[serde(default)]
-        relaxed_monotonicity_l: Option<usize>,
-    },
+    #[default]
+    BeamSearch,
     /// Pipelined search through the generic search loop (queue-based ExpandBeam).
     /// Overlaps IO and compute using io_uring on Linux.
     #[serde(alias = "UnifiedPipeSearch")]
     PipeSearch {
-        /// Start with a smaller beam and grow adaptively. Defaults to true.
-        #[serde(default = "default_true")]
-        adaptive_beam_width: bool,
-        /// Optional relaxed monotonicity parameter for early termination.
-        #[serde(default)]
-        relaxed_monotonicity_l: Option<usize>,
         /// Enable kernel-side SQ polling (ms idle timeout). None = disabled.
         #[serde(default)]
         sqpoll_idle_ms: Option<u32>,
     },
 }
 
-impl Default for SearchMode {
-    fn default() -> Self {
-        SearchMode::BeamSearch {
-            adaptive_beam_width: false,
-            relaxed_monotonicity_l: None,
-        }
-    }
-}
-
 impl fmt::Display for SearchMode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            SearchMode::BeamSearch {
-                adaptive_beam_width,
-                relaxed_monotonicity_l,
-            } => {
-                write!(f, "BeamSearch")?;
-                let has_abw = *adaptive_beam_width;
-                let has_rm = relaxed_monotonicity_l.is_some();
-                if has_abw || has_rm {
-                    write!(f, "(")?;
-                    let mut first = true;
-                    if has_abw {
-                        write!(f, "abw")?;
-                        first = false;
-                    }
-                    if let Some(rm) = relaxed_monotonicity_l {
-                        if !first {
-                            write!(f, ", ")?;
-                        }
-                        write!(f, "rm_l={}", rm)?;
-                    }
-                    write!(f, ")")?;
-                }
-                Ok(())
-            }
-            SearchMode::PipeSearch {
-                adaptive_beam_width,
-                relaxed_monotonicity_l,
-                sqpoll_idle_ms,
-            } => {
+            SearchMode::BeamSearch => write!(f, "BeamSearch"),
+            SearchMode::PipeSearch { sqpoll_idle_ms } => {
                 write!(f, "PipeSearch")?;
-                let has_abw = *adaptive_beam_width;
-                let has_rm = relaxed_monotonicity_l.is_some();
-                let has_sq = sqpoll_idle_ms.is_some();
-                if has_abw || has_rm || has_sq {
-                    write!(f, "(")?;
-                    let mut first = true;
-                    if has_abw {
-                        write!(f, "abw")?;
-                        first = false;
-                    }
-                    if let Some(rm) = relaxed_monotonicity_l {
-                        if !first {
-                            write!(f, ", ")?;
-                        }
-                        write!(f, "rm_l={}", rm)?;
-                        first = false;
-                    }
-                    if let Some(sq) = sqpoll_idle_ms {
-                        if !first {
-                            write!(f, ", ")?;
-                        }
-                        write!(f, "sqpoll={}ms", sq)?;
-                    }
-                    write!(f, ")")?;
+                if let Some(sq) = sqpoll_idle_ms {
+                    write!(f, "(sqpoll={}ms)", sq)?;
                 }
                 Ok(())
             }
@@ -173,10 +104,6 @@ impl fmt::Display for SearchMode {
     }
 }
 
-fn default_true() -> bool {
-    true
-}
-
 /// Search phase configuration
 #[derive(Debug, Deserialize, Serialize)]
 pub(crate) struct DiskSearchPhase {
@@ -344,7 +271,7 @@ impl CheckDeserialization for DiskSearchPhase {
             }
         }
         match &self.search_mode {
-            SearchMode::BeamSearch { .. } => {}
+            SearchMode::BeamSearch => {}
             SearchMode::PipeSearch { .. } => {}
         }
         Ok(())
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 925b8f4ca..b543010a8 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -763,11 +763,6 @@ where
 /// Configuration for creating a pipelined search through DiskIndexSearcher.
 pub struct PipelinedConfig<Data: GraphDataType<VectorIdType = u32>> {
     pub beam_width: usize,
-    /// Start with a smaller beam and grow adaptively.
-    pub adaptive_beam_width: bool,
-    /// Optional relaxed monotonicity: continue exploring this many extra
-    /// comparisons after the candidate list converges.
-    pub relaxed_monotonicity_l: Option<usize>,
     /// Shared node cache. Nodes found here skip disk IO entirely.
     pub node_cache: Arc<Cache<Data>>,
     /// Pooled scratch (io_uring reader + PQ buffers), created once and reused.
@@ -932,19 +927,12 @@ where
             io_stats: io_stats.clone(),
         };
 
-        let mut search_params = SearchParams::new(
+        let search_params = SearchParams::new(
             return_list_size as usize,
             search_list_size as usize,
             Some(beam_width),
         )?;
 
-        if config.adaptive_beam_width {
-            search_params = search_params.with_adaptive_beam_width();
-        }
-        if let Some(rm_l) = config.relaxed_monotonicity_l {
-            search_params = search_params.with_relaxed_monotonicity(rm_l);
-        }
-
         let mut indices = vec![0u32; return_list_size as usize];
         let mut distances = vec![0f32; return_list_size as usize];
         let mut associated_data =
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index c967176b4..c58444156 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2082,25 +2082,6 @@ where
         async move {
             let beam_width = search_params.beam_width.unwrap_or(1);
 
-            // Adaptive beam width: start at initial_beam_width and grow based on
-            // IO waste ratio. Mirrors PipeANN: grow +1 when ≤10% of IOs are wasted
-            // (expanded node no longer in top-L). Only kicks in after 5 hops.
-            let mut cur_beam_width = if search_params.adaptive_beam_width {
-                beam_width.min(search_params.initial_beam_width)
-            } else {
-                beam_width
-            };
-            let mut abw_useful: u32 = 0; // IOs whose expansion was still useful
-            let mut abw_total: u32 = 0; // total IOs tracked for waste ratio
-            // Tracks the deepest position in the sorted queue where we found an
-            // unsubmitted candidate. Mirrors PipeANN's max_marker — when this
-            // reaches the convergence gate, the search has explored past initial
-            // warmup and convergence-dependent features (ABW, RM) activate.
-            let mut max_marker: usize = 0;
-
-            // Relaxed monotonicity: continue exploring after convergence
-            let mut converge_size: Option<usize> = None;
-
             // paged search can call search_internal multiple times, we only need to initialize
             // state if not already initialized.
             if scratch.visited.is_empty() {
@@ -2157,47 +2138,19 @@ where
                     }
 
                     // Step 2: Insert neighbors (updates queue before IO decision)
-                    let worst_before = {
-                        let sz = scratch.best.size().min(scratch.best.search_l());
-                        if sz > 0 {
-                            scratch.best.get(sz - 1).distance
-                        } else {
-                            f32::MAX
-                        }
-                    };
                     neighbors
                         .iter()
                         .for_each(|neighbor| scratch.best.insert(*neighbor));
                     scratch.cmps += neighbors.len() as u32;
                     scratch.hops += expanded_ids.len() as u32;
 
-                    if search_params.adaptive_beam_width
-                        && !expanded_ids.is_empty()
-                        && max_marker >= search_params.abw_convergence_depth
-                    {
-                        let improved = neighbors.iter().any(|n| n.distance < worst_before);
-                        abw_total += 1;
-                        if improved {
-                            abw_useful += 1;
-                        }
-                        // Grow when ≤10% waste (matching PipeANN's kWasteThreshold)
-                        if abw_total > 0
-                            && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
-                        {
-                            cur_beam_width = (cur_beam_width + 1)
-                                .max(search_params.initial_beam_width)
-                                .min(beam_width);
-                        }
-                    }
-
                     // Step 3: Submit one IO (with updated queue)
                     let inflight = accessor.inflight_count();
-                    if inflight < cur_beam_width {
+                    if inflight < beam_width {
                         scratch.beam_nodes.clear();
-                        if let Some((pos, closest_node)) =
+                        if let Some((_, closest_node)) =
                             scratch.best.peek_best_unsubmitted_with_position(&submitted)
                         {
-                            max_marker = max_marker.max(pos);
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
@@ -2214,16 +2167,13 @@ where
                     }
                 } else {
                     // Non-pipelined path OR initial burst (has_pending=false).
-                    // Both pipelined and non-pipelined use the same node selection
-                    // to track max_marker for ABW/RM convergence detection.
-                    let submit_limit = if has_pending { 0 } else { cur_beam_width };
+                    let submit_limit = if has_pending { 0 } else { beam_width };
 
                     scratch.beam_nodes.clear();
                     while scratch.beam_nodes.len() < submit_limit {
-                        if let Some((pos, closest_node)) =
+                        if let Some((_, closest_node)) =
                             scratch.best.peek_best_unsubmitted_with_position(&submitted)
                         {
-                            max_marker = max_marker.max(pos);
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
@@ -2250,53 +2200,11 @@ where
                         submitted.remove(&id);
                     }
 
-                    let worst_before = {
-                        let sz = scratch.best.size().min(scratch.best.search_l());
-                        if sz > 0 {
-                            scratch.best.get(sz - 1).distance
-                        } else {
-                            f32::MAX
-                        }
-                    };
                     neighbors
                         .iter()
                         .for_each(|neighbor| scratch.best.insert(*neighbor));
                     scratch.cmps += neighbors.len() as u32;
                     scratch.hops += expanded_ids.len() as u32;
-
-                    if search_params.adaptive_beam_width
-                        && !expanded_ids.is_empty()
-                        && max_marker >= search_params.abw_convergence_depth
-                    {
-                        let improved = neighbors.iter().any(|n| n.distance < worst_before);
-                        abw_total += 1;
-                        if improved {
-                            abw_useful += 1;
-                        }
-                        if abw_total > 0
-                            && (abw_total - abw_useful) as f64 / abw_total as f64 <= 0.1
-                        {
-                            cur_beam_width = (cur_beam_width + 1)
-                                .max(search_params.initial_beam_width)
-                                .min(beam_width);
-                        }
-                    }
-                }
-
-                // Relaxed monotonicity: detect convergence and extend search.
-                // Convergence is detected when max_marker reaches the convergence
-                // depth — meaning the best unsubmitted candidate is deep enough in
-                // the sorted queue that the top candidates have been explored.
-                // After convergence, the search continues for rm_l additional node
-                // expansions to improve recall beyond the greedy optimum.
-                if let Some(rm_l) = search_params.relaxed_monotonicity_l.filter(|&l| l > 0) {
-                    if max_marker >= search_params.abw_convergence_depth && converge_size.is_none()
-                    {
-                        converge_size = Some(scratch.hops as usize);
-                    }
-                    if converge_size.is_some_and(|cs| (scratch.hops as usize) >= cs + rm_l) {
-                        break;
-                    }
                 }
             }
 
diff --git a/diskann/src/graph/misc.rs b/diskann/src/graph/misc.rs
index 1829cb1f9..8c58f6edb 100644
--- a/diskann/src/graph/misc.rs
+++ b/diskann/src/graph/misc.rs
@@ -41,18 +41,6 @@ pub struct SearchParams {
     pub k_value: usize,
     pub l_value: usize,
     pub beam_width: Option<usize>,
-    /// Enable adaptive beam width based on waste ratio tracking.
-    pub adaptive_beam_width: bool,
-    /// Starting beam width when adaptive_beam_width is true.
-    /// Defaults to 4 (matching PipeANN). Grows up to beam_width.
-    pub initial_beam_width: usize,
-    /// Queue depth threshold before adaptive beam width and relaxed monotonicity
-    /// activate. Defaults to 5 (matching PipeANN's max_marker convergence gate).
-    /// When the best unsubmitted candidate is at position ≥ this value in the
-    /// sorted queue, the search is considered past initial warmup.
-    pub abw_convergence_depth: usize,
-    /// Optional relaxed monotonicity parameter.
-    pub relaxed_monotonicity_l: Option<usize>,
 }
 
 #[derive(Debug, Error)]
@@ -92,26 +80,12 @@ impl SearchParams {
             k_value,
             l_value,
             beam_width,
-            adaptive_beam_width: false,
-            initial_beam_width: 4,
-            abw_convergence_depth: 5,
-            relaxed_monotonicity_l: None,
         })
     }
 
     pub fn new_default(k_value: usize, l_value: usize) -> Result<Self, SearchParamsError> {
         SearchParams::new(k_value, l_value, None)
     }
-
-    pub fn with_adaptive_beam_width(mut self) -> Self {
-        self.adaptive_beam_width = true;
-        self
-    }
-
-    pub fn with_relaxed_monotonicity(mut self, l: usize) -> Self {
-        self.relaxed_monotonicity_l = Some(l);
-        self
-    }
 }
 
 // Parameters for the search algorithm

From 39259563aab3ac0a08e7d91cee6bb246c3523b14 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 18:11:08 -0800
Subject: [PATCH 40/46] Remove peek_best_unsubmitted_with_position

The position return value was only used for ABW's max_marker
convergence tracking, which was removed. Simplify to
peek_best_unsubmitted everywhere.
---
 diskann/src/graph/glue.rs     |  3 +--
 diskann/src/graph/index.rs    |  8 ++------
 diskann/src/neighbor/queue.rs | 26 +-------------------------
 3 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index c0db71859..93e3ea1c1 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -308,8 +308,7 @@ where
 
     /// Returns the number of IOs currently in-flight (submitted but not completed).
     ///
-    /// The search loop uses this to cap submissions at `cur_beam_width`, matching
-    /// PipeSearch's behavior of not over-committing speculative reads.
+    /// The search loop uses this to cap submissions at `beam_width`.
     /// Default: 0 (non-pipelined providers have no in-flight IO).
     fn inflight_count(&self) -> usize {
         0
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index c58444156..e377edef1 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2148,9 +2148,7 @@ where
                     let inflight = accessor.inflight_count();
                     if inflight < beam_width {
                         scratch.beam_nodes.clear();
-                        if let Some((_, closest_node)) =
-                            scratch.best.peek_best_unsubmitted_with_position(&submitted)
-                        {
+                        if let Some(closest_node) = scratch.best.peek_best_unsubmitted(&submitted) {
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
@@ -2171,9 +2169,7 @@ where
 
                     scratch.beam_nodes.clear();
                     while scratch.beam_nodes.len() < submit_limit {
-                        if let Some((_, closest_node)) =
-                            scratch.best.peek_best_unsubmitted_with_position(&submitted)
-                        {
+                        if let Some(closest_node) = scratch.best.peek_best_unsubmitted(&submitted) {
                             search_record.record(closest_node, scratch.hops, scratch.cmps);
                             submitted.insert(closest_node.id);
                             scratch.beam_nodes.push(closest_node.id);
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index f90ee4213..3d0527c2c 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -75,15 +75,6 @@ pub trait NeighborQueue<I: NeighborPriorityQueueIdType>: std::fmt::Debug + Send
         None
     }
 
-    /// Like `peek_best_unsubmitted`, but also returns the queue position (0-indexed).
-    /// The position indicates how deep into the sorted queue we had to search.
-    fn peek_best_unsubmitted_with_position(
-        &self,
-        submitted: &HashSet<I>,
-    ) -> Option<(usize, Neighbor<I>)> {
-        self.peek_best_unsubmitted(submitted).map(|n| (0, n))
-    }
-
     /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
     /// Returns true if found and marked, false otherwise.
     fn mark_visited_by_id(&mut self, _id: &I) -> bool {
@@ -519,19 +510,11 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Return the first node that is not visited and not in `submitted`,
     /// scanning positions 0..min(size, search_param_l). Does not modify any state.
     pub fn peek_best_unsubmitted(&self, submitted: &HashSet<I>) -> Option<Neighbor<I>> {
-        self.peek_best_unsubmitted_with_position(submitted)
-            .map(|(_, n)| n)
-    }
-
-    pub fn peek_best_unsubmitted_with_position(
-        &self,
-        submitted: &HashSet<I>,
-    ) -> Option<(usize, Neighbor<I>)> {
         let limit = self.search_param_l.min(self.size);
         for i in self.cursor..limit {
             let (id, visited) = self.id_visiteds[i];
             if !visited && !submitted.contains(&id) {
-                return Some((i, Neighbor::new(id, self.distances[i])));
+                return Some(Neighbor::new(id, self.distances[i]));
             }
         }
         None
@@ -604,13 +587,6 @@ impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<
         self.peek_best_unsubmitted(submitted)
     }
 
-    fn peek_best_unsubmitted_with_position(
-        &self,
-        submitted: &HashSet<I>,
-    ) -> Option<(usize, Neighbor<I>)> {
-        self.peek_best_unsubmitted_with_position(submitted)
-    }
-
     fn mark_visited_by_id(&mut self, id: &I) -> bool {
         self.mark_visited_by_id(id)
     }

From aaab5ca76f606829bdb5d2d89512c2f71a16483c Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 18:21:52 -0800
Subject: [PATCH 41/46] Unify search_internal into a single loop path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the two-branch pipelined/non-pipelined loop with a single
three-phase iteration:

  1. Expand — process whatever data is available
  2. Select + Submit — fill pipeline up to beam_width
  3. Wait — block only when idle with pending IOs

Both pipelined and non-pipelined accessors follow the same path;
the ExpandBeam trait defaults (no-op submit, synchronous expand)
make it transparent.

- Remove is_pipelined() from ExpandBeam trait
- Use beam_width - inflight_count() for unified submission cap
- Pass beam_nodes from previous submit as priority hint to
  expand_available (improves pipelined node selection)
- Net -61 lines from search_internal
---
 .../src/backend/disk_index/search.rs          |   2 +-
 .../src/search/provider/pipelined_accessor.rs |   4 -
 diskann/src/graph/glue.rs                     |   7 -
 diskann/src/graph/index.rs                    | 157 ++++++------------
 4 files changed, 49 insertions(+), 121 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 7feca085f..319f86cf6 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -368,7 +368,7 @@ where
                                     q,
                                     search_params.recall_at,
                                     l,
-                                    search_params.beam_width,
+                                    Some(search_params.beam_width),
                                     vector_filter,
                                     search_params.is_flat_search,
                                 ),
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index b543010a8..317169b69 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -713,10 +713,6 @@ where
             let _ = self.wait_and_drain();
         }
     }
-
-    fn is_pipelined(&self) -> bool {
-        true
-    }
 }
 
 impl<Data> SearchExt for PipelinedDiskAccessor<'_, Data>
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 93e3ea1c1..33863a79a 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -324,13 +324,6 @@ where
     /// Default: no-op (non-pipelined providers never need to wait).
     fn wait_for_io(&mut self) {}
 
-    /// Whether this accessor uses asynchronous IO (pipelined).
-    /// When true, the search loop uses speculative submission (peek without
-    /// marking visited). When false, the classic visited-at-selection path.
-    fn is_pipelined(&self) -> bool {
-        false
-    }
-
     /// Expand all `ids` synchronously: load data, get neighbors, compute distances.
     ///
     /// This is the original single-shot expansion method. For non-pipelined providers,
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index e377edef1..8067f1324 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -397,10 +397,8 @@ where
             for attempt in 0..num_insert_attempts {
                 let mut search_record =
                     VisitedSearchRecord::new(self.estimate_visited_set_capacity(Some(search_l)));
-
-                let default_params = SearchParams::new(1, scratch.best.search_l(), None)?;
                 self.search_internal(
-                    &default_params,
+                    None, // beam_width
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -522,10 +520,8 @@ where
                 let mut search_record = VisitedSearchRecord::new(
                     self.estimate_visited_set_capacity(Some(scratch.best.search_l())),
                 );
-
-                let default_params = SearchParams::new(1, scratch.best.search_l(), None)?;
                 self.search_internal(
-                    &default_params,
+                    None, // beam_width
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -1331,10 +1327,8 @@ where
             let start_ids = search_accessor.starting_points().await?;
 
             let mut scratch = self.search_scratch(l_value, start_ids.len());
-
-            let default_params = SearchParams::new(1, scratch.best.search_l(), None)?;
             self.search_internal(
-                &default_params,
+                None, // beam_width
                 &start_ids,
                 &mut search_accessor,
                 &computer,
@@ -2066,7 +2060,7 @@ where
     // A is the accessor type, T is the query type used for BuildQueryComputer
     fn search_internal<A, T, SR, Q>(
         &self,
-        search_params: &SearchParams,
+        beam_width: Option<usize>,
         start_ids: &[DP::InternalId],
         accessor: &mut A,
         computer: &A::QueryComputer,
@@ -2080,7 +2074,7 @@ where
         Q: NeighborQueue<DP::InternalId>,
     {
         async move {
-            let beam_width = search_params.beam_width.unwrap_or(1);
+            let beam_width = beam_width.unwrap_or(1);
 
             // paged search can call search_internal multiple times, we only need to initialize
             // state if not already initialized.
@@ -2107,100 +2101,50 @@ where
                 || accessor.has_pending())
                 && !accessor.terminate_early()
             {
-                let has_pending = accessor.has_pending();
-                let pipelining = accessor.is_pipelined();
-
-                // PIPELINED ORDER (matching PipeANN's loop):
-                //   1. poll completions + expand one loaded node
-                //   2. insert neighbors into queue
-                //   3. submit one IO (with latest neighbor info)
-                //
-                // NON-PIPELINED ORDER (original beam search):
-                //   1. submit beam_width nodes
-                //   2. expand all (synchronous)
-
-                if pipelining && has_pending {
-                    // Step 1: Expand one loaded node (polls internally).
-                    // Pass empty iterator — the accessor picks by rank.
-                    neighbors.clear();
-                    let expanded_ids = accessor
-                        .expand_available(
-                            std::iter::empty(),
-                            computer,
-                            glue::NotInMut::new(&mut scratch.visited),
-                            |distance, id| neighbors.push(Neighbor::new(id, distance)),
-                        )
-                        .await?;
-
-                    for &id in &expanded_ids {
-                        scratch.best.mark_visited_by_id(&id);
-                        submitted.remove(&id);
-                    }
-
-                    // Step 2: Insert neighbors (updates queue before IO decision)
-                    neighbors
-                        .iter()
-                        .for_each(|neighbor| scratch.best.insert(*neighbor));
-                    scratch.cmps += neighbors.len() as u32;
-                    scratch.hops += expanded_ids.len() as u32;
-
-                    // Step 3: Submit one IO (with updated queue)
-                    let inflight = accessor.inflight_count();
-                    if inflight < beam_width {
-                        scratch.beam_nodes.clear();
-                        if let Some(closest_node) = scratch.best.peek_best_unsubmitted(&submitted) {
-                            search_record.record(closest_node, scratch.hops, scratch.cmps);
-                            submitted.insert(closest_node.id);
-                            scratch.beam_nodes.push(closest_node.id);
-                        }
-                        accessor.submit_expand(scratch.beam_nodes.iter().copied());
-                    }
-
-                    // Block if truly idle
-                    if expanded_ids.is_empty() && has_pending {
-                        let inflight = accessor.inflight_count();
-                        if inflight > 0 {
-                            accessor.wait_for_io();
-                        }
-                    }
-                } else {
-                    // Non-pipelined path OR initial burst (has_pending=false).
-                    let submit_limit = if has_pending { 0 } else { beam_width };
-
-                    scratch.beam_nodes.clear();
-                    while scratch.beam_nodes.len() < submit_limit {
-                        if let Some(closest_node) = scratch.best.peek_best_unsubmitted(&submitted) {
-                            search_record.record(closest_node, scratch.hops, scratch.cmps);
-                            submitted.insert(closest_node.id);
-                            scratch.beam_nodes.push(closest_node.id);
-                        } else {
-                            break;
-                        }
-                    }
+                // Phase 1: Expand nodes whose data is available.
+                // Non-pipelined: synchronously expands beam_nodes from previous submit.
+                // Pipelined: polls IO completions and expands one loaded node.
+                // On the first iteration beam_nodes is empty — a no-op for both paths.
+                neighbors.clear();
+                let expanded_ids = accessor
+                    .expand_available(
+                        scratch.beam_nodes.iter().copied(),
+                        computer,
+                        glue::NotInMut::new(&mut scratch.visited),
+                        |distance, id| neighbors.push(Neighbor::new(id, distance)),
+                    )
+                    .await?;
 
-                    accessor.submit_expand(scratch.beam_nodes.iter().copied());
+                for &id in &expanded_ids {
+                    scratch.best.mark_visited_by_id(&id);
+                    submitted.remove(&id);
+                }
 
-                    neighbors.clear();
-                    let expanded_ids = accessor
-                        .expand_available(
-                            scratch.beam_nodes.iter().copied(),
-                            computer,
-                            glue::NotInMut::new(&mut scratch.visited),
-                            |distance, id| neighbors.push(Neighbor::new(id, distance)),
-                        )
-                        .await?;
+                neighbors
+                    .iter()
+                    .for_each(|neighbor| scratch.best.insert(*neighbor));
+                scratch.cmps += neighbors.len() as u32;
+                scratch.hops += expanded_ids.len() as u32;
 
-                    // Mark expanded nodes visited.
-                    for &id in &expanded_ids {
-                        scratch.best.mark_visited_by_id(&id);
-                        submitted.remove(&id);
+                // Phase 2: Select and submit candidates to fill the pipeline.
+                // Non-pipelined: inflight is always 0, so this submits beam_width nodes.
+                // Pipelined: submits enough to keep beam_width IOs in flight.
+                scratch.beam_nodes.clear();
+                let slots = beam_width.saturating_sub(accessor.inflight_count());
+                while scratch.beam_nodes.len() < slots {
+                    if let Some(closest_node) = scratch.best.peek_best_unsubmitted(&submitted) {
+                        search_record.record(closest_node, scratch.hops, scratch.cmps);
+                        submitted.insert(closest_node.id);
+                        scratch.beam_nodes.push(closest_node.id);
+                    } else {
+                        break;
                     }
+                }
+                accessor.submit_expand(scratch.beam_nodes.iter().copied());
 
-                    neighbors
-                        .iter()
-                        .for_each(|neighbor| scratch.best.insert(*neighbor));
-                    scratch.cmps += neighbors.len() as u32;
-                    scratch.hops += expanded_ids.len() as u32;
+                // Phase 3: Block only when no progress was made but IOs are pending.
+                if expanded_ids.is_empty() && accessor.has_pending() {
+                    accessor.wait_for_io();
                 }
             }
 
@@ -2494,7 +2438,7 @@ where
 
             let stats = self
                 .search_internal(
-                    search_params,
+                    search_params.beam_width,
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -2690,12 +2634,9 @@ where
             let start_ids = accessor.starting_points().await?;
 
             let mut scratch = self.search_scratch(search_params.starting_l_value, start_ids.len());
-
-            let range_default_params =
-                SearchParams::new(1, scratch.best.search_l(), search_params.beam_width)?;
             let initial_stats = self
                 .search_internal(
-                    &range_default_params,
+                    search_params.beam_width,
                     &start_ids,
                     &mut accessor,
                     &computer,
@@ -3042,10 +2983,8 @@ where
                     .into_ann_result()?;
 
                 let start_ids = accessor.starting_points().await?;
-                let default_params =
-                    SearchParams::new(1, search_state.scratch.best.search_l(), None)?;
                 self.search_internal(
-                    &default_params,
+                    None, // beam_width
                     &start_ids,
                     &mut accessor,
                     &search_state.extra.1,
@@ -3793,7 +3732,7 @@ where
 
             let stats = self
                 .search_internal(
-                    search_params,
+                    search_params.beam_width,
                     &start_ids,
                     &mut accessor,
                     &computer,

From a27d7190234746fb3372bf8bb7bf548d5064a481 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 19:03:56 -0800
Subject: [PATCH 42/46] Replace unused id_scratch with neighbors/submitted
 buffers in SearchScratch

- Remove dead id_scratch field (was never read by any search code)
- Add neighbors: Vec<Neighbor<I>> buffer (reused across search hops)
- Add submitted: HashSet<I> for pipelined submission tracking
- Update search_internal and range_search_internal to use scratch buffers
  instead of allocating per-call
- Update diverse search scratch construction and tests
---
 diskann/src/graph/index.rs          | 41 +++++++++++++------------
 diskann/src/graph/search/scratch.rs | 47 ++++++++++++++---------------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 8067f1324..6fa161182 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2091,13 +2091,14 @@ where
                 }
             }
 
-            let mut neighbors = Vec::with_capacity(self.max_degree_with_slack());
-            // Tracks speculatively submitted (but not yet visited/expanded) nodes
-            // so the pipelined path can decouple submission from visitation.
-            let mut submitted = std::collections::HashSet::<DP::InternalId>::new();
+            scratch.neighbors.clear();
+            scratch.submitted.clear();
 
             while (scratch.best.has_notvisited_node()
-                || scratch.best.peek_best_unsubmitted(&submitted).is_some()
+                || scratch
+                    .best
+                    .peek_best_unsubmitted(&scratch.submitted)
+                    .is_some()
                 || accessor.has_pending())
                 && !accessor.terminate_early()
             {
@@ -2105,25 +2106,26 @@ where
                 // Non-pipelined: synchronously expands beam_nodes from previous submit.
                 // Pipelined: polls IO completions and expands one loaded node.
                 // On the first iteration beam_nodes is empty — a no-op for both paths.
-                neighbors.clear();
+                scratch.neighbors.clear();
                 let expanded_ids = accessor
                     .expand_available(
                         scratch.beam_nodes.iter().copied(),
                         computer,
                         glue::NotInMut::new(&mut scratch.visited),
-                        |distance, id| neighbors.push(Neighbor::new(id, distance)),
+                        |distance, id| scratch.neighbors.push(Neighbor::new(id, distance)),
                     )
                     .await?;
 
                 for &id in &expanded_ids {
                     scratch.best.mark_visited_by_id(&id);
-                    submitted.remove(&id);
+                    scratch.submitted.remove(&id);
                 }
 
-                neighbors
+                scratch
+                    .neighbors
                     .iter()
                     .for_each(|neighbor| scratch.best.insert(*neighbor));
-                scratch.cmps += neighbors.len() as u32;
+                scratch.cmps += scratch.neighbors.len() as u32;
                 scratch.hops += expanded_ids.len() as u32;
 
                 // Phase 2: Select and submit candidates to fill the pipeline.
@@ -2132,9 +2134,11 @@ where
                 scratch.beam_nodes.clear();
                 let slots = beam_width.saturating_sub(accessor.inflight_count());
                 while scratch.beam_nodes.len() < slots {
-                    if let Some(closest_node) = scratch.best.peek_best_unsubmitted(&submitted) {
+                    if let Some(closest_node) =
+                        scratch.best.peek_best_unsubmitted(&scratch.submitted)
+                    {
                         search_record.record(closest_node, scratch.hops, scratch.cmps);
-                        submitted.insert(closest_node.id);
+                        scratch.submitted.insert(closest_node.id);
                         scratch.beam_nodes.push(closest_node.id);
                     } else {
                         break;
@@ -2176,8 +2180,6 @@ where
                 scratch.range_frontier.push_back(neighbor.id);
             }
 
-            let mut neighbors = Vec::with_capacity(self.max_degree_with_slack());
-
             let max_returned = search_params.max_returned.unwrap_or(usize::MAX);
 
             while !scratch.range_frontier.is_empty() {
@@ -2192,18 +2194,18 @@ where
                     }
                 }
 
-                neighbors.clear();
+                scratch.neighbors.clear();
                 accessor
                     .expand_beam(
                         scratch.beam_nodes.iter().copied(),
                         computer,
                         glue::NotInMut::new(&mut scratch.visited),
-                        |distance, id| neighbors.push(Neighbor::new(id, distance)),
+                        |distance, id| scratch.neighbors.push(Neighbor::new(id, distance)),
                     )
                     .await?;
 
                 // The predicate ensure that the contents of `neighbors` are unique.
-                for neighbor in neighbors.iter() {
+                for neighbor in scratch.neighbors.iter() {
                     if neighbor.distance <= search_params.radius * search_params.range_search_slack
                         && scratch.in_range.len() < max_returned
                     {
@@ -2211,7 +2213,7 @@ where
                         scratch.range_frontier.push_back(neighbor.id);
                     }
                 }
-                scratch.cmps += neighbors.len() as u32;
+                scratch.cmps += scratch.neighbors.len() as u32;
                 scratch.hops += scratch.beam_nodes.len() as u32;
             }
 
@@ -3663,7 +3665,8 @@ where
         SearchScratch {
             best: diverse_queue,
             visited: HashSet::with_capacity(self.estimate_visited_set_capacity(Some(l_value))),
-            id_scratch: Vec::with_capacity(self.max_degree_with_slack()),
+            neighbors: Vec::with_capacity(self.max_degree_with_slack()),
+            submitted: std::collections::HashSet::new(),
             beam_nodes: Vec::with_capacity(beam_width.unwrap_or(1)),
             range_frontier: std::collections::VecDeque::new(),
             in_range: Vec::new(),
diff --git a/diskann/src/graph/search/scratch.rs b/diskann/src/graph/search/scratch.rs
index 2a4706821..c6eda864d 100644
--- a/diskann/src/graph/search/scratch.rs
+++ b/diskann/src/graph/search/scratch.rs
@@ -14,18 +14,16 @@ use crate::{
     utils::{VectorId, object_pool::AsPooled},
 };
 use hashbrown::HashSet;
+use std::collections::HashSet as StdHashSet;
 
 /// In-mem index related limits
 pub const GRAPH_SLACK_FACTOR: f64 = 1.3_f64;
 
 /// Scratch space used during graph search.
 ///
-/// This struct contains three important members used by both the sync and async indexes:
-/// `query`, `best`, and `visited`.
-///
-/// The member `id_scratch` is only used by the sync index.
-///
-/// Members `labels` and `beta` are used by the async index for beta-filtered search.
+/// This struct holds reusable buffers that are cleared between searches but retain their
+/// heap allocations. The key members are `best` (priority queue), `visited` (dedup set),
+/// and `neighbors`/`submitted` (per-hop buffers used by the search loop).
 #[derive(Debug)]
 pub struct SearchScratch<I, Q = NeighborPriorityQueue<I>>
 where
@@ -34,26 +32,20 @@ where
     /// A priority queue of the best candidates seen during search. This data structure is
     /// also responsible for determining the best unvisited candidate.
     ///
-    /// Used by both sync and async.
-    ///
     /// When used in a paged search context, this queue is unbounded.
     pub best: Q,
 
     /// A record of all ids visited during a search.
     ///
-    /// Used by both sync and async.
-    ///
     /// This is used to prevent multiple requests to the same `id` from the vector providers.
     pub visited: HashSet<I>,
 
-    /// A buffer for adjacency lists.
-    ///
-    /// Only used by sync.
-    ///
-    /// Adjacency lists in the sync provider are guarded by read/write locks. The
-    /// `id_scratch` is used to copy out the contents of an adjacency list to minimize the
-    /// duration the lock is held.
-    pub id_scratch: Vec<I>,
+    /// A reusable buffer for collecting neighbor distances during expansion.
+    pub neighbors: Vec<Neighbor<I>>,
+
+    /// Tracks speculatively submitted (but not yet visited/expanded) nodes so the pipelined
+    /// path can decouple submission from visitation. Empty for non-pipelined search.
+    pub submitted: StdHashSet<I>,
 
     /// A list of beam search nodes used during search. This is used when beam search is enabled
     /// to temporarily hold beam of nodes in each hop.
@@ -123,7 +115,8 @@ where
         Self {
             best,
             visited,
-            id_scratch: Vec::new(),
+            neighbors: Vec::new(),
+            submitted: StdHashSet::new(),
             beam_nodes: Vec::new(),
             in_range: Vec::new(),
             range_frontier: VecDeque::new(),
@@ -147,7 +140,8 @@ where
     pub fn clear(&mut self) {
         self.best.clear();
         self.visited.clear();
-        self.id_scratch.clear();
+        self.neighbors.clear();
+        self.submitted.clear();
         self.beam_nodes.clear();
         self.in_range.clear();
         self.range_frontier.clear();
@@ -244,7 +238,8 @@ mod tests {
             assert_eq!(x.visited.capacity(), 0);
 
             assert!(x.visited.is_empty());
-            assert!(x.id_scratch.is_empty());
+            assert!(x.neighbors.is_empty());
+            assert!(x.submitted.is_empty());
 
             assert!(x.hops == 0);
             assert!(x.cmps == 0);
@@ -262,7 +257,8 @@ mod tests {
             assert_eq!(x.visited.capacity(), 0);
 
             assert!(x.visited.is_empty());
-            assert!(x.id_scratch.is_empty());
+            assert!(x.neighbors.is_empty());
+            assert!(x.submitted.is_empty());
 
             assert!(x.hops == 0);
             assert!(x.cmps == 0);
@@ -299,8 +295,8 @@ mod tests {
         x.visited.insert(1);
         x.visited.insert(10);
 
-        x.id_scratch.push(1);
-        x.id_scratch.push(10);
+        x.neighbors.push(Neighbor::new(1, 1.0));
+        x.neighbors.push(Neighbor::new(10, 2.0));
 
         x.best.insert(Neighbor::new(1, 1.0));
         x.best.insert(Neighbor::new(10, 2.0));
@@ -309,7 +305,8 @@ mod tests {
         // Do the clear.
         x.clear();
         assert!(x.visited.is_empty());
-        assert!(x.id_scratch.is_empty());
+        assert!(x.neighbors.is_empty());
+        assert!(x.submitted.is_empty());
         assert_eq!(x.best.size(), 0);
 
         assert!(x.hops == 0);

From ec24282642f7853ab783aacff744e57a48b37b04 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 20:11:27 -0800
Subject: [PATCH 43/46] Tri-state NodeState queue: eliminate submitted HashSet,
 fix silent drop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the (I, bool) visited flag in NeighborPriorityQueue with a
tri-state NodeState enum (Unvisited, Submitted, Visited). This:

1. Eliminates the submitted HashSet from SearchScratch — node state is
   tracked directly in the queue with no hashing overhead.

2. Fixes silent drop infinite loops — submit_expand now returns rejected
   IDs, and the search loop calls revert_submitted() to transition them
   back to Unvisited for retry on the next iteration.

3. Implements peek_best_unsubmitted/mark_submitted/revert_submitted/
   mark_visited_by_id on DiverseNeighborQueue by delegating to the
   inner global_queue.

4. Updates all tests to use the new tri-state API (246 pass).

SIFT1M benchmark shows no regression:
  BeamSearch: L=40 246 QPS, 86.4% recall
  PipeSearch: L=40 308 QPS, 87.7% recall (+25% QPS)
---
 .../src/search/provider/pipelined_accessor.rs |  16 +-
 diskann/src/graph/glue.rs                     |  18 +-
 diskann/src/graph/index.rs                    |  19 +-
 diskann/src/graph/search/scratch.rs           |  10 -
 .../src/neighbor/diverse_priority_queue.rs    |  40 +++
 diskann/src/neighbor/mod.rs                   |   2 +-
 diskann/src/neighbor/queue.rs                 | 263 ++++++++++++------
 7 files changed, 246 insertions(+), 122 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 317169b69..454d95224 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -557,9 +557,11 @@ where
 {
     /// Submit non-blocking io_uring reads for the given node IDs.
     /// Nodes found in the node cache are placed directly into `loaded_nodes`,
-    /// skipping disk IO entirely.
-    fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) {
+    /// skipping disk IO entirely. Returns IDs that could not be submitted.
+    fn submit_expand(&mut self, ids: impl Iterator<Item = Self::Id> + Send) -> Vec<Self::Id> {
         let io_start = Instant::now();
+        let mut rejected = Vec::new();
+        let mut hit_slot_limit = false;
         for id in ids {
             if self.scratch.loaded_nodes.contains_key(&id) {
                 continue; // Already loaded from a previous IO
@@ -585,8 +587,10 @@ where
             }
 
             // Don't submit if no free io_uring slots are available.
-            if self.scratch.free_slots.is_empty() {
-                break;
+            if hit_slot_limit || self.scratch.free_slots.is_empty() {
+                hit_slot_limit = true;
+                rejected.push(id);
+                continue;
             }
 
             let sector_idx =
@@ -595,7 +599,7 @@ where
             let slot_id = self.scratch.free_slots.pop_front().unwrap();
             let rank = self.next_rank;
             self.next_rank += 1;
-            // Best-effort: if submission fails, return the slot and retry later
+            // Best-effort: if submission fails, return the slot and reject the ID
             // SAFETY: slot_id was just popped from the free-list, guaranteeing
             // it is not currently in-flight.
             if unsafe { self.scratch.reader.submit_read(sector_offset, slot_id) }.is_ok() {
@@ -607,9 +611,11 @@ where
                 self.io_count += 1;
             } else {
                 self.scratch.free_slots.push_back(slot_id);
+                rejected.push(id);
             }
         }
         self.io_time += io_start.elapsed();
+        rejected
     }
 
     /// Poll for completed reads and expand the best loaded node.
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 33863a79a..3aaf064cf 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -255,18 +255,14 @@ pub trait ExpandBeam<T>: BuildQueryComputer<T> + AsNeighbor + Sized
 where
     T: ?Sized,
 {
-    /// Submit IDs to the expansion queue.
+    /// Submit IDs for expansion.
     ///
-    /// For non-pipelined providers (default), IDs are stored in an internal buffer and
-    /// processed synchronously in [`expand_available`]. For pipelined providers, this
-    /// submits non-blocking IO requests (e.g., io_uring reads) so that data loading
-    /// overlaps with other computation.
-    ///
-    /// The default implementation delegates to [`expand_beam`] from within
-    /// [`expand_available`], so overriding this method is only necessary for pipelined
-    /// providers that need to separate submission from completion.
-    fn submit_expand(&mut self, _ids: impl Iterator<Item = Self::Id> + Send) {
-        // Default: no-op. IDs are passed directly to expand_beam in expand_available.
+    /// For non-pipelined providers (default), this is a no-op — IDs are passed
+    /// directly to [`expand_beam`] in [`expand_available`]. For pipelined providers,
+    /// this submits non-blocking IO requests. Any IDs that could not be submitted
+    /// (e.g., no free IO slots) are returned so the caller can revert their state.
+    fn submit_expand(&mut self, _ids: impl Iterator<Item = Self::Id> + Send) -> Vec<Self::Id> {
+        Vec::new() // Default: all accepted
     }
 
     /// Expand nodes whose data is available, invoking `on_neighbors` for each discovered
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index 6fa161182..f7c0bc0e0 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2092,13 +2092,9 @@ where
             }
 
             scratch.neighbors.clear();
-            scratch.submitted.clear();
 
             while (scratch.best.has_notvisited_node()
-                || scratch
-                    .best
-                    .peek_best_unsubmitted(&scratch.submitted)
-                    .is_some()
+                || scratch.best.peek_best_unsubmitted().is_some()
                 || accessor.has_pending())
                 && !accessor.terminate_early()
             {
@@ -2118,7 +2114,6 @@ where
 
                 for &id in &expanded_ids {
                     scratch.best.mark_visited_by_id(&id);
-                    scratch.submitted.remove(&id);
                 }
 
                 scratch
@@ -2134,17 +2129,18 @@ where
                 scratch.beam_nodes.clear();
                 let slots = beam_width.saturating_sub(accessor.inflight_count());
                 while scratch.beam_nodes.len() < slots {
-                    if let Some(closest_node) =
-                        scratch.best.peek_best_unsubmitted(&scratch.submitted)
-                    {
+                    if let Some(closest_node) = scratch.best.peek_best_unsubmitted() {
                         search_record.record(closest_node, scratch.hops, scratch.cmps);
-                        scratch.submitted.insert(closest_node.id);
+                        scratch.best.mark_submitted(&closest_node.id);
                         scratch.beam_nodes.push(closest_node.id);
                     } else {
                         break;
                     }
                 }
-                accessor.submit_expand(scratch.beam_nodes.iter().copied());
+                let rejected = accessor.submit_expand(scratch.beam_nodes.iter().copied());
+                for id in rejected {
+                    scratch.best.revert_submitted(&id);
+                }
 
                 // Phase 3: Block only when no progress was made but IOs are pending.
                 if expanded_ids.is_empty() && accessor.has_pending() {
@@ -3666,7 +3662,6 @@ where
             best: diverse_queue,
             visited: HashSet::with_capacity(self.estimate_visited_set_capacity(Some(l_value))),
             neighbors: Vec::with_capacity(self.max_degree_with_slack()),
-            submitted: std::collections::HashSet::new(),
             beam_nodes: Vec::with_capacity(beam_width.unwrap_or(1)),
             range_frontier: std::collections::VecDeque::new(),
             in_range: Vec::new(),
diff --git a/diskann/src/graph/search/scratch.rs b/diskann/src/graph/search/scratch.rs
index c6eda864d..75ca5f54d 100644
--- a/diskann/src/graph/search/scratch.rs
+++ b/diskann/src/graph/search/scratch.rs
@@ -14,7 +14,6 @@ use crate::{
     utils::{VectorId, object_pool::AsPooled},
 };
 use hashbrown::HashSet;
-use std::collections::HashSet as StdHashSet;
 
 /// In-mem index related limits
 pub const GRAPH_SLACK_FACTOR: f64 = 1.3_f64;
@@ -43,10 +42,6 @@ where
     /// A reusable buffer for collecting neighbor distances during expansion.
     pub neighbors: Vec<Neighbor<I>>,
 
-    /// Tracks speculatively submitted (but not yet visited/expanded) nodes so the pipelined
-    /// path can decouple submission from visitation. Empty for non-pipelined search.
-    pub submitted: StdHashSet<I>,
-
     /// A list of beam search nodes used during search. This is used when beam search is enabled
     /// to temporarily hold beam of nodes in each hop.
     pub beam_nodes: Vec<I>,
@@ -116,7 +111,6 @@ where
             best,
             visited,
             neighbors: Vec::new(),
-            submitted: StdHashSet::new(),
             beam_nodes: Vec::new(),
             in_range: Vec::new(),
             range_frontier: VecDeque::new(),
@@ -141,7 +135,6 @@ where
         self.best.clear();
         self.visited.clear();
         self.neighbors.clear();
-        self.submitted.clear();
         self.beam_nodes.clear();
         self.in_range.clear();
         self.range_frontier.clear();
@@ -239,7 +232,6 @@ mod tests {
 
             assert!(x.visited.is_empty());
             assert!(x.neighbors.is_empty());
-            assert!(x.submitted.is_empty());
 
             assert!(x.hops == 0);
             assert!(x.cmps == 0);
@@ -258,7 +250,6 @@ mod tests {
 
             assert!(x.visited.is_empty());
             assert!(x.neighbors.is_empty());
-            assert!(x.submitted.is_empty());
 
             assert!(x.hops == 0);
             assert!(x.cmps == 0);
@@ -306,7 +297,6 @@ mod tests {
         x.clear();
         assert!(x.visited.is_empty());
         assert!(x.neighbors.is_empty());
-        assert!(x.submitted.is_empty());
         assert_eq!(x.best.size(), 0);
 
         assert!(x.hops == 0);
diff --git a/diskann/src/neighbor/diverse_priority_queue.rs b/diskann/src/neighbor/diverse_priority_queue.rs
index b5373ad44..058906d46 100644
--- a/diskann/src/neighbor/diverse_priority_queue.rs
+++ b/diskann/src/neighbor/diverse_priority_queue.rs
@@ -260,6 +260,46 @@ where
         let sz = self.global_queue.search_l().min(self.global_queue.size());
         BestCandidatesIterator::new(sz, self)
     }
+
+    fn peek_best_unsubmitted(&self) -> Option<Neighbor<P::Id>> {
+        self.global_queue
+            .peek_best_unsubmitted()
+            .map(|n| Neighbor::new(n.id.id, n.distance))
+    }
+
+    fn mark_visited_by_id(&mut self, id: &P::Id) -> bool {
+        // Scan global_queue for the compound ID matching this bare id
+        let limit = self.global_queue.search_l().min(self.global_queue.size());
+        for i in self.global_queue.cursor..limit {
+            let entry = self.global_queue.get(i);
+            if entry.id.id == *id {
+                return self.global_queue.mark_visited_by_id(&entry.id);
+            }
+        }
+        false
+    }
+
+    fn mark_submitted(&mut self, id: &P::Id) -> bool {
+        let limit = self.global_queue.search_l().min(self.global_queue.size());
+        for i in self.global_queue.cursor..limit {
+            let entry = self.global_queue.get(i);
+            if entry.id.id == *id {
+                return self.global_queue.mark_submitted(&entry.id);
+            }
+        }
+        false
+    }
+
+    fn revert_submitted(&mut self, id: &P::Id) -> bool {
+        let limit = self.global_queue.search_l().min(self.global_queue.size());
+        for i in self.global_queue.cursor..limit {
+            let entry = self.global_queue.get(i);
+            if entry.id.id == *id {
+                return self.global_queue.revert_submitted(&entry.id);
+            }
+        }
+        false
+    }
 }
 
 /// Trait for providing attribute values for vector IDs.
diff --git a/diskann/src/neighbor/mod.rs b/diskann/src/neighbor/mod.rs
index 29ee87981..a29b6bb4b 100644
--- a/diskann/src/neighbor/mod.rs
+++ b/diskann/src/neighbor/mod.rs
@@ -10,7 +10,7 @@ use crate::graph::{SearchOutputBuffer, search_output_buffer};
 
 // Exports
 mod queue;
-pub use queue::{NeighborPriorityQueue, NeighborPriorityQueueIdType, NeighborQueue};
+pub use queue::{NeighborPriorityQueue, NeighborPriorityQueueIdType, NeighborQueue, NodeState};
 
 #[cfg(feature = "experimental_diversity_search")]
 mod diverse_priority_queue;
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index 3d0527c2c..aae6a7025 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -4,11 +4,24 @@
  */
 
 use diskann_wide::{SIMDMask, SIMDPartialOrd, SIMDVector};
-use std::collections::HashSet;
 use std::marker::PhantomData;
 
 use super::Neighbor;
 
+/// Tri-state for nodes in the priority queue.
+///
+/// - `Unvisited`: candidate not yet selected for expansion.
+/// - `Submitted`: selected and submitted for IO (pipelined) or expansion, but not yet expanded.
+/// - `Visited`: fully expanded — neighbors have been processed.
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug, Default)]
+pub enum NodeState {
+    #[default]
+    Unvisited = 0,
+    Submitted = 1,
+    Visited = 2,
+}
+
 /// Shared trait for type the generic `I` parameter used by the
 /// `NeighborPeriorityQueue`.
 pub trait NeighborPriorityQueueIdType:
@@ -69,9 +82,9 @@ pub trait NeighborQueue<I: NeighborPriorityQueueIdType>: std::fmt::Debug + Send
     /// Return an iterator over the best candidates.
     fn iter(&self) -> Self::Iter<'_>;
 
-    /// Return the first node (by distance order) that is not visited and not in `submitted`,
-    /// scanning positions 0..min(size, search_l). Does not modify any state.
-    fn peek_best_unsubmitted(&self, _submitted: &HashSet<I>) -> Option<Neighbor<I>> {
+    /// Return the first node that is `Unvisited` (not `Submitted` or `Visited`),
+    /// scanning from the cursor. Does not modify any state.
+    fn peek_best_unsubmitted(&self) -> Option<Neighbor<I>> {
         None
     }
 
@@ -80,6 +93,18 @@ pub trait NeighborQueue<I: NeighborPriorityQueueIdType>: std::fmt::Debug + Send
     fn mark_visited_by_id(&mut self, _id: &I) -> bool {
         false
     }
+
+    /// Transition a node from `Unvisited` to `Submitted`.
+    /// Returns true if found and transitioned, false otherwise.
+    fn mark_submitted(&mut self, _id: &I) -> bool {
+        false
+    }
+
+    /// Transition a node from `Submitted` back to `Unvisited` (for rejected submissions).
+    /// Returns true if found and reverted, false otherwise.
+    fn revert_submitted(&mut self, _id: &I) -> bool {
+        false
+    }
 }
 
 /// Neighbor priority Queue based on the distance to the query node
@@ -99,9 +124,9 @@ pub struct NeighborPriorityQueue<I: NeighborPriorityQueueIdType> {
     /// The current notvisited neighbor whose distance is smallest among all notvisited neighbor
     cursor: usize,
 
-    /// The neighbor (id, visited) collection.
+    /// The neighbor (id, state) collection.
     /// These are stored together to make inserts cheaper.
-    id_visiteds: Vec<(I, bool)>,
+    id_states: Vec<(I, NodeState)>,
 
     /// The neighbor distance collection
     distances: Vec<f32>,
@@ -122,7 +147,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
             size: 0,
             capacity: search_param_l,
             cursor: 0,
-            id_visiteds: Vec::with_capacity(search_param_l),
+            id_states: Vec::with_capacity(search_param_l),
             distances: Vec::with_capacity(search_param_l),
             auto_resizable: false,
             search_param_l,
@@ -135,7 +160,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
             size: 0,
             capacity: search_param_l,
             cursor: 0,
-            id_visiteds: Vec::with_capacity(search_param_l),
+            id_states: Vec::with_capacity(search_param_l),
             distances: Vec::with_capacity(search_param_l),
             auto_resizable: true,
             search_param_l,
@@ -169,17 +194,18 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
         };
 
         if self.size == self.capacity {
-            self.id_visiteds.truncate(self.size - 1);
+            self.id_states.truncate(self.size - 1);
             self.distances.truncate(self.size - 1);
             self.size -= 1;
         }
 
-        self.id_visiteds.insert(insert_idx, (nbr.id, false));
+        self.id_states
+            .insert(insert_idx, (nbr.id, NodeState::Unvisited));
         self.distances.insert(insert_idx, nbr.distance);
 
         self.size += 1;
 
-        debug_assert!(self.size == self.id_visiteds.len());
+        debug_assert!(self.size == self.id_states.len());
         debug_assert!(self.size == self.distances.len());
 
         if insert_idx < self.cursor {
@@ -196,11 +222,11 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
 
         // Copy the first L best candidates to the result vector
         for (i, res) in result.iter_mut().enumerate().take(extract_size) {
-            *res = Neighbor::new(self.id_visiteds[i].0, self.distances[i]);
+            *res = Neighbor::new(self.id_states[i].0, self.distances[i]);
         }
 
         // Remove the first L best candidates from the priority queue
-        self.id_visiteds.drain(0..extract_size);
+        self.id_states.drain(0..extract_size);
         self.distances.drain(0..extract_size);
 
         // Update the size and cursor of the priority queue
@@ -213,7 +239,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Drain candidates from the front, signaling that they have been consumed.
     pub fn drain_best(&mut self, count: usize) {
         let count = count.min(self.size);
-        self.id_visiteds.drain(0..count);
+        self.id_states.drain(0..count);
         self.distances.drain(0..count);
         self.size -= count;
         self.cursor = 0;
@@ -245,7 +271,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
         // Check if we found the exact neighbor (both id and distance must match)
         if index < self.size && self.get_unchecked(index).id == nbr.id {
             // Remove the neighbor from both collections
-            self.id_visiteds.remove(index);
+            self.id_states.remove(index);
             self.distances.remove(index);
             self.size -= 1;
 
@@ -254,7 +280,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
                 self.cursor -= 1;
             }
 
-            debug_assert!(self.size == self.id_visiteds.len());
+            debug_assert!(self.size == self.id_states.len());
             debug_assert!(self.size == self.distances.len());
 
             return true;
@@ -322,7 +348,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Get the neighbor at index - SAFETY: index must be less than size
     fn get_unchecked(&self, index: usize) -> Neighbor<I> {
         debug_assert!(index < self.size);
-        let id = unsafe { self.id_visiteds.get_unchecked(index).0 };
+        let id = unsafe { self.id_states.get_unchecked(index).0 };
         let distance = unsafe { *self.distances.get_unchecked(index) };
         Neighbor::new(id, distance)
     }
@@ -336,11 +362,11 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Get the closest and notvisited neighbor
     pub fn closest_notvisited(&mut self) -> Neighbor<I> {
         let current = self.cursor;
-        self.set_visited(current, true);
+        self.set_state(current, NodeState::Visited);
 
-        // Look for the next notvisited neighbor
+        // Advance cursor past Visited nodes (stop at Submitted or Unvisited)
         self.cursor += 1;
-        while self.cursor < self.size && self.get_visited(self.cursor) {
+        while self.cursor < self.size && self.get_state(self.cursor) == NodeState::Visited {
             self.cursor += 1;
         }
         self.get_unchecked(current)
@@ -373,14 +399,14 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     pub fn reconfigure(&mut self, search_param_l: usize) {
         self.search_param_l = search_param_l;
         if search_param_l < self.size {
-            self.id_visiteds.truncate(search_param_l);
+            self.id_states.truncate(search_param_l);
             self.distances.truncate(search_param_l);
             self.size = search_param_l;
             self.cursor = self.cursor.min(search_param_l);
         } else if search_param_l > self.capacity {
             // Grow the backing store.
             let additional = search_param_l - self.size;
-            self.id_visiteds.reserve(additional);
+            self.id_states.reserve(additional);
             self.distances.reserve(additional);
         }
         self.capacity = search_param_l;
@@ -394,7 +420,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     ///
     /// Most of the time, you want `reconfigure`.
     fn reserve(&mut self, additional: usize) {
-        self.id_visiteds.reserve(additional);
+        self.id_states.reserve(additional);
         self.distances.reserve(additional);
         self.capacity += additional;
     }
@@ -402,23 +428,21 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     /// Set size (and cursor) to 0. This must be called to reset the queue when reusing
     /// between searched.
     pub fn clear(&mut self) {
-        self.id_visiteds.clear();
+        self.id_states.clear();
         self.distances.clear();
         self.size = 0;
         self.cursor = 0;
     }
 
-    fn set_visited(&mut self, index: usize, flag: bool) {
-        // SAFETY: index must be less than size
+    fn set_state(&mut self, index: usize, state: NodeState) {
         assert!(index <= self.size);
         assert!(self.size <= self.capacity);
-        unsafe { self.id_visiteds.get_unchecked_mut(index) }.1 = flag;
+        unsafe { self.id_states.get_unchecked_mut(index) }.1 = state;
     }
 
-    fn get_visited(&self, index: usize) -> bool {
-        // SAFETY: index must be less than size
+    pub(crate) fn get_state(&self, index: usize) -> NodeState {
         assert!(index < self.size);
-        unsafe { self.id_visiteds.get_unchecked(index).1 }
+        unsafe { self.id_states.get_unchecked(index).1 }
     }
 
     /// Return whether or not the queue is auto resizeable (for paged search).
@@ -435,7 +459,7 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     fn dbgassert_unique_insert(&self, id: I) {
         for i in 0..self.size {
             debug_assert!(
-                self.id_visiteds[i].0 != id,
+                self.id_states[i].0 != id,
                 "Neighbor with ID {} already exists in the priority queue",
                 id
             );
@@ -476,11 +500,11 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
             // If this item should be kept, move it to write position
             if f(&neighbor) {
                 if write_idx != read_idx {
-                    self.id_visiteds[write_idx] = self.id_visiteds[read_idx];
+                    self.id_states[write_idx] = self.id_states[read_idx];
                     self.distances[write_idx] = self.distances[read_idx];
                 }
-                // Reset visited state since compaction invalidates previous state
-                self.id_visiteds[write_idx].1 = false;
+                // Reset state since compaction invalidates previous state
+                self.id_states[write_idx].1 = NodeState::Unvisited;
                 write_idx += 1;
             }
         }
@@ -500,36 +524,37 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
     pub fn truncate(&mut self, len: usize) {
         let new_size = len;
         if new_size < self.size {
-            self.id_visiteds.truncate(new_size);
+            self.id_states.truncate(new_size);
             self.distances.truncate(new_size);
             self.size = new_size;
             self.cursor = 0;
         }
     }
 
-    /// Return the first node that is not visited and not in `submitted`,
-    /// scanning positions 0..min(size, search_param_l). Does not modify any state.
-    pub fn peek_best_unsubmitted(&self, submitted: &HashSet<I>) -> Option<Neighbor<I>> {
+    /// Return the first `Unvisited` node, scanning from cursor.
+    /// Does not modify any state.
+    pub fn peek_best_unsubmitted(&self) -> Option<Neighbor<I>> {
         let limit = self.search_param_l.min(self.size);
         for i in self.cursor..limit {
-            let (id, visited) = self.id_visiteds[i];
-            if !visited && !submitted.contains(&id) {
-                return Some(Neighbor::new(id, self.distances[i]));
+            if self.id_states[i].1 == NodeState::Unvisited {
+                return Some(Neighbor::new(self.id_states[i].0, self.distances[i]));
             }
         }
         None
     }
 
-    /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
+    /// Find the node with matching `id`, mark it `Visited`, and advance the cursor if needed.
     /// Returns true if found and marked, false otherwise.
     pub fn mark_visited_by_id(&mut self, id: &I) -> bool {
         for i in self.cursor..self.size {
-            if self.id_visiteds[i].0 == *id {
-                self.id_visiteds[i].1 = true;
-                // If the cursor was pointing at this node, advance past visited nodes
+            if self.id_states[i].0 == *id {
+                self.id_states[i].1 = NodeState::Visited;
+                // If the cursor was pointing at this node, advance past Visited nodes
                 if self.cursor == i {
                     self.cursor += 1;
-                    while self.cursor < self.size && self.get_visited(self.cursor) {
+                    while self.cursor < self.size
+                        && self.get_state(self.cursor) == NodeState::Visited
+                    {
                         self.cursor += 1;
                     }
                 }
@@ -538,6 +563,33 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
         }
         false
     }
+
+    /// Transition a node from `Unvisited` to `Submitted`.
+    /// Returns true if found and transitioned, false otherwise.
+    pub fn mark_submitted(&mut self, id: &I) -> bool {
+        let limit = self.search_param_l.min(self.size);
+        for i in self.cursor..limit {
+            if self.id_states[i].0 == *id && self.id_states[i].1 == NodeState::Unvisited {
+                self.id_states[i].1 = NodeState::Submitted;
+                return true;
+            }
+        }
+        false
+    }
+
+    /// Transition a node from `Submitted` back to `Unvisited`.
+    /// Used when submit_expand rejects an ID (no free IO slots).
+    /// Returns true if found and reverted, false otherwise.
+    pub fn revert_submitted(&mut self, id: &I) -> bool {
+        for i in self.cursor..self.size {
+            if self.id_states[i].0 == *id && self.id_states[i].1 == NodeState::Submitted {
+                debug_assert!(i >= self.cursor);
+                self.id_states[i].1 = NodeState::Unvisited;
+                return true;
+            }
+        }
+        false
+    }
 }
 
 impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<I> {
@@ -583,13 +635,21 @@ impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<
         self.iter()
     }
 
-    fn peek_best_unsubmitted(&self, submitted: &HashSet<I>) -> Option<Neighbor<I>> {
-        self.peek_best_unsubmitted(submitted)
+    fn peek_best_unsubmitted(&self) -> Option<Neighbor<I>> {
+        self.peek_best_unsubmitted()
     }
 
     fn mark_visited_by_id(&mut self, id: &I) -> bool {
         self.mark_visited_by_id(id)
     }
+
+    fn mark_submitted(&mut self, id: &I) -> bool {
+        self.mark_submitted(id)
+    }
+
+    fn revert_submitted(&mut self, id: &I) -> bool {
+        self.revert_submitted(id)
+    }
 }
 
 /// Enable the following syntax for iteration over the valid elements in the queue.
@@ -753,23 +813,23 @@ mod neighbor_priority_queue_test {
         let mut queue = NeighborPriorityQueue::new(3);
         queue.insert(Neighbor::new(1, 1.0));
         queue.insert(Neighbor::new(2, 0.5));
-        assert!(!queue.get_visited(0));
+        assert!(queue.get_state(0) != NodeState::Visited);
         queue.insert(Neighbor::new(3, 1.5)); // node id in queue should be [2,1,3]
         assert!(queue.has_notvisited_node());
         let nbr = queue.closest_notvisited();
         assert_eq!(nbr.id, 2);
         assert_eq!(nbr.distance, 0.5);
-        assert!(queue.get_visited(0)); // super unfortunate test. We know based on above id 2 should be 0th index
+        assert!(queue.get_state(0) == NodeState::Visited); // super unfortunate test. We know based on above id 2 should be 0th index
         assert!(queue.has_notvisited_node());
         let nbr = queue.closest_notvisited();
         assert_eq!(nbr.id, 1);
         assert_eq!(nbr.distance, 1.0);
-        assert!(queue.get_visited(1));
+        assert!(queue.get_state(1) == NodeState::Visited);
         assert!(queue.has_notvisited_node());
         let nbr = queue.closest_notvisited();
         assert_eq!(nbr.id, 3);
         assert_eq!(nbr.distance, 1.5);
-        assert!(queue.get_visited(2));
+        assert!(queue.get_state(2) == NodeState::Visited);
         assert!(!queue.has_notvisited_node());
     }
 
@@ -789,7 +849,7 @@ mod neighbor_priority_queue_test {
     fn test_reserve() {
         let mut queue = NeighborPriorityQueue::<u32>::new(5);
         queue.reconfigure(10);
-        assert_eq!(queue.id_visiteds.len(), 0);
+        assert_eq!(queue.id_states.len(), 0);
         assert_eq!(queue.distances.len(), 0);
         assert_eq!(queue.capacity, 10);
     }
@@ -799,7 +859,7 @@ mod neighbor_priority_queue_test {
         let mut queue = NeighborPriorityQueue::<u32>::new(10);
         queue.reconfigure(5);
         assert_eq!(queue.capacity, 5);
-        assert_eq!(queue.id_visiteds.len(), 0);
+        assert_eq!(queue.id_states.len(), 0);
         assert_eq!(queue.distances.len(), 0);
 
         queue.reconfigure(11);
@@ -813,7 +873,7 @@ mod neighbor_priority_queue_test {
         assert_eq!(resizable_queue.capacity(), 10);
         assert_eq!(resizable_queue.size(), 0);
         assert!(resizable_queue.auto_resizable);
-        assert_eq!(resizable_queue.id_visiteds.len(), 0);
+        assert_eq!(resizable_queue.id_states.len(), 0);
         assert_eq!(resizable_queue.distances.len(), 0);
     }
 
@@ -1497,8 +1557,7 @@ mod neighbor_priority_queue_test {
         queue.insert(Neighbor::new(3, 1.5));
         // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
 
-        let submitted = HashSet::new();
-        let result = queue.peek_best_unsubmitted(&submitted);
+        let result = queue.peek_best_unsubmitted();
         assert!(result.is_some());
         assert_eq!(result.unwrap().id, 2); // closest unvisited, unsubmitted
     }
@@ -1511,9 +1570,8 @@ mod neighbor_priority_queue_test {
         queue.insert(Neighbor::new(3, 1.5));
         // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
 
-        let mut submitted = HashSet::new();
-        submitted.insert(2u32);
-        let result = queue.peek_best_unsubmitted(&submitted);
+        queue.mark_submitted(&2);
+        let result = queue.peek_best_unsubmitted();
         assert!(result.is_some());
         assert_eq!(result.unwrap().id, 1); // 2 is submitted, so next is 1
     }
@@ -1528,8 +1586,7 @@ mod neighbor_priority_queue_test {
 
         queue.closest_notvisited(); // visits 2
 
-        let submitted = HashSet::new();
-        let result = queue.peek_best_unsubmitted(&submitted);
+        let result = queue.peek_best_unsubmitted();
         assert!(result.is_some());
         assert_eq!(result.unwrap().id, 1); // 2 is visited, so next is 1
     }
@@ -1540,10 +1597,9 @@ mod neighbor_priority_queue_test {
         queue.insert(Neighbor::new(1, 1.0));
         queue.insert(Neighbor::new(2, 0.5));
 
-        let mut submitted = HashSet::new();
-        submitted.insert(1u32);
-        submitted.insert(2u32);
-        let result = queue.peek_best_unsubmitted(&submitted);
+        queue.mark_submitted(&1);
+        queue.mark_submitted(&2);
+        let result = queue.peek_best_unsubmitted();
         assert!(result.is_none());
     }
 
@@ -1556,11 +1612,10 @@ mod neighbor_priority_queue_test {
         queue.insert(Neighbor::new(4, 2.0));
         // Queue sorted: [2(0.5), 1(1.0), 3(1.5), 4(2.0)], search_l=2
 
-        let mut submitted = HashSet::new();
-        submitted.insert(2u32);
-        submitted.insert(1u32);
+        queue.mark_submitted(&2);
+        queue.mark_submitted(&1);
         // Both nodes within search_l window are submitted
-        let result = queue.peek_best_unsubmitted(&submitted);
+        let result = queue.peek_best_unsubmitted();
         assert!(result.is_none());
     }
 
@@ -1570,9 +1625,8 @@ mod neighbor_priority_queue_test {
         queue.insert(Neighbor::new(1, 1.0));
         queue.insert(Neighbor::new(2, 0.5));
 
-        let submitted = HashSet::new();
-        let _ = queue.peek_best_unsubmitted(&submitted);
-        let _ = queue.peek_best_unsubmitted(&submitted);
+        let _ = queue.peek_best_unsubmitted();
+        let _ = queue.peek_best_unsubmitted();
 
         // Cursor should still be at 0 (no state modification)
         assert_eq!(queue.cursor, 0);
@@ -1582,8 +1636,7 @@ mod neighbor_priority_queue_test {
     #[test]
     fn test_peek_best_unsubmitted_empty_queue() {
         let queue = NeighborPriorityQueue::<u32>::new(5);
-        let submitted = HashSet::new();
-        assert!(queue.peek_best_unsubmitted(&submitted).is_none());
+        assert!(queue.peek_best_unsubmitted().is_none());
     }
 
     #[test]
@@ -1595,7 +1648,7 @@ mod neighbor_priority_queue_test {
         // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
 
         assert!(queue.mark_visited_by_id(&1));
-        assert!(queue.get_visited(1)); // id=1 is at index 1
+        assert_eq!(queue.get_state(1), NodeState::Visited); // id=1 is at index 1
     }
 
     #[test]
@@ -1658,23 +1711,67 @@ mod neighbor_priority_queue_test {
         queue.insert(Neighbor::new(3, 1.5));
         // Queue sorted: [2(0.5), 1(1.0), 3(1.5)]
 
-        let mut submitted = HashSet::new();
-
         // Peek - should return id=2
-        let node = queue.peek_best_unsubmitted(&submitted).unwrap();
+        let node = queue.peek_best_unsubmitted().unwrap();
         assert_eq!(node.id, 2);
-        submitted.insert(node.id);
+        queue.mark_submitted(&node.id);
 
         // Peek again - should return id=1 (2 is submitted)
-        let node = queue.peek_best_unsubmitted(&submitted).unwrap();
+        let node = queue.peek_best_unsubmitted().unwrap();
         assert_eq!(node.id, 1);
-        submitted.insert(node.id);
+        queue.mark_submitted(&node.id);
 
         // Mark id=2 as visited (IO completed)
         assert!(queue.mark_visited_by_id(&2));
 
         // Peek - should return id=3 (2 visited, 1 submitted)
-        let node = queue.peek_best_unsubmitted(&submitted).unwrap();
+        let node = queue.peek_best_unsubmitted().unwrap();
         assert_eq!(node.id, 3);
     }
+
+    #[test]
+    fn test_mark_submitted_and_revert() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        // Queue sorted: [2(0.5), 1(1.0)]
+
+        // Mark id=2 as submitted
+        assert!(queue.mark_submitted(&2));
+        assert_eq!(queue.get_state(0), NodeState::Submitted);
+
+        // peek should skip submitted
+        let node = queue.peek_best_unsubmitted().unwrap();
+        assert_eq!(node.id, 1);
+
+        // Revert id=2 back to unvisited
+        assert!(queue.revert_submitted(&2));
+        assert_eq!(queue.get_state(0), NodeState::Unvisited);
+
+        // Now peek should return id=2 again
+        let node = queue.peek_best_unsubmitted().unwrap();
+        assert_eq!(node.id, 2);
+    }
+
+    #[test]
+    fn test_cursor_stops_at_submitted() {
+        let mut queue = NeighborPriorityQueue::new(5);
+        queue.insert(Neighbor::new(1, 1.0));
+        queue.insert(Neighbor::new(2, 0.5));
+        queue.insert(Neighbor::new(3, 1.5));
+        // Queue sorted: [2(0.5), 1(1.0), 3(1.5)], cursor=0
+
+        // Mark id=2 as submitted, then visited — cursor should advance past it
+        // but stop at id=1 (Unvisited)
+        queue.mark_submitted(&2);
+        queue.mark_visited_by_id(&2);
+        assert_eq!(queue.cursor, 1);
+
+        // Mark id=1 as submitted — cursor should NOT advance (Submitted ≠ Visited)
+        queue.mark_submitted(&1);
+        assert_eq!(queue.cursor, 1);
+
+        // has_notvisited_node still true (cursor < limit and id=1 is Submitted, not Visited)
+        assert!(queue.has_notvisited_node());
+    }
 }

From 3cd039c436d70b2752d21f2adecf58fc4c8d356d Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Wed, 11 Feb 2026 20:41:53 -0800
Subject: [PATCH 44/46] Review fixes: fuse peek+mark, propagate IO errors,
 eliminate allocations

Address findings from multi-model code review:

1. Fuse peek_best_unsubmitted + mark_submitted into pop_best_unsubmitted
   (eliminates redundant double-scan in Phase 2 submit loop)

2. Propagate wait_for_io errors instead of silently dropping them
   (prevents infinite spin on io_uring failures)

3. Fix set_state assert off-by-one: index <= size -> index < size

4. Eliminate per-hop expanded_ids Vec allocation by passing &mut Vec
   output parameter through expand_available trait method

5. Eliminate double-scan in DiverseNeighborQueue tri-state methods
   by setting state directly at found index

SIFT1M benchmark: +3-7% QPS improvement, identical recall.
---
 .../src/search/provider/pipelined_accessor.rs | 21 ++++++-----
 diskann/src/graph/glue.rs                     | 16 +++++----
 diskann/src/graph/index.rs                    | 10 +++---
 .../src/neighbor/diverse_priority_queue.rs    | 36 +++++++++++++------
 diskann/src/neighbor/queue.rs                 | 27 ++++++++++++--
 5 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 454d95224..6d9013244 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -130,7 +130,6 @@ pub struct PipelinedScratch {
     // Per-query scratch collections, cleared between queries but retain capacity
     in_flight_ios: VecDeque<InFlightIo>,
     loaded_nodes: HashMap<u32, LoadedNode>,
-    expanded_ids: Vec<u32>,
     distance_cache: HashMap<u32, f32>,
     /// Reusable buffer for neighbor IDs during expand_available
     neighbor_buf: Vec<u32>,
@@ -176,7 +175,6 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
             pq_scratch,
             in_flight_ios: VecDeque::new(),
             loaded_nodes: HashMap::new(),
-            expanded_ids: Vec::new(),
             distance_cache: HashMap::new(),
             neighbor_buf: Vec::new(),
             node_pool: Vec::new(),
@@ -191,7 +189,6 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
         self.node_pool
             .extend(self.loaded_nodes.drain().map(|(_, node)| node));
         self.in_flight_ios.clear();
-        self.expanded_ids.clear();
         self.distance_cache.clear();
         self.neighbor_buf.clear();
         self.free_slots.clear();
@@ -630,18 +627,19 @@ where
         _computer: &Self::QueryComputer,
         mut pred: P,
         mut on_neighbors: F,
-    ) -> ANNResult<Vec<Self::Id>>
+        expanded_ids: &mut Vec<Self::Id>,
+    ) -> ANNResult<()>
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
-        self.scratch.expanded_ids.clear();
+        expanded_ids.clear();
 
         // Non-blocking poll for completions
         self.drain_completions()?;
 
         if self.scratch.loaded_nodes.is_empty() {
-            return Ok(Vec::new());
+            return Ok(());
         }
 
         // Try caller's priority order first
@@ -665,10 +663,10 @@ where
 
         let vid = match best_vid {
             Some(id) => id,
-            None => return Ok(Vec::new()),
+            None => return Ok(()),
         };
         let node = self.scratch.loaded_nodes.remove(&vid).unwrap();
-        self.scratch.expanded_ids.push(vid);
+        expanded_ids.push(vid);
 
         // Compute full-precision distance and cache it for post-processing
         let cpu_start = Instant::now();
@@ -701,7 +699,7 @@ where
         // Return node to pool for reuse
         self.scratch.release_node(node);
 
-        Ok(std::mem::take(&mut self.scratch.expanded_ids))
+        Ok(())
     }
 
     /// Returns true when there are in-flight IO operations.
@@ -713,11 +711,12 @@ where
         self.scratch.in_flight_ios.len()
     }
 
-    fn wait_for_io(&mut self) {
+    fn wait_for_io(&mut self) -> ANNResult<()> {
         // Only block if there are actually in-flight IOs to wait for
         if !self.scratch.in_flight_ios.is_empty() {
-            let _ = self.wait_and_drain();
+            self.wait_and_drain()?;
         }
+        Ok(())
     }
 }
 
diff --git a/diskann/src/graph/glue.rs b/diskann/src/graph/glue.rs
index 3aaf064cf..581cc9838 100644
--- a/diskann/src/graph/glue.rs
+++ b/diskann/src/graph/glue.rs
@@ -273,23 +273,25 @@ where
     /// completed IO operations and expands only the nodes whose data has arrived,
     /// returning immediately without blocking.
     ///
-    /// Returns the IDs of nodes that were actually expanded in this call.
+    /// The IDs of nodes actually expanded are written into `expanded_ids`.
     fn expand_available<P, F>(
         &mut self,
         ids: impl Iterator<Item = Self::Id> + Send,
         computer: &Self::QueryComputer,
         pred: P,
         on_neighbors: F,
-    ) -> impl std::future::Future<Output = ANNResult<Vec<Self::Id>>> + Send
+        expanded_ids: &mut Vec<Self::Id>,
+    ) -> impl std::future::Future<Output = ANNResult<()>> + Send
     where
         P: HybridPredicate<Self::Id> + Send + Sync,
         F: FnMut(f32, Self::Id) + Send,
     {
         async move {
-            let id_vec: Vec<Self::Id> = ids.collect();
-            self.expand_beam(id_vec.iter().copied(), computer, pred, on_neighbors)
+            expanded_ids.clear();
+            expanded_ids.extend(ids);
+            self.expand_beam(expanded_ids.iter().copied(), computer, pred, on_neighbors)
                 .await?;
-            Ok(id_vec)
+            Ok(())
         }
     }
 
@@ -318,7 +320,9 @@ where
     /// spin-polling, while the eager drain ensures we process bursts efficiently.
     ///
     /// Default: no-op (non-pipelined providers never need to wait).
-    fn wait_for_io(&mut self) {}
+    fn wait_for_io(&mut self) -> ANNResult<()> {
+        Ok(())
+    }
 
     /// Expand all `ids` synchronously: load data, get neighbors, compute distances.
     ///
diff --git a/diskann/src/graph/index.rs b/diskann/src/graph/index.rs
index f7c0bc0e0..ad99ecbce 100644
--- a/diskann/src/graph/index.rs
+++ b/diskann/src/graph/index.rs
@@ -2093,6 +2093,8 @@ where
 
             scratch.neighbors.clear();
 
+            let mut expanded_ids = Vec::new();
+
             while (scratch.best.has_notvisited_node()
                 || scratch.best.peek_best_unsubmitted().is_some()
                 || accessor.has_pending())
@@ -2103,12 +2105,13 @@ where
                 // Pipelined: polls IO completions and expands one loaded node.
                 // On the first iteration beam_nodes is empty — a no-op for both paths.
                 scratch.neighbors.clear();
-                let expanded_ids = accessor
+                accessor
                     .expand_available(
                         scratch.beam_nodes.iter().copied(),
                         computer,
                         glue::NotInMut::new(&mut scratch.visited),
                         |distance, id| scratch.neighbors.push(Neighbor::new(id, distance)),
+                        &mut expanded_ids,
                     )
                     .await?;
 
@@ -2129,9 +2132,8 @@ where
                 scratch.beam_nodes.clear();
                 let slots = beam_width.saturating_sub(accessor.inflight_count());
                 while scratch.beam_nodes.len() < slots {
-                    if let Some(closest_node) = scratch.best.peek_best_unsubmitted() {
+                    if let Some(closest_node) = scratch.best.pop_best_unsubmitted() {
                         search_record.record(closest_node, scratch.hops, scratch.cmps);
-                        scratch.best.mark_submitted(&closest_node.id);
                         scratch.beam_nodes.push(closest_node.id);
                     } else {
                         break;
@@ -2144,7 +2146,7 @@ where
 
                 // Phase 3: Block only when no progress was made but IOs are pending.
                 if expanded_ids.is_empty() && accessor.has_pending() {
-                    accessor.wait_for_io();
+                    accessor.wait_for_io()?;
                 }
             }
 
diff --git a/diskann/src/neighbor/diverse_priority_queue.rs b/diskann/src/neighbor/diverse_priority_queue.rs
index 058906d46..8d8609916 100644
--- a/diskann/src/neighbor/diverse_priority_queue.rs
+++ b/diskann/src/neighbor/diverse_priority_queue.rs
@@ -12,7 +12,7 @@ use std::{
 };
 
 use crate::neighbor::{
-    Neighbor,
+    Neighbor, NodeState,
     queue::{
         BestCandidatesIterator, NeighborPriorityQueue, NeighborPriorityQueueIdType, NeighborQueue,
     },
@@ -267,13 +267,27 @@ where
             .map(|n| Neighbor::new(n.id.id, n.distance))
     }
 
+    fn pop_best_unsubmitted(&mut self) -> Option<Neighbor<P::Id>> {
+        self.global_queue
+            .pop_best_unsubmitted()
+            .map(|n| Neighbor::new(n.id.id, n.distance))
+    }
+
     fn mark_visited_by_id(&mut self, id: &P::Id) -> bool {
-        // Scan global_queue for the compound ID matching this bare id
         let limit = self.global_queue.search_l().min(self.global_queue.size());
         for i in self.global_queue.cursor..limit {
-            let entry = self.global_queue.get(i);
-            if entry.id.id == *id {
-                return self.global_queue.mark_visited_by_id(&entry.id);
+            if self.global_queue.get(i).id.id == *id {
+                self.global_queue.set_state(i, NodeState::Visited);
+                // Advance cursor past consecutive Visited nodes
+                if i == self.global_queue.cursor {
+                    while self.global_queue.cursor < limit
+                        && self.global_queue.get_state(self.global_queue.cursor)
+                            == NodeState::Visited
+                    {
+                        self.global_queue.cursor += 1;
+                    }
+                }
+                return true;
             }
         }
         false
@@ -282,9 +296,9 @@ where
     fn mark_submitted(&mut self, id: &P::Id) -> bool {
         let limit = self.global_queue.search_l().min(self.global_queue.size());
         for i in self.global_queue.cursor..limit {
-            let entry = self.global_queue.get(i);
-            if entry.id.id == *id {
-                return self.global_queue.mark_submitted(&entry.id);
+            if self.global_queue.get(i).id.id == *id {
+                self.global_queue.set_state(i, NodeState::Submitted);
+                return true;
             }
         }
         false
@@ -293,9 +307,9 @@ where
     fn revert_submitted(&mut self, id: &P::Id) -> bool {
         let limit = self.global_queue.search_l().min(self.global_queue.size());
         for i in self.global_queue.cursor..limit {
-            let entry = self.global_queue.get(i);
-            if entry.id.id == *id {
-                return self.global_queue.revert_submitted(&entry.id);
+            if self.global_queue.get(i).id.id == *id {
+                self.global_queue.set_state(i, NodeState::Unvisited);
+                return true;
             }
         }
         false
diff --git a/diskann/src/neighbor/queue.rs b/diskann/src/neighbor/queue.rs
index aae6a7025..917300ccf 100644
--- a/diskann/src/neighbor/queue.rs
+++ b/diskann/src/neighbor/queue.rs
@@ -88,6 +88,11 @@ pub trait NeighborQueue<I: NeighborPriorityQueueIdType>: std::fmt::Debug + Send
         None
     }
 
+    /// Find the first `Unvisited` node, mark it `Submitted`, and return it — single pass.
+    fn pop_best_unsubmitted(&mut self) -> Option<Neighbor<I>> {
+        None
+    }
+
     /// Find the node with matching `id`, mark it visited, and advance the cursor if needed.
     /// Returns true if found and marked, false otherwise.
     fn mark_visited_by_id(&mut self, _id: &I) -> bool {
@@ -122,7 +127,7 @@ pub struct NeighborPriorityQueue<I: NeighborPriorityQueueIdType> {
     capacity: usize,
 
     /// The current notvisited neighbor whose distance is smallest among all notvisited neighbor
-    cursor: usize,
+    pub(crate) cursor: usize,
 
     /// The neighbor (id, state) collection.
     /// These are stored together to make inserts cheaper.
@@ -434,8 +439,8 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
         self.cursor = 0;
     }
 
-    fn set_state(&mut self, index: usize, state: NodeState) {
-        assert!(index <= self.size);
+    pub(crate) fn set_state(&mut self, index: usize, state: NodeState) {
+        assert!(index < self.size);
         assert!(self.size <= self.capacity);
         unsafe { self.id_states.get_unchecked_mut(index) }.1 = state;
     }
@@ -543,6 +548,18 @@ impl<I: NeighborPriorityQueueIdType> NeighborPriorityQueue<I> {
         None
     }
 
+    /// Find the first `Unvisited` node, mark it `Submitted`, and return it — single pass.
+    pub fn pop_best_unsubmitted(&mut self) -> Option<Neighbor<I>> {
+        let limit = self.search_param_l.min(self.size);
+        for i in self.cursor..limit {
+            if self.id_states[i].1 == NodeState::Unvisited {
+                self.id_states[i].1 = NodeState::Submitted;
+                return Some(Neighbor::new(self.id_states[i].0, self.distances[i]));
+            }
+        }
+        None
+    }
+
     /// Find the node with matching `id`, mark it `Visited`, and advance the cursor if needed.
     /// Returns true if found and marked, false otherwise.
     pub fn mark_visited_by_id(&mut self, id: &I) -> bool {
@@ -639,6 +656,10 @@ impl<I: NeighborPriorityQueueIdType> NeighborQueue<I> for NeighborPriorityQueue<
         self.peek_best_unsubmitted()
     }
 
+    fn pop_best_unsubmitted(&mut self) -> Option<Neighbor<I>> {
+        self.pop_best_unsubmitted()
+    }
+
     fn mark_visited_by_id(&mut self, id: &I) -> bool {
         self.mark_visited_by_id(id)
     }

From 664f0c1c9df7f9e1c41649803b93f779545da90a Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Thu, 12 Feb 2026 19:09:31 -0800
Subject: [PATCH 45/46] fix(pipelined_reader): resolve io_uring safety issues
 from PR #769 review

- Replace AlignedBoxWithSlice with raw pointer backing to fix aliasing UB
- Add SlotState machine (Free/InFlight/Completed) with internal free-list
- Make get_slot_buf state-checked (panics on non-Completed slots)
- Separate enqueue_read/flush to batch submissions and prevent SQE leaks
- Add short-read detection in drain_cqes
- Make drain_all retry on EINTR, abort on fatal errors
- Replace Vec-returning drain with caller-provided buffer
- Update PipelinedDiskAccessor to use new safe API
- Add 12 comprehensive safety tests
---
 .../src/backend/disk_index/search.rs          |   2 +-
 diskann-disk/src/search/mod.rs                |   3 -
 diskann-disk/src/search/pipelined/mod.rs      |  19 -
 .../src/search/pipelined/pipelined_reader.rs  | 216 -----
 .../src/search/provider/pipelined_accessor.rs |  85 +-
 diskann-disk/src/storage/mod.rs               |   5 +
 diskann-disk/src/storage/pipelined_reader.rs  | 781 ++++++++++++++++++
 7 files changed, 838 insertions(+), 273 deletions(-)
 delete mode 100644 diskann-disk/src/search/pipelined/mod.rs
 delete mode 100644 diskann-disk/src/search/pipelined/pipelined_reader.rs
 create mode 100644 diskann-disk/src/storage/pipelined_reader.rs

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index 319f86cf6..d5aa1d5c2 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -12,7 +12,7 @@ use opentelemetry_sdk::trace::SdkTracerProvider;
 use diskann::{utils::VectorRepr, ANNResult};
 use diskann_benchmark_runner::{files::InputFile, utils::MicroSeconds};
 #[cfg(target_os = "linux")]
-use diskann_disk::search::pipelined::PipelinedReaderConfig;
+use diskann_disk::storage::PipelinedReaderConfig;
 #[cfg(target_os = "linux")]
 use diskann_disk::search::provider::pipelined_accessor::PipelinedConfig;
 use diskann_disk::{
diff --git a/diskann-disk/src/search/mod.rs b/diskann-disk/src/search/mod.rs
index 2c475e10a..1f0d8f148 100644
--- a/diskann-disk/src/search/mod.rs
+++ b/diskann-disk/src/search/mod.rs
@@ -9,6 +9,3 @@ pub mod provider;
 pub mod traits;
 
 pub(crate) mod sector_math;
-
-#[cfg(target_os = "linux")]
-pub mod pipelined;
diff --git a/diskann-disk/src/search/pipelined/mod.rs b/diskann-disk/src/search/pipelined/mod.rs
deleted file mode 100644
index fd66d64d8..000000000
--- a/diskann-disk/src/search/pipelined/mod.rs
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Pipelined IO reader for disk search using io_uring.
-//!
-//! Provides [`PipelinedReader`] for non-blocking sector reads with O_DIRECT,
-//! used by [`PipelinedDiskAccessor`](super::provider::pipelined_accessor::PipelinedDiskAccessor)
-//! to overlap IO and compute within a single query.
-
-#[cfg(target_os = "linux")]
-mod pipelined_reader;
-#[cfg(target_os = "linux")]
-pub use pipelined_reader::PipelinedReader;
-#[cfg(target_os = "linux")]
-pub use pipelined_reader::PipelinedReaderConfig;
-#[cfg(target_os = "linux")]
-pub use pipelined_reader::MAX_IO_CONCURRENCY;
diff --git a/diskann-disk/src/search/pipelined/pipelined_reader.rs b/diskann-disk/src/search/pipelined/pipelined_reader.rs
deleted file mode 100644
index b12fe24fe..000000000
--- a/diskann-disk/src/search/pipelined/pipelined_reader.rs
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Pipelined IO reader using io_uring with non-blocking submit/poll semantics.
-
-use std::{
-    fs::OpenOptions,
-    os::{fd::AsRawFd, unix::fs::OpenOptionsExt},
-};
-
-use diskann::{ANNError, ANNResult};
-use diskann_providers::common::AlignedBoxWithSlice;
-use io_uring::IoUring;
-
-/// Maximum number of concurrent IO operations supported by the ring.
-pub const MAX_IO_CONCURRENCY: usize = 128;
-
-/// Configuration for io_uring-based pipelined reader.
-#[derive(Debug, Clone, Default)]
-pub struct PipelinedReaderConfig {
-    /// Enable kernel-side SQ polling. If `Some(idle_ms)`, a kernel thread polls
-    /// the submission queue, eliminating the syscall per submit. After `idle_ms`
-    /// milliseconds of inactivity the kernel thread sleeps (resumed automatically
-    /// on next `submit()`). Requires Linux kernel >= 5.11 (>= 5.13 unprivileged).
-    pub sqpoll_idle_ms: Option<u32>,
-}
-
-/// A pipelined IO reader that wraps `io_uring` for non-blocking submit/poll.
-///
-/// Unlike `LinuxAlignedFileReader` which uses `submit_and_wait` (blocking),
-/// this reader submits reads and polls completions independently, enabling
-/// IO/compute overlap within a single search query.
-pub struct PipelinedReader {
-    ring: IoUring,
-    /// Pre-allocated sector-aligned read buffers, one per slot.
-    slot_bufs: AlignedBoxWithSlice<u8>,
-    /// Size of each slot buffer in bytes.
-    slot_size: usize,
-    /// Maximum number of slots available.
-    max_slots: usize,
-    /// Number of currently in-flight (submitted but not completed) reads.
-    in_flight: usize,
-    /// Keep the file handle alive for the lifetime of the reader.
-    _file: std::fs::File,
-}
-
-impl PipelinedReader {
-    /// Create a new pipelined reader.
-    ///
-    /// # Arguments
-    /// * `file_path` - Path to the disk index file.
-    /// * `max_slots` - Number of buffer slots (must be <= MAX_IO_CONCURRENCY).
-    /// * `slot_size` - Size of each buffer slot in bytes (should be sector-aligned).
-    /// * `alignment` - Memory alignment for the buffer (typically 4096 for O_DIRECT).
-    pub fn new(
-        file_path: &str,
-        max_slots: usize,
-        slot_size: usize,
-        alignment: usize,
-        config: &PipelinedReaderConfig,
-    ) -> ANNResult<Self> {
-        let file = OpenOptions::new()
-            .read(true)
-            .custom_flags(libc::O_DIRECT)
-            .open(file_path)
-            .map_err(ANNError::log_io_error)?;
-
-        let entries = max_slots.min(MAX_IO_CONCURRENCY) as u32;
-        let ring = if config.sqpoll_idle_ms.is_some() {
-            let mut builder = IoUring::builder();
-            if let Some(idle_ms) = config.sqpoll_idle_ms {
-                builder.setup_sqpoll(idle_ms);
-            }
-            builder.build(entries)?
-        } else {
-            IoUring::new(entries)?
-        };
-        let fd = file.as_raw_fd();
-        ring.submitter().register_files(std::slice::from_ref(&fd))?;
-
-        let slot_bufs = AlignedBoxWithSlice::new(max_slots * slot_size, alignment)?;
-
-        Ok(Self {
-            ring,
-            slot_bufs,
-            slot_size,
-            max_slots,
-            in_flight: 0,
-            _file: file,
-        })
-    }
-
-    /// Submit an asynchronous read into the buffer at `slot_id`.
-    ///
-    /// The read will fetch `slot_size` bytes from `sector_offset` (in bytes) into
-    /// the pre-allocated buffer for the given slot. The `slot_id` is stored as
-    /// `user_data` in the CQE for later retrieval.
-    /// Submit a read for the given sector offset into the specified buffer slot.
-    ///
-    /// # Safety
-    /// The caller must ensure `slot_id` is not currently in-flight (i.e., it has
-    /// been returned by a previous completion or was never submitted). Violating
-    /// this invariant allows the kernel to DMA into a buffer being read, causing
-    /// data corruption. When using a free-list for slot management (see
-    /// `PipelinedScratch::free_slots`), this invariant is structurally guaranteed.
-    pub unsafe fn submit_read(&mut self, sector_offset: u64, slot_id: usize) -> ANNResult<()> {
-        assert!(slot_id < self.max_slots, "slot_id out of range");
-
-        let buf_start = slot_id * self.slot_size;
-        let buf_ptr = self.slot_bufs[buf_start..buf_start + self.slot_size].as_mut_ptr();
-
-        let read_op =
-            io_uring::opcode::Read::new(io_uring::types::Fixed(0), buf_ptr, self.slot_size as u32)
-                .offset(sector_offset)
-                .build()
-                .user_data(slot_id as u64);
-
-        // SAFETY: The buffer at slot_id is pre-allocated and will remain valid
-        // for the duration of the IO operation. Each slot is used exclusively
-        // (caller must not reuse a slot while it is in-flight).
-        unsafe {
-            self.ring
-                .submission()
-                .push(&read_op)
-                .map_err(ANNError::log_push_error)?;
-        }
-
-        self.ring.submit()?;
-        self.in_flight += 1;
-        Ok(())
-    }
-
-    /// Poll for completed IO operations (non-blocking).
-    ///
-    /// Drains already-completed CQEs from the io_uring completion queue.
-    pub fn poll_completions(&mut self) -> ANNResult<Vec<usize>> {
-        self.drain_cqes()
-    }
-
-    /// Block until at least one IO completes, then drain all available CQEs.
-    ///
-    /// Use this when [`poll_completions`] returned an empty vec but there are
-    /// in-flight reads — avoids busy-spinning while waiting for the kernel.
-    pub fn wait_completions(&mut self) -> ANNResult<Vec<usize>> {
-        if self.in_flight == 0 {
-            return Ok(Vec::new());
-        }
-        self.ring.submit_and_wait(1)?;
-        self.drain_cqes()
-    }
-
-    /// Drain all available CQEs from the completion queue.
-    fn drain_cqes(&mut self) -> ANNResult<Vec<usize>> {
-        let mut completed = Vec::new();
-        for cqe in self.ring.completion() {
-            if cqe.result() < 0 {
-                self.in_flight = self.in_flight.saturating_sub(1);
-                return Err(ANNError::log_io_error(std::io::Error::from_raw_os_error(
-                    -cqe.result(),
-                )));
-            }
-            let slot_id = cqe.user_data() as usize;
-            completed.push(slot_id);
-            self.in_flight = self.in_flight.saturating_sub(1);
-        }
-        Ok(completed)
-    }
-
-    /// Returns the read buffer for a completed slot.
-    pub fn get_slot_buf(&self, slot_id: usize) -> &[u8] {
-        let start = slot_id * self.slot_size;
-        &self.slot_bufs[start..start + self.slot_size]
-    }
-
-    /// Reset the reader for reuse: drain all in-flight IOs, then clear state.
-    pub fn reset(&mut self) {
-        self.drain_all();
-    }
-
-    /// Returns the number of submitted but not yet completed reads.
-    pub fn in_flight_count(&self) -> usize {
-        self.in_flight
-    }
-
-    /// Returns the slot size in bytes.
-    pub fn slot_size(&self) -> usize {
-        self.slot_size
-    }
-
-    /// Returns the maximum number of buffer slots.
-    pub fn max_slots(&self) -> usize {
-        self.max_slots
-    }
-
-    /// Drain all in-flight IOs, blocking until they complete.
-    /// Must be called before freeing the slot buffers.
-    fn drain_all(&mut self) {
-        if self.in_flight > 0 {
-            let _ = self.ring.submit_and_wait(self.in_flight);
-            for cqe in self.ring.completion() {
-                let _ = cqe;
-            }
-            self.in_flight = 0;
-        }
-    }
-}
-
-impl Drop for PipelinedReader {
-    fn drop(&mut self) {
-        // Must wait for all in-flight kernel IOs to complete before freeing
-        // the slot buffers — otherwise the kernel may DMA into freed memory.
-        self.drain_all();
-    }
-}
diff --git a/diskann-disk/src/search/provider/pipelined_accessor.rs b/diskann-disk/src/search/provider/pipelined_accessor.rs
index 6d9013244..d875f5b3f 100644
--- a/diskann-disk/src/search/provider/pipelined_accessor.rs
+++ b/diskann-disk/src/search/provider/pipelined_accessor.rs
@@ -7,7 +7,6 @@
 //! via the `ExpandBeam` trait's `submit_expand` / `expand_available` / `has_pending` methods.
 //!
 //! Plugs into `DiskANNIndex::search_internal()` and overlaps IO with computation
-//! plugs into `DiskANNIndex::search_internal()` and overlaps IO with computation
 //! using io_uring under the hood.
 
 use std::collections::{HashMap, VecDeque};
@@ -38,7 +37,7 @@ use diskann_providers::model::{
 use diskann_vector::DistanceFunction;
 
 use crate::data_model::Cache;
-use crate::search::pipelined::{PipelinedReader, PipelinedReaderConfig};
+use crate::storage::{PipelinedReader, PipelinedReaderConfig};
 
 use crate::search::sector_math::{node_offset_in_sector, node_sector_index};
 use crate::search::traits::VertexProviderFactory;
@@ -135,8 +134,8 @@ pub struct PipelinedScratch {
     neighbor_buf: Vec<u32>,
     /// Freelist of LoadedNode instances to avoid per-node allocation
     node_pool: Vec<LoadedNode>,
-    /// Free io_uring buffer slot IDs available for new submissions.
-    free_slots: VecDeque<usize>,
+    /// Reusable buffer for completed slot IDs from poll/wait.
+    completed_buf: Vec<usize>,
 }
 
 /// Arguments for creating or resetting a [`PipelinedScratch`].
@@ -178,12 +177,11 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
             distance_cache: HashMap::new(),
             neighbor_buf: Vec::new(),
             node_pool: Vec::new(),
-            free_slots: (0..args.max_slots).collect(),
+            completed_buf: Vec::new(),
         })
     }
 
     fn try_modify(&mut self, _args: PipelinedScratchArgs) -> Result<(), Self::Error> {
-        let max_slots = self.reader.max_slots();
         self.reader.reset();
         // Return all loaded_nodes back to the pool before clearing
         self.node_pool
@@ -191,8 +189,7 @@ impl TryAsPooled<PipelinedScratchArgs> for PipelinedScratch {
         self.in_flight_ios.clear();
         self.distance_cache.clear();
         self.neighbor_buf.clear();
-        self.free_slots.clear();
-        self.free_slots.extend(0..max_slots);
+        self.completed_buf.clear();
         Ok(())
     }
 }
@@ -355,16 +352,21 @@ where
         }
 
         let io_start = Instant::now();
-        let completed_slots = self.scratch.reader.poll_completions()?;
+        // Split borrows: reader and completed_buf are separate fields.
+        let PipelinedScratch {
+            reader,
+            completed_buf,
+            ..
+        } = &mut *self.scratch;
+        reader.poll_completions(completed_buf)?;
         self.io_time += io_start.elapsed();
 
-        if completed_slots.is_empty() {
+        if completed_buf.is_empty() {
             return Ok(());
         }
 
         Self::process_completed_ios_inner(
             &mut self.scratch,
-            &completed_slots,
             self.num_nodes_per_sector,
             self.node_len,
             self.fp_vector_len,
@@ -373,16 +375,20 @@ where
     /// Block until at least one IO completes, then eagerly drain all available.
     fn wait_and_drain(&mut self) -> ANNResult<()> {
         let io_start = Instant::now();
-        let completed_slots = self.scratch.reader.wait_completions()?;
+        let PipelinedScratch {
+            reader,
+            completed_buf,
+            ..
+        } = &mut *self.scratch;
+        reader.wait_completions(completed_buf)?;
         self.io_time += io_start.elapsed();
 
-        if completed_slots.is_empty() {
+        if completed_buf.is_empty() {
             return Ok(());
         }
 
         Self::process_completed_ios_inner(
             &mut self.scratch,
-            &completed_slots,
             self.num_nodes_per_sector,
             self.node_len,
             self.fp_vector_len,
@@ -390,12 +396,11 @@ where
     }
 
     /// Shared logic: process completed slot IDs, parse nodes, retain in-flight.
-    /// Uses linear scan on completed_slots (small, bounded by max_slots) to
+    /// Uses linear scan on completed_buf (small, bounded by max_slots) to
     /// avoid per-poll HashSet allocation. Reuses LoadedNode instances from the
     /// node pool to avoid per-IO Vec allocations.
     fn process_completed_ios_inner(
         scratch: &mut PipelinedScratch,
-        completed_slots: &[usize],
         num_nodes_per_sector: u64,
         node_len: u64,
         fp_vector_len: u64,
@@ -403,10 +408,8 @@ where
         let mut i = 0;
         while i < scratch.in_flight_ios.len() {
             let io = &scratch.in_flight_ios[i];
-            if completed_slots.contains(&io.slot_id) {
+            if scratch.completed_buf.contains(&io.slot_id) {
                 let io = scratch.in_flight_ios.swap_remove_back(i).unwrap();
-                // Return the slot to the free-list so it can be reused.
-                scratch.free_slots.push_back(io.slot_id);
                 // Acquire node first (mutably borrows node_pool),
                 // then get sector buf (immutably borrows reader) — no conflict.
                 let mut node = scratch.node_pool.pop().unwrap_or_else(|| LoadedNode {
@@ -423,6 +426,9 @@ where
                     fp_vector_len,
                     io.rank,
                 )?;
+                // Release the slot back to the reader's free-list now that
+                // we've copied the data out.
+                scratch.reader.release_slot(io.slot_id);
                 scratch.loaded_nodes.insert(io.vertex_id, node);
             } else {
                 i += 1;
@@ -559,6 +565,7 @@ where
         let io_start = Instant::now();
         let mut rejected = Vec::new();
         let mut hit_slot_limit = false;
+        let mut enqueued = 0u32;
         for id in ids {
             if self.scratch.loaded_nodes.contains_key(&id) {
                 continue; // Already loaded from a previous IO
@@ -584,7 +591,7 @@ where
             }
 
             // Don't submit if no free io_uring slots are available.
-            if hit_slot_limit || self.scratch.free_slots.is_empty() {
+            if hit_slot_limit || !self.scratch.reader.has_free_slot() {
                 hit_slot_limit = true;
                 rejected.push(id);
                 continue;
@@ -593,22 +600,32 @@ where
             let sector_idx =
                 node_sector_index(id, self.num_nodes_per_sector, self.num_sectors_per_node);
             let sector_offset = sector_idx * self.block_size as u64;
-            let slot_id = self.scratch.free_slots.pop_front().unwrap();
             let rank = self.next_rank;
             self.next_rank += 1;
-            // Best-effort: if submission fails, return the slot and reject the ID
-            // SAFETY: slot_id was just popped from the free-list, guaranteeing
-            // it is not currently in-flight.
-            if unsafe { self.scratch.reader.submit_read(sector_offset, slot_id) }.is_ok() {
-                self.scratch.in_flight_ios.push_back(InFlightIo {
-                    vertex_id: id,
-                    slot_id,
-                    rank,
-                });
-                self.io_count += 1;
-            } else {
-                self.scratch.free_slots.push_back(slot_id);
-                rejected.push(id);
+            // enqueue_read allocates a slot internally and pushes the SQE.
+            // On failure the slot stays free inside the reader.
+            match self.scratch.reader.enqueue_read(sector_offset) {
+                Ok(slot_id) => {
+                    self.scratch.in_flight_ios.push_back(InFlightIo {
+                        vertex_id: id,
+                        slot_id,
+                        rank,
+                    });
+                    self.io_count += 1;
+                    enqueued += 1;
+                }
+                Err(_) => {
+                    rejected.push(id);
+                }
+            }
+        }
+        // Flush all enqueued SQEs in a single syscall.
+        if enqueued > 0 {
+            if let Err(e) = self.scratch.reader.flush() {
+                // Slots remain InFlight; they'll be drained on drop/reset.
+                self.io_time += io_start.elapsed();
+                tracing::warn!("PipelinedReader::flush failed: {e}");
+                return rejected;
             }
         }
         self.io_time += io_start.elapsed();
diff --git a/diskann-disk/src/storage/mod.rs b/diskann-disk/src/storage/mod.rs
index 410e39a0a..0e03d6875 100644
--- a/diskann-disk/src/storage/mod.rs
+++ b/diskann-disk/src/storage/mod.rs
@@ -21,4 +21,9 @@ pub use cached_writer::CachedWriter;
 
 pub mod quant;
 
+#[cfg(target_os = "linux")]
+pub(crate) mod pipelined_reader;
+#[cfg(target_os = "linux")]
+pub use pipelined_reader::{PipelinedReader, PipelinedReaderConfig, MAX_IO_CONCURRENCY};
+
 pub mod api;
diff --git a/diskann-disk/src/storage/pipelined_reader.rs b/diskann-disk/src/storage/pipelined_reader.rs
new file mode 100644
index 000000000..c502bfb6a
--- /dev/null
+++ b/diskann-disk/src/storage/pipelined_reader.rs
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Pipelined IO reader using io_uring with non-blocking submit/poll semantics.
+//!
+//! # Safety model
+//!
+//! The kernel writes to slot buffers via DMA, which is invisible to the Rust
+//! compiler. To avoid aliasing UB we **never** form `&[u8]` or `&mut [u8]`
+//! references to the backing allocation while any IO is in-flight. Instead we:
+//!
+//! 1. Obtain the base raw pointer (`*mut u8`) **once** at construction — before
+//!    any IO is submitted — and store it for later use.
+//! 2. Pass raw pointers to io_uring for kernel DMA targets.
+//! 3. Only materialise `&[u8]` slices via [`std::slice::from_raw_parts`] for
+//!    slots whose state is [`SlotState::Completed`] (kernel has finished writing).
+//!
+//! Slot lifecycle: `Free → InFlight → Completed → Free`.
+//!
+//! [`PipelinedReader`] owns the free-list and state machine so callers never
+//! need `unsafe` for normal operation.
+
+use std::{
+    collections::VecDeque,
+    fs::OpenOptions,
+    os::{fd::AsRawFd, unix::fs::OpenOptionsExt},
+};
+
+use diskann::{ANNError, ANNResult};
+use diskann_providers::common::AlignedBoxWithSlice;
+use io_uring::IoUring;
+
+/// Maximum number of concurrent IO operations supported by the ring.
+pub const MAX_IO_CONCURRENCY: usize = 128;
+
+/// Configuration for io_uring-based pipelined reader.
+#[derive(Debug, Clone, Default)]
+pub struct PipelinedReaderConfig {
+    /// Enable kernel-side SQ polling. If `Some(idle_ms)`, a kernel thread polls
+    /// the submission queue, eliminating the syscall per submit. After `idle_ms`
+    /// milliseconds of inactivity the kernel thread sleeps (resumed automatically
+    /// on next `submit()`). Requires Linux kernel >= 5.11 (>= 5.13 unprivileged).
+    pub sqpoll_idle_ms: Option<u32>,
+}
+
+/// State of each buffer slot in the pool.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum SlotState {
+    /// Slot is available for a new IO submission.
+    Free,
+    /// SQE has been pushed (and possibly submitted). Kernel may be DMA-ing.
+    InFlight,
+    /// CQE has been reaped — data is ready. Safe to create `&[u8]`.
+    Completed,
+}
+
+/// A pipelined IO reader that wraps `io_uring` for non-blocking submit/poll.
+///
+/// Unlike `LinuxAlignedFileReader` which uses `submit_and_wait` (blocking),
+/// this reader submits reads and polls completions independently, enabling
+/// IO/compute overlap within a single search query.
+///
+/// The reader owns both the ring buffer allocation and the slot state machine.
+/// Callers interact through a safe API:
+///
+/// 1. [`enqueue_read`](Self::enqueue_read) — push an SQE, returns `slot_id`.
+/// 2. [`flush`](Self::flush) — submit all enqueued SQEs to the kernel (one syscall).
+/// 3. [`poll_completions`](Self::poll_completions) /
+///    [`wait_completions`](Self::wait_completions) — drain CQEs.
+/// 4. [`get_slot_buf`](Self::get_slot_buf) — borrow data for a `Completed` slot.
+/// 5. [`release_slot`](Self::release_slot) — return a `Completed` slot to `Free`.
+pub struct PipelinedReader {
+    ring: IoUring,
+    /// Owns the aligned allocation. **Must not be dereferenced** while any IO is
+    /// in-flight — see the module-level safety discussion.
+    _slot_bufs: AlignedBoxWithSlice<u8>,
+    /// Raw pointer to the start of the buffer, obtained once at construction.
+    /// All subsequent slot access goes through pointer arithmetic on this base.
+    buf_base: *mut u8,
+    /// Size of each slot buffer in bytes.
+    slot_size: usize,
+    /// Maximum number of slots available.
+    max_slots: usize,
+    /// Per-slot state.
+    slot_states: Vec<SlotState>,
+    /// FIFO free-list for O(1) slot allocation.
+    free_slots: VecDeque<usize>,
+    /// Number of slots whose SQEs have been submitted to the kernel (InFlight).
+    in_flight: usize,
+    /// Keep the file handle alive for the lifetime of the reader.
+    _file: std::fs::File,
+}
+
+// SAFETY: The raw pointer `buf_base` is derived from an owned allocation
+// (`_slot_bufs`) and is never shared — all mutable access requires `&mut self`.
+// The io_uring ring and file descriptor are kernel-side resources with no
+// thread-affinity. Moving the reader between threads is safe.
+unsafe impl Send for PipelinedReader {}
+// SAFETY: `&self` methods only access completed slot data (kernel has finished
+// writing). All mutation requires `&mut self`.
+unsafe impl Sync for PipelinedReader {}
+
+impl PipelinedReader {
+    /// Create a new pipelined reader.
+    ///
+    /// # Arguments
+    /// * `file_path` - Path to the disk index file.
+    /// * `max_slots` - Number of buffer slots (clamped to [`MAX_IO_CONCURRENCY`]).
+    /// * `slot_size` - Size of each buffer slot in bytes (should be sector-aligned).
+    /// * `alignment` - Memory alignment for the buffer (typically 4096 for O_DIRECT).
+    /// * `config`    - Optional io_uring tuning (e.g. SQPOLL).
+    pub fn new(
+        file_path: &str,
+        max_slots: usize,
+        slot_size: usize,
+        alignment: usize,
+        config: &PipelinedReaderConfig,
+    ) -> ANNResult<Self> {
+        let file = OpenOptions::new()
+            .read(true)
+            .custom_flags(libc::O_DIRECT)
+            .open(file_path)
+            .map_err(ANNError::log_io_error)?;
+
+        let max_slots = max_slots.min(MAX_IO_CONCURRENCY);
+        let entries = max_slots as u32;
+        let ring = if let Some(idle_ms) = config.sqpoll_idle_ms {
+            let mut builder = IoUring::builder();
+            builder.setup_sqpoll(idle_ms);
+            builder.build(entries)?
+        } else {
+            IoUring::new(entries)?
+        };
+        let fd = file.as_raw_fd();
+        ring.submitter().register_files(std::slice::from_ref(&fd))?;
+
+        let mut slot_bufs = AlignedBoxWithSlice::new(max_slots * slot_size, alignment)?;
+
+        // SAFETY: No IOs are in-flight yet, so creating a `&mut [u8]` is sound.
+        // We extract the raw pointer here and never form a reference again.
+        let buf_base: *mut u8 = slot_bufs.as_mut_slice().as_mut_ptr();
+
+        Ok(Self {
+            ring,
+            _slot_bufs: slot_bufs,
+            buf_base,
+            slot_size,
+            max_slots,
+            slot_states: vec![SlotState::Free; max_slots],
+            free_slots: (0..max_slots).collect(),
+            in_flight: 0,
+            _file: file,
+        })
+    }
+
+    // ------------------------------------------------------------------
+    // Submission
+    // ------------------------------------------------------------------
+
+    /// Enqueue an asynchronous read for `sector_offset` into a newly-allocated
+    /// buffer slot. Returns the `slot_id` on success.
+    ///
+    /// The SQE is pushed to the submission queue but **not submitted** to the
+    /// kernel. Call [`flush`](Self::flush) after enqueuing a batch to submit
+    /// them all in a single syscall.
+    ///
+    /// Returns an error if no free slots are available.
+    pub fn enqueue_read(&mut self, sector_offset: u64) -> ANNResult<usize> {
+        let slot_id = self.free_slots.pop_front().ok_or_else(|| {
+            ANNError::log_index_error(format_args!(
+                "PipelinedReader: no free slots (max_slots={})",
+                self.max_slots
+            ))
+        })?;
+        debug_assert_eq!(self.slot_states[slot_id], SlotState::Free);
+
+        // Raw pointer arithmetic — no reference to the backing buffer.
+        let buf_ptr = unsafe { self.buf_base.add(slot_id * self.slot_size) };
+
+        let read_op =
+            io_uring::opcode::Read::new(io_uring::types::Fixed(0), buf_ptr, self.slot_size as u32)
+                .offset(sector_offset)
+                .build()
+                .user_data(slot_id as u64);
+
+        // SAFETY: `buf_ptr` points into a pre-allocated, aligned region that
+        // outlives the reader. The slot is being transitioned to InFlight so no
+        // other code will access this memory region.
+        let push_result = unsafe { self.ring.submission().push(&read_op) };
+        if let Err(e) = push_result {
+            // SQE queue full — return slot to free-list.
+            self.free_slots.push_back(slot_id);
+            return Err(ANNError::log_push_error(e));
+        }
+
+        self.slot_states[slot_id] = SlotState::InFlight;
+        self.in_flight += 1;
+        Ok(slot_id)
+    }
+
+    /// Submit all enqueued SQEs to the kernel in a single syscall.
+    ///
+    /// Retries automatically on `EINTR`. On fatal errors the enqueued slots
+    /// remain `InFlight` and will be drained on [`Drop`].
+    pub fn flush(&mut self) -> ANNResult<()> {
+        loop {
+            match self.ring.submit() {
+                Ok(_) => return Ok(()),
+                Err(ref e) if e.raw_os_error() == Some(libc::EINTR) => continue,
+                Err(e) => return Err(ANNError::log_io_error(e)),
+            }
+        }
+    }
+
+    // ------------------------------------------------------------------
+    // Completion
+    // ------------------------------------------------------------------
+
+    /// Poll for completed IO operations (non-blocking).
+    ///
+    /// Appends completed `slot_id`s to `completed`. Slots transition from
+    /// `InFlight` → `Completed`. The caller must eventually call
+    /// [`release_slot`](Self::release_slot) for each returned slot.
+    ///
+    /// On IO errors or short reads the affected slot is freed automatically and
+    /// an error is returned. Successfully completed slots in `completed` are
+    /// still valid and should be processed first.
+    pub fn poll_completions(&mut self, completed: &mut Vec<usize>) -> ANNResult<()> {
+        self.drain_cqes(completed)
+    }
+
+    /// Block until at least one IO completes, then drain all available CQEs.
+    ///
+    /// Same contract as [`poll_completions`](Self::poll_completions).
+    pub fn wait_completions(&mut self, completed: &mut Vec<usize>) -> ANNResult<()> {
+        if self.in_flight == 0 {
+            completed.clear();
+            return Ok(());
+        }
+        // submit_and_wait also flushes any un-submitted SQEs.
+        loop {
+            match self.ring.submit_and_wait(1) {
+                Ok(_) => break,
+                Err(ref e) if e.raw_os_error() == Some(libc::EINTR) => continue,
+                Err(e) => return Err(ANNError::log_io_error(e)),
+            }
+        }
+        self.drain_cqes(completed)
+    }
+
+    /// Drain all available CQEs from the completion queue.
+    ///
+    /// Processes every available CQE. On error or short-read the affected slot
+    /// is returned to `Free` and the first error is propagated after all CQEs
+    /// have been consumed (so no CQEs are left unprocessed).
+    fn drain_cqes(&mut self, completed: &mut Vec<usize>) -> ANNResult<()> {
+        completed.clear();
+        let mut first_error: Option<ANNError> = None;
+
+        for cqe in self.ring.completion() {
+            let slot_id = cqe.user_data() as usize;
+            debug_assert!(slot_id < self.max_slots);
+            debug_assert_eq!(self.slot_states[slot_id], SlotState::InFlight);
+            self.in_flight -= 1;
+
+            if cqe.result() < 0 {
+                self.slot_states[slot_id] = SlotState::Free;
+                self.free_slots.push_back(slot_id);
+                if first_error.is_none() {
+                    first_error = Some(ANNError::log_io_error(
+                        std::io::Error::from_raw_os_error(-cqe.result()),
+                    ));
+                }
+                continue;
+            }
+
+            let bytes_read = cqe.result() as usize;
+            if bytes_read < self.slot_size {
+                self.slot_states[slot_id] = SlotState::Free;
+                self.free_slots.push_back(slot_id);
+                if first_error.is_none() {
+                    first_error = Some(ANNError::log_io_error(std::io::Error::new(
+                        std::io::ErrorKind::UnexpectedEof,
+                        format!(
+                            "short read: expected {} bytes, got {}",
+                            self.slot_size, bytes_read
+                        ),
+                    )));
+                }
+                continue;
+            }
+
+            self.slot_states[slot_id] = SlotState::Completed;
+            completed.push(slot_id);
+        }
+
+        match first_error {
+            Some(e) => Err(e),
+            None => Ok(()),
+        }
+    }
+
+    // ------------------------------------------------------------------
+    // Slot access
+    // ------------------------------------------------------------------
+
+    /// Returns the read buffer for a completed slot.
+    ///
+    /// # Panics
+    /// Panics if `slot_id` is out of range or the slot is not in `Completed`
+    /// state (i.e. data is not yet ready or has already been released).
+    pub fn get_slot_buf(&self, slot_id: usize) -> &[u8] {
+        assert!(slot_id < self.max_slots, "slot_id out of range");
+        assert_eq!(
+            self.slot_states[slot_id],
+            SlotState::Completed,
+            "slot {slot_id} is not Completed (state: {:?})",
+            self.slot_states[slot_id],
+        );
+        // SAFETY: The slot is Completed — the kernel has finished writing.
+        // `buf_base` was derived from a valid, aligned allocation that outlives
+        // `self`. The slice covers exactly `slot_size` bytes within bounds.
+        unsafe { std::slice::from_raw_parts(self.buf_base.add(slot_id * self.slot_size), self.slot_size) }
+    }
+
+    /// Release a completed slot back to the free-list for reuse.
+    ///
+    /// # Panics
+    /// Panics if the slot is not in `Completed` state.
+    pub fn release_slot(&mut self, slot_id: usize) {
+        assert!(slot_id < self.max_slots, "slot_id out of range");
+        assert_eq!(
+            self.slot_states[slot_id],
+            SlotState::Completed,
+            "cannot release slot {slot_id}: not Completed (state: {:?})",
+            self.slot_states[slot_id],
+        );
+        self.slot_states[slot_id] = SlotState::Free;
+        self.free_slots.push_back(slot_id);
+    }
+
+    // ------------------------------------------------------------------
+    // Lifecycle helpers
+    // ------------------------------------------------------------------
+
+    /// Returns `true` if a free slot is available for [`enqueue_read`](Self::enqueue_read).
+    pub fn has_free_slot(&self) -> bool {
+        !self.free_slots.is_empty()
+    }
+
+    /// Returns the number of submitted but not yet completed reads.
+    pub fn in_flight_count(&self) -> usize {
+        self.in_flight
+    }
+
+    /// Returns the slot size in bytes.
+    pub fn slot_size(&self) -> usize {
+        self.slot_size
+    }
+
+    /// Returns the maximum number of buffer slots.
+    pub fn max_slots(&self) -> usize {
+        self.max_slots
+    }
+
+    /// Reset the reader for reuse: drain all in-flight IOs, release all
+    /// completed slots, then restore every slot to `Free`.
+    pub fn reset(&mut self) {
+        self.drain_all();
+    }
+
+    /// Drain all in-flight IOs, blocking until they complete, then reset all
+    /// slot states to `Free`.
+    ///
+    /// On transient errors (`EINTR`) retries automatically. On unrecoverable
+    /// errors aborts the process — deallocating the buffer while the kernel
+    /// still holds DMA references would cause memory corruption.
+    fn drain_all(&mut self) {
+        let mut remaining = self.in_flight;
+        while remaining > 0 {
+            match self.ring.submit_and_wait(remaining) {
+                Ok(_) => {}
+                Err(ref e) if e.raw_os_error() == Some(libc::EINTR) => continue,
+                Err(_) => {
+                    // Cannot safely deallocate while kernel may have DMA refs.
+                    std::process::abort();
+                }
+            }
+            for cqe in self.ring.completion() {
+                let _ = cqe;
+                remaining = remaining.saturating_sub(1);
+            }
+        }
+        self.in_flight = 0;
+        for state in &mut self.slot_states {
+            *state = SlotState::Free;
+        }
+        self.free_slots.clear();
+        self.free_slots.extend(0..self.max_slots);
+    }
+}
+
+impl Drop for PipelinedReader {
+    fn drop(&mut self) {
+        // Must wait for all in-flight kernel IOs to complete before the
+        // allocation backing `_slot_bufs` is freed.
+        self.drain_all();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
+    use std::io::Write;
+
+    const SECTOR: usize = 4096;
+
+    /// Create a temp file with `n_sectors` sectors of known data.
+    /// Each sector is filled with the byte `(sector_index & 0xFF) as u8`.
+    fn make_test_file(n_sectors: usize) -> tempfile::NamedTempFile {
+        let mut f = tempfile::NamedTempFile::new().expect("create tempfile");
+        for i in 0..n_sectors {
+            let byte = (i & 0xFF) as u8;
+            f.write_all(&vec![byte; SECTOR]).expect("write sector");
+        }
+        f.flush().expect("flush");
+        f
+    }
+
+    /// Create a reader backed by a temp file. Returns both so the file
+    /// outlives the reader.
+    fn make_reader(
+        n_sectors: usize,
+        max_slots: usize,
+    ) -> (tempfile::NamedTempFile, PipelinedReader) {
+        let file = make_test_file(n_sectors);
+        let reader = PipelinedReader::new(
+            file.path().to_str().unwrap(),
+            max_slots,
+            SECTOR,
+            SECTOR,
+            &PipelinedReaderConfig::default(),
+        )
+        .unwrap();
+        (file, reader)
+    }
+
+    /// Enqueue reads for `sectors`, flush, wait for all completions.
+    /// Returns the slot IDs in enqueue order.
+    fn enqueue_flush_wait(
+        reader: &mut PipelinedReader,
+        sectors: impl IntoIterator<Item = usize>,
+    ) -> Vec<usize> {
+        let mut slots = Vec::new();
+        for s in sectors {
+            slots.push(reader.enqueue_read((s * SECTOR) as u64).unwrap());
+        }
+        reader.flush().unwrap();
+        drain_all_completions(reader);
+        slots
+    }
+
+    /// Wait until all in-flight IOs complete.
+    fn drain_all_completions(reader: &mut PipelinedReader) {
+        let mut buf = Vec::new();
+        while reader.in_flight_count() > 0 {
+            reader.wait_completions(&mut buf).unwrap();
+        }
+    }
+
+    /// Assert that a completed slot contains the expected fill byte for a
+    /// given sector index (test files fill sector N with byte N & 0xFF).
+    fn assert_sector_data(reader: &PipelinedReader, slot: usize, sector: usize) {
+        let buf = reader.get_slot_buf(slot);
+        let expected = (sector & 0xFF) as u8;
+        assert!(
+            buf.iter().all(|&b| b == expected),
+            "slot {slot} (sector {sector}): expected 0x{expected:02x}, got 0x{:02x}",
+            buf[0],
+        );
+    }
+
+    // ===================================================================
+    // Unit tests — each tests a single API behavior
+    // ===================================================================
+
+    #[test]
+    fn slot_lifecycle_round_trip() {
+        let (_f, mut reader) = make_reader(4, 4);
+
+        // Enqueue → flush → wait → get_buf → release
+        let slot = reader.enqueue_read(0).unwrap();
+        assert_eq!(reader.slot_states[slot], SlotState::InFlight);
+
+        reader.flush().unwrap();
+        drain_all_completions(&mut reader);
+        assert_eq!(reader.slot_states[slot], SlotState::Completed);
+
+        assert_sector_data(&reader, slot, 0);
+        reader.release_slot(slot);
+        assert_eq!(reader.slot_states[slot], SlotState::Free);
+
+        // Reuse the slot for a different sector
+        let slots = enqueue_flush_wait(&mut reader, [1]);
+        assert_sector_data(&reader, slots[0], 1);
+        reader.release_slot(slots[0]);
+    }
+
+    #[test]
+    fn slot_exhaustion_returns_error() {
+        let (_f, mut reader) = make_reader(8, 4);
+        for i in 0..4 {
+            reader.enqueue_read((i * SECTOR) as u64).unwrap();
+        }
+        assert!(reader.enqueue_read(0).is_err());
+    }
+
+    #[test]
+    #[should_panic(expected = "not Completed")]
+    fn double_release_panics() {
+        let (_f, mut reader) = make_reader(1, 2);
+        let slots = enqueue_flush_wait(&mut reader, [0]);
+        reader.release_slot(slots[0]);
+        reader.release_slot(slots[0]); // should panic
+    }
+
+    #[test]
+    #[should_panic(expected = "not Completed")]
+    fn get_buf_on_free_slot_panics() {
+        let (_f, reader) = make_reader(1, 2);
+        reader.get_slot_buf(0);
+    }
+
+    #[test]
+    #[should_panic(expected = "not Completed")]
+    fn get_buf_on_inflight_slot_panics() {
+        let (_f, mut reader) = make_reader(1, 2);
+        let slot = reader.enqueue_read(0).unwrap();
+        reader.flush().unwrap();
+        reader.get_slot_buf(slot); // still InFlight
+    }
+
+    #[test]
+    fn drop_drains_in_flight() {
+        let (_f, mut reader) = make_reader(4, 4);
+        for i in 0..4 {
+            reader.enqueue_read((i * SECTOR) as u64).unwrap();
+        }
+        reader.flush().unwrap();
+        drop(reader); // must not panic or leak
+    }
+
+    #[test]
+    fn data_integrity_multi_slot() {
+        let (_f, mut reader) = make_reader(8, 4);
+        let slots = enqueue_flush_wait(&mut reader, 0..4);
+        for (slot, sector) in slots.iter().zip(0..4) {
+            assert_sector_data(&reader, *slot, sector);
+            reader.release_slot(*slot);
+        }
+    }
+
+    #[test]
+    fn reset_clears_all_state() {
+        let (_f, mut reader) = make_reader(4, 4);
+        enqueue_flush_wait(&mut reader, [0, 1]);
+        reader.enqueue_read(2 * SECTOR as u64).unwrap();
+        reader.flush().unwrap();
+
+        reader.reset();
+        assert_eq!(reader.in_flight, 0);
+        assert_eq!(reader.free_slots.len(), 4);
+        assert!(reader.slot_states.iter().all(|&s| s == SlotState::Free));
+    }
+
+    #[test]
+    fn poll_and_wait_return_empty_when_idle() {
+        let (_f, mut reader) = make_reader(1, 2);
+        let mut buf = Vec::new();
+        reader.poll_completions(&mut buf).unwrap();
+        assert!(buf.is_empty());
+        reader.wait_completions(&mut buf).unwrap();
+        assert!(buf.is_empty());
+    }
+
+    #[test]
+    fn short_read_detected_as_error() {
+        let mut f = tempfile::NamedTempFile::new().unwrap();
+        f.write_all(&vec![0xABu8; 512]).unwrap(); // < SECTOR
+        f.flush().unwrap();
+
+        let mut reader = PipelinedReader::new(
+            f.path().to_str().unwrap(),
+            1,
+            SECTOR,
+            SECTOR,
+            &PipelinedReaderConfig::default(),
+        )
+        .unwrap();
+        reader.enqueue_read(0).unwrap();
+        reader.flush().unwrap();
+
+        let mut completed = Vec::new();
+        let result = reader.wait_completions(&mut completed);
+        assert!(result.is_err(), "short read should be detected");
+        assert!(completed.is_empty());
+    }
+
+    #[test]
+    fn drop_with_unflushed_sqes() {
+        let (_f, mut reader) = make_reader(8, 8);
+        for i in 0..8 {
+            reader.enqueue_read((i * SECTOR) as u64).unwrap();
+        }
+        // Enqueued but never flushed — drain_all's submit_and_wait handles it
+        drop(reader);
+    }
+
+    // ===================================================================
+    // Stress tests — exercise the state machine at scale
+    // ===================================================================
+
+    /// Randomized state-machine fuzzer using seeded RNG for reproducibility.
+    /// Exercises random interleavings of enqueue, flush, poll, wait, release,
+    /// and reset with data verification.
+    #[test]
+    fn stress_random_slot_lifecycle() {
+        let (_f, mut reader) = make_reader(256, 16);
+        let mut rng = StdRng::seed_from_u64(0xDEAD_BEEF);
+        let mut pending_completed: Vec<usize> = Vec::new();
+        let mut total_verified = 0u64;
+
+        for _ in 0..2000 {
+            match rng.random_range(0u32..100) {
+                0..40 => {
+                    if reader.has_free_slot() {
+                        let sector = rng.random_range(0usize..256);
+                        reader.enqueue_read((sector * SECTOR) as u64).unwrap();
+                    }
+                }
+                40..55 => {
+                    reader.flush().unwrap();
+                }
+                55..70 => {
+                    let mut buf = Vec::new();
+                    reader.poll_completions(&mut buf).unwrap();
+                    pending_completed.extend_from_slice(&buf);
+                }
+                70..80 => {
+                    if reader.in_flight_count() > 0 {
+                        reader.flush().unwrap();
+                        let mut buf = Vec::new();
+                        reader.wait_completions(&mut buf).unwrap();
+                        pending_completed.extend_from_slice(&buf);
+                    }
+                }
+                80..95 => {
+                    if let Some(slot) = pending_completed.pop() {
+                        let buf = reader.get_slot_buf(slot);
+                        let first = buf[0];
+                        assert!(
+                            buf.iter().all(|&b| b == first),
+                            "data corruption in slot {slot}"
+                        );
+                        reader.release_slot(slot);
+                        total_verified += 1;
+                    }
+                }
+                _ => {
+                    pending_completed.clear();
+                    reader.reset();
+                }
+            }
+        }
+
+        // Cleanup: flush + drain remaining
+        reader.flush().unwrap();
+        let mut buf = Vec::new();
+        while reader.in_flight_count() > 0 {
+            reader.wait_completions(&mut buf).unwrap();
+            for &slot in &buf {
+                let data = reader.get_slot_buf(slot);
+                assert!(data.iter().all(|&b| b == data[0]));
+                reader.release_slot(slot);
+                total_verified += 1;
+            }
+        }
+        for &slot in &pending_completed {
+            let data = reader.get_slot_buf(slot);
+            assert!(data.iter().all(|&b| b == data[0]));
+            reader.release_slot(slot);
+            total_verified += 1;
+        }
+        assert!(total_verified > 0, "stress test verified zero reads");
+    }
+
+    /// Saturate all slots, drain, repeat — catches off-by-one in free-list.
+    #[test]
+    fn stress_saturate_and_drain_cycles() {
+        let max_slots = 32;
+        let (_f, mut reader) = make_reader(max_slots, max_slots);
+
+        for cycle in 0..100 {
+            let sectors: Vec<usize> =
+                (0..max_slots).map(|i| (cycle * max_slots + i) % max_slots).collect();
+            let slots = enqueue_flush_wait(&mut reader, sectors.iter().copied());
+            assert!(reader.enqueue_read(0).is_err());
+
+            for (slot, &sector) in slots.iter().zip(sectors.iter()) {
+                assert_sector_data(&reader, *slot, sector);
+                reader.release_slot(*slot);
+            }
+        }
+    }
+
+    /// 1-slot reader: max state transitions per slot.
+    #[test]
+    fn stress_single_slot_rapid_reuse() {
+        let n_sectors = 64;
+        let (_f, mut reader) = make_reader(n_sectors, 1);
+
+        for i in 0..500 {
+            let sector = i % n_sectors;
+            let slots = enqueue_flush_wait(&mut reader, [sector]);
+            assert_sector_data(&reader, slots[0], sector);
+            reader.release_slot(slots[0]);
+        }
+    }
+
+    /// Drop with 0, 1, 2, … max_slots in-flight IOs.
+    #[test]
+    fn stress_drop_at_various_inflight_counts() {
+        let max_slots = 16;
+        for inflight in 0..=max_slots {
+            let (_f, mut reader) = make_reader(max_slots, max_slots);
+            for i in 0..inflight {
+                reader.enqueue_read((i * SECTOR) as u64).unwrap();
+            }
+            if inflight > 0 {
+                reader.flush().unwrap();
+            }
+            drop(reader);
+        }
+    }
+
+    /// Read every sector in a 256-sector file through 8 slots, verify all.
+    #[test]
+    fn stress_full_file_sequential_scan() {
+        let n_sectors = 256;
+        let max_slots = 8;
+        let (_f, mut reader) = make_reader(n_sectors, max_slots);
+
+        let mut sectors_verified = vec![false; n_sectors];
+        let mut slot_to_sector = [0usize; 128];
+        let mut next_sector = 0usize;
+        let mut buf = Vec::new();
+
+        while next_sector < n_sectors || reader.in_flight_count() > 0 {
+            while next_sector < n_sectors && reader.has_free_slot() {
+                let slot = reader.enqueue_read((next_sector * SECTOR) as u64).unwrap();
+                slot_to_sector[slot] = next_sector;
+                next_sector += 1;
+            }
+            reader.flush().unwrap();
+
+            reader.wait_completions(&mut buf).unwrap();
+            for &slot in &buf {
+                let sector = slot_to_sector[slot];
+                assert_sector_data(&reader, slot, sector);
+                sectors_verified[sector] = true;
+                reader.release_slot(slot);
+            }
+        }
+
+        assert!(sectors_verified.iter().all(|&v| v), "not all sectors verified");
+    }
+}

From 7ff0962c110fc28b82c40794ccc0b6eb13220189 Mon Sep 17 00:00:00 2001
From: Philip Adams <philipadams@microsoft.com>
Date: Thu, 12 Feb 2026 20:34:25 -0800
Subject: [PATCH 46/46] fmt

---
 .../src/backend/disk_index/search.rs          |  4 ++--
 diskann-disk/src/storage/pipelined_reader.rs  | 20 ++++++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs
index d5aa1d5c2..7f8f7b9e2 100644
--- a/diskann-benchmark/src/backend/disk_index/search.rs
+++ b/diskann-benchmark/src/backend/disk_index/search.rs
@@ -12,9 +12,9 @@ use opentelemetry_sdk::trace::SdkTracerProvider;
 use diskann::{utils::VectorRepr, ANNResult};
 use diskann_benchmark_runner::{files::InputFile, utils::MicroSeconds};
 #[cfg(target_os = "linux")]
-use diskann_disk::storage::PipelinedReaderConfig;
-#[cfg(target_os = "linux")]
 use diskann_disk::search::provider::pipelined_accessor::PipelinedConfig;
+#[cfg(target_os = "linux")]
+use diskann_disk::storage::PipelinedReaderConfig;
 use diskann_disk::{
     data_model::CachingStrategy,
     search::provider::{
diff --git a/diskann-disk/src/storage/pipelined_reader.rs b/diskann-disk/src/storage/pipelined_reader.rs
index c502bfb6a..b27c127e0 100644
--- a/diskann-disk/src/storage/pipelined_reader.rs
+++ b/diskann-disk/src/storage/pipelined_reader.rs
@@ -269,9 +269,9 @@ impl PipelinedReader {
                 self.slot_states[slot_id] = SlotState::Free;
                 self.free_slots.push_back(slot_id);
                 if first_error.is_none() {
-                    first_error = Some(ANNError::log_io_error(
-                        std::io::Error::from_raw_os_error(-cqe.result()),
-                    ));
+                    first_error = Some(ANNError::log_io_error(std::io::Error::from_raw_os_error(
+                        -cqe.result(),
+                    )));
                 }
                 continue;
             }
@@ -322,7 +322,9 @@ impl PipelinedReader {
         // SAFETY: The slot is Completed — the kernel has finished writing.
         // `buf_base` was derived from a valid, aligned allocation that outlives
         // `self`. The slice covers exactly `slot_size` bytes within bounds.
-        unsafe { std::slice::from_raw_parts(self.buf_base.add(slot_id * self.slot_size), self.slot_size) }
+        unsafe {
+            std::slice::from_raw_parts(self.buf_base.add(slot_id * self.slot_size), self.slot_size)
+        }
     }
 
     /// Release a completed slot back to the free-list for reuse.
@@ -705,8 +707,9 @@ mod tests {
         let (_f, mut reader) = make_reader(max_slots, max_slots);
 
         for cycle in 0..100 {
-            let sectors: Vec<usize> =
-                (0..max_slots).map(|i| (cycle * max_slots + i) % max_slots).collect();
+            let sectors: Vec<usize> = (0..max_slots)
+                .map(|i| (cycle * max_slots + i) % max_slots)
+                .collect();
             let slots = enqueue_flush_wait(&mut reader, sectors.iter().copied());
             assert!(reader.enqueue_read(0).is_err());
 
@@ -776,6 +779,9 @@ mod tests {
             }
         }
 
-        assert!(sectors_verified.iter().all(|&v| v), "not all sectors verified");
+        assert!(
+            sectors_verified.iter().all(|&v| v),
+            "not all sectors verified"
+        );
     }
 }