datastax · jshook · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
@@ -16,6 +16,14 @@
     </properties>
     <build>
         <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <configuration>
+                    <skip>false</skip>
+                    <workingDirectory>${project.parent.basedir}</workingDirectory>
+                </configuration>
+            </plugin>
             <plugin>
                 <groupId>org.codehaus.mojo</groupId>
                 <artifactId>exec-maven-plugin</artifactId>

@@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException {
                 try {
                     DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                             () -> new RuntimeException("Dataset " + datasetName + " not found")
-                    );
+                    ).getDataSet();
                     logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());
 
                     String normalizedDatasetName = datasetName;

@@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List<Func
         for (var datasetName : datasetNames) {
             DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                     () -> new RuntimeException("Dataset " + datasetName + " not found")
-            );
+            ).getDataSet();
             Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
         }
     }

@@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException {
             String datasetName = config.dataset;
             DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                     () -> new RuntimeException("Could not load dataset:" + datasetName)
-            );
+            ).getDataSet();
             // Register dataset info the first time we actually load the dataset for benchmarking
             artifacts.registerDataset(datasetName, ds);
 

@@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException {
 
         // Load dataset
         var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
-                .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
+                .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
+                .getDataSet();
 
         // Run artifacts + selections (sys_info/dataset_info/experiments.csv)
         RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config));

@@ -0,0 +1,137 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.example.benchmarks.datasets;
+
+import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import java.util.Optional;
+import java.util.function.Supplier;
+
+/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data.
+///
+/// Metadata such as the dataset name and similarity function are available immediately
+/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing
+/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}.
+///
+/// This design allows callers to enumerate or filter available datasets cheaply, and
+/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to
+/// thread-safe caching.
+///
+/// Instances are created by {@link DataSetLoader} implementations; callers obtain them
+/// through {@link DataSets#loadDataSet(String)}.
+///
+/// ### Typical usage
+/// ```java
+/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow();
+///
+/// // Cheap — no vectors loaded yet
+/// System.out.println(info.getName());
+/// System.out.println(info.similarityFunction());
+///
+/// // First call triggers full load; subsequent calls return the cached DataSet
+/// DataSet ds = info.getDataSet();
+/// ```
+///
+/// @see DataSet
+/// @see DataSetLoader
+/// @see DataSets
+public class DataSetInfo implements DataSetProperties {
+    private final Supplier<DataSet> loader;
+    private final DataSetProperties baseProperties;
+    private volatile DataSet cached;
+
+    /// Creates a new dataset info handle.
+    ///
+    /// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called.
+    /// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates /
+    /// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}.
+    ///
+    /// @param baseProperties     the dataset properties (name, similarity function, etc.)
+    /// @param loader             a supplier that performs the deferred load; invoked at most once
+    public DataSetInfo(DataSetProperties baseProperties, Supplier<DataSet> loader) {
+        this.baseProperties = baseProperties;
+        this.loader = loader;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public Optional<VectorSimilarityFunction> similarityFunction() {
+        return baseProperties.similarityFunction();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public int numVectors() {
+        return this.baseProperties.numVectors();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public String getName() {
+        return baseProperties.getName();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public boolean isNormalized() {
+        return baseProperties.isNormalized();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public boolean isZeroVectorFree() {
+        return baseProperties.isZeroVectorFree();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public boolean isDuplicateVectorFree() {
+        return baseProperties.isDuplicateVectorFree();
+    }
+
+    /// Returns the fully loaded and scrubbed {@link DataSet}.
+    ///
+    /// On the first invocation this triggers the deferred load pipeline, which may involve
+    /// reading large vector files from disk, deduplication, zero-vector removal, and
+    /// normalization. The result is cached so that subsequent calls return immediately.
+    ///
+    /// This method is thread-safe: concurrent callers will block until the first load
+    /// completes, after which all callers share the same cached instance.
+    ///
+    /// @return the ready-to-use {@link DataSet}
+    public DataSet getDataSet() {
+        if (cached == null) {
+            synchronized (this) {
+                if (cached == null) {
+                    cached = loader.get();
+                }
+            }
+        }
+        return cached;
+    }
+}
@@ -23,19 +23,28 @@
  */
 public interface DataSetLoader {
     /**
-     * Implementations of this method <EM>MUST NOT</EM> throw exceptions related to the presence or absence of a
+     * Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle.
+     *
+     * <p>The returned handle provides the dataset name and similarity function immediately,
+     * without loading vector data into memory. The full {@link DataSet} (vectors, ground truth,
+     * etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}.
+     *
+     * <p>Implementations <em>MUST NOT</em> throw exceptions related to the presence or absence of a
      * requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with
-     * exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably
-     * return from this method, avoiding any {@link System#exit(int)} or similar calls.
+     * exceptions as usual, including any errors downloading or preparing a dataset which has been found.
+     * Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls.
+     *
+     * <p>Implementations may perform file downloads or other preparation work before returning the handle,
+     * but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier.
      *
      * <HR/>
      *
-     * Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
+     * <p>Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
      * not found, and info level for when datasets are found and loaded. This can assist users troubleshooting
      * diverse data sources.
      *
-     * @param dataSetName
-     * @return a {@link DataSet}, if found
+     * @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5})
+     * @return a {@link DataSetInfo} handle for the dataset, if found
      */
-    Optional<DataSet> loadDataSet(String dataSetName);
+    Optional<DataSetInfo> loadDataSet(String dataSetName);
 }
@@ -34,30 +34,42 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Map;
 import java.util.Optional;
 import java.util.stream.IntStream;
 
 /**
  * This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
+ *
+ * <p>The vector similarity function is first inferred from the filename (e.g. {@code -angular},
+ * {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
+ * back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
+ * If neither source provides a similarity function, an error is thrown.
  */
 public class DataSetLoaderHDF5 implements DataSetLoader {
     public static final Path HDF5_DIR = Path.of("hdf5");
     private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
     public static final String HDF5_EXTN = ".hdf5";
+    private static final DataSetMetadataReader metadata = DataSetMetadataReader.load();
 
     /**
      * {@inheritDoc}
      */
-    public Optional<DataSet> loadDataSet(String datasetName) {
-        return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
+    public Optional<DataSetInfo> loadDataSet(String datasetName) {
+        return maybeDownloadHdf5(datasetName).map(path -> {
+            var props = getProperties(datasetName, path);
+            var similarity = props.similarityFunction()
+                    .orElseThrow(() -> new IllegalArgumentException(
+                            "No similarity function found for HDF5 dataset: " + datasetName
+                            + ". Either include -angular, -dot, or -euclidean in the filename,"
+                            + " or add an entry in dataset_metadata.yml"));
+            return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
+        });
     }
 
-    private DataSet readHdf5Data(Path path) {
-
-        // infer the similarity
-        VectorSimilarityFunction similarityFunction = getVectorSimilarityFunction(path);
-
-        // read the data
+    /// Reads base vectors, query vectors, and ground truth from an HDF5 file
+    /// and returns a scrubbed {@link DataSet}.
+    private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
         VectorFloat<?>[] baseVectors;
         VectorFloat<?>[] queryVectors;
         var gtSets = new ArrayList<List<Integer>>();
@@ -94,27 +106,43 @@ private DataSet readHdf5Data(Path path) {
         return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
     }
 
-    /**
-     * Derive the similarity function from the dataset name.
-     * @param filename filename of the dataset AKA "name"
-     * @return The matching similarity function, or throw an error
-     */
-    private static VectorSimilarityFunction getVectorSimilarityFunction(Path filename) {
-        VectorSimilarityFunction similarityFunction;
-        if (filename.toString().contains("-angular") || filename.toString().contains("-dot")) {
-            similarityFunction = VectorSimilarityFunction.COSINE;
+    /// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
+    ///
+    /// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
+    /// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
+    /// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
+    /// a minimal {@link DataSetProperties} with an empty similarity function is returned
+    /// so that the caller can produce a clear error.
+    ///
+    /// @param datasetName the logical dataset name (without {@code .hdf5} extension)
+    /// @param filename    the resolved file path including the {@code .hdf5} extension
+    /// @return the dataset properties
+    private static DataSetProperties getProperties(String datasetName, Path filename) {
+        String filenameStr = filename.toString();
+        VectorSimilarityFunction inferred = null;
+        if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
+            inferred = VectorSimilarityFunction.COSINE;
+        } else if (filenameStr.contains("-euclidean")) {
+            inferred = VectorSimilarityFunction.EUCLIDEAN;
         }
-        else if (filename.toString().contains("-euclidean")) {
-            similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
-        }
-        else {
-            throw new IllegalArgumentException("Unknown similarity function -- expected angular or euclidean for " + filename);
+
+        // If filename inference succeeded, build properties with just the SF
+        if (inferred != null) {
+            return new DataSetProperties.PropertyMap(Map.of(
+                    DataSetProperties.KEY_NAME, datasetName,
+                    DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
         }
-        return similarityFunction;
+
+        // Fall back to metadata YAML
+        return metadata.getProperties(datasetName)
+                .orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
     }
 
+    /// Downloads the HDF5 file for the given dataset if it is not already present locally.
+    ///
+    /// @param datasetName the logical dataset name (without {@code .hdf5} extension)
+    /// @return the local path to the HDF5 file, or empty if the remote file was not found
     private Optional<Path> maybeDownloadHdf5(String datasetName) {
-
         var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN);
 
         if (Files.exists(dsFilePath)) {
@@ -123,7 +151,6 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {
 
         // Download from https://ann-benchmarks.com/datasetName
         var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN;
-        System.out.println("Downloading: " + url);
 
         HttpURLConnection connection;
         while (true) {
@@ -148,6 +175,7 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {
 
         try (InputStream in = connection.getInputStream()) {
             Files.createDirectories(dsFilePath.getParent());
+            System.out.println("Downloading: " + url);
             Files.copy(in, dsFilePath, StandardCopyOption.REPLACE_EXISTING);
         } catch (IOException e) {
             throw new RuntimeException("Error downloading data:" + e.getMessage(),e);