From 37e6dcb09d86293c17862555293af931b70ff726 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 20 Feb 2026 18:44:04 +0000 Subject: [PATCH 1/9] add DataSetInfo --- .../jvector/example/AutoBenchYAML.java | 2 +- .../github/jbellis/jvector/example/Bench.java | 2 +- .../jbellis/jvector/example/BenchYAML.java | 2 +- .../jvector/example/HelloVectorWorld.java | 3 +- .../benchmarks/datasets/DataSetInfo.java | 64 +++++++++++++++++++ .../benchmarks/datasets/DataSetLoader.java | 4 +- .../datasets/DataSetLoaderHDF5.java | 7 +- .../benchmarks/datasets/DataSetLoaderMFD.java | 55 +++++++++++++--- .../example/benchmarks/datasets/DataSets.java | 6 +- .../jvector/example/tutorial/DiskIntro.java | 2 +- .../example/tutorial/LargerThanMemory.java | 2 +- .../jvector/example/util/SiftLoader.java | 7 +- .../graph/disk/ParallelWriteExample.java | 2 +- .../jvector/microbench/GraphBuildBench.java | 2 +- 14 files changed, 134 insertions(+), 26 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index 7922dd201..882608fbb 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException { try { DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size()); String normalizedDatasetName = datasetName; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 559d665fc..78a85e1fc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index 710301054..343fcbd95 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException { String datasetName = config.dataset; DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Could not load dataset:" + datasetName) - ); + ).getDataSet(); // Register dataset info the first time we actually load the dataset for benchmarking artifacts.registerDataset(datasetName, ds); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index f118d7695..032ea2f6c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException { // Load dataset var ds = new DataSetLoaderMFD().loadDataSet(datasetName) - .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")); + .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")) + .getDataSet(); // Run artifacts + selections (sys_info/dataset_info/experiments.csv) RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config)); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java new file mode 100644 index 000000000..83dce8086 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -0,0 +1,64 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; + +import java.util.function.Supplier; + +/// A lightweight handle that identifies a dataset without eagerly loading its data. +/// The name and similarity function are available immediately, while the full +/// {@link DataSet} is loaded lazily on the first call to {@link #getDataSet()}. +public class DataSetInfo { + private final String name; + private final VectorSimilarityFunction similarityFunction; + private final Supplier loader; + private volatile DataSet cached; + + /// Creates a new dataset info handle. + /// + /// @param name the dataset name + /// @param similarityFunction the similarity function used by this dataset + /// @param loader a supplier that loads the full dataset on demand + public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier loader) { + this.name = name; + this.similarityFunction = similarityFunction; + this.loader = loader; + } + + /// Returns the dataset name, available without loading data. + public String getName() { + return name; + } + + /// Returns the similarity function, available without loading data. + public VectorSimilarityFunction getSimilarityFunction() { + return similarityFunction; + } + + /// Returns the full {@link DataSet}, loading it on first access and caching for subsequent calls. + public DataSet getDataSet() { + if (cached == null) { + synchronized (this) { + if (cached == null) { + cached = loader.get(); + } + } + } + return cached; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java index d280fbf91..e7c5b0954 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java @@ -35,7 +35,7 @@ public interface DataSetLoader { * diverse data sources. * * @param dataSetName - * @return a {@link DataSet}, if found + * @return a {@link DataSetInfo} handle for the dataset, if found */ - Optional loadDataSet(String dataSetName); + Optional loadDataSet(String dataSetName); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java index 072a9b764..d531234a9 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java @@ -48,8 +48,11 @@ public class DataSetLoaderHDF5 implements DataSetLoader { /** * {@inheritDoc} */ - public Optional loadDataSet(String datasetName) { - return maybeDownloadHdf5(datasetName).map(this::readHdf5Data); + public Optional loadDataSet(String datasetName) { + return maybeDownloadHdf5(datasetName).map(path -> { + VectorSimilarityFunction similarity = getVectorSimilarityFunction(path); + return new DataSetInfo(datasetName, similarity, () -> readHdf5Data(path)); + }); } private DataSet readHdf5Data(Path path) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java index 7381f0c35..37e9f8a84 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java @@ -31,6 +31,9 @@ import software.amazon.awssdk.transfer.s3.model.FileDownload; import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener; +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -54,8 +57,9 @@ public class DataSetLoaderMFD implements DataSetLoader { /** * {@inheritDoc} */ - public Optional loadDataSet(String fileName) { - return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load); + public Optional loadDataSet(String fileName) { + return maybeDownloadFvecs(fileName).map(mfd -> + new DataSetInfo(mfd.name, VectorSimilarityFunction.COSINE, mfd::load)); } private Optional maybeDownloadFvecs(String name) { @@ -95,19 +99,39 @@ private Optional maybeDownloadFvecs(String name) { .build(); // 3 retries + boolean downloaded = false; for (int i = 0; i < 3; i++) { - FileDownload downloadFile = tm.downloadFile(downloadFileRequest); - CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); - long downloadedSize = Files.size(localPath); + try { + FileDownload downloadFile = tm.downloadFile(downloadFileRequest); + CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); + long downloadedSize = Files.size(localPath); + + // Check if downloaded file size matches the expected size + if (downloadedSize != downloadResult.response().contentLength()) { + logger.error("Incomplete download (got {} of {} bytes). Retrying...", + downloadedSize, downloadResult.response().contentLength()); + Files.deleteIfExists(localPath); + continue; + } + + // Validate the file header to catch corrupt downloads + if (!validateVecFileHeader(localPath)) { + logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath); + Files.deleteIfExists(localPath); + continue; + } - // Check if downloaded file size matches the expected size - if (downloadedSize == downloadResult.response().contentLength()) { logger.info("Downloaded file of length " + downloadedSize); - break; // Successfully downloaded - } else { - logger.error("Incomplete download. Retrying..."); + downloaded = true; + break; + } catch (Exception e) { + logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage()); + Files.deleteIfExists(localPath); } } + if (!downloaded) { + throw new IOException("Failed to download " + urlPath + " after 3 attempts"); + } } tm.close(); } catch (Exception e) { @@ -117,6 +141,17 @@ private Optional maybeDownloadFvecs(String name) { return Optional.of(mfd); } + /// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the + /// little-endian int32 dimension/count value is positive and reasonable. + private static boolean validateVecFileHeader(Path path) { + try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) { + int dimension = Integer.reverseBytes(dis.readInt()); + return dimension > 0 && dimension <= 100_000; + } catch (IOException e) { + return false; + } + } + private static S3AsyncClientBuilder s3AsyncClientBuilder() { return S3AsyncClient.builder() .region(Region.US_EAST_1) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java index da27c8f2c..5f502f79b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java @@ -33,11 +33,11 @@ public class DataSets { add(new DataSetLoaderMFD()); }}; - public static Optional loadDataSet(String dataSetName) { + public static Optional loadDataSet(String dataSetName) { return loadDataSet(dataSetName, defaultLoaders); } - public static Optional loadDataSet(String dataSetName, Collection loaders) { + public static Optional loadDataSet(String dataSetName, Collection loaders) { logger.info("loading dataset [{}]", dataSetName); if (dataSetName.endsWith(".hdf5")) { throw new InvalidParameterException("DataSet names are not meant to be file names. Did you mean " + dataSetName.replace(".hdf5", "") + "? "); @@ -45,7 +45,7 @@ public static Optional loadDataSet(String dataSetName, Collection dataSetLoaded = loader.loadDataSet(dataSetName); + Optional dataSetLoaded = loader.loadDataSet(dataSetName); if (dataSetLoaded.isPresent()) { logger.info("dataset [{}] found with loader [{}]", dataSetName, loader.getClass().getSimpleName()); return dataSetLoaded; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java index e38c5c5b8..cfb70da09 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java @@ -52,7 +52,7 @@ public static void main(String[] args) throws IOException { // This is a preconfigured dataset that will be downloaded automatically. DataSet dataset = DataSets.loadDataSet("ada002-100k").orElseThrow(() -> new RuntimeException("Dataset doesn't exist or wasn't configured correctly") - ); + ).getDataSet(); // The loaded DataSet provides a RAVV over the base vectors RandomAccessVectorValues ravv = dataset.getBaseRavv(); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java index 4e16cda5d..5f22b1cbc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java @@ -62,7 +62,7 @@ public static void main(String[] args) throws IOException { // the base vectors in-memory. DataSet dataset = DataSets.loadDataSet("e5-small-v2-100k").orElseThrow(() -> new RuntimeException("Dataset doesn't exist or wasn't configured correctly") - ); + ).getDataSet(); // Remember that RAVVs need not be in-memory in the general case. // We will sample from this RAVV to compute the PQ codebooks. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java index dd130e04b..a491d0c9e 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java @@ -39,7 +39,12 @@ public static List> readFvecs(String filePath) { try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) { while (dis.available() > 0) { var dimension = Integer.reverseBytes(dis.readInt()); - assert dimension > 0 : dimension; + if (dimension <= 0) { + throw new IOException("Corrupt fvecs file: negative or zero dimension " + dimension + " (possible file corruption or wrong format)"); + } + if (dimension > 100_000) { + throw new IOException("Unreasonable dimension " + dimension + " in fvecs file (possible file corruption or wrong format)"); + } var buffer = new byte[dimension * Float.BYTES]; dis.readFully(buffer); var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java index 04f766f38..f3728234c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java @@ -304,7 +304,7 @@ public static void main(String[] args) throws IOException { System.out.println("Loading dataset: " + datasetName); DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); System.out.printf("Loaded %d vectors of dimension %d%n", ds.getBaseVectors().size(), ds.getDimension()); var floatVectors = ds.getBaseRavv(); diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java index 559048dcb..8e9cc712f 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java @@ -46,7 +46,7 @@ public static class Parameters { public Parameters() { this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow( () -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" ) - ); + ).getDataSet(); this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length()); } } From 90cfe18806bd319d505ffe63294cb12003a76a2d Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 20 Feb 2026 20:42:27 +0000 Subject: [PATCH 2/9] add some javadoc --- .../benchmarks/datasets/DataSetInfo.java | 63 ++++++++++++++++--- .../benchmarks/datasets/DataSetLoader.java | 19 ++++-- .../example/benchmarks/datasets/DataSets.java | 17 +++++ 3 files changed, 85 insertions(+), 14 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java index 83dce8086..0824410ec 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -20,9 +20,34 @@ import java.util.function.Supplier; -/// A lightweight handle that identifies a dataset without eagerly loading its data. -/// The name and similarity function are available immediately, while the full -/// {@link DataSet} is loaded lazily on the first call to {@link #getDataSet()}. +/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data. +/// +/// Metadata such as the dataset name and similarity function are available immediately +/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing +/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}. +/// +/// This design allows callers to enumerate or filter available datasets cheaply, and +/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to +/// thread-safe caching. +/// +/// Instances are created by {@link DataSetLoader} implementations; callers obtain them +/// through {@link DataSets#loadDataSet(String)}. +/// +/// ### Typical usage +/// ```java +/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow(); +/// +/// // Cheap — no vectors loaded yet +/// System.out.println(info.getName()); +/// System.out.println(info.getSimilarityFunction()); +/// +/// // First call triggers full load; subsequent calls return the cached DataSet +/// DataSet ds = info.getDataSet(); +/// ``` +/// +/// @see DataSet +/// @see DataSetLoader +/// @see DataSets public class DataSetInfo { private final String name; private final VectorSimilarityFunction similarityFunction; @@ -31,26 +56,46 @@ public class DataSetInfo { /// Creates a new dataset info handle. /// - /// @param name the dataset name - /// @param similarityFunction the similarity function used by this dataset - /// @param loader a supplier that loads the full dataset on demand + /// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called. + /// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates / + /// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}. + /// + /// @param name the dataset name, used for display and lookup + /// @param similarityFunction the vector similarity function for this dataset + /// (e.g. {@link VectorSimilarityFunction#COSINE}) + /// @param loader a supplier that performs the deferred load; invoked at most once public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier loader) { this.name = name; this.similarityFunction = similarityFunction; this.loader = loader; } - /// Returns the dataset name, available without loading data. + /// Returns the dataset name. + /// + /// This is always available without triggering a data load. public String getName() { return name; } - /// Returns the similarity function, available without loading data. + /// Returns the similarity function for this dataset. + /// + /// This is always available without triggering a data load. + /// For MFD datasets this is always {@link VectorSimilarityFunction#COSINE}; + /// for HDF5 datasets it is inferred from the filename (e.g. {@code -angular} or {@code -euclidean}). public VectorSimilarityFunction getSimilarityFunction() { return similarityFunction; } - /// Returns the full {@link DataSet}, loading it on first access and caching for subsequent calls. + /// Returns the fully loaded and scrubbed {@link DataSet}. + /// + /// On the first invocation this triggers the deferred load pipeline, which may involve + /// reading large vector files from disk, deduplication, zero-vector removal, and + /// normalization. The result is cached so that subsequent calls return immediately. + /// + /// This method is thread-safe: concurrent callers will block until the first load + /// completes, after which all callers share the same cached instance. + /// + /// @return the ready-to-use {@link DataSet} public DataSet getDataSet() { if (cached == null) { synchronized (this) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java index e7c5b0954..932ea2dc7 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java @@ -23,18 +23,27 @@ */ public interface DataSetLoader { /** - * Implementations of this method MUST NOT throw exceptions related to the presence or absence of a + * Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle. + * + *

The returned handle provides the dataset name and similarity function immediately, + * without loading vector data into memory. The full {@link DataSet} (vectors, ground truth, + * etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}. + * + *

Implementations MUST NOT throw exceptions related to the presence or absence of a * requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with - * exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably - * return from this method, avoiding any {@link System#exit(int)} or similar calls. + * exceptions as usual, including any errors downloading or preparing a dataset which has been found. + * Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls. + * + *

Implementations may perform file downloads or other preparation work before returning the handle, + * but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier. * *


* - * Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are + *

Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are * not found, and info level for when datasets are found and loaded. This can assist users troubleshooting * diverse data sources. * - * @param dataSetName + * @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5}) * @return a {@link DataSetInfo} handle for the dataset, if found */ Optional loadDataSet(String dataSetName); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java index 5f502f79b..449ff4fc6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java @@ -25,6 +25,14 @@ import java.util.List; import java.util.Optional; +/// Facade for locating datasets across multiple {@link DataSetLoader} implementations. +/// +/// Returns a {@link DataSetInfo} handle whose vector data is loaded lazily on the first +/// call to {@link DataSetInfo#getDataSet()}, allowing callers to inspect dataset metadata +/// (name, similarity function) without incurring the cost of reading vectors into memory. +/// +/// @see DataSetInfo +/// @see DataSetLoader public class DataSets { private static final Logger logger = LoggerFactory.getLogger(DataSets.class); @@ -33,10 +41,19 @@ public class DataSets { add(new DataSetLoaderMFD()); }}; + /// Loads a dataset by name using the {@link #defaultLoaders}. + /// + /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"}) + /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name public static Optional loadDataSet(String dataSetName) { return loadDataSet(dataSetName, defaultLoaders); } + /// Loads a dataset by name, trying each loader in order until one matches. + /// + /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"}) + /// @param loaders the loaders to try, in priority order + /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name public static Optional loadDataSet(String dataSetName, Collection loaders) { logger.info("loading dataset [{}]", dataSetName); if (dataSetName.endsWith(".hdf5")) { From d76dcdd4f93bb200ed3139ba22e52c25219cf1a0 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 20 Feb 2026 18:44:04 +0000 Subject: [PATCH 3/9] add DataSetInfo --- .../jvector/example/AutoBenchYAML.java | 2 +- .../github/jbellis/jvector/example/Bench.java | 2 +- .../jbellis/jvector/example/BenchYAML.java | 2 +- .../jvector/example/HelloVectorWorld.java | 3 +- .../benchmarks/datasets/DataSetInfo.java | 64 +++++++++++++++++++ .../benchmarks/datasets/DataSetLoader.java | 4 +- .../datasets/DataSetLoaderHDF5.java | 7 +- .../benchmarks/datasets/DataSetLoaderMFD.java | 55 +++++++++++++--- .../example/benchmarks/datasets/DataSets.java | 6 +- .../jvector/example/tutorial/DiskIntro.java | 2 +- .../example/tutorial/LargerThanMemory.java | 2 +- .../jvector/example/util/SiftLoader.java | 7 +- .../graph/disk/ParallelWriteExample.java | 2 +- .../jvector/microbench/GraphBuildBench.java | 2 +- 14 files changed, 134 insertions(+), 26 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index 7922dd201..882608fbb 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException { try { DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size()); String normalizedDatasetName = datasetName; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 559d665fc..78a85e1fc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index 710301054..343fcbd95 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException { String datasetName = config.dataset; DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Could not load dataset:" + datasetName) - ); + ).getDataSet(); // Register dataset info the first time we actually load the dataset for benchmarking artifacts.registerDataset(datasetName, ds); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index f118d7695..032ea2f6c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException { // Load dataset var ds = new DataSetLoaderMFD().loadDataSet(datasetName) - .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")); + .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")) + .getDataSet(); // Run artifacts + selections (sys_info/dataset_info/experiments.csv) RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config)); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java new file mode 100644 index 000000000..83dce8086 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -0,0 +1,64 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; + +import java.util.function.Supplier; + +/// A lightweight handle that identifies a dataset without eagerly loading its data. +/// The name and similarity function are available immediately, while the full +/// {@link DataSet} is loaded lazily on the first call to {@link #getDataSet()}. +public class DataSetInfo { + private final String name; + private final VectorSimilarityFunction similarityFunction; + private final Supplier loader; + private volatile DataSet cached; + + /// Creates a new dataset info handle. + /// + /// @param name the dataset name + /// @param similarityFunction the similarity function used by this dataset + /// @param loader a supplier that loads the full dataset on demand + public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier loader) { + this.name = name; + this.similarityFunction = similarityFunction; + this.loader = loader; + } + + /// Returns the dataset name, available without loading data. + public String getName() { + return name; + } + + /// Returns the similarity function, available without loading data. + public VectorSimilarityFunction getSimilarityFunction() { + return similarityFunction; + } + + /// Returns the full {@link DataSet}, loading it on first access and caching for subsequent calls. + public DataSet getDataSet() { + if (cached == null) { + synchronized (this) { + if (cached == null) { + cached = loader.get(); + } + } + } + return cached; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java index d280fbf91..e7c5b0954 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java @@ -35,7 +35,7 @@ public interface DataSetLoader { * diverse data sources. * * @param dataSetName - * @return a {@link DataSet}, if found + * @return a {@link DataSetInfo} handle for the dataset, if found */ - Optional loadDataSet(String dataSetName); + Optional loadDataSet(String dataSetName); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java index 072a9b764..d531234a9 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java @@ -48,8 +48,11 @@ public class DataSetLoaderHDF5 implements DataSetLoader { /** * {@inheritDoc} */ - public Optional loadDataSet(String datasetName) { - return maybeDownloadHdf5(datasetName).map(this::readHdf5Data); + public Optional loadDataSet(String datasetName) { + return maybeDownloadHdf5(datasetName).map(path -> { + VectorSimilarityFunction similarity = getVectorSimilarityFunction(path); + return new DataSetInfo(datasetName, similarity, () -> readHdf5Data(path)); + }); } private DataSet readHdf5Data(Path path) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java index 7381f0c35..37e9f8a84 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java @@ -31,6 +31,9 @@ import software.amazon.awssdk.transfer.s3.model.FileDownload; import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener; +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -54,8 +57,9 @@ public class DataSetLoaderMFD implements DataSetLoader { /** * {@inheritDoc} */ - public Optional loadDataSet(String fileName) { - return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load); + public Optional loadDataSet(String fileName) { + return maybeDownloadFvecs(fileName).map(mfd -> + new DataSetInfo(mfd.name, VectorSimilarityFunction.COSINE, mfd::load)); } private Optional maybeDownloadFvecs(String name) { @@ -95,19 +99,39 @@ private Optional maybeDownloadFvecs(String name) { .build(); // 3 retries + boolean downloaded = false; for (int i = 0; i < 3; i++) { - FileDownload downloadFile = tm.downloadFile(downloadFileRequest); - CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); - long downloadedSize = Files.size(localPath); + try { + FileDownload downloadFile = tm.downloadFile(downloadFileRequest); + CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); + long downloadedSize = Files.size(localPath); + + // Check if downloaded file size matches the expected size + if (downloadedSize != downloadResult.response().contentLength()) { + logger.error("Incomplete download (got {} of {} bytes). Retrying...", + downloadedSize, downloadResult.response().contentLength()); + Files.deleteIfExists(localPath); + continue; + } + + // Validate the file header to catch corrupt downloads + if (!validateVecFileHeader(localPath)) { + logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath); + Files.deleteIfExists(localPath); + continue; + } - // Check if downloaded file size matches the expected size - if (downloadedSize == downloadResult.response().contentLength()) { logger.info("Downloaded file of length " + downloadedSize); - break; // Successfully downloaded - } else { - logger.error("Incomplete download. Retrying..."); + downloaded = true; + break; + } catch (Exception e) { + logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage()); + Files.deleteIfExists(localPath); } } + if (!downloaded) { + throw new IOException("Failed to download " + urlPath + " after 3 attempts"); + } } tm.close(); } catch (Exception e) { @@ -117,6 +141,17 @@ private Optional maybeDownloadFvecs(String name) { return Optional.of(mfd); } + /// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the + /// little-endian int32 dimension/count value is positive and reasonable. + private static boolean validateVecFileHeader(Path path) { + try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) { + int dimension = Integer.reverseBytes(dis.readInt()); + return dimension > 0 && dimension <= 100_000; + } catch (IOException e) { + return false; + } + } + private static S3AsyncClientBuilder s3AsyncClientBuilder() { return S3AsyncClient.builder() .region(Region.US_EAST_1) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java index da27c8f2c..5f502f79b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java @@ -33,11 +33,11 @@ public class DataSets { add(new DataSetLoaderMFD()); }}; - public static Optional loadDataSet(String dataSetName) { + public static Optional loadDataSet(String dataSetName) { return loadDataSet(dataSetName, defaultLoaders); } - public static Optional loadDataSet(String dataSetName, Collection loaders) { + public static Optional loadDataSet(String dataSetName, Collection loaders) { logger.info("loading dataset [{}]", dataSetName); if (dataSetName.endsWith(".hdf5")) { throw new InvalidParameterException("DataSet names are not meant to be file names. Did you mean " + dataSetName.replace(".hdf5", "") + "? "); @@ -45,7 +45,7 @@ public static Optional loadDataSet(String dataSetName, Collection dataSetLoaded = loader.loadDataSet(dataSetName); + Optional dataSetLoaded = loader.loadDataSet(dataSetName); if (dataSetLoaded.isPresent()) { logger.info("dataset [{}] found with loader [{}]", dataSetName, loader.getClass().getSimpleName()); return dataSetLoaded; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java index e38c5c5b8..cfb70da09 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java @@ -52,7 +52,7 @@ public static void main(String[] args) throws IOException { // This is a preconfigured dataset that will be downloaded automatically. DataSet dataset = DataSets.loadDataSet("ada002-100k").orElseThrow(() -> new RuntimeException("Dataset doesn't exist or wasn't configured correctly") - ); + ).getDataSet(); // The loaded DataSet provides a RAVV over the base vectors RandomAccessVectorValues ravv = dataset.getBaseRavv(); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java index 4e16cda5d..5f22b1cbc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java @@ -62,7 +62,7 @@ public static void main(String[] args) throws IOException { // the base vectors in-memory. DataSet dataset = DataSets.loadDataSet("e5-small-v2-100k").orElseThrow(() -> new RuntimeException("Dataset doesn't exist or wasn't configured correctly") - ); + ).getDataSet(); // Remember that RAVVs need not be in-memory in the general case. // We will sample from this RAVV to compute the PQ codebooks. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java index dd130e04b..a491d0c9e 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java @@ -39,7 +39,12 @@ public static List> readFvecs(String filePath) { try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) { while (dis.available() > 0) { var dimension = Integer.reverseBytes(dis.readInt()); - assert dimension > 0 : dimension; + if (dimension <= 0) { + throw new IOException("Corrupt fvecs file: negative or zero dimension " + dimension + " (possible file corruption or wrong format)"); + } + if (dimension > 100_000) { + throw new IOException("Unreasonable dimension " + dimension + " in fvecs file (possible file corruption or wrong format)"); + } var buffer = new byte[dimension * Float.BYTES]; dis.readFully(buffer); var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java index 04f766f38..f3728234c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java @@ -304,7 +304,7 @@ public static void main(String[] args) throws IOException { System.out.println("Loading dataset: " + datasetName); DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); System.out.printf("Loaded %d vectors of dimension %d%n", ds.getBaseVectors().size(), ds.getDimension()); var floatVectors = ds.getBaseRavv(); diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java index 559048dcb..8e9cc712f 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java @@ -46,7 +46,7 @@ public static class Parameters { public Parameters() { this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow( () -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" ) - ); + ).getDataSet(); this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length()); } } From 0c74def7383a178e0e2d522573d152053c76cce5 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Fri, 20 Feb 2026 20:42:27 +0000 Subject: [PATCH 4/9] add some javadoc --- .../benchmarks/datasets/DataSetInfo.java | 63 ++++++++++++++++--- .../benchmarks/datasets/DataSetLoader.java | 19 ++++-- .../example/benchmarks/datasets/DataSets.java | 17 +++++ 3 files changed, 85 insertions(+), 14 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java index 83dce8086..0824410ec 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -20,9 +20,34 @@ import java.util.function.Supplier; -/// A lightweight handle that identifies a dataset without eagerly loading its data. -/// The name and similarity function are available immediately, while the full -/// {@link DataSet} is loaded lazily on the first call to {@link #getDataSet()}. +/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data. +/// +/// Metadata such as the dataset name and similarity function are available immediately +/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing +/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}. +/// +/// This design allows callers to enumerate or filter available datasets cheaply, and +/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to +/// thread-safe caching. +/// +/// Instances are created by {@link DataSetLoader} implementations; callers obtain them +/// through {@link DataSets#loadDataSet(String)}. +/// +/// ### Typical usage +/// ```java +/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow(); +/// +/// // Cheap — no vectors loaded yet +/// System.out.println(info.getName()); +/// System.out.println(info.getSimilarityFunction()); +/// +/// // First call triggers full load; subsequent calls return the cached DataSet +/// DataSet ds = info.getDataSet(); +/// ``` +/// +/// @see DataSet +/// @see DataSetLoader +/// @see DataSets public class DataSetInfo { private final String name; private final VectorSimilarityFunction similarityFunction; @@ -31,26 +56,46 @@ public class DataSetInfo { /// Creates a new dataset info handle. /// - /// @param name the dataset name - /// @param similarityFunction the similarity function used by this dataset - /// @param loader a supplier that loads the full dataset on demand + /// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called. + /// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates / + /// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}. + /// + /// @param name the dataset name, used for display and lookup + /// @param similarityFunction the vector similarity function for this dataset + /// (e.g. {@link VectorSimilarityFunction#COSINE}) + /// @param loader a supplier that performs the deferred load; invoked at most once public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier loader) { this.name = name; this.similarityFunction = similarityFunction; this.loader = loader; } - /// Returns the dataset name, available without loading data. + /// Returns the dataset name. + /// + /// This is always available without triggering a data load. public String getName() { return name; } - /// Returns the similarity function, available without loading data. + /// Returns the similarity function for this dataset. + /// + /// This is always available without triggering a data load. + /// For MFD datasets this is always {@link VectorSimilarityFunction#COSINE}; + /// for HDF5 datasets it is inferred from the filename (e.g. {@code -angular} or {@code -euclidean}). public VectorSimilarityFunction getSimilarityFunction() { return similarityFunction; } - /// Returns the full {@link DataSet}, loading it on first access and caching for subsequent calls. + /// Returns the fully loaded and scrubbed {@link DataSet}. + /// + /// On the first invocation this triggers the deferred load pipeline, which may involve + /// reading large vector files from disk, deduplication, zero-vector removal, and + /// normalization. The result is cached so that subsequent calls return immediately. + /// + /// This method is thread-safe: concurrent callers will block until the first load + /// completes, after which all callers share the same cached instance. + /// + /// @return the ready-to-use {@link DataSet} public DataSet getDataSet() { if (cached == null) { synchronized (this) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java index e7c5b0954..932ea2dc7 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java @@ -23,18 +23,27 @@ */ public interface DataSetLoader { /** - * Implementations of this method MUST NOT throw exceptions related to the presence or absence of a + * Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle. + * + *

The returned handle provides the dataset name and similarity function immediately, + * without loading vector data into memory. The full {@link DataSet} (vectors, ground truth, + * etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}. + * + *

Implementations MUST NOT throw exceptions related to the presence or absence of a * requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with - * exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably - * return from this method, avoiding any {@link System#exit(int)} or similar calls. + * exceptions as usual, including any errors downloading or preparing a dataset which has been found. + * Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls. + * + *

Implementations may perform file downloads or other preparation work before returning the handle, + * but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier. * *


* - * Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are + *

Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are * not found, and info level for when datasets are found and loaded. This can assist users troubleshooting * diverse data sources. * - * @param dataSetName + * @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5}) * @return a {@link DataSetInfo} handle for the dataset, if found */ Optional loadDataSet(String dataSetName); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java index 5f502f79b..449ff4fc6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java @@ -25,6 +25,14 @@ import java.util.List; import java.util.Optional; +/// Facade for locating datasets across multiple {@link DataSetLoader} implementations. +/// +/// Returns a {@link DataSetInfo} handle whose vector data is loaded lazily on the first +/// call to {@link DataSetInfo#getDataSet()}, allowing callers to inspect dataset metadata +/// (name, similarity function) without incurring the cost of reading vectors into memory. +/// +/// @see DataSetInfo +/// @see DataSetLoader public class DataSets { private static final Logger logger = LoggerFactory.getLogger(DataSets.class); @@ -33,10 +41,19 @@ public class DataSets { add(new DataSetLoaderMFD()); }}; + /// Loads a dataset by name using the {@link #defaultLoaders}. + /// + /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"}) + /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name public static Optional loadDataSet(String dataSetName) { return loadDataSet(dataSetName, defaultLoaders); } + /// Loads a dataset by name, trying each loader in order until one matches. + /// + /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"}) + /// @param loaders the loaders to try, in priority order + /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name public static Optional loadDataSet(String dataSetName, Collection loaders) { logger.info("loading dataset [{}]", dataSetName); if (dataSetName.endsWith(".hdf5")) { From 045c252cb41ebab02f41bd0ede280b7e5e17309f Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 19 Mar 2026 17:35:50 +0000 Subject: [PATCH 5/9] expand type-safe metadata support for datasets --- jvector-examples/pom.xml | 8 + .../benchmarks/datasets/DataSetInfo.java | 72 +++- .../datasets/DataSetLoaderHDF5.java | 75 ++-- .../benchmarks/datasets/DataSetLoaderMFD.java | 33 +- .../datasets/DataSetMetadataReader.java | 103 +++++ .../datasets/DataSetProperties.java | 210 +++++++++ .../datasets/DataSetPropertiesTest.java | 404 ++++++++++++++++++ .../example/util/BenchmarkSummarizerTest.java | 5 +- .../example/benchmarks/datasets/empty.yml | 1 + .../benchmarks/datasets/flat_entry.yml | 5 + .../benchmarks/datasets/multi_entry.yml | 14 + .../benchmarks/datasets/scalar_entry.yml | 1 + .../yaml-configs/dataset_metadata.yml | 34 ++ jvector-examples/yaml-configs/datasets.yml | 3 - 14 files changed, 912 insertions(+), 56 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java create mode 100644 jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java create mode 100644 jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml create mode 100644 jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml create mode 100644 jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml create mode 100644 jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml create mode 100644 jvector-examples/yaml-configs/dataset_metadata.yml diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index ae8c77d6d..9daf7b8cf 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -16,6 +16,14 @@ + + org.apache.maven.plugins + maven-surefire-plugin + + false + ${project.parent.basedir} + + org.codehaus.mojo exec-maven-plugin diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java index 0824410ec..44b68638e 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -17,7 +17,7 @@ package io.github.jbellis.jvector.example.benchmarks.datasets; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; - +import java.util.Optional; import java.util.function.Supplier; /// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data. @@ -39,7 +39,7 @@ /// /// // Cheap — no vectors loaded yet /// System.out.println(info.getName()); -/// System.out.println(info.getSimilarityFunction()); +/// System.out.println(info.similarityFunction()); /// /// // First call triggers full load; subsequent calls return the cached DataSet /// DataSet ds = info.getDataSet(); @@ -48,10 +48,9 @@ /// @see DataSet /// @see DataSetLoader /// @see DataSets -public class DataSetInfo { - private final String name; - private final VectorSimilarityFunction similarityFunction; +public class DataSetInfo implements DataSetProperties { private final Supplier loader; + private final DataSetProperties baseProperties; private volatile DataSet cached; /// Creates a new dataset info handle. @@ -60,30 +59,59 @@ public class DataSetInfo { /// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates / /// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}. /// - /// @param name the dataset name, used for display and lookup - /// @param similarityFunction the vector similarity function for this dataset - /// (e.g. {@link VectorSimilarityFunction#COSINE}) + /// @param baseProperties the dataset properties (name, similarity function, etc.) /// @param loader a supplier that performs the deferred load; invoked at most once - public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier loader) { - this.name = name; - this.similarityFunction = similarityFunction; + public DataSetInfo(DataSetProperties baseProperties, Supplier loader) { + this.baseProperties = baseProperties; this.loader = loader; } - /// Returns the dataset name. - /// - /// This is always available without triggering a data load. + /** + * @inheritDoc + */ + @Override + public Optional similarityFunction() { + return baseProperties.similarityFunction(); + } + + /** + * @inheritDoc + */ + @Override + public int numVectors() { + return this.baseProperties.numVectors(); + } + + /** + * @inheritDoc + */ + @Override public String getName() { - return name; + return baseProperties.getName(); } - /// Returns the similarity function for this dataset. - /// - /// This is always available without triggering a data load. - /// For MFD datasets this is always {@link VectorSimilarityFunction#COSINE}; - /// for HDF5 datasets it is inferred from the filename (e.g. {@code -angular} or {@code -euclidean}). - public VectorSimilarityFunction getSimilarityFunction() { - return similarityFunction; + /** + * @inheritDoc + */ + @Override + public boolean isNormalized() { + return baseProperties.isNormalized(); + } + + /** + * @inheritDoc + */ + @Override + public boolean isZeroVectorFree() { + return baseProperties.isZeroVectorFree(); + } + + /** + * @inheritDoc + */ + @Override + public boolean isDuplicateVectorFree() { + return baseProperties.isDuplicateVectorFree(); } /// Returns the fully loaded and scrubbed {@link DataSet}. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java index d531234a9..aed5d99e7 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java @@ -34,33 +34,42 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.stream.IntStream; /** * This dataset loader will get and load hdf5 files from ann-benchmarks. + * + *

The vector similarity function is first inferred from the filename (e.g. {@code -angular}, + * {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls + * back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. + * If neither source provides a similarity function, an error is thrown. */ public class DataSetLoaderHDF5 implements DataSetLoader { public static final Path HDF5_DIR = Path.of("hdf5"); private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport(); public static final String HDF5_EXTN = ".hdf5"; + private static final DataSetMetadataReader metadata = DataSetMetadataReader.load(); /** * {@inheritDoc} */ public Optional loadDataSet(String datasetName) { return maybeDownloadHdf5(datasetName).map(path -> { - VectorSimilarityFunction similarity = getVectorSimilarityFunction(path); - return new DataSetInfo(datasetName, similarity, () -> readHdf5Data(path)); + var props = getProperties(datasetName, path); + var similarity = props.similarityFunction() + .orElseThrow(() -> new IllegalArgumentException( + "No similarity function found for HDF5 dataset: " + datasetName + + ". Either include -angular, -dot, or -euclidean in the filename," + + " or add an entry in dataset_metadata.yml")); + return new DataSetInfo(props, () -> readHdf5Data(path, similarity)); }); } - private DataSet readHdf5Data(Path path) { - - // infer the similarity - VectorSimilarityFunction similarityFunction = getVectorSimilarityFunction(path); - - // read the data + /// Reads base vectors, query vectors, and ground truth from an HDF5 file + /// and returns a scrubbed {@link DataSet}. + private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) { VectorFloat[] baseVectors; VectorFloat[] queryVectors; var gtSets = new ArrayList>(); @@ -97,27 +106,43 @@ private DataSet readHdf5Data(Path path) { return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets); } - /** - * Derive the similarity function from the dataset name. - * @param filename filename of the dataset AKA "name" - * @return The matching similarity function, or throw an error - */ - private static VectorSimilarityFunction getVectorSimilarityFunction(Path filename) { - VectorSimilarityFunction similarityFunction; - if (filename.toString().contains("-angular") || filename.toString().contains("-dot")) { - similarityFunction = VectorSimilarityFunction.COSINE; + /// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}. + /// + /// The filename is checked first for known suffixes ({@code -angular}, {@code -dot}, + /// {@code -euclidean}) to infer the similarity function. If none match, the dataset name + /// is looked up in {@code dataset_metadata.yml}. If neither source provides properties, + /// a minimal {@link DataSetProperties} with an empty similarity function is returned + /// so that the caller can produce a clear error. + /// + /// @param datasetName the logical dataset name (without {@code .hdf5} extension) + /// @param filename the resolved file path including the {@code .hdf5} extension + /// @return the dataset properties + private static DataSetProperties getProperties(String datasetName, Path filename) { + String filenameStr = filename.toString(); + VectorSimilarityFunction inferred = null; + if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) { + inferred = VectorSimilarityFunction.COSINE; + } else if (filenameStr.contains("-euclidean")) { + inferred = VectorSimilarityFunction.EUCLIDEAN; } - else if (filename.toString().contains("-euclidean")) { - similarityFunction = VectorSimilarityFunction.EUCLIDEAN; - } - else { - throw new IllegalArgumentException("Unknown similarity function -- expected angular or euclidean for " + filename); + + // If filename inference succeeded, build properties with just the SF + if (inferred != null) { + return new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_NAME, datasetName, + DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred)); } - return similarityFunction; + + // Fall back to metadata YAML + return metadata.getProperties(datasetName) + .orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName))); } + /// Downloads the HDF5 file for the given dataset if it is not already present locally. + /// + /// @param datasetName the logical dataset name (without {@code .hdf5} extension) + /// @return the local path to the HDF5 file, or empty if the remote file was not found private Optional maybeDownloadHdf5(String datasetName) { - var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN); if (Files.exists(dsFilePath)) { @@ -126,7 +151,6 @@ private Optional maybeDownloadHdf5(String datasetName) { // Download from https://ann-benchmarks.com/datasetName var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN; - System.out.println("Downloading: " + url); HttpURLConnection connection; while (true) { @@ -151,6 +175,7 @@ private Optional maybeDownloadHdf5(String datasetName) { try (InputStream in = connection.getInputStream()) { Files.createDirectories(dsFilePath.getParent()); + System.out.println("Downloading: " + url); Files.copy(in, dsFilePath, StandardCopyOption.REPLACE_EXISTING); } catch (IOException e) { throw new RuntimeException("Error downloading data:" + e.getMessage(),e); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java index 37e9f8a84..16dedbb82 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java @@ -43,6 +43,10 @@ /** * This dataset loader supports multi-file datasets which are comprised of several files as defined in * {@link DataSetLoaderMFD.MultiFileDatasource}. + * + *

The vector similarity function is determined by looking up the dataset name in + * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If no entry is found, + * an error is thrown. */ public class DataSetLoaderMFD implements DataSetLoader { @@ -53,15 +57,27 @@ public class DataSetLoaderMFD implements DataSetLoader { private static final String fvecDir = "fvec"; private static final String bucketName = "astra-vector"; private static final List bucketNames = List.of(bucketName, infraBucketName); + private static final DataSetMetadataReader metadata = DataSetMetadataReader.load(); /** * {@inheritDoc} */ public Optional loadDataSet(String fileName) { - return maybeDownloadFvecs(fileName).map(mfd -> - new DataSetInfo(mfd.name, VectorSimilarityFunction.COSINE, mfd::load)); + return maybeDownloadFvecs(fileName).map(mfd -> { + var props = metadata.getProperties(mfd.name) + .orElseThrow(() -> new IllegalArgumentException( + "No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name)); + var vsf = props.similarityFunction() + .orElseThrow(() -> new IllegalArgumentException( + "No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name)); + return new DataSetInfo(props, () -> mfd.load(vsf)); + }); } + /// Downloads the fvec/ivec files for the named dataset from S3 if not already present locally. + /// + /// @param name the logical dataset name + /// @return the datasource descriptor, or empty if the name is not a known multi-file dataset private Optional maybeDownloadFvecs(String name) { String bucket = infraDatasets.contains(name) ? infraBucketName : bucketName; var mfd = MultiFileDatasource.byName.get(name); @@ -152,6 +168,7 @@ private static boolean validateVecFileHeader(Path path) { } } + /// Creates an S3 async client builder configured for anonymous access to US-EAST-1. private static S3AsyncClientBuilder s3AsyncClientBuilder() { return S3AsyncClient.builder() .region(Region.US_EAST_1) @@ -161,6 +178,8 @@ private static S3AsyncClientBuilder s3AsyncClientBuilder() { .credentialsProvider(AnonymousCredentialsProvider.create()); } + /// Describes a dataset stored as three separate fvec/ivec files (base vectors, query + /// vectors, and ground truth) in an S3 bucket. Known datasets are registered in {@link #byName}. public static class MultiFileDatasource { public final String name; public final Path basePath; @@ -175,19 +194,25 @@ public MultiFileDatasource(String name, String basePath, String queriesPath, Str this.groundTruthPath = Paths.get(groundTruthPath); } + /// Returns the parent directory of the base vectors file. public Path directory() { return basePath.getParent(); } + /// Returns the three file paths (base, queries, ground truth) that comprise this dataset. public Iterable paths() { return List.of(basePath, queriesPath, groundTruthPath); } - public DataSet load() { + /// Reads the fvec/ivec files from disk and returns a scrubbed {@link DataSet}. + /// + /// @param similarityFunction the similarity function to associate with the dataset + /// @return the loaded and scrubbed dataset + public DataSet load(VectorSimilarityFunction similarityFunction) { var baseVectors = SiftLoader.readFvecs("fvec/" + basePath); var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath); var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath); - return DataSetUtils.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors); + return DataSetUtils.getScrubbedDataSet(name, similarityFunction, baseVectors, queryVectors, gtVectors); } public static Map byName = new HashMap<>() {{ diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java new file mode 100644 index 000000000..e8305a3ce --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java @@ -0,0 +1,103 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import org.yaml.snakeyaml.Yaml; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +/// Reads dataset metadata from a multi-entry YAML file and provides keyed lookups +/// for {@link DataSetProperties}. +/// +/// This is used by loaders such as {@link DataSetLoaderMFD} and {@link DataSetLoaderHDF5} +/// that do not have an intrinsic way to determine the similarity function from the dataset +/// name or file format alone. +/// +/// The YAML file maps dataset keys to their metadata properties using the same key names +/// as the {@code KEY_*} constants on {@link DataSetProperties}: +/// ```yaml +/// ada002-100k: +/// similarity_function: COSINE +/// is_normalized: true +/// ``` +/// +/// For single-entry lookups without caching, use +/// {@link DataSetProperties.PropertyMap#PropertyMap(String, String)} directly. +/// This class is useful when the same file is queried repeatedly for different keys. +/// +/// Keys may or may not include file extensions (e.g. {@code .hdf5}). The lookup tries +/// the exact key first, then falls back to the key with {@code .hdf5} appended. +public class DataSetMetadataReader { + + private static final String DEFAULT_FILE = "jvector-examples/yaml-configs/dataset_metadata.yml"; + + private final Map> metadata; + + private DataSetMetadataReader(Map> metadata) { + this.metadata = metadata; + } + + /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}). + /// + /// @return the loaded metadata + /// @throws RuntimeException if the file cannot be read + public static DataSetMetadataReader load() { + return load(DEFAULT_FILE); + } + + /// Loads dataset metadata from the specified file. + /// + /// @param file path to the YAML metadata file + /// @return the loaded metadata + /// @throws RuntimeException if the file cannot be read + @SuppressWarnings("unchecked") + public static DataSetMetadataReader load(String file) { + try (InputStream inputStream = new FileInputStream(file)) { + Yaml yaml = new Yaml(); + Map> data = yaml.load(inputStream); + return new DataSetMetadataReader(data); + } catch (IOException e) { + throw new RuntimeException("Failed to load dataset metadata from " + file, e); + } + } + + /// Looks up the {@link DataSetProperties} for a dataset by key. + /// + /// The lookup tries the exact key first, then the key with {@code .hdf5} appended. + /// The YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} with the dataset + /// name injected. Properties not present in the YAML default to empty/false/zero. + /// + /// @param datasetKey the dataset name or filename to look up + /// @return the dataset properties if an entry exists, or empty if no entry is found + public Optional getProperties(String datasetKey) { + Map entry = metadata.get(datasetKey); + if (entry == null) { + entry = metadata.get(datasetKey + ".hdf5"); + } + if (entry == null) { + return Optional.empty(); + } + var props = new HashMap<>(entry); + props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey); + return Optional.of(new DataSetProperties.PropertyMap(props)); + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java new file mode 100644 index 000000000..7b6012bb1 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java @@ -0,0 +1,210 @@ +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import org.yaml.snakeyaml.Yaml; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +/** + * The essential properties of a vector dataset which make it valid for indexing, querying, and testing in general. + * These properties describe the base facet of a whole dataset, and are not defined for query vectors, distances, etc. + * For facet-by-facet (base, query, distance) properties, a different interface will be provided. + */ +public interface DataSetProperties { + + /// Canonical key for the dataset name ({@link String}). + String KEY_NAME = "name"; + + /// Canonical key for the similarity function ({@link VectorSimilarityFunction} or its name as a {@link String}). + String KEY_SIMILARITY_FUNCTION = "similarity_function"; + + /// Canonical key for the number of base vectors ({@link Integer}). + String KEY_NUM_VECTORS = "num_vectors"; + + /// Canonical key for whether the dataset is normalized ({@link Boolean}). + String KEY_IS_NORMALIZED = "is_normalized"; + + /// Canonical key for whether the dataset is free of zero vectors ({@link Boolean}). + String KEY_IS_ZERO_VECTOR_FREE = "is_zero_vector_free"; + + /// Canonical key for whether the dataset is free of duplicate vectors ({@link Boolean}). + String KEY_IS_DUPLICATE_VECTOR_FREE = "is_duplicate_vector_free"; + + /** + * Returns the similarity function for this dataset. + * + * @return the similarity function, or empty if not configured + */ + Optional similarityFunction(); + + /** + * Get the number of (base) vectors in this dataset. + * @return the number of base vectors in this dataset. + */ + public int numVectors(); + + /** + * Get the name of the dataset + * @return the name of the dataset + */ + public String getName(); + + /** + * Has this dataset been normalized? + * It is an error for this to return true when the vectors are not normalized. + * It is acceptable to return false when the dataset is not known to be normalized. + * @return true if this dataset has been normalized, false otherwise. + */ + public boolean isNormalized(); + + /** + * Has this dataset been verified or corrected to contain no zero-vectors, i.e., vectors with all components set to zero? + * Vectors with some zero components are deemed valid, but vectors with all components set to zero are not. + * It is an error for this to return true when there is a single zero-vector. + * It is acceptable for it to return false when the dataset is not known to be zero vector free. + * @return true if the dataset is known to have no zero vectors + */ + public boolean isZeroVectorFree(); + + /** + * Vectors in this dataset must be distinct or this method should return false. + * Datasets which have duplicate values are assumed to have other issues with embedding, process controls, etc. + * Further, graph construction algorithms are sensitive to vector identity being stable between ordinals and values. + * It is an error for this to return true when there are duplicate vectors. + * It is acceptable for it to return false when the dataset is not known to be duplicate vector free. + * @return true, if all vectors in this dataset are distinct. + */ + public boolean isDuplicateVectorFree(); + + /** + * A convenience method to capture the notion of a valid dataset. + * As any additional qualifiers are added to this data carrier, this method should be updated accordingly. + * @return true, if the dataset is known to be valid for indexing and querying. + */ + default boolean isValid() { + return isZeroVectorFree() && isDuplicateVectorFree(); + } + + /// A {@link DataSetProperties} implementation backed by a {@code Map}. + /// + /// Property keys use the {@code KEY_*} constants defined on {@link DataSetProperties}. + /// Missing or null values fall back to safe defaults (empty optional, zero, or false). + /// + /// A {@code PropertyMap} can be constructed directly from a map, or loaded from a YAML + /// file with an optional document key to select a named top-level entry. + /// + /// ### Examples + /// ```java + /// // From a map + /// var props = new DataSetProperties.PropertyMap(Map.of( + /// DataSetProperties.KEY_NAME, "ada002-100k", + /// DataSetProperties.KEY_SIMILARITY_FUNCTION, VectorSimilarityFunction.COSINE + /// )); + /// + /// // From a YAML file, selecting a named entry + /// var props = new DataSetProperties.PropertyMap("dataset_metadata.yml", "ada002-100k"); + /// + /// // From a flat YAML file (no top-level key) + /// var props = new DataSetProperties.PropertyMap("my_dataset.yml", null); + /// ``` + class PropertyMap implements DataSetProperties { + + private final Map properties; + + /// Creates a new instance backed by the given map. + /// + /// @param properties the property map; keys should use the {@code KEY_*} constants + /// from {@link DataSetProperties} + public PropertyMap(Map properties) { + this.properties = properties; + } + + /// Loads properties from a YAML file. + /// + /// If {@code documentKey} is non-null and non-empty, the YAML document is expected + /// to be a map of maps, and the entry at that key is used as the properties. The + /// document key is also set as the {@link DataSetProperties#KEY_NAME} if no explicit + /// name is present. + /// + /// If {@code documentKey} is null or empty, the entire YAML document is treated as + /// the property map. + /// + /// @param yamlFile path to a {@code .yml} or {@code .yaml} file + /// @param documentKey the top-level key to select, or null/empty to use the whole document + /// @throws IllegalArgumentException if the file does not end in {@code .yml} or {@code .yaml} + /// @throws RuntimeException if the file cannot be read or parsed + @SuppressWarnings("unchecked") + public PropertyMap(String yamlFile, String documentKey) { + if (!yamlFile.endsWith(".yml") && !yamlFile.endsWith(".yaml")) { + throw new IllegalArgumentException("Expected a .yml or .yaml file, got: " + yamlFile); + } + Map loaded; + try (InputStream in = new FileInputStream(yamlFile)) { + loaded = new Yaml().load(in); + } catch (IOException e) { + throw new RuntimeException("Failed to load YAML from " + yamlFile, e); + } + if (documentKey != null && !documentKey.isEmpty()) { + Object entry = loaded.get(documentKey); + if (entry == null) { + throw new IllegalArgumentException("No entry found for key '" + documentKey + "' in " + yamlFile); + } + if (!(entry instanceof Map)) { + throw new IllegalArgumentException("Entry for key '" + documentKey + "' in " + yamlFile + " is not a map"); + } + var props = new HashMap<>((Map) entry); + props.putIfAbsent(KEY_NAME, documentKey); + this.properties = props; + } else { + this.properties = loaded != null ? loaded : Map.of(); + } + } + + @Override + public Optional similarityFunction() { + var value = properties.get(KEY_SIMILARITY_FUNCTION); + if (value instanceof VectorSimilarityFunction) { + return Optional.of((VectorSimilarityFunction) value); + } + if (value instanceof String) { + return Optional.of(VectorSimilarityFunction.valueOf((String) value)); + } + return Optional.empty(); + } + + @Override + public int numVectors() { + var value = properties.get(KEY_NUM_VECTORS); + if (value instanceof Number) { + return ((Number) value).intValue(); + } + return 0; + } + + @Override + public String getName() { + var value = properties.get(KEY_NAME); + return value != null ? value.toString() : ""; + } + + @Override + public boolean isNormalized() { + return Boolean.TRUE.equals(properties.get(KEY_IS_NORMALIZED)); + } + + @Override + public boolean isZeroVectorFree() { + return Boolean.TRUE.equals(properties.get(KEY_IS_ZERO_VECTOR_FREE)); + } + + @Override + public boolean isDuplicateVectorFree() { + return Boolean.TRUE.equals(properties.get(KEY_IS_DUPLICATE_VECTOR_FREE)); + } + } +} diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java new file mode 100644 index 000000000..8ed60e5bb --- /dev/null +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java @@ -0,0 +1,404 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import org.junit.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.junit.jupiter.api.Assertions.*; + +public class DataSetPropertiesTest { + + private static String testResource(String name) { + return Objects.requireNonNull( + DataSetPropertiesTest.class.getResource(name), + "Test resource not found: " + name + ).getPath(); + } + + // ======================================================================== + // PropertyMap from Map — happy paths + // ======================================================================== + + @Test + public void propertyMapFromFullMap() { + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_NAME, "test-ds", + DataSetProperties.KEY_SIMILARITY_FUNCTION, VectorSimilarityFunction.COSINE, + DataSetProperties.KEY_NUM_VECTORS, 42000, + DataSetProperties.KEY_IS_NORMALIZED, true, + DataSetProperties.KEY_IS_ZERO_VECTOR_FREE, true, + DataSetProperties.KEY_IS_DUPLICATE_VECTOR_FREE, true + )); + assertEquals("test-ds", props.getName()); + assertEquals(VectorSimilarityFunction.COSINE, props.similarityFunction().orElse(null)); + assertEquals(42000, props.numVectors()); + assertTrue(props.isNormalized()); + assertTrue(props.isZeroVectorFree()); + assertTrue(props.isDuplicateVectorFree()); + assertTrue(props.isValid()); + } + + @Test + public void propertyMapSimilarityFunctionFromString() { + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_SIMILARITY_FUNCTION, "EUCLIDEAN" + )); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, props.similarityFunction().orElse(null)); + } + + @Test + public void propertyMapSimilarityFunctionFromEnum() { + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_SIMILARITY_FUNCTION, VectorSimilarityFunction.DOT_PRODUCT + )); + assertEquals(VectorSimilarityFunction.DOT_PRODUCT, props.similarityFunction().orElse(null)); + } + + @Test + public void propertyMapDefaults() { + var props = new DataSetProperties.PropertyMap(Map.of()); + assertEquals("", props.getName()); + assertTrue(props.similarityFunction().isEmpty()); + assertEquals(0, props.numVectors()); + assertFalse(props.isNormalized()); + assertFalse(props.isZeroVectorFree()); + assertFalse(props.isDuplicateVectorFree()); + assertFalse(props.isValid()); + } + + @Test + public void propertyMapNumVectorsFromLong() { + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_NUM_VECTORS, 99999L + )); + assertEquals(99999, props.numVectors()); + } + + @Test + public void propertyMapIsValidRequiresBothFlags() { + var onlyZeroFree = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_IS_ZERO_VECTOR_FREE, true + )); + assertFalse(onlyZeroFree.isValid()); + + var onlyDedupe = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_IS_DUPLICATE_VECTOR_FREE, true + )); + assertFalse(onlyDedupe.isValid()); + + var both = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_IS_ZERO_VECTOR_FREE, true, + DataSetProperties.KEY_IS_DUPLICATE_VECTOR_FREE, true + )); + assertTrue(both.isValid()); + } + + // ======================================================================== + // PropertyMap from Map — adversarial inputs + // ======================================================================== + + @Test + public void propertyMapInvalidSimilarityFunctionString() { + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_SIMILARITY_FUNCTION, "NOT_A_REAL_FUNCTION" + )); + assertThrows(IllegalArgumentException.class, props::similarityFunction); + } + + @Test + public void propertyMapSimilarityFunctionWrongType() { + // An Integer is neither String nor VectorSimilarityFunction + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_SIMILARITY_FUNCTION, 12345 + )); + assertTrue(props.similarityFunction().isEmpty()); + } + + @Test + public void propertyMapNumVectorsWrongType() { + var props = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_NUM_VECTORS, "not-a-number" + )); + assertEquals(0, props.numVectors()); + } + + @Test + public void propertyMapBooleanFieldsIgnoreNonBooleans() { + // String "true" is not Boolean.TRUE + var map = new HashMap(); + map.put(DataSetProperties.KEY_IS_NORMALIZED, "true"); + map.put(DataSetProperties.KEY_IS_ZERO_VECTOR_FREE, 1); + map.put(DataSetProperties.KEY_IS_DUPLICATE_VECTOR_FREE, "yes"); + var props = new DataSetProperties.PropertyMap(map); + assertFalse(props.isNormalized()); + assertFalse(props.isZeroVectorFree()); + assertFalse(props.isDuplicateVectorFree()); + } + + @Test + public void propertyMapNullValuesInMap() { + var map = new HashMap(); + map.put(DataSetProperties.KEY_NAME, null); + map.put(DataSetProperties.KEY_SIMILARITY_FUNCTION, null); + map.put(DataSetProperties.KEY_NUM_VECTORS, null); + map.put(DataSetProperties.KEY_IS_NORMALIZED, null); + var props = new DataSetProperties.PropertyMap(map); + assertEquals("", props.getName()); + assertTrue(props.similarityFunction().isEmpty()); + assertEquals(0, props.numVectors()); + assertFalse(props.isNormalized()); + } + + // ======================================================================== + // PropertyMap from YAML — keyed entry + // ======================================================================== + + @Test + public void yamlKeyedEntryFullProperties() { + var props = new DataSetProperties.PropertyMap(testResource("multi_entry.yml"), "ada002-100k"); + assertEquals("ada002-100k", props.getName()); + assertEquals(VectorSimilarityFunction.COSINE, props.similarityFunction().orElse(null)); + assertEquals(100000, props.numVectors()); + assertTrue(props.isNormalized()); + assertTrue(props.isZeroVectorFree()); + assertTrue(props.isDuplicateVectorFree()); + assertTrue(props.isValid()); + } + + @Test + public void yamlKeyedEntryMinimalProperties() { + var props = new DataSetProperties.PropertyMap(testResource("multi_entry.yml"), "minimal-entry"); + assertEquals("minimal-entry", props.getName()); + assertEquals(VectorSimilarityFunction.DOT_PRODUCT, props.similarityFunction().orElse(null)); + assertEquals(0, props.numVectors()); + assertFalse(props.isNormalized()); + assertFalse(props.isValid()); + } + + @Test + public void yamlKeyedEntryExplicitNameOverridesDocumentKey() { + var props = new DataSetProperties.PropertyMap(testResource("multi_entry.yml"), "has-explicit-name"); + assertEquals("custom-name", props.getName(), "Explicit name in YAML should take precedence over document key"); + } + + @Test + public void yamlKeyedEntryNameDefaultsToDocumentKey() { + var props = new DataSetProperties.PropertyMap(testResource("multi_entry.yml"), "sift-128-euclidean"); + assertEquals("sift-128-euclidean", props.getName()); + } + + // ======================================================================== + // PropertyMap from YAML — flat document (null/empty key) + // ======================================================================== + + @Test + public void yamlFlatDocumentNullKey() { + var props = new DataSetProperties.PropertyMap(testResource("flat_entry.yml"), null); + assertEquals("flat-dataset", props.getName()); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, props.similarityFunction().orElse(null)); + assertEquals(50000, props.numVectors()); + assertFalse(props.isNormalized()); + assertTrue(props.isZeroVectorFree()); + } + + @Test + public void yamlFlatDocumentEmptyKey() { + var props = new DataSetProperties.PropertyMap(testResource("flat_entry.yml"), ""); + assertEquals("flat-dataset", props.getName()); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, props.similarityFunction().orElse(null)); + } + + // ======================================================================== + // PropertyMap from YAML — adversarial / error cases + // ======================================================================== + + @Test + public void yamlRejectsNonYamlExtension() { + assertThrows(IllegalArgumentException.class, () -> + new DataSetProperties.PropertyMap("data.json", null)); + assertThrows(IllegalArgumentException.class, () -> + new DataSetProperties.PropertyMap("data.txt", "key")); + assertThrows(IllegalArgumentException.class, () -> + new DataSetProperties.PropertyMap("data.yml.bak", null)); + } + + @Test + public void yamlAcceptsBothExtensions() { + // .yml is tested above; verify .yaml also passes the extension check + // (will fail at file-not-found, not at extension check) + var ex = assertThrows(RuntimeException.class, () -> + new DataSetProperties.PropertyMap("/nonexistent/path.yaml", null)); + assertTrue(ex.getMessage().contains("Failed to load YAML"), ex.getMessage()); + } + + @Test + public void yamlNonexistentFile() { + assertThrows(RuntimeException.class, () -> + new DataSetProperties.PropertyMap("/no/such/file.yml", null)); + } + + @Test + public void yamlMissingDocumentKey() { + var ex = assertThrows(IllegalArgumentException.class, () -> + new DataSetProperties.PropertyMap(testResource("multi_entry.yml"), "no-such-dataset")); + assertTrue(ex.getMessage().contains("no-such-dataset"), ex.getMessage()); + } + + @Test + public void yamlDocumentKeyPointsToNonMap() { + var ex = assertThrows(IllegalArgumentException.class, () -> + new DataSetProperties.PropertyMap(testResource("scalar_entry.yml"), "some_key")); + assertTrue(ex.getMessage().contains("not a map"), ex.getMessage()); + } + + @Test + public void yamlEmptyDocument() { + var props = new DataSetProperties.PropertyMap(testResource("empty.yml"), null); + assertEquals("", props.getName()); + assertTrue(props.similarityFunction().isEmpty()); + } + + // ======================================================================== + // DataSetMetadataReader + // ======================================================================== + + @Test + public void metadataReaderLooksUpExactKey() { + var reader = DataSetMetadataReader.load(testResource("multi_entry.yml")); + var props = reader.getProperties("ada002-100k"); + assertTrue(props.isPresent()); + assertEquals("ada002-100k", props.get().getName()); + assertEquals(VectorSimilarityFunction.COSINE, props.get().similarityFunction().orElse(null)); + } + + @Test + public void metadataReaderReturnsEmptyForUnknownKey() { + var reader = DataSetMetadataReader.load(testResource("multi_entry.yml")); + assertTrue(reader.getProperties("does-not-exist").isEmpty()); + } + + @Test + public void metadataReaderFallsBackToHdf5Suffix() { + // multi_entry.yml has "sift-128-euclidean" as a key (no .hdf5 suffix) + // The reader should NOT find "sift-128-euclidean" via hdf5 fallback since the key exists directly + var reader = DataSetMetadataReader.load(testResource("multi_entry.yml")); + var props = reader.getProperties("sift-128-euclidean"); + assertTrue(props.isPresent()); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, props.get().similarityFunction().orElse(null)); + } + + @Test + public void metadataReaderNonexistentFile() { + assertThrows(RuntimeException.class, () -> + DataSetMetadataReader.load("/no/such/file.yml")); + } + + // ======================================================================== + // DataSetInfo delegation + // ======================================================================== + + @Test + public void dataSetInfoDelegatesToBaseProperties() { + var base = new DataSetProperties.PropertyMap(Map.of( + DataSetProperties.KEY_NAME, "delegate-test", + DataSetProperties.KEY_SIMILARITY_FUNCTION, VectorSimilarityFunction.EUCLIDEAN, + DataSetProperties.KEY_NUM_VECTORS, 7777, + DataSetProperties.KEY_IS_NORMALIZED, true, + DataSetProperties.KEY_IS_ZERO_VECTOR_FREE, true, + DataSetProperties.KEY_IS_DUPLICATE_VECTOR_FREE, true + )); + var info = new DataSetInfo(base, () -> { throw new AssertionError("loader should not be called"); }); + + assertEquals("delegate-test", info.getName()); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, info.similarityFunction().orElse(null)); + assertEquals(7777, info.numVectors()); + assertTrue(info.isNormalized()); + assertTrue(info.isZeroVectorFree()); + assertTrue(info.isDuplicateVectorFree()); + assertTrue(info.isValid()); + } + + @Test + public void dataSetInfoLazyLoading() { + var callCount = new int[]{0}; + var base = new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, "lazy")); + // Return a dummy non-null sentinel so the cache works (null would defeat the null-check) + var sentinel = new DataSet() { + public int getDimension() { return 0; } + public RandomAccessVectorValues getBaseRavv() { return null; } + public String getName() { return "sentinel"; } + public VectorSimilarityFunction getSimilarityFunction() { return VectorSimilarityFunction.COSINE; } + public List> getBaseVectors() { return Collections.emptyList(); } + public List> getQueryVectors() { return Collections.emptyList(); } + public List> getGroundTruth() { return Collections.emptyList(); } + }; + var info = new DataSetInfo(base, () -> { + callCount[0]++; + return sentinel; + }); + + // Accessing properties should not trigger the loader + info.getName(); + info.similarityFunction(); + info.numVectors(); + assertEquals(0, callCount[0], "Loader should not have been called for metadata access"); + + // getDataSet triggers it + info.getDataSet(); + assertEquals(1, callCount[0]); + + // Second call should use cache + info.getDataSet(); + assertEquals(1, callCount[0], "Loader should only be called once"); + } + + // ======================================================================== + // Integration: DataSetMetadataReader loaded from production metadata file + // ======================================================================== + + @Test + public void productionMetadataFileLoadsSuccessfully() { + // This validates the actual dataset_metadata.yml is well-formed + var reader = DataSetMetadataReader.load(); + var props = reader.getProperties("ada002-100k"); + assertTrue(props.isPresent(), "ada002-100k should be in the production metadata file"); + assertEquals(VectorSimilarityFunction.COSINE, props.get().similarityFunction().orElse(null)); + } + + @Test + public void productionMetadataAllEntriesHaveSimilarityFunction() { + var reader = DataSetMetadataReader.load(); + // All entries in the production metadata should have a similarity function + for (var name : new String[]{"cohere-english-v3-100k", "ada002-100k", "openai-v3-small-100k", + "gecko-100k", "openai-v3-large-3072-100k", "openai-v3-large-1536-100k", + "e5-small-v2-100k", "e5-base-v2-100k", "e5-large-v2-100k", + "ada002-1M", "colbert-1M"}) { + var props = reader.getProperties(name); + assertTrue(props.isPresent(), "Missing metadata for " + name); + assertTrue(props.get().similarityFunction().isPresent(), + "Missing similarity_function for " + name); + } + } +} diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java index b36088c8a..668f573ac 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java @@ -123,8 +123,9 @@ public void testSummaryStatsToString() { " Average Recall@k: %.4f%n" + " Average QPS: %.2f (± %.2f)%n" + " Average Latency: %.2f ms%n" + - " Index Construction Time: %.2f", - 4, 0.85, 1200.0, 0.0, 5.2, 1000000.00); + " Index Construction Time: %.2f%n" + + " Average Nodes Visited: %.2f", + 4, 0.85, 1200.0, 0.2, 5.2, 1000000.00, 100.00); assertEquals(expected, stats.toString()); } diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml @@ -0,0 +1 @@ + diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml new file mode 100644 index 000000000..0f0878ae3 --- /dev/null +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml @@ -0,0 +1,5 @@ +name: flat-dataset +similarity_function: EUCLIDEAN +num_vectors: 50000 +is_normalized: false +is_zero_vector_free: true diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml new file mode 100644 index 000000000..d4523e1bd --- /dev/null +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml @@ -0,0 +1,14 @@ +ada002-100k: + similarity_function: COSINE + is_normalized: true + is_zero_vector_free: true + is_duplicate_vector_free: true + num_vectors: 100000 +sift-128-euclidean: + similarity_function: EUCLIDEAN + num_vectors: 128000 +minimal-entry: + similarity_function: DOT_PRODUCT +has-explicit-name: + name: custom-name + similarity_function: COSINE diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml new file mode 100644 index 000000000..f34be08ec --- /dev/null +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml @@ -0,0 +1 @@ +some_key: just_a_string diff --git a/jvector-examples/yaml-configs/dataset_metadata.yml b/jvector-examples/yaml-configs/dataset_metadata.yml new file mode 100644 index 000000000..61b4af679 --- /dev/null +++ b/jvector-examples/yaml-configs/dataset_metadata.yml @@ -0,0 +1,34 @@ +# This file contains the metadata for the datasets (formats) which do not have a dedicated +# metadata facility. The MFD and hdf5 loaders use this file to determine the similarity function, among other things. +# (HDF5 metadata support is moot for us since the runtime support fall short in other ways) +# +# You can put additional metadata here, but it will not be type-safe and reified properly unless there is an accompanying +# change in the DataSetProperties interface and associated implementations. + +cohere-english-v3-100k: + similarity_function: COSINE + # examples of supported properties + # If not present, presumed to be false + # is_normalized: false + # is_zero_vector_free: false + # is duplicate_vector_free: false +ada002-100k: + similarity_function: COSINE +openai-v3-small-100k: + similarity_function: COSINE +gecko-100k: + similarity_function: COSINE +openai-v3-large-3072-100k: + similarity_function: COSINE +openai-v3-large-1536-100k: + similarity_function: COSINE +e5-small-v2-100k: + similarity_function: COSINE +e5-base-v2-100k: + similarity_function: COSINE +e5-large-v2-100k: + similarity_function: COSINE +ada002-1M: + similarity_function: COSINE +colbert-1M: + similarity_function: COSINE diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml index a35555704..29c8c3f52 100644 --- a/jvector-examples/yaml-configs/datasets.yml +++ b/jvector-examples/yaml-configs/datasets.yml @@ -1,4 +1,3 @@ -neighborhood-watch-100k: - cohere-english-v3-100k - ada002-100k - openai-v3-small-100k @@ -8,10 +7,8 @@ neighborhood-watch-100k: - e5-small-v2-100k - e5-base-v2-100k - e5-large-v2-100k -neighborhood-watch-1M: - ada002-1M - colbert-1M -ann-benchmarks: - glove-25-angular.hdf5 - glove-50-angular.hdf5 - lastfm-64-dot.hdf5 From 7e9185716b55331c7b460c30f736c344d8e04a6b Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 19 Mar 2026 17:39:52 +0000 Subject: [PATCH 6/9] add docs to dataset_metadata --- jvector-examples/yaml-configs/dataset_metadata.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/jvector-examples/yaml-configs/dataset_metadata.yml b/jvector-examples/yaml-configs/dataset_metadata.yml index 61b4af679..6aade63cc 100644 --- a/jvector-examples/yaml-configs/dataset_metadata.yml +++ b/jvector-examples/yaml-configs/dataset_metadata.yml @@ -2,6 +2,11 @@ # metadata facility. The MFD and hdf5 loaders use this file to determine the similarity function, among other things. # (HDF5 metadata support is moot for us since the runtime support fall short in other ways) # +# Ideally, this metadata is part of the format and access layer for a given dataset format. This file exists because +# the dataset names herein are in a form which does _not_ support proper bundled configuration data with the raw data. +# When possible, these dataset should be provided with another mechanism which fully handles this aspect of dataset +# management so that we don't have to maintain separate parts in different places. +# # You can put additional metadata here, but it will not be type-safe and reified properly unless there is an accompanying # change in the DataSetProperties interface and associated implementations. From dc23e8e53949971877c67221fcb894d63ff16567 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 19 Mar 2026 18:38:49 +0000 Subject: [PATCH 7/9] fix inheritDoc directive --- .../example/benchmarks/datasets/DataSetInfo.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java index 44b68638e..94eb9f011 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -67,7 +67,7 @@ public DataSetInfo(DataSetProperties baseProperties, Supplier loader) { } /** - * @inheritDoc + * {@inheritDoc} */ @Override public Optional similarityFunction() { @@ -75,7 +75,7 @@ public Optional similarityFunction() { } /** - * @inheritDoc + * {@inheritDoc} */ @Override public int numVectors() { @@ -83,7 +83,7 @@ public int numVectors() { } /** - * @inheritDoc + * {@inheritDoc} */ @Override public String getName() { @@ -91,7 +91,7 @@ public String getName() { } /** - * @inheritDoc + * {@inheritDoc} */ @Override public boolean isNormalized() { @@ -99,7 +99,7 @@ public boolean isNormalized() { } /** - * @inheritDoc + * {@inheritDoc} */ @Override public boolean isZeroVectorFree() { @@ -107,7 +107,7 @@ public boolean isZeroVectorFree() { } /** - * @inheritDoc + * {@inheritDoc} */ @Override public boolean isDuplicateVectorFree() { From 04056f11f9a6a23f7b3ef42dd8416c6cce3ff30b Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 19 Mar 2026 18:46:31 +0000 Subject: [PATCH 8/9] update rat settings --- .../benchmarks/datasets/DataSetProperties.java | 16 ++++++++++++++++ .../example/benchmarks/datasets/empty.yml | 14 +++++++++++++- .../example/benchmarks/datasets/flat_entry.yml | 14 ++++++++++++++ .../example/benchmarks/datasets/multi_entry.yml | 14 ++++++++++++++ .../example/benchmarks/datasets/scalar_entry.yml | 14 ++++++++++++++ 5 files changed, 71 insertions(+), 1 deletion(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java index 7b6012bb1..5f02ba790 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java @@ -1,3 +1,19 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.github.jbellis.jvector.example.benchmarks.datasets; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml index 8b1378917..2c9ca172f 100644 --- a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/empty.yml @@ -1 +1,13 @@ - +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml index 0f0878ae3..c84013ddd 100644 --- a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/flat_entry.yml @@ -1,3 +1,17 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: flat-dataset similarity_function: EUCLIDEAN num_vectors: 50000 diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml index d4523e1bd..def5f019b 100644 --- a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/multi_entry.yml @@ -1,3 +1,17 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + ada002-100k: similarity_function: COSINE is_normalized: true diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml index f34be08ec..8cd162d69 100644 --- a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/scalar_entry.yml @@ -1 +1,15 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + some_key: just_a_string From db4f7d4012de07ab3032b596b6175434e301e5a2 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 19 Mar 2026 19:18:41 +0000 Subject: [PATCH 9/9] restore correct dataset layering --- jvector-examples/yaml-configs/datasets.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml index 29c8c3f52..a35555704 100644 --- a/jvector-examples/yaml-configs/datasets.yml +++ b/jvector-examples/yaml-configs/datasets.yml @@ -1,3 +1,4 @@ +neighborhood-watch-100k: - cohere-english-v3-100k - ada002-100k - openai-v3-small-100k @@ -7,8 +8,10 @@ - e5-small-v2-100k - e5-base-v2-100k - e5-large-v2-100k +neighborhood-watch-1M: - ada002-1M - colbert-1M +ann-benchmarks: - glove-25-angular.hdf5 - glove-50-angular.hdf5 - lastfm-64-dot.hdf5