Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions jvector-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skip>false</skip>
<workingDirectory>${project.parent.basedir}</workingDirectory>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException {
try {
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Dataset " + datasetName + " not found")
);
).getDataSet();
logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());

String normalizedDatasetName = datasetName;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List<Func
for (var datasetName : datasetNames) {
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Dataset " + datasetName + " not found")
);
).getDataSet();
Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException {
String datasetName = config.dataset;
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Could not load dataset:" + datasetName)
);
).getDataSet();
// Register dataset info the first time we actually load the dataset for benchmarking
artifacts.registerDataset(datasetName, ds);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException {

// Load dataset
var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
.getDataSet();

// Run artifacts + selections (sys_info/dataset_info/experiments.csv)
RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright DataStax, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.github.jbellis.jvector.example.benchmarks.datasets;

import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import java.util.Optional;
import java.util.function.Supplier;

/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data.
///
/// Metadata such as the dataset name and similarity function are available immediately
/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing
/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}.
///
/// This design allows callers to enumerate or filter available datasets cheaply, and
/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to
/// thread-safe caching.
///
/// Instances are created by {@link DataSetLoader} implementations; callers obtain them
/// through {@link DataSets#loadDataSet(String)}.
///
/// ### Typical usage
/// ```java
/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow();
///
/// // Cheap — no vectors loaded yet
/// System.out.println(info.getName());
/// System.out.println(info.similarityFunction());
///
/// // First call triggers full load; subsequent calls return the cached DataSet
/// DataSet ds = info.getDataSet();
/// ```
///
/// @see DataSet
/// @see DataSetLoader
/// @see DataSets
public class DataSetInfo implements DataSetProperties {
private final Supplier<DataSet> loader;
private final DataSetProperties baseProperties;
private volatile DataSet cached;

/// Creates a new dataset info handle.
///
/// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called.
/// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates /
/// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}.
///
/// @param baseProperties the dataset properties (name, similarity function, etc.)
/// @param loader a supplier that performs the deferred load; invoked at most once
public DataSetInfo(DataSetProperties baseProperties, Supplier<DataSet> loader) {
this.baseProperties = baseProperties;
this.loader = loader;
}

/**
* {@inheritDoc}
*/
@Override
public Optional<VectorSimilarityFunction> similarityFunction() {
return baseProperties.similarityFunction();
}

/**
* {@inheritDoc}
*/
@Override
public int numVectors() {
return this.baseProperties.numVectors();
}

/**
* {@inheritDoc}
*/
@Override
public String getName() {
return baseProperties.getName();
}

/**
* {@inheritDoc}
*/
@Override
public boolean isNormalized() {
return baseProperties.isNormalized();
}

/**
* {@inheritDoc}
*/
@Override
public boolean isZeroVectorFree() {
return baseProperties.isZeroVectorFree();
}

/**
* {@inheritDoc}
*/
@Override
public boolean isDuplicateVectorFree() {
return baseProperties.isDuplicateVectorFree();
}

/// Returns the fully loaded and scrubbed {@link DataSet}.
///
/// On the first invocation this triggers the deferred load pipeline, which may involve
/// reading large vector files from disk, deduplication, zero-vector removal, and
/// normalization. The result is cached so that subsequent calls return immediately.
///
/// This method is thread-safe: concurrent callers will block until the first load
/// completes, after which all callers share the same cached instance.
///
/// @return the ready-to-use {@link DataSet}
public DataSet getDataSet() {
if (cached == null) {
synchronized (this) {
if (cached == null) {
cached = loader.get();
}
}
}
return cached;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,28 @@
*/
public interface DataSetLoader {
/**
* Implementations of this method <EM>MUST NOT</EM> throw exceptions related to the presence or absence of a
* Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle.
*
* <p>The returned handle provides the dataset name and similarity function immediately,
* without loading vector data into memory. The full {@link DataSet} (vectors, ground truth,
* etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}.
*
* <p>Implementations <em>MUST NOT</em> throw exceptions related to the presence or absence of a
* requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with
* exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably
* return from this method, avoiding any {@link System#exit(int)} or similar calls.
* exceptions as usual, including any errors downloading or preparing a dataset which has been found.
* Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls.
*
* <p>Implementations may perform file downloads or other preparation work before returning the handle,
* but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier.
*
* <HR/>
*
* Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
* <p>Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
* not found, and info level for when datasets are found and loaded. This can assist users troubleshooting
* diverse data sources.
*
* @param dataSetName
* @return a {@link DataSet}, if found
* @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5})
* @return a {@link DataSetInfo} handle for the dataset, if found
*/
Optional<DataSet> loadDataSet(String dataSetName);
Optional<DataSetInfo> loadDataSet(String dataSetName);
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,30 +34,42 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.IntStream;

/**
* This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
*
* <p>The vector similarity function is first inferred from the filename (e.g. {@code -angular},
* {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
* back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
* If neither source provides a similarity function, an error is thrown.
*/
public class DataSetLoaderHDF5 implements DataSetLoader {
public static final Path HDF5_DIR = Path.of("hdf5");
private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
public static final String HDF5_EXTN = ".hdf5";
private static final DataSetMetadataReader metadata = DataSetMetadataReader.load();

/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String datasetName) {
return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
public Optional<DataSetInfo> loadDataSet(String datasetName) {
return maybeDownloadHdf5(datasetName).map(path -> {
var props = getProperties(datasetName, path);
var similarity = props.similarityFunction()
.orElseThrow(() -> new IllegalArgumentException(
"No similarity function found for HDF5 dataset: " + datasetName
+ ". Either include -angular, -dot, or -euclidean in the filename,"
+ " or add an entry in dataset_metadata.yml"));
return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
});
}

private DataSet readHdf5Data(Path path) {

// infer the similarity
VectorSimilarityFunction similarityFunction = getVectorSimilarityFunction(path);

// read the data
/// Reads base vectors, query vectors, and ground truth from an HDF5 file
/// and returns a scrubbed {@link DataSet}.
private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
VectorFloat<?>[] baseVectors;
VectorFloat<?>[] queryVectors;
var gtSets = new ArrayList<List<Integer>>();
Expand Down Expand Up @@ -94,27 +106,43 @@ private DataSet readHdf5Data(Path path) {
return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
}

/**
* Derive the similarity function from the dataset name.
* @param filename filename of the dataset AKA "name"
* @return The matching similarity function, or throw an error
*/
private static VectorSimilarityFunction getVectorSimilarityFunction(Path filename) {
VectorSimilarityFunction similarityFunction;
if (filename.toString().contains("-angular") || filename.toString().contains("-dot")) {
similarityFunction = VectorSimilarityFunction.COSINE;
/// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
///
/// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
/// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
/// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
/// a minimal {@link DataSetProperties} with an empty similarity function is returned
/// so that the caller can produce a clear error.
///
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
/// @param filename the resolved file path including the {@code .hdf5} extension
/// @return the dataset properties
private static DataSetProperties getProperties(String datasetName, Path filename) {
String filenameStr = filename.toString();
VectorSimilarityFunction inferred = null;
if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
inferred = VectorSimilarityFunction.COSINE;
} else if (filenameStr.contains("-euclidean")) {
inferred = VectorSimilarityFunction.EUCLIDEAN;
}
else if (filename.toString().contains("-euclidean")) {
similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
}
else {
throw new IllegalArgumentException("Unknown similarity function -- expected angular or euclidean for " + filename);

// If filename inference succeeded, build properties with just the SF
if (inferred != null) {
return new DataSetProperties.PropertyMap(Map.of(
DataSetProperties.KEY_NAME, datasetName,
DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
}
return similarityFunction;

// Fall back to metadata YAML
return metadata.getProperties(datasetName)
.orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
}

/// Downloads the HDF5 file for the given dataset if it is not already present locally.
///
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
/// @return the local path to the HDF5 file, or empty if the remote file was not found
private Optional<Path> maybeDownloadHdf5(String datasetName) {

var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN);

if (Files.exists(dsFilePath)) {
Expand All @@ -123,7 +151,6 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {

// Download from https://ann-benchmarks.com/datasetName
var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN;
System.out.println("Downloading: " + url);

HttpURLConnection connection;
while (true) {
Expand All @@ -148,6 +175,7 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {

try (InputStream in = connection.getInputStream()) {
Files.createDirectories(dsFilePath.getParent());
System.out.println("Downloading: " + url);
Files.copy(in, dsFilePath, StandardCopyOption.REPLACE_EXISTING);
} catch (IOException e) {
throw new RuntimeException("Error downloading data:" + e.getMessage(),e);
Expand Down
Loading
Loading