diff --git a/.github/workflows/auto-jdk-matrix.yml b/.github/workflows/auto-jdk-matrix.yml index f5ba78463..0d387c944 100644 --- a/.github/workflows/auto-jdk-matrix.yml +++ b/.github/workflows/auto-jdk-matrix.yml @@ -20,14 +20,14 @@ jobs: strategy: fail-fast: false matrix: - jdk: [ 24 ] + jdk: [ 25 ] env: JDK_VERSION: ${{ matrix.jdk }} steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: persist-credentials: false @@ -43,7 +43,7 @@ jobs: restore-keys: build-${{ runner.os }}-maven- - name: Install Matrix JDK - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.jdk }} distribution: 'temurin' @@ -67,7 +67,7 @@ jobs: -Dgpg.skip=true # Architecture options: x86, x64, armv7, aarch64, ppc64le -# setup-java@v4 has a "with cache" option +# setup-java@v5 has a "with cache" option # Lifecycle: validate, compile, test, package, verify, install, deploy # -B batch mode, never stops for user input # -V show Version without stopping diff --git a/.github/workflows/auto-os-matrix.yml b/.github/workflows/auto-os-matrix.yml index abb9baec3..df3c7135f 100644 --- a/.github/workflows/auto-os-matrix.yml +++ b/.github/workflows/auto-os-matrix.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: - jdk: [ 24 ] + jdk: [ 25 ] os: [ windows-latest, ubuntu-latest, macos-latest ] include: - os: windows-latest @@ -41,7 +41,7 @@ jobs: steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: persist-credentials: false @@ -53,7 +53,7 @@ jobs: restore-keys: build-${{ runner.os }}-maven- - name: Install Matrix JDK - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.jdk }} distribution: 'temurin' diff --git a/.github/workflows/check_cpp_files.yml b/.github/workflows/check_cpp_files.yml index e433dcb87..694aa139d 100644 --- a/.github/workflows/check_cpp_files.yml +++ b/.github/workflows/check_cpp_files.yml @@ -16,18 +16,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Checkout C++ - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: repository: apache/datasketches-cpp path: cpp - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: - java-version: '24' + java-version: '25' distribution: 'temurin' - name: Configure C++ build diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index d8114578a..0a6de05d9 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -28,14 +28,14 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: distribution: 'temurin' cache: 'maven' - java-version: '24' + java-version: '25' - name: Initialize CodeQL uses: github/codeql-action/init@v3 diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 2fef93616..66bab896a 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -14,12 +14,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: - java-version: '24' + java-version: '25' distribution: 'temurin' - name: Echo Java Version diff --git a/pom.xml b/pom.xml index 972d21449..e130f2694 100644 --- a/pom.xml +++ b/pom.xml @@ -66,8 +66,6 @@ under the License. - GitHub https://github.com/apache/${project.artifactId}/issues @@ -83,8 +81,6 @@ under the License. - 6.1.0-SNAPSHOT - 7.11.0 @@ -94,11 +90,10 @@ under the License. check_cpp_historical_files - 3.9.10 - 24 - -Xmx4g -Duser.language=en -Duser.country=US -Dfile.encoding=UTF-8 - ${java.version} - ${java.version} + 3.9.11 + 25 + + -Xmx4g UTF-8 ${charset.encoding} ${charset.encoding} @@ -108,11 +103,12 @@ under the License. 3.7.1 3.14.0 + 3.8.1 3.1.4 - 3.6.0 + 3.6.1 3.2.8 3.4.2 - 3.11.2 + 3.11.3 3.1.1 3.3.1 @@ -148,6 +144,7 @@ under the License. + org.apache.maven.plugins maven-assembly-plugin ${maven-assembly-plugin.version} @@ -157,12 +154,20 @@ under the License. maven-compiler-plugin ${maven-compiler-plugin.version} + true + ${java.version} - + -J${jvm.options} + + org.apache.maven.plugins + maven-dependency-plugin + ${maven-dependency-plugin.version} + + @@ -184,11 +189,12 @@ under the License. - [24,) + [25,) - [${maven.version},4.0.0) + [${maven.version},) + @@ -232,6 +238,10 @@ under the License. true public all,-missing + ${java.version} + + -J${jvm.options} + @@ -276,7 +286,9 @@ under the License. maven-surefire-plugin ${maven-surefire-failsafe-plugins.version} - ${jvm-arguments} + 1 + true + ${argLine} ${jvm.options} false false true @@ -383,6 +395,18 @@ under the License. + + org.apache.maven.plugins + maven-assembly-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + + org.apache.maven.plugins + maven-dependency-plugin + org.apache.maven.plugins maven-deploy-plugin @@ -585,25 +609,7 @@ under the License. - + generate-java-files diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java index 5bf3253cd..2fe018335 100644 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java @@ -29,15 +29,13 @@ public interface MemorySegmentRequest { /** - * Request a new MemorySegment with the given newByteSize. + * Request a new heap MemorySegment with the given newByteSize. * Because we do not have a reference to an Arena, the default here is to * allocate a new MemorySegment on the heap. It is up to the user to override this as appropriate. - * @param prevSeg the previous MemorySegment to be possibly closed here or by using the separate - * {@link #requestClose requestClose} method. This is included for convenience, it may be null. * @param newByteSize The new byteSize being requested. * @return new MemorySegment with the requested byteSize. */ - default MemorySegment request(final MemorySegment prevSeg, final long newByteSize) { + default MemorySegment request(final long newByteSize) { if (newByteSize > Integer.MAX_VALUE) { throw new SketchesArgumentException("Requested size in bytes exceeds Integer.MAX_VALUE."); } diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java new file mode 100644 index 000000000..650ce4223 --- /dev/null +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.common; + +import java.lang.foreign.Arena; + +import java.lang.foreign.MemorySegment; +import java.util.Enumeration; +import java.util.concurrent.ConcurrentHashMap; +import java.util.Objects; + +/** + * This is an example of a possible implementation of the MemorySegmentRequest interface + * where all requested segments are allocated off-heap. A local ConcurrentHashMap tracks a newly created + * confined Arena for every new MemorySegment allocated off-heap. This allows individual segments to be freed + * immediately upon receiving the {@link #requestClose(MemorySegment) requestClose(MemorySegment)} call. + */ +public final class MemorySegmentRequestExample implements MemorySegmentRequest { + private final ConcurrentHashMap map = new ConcurrentHashMap<>(); + + /** + * Request a new off-heap MemorySegment with the given newByteSeze. + * An internal confined Arena is created to exclusively manage the new segment and it is associated + * with the new segment with a ConcurrentHashMap. + */ + @Override + public synchronized MemorySegment request(final long newByteSize) { + final Arena arena = Arena.ofConfined(); + final MemorySegment seg = arena.allocate(newByteSize); + map.put(seg, arena); + return seg; + + } + + @Override + public synchronized void requestClose(final MemorySegment segKey) { + Objects.requireNonNull(segKey, "MemorySegment segKey must not be null"); + final Arena arena = map.get(segKey); + if (arena != null) { + if (arena.scope().isAlive()) { arena.close(); } + map.remove(segKey); + } else { + //ignore or + //throw new SketchesArgumentException("Given MemorySegment key is not mapped to an Arena!"); + } + } + + /** + * This closes any unclosed, off-heap MemorySegments and removes all mappings from the map. + */ + public synchronized void cleanup() { + for (final Enumeration e = map.elements(); e.hasMoreElements(); ) { + final Arena arena = e.nextElement(); + if (arena.scope().isAlive()) { arena.close(); } + } + map.clear(); + } + +} diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java deleted file mode 100644 index d6d1c4371..000000000 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.common; - -import java.lang.foreign.Arena; -import java.lang.foreign.MemorySegment; -import java.util.Enumeration; -import java.util.Hashtable; - -/** - * This is just an example of a possible extension of the MemorySegmentRequest interface. - * You may want to enable the println statements to track the state of the Hashtable. - */ -public final class MemorySegmentRequestExtension implements MemorySegmentRequest { - private final Hashtable table = new Hashtable<>(); - - @Override - public synchronized MemorySegment request(final MemorySegment prevSeg, final long newByteSize) { - if (prevSeg.isNative()) { - final Arena arena = Arena.ofConfined(); - final MemorySegment seg = arena.allocate(newByteSize); - table.put(seg, arena); //System.out.println("Add"); - return seg; - } else { - if (newByteSize > Integer.MAX_VALUE) { - throw new SketchesArgumentException("Requested byteSize is greater than Integer.MAX_VALUE."); - } - return MemorySegment.ofArray(new byte[(int)newByteSize]); - } - } - - @Override - public synchronized void requestClose(final MemorySegment prevSeg) { - final Arena arena = table.get(prevSeg); - if ((arena != null) && arena.scope().isAlive()) { - arena.close(); - table.remove(prevSeg); //System.out.println("Remove"); - } //else ignore - } - - /** - * This cleans up any unclosed off-heap MemorySegments. - */ - public synchronized void cleanup() { - for (final Enumeration e = table.elements(); e.hasMoreElements();) { - final Arena arena = e.nextElement(); - if (arena.scope().isAlive()) { - arena.close(); //System.out.println("Closed a remaining Arena in the Hashtable"); - } - } - } - -} diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java b/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java index 6a4bde853..7e4ebcd38 100644 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java @@ -20,6 +20,7 @@ package org.apache.datasketches.common; import java.lang.foreign.MemorySegment; +import java.util.Objects; import java.util.Optional; /** @@ -66,6 +67,8 @@ public interface MemorySegmentStatus { * @return true if the two given MemorySegments have to the same backing resource. */ static boolean isSameResource(final MemorySegment seg1, final MemorySegment seg2) { + Objects.requireNonNull(seg1, "MemorySegment seg1 must be non-null."); + Objects.requireNonNull(seg2, "MemorySegment seg2 must be non-null."); final Optional opt = seg1.asOverlappingSlice(seg2); return opt.isPresent(); } diff --git a/src/main/java/org/apache/datasketches/cpc/CpcSketch.java b/src/main/java/org/apache/datasketches/cpc/CpcSketch.java index bd154eb5f..212670c50 100644 --- a/src/main/java/org/apache/datasketches/cpc/CpcSketch.java +++ b/src/main/java/org/apache/datasketches/cpc/CpcSketch.java @@ -19,9 +19,9 @@ package org.apache.datasketches.cpc; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.lang.Math.log; import static java.lang.Math.sqrt; +import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.datasketches.common.Util.LS; import static org.apache.datasketches.common.Util.invPow2; @@ -81,7 +81,7 @@ public final class CpcSketch { byte[] slidingWindow; //either null or size K bytes PairTable pairTable; //for sparse and surprising values, either null or variable size - //The following variables are only valid in HIP varients + //The following variables are only valid in HIP variants double kxp; //used with HIP double hipEstAccum; //used with HIP diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java index 5cc6f28fe..4823fcca7 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + /** * BloomFilter package */ diff --git a/src/main/java/org/apache/datasketches/hll/HllSketch.java b/src/main/java/org/apache/datasketches/hll/HllSketch.java index 9709ad69d..35d782a27 100644 --- a/src/main/java/org/apache/datasketches/hll/HllSketch.java +++ b/src/main/java/org/apache/datasketches/hll/HllSketch.java @@ -238,7 +238,7 @@ static final HllSketch heapify(final MemorySegment srcSeg, final boolean checkRe *

The given dstSeg is checked for the required capacity as determined by * {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}. * @param srcWseg an writable image of a valid source sketch with data. - * @return an HllSketch where the sketch data is in the given dstSeg. + * @return an HllSketch where the sketch data is in the given srcWseg. */ public static final HllSketch writableWrap(final MemorySegment srcWseg) { if (srcWseg.isReadOnly()) { return wrap(srcWseg); } @@ -251,7 +251,7 @@ static final HllSketch writableWrap( final MemorySegment srcWseg, final boolean checkBounds(0, 8, srcWseg.byteSize()); //need min 8 bytes if (extractCompactFlag(srcWseg)) { throw new SketchesArgumentException( - "Cannot perform a writableWrap of a writable sketch image that is in compact form. " + "Cannot perform a writableWrap of a sketch image that is in compact form. " + "Compact sketches are by definition immutable."); } final int lgConfigK = extractLgK(srcWseg); diff --git a/src/main/java/org/apache/datasketches/hll/TgtHllType.java b/src/main/java/org/apache/datasketches/hll/TgtHllType.java index b7f8d45ad..78aaafd1b 100644 --- a/src/main/java/org/apache/datasketches/hll/TgtHllType.java +++ b/src/main/java/org/apache/datasketches/hll/TgtHllType.java @@ -60,7 +60,7 @@ public enum TgtHllType { */ HLL_6, /** - * An Hll Sketch with a bin size of 8 bits + * An HLL Sketch with a bin size of 8 bits */ HLL_8; diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 45a6ae8bb..0e4b48794 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -144,9 +144,7 @@ public static KllDoublesSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllDoublesSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, DOUBLES_SKETCH); - return new KllDirectDoublesSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -386,6 +384,8 @@ else if (weight < levelsArr[0]) { /** * Vector update. Updates this sketch with the given array (vector) of items, starting at the items * offset for a length number of items. This is not supported for direct sketches. + *

Note: a single occurrence of a NaN in the array will force this method to use the conventional update path + * rather than the fast update path.

* @param items the vector of items * @param offset the starting index of the items[] array * @param length the number of items diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 35c8711ed..d41abb891 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -144,9 +144,7 @@ public static KllFloatsSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllFloatsSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, FLOATS_SKETCH); - return new KllDirectFloatsSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -386,6 +384,8 @@ else if (weight < levelsArr[0]) { /** * Vector update. Updates this sketch with the given array (vector) of items, starting at the items * offset for a length number of items. This is not supported for direct sketches. + *

Note: a single occurrence of a NaN in the array will force this method to use the conventional update path + * rather than the fast update path.

* @param items the vector of items * @param offset the starting index of the items[] array * @param length the number of items diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java index 3d784972f..73bfb5283 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -357,7 +357,7 @@ static MemorySegment memorySegmentSpaceMgmt( if (mSegReq == null) { mSegReq = MemorySegmentRequest.DEFAULT; } - final MemorySegment newSeg = mSegReq.request(oldWseg, requiredSketchBytes); + final MemorySegment newSeg = mSegReq.request(requiredSketchBytes); MemorySegment.copy(oldWseg, 0, newSeg, 0, DATA_START_ADR); //copy preamble (first 20 bytes) mSegReq.requestClose(oldWseg); return newSeg; diff --git a/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java b/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java index c5ada70b4..6acf93799 100644 --- a/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java @@ -144,9 +144,7 @@ public static KllLongsSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllLongsSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, LONGS_SKETCH); - return new KllDirectLongsSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -364,7 +362,7 @@ final void updateMinMax(final long item) { /** * Weighted update. Updates this sketch with the given item the number of times specified by the given integer weight. - * @param item the item to be repeated. NaNs are ignored. + * @param item the item to be repeated. * @param weight the number of times the update of item is to be repeated. It must be ≥ one. */ public void update(final long item, final long weight) { @@ -409,6 +407,8 @@ public void update(final long[] items, final int offset, final int length) { + + */ private void updateLong(final long[] srcItems, final int srcOffset, final int length) { if (isEmpty()) { diff --git a/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java index 4976039d5..168b80b16 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java @@ -358,7 +358,7 @@ private MemorySegment growCombinedSegBuffer(final int itemSpaceNeeded) { mSegReq_ = (mSegReq_ == null) ? MemorySegmentRequest.DEFAULT : mSegReq_; - final MemorySegment newSeg = mSegReq_.request(seg_, needBytes); + final MemorySegment newSeg = mSegReq_.request(needBytes); MemorySegment.copy(seg_, 0, newSeg, 0, segBytes); mSegReq_.requestClose(seg_); return newSeg; diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java index f4991d658..41355f63d 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java @@ -44,26 +44,16 @@ abstract class DoublesSketchAccessor extends DoublesBufferAccessor { final DoublesSketch ds, final boolean forceSize, final int level) { - this(checkLvl(level), ds, forceSize, level); - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - } - - private DoublesSketchAccessor( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final DoublesSketch ds, - final boolean forceSize, - final int level) { + checkLvl(level); ds_ = ds; forceSize_ = forceSize; setLevel(level); } - private static final boolean checkLvl(final int level) { + private static final void checkLvl(final int level) { if ((level != BB_LVL_IDX) && (level < 0)) { throw new SketchesArgumentException("Parameter level is < 0."); } - return true; } /** diff --git a/src/main/java/org/apache/datasketches/theta/BitPacking.java b/src/main/java/org/apache/datasketches/theta/BitPacking.java index cd7dfe1c9..fb8aa0619 100644 --- a/src/main/java/org/apache/datasketches/theta/BitPacking.java +++ b/src/main/java/org/apache/datasketches/theta/BitPacking.java @@ -24,7 +24,7 @@ /** * Used as part of Theta compression. */ -public final class BitPacking { +final class BitPacking { private BitPacking() { } @@ -36,7 +36,7 @@ private BitPacking() { } * @param bufOffset the byte offset in the buffer * @param bitOffset the bit offset */ - public static void packBits(final long value, int bits, final byte[] buffer, int bufOffset, final int bitOffset) { + static void packBits(final long value, int bits, final byte[] buffer, int bufOffset, final int bitOffset) { if (bitOffset > 0) { final int chunkBits = 8 - bitOffset; final int mask = (1 << chunkBits) - 1; @@ -65,7 +65,7 @@ public static void packBits(final long value, int bits, final byte[] buffer, int * @param bufOffset the buffer offset * @param bitOffset the bit offset */ - public static void unpackBits(final long[] value, final int index, int bits, final byte[] buffer, + static void unpackBits(final long[] value, final int index, int bits, final byte[] buffer, int bufOffset,final int bitOffset) { final int availBits = 8 - bitOffset; final int chunkBits = availBits <= bits ? availBits : bits; diff --git a/src/main/java/org/apache/datasketches/theta/CompactOperations.java b/src/main/java/org/apache/datasketches/theta/CompactOperations.java index 926600638..9fb917b24 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactOperations.java +++ b/src/main/java/org/apache/datasketches/theta/CompactOperations.java @@ -29,11 +29,11 @@ import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.SER_VER; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; @@ -122,7 +122,7 @@ static CompactSketch segmentToCompact( final MemorySegment dstWSeg) { //extract Pre0 fields and Flags from srcMem - final int srcPreLongs = extractPreLongs(srcSeg); + final int srcPreLongs = checkSegPreambleCap(srcSeg); final int srcSerVer = extractSerVer(srcSeg); //not used final int srcFamId = extractFamilyID(srcSeg); final int srcLgArrLongs = extractLgArrLongs(srcSeg); @@ -137,7 +137,7 @@ static CompactSketch segmentToCompact( final boolean srcSingleFlag = (srcFlags & SINGLEITEM_FLAG_MASK) > 0; final boolean single = srcSingleFlag - || SingleItemSketch.otherCheckForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags); + || SingleItemSketch.checkForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags); //extract pre1 and pre2 fields final int curCount = single ? 1 : (srcPreLongs > 1) ? extractCurCount(srcSeg) : 0; @@ -319,12 +319,12 @@ static long[] compactCache(final long[] srcCache, final int curCount, * This is checked in all compacting operations. * 7 <1.0 !0 F OK This corresponds to a sketch in estimation mode * - * #4 is handled by correctThetaOnCompat(boolean, int) (below). + * #4 is handled by correctThetaOnCompact(boolean, int) (below). * #2 & #6 handled by checkIllegalCurCountAndEmpty(boolean, int) */ /** - * This corrects a temporary anomalous condition where compact() is called on an UpdateSketch + * This corrects a temporary anomalous condition where compact() or toByteArray() is called on an UpdateSketch * that was initialized with p < 1.0 and update() was never called. In this case Theta < 1.0, * curCount = 0, and empty = true. The correction is to change Theta to 1.0, which makes the * returning sketch empty. This should only be used in the compaction or serialization of an @@ -347,8 +347,8 @@ static long correctThetaOnCompact(final boolean empty, final int curCount, * @param curCount the given current count */ //This handles #2 and #6 above static void checkIllegalCurCountAndEmpty(final boolean empty, final int curCount) { - if (empty && (curCount != 0)) { //this handles #2 and #6 above - throw new SketchesStateException("Illegal State: Empty=true and Current Count != 0."); + if (empty && curCount != 0) { //this handles #2 and #6 above + throw new SketchesStateException("Possible corruption. Illegal State: Empty=true and Current Count != 0."); } } diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index edd55165c..aaa751af0 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; -import static org.apache.datasketches.common.ByteArrayUtil.getShortLE; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; @@ -30,17 +29,15 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT; import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -69,16 +66,13 @@ public abstract class CompactSketch extends Sketch { *

The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.

* - *

This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * The resulting on-heap CompactSketch will be given the seedHash derived from the given sketch image. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.

+ *

The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.

* * @param srcSeg an image of a CompactSketch. * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg) { - return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -87,9 +81,7 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { *

The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.

* - *

This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketch images cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

+ *

This method checks if the given expectedSeed was used to create the source MemorySegment image.

* * @param srcSeg an image of a CompactSketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -97,10 +89,6 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - return heapify(srcSeg, expectedSeed, true); - } - - private static CompactSketch heapify(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = idToFamily(familyID); @@ -108,25 +96,18 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } if (serVer == 4) { - return heapifyV4(srcSeg, seed, enforceSeed); + return heapifyV4(srcSeg, expectedSeed); } if (serVer == 3) { final int flags = extractFlags(srcSeg); final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0; final boolean empty = (flags & EMPTY_FLAG_MASK) != 0; - if (enforceSeed && !empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + if (!empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, expectedSeed); } return CompactOperations.segmentToCompact(srcSeg, srcOrdered, null); } - //not SerVer 3, assume compact stored form - final short seedHash = Util.computeSeedHash(seed); - if (serVer == 1) { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); - } - if (serVer == 2) { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - throw new SketchesArgumentException("Unknown Serialization Version: " + serVer); + //not SerVer 3 or 4 + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -134,24 +115,17 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

- * *

Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting on-heap CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.

+ *

The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.

* * @param srcSeg an image of a Sketch. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg) { - return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -159,47 +133,33 @@ public static CompactSketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

- * *

Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

+ *

This method checks if the given expectedSeed was used to create the source MemorySegment image.

* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - return wrap(srcSeg, expectedSeed, true); - } - - private static CompactSketch wrap(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = Family.idToFamily(familyID); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); + final short seedHash = Util.computeSeedHash(expectedSeed); - if (serVer == 4) { - return DirectCompactCompressedSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - else if (serVer == 3) { + + if (serVer == 3) { if (PreambleUtil.isEmptyFlag(srcSeg)) { return EmptyCompactSketch.getHeapInstance(srcSeg); } - if (otherCheckForSingleItem(srcSeg)) { - return SingleItemSketch.heapify(srcSeg, enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + if (checkForSingleItem(srcSeg)) { + return SingleItemSketch.heapify(srcSeg, seedHash); } //not empty & not singleItem final int flags = extractFlags(srcSeg); @@ -213,91 +173,72 @@ else if (serVer == 3) { throw new SketchesArgumentException( "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } - return DirectCompactSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } //end of serVer 3 - else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); + return DirectCompactSketch.wrapInstance(srcSeg, seedHash); } - else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + if (serVer == 4) { + return DirectCompactCompressedSketch.wrapInstance(srcSeg, seedHash); } + //not SerVer 3 or 4 throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** - * Wrap takes the sketch image in the given MemorySegment and refers to it directly. + * Wrap takes the sketch image in the given byte array and refers to it directly. * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

+ *

Only sketches that have been explicitly stored as direct sketches can be wrapped.

* *

Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

This method checks if the DEFAULT_UPDATE_SEED was used to create the source MemorySegment image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.

+ *

This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.

* * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. * - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes) { - return wrap(bytes, Util.DEFAULT_UPDATE_SEED, false); + return wrap(bytes, Util.DEFAULT_UPDATE_SEED); } /** - * Wrap takes the sketch image in the given MemorySegment and refers to it directly. + * Wrap takes the sketch image in the given byte array and refers to it directly. * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

+ *

Only sketches that have been explicitly stored as direct sketches can be wrapped.

* *

Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

This method checks if the given expectedSeed was used to create the source MemorySegment image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

+ *

This method checks if the given expectedSeed was used to create the source byte array image.

* * @param bytes a byte array image of a Sketch that was created using the given expectedSeed. - * @param expectedSeed the seed used to validate the given MemorySegment image. + * @param expectedSeed the seed used to validate the given byte array image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) { - return wrap(bytes, expectedSeed, true); - } - - private static CompactSketch wrap(final byte[] bytes, final long seed, final boolean enforceSeed) { final int serVer = bytes[PreambleUtil.SER_VER_BYTE]; final int familyId = bytes[PreambleUtil.FAMILY_BYTE]; final Family family = Family.idToFamily(familyId); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); - if (serVer == 4) { - return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); - } else if (serVer == 3) { + final short seedHash = Util.computeSeedHash(expectedSeed); + + if (serVer == 3) { final int flags = bytes[FLAGS_BYTE]; if ((flags & EMPTY_FLAG_MASK) > 0) { return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); } final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + if (checkForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), seedHash); } //not empty & not singleItem final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; @@ -310,16 +251,14 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo throw new SketchesArgumentException( "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); - } else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + return WrappedCompactSketch.wrapInstance(bytes, seedHash); } + if (serVer ==4) { + return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); + } + //not SerVer 3 or 4 throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: Serialization Version " + serVer + " not recognized."); } //Sketch Overrides @@ -446,12 +385,12 @@ private byte[] toByteArrayV4() { return bytes; } - private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { - final int preLongs = extractPreLongs(srcSeg); + private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed) { + final int preLongs = Sketch.getPreambleLongs(srcSeg); final int entryBits = extractEntryBitsV4(srcSeg); final int numEntriesBytes = extractNumEntriesBytesV4(srcSeg); final short seedHash = (short) extractSeedHash(srcSeg); - if (enforceSeed) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + PreambleUtil.checkSegmentSeedHash(srcSeg, seed); int offsetBytes = 8; long theta = Long.MAX_VALUE; if (preLongs > 1) { diff --git a/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java index 6e7cad3c5..b70fdda36 100644 --- a/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java @@ -75,7 +75,9 @@ final class ConcurrentDirectQuickSelectSketch extends DirectQuickSelectSketch final double maxConcurrencyError, final MemorySegment dstSeg) { super(lgNomLongs, seed, 1.0F, //p ResizeFactor.X1, //rf, - dstSeg, false); //unionGadget + dstSeg, + null, + false); //unionGadget volatileThetaLong_ = Long.MAX_VALUE; volatileEstimate_ = 0; @@ -91,6 +93,7 @@ final class ConcurrentDirectQuickSelectSketch extends DirectQuickSelectSketch super(sketch.getLgNomLongs(), seed, 1.0F, //p ResizeFactor.X1, //rf, dstSeg, + null, false); //unionGadget exactLimit_ = ConcurrentSharedThetaSketch.computeExactLimit(1L << getLgNomLongs(), @@ -115,7 +118,7 @@ public double getEstimate() { @Override public boolean isEstimationMode() { - return (getRetainedEntries(false) > exactLimit_) || super.isEstimationMode(); + return getRetainedEntries(false) > exactLimit_ || super.isEstimationMode(); } @Override @@ -164,7 +167,7 @@ public long getExactLimit() { @Override public boolean startEagerPropagation() { while (!sharedPropagationInProgress_.compareAndSet(false, true)) { /* busy wait till free */ } - return (!isEstimationMode());// no eager propagation is allowed in estimation mode + return !isEstimationMode();// no eager propagation is allowed in estimation mode } @Override @@ -206,8 +209,8 @@ public void initBgPropagationService() { public boolean propagate(final AtomicBoolean localPropagationInProgress, final Sketch sketchIn, final long singleHash) { final long epoch = epoch_; - if ((singleHash != NOT_SINGLE_HASH) // namely, is a single hash and - && (getRetainedEntries(false) < exactLimit_)) { // a small sketch then propagate myself (blocking) + if (singleHash != NOT_SINGLE_HASH // namely, is a single hash and + && getRetainedEntries(false) < exactLimit_) { // a small sketch then propagate myself (blocking) if (!startEagerPropagation()) { endPropagation(localPropagationInProgress, true); return false; diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java index 2bf154215..4a3b80839 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4; import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits; @@ -70,12 +69,12 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe MemorySegment.copy(seg_, 0, dstSeg, 0, getCurrentBytes()); return new DirectCompactSketch(dstSeg); } - return CompactSketch.heapify(seg_); + return CompactSketch.heapify(seg_, Util.DEFAULT_UPDATE_SEED); } @Override public int getCurrentBytes() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); final int entryBits = extractEntryBitsV4(seg_); final int numEntriesBytes = extractNumEntriesBytesV4(seg_); return preLongs * Long.BYTES + numEntriesBytes + wholeBytesToHoldBits(getRetainedEntries() * entryBits); @@ -85,11 +84,11 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used - final int preLongs = extractPreLongs(seg_); // if > 1 then the second long has theta + final int preLongs = Sketch.getPreambleLongs(seg_); // if > 1 then the second long has theta final int numEntriesBytes = extractNumEntriesBytesV4(seg_); int offsetBytes = preLongs > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE; int numEntries = 0; @@ -101,7 +100,7 @@ public int getRetainedEntries(final boolean valid) { //compact is always valid @Override public long getThetaLong() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs > 1) ? extractThetaLongV4(seg_) : Long.MAX_VALUE; } @@ -119,7 +118,7 @@ public boolean isOrdered() { public HashIterator iterator() { return new MemorySegmentCompactCompressedHashIterator( seg_, - (extractPreLongs(seg_) > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE) + (Sketch.getPreambleLongs(seg_) > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE) + extractNumEntriesBytesV4(seg_), extractEntryBitsV4(seg_), getRetainedEntries() diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java index 2fcbf08d6..f393dc5b8 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java @@ -26,10 +26,9 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -53,7 +52,7 @@ class DirectCompactSketch extends CompactSketch { /** * Construct this sketch with the given MemorySegment. - * @param seg Read-only MemorySegment object with the order bit properly set. + * @param seg (optional) Read-only MemorySegment object. */ DirectCompactSketch(final MemorySegment seg) { seg_ = seg; @@ -81,22 +80,22 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe @Override public int getCurrentBytes() { - if (otherCheckForSingleItem(seg_)) { return 16; } - final int preLongs = extractPreLongs(seg_); + if (checkForSingleItem(seg_)) { return 16; } + final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); return (preLongs + curCount) << 3; } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid - if (otherCheckForSingleItem(seg_)) { return 1; } - final int preLongs = extractPreLongs(seg_); + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch + if (checkForSingleItem(seg_)) { return 1; } + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs == 1) ? 0 : extractCurCount(seg_); } @Override public long getThetaLong() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs > 2) ? extractThetaLong(seg_) : Long.MAX_VALUE; } @@ -147,8 +146,8 @@ public byte[] toByteArray() { @Override long[] getCache() { - if (otherCheckForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } - final int preLongs = extractPreLongs(seg_); + if (checkForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } + final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); if (curCount > 0) { final long[] cache = new long[curCount]; @@ -160,12 +159,12 @@ long[] getCache() { @Override int getCompactPreambleLongs() { - return extractPreLongs(seg_); + return Sketch.getPreambleLongs(seg_); } @Override int getCurrentPreambleLongs() { - return extractPreLongs(seg_); + return Sketch.getPreambleLongs(seg_); } @Override diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 45b01edba..723b6cc75 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -33,8 +33,7 @@ import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.PreambleUtil.insertCurCount; import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID; import static org.apache.datasketches.theta.PreambleUtil.insertFlags; @@ -60,8 +59,10 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.SuppressFBWarnings; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; import org.apache.datasketches.thetacommon.ThetaUtil; @@ -78,15 +79,27 @@ * @author Kevin Lang */ class DirectQuickSelectSketch extends DirectQuickSelectSketchR { + private static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space + int hashTableThreshold_; //computed and mutable, kept only on heap, never serialized. + private final MemorySegmentRequest mSegReq; + /** + * Construct this sketch as a result of a wrap operation where the given MemorySegment already has an updatable sketch image. + * @param wseg the given MemorySegment that has an updatable sketch image. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. + * @param seed See Update Hash Seed. + */ private DirectQuickSelectSketch( - final long seed, - final MemorySegment wseg) { - super(seed, wseg); + final MemorySegment wseg, + final MemorySegmentRequest mSegReq, + final long seed) { + this.mSegReq = mSegReq == null ? MemorySegmentRequest.DEFAULT : mSegReq; + super(wseg, seed); } /** * Construct a new sketch instance and initialize the given MemorySegment as its backing store. + * This is only called internally by other theta sketch classes. * * @param lgNomLongs See lgNomLongs. * @param seed See Update Hash Seed. @@ -96,6 +109,7 @@ private DirectQuickSelectSketch( * See Resize Factor * @param dstSeg the given MemorySegment object destination. It cannot be null. * It will be cleared prior to use. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param unionGadget true if this sketch is implementing the Union gadget function. * Otherwise, it is behaving as a normal QuickSelectSketch. */ @@ -105,43 +119,24 @@ private DirectQuickSelectSketch( final float p, final ResizeFactor rf, final MemorySegment dstSeg, + final MemorySegmentRequest mSegReq, final boolean unionGadget) { - this( - checkSegSize(lgNomLongs, rf, dstSeg, unionGadget), - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - lgNomLongs, - seed, - p, - rf, - dstSeg, - unionGadget); - } - private DirectQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final int lgNomLongs, - final long seed, - final float p, - final ResizeFactor rf, - final MemorySegment dstSeg, - final boolean unionGadget) { - super(seed, dstSeg); //Choose family, preambleLongs - final Family family; - final int preambleLongs; - if (unionGadget) { - preambleLongs = Family.UNION.getMinPreLongs(); - family = Family.UNION; - } - else { - preambleLongs = Family.QUICKSELECT.getMinPreLongs(); - family = Family.QUICKSELECT; - } + final Family family = unionGadget ? Family.UNION : Family.QUICKSELECT; + final int preambleLongs = unionGadget ? Family.UNION.getMinPreLongs() : Family.QUICKSELECT.getMinPreLongs(); - //Choose RF, minReqBytes, lgArrLongs. + //Set RF, lgArrLongs. final int lgRF = rf.lg(); - final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; + final int lgArrLongs = lgRF == 0 ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; + + //check Segment capacity + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); + final long curSegCapBytes = dstSeg.byteSize(); + if (curSegCapBytes < minReqBytes) { + throw new SketchesArgumentException( + "MemorySegment capacity is less than minimum required: " + curSegCapBytes + " < " + minReqBytes); + } //@formatter:off //Build preamble @@ -157,44 +152,34 @@ private DirectQuickSelectSketch( insertP(dstSeg, p); //bytes 12-15 final long thetaLong = (long)(p * LONG_MAX_VALUE_AS_DOUBLE); insertThetaLong(dstSeg, thetaLong); //bytes 16-23 - if (unionGadget) { - insertUnionThetaLong(dstSeg, thetaLong); - } //@formatter:on + if (unionGadget) { insertUnionThetaLong(dstSeg, thetaLong); } + //clear hash table area dstSeg.asSlice(preambleLongs << 3, Long.BYTES << lgArrLongs).fill((byte)0); - hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - } - - private static final boolean checkSegSize( - final int lgNomLongs, final ResizeFactor rf, final MemorySegment dstSeg, final boolean unionGadget) { - final int preambleLongs = (unionGadget) ? Family.UNION.getMinPreLongs() : Family.QUICKSELECT.getMinPreLongs(); - final int lgRF = rf.lg(); - final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); - final long curSegCapBytes = dstSeg.byteSize(); - if (curSegCapBytes < minReqBytes) { - throw new SketchesArgumentException( - "MemorySegment capacity is too small: " + curSegCapBytes + " < " + minReqBytes); - } - return true; + this.mSegReq = mSegReq == null ? MemorySegmentRequest.DEFAULT : mSegReq; + super(dstSeg, seed); } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed * @return instance of this sketch */ - static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + //called from UnionImpl and UpdateSketch + static DirectQuickSelectSketch writableWrap( + final MemorySegment srcSeg, + final MemorySegmentRequest mSegReq, + final long seed) { + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); if (isResizeFactorIncorrect(srcSeg, lgNomLongs, lgArrLongs)) { @@ -202,8 +187,7 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo insertLgResizeFactor(srcSeg, ResizeFactor.X2.lg()); } - final DirectQuickSelectSketch dqss = - new DirectQuickSelectSketch(seed, srcSeg); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, mSegReq, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -212,15 +196,19 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. * @param srcSeg The given MemorySegment must be in hash table form and not read only. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed * @return instance of this sketch */ - static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, final long seed) { + //called from UnionImpl <- Union + static DirectQuickSelectSketch fastWritableWrap( + final MemorySegment srcSeg, + final MemorySegmentRequest mSegReq, + final long seed) { final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - final DirectQuickSelectSketch dqss = - new DirectQuickSelectSketch(seed, srcSeg); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, mSegReq, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -233,7 +221,7 @@ static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, fina public UpdateSketch rebuild() { final int lgNomLongs = getLgNomLongs(); final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - if (getRetainedEntries(true) > (1 << lgNomLongs)) { + if (getRetainedEntries(true) > 1 << lgNomLongs) { quickSelectAndRebuild(wseg_, preambleLongs, lgNomLongs); } return this; @@ -274,20 +262,17 @@ UpdateReturnState hashUpdate(final long hash) { final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //The duplicate test - final int index = - HashOperations.hashSearchOrInsertMemorySegment(wseg_, lgArrLongs, hash, preambleLongs << 3); - if (index >= 0) { - return RejectedDuplicate; //Duplicate, not inserted - } + final int index = HashOperations.hashSearchOrInsertMemorySegment(wseg_, lgArrLongs, hash, preambleLongs << 3); + if (index >= 0) { return RejectedDuplicate; } //Duplicate, not inserted + //insertion occurred, increment curCount final int curCount = getRetainedEntries(true) + 1; wseg_.set(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT, curCount); //update curCount if (isOutOfSpace(curCount)) { //we need to do something, we are out of space - if (lgArrLongs > lgNomLongs) { //at full size, rebuild - //Assumes no dirty values, changes thetaLong, curCount_ - assert (lgArrLongs == (lgNomLongs + 1)) : "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs; + if (lgArrLongs > lgNomLongs) { //at full size, rebuild, assumes no dirty values, changes thetaLong, curCount_ + assert lgArrLongs == lgNomLongs + 1 : "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs; //rebuild, refresh curCount based on # values in the hashtable. quickSelectAndRebuild(wseg_, preambleLongs, lgNomLongs); return InsertedCountIncrementedRebuilt; @@ -305,23 +290,45 @@ UpdateReturnState hashUpdate(final long hash) { return InsertedCountIncrementedResized; } //end of Expand in current MemorySegment, exit. - else { - //Request more space, then resize. lgArrLongs will change; thetaLong, curCount will not + else { //Request larger segment, then resize. lgArrLongs will change; thetaLong, curCount will not final int preBytes = preambleLongs << 3; tgtLgArrLongs = Math.min(lgArrLongs + lgRF, lgNomLongs + 1); final int tgtArrBytes = 8 << tgtLgArrLongs; final int reqBytes = tgtArrBytes + preBytes; - final MemorySegment newDstSeg = MemorySegment.ofArray(new byte[reqBytes]); + + final MemorySegment newDstSeg = mSegReq.request(reqBytes); moveAndResize(wseg_, preambleLongs, lgArrLongs, newDstSeg, tgtLgArrLongs, thetaLong); + final MemorySegment oldSeg = wseg_; wseg_ = newDstSeg; + mSegReq.requestClose(oldSeg); hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, tgtLgArrLongs); return InsertedCountIncrementedResized; - } //end of Request more space to resize + } //end of request new segment & resize } //end of resize } //end of isOutOfSpace return InsertedCountIncremented; } + @Override + boolean isOutOfSpace(final int numEntries) { + return numEntries > hashTableThreshold_; + } + + /** + * Returns the cardinality limit given the current size of the hash table array. + * + * @param lgNomLongs See lgNomLongs. + * @param lgArrLongs See lgArrLongs. + * @return the hash table threshold + */ + @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") + protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { + //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, + //but this allows us to tune these constants for different sketches. + final double fraction = lgArrLongs <= lgNomLongs ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; + return (int) (fraction * (1 << lgArrLongs)); + } + } diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 0a81f4887..f78fbced4 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -28,7 +28,6 @@ import static org.apache.datasketches.theta.CompactOperations.correctThetaOnCompact; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT; @@ -37,22 +36,20 @@ import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong; import java.lang.foreign.MemorySegment; +import java.util.Objects; import org.apache.datasketches.common.Family; import org.apache.datasketches.common.MemorySegmentStatus; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesReadOnlyException; -import org.apache.datasketches.common.SuppressFBWarnings; -import org.apache.datasketches.thetacommon.ThetaUtil; /** - * The default Theta Sketch using the QuickSelect algorithm. - * This is the read-only implementation with non-functional methods, which affect the state. + * The read-only Theta Sketch. * *

This implementation uses data in a given MemorySegment that is owned and managed by the caller. * This MemorySegment can be off-heap, which if managed properly will greatly reduce the need for @@ -62,53 +59,67 @@ * @author Kevin Lang */ class DirectQuickSelectSketchR extends UpdateSketch { - static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space - final long seed_; //provided, kept only on heap, never serialized. - int hashTableThreshold_; //computed, kept only on heap, never serialized. - MemorySegment wseg_; //A MemorySegment for child class, but no write methods here - - //only called by DirectQuickSelectSketch and below - DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { - seed_ = seed; + + /** + * This MemorySegment reference is also used by the writable child DirectQuickSelectSketch. + * + *

When this class is constructed with the writable constructor, called by the writable child DirectQuickSelectSketch, + * this reference can be changed, its contents can be modified.

+ * + *

When this class is constructed with the read-only constructor, called from local factories, this MemorySegment will + * be placed in read-only mode.

+ */ + MemorySegment wseg_; // + + /** + * This writable constructor is only called by the writable child DirectQuickSelectSketch and then this class provides the + * read-only methods for the DirectQuickSelectSketch class. + * @param wseg the writable MemorySegment used by the writable child DirectQuickSelectSketch. + * @param seed the seed for the update function for the writable child DirectQuickSelectSketch. + */ + DirectQuickSelectSketchR(final MemorySegment wseg, final long seed) { + Objects.requireNonNull(wseg, "MemorySegment wseg must not be null"); + super(seed); wseg_ = wseg; } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * This read-only constructor is only called by local factory methods which use this class as a read-only direct sketch. + * @param seed the seed used to validate the internal hashes of the given source MemorySegment. + * @param srcSeg the read-only MemorySegment used by this class in read-only mode. + */ + private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { + Objects.requireNonNull(srcSeg, "MemorySegment srcSeg must not be null"); + super(seed); + wseg_ = srcSeg.asReadOnly(); + } + + /** + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg the source MemorySegment. * The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed * @return instance of this sketch */ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = checkSegPreambleCap(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); - - final DirectQuickSelectSketchR dqssr = - new DirectQuickSelectSketchR(seed, srcSeg); - dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqssr; + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + return new DirectQuickSelectSketchR(seed, srcSeg); } /** * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. + * Caller must ensure segment contents are a valid sketch image. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed * @return instance of this sketch */ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int lgNomLongs = srcSeg.get(JAVA_BYTE, LG_NOM_LONGS_BYTE) & 0XFF; - final int lgArrLongs = srcSeg.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; - - final DirectQuickSelectSketchR dqss = new DirectQuickSelectSketchR(seed, srcSeg); - dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqss; + return new DirectQuickSelectSketchR(seed, srcSeg); } //Sketch @@ -116,9 +127,9 @@ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, fin @Override public int getCurrentBytes() { //not compact - final byte lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE); - final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - return (preLongs + (1 << lgArrLongs)) << 3; + final int lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte + final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits + return preLongs + (1 << lgArrLongs) << 3; } @Override @@ -130,12 +141,12 @@ public double getEstimate() { @Override public Family getFamily() { - final int familyID = wseg_.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; + final int familyID = wseg_.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; //mask to byte return Family.idToFamily(familyID); } @Override - public int getRetainedEntries(final boolean valid) { //always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } @@ -146,7 +157,7 @@ public long getThetaLong() { @Override public boolean hasMemorySegment() { - return (wseg_ != null) && wseg_.scope().isAlive(); + return wseg_ != null && wseg_.scope().isAlive(); } @Override @@ -161,7 +172,7 @@ public boolean isEmpty() { @Override public boolean isSameResource(final MemorySegment that) { - return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); + return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); //null checks done here } @Override @@ -171,14 +182,14 @@ public HashIterator iterator() { @Override public byte[] toByteArray() { //MY_FAMILY is stored in wseg_ - checkIllegalCurCountAndEmpty(isEmpty(), extractCurCount(wseg_)); + final int curCount = extractCurCount(wseg_); + checkIllegalCurCountAndEmpty(isEmpty(), curCount); final int lengthBytes = getCurrentBytes(); final byte[] byteArray = new byte[lengthBytes]; final MemorySegment seg = MemorySegment.ofArray(byteArray); MemorySegment.copy(wseg_, 0, seg, 0, lengthBytes); - final long thetaLong = - correctThetaOnCompact(isEmpty(), extractCurCount(wseg_), extractThetaLong(wseg_)); - insertThetaLong(wseg_, thetaLong); + final long thetaLong = correctThetaOnCompact(isEmpty(), curCount, extractThetaLong(wseg_)); + insertThetaLong(seg, thetaLong); return byteArray; } @@ -199,11 +210,6 @@ public ResizeFactor getResizeFactor() { return ResizeFactor.getRF(getLgRF()); } - @Override - long getSeed() { - return seed_; - } - @Override public UpdateSketch rebuild() { throw new SketchesReadOnlyException(); @@ -218,8 +224,8 @@ public void reset() { @Override long[] getCache() { - final long lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; - final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; + final long lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte + final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits final long[] cacheArr = new long[1 << lgArrLongs]; MemorySegment.copy(wseg_, JAVA_LONG_UNALIGNED, preambleLongs << 3, cacheArr, 0, 1 << lgArrLongs); return cacheArr; @@ -232,7 +238,7 @@ int getCompactPreambleLongs() { @Override int getCurrentPreambleLongs() { - return PreambleUtil.extractPreLongs(wseg_); + return Sketch.getPreambleLongs(wseg_); } @Override @@ -251,17 +257,17 @@ boolean isDirty() { } @Override - boolean isOutOfSpace(final int numEntries) { - return numEntries > hashTableThreshold_; + boolean isOutOfSpace(final int numEntries) { //overridden by writable DirectQuickSelectSketch + return false; } @Override int getLgArrLongs() { - return wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; + return wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte } int getLgRF() { //only Direct needs this - return (wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3; + return wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; //mask to 2 bits } @Override @@ -269,19 +275,4 @@ UpdateReturnState hashUpdate(final long hash) { throw new SketchesReadOnlyException(); } - /** - * Returns the cardinality limit given the current size of the hash table array. - * - * @param lgNomLongs See lgNomLongs. - * @param lgArrLongs See lgArrLongs. - * @return the hash table threshold - */ - @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") - protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { - //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, - //but this allows us to tune these constants for different sketches. - final double fraction = (lgArrLongs <= lgNomLongs) ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; - return (int) (fraction * (1 << lgArrLongs)); - } - } diff --git a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java index 45a17d40d..793ce1763 100644 --- a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java @@ -86,7 +86,7 @@ public int getCurrentBytes() { public double getEstimate() { return 0; } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 0; } diff --git a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java deleted file mode 100644 index 723a8b651..000000000 --- a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; -import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.SketchesArgumentException; - -/** - * Used to convert older serialization versions 1 and 2 to version 3. The Serialization - * Version is the version of the sketch binary image format and should not be confused with the - * version number of the Open Source DataSketches Library. - * - * @author Lee Rhodes - */ -final class ForwardCompatibility { - - private ForwardCompatibility() { } - - /** - * Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch. - * Note: SerVer 1 sketches always have (metadata) preamble-longs of 3 and are always stored - * in a compact ordered form, but with 3 different sketch types. All SerVer 1 sketches will - * be converted to a SerVer 3 sketches. There is no concept of p-sampling, no empty bit. - * - * @param srcSeg the image of a SerVer 1 sketch - * - * @param seedHash See Seed Hash. - * The seedHash that matches the seedHash of the original seed used to construct the sketch. - * Note: SerVer 1 sketches do not have the concept of the SeedHash, so the seedHash provided here - * MUST be derived from the actual seed that was used when the SerVer 1 sketches were built. - * @return a SerVer 3 {@link CompactSketch}. - */ - static final CompactSketch heapify1to3(final MemorySegment srcSeg, final short seedHash) { - final int segCap = (int) srcSeg.byteSize(); - final int preLongs = extractPreLongs(srcSeg); //always 3 for serVer 1 - if (preLongs != 3) { - throw new SketchesArgumentException("PreLongs must be 3 for SerVer 1: " + preLongs); - } - final int familyId = extractFamilyID(srcSeg); //1,2,3 - if ((familyId < 1) || (familyId > 3)) { - throw new SketchesArgumentException("Family ID (Sketch Type) must be 1 to 3: " + familyId); - } - final int curCount = extractCurCount(srcSeg); - final long thetaLong = extractThetaLong(srcSeg); - final boolean empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); - - if (empty || (segCap <= 24)) { //return empty - return EmptyCompactSketch.getInstance(); - } - - final int reqCap = (curCount + preLongs) << 3; - validateInputSize(reqCap, segCap); - - if ((thetaLong == Long.MAX_VALUE) && (curCount == 1)) { - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //theta < 1.0 and/or curCount > 1 - - final long[] compactOrderedCache = new long[curCount]; - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong, true); - } - - /** - * Convert a serialization version (SerVer) 2 sketch to a SerVer 3 HeapCompactOrderedSketch. - * Note: SerVer 2 sketches can have metadata-longs of 1,2 or 3 and are always stored - * in a compact ordered form (not as a hash table), but with 4 different sketch types. - * @param srcSeg the image of a SerVer 2 sketch - * @param seedHash See Seed Hash. - * The seed used for building the sketch image in srcMem - * @return a SerVer 3 HeapCompactOrderedSketch - */ - static final CompactSketch heapify2to3(final MemorySegment srcSeg, final short seedHash) { - final int segCap = (int) srcSeg.byteSize(); - final int preLongs = extractPreLongs(srcSeg); //1,2 or 3 - final int familyId = extractFamilyID(srcSeg); //1,2,3,4 - if ((familyId < 1) || (familyId > 4)) { - throw new SketchesArgumentException("Family (Sketch Type) must be 1 to 4: " + familyId); - } - int reqBytesIn = 8; - int curCount = 0; - long thetaLong = Long.MAX_VALUE; - if (preLongs == 1) { - reqBytesIn = 8; - validateInputSize(reqBytesIn, segCap); - return EmptyCompactSketch.getInstance(); - } - if (preLongs == 2) { //includes pre0 + count, no theta (== 1.0) - reqBytesIn = preLongs << 3; - validateInputSize(reqBytesIn, segCap); - curCount = extractCurCount(srcSeg); - if (curCount == 0) { - return EmptyCompactSketch.getInstance(); - } - if (curCount == 1) { - reqBytesIn = (preLongs + 1) << 3; - validateInputSize(reqBytesIn, segCap); - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //curCount > 1 - reqBytesIn = (curCount + preLongs) << 3; - validateInputSize(reqBytesIn, segCap); - final long[] compactOrderedCache = new long[curCount]; - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong,true); - } - if (preLongs == 3) { //pre0 + count + theta - reqBytesIn = (preLongs) << 3; // - validateInputSize(reqBytesIn, segCap); - curCount = extractCurCount(srcSeg); - thetaLong = extractThetaLong(srcSeg); - if ((curCount == 0) && (thetaLong == Long.MAX_VALUE)) { - return EmptyCompactSketch.getInstance(); - } - if ((curCount == 1) && (thetaLong == Long.MAX_VALUE)) { - reqBytesIn = (preLongs + 1) << 3; - validateInputSize(reqBytesIn, segCap); - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //curCount > 1 and/or theta < 1.0 - reqBytesIn = (curCount + preLongs) << 3; - validateInputSize(reqBytesIn, segCap); - final long[] compactOrderedCache = new long[curCount]; - //srcSeg.getLongArray(preLongs << 3, compactOrderedCache, 0, curCount); - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong, true); - } - throw new SketchesArgumentException("PreLongs must be 1,2, or 3: " + preLongs); - } - - private static final void validateInputSize(final int reqBytesIn, final int segCap) { - if (reqBytesIn > segCap) { - throw new SketchesArgumentException( - "Input MemorySegment or byte[] size is too small: Required Bytes: " + reqBytesIn - + ", bytesIn: " + segCap); - } - } - -} diff --git a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java index 8a35631ab..5a5c16f00 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java @@ -23,6 +23,7 @@ import static java.lang.Math.min; import static java.lang.Math.sqrt; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; +import static org.apache.datasketches.common.Util.DEFAULT_UPDATE_SEED; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.checkBounds; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; @@ -31,7 +32,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; import static org.apache.datasketches.theta.PreambleUtil.extractP; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountNotIncremented; @@ -112,6 +112,17 @@ static HeapAlphaSketch newHeapInstance(final int lgNomLongs, final long seed, fi return has; } + /** + * Heapify a sketch from a MemorySegment object containing sketch data. + * @param srcSeg The source MemorySegment object. + * It must have a size of at least 24 bytes. + * The assumed seed is {@link org.apache.datasketches.common.Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} + * @return instance of this sketch + */ + static HeapAlphaSketch heapifyInstance(final MemorySegment srcSeg) { + return heapifyInstance(srcSeg, DEFAULT_UPDATE_SEED); + } + /** * Heapify a sketch from a MemorySegment object containing sketch data. * @param srcSeg The source MemorySegment object. @@ -123,7 +134,7 @@ static HeapAlphaSketch newHeapInstance(final int lgNomLongs, final long seed, fi static HeapAlphaSketch heapifyInstance(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); checkBounds(0, 24, srcSeg.byteSize()); - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 @@ -197,7 +208,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch if (curCount_ > 0) { if (valid && isDirty()) { return HashOperations.countPart(getCache(), getLgArrLongs(), getThetaLong()); @@ -234,14 +245,14 @@ public boolean isEmpty() { *
    * Long || Start Byte Adr:
    * Adr:
-   *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |        0           |
-   *  0   ||    Seed Hash    | Flags  |  LgArr | LgNom  | FamID  | SerVer | lgRF | PreLongs=3  |
+   *      ||    7   |    6   |    5   |    4   |    3   |    2     |    1     |        0           |
+   *  0   ||    Seed Hash    | Flags  |  LgArr | LgNom  | FamID=1  | SerVer=3 | lgRF | PreLongs=3  |
    *
-   *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
-   *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
+   *      ||   15   |   14   |   13   |   12   |   11   |   10     |    9     |     8              |
+   *  1   ||-----------------p-----------------|----------Retained Entries Count-------------------|
    *
-   *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
-   *  2   ||---------------------------------Theta---------------------------------------------|
+   *      ||   23   |   22   |   21    |  20   |   19   |   18     |   17     |    16              |
+   *  2   ||---------------------------------Theta-------------------------------------------------|
    * 
*/ diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java index fdd2860ce..69eebff5f 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java @@ -57,6 +57,7 @@ final class HeapCompactSketch extends CompactSketch { * @param curCount correct value * @param thetaLong The correct * thetaLong. + * @param ordered true if cache is ordered. */ HeapCompactSketch(final long[] cache, final boolean empty, final short seedHash, final int curCount, final long thetaLong, final boolean ordered) { @@ -87,7 +88,7 @@ public int getCurrentBytes() { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return curCount_; } diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java index 5d8af6bfb..c23deebf1 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java @@ -30,7 +30,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; import static org.apache.datasketches.theta.PreambleUtil.extractP; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncrementedRebuilt; @@ -108,11 +107,11 @@ private HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float * @return instance of this sketch */ static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); final float p = extractP(srcSeg); //bytes 12-15 @@ -150,7 +149,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return curCount_; } diff --git a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java index 87e1892b8..56175a019 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java @@ -49,13 +49,12 @@ */ abstract class HeapUpdateSketch extends UpdateSketch { final int lgNomLongs_; - private final long seed_; private final float p_; private final ResizeFactor rf_; HeapUpdateSketch(final int lgNomLongs, final long seed, final float p, final ResizeFactor rf) { + super(seed); lgNomLongs_ = Math.max(lgNomLongs, ThetaUtil.MIN_LG_NOM_LONGS); - seed_ = seed; p_ = p; rf_ = rf; } @@ -66,7 +65,7 @@ abstract class HeapUpdateSketch extends UpdateSketch { public int getCurrentBytes() { final int preLongs = getCurrentPreambleLongs(); final int dataLongs = getCurrentDataLongs(); - return (preLongs + dataLongs) << 3; + return preLongs + dataLongs << 3; } //UpdateSketch @@ -86,11 +85,6 @@ public ResizeFactor getResizeFactor() { return rf_; } - @Override - long getSeed() { - return seed_; - } - //restricted methods @Override @@ -102,14 +96,14 @@ short getSeedHash() { byte[] toByteArray(final int preLongs, final byte familyID) { if (isDirty()) { rebuild(); } checkIllegalCurCountAndEmpty(isEmpty(), getRetainedEntries(true)); - final int preBytes = (preLongs << 3) & 0X3F; //24 bytes + final int preBytes = (preLongs << 3) & 0X3F; //24 bytes; mask to 6 bits final int dataBytes = getCurrentDataLongs() << 3; final byte[] byteArrOut = new byte[preBytes + dataBytes]; final MemorySegment segOut = MemorySegment.ofArray(byteArrOut); //preamble first 8 bytes. Note: only compact can be reduced to 8 bytes. - final int lgRf = getResizeFactor().lg() & 0x3; + final int lgRf = getResizeFactor().lg() & 0x3; //mask to 2 bits insertPreLongs(segOut, preLongs); //byte 0 low 6 bits insertLgResizeFactor(segOut, lgRf); //byte 0 high 2 bits insertSerVer(segOut, SER_VER); //byte 1 diff --git a/src/main/java/org/apache/datasketches/theta/Intersection.java b/src/main/java/org/apache/datasketches/theta/Intersection.java index a31dc3ef9..134c49ff6 100644 --- a/src/main/java/org/apache/datasketches/theta/Intersection.java +++ b/src/main/java/org/apache/datasketches/theta/Intersection.java @@ -20,23 +20,13 @@ package org.apache.datasketches.theta; import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static org.apache.datasketches.common.Util.floorPowerOf2; -import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER; import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; -import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; -import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import java.lang.foreign.MemorySegment; -import java.util.Arrays; import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.thetacommon.ThetaUtil; /** * The API for intersection operations @@ -164,84 +154,4 @@ public static Intersection wrap(final MemorySegment srcSeg, final long expectedS return IntersectionImpl.wrapInstance(srcSeg, expectedSeed, srcSeg.isReadOnly() ); } - // Restricted - - /** - * Returns the maximum lgArrLongs given the capacity of the MemorySegment. - * @param dstSeg the given MemorySegment - * @return the maximum lgArrLongs given the capacity of the MemorySegment - */ - protected static int getMaxLgArrLongs(final MemorySegment dstSeg) { - final int preBytes = CONST_PREAMBLE_LONGS << 3; - final long cap = dstSeg.byteSize(); - return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3); - } - - protected static void checkMinSizeMemorySegment(final MemorySegment seg) { - final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280 - final long cap = seg.byteSize(); - if (cap < minBytes) { - throw new SketchesArgumentException( - "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap); - } - } - - /** - * Compact first 2^lgArrLongs of given array - * @param srcCache anything - * @param lgArrLongs The correct - * lgArrLongs. - * @param curCount must be correct - * @param thetaLong The correct - * thetaLong. - * @param dstOrdered true if output array must be sorted - * @return the compacted array - */ //Only used in IntersectionImpl & Test - static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs, - final int curCount, final long thetaLong, final boolean dstOrdered) { - if (curCount == 0) { - return new long[0]; - } - final long[] cacheOut = new long[curCount]; - final int len = 1 << lgArrLongs; - int j = 0; - for (int i = 0; i < len; i++) { - final long v = srcCache[i]; - if (v <= 0L || v >= thetaLong ) { continue; } - cacheOut[j++] = v; - } - assert curCount == j; - if (dstOrdered) { - Arrays.sort(cacheOut); - } - return cacheOut; - } - - protected static void segChecks(final MemorySegment srcSeg) { - //Get Preamble - //Note: Intersection does not use lgNomLongs (or k), per se. - //seedHash loaded and checked in private constructor - final int preLongs = extractPreLongs(srcSeg); - final int serVer = extractSerVer(srcSeg); - final int famID = extractFamilyID(srcSeg); - final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0; - final int curCount = extractCurCount(srcSeg); - //Checks - if (preLongs != CONST_PREAMBLE_LONGS) { - throw new SketchesArgumentException( - "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs); - } - if (serVer != SER_VER) { - throw new SketchesArgumentException("Serialization Version must equal " + SER_VER); - } - Family.INTERSECTION.checkFamilyID(famID); - if (empty) { - if (curCount != 0) { - throw new SketchesArgumentException( - "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount); - } - //empty = true AND curCount_ = 0: OK - } //else empty = false, curCount could be anything - } - } diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java index ebb4a6215..6819524b1 100644 --- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java @@ -26,6 +26,7 @@ import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; import static org.apache.datasketches.common.Util.clearBits; +import static org.apache.datasketches.common.Util.floorPowerOf2; import static org.apache.datasketches.common.Util.setBits; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; @@ -41,8 +42,10 @@ import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.clearEmpty; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; +import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; +import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.PreambleUtil.insertCurCount; import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID; @@ -81,17 +84,17 @@ * @author Kevin Lang */ final class IntersectionImpl extends Intersection { - protected final short seedHash_; - protected final boolean readOnly_; //True if this sketch is to be treated as read only - protected final MemorySegment wseg_; - protected final int maxLgArrLongs_; //only used with MemorySegment, not serialized + private final short seedHash_; + private final boolean readOnly_; //True if this sketch is to be treated as read only + private final MemorySegment wseg_; + private final int maxLgArrLongs_; //only used with MemorySegment, not serialized //Note: Intersection does not use lgNomLongs or k, per se. - protected int lgArrLongs_; //current size of hash table - protected int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true - protected long thetaLong_; - protected boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE! - protected long[] hashTable_; //retained entries of the intersection, on-heap only. + private int lgArrLongs_; //current size of hash table + private int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true + private long thetaLong_; + private boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE! + private long[] hashTable_; //retained entries of the intersection, on-heap only. /** * Constructor: Sets the class finals and computes, sets and checks the seedHash. @@ -100,14 +103,14 @@ final class IntersectionImpl extends Intersection { * @param dstSegFlag The given MemorySegment is a Destination (new offHeap) MemorySegment. * @param readOnly True if MemorySegment is to be treated as read only. */ - protected IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag, + private IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag, final boolean readOnly) { readOnly_ = readOnly; if (wseg != null) { wseg_ = wseg; if (dstSegFlag) { //DstSeg: compute & store seedHash, no seedHash checking - checkMinSizeMemorySegment(wseg); - maxLgArrLongs_ = !readOnly ? getMaxLgArrLongs(wseg) : 0; //Only Off Heap + IntersectionImpl.checkMinSizeMemorySegment(wseg); + maxLgArrLongs_ = !readOnly ? IntersectionImpl.getMaxLgArrLongs(wseg) : 0; //Only Off Heap seedHash_ = Util.computeSeedHash(seed); wseg_.set(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT, seedHash_); } else { //SrcSeg:gets and stores the seedHash, checks seg_seedHash against the seed @@ -179,8 +182,8 @@ static IntersectionImpl initNewDirectInstance(final long seed, final MemorySegme static IntersectionImpl heapifyInstance(final MemorySegment srcSeg, final long seed) { final boolean dstSegFlag = false; final boolean readOnly = false; + IntersectionImpl.segChecks(srcSeg); final IntersectionImpl impl = new IntersectionImpl(null, seed, dstSegFlag, readOnly); - segChecks(srcSeg); //Initialize impl.lgArrLongs_ = extractLgArrLongs(srcSeg); @@ -207,8 +210,8 @@ static IntersectionImpl wrapInstance( final long seed, final boolean readOnly) { final boolean dstSegFlag = false; + IntersectionImpl.segChecks(srcSeg); final IntersectionImpl impl = new IntersectionImpl(srcSeg, seed, dstSegFlag, readOnly); - segChecks(srcSeg); impl.lgArrLongs_ = extractLgArrLongs(srcSeg); impl.curCount_ = extractCurCount(srcSeg); impl.thetaLong_ = extractThetaLong(srcSeg); @@ -333,7 +336,7 @@ public CompactSketch getResult(final boolean dstOrdered, final MemorySegment dst } else { hashTable = hashTable_; } - compactCache = compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered); + compactCache = IntersectionImpl.compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered); srcCompact = true; srcOrdered = dstOrdered; return CompactOperations.componentsToCompact( @@ -561,4 +564,83 @@ private void resetCommon() { thetaLong_ = Long.MAX_VALUE; hashTable_ = null; } + + /** + * Compact first 2^lgArrLongs of given array + * @param srcCache anything + * @param lgArrLongs The correct + * lgArrLongs. + * @param curCount must be correct + * @param thetaLong The correct + * thetaLong. + * @param dstOrdered true if output array must be sorted + * @return the compacted array + */ //used in Test + static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs, + final int curCount, final long thetaLong, final boolean dstOrdered) { + if (curCount == 0) { + return new long[0]; + } + final long[] cacheOut = new long[curCount]; + final int len = 1 << lgArrLongs; + int j = 0; + for (int i = 0; i < len; i++) { + final long v = srcCache[i]; + if (v <= 0L || v >= thetaLong ) { continue; } + cacheOut[j++] = v; + } + assert curCount == j; + if (dstOrdered) { + Arrays.sort(cacheOut); + } + return cacheOut; + } + + private static void checkMinSizeMemorySegment(final MemorySegment seg) { + final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280 + final long cap = seg.byteSize(); + if (cap < minBytes) { + throw new SketchesArgumentException( + "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap); + } + } + + /** + * Returns the maximum lgArrLongs given the capacity of the MemorySegment. + * @param dstSeg the given MemorySegment + * @return the maximum lgArrLongs given the capacity of the MemorySegment + */ + private static int getMaxLgArrLongs(final MemorySegment dstSeg) { + final int preBytes = CONST_PREAMBLE_LONGS << 3; + final long cap = dstSeg.byteSize(); + return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3); + } + + private static void segChecks(final MemorySegment srcSeg) { + //Get Preamble + //Note: Intersection does not use lgNomLongs (or k), per se. + //seedHash loaded and checked in private constructor + final int preLongs = Sketch.getPreambleLongs(srcSeg); + final int serVer = extractSerVer(srcSeg); + final int famID = extractFamilyID(srcSeg); + final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0; + final int curCount = extractCurCount(srcSeg); + //Checks + if (preLongs != CONST_PREAMBLE_LONGS) { + throw new SketchesArgumentException( + "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs); + } + if (serVer != SER_VER) { + throw new SketchesArgumentException("Serialization Version must equal " + SER_VER); + } + Family.INTERSECTION.checkFamilyID(famID); + if (empty) { + if (curCount != 0) { + throw new SketchesArgumentException( + "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount); + } + //empty = true AND curCount_ = 0: OK + } //else empty = false, curCount could be anything + } + } diff --git a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java index 548c79ef3..53344c8d6 100644 --- a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java +++ b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java @@ -38,7 +38,7 @@ final class MemorySegmentHashIterator implements HashIterator { this.seg = srcSeg; this.arrLongs = arrLongs; this.thetaLong = thetaLong; - offsetBytes = PreambleUtil.extractPreLongs(srcSeg) << 3; + offsetBytes = Sketch.getPreambleLongs(srcSeg) << 3; index = -1; hash = 0; } diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index b3451fcd1..4dd993eb3 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -126,17 +126,17 @@ *
  * Long || Start Byte Adr:
  * Adr:
- *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
- *  0   ||    Seed Hash    | Flags  | numEB  | entBits| FamID  | SerVer |     PreLongs = 3   |
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1     |   0              |
+ *  0   ||    Seed Hash    | Flags  | numEB  | entBits| FamID  | SerVer=4 |   PreLongs = 3   |
  *
- *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
- *  1   ||------------------------------THETA_LONG-------------------------------------------|
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9     |   8              |
+ *  1   ||------------------------------THETA_LONG-------------------------------------------| (only if estimating)
  *
- *      ||        |        |        |  (20)  |  (19)  |  (18)  |  (17)  |    16              |
- *  2   ||----------------Retained Entries stored as 1 to 4 bytes----------------------------|
+ *      ||        |        |        |   20   |  (19)  |  (18)  |  (17)    |  16              |
+ *  2   ||--------Retained Entries stored as 1 to 4 bytes in bytes 16-19---------------------|
  *
- *      ||        |        |        |        |        |        |        |                    |
- *  3   ||------------------Delta encoded compressed byte array------------------------------|
+ *      ||        |        |        |        |        |        |          |                  |
+ *  3   ||--------Delta encoded compressed byte array starts at bytes 17-20------------------|
  *  
* *

The UpdateSketch and AlphaSketch require 24 bytes of preamble followed by a non-compact @@ -190,10 +190,10 @@ private PreambleUtil() {} // ###### DO NOT MESS WITH THIS FROM HERE ... // Preamble byte Addresses - static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte. - static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte. Not used by compact, direct + static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte 0. + static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte 0. Used by Update, Alpha, not used by compact, direct static final int SER_VER_BYTE = 1; - static final int FAMILY_BYTE = 2; //SerVer1,2 was SKETCH_TYPE_BYTE + static final int FAMILY_BYTE = 2; static final int LG_NOM_LONGS_BYTE = 3; //not used by compact static final int LG_ARR_LONGS_BYTE = 4; //not used by compact static final int FLAGS_BYTE = 5; @@ -203,28 +203,23 @@ private PreambleUtil() {} static final int THETA_LONG = 16; //8-byte aligned static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union - // flag bit masks - static final int RESERVED_FLAG_MASK = 1; //SerVer 1, 2, 3. Now Reserved, no longer used. - static final int READ_ONLY_FLAG_MASK = 2; //Set but not read. Reserved. SerVer 1, 2, 3 - static final int EMPTY_FLAG_MASK = 4; //SerVer 2, 3 - static final int COMPACT_FLAG_MASK = 8; //SerVer 2 was NO_REBUILD_FLAG_MASK, 3 - static final int ORDERED_FLAG_MASK = 16;//SerVer 2 was UNORDERED_FLAG_MASK, 3 - static final int SINGLEITEM_FLAG_MASK = 32;//SerVer 3 - //The last 2 bits of the flags byte are reserved and assumed to be zero, for now. - - //Backward compatibility: SerVer1 preamble always 3 longs, SerVer2 preamble: 1, 2, 3 longs - // SKETCH_TYPE_BYTE 2 //SerVer1, SerVer2 - // V1, V2 types: Alpha = 1, QuickSelect = 2, SetSketch = 3; V3 only: Buffered QS = 4 - static final int LG_RESIZE_RATIO_BYTE_V1 = 5; //used by SerVer 1 - static final int FLAGS_BYTE_V1 = 6; //used by SerVer 1 + // flag byte bit masks + static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. Was BigEndian + static final int READ_ONLY_FLAG_MASK = 2; //Bit 1: Reserved, Set but not read. + static final int EMPTY_FLAG_MASK = 4; //Bit 2: + static final int COMPACT_FLAG_MASK = 8; //Bit 3: + static final int ORDERED_FLAG_MASK = 16;//Bit 4: + static final int SINGLEITEM_FLAG_MASK = 32;//Bit 5: + //The last 2 bits (Bit 6,7) of the flags byte are reserved and assumed to be zero. //Other constants static final int SER_VER = 3; + static final int SER_VER_COMPRESSED = 4; // serial version 4 compressed ordered sketch, not empty, not single item - static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes - static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries - static final int THETA_LONG_V4 = 8; //8-byte aligned + static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes + static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries + static final int THETA_LONG_V4 = 8; //8-byte aligned /** * Computes the number of bytes required for an updatable sketch using a hash-table cache. @@ -233,7 +228,7 @@ private PreambleUtil() {} * @param preambleLongs current preamble size * @return the size in bytes */ - static int getSegBytes(final int lgArrLongs, final int preambleLongs) { + static int getUpdatableSegBytes(final int lgArrLongs, final int preambleLongs) { return (8 << lgArrLongs) + (preambleLongs << 3); } @@ -261,7 +256,7 @@ static String preambleToString(final byte[] byteArr) { * @return the summary preamble string. */ static String preambleToString(final MemorySegment seg) { - final int preLongs = getAndCheckPreLongs(seg); + final int preLongs = checkSegPreambleCap(seg); final int rfId = extractLgResizeFactor(seg); final ResizeFactor rf = ResizeFactor.getRF(rfId); final int serVer = extractSerVer(seg); @@ -272,7 +267,7 @@ static String preambleToString(final MemorySegment seg) { //Flags final int flags = extractFlags(seg); - final String flagsStr = (flags) + ", 0x" + (Integer.toHexString(flags)) + ", " + final String flagsStr = flags + ", 0x" + Integer.toHexString(flags) + ", " + zeroPad(Integer.toBinaryString(flags), 8); final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; final boolean empty = (flags & EMPTY_FLAG_MASK) > 0; @@ -318,7 +313,7 @@ else if (preLongs == 4) { //Union sb.append("Byte 0: ResizeFactor : ").append(rfId + ", " + rf.toString()).append(LS); sb.append("Byte 1: Serialization Version: ").append(serVer).append(LS); sb.append("Byte 2: Family : ").append(familyId + ", " + family.toString()).append(LS); - sb.append("Byte 3: LgNomLongs : ").append(lgNomLongs).append(LS); + sb.append("Byte 3: LgNomLongs, LgK : ").append(lgNomLongs).append(LS); sb.append("Byte 4: LgArrLongs : ").append(lgArrLongs).append(LS); sb.append("Byte 5: Flags Field : ").append(flagsStr).append(LS); sb.append(" Bit Flag Name : State:").append(LS); @@ -351,8 +346,13 @@ else if (preLongs == 3) { sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); sb.append(" Theta (long) : ").append(thetaLong).append(LS); sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS); + if (serVer == 4) { + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); + sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); + return sb.toString(); + } } - else { //preLongs == 4 + else { //preLongs == 4 (Union) sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS); sb.append("Bytes 12-15: P : ").append(p).append(LS); sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); @@ -363,9 +363,8 @@ else if (preLongs == 3) { sb.append(" ThetaU (long,hex): ").append(thetaUHex).append(LS); } sb.append( "Preamble Bytes : ").append(preLongs * 8).append(LS); - sb.append( "Data Bytes : ").append(curCount * 8).append(LS); - sb.append( "TOTAL Sketch Bytes : ").append((preLongs + curCount) * 8).append(LS); - sb.append( "TOTAL Capacity Bytes : ").append(seg.byteSize()).append(LS); + sb.append( "Retained Data Bytes : ").append(curCount * 8).append(LS); + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); return sb.toString(); } @@ -377,11 +376,7 @@ static int extractPreLongs(final MemorySegment seg) { } static int extractLgResizeFactor(final MemorySegment seg) { - return (seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3; - } - - static int extractLgResizeRatioV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, LG_RESIZE_RATIO_BYTE_V1) & 0X3; + return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; } static int extractSerVer(final MemorySegment seg) { @@ -404,10 +399,6 @@ static int extractFlags(final MemorySegment seg) { return seg.get(JAVA_BYTE, FLAGS_BYTE) & 0XFF; } - static int extractFlagsV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, FLAGS_BYTE_V1) & 0XFF; - } - static int extractSeedHash(final MemorySegment seg) { return seg.get(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT) & 0XFFFF; } @@ -516,7 +507,7 @@ static void clearEmpty(final MemorySegment seg) { } static boolean isEmptyFlag(final MemorySegment seg) { - return ((extractFlags(seg) & EMPTY_FLAG_MASK) > 0); + return (extractFlags(seg) & EMPTY_FLAG_MASK) > 0; } /** @@ -524,17 +515,16 @@ static boolean isEmptyFlag(final MemorySegment seg) { * @param seg the given MemorySegment * @return the extracted prelongs value. */ - static int getAndCheckPreLongs(final MemorySegment seg) { - final long cap = seg.byteSize(); - if (cap < 8) { - throwNotBigEnough(cap, 8); - } - final int preLongs = extractPreLongs(seg); - final int required = Math.max(preLongs << 3, 8); - if (cap < required) { - throwNotBigEnough(cap, required); + static int checkSegPreambleCap(final MemorySegment seg) { + try { + final int preLongs = extractPreLongs(seg); + final int required = Math.max(preLongs << 3, 8); + final long cap = seg.byteSize(); + if (cap < required) { throwNotBigEnough(cap, required); } + return preLongs; + } catch (IndexOutOfBoundsException e) { //thrown by MemorySegment + throw new SketchesArgumentException("Possible Corruption: Given MemorySegment is empty."); } - return preLongs; } static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { @@ -543,10 +533,10 @@ static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { return seedHashSeg; } - private static void throwNotBigEnough(final long cap, final int required) { + private static void throwNotBigEnough(final long cap, final long required) { throw new SketchesArgumentException( - "Possible Corruption: Size of byte array or MemorySegment not large enough: Size: " + cap - + ", Required: " + required); + "Possible Corruption: Size of MemorySegment not large enough: Size: " + cap + + " < Required: " + required); } static int wholeBytesToHoldBits(final int bits) { diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java index abf8df391..766e1850d 100644 --- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java +++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java @@ -24,9 +24,9 @@ import static org.apache.datasketches.common.ByteArrayUtil.putLongLE; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; @@ -45,13 +45,13 @@ final class SingleItemSketch extends CompactSketch { private static final long DEFAULT_SEED_HASH = Util.computeSeedHash(Util.DEFAULT_UPDATE_SEED) & 0xFFFFL; // For backward compatibility, a candidate pre0_ long must have: - // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, LittleEndian = 11010 = 0x1A. + // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, NOT BigEndian = 11010 = 0x1A. (without SI flag) // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches, it is virtually guaranteed that we have a SingleItem Sketch. - private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //with SI flag + private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //low 6 bytes, with SI flag private long pre0_ = 0; private long hash_ = 0; @@ -84,7 +84,7 @@ private SingleItemSketch(final long hash) { */ //does not override Sketch static SingleItemSketch heapify(final MemorySegment srcSeg, final short expectedSeedHash) { Util.checkSeedHashes((short) extractSeedHash(srcSeg), expectedSeedHash); - final boolean singleItem = otherCheckForSingleItem(srcSeg); + final boolean singleItem = checkForSingleItem(srcSeg); if (singleItem) { return new SingleItemSketch(srcSeg.get(JAVA_LONG_UNALIGNED, 8), expectedSeedHash); } throw new SketchesArgumentException("Input MemorySegment is not a SingleItemSketch."); } @@ -330,7 +330,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 1; } @@ -384,25 +384,28 @@ short getSeedHash() { return (short) (pre0_ >>> 48); } - static boolean otherCheckForSingleItem(final MemorySegment seg) { - return otherCheckForSingleItem(extractPreLongs(seg), extractSerVer(seg), - extractFamilyID(seg), extractFlags(seg) ); + static boolean checkForSingleItem(final MemorySegment seg) { + final int preLongs = checkSegPreambleCap(seg); + return checkForSingleItem(preLongs, extractSerVer(seg), extractFamilyID(seg), extractFlags(seg) ); } - static boolean otherCheckForSingleItem(final int preLongs, final int serVer, - final int famId, final int flags) { - // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved=F = X11010 = 0x1A. + static boolean checkForSingleItem( + final int preLongs, + final int serVer, + final int famId, + final int flags) { + // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved(BE)=F = X11010 = 0x1A. // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches (not done here), it is virtually guaranteed that we have a // SingleItem Sketch. - final boolean numPreLongs = preLongs == 1; - final boolean numSerVer = serVer >= 3; - final boolean numFamId = famId == Family.COMPACT.getID(); - final boolean numFlags = (flags & 0x1F) == 0x1A; //no SI, yet - final boolean singleFlag = (flags & SINGLEITEM_FLAG_MASK) > 0; - return (numPreLongs && numSerVer && numFamId && numFlags) || singleFlag; + final boolean preLongsOK = preLongs == 1; + final boolean serVerOK = serVer >= 3; + final boolean famIdOK = famId == Family.COMPACT.getID(); + final boolean flagsOK = (flags & 0x1F) == 0x1A; //no SI, yet + final boolean singleFlagOK = (flags & SINGLEITEM_FLAG_MASK) > 0; + return (preLongsOK && serVerOK && famIdOK && flagsOK) || singleFlagOK; } } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 6310d82c4..d14519062 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -19,17 +19,19 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.LS; import static org.apache.datasketches.common.Util.ceilingPowerOf2; import static org.apache.datasketches.common.Util.zeroPad; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; +import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; +import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; +import static org.apache.datasketches.theta.PreambleUtil.extractFlags; +import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; +import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.thetacommon.HashOperations.count; import java.lang.foreign.MemorySegment; @@ -62,20 +64,12 @@ public abstract class Sketch implements MemorySegmentStatus { * Default Update Seed

* was used to create the source MemorySegment image. * - *

For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.

- * * @param srcSeg an image of a Sketch. * * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { - return CompactSketch.heapify(srcSeg); - } - return heapifyUpdateFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED); + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -83,8 +77,8 @@ public static Sketch heapify(final MemorySegment srcSeg) { * *

The resulting sketch will not retain any link to the source MemorySegment.

* - *

For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.

+ *

For Update Sketches this method checks if the expectedSeed + * was used to create the source MemorySegment image.

* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -93,12 +87,12 @@ public static Sketch heapify(final MemorySegment srcSeg) { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.COMPACT.getID()) { return CompactSketch.heapify(srcSeg, expectedSeed); } - return heapifyUpdateFromMemorySegment(srcSeg, expectedSeed); + return heapifyUpdateSketchFromMemorySegment(srcSeg, expectedSeed); } /** @@ -106,44 +100,21 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".

+ *

Only sketches that have been explicitly stored as direct sketches can be wrapped.

* *

Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

For Update Sketches this method checks if the - * Default Update Seed

- * was used to create the source MemorySegment image. - * - *

For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.

+ *

This method checks if the + * Default Update Seed + * was used to create the source MemorySegment image.

* - * @param srcSeg an image of a Sketch. - * @return a Sketch backed by the given MemorySegment + * @param srcSeg a MemorySegment with an image of a Sketch. + * @return a read-only Sketch backed by the given MemorySegment */ public static Sketch wrap(final MemorySegment srcSeg) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, Util.DEFAULT_UPDATE_SEED); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } - } - if (family == Family.COMPACT) { - return CompactSketch.wrap(srcSeg); - } - throw new SketchesArgumentException( - "Cannot wrap family: " + family + " as a Sketch"); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -151,40 +122,30 @@ public static Sketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".

+ *

Only sketches that have been explicitly stored as direct sketches can be wrapped.

* *

Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.

+ *

This method checks if the given expectedSeed + * was used to create the source MemorySegment image.

* * @param srcSeg a MemorySegment with an image of a Sketch. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a UpdateSketch backed by the given MemorySegment except as above. + * @return a read-only Sketch backed by the given MemorySegment. */ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.QUICKSELECT.getID()) { + return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed); } - if (family == Family.COMPACT) { + if (familyID == Family.COMPACT.getID()) { return CompactSketch.wrap(srcSeg, expectedSeed); } + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "Cannot wrap family: " + family + " as a Sketch"); } @@ -203,7 +164,7 @@ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { * @return this sketch as an ordered CompactSketch. */ public CompactSketch compact() { - return (this.isCompact()) ? (CompactSketch)this : compact(true, null); + return isCompact() ? (CompactSketch)this : compact(true, null); } /** @@ -269,6 +230,21 @@ public int getCountLessThanThetaLong(final long thetaLong) { */ public abstract double getEstimate(); + /** + * Gets the estimate from the given MemorySegment + * @param srcSeg the given MemorySegment + * @return the result estimate + */ + public static double getEstimate(final MemorySegment srcSeg) { + checkSegPreambleCap(srcSeg); + final int familyId = extractFamilyID(srcSeg); + if (!isValidSketchID(familyId)) { + throw new SketchesArgumentException("Source MemorySegment not a valid Sketch Family: " + + Family.idToFamily(familyId).toString()); + } + return Sketch.estimate(extractThetaLong(srcSeg), getRetainedEntries(srcSeg)); + } + /** * Returns the Family that this sketch belongs to * @return the Family that this sketch belongs to @@ -327,9 +303,20 @@ public static int getMaxUpdateSketchBytes(final int nomEntries) { return (nomEnt << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); } + /** + * Returns the maximum number of storage bytes required for an UpdateSketch with the given + * log_base2 of the nominal entries. + * @param lgNomEntries log_base2 of Nominal Entries + * @return the maximum number of storage bytes required for a UpdateSketch with the given lgNomEntries + */ + public static int getUpdateSketchMaxBytes(final int lgNomEntries) { + return (16 << lgNomEntries) + (Family.QUICKSELECT.getMaxPreLongs() << 3); + } + /** * Returns the number of valid entries that have been retained by the sketch. - * @return the number of valid retained entries + * For the Alpha Sketch this returns only valid entries. + * @return the number of valid retained entries. */ public int getRetainedEntries() { return getRetainedEntries(true); @@ -337,13 +324,24 @@ public int getRetainedEntries() { /** * Returns the number of entries that have been retained by the sketch. - * @param valid if true, returns the number of valid entries, which are less than theta and used - * for estimation. - * Otherwise, return the number of all entries, valid or not, that are currently in the internal - * sketch cache. + * @param valid This parameter is only relevant for the Alpha Sketch. + * if true, returns the number of valid entries, which are less than theta and used + * for estimation. Otherwise, return the number of all entries, valid or not, that are currently in the + * internal sketch cache. * @return the number of retained entries */ - public abstract int getRetainedEntries(boolean valid); + public abstract int getRetainedEntries(final boolean valid); + + /** + * Returns the number of valid entries that have been retained by the sketch from the given MemorySegment + * @param srcSeg the given MemorySegment that has an image of a Sketch + * @return the number of valid retained entries + */ + public static int getRetainedEntries(final MemorySegment srcSeg) { + final int preLongs = checkSegPreambleCap(srcSeg); + final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0; + return (preLongs == 1) ? (empty ? 0 : 1) : extractCurCount(srcSeg); + } /** * Returns the serialization version from the given MemorySegment @@ -351,7 +349,8 @@ public int getRetainedEntries() { * @return the serialization version from the MemorySegment */ public static int getSerializationVersion(final MemorySegment seg) { - return seg.get(JAVA_BYTE, SER_VER_BYTE); + checkSegPreambleCap(seg); + return extractSerVer(seg); } /** @@ -400,7 +399,7 @@ public double getUpperBound(final int numStdDev) { * @return true if the sketch is in estimation mode. */ public boolean isEstimationMode() { - return estMode(getThetaLong(), isEmpty()); + return getThetaLong() < Long.MAX_VALUE && !isEmpty(); } /** @@ -445,7 +444,10 @@ public String toString() { * @param hexMode If true, hashes will be output in hex. * @return The result string, which can be very long. */ - public String toString(final boolean sketchSummary, final boolean dataDetail, final int width, + public String toString( + final boolean sketchSummary, + final boolean dataDetail, + final int width, final boolean hexMode) { final StringBuilder sb = new StringBuilder(); @@ -548,6 +550,9 @@ public static String toString(final MemorySegment seg) { /** * Gets the internal cache array. For on-heap sketches this will return a reference to the actual * cache array. For MemorySegment-based sketches this returns a copy. + * + *

This can be an expensive operation and is intended for diagnostic & test applications. + * Use {@link #iterator() iterator()} instead.

* @return the internal cache array. */ abstract long[] getCache(); @@ -584,6 +589,24 @@ public static String toString(final MemorySegment seg) { */ abstract short getSeedHash(); + static boolean getEmpty(final MemorySegment srcSeg) { + checkSegPreambleCap(srcSeg); + final int serVer = extractSerVer(srcSeg); + if (serVer == 1) { + return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; + } + return (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0; + } + + static int getPreambleLongs(final MemorySegment srcSeg) { + return checkSegPreambleCap(srcSeg); + } + + static long getThetaLong(final MemorySegment srcSeg) { + final int preLongs = checkSegPreambleCap(srcSeg); + return preLongs < 3 ? Long.MAX_VALUE : extractThetaLong(srcSeg); + } + /** * Returns true if given Family id is one of the theta sketches * @param id the given Family id @@ -595,66 +618,65 @@ static final boolean isValidSketchID(final int id) { || id == Family.COMPACT.getID(); } - /** - * Checks Ordered and Compact flags for integrity between sketch and a MemorySegment - * @param sketch the given sketch - */ - static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { - final MemorySegment seg = sketch.getMemorySegment(); - if (seg == null) { return; } - final int flags = PreambleUtil.extractFlags(seg); - if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Compact Flag inconsistent with Sketch"); - } - if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Ordered Flag inconsistent with Sketch"); - } - } - static final double estimate(final long thetaLong, final int curCount) { return curCount * (LONG_MAX_VALUE_AS_DOUBLE / thetaLong); } - static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, - final boolean empty) { + /** + * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the lower bound. + */ + public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { + return lowerBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + + static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getLowerBound(curCount, theta, numStdDev, empty); } + /** + * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the upper bound. + */ + public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { + return upperBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + static final double upperBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getUpperBound(curCount, theta, numStdDev, empty); } - private static final boolean estMode(final long thetaLong, final boolean empty) { - return thetaLong < Long.MAX_VALUE && !empty; - } - /** - * Instantiates a Heap Update Sketch from MemorySegment. Only SerVer3. SerVer 1 & 2 already handled. + * Instantiates a Heap Update Sketch from MemorySegment. * @param srcSeg the source MemorySegment * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * @return a Sketch */ - private static final Sketch heapifyUpdateFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { - final long cap = srcSeg.byteSize(); - if (cap < 8) { - throw new SketchesArgumentException( - "Corrupted: valid sketch must be at least 8 bytes."); - } - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); + private static final Sketch heapifyUpdateSketchFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { + final Family family = idToFamily(extractFamilyID(srcSeg)); if (family == Family.ALPHA) { - final int flags = PreambleUtil.extractFlags(srcSeg); + final int flags = extractFlags(srcSeg); final boolean compactFlag = (flags & COMPACT_FLAG_MASK) != 0; if (compactFlag) { throw new SketchesArgumentException( - "Corrupted: ALPHA family image: cannot be compact"); + "Corrupted: An ALPHA family image cannot be compact"); } return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed); } diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java deleted file mode 100644 index 40c7ccf86..000000000 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.common.Util; - -/** - * This class brings together the common sketch and set operation creation methods and - * the public static methods into one place. - * - * @author Lee Rhodes - */ -public final class Sketches { - - private Sketches() {} - - /** - * Gets the unique count estimate from a valid MemorySegment image of a Sketch - * @param srcSeg the source MemorySegment - * @return the sketch's best estimate of the cardinality of the input stream. - */ - public static double getEstimate(final MemorySegment srcSeg) { - checkIfValidThetaSketch(srcSeg); - return Sketch.estimate(getThetaLong(srcSeg), getRetainedEntries(srcSeg)); - } - - /** - * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the lower bound. - */ - public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.lowerBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxAnotBResultBytes(int)}. - * Returns the maximum number of bytes for the returned CompactSketch, given the maximum - * value of nomEntries of the first sketch A of AnotB. - * @param maxNomEntries the given value - * @return the maximum number of bytes. - */ - public static int getMaxAnotBResultBytes(final int maxNomEntries) { - return SetOperation.getMaxAnotBResultBytes(maxNomEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch with the given - * number of actual entries. - * @param numberOfEntries the actual number of retained entries stored in the sketch. - * @return the maximum number of storage bytes required for a CompactSketch with the given number - * of retained entries. - */ - public static int getMaxCompactSketchBytes(final int numberOfEntries) { - return Sketch.getMaxCompactSketchBytes(numberOfEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch given the configured - * log_base2 of the number of nominal entries, which is a power of 2. - * @param lgNomEntries Nominal Entries - * @return the maximum number of storage bytes required for a CompactSketch with the given - * lgNomEntries. - * @see Sketch#getCompactSketchMaxBytes(int) - */ - public static int getCompactSketchMaxBytes(final int lgNomEntries) { - return Sketch.getCompactSketchMaxBytes(lgNomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxIntersectionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxIntersectionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxIntersectionBytes(int)} - */ - public static int getMaxIntersectionBytes(final int nomEntries) { - return SetOperation.getMaxIntersectionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxUnionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxUnionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxUnionBytes(int)} - */ - public static int getMaxUnionBytes(final int nomEntries) { - return SetOperation.getMaxUnionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - * @param nomEntries Ref: {@link Sketch#getMaxUpdateSketchBytes(int)}, {@code nomEntries} - * @return Ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - */ - public static int getMaxUpdateSketchBytes(final int nomEntries) { - return Sketch.getMaxUpdateSketchBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getSerializationVersion(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#getSerializationVersion(MemorySegment)}, {@code srcSeg} - * @return Ref: {@link Sketch#getSerializationVersion(MemorySegment)} - */ - public static int getSerializationVersion(final MemorySegment srcSeg) { - return Sketch.getSerializationVersion(srcSeg); - } - - /** - * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the upper bound. - */ - public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.upperBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - //Heapify Operations - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg) { - return SetOperation.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg) { - return Sketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg) { - return UpdateSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.heapify(srcSeg, expectedSeed); - } - - //Builders - - /** - * Ref: {@link SetOperationBuilder SetOperationBuilder} - * @return {@link SetOperationBuilder SetOperationBuilder} - */ - public static SetOperationBuilder setOperationBuilder() { - return new SetOperationBuilder(); - } - - /** - * Ref: {@link UpdateSketchBuilder UpdateSketchBuilder} - * @return {@link UpdateSketchBuilder UpdateSketchBuilder} - */ - public static UpdateSketchBuilder updateSketchBuilder() { - return new UpdateSketchBuilder(); - } - - //Wrap operations - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Intersection backed by the given MemorySegment - */ - public static Intersection wrapIntersection(final MemorySegment srcSeg) { - return (Intersection) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg) { - return wrapSetOperation(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg) { - return Sketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the expectedSeed used to validate the given MemorySegment image. - * Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} and casts the result to a Union - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Union backed by the given MemorySegment. - */ - public static Union wrapUnion(final MemorySegment srcSeg) { - return (Union) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg) { - return wrapUpdateSketch(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.wrap(srcSeg, expectedSeed); - } - - //Restricted static methods - - static void checkIfValidThetaSketch(final MemorySegment srcSeg) { - final int fam = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - if (!Sketch.isValidSketchID(fam)) { - throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " - + Family.idToFamily(fam).toString()); - } - } - - static boolean getEmpty(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - return ((getThetaLong(srcSeg) == Long.MAX_VALUE) && (getRetainedEntries(srcSeg) == 0)); - } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - } - - static int getPreambleLongs(final MemorySegment srcSeg) { - return srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 - } - - static int getRetainedEntries(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); - if ((getThetaLong(srcSeg) == Long.MAX_VALUE) && (entries == 0)) { - return 0; - } - return entries; - } - //SerVer 2 or 3 - final int preLongs = getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - if (preLongs == 1) { - return empty ? 0 : 1; - } - //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 - } - - static long getThetaLong(final MemorySegment srcSeg) { - final int preLongs = getPreambleLongs(srcSeg); - return (preLongs < 3) ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3 - } -} diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index ed0178c8c..bbefd958c 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -22,6 +22,8 @@ import static java.lang.Math.min; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.QuickSelect.selectExcludingZeros; +import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.UNION_THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.clearEmpty; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; @@ -34,6 +36,7 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.ResizeFactor; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; @@ -105,7 +108,7 @@ static UnionImpl initNewDirectInstance( final ResizeFactor rf, final MemorySegment dstSeg) { final UpdateSketch gadget = //create with UNION family - new DirectQuickSelectSketch(lgNomLongs, seed, p, rf, dstSeg, true); + new DirectQuickSelectSketch(lgNomLongs, seed, p, rf, dstSeg, null, true); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = gadget.getThetaLong(); unionImpl.unionEmpty_ = gadget.isEmpty(); @@ -142,7 +145,7 @@ static UnionImpl fastWrapInstance(final MemorySegment srcSeg, final long expecte Family.UNION.checkFamilyID(extractFamilyID(srcSeg)); final UpdateSketch gadget = srcSeg.isReadOnly() ? DirectQuickSelectSketchR.fastReadOnlyWrap(srcSeg, expectedSeed) - : DirectQuickSelectSketch.fastWritableWrap(srcSeg, expectedSeed); + : DirectQuickSelectSketch.fastWritableWrap(srcSeg, null, expectedSeed); final UnionImpl unionImpl = new UnionImpl(gadget, expectedSeed); unionImpl.unionThetaLong_ = extractUnionThetaLong(srcSeg); unionImpl.unionEmpty_ = PreambleUtil.isEmptyFlag(srcSeg); @@ -151,17 +154,17 @@ static UnionImpl fastWrapInstance(final MemorySegment srcSeg, final long expecte /** * Wrap a Union object around a Union MemorySegment object containing data. - * Called by SetOperation. * @param srcSeg The source MemorySegment object. * @param expectedSeed the seed used to validate the given MemorySegment image. * See seed * @return this class */ + //Called by SetOperation and Union static UnionImpl wrapInstance(final MemorySegment srcSeg, final long expectedSeed) { Family.UNION.checkFamilyID(extractFamilyID(srcSeg)); final UpdateSketch gadget = srcSeg.isReadOnly() ? DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed) - : DirectQuickSelectSketch.writableWrap(srcSeg, expectedSeed); + : DirectQuickSelectSketch.writableWrap(srcSeg, null, expectedSeed); final UnionImpl unionImpl = new UnionImpl(gadget, expectedSeed); unionImpl.unionThetaLong_ = extractUnionThetaLong(srcSeg); unionImpl.unionEmpty_ = PreambleUtil.isEmptyFlag(srcSeg); @@ -269,7 +272,7 @@ public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boo public void union(final Sketch sketchIn) { //UNION Empty Rule: AND the empty states. - if ((sketchIn == null) || sketchIn.isEmpty()) { + if (sketchIn == null || sketchIn.isEmpty()) { //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } @@ -279,7 +282,7 @@ public void union(final Sketch sketchIn) { gadget_.hashUpdate(sketchIn.getCache()[0]); return; } - Sketch.checkSketchAndMemorySegmentFlags(sketchIn); + UnionImpl.checkSketchAndMemorySegmentFlags(sketchIn); unionThetaLong_ = min(min(unionThetaLong_, sketchIn.getThetaLong()), gadget_.getThetaLong()); //Theta rule unionEmpty_ = false; @@ -287,7 +290,7 @@ public void union(final Sketch sketchIn) { final HashIterator it = sketchIn.iterator(); while (it.next()) { final long hash = it.get(); - if ((hash < unionThetaLong_) && (hash < gadget_.getThetaLong())) { + if (hash < unionThetaLong_ && hash < gadget_.getThetaLong()) { gadget_.hashUpdate(hash); // backdoor update, hash function is bypassed } else if (isOrdered) { break; } } @@ -372,4 +375,22 @@ boolean isEmpty() { return gadget_.isEmpty() && unionEmpty_; } + /** + * Checks Ordered and Compact flags for integrity between sketch and its MemorySegment + * @param sketch the given sketch + */ + private static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { + final MemorySegment seg = sketch.getMemorySegment(); + if (seg == null) { return; } + final int flags = PreambleUtil.extractFlags(seg); + if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Compact Flag inconsistent with Sketch"); + } + if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Ordered Flag inconsistent with Sketch"); + } + } + } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 7db1988e9..723d57a96 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -19,19 +19,15 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; -import static org.apache.datasketches.common.Util.checkBounds; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.CompactOperations.componentsToCompact; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.SER_VER; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.checkSegmentSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; @@ -39,7 +35,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractP; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.UpdateReturnState.RejectedNullOrEmpty; import java.lang.foreign.MemorySegment; @@ -47,6 +43,7 @@ import java.util.Objects; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; @@ -60,8 +57,11 @@ * @author Lee Rhodes */ public abstract class UpdateSketch extends Sketch { + private final long seed_; - UpdateSketch() {} + UpdateSketch(final long seed) { + seed_ = seed; //kept only on heap, never serialized. Hoisted here for performance. + } /** * Wrap takes the writable sketch image in MemorySegment and refers to it directly. There is no data copying onto @@ -72,9 +72,12 @@ public abstract class UpdateSketch extends Sketch { * @param srcWSeg an image of a writable sketch where the image seed hash matches the default seed hash. * It must have a size of at least 24 bytes. * @return an UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg) { - return wrap(srcWSeg, Util.DEFAULT_UPDATE_SEED); + return wrap(srcWSeg, null, Util.DEFAULT_UPDATE_SEED); } /** @@ -85,24 +88,30 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * Java Heap version of the sketch where all data will be copied to the heap. * @param srcWSeg an image of a writable sketch where the image seed hash matches the given seed hash. * It must have a size of at least 24 bytes. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * Compact sketches store a 16-bit hash of the seed, but not the seed itself. * @return a UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ - public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expectedSeed) { - Objects.requireNonNull(srcWSeg, "Source MemorySeg e t must not be null"); - checkBounds(0, 24, srcWSeg.byteSize()); //need min 24 bytes - final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcWSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcWSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family != Family.QUICKSELECT) { + public static UpdateSketch wrap( + final MemorySegment srcWSeg, + final MemorySegmentRequest mSegReq, + final long expectedSeed) { + Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); + final int preLongs = checkSegPreambleCap(srcWSeg) & 0X3F; //mask to 6 bits; + final int serVer = extractSerVer(srcWSeg); + final int familyID = extractFamilyID(srcWSeg); + if (familyID != Family.QUICKSELECT.getID()) { + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "A " + family + " sketch cannot be wrapped as an UpdateSketch."); } - if ((serVer == 3) && (preLongs == 3)) { - return DirectQuickSelectSketch.writableWrap(srcWSeg, expectedSeed); + if (serVer == 3 && preLongs == 3) { + return DirectQuickSelectSketch.writableWrap(srcWSeg, mSegReq, expectedSeed); } else { throw new SketchesArgumentException( "Corrupted: An UpdateSketch image must have SerVer = 3 and preLongs = 3"); @@ -115,6 +124,9 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expected * @param srcSeg the given MemorySegment with a sketch image. * It must have a size of at least 24 bytes. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg) { return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); @@ -127,12 +139,15 @@ public static UpdateSketch heapify(final MemorySegment srcSeg) { * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); - checkBounds(0, 24, srcSeg.byteSize()); //need min 24 bytes - final Family family = Family.idToFamily(srcSeg.get(JAVA_BYTE, FAMILY_BYTE)); - if (family.equals(Family.ALPHA)) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.ALPHA.getID()) { return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed); } return HeapQuickSelectSketch.heapifyInstance(srcSeg, expectedSeed); @@ -142,8 +157,16 @@ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expect @Override public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstWSeg) { - return componentsToCompact(getThetaLong(), getRetainedEntries(true), getSeedHash(), isEmpty(), - false, false, dstOrdered, dstWSeg, getCache()); + return componentsToCompact( + getThetaLong(), + getRetainedEntries(true), + getSeedHash(), + isEmpty(), + false, //is src compact + false, //is src ordered + dstOrdered, + dstWSeg, + getCache()); } @Override @@ -160,7 +183,7 @@ int getCurrentDataLongs() { @Override public boolean hasMemorySegment() { - return ((this instanceof DirectQuickSelectSketchR) && ((DirectQuickSelectSketchR)this).hasMemorySegment()); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.hasMemorySegment(); } @Override @@ -170,7 +193,7 @@ public boolean isCompact() { @Override public boolean isOffHeap() { - return ((this instanceof DirectQuickSelectSketchR) && ((DirectQuickSelectSketchR)this).isOffHeap()); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isOffHeap(); } @Override @@ -180,7 +203,7 @@ public boolean isOrdered() { @Override public boolean isSameResource(final MemorySegment that) { - return (this instanceof final DirectQuickSelectSketchR dqssr) && dqssr.isSameResource(that); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isSameResource(that); } //UpdateSketch interface @@ -210,7 +233,7 @@ public static final UpdateSketchBuilder builder() { * Gets the configured seed * @return the configured seed */ - abstract long getSeed(); + public long getSeed() { return seed_; } /** * Resets this sketch back to a virgin empty state. @@ -232,8 +255,7 @@ public static final UpdateSketchBuilder builder() { * See Update Return State */ public UpdateReturnState update(final long datum) { - final long[] data = { datum }; - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(datum, seed_)[0] >>> 1); } /** @@ -248,9 +270,9 @@ public UpdateReturnState update(final long datum) { * See Update Return State */ public UpdateReturnState update(final double datum) { - final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0 - final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN & +/- infinity forms - return hashUpdate(hash(data, getSeed())[0] >>> 1); + final double d = datum == 0.0 ? 0.0 : datum; // canonicalize -0.0, 0.0 + final long data = Double.doubleToLongBits(d);// canonicalize all NaN & +/- infinity forms + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -267,11 +289,11 @@ public UpdateReturnState update(final double datum) { * See Update Return State */ public UpdateReturnState update(final String datum) { - if ((datum == null) || datum.isEmpty()) { + if (datum == null || datum.isEmpty()) { return RejectedNullOrEmpty; } final byte[] data = datum.getBytes(UTF_8); - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -283,10 +305,10 @@ public UpdateReturnState update(final String datum) { * See Update Return State */ public UpdateReturnState update(final byte[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -298,10 +320,10 @@ public UpdateReturnState update(final byte[] data) { * See Update Return State */ public UpdateReturnState update(final ByteBuffer buffer) { - if ((buffer == null) || !buffer.hasRemaining()) { + if (buffer == null || !buffer.hasRemaining()) { return RejectedNullOrEmpty; } - return hashUpdate(hash(buffer, getSeed())[0] >>> 1); + return hashUpdate(hash(buffer, seed_)[0] >>> 1); } /** @@ -316,10 +338,10 @@ public UpdateReturnState update(final ByteBuffer buffer) { * See Update Return State */ public UpdateReturnState update(final char[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -331,10 +353,10 @@ public UpdateReturnState update(final char[] data) { * See Update Return State */ public UpdateReturnState update(final int[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -346,10 +368,10 @@ public UpdateReturnState update(final int[] data) { * See Update Return State */ public UpdateReturnState update(final long[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } //restricted methods @@ -391,23 +413,23 @@ public UpdateReturnState update(final long[] data) { */ abstract boolean isOutOfSpace(int numEntries); - static void checkUnionQuickSelectFamily(final MemorySegment seg, final int preambleLongs, - final int lgNomLongs) { + static void checkUnionAndQuickSelectFamily(final MemorySegment seg, final int preambleLongs, final int lgNomLongs) { + //Check Family final int familyID = extractFamilyID(seg); //byte 2 - final Family family = Family.idToFamily(familyID); - if (family.equals(Family.UNION)) { + if (familyID == Family.UNION.getID()) { if (preambleLongs != Family.UNION.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs); } } - else if (family.equals(Family.QUICKSELECT)) { + else if (familyID == Family.QUICKSELECT.getID()) { if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs); } } else { + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "Possible corruption: Invalid Family: " + family.toString()); } @@ -444,7 +466,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee //Check seg capacity, lgArrLongs final long curCapBytes = srcSeg.byteSize(); - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); if (curCapBytes < minReqBytes) { throw new SketchesArgumentException( "Possible corruption: Current MemorySegment size < min required size: " @@ -455,7 +477,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee final long thetaLong = extractThetaLong(srcSeg); //bytes 16-23 final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; //if (lgArrLongs <= lgNomLongs) the sketch is still resizing, thus theta cannot be < p. - if ((lgArrLongs <= lgNomLongs) && (theta < p) ) { + if (lgArrLongs <= lgNomLongs && theta < p ) { throw new SketchesArgumentException( "Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. " + lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p); @@ -477,7 +499,7 @@ static boolean isResizeFactorIncorrect(final MemorySegment srcSeg, final int lgN final int lgA = lgArrLongs; final int lgR = extractLgResizeFactor(srcSeg); if (lgR == 0) { return lgA != lgT; } - return (((lgT - lgA) % lgR) != 0); + return (lgT - lgA) % lgR != 0; } } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java index 834778f87..d91d654b6 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java @@ -26,6 +26,7 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; @@ -44,10 +45,11 @@ public final class UpdateSketchBuilder { private ResizeFactor bRF; private Family bFam; private float bP; + private MemorySegmentRequest bMemorySegmentRequest; //Fields for concurrent theta sketch private int bNumPoolThreads; - private int bLocalLgNomLongs; + private int bConCurLgNomLongs; private boolean bPropagateOrderedCompact; private double bMaxConcurrencyError; private int bMaxNumLocalThreads; @@ -57,11 +59,12 @@ public final class UpdateSketchBuilder { *
    *
  • Nominal Entries: {@value org.apache.datasketches.thetacommon.ThetaUtil#DEFAULT_NOMINAL_ENTRIES}
  • *
  • Seed: {@value org.apache.datasketches.common.Util#DEFAULT_UPDATE_SEED}
  • - *
  • Input Sampling Probability: 1.0
  • - *
  • Family: {@link org.apache.datasketches.common.Family#QUICKSELECT}
  • *
  • Resize Factor: The default for sketches on the Java heap is {@link ResizeFactor#X8}. * For direct sketches, which are targeted for off-heap, this value will * be fixed at either {@link ResizeFactor#X1} or {@link ResizeFactor#X2}.
  • + *
  • Family: {@link org.apache.datasketches.common.Family#QUICKSELECT}
  • + *
  • Input Sampling Probability, p: 1.0
  • + *
  • MemorySegmentRequest implementation: null
  • *
* Parameters unique to the concurrent sketches only: *
    @@ -75,19 +78,21 @@ public final class UpdateSketchBuilder { public UpdateSketchBuilder() { bLgNomLongs = Integer.numberOfTrailingZeros(ThetaUtil.DEFAULT_NOMINAL_ENTRIES); bSeed = Util.DEFAULT_UPDATE_SEED; - bP = (float) 1.0; bRF = ResizeFactor.X8; bFam = Family.QUICKSELECT; + bP = (float) 1.0; + bMemorySegmentRequest = null; + // Default values for concurrent sketch bNumPoolThreads = ConcurrentPropagationService.NUM_POOL_THREADS; - bLocalLgNomLongs = 4; //default is smallest legal QS sketch + bConCurLgNomLongs = 4; //default is smallest legal QS sketch bPropagateOrderedCompact = true; bMaxConcurrencyError = 0; bMaxNumLocalThreads = 1; } /** - * Sets the Nominal Entries for this sketch. + * Sets the local Nominal Entries for this builder. * This value is also used for building a shared concurrent sketch. * The minimum value is 16 (2^4) and the maximum value is 67,108,864 (2^26). * Be aware that sketches as large as this maximum value may not have been @@ -103,7 +108,7 @@ public UpdateSketchBuilder setNominalEntries(final int nomEntries) { } /** - * Alternative method of setting the Nominal Entries for this sketch from the log_base2 value. + * Alternative method of setting the local Nominal Entries for this builder from the log_base2 value. * This value is also used for building a shared concurrent sketch. * The minimum value is 4 and the maximum value is 26. * Be aware that sketches as large as this maximum value may not have been @@ -118,7 +123,22 @@ public UpdateSketchBuilder setLogNominalEntries(final int lgNomEntries) { } /** - * Returns Log-base 2 Nominal Entries + * Alternative method of setting the Nominal Entries for this builder from the log_base2 value, commonly called LgK. + * This value is also used for building a shared concurrent sketch. + * The minimum value is 4 and the maximum value is 26. + * Be aware that sketches as large as 26 may not have been + * thoroughly characterized for performance. + * + * @param lgK the Log Nominal Entries. Also for the concurrent shared sketch. + * @return this UpdateSketchBuilder + */ + public UpdateSketchBuilder setLgK(final int lgK) { + bLgNomLongs = ThetaUtil.checkNomLongs(1 << lgK); + return this; + } + + /** + * Returns the local Log-base 2 Nominal Entries * @return Log-base 2 Nominal Entries */ public int getLgNominalEntries() { @@ -126,7 +146,7 @@ public int getLgNominalEntries() { } /** - * Sets the Nominal Entries for the concurrent local sketch. The minimum value is 16 and the + * Sets the local (default) Concurrent Nominal Entries for the concurrent local sketch. The minimum value is 16 and the * maximum value is 67,108,864, which is 2^26. * Be aware that sketches as large as this maximum * value have not been thoroughly tested or characterized for performance. @@ -135,9 +155,9 @@ public int getLgNominalEntries() { * This will become the ceiling power of 2 if it is not. * @return this UpdateSketchBuilder */ - public UpdateSketchBuilder setLocalNominalEntries(final int nomEntries) { - bLocalLgNomLongs = Integer.numberOfTrailingZeros(ceilingPowerOf2(nomEntries)); - if ((bLocalLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS) || (bLocalLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS)) { + public UpdateSketchBuilder setConCurNominalEntries(final int nomEntries) { + bConCurLgNomLongs = Integer.numberOfTrailingZeros(ceilingPowerOf2(nomEntries)); + if (bConCurLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS || bConCurLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS) { throw new SketchesArgumentException( "Nominal Entries must be >= 16 and <= 67108864: " + nomEntries); } @@ -145,8 +165,7 @@ public UpdateSketchBuilder setLocalNominalEntries(final int nomEntries) { } /** - * Alternative method of setting the Nominal Entries for a local concurrent sketch from the - * log_base2 value. + * Alternative method of setting the local (default) Nominal Entries for a local concurrent sketch from the log_base2 value. * The minimum value is 4 and the maximum value is 26. * Be aware that sketches as large as this maximum * value have not been thoroughly tested or characterized for performance. @@ -154,9 +173,9 @@ public UpdateSketchBuilder setLocalNominalEntries(final int nomEntries) { * @param lgNomEntries the Log Nominal Entries for a concurrent local sketch * @return this UpdateSketchBuilder */ - public UpdateSketchBuilder setLocalLogNominalEntries(final int lgNomEntries) { - bLocalLgNomLongs = lgNomEntries; - if ((bLocalLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS) || (bLocalLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS)) { + public UpdateSketchBuilder setConCurLogNominalEntries(final int lgNomEntries) { + bConCurLgNomLongs = lgNomEntries; + if (bConCurLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS || bConCurLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS) { throw new SketchesArgumentException( "Log Nominal Entries must be >= 4 and <= 26: " + lgNomEntries); } @@ -164,15 +183,15 @@ public UpdateSketchBuilder setLocalLogNominalEntries(final int lgNomEntries) { } /** - * Returns Log-base 2 Nominal Entries for the concurrent local sketch + * Returns local Log-base 2 Nominal Entries for the concurrent local sketch * @return Log-base 2 Nominal Entries for the concurrent local sketch */ - public int getLocalLgNominalEntries() { - return bLocalLgNomLongs; + public int getConCurLgNominalEntries() { + return bConCurLgNomLongs; } /** - * Sets the long seed value that is required by the hashing function. + * Sets the local long seed value that is required by the hashing function. * @param seed See seed * @return this UpdateSketchBuilder */ @@ -182,7 +201,7 @@ public UpdateSketchBuilder setSeed(final long seed) { } /** - * Returns the seed + * Returns the local long seed value that is required by the hashing function. * @return the seed */ public long getSeed() { @@ -190,12 +209,12 @@ public long getSeed() { } /** - * Sets the upfront uniform sampling probability, p + * Sets the local upfront uniform pre-sampling probability, p * @param p See Sampling Probability, p * @return this UpdateSketchBuilder */ public UpdateSketchBuilder setP(final float p) { - if ((p <= 0.0) || (p > 1.0)) { + if (p <= 0.0 || p > 1.0) { throw new SketchesArgumentException("p must be > 0 and <= 1.0: " + p); } bP = p; @@ -203,7 +222,7 @@ public UpdateSketchBuilder setP(final float p) { } /** - * Returns the pre-sampling probability p + * Returns the local upfront uniform pre-sampling probability p * @return the pre-sampling probability p */ public float getP() { @@ -211,7 +230,7 @@ public float getP() { } /** - * Sets the cache Resize Factor. + * Sets the local cache Resize Factor. * @param rf See Resize Factor * @return this UpdateSketchBuilder */ @@ -221,7 +240,7 @@ public UpdateSketchBuilder setResizeFactor(final ResizeFactor rf) { } /** - * Returns the Resize Factor + * Returns the local Resize Factor * @return the Resize Factor */ public ResizeFactor getResizeFactor() { @@ -229,7 +248,7 @@ public ResizeFactor getResizeFactor() { } /** - * Set the Family. + * Set the local Family. Choose either Family.ALPHA or Family.QUICKSELECT. * @param family the family for this builder * @return this UpdateSketchBuilder */ @@ -239,7 +258,7 @@ public UpdateSketchBuilder setFamily(final Family family) { } /** - * Returns the Family + * Returns the local Family * @return the Family */ public Family getFamily() { @@ -247,7 +266,27 @@ public Family getFamily() { } /** - * Sets the number of pool threads used for background propagation in the concurrent sketches. + * Sets the local MemorySegmentRequest + * @param mSegReq the given MemorySegmentRequest + * @return this UpdateSketchBuilder + */ + public UpdateSketchBuilder setMemorySegmentRequest(final MemorySegmentRequest mSegReq) { + bMemorySegmentRequest = mSegReq; + return this; + } + + /** + * Returns the local MemorySegmentRequest + * @return the local MemorySegmentRequest + */ + public MemorySegmentRequest getMemorySegmentRequest() { + return bMemorySegmentRequest; + } + + //Concurrent related + + /** + * Sets the local number of pool threads used for background propagation in the concurrent sketches. * @param numPoolThreads the given number of pool threads */ public void setNumPoolThreads(final int numPoolThreads) { @@ -255,7 +294,7 @@ public void setNumPoolThreads(final int numPoolThreads) { } /** - * Gets the number of background pool threads used for propagation in the concurrent sketches. + * Gets the local number of background pool threads used for propagation in the concurrent sketches. * @return the number of background pool threads */ public int getNumPoolThreads() { @@ -263,7 +302,7 @@ public int getNumPoolThreads() { } /** - * Sets the Propagate Ordered Compact flag to the given value. Used with concurrent sketches. + * Sets the local Propagate Ordered Compact flag to the given value. Used with concurrent sketches. * * @param prop the given value * @return this UpdateSketchBuilder @@ -274,7 +313,7 @@ public UpdateSketchBuilder setPropagateOrderedCompact(final boolean prop) { } /** - * Gets the Propagate Ordered Compact flag used with concurrent sketches. + * Gets the local Propagate Ordered Compact flag used with concurrent sketches. * @return the Propagate Ordered Compact flag */ public boolean getPropagateOrderedCompact() { @@ -282,7 +321,7 @@ public boolean getPropagateOrderedCompact() { } /** - * Sets the Maximum Concurrency Error. + * Sets the local Maximum Concurrency Error. * @param maxConcurrencyError the given Maximum Concurrency Error. */ public void setMaxConcurrencyError(final double maxConcurrencyError) { @@ -290,7 +329,7 @@ public void setMaxConcurrencyError(final double maxConcurrencyError) { } /** - * Gets the Maximum Concurrency Error + * Gets the local Maximum Concurrency Error * @return the Maximum Concurrency Error */ public double getMaxConcurrencyError() { @@ -298,7 +337,7 @@ public double getMaxConcurrencyError() { } /** - * Sets the Maximum Number of Local Threads. + * Sets the local Maximum Number of Local Threads. * This is used to set the size of the local concurrent buffers. * @param maxNumLocalThreads the given Maximum Number of Local Threads */ @@ -307,7 +346,7 @@ public void setMaxNumLocalThreads(final int maxNumLocalThreads) { } /** - * Gets the Maximum Number of Local Threads. + * Gets the local Maximum Number of Local Threads. * @return the Maximum Number of Local Threads. */ public int getMaxNumLocalThreads() { @@ -327,12 +366,14 @@ public UpdateSketch build() { /** * Returns an UpdateSketch with the current configuration of this Builder * with the specified backing destination MemorySegment store. - * Note: this cannot be used with the Alpha Family of sketches. + * Note: this can only be used with the QUICKSELECT Family of sketches + * and cannot be used with the Alpha Family of sketches. * @param dstSeg The destination MemorySegment. * @return an UpdateSketch */ public UpdateSketch build(final MemorySegment dstSeg) { UpdateSketch sketch = null; + final boolean unionGadget = false; switch (bFam) { case ALPHA: { if (dstSeg == null) { @@ -345,11 +386,10 @@ public UpdateSketch build(final MemorySegment dstSeg) { } case QUICKSELECT: { if (dstSeg == null) { - sketch = new HeapQuickSelectSketch(bLgNomLongs, bSeed, bP, bRF, false); + sketch = new HeapQuickSelectSketch(bLgNomLongs, bSeed, bP, bRF, unionGadget); } else { - sketch = new DirectQuickSelectSketch( - bLgNomLongs, bSeed, bP, bRF, dstSeg, false); + sketch = new DirectQuickSelectSketch(bLgNomLongs, bSeed, bP, bRF, dstSeg, bMemorySegmentRequest, unionGadget); } break; } @@ -464,10 +504,10 @@ public UpdateSketch buildSharedFromSketch(final UpdateSketch sketch, final Memor * @return an UpdateSketch to be used as a per-thread local buffer. */ public UpdateSketch buildLocal(final UpdateSketch shared) { - if ((shared == null) || !(shared instanceof ConcurrentSharedThetaSketch)) { + if (shared == null || !(shared instanceof ConcurrentSharedThetaSketch)) { throw new SketchesStateException("The concurrent shared sketch must be built first."); } - return new ConcurrentHeapThetaBuffer(bLocalLgNomLongs, bSeed, + return new ConcurrentHeapThetaBuffer(bConCurLgNomLongs, bSeed, (ConcurrentSharedThetaSketch) shared, bPropagateOrderedCompact, bMaxNumLocalThreads); } @@ -477,8 +517,8 @@ public String toString() { sb.append("UpdateSketchBuilder configuration:").append(LS); sb.append("LgK:").append(TAB).append(bLgNomLongs).append(LS); sb.append("K:").append(TAB).append(1 << bLgNomLongs).append(LS); - sb.append("LgLocalK:").append(TAB).append(bLocalLgNomLongs).append(LS); - sb.append("LocalK:").append(TAB).append(1 << bLocalLgNomLongs).append(LS); + sb.append("LgLocalK:").append(TAB).append(bConCurLgNomLongs).append(LS); + sb.append("LocalK:").append(TAB).append(1 << bConCurLgNomLongs).append(LS); sb.append("Seed:").append(TAB).append(bSeed).append(LS); sb.append("p:").append(TAB).append(bP).append(LS); sb.append("ResizeFactor:").append(TAB).append(bRF).append(LS); diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java index e9a952ab4..584338469 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java @@ -28,7 +28,8 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact compressed read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact compressed read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * *

    This sketch can only be associated with a Serialization Version 4 format binary image.

    */ @@ -68,7 +69,7 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java index a5b67363f..1f3f3ab9e 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java @@ -37,9 +37,10 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * - *

    This sketch can only be associated with a Serialization Version 3 format binary image.

    + *

    This sketch can only be associated with a Serialization Version 3 binary image format.

    */ class WrappedCompactSketch extends CompactSketch { final byte[] bytes_; @@ -79,7 +80,7 @@ public int getCurrentBytes() { } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch final int preLongs = bytes_[PREAMBLE_LONGS_BYTE]; return (preLongs == 1) ? 0 : getIntLE(bytes_, RETAINED_ENTRIES_INT); } diff --git a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java index 4012cb412..778dc02f2 100644 --- a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java +++ b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java @@ -71,8 +71,7 @@ private ThetaUtil() {} * @param lgMin Log2 of the minimum allowed starting size * @return The Log2 of the starting size */ - public static int startingSubMultiple(final int lgTarget, final int lgRF, - final int lgMin) { + public static int startingSubMultiple(final int lgTarget, final int lgRF, final int lgMin) { return lgTarget <= lgMin ? lgMin : lgRF == 0 ? lgTarget : (lgTarget - lgMin) % lgRF + lgMin; } diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index c264366a4..92355a77b 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -135,8 +135,9 @@ private QuickSelectSketch( final float samplingProbability, final SummaryFactory summaryFactory, final int startingSize) { + final long thetaLong = (long) (Long.MAX_VALUE * (double) samplingProbability); super( - (long) (Long.MAX_VALUE * (double) samplingProbability), + thetaLong, true, summaryFactory); nomEntries_ = ceilingPowerOf2(nomEntries); @@ -182,23 +183,9 @@ private QuickSelectSketch( final MemorySegment seg, final SummaryDeserializer deserializer, final SummaryFactory summaryFactory) { - this(new Validate<>(), seg, deserializer, summaryFactory); - } - - /* - * This private constructor is used to protect against "Finalizer attacks". - * The private static inner class Validate performs validation and deserialization - * from the input MemorySegment and may throw exceptions. In order to protect against the attack, we must - * perform this validation prior to the constructor's super reaches the Object class. - * Making QuickSelectSketch final won't work here because UpdatableSketch is a subclass. - * Using an empty final finalizer() is not recommended and is deprecated as of Java9. - */ - private QuickSelectSketch( - final Validate val, - final MemorySegment seg, - final SummaryDeserializer deserializer, - final SummaryFactory summaryFactory) { - super(val.validate(seg, deserializer), val.myEmpty, summaryFactory); + //this(new Validate<>(), seg, deserializer, summaryFactory); + final Validate val = new Validate<>(); + final long thetaLong = val.validate(seg, deserializer); nomEntries_ = val.myNomEntries; lgResizeFactor_ = val.myLgResizeFactor; samplingProbability_ = val.mySamplingProbability; @@ -207,6 +194,7 @@ private QuickSelectSketch( rebuildThreshold_ = val.myRebuildThreshold; hashTable_ = val.myHashTable; summaryTable_ = val.mySummaryTable; + super(thetaLong, val.myEmpty, summaryFactory); } private static final class Validate { diff --git a/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java index 344df137c..f32571d0b 100644 --- a/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java @@ -79,25 +79,7 @@ class DirectArrayOfDoublesQuickSelectSketch extends ArrayOfDoublesQuickSelectSke final int numValues, final long seed, final MemorySegment dstSeg) { - this(checkMemorySegment(nomEntries, lgResizeFactor, numValues, dstSeg), - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - nomEntries, - lgResizeFactor, - samplingProbability, - numValues, - seed, - dstSeg); - } - - private DirectArrayOfDoublesQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final int nomEntries, - final int lgResizeFactor, - final float samplingProbability, - final int numValues, - final long seed, - final MemorySegment dstSeg) { + checkMemorySegment(nomEntries, lgResizeFactor, numValues, dstSeg); super(numValues, seed); seg_ = dstSeg; final int startingCapacity = Util.getStartingCapacity(nomEntries, lgResizeFactor); @@ -126,14 +108,13 @@ private DirectArrayOfDoublesQuickSelectSketch( setRebuildThreshold(); } - private static final boolean checkMemorySegment( + private static final void checkMemorySegment( final int nomEntries, final int lgResizeFactor, final int numValues, final MemorySegment dstSeg) { final int startingCapacity = Util.getStartingCapacity(nomEntries, lgResizeFactor); checkMemorySegmentSize(dstSeg, startingCapacity, numValues); - return true; } /** @@ -144,15 +125,7 @@ private static final boolean checkMemorySegment( DirectArrayOfDoublesQuickSelectSketch( final MemorySegment seg, final long seed) { - this(checkSerVer(seg), seg, seed); - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - } - - private DirectArrayOfDoublesQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final MemorySegment seg, - final long seed) { + checkSerVer(seg); super(seg.get(JAVA_BYTE, NUM_VALUES_BYTE), seed); seg_ = seg; SerializerDeserializer.validateFamily(seg.get(JAVA_BYTE, FAMILY_ID_BYTE), @@ -170,13 +143,12 @@ private DirectArrayOfDoublesQuickSelectSketch( setRebuildThreshold(); } - private static final boolean checkSerVer(final MemorySegment seg) { + private static final void checkSerVer(final MemorySegment seg) { final byte version = seg.get(JAVA_BYTE, SERIAL_VERSION_BYTE); if (version != serialVersionUID) { throw new SketchesArgumentException("Serial version mismatch. Expected: " + serialVersionUID + ", actual: " + version); } - return true; } @Override diff --git a/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java b/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java index 7d875a7e6..f0ed246ae 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java +++ b/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java @@ -19,18 +19,15 @@ package org.apache.datasketches.kll; -import static org.apache.datasketches.kll.KllSketch.getMaxSerializedSizeBytes; -import static org.apache.datasketches.kll.KllSketch.SketchType.LONGS_SKETCH; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; -import org.apache.datasketches.common.MemorySegmentRequestExtension; +import org.apache.datasketches.common.MemorySegmentRequestExample; import org.testng.annotations.Test; - public class KllMemorySegmentRequestApp { @Test @@ -39,23 +36,24 @@ public class KllMemorySegmentRequestApp { * This demonstrates one example of how to manage a growing off-heap KLL sketch where the * expanded MemorySegments are also off-heap. */ - public void checkMemorySegmentRequestExtension() { + public void checkMemorySegmentRequestExample() { final int k = 200; + final int itemsIn = 10 * k; //will force requests for more space + + //Use the custom MemorySegmentRequestExample to do the allocations. + final MemorySegmentRequestExample mSegReqEx = new MemorySegmentRequestExample(); //The allocation of the original off-heap MemorySegment for the KllLongsSketch //Note that this targets the size to only handle k values, which is quite small. - final int numBytes = getMaxSerializedSizeBytes(k, k, LONGS_SKETCH, true); - final Arena arena = Arena.ofConfined(); - final MemorySegment seg = arena.allocate(numBytes); + final int numBytes = KllSketch.getMaxSerializedSizeBytes(k, k, KllSketch.SketchType.LONGS_SKETCH, true); - //Use the custom extension of the MemorySegmentRequest interface. - final MemorySegmentRequestExtension mSegReqExt = new MemorySegmentRequestExtension(); + final MemorySegment seg = mSegReqEx.request(numBytes); - //Create a new KllLongsSketch and pass the custom extension - final KllLongsSketch sk = KllLongsSketch.newDirectInstance(k, seg, mSegReqExt); + //Create a new KllLongsSketch and pass the mSegReqEx + final KllLongsSketch sk = KllLongsSketch.newDirectInstance(k, seg, mSegReqEx); //Update the sketch with way more data than the original MemorySegment can handle, forcing it to request larger MemorySegments. - for (int n = 1; n <= (10 * k); n++) { sk.update(n); } + for (int n = 1; n <= itemsIn; n++) { sk.update(n); } //Check to make sure the sketch got all the data: assertEquals(sk.getMaxItem(), 10 * k); @@ -63,13 +61,10 @@ public void checkMemorySegmentRequestExtension() { assertEquals(sk.getN(), 10 * k); //Confirm that the last MemorySegment used by the sketch is, in fact, not the same as the original one that was allocated. - assertFalse(sk.getMemorySegment().equals(seg)); - - //All done with the sketch. Cleanup any unclosed off-heap MemorySegments. - mSegReqExt.cleanup(); + assertTrue(sk.getMemorySegment() != seg); - //Close the original off-heap allocated MemorySegment. - arena.close(); + //All done with the sketch. Cleanup any unclosed off-heap MemorySegments including the original allocation. + mSegReqEx.cleanup(); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java b/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java index 7556d2be9..cfd4f61e1 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java +++ b/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java @@ -20,12 +20,12 @@ package org.apache.datasketches.quantiles; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; -import org.apache.datasketches.common.MemorySegmentRequestExtension; +import org.apache.datasketches.common.MemorySegmentRequestExample; import org.testng.annotations.Test; public class ClassicQuantilesMemorySegmentRequestApp { @@ -36,27 +36,24 @@ public class ClassicQuantilesMemorySegmentRequestApp { * This demonstrates one example of how to manage a growing off-heap DoublesSketch where the * expanded MemorySegments are also off-heap. */ - public void checkMemorySegmentRequestExtension() { + public void checkMemorySegmentRequestExample() { final int k = 128; //The default is 128 final int itemsIn = 40 * k; //will force requests for more space + //Use the custom MemorySegmentRequestExample to do the allocations. + final MemorySegmentRequestExample mSegReqEx = new MemorySegmentRequestExample(); + //The allocation of the original off-heap MemorySegment for the DoublesSketch //Note that this targets the size to only handle 2k values, which is quite small. final int initalBytes = DoublesSketch.getUpdatableStorageBytes(k, 2 * k); - final Arena arena = Arena.ofConfined(); - final MemorySegment seg = arena.allocate(initalBytes); - //Use the custom extension of the MemorySegmentRequest interface. - final MemorySegmentRequestExtension mSegReqExt = new MemorySegmentRequestExtension(); + final MemorySegment seg = mSegReqEx.request(initalBytes); - //Create a new KllLongsSketch and pass the custom extension - final DoublesSketchBuilder bldr = DoublesSketch.builder().setK(k); - final DoublesSketch sk = bldr.build(seg, mSegReqExt); + //Create a new KllLongsSketch and pass the mSegReqEx + final DoublesSketch sk = DoublesSketch.builder().setK(k).build(seg, mSegReqEx); //Update the sketch with way more data than the original MemorySegment can handle, forcing it to request larger MemorySegments. - for (int n = 1; n <= itemsIn; n++) { - sk.update(n); - } + for (int n = 1; n <= itemsIn; n++) { sk.update(n); } //Check to make sure the sketch got all the data: assertEquals(sk.getMaxItem(), itemsIn); @@ -64,15 +61,10 @@ public void checkMemorySegmentRequestExtension() { assertEquals(sk.getN(), itemsIn); //Confirm that the last MemorySegment used by the sketch is, in fact, not the same as the original one that was allocated. - assertFalse(sk.getMemorySegment().equals(seg)); + assertTrue(sk.getMemorySegment() != seg); //All done with the sketch. Cleanup any unclosed off-heap MemorySegments. - mSegReqExt.cleanup(); - - //Close the original off-heap allocated MemorySegment. - arena.close(); + mSegReqEx.cleanup(); } - static void println(final Object o) { System.out.println(o.toString()); } - } diff --git a/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java b/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java index df11869eb..738b46bd0 100644 --- a/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java @@ -45,10 +45,6 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesException; import org.apache.datasketches.common.SketchesStateException; -import org.apache.datasketches.sampling.PreambleUtil; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.datasketches.sampling.ReservoirSize; -import org.apache.datasketches.sampling.SampleSubsetSummary; import org.testng.annotations.Test; public class ReservoirItemsSketchTest { @@ -534,61 +530,68 @@ public void checkForceIncrement() { @Test public void checkEstimateSubsetSum() { final int k = 10; - final ReservoirItemsSketch sketch = ReservoirItemsSketch.newInstance(k); - - // empty sketch -- all zeros - SampleSubsetSummary ss = sketch.estimateSubsetSum(item -> true); - assertEquals(ss.getEstimate(), 0.0); - assertEquals(ss.getTotalSketchWeight(), 0.0); - - // add items, keeping in exact mode - double itemCount = 0.0; - for (long i = 1; i <= (k - 1); ++i) { - sketch.update(i); - itemCount += 1.0; - } - - ss = sketch.estimateSubsetSum(item -> true); - assertEquals(ss.getEstimate(), itemCount); - assertEquals(ss.getLowerBound(), itemCount); - assertEquals(ss.getUpperBound(), itemCount); - assertEquals(ss.getTotalSketchWeight(), itemCount); - - // add a few more items, pushing to sampling mode - for (long i = k; i <= (k + 1); ++i) { - sketch.update(i); - itemCount += 1.0; - } - - // predicate always true so estimate == upper bound - ss = sketch.estimateSubsetSum(item -> true); - assertEquals(ss.getEstimate(), itemCount); - assertEquals(ss.getUpperBound(), itemCount); - assertTrue(ss.getLowerBound() < itemCount); - assertEquals(ss.getTotalSketchWeight(), itemCount); - - // predicate always false so estimate == lower bound == 0.0 - ss = sketch.estimateSubsetSum(item -> false); - assertEquals(ss.getEstimate(), 0.0); - assertEquals(ss.getLowerBound(), 0.0); - assertTrue(ss.getUpperBound() > 0.0); - assertEquals(ss.getTotalSketchWeight(), itemCount); - - // finally, a non-degenerate predicate - // insert negative items with identical weights, filter for negative weights only - for (long i = 1; i <= (k + 1); ++i) { - sketch.update(-i); - itemCount += 1.0; - } - - ss = sketch.estimateSubsetSum(item -> item < 0); - assertTrue(ss.getEstimate() >= ss.getLowerBound()); - assertTrue(ss.getEstimate() <= ss.getUpperBound()); - - // allow pretty generous bounds when testing - assertTrue(ss.getLowerBound() < (itemCount / 1.4)); - assertTrue(ss.getUpperBound() > (itemCount / 2.6)); - assertEquals(ss.getTotalSketchWeight(), itemCount); + + //trial loop for probabilistic testing + int passLB = 0; + int passUB = 0; + for (int t = 0; t < 3; t++) { + final ReservoirItemsSketch sketch = ReservoirItemsSketch.newInstance(k); + + // empty sketch -- all zeros + SampleSubsetSummary ss = sketch.estimateSubsetSum(item -> true); + assertEquals(ss.getEstimate(), 0.0); + assertEquals(ss.getTotalSketchWeight(), 0.0); + + // add items, keeping in exact mode + double itemCount = 0.0; + for (long i = 1; i <= (k - 1); ++i) { + sketch.update(i); + itemCount += 1.0; + } + + ss = sketch.estimateSubsetSum(item -> true); + assertEquals(ss.getEstimate(), itemCount); + assertEquals(ss.getLowerBound(), itemCount); + assertEquals(ss.getUpperBound(), itemCount); + assertEquals(ss.getTotalSketchWeight(), itemCount); + + // add a few more items, pushing to sampling mode + for (long i = k; i <= (k + 1); ++i) { + sketch.update(i); + itemCount += 1.0; + } + + // predicate always true so estimate == upper bound + ss = sketch.estimateSubsetSum(item -> true); + assertEquals(ss.getEstimate(), itemCount); + assertEquals(ss.getUpperBound(), itemCount); + assertTrue(ss.getLowerBound() < itemCount); + assertEquals(ss.getTotalSketchWeight(), itemCount); + + // predicate always false so estimate == lower bound == 0.0 + ss = sketch.estimateSubsetSum(item -> false); + assertEquals(ss.getEstimate(), 0.0); + assertEquals(ss.getLowerBound(), 0.0); + assertTrue(ss.getUpperBound() > 0.0); + assertEquals(ss.getTotalSketchWeight(), itemCount); + + // finally, a non-degenerate predicate + // insert negative items with identical weights, filter for negative weights only + for (long i = 1; i <= (k + 1); ++i) { + sketch.update(-i); + itemCount += 1.0; + } + + ss = sketch.estimateSubsetSum(item -> item < 0); + assertTrue(ss.getEstimate() >= ss.getLowerBound()); + assertTrue(ss.getEstimate() <= ss.getUpperBound()); + + // allow pretty generous bounds when testing + if(ss.getLowerBound() < (itemCount / 1.4)) { passLB++; } + if(ss.getUpperBound() > (itemCount / 2.6)) { passUB++; } + assertEquals(ss.getTotalSketchWeight(), itemCount); + } //End trial loop + assertTrue(passLB >= 2 && passUB >= 2); //2 out of 3 must pass for LB and UB } private static MemorySegment getBasicSerializedLongsRIS() { diff --git a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java index e400dd1fa..6482712e8 100644 --- a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java +++ b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java @@ -25,16 +25,10 @@ import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.AnotB; -import org.apache.datasketches.theta.AnotBimpl; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; import org.testng.annotations.Test; /** @@ -295,19 +289,18 @@ public void checkAnotBnotC_sameMemorySegment() { @Test public void checkAnotBsimple() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); - final UpdateSketch skB = Sketches.updateSketchBuilder().build(); - final AnotB aNotB = Sketches.setOperationBuilder().buildANotB(); + final UpdateSketch skA = UpdateSketch.builder().build(); + final UpdateSketch skB =UpdateSketch.builder().build(); + final AnotB aNotB = SetOperation.builder().buildANotB(); final CompactSketch csk = aNotB.aNotB(skA, skB); assertEquals(csk.getCurrentBytes(), 8); } @Test public void checkGetResult() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); - final UpdateSketch skB = Sketches.updateSketchBuilder().build(); - - final AnotB aNotB = Sketches.setOperationBuilder().buildANotB(); + final UpdateSketch skA = UpdateSketch.builder().build(); + final UpdateSketch skB = UpdateSketch.builder().build(); + final AnotB aNotB = SetOperation.builder().buildANotB(); final CompactSketch csk = aNotB.aNotB(skA, skB); assertEquals(csk.getCurrentBytes(), 8); } @@ -321,7 +314,7 @@ public void checkGetFamily() { @Test public void checkGetMaxBytes() { - final int bytes = Sketches.getMaxAnotBResultBytes(10); + final int bytes = SetOperation.getMaxAnotBResultBytes(10); assertEquals(bytes, 16 * 15 + 24); } diff --git a/src/test/java/org/apache/datasketches/theta/BackwardConversions.java b/src/test/java/org/apache/datasketches/theta/BackwardConversions.java deleted file mode 100644 index a0688cbba..000000000 --- a/src/test/java/org/apache/datasketches/theta/BackwardConversions.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; - -import java.lang.foreign.MemorySegment; -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.EmptyCompactSketch; -import org.apache.datasketches.theta.SingleItemSketch; - -/** - * This class converts current compact sketches into prior SerVer 1 and SerVer 2 format for testing. - * - * @author Lee Rhodes - */ -public class BackwardConversions { - - /** - * Converts a SerVer3 ordered, heap CompactSketch to a SerVer1 ordered, SetSketch in MemorySegment. - * This is exclusively for testing purposes. - * - *

    V1 dates from roughly Aug 2014 to about May 2015. - * The library at that time had an early Theta sketch with set operations based on ByteBuffer, - * the Alpha sketch, and an early HLL sketch. It also had an early adaptor for Pig. - * It also had code for the even earlier CountUniqueSketch (for backward compatibility), - * which was the bucket sketch based on Giroire. - * - *

    Serialization Version 1:

    - *
    -   * Long || Start Byte Adr:
    -   * Adr:
    -   *      ||  7 |   6   |     5    |   4   |   3   |    2   |    1   |     0    |
    -   *  0   ||    | Flags | LgResize | LgArr | lgNom | SkType | SerVer | MD_LONGS |
    -   *
    -   *      || 15 |  14   |    13    |  12   |  11   |   10   |    9   |     8    |
    -   *  1   ||                               | ------------CurCount-------------- |
    -   *
    -   *      || 23 |  22   |    21    |  20   |  19   |   18   |   17   |    16    |
    -   *  2   || --------------------------THETA_LONG------------------------------ |
    -   *
    -   *      ||                                                         |    24    |
    -   *  3   || ----------------------Start of Long Array------------------------  |
    -   * 
    - * - *
      - *
    • The serialization for V1 was always to a compact form (no hash table spaces).
    • - *
    • MD_LONGS (Metadata Longs, now Preamble Longs) was always 3.
    • - *
    • SerVer is always 1.
    • - *
    • The SkType had three values: 1,2,3 for Alpha, QuickSelect, and SetSketch, - * respectively.
    • - *
    • Bytes lgNom and lgArr were only used by the QS and Alpha sketches.
    • - *
    • V1 LgResize (2 bits) was only relevant to the Alpha and QS sketches.
    • - *
    • The flags byte is in byte 6 (moved to 5 in V2).
    • - *
    • The only flag bits are BE(bit0)=0, and Read-Only(bit1)=1. Read-only was only set for the - * SetSketch.
    • - *
    • There is no seedHash.
    • - *
    • There is no concept of p-sampling so bytes 12-15 of Pre1 are empty.
    • - *
    • The determination of empty is when both curCount=0 and thetaLong = Long.MAX_VALUE.
    • - *
    - * - * @param skV3 a SerVer3, ordered CompactSketch - * @return a SerVer1 SetSketch as MemorySegment object. - */ - public static MemorySegment convertSerVer3toSerVer1(final CompactSketch skV3) { - //Check input sketch - final boolean validIn = skV3.isCompact() && skV3.isOrdered() && !skV3.hasMemorySegment(); - if (!validIn) { - throw new SketchesArgumentException("Invalid input sketch."); - } - - //Build V1 SetSketch in MemorySegment - final int curCount = skV3.getRetainedEntries(true); - final int bytes = (3 + curCount) << 3; - final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]);//Util.newHeapSegment(bytes); - //Pre0 - wseg.set(JAVA_BYTE, 0, (byte) 3); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 1); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //Compact (SetSketch) - wseg.set(JAVA_BYTE, 6, (byte) 2); //Flags ReadOnly, LittleEndian - //Pre1 - wseg.set(JAVA_INT_UNALIGNED, 8, curCount); - //Pre2 - wseg.set(JAVA_LONG_UNALIGNED, 16, skV3.getThetaLong()); - //Data - if (curCount > 0) { - MemorySegment.copy(skV3.getCache(), 0, wseg, JAVA_LONG_UNALIGNED, 24, curCount); - } - return wseg; - } - - /** - * Converts a SerVer3 ordered, heap CompactSketch to a SerVer2 ordered, SetSketch in MemorySegment. - * This is exclusively for testing purposes. - * - *

    V2 is short-lived and dates from roughly Mid May 2015 to about June 1st, 2015. - * (V3 was created about June 15th in preparation for OpenSource in July.) - * The Theta sketch had evolved but still based on ByteBuffer. There was an UpdateSketch, - * the Alpha sketch, and the early HLL sketch. It also had an early adaptor for Pig. - * - * - *

    Serialization Version 2:

    - *
    -   * Long || Start Byte Adr:
    -   * Adr:
    -   *      ||  7 |   6   |     5    |   4   |   3   |    2   |    1   |     0         |
    -   *  0   || Seed Hash  |  Flags   | lgArr | lgNom | SkType | SerVer | MD_LONGS + RR |
    -   *
    -   *      || 15 |  14   |    13    |  12   |  11   |   10   |    9   |     8         |
    -   *  1   || --------------p-------------- | ---------Retained Entries Count-------- |
    -   *
    -   *      || 23 |  22   |    21    |  20   |  19   |   18   |   17   |    16         |
    -   *  2   || --------------------------THETA_LONG----------------------------------- |
    -   *
    -   *      ||                                                         |    24         |
    -   *  3   || ----------Start of Long Array, could be at 2 or 3 --------------------  |
    -   *  
    - * - *
      - *
    • The serialization for V2 was always to a compact form (no hash table spaces).
    • - *
    • MD_LONGS low 6 bits: 1 (Empty), 2 (Exact), 3 (Estimating).
    • - *
    • SerVer is always 2.
    • - *
    • The SkType had 4 values: 1,2,3,4; see below.
    • - *
    • Bytes lgNom and lgArr were only used by the QS and Alpha sketches.
    • - *
    • V2 LgResize top 2 bits if byte 0. Only relevant to the Alpha and QS sketches.
    • - *
    • The flags byte is in byte 5.
    • - *
    • The flag bits are specified below.
    • - *
    • There is a seedHash in bytes 6-7.
    • - *
    • p-sampling is bytes 12-15 of Pre1.
    • - *
    • The determination of empty based on the sketch field empty_.
    • - *
    - *
    -   *   // Metadata byte Addresses
    -   *   private static final int METADATA_LONGS_BYTE        = 0; //low 6 bits
    -   *   private static final int LG_RESIZE_RATIO_BYTE       = 0; //upper 2 bits
    -   *   private static final int SER_VER_BYTE               = 1;
    -   *   private static final int SKETCH_TYPE_BYTE           = 2;
    -   *   private static final int LG_NOM_LONGS_BYTE          = 3;
    -   *   private static final int LG_ARR_LONGS_BYTE          = 4;
    -   *   private static final int FLAGS_BYTE                 = 5;
    -   *   private static final int SEED_HASH_SHORT            = 6;  //byte 6,7
    -   *   private static final int RETAINED_ENTRIES_COUNT_INT = 8;  //4 byte aligned
    -   *   private static final int P_FLOAT                    = 12; //4 byte aligned
    -   *   private static final int THETA_LONG                 = 16; //8-byte aligned
    -   *   //Backward compatibility
    -   *   private static final int FLAGS_BYTE_V1              = 6;
    -   *   private static final int LG_RESIZE_RATIO_BYTE_V1    = 5;
    -   *
    -   *   // Constant Values
    -   *   static final int SER_VER                        = 2;
    -   *   static final int ALPHA_SKETCH                   = 1; //SKETCH_TYPE_BYTE
    -   *   static final int QUICK_SELECT_SKETCH            = 2;
    -   *   static final int SET_SKETCH                     = 3;
    -   *   static final int BUFFERED_QUICK_SELECT_SKETCH   = 4;
    -   *   static final String[] SKETCH_TYPE_STR     =
    -   *       { "None", "AlphaSketch", "QuickSelectSketch", "SetSketch", "BufferedQuickSelectSketch" };
    -   *
    -   *   // flag bit masks
    -   *   static final int BIG_ENDIAN_FLAG_MASK     = 1;
    -   *   static final int READ_ONLY_FLAG_MASK      = 2;
    -   *   static final int EMPTY_FLAG_MASK          = 4;
    -   *   static final int NO_REBUILD_FLAG_MASK     = 8;
    -   *   static final int UNORDERED_FLAG_MASK     = 16;
    -   * 
    - * - * @param skV3 a SerVer3, ordered CompactSketch - * @param seed used for checking the seed hash (if one exists). - * @return a SerVer2 SetSketch as MemorySegment object. - */ - public static MemorySegment convertSerVer3toSerVer2(final CompactSketch skV3, final long seed) { - final short seedHash = Util.computeSeedHash(seed); - MemorySegment wseg = null; - - if (skV3 instanceof EmptyCompactSketch) { - wseg = MemorySegment.ofArray(new long[1]); - wseg.set(JAVA_BYTE, 0, (byte) 1); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch - final byte flags = (byte) 0xE; //NoRebuild, Empty, ReadOnly, LE - wseg.set(JAVA_BYTE, 5, flags); - wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash); - return wseg; - } - if (skV3 instanceof SingleItemSketch) { - final SingleItemSketch sis = (SingleItemSketch) skV3; - wseg = MemorySegment.ofArray(new long[3]); - wseg.set(JAVA_BYTE, 0, (byte) 2); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch - final byte flags = (byte) 0xA; //NoRebuild, notEmpty, ReadOnly, LE - wseg.set(JAVA_BYTE, 5, flags); - wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash); - wseg.set(JAVA_INT_UNALIGNED, 8, 1); - final long[] arr = sis.getCache(); - wseg.set(JAVA_LONG_UNALIGNED, 16, arr[0]); - return wseg; - } - //General CompactSketch - final int preLongs = skV3.getCompactPreambleLongs(); - final int entries = skV3.getRetainedEntries(true); - final boolean unordered = !(skV3.isOrdered()); - final byte flags = (byte) (0xA | (unordered ? 16 : 0)); //Unordered, NoRebuild, notEmpty, ReadOnly, LE - wseg = MemorySegment.ofArray(new byte[(preLongs + entries) << 3]); - wseg.set(JAVA_BYTE, 0, (byte) preLongs); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch - - wseg.set(JAVA_BYTE, 5, flags); - wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash); - wseg.set(JAVA_INT_UNALIGNED, 8, entries); - if (preLongs == 3) { - wseg.set(JAVA_LONG_UNALIGNED, 16, skV3.getThetaLong()); - } - final long[] arr = skV3.getCache(); - MemorySegment.copy(arr, 0, wseg, JAVA_LONG_UNALIGNED, preLongs << 3, entries); - return wseg; - } -} diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java index 6d9c173a0..fc35891b3 100644 --- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java @@ -26,25 +26,13 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; +import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.DirectCompactSketch; -import org.apache.datasketches.theta.EmptyCompactSketch; -import org.apache.datasketches.theta.HashIterator; -import org.apache.datasketches.theta.HeapCompactSketch; -import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.SingleItemSketch; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.WrappedCompactCompressedSketch; -import org.apache.datasketches.theta.WrappedCompactSketch; import org.testng.annotations.Test; -import java.lang.foreign.Arena; - /** * @author Lee Rhodes */ @@ -186,7 +174,7 @@ private static void checkOtherCompactSketch(final Sketch testSk, final Sketch re @Test public void checkDirectSingleItemSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final int bytes = sk.getCompactBytes(); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); @@ -230,7 +218,7 @@ public void checkSegTooSmallOrdered() { @Test public void checkCompactCachePart() { //phony values except for curCount = 0. - final long[] result = Intersection.compactCachePart(null, 4, 0, 0L, false); + final long[] result = IntersectionImpl.compactCachePart(null, 4, 0, 0L, false); assertEquals(result.length, 0); } @@ -250,7 +238,7 @@ public void checkCompactCachePart() { * Empty, segment-based Compact sketches are always ordered */ public void checkEmptyMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]); final CompactSketch csk1 = sk.compact(false, wseg1); //the first parameter is ignored when empty @@ -290,7 +278,7 @@ public void checkEmptyMemorySegmentCompactSketch() { * Single-Item, segment-based Compact sketches are always ordered: */ public void checkSingleItemMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]); @@ -321,7 +309,7 @@ public void checkSingleItemMemorySegmentCompactSketch() { @Test public void checkMultipleItemMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); //This sequence is naturally out-of-order by the hash values. sk.update(1); sk.update(2); @@ -360,7 +348,7 @@ public void checkMultipleItemMemorySegmentCompactSketch() { * All empty, heap-based, compact sketches point to the same static, final constant of 8 bytes. */ public void checkEmptyHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when empty final State state1 = new State("EmptyCompactSketch", 0, 8, COMPACT, EMPTY, !DIRECT, !SEGMENT, ORDERED, !ESTIMATION); @@ -390,7 +378,7 @@ public void checkEmptyHeapCompactSketch() { * Single-Item, heap-based Compact sketches are always ordered. */ public void checkSingleItemHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when single item @@ -418,7 +406,7 @@ public void checkSingleItemHeapCompactSketch() { @Test public void checkMultipleItemHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); //This sequence is naturally out-of-order by the hash values. sk.update(1); sk.update(2); @@ -453,9 +441,9 @@ public void checkMultipleItemHeapCompactSketch() { @Test public void checkHeapifySingleItemSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); - final int bytes = Sketches.getMaxCompactSketchBytes(2); //1 more than needed + final int bytes = Sketch.getMaxCompactSketchBytes(2); //1 more than needed final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); sk.compact(false, wseg); final Sketch csk = Sketch.heapify(wseg); @@ -464,7 +452,7 @@ public void checkHeapifySingleItemSketch() { @Test public void checkHeapifyEmptySketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final MemorySegment wseg = MemorySegment.ofArray(new byte[16]); //empty, but extra bytes final CompactSketch csk = sk.compact(false, wseg); //ignores order because it is empty assertTrue(csk instanceof DirectCompactSketch); @@ -474,7 +462,7 @@ public void checkHeapifyEmptySketch() { @Test public void checkGetCache() { - final UpdateSketch sk = Sketches.updateSketchBuilder().setP((float).5).build(); + final UpdateSketch sk = UpdateSketch.builder().setP((float).5).build(); sk.update(7); final int bytes = sk.getCompactBytes(); final CompactSketch csk = sk.compact(true, MemorySegment.ofArray(new byte[bytes])); @@ -484,7 +472,7 @@ public void checkGetCache() { @Test public void checkHeapCompactSketchCompact() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); sk.update(2); final CompactSketch csk = sk.compact(); @@ -506,7 +494,7 @@ public void checkDirectCompactSketchCompact() { final int lgK = 6; //empty - final UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build(); + final UpdateSketch sk = UpdateSketch.builder().setLogNominalEntries(lgK).build(); bytes = sk.getCompactBytes(); //empty, 8 bytes wseg1 = MemorySegment.ofArray(new byte[bytes]); wseg2 = MemorySegment.ofArray(new byte[bytes]); @@ -566,7 +554,7 @@ public void checkDirectCompactSketchCompact() { @Test public void serializeDeserializeHeapV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -582,8 +570,8 @@ public void serializeDeserializeHeapV4() { } @Test - public void serializeDeserializeDirectV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + public void serializeDeserializeDirectV4_segment() { + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -598,9 +586,27 @@ public void serializeDeserializeDirectV4() { } } + @Test + public void serializeDeserializeDirectV4_bytes() { + final UpdateSketch sk = UpdateSketch.builder().build(); + for (int i = 0; i < 10000; i++) { + sk.update(i); + } + final CompactSketch cs1 = sk.compact(true, MemorySegment.ofArray(new byte[sk.getCompactBytes()])); + final byte[] bytes = cs1.toByteArrayCompressed(); + final CompactSketch cs2 = CompactSketch.wrap(bytes); + assertEquals(cs1.getRetainedEntries(), cs2.getRetainedEntries()); + final HashIterator it1 = cs1.iterator(); + final HashIterator it2 = cs2.iterator(); + while (it1.next() && it2.next()) { + assertEquals(it2.get(), it2.get()); + } + } + + @Test public void serializeWrapBytesV3() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -618,7 +624,7 @@ public void serializeWrapBytesV3() { @Test public void serializeWrapBytesV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java index 4a59edb97..e0816b0e5 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java @@ -33,15 +33,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.ConcurrentDirectQuickSelectSketch; -import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer; -import org.apache.datasketches.theta.ConcurrentSharedThetaSketch; -import org.apache.datasketches.theta.DirectQuickSelectSketch; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.UpdateSketchBuilder; import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketchTest.SharedLocal; import org.apache.datasketches.thetacommon.HashOperations; import org.testng.annotations.Test; @@ -79,7 +70,7 @@ public void checkHeapifyMemorySegmentEstimating() { assertEquals(local.getClass().getSimpleName(), "ConcurrentHeapThetaBuffer"); //This sharedHeap is not linked to the concurrent local buffer - final UpdateSketch sharedHeap = Sketches.heapifyUpdateSketch(sl.wseg); + final UpdateSketch sharedHeap = UpdateSketch.heapify(sl.wseg); assertEquals(sharedHeap.getClass().getSimpleName(), "HeapQuickSelectSketch"); checkMemorySegmentDirectProxyMethods(local, shared); @@ -242,7 +233,7 @@ public void checkDQStoCompactForms() { assertEquals(csk.getClass().getSimpleName(), "HeapCompactSketch"); final int bytes = shared.getCompactBytes(); - assertEquals(bytes, (k*8) + (Family.COMPACT.getMaxPreLongs() << 3)); + assertEquals(bytes, k*8 + (Family.COMPACT.getMaxPreLongs() << 3)); final byte[] segArr2 = new byte[bytes]; final MemorySegment seg2 = MemorySegment.ofArray(segArr2); @@ -462,7 +453,7 @@ public void checkEstModeMemorySegmentArr() { waitForBgPropagationToComplete(shared); final double est = local.getEstimate(); - assertTrue((est < (u * 1.05)) && (est > (u * 0.95))); + assertTrue(est < u * 1.05 && est > u * 0.95); assertTrue(shared.getRetainedEntries(false) >= k); } @@ -480,7 +471,7 @@ public void checkEstModeNativeMemorySegment() { for (int i = 0; i< u; i++) { local.update(i); } waitForBgPropagationToComplete(shared); final double est = local.getEstimate(); - assertTrue((est < (u * 1.05)) && (est > (u * 0.95))); + assertTrue(est < u * 1.05 && est > u * 0.95); assertTrue(shared.getRetainedEntries(false) >= k); } @@ -501,7 +492,7 @@ public void checkConstructReconstructFromMemorySegment() { final double est1 = local.getEstimate(); final int count1 = shared.getRetainedEntries(false); - assertTrue((est1 < (u * 1.05)) && (est1 > (u * 0.95))); + assertTrue(est1 < u * 1.05 && est1 > u * 0.95); assertTrue(count1 >= k); byte[] serArr; @@ -509,7 +500,7 @@ public void checkConstructReconstructFromMemorySegment() { serArr = shared.toByteArray(); final MemorySegment seg = MemorySegment.ofArray(serArr); - final UpdateSketch recoveredShared = Sketches.wrapUpdateSketch(seg); + final UpdateSketch recoveredShared = UpdateSketch.wrap(seg); //reconstruct to Native/Direct final int bytes = Sketch.getMaxUpdateSketchBytes(k); @@ -576,7 +567,7 @@ public void checkBadLgNomLongs() { final boolean useSeg = true; final SharedLocal sl = new SharedLocal(lgK, lgK, useSeg); sl.wseg.set(JAVA_BYTE, LG_NOM_LONGS_BYTE, (byte) 3); //Corrupt LgNomLongs byte - DirectQuickSelectSketch.writableWrap(sl.wseg, Util.DEFAULT_UPDATE_SEED); + DirectQuickSelectSketch.writableWrap(sl.wseg, null, Util.DEFAULT_UPDATE_SEED); } @Test @@ -607,7 +598,7 @@ public void checkBackgroundPropagation() { final long theta2 = ((ConcurrentSharedThetaSketch)shared).getVolatileTheta(); final int entries = shared.getRetainedEntries(false); - assertTrue((entries > k) || (theta2 < theta1), + assertTrue(entries > k || theta2 < theta1, "entries="+entries+" k="+k+" theta1="+theta1+" theta2="+theta2); shared.rebuild(); @@ -658,7 +649,7 @@ public void checkWrapIllegalFamilyID_direct() { sl.wseg.set(JAVA_BYTE, FAMILY_BYTE, (byte) 0); //corrupt the Sketch ID byte //try to wrap the corrupted seg - DirectQuickSelectSketch.writableWrap(sl.wseg, Util.DEFAULT_UPDATE_SEED); + DirectQuickSelectSketch.writableWrap(sl.wseg, null, Util.DEFAULT_UPDATE_SEED); } @Test(expectedExceptions = SketchesArgumentException.class) diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java index 565ef50ed..c354fd344 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java @@ -34,17 +34,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketch; -import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer; -import org.apache.datasketches.theta.ConcurrentPropagationService; -import org.apache.datasketches.theta.ConcurrentSharedThetaSketch; -import org.apache.datasketches.theta.HeapQuickSelectSketch; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.UpdateSketchBuilder; import org.testng.annotations.Test; /** @@ -90,7 +79,7 @@ public void checkPropagationNotOrdered() { final SharedLocal sl = new SharedLocal(lgK, 4, false, false); final UpdateSketch shared = sl.shared; final UpdateSketch local = sl.local; - assertEquals((sl.bldr.getLocalLgNominalEntries()), 4); + assertEquals((sl.bldr.getConCurLgNominalEntries()), 4); assertTrue(local.isEmpty()); for (int i = 0; i < u; i++) { @@ -173,7 +162,7 @@ public void checkHeapifyByteArrayExact() { final byte[] serArr = shared.toByteArray(); final MemorySegment srcSeg = MemorySegment.ofArray(serArr).asReadOnly(); - final Sketch recoveredShared = Sketches.heapifyUpdateSketch(srcSeg); + final Sketch recoveredShared = UpdateSketch.heapify(srcSeg); //reconstruct to Native/Direct final int bytes = Sketch.getMaxUpdateSketchBytes(k); @@ -494,7 +483,7 @@ public void checkRebuild() { public void checkBuilder() { final int lgK = 4; final SharedLocal sl = new SharedLocal(lgK); - assertEquals(sl.bldr.getLocalLgNominalEntries(), lgK); + assertEquals(sl.bldr.getConCurLgNominalEntries(), lgK); assertEquals(sl.bldr.getLgNominalEntries(), lgK); println(sl.bldr.toString()); } @@ -652,11 +641,11 @@ public void checkBuilderExceptions() { fail(); } catch (final SketchesArgumentException e) { } try { - bldr.setLocalNominalEntries(8); + bldr.setConCurNominalEntries(8); fail(); } catch (final SketchesArgumentException e) { } try { - bldr.setLocalLogNominalEntries(3); + bldr.setConCurLogNominalEntries(3); fail(); } catch (final SketchesArgumentException e) { } bldr.setNumPoolThreads(4); @@ -731,7 +720,7 @@ static class SharedLocal { wseg = null; } bldr.setLogNominalEntries(sharedLgK); - bldr.setLocalLogNominalEntries(localLgK); + bldr.setConCurLogNominalEntries(localLgK); bldr.setPropagateOrderedCompact(ordered); bldr.setSeed(this.seed); shared = bldr.buildShared(wseg); diff --git a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java index e812ab8f2..59b6396b7 100644 --- a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java +++ b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java @@ -31,19 +31,12 @@ import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.IntersectionImpl; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.Union; -import org.apache.datasketches.theta.UpdateSketch; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesReadOnlyException; import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.common.Util; import org.testng.annotations.Test; /** @@ -471,7 +464,7 @@ public void checkWrapVirginEmpty() { MemorySegment iSeg = MemorySegment.ofArray(new byte[segBytes]); inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin off-heap - inter2 = Sketches.wrapIntersection(iSeg); //virgin off-heap, identical to inter1 + inter2 = Intersection.wrap(iSeg); //virgin off-heap, identical to inter1 //both in virgin state, empty = false //note: both inter1 and inter2 are tied to the same MemorySegment, // so an intersect to one also affects the other. Don't do what I do! @@ -493,7 +486,7 @@ public void checkWrapVirginEmpty() { //test the path via toByteArray, now in a different state iSeg = MemorySegment.ofArray(inter1.toByteArray()); - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); assertTrue(inter2.hasResult()); //still true //test the compaction path @@ -514,7 +507,7 @@ public void checkWrapNullEmpty2() { final MemorySegment iSeg = MemorySegment.ofArray(segArr); inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); //both in virgin state, empty = false assertFalse(inter1.hasResult()); assertFalse(inter2.hasResult()); @@ -525,7 +518,7 @@ public void checkWrapNullEmpty2() { //remains empty = false. inter1.intersect(sk1); - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); assertTrue(inter1.hasResult()); assertTrue(inter2.hasResult()); final CompactSketch comp = inter2.getResult(true, null); @@ -579,7 +572,7 @@ public void checkBadPreambleLongs() { final MemorySegment seg = MemorySegment.ofArray(byteArray); //corrupt: seg.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, (byte) 2);//RF not used = 0 - Sketches.wrapIntersection(seg); + Intersection.wrap(seg); } @Test(expectedExceptions = SketchesArgumentException.class) @@ -596,18 +589,19 @@ public void checkBadSerVer() { final MemorySegment seg = MemorySegment.ofArray(byteArray); //corrupt: seg.set(JAVA_BYTE, SER_VER_BYTE, (byte) 2); - Sketches.wrapIntersection(seg); //throws in SetOperations + Intersection.wrap(seg); //throws in SetOperations } - @Test(expectedExceptions = ClassCastException.class) - public void checkFamilyID() { + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkIncorrectWrap() { final int k = 32; Union union; union = SetOperation.builder().setNominalEntries(k).buildUnion(); final byte[] byteArray = union.toByteArray(); final MemorySegment seg = MemorySegment.ofArray(byteArray); - Sketches.wrapIntersection(seg); + Intersection.wrap(seg); //wrong sketch Family + //Sketches.wrapIntersection(seg); } @Test @@ -637,7 +631,7 @@ public void checkWrap() { final byte[] segArr2 = inter.toByteArray(); final MemorySegment srcSeg = MemorySegment.ofArray(segArr2); - inter2 = Sketches.wrapIntersection(srcSeg); + inter2 = Intersection.wrap(srcSeg); //2nd call = valid intersecting sk2 = UpdateSketch.builder().setNominalEntries(k).build(); @@ -656,7 +650,7 @@ public void checkWrap() { final byte[] segArr3 = inter2.toByteArray(); final MemorySegment srcSeg2 = MemorySegment.ofArray(segArr3); - inter3 = Sketches.wrapIntersection(srcSeg2); + inter3 = Intersection.wrap(srcSeg2); resultComp2 = inter3.getResult(false, null); est2 = resultComp2.getEstimate(); println("Est2: "+est2); @@ -683,13 +677,13 @@ public void checkExceptionMinSize() { @Test public void checkGetResult() { final int k = 1024; - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final int segBytes = getMaxIntersectionBytes(k); final byte[] segArr = new byte[segBytes]; final MemorySegment iSeg = MemorySegment.ofArray(segArr); - final Intersection inter = Sketches.setOperationBuilder().buildIntersection(iSeg); + final Intersection inter = SetOperation.builder().buildIntersection(iSeg); inter.intersect(sk); final CompactSketch csk = inter.getResult(); assertEquals(csk.getCompactBytes(), 8); @@ -732,8 +726,8 @@ public void checkExceptions2() { public void checkOverlappedDirect() { final int k = 1 << 4; final int segBytes = 2*k*16 +PREBYTES; //plenty of room - final UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - final UpdateSketch sk2 = Sketches.updateSketchBuilder().setNominalEntries(k).build(); + final UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k).build(); + final UpdateSketch sk2 = UpdateSketch.builder().setNominalEntries(k).build(); for (int i=0; i>> 1); //corrupt theta and seg1.set(JAVA_BYTE, LG_ARR_LONGS_BYTE, (byte) 10); //corrupt lgArrLongs try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -803,7 +803,7 @@ public void checkConstructorSrcSegCorruptions() { final byte badFlags = (byte) (COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | ORDERED_FLAG_MASK); seg1.set(JAVA_BYTE, FLAGS_BYTE, badFlags); try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -812,7 +812,7 @@ public void checkConstructorSrcSegCorruptions() { final byte[] arr2 = Arrays.copyOfRange(arr1, 0, bytes-1); //corrupt length final MemorySegment seg2 = MemorySegment.ofArray(arr2); try { - usk2 = DirectQuickSelectSketch.writableWrap(seg2, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg2, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -823,7 +823,7 @@ public void checkConstructorSrcSegCorruptions() { public void checkCorruptRFWithInsufficientArray() { final int k = 1024; //lgNomLongs = 10 - final int bytes = Sketches.getMaxUpdateSketchBytes(k); + final int bytes = Sketch.getMaxUpdateSketchBytes(k); final byte[] arr = new byte[bytes]; final MemorySegment seg = MemorySegment.ofArray(arr); final ResizeFactor rf = ResizeFactor.X8; // 3 @@ -831,15 +831,15 @@ public void checkCorruptRFWithInsufficientArray() { usk.update(0); insertLgResizeFactor(seg, 0); // corrupt RF: X1 - final UpdateSketch dqss = DirectQuickSelectSketch.writableWrap(seg, Util.DEFAULT_UPDATE_SEED); + final UpdateSketch dqss = DirectQuickSelectSketch.writableWrap(seg, null, Util.DEFAULT_UPDATE_SEED); assertEquals(dqss.getResizeFactor(), ResizeFactor.X2); // force-promote to X2 } @Test public void checkFamilyAndRF() { final int k = 16; - final MemorySegment seg = MemorySegment.ofArray(new byte[(k*16) + 24]); - final UpdateSketch sketch = Sketches.updateSketchBuilder().setNominalEntries(k).build(seg); + final MemorySegment seg = MemorySegment.ofArray(new byte[k*16 + 24]); + final UpdateSketch sketch = UpdateSketch.builder().setNominalEntries(k).build(seg); assertEquals(sketch.getFamily(), Family.QUICKSELECT); assertEquals(sketch.getResizeFactor(), ResizeFactor.X8); } @@ -849,29 +849,29 @@ public void checkFamilyAndRF() { public void checkResizeInBigSeg() { final int k = 1 << 14; final int u = 1 << 20; - final MemorySegment seg = MemorySegment.ofArray(new byte[(8*k*16) +24]); - final UpdateSketch sketch = Sketches.updateSketchBuilder().setNominalEntries(k).build(seg); + final MemorySegment seg = MemorySegment.ofArray(new byte[8*k*16 +24]); + final UpdateSketch sketch = UpdateSketch.builder().setNominalEntries(k).build(seg); for (int i=0; i previous); + previous = it.get(); + } + } + } + + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppCompressedSegment() throws IOException { final int[] nArr = {10, 100, 1000, 10000, 100000, 1000000}; - for (int n: nArr) { + for (final int n: nArr) { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_compressed_n" + n + "_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); @@ -114,11 +133,38 @@ public void deserializeFromCppCompressed() throws IOException { } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCppNonEmptyNoEntries() throws IOException { + public void deserializeFromCppCompressedBytes() throws IOException { + final int[] nArr = {10, 100, 1000, 10000, 100000, 1000000}; + for (final int n: nArr) { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_compressed_n" + n + "_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); + assertEquals(sketch.getEstimate(), n, n * 0.03); + assertTrue(sketch.isOrdered()); + final HashIterator it = sketch.iterator(); + long previous = 0; + while (it.next()) { + assertTrue(it.get() < sketch.getThetaLong()); + assertTrue(it.get() > previous); + previous = it.get(); + } + } + } + + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppNonEmptyNoEntriesSegment() throws IOException { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_non_empty_no_entries_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertFalse(sketch.isEmpty()); assertEquals(sketch.getRetainedEntries(), 0); } + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppNonEmptyNoEntriesBytes() throws IOException { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_non_empty_no_entries_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertFalse(sketch.isEmpty()); + assertEquals(sketch.getRetainedEntries(), 0); + } + } diff --git a/src/test/java/org/apache/datasketches/theta/UnionImplTest.java b/src/test/java/org/apache/datasketches/theta/UnionImplTest.java index f212b6547..1cc7c76e7 100644 --- a/src/test/java/org/apache/datasketches/theta/UnionImplTest.java +++ b/src/test/java/org/apache/datasketches/theta/UnionImplTest.java @@ -20,8 +20,6 @@ package org.apache.datasketches.theta; import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer1; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer2; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; @@ -39,7 +37,7 @@ public class UnionImplTest { @Test public void checkGetCurrentAndMaxBytes() { final int lgK = 10; - final Union union = Sketches.setOperationBuilder().setLogNominalEntries(lgK).buildUnion(); + final Union union = SetOperation.builder().setLogNominalEntries(lgK).buildUnion(); assertEquals(union.getCurrentBytes(), 288); assertEquals(union.getMaxUnionBytes(), 16416); } @@ -49,13 +47,13 @@ public void checkUpdateWithSketch() { final int k = 16; final MemorySegment seg = MemorySegment.ofArray(new byte[(k*8) + 24]); final MemorySegment seg2 = MemorySegment.ofArray(new byte[(k*8) + 24]); - final UpdateSketch sketch = Sketches.updateSketchBuilder().setNominalEntries(k).build(); + final UpdateSketch sketch = UpdateSketch.builder().setNominalEntries(k).build(); for (int i=0; i 0) && !(((lgT - lgA) % lgR) == 0); - boolean rf0 = (lgR == 0) && (lgA != lgT); - assertTrue((lgRbad == rf0) || (lgRbad == rf123)); + final boolean lgRbad = isResizeFactorIncorrect(wseg, lgK, lgA); + final boolean rf123 = lgR > 0 && (lgT - lgA) % lgR != 0; + final boolean rf0 = lgR == 0 && lgA != lgT; + assertTrue(lgRbad == rf0 || lgRbad == rf123); } } } @@ -211,12 +210,12 @@ public void checkIsResizeFactorIncorrect() { public void checkCompactOpsMemorySegmentToCompact() { MemorySegment skwseg, cskwseg1, cskwseg2, cskwseg3; CompactSketch csk1, csk2, csk3; - int lgK = 6; - UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build(); - int n = 1 << (lgK + 1); + final int lgK = 6; + final UpdateSketch sk = UpdateSketch.builder().setLogNominalEntries(lgK).build(); + final int n = 1 << (lgK + 1); for (int i = 2; i < n; i++) { sk.update(i); } - int cbytes = sk.getCompactBytes(); - byte[] byteArr = sk.toByteArray(); + final int cbytes = sk.getCompactBytes(); + final byte[] byteArr = sk.toByteArray(); skwseg = MemorySegment.ofArray(byteArr); cskwseg1 = MemorySegment.ofArray(new byte[cbytes]); cskwseg2 = MemorySegment.ofArray(new byte[cbytes]); @@ -236,7 +235,7 @@ public void printlnTest() { /** * @param s value to print */ - static void println(String s) { + static void println(final String s) { //System.out.println(s); //disable here } } diff --git a/src/test/java/org/apache/datasketches/thetacommon/BoundsOnRatiosInThetaSketchedSetsTest.java b/src/test/java/org/apache/datasketches/thetacommon/BoundsOnRatiosInThetaSketchedSetsTest.java index ed8833f7a..c2fe86b03 100644 --- a/src/test/java/org/apache/datasketches/thetacommon/BoundsOnRatiosInThetaSketchedSetsTest.java +++ b/src/test/java/org/apache/datasketches/thetacommon/BoundsOnRatiosInThetaSketchedSetsTest.java @@ -25,22 +25,21 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.theta.CompactSketch; import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.Sketches; +import org.apache.datasketches.theta.SetOperation; import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.thetacommon.BoundsOnRatiosInThetaSketchedSets; import org.testng.annotations.Test; public class BoundsOnRatiosInThetaSketchedSetsTest { @Test public void checkNormalReturns() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); //4K - final UpdateSketch skC = Sketches.updateSketchBuilder().build(); + final UpdateSketch skA = UpdateSketch.builder().build(); //4K + final UpdateSketch skC = UpdateSketch.builder().build(); final int uA = 10000; final int uC = 100000; for (int i = 0; i < uA; i++) { skA.update(i); } for (int i = 0; i < uC; i++) { skC.update(i + (uA / 2)); } - final Intersection inter = Sketches.setOperationBuilder().buildIntersection(); + final Intersection inter = SetOperation.builder().buildIntersection();//SetOperation.builder().buildIntersection(); inter.intersect(skA); inter.intersect(skC); final CompactSketch skB = inter.getResult(); @@ -72,8 +71,8 @@ public void checkNormalReturns() { @Test(expectedExceptions = SketchesArgumentException.class) public void checkAbnormalReturns() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); //4K - final UpdateSketch skC = Sketches.updateSketchBuilder().build(); + final UpdateSketch skA = UpdateSketch.builder().build(); // 4K + final UpdateSketch skC = UpdateSketch.builder().build(); final int uA = 100000; final int uC = 10000; for (int i = 0; i < uA; i++) { skA.update(i); }