The given dstSeg is checked for the required capacity as determined by * {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}. * @param srcWseg an writable image of a valid source sketch with data. - * @return an HllSketch where the sketch data is in the given dstSeg. + * @return an HllSketch where the sketch data is in the given srcWseg. */ public static final HllSketch writableWrap(final MemorySegment srcWseg) { if (srcWseg.isReadOnly()) { return wrap(srcWseg); } @@ -251,7 +251,7 @@ static final HllSketch writableWrap( final MemorySegment srcWseg, final boolean checkBounds(0, 8, srcWseg.byteSize()); //need min 8 bytes if (extractCompactFlag(srcWseg)) { throw new SketchesArgumentException( - "Cannot perform a writableWrap of a writable sketch image that is in compact form. " + "Cannot perform a writableWrap of a sketch image that is in compact form. " + "Compact sketches are by definition immutable."); } final int lgConfigK = extractLgK(srcWseg); diff --git a/src/main/java/org/apache/datasketches/hll/TgtHllType.java b/src/main/java/org/apache/datasketches/hll/TgtHllType.java index b7f8d45ad..78aaafd1b 100644 --- a/src/main/java/org/apache/datasketches/hll/TgtHllType.java +++ b/src/main/java/org/apache/datasketches/hll/TgtHllType.java @@ -60,7 +60,7 @@ public enum TgtHllType { */ HLL_6, /** - * An Hll Sketch with a bin size of 8 bits + * An HLL Sketch with a bin size of 8 bits */ HLL_8; diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 45a6ae8bb..0e4b48794 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -144,9 +144,7 @@ public static KllDoublesSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllDoublesSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, DOUBLES_SKETCH); - return new KllDirectDoublesSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -386,6 +384,8 @@ else if (weight < levelsArr[0]) { /** * Vector update. Updates this sketch with the given array (vector) of items, starting at the items * offset for a length number of items. This is not supported for direct sketches. + *
Note: a single occurrence of a NaN in the array will force this method to use the conventional update path + * rather than the fast update path.
* @param items the vector of items * @param offset the starting index of the items[] array * @param length the number of items diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 35c8711ed..d41abb891 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -144,9 +144,7 @@ public static KllFloatsSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllFloatsSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, FLOATS_SKETCH); - return new KllDirectFloatsSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -386,6 +384,8 @@ else if (weight < levelsArr[0]) { /** * Vector update. Updates this sketch with the given array (vector) of items, starting at the items * offset for a length number of items. This is not supported for direct sketches. + *Note: a single occurrence of a NaN in the array will force this method to use the conventional update path + * rather than the fast update path.
* @param items the vector of items * @param offset the starting index of the items[] array * @param length the number of items diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java index 3d784972f..73bfb5283 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -357,7 +357,7 @@ static MemorySegment memorySegmentSpaceMgmt( if (mSegReq == null) { mSegReq = MemorySegmentRequest.DEFAULT; } - final MemorySegment newSeg = mSegReq.request(oldWseg, requiredSketchBytes); + final MemorySegment newSeg = mSegReq.request(requiredSketchBytes); MemorySegment.copy(oldWseg, 0, newSeg, 0, DATA_START_ADR); //copy preamble (first 20 bytes) mSegReq.requestClose(oldWseg); return newSeg; diff --git a/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java b/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java index c5ada70b4..6acf93799 100644 --- a/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java @@ -144,9 +144,7 @@ public static KllLongsSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllLongsSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, LONGS_SKETCH); - return new KllDirectLongsSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -364,7 +362,7 @@ final void updateMinMax(final long item) { /** * Weighted update. Updates this sketch with the given item the number of times specified by the given integer weight. - * @param item the item to be repeated. NaNs are ignored. + * @param item the item to be repeated. * @param weight the number of times the update of item is to be repeated. It must be ≥ one. */ public void update(final long item, final long weight) { @@ -409,6 +407,8 @@ public void update(final long[] items, final int offset, final int length) { + + */ private void updateLong(final long[] srcItems, final int srcOffset, final int length) { if (isEmpty()) { diff --git a/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java index 4976039d5..168b80b16 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java @@ -358,7 +358,7 @@ private MemorySegment growCombinedSegBuffer(final int itemSpaceNeeded) { mSegReq_ = (mSegReq_ == null) ? MemorySegmentRequest.DEFAULT : mSegReq_; - final MemorySegment newSeg = mSegReq_.request(seg_, needBytes); + final MemorySegment newSeg = mSegReq_.request(needBytes); MemorySegment.copy(seg_, 0, newSeg, 0, segBytes); mSegReq_.requestClose(seg_); return newSeg; diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java index f4991d658..41355f63d 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java @@ -44,26 +44,16 @@ abstract class DoublesSketchAccessor extends DoublesBufferAccessor { final DoublesSketch ds, final boolean forceSize, final int level) { - this(checkLvl(level), ds, forceSize, level); - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - } - - private DoublesSketchAccessor( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final DoublesSketch ds, - final boolean forceSize, - final int level) { + checkLvl(level); ds_ = ds; forceSize_ = forceSize; setLevel(level); } - private static final boolean checkLvl(final int level) { + private static final void checkLvl(final int level) { if ((level != BB_LVL_IDX) && (level < 0)) { throw new SketchesArgumentException("Parameter level is < 0."); } - return true; } /** diff --git a/src/main/java/org/apache/datasketches/theta/BitPacking.java b/src/main/java/org/apache/datasketches/theta/BitPacking.java index cd7dfe1c9..fb8aa0619 100644 --- a/src/main/java/org/apache/datasketches/theta/BitPacking.java +++ b/src/main/java/org/apache/datasketches/theta/BitPacking.java @@ -24,7 +24,7 @@ /** * Used as part of Theta compression. */ -public final class BitPacking { +final class BitPacking { private BitPacking() { } @@ -36,7 +36,7 @@ private BitPacking() { } * @param bufOffset the byte offset in the buffer * @param bitOffset the bit offset */ - public static void packBits(final long value, int bits, final byte[] buffer, int bufOffset, final int bitOffset) { + static void packBits(final long value, int bits, final byte[] buffer, int bufOffset, final int bitOffset) { if (bitOffset > 0) { final int chunkBits = 8 - bitOffset; final int mask = (1 << chunkBits) - 1; @@ -65,7 +65,7 @@ public static void packBits(final long value, int bits, final byte[] buffer, int * @param bufOffset the buffer offset * @param bitOffset the bit offset */ - public static void unpackBits(final long[] value, final int index, int bits, final byte[] buffer, + static void unpackBits(final long[] value, final int index, int bits, final byte[] buffer, int bufOffset,final int bitOffset) { final int availBits = 8 - bitOffset; final int chunkBits = availBits <= bits ? availBits : bits; diff --git a/src/main/java/org/apache/datasketches/theta/CompactOperations.java b/src/main/java/org/apache/datasketches/theta/CompactOperations.java index 926600638..9fb917b24 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactOperations.java +++ b/src/main/java/org/apache/datasketches/theta/CompactOperations.java @@ -29,11 +29,11 @@ import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.SER_VER; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; @@ -122,7 +122,7 @@ static CompactSketch segmentToCompact( final MemorySegment dstWSeg) { //extract Pre0 fields and Flags from srcMem - final int srcPreLongs = extractPreLongs(srcSeg); + final int srcPreLongs = checkSegPreambleCap(srcSeg); final int srcSerVer = extractSerVer(srcSeg); //not used final int srcFamId = extractFamilyID(srcSeg); final int srcLgArrLongs = extractLgArrLongs(srcSeg); @@ -137,7 +137,7 @@ static CompactSketch segmentToCompact( final boolean srcSingleFlag = (srcFlags & SINGLEITEM_FLAG_MASK) > 0; final boolean single = srcSingleFlag - || SingleItemSketch.otherCheckForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags); + || SingleItemSketch.checkForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags); //extract pre1 and pre2 fields final int curCount = single ? 1 : (srcPreLongs > 1) ? extractCurCount(srcSeg) : 0; @@ -319,12 +319,12 @@ static long[] compactCache(final long[] srcCache, final int curCount, * This is checked in all compacting operations. * 7 <1.0 !0 F OK This corresponds to a sketch in estimation mode * - * #4 is handled by correctThetaOnCompat(boolean, int) (below). + * #4 is handled by correctThetaOnCompact(boolean, int) (below). * #2 & #6 handled by checkIllegalCurCountAndEmpty(boolean, int) */ /** - * This corrects a temporary anomalous condition where compact() is called on an UpdateSketch + * This corrects a temporary anomalous condition where compact() or toByteArray() is called on an UpdateSketch * that was initialized with p < 1.0 and update() was never called. In this case Theta < 1.0, * curCount = 0, and empty = true. The correction is to change Theta to 1.0, which makes the * returning sketch empty. This should only be used in the compaction or serialization of an @@ -347,8 +347,8 @@ static long correctThetaOnCompact(final boolean empty, final int curCount, * @param curCount the given current count */ //This handles #2 and #6 above static void checkIllegalCurCountAndEmpty(final boolean empty, final int curCount) { - if (empty && (curCount != 0)) { //this handles #2 and #6 above - throw new SketchesStateException("Illegal State: Empty=true and Current Count != 0."); + if (empty && curCount != 0) { //this handles #2 and #6 above + throw new SketchesStateException("Possible corruption. Illegal State: Empty=true and Current Count != 0."); } } diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index edd55165c..aaa751af0 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; -import static org.apache.datasketches.common.ByteArrayUtil.getShortLE; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; @@ -30,17 +29,15 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT; import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -69,16 +66,13 @@ public abstract class CompactSketch extends Sketch { *The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.
* - *This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * The resulting on-heap CompactSketch will be given the seedHash derived from the given sketch image. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.
+ *The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.
* * @param srcSeg an image of a CompactSketch. * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg) { - return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -87,9 +81,7 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { *The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.
* - *This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketch images cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
+ *This method checks if the given expectedSeed was used to create the source MemorySegment image.
* * @param srcSeg an image of a CompactSketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -97,10 +89,6 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - return heapify(srcSeg, expectedSeed, true); - } - - private static CompactSketch heapify(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = idToFamily(familyID); @@ -108,25 +96,18 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } if (serVer == 4) { - return heapifyV4(srcSeg, seed, enforceSeed); + return heapifyV4(srcSeg, expectedSeed); } if (serVer == 3) { final int flags = extractFlags(srcSeg); final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0; final boolean empty = (flags & EMPTY_FLAG_MASK) != 0; - if (enforceSeed && !empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + if (!empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, expectedSeed); } return CompactOperations.segmentToCompact(srcSeg, srcOrdered, null); } - //not SerVer 3, assume compact stored form - final short seedHash = Util.computeSeedHash(seed); - if (serVer == 1) { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); - } - if (serVer == 2) { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - throw new SketchesArgumentException("Unknown Serialization Version: " + serVer); + //not SerVer 3 or 4 + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -134,24 +115,17 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
- * *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting on-heap CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.
+ *The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.
* * @param srcSeg an image of a Sketch. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg) { - return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -159,47 +133,33 @@ public static CompactSketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
- * *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
+ *This method checks if the given expectedSeed was used to create the source MemorySegment image.
* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - return wrap(srcSeg, expectedSeed, true); - } - - private static CompactSketch wrap(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = Family.idToFamily(familyID); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); + final short seedHash = Util.computeSeedHash(expectedSeed); - if (serVer == 4) { - return DirectCompactCompressedSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - else if (serVer == 3) { + + if (serVer == 3) { if (PreambleUtil.isEmptyFlag(srcSeg)) { return EmptyCompactSketch.getHeapInstance(srcSeg); } - if (otherCheckForSingleItem(srcSeg)) { - return SingleItemSketch.heapify(srcSeg, enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + if (checkForSingleItem(srcSeg)) { + return SingleItemSketch.heapify(srcSeg, seedHash); } //not empty & not singleItem final int flags = extractFlags(srcSeg); @@ -213,91 +173,72 @@ else if (serVer == 3) { throw new SketchesArgumentException( "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } - return DirectCompactSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } //end of serVer 3 - else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); + return DirectCompactSketch.wrapInstance(srcSeg, seedHash); } - else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + if (serVer == 4) { + return DirectCompactCompressedSketch.wrapInstance(srcSeg, seedHash); } + //not SerVer 3 or 4 throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** - * Wrap takes the sketch image in the given MemorySegment and refers to it directly. + * Wrap takes the sketch image in the given byte array and refers to it directly. * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method checks if the DEFAULT_UPDATE_SEED was used to create the source MemorySegment image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.
+ *This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.
* * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. * - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes) { - return wrap(bytes, Util.DEFAULT_UPDATE_SEED, false); + return wrap(bytes, Util.DEFAULT_UPDATE_SEED); } /** - * Wrap takes the sketch image in the given MemorySegment and refers to it directly. + * Wrap takes the sketch image in the given byte array and refers to it directly. * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method checks if the given expectedSeed was used to create the source MemorySegment image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
+ *This method checks if the given expectedSeed was used to create the source byte array image.
* * @param bytes a byte array image of a Sketch that was created using the given expectedSeed. - * @param expectedSeed the seed used to validate the given MemorySegment image. + * @param expectedSeed the seed used to validate the given byte array image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) { - return wrap(bytes, expectedSeed, true); - } - - private static CompactSketch wrap(final byte[] bytes, final long seed, final boolean enforceSeed) { final int serVer = bytes[PreambleUtil.SER_VER_BYTE]; final int familyId = bytes[PreambleUtil.FAMILY_BYTE]; final Family family = Family.idToFamily(familyId); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); - if (serVer == 4) { - return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); - } else if (serVer == 3) { + final short seedHash = Util.computeSeedHash(expectedSeed); + + if (serVer == 3) { final int flags = bytes[FLAGS_BYTE]; if ((flags & EMPTY_FLAG_MASK) > 0) { return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); } final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + if (checkForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), seedHash); } //not empty & not singleItem final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; @@ -310,16 +251,14 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo throw new SketchesArgumentException( "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); - } else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + return WrappedCompactSketch.wrapInstance(bytes, seedHash); } + if (serVer ==4) { + return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); + } + //not SerVer 3 or 4 throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: Serialization Version " + serVer + " not recognized."); } //Sketch Overrides @@ -446,12 +385,12 @@ private byte[] toByteArrayV4() { return bytes; } - private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { - final int preLongs = extractPreLongs(srcSeg); + private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed) { + final int preLongs = Sketch.getPreambleLongs(srcSeg); final int entryBits = extractEntryBitsV4(srcSeg); final int numEntriesBytes = extractNumEntriesBytesV4(srcSeg); final short seedHash = (short) extractSeedHash(srcSeg); - if (enforceSeed) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + PreambleUtil.checkSegmentSeedHash(srcSeg, seed); int offsetBytes = 8; long theta = Long.MAX_VALUE; if (preLongs > 1) { diff --git a/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java index 6e7cad3c5..b70fdda36 100644 --- a/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java @@ -75,7 +75,9 @@ final class ConcurrentDirectQuickSelectSketch extends DirectQuickSelectSketch final double maxConcurrencyError, final MemorySegment dstSeg) { super(lgNomLongs, seed, 1.0F, //p ResizeFactor.X1, //rf, - dstSeg, false); //unionGadget + dstSeg, + null, + false); //unionGadget volatileThetaLong_ = Long.MAX_VALUE; volatileEstimate_ = 0; @@ -91,6 +93,7 @@ final class ConcurrentDirectQuickSelectSketch extends DirectQuickSelectSketch super(sketch.getLgNomLongs(), seed, 1.0F, //p ResizeFactor.X1, //rf, dstSeg, + null, false); //unionGadget exactLimit_ = ConcurrentSharedThetaSketch.computeExactLimit(1L << getLgNomLongs(), @@ -115,7 +118,7 @@ public double getEstimate() { @Override public boolean isEstimationMode() { - return (getRetainedEntries(false) > exactLimit_) || super.isEstimationMode(); + return getRetainedEntries(false) > exactLimit_ || super.isEstimationMode(); } @Override @@ -164,7 +167,7 @@ public long getExactLimit() { @Override public boolean startEagerPropagation() { while (!sharedPropagationInProgress_.compareAndSet(false, true)) { /* busy wait till free */ } - return (!isEstimationMode());// no eager propagation is allowed in estimation mode + return !isEstimationMode();// no eager propagation is allowed in estimation mode } @Override @@ -206,8 +209,8 @@ public void initBgPropagationService() { public boolean propagate(final AtomicBoolean localPropagationInProgress, final Sketch sketchIn, final long singleHash) { final long epoch = epoch_; - if ((singleHash != NOT_SINGLE_HASH) // namely, is a single hash and - && (getRetainedEntries(false) < exactLimit_)) { // a small sketch then propagate myself (blocking) + if (singleHash != NOT_SINGLE_HASH // namely, is a single hash and + && getRetainedEntries(false) < exactLimit_) { // a small sketch then propagate myself (blocking) if (!startEagerPropagation()) { endPropagation(localPropagationInProgress, true); return false; diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java index 2bf154215..4a3b80839 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4; import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits; @@ -70,12 +69,12 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe MemorySegment.copy(seg_, 0, dstSeg, 0, getCurrentBytes()); return new DirectCompactSketch(dstSeg); } - return CompactSketch.heapify(seg_); + return CompactSketch.heapify(seg_, Util.DEFAULT_UPDATE_SEED); } @Override public int getCurrentBytes() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); final int entryBits = extractEntryBitsV4(seg_); final int numEntriesBytes = extractNumEntriesBytesV4(seg_); return preLongs * Long.BYTES + numEntriesBytes + wholeBytesToHoldBits(getRetainedEntries() * entryBits); @@ -85,11 +84,11 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used - final int preLongs = extractPreLongs(seg_); // if > 1 then the second long has theta + final int preLongs = Sketch.getPreambleLongs(seg_); // if > 1 then the second long has theta final int numEntriesBytes = extractNumEntriesBytesV4(seg_); int offsetBytes = preLongs > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE; int numEntries = 0; @@ -101,7 +100,7 @@ public int getRetainedEntries(final boolean valid) { //compact is always valid @Override public long getThetaLong() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs > 1) ? extractThetaLongV4(seg_) : Long.MAX_VALUE; } @@ -119,7 +118,7 @@ public boolean isOrdered() { public HashIterator iterator() { return new MemorySegmentCompactCompressedHashIterator( seg_, - (extractPreLongs(seg_) > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE) + (Sketch.getPreambleLongs(seg_) > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE) + extractNumEntriesBytesV4(seg_), extractEntryBitsV4(seg_), getRetainedEntries() diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java index 2fcbf08d6..f393dc5b8 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java @@ -26,10 +26,9 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -53,7 +52,7 @@ class DirectCompactSketch extends CompactSketch { /** * Construct this sketch with the given MemorySegment. - * @param seg Read-only MemorySegment object with the order bit properly set. + * @param seg (optional) Read-only MemorySegment object. */ DirectCompactSketch(final MemorySegment seg) { seg_ = seg; @@ -81,22 +80,22 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe @Override public int getCurrentBytes() { - if (otherCheckForSingleItem(seg_)) { return 16; } - final int preLongs = extractPreLongs(seg_); + if (checkForSingleItem(seg_)) { return 16; } + final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); return (preLongs + curCount) << 3; } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid - if (otherCheckForSingleItem(seg_)) { return 1; } - final int preLongs = extractPreLongs(seg_); + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch + if (checkForSingleItem(seg_)) { return 1; } + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs == 1) ? 0 : extractCurCount(seg_); } @Override public long getThetaLong() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs > 2) ? extractThetaLong(seg_) : Long.MAX_VALUE; } @@ -147,8 +146,8 @@ public byte[] toByteArray() { @Override long[] getCache() { - if (otherCheckForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } - final int preLongs = extractPreLongs(seg_); + if (checkForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } + final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); if (curCount > 0) { final long[] cache = new long[curCount]; @@ -160,12 +159,12 @@ long[] getCache() { @Override int getCompactPreambleLongs() { - return extractPreLongs(seg_); + return Sketch.getPreambleLongs(seg_); } @Override int getCurrentPreambleLongs() { - return extractPreLongs(seg_); + return Sketch.getPreambleLongs(seg_); } @Override diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 45b01edba..723b6cc75 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -33,8 +33,7 @@ import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.PreambleUtil.insertCurCount; import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID; import static org.apache.datasketches.theta.PreambleUtil.insertFlags; @@ -60,8 +59,10 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.SuppressFBWarnings; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; import org.apache.datasketches.thetacommon.ThetaUtil; @@ -78,15 +79,27 @@ * @author Kevin Lang */ class DirectQuickSelectSketch extends DirectQuickSelectSketchR { + private static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space + int hashTableThreshold_; //computed and mutable, kept only on heap, never serialized. + private final MemorySegmentRequest mSegReq; + /** + * Construct this sketch as a result of a wrap operation where the given MemorySegment already has an updatable sketch image. + * @param wseg the given MemorySegment that has an updatable sketch image. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. + * @param seed See Update Hash Seed. + */ private DirectQuickSelectSketch( - final long seed, - final MemorySegment wseg) { - super(seed, wseg); + final MemorySegment wseg, + final MemorySegmentRequest mSegReq, + final long seed) { + this.mSegReq = mSegReq == null ? MemorySegmentRequest.DEFAULT : mSegReq; + super(wseg, seed); } /** * Construct a new sketch instance and initialize the given MemorySegment as its backing store. + * This is only called internally by other theta sketch classes. * * @param lgNomLongs See lgNomLongs. * @param seed See Update Hash Seed. @@ -96,6 +109,7 @@ private DirectQuickSelectSketch( * See Resize Factor * @param dstSeg the given MemorySegment object destination. It cannot be null. * It will be cleared prior to use. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param unionGadget true if this sketch is implementing the Union gadget function. * Otherwise, it is behaving as a normal QuickSelectSketch. */ @@ -105,43 +119,24 @@ private DirectQuickSelectSketch( final float p, final ResizeFactor rf, final MemorySegment dstSeg, + final MemorySegmentRequest mSegReq, final boolean unionGadget) { - this( - checkSegSize(lgNomLongs, rf, dstSeg, unionGadget), - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - lgNomLongs, - seed, - p, - rf, - dstSeg, - unionGadget); - } - private DirectQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final int lgNomLongs, - final long seed, - final float p, - final ResizeFactor rf, - final MemorySegment dstSeg, - final boolean unionGadget) { - super(seed, dstSeg); //Choose family, preambleLongs - final Family family; - final int preambleLongs; - if (unionGadget) { - preambleLongs = Family.UNION.getMinPreLongs(); - family = Family.UNION; - } - else { - preambleLongs = Family.QUICKSELECT.getMinPreLongs(); - family = Family.QUICKSELECT; - } + final Family family = unionGadget ? Family.UNION : Family.QUICKSELECT; + final int preambleLongs = unionGadget ? Family.UNION.getMinPreLongs() : Family.QUICKSELECT.getMinPreLongs(); - //Choose RF, minReqBytes, lgArrLongs. + //Set RF, lgArrLongs. final int lgRF = rf.lg(); - final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; + final int lgArrLongs = lgRF == 0 ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; + + //check Segment capacity + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); + final long curSegCapBytes = dstSeg.byteSize(); + if (curSegCapBytes < minReqBytes) { + throw new SketchesArgumentException( + "MemorySegment capacity is less than minimum required: " + curSegCapBytes + " < " + minReqBytes); + } //@formatter:off //Build preamble @@ -157,44 +152,34 @@ private DirectQuickSelectSketch( insertP(dstSeg, p); //bytes 12-15 final long thetaLong = (long)(p * LONG_MAX_VALUE_AS_DOUBLE); insertThetaLong(dstSeg, thetaLong); //bytes 16-23 - if (unionGadget) { - insertUnionThetaLong(dstSeg, thetaLong); - } //@formatter:on + if (unionGadget) { insertUnionThetaLong(dstSeg, thetaLong); } + //clear hash table area dstSeg.asSlice(preambleLongs << 3, Long.BYTES << lgArrLongs).fill((byte)0); - hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - } - - private static final boolean checkSegSize( - final int lgNomLongs, final ResizeFactor rf, final MemorySegment dstSeg, final boolean unionGadget) { - final int preambleLongs = (unionGadget) ? Family.UNION.getMinPreLongs() : Family.QUICKSELECT.getMinPreLongs(); - final int lgRF = rf.lg(); - final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); - final long curSegCapBytes = dstSeg.byteSize(); - if (curSegCapBytes < minReqBytes) { - throw new SketchesArgumentException( - "MemorySegment capacity is too small: " + curSegCapBytes + " < " + minReqBytes); - } - return true; + this.mSegReq = mSegReq == null ? MemorySegmentRequest.DEFAULT : mSegReq; + super(dstSeg, seed); } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed * @return instance of this sketch */ - static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + //called from UnionImpl and UpdateSketch + static DirectQuickSelectSketch writableWrap( + final MemorySegment srcSeg, + final MemorySegmentRequest mSegReq, + final long seed) { + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); if (isResizeFactorIncorrect(srcSeg, lgNomLongs, lgArrLongs)) { @@ -202,8 +187,7 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo insertLgResizeFactor(srcSeg, ResizeFactor.X2.lg()); } - final DirectQuickSelectSketch dqss = - new DirectQuickSelectSketch(seed, srcSeg); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, mSegReq, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -212,15 +196,19 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. * @param srcSeg The given MemorySegment must be in hash table form and not read only. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed * @return instance of this sketch */ - static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, final long seed) { + //called from UnionImpl <- Union + static DirectQuickSelectSketch fastWritableWrap( + final MemorySegment srcSeg, + final MemorySegmentRequest mSegReq, + final long seed) { final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - final DirectQuickSelectSketch dqss = - new DirectQuickSelectSketch(seed, srcSeg); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, mSegReq, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -233,7 +221,7 @@ static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, fina public UpdateSketch rebuild() { final int lgNomLongs = getLgNomLongs(); final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - if (getRetainedEntries(true) > (1 << lgNomLongs)) { + if (getRetainedEntries(true) > 1 << lgNomLongs) { quickSelectAndRebuild(wseg_, preambleLongs, lgNomLongs); } return this; @@ -274,20 +262,17 @@ UpdateReturnState hashUpdate(final long hash) { final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //The duplicate test - final int index = - HashOperations.hashSearchOrInsertMemorySegment(wseg_, lgArrLongs, hash, preambleLongs << 3); - if (index >= 0) { - return RejectedDuplicate; //Duplicate, not inserted - } + final int index = HashOperations.hashSearchOrInsertMemorySegment(wseg_, lgArrLongs, hash, preambleLongs << 3); + if (index >= 0) { return RejectedDuplicate; } //Duplicate, not inserted + //insertion occurred, increment curCount final int curCount = getRetainedEntries(true) + 1; wseg_.set(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT, curCount); //update curCount if (isOutOfSpace(curCount)) { //we need to do something, we are out of space - if (lgArrLongs > lgNomLongs) { //at full size, rebuild - //Assumes no dirty values, changes thetaLong, curCount_ - assert (lgArrLongs == (lgNomLongs + 1)) : "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs; + if (lgArrLongs > lgNomLongs) { //at full size, rebuild, assumes no dirty values, changes thetaLong, curCount_ + assert lgArrLongs == lgNomLongs + 1 : "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs; //rebuild, refresh curCount based on # values in the hashtable. quickSelectAndRebuild(wseg_, preambleLongs, lgNomLongs); return InsertedCountIncrementedRebuilt; @@ -305,23 +290,45 @@ UpdateReturnState hashUpdate(final long hash) { return InsertedCountIncrementedResized; } //end of Expand in current MemorySegment, exit. - else { - //Request more space, then resize. lgArrLongs will change; thetaLong, curCount will not + else { //Request larger segment, then resize. lgArrLongs will change; thetaLong, curCount will not final int preBytes = preambleLongs << 3; tgtLgArrLongs = Math.min(lgArrLongs + lgRF, lgNomLongs + 1); final int tgtArrBytes = 8 << tgtLgArrLongs; final int reqBytes = tgtArrBytes + preBytes; - final MemorySegment newDstSeg = MemorySegment.ofArray(new byte[reqBytes]); + + final MemorySegment newDstSeg = mSegReq.request(reqBytes); moveAndResize(wseg_, preambleLongs, lgArrLongs, newDstSeg, tgtLgArrLongs, thetaLong); + final MemorySegment oldSeg = wseg_; wseg_ = newDstSeg; + mSegReq.requestClose(oldSeg); hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, tgtLgArrLongs); return InsertedCountIncrementedResized; - } //end of Request more space to resize + } //end of request new segment & resize } //end of resize } //end of isOutOfSpace return InsertedCountIncremented; } + @Override + boolean isOutOfSpace(final int numEntries) { + return numEntries > hashTableThreshold_; + } + + /** + * Returns the cardinality limit given the current size of the hash table array. + * + * @param lgNomLongs See lgNomLongs. + * @param lgArrLongs See lgArrLongs. + * @return the hash table threshold + */ + @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") + protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { + //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, + //but this allows us to tune these constants for different sketches. + final double fraction = lgArrLongs <= lgNomLongs ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; + return (int) (fraction * (1 << lgArrLongs)); + } + } diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 0a81f4887..f78fbced4 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -28,7 +28,6 @@ import static org.apache.datasketches.theta.CompactOperations.correctThetaOnCompact; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT; @@ -37,22 +36,20 @@ import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong; import java.lang.foreign.MemorySegment; +import java.util.Objects; import org.apache.datasketches.common.Family; import org.apache.datasketches.common.MemorySegmentStatus; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesReadOnlyException; -import org.apache.datasketches.common.SuppressFBWarnings; -import org.apache.datasketches.thetacommon.ThetaUtil; /** - * The default Theta Sketch using the QuickSelect algorithm. - * This is the read-only implementation with non-functional methods, which affect the state. + * The read-only Theta Sketch. * *This implementation uses data in a given MemorySegment that is owned and managed by the caller. * This MemorySegment can be off-heap, which if managed properly will greatly reduce the need for @@ -62,53 +59,67 @@ * @author Kevin Lang */ class DirectQuickSelectSketchR extends UpdateSketch { - static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space - final long seed_; //provided, kept only on heap, never serialized. - int hashTableThreshold_; //computed, kept only on heap, never serialized. - MemorySegment wseg_; //A MemorySegment for child class, but no write methods here - - //only called by DirectQuickSelectSketch and below - DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { - seed_ = seed; + + /** + * This MemorySegment reference is also used by the writable child DirectQuickSelectSketch. + * + *
When this class is constructed with the writable constructor, called by the writable child DirectQuickSelectSketch, + * this reference can be changed, its contents can be modified.
+ * + *When this class is constructed with the read-only constructor, called from local factories, this MemorySegment will + * be placed in read-only mode.
+ */ + MemorySegment wseg_; // + + /** + * This writable constructor is only called by the writable child DirectQuickSelectSketch and then this class provides the + * read-only methods for the DirectQuickSelectSketch class. + * @param wseg the writable MemorySegment used by the writable child DirectQuickSelectSketch. + * @param seed the seed for the update function for the writable child DirectQuickSelectSketch. + */ + DirectQuickSelectSketchR(final MemorySegment wseg, final long seed) { + Objects.requireNonNull(wseg, "MemorySegment wseg must not be null"); + super(seed); wseg_ = wseg; } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * This read-only constructor is only called by local factory methods which use this class as a read-only direct sketch. + * @param seed the seed used to validate the internal hashes of the given source MemorySegment. + * @param srcSeg the read-only MemorySegment used by this class in read-only mode. + */ + private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { + Objects.requireNonNull(srcSeg, "MemorySegment srcSeg must not be null"); + super(seed); + wseg_ = srcSeg.asReadOnly(); + } + + /** + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg the source MemorySegment. * The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed * @return instance of this sketch */ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = checkSegPreambleCap(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); - - final DirectQuickSelectSketchR dqssr = - new DirectQuickSelectSketchR(seed, srcSeg); - dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqssr; + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + return new DirectQuickSelectSketchR(seed, srcSeg); } /** * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. + * Caller must ensure segment contents are a valid sketch image. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed * @return instance of this sketch */ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int lgNomLongs = srcSeg.get(JAVA_BYTE, LG_NOM_LONGS_BYTE) & 0XFF; - final int lgArrLongs = srcSeg.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; - - final DirectQuickSelectSketchR dqss = new DirectQuickSelectSketchR(seed, srcSeg); - dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqss; + return new DirectQuickSelectSketchR(seed, srcSeg); } //Sketch @@ -116,9 +127,9 @@ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, fin @Override public int getCurrentBytes() { //not compact - final byte lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE); - final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - return (preLongs + (1 << lgArrLongs)) << 3; + final int lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte + final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits + return preLongs + (1 << lgArrLongs) << 3; } @Override @@ -130,12 +141,12 @@ public double getEstimate() { @Override public Family getFamily() { - final int familyID = wseg_.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; + final int familyID = wseg_.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; //mask to byte return Family.idToFamily(familyID); } @Override - public int getRetainedEntries(final boolean valid) { //always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } @@ -146,7 +157,7 @@ public long getThetaLong() { @Override public boolean hasMemorySegment() { - return (wseg_ != null) && wseg_.scope().isAlive(); + return wseg_ != null && wseg_.scope().isAlive(); } @Override @@ -161,7 +172,7 @@ public boolean isEmpty() { @Override public boolean isSameResource(final MemorySegment that) { - return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); + return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); //null checks done here } @Override @@ -171,14 +182,14 @@ public HashIterator iterator() { @Override public byte[] toByteArray() { //MY_FAMILY is stored in wseg_ - checkIllegalCurCountAndEmpty(isEmpty(), extractCurCount(wseg_)); + final int curCount = extractCurCount(wseg_); + checkIllegalCurCountAndEmpty(isEmpty(), curCount); final int lengthBytes = getCurrentBytes(); final byte[] byteArray = new byte[lengthBytes]; final MemorySegment seg = MemorySegment.ofArray(byteArray); MemorySegment.copy(wseg_, 0, seg, 0, lengthBytes); - final long thetaLong = - correctThetaOnCompact(isEmpty(), extractCurCount(wseg_), extractThetaLong(wseg_)); - insertThetaLong(wseg_, thetaLong); + final long thetaLong = correctThetaOnCompact(isEmpty(), curCount, extractThetaLong(wseg_)); + insertThetaLong(seg, thetaLong); return byteArray; } @@ -199,11 +210,6 @@ public ResizeFactor getResizeFactor() { return ResizeFactor.getRF(getLgRF()); } - @Override - long getSeed() { - return seed_; - } - @Override public UpdateSketch rebuild() { throw new SketchesReadOnlyException(); @@ -218,8 +224,8 @@ public void reset() { @Override long[] getCache() { - final long lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; - final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; + final long lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte + final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits final long[] cacheArr = new long[1 << lgArrLongs]; MemorySegment.copy(wseg_, JAVA_LONG_UNALIGNED, preambleLongs << 3, cacheArr, 0, 1 << lgArrLongs); return cacheArr; @@ -232,7 +238,7 @@ int getCompactPreambleLongs() { @Override int getCurrentPreambleLongs() { - return PreambleUtil.extractPreLongs(wseg_); + return Sketch.getPreambleLongs(wseg_); } @Override @@ -251,17 +257,17 @@ boolean isDirty() { } @Override - boolean isOutOfSpace(final int numEntries) { - return numEntries > hashTableThreshold_; + boolean isOutOfSpace(final int numEntries) { //overridden by writable DirectQuickSelectSketch + return false; } @Override int getLgArrLongs() { - return wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; + return wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte } int getLgRF() { //only Direct needs this - return (wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3; + return wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; //mask to 2 bits } @Override @@ -269,19 +275,4 @@ UpdateReturnState hashUpdate(final long hash) { throw new SketchesReadOnlyException(); } - /** - * Returns the cardinality limit given the current size of the hash table array. - * - * @param lgNomLongs See lgNomLongs. - * @param lgArrLongs See lgArrLongs. - * @return the hash table threshold - */ - @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") - protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { - //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, - //but this allows us to tune these constants for different sketches. - final double fraction = (lgArrLongs <= lgNomLongs) ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; - return (int) (fraction * (1 << lgArrLongs)); - } - } diff --git a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java index 45a17d40d..793ce1763 100644 --- a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java @@ -86,7 +86,7 @@ public int getCurrentBytes() { public double getEstimate() { return 0; } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 0; } diff --git a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java deleted file mode 100644 index 723a8b651..000000000 --- a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; -import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.SketchesArgumentException; - -/** - * Used to convert older serialization versions 1 and 2 to version 3. The Serialization - * Version is the version of the sketch binary image format and should not be confused with the - * version number of the Open Source DataSketches Library. - * - * @author Lee Rhodes - */ -final class ForwardCompatibility { - - private ForwardCompatibility() { } - - /** - * Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch. - * Note: SerVer 1 sketches always have (metadata) preamble-longs of 3 and are always stored - * in a compact ordered form, but with 3 different sketch types. All SerVer 1 sketches will - * be converted to a SerVer 3 sketches. There is no concept of p-sampling, no empty bit. - * - * @param srcSeg the image of a SerVer 1 sketch - * - * @param seedHash See Seed Hash. - * The seedHash that matches the seedHash of the original seed used to construct the sketch. - * Note: SerVer 1 sketches do not have the concept of the SeedHash, so the seedHash provided here - * MUST be derived from the actual seed that was used when the SerVer 1 sketches were built. - * @return a SerVer 3 {@link CompactSketch}. - */ - static final CompactSketch heapify1to3(final MemorySegment srcSeg, final short seedHash) { - final int segCap = (int) srcSeg.byteSize(); - final int preLongs = extractPreLongs(srcSeg); //always 3 for serVer 1 - if (preLongs != 3) { - throw new SketchesArgumentException("PreLongs must be 3 for SerVer 1: " + preLongs); - } - final int familyId = extractFamilyID(srcSeg); //1,2,3 - if ((familyId < 1) || (familyId > 3)) { - throw new SketchesArgumentException("Family ID (Sketch Type) must be 1 to 3: " + familyId); - } - final int curCount = extractCurCount(srcSeg); - final long thetaLong = extractThetaLong(srcSeg); - final boolean empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); - - if (empty || (segCap <= 24)) { //return empty - return EmptyCompactSketch.getInstance(); - } - - final int reqCap = (curCount + preLongs) << 3; - validateInputSize(reqCap, segCap); - - if ((thetaLong == Long.MAX_VALUE) && (curCount == 1)) { - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //theta < 1.0 and/or curCount > 1 - - final long[] compactOrderedCache = new long[curCount]; - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong, true); - } - - /** - * Convert a serialization version (SerVer) 2 sketch to a SerVer 3 HeapCompactOrderedSketch. - * Note: SerVer 2 sketches can have metadata-longs of 1,2 or 3 and are always stored - * in a compact ordered form (not as a hash table), but with 4 different sketch types. - * @param srcSeg the image of a SerVer 2 sketch - * @param seedHash See Seed Hash. - * The seed used for building the sketch image in srcMem - * @return a SerVer 3 HeapCompactOrderedSketch - */ - static final CompactSketch heapify2to3(final MemorySegment srcSeg, final short seedHash) { - final int segCap = (int) srcSeg.byteSize(); - final int preLongs = extractPreLongs(srcSeg); //1,2 or 3 - final int familyId = extractFamilyID(srcSeg); //1,2,3,4 - if ((familyId < 1) || (familyId > 4)) { - throw new SketchesArgumentException("Family (Sketch Type) must be 1 to 4: " + familyId); - } - int reqBytesIn = 8; - int curCount = 0; - long thetaLong = Long.MAX_VALUE; - if (preLongs == 1) { - reqBytesIn = 8; - validateInputSize(reqBytesIn, segCap); - return EmptyCompactSketch.getInstance(); - } - if (preLongs == 2) { //includes pre0 + count, no theta (== 1.0) - reqBytesIn = preLongs << 3; - validateInputSize(reqBytesIn, segCap); - curCount = extractCurCount(srcSeg); - if (curCount == 0) { - return EmptyCompactSketch.getInstance(); - } - if (curCount == 1) { - reqBytesIn = (preLongs + 1) << 3; - validateInputSize(reqBytesIn, segCap); - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //curCount > 1 - reqBytesIn = (curCount + preLongs) << 3; - validateInputSize(reqBytesIn, segCap); - final long[] compactOrderedCache = new long[curCount]; - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong,true); - } - if (preLongs == 3) { //pre0 + count + theta - reqBytesIn = (preLongs) << 3; // - validateInputSize(reqBytesIn, segCap); - curCount = extractCurCount(srcSeg); - thetaLong = extractThetaLong(srcSeg); - if ((curCount == 0) && (thetaLong == Long.MAX_VALUE)) { - return EmptyCompactSketch.getInstance(); - } - if ((curCount == 1) && (thetaLong == Long.MAX_VALUE)) { - reqBytesIn = (preLongs + 1) << 3; - validateInputSize(reqBytesIn, segCap); - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //curCount > 1 and/or theta < 1.0 - reqBytesIn = (curCount + preLongs) << 3; - validateInputSize(reqBytesIn, segCap); - final long[] compactOrderedCache = new long[curCount]; - //srcSeg.getLongArray(preLongs << 3, compactOrderedCache, 0, curCount); - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong, true); - } - throw new SketchesArgumentException("PreLongs must be 1,2, or 3: " + preLongs); - } - - private static final void validateInputSize(final int reqBytesIn, final int segCap) { - if (reqBytesIn > segCap) { - throw new SketchesArgumentException( - "Input MemorySegment or byte[] size is too small: Required Bytes: " + reqBytesIn - + ", bytesIn: " + segCap); - } - } - -} diff --git a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java index 8a35631ab..5a5c16f00 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java @@ -23,6 +23,7 @@ import static java.lang.Math.min; import static java.lang.Math.sqrt; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; +import static org.apache.datasketches.common.Util.DEFAULT_UPDATE_SEED; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.checkBounds; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; @@ -31,7 +32,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; import static org.apache.datasketches.theta.PreambleUtil.extractP; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountNotIncremented; @@ -112,6 +112,17 @@ static HeapAlphaSketch newHeapInstance(final int lgNomLongs, final long seed, fi return has; } + /** + * Heapify a sketch from a MemorySegment object containing sketch data. + * @param srcSeg The source MemorySegment object. + * It must have a size of at least 24 bytes. + * The assumed seed is {@link org.apache.datasketches.common.Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} + * @return instance of this sketch + */ + static HeapAlphaSketch heapifyInstance(final MemorySegment srcSeg) { + return heapifyInstance(srcSeg, DEFAULT_UPDATE_SEED); + } + /** * Heapify a sketch from a MemorySegment object containing sketch data. * @param srcSeg The source MemorySegment object. @@ -123,7 +134,7 @@ static HeapAlphaSketch newHeapInstance(final int lgNomLongs, final long seed, fi static HeapAlphaSketch heapifyInstance(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); checkBounds(0, 24, srcSeg.byteSize()); - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 @@ -197,7 +208,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch if (curCount_ > 0) { if (valid && isDirty()) { return HashOperations.countPart(getCache(), getLgArrLongs(), getThetaLong()); @@ -234,14 +245,14 @@ public boolean isEmpty() { *
* Long || Start Byte Adr:
* Adr:
- * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
- * 0 || Seed Hash | Flags | LgArr | LgNom | FamID | SerVer | lgRF | PreLongs=3 |
+ * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
+ * 0 || Seed Hash | Flags | LgArr | LgNom | FamID=1 | SerVer=3 | lgRF | PreLongs=3 |
*
- * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
- * 1 ||-----------------p-----------------|----------Retained Entries Count---------------|
+ * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
+ * 1 ||-----------------p-----------------|----------Retained Entries Count-------------------|
*
- * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
- * 2 ||---------------------------------Theta---------------------------------------------|
+ * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
+ * 2 ||---------------------------------Theta-------------------------------------------------|
*
*/
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
index fdd2860ce..69eebff5f 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
@@ -57,6 +57,7 @@ final class HeapCompactSketch extends CompactSketch {
* @param curCount correct value
* @param thetaLong The correct
* thetaLong.
+ * @param ordered true if cache is ordered.
*/
HeapCompactSketch(final long[] cache, final boolean empty, final short seedHash,
final int curCount, final long thetaLong, final boolean ordered) {
@@ -87,7 +88,7 @@ public int getCurrentBytes() {
}
@Override
- public int getRetainedEntries(final boolean valid) {
+ public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
return curCount_;
}
diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
index 5d8af6bfb..c23deebf1 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
@@ -30,7 +30,6 @@
import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor;
import static org.apache.datasketches.theta.PreambleUtil.extractP;
-import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncrementedRebuilt;
@@ -108,11 +107,11 @@ private HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float
* @return instance of this sketch
*/
static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final long seed) {
- final int preambleLongs = extractPreLongs(srcSeg); //byte 0
+ final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0
final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3
final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4
- checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
+ checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs);
final float p = extractP(srcSeg); //bytes 12-15
@@ -150,7 +149,7 @@ public Family getFamily() {
}
@Override
- public int getRetainedEntries(final boolean valid) {
+ public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
return curCount_;
}
diff --git a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java
index 87e1892b8..56175a019 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java
@@ -49,13 +49,12 @@
*/
abstract class HeapUpdateSketch extends UpdateSketch {
final int lgNomLongs_;
- private final long seed_;
private final float p_;
private final ResizeFactor rf_;
HeapUpdateSketch(final int lgNomLongs, final long seed, final float p, final ResizeFactor rf) {
+ super(seed);
lgNomLongs_ = Math.max(lgNomLongs, ThetaUtil.MIN_LG_NOM_LONGS);
- seed_ = seed;
p_ = p;
rf_ = rf;
}
@@ -66,7 +65,7 @@ abstract class HeapUpdateSketch extends UpdateSketch {
public int getCurrentBytes() {
final int preLongs = getCurrentPreambleLongs();
final int dataLongs = getCurrentDataLongs();
- return (preLongs + dataLongs) << 3;
+ return preLongs + dataLongs << 3;
}
//UpdateSketch
@@ -86,11 +85,6 @@ public ResizeFactor getResizeFactor() {
return rf_;
}
- @Override
- long getSeed() {
- return seed_;
- }
-
//restricted methods
@Override
@@ -102,14 +96,14 @@ short getSeedHash() {
byte[] toByteArray(final int preLongs, final byte familyID) {
if (isDirty()) { rebuild(); }
checkIllegalCurCountAndEmpty(isEmpty(), getRetainedEntries(true));
- final int preBytes = (preLongs << 3) & 0X3F; //24 bytes
+ final int preBytes = (preLongs << 3) & 0X3F; //24 bytes; mask to 6 bits
final int dataBytes = getCurrentDataLongs() << 3;
final byte[] byteArrOut = new byte[preBytes + dataBytes];
final MemorySegment segOut = MemorySegment.ofArray(byteArrOut);
//preamble first 8 bytes. Note: only compact can be reduced to 8 bytes.
- final int lgRf = getResizeFactor().lg() & 0x3;
+ final int lgRf = getResizeFactor().lg() & 0x3; //mask to 2 bits
insertPreLongs(segOut, preLongs); //byte 0 low 6 bits
insertLgResizeFactor(segOut, lgRf); //byte 0 high 2 bits
insertSerVer(segOut, SER_VER); //byte 1
diff --git a/src/main/java/org/apache/datasketches/theta/Intersection.java b/src/main/java/org/apache/datasketches/theta/Intersection.java
index a31dc3ef9..134c49ff6 100644
--- a/src/main/java/org/apache/datasketches/theta/Intersection.java
+++ b/src/main/java/org/apache/datasketches/theta/Intersection.java
@@ -20,23 +20,13 @@
package org.apache.datasketches.theta;
import static java.lang.foreign.ValueLayout.JAVA_BYTE;
-import static org.apache.datasketches.common.Util.floorPowerOf2;
-import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
-import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
-import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
-import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
-import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import java.lang.foreign.MemorySegment;
-import java.util.Arrays;
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.Util;
-import org.apache.datasketches.thetacommon.ThetaUtil;
/**
* The API for intersection operations
@@ -164,84 +154,4 @@ public static Intersection wrap(final MemorySegment srcSeg, final long expectedS
return IntersectionImpl.wrapInstance(srcSeg, expectedSeed, srcSeg.isReadOnly() );
}
- // Restricted
-
- /**
- * Returns the maximum lgArrLongs given the capacity of the MemorySegment.
- * @param dstSeg the given MemorySegment
- * @return the maximum lgArrLongs given the capacity of the MemorySegment
- */
- protected static int getMaxLgArrLongs(final MemorySegment dstSeg) {
- final int preBytes = CONST_PREAMBLE_LONGS << 3;
- final long cap = dstSeg.byteSize();
- return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3);
- }
-
- protected static void checkMinSizeMemorySegment(final MemorySegment seg) {
- final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280
- final long cap = seg.byteSize();
- if (cap < minBytes) {
- throw new SketchesArgumentException(
- "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap);
- }
- }
-
- /**
- * Compact first 2^lgArrLongs of given array
- * @param srcCache anything
- * @param lgArrLongs The correct
- * lgArrLongs.
- * @param curCount must be correct
- * @param thetaLong The correct
- * thetaLong.
- * @param dstOrdered true if output array must be sorted
- * @return the compacted array
- */ //Only used in IntersectionImpl & Test
- static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs,
- final int curCount, final long thetaLong, final boolean dstOrdered) {
- if (curCount == 0) {
- return new long[0];
- }
- final long[] cacheOut = new long[curCount];
- final int len = 1 << lgArrLongs;
- int j = 0;
- for (int i = 0; i < len; i++) {
- final long v = srcCache[i];
- if (v <= 0L || v >= thetaLong ) { continue; }
- cacheOut[j++] = v;
- }
- assert curCount == j;
- if (dstOrdered) {
- Arrays.sort(cacheOut);
- }
- return cacheOut;
- }
-
- protected static void segChecks(final MemorySegment srcSeg) {
- //Get Preamble
- //Note: Intersection does not use lgNomLongs (or k), per se.
- //seedHash loaded and checked in private constructor
- final int preLongs = extractPreLongs(srcSeg);
- final int serVer = extractSerVer(srcSeg);
- final int famID = extractFamilyID(srcSeg);
- final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0;
- final int curCount = extractCurCount(srcSeg);
- //Checks
- if (preLongs != CONST_PREAMBLE_LONGS) {
- throw new SketchesArgumentException(
- "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs);
- }
- if (serVer != SER_VER) {
- throw new SketchesArgumentException("Serialization Version must equal " + SER_VER);
- }
- Family.INTERSECTION.checkFamilyID(famID);
- if (empty) {
- if (curCount != 0) {
- throw new SketchesArgumentException(
- "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount);
- }
- //empty = true AND curCount_ = 0: OK
- } //else empty = false, curCount could be anything
- }
-
}
diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
index ebb4a6215..6819524b1 100644
--- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
@@ -26,6 +26,7 @@
import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED;
import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED;
import static org.apache.datasketches.common.Util.clearBits;
+import static org.apache.datasketches.common.Util.floorPowerOf2;
import static org.apache.datasketches.common.Util.setBits;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
@@ -41,8 +42,10 @@
import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG;
import static org.apache.datasketches.theta.PreambleUtil.clearEmpty;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
+import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs;
+import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.PreambleUtil.insertCurCount;
import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID;
@@ -81,17 +84,17 @@
* @author Kevin Lang
*/
final class IntersectionImpl extends Intersection {
- protected final short seedHash_;
- protected final boolean readOnly_; //True if this sketch is to be treated as read only
- protected final MemorySegment wseg_;
- protected final int maxLgArrLongs_; //only used with MemorySegment, not serialized
+ private final short seedHash_;
+ private final boolean readOnly_; //True if this sketch is to be treated as read only
+ private final MemorySegment wseg_;
+ private final int maxLgArrLongs_; //only used with MemorySegment, not serialized
//Note: Intersection does not use lgNomLongs or k, per se.
- protected int lgArrLongs_; //current size of hash table
- protected int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true
- protected long thetaLong_;
- protected boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE!
- protected long[] hashTable_; //retained entries of the intersection, on-heap only.
+ private int lgArrLongs_; //current size of hash table
+ private int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true
+ private long thetaLong_;
+ private boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE!
+ private long[] hashTable_; //retained entries of the intersection, on-heap only.
/**
* Constructor: Sets the class finals and computes, sets and checks the seedHash.
@@ -100,14 +103,14 @@ final class IntersectionImpl extends Intersection {
* @param dstSegFlag The given MemorySegment is a Destination (new offHeap) MemorySegment.
* @param readOnly True if MemorySegment is to be treated as read only.
*/
- protected IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag,
+ private IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag,
final boolean readOnly) {
readOnly_ = readOnly;
if (wseg != null) {
wseg_ = wseg;
if (dstSegFlag) { //DstSeg: compute & store seedHash, no seedHash checking
- checkMinSizeMemorySegment(wseg);
- maxLgArrLongs_ = !readOnly ? getMaxLgArrLongs(wseg) : 0; //Only Off Heap
+ IntersectionImpl.checkMinSizeMemorySegment(wseg);
+ maxLgArrLongs_ = !readOnly ? IntersectionImpl.getMaxLgArrLongs(wseg) : 0; //Only Off Heap
seedHash_ = Util.computeSeedHash(seed);
wseg_.set(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT, seedHash_);
} else { //SrcSeg:gets and stores the seedHash, checks seg_seedHash against the seed
@@ -179,8 +182,8 @@ static IntersectionImpl initNewDirectInstance(final long seed, final MemorySegme
static IntersectionImpl heapifyInstance(final MemorySegment srcSeg, final long seed) {
final boolean dstSegFlag = false;
final boolean readOnly = false;
+ IntersectionImpl.segChecks(srcSeg);
final IntersectionImpl impl = new IntersectionImpl(null, seed, dstSegFlag, readOnly);
- segChecks(srcSeg);
//Initialize
impl.lgArrLongs_ = extractLgArrLongs(srcSeg);
@@ -207,8 +210,8 @@ static IntersectionImpl wrapInstance(
final long seed,
final boolean readOnly) {
final boolean dstSegFlag = false;
+ IntersectionImpl.segChecks(srcSeg);
final IntersectionImpl impl = new IntersectionImpl(srcSeg, seed, dstSegFlag, readOnly);
- segChecks(srcSeg);
impl.lgArrLongs_ = extractLgArrLongs(srcSeg);
impl.curCount_ = extractCurCount(srcSeg);
impl.thetaLong_ = extractThetaLong(srcSeg);
@@ -333,7 +336,7 @@ public CompactSketch getResult(final boolean dstOrdered, final MemorySegment dst
} else {
hashTable = hashTable_;
}
- compactCache = compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered);
+ compactCache = IntersectionImpl.compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered);
srcCompact = true;
srcOrdered = dstOrdered;
return CompactOperations.componentsToCompact(
@@ -561,4 +564,83 @@ private void resetCommon() {
thetaLong_ = Long.MAX_VALUE;
hashTable_ = null;
}
+
+ /**
+ * Compact first 2^lgArrLongs of given array
+ * @param srcCache anything
+ * @param lgArrLongs The correct
+ * lgArrLongs.
+ * @param curCount must be correct
+ * @param thetaLong The correct
+ * thetaLong.
+ * @param dstOrdered true if output array must be sorted
+ * @return the compacted array
+ */ //used in Test
+ static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs,
+ final int curCount, final long thetaLong, final boolean dstOrdered) {
+ if (curCount == 0) {
+ return new long[0];
+ }
+ final long[] cacheOut = new long[curCount];
+ final int len = 1 << lgArrLongs;
+ int j = 0;
+ for (int i = 0; i < len; i++) {
+ final long v = srcCache[i];
+ if (v <= 0L || v >= thetaLong ) { continue; }
+ cacheOut[j++] = v;
+ }
+ assert curCount == j;
+ if (dstOrdered) {
+ Arrays.sort(cacheOut);
+ }
+ return cacheOut;
+ }
+
+ private static void checkMinSizeMemorySegment(final MemorySegment seg) {
+ final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280
+ final long cap = seg.byteSize();
+ if (cap < minBytes) {
+ throw new SketchesArgumentException(
+ "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap);
+ }
+ }
+
+ /**
+ * Returns the maximum lgArrLongs given the capacity of the MemorySegment.
+ * @param dstSeg the given MemorySegment
+ * @return the maximum lgArrLongs given the capacity of the MemorySegment
+ */
+ private static int getMaxLgArrLongs(final MemorySegment dstSeg) {
+ final int preBytes = CONST_PREAMBLE_LONGS << 3;
+ final long cap = dstSeg.byteSize();
+ return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3);
+ }
+
+ private static void segChecks(final MemorySegment srcSeg) {
+ //Get Preamble
+ //Note: Intersection does not use lgNomLongs (or k), per se.
+ //seedHash loaded and checked in private constructor
+ final int preLongs = Sketch.getPreambleLongs(srcSeg);
+ final int serVer = extractSerVer(srcSeg);
+ final int famID = extractFamilyID(srcSeg);
+ final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0;
+ final int curCount = extractCurCount(srcSeg);
+ //Checks
+ if (preLongs != CONST_PREAMBLE_LONGS) {
+ throw new SketchesArgumentException(
+ "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs);
+ }
+ if (serVer != SER_VER) {
+ throw new SketchesArgumentException("Serialization Version must equal " + SER_VER);
+ }
+ Family.INTERSECTION.checkFamilyID(famID);
+ if (empty) {
+ if (curCount != 0) {
+ throw new SketchesArgumentException(
+ "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount);
+ }
+ //empty = true AND curCount_ = 0: OK
+ } //else empty = false, curCount could be anything
+ }
+
}
diff --git a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java
index 548c79ef3..53344c8d6 100644
--- a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java
+++ b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java
@@ -38,7 +38,7 @@ final class MemorySegmentHashIterator implements HashIterator {
this.seg = srcSeg;
this.arrLongs = arrLongs;
this.thetaLong = thetaLong;
- offsetBytes = PreambleUtil.extractPreLongs(srcSeg) << 3;
+ offsetBytes = Sketch.getPreambleLongs(srcSeg) << 3;
index = -1;
hash = 0;
}
diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
index b3451fcd1..4dd993eb3 100644
--- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
+++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
@@ -126,17 +126,17 @@
* * Long || Start Byte Adr: * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || Seed Hash | Flags | numEB | entBits| FamID | SerVer | PreLongs = 3 | + * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | + * 0 || Seed Hash | Flags | numEB | entBits| FamID | SerVer=4 | PreLongs = 3 | * - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 ||------------------------------THETA_LONG-------------------------------------------| + * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | + * 1 ||------------------------------THETA_LONG-------------------------------------------| (only if estimating) * - * || | | | (20) | (19) | (18) | (17) | 16 | - * 2 ||----------------Retained Entries stored as 1 to 4 bytes----------------------------| + * || | | | 20 | (19) | (18) | (17) | 16 | + * 2 ||--------Retained Entries stored as 1 to 4 bytes in bytes 16-19---------------------| * - * || | | | | | | | | - * 3 ||------------------Delta encoded compressed byte array------------------------------| + * || | | | | | | | | + * 3 ||--------Delta encoded compressed byte array starts at bytes 17-20------------------| ** *
The UpdateSketch and AlphaSketch require 24 bytes of preamble followed by a non-compact @@ -190,10 +190,10 @@ private PreambleUtil() {} // ###### DO NOT MESS WITH THIS FROM HERE ... // Preamble byte Addresses - static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte. - static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte. Not used by compact, direct + static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte 0. + static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte 0. Used by Update, Alpha, not used by compact, direct static final int SER_VER_BYTE = 1; - static final int FAMILY_BYTE = 2; //SerVer1,2 was SKETCH_TYPE_BYTE + static final int FAMILY_BYTE = 2; static final int LG_NOM_LONGS_BYTE = 3; //not used by compact static final int LG_ARR_LONGS_BYTE = 4; //not used by compact static final int FLAGS_BYTE = 5; @@ -203,28 +203,23 @@ private PreambleUtil() {} static final int THETA_LONG = 16; //8-byte aligned static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union - // flag bit masks - static final int RESERVED_FLAG_MASK = 1; //SerVer 1, 2, 3. Now Reserved, no longer used. - static final int READ_ONLY_FLAG_MASK = 2; //Set but not read. Reserved. SerVer 1, 2, 3 - static final int EMPTY_FLAG_MASK = 4; //SerVer 2, 3 - static final int COMPACT_FLAG_MASK = 8; //SerVer 2 was NO_REBUILD_FLAG_MASK, 3 - static final int ORDERED_FLAG_MASK = 16;//SerVer 2 was UNORDERED_FLAG_MASK, 3 - static final int SINGLEITEM_FLAG_MASK = 32;//SerVer 3 - //The last 2 bits of the flags byte are reserved and assumed to be zero, for now. - - //Backward compatibility: SerVer1 preamble always 3 longs, SerVer2 preamble: 1, 2, 3 longs - // SKETCH_TYPE_BYTE 2 //SerVer1, SerVer2 - // V1, V2 types: Alpha = 1, QuickSelect = 2, SetSketch = 3; V3 only: Buffered QS = 4 - static final int LG_RESIZE_RATIO_BYTE_V1 = 5; //used by SerVer 1 - static final int FLAGS_BYTE_V1 = 6; //used by SerVer 1 + // flag byte bit masks + static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. Was BigEndian + static final int READ_ONLY_FLAG_MASK = 2; //Bit 1: Reserved, Set but not read. + static final int EMPTY_FLAG_MASK = 4; //Bit 2: + static final int COMPACT_FLAG_MASK = 8; //Bit 3: + static final int ORDERED_FLAG_MASK = 16;//Bit 4: + static final int SINGLEITEM_FLAG_MASK = 32;//Bit 5: + //The last 2 bits (Bit 6,7) of the flags byte are reserved and assumed to be zero. //Other constants static final int SER_VER = 3; + static final int SER_VER_COMPRESSED = 4; // serial version 4 compressed ordered sketch, not empty, not single item - static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes - static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries - static final int THETA_LONG_V4 = 8; //8-byte aligned + static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes + static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries + static final int THETA_LONG_V4 = 8; //8-byte aligned /** * Computes the number of bytes required for an updatable sketch using a hash-table cache. @@ -233,7 +228,7 @@ private PreambleUtil() {} * @param preambleLongs current preamble size * @return the size in bytes */ - static int getSegBytes(final int lgArrLongs, final int preambleLongs) { + static int getUpdatableSegBytes(final int lgArrLongs, final int preambleLongs) { return (8 << lgArrLongs) + (preambleLongs << 3); } @@ -261,7 +256,7 @@ static String preambleToString(final byte[] byteArr) { * @return the summary preamble string. */ static String preambleToString(final MemorySegment seg) { - final int preLongs = getAndCheckPreLongs(seg); + final int preLongs = checkSegPreambleCap(seg); final int rfId = extractLgResizeFactor(seg); final ResizeFactor rf = ResizeFactor.getRF(rfId); final int serVer = extractSerVer(seg); @@ -272,7 +267,7 @@ static String preambleToString(final MemorySegment seg) { //Flags final int flags = extractFlags(seg); - final String flagsStr = (flags) + ", 0x" + (Integer.toHexString(flags)) + ", " + final String flagsStr = flags + ", 0x" + Integer.toHexString(flags) + ", " + zeroPad(Integer.toBinaryString(flags), 8); final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; final boolean empty = (flags & EMPTY_FLAG_MASK) > 0; @@ -318,7 +313,7 @@ else if (preLongs == 4) { //Union sb.append("Byte 0: ResizeFactor : ").append(rfId + ", " + rf.toString()).append(LS); sb.append("Byte 1: Serialization Version: ").append(serVer).append(LS); sb.append("Byte 2: Family : ").append(familyId + ", " + family.toString()).append(LS); - sb.append("Byte 3: LgNomLongs : ").append(lgNomLongs).append(LS); + sb.append("Byte 3: LgNomLongs, LgK : ").append(lgNomLongs).append(LS); sb.append("Byte 4: LgArrLongs : ").append(lgArrLongs).append(LS); sb.append("Byte 5: Flags Field : ").append(flagsStr).append(LS); sb.append(" Bit Flag Name : State:").append(LS); @@ -351,8 +346,13 @@ else if (preLongs == 3) { sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); sb.append(" Theta (long) : ").append(thetaLong).append(LS); sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS); + if (serVer == 4) { + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); + sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); + return sb.toString(); + } } - else { //preLongs == 4 + else { //preLongs == 4 (Union) sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS); sb.append("Bytes 12-15: P : ").append(p).append(LS); sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); @@ -363,9 +363,8 @@ else if (preLongs == 3) { sb.append(" ThetaU (long,hex): ").append(thetaUHex).append(LS); } sb.append( "Preamble Bytes : ").append(preLongs * 8).append(LS); - sb.append( "Data Bytes : ").append(curCount * 8).append(LS); - sb.append( "TOTAL Sketch Bytes : ").append((preLongs + curCount) * 8).append(LS); - sb.append( "TOTAL Capacity Bytes : ").append(seg.byteSize()).append(LS); + sb.append( "Retained Data Bytes : ").append(curCount * 8).append(LS); + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); return sb.toString(); } @@ -377,11 +376,7 @@ static int extractPreLongs(final MemorySegment seg) { } static int extractLgResizeFactor(final MemorySegment seg) { - return (seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3; - } - - static int extractLgResizeRatioV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, LG_RESIZE_RATIO_BYTE_V1) & 0X3; + return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; } static int extractSerVer(final MemorySegment seg) { @@ -404,10 +399,6 @@ static int extractFlags(final MemorySegment seg) { return seg.get(JAVA_BYTE, FLAGS_BYTE) & 0XFF; } - static int extractFlagsV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, FLAGS_BYTE_V1) & 0XFF; - } - static int extractSeedHash(final MemorySegment seg) { return seg.get(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT) & 0XFFFF; } @@ -516,7 +507,7 @@ static void clearEmpty(final MemorySegment seg) { } static boolean isEmptyFlag(final MemorySegment seg) { - return ((extractFlags(seg) & EMPTY_FLAG_MASK) > 0); + return (extractFlags(seg) & EMPTY_FLAG_MASK) > 0; } /** @@ -524,17 +515,16 @@ static boolean isEmptyFlag(final MemorySegment seg) { * @param seg the given MemorySegment * @return the extracted prelongs value. */ - static int getAndCheckPreLongs(final MemorySegment seg) { - final long cap = seg.byteSize(); - if (cap < 8) { - throwNotBigEnough(cap, 8); - } - final int preLongs = extractPreLongs(seg); - final int required = Math.max(preLongs << 3, 8); - if (cap < required) { - throwNotBigEnough(cap, required); + static int checkSegPreambleCap(final MemorySegment seg) { + try { + final int preLongs = extractPreLongs(seg); + final int required = Math.max(preLongs << 3, 8); + final long cap = seg.byteSize(); + if (cap < required) { throwNotBigEnough(cap, required); } + return preLongs; + } catch (IndexOutOfBoundsException e) { //thrown by MemorySegment + throw new SketchesArgumentException("Possible Corruption: Given MemorySegment is empty."); } - return preLongs; } static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { @@ -543,10 +533,10 @@ static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { return seedHashSeg; } - private static void throwNotBigEnough(final long cap, final int required) { + private static void throwNotBigEnough(final long cap, final long required) { throw new SketchesArgumentException( - "Possible Corruption: Size of byte array or MemorySegment not large enough: Size: " + cap - + ", Required: " + required); + "Possible Corruption: Size of MemorySegment not large enough: Size: " + cap + + " < Required: " + required); } static int wholeBytesToHoldBits(final int bits) { diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java index abf8df391..766e1850d 100644 --- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java +++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java @@ -24,9 +24,9 @@ import static org.apache.datasketches.common.ByteArrayUtil.putLongLE; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; @@ -45,13 +45,13 @@ final class SingleItemSketch extends CompactSketch { private static final long DEFAULT_SEED_HASH = Util.computeSeedHash(Util.DEFAULT_UPDATE_SEED) & 0xFFFFL; // For backward compatibility, a candidate pre0_ long must have: - // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, LittleEndian = 11010 = 0x1A. + // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, NOT BigEndian = 11010 = 0x1A. (without SI flag) // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches, it is virtually guaranteed that we have a SingleItem Sketch. - private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //with SI flag + private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //low 6 bytes, with SI flag private long pre0_ = 0; private long hash_ = 0; @@ -84,7 +84,7 @@ private SingleItemSketch(final long hash) { */ //does not override Sketch static SingleItemSketch heapify(final MemorySegment srcSeg, final short expectedSeedHash) { Util.checkSeedHashes((short) extractSeedHash(srcSeg), expectedSeedHash); - final boolean singleItem = otherCheckForSingleItem(srcSeg); + final boolean singleItem = checkForSingleItem(srcSeg); if (singleItem) { return new SingleItemSketch(srcSeg.get(JAVA_LONG_UNALIGNED, 8), expectedSeedHash); } throw new SketchesArgumentException("Input MemorySegment is not a SingleItemSketch."); } @@ -330,7 +330,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 1; } @@ -384,25 +384,28 @@ short getSeedHash() { return (short) (pre0_ >>> 48); } - static boolean otherCheckForSingleItem(final MemorySegment seg) { - return otherCheckForSingleItem(extractPreLongs(seg), extractSerVer(seg), - extractFamilyID(seg), extractFlags(seg) ); + static boolean checkForSingleItem(final MemorySegment seg) { + final int preLongs = checkSegPreambleCap(seg); + return checkForSingleItem(preLongs, extractSerVer(seg), extractFamilyID(seg), extractFlags(seg) ); } - static boolean otherCheckForSingleItem(final int preLongs, final int serVer, - final int famId, final int flags) { - // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved=F = X11010 = 0x1A. + static boolean checkForSingleItem( + final int preLongs, + final int serVer, + final int famId, + final int flags) { + // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved(BE)=F = X11010 = 0x1A. // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches (not done here), it is virtually guaranteed that we have a // SingleItem Sketch. - final boolean numPreLongs = preLongs == 1; - final boolean numSerVer = serVer >= 3; - final boolean numFamId = famId == Family.COMPACT.getID(); - final boolean numFlags = (flags & 0x1F) == 0x1A; //no SI, yet - final boolean singleFlag = (flags & SINGLEITEM_FLAG_MASK) > 0; - return (numPreLongs && numSerVer && numFamId && numFlags) || singleFlag; + final boolean preLongsOK = preLongs == 1; + final boolean serVerOK = serVer >= 3; + final boolean famIdOK = famId == Family.COMPACT.getID(); + final boolean flagsOK = (flags & 0x1F) == 0x1A; //no SI, yet + final boolean singleFlagOK = (flags & SINGLEITEM_FLAG_MASK) > 0; + return (preLongsOK && serVerOK && famIdOK && flagsOK) || singleFlagOK; } } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 6310d82c4..d14519062 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -19,17 +19,19 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.LS; import static org.apache.datasketches.common.Util.ceilingPowerOf2; import static org.apache.datasketches.common.Util.zeroPad; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; +import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; +import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; +import static org.apache.datasketches.theta.PreambleUtil.extractFlags; +import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; +import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.thetacommon.HashOperations.count; import java.lang.foreign.MemorySegment; @@ -62,20 +64,12 @@ public abstract class Sketch implements MemorySegmentStatus { * Default Update Seed
* was used to create the source MemorySegment image. * - *For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.
- * * @param srcSeg an image of a Sketch. * * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { - return CompactSketch.heapify(srcSeg); - } - return heapifyUpdateFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED); + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -83,8 +77,8 @@ public static Sketch heapify(final MemorySegment srcSeg) { * *The resulting sketch will not retain any link to the source MemorySegment.
* - *For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.
+ *For Update Sketches this method checks if the expectedSeed + * was used to create the source MemorySegment image.
* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -93,12 +87,12 @@ public static Sketch heapify(final MemorySegment srcSeg) { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.COMPACT.getID()) { return CompactSketch.heapify(srcSeg, expectedSeed); } - return heapifyUpdateFromMemorySegment(srcSeg, expectedSeed); + return heapifyUpdateSketchFromMemorySegment(srcSeg, expectedSeed); } /** @@ -106,44 +100,21 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *For Update Sketches this method checks if the - * Default Update Seed
- * was used to create the source MemorySegment image. - * - *For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.
+ *This method checks if the + * Default Update Seed + * was used to create the source MemorySegment image.
* - * @param srcSeg an image of a Sketch. - * @return a Sketch backed by the given MemorySegment + * @param srcSeg a MemorySegment with an image of a Sketch. + * @return a read-only Sketch backed by the given MemorySegment */ public static Sketch wrap(final MemorySegment srcSeg) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, Util.DEFAULT_UPDATE_SEED); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } - } - if (family == Family.COMPACT) { - return CompactSketch.wrap(srcSeg); - } - throw new SketchesArgumentException( - "Cannot wrap family: " + family + " as a Sketch"); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -151,40 +122,30 @@ public static Sketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.
+ *This method checks if the given expectedSeed + * was used to create the source MemorySegment image.
* * @param srcSeg a MemorySegment with an image of a Sketch. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a UpdateSketch backed by the given MemorySegment except as above. + * @return a read-only Sketch backed by the given MemorySegment. */ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.QUICKSELECT.getID()) { + return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed); } - if (family == Family.COMPACT) { + if (familyID == Family.COMPACT.getID()) { return CompactSketch.wrap(srcSeg, expectedSeed); } + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "Cannot wrap family: " + family + " as a Sketch"); } @@ -203,7 +164,7 @@ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { * @return this sketch as an ordered CompactSketch. */ public CompactSketch compact() { - return (this.isCompact()) ? (CompactSketch)this : compact(true, null); + return isCompact() ? (CompactSketch)this : compact(true, null); } /** @@ -269,6 +230,21 @@ public int getCountLessThanThetaLong(final long thetaLong) { */ public abstract double getEstimate(); + /** + * Gets the estimate from the given MemorySegment + * @param srcSeg the given MemorySegment + * @return the result estimate + */ + public static double getEstimate(final MemorySegment srcSeg) { + checkSegPreambleCap(srcSeg); + final int familyId = extractFamilyID(srcSeg); + if (!isValidSketchID(familyId)) { + throw new SketchesArgumentException("Source MemorySegment not a valid Sketch Family: " + + Family.idToFamily(familyId).toString()); + } + return Sketch.estimate(extractThetaLong(srcSeg), getRetainedEntries(srcSeg)); + } + /** * Returns the Family that this sketch belongs to * @return the Family that this sketch belongs to @@ -327,9 +303,20 @@ public static int getMaxUpdateSketchBytes(final int nomEntries) { return (nomEnt << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); } + /** + * Returns the maximum number of storage bytes required for an UpdateSketch with the given + * log_base2 of the nominal entries. + * @param lgNomEntries log_base2 of Nominal Entries + * @return the maximum number of storage bytes required for a UpdateSketch with the given lgNomEntries + */ + public static int getUpdateSketchMaxBytes(final int lgNomEntries) { + return (16 << lgNomEntries) + (Family.QUICKSELECT.getMaxPreLongs() << 3); + } + /** * Returns the number of valid entries that have been retained by the sketch. - * @return the number of valid retained entries + * For the Alpha Sketch this returns only valid entries. + * @return the number of valid retained entries. */ public int getRetainedEntries() { return getRetainedEntries(true); @@ -337,13 +324,24 @@ public int getRetainedEntries() { /** * Returns the number of entries that have been retained by the sketch. - * @param valid if true, returns the number of valid entries, which are less than theta and used - * for estimation. - * Otherwise, return the number of all entries, valid or not, that are currently in the internal - * sketch cache. + * @param valid This parameter is only relevant for the Alpha Sketch. + * if true, returns the number of valid entries, which are less than theta and used + * for estimation. Otherwise, return the number of all entries, valid or not, that are currently in the + * internal sketch cache. * @return the number of retained entries */ - public abstract int getRetainedEntries(boolean valid); + public abstract int getRetainedEntries(final boolean valid); + + /** + * Returns the number of valid entries that have been retained by the sketch from the given MemorySegment + * @param srcSeg the given MemorySegment that has an image of a Sketch + * @return the number of valid retained entries + */ + public static int getRetainedEntries(final MemorySegment srcSeg) { + final int preLongs = checkSegPreambleCap(srcSeg); + final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0; + return (preLongs == 1) ? (empty ? 0 : 1) : extractCurCount(srcSeg); + } /** * Returns the serialization version from the given MemorySegment @@ -351,7 +349,8 @@ public int getRetainedEntries() { * @return the serialization version from the MemorySegment */ public static int getSerializationVersion(final MemorySegment seg) { - return seg.get(JAVA_BYTE, SER_VER_BYTE); + checkSegPreambleCap(seg); + return extractSerVer(seg); } /** @@ -400,7 +399,7 @@ public double getUpperBound(final int numStdDev) { * @return true if the sketch is in estimation mode. */ public boolean isEstimationMode() { - return estMode(getThetaLong(), isEmpty()); + return getThetaLong() < Long.MAX_VALUE && !isEmpty(); } /** @@ -445,7 +444,10 @@ public String toString() { * @param hexMode If true, hashes will be output in hex. * @return The result string, which can be very long. */ - public String toString(final boolean sketchSummary, final boolean dataDetail, final int width, + public String toString( + final boolean sketchSummary, + final boolean dataDetail, + final int width, final boolean hexMode) { final StringBuilder sb = new StringBuilder(); @@ -548,6 +550,9 @@ public static String toString(final MemorySegment seg) { /** * Gets the internal cache array. For on-heap sketches this will return a reference to the actual * cache array. For MemorySegment-based sketches this returns a copy. + * + *This can be an expensive operation and is intended for diagnostic & test applications. + * Use {@link #iterator() iterator()} instead.
* @return the internal cache array. */ abstract long[] getCache(); @@ -584,6 +589,24 @@ public static String toString(final MemorySegment seg) { */ abstract short getSeedHash(); + static boolean getEmpty(final MemorySegment srcSeg) { + checkSegPreambleCap(srcSeg); + final int serVer = extractSerVer(srcSeg); + if (serVer == 1) { + return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; + } + return (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0; + } + + static int getPreambleLongs(final MemorySegment srcSeg) { + return checkSegPreambleCap(srcSeg); + } + + static long getThetaLong(final MemorySegment srcSeg) { + final int preLongs = checkSegPreambleCap(srcSeg); + return preLongs < 3 ? Long.MAX_VALUE : extractThetaLong(srcSeg); + } + /** * Returns true if given Family id is one of the theta sketches * @param id the given Family id @@ -595,66 +618,65 @@ static final boolean isValidSketchID(final int id) { || id == Family.COMPACT.getID(); } - /** - * Checks Ordered and Compact flags for integrity between sketch and a MemorySegment - * @param sketch the given sketch - */ - static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { - final MemorySegment seg = sketch.getMemorySegment(); - if (seg == null) { return; } - final int flags = PreambleUtil.extractFlags(seg); - if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Compact Flag inconsistent with Sketch"); - } - if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Ordered Flag inconsistent with Sketch"); - } - } - static final double estimate(final long thetaLong, final int curCount) { return curCount * (LONG_MAX_VALUE_AS_DOUBLE / thetaLong); } - static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, - final boolean empty) { + /** + * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the lower bound. + */ + public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { + return lowerBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + + static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getLowerBound(curCount, theta, numStdDev, empty); } + /** + * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the upper bound. + */ + public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { + return upperBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + static final double upperBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getUpperBound(curCount, theta, numStdDev, empty); } - private static final boolean estMode(final long thetaLong, final boolean empty) { - return thetaLong < Long.MAX_VALUE && !empty; - } - /** - * Instantiates a Heap Update Sketch from MemorySegment. Only SerVer3. SerVer 1 & 2 already handled. + * Instantiates a Heap Update Sketch from MemorySegment. * @param srcSeg the source MemorySegment * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * @return a Sketch */ - private static final Sketch heapifyUpdateFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { - final long cap = srcSeg.byteSize(); - if (cap < 8) { - throw new SketchesArgumentException( - "Corrupted: valid sketch must be at least 8 bytes."); - } - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); + private static final Sketch heapifyUpdateSketchFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { + final Family family = idToFamily(extractFamilyID(srcSeg)); if (family == Family.ALPHA) { - final int flags = PreambleUtil.extractFlags(srcSeg); + final int flags = extractFlags(srcSeg); final boolean compactFlag = (flags & COMPACT_FLAG_MASK) != 0; if (compactFlag) { throw new SketchesArgumentException( - "Corrupted: ALPHA family image: cannot be compact"); + "Corrupted: An ALPHA family image cannot be compact"); } return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed); } diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java deleted file mode 100644 index 40c7ccf86..000000000 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.common.Util; - -/** - * This class brings together the common sketch and set operation creation methods and - * the public static methods into one place. - * - * @author Lee Rhodes - */ -public final class Sketches { - - private Sketches() {} - - /** - * Gets the unique count estimate from a valid MemorySegment image of a Sketch - * @param srcSeg the source MemorySegment - * @return the sketch's best estimate of the cardinality of the input stream. - */ - public static double getEstimate(final MemorySegment srcSeg) { - checkIfValidThetaSketch(srcSeg); - return Sketch.estimate(getThetaLong(srcSeg), getRetainedEntries(srcSeg)); - } - - /** - * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the lower bound. - */ - public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.lowerBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxAnotBResultBytes(int)}. - * Returns the maximum number of bytes for the returned CompactSketch, given the maximum - * value of nomEntries of the first sketch A of AnotB. - * @param maxNomEntries the given value - * @return the maximum number of bytes. - */ - public static int getMaxAnotBResultBytes(final int maxNomEntries) { - return SetOperation.getMaxAnotBResultBytes(maxNomEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch with the given - * number of actual entries. - * @param numberOfEntries the actual number of retained entries stored in the sketch. - * @return the maximum number of storage bytes required for a CompactSketch with the given number - * of retained entries. - */ - public static int getMaxCompactSketchBytes(final int numberOfEntries) { - return Sketch.getMaxCompactSketchBytes(numberOfEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch given the configured - * log_base2 of the number of nominal entries, which is a power of 2. - * @param lgNomEntries Nominal Entries - * @return the maximum number of storage bytes required for a CompactSketch with the given - * lgNomEntries. - * @see Sketch#getCompactSketchMaxBytes(int) - */ - public static int getCompactSketchMaxBytes(final int lgNomEntries) { - return Sketch.getCompactSketchMaxBytes(lgNomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxIntersectionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxIntersectionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxIntersectionBytes(int)} - */ - public static int getMaxIntersectionBytes(final int nomEntries) { - return SetOperation.getMaxIntersectionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxUnionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxUnionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxUnionBytes(int)} - */ - public static int getMaxUnionBytes(final int nomEntries) { - return SetOperation.getMaxUnionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - * @param nomEntries Ref: {@link Sketch#getMaxUpdateSketchBytes(int)}, {@code nomEntries} - * @return Ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - */ - public static int getMaxUpdateSketchBytes(final int nomEntries) { - return Sketch.getMaxUpdateSketchBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getSerializationVersion(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#getSerializationVersion(MemorySegment)}, {@code srcSeg} - * @return Ref: {@link Sketch#getSerializationVersion(MemorySegment)} - */ - public static int getSerializationVersion(final MemorySegment srcSeg) { - return Sketch.getSerializationVersion(srcSeg); - } - - /** - * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the upper bound. - */ - public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.upperBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - //Heapify Operations - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg) { - return SetOperation.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg) { - return Sketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg) { - return UpdateSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.heapify(srcSeg, expectedSeed); - } - - //Builders - - /** - * Ref: {@link SetOperationBuilder SetOperationBuilder} - * @return {@link SetOperationBuilder SetOperationBuilder} - */ - public static SetOperationBuilder setOperationBuilder() { - return new SetOperationBuilder(); - } - - /** - * Ref: {@link UpdateSketchBuilder UpdateSketchBuilder} - * @return {@link UpdateSketchBuilder UpdateSketchBuilder} - */ - public static UpdateSketchBuilder updateSketchBuilder() { - return new UpdateSketchBuilder(); - } - - //Wrap operations - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Intersection backed by the given MemorySegment - */ - public static Intersection wrapIntersection(final MemorySegment srcSeg) { - return (Intersection) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg) { - return wrapSetOperation(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg) { - return Sketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the expectedSeed used to validate the given MemorySegment image. - * Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} and casts the result to a Union - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Union backed by the given MemorySegment. - */ - public static Union wrapUnion(final MemorySegment srcSeg) { - return (Union) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg) { - return wrapUpdateSketch(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.wrap(srcSeg, expectedSeed); - } - - //Restricted static methods - - static void checkIfValidThetaSketch(final MemorySegment srcSeg) { - final int fam = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - if (!Sketch.isValidSketchID(fam)) { - throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " - + Family.idToFamily(fam).toString()); - } - } - - static boolean getEmpty(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - return ((getThetaLong(srcSeg) == Long.MAX_VALUE) && (getRetainedEntries(srcSeg) == 0)); - } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - } - - static int getPreambleLongs(final MemorySegment srcSeg) { - return srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 - } - - static int getRetainedEntries(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); - if ((getThetaLong(srcSeg) == Long.MAX_VALUE) && (entries == 0)) { - return 0; - } - return entries; - } - //SerVer 2 or 3 - final int preLongs = getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - if (preLongs == 1) { - return empty ? 0 : 1; - } - //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 - } - - static long getThetaLong(final MemorySegment srcSeg) { - final int preLongs = getPreambleLongs(srcSeg); - return (preLongs < 3) ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3 - } -} diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index ed0178c8c..bbefd958c 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -22,6 +22,8 @@ import static java.lang.Math.min; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.QuickSelect.selectExcludingZeros; +import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.UNION_THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.clearEmpty; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; @@ -34,6 +36,7 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.ResizeFactor; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; @@ -105,7 +108,7 @@ static UnionImpl initNewDirectInstance( final ResizeFactor rf, final MemorySegment dstSeg) { final UpdateSketch gadget = //create with UNION family - new DirectQuickSelectSketch(lgNomLongs, seed, p, rf, dstSeg, true); + new DirectQuickSelectSketch(lgNomLongs, seed, p, rf, dstSeg, null, true); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = gadget.getThetaLong(); unionImpl.unionEmpty_ = gadget.isEmpty(); @@ -142,7 +145,7 @@ static UnionImpl fastWrapInstance(final MemorySegment srcSeg, final long expecte Family.UNION.checkFamilyID(extractFamilyID(srcSeg)); final UpdateSketch gadget = srcSeg.isReadOnly() ? DirectQuickSelectSketchR.fastReadOnlyWrap(srcSeg, expectedSeed) - : DirectQuickSelectSketch.fastWritableWrap(srcSeg, expectedSeed); + : DirectQuickSelectSketch.fastWritableWrap(srcSeg, null, expectedSeed); final UnionImpl unionImpl = new UnionImpl(gadget, expectedSeed); unionImpl.unionThetaLong_ = extractUnionThetaLong(srcSeg); unionImpl.unionEmpty_ = PreambleUtil.isEmptyFlag(srcSeg); @@ -151,17 +154,17 @@ static UnionImpl fastWrapInstance(final MemorySegment srcSeg, final long expecte /** * Wrap a Union object around a Union MemorySegment object containing data. - * Called by SetOperation. * @param srcSeg The source MemorySegment object. * @param expectedSeed the seed used to validate the given MemorySegment image. * See seed * @return this class */ + //Called by SetOperation and Union static UnionImpl wrapInstance(final MemorySegment srcSeg, final long expectedSeed) { Family.UNION.checkFamilyID(extractFamilyID(srcSeg)); final UpdateSketch gadget = srcSeg.isReadOnly() ? DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed) - : DirectQuickSelectSketch.writableWrap(srcSeg, expectedSeed); + : DirectQuickSelectSketch.writableWrap(srcSeg, null, expectedSeed); final UnionImpl unionImpl = new UnionImpl(gadget, expectedSeed); unionImpl.unionThetaLong_ = extractUnionThetaLong(srcSeg); unionImpl.unionEmpty_ = PreambleUtil.isEmptyFlag(srcSeg); @@ -269,7 +272,7 @@ public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boo public void union(final Sketch sketchIn) { //UNION Empty Rule: AND the empty states. - if ((sketchIn == null) || sketchIn.isEmpty()) { + if (sketchIn == null || sketchIn.isEmpty()) { //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } @@ -279,7 +282,7 @@ public void union(final Sketch sketchIn) { gadget_.hashUpdate(sketchIn.getCache()[0]); return; } - Sketch.checkSketchAndMemorySegmentFlags(sketchIn); + UnionImpl.checkSketchAndMemorySegmentFlags(sketchIn); unionThetaLong_ = min(min(unionThetaLong_, sketchIn.getThetaLong()), gadget_.getThetaLong()); //Theta rule unionEmpty_ = false; @@ -287,7 +290,7 @@ public void union(final Sketch sketchIn) { final HashIterator it = sketchIn.iterator(); while (it.next()) { final long hash = it.get(); - if ((hash < unionThetaLong_) && (hash < gadget_.getThetaLong())) { + if (hash < unionThetaLong_ && hash < gadget_.getThetaLong()) { gadget_.hashUpdate(hash); // backdoor update, hash function is bypassed } else if (isOrdered) { break; } } @@ -372,4 +375,22 @@ boolean isEmpty() { return gadget_.isEmpty() && unionEmpty_; } + /** + * Checks Ordered and Compact flags for integrity between sketch and its MemorySegment + * @param sketch the given sketch + */ + private static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { + final MemorySegment seg = sketch.getMemorySegment(); + if (seg == null) { return; } + final int flags = PreambleUtil.extractFlags(seg); + if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Compact Flag inconsistent with Sketch"); + } + if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Ordered Flag inconsistent with Sketch"); + } + } + } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 7db1988e9..723d57a96 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -19,19 +19,15 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; -import static org.apache.datasketches.common.Util.checkBounds; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.CompactOperations.componentsToCompact; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.SER_VER; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.checkSegmentSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; @@ -39,7 +35,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractP; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.UpdateReturnState.RejectedNullOrEmpty; import java.lang.foreign.MemorySegment; @@ -47,6 +43,7 @@ import java.util.Objects; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; @@ -60,8 +57,11 @@ * @author Lee Rhodes */ public abstract class UpdateSketch extends Sketch { + private final long seed_; - UpdateSketch() {} + UpdateSketch(final long seed) { + seed_ = seed; //kept only on heap, never serialized. Hoisted here for performance. + } /** * Wrap takes the writable sketch image in MemorySegment and refers to it directly. There is no data copying onto @@ -72,9 +72,12 @@ public abstract class UpdateSketch extends Sketch { * @param srcWSeg an image of a writable sketch where the image seed hash matches the default seed hash. * It must have a size of at least 24 bytes. * @return an UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg) { - return wrap(srcWSeg, Util.DEFAULT_UPDATE_SEED); + return wrap(srcWSeg, null, Util.DEFAULT_UPDATE_SEED); } /** @@ -85,24 +88,30 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * Java Heap version of the sketch where all data will be copied to the heap. * @param srcWSeg an image of a writable sketch where the image seed hash matches the given seed hash. * It must have a size of at least 24 bytes. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * Compact sketches store a 16-bit hash of the seed, but not the seed itself. * @return a UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ - public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expectedSeed) { - Objects.requireNonNull(srcWSeg, "Source MemorySeg e t must not be null"); - checkBounds(0, 24, srcWSeg.byteSize()); //need min 24 bytes - final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcWSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcWSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family != Family.QUICKSELECT) { + public static UpdateSketch wrap( + final MemorySegment srcWSeg, + final MemorySegmentRequest mSegReq, + final long expectedSeed) { + Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); + final int preLongs = checkSegPreambleCap(srcWSeg) & 0X3F; //mask to 6 bits; + final int serVer = extractSerVer(srcWSeg); + final int familyID = extractFamilyID(srcWSeg); + if (familyID != Family.QUICKSELECT.getID()) { + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "A " + family + " sketch cannot be wrapped as an UpdateSketch."); } - if ((serVer == 3) && (preLongs == 3)) { - return DirectQuickSelectSketch.writableWrap(srcWSeg, expectedSeed); + if (serVer == 3 && preLongs == 3) { + return DirectQuickSelectSketch.writableWrap(srcWSeg, mSegReq, expectedSeed); } else { throw new SketchesArgumentException( "Corrupted: An UpdateSketch image must have SerVer = 3 and preLongs = 3"); @@ -115,6 +124,9 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expected * @param srcSeg the given MemorySegment with a sketch image. * It must have a size of at least 24 bytes. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg) { return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); @@ -127,12 +139,15 @@ public static UpdateSketch heapify(final MemorySegment srcSeg) { * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); - checkBounds(0, 24, srcSeg.byteSize()); //need min 24 bytes - final Family family = Family.idToFamily(srcSeg.get(JAVA_BYTE, FAMILY_BYTE)); - if (family.equals(Family.ALPHA)) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.ALPHA.getID()) { return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed); } return HeapQuickSelectSketch.heapifyInstance(srcSeg, expectedSeed); @@ -142,8 +157,16 @@ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expect @Override public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstWSeg) { - return componentsToCompact(getThetaLong(), getRetainedEntries(true), getSeedHash(), isEmpty(), - false, false, dstOrdered, dstWSeg, getCache()); + return componentsToCompact( + getThetaLong(), + getRetainedEntries(true), + getSeedHash(), + isEmpty(), + false, //is src compact + false, //is src ordered + dstOrdered, + dstWSeg, + getCache()); } @Override @@ -160,7 +183,7 @@ int getCurrentDataLongs() { @Override public boolean hasMemorySegment() { - return ((this instanceof DirectQuickSelectSketchR) && ((DirectQuickSelectSketchR)this).hasMemorySegment()); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.hasMemorySegment(); } @Override @@ -170,7 +193,7 @@ public boolean isCompact() { @Override public boolean isOffHeap() { - return ((this instanceof DirectQuickSelectSketchR) && ((DirectQuickSelectSketchR)this).isOffHeap()); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isOffHeap(); } @Override @@ -180,7 +203,7 @@ public boolean isOrdered() { @Override public boolean isSameResource(final MemorySegment that) { - return (this instanceof final DirectQuickSelectSketchR dqssr) && dqssr.isSameResource(that); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isSameResource(that); } //UpdateSketch interface @@ -210,7 +233,7 @@ public static final UpdateSketchBuilder builder() { * Gets the configured seed * @return the configured seed */ - abstract long getSeed(); + public long getSeed() { return seed_; } /** * Resets this sketch back to a virgin empty state. @@ -232,8 +255,7 @@ public static final UpdateSketchBuilder builder() { * See Update Return State */ public UpdateReturnState update(final long datum) { - final long[] data = { datum }; - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(datum, seed_)[0] >>> 1); } /** @@ -248,9 +270,9 @@ public UpdateReturnState update(final long datum) { * See Update Return State */ public UpdateReturnState update(final double datum) { - final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0 - final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN & +/- infinity forms - return hashUpdate(hash(data, getSeed())[0] >>> 1); + final double d = datum == 0.0 ? 0.0 : datum; // canonicalize -0.0, 0.0 + final long data = Double.doubleToLongBits(d);// canonicalize all NaN & +/- infinity forms + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -267,11 +289,11 @@ public UpdateReturnState update(final double datum) { * See Update Return State */ public UpdateReturnState update(final String datum) { - if ((datum == null) || datum.isEmpty()) { + if (datum == null || datum.isEmpty()) { return RejectedNullOrEmpty; } final byte[] data = datum.getBytes(UTF_8); - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -283,10 +305,10 @@ public UpdateReturnState update(final String datum) { * See Update Return State */ public UpdateReturnState update(final byte[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -298,10 +320,10 @@ public UpdateReturnState update(final byte[] data) { * See Update Return State */ public UpdateReturnState update(final ByteBuffer buffer) { - if ((buffer == null) || !buffer.hasRemaining()) { + if (buffer == null || !buffer.hasRemaining()) { return RejectedNullOrEmpty; } - return hashUpdate(hash(buffer, getSeed())[0] >>> 1); + return hashUpdate(hash(buffer, seed_)[0] >>> 1); } /** @@ -316,10 +338,10 @@ public UpdateReturnState update(final ByteBuffer buffer) { * See Update Return State */ public UpdateReturnState update(final char[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -331,10 +353,10 @@ public UpdateReturnState update(final char[] data) { * See Update Return State */ public UpdateReturnState update(final int[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -346,10 +368,10 @@ public UpdateReturnState update(final int[] data) { * See Update Return State */ public UpdateReturnState update(final long[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } //restricted methods @@ -391,23 +413,23 @@ public UpdateReturnState update(final long[] data) { */ abstract boolean isOutOfSpace(int numEntries); - static void checkUnionQuickSelectFamily(final MemorySegment seg, final int preambleLongs, - final int lgNomLongs) { + static void checkUnionAndQuickSelectFamily(final MemorySegment seg, final int preambleLongs, final int lgNomLongs) { + //Check Family final int familyID = extractFamilyID(seg); //byte 2 - final Family family = Family.idToFamily(familyID); - if (family.equals(Family.UNION)) { + if (familyID == Family.UNION.getID()) { if (preambleLongs != Family.UNION.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs); } } - else if (family.equals(Family.QUICKSELECT)) { + else if (familyID == Family.QUICKSELECT.getID()) { if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs); } } else { + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "Possible corruption: Invalid Family: " + family.toString()); } @@ -444,7 +466,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee //Check seg capacity, lgArrLongs final long curCapBytes = srcSeg.byteSize(); - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); if (curCapBytes < minReqBytes) { throw new SketchesArgumentException( "Possible corruption: Current MemorySegment size < min required size: " @@ -455,7 +477,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee final long thetaLong = extractThetaLong(srcSeg); //bytes 16-23 final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; //if (lgArrLongs <= lgNomLongs) the sketch is still resizing, thus theta cannot be < p. - if ((lgArrLongs <= lgNomLongs) && (theta < p) ) { + if (lgArrLongs <= lgNomLongs && theta < p ) { throw new SketchesArgumentException( "Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. " + lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p); @@ -477,7 +499,7 @@ static boolean isResizeFactorIncorrect(final MemorySegment srcSeg, final int lgN final int lgA = lgArrLongs; final int lgR = extractLgResizeFactor(srcSeg); if (lgR == 0) { return lgA != lgT; } - return (((lgT - lgA) % lgR) != 0); + return (lgT - lgA) % lgR != 0; } } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java index 834778f87..d91d654b6 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java @@ -26,6 +26,7 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; @@ -44,10 +45,11 @@ public final class UpdateSketchBuilder { private ResizeFactor bRF; private Family bFam; private float bP; + private MemorySegmentRequest bMemorySegmentRequest; //Fields for concurrent theta sketch private int bNumPoolThreads; - private int bLocalLgNomLongs; + private int bConCurLgNomLongs; private boolean bPropagateOrderedCompact; private double bMaxConcurrencyError; private int bMaxNumLocalThreads; @@ -57,11 +59,12 @@ public final class UpdateSketchBuilder { *This sketch can only be associated with a Serialization Version 4 format binary image.
*/ @@ -68,7 +69,7 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java index a5b67363f..1f3f3ab9e 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java @@ -37,9 +37,10 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * - *This sketch can only be associated with a Serialization Version 3 format binary image.
+ *This sketch can only be associated with a Serialization Version 3 binary image format.
*/ class WrappedCompactSketch extends CompactSketch { final byte[] bytes_; @@ -79,7 +80,7 @@ public int getCurrentBytes() { } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch final int preLongs = bytes_[PREAMBLE_LONGS_BYTE]; return (preLongs == 1) ? 0 : getIntLE(bytes_, RETAINED_ENTRIES_INT); } diff --git a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java index 4012cb412..778dc02f2 100644 --- a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java +++ b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java @@ -71,8 +71,7 @@ private ThetaUtil() {} * @param lgMin Log2 of the minimum allowed starting size * @return The Log2 of the starting size */ - public static int startingSubMultiple(final int lgTarget, final int lgRF, - final int lgMin) { + public static int startingSubMultiple(final int lgTarget, final int lgRF, final int lgMin) { return lgTarget <= lgMin ? lgMin : lgRF == 0 ? lgTarget : (lgTarget - lgMin) % lgRF + lgMin; } diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index c264366a4..92355a77b 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -135,8 +135,9 @@ private QuickSelectSketch( final float samplingProbability, final SummaryFactoryV1 dates from roughly Aug 2014 to about May 2015. - * The library at that time had an early Theta sketch with set operations based on ByteBuffer, - * the Alpha sketch, and an early HLL sketch. It also had an early adaptor for Pig. - * It also had code for the even earlier CountUniqueSketch (for backward compatibility), - * which was the bucket sketch based on Giroire. - * - *
Serialization Version 1:
- *- * Long || Start Byte Adr: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || | Flags | LgResize | LgArr | lgNom | SkType | SerVer | MD_LONGS | - * - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 || | ------------CurCount-------------- | - * - * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | - * 2 || --------------------------THETA_LONG------------------------------ | - * - * || | 24 | - * 3 || ----------------------Start of Long Array------------------------ | - *- * - *
V2 is short-lived and dates from roughly Mid May 2015 to about June 1st, 2015. - * (V3 was created about June 15th in preparation for OpenSource in July.) - * The Theta sketch had evolved but still based on ByteBuffer. There was an UpdateSketch, - * the Alpha sketch, and the early HLL sketch. It also had an early adaptor for Pig. - * - * - *
Serialization Version 2:
- *- * Long || Start Byte Adr: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || Seed Hash | Flags | lgArr | lgNom | SkType | SerVer | MD_LONGS + RR | - * - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 || --------------p-------------- | ---------Retained Entries Count-------- | - * - * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | - * 2 || --------------------------THETA_LONG----------------------------------- | - * - * || | 24 | - * 3 || ----------Start of Long Array, could be at 2 or 3 -------------------- | - *- * - *
- * // Metadata byte Addresses
- * private static final int METADATA_LONGS_BYTE = 0; //low 6 bits
- * private static final int LG_RESIZE_RATIO_BYTE = 0; //upper 2 bits
- * private static final int SER_VER_BYTE = 1;
- * private static final int SKETCH_TYPE_BYTE = 2;
- * private static final int LG_NOM_LONGS_BYTE = 3;
- * private static final int LG_ARR_LONGS_BYTE = 4;
- * private static final int FLAGS_BYTE = 5;
- * private static final int SEED_HASH_SHORT = 6; //byte 6,7
- * private static final int RETAINED_ENTRIES_COUNT_INT = 8; //4 byte aligned
- * private static final int P_FLOAT = 12; //4 byte aligned
- * private static final int THETA_LONG = 16; //8-byte aligned
- * //Backward compatibility
- * private static final int FLAGS_BYTE_V1 = 6;
- * private static final int LG_RESIZE_RATIO_BYTE_V1 = 5;
- *
- * // Constant Values
- * static final int SER_VER = 2;
- * static final int ALPHA_SKETCH = 1; //SKETCH_TYPE_BYTE
- * static final int QUICK_SELECT_SKETCH = 2;
- * static final int SET_SKETCH = 3;
- * static final int BUFFERED_QUICK_SELECT_SKETCH = 4;
- * static final String[] SKETCH_TYPE_STR =
- * { "None", "AlphaSketch", "QuickSelectSketch", "SetSketch", "BufferedQuickSelectSketch" };
- *
- * // flag bit masks
- * static final int BIG_ENDIAN_FLAG_MASK = 1;
- * static final int READ_ONLY_FLAG_MASK = 2;
- * static final int EMPTY_FLAG_MASK = 4;
- * static final int NO_REBUILD_FLAG_MASK = 8;
- * static final int UNORDERED_FLAG_MASK = 16;
- *
- *
- * @param skV3 a SerVer3, ordered CompactSketch
- * @param seed used for checking the seed hash (if one exists).
- * @return a SerVer2 SetSketch as MemorySegment object.
- */
- public static MemorySegment convertSerVer3toSerVer2(final CompactSketch skV3, final long seed) {
- final short seedHash = Util.computeSeedHash(seed);
- MemorySegment wseg = null;
-
- if (skV3 instanceof EmptyCompactSketch) {
- wseg = MemorySegment.ofArray(new long[1]);
- wseg.set(JAVA_BYTE, 0, (byte) 1); //preLongs
- wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer
- wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch
- final byte flags = (byte) 0xE; //NoRebuild, Empty, ReadOnly, LE
- wseg.set(JAVA_BYTE, 5, flags);
- wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash);
- return wseg;
- }
- if (skV3 instanceof SingleItemSketch) {
- final SingleItemSketch sis = (SingleItemSketch) skV3;
- wseg = MemorySegment.ofArray(new long[3]);
- wseg.set(JAVA_BYTE, 0, (byte) 2); //preLongs
- wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer
- wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch
- final byte flags = (byte) 0xA; //NoRebuild, notEmpty, ReadOnly, LE
- wseg.set(JAVA_BYTE, 5, flags);
- wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash);
- wseg.set(JAVA_INT_UNALIGNED, 8, 1);
- final long[] arr = sis.getCache();
- wseg.set(JAVA_LONG_UNALIGNED, 16, arr[0]);
- return wseg;
- }
- //General CompactSketch
- final int preLongs = skV3.getCompactPreambleLongs();
- final int entries = skV3.getRetainedEntries(true);
- final boolean unordered = !(skV3.isOrdered());
- final byte flags = (byte) (0xA | (unordered ? 16 : 0)); //Unordered, NoRebuild, notEmpty, ReadOnly, LE
- wseg = MemorySegment.ofArray(new byte[(preLongs + entries) << 3]);
- wseg.set(JAVA_BYTE, 0, (byte) preLongs); //preLongs
- wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer
- wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch
-
- wseg.set(JAVA_BYTE, 5, flags);
- wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash);
- wseg.set(JAVA_INT_UNALIGNED, 8, entries);
- if (preLongs == 3) {
- wseg.set(JAVA_LONG_UNALIGNED, 16, skV3.getThetaLong());
- }
- final long[] arr = skV3.getCache();
- MemorySegment.copy(arr, 0, wseg, JAVA_LONG_UNALIGNED, preLongs << 3, entries);
- return wseg;
- }
-}
diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
index 6d9c173a0..fc35891b3 100644
--- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
@@ -26,25 +26,13 @@
import static org.testng.Assert.assertNull;
import static org.testng.Assert.assertTrue;
+import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
+
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
-import org.apache.datasketches.theta.CompactSketch;
-import org.apache.datasketches.theta.DirectCompactSketch;
-import org.apache.datasketches.theta.EmptyCompactSketch;
-import org.apache.datasketches.theta.HashIterator;
-import org.apache.datasketches.theta.HeapCompactSketch;
-import org.apache.datasketches.theta.Intersection;
-import org.apache.datasketches.theta.SingleItemSketch;
-import org.apache.datasketches.theta.Sketch;
-import org.apache.datasketches.theta.Sketches;
-import org.apache.datasketches.theta.UpdateSketch;
-import org.apache.datasketches.theta.WrappedCompactCompressedSketch;
-import org.apache.datasketches.theta.WrappedCompactSketch;
import org.testng.annotations.Test;
-import java.lang.foreign.Arena;
-
/**
* @author Lee Rhodes
*/
@@ -186,7 +174,7 @@ private static void checkOtherCompactSketch(final Sketch testSk, final Sketch re
@Test
public void checkDirectSingleItemSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
sk.update(1);
final int bytes = sk.getCompactBytes();
final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]);
@@ -230,7 +218,7 @@ public void checkSegTooSmallOrdered() {
@Test
public void checkCompactCachePart() {
//phony values except for curCount = 0.
- final long[] result = Intersection.compactCachePart(null, 4, 0, 0L, false);
+ final long[] result = IntersectionImpl.compactCachePart(null, 4, 0, 0L, false);
assertEquals(result.length, 0);
}
@@ -250,7 +238,7 @@ public void checkCompactCachePart() {
* Empty, segment-based Compact sketches are always ordered
*/
public void checkEmptyMemorySegmentCompactSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]);
final CompactSketch csk1 = sk.compact(false, wseg1); //the first parameter is ignored when empty
@@ -290,7 +278,7 @@ public void checkEmptyMemorySegmentCompactSketch() {
* Single-Item, segment-based Compact sketches are always ordered:
*/
public void checkSingleItemMemorySegmentCompactSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
sk.update(1);
final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]);
@@ -321,7 +309,7 @@ public void checkSingleItemMemorySegmentCompactSketch() {
@Test
public void checkMultipleItemMemorySegmentCompactSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
//This sequence is naturally out-of-order by the hash values.
sk.update(1);
sk.update(2);
@@ -360,7 +348,7 @@ public void checkMultipleItemMemorySegmentCompactSketch() {
* All empty, heap-based, compact sketches point to the same static, final constant of 8 bytes.
*/
public void checkEmptyHeapCompactSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when empty
final State state1 = new State("EmptyCompactSketch", 0, 8, COMPACT, EMPTY, !DIRECT, !SEGMENT, ORDERED, !ESTIMATION);
@@ -390,7 +378,7 @@ public void checkEmptyHeapCompactSketch() {
* Single-Item, heap-based Compact sketches are always ordered.
*/
public void checkSingleItemHeapCompactSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
sk.update(1);
final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when single item
@@ -418,7 +406,7 @@ public void checkSingleItemHeapCompactSketch() {
@Test
public void checkMultipleItemHeapCompactSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
//This sequence is naturally out-of-order by the hash values.
sk.update(1);
sk.update(2);
@@ -453,9 +441,9 @@ public void checkMultipleItemHeapCompactSketch() {
@Test
public void checkHeapifySingleItemSketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
sk.update(1);
- final int bytes = Sketches.getMaxCompactSketchBytes(2); //1 more than needed
+ final int bytes = Sketch.getMaxCompactSketchBytes(2); //1 more than needed
final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]);
sk.compact(false, wseg);
final Sketch csk = Sketch.heapify(wseg);
@@ -464,7 +452,7 @@ public void checkHeapifySingleItemSketch() {
@Test
public void checkHeapifyEmptySketch() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
final MemorySegment wseg = MemorySegment.ofArray(new byte[16]); //empty, but extra bytes
final CompactSketch csk = sk.compact(false, wseg); //ignores order because it is empty
assertTrue(csk instanceof DirectCompactSketch);
@@ -474,7 +462,7 @@ public void checkHeapifyEmptySketch() {
@Test
public void checkGetCache() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().setP((float).5).build();
+ final UpdateSketch sk = UpdateSketch.builder().setP((float).5).build();
sk.update(7);
final int bytes = sk.getCompactBytes();
final CompactSketch csk = sk.compact(true, MemorySegment.ofArray(new byte[bytes]));
@@ -484,7 +472,7 @@ public void checkGetCache() {
@Test
public void checkHeapCompactSketchCompact() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
sk.update(1);
sk.update(2);
final CompactSketch csk = sk.compact();
@@ -506,7 +494,7 @@ public void checkDirectCompactSketchCompact() {
final int lgK = 6;
//empty
- final UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build();
+ final UpdateSketch sk = UpdateSketch.builder().setLogNominalEntries(lgK).build();
bytes = sk.getCompactBytes(); //empty, 8 bytes
wseg1 = MemorySegment.ofArray(new byte[bytes]);
wseg2 = MemorySegment.ofArray(new byte[bytes]);
@@ -566,7 +554,7 @@ public void checkDirectCompactSketchCompact() {
@Test
public void serializeDeserializeHeapV4() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
for (int i = 0; i < 10000; i++) {
sk.update(i);
}
@@ -582,8 +570,8 @@ public void serializeDeserializeHeapV4() {
}
@Test
- public void serializeDeserializeDirectV4() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ public void serializeDeserializeDirectV4_segment() {
+ final UpdateSketch sk = UpdateSketch.builder().build();
for (int i = 0; i < 10000; i++) {
sk.update(i);
}
@@ -598,9 +586,27 @@ public void serializeDeserializeDirectV4() {
}
}
+ @Test
+ public void serializeDeserializeDirectV4_bytes() {
+ final UpdateSketch sk = UpdateSketch.builder().build();
+ for (int i = 0; i < 10000; i++) {
+ sk.update(i);
+ }
+ final CompactSketch cs1 = sk.compact(true, MemorySegment.ofArray(new byte[sk.getCompactBytes()]));
+ final byte[] bytes = cs1.toByteArrayCompressed();
+ final CompactSketch cs2 = CompactSketch.wrap(bytes);
+ assertEquals(cs1.getRetainedEntries(), cs2.getRetainedEntries());
+ final HashIterator it1 = cs1.iterator();
+ final HashIterator it2 = cs2.iterator();
+ while (it1.next() && it2.next()) {
+ assertEquals(it2.get(), it2.get());
+ }
+ }
+
+
@Test
public void serializeWrapBytesV3() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
for (int i = 0; i < 10000; i++) {
sk.update(i);
}
@@ -618,7 +624,7 @@ public void serializeWrapBytesV3() {
@Test
public void serializeWrapBytesV4() {
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
for (int i = 0; i < 10000; i++) {
sk.update(i);
}
diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java
index 4a59edb97..e0816b0e5 100644
--- a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java
@@ -33,15 +33,6 @@
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.Util;
-import org.apache.datasketches.theta.CompactSketch;
-import org.apache.datasketches.theta.ConcurrentDirectQuickSelectSketch;
-import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer;
-import org.apache.datasketches.theta.ConcurrentSharedThetaSketch;
-import org.apache.datasketches.theta.DirectQuickSelectSketch;
-import org.apache.datasketches.theta.Sketch;
-import org.apache.datasketches.theta.Sketches;
-import org.apache.datasketches.theta.UpdateSketch;
-import org.apache.datasketches.theta.UpdateSketchBuilder;
import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketchTest.SharedLocal;
import org.apache.datasketches.thetacommon.HashOperations;
import org.testng.annotations.Test;
@@ -79,7 +70,7 @@ public void checkHeapifyMemorySegmentEstimating() {
assertEquals(local.getClass().getSimpleName(), "ConcurrentHeapThetaBuffer");
//This sharedHeap is not linked to the concurrent local buffer
- final UpdateSketch sharedHeap = Sketches.heapifyUpdateSketch(sl.wseg);
+ final UpdateSketch sharedHeap = UpdateSketch.heapify(sl.wseg);
assertEquals(sharedHeap.getClass().getSimpleName(), "HeapQuickSelectSketch");
checkMemorySegmentDirectProxyMethods(local, shared);
@@ -242,7 +233,7 @@ public void checkDQStoCompactForms() {
assertEquals(csk.getClass().getSimpleName(), "HeapCompactSketch");
final int bytes = shared.getCompactBytes();
- assertEquals(bytes, (k*8) + (Family.COMPACT.getMaxPreLongs() << 3));
+ assertEquals(bytes, k*8 + (Family.COMPACT.getMaxPreLongs() << 3));
final byte[] segArr2 = new byte[bytes];
final MemorySegment seg2 = MemorySegment.ofArray(segArr2);
@@ -462,7 +453,7 @@ public void checkEstModeMemorySegmentArr() {
waitForBgPropagationToComplete(shared);
final double est = local.getEstimate();
- assertTrue((est < (u * 1.05)) && (est > (u * 0.95)));
+ assertTrue(est < u * 1.05 && est > u * 0.95);
assertTrue(shared.getRetainedEntries(false) >= k);
}
@@ -480,7 +471,7 @@ public void checkEstModeNativeMemorySegment() {
for (int i = 0; i< u; i++) { local.update(i); }
waitForBgPropagationToComplete(shared);
final double est = local.getEstimate();
- assertTrue((est < (u * 1.05)) && (est > (u * 0.95)));
+ assertTrue(est < u * 1.05 && est > u * 0.95);
assertTrue(shared.getRetainedEntries(false) >= k);
}
@@ -501,7 +492,7 @@ public void checkConstructReconstructFromMemorySegment() {
final double est1 = local.getEstimate();
final int count1 = shared.getRetainedEntries(false);
- assertTrue((est1 < (u * 1.05)) && (est1 > (u * 0.95)));
+ assertTrue(est1 < u * 1.05 && est1 > u * 0.95);
assertTrue(count1 >= k);
byte[] serArr;
@@ -509,7 +500,7 @@ public void checkConstructReconstructFromMemorySegment() {
serArr = shared.toByteArray();
final MemorySegment seg = MemorySegment.ofArray(serArr);
- final UpdateSketch recoveredShared = Sketches.wrapUpdateSketch(seg);
+ final UpdateSketch recoveredShared = UpdateSketch.wrap(seg);
//reconstruct to Native/Direct
final int bytes = Sketch.getMaxUpdateSketchBytes(k);
@@ -576,7 +567,7 @@ public void checkBadLgNomLongs() {
final boolean useSeg = true;
final SharedLocal sl = new SharedLocal(lgK, lgK, useSeg);
sl.wseg.set(JAVA_BYTE, LG_NOM_LONGS_BYTE, (byte) 3); //Corrupt LgNomLongs byte
- DirectQuickSelectSketch.writableWrap(sl.wseg, Util.DEFAULT_UPDATE_SEED);
+ DirectQuickSelectSketch.writableWrap(sl.wseg, null, Util.DEFAULT_UPDATE_SEED);
}
@Test
@@ -607,7 +598,7 @@ public void checkBackgroundPropagation() {
final long theta2 = ((ConcurrentSharedThetaSketch)shared).getVolatileTheta();
final int entries = shared.getRetainedEntries(false);
- assertTrue((entries > k) || (theta2 < theta1),
+ assertTrue(entries > k || theta2 < theta1,
"entries="+entries+" k="+k+" theta1="+theta1+" theta2="+theta2);
shared.rebuild();
@@ -658,7 +649,7 @@ public void checkWrapIllegalFamilyID_direct() {
sl.wseg.set(JAVA_BYTE, FAMILY_BYTE, (byte) 0); //corrupt the Sketch ID byte
//try to wrap the corrupted seg
- DirectQuickSelectSketch.writableWrap(sl.wseg, Util.DEFAULT_UPDATE_SEED);
+ DirectQuickSelectSketch.writableWrap(sl.wseg, null, Util.DEFAULT_UPDATE_SEED);
}
@Test(expectedExceptions = SketchesArgumentException.class)
diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java
index 565ef50ed..c354fd344 100644
--- a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java
@@ -34,17 +34,6 @@
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.Util;
-import org.apache.datasketches.theta.CompactSketch;
-import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketch;
-import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer;
-import org.apache.datasketches.theta.ConcurrentPropagationService;
-import org.apache.datasketches.theta.ConcurrentSharedThetaSketch;
-import org.apache.datasketches.theta.HeapQuickSelectSketch;
-import org.apache.datasketches.theta.PreambleUtil;
-import org.apache.datasketches.theta.Sketch;
-import org.apache.datasketches.theta.Sketches;
-import org.apache.datasketches.theta.UpdateSketch;
-import org.apache.datasketches.theta.UpdateSketchBuilder;
import org.testng.annotations.Test;
/**
@@ -90,7 +79,7 @@ public void checkPropagationNotOrdered() {
final SharedLocal sl = new SharedLocal(lgK, 4, false, false);
final UpdateSketch shared = sl.shared;
final UpdateSketch local = sl.local;
- assertEquals((sl.bldr.getLocalLgNominalEntries()), 4);
+ assertEquals((sl.bldr.getConCurLgNominalEntries()), 4);
assertTrue(local.isEmpty());
for (int i = 0; i < u; i++) {
@@ -173,7 +162,7 @@ public void checkHeapifyByteArrayExact() {
final byte[] serArr = shared.toByteArray();
final MemorySegment srcSeg = MemorySegment.ofArray(serArr).asReadOnly();
- final Sketch recoveredShared = Sketches.heapifyUpdateSketch(srcSeg);
+ final Sketch recoveredShared = UpdateSketch.heapify(srcSeg);
//reconstruct to Native/Direct
final int bytes = Sketch.getMaxUpdateSketchBytes(k);
@@ -494,7 +483,7 @@ public void checkRebuild() {
public void checkBuilder() {
final int lgK = 4;
final SharedLocal sl = new SharedLocal(lgK);
- assertEquals(sl.bldr.getLocalLgNominalEntries(), lgK);
+ assertEquals(sl.bldr.getConCurLgNominalEntries(), lgK);
assertEquals(sl.bldr.getLgNominalEntries(), lgK);
println(sl.bldr.toString());
}
@@ -652,11 +641,11 @@ public void checkBuilderExceptions() {
fail();
} catch (final SketchesArgumentException e) { }
try {
- bldr.setLocalNominalEntries(8);
+ bldr.setConCurNominalEntries(8);
fail();
} catch (final SketchesArgumentException e) { }
try {
- bldr.setLocalLogNominalEntries(3);
+ bldr.setConCurLogNominalEntries(3);
fail();
} catch (final SketchesArgumentException e) { }
bldr.setNumPoolThreads(4);
@@ -731,7 +720,7 @@ static class SharedLocal {
wseg = null;
}
bldr.setLogNominalEntries(sharedLgK);
- bldr.setLocalLogNominalEntries(localLgK);
+ bldr.setConCurLogNominalEntries(localLgK);
bldr.setPropagateOrderedCompact(ordered);
bldr.setSeed(this.seed);
shared = bldr.buildShared(wseg);
diff --git a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java
index e812ab8f2..59b6396b7 100644
--- a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java
+++ b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java
@@ -31,19 +31,12 @@
import static org.testng.Assert.fail;
import java.lang.foreign.MemorySegment;
+
import org.apache.datasketches.common.Family;
-import org.apache.datasketches.common.Util;
-import org.apache.datasketches.theta.CompactSketch;
-import org.apache.datasketches.theta.Intersection;
-import org.apache.datasketches.theta.IntersectionImpl;
-import org.apache.datasketches.theta.PreambleUtil;
-import org.apache.datasketches.theta.SetOperation;
-import org.apache.datasketches.theta.Sketches;
-import org.apache.datasketches.theta.Union;
-import org.apache.datasketches.theta.UpdateSketch;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.SketchesReadOnlyException;
import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.common.Util;
import org.testng.annotations.Test;
/**
@@ -471,7 +464,7 @@ public void checkWrapVirginEmpty() {
MemorySegment iSeg = MemorySegment.ofArray(new byte[segBytes]);
inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin off-heap
- inter2 = Sketches.wrapIntersection(iSeg); //virgin off-heap, identical to inter1
+ inter2 = Intersection.wrap(iSeg); //virgin off-heap, identical to inter1
//both in virgin state, empty = false
//note: both inter1 and inter2 are tied to the same MemorySegment,
// so an intersect to one also affects the other. Don't do what I do!
@@ -493,7 +486,7 @@ public void checkWrapVirginEmpty() {
//test the path via toByteArray, now in a different state
iSeg = MemorySegment.ofArray(inter1.toByteArray());
- inter2 = Sketches.wrapIntersection(iSeg);
+ inter2 = Intersection.wrap(iSeg);
assertTrue(inter2.hasResult()); //still true
//test the compaction path
@@ -514,7 +507,7 @@ public void checkWrapNullEmpty2() {
final MemorySegment iSeg = MemorySegment.ofArray(segArr);
inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin
- inter2 = Sketches.wrapIntersection(iSeg);
+ inter2 = Intersection.wrap(iSeg);
//both in virgin state, empty = false
assertFalse(inter1.hasResult());
assertFalse(inter2.hasResult());
@@ -525,7 +518,7 @@ public void checkWrapNullEmpty2() {
//remains empty = false.
inter1.intersect(sk1);
- inter2 = Sketches.wrapIntersection(iSeg);
+ inter2 = Intersection.wrap(iSeg);
assertTrue(inter1.hasResult());
assertTrue(inter2.hasResult());
final CompactSketch comp = inter2.getResult(true, null);
@@ -579,7 +572,7 @@ public void checkBadPreambleLongs() {
final MemorySegment seg = MemorySegment.ofArray(byteArray);
//corrupt:
seg.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, (byte) 2);//RF not used = 0
- Sketches.wrapIntersection(seg);
+ Intersection.wrap(seg);
}
@Test(expectedExceptions = SketchesArgumentException.class)
@@ -596,18 +589,19 @@ public void checkBadSerVer() {
final MemorySegment seg = MemorySegment.ofArray(byteArray);
//corrupt:
seg.set(JAVA_BYTE, SER_VER_BYTE, (byte) 2);
- Sketches.wrapIntersection(seg); //throws in SetOperations
+ Intersection.wrap(seg); //throws in SetOperations
}
- @Test(expectedExceptions = ClassCastException.class)
- public void checkFamilyID() {
+ @Test(expectedExceptions = SketchesArgumentException.class)
+ public void checkIncorrectWrap() {
final int k = 32;
Union union;
union = SetOperation.builder().setNominalEntries(k).buildUnion();
final byte[] byteArray = union.toByteArray();
final MemorySegment seg = MemorySegment.ofArray(byteArray);
- Sketches.wrapIntersection(seg);
+ Intersection.wrap(seg); //wrong sketch Family
+ //Sketches.wrapIntersection(seg);
}
@Test
@@ -637,7 +631,7 @@ public void checkWrap() {
final byte[] segArr2 = inter.toByteArray();
final MemorySegment srcSeg = MemorySegment.ofArray(segArr2);
- inter2 = Sketches.wrapIntersection(srcSeg);
+ inter2 = Intersection.wrap(srcSeg);
//2nd call = valid intersecting
sk2 = UpdateSketch.builder().setNominalEntries(k).build();
@@ -656,7 +650,7 @@ public void checkWrap() {
final byte[] segArr3 = inter2.toByteArray();
final MemorySegment srcSeg2 = MemorySegment.ofArray(segArr3);
- inter3 = Sketches.wrapIntersection(srcSeg2);
+ inter3 = Intersection.wrap(srcSeg2);
resultComp2 = inter3.getResult(false, null);
est2 = resultComp2.getEstimate();
println("Est2: "+est2);
@@ -683,13 +677,13 @@ public void checkExceptionMinSize() {
@Test
public void checkGetResult() {
final int k = 1024;
- final UpdateSketch sk = Sketches.updateSketchBuilder().build();
+ final UpdateSketch sk = UpdateSketch.builder().build();
final int segBytes = getMaxIntersectionBytes(k);
final byte[] segArr = new byte[segBytes];
final MemorySegment iSeg = MemorySegment.ofArray(segArr);
- final Intersection inter = Sketches.setOperationBuilder().buildIntersection(iSeg);
+ final Intersection inter = SetOperation.builder().buildIntersection(iSeg);
inter.intersect(sk);
final CompactSketch csk = inter.getResult();
assertEquals(csk.getCompactBytes(), 8);
@@ -732,8 +726,8 @@ public void checkExceptions2() {
public void checkOverlappedDirect() {
final int k = 1 << 4;
final int segBytes = 2*k*16 +PREBYTES; //plenty of room
- final UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k).build();
- final UpdateSketch sk2 = Sketches.updateSketchBuilder().setNominalEntries(k).build();
+ final UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k).build();
+ final UpdateSketch sk2 = UpdateSketch.builder().setNominalEntries(k).build();
for (int i=0; i