mutations) throws IOException {
+ boolean logAndSkipIncompatibleRows = false;
+
+ Put put = null;
+ int cellCount = 0;
+ long totalByteSize = 0L;
+
+ // create mutations
+ for (Cell cell : cells) {
+ totalByteSize += cell.heapSize();
+
+ // handle large cells
+ if (filterLargeCells && cell.getValueLength() > filterLargeCellThresholdBytes) {
+ // TODO add config name in log
+ LOG.warn(
+ "Dropping mutation, cell value length, "
+ + cell.getValueLength()
+ + ", exceeds filter length, "
+ + filterLargeCellThresholdBytes
+ + ", cell: "
+ + cell
+ + ", row key: "
+ + Bytes.toStringBinary(rowKey));
+ continue;
+ }
+
+ // Split the row into multiple mutations if mutations exceeds threshold limit
+ if (cellCount % maxMutationsPerRequestThreshold == 0) {
+ cellCount = 0;
+ put = new Put(rowKey);
+ mutations.add(put);
+ }
+ put.add(cell);
+ cellCount++;
+ }
+
+ // TODO add config name in log
+ if (filterLargeRows && totalByteSize > filterLargeRowThresholdBytes) {
+ logAndSkipIncompatibleRows = true;
+ LOG.warn(
+ "Dropping row, row length, "
+ + totalByteSize
+ + ", exceeds filter length threshold, "
+ + filterLargeRowThresholdBytes
+ + ", row key: "
+ + Bytes.toStringBinary(rowKey));
+ }
+
+ // TODO add config name in log
+ if (filterLargeRowKeys && rowKey.length > filterLargeRowKeysThresholdBytes) {
+ logAndSkipIncompatibleRows = true;
+ LOG.warn(
+ "Dropping row, row key length, "
+ + rowKey.length
+ + ", exceeds filter length threshold, "
+ + filterLargeRowKeysThresholdBytes
+ + ", row key: "
+ + Bytes.toStringBinary(rowKey));
+ }
+
+ return logAndSkipIncompatibleRows;
+ }
+ }
+
+ /**
+ * A workalike for {@link org.apache.hadoop.hbase.client.ClientSideRegionScanner}.
+ *
+ * It serves the same purpose, but skips block and mobFile cache initialization. Those caches
+ * dont appear to useful for the import job and leak threads on shutdown
+ */
+ static class HBaseRegionScanner implements AutoCloseable {
+ private static final Logger LOG = LoggerFactory.getLogger(HBaseRegionScanner.class);
+
+ private HRegion region;
+ private RegionScanner scanner;
+ private final List| values;
+ boolean hasMore = true;
+
+ public HBaseRegionScanner(
+ Configuration conf,
+ FileSystem fs,
+ Path rootDir,
+ TableDescriptor htd,
+ RegionInfo hri,
+ Scan scan)
+ throws IOException {
+ scan.setIsolationLevel(IsolationLevel.READ_UNCOMMITTED);
+ htd = TableDescriptorBuilder.newBuilder(htd).setReadOnly(true).build();
+ this.region =
+ HRegion.newHRegion(
+ CommonFSUtils.getTableDir(rootDir, htd.getTableName()),
+ (WAL) null,
+ fs,
+ conf,
+ hri,
+ htd,
+ (RegionServerServices) null);
+ this.region.setRestoredRegion(true);
+ conf.set("hfile.block.cache.policy", "IndexOnlyLRU");
+ conf.setIfUnset("hfile.onheap.block.cache.fixed.size", String.valueOf(33554432L));
+ conf.unset("hbase.bucketcache.ioengine");
+
+ this.region.initialize();
+ this.scanner = this.region.getScanner(scan);
+ this.values = new ArrayList();
+
+ this.region.startRegionOperation();
+ }
+
+ public void close() {
+ if (this.scanner != null) {
+ try {
+ this.scanner.close();
+ this.scanner = null;
+ } catch (IOException var3) {
+ LOG.warn("Exception while closing scanner", var3);
+ }
+ }
+
+ if (this.region != null) {
+ try {
+ this.region.closeRegionOperation();
+ this.region.close(true);
+ this.region = null;
+ } catch (IOException var2) {
+ LOG.warn("Exception while closing region", var2);
+ }
+ }
+ }
+
+ public Result next() throws IOException {
+ do {
+ if (!this.hasMore) {
+ return null;
+ }
+
+ this.values.clear();
+ this.hasMore = this.scanner.nextRaw(this.values);
+ } while (this.values.isEmpty());
+
+ Result result = Result.create(this.values);
+
+ return result;
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/transforms/package-info.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/transforms/package-info.java
new file mode 100644
index 0000000000..17f27d4046
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/hbasesnapshots/transforms/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2026 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** Package contains all the {@link org.apache.beam.sdk.transforms.PTransform} implementations. */
+package com.google.cloud.bigtable.beam.hbasesnapshots.transforms;
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java
index e1219f88be..cdff1a2ae1 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/main/java/com/google/cloud/bigtable/beam/validation/BufferedHadoopHashTableSource.java
@@ -50,7 +50,6 @@ class BufferedHadoopHashTableSource extends BoundedSource>> CODER =
KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));
- ;
// Max number of RangeHashes to buffer.
private final int maxBufferSize;
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java
index 1f3758dc21..fac4866838 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/EndToEndIT.java
@@ -19,6 +19,8 @@
import com.google.api.services.storage.model.StorageObject;
import com.google.bigtable.repackaged.com.google.gson.Gson;
import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot.ImportOptions;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.HBaseSnapshotInputConfigBuilder;
+import com.google.cloud.bigtable.beam.hbasesnapshots.dofn.CleanupHBaseSnapshotRestoreFiles;
import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn;
import com.google.cloud.bigtable.beam.test_env.EnvSetup;
import com.google.cloud.bigtable.beam.test_env.TestProperties;
@@ -314,8 +316,8 @@ public void testHBaseSnapshotImport() throws Exception {
// The restore directory is stored relative to the snapshot directory and contains the job name
String bucket = GcsPath.fromUri(hbaseSnapshotDir).getBucket();
String restorePathPrefix =
- CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(
- HBaseSnapshotInputConfigBuilder.RESTORE_DIR);
+ CleanupHBaseSnapshotRestoreFiles.getListPrefix(
+ HBaseSnapshotInputConfigBuilder.RESTORE_DIR + importOpts.getJobName());
List allObjects = new ArrayList<>();
String nextToken;
do {
@@ -427,8 +429,8 @@ public void testSnappyCompressedHBaseSnapshotImport() throws Exception {
// The restore directory is stored relative to the snapshot directory and contains the job name
String bucket = GcsPath.fromUri(hbaseSnapshotDir).getBucket();
String restorePathPrefix =
- CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(
- HBaseSnapshotInputConfigBuilder.RESTORE_DIR);
+ CleanupHBaseSnapshotRestoreFiles.getListPrefix(
+ HBaseSnapshotInputConfigBuilder.RESTORE_DIR + importOpts.getJobName());
List allObjects = new ArrayList<>();
String nextToken;
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java
index fb5346f72a..039bd5b3e2 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/HBaseSnapshotInputConfigBuilderTest.java
@@ -18,6 +18,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.HBaseSnapshotInputConfigBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat;
import org.apache.hadoop.mapreduce.InputFormat;
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshotTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshotTest.java
new file mode 100644
index 0000000000..74b0253703
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/ImportJobFromHbaseSnapshotTest.java
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2026 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.notNullValue;
+
+import com.google.api.services.storage.model.Objects;
+import com.google.api.services.storage.model.StorageObject;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.ImportConfig;
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
+import org.apache.beam.sdk.extensions.gcp.util.GcsUtil;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.MockitoJUnit;
+import org.mockito.junit.MockitoRule;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Test cases for the {@link ImportJobFromHbaseSnapshot} class. */
+@RunWith(JUnit4.class)
+public class ImportJobFromHbaseSnapshotTest {
+ private static final Logger LOG = LoggerFactory.getLogger(ImportJobFromHbaseSnapshotTest.class);
+
+ @ClassRule public static TemporaryFolder tempFolder = new TemporaryFolder();
+ @Rule public final ExpectedException expectedException = ExpectedException.none();
+
+ @Rule public final MockitoRule mockito = MockitoJUnit.rule();
+ @Mock GcsOptions gcsOptions;
+ @Mock GcsUtil gcsUtilMock;
+ @Mock Objects gcsObjects;
+
+ @Test
+ public void testBuildImportConfigWithMissingSourcePathThrowsException() throws Exception {
+ ImportJobFromHbaseSnapshot.ImportOptions options =
+ SnapshotTestHelper.getPipelineOptions(
+ new String[] {
+ "--snapshots='bookmark-2099:bookmark,malwarescanstate-9087:malwarescan'"
+ });
+
+ expectedException.expect(IllegalArgumentException.class);
+ expectedException.expectMessage(ImportJobFromHbaseSnapshot.MISSING_SNAPSHOT_SOURCEPATH);
+ ImportJobFromHbaseSnapshot.buildImportConfigFromPipelineOptions(options, gcsOptions);
+ }
+
+ @Test
+ public void testBuildImportConfigWithMissingSnapshotsThrowsException() throws Exception {
+ ImportJobFromHbaseSnapshot.ImportOptions options =
+ SnapshotTestHelper.getPipelineOptions(
+ new String[] {"--hbaseSnapshotSourceDir=gs://bucket/data/"});
+
+ expectedException.expect(IllegalArgumentException.class);
+ expectedException.expectMessage(ImportJobFromHbaseSnapshot.MISSING_SNAPSHOT_NAMES);
+ ImportJobFromHbaseSnapshot.buildImportConfigFromPipelineOptions(options, gcsOptions);
+ }
+
+ @Test
+ public void testBuildImportConfigFromSnapshotsString() throws Exception {
+ String sourcePath = "gs://bucket/data/";
+ ImportJobFromHbaseSnapshot.ImportOptions options =
+ SnapshotTestHelper.getPipelineOptions(
+ new String[] {
+ "--hbaseSnapshotSourceDir=" + sourcePath,
+ "--snapshots='bookmark-2099:bookmark,malwarescanstate-9087:malwarescan'"
+ });
+
+ ImportConfig importConfig =
+ ImportJobFromHbaseSnapshot.buildImportConfigFromPipelineOptions(options, gcsOptions);
+ assertThat(importConfig.getSourcepath(), is(sourcePath));
+ assertThat(importConfig.getRestorepath(), notNullValue());
+ assertThat(importConfig.getSnapshots().size(), is(2));
+ }
+
+ private void setUpGcsObjectMocks(List fakeStorageObjects) throws Exception {
+ Mockito.when(gcsObjects.getItems()).thenReturn(fakeStorageObjects);
+ Mockito.when(gcsUtilMock.listObjects(Mockito.anyString(), Mockito.anyString(), Mockito.any()))
+ .thenReturn(gcsObjects);
+ }
+
+ @Test
+ public void testBuildImportConfigForAllSnapshots() throws Exception {
+ String baseObjectPath = "snapshots/20220309230526";
+ String importSnapshotpath = String.format("gs://sym-bucket/%s", baseObjectPath);
+ ImportJobFromHbaseSnapshot.ImportOptions options =
+ SnapshotTestHelper.getPipelineOptions(
+ new String[] {"--hbaseSnapshotSourceDir=" + importSnapshotpath, "--snapshots=*"});
+ Mockito.when(gcsOptions.getGcsUtil()).thenReturn(gcsUtilMock);
+
+ List snapshotList = Arrays.asList("audit-events", "dlpInfo", "ce-metrics-manifest");
+ List fakeStorageObjects =
+ SnapshotTestHelper.createFakeStorageObjects(baseObjectPath, snapshotList);
+ setUpGcsObjectMocks(fakeStorageObjects);
+
+ ImportConfig importConfig =
+ ImportJobFromHbaseSnapshot.buildImportConfigFromPipelineOptions(options, gcsOptions);
+ assertThat(importConfig.getSourcepath(), is(importSnapshotpath));
+ assertThat(importConfig.getRestorepath(), notNullValue());
+ assertThat(importConfig.getSnapshots().size(), is(snapshotList.size()));
+ }
+
+ @Test
+ public void testBuildImportConfigFromJsonFileWithMissingPathThrowsException() throws Exception {
+ String config =
+ "{\n"
+ + " \"snapshots\": {\n"
+ + " \"snap_demo1\": \"snap_demo1\",\n"
+ + " \"snap_demo2\": \"snap_demo2\"\n"
+ + " }\n"
+ + "}";
+ File file = tempFolder.newFile();
+ SnapshotTestHelper.writeToFile(file.getAbsolutePath(), config);
+ ImportJobFromHbaseSnapshot.ImportOptions options =
+ SnapshotTestHelper.getPipelineOptions(
+ new String[] {"--importConfigFilePath=" + file.getAbsolutePath()});
+
+ expectedException.expect(NullPointerException.class);
+ expectedException.expectMessage(ImportJobFromHbaseSnapshot.MISSING_SNAPSHOT_SOURCEPATH);
+
+ ImportConfig importConfig =
+ ImportJobFromHbaseSnapshot.buildImportConfigFromConfigFile(
+ options.getImportConfigFilePath());
+ }
+
+ @Test
+ public void testBuildImportConfigFromJsonFile() throws Exception {
+ String importSnapshotpath = "gs://sym-datastore/snapshots/data/snap_demo";
+ String restoreSnapshotpath = "gs://sym-datastore/snapshots/data/restore";
+ String config =
+ String.format(
+ "{\n"
+ + " \"sourcepath\": \"%s\",\n"
+ + " \"restorepath\": \"%s\",\n"
+ + " \"snapshots\": {\n"
+ + " \"snap_demo1\": \"demo1\",\n"
+ + " \"snap_demo2\": \"demo2\"\n"
+ + " }\n"
+ + "}",
+ importSnapshotpath, restoreSnapshotpath);
+
+ File file = tempFolder.newFile();
+ SnapshotTestHelper.writeToFile(file.getAbsolutePath(), config);
+ ImportJobFromHbaseSnapshot.ImportOptions options =
+ SnapshotTestHelper.getPipelineOptions(
+ new String[] {"--importConfigFilePath=" + file.getAbsolutePath()});
+ ImportConfig importConfig =
+ ImportJobFromHbaseSnapshot.buildImportConfigFromConfigFile(
+ options.getImportConfigFilePath());
+ assertThat(importConfig.getSourcepath(), is(importSnapshotpath));
+ assertThat(importConfig.getRestorepath().startsWith(restoreSnapshotpath), is(true));
+ assertThat(importConfig.getSnapshots().get(0).getbigtableTableName(), is("demo1"));
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/SnapshotTestHelper.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/SnapshotTestHelper.java
new file mode 100644
index 0000000000..584db64822
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/SnapshotTestHelper.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2026 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots;
+
+import com.google.api.services.storage.model.StorageObject;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.SnapshotConfig;
+import com.google.common.base.Joiner;
+import com.google.common.io.ByteStreams;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import java.nio.channels.WritableByteChannel;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.sdk.io.FileSystems;
+import org.apache.beam.sdk.io.fs.ResourceId;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.util.MimeTypes;
+
+/** Contains various helper methods to handle different tasks while executing tests. */
+public class SnapshotTestHelper {
+ private SnapshotTestHelper() {}
+
+ /**
+ * Helper to generate files for testing.
+ *
+ * @param filePath The path to the file to write.
+ * @param fileContents The content to write.
+ * @throws IOException If an error occurs while creating or writing the file.
+ */
+ static void writeToFile(String filePath, String fileContents) throws IOException {
+
+ ResourceId resourceId = FileSystems.matchNewResource(filePath, false);
+
+ // Write the file contents to the channel and close.
+ try (ReadableByteChannel readChannel =
+ Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
+ try (WritableByteChannel writeChannel = FileSystems.create(resourceId, MimeTypes.TEXT)) {
+ ByteStreams.copy(readChannel, writeChannel);
+ }
+ }
+ }
+
+ /**
+ * @param restorePath - Path to which snapshots will be restored temporarily
+ * @return SnapshotConfig - Returns the snapshot config
+ */
+ public static SnapshotConfig newSnapshotConfig(String restorePath) {
+ return newSnapshotConfig("testsourcepath", restorePath);
+ }
+
+ public static SnapshotConfig newSnapshotConfig(String sourcePath, String restorePath) {
+ return SnapshotConfig.builder()
+ .setProjectId("testproject")
+ .setSourceLocation(sourcePath)
+ .setRestoreLocation(restorePath)
+ .setSnapshotName("testsnapshot")
+ .setTableName("testtable")
+ .setConfigurationDetails(new HashMap())
+ .build();
+ }
+
+ /**
+ * Helper method providing pipeline options.
+ *
+ * @param args list of pipeline arguments.
+ */
+ static ImportJobFromHbaseSnapshot.ImportOptions getPipelineOptions(String[] args) {
+ return PipelineOptionsFactory.fromArgs(args).as(ImportJobFromHbaseSnapshot.ImportOptions.class);
+ }
+
+ /**
+ * Creates Fake Storage Objects
+ *
+ * @param basePath File System base path
+ * @param objectNames List of object names
+ * @return List of matching Storage objects
+ */
+ static List createFakeStorageObjects(String basePath, List objectNames) {
+ if (objectNames == null) return null;
+
+ List storageObjects = new ArrayList<>();
+ objectNames.forEach(
+ name -> {
+ StorageObject object = new StorageObject();
+ object.setId(Joiner.on("/").join(basePath, ".hbase-snapshot", name, ".snapshotinfo"));
+ storageObjects.add(object);
+ });
+
+ return storageObjects;
+ }
+
+ static Map buildMapFromList(String[] values) {
+ if (values.length % 2 != 0)
+ throw new IllegalArgumentException(
+ "Input should contain even number of values to represent both"
+ + " key and value for the map.");
+ Map data = new HashMap<>();
+ for (int i = 0; i < values.length; i += 2) data.put(values[i], values[i + 1]);
+ return data;
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/SnapshotUtilsTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/SnapshotUtilsTest.java
new file mode 100644
index 0000000000..1af5aef156
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/SnapshotUtilsTest.java
@@ -0,0 +1,237 @@
+/*
+ * Copyright 2026 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.*;
+
+import com.google.api.services.storage.model.Objects;
+import com.google.api.services.storage.model.StorageObject;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.ImportConfig;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.SnapshotConfig;
+import java.io.IOException;
+import java.time.Instant;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.beam.sdk.extensions.gcp.util.GcsUtil;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.MockitoJUnit;
+import org.mockito.junit.MockitoRule;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Test cases for the {@link SnapshotUtils} class. */
+@RunWith(JUnit4.class)
+public class SnapshotUtilsTest {
+ private static final Logger LOG = LoggerFactory.getLogger(SnapshotUtilsTest.class);
+
+ @ClassRule public static TemporaryFolder tempFolder = new TemporaryFolder();
+ // Preferred way to instantiate mocks in JUnit4 is via the JUnit rule MockitoJUnit
+ @Rule public final MockitoRule mockito = MockitoJUnit.rule();
+ @Mock GcsUtil gcsUtilMock;
+ @Mock Objects gcsObjects;
+
+ @org.junit.Before
+ public void setup() throws Exception {
+ java.lang.reflect.Field field = SnapshotUtils.class.getDeclaredField("hbaseConfiguration");
+ field.setAccessible(true);
+ field.set(null, null);
+ }
+
+ @Test
+ public void testRemoveSuffixSlashIfExists() {
+ String path = "gs://bucket/prefix";
+
+ assertThat(SnapshotUtils.removeSuffixSlashIfExists(path), is(path));
+ assertThat(SnapshotUtils.removeSuffixSlashIfExists(path + "/"), is(path));
+ }
+
+ @Test
+ public void testAppendCurrentTimestamp() {
+ String path = "gs://bucket/prefix";
+ DateTimeFormatter formatter =
+ DateTimeFormatter.ofPattern("yyyyMMddHHmm").withZone(ZoneId.of("UTC"));
+ long currentTime = Long.parseLong(formatter.format(Instant.now()));
+ String returnVal = SnapshotUtils.appendCurrentTimestamp(path).replace(path + "/", "");
+ long returnTime = Long.parseLong(returnVal.split("-")[0]);
+ assertThat((returnTime - currentTime), lessThan(2L));
+ }
+
+ @Test
+ public void testgetNamedDirectory() {
+ String path = "gs://bucket/subdir1";
+ String subFolder = "subdir2";
+ String expectedPath = "gs://bucket/subdir2";
+ String retValue = SnapshotUtils.getNamedDirectory(path, subFolder);
+ assertThat(retValue.startsWith(expectedPath), is(true));
+ }
+
+ @Test
+ public void testGetConfigurationWithDataflowRunner() {
+ String projectId = "testproject";
+ Map configurations =
+ SnapshotUtils.getConfiguration("DataflowRunner", projectId, "/path/to/sourcedir", null);
+ assertThat(configurations.get("fs.gs.project.id"), is(projectId));
+ assertThat(configurations.get("s.gs.auth.type"), nullValue());
+ }
+
+ @Test
+ public void testGetConfigurationWithDirectRunner() {
+ Map hbaseConfiguration =
+ SnapshotTestHelper.buildMapFromList(
+ new String[] {"fs.AbstractFileSystem.gs.impl", "org.apache.hadoop.fs.hdfs"});
+ Map configurations =
+ SnapshotUtils.getConfiguration(
+ "DirectRunner", "testproject", "/path/to/sourcedir", hbaseConfiguration);
+ assertThat(
+ configurations.get("fs.AbstractFileSystem.gs.impl"),
+ is(hbaseConfiguration.get("fs.AbstractFileSystem.gs.impl")));
+ assertThat(configurations.get("fs.gs.auth.type"), is("APPLICATION_DEFAULT"));
+ }
+
+ @Test
+ public void testGetHbaseConfiguration() {
+ Map configurations =
+ SnapshotTestHelper.buildMapFromList(
+ new String[] {"throttling.enable", "true", "throttling.threshold.ms", "200"});
+ Configuration hbaseConfiguration = SnapshotUtils.getHBaseConfiguration(configurations);
+ assertThat(hbaseConfiguration.getBoolean("throttling.enable", false), is(true));
+ assertThat(hbaseConfiguration.get("throttling.threshold.ms"), is("200"));
+ }
+
+ @Test
+ public void testBuildSnapshotConfigs() {
+ String projectId = "testproject";
+ String sourcePath = "/path/to/sourcedir";
+ String restorePath = "/path/to/restoredir";
+ List snapshotInfoList =
+ Arrays.asList(
+ new ImportConfig.SnapshotInfo("snapdemo", "btdemo"),
+ new ImportConfig.SnapshotInfo("bookcontent-9087", "bookcontent"));
+
+ Map conbfiguration =
+ SnapshotTestHelper.buildMapFromList(
+ new String[] {"bigtable.row.size", "100", "bigtable.auth.type", "private"});
+
+ List snapshotConfigs =
+ SnapshotUtils.buildSnapshotConfigs(
+ snapshotInfoList, new HashMap<>(), projectId, sourcePath, restorePath);
+
+ assertThat(snapshotConfigs.size(), is(2));
+ assertThat(snapshotConfigs.get(0).getProjectId(), is(projectId));
+ assertThat(snapshotConfigs.get(0).getSnapshotName(), is("snapdemo"));
+ assertThat(snapshotConfigs.get(1).getSourceLocation(), is(sourcePath));
+ assertThat(snapshotConfigs.get(1).getTableName(), is("bookcontent"));
+ }
+
+ @Test
+ public void testGetSnapshotsFromStringReturnsSameTableName() {
+ String snapshotsWithBigtableTableName = "bookmark-2099";
+ Map snapshots =
+ SnapshotUtils.getSnapshotsFromString(snapshotsWithBigtableTableName);
+ assertThat(snapshots.size(), is(equalTo(1)));
+ assertThat(snapshots.get("bookmark-2099"), is("bookmark-2099"));
+ }
+
+ @Test
+ public void testGetSnapshotsFromStringReturnsMultipleTables() {
+ String snapshotsWithBigtableTableName = "snapshot1,snapshot2,snapshot3:mytable3,snapshot4";
+ Map snapshots =
+ SnapshotUtils.getSnapshotsFromString(snapshotsWithBigtableTableName);
+ assertThat(snapshots.size(), is(equalTo(4)));
+ assertThat(snapshots.get("snapshot1"), is("snapshot1"));
+ assertThat(snapshots.get("snapshot2"), is("snapshot2"));
+ assertThat(snapshots.get("snapshot3"), is("mytable3"));
+ assertThat(snapshots.get("snapshot4"), is("snapshot4"));
+ }
+
+ @Test
+ public void testGetSnapshotsFromStringReturnsParsedValues() {
+ String snapshotsWithBigtableTableName =
+ "bookmark-2099:bookmark,malwarescanstate-9087:malwarescan";
+ Map snapshots =
+ SnapshotUtils.getSnapshotsFromString(snapshotsWithBigtableTableName);
+ assertThat(snapshots.size(), is(equalTo(2)));
+ assertThat(snapshots.get("malwarescanstate-9087"), is("malwarescan"));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetSnapshotsFromStringThrowsException() {
+ String snapshotsWithBigtableTableName =
+ "bookmark-2099:bookmark,malwarescanstate-9087:malwarescan:snapdemo1";
+ Map snapshots =
+ SnapshotUtils.getSnapshotsFromString(snapshotsWithBigtableTableName);
+ }
+
+ private void setUpGcsObjectMocks(List fakeStorageObjects) throws IOException {
+ Mockito.when(gcsObjects.getItems()).thenReturn(fakeStorageObjects);
+ Mockito.when(gcsUtilMock.listObjects(Mockito.anyString(), Mockito.anyString(), Mockito.any()))
+ .thenReturn(gcsObjects);
+ }
+
+ private Map getMatchingSnapshotsFromSnapshotPath(
+ List snapshotList, String prefix) throws IOException {
+ String baseObjectPath = "snapshots/20220309230526";
+ String importSnapshotpath = String.format("gs://sym-bucket/%s", baseObjectPath);
+ List fakeStorageObjects =
+ SnapshotTestHelper.createFakeStorageObjects(baseObjectPath, snapshotList);
+ setUpGcsObjectMocks(fakeStorageObjects);
+ return SnapshotUtils.getSnapshotsFromSnapshotPath(importSnapshotpath, gcsUtilMock, prefix);
+ }
+
+ @Test
+ public void testgetAllSnapshotsFromSnapshotPath() throws IOException {
+ List snapshotList = Arrays.asList("audit-events", "dlpInfo", "ce-metrics-manifest");
+ Map snapshots = getMatchingSnapshotsFromSnapshotPath(snapshotList, "*");
+ assertThat(snapshots.size(), is(equalTo(3)));
+ assertThat(snapshots.keySet(), containsInAnyOrder(snapshotList.toArray(new String[0])));
+ }
+
+ @Test
+ public void testgetSubSetSnapshotsFromSnapshotPath() throws IOException {
+ List snapshotList =
+ Arrays.asList(
+ "snapshot-audit-events",
+ "snapshot-attachments",
+ "snapshot-ce-metrics-manifest",
+ "snapshot-attachments-streams");
+ Map snapshots =
+ getMatchingSnapshotsFromSnapshotPath(snapshotList, ".*attachments.*");
+ List expectedResult =
+ snapshotList.stream().filter(e -> e.contains("attachments")).collect(Collectors.toList());
+ // LOG.info("Matched:{} and expected:{}", snapshots.size(), expectedResult.size());
+ assertThat(snapshots.size(), is(equalTo(expectedResult.size())));
+ assertThat(snapshots.keySet(), containsInAnyOrder(expectedResult.toArray(new String[0])));
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testgetSubSetSnapshotsFromSnapshotPathThrowsException() throws IOException {
+ Map snapshots = getMatchingSnapshotsFromSnapshotPath(null, "*");
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/CleanUpRestoredSnapshotsTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/CleanUpRestoredSnapshotsTest.java
new file mode 100644
index 0000000000..b647ea404a
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/CleanUpRestoredSnapshotsTest.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2026 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots.dofn;
+
+import com.google.cloud.bigtable.beam.hbasesnapshots.SnapshotTestHelper;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.SnapshotConfig;
+import java.io.File;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.junit.Assert;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Tests the {@link CleanupRestoredSnapshots} functionality. */
+@RunWith(JUnit4.class)
+public class CleanUpRestoredSnapshotsTest {
+ private static final Logger LOG = LoggerFactory.getLogger(CleanUpRestoredSnapshotsTest.class);
+
+ @Rule public final transient TestPipeline pipeline = TestPipeline.create();
+ @ClassRule public static TemporaryFolder tempFolder = new TemporaryFolder();
+
+ @Test
+ public void testDeleteRestoredSnapshot() throws Exception {
+ File restoreDir = tempFolder.newFolder();
+ if (restoreDir.exists()) {
+ LOG.info("Created temp folder: {}", restoreDir.getAbsolutePath());
+ SnapshotConfig snapshotConfig =
+ SnapshotTestHelper.newSnapshotConfig(restoreDir.getAbsolutePath());
+ new CleanupRestoredSnapshots().cleanupSnapshot(snapshotConfig);
+ Assert.assertFalse(restoreDir.exists());
+ } else {
+ LOG.warn(
+ "Skipping CleanUpRestoredSnapshotsTest since temporary file was unable to be created in restore path: {}",
+ restoreDir.getAbsolutePath());
+ }
+ }
+
+ /**
+ * Tests CleanupRestoredSnapshots with invalid path to verify exception is handled internally
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testDeleteRestoredSnapshotWithInvalidPath() throws Exception {
+ pipeline
+ .apply("CreateInput", Create.of(SnapshotTestHelper.newSnapshotConfig("invalid_path")))
+ .apply("DeleteSnapshot", ParDo.of(new CleanupRestoredSnapshots()));
+ pipeline.run();
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/CleanupHBaseSnapshotRestoreFilesTest.java
similarity index 72%
rename from bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java
rename to bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/CleanupHBaseSnapshotRestoreFilesTest.java
index 0183f856f1..bfe46f5191 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/CleanupHBaseSnapshotRestoreFilesFnTest.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/CleanupHBaseSnapshotRestoreFilesTest.java
@@ -13,15 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.google.cloud.bigtable.beam.hbasesnapshots;
+package com.google.cloud.bigtable.beam.hbasesnapshots.dofn;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.HBaseSnapshotInputConfigBuilder;
import java.util.UUID;
import org.junit.Test;
-public class CleanupHBaseSnapshotRestoreFilesFnTest {
+public class CleanupHBaseSnapshotRestoreFilesTest {
private static final String TEST_BUCKET_NAME = "test-bucket";
private static final String TEST_SNAPSHOT_PATH = "gs://" + TEST_BUCKET_NAME + "/hbase-export";
private static final String TEST_RESTORE_PATH =
@@ -32,24 +33,24 @@ public class CleanupHBaseSnapshotRestoreFilesFnTest {
public void testGetWorkingBucketName() {
assertEquals(
TEST_BUCKET_NAME,
- CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_SNAPSHOT_PATH));
+ CleanupHBaseSnapshotRestoreFiles.getWorkingBucketName(TEST_SNAPSHOT_PATH));
assertThrows(
IllegalArgumentException.class,
() -> {
- CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_BUCKET_NAME);
+ CleanupHBaseSnapshotRestoreFiles.getWorkingBucketName(TEST_BUCKET_NAME);
});
}
@Test
public void testGetListPrefix() {
assertEquals(
- TEST_RESTORE_PREFIX, CleanupHBaseSnapshotRestoreFilesFn.getListPrefix(TEST_RESTORE_PATH));
+ TEST_RESTORE_PREFIX, CleanupHBaseSnapshotRestoreFiles.getListPrefix(TEST_RESTORE_PATH));
assertThrows(
IllegalArgumentException.class,
() -> {
- CleanupHBaseSnapshotRestoreFilesFn.getWorkingBucketName(TEST_RESTORE_PREFIX);
+ CleanupHBaseSnapshotRestoreFiles.getWorkingBucketName(TEST_RESTORE_PREFIX);
});
}
}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/RestoreSnapshotTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/RestoreSnapshotTest.java
new file mode 100644
index 0000000000..386f6a9b2d
--- /dev/null
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/hbasesnapshots/dofn/RestoreSnapshotTest.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2026 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.bigtable.beam.hbasesnapshots.dofn;
+
+import com.google.cloud.bigtable.beam.hbasesnapshots.SnapshotTestHelper;
+import com.google.cloud.bigtable.beam.hbasesnapshots.conf.SnapshotConfig;
+import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.mockito.MockedStatic;
+import org.mockito.Mockito;
+
+/** Tests the {@link RestoreSnapshot} functionality. */
+@RunWith(JUnit4.class)
+public class RestoreSnapshotTest {
+
+ @ClassRule public static TemporaryFolder tempFolder = new TemporaryFolder();
+
+ @Test
+ public void testRestoreSnapshot() throws Exception {
+
+ SnapshotConfig snapshotConfig =
+ SnapshotTestHelper.newSnapshotConfig(
+ tempFolder.newFolder().getAbsolutePath(), tempFolder.newFolder().getAbsolutePath());
+
+ try (MockedStatic restoreSnapshotHelper =
+ Mockito.mockStatic(RestoreSnapshotHelper.class)) {
+ restoreSnapshotHelper
+ .when(
+ () ->
+ RestoreSnapshotHelper.copySnapshotForScanner(
+ snapshotConfig.getConfiguration(),
+ null,
+ snapshotConfig.getSourcePath(),
+ snapshotConfig.getRestorePath(),
+ snapshotConfig.getSnapshotName()))
+ .thenReturn(null);
+
+ new RestoreSnapshot().restoreSnapshot(snapshotConfig);
+ }
+ }
+}
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/SequenceFileSourceTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/SequenceFileSourceTest.java
index 5d1715ccab..87f6a4c085 100644
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/SequenceFileSourceTest.java
+++ b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/sequencefiles/SequenceFileSourceTest.java
@@ -16,12 +16,8 @@
package com.google.cloud.bigtable.beam.sequencefiles;
import static org.hamcrest.MatcherAssert.assertThat;
-import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.hasSize;
-import static org.hamcrest.Matchers.instanceOf;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertTrue;
+import static org.hamcrest.Matchers.*;
+import static org.junit.Assert.*;
import com.google.common.collect.Lists;
import java.io.File;
diff --git a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java b/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java
deleted file mode 100644
index 64e7c6959e..0000000000
--- a/bigtable-dataflow-parent/bigtable-beam-import/src/test/java/com/google/cloud/bigtable/beam/validation/ComputeAndValidateHashFromBigtableDoFnTest.java
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * Copyright 2021 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.google.cloud.bigtable.beam.validation;
-
-import static com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.GCRules.GCRULES;
-
-import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient;
-import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings;
-import com.google.bigtable.repackaged.com.google.cloud.bigtable.admin.v2.models.CreateTableRequest;
-import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration;
-import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
-import com.google.cloud.bigtable.hbase.BigtableConfiguration;
-import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
-import com.google.cloud.bigtable.test.helper.BigtableEmulatorRule;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-import java.util.stream.StreamSupport;
-import org.apache.beam.sdk.PipelineResult;
-import org.apache.beam.sdk.metrics.MetricQueryResults;
-import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
-import org.apache.beam.sdk.testing.PAssert;
-import org.apache.beam.sdk.testing.TestPipeline;
-import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.TableName;
-import org.apache.hadoop.hbase.client.Connection;
-import org.apache.hadoop.hbase.client.Delete;
-import org.apache.hadoop.hbase.client.Put;
-import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.Scan;
-import org.apache.hadoop.hbase.client.Table;
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
-import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@RunWith(JUnit4.class)
-public class ComputeAndValidateHashFromBigtableDoFnTest {
-
- private static final byte[] EMPTY_ROW_KEY = HConstants.EMPTY_BYTE_ARRAY;
- protected final Logger LOG = LoggerFactory.getLogger(getClass());
-
- public static final String FAKE_TABLE = "fake-table";
- private static final String ROW_KEY_PREFIX = "row-";
- private static final String VALUE_PREFIX = "value-";
- private static final byte[] EXTRA_VALUE = "add".getBytes();
- private static final byte[] CF = "cf".getBytes();
- private static final byte[] CF2 = "cf".getBytes();
- private static final byte[] COL = "col".getBytes();
- private static final long TS = 1000l;
- private static final int FIRST_ROW_INDEX = 20;
- private static final int LAST_ROW_INDEX = 31;
-
- @Rule public final BigtableEmulatorRule bigtableEmulator = new BigtableEmulatorRule();
-
- @Rule public final transient TestPipeline p = TestPipeline.create();
-
- private ComputeAndValidateHashFromBigtableDoFn doFn;
-
- // Clients that will be connected to the emulator
- private BigtableTableAdminClient tableAdminClient;
-
- private Connection connection;
- private Table table;
- // Fake a TableHashWrapper.
- private FakeTableHashWrapper fakeTableHashWrapper;
-
- private List hashes;
-
- @Before
- public void setUp() throws IOException {
- hashes = new ArrayList<>();
- // Initialize the clients to connect to the emulator
- tableAdminClient =
- BigtableTableAdminClient.create(
- BigtableTableAdminSettings.newBuilderForEmulator(bigtableEmulator.getPort())
- .setProjectId("fake-project")
- .setInstanceId("fake-instance")
- .build());
-
- CloudBigtableTableConfiguration config =
- new CloudBigtableTableConfiguration.Builder()
- .withProjectId("fake-project")
- .withInstanceId("fake-instance")
- .withTableId(FAKE_TABLE)
- .withConfiguration(
- BigtableOptionsFactory.BIGTABLE_EMULATOR_HOST_KEY,
- "localhost:" + bigtableEmulator.getPort())
- .build();
-
- connection = BigtableConfiguration.connect(config.toHBaseConfig());
- table = connection.getTable(TableName.valueOf(FAKE_TABLE));
- fakeTableHashWrapper = new FakeTableHashWrapper();
- // Scan all the cells for the column, HBase scan fetches 1 cell/column by default
- fakeTableHashWrapper.scan = new Scan().setMaxVersions();
-
- FakeTableHashWrapperFactory fakeFactory = new FakeTableHashWrapperFactory(fakeTableHashWrapper);
-
- doFn =
- new ComputeAndValidateHashFromBigtableDoFn(
- config,
- StaticValueProvider.of(FAKE_TABLE),
- StaticValueProvider.of("proj"),
- StaticValueProvider.of("hash"),
- fakeFactory);
-
- // Create a test table that can be used in tests
- tableAdminClient.createTable(
- CreateTableRequest.of(FAKE_TABLE)
- .addFamily(new String(CF), GCRULES.maxVersions(100))
- .addFamily(new String(CF2), GCRULES.maxVersions(100)));
-
- p.getCoderRegistry().registerCoderForClass(RangeHash.class, new RangeHashCoder());
-
- // Fill CBT table with data.
- writeDataToTable();
- }
-
- @After
- public void tearDown() throws IOException {
- doFn.cleanupConnection();
- // TODO should we delete the table for each test?
- tableAdminClient.deleteTable(FAKE_TABLE);
- tableAdminClient.close();
- connection.close();
- }
-
- private byte[] getRowKey(int i) {
- return (ROW_KEY_PREFIX + i).getBytes();
- }
-
- private byte[] getValue(int rowIndex, int cellIndex) {
- return (VALUE_PREFIX + rowIndex + "-" + cellIndex).getBytes();
- }
-
- private void writeDataToTable() throws IOException {
- List puts = new ArrayList<>();
- // Tests use the rows 21-30. Setup some extra data simulate the real world scenario where
- // there will be other workitems working parallely on the table.
- for (int i = 20; i < 32; i++) {
- for (int j = 0; j < 2; j++) {
- // Insert rows with 2 cells each
- Put put = new Put(getRowKey(i));
- put.addColumn(CF, COL, TS + j, getValue(i, j));
- puts.add(put);
- }
- }
- table.put(puts);
- }
-
- /** Deletes the row range [startIndex, stopIndex) */
- private void deleteRange(int startIndex, int stopIndex) throws IOException {
- for (int i = startIndex; i < stopIndex; i++) {
- table.delete(new Delete(getRowKey(i)));
- }
- }
-
- // Creates a RangeHash for range [startRow, stopRow).
- private RangeHash createHash(byte[] startRow, byte[] stopRow) throws IOException {
- LOG.debug("Creating hash for rows " + startRow + " to " + stopRow);
- BigtableResultHasher hasher = new BigtableResultHasher();
- hasher.startBatch(new ImmutableBytesWritable(startRow));
-
- // Scan all the cells for a column.
- Scan scan = new Scan().setMaxVersions().withStartRow(startRow).withStopRow(stopRow, false);
-
- // Read the rows from Bigtable and compute the expected hash.
- for (Result result : table.getScanner(scan)) {
- LOG.debug("Adding result to hash: " + result);
- hasher.hashResult(result);
- }
- hasher.finishBatch();
- return RangeHash.of(
- new ImmutableBytesWritable(startRow),
- new ImmutableBytesWritable(stopRow),
- hasher.getBatchHash());
- }
-
- private void validateCounters(
- PipelineResult result, Long expectedMatches, Long expectedMismatches) {
- MetricQueryResults metrics = result.metrics().allMetrics();
- Map counters =
- StreamSupport.stream(metrics.getCounters().spliterator(), false)
- .collect(Collectors.toMap((m) -> m.getName().getName(), (m) -> m.getAttempted()));
- Assert.assertEquals(expectedMatches, counters.get("ranges_matched"));
- Assert.assertEquals(expectedMismatches, counters.get("ranges_not_matched"));
- }
-
- ////////// Happy case tests for various setups//////////////////////
- @Test
- public void testHashMatchesForMultipleRange() throws Exception {
- hashes.add(createHash(getRowKey(21), getRowKey(24)));
- hashes.add(createHash(getRowKey(24), getRowKey(28)));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).empty();
- PipelineResult result = p.run();
- validateCounters(result, 2L, 0L);
- }
-
- @Test
- public void testHashMatchesForSingleRange() throws Exception {
- hashes.add(createHash(getRowKey(21), getRowKey(24)));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).containsInAnyOrder();
- PipelineResult result = p.run();
- validateCounters(result, 1L, 0L);
- }
-
- @Test
- public void testHashMatchesForFullTableScanWithMultipleRange() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24)));
- hashes.add(createHash(getRowKey(24), EMPTY_ROW_KEY));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).empty();
- PipelineResult result = p.run();
- validateCounters(result, 2L, 0L);
- }
-
- @Test
- public void testHashMatchesForMultipleSingleRowRange() throws Exception {
- hashes.add(createHash(getRowKey(22), getRowKey(23)));
- hashes.add(createHash(getRowKey(23), getRowKey(24)));
- hashes.add(createHash(getRowKey(24), getRowKey(25)));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(getRowKey(22)), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).empty();
- PipelineResult result = p.run();
- validateCounters(result, 3L, 0L);
- }
-
- ///////////////// Test mismatches when Bigtable has extra rows ////////////////////
- @Test
- public void testAdditionalCellInMiddle() throws Exception {
- hashes.add(createHash(getRowKey(21), getRowKey(24)));
- hashes.add(createHash(getRowKey(24), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), getRowKey(30)));
-
- // Add an extra cell in the table
- table.put(new Put(getRowKey(25)).addColumn(CF, COL, EXTRA_VALUE));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).containsInAnyOrder(hashes.get(1));
- PipelineResult result = p.run();
- validateCounters(result, 2L, 1L);
- }
-
- @Test
- public void testAdditionalRowsAtEnds() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(24)));
- hashes.add(createHash(getRowKey(24), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY));
-
- // Add an extra row in the beginning
- table.put(new Put(getRowKey(1)).addColumn(CF, COL, EXTRA_VALUE));
-
- // Add an extra row at the end.
- table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2));
- PipelineResult result = p.run();
- validateCounters(result, 1L, 2L);
- }
-
- ///////////////////// Test different values ///////////////////////////
- @Test
- public void testDifferentValues() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
- hashes.add(createHash(getRowKey(21), getRowKey(23)));
- hashes.add(createHash(getRowKey(23), getRowKey(25)));
- hashes.add(createHash(getRowKey(25), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY));
-
- // Modify the CF
- table.delete(new Delete(getRowKey(20)).addColumns(CF, COL, TS));
- table.put(new Put(getRowKey(1)).addColumn(CF2, COL, TS, getValue(20, 0)));
-
- // Modify the qualifier
- table.delete(new Delete(getRowKey(22)).addColumns(CF, COL, TS));
- table.put(new Put(getRowKey(22)).addColumn(CF, "random-col".getBytes(), TS, getValue(22, 0)));
-
- // Modify the timestamp
- table.delete(new Delete(getRowKey(24)).addColumns(CF, COL, TS));
- table.put(new Put(getRowKey(24)).addColumn(CF, COL, 1, getValue(24, 0)));
-
- // Modify the value
- table.delete(new Delete(getRowKey(26)).addColumns(CF, COL, TS));
- table.put(new Put(getRowKey(26)).addColumn(CF, COL, getValue(26, 0)));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output)
- .containsInAnyOrder(hashes.get(0), hashes.get(1), hashes.get(2), hashes.get(3));
- PipelineResult result = p.run();
- validateCounters(result, 1L, 4L);
- }
-
- ////////////////// Tests with CBT missing data //////////////////////////////
- @Test
- public void testMissingRows() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
- hashes.add(createHash(getRowKey(21), getRowKey(23)));
- hashes.add(createHash(getRowKey(23), getRowKey(25)));
- hashes.add(createHash(getRowKey(25), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), EMPTY_ROW_KEY));
-
- // Delete a row at the beginning
- table.delete(new Delete(getRowKey(FIRST_ROW_INDEX)));
-
- // Delete a row at the middle
- table.delete(new Delete(getRowKey(24)));
-
- // Delete a row at the end
- table.delete(new Delete(getRowKey(LAST_ROW_INDEX)));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4));
- PipelineResult result = p.run();
- validateCounters(result, 2L, 3L);
- }
-
- @Test
- public void testMissingRanges() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
- hashes.add(createHash(getRowKey(21), getRowKey(23)));
- hashes.add(createHash(getRowKey(23), getRowKey(25)));
- hashes.add(createHash(getRowKey(25), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), getRowKey(29)));
- hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY));
-
- // Delete a range at the beginning
- deleteRange(FIRST_ROW_INDEX, 21);
-
- // Delete a range in middle
- deleteRange(23, 25);
-
- // Delete row ranges at the end, bigtable scanner will finish with multiple row-ranges to
- // process.
- deleteRange(27, LAST_ROW_INDEX + 1);
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output)
- .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5));
- PipelineResult result = p.run();
- validateCounters(result, 2L, 4L);
- }
-
- @Test
- public void testCbtEmpty() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(25)));
- hashes.add(createHash(getRowKey(25), getRowKey(29)));
- hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY));
-
- // Delete all data from bigtable
- deleteRange(FIRST_ROW_INDEX, LAST_ROW_INDEX);
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).containsInAnyOrder(hashes);
- PipelineResult result = p.run();
- validateCounters(result, 0L, 3L);
- }
-
- ////////////////////// Test that scan is used from TableHash.////////////////////////
- @Test
- public void testScanFromTableHash() throws Exception {
- hashes.add(createHash(getRowKey(21), getRowKey(24)));
- hashes.add(createHash(getRowKey(24), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), getRowKey(30)));
-
- // Update the TableHashWrapper Scan to default. Scan from HashTable.TableHash determines the
- // cells used to compute hash. CBT has to use the same cells for validation.
- fakeTableHashWrapper.scan = new Scan();
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(getRowKey(21)), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output).containsInAnyOrder(hashes);
- PipelineResult result = p.run();
- validateCounters(result, 0L, 3L);
- }
-
- ////////////////////// Combination of different cases //////////////////////////////////
- @Test
- public void testMismatchesComprehensive() throws Exception {
- hashes.add(createHash(EMPTY_ROW_KEY, getRowKey(21)));
- hashes.add(createHash(getRowKey(21), getRowKey(23)));
- hashes.add(createHash(getRowKey(23), getRowKey(25)));
- hashes.add(createHash(getRowKey(25), getRowKey(27)));
- hashes.add(createHash(getRowKey(27), getRowKey(29)));
- hashes.add(createHash(getRowKey(29), EMPTY_ROW_KEY));
-
- // Delete a range at the beginning from CBT
- deleteRange(FIRST_ROW_INDEX, 21);
-
- // Delete a row in middle from CBT
- table.delete(new Delete(getRowKey(23)));
-
- // Update a value in CBT
- table.delete(new Delete(getRowKey(27)).addColumns(CF, COL, TS));
- table.put(new Put(getRowKey(27)).addColumn(CF, COL, getValue(27, 0)));
-
- // Add an extra row at the end.
- table.put(new Put(getRowKey(5)).addColumn(CF, COL, EXTRA_VALUE));
-
- PCollection>>> input =
- p.apply(Create.of(KV.of(new String(EMPTY_ROW_KEY), Arrays.asList(hashes))));
-
- PCollection output = input.apply(ParDo.of(doFn));
- PAssert.that(output)
- .containsInAnyOrder(hashes.get(0), hashes.get(2), hashes.get(4), hashes.get(5));
- PipelineResult result = p.run();
- validateCounters(result, 2L, 4L);
- }
-}
|