diff --git a/docs/content/primary-key-table/pk-clustering-override.md b/docs/content/primary-key-table/pk-clustering-override.md index 3348bc0482bb..91b3fa14e884 100644 --- a/docs/content/primary-key-table/pk-clustering-override.md +++ b/docs/content/primary-key-table/pk-clustering-override.md @@ -50,6 +50,23 @@ CREATE TABLE my_table ( ); ``` +For `first-row` merge engine, deletion vectors are already built-in, so you don't need to enable them explicitly: + +```sql +CREATE TABLE my_table ( + id BIGINT, + dt STRING, + city STRING, + amount DOUBLE, + PRIMARY KEY (id) NOT ENFORCED +) WITH ( + 'pk-clustering-override' = 'true', + 'clustering.columns' = 'city', + 'merge-engine' = 'first-row', + 'bucket' = '4' +); +``` + After this, data files within each bucket will be physically sorted by `city` instead of `id`. Queries like `SELECT * FROM my_table WHERE city = 'Beijing'` can skip irrelevant data files by checking their min/max statistics on the clustering column. @@ -60,7 +77,7 @@ on the clustering column. |--------|-------------| | `pk-clustering-override` | `true` | | `clustering.columns` | Must be set (one or more non-primary-key columns) | -| `deletion-vectors.enabled` | Must be `true` | +| `deletion-vectors.enabled` | Must be `true` (not required for `first-row` merge engine) | | `merge-engine` | `deduplicate` (default) or `first-row` only | ## When to Use diff --git a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java index 2ff1080c4a84..271709c47ef5 100644 --- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java +++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java @@ -522,7 +522,7 @@ private static void validateForDeletionVectors(CoreOptions options) { || options.changelogProducer() == ChangelogProducer.LOOKUP, "Deletion vectors mode is only supported for NONE/INPUT/LOOKUP changelog producer now."); - // pk-clustering-override mode requires deletion vectors even for first-row + // pk-clustering-override mode allows deletion vectors for first-row if (!options.pkClusteringOverride()) { checkArgument( !options.mergeEngine().equals(MergeEngine.FIRST_ROW), @@ -847,7 +847,8 @@ public static void validatePkClusteringOverride(CoreOptions options) { throw new IllegalArgumentException( "Cannot support 'pk-clustering-override' mode without 'clustering.columns'."); } - if (!options.deletionVectorsEnabled()) { + if (!options.deletionVectorsEnabled() + && options.mergeEngine() != CoreOptions.MergeEngine.FIRST_ROW) { throw new UnsupportedOperationException( "Cannot support deletion-vectors disabled in 'pk-clustering-override' mode."); } diff --git a/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java b/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java index e9ddc9e000eb..3314c1d8062a 100644 --- a/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java @@ -519,6 +519,22 @@ public void testFirstRowBasic() throws Exception { .containsExactlyInAnyOrder(GenericRow.of(1, 100), GenericRow.of(2, 200)); } + /** Test first-row mode without explicit deletion-vectors enabled. */ + @Test + public void testFirstRowWithoutDeletionVectors() throws Exception { + Table firstRowTable = createFirstRowTableWithoutDv(); + + // Write initial data + writeRows(firstRowTable, Arrays.asList(GenericRow.of(1, 100), GenericRow.of(2, 200))); + + // Write same keys with different values - should be ignored (first-row keeps first) + writeRows(firstRowTable, Arrays.asList(GenericRow.of(1, 999), GenericRow.of(2, 888))); + + // Should still see the first values + assertThat(readRows(firstRowTable)) + .containsExactlyInAnyOrder(GenericRow.of(1, 100), GenericRow.of(2, 200)); + } + /** Test first-row mode with multiple commits. */ @Test public void testFirstRowMultipleCommits() throws Exception { @@ -915,6 +931,22 @@ private Table createFirstRowTable() throws Exception { return catalog.getTable(identifier); } + private Table createFirstRowTableWithoutDv() throws Exception { + Identifier identifier = Identifier.create("default", "first_row_no_dv_table"); + Schema schema = + Schema.newBuilder() + .column("a", DataTypes.INT()) + .column("b", DataTypes.INT()) + .primaryKey("a") + .option(BUCKET.key(), "1") + .option(CLUSTERING_COLUMNS.key(), "b") + .option(PK_CLUSTERING_OVERRIDE.key(), "true") + .option(MERGE_ENGINE.key(), "first-row") + .build(); + catalog.createTable(identifier, schema, false); + return catalog.getTable(identifier); + } + private void writeRows(List rows) throws Exception { writeRows(table, rows); }