From 327a9141026f501eb7cb8ab23a75660a12e2cac7 Mon Sep 17 00:00:00 2001 From: Madhavendra Rathore Date: Wed, 22 Apr 2026 16:51:06 +0530 Subject: [PATCH 1/2] Fix DML misclassification for statements containing UNION/INTERSECT/EXCEPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Statement.execute()` was incorrectly returning `true` (and `getUpdateCount()` returning `-1`) for `INSERT` / `UPDATE` / `DELETE` / `MERGE` statements whose subqueries or CTEs contained `UNION`, `INTERSECT`, or `EXCEPT`. The `UNION_PATTERN`, `INTERSECT_PATTERN`, and `EXCEPT_PATTERN` regexes in `DatabricksJdbcConstants` are non-anchored (`\s+UNION\s+` etc.) and were matched via `find()` inside `shouldReturnResultSet`, so the keyword was picked up anywhere in the SQL — even deep inside a subquery of an outer DML, or inside the Databricks column-exclusion form `SELECT * EXCEPT (col)`. `executeUpdate()` then threw `DatabricksSQLException`, `getUpdateCount()` lost the affected-row count, and frameworks like Slick that use `!execute()` as the DML detector crashed. This also regressed behavior from the Simba `2.7.5` driver, which returned `false` for these inputs. Short-circuit `shouldReturnResultSet` to `false` when the trimmed query starts with a DML keyword, so the non-anchored set-operator patterns can't fire on subquery content. A separate `DML_PREFIX_PATTERN` is added so `INSERT_PATTERN` (shared with the batching parser and requiring `INSERT INTO`) is untouched — this lets the guard also cover `INSERT OVERWRITE ...`. The existing `NonRowcountQueryPrefixes` opt-in still wins: it is evaluated before the new short-circuit. Fixes https://github.com/databricks/databricks-jdbc/issues/1418 Signed-off-by: Madhavendra Rathore --- NEXT_CHANGELOG.md | 1 + .../jdbc/api/impl/DatabricksStatement.java | 9 +++ .../jdbc/common/DatabricksJdbcConstants.java | 8 ++ .../api/impl/DatabricksStatementTest.java | 78 +++++++++++++++++++ 4 files changed, 96 insertions(+) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index bc2e82ab7..99d129f15 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -25,6 +25,7 @@ - Fixed `getColumnClassName()` returning null for VARIANT columns in SEA mode by adding VARIANT to the type system. - Fixed `getColumns()` returning `DATA_TYPE=0` (NULL) for GEOMETRY/GEOGRAPHY columns in Thrift mode. Now returns `Types.VARCHAR` (12) when geospatial is disabled and `Types.OTHER` (1111) when enabled, consistent with SEA mode. - Fixed `getCrossReference()` returning 0 rows when parent args are passed in uppercase. The client-side filter used case-sensitive comparison against server-returned lowercase names. +- Fixed `Statement.execute()` incorrectly returning `true` and `getUpdateCount()` returning `-1` for DML statements (`INSERT` / `UPDATE` / `DELETE` / `MERGE`, including `INSERT OVERWRITE`) whose subqueries or CTEs contain `UNION`, `INTERSECT`, or `EXCEPT`. Also fixes the same mis-classification for the Databricks column-exclusion form `SELECT * EXCEPT (col)` when used inside a DML. DML statements are now short-circuited to update-count mode before the non-anchored set-operator regexes are evaluated, matching the JDBC spec and restoring compatibility with frameworks (e.g. Slick) that use `!execute()` to detect DML. ([#1418](https://github.com/databricks/databricks-jdbc/issues/1418)) --- *Note: When making changes, please add your change under the appropriate section diff --git a/src/main/java/com/databricks/jdbc/api/impl/DatabricksStatement.java b/src/main/java/com/databricks/jdbc/api/impl/DatabricksStatement.java index c9b6733e7..ebd142ef2 100644 --- a/src/main/java/com/databricks/jdbc/api/impl/DatabricksStatement.java +++ b/src/main/java/com/databricks/jdbc/api/impl/DatabricksStatement.java @@ -770,6 +770,15 @@ static boolean shouldReturnResultSet(String query, List nonRowcountQuery return true; } + // DML statements (INSERT / UPDATE / DELETE / MERGE) always return an update count per the + // JDBC spec, even when their subqueries or CTEs contain UNION / INTERSECT / EXCEPT. Without + // this short-circuit the non-anchored UNION_PATTERN et al. below match anywhere in the SQL + // and mis-classify DML — including the column-exclusion form `SELECT * EXCEPT (col)`. See + // https://github.com/databricks/databricks-jdbc/issues/1418. + if (DML_PREFIX_PATTERN.matcher(trimmedQuery).find()) { + return false; + } + // Check if the query matches any of the patterns that return a ResultSet return SELECT_PATTERN.matcher(trimmedQuery).find() || SHOW_PATTERN.matcher(trimmedQuery).find() diff --git a/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java b/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java index 7849411f5..568277c42 100644 --- a/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java +++ b/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java @@ -177,6 +177,14 @@ public enum FakeServiceType { public static final Pattern CALL_PATTERN = Pattern.compile("^(\\s*\\()*\\s*CALL", Pattern.CASE_INSENSITIVE); + /** + * Matches statements whose leading keyword is a DML (INSERT / UPDATE / DELETE / MERGE). Unlike + * {@link #INSERT_PATTERN} this also accepts {@code INSERT OVERWRITE ...} (not just {@code INSERT + * INTO ...}) and does not share the batching parser's expectations, so it is safe to broaden. + */ + public static final Pattern DML_PREFIX_PATTERN = + Pattern.compile("^(\\s*\\()*\\s*(INSERT|UPDATE|DELETE|MERGE)\\s+", Pattern.CASE_INSENSITIVE); + /** Maximum number of parameters allowed in a single Databricks query */ public static final int MAX_QUERY_PARAMETERS = 256; diff --git a/src/test/java/com/databricks/jdbc/api/impl/DatabricksStatementTest.java b/src/test/java/com/databricks/jdbc/api/impl/DatabricksStatementTest.java index ff5d9e7ec..3b9d416f2 100644 --- a/src/test/java/com/databricks/jdbc/api/impl/DatabricksStatementTest.java +++ b/src/test/java/com/databricks/jdbc/api/impl/DatabricksStatementTest.java @@ -581,6 +581,84 @@ public void testShouldReturnResultSet_ExceptQuery() { assertTrue(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); } + // https://github.com/databricks/databricks-jdbc/issues/1418 — DML statements whose subqueries + // or CTEs contain UNION / INTERSECT / EXCEPT were being mis-classified as ResultSet-producing. + @Test + public void testShouldReturnResultSet_InsertWithUnionInSubquery() { + String query = + "INSERT INTO my_catalog.my_schema.target_table " + + "SELECT * FROM ( " + + " SELECT col1, col2 FROM src WHERE 1 = 0 " + + " UNION ALL " + + " SELECT col1, col2 FROM src WHERE 1 = 0 " + + ") t"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_InsertWithIntersectInSubquery() { + String query = "INSERT INTO t SELECT x FROM (SELECT x FROM a INTERSECT SELECT x FROM b) s"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_InsertWithExceptInSubquery() { + String query = "INSERT INTO t SELECT x FROM (SELECT x FROM a EXCEPT SELECT x FROM b) s"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_InsertWithSelectStarExceptColumnExclusion() { + // `SELECT * EXCEPT (col)` is Databricks column-exclusion syntax, not a set operator. + String query = "INSERT INTO t SELECT * EXCEPT (secret_col) FROM source"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_InsertOverwriteDirectoryWithIntersect() { + String query = + "INSERT OVERWRITE DIRECTORY 's3://bucket/path' USING CSV " + + "SELECT x FROM a INTERSECT SELECT x FROM b"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_UpdateWithUnionInSubquery() { + String query = "UPDATE t SET col = (SELECT x FROM a UNION SELECT x FROM b LIMIT 1)"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_DeleteWithExceptInSubquery() { + String query = "DELETE FROM t WHERE id IN (SELECT id FROM a EXCEPT SELECT id FROM b)"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_MergeWithUnionInSource() { + String query = + "MERGE INTO target t USING (SELECT id FROM a UNION SELECT id FROM b) s " + + "ON t.id = s.id WHEN MATCHED THEN DELETE"; + assertFalse(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + + @Test + public void testShouldReturnResultSet_DmlPrefixOverriddenByNonRowcountConfig() { + // The NonRowcountQueryPrefixes opt-in path must still win over the DML short-circuit. + String query = "INSERT INTO t VALUES (1)"; + assertTrue( + DatabricksStatement.shouldReturnResultSet(query, Arrays.asList("INSERT")), + "NonRowcountQueryPrefixes=INSERT should force ResultSet mode"); + } + + @Test + public void testShouldReturnResultSet_TopLevelParenthesizedUnionStillMatches() { + // Regression guard: top-level set operations starting with `(` still classify as ResultSet + // via SELECT_PATTERN's `^(\s*\()*\s*SELECT` prefix. + String query = "(SELECT a FROM t1) UNION (SELECT a FROM t2)"; + assertTrue(DatabricksStatement.shouldReturnResultSet(query, Collections.emptyList())); + } + @Test public void testShouldReturnResultSet_DeclareQuery() { String query = "DECLARE @var INT;"; From 4112b9cf6d633ca9efcfaa55363527f51e511d0f Mon Sep 17 00:00:00 2001 From: Madhavendra Rathore Date: Wed, 22 Apr 2026 16:52:45 +0530 Subject: [PATCH 2/2] Remove NEXT_CHANGELOG.md entry for now Will rely on the PR's NO_CHANGELOG=true marker until a changelog entry is ready. Signed-off-by: Madhavendra Rathore --- NEXT_CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 99d129f15..bc2e82ab7 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -25,7 +25,6 @@ - Fixed `getColumnClassName()` returning null for VARIANT columns in SEA mode by adding VARIANT to the type system. - Fixed `getColumns()` returning `DATA_TYPE=0` (NULL) for GEOMETRY/GEOGRAPHY columns in Thrift mode. Now returns `Types.VARCHAR` (12) when geospatial is disabled and `Types.OTHER` (1111) when enabled, consistent with SEA mode. - Fixed `getCrossReference()` returning 0 rows when parent args are passed in uppercase. The client-side filter used case-sensitive comparison against server-returned lowercase names. -- Fixed `Statement.execute()` incorrectly returning `true` and `getUpdateCount()` returning `-1` for DML statements (`INSERT` / `UPDATE` / `DELETE` / `MERGE`, including `INSERT OVERWRITE`) whose subqueries or CTEs contain `UNION`, `INTERSECT`, or `EXCEPT`. Also fixes the same mis-classification for the Databricks column-exclusion form `SELECT * EXCEPT (col)` when used inside a DML. DML statements are now short-circuited to update-count mode before the non-anchored set-operator regexes are evaluated, matching the JDBC spec and restoring compatibility with frameworks (e.g. Slick) that use `!execute()` to detect DML. ([#1418](https://github.com/databricks/databricks-jdbc/issues/1418)) --- *Note: When making changes, please add your change under the appropriate section