From 4fee96bc8299de382217278fc198cc6df76a2615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 16 May 2026 18:55:32 +0200 Subject: [PATCH] fix: translate SIMILAR TO pattern to PostgreSQL-compatible regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `SIMILAR TO` was lowered directly to a regex match without translating SQL wildcards or anchoring the pattern, so `'abc' SIMILAR TO 'a%'` returned false instead of true and `.`/`^`/`$` were treated as regex metacharacters instead of literals. The planner now translates literal `SIMILAR TO` patterns into an equivalent POSIX regex (anchored with `^...$`, `%`→`.*`, `_`→`.`, literal `.`/`^`/`$` escaped, backslash escape and bracket expressions preserved) before lowering to the existing regex-match operator. NULL patterns flow through as a typed Utf8 null. Non-literal patterns now return a clear `not_impl` error rather than a silently wrong result, and the SQL-layer pattern-type check is widened to accept `LargeUtf8` and `Utf8View` literals. Closes #22263. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-expr/src/expressions/binary.rs | 102 ++++++++++++++++++ .../physical-expr/src/expressions/mod.rs | 2 +- datafusion/physical-expr/src/planner.rs | 39 ++++++- datafusion/sql/src/expr/mod.rs | 5 +- .../sqllogictest/test_files/strings.slt | 56 +++++++++- 5 files changed, 198 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index b92668fe9bd0d..7ffd29b70880f 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -995,6 +995,74 @@ pub fn similar_to( Ok(Arc::new(BinaryExpr::new(expr, binary_op, pattern))) } +/// Translate a SQL `SIMILAR TO` pattern into an equivalent POSIX regex. +/// +/// PostgreSQL `SIMILAR TO` mixes SQL LIKE wildcards with POSIX-style regex +/// metacharacters and requires the pattern to match the entire string. +/// In particular: +/// +/// * `%` matches any sequence of zero or more characters (like LIKE). +/// * `_` matches exactly one character (like LIKE). +/// * `|`, `*`, `+`, `?`, `()`, `{m[,n]}`, `[...]` keep their POSIX regex meaning. +/// * `.`, `^`, `$` are *literal* characters (not regex metacharacters). +/// * `\` is the default escape character, so `\X` means a literal `X`. +/// +/// The translated regex is wrapped with `^...$` so the regex engine enforces +/// a full-string match. +pub fn translate_similar_to_pattern(pattern: &str) -> String { + let mut out = String::with_capacity(pattern.len() + 2); + out.push('^'); + let mut chars = pattern.chars().peekable(); + while let Some(c) = chars.next() { + match c { + '%' => out.push_str(".*"), + '_' => out.push('.'), + '\\' => { + // Backslash escapes the next character in SIMILAR TO. Emit it + // as a literal in the regex by re-escaping it. + match chars.next() { + Some(next) => { + out.push('\\'); + out.push(next); + } + None => out.push_str("\\\\"), + } + } + '[' => { + // Pass through a POSIX bracket expression verbatim. Inside a + // bracket expression `%`/`_` are literal and most other + // metacharacters lose their special meaning, so we copy until + // the matching `]`. + out.push('['); + // The first character after `[` (or `[^`) is always literal + // even if it is `]`. + if matches!(chars.peek(), Some('^')) { + out.push(chars.next().unwrap()); + } + if matches!(chars.peek(), Some(']')) { + out.push(chars.next().unwrap()); + } + for b in chars.by_ref() { + out.push(b); + if b == ']' { + break; + } + } + } + // SIMILAR TO metacharacters that map 1:1 to regex. + '|' | '*' | '+' | '?' | '(' | ')' | '{' | '}' => out.push(c), + // Regex metacharacters that SIMILAR TO treats as literals. + '.' | '^' | '$' => { + out.push('\\'); + out.push(c); + } + _ => out.push(c), + } + } + out.push('$'); + out +} + #[cfg(test)] mod tests { use super::*; @@ -5410,4 +5478,38 @@ mod tests { BooleanArray::from_iter(vec![Some(true), Some(true), Some(true), Some(true)]); assert_eq!(eq_result.into_array(4).unwrap().as_boolean(), &expected); } + + #[test] + fn similar_to_pattern_translation() { + let cases = [ + // Empty pattern matches only the empty string. + ("", "^$"), + // SQL wildcards expand to their POSIX regex equivalents. + ("a%", "^a.*$"), + ("a_b", "^a.b$"), + // POSIX metacharacters borrowed by SIMILAR TO pass through unchanged. + ("p[12]%", "^p[12].*$"), + ("(foo|bar)+", "^(foo|bar)+$"), + ("a{2,3}", "^a{2,3}$"), + // `.`, `^`, `$` are literal in SIMILAR TO and must be escaped for regex. + ("a.b", "^a\\.b$"), + ("^a$", "^\\^a\\$$"), + // Backslash escapes the SQL wildcards. + ("100\\%", "^100\\%$"), + ("a\\_b", "^a\\_b$"), + // Bracket expressions are passed through verbatim, including a + // leading literal `]` and `%`/`_` inside the class. + ("[%_]", "^[%_]$"), + ("[]abc]", "^[]abc]$"), + ("[^abc]", "^[^abc]$"), + ("[^]abc]", "^[^]abc]$"), + ]; + for (input, expected) in cases { + assert_eq!( + translate_similar_to_pattern(input), + expected, + "pattern: {input:?}" + ); + } + } } diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 7cf874c448ea0..2f7a7a2823482 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -40,7 +40,7 @@ pub use crate::PhysicalSortExpr; /// Module with some convenient methods used in expression building pub use crate::aggregate::stats::StatsType; -pub use binary::{BinaryExpr, binary, similar_to}; +pub use binary::{BinaryExpr, binary, similar_to, translate_similar_to_pattern}; pub use case::{CaseExpr, case}; pub use cast::{CastExpr, cast}; pub use column::{Column, col, with_new_schema}; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index d0d0508a106a5..d25298b4a8a12 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -21,7 +21,9 @@ use crate::scalar_subquery::ScalarSubqueryExpr; use crate::{HigherOrderFunctionExpr, ScalarFunctionExpr}; use crate::{ PhysicalExpr, - expressions::{self, Column, Literal, binary, like, similar_to}, + expressions::{ + self, Column, Literal, binary, like, similar_to, translate_similar_to_pattern, + }, }; use arrow::datatypes::Schema; @@ -253,8 +255,41 @@ pub fn create_physical_expr( } let physical_expr = create_physical_expr(expr, input_dfschema, execution_props)?; + // SIMILAR TO uses SQL wildcards (`%`, `_`) layered on POSIX regex and + // requires a whole-string match. Translate literal patterns to an + // equivalent regex so the existing regex-match operator returns + // PostgreSQL-compatible results. + let translated_pattern = match pattern.as_ref() { + Expr::Literal(ScalarValue::Utf8(Some(s)), m) => Expr::Literal( + ScalarValue::Utf8(Some(translate_similar_to_pattern(s))), + m.clone(), + ), + Expr::Literal(ScalarValue::LargeUtf8(Some(s)), m) => Expr::Literal( + ScalarValue::LargeUtf8(Some(translate_similar_to_pattern(s))), + m.clone(), + ), + Expr::Literal(ScalarValue::Utf8View(Some(s)), m) => Expr::Literal( + ScalarValue::Utf8View(Some(translate_similar_to_pattern(s))), + m.clone(), + ), + // NULL pattern: regex match against NULL returns NULL. Use a + // typed Utf8 null so the regex kernel can handle it. + Expr::Literal( + ScalarValue::Utf8(None) + | ScalarValue::LargeUtf8(None) + | ScalarValue::Utf8View(None) + | ScalarValue::Null, + m, + ) => Expr::Literal(ScalarValue::Utf8(None), m.clone()), + _ => { + return not_impl_err!( + "SIMILAR TO with a non-literal pattern is not yet supported" + ); + } + }; + let pattern_expr = &translated_pattern; let physical_pattern = - create_physical_expr(pattern, input_dfschema, execution_props)?; + create_physical_expr(pattern_expr, input_dfschema, execution_props)?; similar_to(*negated, *case_insensitive, physical_expr, physical_pattern) } Expr::Case(case) => { diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index daf092ecd4cf9..7d55874d0eca6 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -941,7 +941,10 @@ impl SqlToRel<'_, S> { ) -> Result { let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?; let pattern_type = pattern.get_type(schema)?; - if pattern_type != DataType::Utf8 && pattern_type != DataType::Null { + if !matches!( + pattern_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View | DataType::Null + ) { return plan_err!("Invalid pattern in SIMILAR TO expression"); } let escape_char = match escape_char.map(|v| v.value) { diff --git a/datafusion/sqllogictest/test_files/strings.slt b/datafusion/sqllogictest/test_files/strings.slt index 9fa453fa02523..13fa9074b290f 100644 --- a/datafusion/sqllogictest/test_files/strings.slt +++ b/datafusion/sqllogictest/test_files/strings.slt @@ -72,8 +72,9 @@ p2e1 p2m1e1 # SIMILAR TO +# Uses SQL wildcards (`%`) layered on POSIX regex with whole-string matching. query T rowsort -SELECT s FROM test WHERE s SIMILAR TO 'p[12].*'; +SELECT s FROM test WHERE s SIMILAR TO 'p[12]%'; ---- p1 p1e1 @@ -84,13 +85,64 @@ p2m1e1 # NOT SIMILAR TO query T rowsort -SELECT s FROM test WHERE s NOT SIMILAR TO 'p[12].*'; +SELECT s FROM test WHERE s NOT SIMILAR TO 'p[12]%'; ---- P1 P1e1 P1m1e1 e1 +# Regression for https://github.com/apache/datafusion/issues/22263: +# `%` must be treated as the SQL wildcard, not a literal. +query B +SELECT 'abc' SIMILAR TO 'a%'; +---- +true + +# `_` matches exactly one character. +query BB +SELECT 'abc' SIMILAR TO 'a_c', 'abc' SIMILAR TO 'a_'; +---- +true false + +# SIMILAR TO is anchored to the whole string, unlike regex `~`. +query BB +SELECT 'abc' SIMILAR TO 'b', 'abc' SIMILAR TO 'abc'; +---- +false true + +# `.`, `^`, `$` are literal in SIMILAR TO, not regex metacharacters. +query BBB +SELECT 'a.b' SIMILAR TO 'a.b', + 'axb' SIMILAR TO 'a.b', + '^abc$' SIMILAR TO '^abc$'; +---- +true false true + +# POSIX regex metacharacters keep their meaning. +query BBB +SELECT 'foo' SIMILAR TO '(foo|bar)', + 'aaaa' SIMILAR TO 'a{2,4}', + 'abc' SIMILAR TO 'ab+c'; +---- +true true true + +# Backslash escapes the SQL wildcards. +query BB +SELECT '100%' SIMILAR TO '100\%', 'a_b' SIMILAR TO 'a\_b'; +---- +true true + +# NULL pattern yields NULL. +query B +SELECT 'abc' SIMILAR TO NULL; +---- +NULL + +# Non-literal patterns are not yet supported (better an error than a wrong answer). +statement error SIMILAR TO with a non-literal pattern is not yet supported +SELECT s FROM test WHERE s SIMILAR TO s; + # NOT LIKE query T rowsort SELECT s FROM test WHERE s NOT LIKE 'p1%';