Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,74 @@ pub fn similar_to(
Ok(Arc::new(BinaryExpr::new(expr, binary_op, pattern)))
}

/// Translate a SQL `SIMILAR TO` pattern into an equivalent POSIX regex.
///
/// PostgreSQL `SIMILAR TO` mixes SQL LIKE wildcards with POSIX-style regex
/// metacharacters and requires the pattern to match the entire string.
/// In particular:
///
/// * `%` matches any sequence of zero or more characters (like LIKE).
/// * `_` matches exactly one character (like LIKE).
/// * `|`, `*`, `+`, `?`, `()`, `{m[,n]}`, `[...]` keep their POSIX regex meaning.
/// * `.`, `^`, `$` are *literal* characters (not regex metacharacters).
/// * `\` is the default escape character, so `\X` means a literal `X`.
///
/// The translated regex is wrapped with `^...$` so the regex engine enforces
/// a full-string match.
pub fn translate_similar_to_pattern(pattern: &str) -> String {
let mut out = String::with_capacity(pattern.len() + 2);
out.push('^');
let mut chars = pattern.chars().peekable();
while let Some(c) = chars.next() {
match c {
'%' => out.push_str(".*"),
'_' => out.push('.'),
'\\' => {
// Backslash escapes the next character in SIMILAR TO. Emit it
// as a literal in the regex by re-escaping it.
match chars.next() {
Some(next) => {
out.push('\\');
out.push(next);
}
None => out.push_str("\\\\"),
}
}
'[' => {
// Pass through a POSIX bracket expression verbatim. Inside a
// bracket expression `%`/`_` are literal and most other
// metacharacters lose their special meaning, so we copy until
// the matching `]`.
out.push('[');
// The first character after `[` (or `[^`) is always literal
// even if it is `]`.
if matches!(chars.peek(), Some('^')) {
out.push(chars.next().unwrap());
}
if matches!(chars.peek(), Some(']')) {
out.push(chars.next().unwrap());
}
for b in chars.by_ref() {
out.push(b);
if b == ']' {
break;
}
}
}
// SIMILAR TO metacharacters that map 1:1 to regex.
'|' | '*' | '+' | '?' | '(' | ')' | '{' | '}' => out.push(c),
// Regex metacharacters that SIMILAR TO treats as literals.
'.' | '^' | '$' => {
out.push('\\');
out.push(c);
}
_ => out.push(c),
}
}
out.push('$');
out
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -5410,4 +5478,38 @@ mod tests {
BooleanArray::from_iter(vec![Some(true), Some(true), Some(true), Some(true)]);
assert_eq!(eq_result.into_array(4).unwrap().as_boolean(), &expected);
}

#[test]
fn similar_to_pattern_translation() {
let cases = [
// Empty pattern matches only the empty string.
("", "^$"),
// SQL wildcards expand to their POSIX regex equivalents.
("a%", "^a.*$"),
("a_b", "^a.b$"),
// POSIX metacharacters borrowed by SIMILAR TO pass through unchanged.
("p[12]%", "^p[12].*$"),
("(foo|bar)+", "^(foo|bar)+$"),
("a{2,3}", "^a{2,3}$"),
// `.`, `^`, `$` are literal in SIMILAR TO and must be escaped for regex.
("a.b", "^a\\.b$"),
("^a$", "^\\^a\\$$"),
// Backslash escapes the SQL wildcards.
("100\\%", "^100\\%$"),
("a\\_b", "^a\\_b$"),
// Bracket expressions are passed through verbatim, including a
// leading literal `]` and `%`/`_` inside the class.
("[%_]", "^[%_]$"),
("[]abc]", "^[]abc]$"),
("[^abc]", "^[^abc]$"),
("[^]abc]", "^[^]abc]$"),
];
for (input, expected) in cases {
assert_eq!(
translate_similar_to_pattern(input),
expected,
"pattern: {input:?}"
);
}
}
}
2 changes: 1 addition & 1 deletion datafusion/physical-expr/src/expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub use crate::PhysicalSortExpr;
/// Module with some convenient methods used in expression building
pub use crate::aggregate::stats::StatsType;

pub use binary::{BinaryExpr, binary, similar_to};
pub use binary::{BinaryExpr, binary, similar_to, translate_similar_to_pattern};
pub use case::{CaseExpr, case};
pub use cast::{CastExpr, cast};
pub use column::{Column, col, with_new_schema};
Expand Down
39 changes: 37 additions & 2 deletions datafusion/physical-expr/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ use crate::scalar_subquery::ScalarSubqueryExpr;
use crate::{HigherOrderFunctionExpr, ScalarFunctionExpr};
use crate::{
PhysicalExpr,
expressions::{self, Column, Literal, binary, like, similar_to},
expressions::{
self, Column, Literal, binary, like, similar_to, translate_similar_to_pattern,
},
};

use arrow::datatypes::Schema;
Expand Down Expand Up @@ -253,8 +255,41 @@ pub fn create_physical_expr(
}
let physical_expr =
create_physical_expr(expr, input_dfschema, execution_props)?;
// SIMILAR TO uses SQL wildcards (`%`, `_`) layered on POSIX regex and
// requires a whole-string match. Translate literal patterns to an
// equivalent regex so the existing regex-match operator returns
// PostgreSQL-compatible results.
let translated_pattern = match pattern.as_ref() {
Expr::Literal(ScalarValue::Utf8(Some(s)), m) => Expr::Literal(
ScalarValue::Utf8(Some(translate_similar_to_pattern(s))),
m.clone(),
),
Expr::Literal(ScalarValue::LargeUtf8(Some(s)), m) => Expr::Literal(
ScalarValue::LargeUtf8(Some(translate_similar_to_pattern(s))),
m.clone(),
),
Expr::Literal(ScalarValue::Utf8View(Some(s)), m) => Expr::Literal(
ScalarValue::Utf8View(Some(translate_similar_to_pattern(s))),
m.clone(),
),
// NULL pattern: regex match against NULL returns NULL. Use a
// typed Utf8 null so the regex kernel can handle it.
Expr::Literal(
ScalarValue::Utf8(None)
| ScalarValue::LargeUtf8(None)
| ScalarValue::Utf8View(None)
| ScalarValue::Null,
m,
) => Expr::Literal(ScalarValue::Utf8(None), m.clone()),
_ => {
return not_impl_err!(
"SIMILAR TO with a non-literal pattern is not yet supported"
);
}
};
let pattern_expr = &translated_pattern;
let physical_pattern =
create_physical_expr(pattern, input_dfschema, execution_props)?;
create_physical_expr(pattern_expr, input_dfschema, execution_props)?;
similar_to(*negated, *case_insensitive, physical_expr, physical_pattern)
}
Expr::Case(case) => {
Expand Down
5 changes: 4 additions & 1 deletion datafusion/sql/src/expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
) -> Result<Expr> {
let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?;
let pattern_type = pattern.get_type(schema)?;
if pattern_type != DataType::Utf8 && pattern_type != DataType::Null {
if !matches!(
pattern_type,
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View | DataType::Null
) {
return plan_err!("Invalid pattern in SIMILAR TO expression");
}
let escape_char = match escape_char.map(|v| v.value) {
Expand Down
56 changes: 54 additions & 2 deletions datafusion/sqllogictest/test_files/strings.slt
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ p2e1
p2m1e1

# SIMILAR TO
# Uses SQL wildcards (`%`) layered on POSIX regex with whole-string matching.
query T rowsort
SELECT s FROM test WHERE s SIMILAR TO 'p[12].*';
SELECT s FROM test WHERE s SIMILAR TO 'p[12]%';
----
p1
p1e1
Expand All @@ -84,13 +85,64 @@ p2m1e1

# NOT SIMILAR TO
query T rowsort
SELECT s FROM test WHERE s NOT SIMILAR TO 'p[12].*';
SELECT s FROM test WHERE s NOT SIMILAR TO 'p[12]%';
----
P1
P1e1
P1m1e1
e1

# Regression for https://github.com/apache/datafusion/issues/22263:
# `%` must be treated as the SQL wildcard, not a literal.
query B
SELECT 'abc' SIMILAR TO 'a%';
----
true

# `_` matches exactly one character.
query BB
SELECT 'abc' SIMILAR TO 'a_c', 'abc' SIMILAR TO 'a_';
----
true false

# SIMILAR TO is anchored to the whole string, unlike regex `~`.
query BB
SELECT 'abc' SIMILAR TO 'b', 'abc' SIMILAR TO 'abc';
----
false true

# `.`, `^`, `$` are literal in SIMILAR TO, not regex metacharacters.
query BBB
SELECT 'a.b' SIMILAR TO 'a.b',
'axb' SIMILAR TO 'a.b',
'^abc$' SIMILAR TO '^abc$';
----
true false true

# POSIX regex metacharacters keep their meaning.
query BBB
SELECT 'foo' SIMILAR TO '(foo|bar)',
'aaaa' SIMILAR TO 'a{2,4}',
'abc' SIMILAR TO 'ab+c';
----
true true true

# Backslash escapes the SQL wildcards.
query BB
SELECT '100%' SIMILAR TO '100\%', 'a_b' SIMILAR TO 'a\_b';
----
true true

# NULL pattern yields NULL.
query B
SELECT 'abc' SIMILAR TO NULL;
----
NULL

# Non-literal patterns are not yet supported (better an error than a wrong answer).
statement error SIMILAR TO with a non-literal pattern is not yet supported
SELECT s FROM test WHERE s SIMILAR TO s;

# NOT LIKE
query T rowsort
SELECT s FROM test WHERE s NOT LIKE 'p1%';
Expand Down
Loading