From 658cabed1ce50bd214d4775bcc91abcbd6b79272 Mon Sep 17 00:00:00 2001 From: Ioannis Rosuochatzakis Date: Thu, 19 Feb 2026 01:51:00 +0100 Subject: [PATCH 1/3] TEDEFO-4912 Add portable regex subset validation for EFX like patterns --- .../efx/exceptions/InvalidUsageException.java | 8 +- .../efx/sdk2/EfxExpressionTranslatorV2.java | 2 + .../ted/efx/util/EfxRegexValidator.java | 241 ++++++++++++++++ .../ted/efx/util/EfxRegexValidatorTest.java | 265 ++++++++++++++++++ 4 files changed, 515 insertions(+), 1 deletion(-) create mode 100644 src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java create mode 100644 src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java diff --git a/src/main/java/eu/europa/ted/efx/exceptions/InvalidUsageException.java b/src/main/java/eu/europa/ted/efx/exceptions/InvalidUsageException.java index d91bb818..8f62a23d 100644 --- a/src/main/java/eu/europa/ted/efx/exceptions/InvalidUsageException.java +++ b/src/main/java/eu/europa/ted/efx/exceptions/InvalidUsageException.java @@ -29,7 +29,8 @@ public enum ErrorCode { INVALID_NOTICE_SUBTYPE_RANGE_ORDER, INVALID_NOTICE_SUBTYPE_TOKEN, FIELD_NOT_WITHHOLDABLE, - TEMPLATE_ONLY_FUNCTION + TEMPLATE_ONLY_FUNCTION, + UNSUPPORTED_REGEX_CONSTRUCT } private static final String SHORTHAND_REQUIRES_CODE_OR_INDICATOR = "Indirect label reference shorthand #{%1$s}, requires a field of type 'code' or 'indicator'. Field %1$s is of type %2$s."; @@ -38,6 +39,7 @@ public enum ErrorCode { private static final String INVALID_NOTICE_SUBTYPE_TOKEN = "Invalid notice subtype token '%s'. Expected format: 'X' or 'X-Y'."; private static final String FIELD_NOT_WITHHOLDABLE = "Field '%s' is always published and cannot be withheld from publication."; private static final String TEMPLATE_ONLY_FUNCTION = "Function '%s' can only be used in templates, not in expressions or validation rules."; + private static final String UNSUPPORTED_REGEX_CONSTRUCT = "Invalid regex pattern %s at position %d: %s"; private final ErrorCode errorCode; @@ -73,4 +75,8 @@ public static InvalidUsageException fieldNotWithholdable(String fieldId) { public static InvalidUsageException templateOnlyFunction(String functionName) { return new InvalidUsageException(ErrorCode.TEMPLATE_ONLY_FUNCTION, String.format(TEMPLATE_ONLY_FUNCTION, functionName)); } + + public static InvalidUsageException unsupportedRegexConstruct(String pattern, int position, String reason) { + return new InvalidUsageException(ErrorCode.UNSUPPORTED_REGEX_CONSTRUCT, String.format(UNSUPPORTED_REGEX_CONSTRUCT, pattern, position, reason)); + } } diff --git a/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java b/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java index acf7f24c..f0b31e7f 100644 --- a/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java +++ b/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java @@ -43,6 +43,7 @@ import eu.europa.ted.efx.exceptions.SymbolResolutionException; import eu.europa.ted.efx.exceptions.TypeMismatchException; import eu.europa.ted.efx.exceptions.ConsistencyCheckException; +import eu.europa.ted.efx.util.EfxRegexValidator; import eu.europa.ted.efx.interfaces.EfxExpressionTranslator; import eu.europa.ted.efx.interfaces.ScriptGenerator; import eu.europa.ted.efx.interfaces.SymbolResolver; @@ -637,6 +638,7 @@ private void exitSequenceDistinctCondition( @Override public void exitLikePatternCondition(LikePatternConditionContext ctx) { + EfxRegexValidator.validate(ctx.pattern.getText()); StringExpression expression = this.stack.pop(StringExpression.class); BooleanExpression condition = this.script.composePatternMatchCondition(expression, ctx.pattern.getText()); if (ctx.modifier != null && ctx.modifier.getText().equals(NOT_MODIFIER)) { diff --git a/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java b/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java new file mode 100644 index 00000000..e7e09497 --- /dev/null +++ b/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java @@ -0,0 +1,241 @@ +/* + * Copyright 2025 European Union + * + * Licensed under the EUPL, Version 1.2 or – as soon they will be approved by the European + * Commission – subsequent versions of the EUPL (the "Licence"); You may not use this work except in + * compliance with the Licence. You may obtain a copy of the Licence at: + * https://joinup.ec.europa.eu/software/page/eupl + * + * Unless required by applicable law or agreed to in writing, software distributed under the Licence + * is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the Licence for the specific language governing permissions and limitations under + * the Licence. + */ +package eu.europa.ted.efx.util; + +import java.util.Set; + +import eu.europa.ted.efx.exceptions.InvalidUsageException; + +/** + * Validates that a regex pattern used in an EFX {@code like} expression only uses constructs + * from the portable EFX regex subset. This subset is designed to work identically across all + * EFX target languages (XPath, Java, JavaScript, Python, C#, Swift). + * + *

Allowed constructs:

+ * + * + *

Shorthand character class semantics (ASCII):

+ * + */ +public final class EfxRegexValidator { + + private static final Set SHORTHAND_CLASSES = Set.of('d', 'D', 'w', 'W', 's', 'S'); + + private static final Set ESCAPABLE_METACHARACTERS = + Set.of('.', '\\', '(', ')', '[', ']', '{', '}', '*', '+', '?', '|', '^', '$'); + + private EfxRegexValidator() {} + + /** + * Validates that the given EFX regex pattern only uses portable constructs. + * + * @param rawPattern the raw token text of the pattern including delimiters (e.g. {@code '[0-9]+'}) + * @throws InvalidUsageException if the pattern uses unsupported constructs + */ + public static void validate(String rawPattern) { + if (rawPattern == null || rawPattern.length() < 2) { + return; + } + + char delimiter = rawPattern.charAt(0); + String content = rawPattern.substring(1, rawPattern.length() - 1); + int groupDepth = 0; + + for (int i = 0; i < content.length(); i++) { + char c = content.charAt(i); + + if (c == '\\') { + i = validateEscape(rawPattern, content, i); + } else if (c == '[') { + i = validateCharacterClass(rawPattern, content, i); + } else if (c == '(') { + i = validateGroupOpen(rawPattern, content, i); + groupDepth++; + } else if (c == ')') { + if (groupDepth <= 0) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, i + 1, "unmatched closing parenthesis"); + } + groupDepth--; + } else if (c == '{') { + i = validateRepetition(rawPattern, content, i); + } + // '.', '|', '^', '$', and literal characters are allowed as-is + } + + if (groupDepth > 0) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, content.length(), "unclosed group — missing ')'"); + } + } + + private static int validateEscape(String rawPattern, String content, int pos) { + if (pos + 1 >= content.length()) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, "trailing backslash"); + } + + char next = content.charAt(pos + 1); + + if (SHORTHAND_CLASSES.contains(next) || ESCAPABLE_METACHARACTERS.contains(next)) { + return pos + 1; + } + + // EFX quote escapes (\' and \") — these represent literal quote characters + if (next == '\'' || next == '"') { + return pos + 1; + } + + if (next == 'b' || next == 'B') { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "word boundary '\\" + next + "' is not allowed in EFX regex — use '^' and '$' anchors instead"); + } + + if (next >= '1' && next <= '9') { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "backreference '\\" + next + "' is not allowed in EFX regex"); + } + + if (next == 'p' || next == 'P') { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "Unicode property escape '\\" + next + "{...}' is not allowed in EFX regex — use character classes like '[a-z]' instead"); + } + + if (next == '0' || next == 'x' || next == 'u') { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "numeric character escape '\\" + next + "' is not allowed in EFX regex — use the literal character instead"); + } + + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "escape sequence '\\" + next + "' is not allowed in EFX regex"); + } + + private static int validateCharacterClass(String rawPattern, String content, int startPos) { + int i = startPos + 1; + + // Allow ^ for negation at the start + if (i < content.length() && content.charAt(i) == '^') { + i++; + } + + // Allow ] as a literal if it's the first character in the class (or after ^) + if (i < content.length() && content.charAt(i) == ']') { + i++; + } + + while (i < content.length()) { + char c = content.charAt(i); + + if (c == ']') { + return i; // end of character class + } + + if (c == '\\') { + if (i + 1 >= content.length()) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, i + 1, "trailing backslash in character class"); + } + char next = content.charAt(i + 1); + if (SHORTHAND_CLASSES.contains(next) || ESCAPABLE_METACHARACTERS.contains(next) + || next == '\'' || next == '"' || next == '-') { + i += 2; + continue; + } + // Reject the same unsupported escapes as outside a character class + validateEscape(rawPattern, content, i); + i += 2; + continue; + } + + // Regular characters and ranges (a-z) are allowed + i++; + } + + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, startPos + 1, "unclosed character class — missing ']'"); + } + + private static int validateGroupOpen(String rawPattern, String content, int pos) { + if (pos + 1 < content.length() && content.charAt(pos + 1) == '?') { + String message; + if (pos + 2 < content.length()) { + char modifier = content.charAt(pos + 2); + if (modifier == '=') { + message = "lookahead '(?=...)' is not allowed in EFX regex"; + } else if (modifier == '!') { + message = "negative lookahead '(?!...)' is not allowed in EFX regex"; + } else if (modifier == '<') { + message = "lookbehind '(?<...)' is not allowed in EFX regex"; + } else if (modifier == ':') { + message = "non-capturing group '(?:...)' is not allowed in EFX regex — use plain '(...)' instead"; + } else { + message = "extended group syntax '(?" + modifier + "...)' is not allowed in EFX regex"; + } + } else { + message = "extended group syntax '(?...)' is not allowed in EFX regex"; + } + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, message); + } + return pos; + } + + private static int validateRepetition(String rawPattern, String content, int startPos) { + int i = startPos + 1; + + // Expect at least one digit + if (i >= content.length() || !Character.isDigit(content.charAt(i))) { + // Treat { as a literal character (some regex engines allow this) + return startPos; + } + + // Parse first number + while (i < content.length() && Character.isDigit(content.charAt(i))) { + i++; + } + + if (i >= content.length()) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, startPos + 1, "unclosed repetition quantifier — missing '}'"); + } + + if (content.charAt(i) == '}') { + return i; + } + + if (content.charAt(i) == ',') { + i++; + // Optional second number + while (i < content.length() && Character.isDigit(content.charAt(i))) { + i++; + } + if (i >= content.length() || content.charAt(i) != '}') { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, startPos + 1, "unclosed repetition quantifier — missing '}'"); + } + return i; + } + + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, startPos + 1, "invalid repetition quantifier"); + } +} diff --git a/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java b/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java new file mode 100644 index 00000000..beeacbfc --- /dev/null +++ b/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java @@ -0,0 +1,265 @@ +/* + * Copyright 2025 European Union + * + * Licensed under the EUPL, Version 1.2 or – as soon they will be approved by the European + * Commission – subsequent versions of the EUPL (the "Licence"); You may not use this work except in + * compliance with the Licence. You may obtain a copy of the Licence at: + * https://joinup.ec.europa.eu/software/page/eupl + * + * Unless required by applicable law or agreed to in writing, software distributed under the Licence + * is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the Licence for the specific language governing permissions and limitations under + * the Licence. + */ +package eu.europa.ted.efx.util; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +import eu.europa.ted.efx.exceptions.InvalidUsageException; + +class EfxRegexValidatorTest { + + @Nested + class AllowedConstructs { + + @Test + void testLiteralCharacters() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'abc123'")); + } + + @Test + void testDot() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a.b'")); + } + + @Test + void testQuantifiers() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a*b+c?'")); + } + + @Test + void testRepetition_Exact() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a{3}'")); + } + + @Test + void testRepetition_AtLeast() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a{3,}'")); + } + + @Test + void testRepetition_Range() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a{3,5}'")); + } + + @Test + void testNonGreedyQuantifiers() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a*?'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a+?'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a??'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a{2,3}?'")); + } + + @Test + void testCharacterClass() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[abc]'")); + } + + @Test + void testCharacterClassWithRange() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[a-z0-9]'")); + } + + @Test + void testNegatedCharacterClass() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[^abc]'")); + } + + @Test + void testCharacterClassWithClosingBracketAsFirst() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[]abc]'")); + } + + @Test + void testGrouping() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'(abc)'")); + } + + @Test + void testNestedGroups() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'((a)(b))'")); + } + + @Test + void testAlternation() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a|b|c'")); + } + + @Test + void testAnchors() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'^abc$'")); + } + + @Test + void testShorthandClasses() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\d\\D\\w\\W\\s\\S'")); + } + + @Test + void testEscapedMetacharacters() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\.\\\\\\(\\)\\[\\]\\{\\}\\*\\+\\?\\|\\^\\$'")); + } + + @Test + void testEscapedQuotes() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a\\'b'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'a\\\"b'")); + } + + @Test + void testShorthandClassesInsideCharacterClass() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[\\d\\w]'")); + } + + @Test + void testComplexPattern() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'^[a-zA-Z]\\d{2,4}(\\.[0-9]+)?$'")); + } + + @Test + void testEmptyPattern() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("''")); + } + + @Test + void testDoubleQuotedPattern() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("\"[0-9]+\"")); + } + + @Test + void testLiteralBraceWhenNotQuantifier() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'{abc'")); + } + } + + @Nested + class DisallowedConstructs { + + @Test + void testWordBoundary() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\b'")); + assertTrue(ex.getMessage().contains("word boundary")); + } + + @Test + void testNonWordBoundary() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\B'")); + assertTrue(ex.getMessage().contains("word boundary")); + } + + @Test + void testBackreference() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'(a)\\1'")); + assertTrue(ex.getMessage().contains("backreference")); + } + + @Test + void testUnicodePropertyEscape() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\p'")); + assertTrue(ex.getMessage().contains("Unicode property")); + } + + @Test + void testNumericEscape_Hex() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\x41'")); + assertTrue(ex.getMessage().contains("numeric character")); + } + + @Test + void testNumericEscape_Unicode() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\u0041'")); + assertTrue(ex.getMessage().contains("numeric character")); + } + + @Test + void testNumericEscape_Null() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\0'")); + assertTrue(ex.getMessage().contains("numeric character")); + } + + @Test + void testLookahead() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'a(?=b)'")); + assertTrue(ex.getMessage().contains("lookahead")); + } + + @Test + void testNegativeLookahead() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'a(?!b)'")); + assertTrue(ex.getMessage().contains("negative lookahead")); + } + + @Test + void testLookbehind() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'(? EfxRegexValidator.validate("'(?:abc)'")); + assertTrue(ex.getMessage().contains("non-capturing group")); + } + + + @Test + void testUnsupportedEscapeSequence() { + assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\a'")); + } + + @Test + void testUnclosedCharacterClass() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'[abc'")); + assertTrue(ex.getMessage().contains("unclosed character class")); + } + + @Test + void testUnclosedGroup() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'(abc'")); + assertTrue(ex.getMessage().contains("unclosed group")); + } + + @Test + void testUnmatchedClosingParenthesis() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'abc)'")); + assertTrue(ex.getMessage().contains("unmatched closing parenthesis")); + } + + @Test + void testTrailingBackslash() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'abc\\'")); + assertTrue(ex.getMessage().contains("trailing backslash")); + } + } +} From 8687c3d6f21eab76bd9dab1f7e7246db1f55b8a2 Mon Sep 17 00:00:00 2001 From: Ioannis Rosuochatzakis Date: Fri, 20 Feb 2026 13:19:41 +0100 Subject: [PATCH 2/3] TEDEFO-4912 Disallow shorthand classes; validate replace-regex literal patterns only --- .../efx/sdk2/EfxExpressionTranslatorV2.java | 3 + .../ted/efx/util/EfxRegexValidator.java | 55 +++++++++------- .../sdk2/EfxExpressionTranslatorV2Test.java | 23 +++++-- .../ted/efx/util/EfxRegexValidatorTest.java | 64 ++++++++++++++++--- 4 files changed, 108 insertions(+), 37 deletions(-) diff --git a/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java b/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java index d3b10682..335465b1 100644 --- a/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java +++ b/src/main/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2.java @@ -2346,6 +2346,9 @@ public void exitReplaceFunction(ReplaceFunctionContext ctx) { @Override public void exitReplaceRegexFunction(ReplaceRegexFunctionContext ctx) { + if (ctx.pattern instanceof StringLiteralExpressionContext) { + EfxRegexValidator.validate(ctx.pattern.getText()); + } final StringExpression replacement = this.stack.pop(StringExpression.class); final StringExpression pattern = this.stack.pop(StringExpression.class); final StringExpression text = this.stack.pop(StringExpression.class); diff --git a/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java b/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java index e7e09497..fcf57c8b 100644 --- a/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java +++ b/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java @@ -22,28 +22,25 @@ * from the portable EFX regex subset. This subset is designed to work identically across all * EFX target languages (XPath, Java, JavaScript, Python, C#, Swift). * - *

Allowed constructs:

- *
    - *
  • Literal characters
  • - *
  • {@code .} (any character)
  • - *
  • {@code *}, {@code +}, {@code ?} (quantifiers, greedy or non-greedy)
  • - *
  • {@code {n}}, {@code {n,}}, {@code {n,m}} (repetition, greedy or non-greedy)
  • - *
  • {@code [...]}, {@code [^...]} (character classes with ranges)
  • - *
  • {@code (...)} (grouping), {@code |} (alternation)
  • - *
  • {@code ^}, {@code $} (anchors)
  • - *
  • {@code \d}, {@code \D}, {@code \w}, {@code \W}, {@code \s}, {@code \S} - * (shorthand character classes with ASCII semantics)
  • - *
  • Escaped metacharacters: {@code \.}, {@code \\}, {@code \(}, {@code \)}, - * {@code \[}, {@code \]}, {@code \{}, {@code \}}, {@code \*}, {@code \+}, - * {@code \?}, {@code \|}, {@code \^}, {@code \$}
  • - *
+ * Allowed constructs: * - *

Shorthand character class semantics (ASCII):

- *
    - *
  • {@code \d} = {@code [0-9]}
  • - *
  • {@code \w} = {@code [a-zA-Z0-9_]}
  • - *
  • {@code \s} = {@code [ \t\r\n\f]}
  • - *
+ * Literal characters + * {@code .} (any character) + * {@code *}, {@code +}, {@code ?} (quantifiers, greedy or non-greedy) + * {@code {n}}, {@code {n,}}, {@code {n,m}} (repetition, greedy or non-greedy) + * {@code [...]}, {@code [^...]} (character classes with ranges) + * {@code (...)} (grouping), {@code |} (alternation) + * {@code ^}, {@code $} (anchors) + * Escaped metacharacters: {@code \.} {@code \\} {@code \(} {@code \)} + * {@code \[} {@code \]} {@code \{} {@code \}} {@code \*} {@code \+} + * {@code \?} {@code \|} {@code \^} {@code \$} + * + * Disallowed constructs (not portable): + * + * {@code \d}, {@code \D}, {@code \w}, {@code \W}, {@code \s}, {@code \S} + * — shorthand character classes have inconsistent semantics across target languages + * (ASCII in XPath/JavaScript, Unicode in Python/C#/Swift). Use explicit character + * classes instead, e.g. {@code [0-9]}, {@code [a-zA-Z0-9_]}, {@code [ \t\r\n]}. */ public final class EfxRegexValidator { @@ -102,7 +99,12 @@ private static int validateEscape(String rawPattern, String content, int pos) { char next = content.charAt(pos + 1); - if (SHORTHAND_CLASSES.contains(next) || ESCAPABLE_METACHARACTERS.contains(next)) { + if (ESCAPABLE_METACHARACTERS.contains(next)) { + return pos + 1; + } + + // Portable whitespace escapes — same meaning in all target languages + if (next == 't' || next == 'r' || next == 'n' || next == 'f') { return pos + 1; } @@ -111,6 +113,12 @@ private static int validateEscape(String rawPattern, String content, int pos) { return pos + 1; } + if (SHORTHAND_CLASSES.contains(next)) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "shorthand class '\\" + next + "' is not allowed in EFX regex — its semantics differ across target languages. " + + "Use an explicit character class instead (e.g. [0-9], [a-zA-Z0-9_], [ \\t\\r\\n])"); + } + if (next == 'b' || next == 'B') { throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, "word boundary '\\" + next + "' is not allowed in EFX regex — use '^' and '$' anchors instead"); @@ -160,7 +168,8 @@ private static int validateCharacterClass(String rawPattern, String content, int throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, i + 1, "trailing backslash in character class"); } char next = content.charAt(i + 1); - if (SHORTHAND_CLASSES.contains(next) || ESCAPABLE_METACHARACTERS.contains(next) + if (ESCAPABLE_METACHARACTERS.contains(next) + || next == 't' || next == 'r' || next == 'n' || next == 'f' || next == '\'' || next == '"' || next == '-') { i += 2; continue; diff --git a/src/test/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2Test.java b/src/test/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2Test.java index 35976fab..35b168d1 100644 --- a/src/test/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2Test.java +++ b/src/test/java/eu/europa/ted/efx/sdk2/EfxExpressionTranslatorV2Test.java @@ -137,8 +137,8 @@ void testLikePatternCondition() { @Test void testLikePatternCondition_WithEscapedDot() { testExpressionTranslationWithContext( - "fn:matches(normalize-space('12.3'), '\\d+\\.\\d+')", - "BT-00-Text", "'12.3' like '\\d+\\.\\d+'"); + "fn:matches(normalize-space('12.3'), '[0-9]+\\.[0-9]+')", + "BT-00-Text", "'12.3' like '[0-9]+\\.[0-9]+'"); } @Test @@ -1951,8 +1951,23 @@ void testReplaceRegexFunction() { @Test void testReplaceRegexFunction_WithFieldReference() { testExpressionTranslation( - "replace(PathNode/TextField/normalize-space(text()), '\\s+', ' ')", - "{ND-Root} ${replace-regex(BT-00-Text, '\\s+', ' ')}"); + "replace(PathNode/TextField/normalize-space(text()), '[ \\t]+', ' ')", + "{ND-Root} ${replace-regex(BT-00-Text, '[ \\t]+', ' ')}"); + } + + @Test + void testReplaceRegexFunction_WithShorthandPattern_ThrowsError() { + assertThrows(InvalidUsageException.class, () -> + testExpressionTranslationWithContext( + "", "ND-Root", "replace-regex('hello', '\\w+', 'x')")); + } + + @Test + void testReplaceRegexFunction_WithDynamicPattern_DoesNotThrow() { + // Pattern is a field reference (non-literal) — static regex validation is skipped + testExpressionTranslation( + "replace('hello', PathNode/TextField/normalize-space(text()), 'x')", + "{ND-Root} ${replace-regex('hello', BT-00-Text, 'x')}"); } @Test diff --git a/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java b/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java index beeacbfc..e8c8aeaa 100644 --- a/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java +++ b/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java @@ -106,13 +106,13 @@ void testAnchors() { } @Test - void testShorthandClasses() { - assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\d\\D\\w\\W\\s\\S'")); + void testEscapedMetacharacters() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\.\\\\\\(\\)\\[\\]\\{\\}\\*\\+\\?\\|\\^\\$'")); } @Test - void testEscapedMetacharacters() { - assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\.\\\\\\(\\)\\[\\]\\{\\}\\*\\+\\?\\|\\^\\$'")); + void testWhitespaceEscapes() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[ \\t\\r\\n\\f]+'")); } @Test @@ -121,14 +121,9 @@ void testEscapedQuotes() { assertDoesNotThrow(() -> EfxRegexValidator.validate("'a\\\"b'")); } - @Test - void testShorthandClassesInsideCharacterClass() { - assertDoesNotThrow(() -> EfxRegexValidator.validate("'[\\d\\w]'")); - } - @Test void testComplexPattern() { - assertDoesNotThrow(() -> EfxRegexValidator.validate("'^[a-zA-Z]\\d{2,4}(\\.[0-9]+)?$'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'^[a-zA-Z][0-9]{2,4}(\\.[0-9]+)?$'")); } @Test @@ -150,6 +145,55 @@ void testLiteralBraceWhenNotQuantifier() { @Nested class DisallowedConstructs { + @Test + void testShorthandClass_d() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\d'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + + @Test + void testShorthandClass_w() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\w'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + + @Test + void testShorthandClass_s() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\s'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + + @Test + void testShorthandClass_D() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\D'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + + @Test + void testShorthandClass_W() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\W'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + + @Test + void testShorthandClass_S() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\S'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + + @Test + void testShorthandClassInsideCharacterClass() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'[\\d\\w]'")); + assertTrue(ex.getMessage().contains("shorthand class")); + } + @Test void testWordBoundary() { InvalidUsageException ex = assertThrows(InvalidUsageException.class, From 005f97effb4c28d5131a41b387833847c0c16563 Mon Sep 17 00:00:00 2001 From: Ioannis Rosuochatzakis Date: Fri, 20 Feb 2026 14:26:06 +0100 Subject: [PATCH 3/3] TEDEFO-4912 Allow Unicode property escapes; validate replace-regex literal patterns only --- .../ted/efx/util/EfxRegexValidator.java | 23 +++++++++---- .../ted/efx/util/EfxRegexValidatorTest.java | 32 +++++++++++++++++-- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java b/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java index fcf57c8b..f5878f79 100644 --- a/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java +++ b/src/main/java/eu/europa/ted/efx/util/EfxRegexValidator.java @@ -34,6 +34,10 @@ * Escaped metacharacters: {@code \.} {@code \\} {@code \(} {@code \)} * {@code \[} {@code \]} {@code \{} {@code \}} {@code \*} {@code \+} * {@code \?} {@code \|} {@code \^} {@code \$} + * {@code \p{Category}}, {@code \P{Category}} — Unicode property escapes, + * e.g. {@code \p{L}} (any Unicode letter). All EFX target languages support these + * (JavaScript requires the {@code u} flag; Python requires the {@code regex} module + * instead of {@code re} — both are handled transparently by the EFX translator). * * Disallowed constructs (not portable): * @@ -62,7 +66,6 @@ public static void validate(String rawPattern) { return; } - char delimiter = rawPattern.charAt(0); String content = rawPattern.substring(1, rawPattern.length() - 1); int groupDepth = 0; @@ -130,8 +133,17 @@ private static int validateEscape(String rawPattern, String content, int pos) { } if (next == 'p' || next == 'P') { - throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, - "Unicode property escape '\\" + next + "{...}' is not allowed in EFX regex — use character classes like '[a-z]' instead"); + // Unicode property escape \p{Category} or \P{Category} — allowed + if (pos + 2 >= content.length() || content.charAt(pos + 2) != '{') { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "Unicode property escape '\\" + next + "' must be followed by '{Category}' (e.g. \\p{L})"); + } + int closeIdx = content.indexOf('}', pos + 3); + if (closeIdx < 0) { + throw InvalidUsageException.unsupportedRegexConstruct(rawPattern, pos + 1, + "Unicode property escape '\\" + next + "{...}' is missing closing '}'"); + } + return closeIdx; } if (next == '0' || next == 'x' || next == 'u') { @@ -174,9 +186,8 @@ private static int validateCharacterClass(String rawPattern, String content, int i += 2; continue; } - // Reject the same unsupported escapes as outside a character class - validateEscape(rawPattern, content, i); - i += 2; + // Validate using the same rules as outside a character class + i = validateEscape(rawPattern, content, i) + 1; continue; } diff --git a/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java b/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java index e8c8aeaa..1270499f 100644 --- a/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java +++ b/src/test/java/eu/europa/ted/efx/util/EfxRegexValidatorTest.java @@ -115,6 +115,13 @@ void testWhitespaceEscapes() { assertDoesNotThrow(() -> EfxRegexValidator.validate("'[ \\t\\r\\n\\f]+'")); } + @Test + void testUnicodePropertyEscape() { + assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\p{L}+'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'\\P{Z}+'")); + assertDoesNotThrow(() -> EfxRegexValidator.validate("'[\\p{L}\\p{N}]+'")); + } + @Test void testEscapedQuotes() { assertDoesNotThrow(() -> EfxRegexValidator.validate("'a\\'b'")); @@ -216,10 +223,31 @@ void testBackreference() { } @Test - void testUnicodePropertyEscape() { + void testUnicodePropertyEscape_MissingBraces() { InvalidUsageException ex = assertThrows(InvalidUsageException.class, () -> EfxRegexValidator.validate("'\\p'")); - assertTrue(ex.getMessage().contains("Unicode property")); + assertTrue(ex.getMessage().contains("\\p")); + } + + @Test + void testUnicodePropertyEscape_UpperCase_MissingBraces() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\P'")); + assertTrue(ex.getMessage().contains("\\P")); + } + + @Test + void testUnicodePropertyEscape_MissingClosingBrace() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\p{L'")); + assertTrue(ex.getMessage().contains("\\p")); + } + + @Test + void testUnicodePropertyEscape_UpperCase_MissingClosingBrace() { + InvalidUsageException ex = assertThrows(InvalidUsageException.class, + () -> EfxRegexValidator.validate("'\\P{Z'")); + assertTrue(ex.getMessage().contains("\\P")); } @Test