diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 78d529b7baee72..2f98dcf982cf8f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2953,8 +2953,13 @@ RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or /// consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of /// actual matching logic. /// + /// + /// Defaults to false. When true, Capture nodes are transparently unwrapped so the string inside a capture group + /// can be extracted. This must only be set to true for prefix analysis, not for the compiler/source generator, + /// as the compiler must not skip Capture nodes (they have side effects that need to be emitted). + /// /// true if a sequence was found; otherwise, false. - public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false) + public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false, bool unwrapCaptures = false) { Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}"); @@ -2969,6 +2974,19 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil { RegexNode child = Child(i); + // When used for prefix analysis (unwrapCaptures is true), unwrap capture + // groups and atomic groups so their contents can be examined. Capture unwrapping + // must not be done when used by the compiler/source generator, as it would cause + // capture side effects to be skipped. Atomic groups only affect backtracking, not + // what text is matched, so they are safe to unwrap for prefix analysis as well. + if (unwrapCaptures) + { + while (child.Kind is RegexNodeKind.Capture or RegexNodeKind.Atomic) + { + child = child.Child(0); + } + } + if (child.Kind is RegexNodeKind.One) { // We only want to include ASCII characters, and only if they don't participate in case conversion @@ -3006,6 +3024,26 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M); } + else if (child.Kind is RegexNodeKind.Concatenate) + { + // This can occur after unwrapping a Capture whose child is a Concatenate. + // Recurse to extract any case-insensitive string from the inner concatenation. + if (!StackHelper.TryEnsureSufficientExecutionStack() || + !child.TryGetOrdinalCaseInsensitiveString(0, child.ChildCount(), out int innerNodesConsumed, out string? innerStr, consumeZeroWidthNodes, unwrapCaptures)) + { + break; + } + + vsb.Append(innerStr); + + // If the inner concatenation wasn't fully consumed, we can't continue past it + // as subsequent siblings aren't guaranteed to immediately follow the extracted prefix. + if (innerNodesConsumed < child.ChildCount()) + { + i++; + break; + } + } else if (child.Kind is RegexNodeKind.Empty) { // Skip over empty nodes, as they're pure nops. They would ideally have been optimized away, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 136929aa18eef8..4a3747455c5846 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -495,7 +495,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) continue; case RegexNodeKind.Concatenate: - node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true); + node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true, unwrapCaptures: true); return caseInsensitiveString; default: diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index ca75e1abd4404f..b08a8bbe46aa5e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -120,6 +120,29 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex [InlineData(@"(?<=cd)ab", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")] [InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + // Capture groups should be transparent to ordinal case-insensitive prefix extraction + [InlineData(@"(abc)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + [InlineData(@"\b(in)\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "in")] + [InlineData(@"\b(from).+(to)\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "from")] + // Partial capture: inner Concatenate not fully consumed, followed by non-letter One('(') + [InlineData(@"(abcde|abcfg)\(", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + // Partial capture: inner Concatenate not fully consumed, followed by letter Set([Ee]) + [InlineData(@"(abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + // Adjacent captures: both fully consumed via inner Concatenate recursion, extraction continues across capture boundaries + [InlineData(@"(ab)(cd)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")] + // Non-capture content before capture: tests Set processing then Capture unwrap in same Concatenate iteration + [InlineData(@"ab(cd)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")] + // Single-char capture unwraps to Set (not Concatenate), exercises direct Set handling after Capture unwrap + [InlineData(@"a(b)c", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")] + // Empty capture unwraps to Empty node, which is skipped; extraction continues with subsequent content + [InlineData(@"()ab", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")] + // Atomic groups inside a Concatenate are unwrapped like Capture (atomicity only affects backtracking, not what's matched) + [InlineData(@"ab(?>cd)ef", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcdef")] + // Capture wrapping Atomic (and vice versa): while loop peels multiple wrapper layers + [InlineData(@"a((?>bc))d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")] + [InlineData(@"a(?>(bc))d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")] + // Capture containing fixed-count repeater: Setloop with M==N is extractable + [InlineData(@"(ab{3}c)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abbbc")] public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix) { RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options); @@ -127,6 +150,23 @@ public void LeadingPrefix(string pattern, int options, int expectedMode, string Assert.Equal(expectedPrefix, opts.LeadingPrefix); } + [Fact] + [OuterLoop("Stress test for deep nesting")] + public void LeadingPrefix_DeepCaptureNesting_DoesNotStackOverflow() + { + // Deeply nested captures like (((((...))))) with IgnoreCase exercise the recursive + // Capture-unwrapping path in TryGetOrdinalCaseInsensitiveString. Verify it doesn't SO. + const int Depth = 2000; + string pattern = new string('(', Depth) + "ab" + new string(')', Depth); + RegexFindOptimizations opts = ComputeOptimizations(pattern, RegexOptions.IgnoreCase); + // The prefix may or may not be extracted depending on stack limits, but it must not crash. + // If extraction succeeds, it should find "ab". + if (opts.FindMode == FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight) + { + Assert.Equal("ab", opts.LeadingPrefix); + } + } + [Theory] [InlineData(@"[ab]", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")] [InlineData(@"[Aa]", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "Aa")]