diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index 78d529b7baee72..2f98dcf982cf8f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -2953,8 +2953,13 @@ RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or
/// consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of
/// actual matching logic.
///
+ ///
+ /// Defaults to false. When true, Capture nodes are transparently unwrapped so the string inside a capture group
+ /// can be extracted. This must only be set to true for prefix analysis, not for the compiler/source generator,
+ /// as the compiler must not skip Capture nodes (they have side effects that need to be emitted).
+ ///
/// true if a sequence was found; otherwise, false.
- public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false)
+ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false, bool unwrapCaptures = false)
{
Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");
@@ -2969,6 +2974,19 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
{
RegexNode child = Child(i);
+ // When used for prefix analysis (unwrapCaptures is true), unwrap capture
+ // groups and atomic groups so their contents can be examined. Capture unwrapping
+ // must not be done when used by the compiler/source generator, as it would cause
+ // capture side effects to be skipped. Atomic groups only affect backtracking, not
+ // what text is matched, so they are safe to unwrap for prefix analysis as well.
+ if (unwrapCaptures)
+ {
+ while (child.Kind is RegexNodeKind.Capture or RegexNodeKind.Atomic)
+ {
+ child = child.Child(0);
+ }
+ }
+
if (child.Kind is RegexNodeKind.One)
{
// We only want to include ASCII characters, and only if they don't participate in case conversion
@@ -3006,6 +3024,26 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M);
}
+ else if (child.Kind is RegexNodeKind.Concatenate)
+ {
+ // This can occur after unwrapping a Capture whose child is a Concatenate.
+ // Recurse to extract any case-insensitive string from the inner concatenation.
+ if (!StackHelper.TryEnsureSufficientExecutionStack() ||
+ !child.TryGetOrdinalCaseInsensitiveString(0, child.ChildCount(), out int innerNodesConsumed, out string? innerStr, consumeZeroWidthNodes, unwrapCaptures))
+ {
+ break;
+ }
+
+ vsb.Append(innerStr);
+
+ // If the inner concatenation wasn't fully consumed, we can't continue past it
+ // as subsequent siblings aren't guaranteed to immediately follow the extracted prefix.
+ if (innerNodesConsumed < child.ChildCount())
+ {
+ i++;
+ break;
+ }
+ }
else if (child.Kind is RegexNodeKind.Empty)
{
// Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index 136929aa18eef8..4a3747455c5846 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -495,7 +495,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
continue;
case RegexNodeKind.Concatenate:
- node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true);
+ node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true, unwrapCaptures: true);
return caseInsensitiveString;
default:
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
index ca75e1abd4404f..b08a8bbe46aa5e 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs
@@ -120,6 +120,29 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex
[InlineData(@"(?<=cd)ab", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
+ // Capture groups should be transparent to ordinal case-insensitive prefix extraction
+ [InlineData(@"(abc)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
+ [InlineData(@"\b(in)\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "in")]
+ [InlineData(@"\b(from).+(to)\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "from")]
+ // Partial capture: inner Concatenate not fully consumed, followed by non-letter One('(')
+ [InlineData(@"(abcde|abcfg)\(", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
+ // Partial capture: inner Concatenate not fully consumed, followed by letter Set([Ee])
+ [InlineData(@"(abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
+ // Adjacent captures: both fully consumed via inner Concatenate recursion, extraction continues across capture boundaries
+ [InlineData(@"(ab)(cd)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
+ // Non-capture content before capture: tests Set processing then Capture unwrap in same Concatenate iteration
+ [InlineData(@"ab(cd)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
+ // Single-char capture unwraps to Set (not Concatenate), exercises direct Set handling after Capture unwrap
+ [InlineData(@"a(b)c", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
+ // Empty capture unwraps to Empty node, which is skipped; extraction continues with subsequent content
+ [InlineData(@"()ab", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
+ // Atomic groups inside a Concatenate are unwrapped like Capture (atomicity only affects backtracking, not what's matched)
+ [InlineData(@"ab(?>cd)ef", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcdef")]
+ // Capture wrapping Atomic (and vice versa): while loop peels multiple wrapper layers
+ [InlineData(@"a((?>bc))d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
+ [InlineData(@"a(?>(bc))d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
+ // Capture containing fixed-count repeater: Setloop with M==N is extractable
+ [InlineData(@"(ab{3}c)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abbbc")]
public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix)
{
RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options);
@@ -127,6 +150,23 @@ public void LeadingPrefix(string pattern, int options, int expectedMode, string
Assert.Equal(expectedPrefix, opts.LeadingPrefix);
}
+ [Fact]
+ [OuterLoop("Stress test for deep nesting")]
+ public void LeadingPrefix_DeepCaptureNesting_DoesNotStackOverflow()
+ {
+ // Deeply nested captures like (((((...))))) with IgnoreCase exercise the recursive
+ // Capture-unwrapping path in TryGetOrdinalCaseInsensitiveString. Verify it doesn't SO.
+ const int Depth = 2000;
+ string pattern = new string('(', Depth) + "ab" + new string(')', Depth);
+ RegexFindOptimizations opts = ComputeOptimizations(pattern, RegexOptions.IgnoreCase);
+ // The prefix may or may not be extracted depending on stack limits, but it must not crash.
+ // If extraction succeeds, it should find "ab".
+ if (opts.FindMode == FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight)
+ {
+ Assert.Equal("ab", opts.LeadingPrefix);
+ }
+ }
+
[Theory]
[InlineData(@"[ab]", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")]
[InlineData(@"[Aa]", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "Aa")]