Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2953,8 +2953,13 @@ RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or
/// consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of
/// actual matching logic.
/// </param>
/// <param name="unwrapCaptures">
/// Defaults to false. When true, Capture nodes are transparently unwrapped so the string inside a capture group
/// can be extracted. This must only be set to true for prefix analysis, not for the compiler/source generator,
/// as the compiler must not skip Capture nodes (they have side effects that need to be emitted).
/// </param>
/// <returns>true if a sequence was found; otherwise, false.</returns>
public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false)
public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString, bool consumeZeroWidthNodes = false, bool unwrapCaptures = false)
{
Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}");

Expand All @@ -2969,6 +2974,19 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
{
RegexNode child = Child(i);

// When used for prefix analysis (unwrapCaptures is true), unwrap capture
// groups and atomic groups so their contents can be examined. Capture unwrapping
// must not be done when used by the compiler/source generator, as it would cause
// capture side effects to be skipped. Atomic groups only affect backtracking, not
// what text is matched, so they are safe to unwrap for prefix analysis as well.
if (unwrapCaptures)
{
while (child.Kind is RegexNodeKind.Capture or RegexNodeKind.Atomic)
{
child = child.Child(0);
}
}

if (child.Kind is RegexNodeKind.One)
{
// We only want to include ASCII characters, and only if they don't participate in case conversion
Expand Down Expand Up @@ -3006,6 +3024,26 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil

vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M);
}
else if (child.Kind is RegexNodeKind.Concatenate)
{
// This can occur after unwrapping a Capture whose child is a Concatenate.
// Recurse to extract any case-insensitive string from the inner concatenation.
if (!StackHelper.TryEnsureSufficientExecutionStack() ||
!child.TryGetOrdinalCaseInsensitiveString(0, child.ChildCount(), out int innerNodesConsumed, out string? innerStr, consumeZeroWidthNodes, unwrapCaptures))
{
break;
}

vsb.Append(innerStr);

// If the inner concatenation wasn't fully consumed, we can't continue past it
// as subsequent siblings aren't guaranteed to immediately follow the extracted prefix.
if (innerNodesConsumed < child.ChildCount())
{
i++;
break;
}
}
else if (child.Kind is RegexNodeKind.Empty)
{
// Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
continue;

case RegexNodeKind.Concatenate:
node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true);
node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString, consumeZeroWidthNodes: true, unwrapCaptures: true);
return caseInsensitiveString;

default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,53 @@ public void TrailingAnchor(string pattern, int options, int expectedMode, int ex
[InlineData(@"(?<=cd)ab", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft, "ab")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight, "abc")]
[InlineData(@"\bab(?=\w)(?!=\d)c\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
// Capture groups should be transparent to ordinal case-insensitive prefix extraction
[InlineData(@"(abc)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
[InlineData(@"\b(in)\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "in")]
[InlineData(@"\b(from).+(to)\b", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "from")]
// Partial capture: inner Concatenate not fully consumed, followed by non-letter One('(')
[InlineData(@"(abcde|abcfg)\(", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
// Partial capture: inner Concatenate not fully consumed, followed by letter Set([Ee])
[InlineData(@"(abc|abd)e", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
// Adjacent captures: both fully consumed via inner Concatenate recursion, extraction continues across capture boundaries
[InlineData(@"(ab)(cd)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
// Non-capture content before capture: tests Set processing then Capture unwrap in same Concatenate iteration
[InlineData(@"ab(cd)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
// Single-char capture unwraps to Set (not Concatenate), exercises direct Set handling after Capture unwrap
[InlineData(@"a(b)c", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abc")]
// Empty capture unwraps to Empty node, which is skipped; extraction continues with subsequent content
[InlineData(@"()ab", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "ab")]
// Atomic groups inside a Concatenate are unwrapped like Capture (atomicity only affects backtracking, not what's matched)
[InlineData(@"ab(?>cd)ef", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcdef")]
// Capture wrapping Atomic (and vice versa): while loop peels multiple wrapper layers
[InlineData(@"a((?>bc))d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
[InlineData(@"a(?>(bc))d", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abcd")]
// Capture containing fixed-count repeater: Setloop with M==N is extractable
[InlineData(@"(ab{3}c)", (int)RegexOptions.IgnoreCase, (int)FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight, "abbbc")]
public void LeadingPrefix(string pattern, int options, int expectedMode, string expectedPrefix)
{
RegexFindOptimizations opts = ComputeOptimizations(pattern, (RegexOptions)options);
Assert.Equal((FindNextStartingPositionMode)expectedMode, opts.FindMode);
Assert.Equal(expectedPrefix, opts.LeadingPrefix);
}

[Fact]
[OuterLoop("Stress test for deep nesting")]
public void LeadingPrefix_DeepCaptureNesting_DoesNotStackOverflow()
{
// Deeply nested captures like (((((...))))) with IgnoreCase exercise the recursive
// Capture-unwrapping path in TryGetOrdinalCaseInsensitiveString. Verify it doesn't SO.
const int Depth = 2000;
string pattern = new string('(', Depth) + "ab" + new string(')', Depth);
RegexFindOptimizations opts = ComputeOptimizations(pattern, RegexOptions.IgnoreCase);
// The prefix may or may not be extracted depending on stack limits, but it must not crash.
// If extraction succeeds, it should find "ab".
if (opts.FindMode == FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight)
{
Assert.Equal("ab", opts.LeadingPrefix);
}
}

[Theory]
[InlineData(@"[ab]", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "ab")]
[InlineData(@"[Aa]", 0, (int)FindNextStartingPositionMode.LeadingSet_LeftToRight, "Aa")]
Expand Down
Loading