From c28883130fbb7554015d3eeb150b3f7f9a4899eb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 16 May 2026 10:29:24 +0000
Subject: [PATCH 1/6] Add pathological-regex discovery tests across all 16
 ports

A single 10-case panel (P1-P10) drives each port's re_* API to surface
where regex wrappers misbehave on pathological inputs (catastrophic
backtracking, zero-width replace, large bounded quantifiers, invalid
patterns, RE2-vs-PCRE drift, non-ASCII handling). Each test catches
errors per case so one failure doesn't mask the others.

See REGEX_PATHOLOGICAL.md for the panel definition, the per-port
results from the first run, and the failures discovered (notably:
rust stack-overflow on a{0,10000}, php silently accepts invalid
patterns, go MustCompile panics on three cases, three distinct
zero-width-replace conventions, perl UTF-8 round-trip corruption).
---
 REGEX_PATHOLOGICAL.md                         | 116 ++++++++++++++++
 c/tests/regex_pathological.c                  | 127 ++++++++++++++++++
 cpp/tests/regex_pathological.cpp              |  84 ++++++++++++
 csharp/tests/RegexPathologicalTest.cs         |  54 ++++++++
 go/regex_pathological_test.go                 |  54 ++++++++
 java/src/test/RegexPathologicalTest.java      |  46 +++++++
 javascript/test/regex_pathological.test.js    |  43 ++++++
 .../voxgig/struct/RegexPathologicalTest.kt    |  41 ++++++
 lua/test/regex_pathological.lua               |  62 +++++++++
 perl/t/regex_pathological.t                   |  51 +++++++
 php/tests/RegexPathologicalTest.php           |  45 +++++++
 python/tests/test_regex_pathological.py       |  51 +++++++
 ruby/test_regex_pathological.rb               |  37 +++++
 rust/tests/regex_pathological.rs              |  64 +++++++++
 .../RegexPathologicalTests.swift              |  39 ++++++
 .../dist-test/regex_pathological.test.js      |  41 ++++++
 .../dist-test/regex_pathological.test.js.map  |   1 +
 typescript/test/regex_pathological.test.ts    |  45 +++++++
 zig/test/regex_pathological.zig               |  94 +++++++++++++
 19 files changed, 1095 insertions(+)
 create mode 100644 REGEX_PATHOLOGICAL.md
 create mode 100644 c/tests/regex_pathological.c
 create mode 100644 cpp/tests/regex_pathological.cpp
 create mode 100644 csharp/tests/RegexPathologicalTest.cs
 create mode 100644 go/regex_pathological_test.go
 create mode 100644 java/src/test/RegexPathologicalTest.java
 create mode 100644 javascript/test/regex_pathological.test.js
 create mode 100644 kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt
 create mode 100644 lua/test/regex_pathological.lua
 create mode 100644 perl/t/regex_pathological.t
 create mode 100644 php/tests/RegexPathologicalTest.php
 create mode 100644 python/tests/test_regex_pathological.py
 create mode 100644 ruby/test_regex_pathological.rb
 create mode 100644 rust/tests/regex_pathological.rs
 create mode 100644 swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift
 create mode 100644 typescript/dist-test/regex_pathological.test.js
 create mode 100644 typescript/dist-test/regex_pathological.test.js.map
 create mode 100644 typescript/test/regex_pathological.test.ts
 create mode 100644 zig/test/regex_pathological.zig

diff --git a/REGEX_PATHOLOGICAL.md b/REGEX_PATHOLOGICAL.md
new file mode 100644
index 0000000..5a8ed95
--- /dev/null
+++ b/REGEX_PATHOLOGICAL.md
@@ -0,0 +1,116 @@
+# Pathological Regex — Cross-Port Discovery
+
+> First-pass discovery test. Goal is to surface where each port's regex
+> wrapper misbehaves on pathological inputs. **Not for assertion** —
+> behaviour differs across engines and the test files do not enforce a
+> specific outcome. Fixes come later; this run is to find them.
+
+The same 10-case panel runs in every port via the port's `re_*` API
+(see `REGEX_API.md`). Each port has a `regex_pathological*` test file
+under its own tests directory.
+
+## The panel
+
+| # | Name | Call | What it stresses |
+|---|---|---|---|
+| P1  | `redos_nested_plus`         | `re_test("^(a+)+$", "a"*22 + "!")` | Catastrophic backtracking via nested quantifier |
+| P2  | `redos_alt_overlap`         | `re_test("^(a\|aa)+$", "a"*22 + "!")` | Catastrophic backtracking via overlapping alternation |
+| P3  | `empty_repeat_replace`      | `re_replace("a*", "abc", "X")` | Zero-width-match convention in `replace_all` |
+| P4  | `unicode_replace_dot`       | `re_replace("\\.", "café.au.lait", "/")` | UTF-8 char-boundary handling |
+| P5  | `unicode_find_codepoint`    | `re_find("é", "café au lait")` | Non-ASCII patterns |
+| P6  | `deep_nesting_compile`      | `re_test("(((…40…(a)…)))","a")` | Parser/compiler stack |
+| P7  | `big_bounded_quantifier`    | `re_test("^a{0,10000}b$", "a"*10+"b")` | Large bounded quantifier |
+| P8  | `invalid_pattern`           | `re_compile("[abc")` | Error reporting |
+| P9  | `backref_re2_forbidden`     | `re_test("^(a+)\\1$", "aaaa")` | RE2 strictness on backrefs |
+| P10 | `find_all_zero_width`       | `re_find_all("a*", "bbb")` | Zero-width `find_all` enumeration |
+
+## Findings (first run)
+
+Times in ms — wall-clock per case.
+
+| Port       | P1 (ms) | P2  | P3 result    | P4 result        | P7  | P8                | P9 result |
+|------------|--------:|----:|--------------|------------------|----:|-------------------|-----------|
+| typescript |  169    | 3   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| javascript |  172    | 3   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| python     |  185    | 4   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| ruby       |    0.04 | 0.03| `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| php        |    2    | 0.3 | `"XXbXcX"`   | `café/au/lait`   | OK  | **OK (silent!)**  | matches   |
+| perl       |    0.06 | 0.04| `"XXbXcX"`   | **`cafÃ©/au/lait`** | OK  | ERR (clean)    | matches   |
+| go         |    0.05 | 0.02| **`"XbXcX"`** | `café/au/lait`  | **PANIC** | **PANIC**   | **PANIC** |
+| rust       |    0.04 | 0.02| `"XXbXcX"`   | `café/au/lait`   | **STACK-OVERFLOW** | — (binary aborted) | — |
+| java       |   16    | 0.2 | `"XXbXcX"`   | `caf?/au/lait`†  | OK  | ERR (clean)       | matches   |
+| cpp        | **1349**| 28  | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| c          |    0.01 | 0.01| **`"XaXbXcX"`** | `café/au/lait`| OK  | ERR (NULL return) | non-match |
+| lua        |    0.10 | 0.13| **`"XaXbXcX"`** | `café/au/lait`| OK  | ERR (clean)       | non-match |
+| csharp     |  359    | 7   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| kotlin     |   30    | 0.2 | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
+| swift      | n/r     |     |              |                  |     |                   |           |
+| zig        | n/r     |     |              |                  |     |                   |           |
+
+n/r = not run (toolchain unavailable in this environment).
+† Java prints `?` because stdout's default encoding is platform-dependent, not the regex; the JVM-internal string is correctly `café`.
+
+## Failures discovered
+
+1. **rust — `re_test("^a{0,10000}b$", …)` overflows the matcher's stack.**
+   The in-tree Thompson engine appears to allocate per-repeat state on the
+   call stack. The whole test binary aborts (SIGABRT). `panic::catch_unwind`
+   cannot recover from stack-exhaustion. P8/P9/P10 are never reached.
+
+2. **php — `re_compile("[abc")` returns a valid-looking delimited pattern
+   and `re_test` returns `false` silently.** `php/src/Struct.php:565` does
+   `'/' . str_replace('/', '\\/', $pattern) . '/'` without ever compiling;
+   `re_test` uses `@preg_match` which suppresses warnings. Callers can't
+   tell a bad pattern from a no-match.
+
+3. **go — `ReCompile`/`ReTest`/`ReReplace` all use `regexp.MustCompile`
+   and panic** on (a) bounded quantifiers > 1000 (`a{0,10000}` — RE2 limit),
+   (b) invalid patterns, and (c) backrefs (not supported by RE2). The Go
+   API has no shape that lets a caller catch a compile error.
+
+4. **Three distinct P3 conventions for zero-width `replace_all`:**
+   - JS/TS/Python/Ruby/PHP/Java/.NET/Kotlin/Rust/C++ → `"XXbXcX"`
+   - Go → `"XbXcX"`
+   - C / Lua (in-tree engines) → `"XaXbXcX"` (the matching `a` is replaced AND a zero-width insertion is emitted)
+   Internal call sites that depend on the exact shape will diverge.
+
+5. **C and Lua return `false` for the backref pattern P9** rather than
+   erroring; the parser silently consumes `\1` as something other than a
+   backref. RE2 ports (Go) reject it; PCRE/ECMA ports (everyone else) match.
+
+6. **C++ libstdc++ regex shows catastrophic backtracking** — 1349 ms on
+   `(a+)+` over 22 a's. C# (.NET) and Python next worst (~350 ms / ~185 ms).
+   Go / Rust / Ruby / Perl / C / Lua are all sub-millisecond (no backtracking).
+
+7. **perl — UTF-8 round-tripping in `re_replace` corrupts output.** P4
+   returned `cafÃ©/au/lait`. Either the regex returns octets or the JSON
+   encoder treats characters-as-bytes — encoding boundary is wrong in the
+   port.
+
+8. **C public header omits `re_find_all`** — surface gap vs the rest of
+   the ports.
+
+9. **Zig public surface omits `re_find`, `re_find_all`, `re_replace`** —
+   only `re_compile`, `re_test`, `re_escape` are exported. Half the
+   `REGEX_API.md` contract is unimplemented.
+
+## Where the test files live
+
+| Port       | Path |
+|------------|------|
+| typescript | `typescript/test/regex_pathological.test.ts` |
+| javascript | `javascript/test/regex_pathological.test.js` |
+| python     | `python/tests/test_regex_pathological.py` |
+| ruby       | `ruby/test_regex_pathological.rb` |
+| php        | `php/tests/RegexPathologicalTest.php` |
+| perl       | `perl/t/regex_pathological.t` |
+| go         | `go/regex_pathological_test.go` |
+| rust       | `rust/tests/regex_pathological.rs` |
+| java       | `java/src/test/RegexPathologicalTest.java` |
+| cpp        | `cpp/tests/regex_pathological.cpp` |
+| c          | `c/tests/regex_pathological.c` |
+| lua        | `lua/test/regex_pathological.lua` |
+| csharp     | `csharp/tests/RegexPathologicalTest.cs` |
+| kotlin     | `kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt` |
+| swift      | `swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift` |
+| zig        | `zig/test/regex_pathological.zig` |
diff --git a/c/tests/regex_pathological.c b/c/tests/regex_pathological.c
new file mode 100644
index 0000000..c013882
--- /dev/null
+++ b/c/tests/regex_pathological.c
@@ -0,0 +1,127 @@
+/* Discovery test: pathological regex inputs run against the port's vs_re_*
+ * API. Goal is to surface failures across ports, not to assert behaviour.
+ * The panel is the same in every port (see REGEX.md).
+ *
+ * C has no exception machinery, so this records the return value (or NULL)
+ * for each case. A crash here means the engine aborted on that input.
+ */
+
+#include "voxgig_struct.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+static double now_ms(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec * 1000.0 + ts.tv_nsec / 1e6;
+}
+
+static char* repeat(char c, size_t n) {
+  char* s = (char*)malloc(n + 1);
+  memset(s, c, n);
+  s[n] = '\0';
+  return s;
+}
+
+static void print_strvec(const vs_strvec* v) {
+  printf("[");
+  for (size_t i = 0; i < v->len; i++) {
+    printf("%s\"%s\"", i ? "," : "", v->data[i] ? v->data[i] : "");
+  }
+  printf("]");
+}
+
+int main(void) {
+  char* a22 = repeat('a', 22);
+  char* p1_in = (char*)malloc(strlen(a22) + 2);
+  sprintf(p1_in, "%s!", a22);
+
+  char* opens = repeat('(', 40);
+  char* closes = repeat(')', 40);
+  char* nest40 = (char*)malloc(40 + 1 + 40 + 1);
+  sprintf(nest40, "%sa%s", opens, closes);
+
+  double t0, ms;
+
+  /* P1 */
+  t0 = now_ms();
+  bool b1 = vs_re_test("^(a+)+$", p1_in);
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P1_redos_nested_plus | %.2fms | OK | %s\n", ms, b1 ? "true" : "false");
+
+  /* P2 */
+  t0 = now_ms();
+  bool b2 = vs_re_test("^(a|aa)+$", p1_in);
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P2_redos_alt_overlap | %.2fms | OK | %s\n", ms, b2 ? "true" : "false");
+
+  /* P3 */
+  t0 = now_ms();
+  char* p3 = vs_re_replace("a*", "abc", "X");
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P3_empty_repeat_replace | %.2fms | OK | \"%s\"\n", ms, p3 ? p3 : "(null)");
+  free(p3);
+
+  /* P4 */
+  t0 = now_ms();
+  char* p4 = vs_re_replace("\\.", "café.au.lait", "/");
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P4_unicode_replace_dot | %.2fms | OK | \"%s\"\n", ms, p4 ? p4 : "(null)");
+  free(p4);
+
+  /* P5 */
+  t0 = now_ms();
+  vs_strvec p5 = vs_re_find("é", "café au lait");
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P5_unicode_find_codepoint | %.2fms | OK | ", ms);
+  print_strvec(&p5);
+  printf("\n");
+  vs_strvec_free(&p5);
+
+  /* P6 */
+  t0 = now_ms();
+  bool b6 = vs_re_test(nest40, "a");
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P6_deep_nesting_compile | %.2fms | OK | %s\n", ms, b6 ? "true" : "false");
+
+  /* P7 */
+  t0 = now_ms();
+  char* p7_in = (char*)malloc(12);
+  sprintf(p7_in, "%sb", "aaaaaaaaaa");
+  bool b7 = vs_re_test("^a{0,10000}b$", p7_in);
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P7_big_bounded_quantifier | %.2fms | OK | %s\n", ms, b7 ? "true" : "false");
+  free(p7_in);
+
+  /* P8 — invalid pattern. vs_re_compile returns NULL on error. */
+  t0 = now_ms();
+  vs_regex* p8 = vs_re_compile("[abc");
+  ms = now_ms() - t0;
+  if (p8) {
+    printf("[regex-discovery] P8_invalid_pattern | %.2fms | OK | \"compiled-without-error\"\n", ms);
+    /* leak: no vs_regex_free in public header */
+  } else {
+    printf("[regex-discovery] P8_invalid_pattern | %.2fms | ERR | compile returned NULL\n", ms);
+  }
+
+  /* P9 */
+  t0 = now_ms();
+  bool b9 = vs_re_test("^(a+)\\1$", "aaaa");
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P9_backref_re2_forbidden | %.2fms | OK | %s\n", ms, b9 ? "true" : "false");
+
+  /* P10 */
+  t0 = now_ms();
+  (void)vs_re_test("a*", "bbb");  /* find_all not in public header */
+  ms = now_ms() - t0;
+  printf("[regex-discovery] P10_find_all_zero_width | %.2fms | OK | <find_all not exposed>\n", ms);
+
+  free(a22);
+  free(p1_in);
+  free(opens);
+  free(closes);
+  free(nest40);
+  return 0;
+}
diff --git a/cpp/tests/regex_pathological.cpp b/cpp/tests/regex_pathological.cpp
new file mode 100644
index 0000000..066233e
--- /dev/null
+++ b/cpp/tests/regex_pathological.cpp
@@ -0,0 +1,84 @@
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+#include "voxgig_struct.hpp"
+
+#include <chrono>
+#include <cstdio>
+#include <functional>
+#include <regex>
+#include <sstream>
+#include <string>
+
+using namespace voxgig::structlib;
+
+// Render outcomes as JSON-ish so output matches the other ports.
+static std::string j_str(const std::string& s) {
+  std::string out = "\"";
+  for (char c : s) {
+    if (c == '"' || c == '\\')
+      out.push_back('\\'), out.push_back(c);
+    else
+      out.push_back(c);
+  }
+  out.push_back('"');
+  return out;
+}
+
+template <typename F>
+static void record(const char* label, F fn) {
+  auto t0 = std::chrono::steady_clock::now();
+  std::string outcome;
+  try {
+    outcome = std::string("OK | ") + fn();
+  } catch (const std::exception& e) {
+    outcome = std::string("ERR | ") + typeid(e).name() + ": " + e.what();
+  } catch (...) {
+    outcome = "ERR | unknown exception";
+  }
+  double ms = std::chrono::duration<double, std::milli>(
+                std::chrono::steady_clock::now() - t0)
+                .count();
+  std::printf("[regex-discovery] %s | %.2fms | %s\n", label, ms, outcome.c_str());
+}
+
+static std::string as_bool(bool b) { return b ? "true" : "false"; }
+
+static std::string as_vec(const std::vector<std::string>& v) {
+  std::string s = "[";
+  for (size_t i = 0; i < v.size(); i++) {
+    if (i) s += ",";
+    s += j_str(v[i]);
+  }
+  s += "]";
+  return s;
+}
+
+static std::string as_vec2(const std::vector<std::vector<std::string>>& v) {
+  std::string s = "[";
+  for (size_t i = 0; i < v.size(); i++) {
+    if (i) s += ",";
+    s += as_vec(v[i]);
+  }
+  s += "]";
+  return s;
+}
+
+int main() {
+  std::string a22(22, 'a');
+  std::string nest40 = std::string(40, '(') + "a" + std::string(40, ')');
+
+  record("P1_redos_nested_plus",      [&] { return as_bool(re_test("^(a+)+$", a22 + "!")); });
+  record("P2_redos_alt_overlap",      [&] { return as_bool(re_test("^(a|aa)+$", a22 + "!")); });
+  record("P3_empty_repeat_replace",   [&] { return j_str(re_replace("a*", "abc", "X")); });
+  record("P4_unicode_replace_dot",    [&] { return j_str(re_replace("\\.", "café.au.lait", "/")); });
+  record("P5_unicode_find_codepoint", [&] { return as_vec(re_find("é", "café au lait")); });
+  record("P6_deep_nesting_compile",   [&] { return as_bool(re_test(nest40, "a")); });
+  record("P7_big_bounded_quantifier", [&] { return as_bool(re_test("^a{0,10000}b$", std::string(10, 'a') + "b")); });
+  record("P8_invalid_pattern",        [&] { (void)re_compile("[abc"); return std::string("\"compiled\""); });
+  record("P9_backref_re2_forbidden",  [&] { return as_bool(re_test("^(a+)\\1$", "aaaa")); });
+  record("P10_find_all_zero_width",   [&] { return as_vec2(re_find_all("a*", "bbb")); });
+
+  return 0;
+}
diff --git a/csharp/tests/RegexPathologicalTest.cs b/csharp/tests/RegexPathologicalTest.cs
new file mode 100644
index 0000000..122a632
--- /dev/null
+++ b/csharp/tests/RegexPathologicalTest.cs
@@ -0,0 +1,54 @@
+/* Copyright (c) 2025-2026 Voxgig Ltd. MIT LICENSE. */
+
+// RUN: cd csharp/tests && dotnet test --filter "DisplayName~RegexPathological"
+//
+// Discovery test: pathological regex inputs run against the port's Re* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+using System;
+using System.Diagnostics;
+using System.Text.Json;
+using static Voxgig.Struct.StructUtils;
+using Xunit;
+
+namespace Voxgig.Tests;
+
+public class RegexPathologicalTest
+{
+    private static void Record(string label, Func<object?> fn)
+    {
+        var sw = Stopwatch.StartNew();
+        string outcome;
+        try
+        {
+            var r = fn();
+            outcome = "OK | " + JsonSerializer.Serialize(r);
+        }
+        catch (Exception e)
+        {
+            outcome = "ERR | " + e.GetType().Name + ": " + e.Message;
+        }
+        sw.Stop();
+        var ms = sw.Elapsed.TotalMilliseconds;
+        Console.WriteLine($"[regex-discovery] {label} | {ms:F2}ms | {outcome}");
+    }
+
+    [Fact]
+    public void Panel()
+    {
+        var a22 = new string('a', 22);
+        var nest40 = new string('(', 40) + "a" + new string(')', 40);
+
+        Record("P1_redos_nested_plus",      () => ReTest("^(a+)+$", a22 + "!"));
+        Record("P2_redos_alt_overlap",      () => ReTest("^(a|aa)+$", a22 + "!"));
+        Record("P3_empty_repeat_replace",   () => ReReplace("a*", "abc", "X"));
+        Record("P4_unicode_replace_dot",    () => ReReplace("\\.", "café.au.lait", "/"));
+        Record("P5_unicode_find_codepoint", () => ReFind("é", "café au lait"));
+        Record("P6_deep_nesting_compile",   () => ReTest(nest40, "a"));
+        Record("P7_big_bounded_quantifier", () => ReTest("^a{0,10000}b$", new string('a', 10) + "b"));
+        Record("P8_invalid_pattern",        () => ReCompile("[abc"));
+        Record("P9_backref_re2_forbidden",  () => ReTest("^(a+)\\1$", "aaaa"));
+        Record("P10_find_all_zero_width",   () => ReFindAll("a*", "bbb"));
+    }
+}
diff --git a/go/regex_pathological_test.go b/go/regex_pathological_test.go
new file mode 100644
index 0000000..ef673c0
--- /dev/null
+++ b/go/regex_pathological_test.go
@@ -0,0 +1,54 @@
+// RUN: go test -run=TestRegexPathological -v
+//
+// Discovery test: pathological regex inputs run against the port's Re* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+package voxgigstruct_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	voxgigstruct "github.com/voxgig/struct/go"
+)
+
+func record(label string, fn func() any) {
+	t0 := time.Now()
+	var outcome string
+	func() {
+		defer func() {
+			if r := recover(); r != nil {
+				outcome = fmt.Sprintf("ERR | panic: %v", r)
+			}
+		}()
+		r := fn()
+		b, err := json.Marshal(r)
+		if err != nil {
+			outcome = fmt.Sprintf("OK | <unjsonable %T>: %v", r, r)
+			return
+		}
+		outcome = fmt.Sprintf("OK | %s", string(b))
+	}()
+	ms := float64(time.Since(t0).Microseconds()) / 1000.0
+	fmt.Printf("[regex-discovery] %s | %.2fms | %s\n", label, ms, outcome)
+}
+
+func TestRegexPathological(t *testing.T) {
+	a22 := strings.Repeat("a", 22)
+	nest40 := strings.Repeat("(", 40) + "a" + strings.Repeat(")", 40)
+
+	record("P1_redos_nested_plus", func() any { return voxgigstruct.ReTest("^(a+)+$", a22+"!") })
+	record("P2_redos_alt_overlap", func() any { return voxgigstruct.ReTest("^(a|aa)+$", a22+"!") })
+	record("P3_empty_repeat_replace", func() any { return voxgigstruct.ReReplace("a*", "abc", "X") })
+	record("P4_unicode_replace_dot", func() any { return voxgigstruct.ReReplace(`\.`, "café.au.lait", "/") })
+	record("P5_unicode_find_codepoint", func() any { return voxgigstruct.ReFind("é", "café au lait") })
+	record("P6_deep_nesting_compile", func() any { return voxgigstruct.ReTest(nest40, "a") })
+	record("P7_big_bounded_quantifier", func() any { return voxgigstruct.ReTest("^a{0,10000}b$", strings.Repeat("a", 10)+"b") })
+	record("P8_invalid_pattern", func() any { return voxgigstruct.ReCompile("[abc") != nil })
+	record("P9_backref_re2_forbidden", func() any { return voxgigstruct.ReTest(`^(a+)\1$`, "aaaa") })
+	record("P10_find_all_zero_width", func() any { return voxgigstruct.ReFindAll("a*", "bbb") })
+}
diff --git a/java/src/test/RegexPathologicalTest.java b/java/src/test/RegexPathologicalTest.java
new file mode 100644
index 0000000..b1b6870
--- /dev/null
+++ b/java/src/test/RegexPathologicalTest.java
@@ -0,0 +1,46 @@
+// RUN: mvn -Dtest=RegexPathologicalTest test
+//
+// Discovery test: pathological regex inputs run against the port's re* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+package voxgig.struct;
+
+import com.google.gson.Gson;
+import org.junit.jupiter.api.Test;
+
+import java.util.function.Supplier;
+
+class RegexPathologicalTest {
+  private static final Gson GSON = new Gson();
+
+  private static void record(String label, Supplier<Object> fn) {
+    long t0 = System.nanoTime();
+    String outcome;
+    try {
+      Object r = fn.get();
+      outcome = "OK | " + GSON.toJson(r);
+    } catch (Throwable e) {
+      outcome = "ERR | " + e.getClass().getSimpleName() + ": " + e.getMessage();
+    }
+    double ms = (System.nanoTime() - t0) / 1e6;
+    System.out.printf("[regex-discovery] %s | %.2fms | %s%n", label, ms, outcome);
+  }
+
+  @Test
+  void panel() {
+    String a22 = "a".repeat(22);
+    String nest40 = "(".repeat(40) + "a" + ")".repeat(40);
+
+    record("P1_redos_nested_plus",      () -> Struct.reTest("^(a+)+$", a22 + "!"));
+    record("P2_redos_alt_overlap",      () -> Struct.reTest("^(a|aa)+$", a22 + "!"));
+    record("P3_empty_repeat_replace",   () -> Struct.reReplace("a*", "abc", "X"));
+    record("P4_unicode_replace_dot",    () -> Struct.reReplace("\\.", "café.au.lait", "/"));
+    record("P5_unicode_find_codepoint", () -> Struct.reFind("é", "café au lait"));
+    record("P6_deep_nesting_compile",   () -> Struct.reTest(nest40, "a"));
+    record("P7_big_bounded_quantifier", () -> Struct.reTest("^a{0,10000}b$", "a".repeat(10) + "b"));
+    record("P8_invalid_pattern",        () -> Struct.reCompile("[abc"));
+    record("P9_backref_re2_forbidden",  () -> Struct.reTest("^(a+)\\1$", "aaaa"));
+    record("P10_find_all_zero_width",   () -> Struct.reFindAll("a*", "bbb"));
+  }
+}
diff --git a/javascript/test/regex_pathological.test.js b/javascript/test/regex_pathological.test.js
new file mode 100644
index 0000000..f10177c
--- /dev/null
+++ b/javascript/test/regex_pathological.test.js
@@ -0,0 +1,43 @@
+// VERSION: @voxgig/struct 0.1.0
+//
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// The goal is to surface which inputs cause errors, hangs, or surprising
+// output across ports — NOT to assert any specific behaviour. Each case
+// wraps the call in try/catch so one failure does not mask the others.
+// The panel is the same in every port (see REGEX.md).
+
+const { test } = require('node:test')
+const struct = require('../src/struct')
+
+const { re_compile, re_test, re_find, re_find_all, re_replace } = struct
+
+function rep(s, n) { return new Array(n + 1).join(s) }
+
+function record(label, fn) {
+  const t0 = process.hrtime.bigint()
+  let outcome
+  try {
+    const r = fn()
+    outcome = `OK | ${JSON.stringify(r)}`
+  } catch (e) {
+    outcome = `ERR | ${e && e.message ? e.message : String(e)}`
+  }
+  const ms = Number(process.hrtime.bigint() - t0) / 1e6
+  console.log(`[regex-discovery] ${label} | ${ms.toFixed(2)}ms | ${outcome}`)
+}
+
+test('regex pathological discovery', () => {
+  const A22 = rep('a', 22)
+  const NEST40 = rep('(', 40) + 'a' + rep(')', 40)
+
+  record('P1_redos_nested_plus',     () => re_test('^(a+)+$', A22 + '!'))
+  record('P2_redos_alt_overlap',     () => re_test('^(a|aa)+$', A22 + '!'))
+  record('P3_empty_repeat_replace',  () => re_replace('a*', 'abc', 'X'))
+  record('P4_unicode_replace_dot',   () => re_replace('\\.', 'café.au.lait', '/'))
+  record('P5_unicode_find_codepoint',() => re_find('é', 'café au lait'))
+  record('P6_deep_nesting_compile',  () => re_test(NEST40, 'a'))
+  record('P7_big_bounded_quantifier',() => re_test('^a{0,10000}b$', rep('a', 10) + 'b'))
+  record('P8_invalid_pattern',       () => re_compile('[abc'))
+  record('P9_backref_re2_forbidden', () => re_test('^(a+)\\1$', 'aaaa'))
+  record('P10_find_all_zero_width',  () => re_find_all('a*', 'bbb'))
+})
diff --git a/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt b/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt
new file mode 100644
index 0000000..e9bed34
--- /dev/null
+++ b/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt
@@ -0,0 +1,41 @@
+// Discovery test: pathological regex inputs run against the port's re* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+package voxgig.struct
+
+import com.google.gson.Gson
+import kotlin.test.Test
+
+class RegexPathologicalTest {
+    private val gson = Gson()
+
+    private fun record(label: String, fn: () -> Any?) {
+        val t0 = System.nanoTime()
+        val outcome: String = try {
+            val r = fn()
+            "OK | " + gson.toJson(r)
+        } catch (e: Throwable) {
+            "ERR | ${e::class.simpleName}: ${e.message}"
+        }
+        val ms = (System.nanoTime() - t0) / 1e6
+        println("[regex-discovery] %s | %.2fms | %s".format(label, ms, outcome))
+    }
+
+    @Test
+    fun panel() {
+        val a22 = "a".repeat(22)
+        val nest40 = "(".repeat(40) + "a" + ")".repeat(40)
+
+        record("P1_redos_nested_plus")      { Struct.reTest("^(a+)+\$", a22 + "!") }
+        record("P2_redos_alt_overlap")      { Struct.reTest("^(a|aa)+\$", a22 + "!") }
+        record("P3_empty_repeat_replace")   { Struct.reReplace("a*", "abc", "X") }
+        record("P4_unicode_replace_dot")    { Struct.reReplace("\\.", "café.au.lait", "/") }
+        record("P5_unicode_find_codepoint") { Struct.reFind("é", "café au lait") }
+        record("P6_deep_nesting_compile")   { Struct.reTest(nest40, "a") }
+        record("P7_big_bounded_quantifier") { Struct.reTest("^a{0,10000}b\$", "a".repeat(10) + "b") }
+        record("P8_invalid_pattern")        { Struct.reCompile("[abc") }
+        record("P9_backref_re2_forbidden")  { Struct.reTest("^(a+)\\1\$", "aaaa") }
+        record("P10_find_all_zero_width")   { Struct.reFindAll("a*", "bbb") }
+    }
+}
diff --git a/lua/test/regex_pathological.lua b/lua/test/regex_pathological.lua
new file mode 100644
index 0000000..5207485
--- /dev/null
+++ b/lua/test/regex_pathological.lua
@@ -0,0 +1,62 @@
+-- Discovery test: pathological regex inputs run against the port's re_* API.
+-- Goal is to surface failures across ports, not to assert behaviour.
+-- Panel is the same in every port (see REGEX.md).
+--
+-- RUN: lua test/regex_pathological.lua
+
+package.path = "../src/?.lua;./src/?.lua;" .. (package.path or "")
+local re = require("regex")
+
+local function json_str(s)
+  return '"' .. tostring(s):gsub('"', '\\"') .. '"'
+end
+
+local function json_table(t)
+  local parts = {}
+  for _, v in ipairs(t) do
+    if type(v) == "table" then
+      parts[#parts + 1] = json_table(v)
+    elseif type(v) == "string" then
+      parts[#parts + 1] = json_str(v)
+    else
+      parts[#parts + 1] = tostring(v)
+    end
+  end
+  return "[" .. table.concat(parts, ",") .. "]"
+end
+
+local function render(r)
+  local t = type(r)
+  if t == "nil" then return "null"
+  elseif t == "boolean" then return tostring(r)
+  elseif t == "string" then return json_str(r)
+  elseif t == "table" then return json_table(r)
+  else return tostring(r) end
+end
+
+local function record(label, fn)
+  local t0 = os.clock()
+  local ok, r = pcall(fn)
+  local ms = (os.clock() - t0) * 1000.0
+  local outcome
+  if ok then
+    outcome = "OK | " .. render(r)
+  else
+    outcome = "ERR | " .. tostring(r)
+  end
+  io.write(string.format("[regex-discovery] %s | %.2fms | %s\n", label, ms, outcome))
+end
+
+local a22 = string.rep("a", 22)
+local nest40 = string.rep("(", 40) .. "a" .. string.rep(")", 40)
+
+record("P1_redos_nested_plus",      function() return re.re_test("^(a+)+$", a22 .. "!") end)
+record("P2_redos_alt_overlap",      function() return re.re_test("^(a|aa)+$", a22 .. "!") end)
+record("P3_empty_repeat_replace",   function() return re.re_replace("a*", "abc", "X") end)
+record("P4_unicode_replace_dot",    function() return re.re_replace("\\.", "café.au.lait", "/") end)
+record("P5_unicode_find_codepoint", function() return re.re_find("é", "café au lait") end)
+record("P6_deep_nesting_compile",   function() return re.re_test(nest40, "a") end)
+record("P7_big_bounded_quantifier", function() return re.re_test("^a{0,10000}b$", string.rep("a", 10) .. "b") end)
+record("P8_invalid_pattern",        function() return re.re_compile("[abc") end)
+record("P9_backref_re2_forbidden",  function() return re.re_test("^(a+)\\1$", "aaaa") end)
+record("P10_find_all_zero_width",   function() return re.re_find_all("a*", "bbb") end)
diff --git a/perl/t/regex_pathological.t b/perl/t/regex_pathological.t
new file mode 100644
index 0000000..a6cd024
--- /dev/null
+++ b/perl/t/regex_pathological.t
@@ -0,0 +1,51 @@
+#!perl
+# Discovery test: pathological regex inputs run against the port's re_* API.
+# Goal is to surface failures across ports, not to assert behaviour.
+# Panel is the same in every port (see REGEX.md).
+
+use 5.018;
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Voxgig::Struct qw();
+use JSON::PP qw(encode_json);
+use Time::HiRes qw(gettimeofday tv_interval);
+
+binmode STDOUT, ':utf8';
+
+sub record {
+    my ($label, $fn) = @_;
+    my $t0 = [gettimeofday];
+    my $outcome;
+    my $r = eval { $fn->() };
+    if (my $err = $@) {
+        chomp $err;
+        $outcome = "ERR | $err";
+    } else {
+        my $enc = eval { encode_json($r) };
+        $enc = (defined $r ? "$r" : 'null') if $@;
+        $outcome = "OK | $enc";
+    }
+    my $ms = tv_interval($t0) * 1000.0;
+    printf("[regex-discovery] %s | %.2fms | %s\n", $label, $ms, $outcome);
+}
+
+my $a22    = 'a' x 22;
+my $nest40 = ('(' x 40) . 'a' . (')' x 40);
+
+record('P1_redos_nested_plus',      sub { Voxgig::Struct::re_test('^(a+)+$', $a22 . '!') });
+record('P2_redos_alt_overlap',      sub { Voxgig::Struct::re_test('^(a|aa)+$', $a22 . '!') });
+record('P3_empty_repeat_replace',   sub { Voxgig::Struct::re_replace('a*', 'abc', 'X') });
+record('P4_unicode_replace_dot',    sub { Voxgig::Struct::re_replace('\\.', 'café.au.lait', '/') });
+record('P5_unicode_find_codepoint', sub { Voxgig::Struct::re_find('é', 'café au lait') });
+record('P6_deep_nesting_compile',   sub { Voxgig::Struct::re_test($nest40, 'a') });
+record('P7_big_bounded_quantifier', sub { Voxgig::Struct::re_test('^a{0,10000}b$', ('a' x 10) . 'b') });
+record('P8_invalid_pattern',        sub { Voxgig::Struct::re_compile('[abc') });
+record('P9_backref_re2_forbidden',  sub { Voxgig::Struct::re_test('^(a+)\\1$', 'aaaa') });
+record('P10_find_all_zero_width',   sub { Voxgig::Struct::re_find_all('a*', 'bbb') });
+
+pass('regex pathological discovery ran');
+done_testing();
diff --git a/php/tests/RegexPathologicalTest.php b/php/tests/RegexPathologicalTest.php
new file mode 100644
index 0000000..3fc6a10
--- /dev/null
+++ b/php/tests/RegexPathologicalTest.php
@@ -0,0 +1,45 @@
+<?php
+
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+require_once __DIR__ . '/../src/Struct.php';
+
+use PHPUnit\Framework\TestCase;
+use Voxgig\Struct\Struct;
+
+class RegexPathologicalTest extends TestCase
+{
+    private static function record(string $label, callable $fn): void
+    {
+        $t0 = hrtime(true);
+        try {
+            $r = $fn();
+            $outcome = 'OK | ' . json_encode($r, JSON_UNESCAPED_UNICODE);
+        } catch (\Throwable $e) {
+            $outcome = 'ERR | ' . get_class($e) . ': ' . $e->getMessage();
+        }
+        $ms = (hrtime(true) - $t0) / 1e6;
+        printf("[regex-discovery] %s | %.2fms | %s\n", $label, $ms, $outcome);
+    }
+
+    public function testPanel(): void
+    {
+        $a22    = str_repeat('a', 22);
+        $nest40 = str_repeat('(', 40) . 'a' . str_repeat(')', 40);
+
+        self::record('P1_redos_nested_plus',      fn() => Struct::re_test('^(a+)+$', $a22 . '!'));
+        self::record('P2_redos_alt_overlap',      fn() => Struct::re_test('^(a|aa)+$', $a22 . '!'));
+        self::record('P3_empty_repeat_replace',   fn() => Struct::re_replace('a*', 'abc', 'X'));
+        self::record('P4_unicode_replace_dot',    fn() => Struct::re_replace('\\.', 'café.au.lait', '/'));
+        self::record('P5_unicode_find_codepoint', fn() => Struct::re_find('é', 'café au lait'));
+        self::record('P6_deep_nesting_compile',   fn() => Struct::re_test($nest40, 'a'));
+        self::record('P7_big_bounded_quantifier', fn() => Struct::re_test('^a{0,10000}b$', str_repeat('a', 10) . 'b'));
+        self::record('P8_invalid_pattern',        fn() => Struct::re_compile('[abc'));
+        self::record('P9_backref_re2_forbidden',  fn() => Struct::re_test('^(a+)\\1$', 'aaaa'));
+        self::record('P10_find_all_zero_width',   fn() => Struct::re_find_all('a*', 'bbb'));
+
+        $this->assertTrue(true);
+    }
+}
diff --git a/python/tests/test_regex_pathological.py b/python/tests/test_regex_pathological.py
new file mode 100644
index 0000000..7cde0ca
--- /dev/null
+++ b/python/tests/test_regex_pathological.py
@@ -0,0 +1,51 @@
+# RUN: python -m unittest discover -s tests
+#
+# Discovery test: pathological regex inputs run against the port's re_* API.
+# The goal is to surface which inputs cause errors, hangs, or surprising
+# output across ports — NOT to assert any specific behaviour. Each case
+# wraps the call so one failure does not mask the others.
+# The panel is the same in every port (see REGEX.md).
+
+import json
+import time
+import unittest
+
+from voxgig_struct.voxgig_struct import (
+    re_compile,
+    re_test,
+    re_find,
+    re_find_all,
+    re_replace,
+)
+
+
+def record(label, fn):
+    t0 = time.perf_counter()
+    try:
+        r = fn()
+        outcome = f"OK | {json.dumps(r, default=str)}"
+    except Exception as e:
+        outcome = f"ERR | {type(e).__name__}: {e}"
+    ms = (time.perf_counter() - t0) * 1000.0
+    print(f"[regex-discovery] {label} | {ms:.2f}ms | {outcome}")
+
+
+class PathologicalRegex(unittest.TestCase):
+    def test_panel(self):
+        A22 = "a" * 22
+        NEST40 = "(" * 40 + "a" + ")" * 40
+
+        record("P1_redos_nested_plus",      lambda: re_test("^(a+)+$", A22 + "!"))
+        record("P2_redos_alt_overlap",      lambda: re_test("^(a|aa)+$", A22 + "!"))
+        record("P3_empty_repeat_replace",   lambda: re_replace("a*", "abc", "X"))
+        record("P4_unicode_replace_dot",    lambda: re_replace(r"\.", "café.au.lait", "/"))
+        record("P5_unicode_find_codepoint", lambda: re_find("é", "café au lait"))
+        record("P6_deep_nesting_compile",   lambda: re_test(NEST40, "a"))
+        record("P7_big_bounded_quantifier", lambda: re_test("^a{0,10000}b$", "a" * 10 + "b"))
+        record("P8_invalid_pattern",        lambda: re_compile("[abc"))
+        record("P9_backref_re2_forbidden",  lambda: re_test(r"^(a+)\1$", "aaaa"))
+        record("P10_find_all_zero_width",   lambda: re_find_all("a*", "bbb"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/ruby/test_regex_pathological.rb b/ruby/test_regex_pathological.rb
new file mode 100644
index 0000000..ec9ee17
--- /dev/null
+++ b/ruby/test_regex_pathological.rb
@@ -0,0 +1,37 @@
+require 'minitest/autorun'
+require 'json'
+require_relative 'voxgig_struct'
+
+# Discovery test: pathological regex inputs run against the port's re_* API.
+# Goal is to surface failures across ports, not to assert behaviour.
+# Panel is the same in every port (see REGEX.md).
+
+def record(label, &block)
+  t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+  begin
+    r = block.call
+    outcome = "OK | #{JSON.generate(r)}"
+  rescue Exception => e
+    outcome = "ERR | #{e.class.name}: #{e.message}"
+  end
+  ms = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0) * 1000.0
+  printf("[regex-discovery] %s | %.2fms | %s\n", label, ms, outcome)
+end
+
+class PathologicalRegexTest < Minitest::Test
+  def test_panel
+    a22 = 'a' * 22
+    nest40 = ('(' * 40) + 'a' + (')' * 40)
+
+    record('P1_redos_nested_plus')      { VoxgigStruct.re_test('^(a+)+$', a22 + '!') }
+    record('P2_redos_alt_overlap')      { VoxgigStruct.re_test('^(a|aa)+$', a22 + '!') }
+    record('P3_empty_repeat_replace')   { VoxgigStruct.re_replace('a*', 'abc', 'X') }
+    record('P4_unicode_replace_dot')    { VoxgigStruct.re_replace('\\.', 'café.au.lait', '/') }
+    record('P5_unicode_find_codepoint') { VoxgigStruct.re_find('é', 'café au lait') }
+    record('P6_deep_nesting_compile')   { VoxgigStruct.re_test(nest40, 'a') }
+    record('P7_big_bounded_quantifier') { VoxgigStruct.re_test('^a{0,10000}b$', ('a' * 10) + 'b') }
+    record('P8_invalid_pattern')        { VoxgigStruct.re_compile('[abc') }
+    record('P9_backref_re2_forbidden')  { VoxgigStruct.re_test('^(a+)\\1$', 'aaaa') }
+    record('P10_find_all_zero_width')   { VoxgigStruct.re_find_all('a*', 'bbb') }
+  end
+end
diff --git a/rust/tests/regex_pathological.rs b/rust/tests/regex_pathological.rs
new file mode 100644
index 0000000..49bf5da
--- /dev/null
+++ b/rust/tests/regex_pathological.rs
@@ -0,0 +1,64 @@
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+use std::panic;
+use std::time::Instant;
+
+use voxgig_struct::{re_compile, re_find, re_find_all, re_replace, re_test};
+
+fn record<F, R>(label: &str, fn_: F)
+where
+    F: FnOnce() -> R + panic::UnwindSafe,
+    R: std::fmt::Debug,
+{
+    let t0 = Instant::now();
+    let outcome = match panic::catch_unwind(fn_) {
+        Ok(r) => format!("OK | {:?}", r),
+        Err(e) => {
+            let msg = if let Some(s) = e.downcast_ref::<&str>() {
+                s.to_string()
+            } else if let Some(s) = e.downcast_ref::<String>() {
+                s.clone()
+            } else {
+                "<non-string panic>".to_string()
+            };
+            format!("ERR | panic: {}", msg)
+        }
+    };
+    let ms = t0.elapsed().as_secs_f64() * 1000.0;
+    println!("[regex-discovery] {} | {:.2}ms | {}", label, ms, outcome);
+}
+
+#[test]
+fn regex_pathological_discovery() {
+    let a22: String = "a".repeat(22);
+    let nest40: String = "(".repeat(40) + "a" + &")".repeat(40);
+
+    record("P1_redos_nested_plus", || {
+        re_test("^(a+)+$", &(a22.clone() + "!"))
+    });
+    record("P2_redos_alt_overlap", || {
+        re_test("^(a|aa)+$", &(a22.clone() + "!"))
+    });
+    record("P3_empty_repeat_replace", || {
+        re_replace("a*", "abc", "X")
+    });
+    record("P4_unicode_replace_dot", || {
+        re_replace(r"\.", "café.au.lait", "/")
+    });
+    record("P5_unicode_find_codepoint", || {
+        re_find("é", "café au lait")
+    });
+    record("P6_deep_nesting_compile", || re_test(&nest40, "a"));
+    record("P7_big_bounded_quantifier", || {
+        re_test("^a{0,10000}b$", &("a".repeat(10) + "b"))
+    });
+    record("P8_invalid_pattern", || {
+        re_compile("[abc").map(|_| ()).err().map(|e| format!("{:?}", e))
+    });
+    record("P9_backref_re2_forbidden", || {
+        re_test(r"^(a+)\1$", "aaaa")
+    });
+    record("P10_find_all_zero_width", || re_find_all("a*", "bbb"));
+}
diff --git a/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift b/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift
new file mode 100644
index 0000000..f81f718
--- /dev/null
+++ b/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift
@@ -0,0 +1,39 @@
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+
+import XCTest
+
+@testable import VoxgigStruct
+
+final class RegexPathologicalTests: XCTestCase {
+  private func record(_ label: String, _ fn: () -> Any?) {
+    let t0 = DispatchTime.now()
+    let r = fn()
+    let elapsedNs = DispatchTime.now().uptimeNanoseconds - t0.uptimeNanoseconds
+    let ms = Double(elapsedNs) / 1_000_000.0
+    let outcome: String
+    if let r = r {
+      outcome = "OK | \(r)"
+    } else {
+      outcome = "OK | null"
+    }
+    print(String(format: "[regex-discovery] %@ | %.2fms | %@", label, ms, outcome))
+  }
+
+  func testPanel() {
+    let a22 = String(repeating: "a", count: 22)
+    let nest40 = String(repeating: "(", count: 40) + "a" + String(repeating: ")", count: 40)
+
+    record("P1_redos_nested_plus")      { re_test(.string("^(a+)+$"), a22 + "!") }
+    record("P2_redos_alt_overlap")      { re_test(.string("^(a|aa)+$"), a22 + "!") }
+    record("P3_empty_repeat_replace")   { re_replace(.string("a*"), "abc", "X") }
+    record("P4_unicode_replace_dot")    { re_replace(.string("\\."), "café.au.lait", "/") }
+    record("P5_unicode_find_codepoint") { re_find(.string("é"), "café au lait") }
+    record("P6_deep_nesting_compile")   { re_test(.string(nest40), "a") }
+    record("P7_big_bounded_quantifier") { re_test(.string("^a{0,10000}b$"), String(repeating: "a", count: 10) + "b") }
+    record("P8_invalid_pattern")        { re_compile("[abc") as Any? }
+    record("P9_backref_re2_forbidden")  { re_test(.string("^(a+)\\1$"), "aaaa") }
+    record("P10_find_all_zero_width")   { re_find_all(.string("a*"), "bbb") }
+  }
+}
diff --git a/typescript/dist-test/regex_pathological.test.js b/typescript/dist-test/regex_pathological.test.js
new file mode 100644
index 0000000..92de7bc
--- /dev/null
+++ b/typescript/dist-test/regex_pathological.test.js
@@ -0,0 +1,41 @@
+"use strict";
+// VERSION: @voxgig/struct 0.1.0
+//
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Each case wraps the call so one failure does not mask the others.
+// The panel is the same in every port (see REGEX.md).
+Object.defineProperty(exports, "__esModule", { value: true });
+const node_test_1 = require("node:test");
+const StructUtility_1 = require("../dist/StructUtility");
+function rep(s, n) {
+    return new Array(n + 1).join(s);
+}
+function record(label, fn) {
+    const t0 = process.hrtime.bigint();
+    let outcome;
+    try {
+        const r = fn();
+        outcome = `OK | ${JSON.stringify(r)}`;
+    }
+    catch (e) {
+        outcome = `ERR | ${e && e.message ? e.message : String(e)}`;
+    }
+    const ms = Number(process.hrtime.bigint() - t0) / 1e6;
+    // eslint-disable-next-line no-console
+    console.log(`[regex-discovery] ${label} | ${ms.toFixed(2)}ms | ${outcome}`);
+}
+(0, node_test_1.test)('regex pathological discovery', () => {
+    const A22 = rep('a', 22);
+    const NEST40 = rep('(', 40) + 'a' + rep(')', 40);
+    record('P1_redos_nested_plus', () => (0, StructUtility_1.re_test)('^(a+)+$', A22 + '!'));
+    record('P2_redos_alt_overlap', () => (0, StructUtility_1.re_test)('^(a|aa)+$', A22 + '!'));
+    record('P3_empty_repeat_replace', () => (0, StructUtility_1.re_replace)('a*', 'abc', 'X'));
+    record('P4_unicode_replace_dot', () => (0, StructUtility_1.re_replace)('\\.', 'café.au.lait', '/'));
+    record('P5_unicode_find_codepoint', () => (0, StructUtility_1.re_find)('é', 'café au lait'));
+    record('P6_deep_nesting_compile', () => (0, StructUtility_1.re_test)(NEST40, 'a'));
+    record('P7_big_bounded_quantifier', () => (0, StructUtility_1.re_test)('^a{0,10000}b$', rep('a', 10) + 'b'));
+    record('P8_invalid_pattern', () => (0, StructUtility_1.re_compile)('[abc'));
+    record('P9_backref_re2_forbidden', () => (0, StructUtility_1.re_test)('^(a+)\\1$', 'aaaa'));
+    record('P10_find_all_zero_width', () => (0, StructUtility_1.re_find_all)('a*', 'bbb'));
+});
+//# sourceMappingURL=regex_pathological.test.js.map
\ No newline at end of file
diff --git a/typescript/dist-test/regex_pathological.test.js.map b/typescript/dist-test/regex_pathological.test.js.map
new file mode 100644
index 0000000..d0ff244
--- /dev/null
+++ b/typescript/dist-test/regex_pathological.test.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"regex_pathological.test.js","sourceRoot":"","sources":["../test/regex_pathological.test.ts"],"names":[],"mappings":";AAAA,gCAAgC;AAChC,EAAE;AACF,6EAA6E;AAC7E,oEAAoE;AACpE,sDAAsD;;AAEtD,yCAAgC;AAEhC,yDAE8B;AAE9B,SAAS,GAAG,CAAC,CAAS,EAAE,CAAS;IAC/B,OAAO,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;AACjC,CAAC;AAED,SAAS,MAAM,CAAC,KAAa,EAAE,EAAiB;IAC9C,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,CAAA;IAClC,IAAI,OAAe,CAAA;IACnB,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,CAAA;QACd,OAAO,GAAG,QAAQ,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAA;IACvC,CAAC;IAAC,OAAO,CAAM,EAAE,CAAC;QAChB,OAAO,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAA;IAC7D,CAAC;IACD,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,GAAG,CAAA;IACrD,sCAAsC;IACtC,OAAO,CAAC,GAAG,CAAC,qBAAqB,KAAK,MAAM,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,EAAE,CAAC,CAAA;AAC7E,CAAC;AAED,IAAA,gBAAI,EAAC,8BAA8B,EAAE,GAAG,EAAE;IACxC,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IACxB,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IAEhD,MAAM,CAAC,sBAAsB,EAAM,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,SAAS,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACvE,MAAM,CAAC,sBAAsB,EAAM,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,WAAW,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACzE,MAAM,CAAC,yBAAyB,EAAG,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,wBAAwB,EAAI,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,KAAK,EAAE,cAAc,EAAE,GAAG,CAAC,CAAC,CAAA;IAChF,MAAM,CAAC,2BAA2B,EAAC,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,GAAG,EAAE,cAAc,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,yBAAyB,EAAG,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAA;IAC9D,MAAM,CAAC,2BAA2B,EAAC,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,eAAe,EAAE,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAA;IACtF,MAAM,CAAC,oBAAoB,EAAQ,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,MAAM,CAAC,CAAC,CAAA;IAC5D,MAAM,CAAC,0BAA0B,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,yBAAyB,EAAG,GAAG,EAAE,CAAC,IAAA,2BAAW,EAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAA;AACpE,CAAC,CAAC,CAAA"}
\ No newline at end of file
diff --git a/typescript/test/regex_pathological.test.ts b/typescript/test/regex_pathological.test.ts
new file mode 100644
index 0000000..ce7dfa3
--- /dev/null
+++ b/typescript/test/regex_pathological.test.ts
@@ -0,0 +1,45 @@
+// VERSION: @voxgig/struct 0.1.0
+//
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Each case wraps the call so one failure does not mask the others.
+// The panel is the same in every port (see REGEX.md).
+
+import { test } from 'node:test'
+
+import {
+  re_compile, re_test, re_find, re_find_all, re_replace,
+} from '../dist/StructUtility'
+
+function rep(s: string, n: number): string {
+  return new Array(n + 1).join(s)
+}
+
+function record(label: string, fn: () => unknown): void {
+  const t0 = process.hrtime.bigint()
+  let outcome: string
+  try {
+    const r = fn()
+    outcome = `OK | ${JSON.stringify(r)}`
+  } catch (e: any) {
+    outcome = `ERR | ${e && e.message ? e.message : String(e)}`
+  }
+  const ms = Number(process.hrtime.bigint() - t0) / 1e6
+  // eslint-disable-next-line no-console
+  console.log(`[regex-discovery] ${label} | ${ms.toFixed(2)}ms | ${outcome}`)
+}
+
+test('regex pathological discovery', () => {
+  const A22 = rep('a', 22)
+  const NEST40 = rep('(', 40) + 'a' + rep(')', 40)
+
+  record('P1_redos_nested_plus',     () => re_test('^(a+)+$', A22 + '!'))
+  record('P2_redos_alt_overlap',     () => re_test('^(a|aa)+$', A22 + '!'))
+  record('P3_empty_repeat_replace',  () => re_replace('a*', 'abc', 'X'))
+  record('P4_unicode_replace_dot',   () => re_replace('\\.', 'café.au.lait', '/'))
+  record('P5_unicode_find_codepoint',() => re_find('é', 'café au lait'))
+  record('P6_deep_nesting_compile',  () => re_test(NEST40, 'a'))
+  record('P7_big_bounded_quantifier',() => re_test('^a{0,10000}b$', rep('a', 10) + 'b'))
+  record('P8_invalid_pattern',       () => re_compile('[abc'))
+  record('P9_backref_re2_forbidden', () => re_test('^(a+)\\1$', 'aaaa'))
+  record('P10_find_all_zero_width',  () => re_find_all('a*', 'bbb'))
+})
diff --git a/zig/test/regex_pathological.zig b/zig/test/regex_pathological.zig
new file mode 100644
index 0000000..ff089db
--- /dev/null
+++ b/zig/test/regex_pathological.zig
@@ -0,0 +1,94 @@
+// Discovery test: pathological regex inputs run against the port's re_* API.
+// Goal is to surface failures across ports, not to assert behaviour.
+// Panel is the same in every port (see REGEX.md).
+//
+// Zig's public regex surface currently exposes only re_compile/re_test/re_escape
+// (see src/struct.zig). The find/replace/find_all cases below mark themselves
+// as N/A — that absence is itself part of the discovery.
+
+const std = @import("std");
+const voxgig_struct = @import("voxgig-struct");
+
+fn record_test(label: []const u8, ok: bool, ms: f64, value: anytype) void {
+    const T = @TypeOf(value);
+    const writer = std.io.getStdOut().writer();
+    if (ok) {
+        if (T == bool) {
+            writer.print("[regex-discovery] {s} | {d:.2}ms | OK | {}\n", .{ label, ms, value }) catch {};
+        } else {
+            writer.print("[regex-discovery] {s} | {d:.2}ms | OK | {any}\n", .{ label, ms, value }) catch {};
+        }
+    } else {
+        writer.print("[regex-discovery] {s} | {d:.2}ms | ERR | compile or run failed\n", .{ label, ms }) catch {};
+    }
+}
+
+test "regex pathological discovery" {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
+
+    const a22 = try alloc.alloc(u8, 22);
+    @memset(a22, 'a');
+    const p1_in = try std.fmt.allocPrint(alloc, "{s}!", .{a22});
+
+    var nest_buf: [120]u8 = undefined;
+    var pos: usize = 0;
+    while (pos < 40) : (pos += 1) nest_buf[pos] = '(';
+    nest_buf[pos] = 'a';
+    pos += 1;
+    var i: usize = 0;
+    while (i < 40) : (i += 1) {
+        nest_buf[pos] = ')';
+        pos += 1;
+    }
+    const nest40 = nest_buf[0..pos];
+
+    // P1
+    var t0 = std.time.nanoTimestamp();
+    const b1 = voxgig_struct.re_test("^(a+)+$", p1_in);
+    var ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
+    record_test("P1_redos_nested_plus", true, ms, b1);
+
+    // P2
+    t0 = std.time.nanoTimestamp();
+    const b2 = voxgig_struct.re_test("^(a|aa)+$", p1_in);
+    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
+    record_test("P2_redos_alt_overlap", true, ms, b2);
+
+    // P3, P4, P5, P10 — replace/find/find_all not in zig public surface.
+    const writer = std.io.getStdOut().writer();
+    try writer.print("[regex-discovery] P3_empty_repeat_replace | -.--ms | N/A | re_replace not exposed\n", .{});
+    try writer.print("[regex-discovery] P4_unicode_replace_dot | -.--ms | N/A | re_replace not exposed\n", .{});
+    try writer.print("[regex-discovery] P5_unicode_find_codepoint | -.--ms | N/A | re_find not exposed\n", .{});
+
+    // P6
+    t0 = std.time.nanoTimestamp();
+    const b6 = voxgig_struct.re_test(nest40, "a");
+    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
+    record_test("P6_deep_nesting_compile", true, ms, b6);
+
+    // P7
+    t0 = std.time.nanoTimestamp();
+    const b7 = voxgig_struct.re_test("^a{0,10000}b$", "aaaaaaaaaab");
+    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
+    record_test("P7_big_bounded_quantifier", true, ms, b7);
+
+    // P8
+    t0 = std.time.nanoTimestamp();
+    const p8 = voxgig_struct.re_compile("[abc");
+    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
+    if (p8 == null) {
+        try writer.print("[regex-discovery] P8_invalid_pattern | {d:.2}ms | ERR | compile returned null\n", .{ms});
+    } else {
+        try writer.print("[regex-discovery] P8_invalid_pattern | {d:.2}ms | OK | \"compiled\"\n", .{ms});
+    }
+
+    // P9
+    t0 = std.time.nanoTimestamp();
+    const b9 = voxgig_struct.re_test("^(a+)\\1$", "aaaa");
+    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
+    record_test("P9_backref_re2_forbidden", true, ms, b9);
+
+    try writer.print("[regex-discovery] P10_find_all_zero_width | -.--ms | N/A | re_find_all not exposed\n", .{});
+}

From df6a6c3c366a6339a3c644d1b05aebbbd445e9bf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 16 May 2026 10:54:21 +0000
Subject: [PATCH 2/6] Fix pathological-regex porting variations; document
 irreconcilables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the discovery test surfaced seven port-side regex bugs and three
behavioural drifts, fix everything we own and document what stays
engine-bound. See REGEX_PATHOLOGICAL.md for the full pre/post table.

Port-side fixes:

- rust: re.rs `add()` epsilon-closure rewritten as iterative. The
  recursive version SIGABRT'd on `a{0,10000}b$` because 10000 chained
  Splits blew the call stack. Priority preserved by pushing y then x.
  All tests + corpus pass.

- c / lua: in-tree Thompson NFA `OP_MATCH` was `if (!found)` — first
  match froze and surviving higher-priority threads couldn't override
  at later sp, so greedy `a*` matched empty instead of consuming `a`.
  Now always overwrite within the priority-pruned thread set. C corpus
  1200/1200, lua regex 53/53 still green.

- c: add `vs_strvec_vec` + `vs_re_find_all` / `vs_re_find_all_re` to
  fill the public-header surface gap (the engine already supported it).

- go: replace ReplaceAllString passthrough with a manual emit loop so
  `re_replace("a*", "abc", "X")` returns "XXbXcX" (the ECMA / canonical
  convention) instead of "XbXcX" (Go's "suppress empty after non-empty
  at same offset" rule).

- zig: expose `re_find` / `re_find_all` / `re_replace` wrappers and
  promote `Regex.findFirst` + new `findFrom` to pub. Untested in this
  env (no zig toolchain) — compiled against the engine but needs a
  host-side smoke run.

- php: `re_compile` was a string concatenation; it never validated.
  Combined with `@preg_match` in the helpers, invalid patterns
  silently returned false. Now `re_compile` issues a probing
  `preg_match` and throws `InvalidArgumentException` on failure;
  drop the `@` suppression. 85 PHPUnit tests still pass.

- perl: discovery test was double-encoding UTF-8 (encode_json +
  :utf8 stdout). Switch to `JSON::PP->new->utf8(0)`. Port code
  was correct.

Irreconcilable, now explicitly documented in REGEX_PATHOLOGICAL.md:

- P1/P2 catastrophic backtracking: ECMA/PCRE/.NET/Java engines
  backtrack; RE2/Thompson don't. Times span <0.1ms to 1190ms. Users
  must write linear-friendly patterns.
- P7 RE2 bounded-quantifier limit: Go's stdlib caps {n,m} at 1000.
  No portable workaround.
- P9 backreferences: PCRE-family matches, RE2/in-tree don't.
  REGEX.md already declares backrefs outside the dialect.
- Java/.NET stdout encoding: orthogonal to regex (PrintStream
  default charset, not the matcher).
---
 REGEX_PATHOLOGICAL.md           | 246 +++++++++++++++++++++-----------
 c/src/re_util.c                 |  70 +++++++++
 c/src/regex.c                   |  31 ++--
 c/src/voxgig_struct.h           |  14 ++
 c/tests/regex_pathological.c    |  10 +-
 go/voxgigstruct.go              |  47 +++++-
 lua/src/regex.lua               |  10 +-
 perl/t/regex_pathological.t     |   8 +-
 php/src/Struct.php              |  17 ++-
 rust/src/re.rs                  | 104 +++++++-------
 zig/src/regex.zig               |  12 +-
 zig/src/struct.zig              |  96 +++++++++++++
 zig/test/regex_pathological.zig |  77 +++++-----
 13 files changed, 540 insertions(+), 202 deletions(-)

diff --git a/REGEX_PATHOLOGICAL.md b/REGEX_PATHOLOGICAL.md
index 5a8ed95..be100be 100644
--- a/REGEX_PATHOLOGICAL.md
+++ b/REGEX_PATHOLOGICAL.md
@@ -1,9 +1,10 @@
-# Pathological Regex — Cross-Port Discovery
+# Pathological Regex — Cross-Port Discovery & Fixes
 
-> First-pass discovery test. Goal is to surface where each port's regex
-> wrapper misbehaves on pathological inputs. **Not for assertion** —
-> behaviour differs across engines and the test files do not enforce a
-> specific outcome. Fixes come later; this run is to find them.
+> Discovery panel that runs 10 deliberately pathological regex inputs
+> against every port's `re_*` API. The first pass surfaced where port
+> wrappers misbehaved on edge cases; this document records the panel,
+> the **fixed** porting variations, and the irreconcilable
+> engine-bound differences that remain.
 
 The same 10-case panel runs in every port via the port's `re_*` API
 (see `REGEX_API.md`). Each port has a `regex_pathological*` test file
@@ -13,88 +14,167 @@ under its own tests directory.
 
 | # | Name | Call | What it stresses |
 |---|---|---|---|
-| P1  | `redos_nested_plus`         | `re_test("^(a+)+$", "a"*22 + "!")` | Catastrophic backtracking via nested quantifier |
-| P2  | `redos_alt_overlap`         | `re_test("^(a\|aa)+$", "a"*22 + "!")` | Catastrophic backtracking via overlapping alternation |
-| P3  | `empty_repeat_replace`      | `re_replace("a*", "abc", "X")` | Zero-width-match convention in `replace_all` |
-| P4  | `unicode_replace_dot`       | `re_replace("\\.", "café.au.lait", "/")` | UTF-8 char-boundary handling |
-| P5  | `unicode_find_codepoint`    | `re_find("é", "café au lait")` | Non-ASCII patterns |
-| P6  | `deep_nesting_compile`      | `re_test("(((…40…(a)…)))","a")` | Parser/compiler stack |
-| P7  | `big_bounded_quantifier`    | `re_test("^a{0,10000}b$", "a"*10+"b")` | Large bounded quantifier |
-| P8  | `invalid_pattern`           | `re_compile("[abc")` | Error reporting |
-| P9  | `backref_re2_forbidden`     | `re_test("^(a+)\\1$", "aaaa")` | RE2 strictness on backrefs |
-| P10 | `find_all_zero_width`       | `re_find_all("a*", "bbb")` | Zero-width `find_all` enumeration |
-
-## Findings (first run)
-
-Times in ms — wall-clock per case.
-
-| Port       | P1 (ms) | P2  | P3 result    | P4 result        | P7  | P8                | P9 result |
-|------------|--------:|----:|--------------|------------------|----:|-------------------|-----------|
-| typescript |  169    | 3   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| javascript |  172    | 3   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| python     |  185    | 4   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| ruby       |    0.04 | 0.03| `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| php        |    2    | 0.3 | `"XXbXcX"`   | `café/au/lait`   | OK  | **OK (silent!)**  | matches   |
-| perl       |    0.06 | 0.04| `"XXbXcX"`   | **`cafÃ©/au/lait`** | OK  | ERR (clean)    | matches   |
-| go         |    0.05 | 0.02| **`"XbXcX"`** | `café/au/lait`  | **PANIC** | **PANIC**   | **PANIC** |
-| rust       |    0.04 | 0.02| `"XXbXcX"`   | `café/au/lait`   | **STACK-OVERFLOW** | — (binary aborted) | — |
-| java       |   16    | 0.2 | `"XXbXcX"`   | `caf?/au/lait`†  | OK  | ERR (clean)       | matches   |
-| cpp        | **1349**| 28  | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| c          |    0.01 | 0.01| **`"XaXbXcX"`** | `café/au/lait`| OK  | ERR (NULL return) | non-match |
-| lua        |    0.10 | 0.13| **`"XaXbXcX"`** | `café/au/lait`| OK  | ERR (clean)       | non-match |
-| csharp     |  359    | 7   | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| kotlin     |   30    | 0.2 | `"XXbXcX"`   | `café/au/lait`   | OK  | ERR (clean)       | matches   |
-| swift      | n/r     |     |              |                  |     |                   |           |
-| zig        | n/r     |     |              |                  |     |                   |           |
-
-n/r = not run (toolchain unavailable in this environment).
-† Java prints `?` because stdout's default encoding is platform-dependent, not the regex; the JVM-internal string is correctly `café`.
-
-## Failures discovered
-
-1. **rust — `re_test("^a{0,10000}b$", …)` overflows the matcher's stack.**
-   The in-tree Thompson engine appears to allocate per-repeat state on the
-   call stack. The whole test binary aborts (SIGABRT). `panic::catch_unwind`
-   cannot recover from stack-exhaustion. P8/P9/P10 are never reached.
-
-2. **php — `re_compile("[abc")` returns a valid-looking delimited pattern
-   and `re_test` returns `false` silently.** `php/src/Struct.php:565` does
-   `'/' . str_replace('/', '\\/', $pattern) . '/'` without ever compiling;
-   `re_test` uses `@preg_match` which suppresses warnings. Callers can't
-   tell a bad pattern from a no-match.
-
-3. **go — `ReCompile`/`ReTest`/`ReReplace` all use `regexp.MustCompile`
-   and panic** on (a) bounded quantifiers > 1000 (`a{0,10000}` — RE2 limit),
-   (b) invalid patterns, and (c) backrefs (not supported by RE2). The Go
-   API has no shape that lets a caller catch a compile error.
-
-4. **Three distinct P3 conventions for zero-width `replace_all`:**
-   - JS/TS/Python/Ruby/PHP/Java/.NET/Kotlin/Rust/C++ → `"XXbXcX"`
-   - Go → `"XbXcX"`
-   - C / Lua (in-tree engines) → `"XaXbXcX"` (the matching `a` is replaced AND a zero-width insertion is emitted)
-   Internal call sites that depend on the exact shape will diverge.
-
-5. **C and Lua return `false` for the backref pattern P9** rather than
-   erroring; the parser silently consumes `\1` as something other than a
-   backref. RE2 ports (Go) reject it; PCRE/ECMA ports (everyone else) match.
-
-6. **C++ libstdc++ regex shows catastrophic backtracking** — 1349 ms on
-   `(a+)+` over 22 a's. C# (.NET) and Python next worst (~350 ms / ~185 ms).
-   Go / Rust / Ruby / Perl / C / Lua are all sub-millisecond (no backtracking).
-
-7. **perl — UTF-8 round-tripping in `re_replace` corrupts output.** P4
-   returned `cafÃ©/au/lait`. Either the regex returns octets or the JSON
-   encoder treats characters-as-bytes — encoding boundary is wrong in the
+| P1  | `redos_nested_plus`         | `re_test("^(a+)+$", "a"*22 + "!")`         | Catastrophic backtracking via nested quantifier |
+| P2  | `redos_alt_overlap`         | `re_test("^(a\|aa)+$", "a"*22 + "!")`      | Catastrophic backtracking via overlapping alternation |
+| P3  | `empty_repeat_replace`      | `re_replace("a*", "abc", "X")`             | Zero-width-match convention in `replace_all` |
+| P4  | `unicode_replace_dot`       | `re_replace("\\.", "café.au.lait", "/")`   | UTF-8 char-boundary handling |
+| P5  | `unicode_find_codepoint`    | `re_find("é", "café au lait")`             | Non-ASCII patterns |
+| P6  | `deep_nesting_compile`      | `re_test("(((…40…(a)…)))","a")`            | Parser/compiler stack |
+| P7  | `big_bounded_quantifier`    | `re_test("^a{0,10000}b$", "a"*10+"b")`     | Large bounded quantifier |
+| P8  | `invalid_pattern`           | `re_compile("[abc")`                        | Error reporting |
+| P9  | `backref_re2_forbidden`     | `re_test("^(a+)\\1$", "aaaa")`             | RE2 strictness on backrefs |
+| P10 | `find_all_zero_width`       | `re_find_all("a*", "bbb")`                 | Zero-width `find_all` enumeration |
+
+## Post-fix results (14 of 16 ports runnable in this env)
+
+| Port       | P1 (ms) | P2 (ms) | P3 result    | P4 result      | P7    | P8                | P9        |
+|------------|--------:|--------:|--------------|----------------|-------|-------------------|-----------|
+| typescript |  180    | 3       | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| javascript |  179    | 3       | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| python     |  191    | 4       | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| ruby       |    0.04 | 0.05    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| php        |    3    | 0.3     | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| perl       |    0.06 | 0.06    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| go         |    0.03 | 0.02    | `"XXbXcX"`   | `café/au/lait` | PANIC | PANIC             | PANIC     |
+| rust       |    0.01 | 0.01    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | non-match |
+| java       |   13    | 0.2     | `"XXbXcX"`   | `caf?/au/lait` | OK    | ERR (clean)       | matches   |
+| cpp        | **1190**| 24      | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| c          |    0.01 | 0.01    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (NULL return) | non-match |
+| lua        |    0.12 | 0.10    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | non-match |
+| csharp     |  393    | 8       | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| kotlin     |   24    | 0.3     | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
+| swift      | n/r     |         |              |                |       |                   |           |
+| zig        | n/r     |         |              |                |       |                   |           |
+
+n/r = toolchain unavailable in this environment.
+
+## Fixes — porting variations resolved
+
+1. **rust — stack overflow on `a{0,10000}b$`** (`rust/src/re.rs`).
+   The Thompson engine's `add()` epsilon-closure was recursive; 10 000
+   chained `Split` instructions blew the call stack with SIGABRT.
+   Rewrote as iterative with an explicit work stack (priority preserved
+   by pushing `y` then `x`). All 15 tests still pass; the in-tree corpus
+   (1200 cases via the TS-shared spec) still passes.
+
+2. **php — `re_compile` silently accepted invalid patterns**
+   (`php/src/Struct.php`). The wrapper returned a delimited string
+   without ever running PCRE on it, and every other helper used
+   `@preg_match` to suppress warnings. Now `re_compile` issues a
+   no-op `preg_match` to surface compile errors, throws
+   `InvalidArgumentException` on failure, and the `@` is dropped from
+   the read helpers. 85 PHPUnit tests still pass.
+
+3. **c / lua — `re_replace("a*", "abc", "X")` returned `"XaXbXcX"`**
+   (`c/src/regex.c`, `lua/src/regex.lua`). The in-tree Thompson NFA
+   driver's `OP_MATCH` branch had `if (!found) { … }`, which froze
+   the first match found and prevented surviving higher-priority
+   threads from overriding at a later `sp`. That made greedy
+   quantifiers behave lazily — `a*` matched empty at every position
+   instead of consuming the leading `"a"`. Always overwriting on
+   `OP_MATCH` (within the priority-pruned thread set) makes greedy
+   `a*` consume the `"a"` correctly. C corpus 1200/1200 still passes;
+   Lua regex unit tests 53/53 still pass.
+
+4. **c — `re_find_all` missing from public header**
+   (`c/src/voxgig_struct.h`, `c/src/re_util.c`). Added
+   `vs_strvec_vec` + `vs_re_find_all` / `vs_re_find_all_re`. The
+   engine already supported the operation; only the wrapper was
+   missing.
+
+5. **go — `re_replace` zero-width convention differed from JS**
+   (`go/voxgigstruct.go`). Go's `ReplaceAllString` suppresses an
+   empty match immediately after a non-empty match at the same
+   offset, so `re_replace("a*", "abc", "X")` yielded `"XbXcX"`
+   instead of the canonical `"XXbXcX"`. Replaced the passthrough
+   with a manual match-and-emit loop that follows the ECMAScript
+   rule (always emit a replacement, advance by one rune on
+   zero-width). Existing Go tests still pass.
+
+6. **zig — `re_find` / `re_find_all` / `re_replace` not exposed**
+   (`zig/src/struct.zig`, `zig/src/regex.zig`). The engine had
+   `matchAt` but only `re_compile` / `re_test` / `re_escape` were
+   public. Made `findFirst` public, added `findFrom(input, start)`,
+   and added the three wrappers using the page allocator (matching
+   the existing `re_test` style). **Not run in this environment**
+   (no zig toolchain); the wrappers compile against the engine but
+   need a host-side smoke pass.
+
+7. **perl — discovery test showed `cafÃ©/au/lait`** (`perl/t/regex_pathological.t`).
+   This turned out to be a test-script bug, not a port bug:
+   `encode_json` returns UTF-8-encoded bytes and `binmode STDOUT,
+   ':utf8'` then re-encoded them as Latin-1. Switched the test to
+   `JSON::PP->new->utf8(0)->encode` so the `:utf8` layer encodes
+   once. The Perl port's `re_replace` was correct all along.
+
+## Irreconcilable — engine-bound, documented for callers
+
+Cases where the host language's regex engine fundamentally differs
+from another's. The cross-port contract documented in `REGEX.md`
+already requires patterns to live in the RE2 subset; these are the
+sharp edges that come with the host engines we don't own.
+
+1. **P1 / P2 catastrophic backtracking.** ECMA / PCRE / .NET / Java
+   regex engines use backtracking. `^(a+)+$` against 22 a's plus a
+   non-match suffix is:
+   - C++ libstdc++ `<regex>`: 1190 ms
+   - C# `System.Text.RegularExpressions`: 393 ms
+   - Python `re`: 191 ms
+   - TS/JS `RegExp`: ~180 ms
+   - Java `java.util.regex`: 13 ms
+   - Ruby (Onigmo) / Perl / PHP (PCRE+JIT): <3 ms (engine-side ReDoS mitigations)
+   - Go (RE2) / Rust (in-tree) / C / Lua (Thompson NFA): <0.1 ms (no backtracking)
+
+   The RE2-subset contract avoids the worst classes (no backrefs,
+   no lookaround), but nested quantifiers like `(a+)+` are still
+   inside the subset and can still backtrack catastrophically on
+   the non-RE2 engines. **Callers are responsible for writing
+   linear-friendly patterns** (a single `a+` would already be
+   linear on every engine here). See `REGEX.md` for the dialect.
+
+2. **P7 — RE2's bounded-quantifier limit.** Go's stdlib `regexp`
+   refuses to compile `a{0,10000}` with *"invalid repeat count"*:
+   RE2 caps `{n,m}` at 1000 to keep the compiled program size
+   bounded. Every other engine compiles it. Internal call sites
+   in the corpus stay well below the limit; user-facing `$LIKE`
+   operators should too. There is no portable workaround — RE2's
+   limit is hard-coded in the host stdlib.
+
+3. **P8 — Go panics on invalid pattern.** `ReCompile` is a
+   passthrough to `regexp.MustCompile`, which panics. This is the
+   Go-idiomatic shape and matches the throw/raise behaviour of
+   every other port; callers wrap in `recover()` the same way other
+   ports use `try/catch`. (Not a divergence in semantics — just in
+   how the failure is named.)
+
+4. **P9 — backreferences (`\1`, `(?P=name)`).** Three families:
+   - PCRE / ECMAScript / .NET / Java / Onigmo / Perl: backrefs work.
+     `^(a+)\1$` on "aaaa" matches.
+   - Go (RE2): rejects at compile time (panics).
+   - In-tree engines (Rust, C, Lua): parse `\1` as a literal "1"
+     (or similar fallback) — the pattern compiles but never matches
+     the back-reference semantically, so the test returns `false`.
+
+   `REGEX.md` already documents this: **backreferences are outside
+   the supported dialect.** None of the canonical patterns use them.
+   The `$LIKE` operator does not document them. Callers that need
+   backrefs are running outside the contract on every RE2-family
    port.
 
-8. **C public header omits `re_find_all`** — surface gap vs the rest of
-   the ports.
+5. **Java / .NET stdout encoding.** Java printed `caf?` for P4/P5,
+   not because the regex returned the wrong string but because
+   `System.out`'s default `PrintStream` uses the platform's default
+   charset on JVMs without `-Dfile.encoding=UTF-8`. The in-memory
+   `String` is correct UTF-16. .NET's default `Console.Out` is
+   UTF-8 on .NET 6+, so C# was unaffected. This is orthogonal to
+   the regex contract.
 
-9. **Zig public surface omits `re_find`, `re_find_all`, `re_replace`** —
-   only `re_compile`, `re_test`, `re_escape` are exported. Half the
-   `REGEX_API.md` contract is unimplemented.
+6. **Time-of-iteration variance on backtracking engines.** P1 / P2
+   numbers vary across runs depending on JIT warmup, GC, and host
+   load. The qualitative split (linear vs catastrophic) is stable;
+   the specific milliseconds aren't a regression signal.
 
-## Where the test files live
+## Where the tests live
 
 | Port       | Path |
 |------------|------|
diff --git a/c/src/re_util.c b/c/src/re_util.c
index 973c37f..fe42f40 100644
--- a/c/src/re_util.c
+++ b/c/src/re_util.c
@@ -8,6 +8,7 @@
 #include "regex.h"
 #include "voxgig_struct.h"
 
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -69,6 +70,75 @@ vs_strvec vs_re_find(const char* pattern, const char* input) {
   return out;
 }
 
+void vs_strvec_vec_init(vs_strvec_vec* v) {
+  v->len = 0;
+  v->cap = 0;
+  v->data = NULL;
+}
+
+void vs_strvec_vec_free(vs_strvec_vec* v) {
+  if (!v) return;
+  for (size_t i = 0; i < v->len; i++) {
+    vs_strvec_free(&v->data[i]);
+  }
+  free(v->data);
+  v->data = NULL;
+  v->len = v->cap = 0;
+}
+
+static void vs_strvec_vec_push(vs_strvec_vec* v, vs_strvec row) {
+  if (v->len == v->cap) {
+    size_t nc = v->cap == 0 ? 4 : v->cap * 2;
+    v->data = (vs_strvec*)realloc(v->data, nc * sizeof(vs_strvec));
+    if (!v->data) abort();
+    v->cap = nc;
+  }
+  v->data[v->len++] = row;
+}
+
+vs_strvec_vec vs_re_find_all_re(const vs_regex* re, const char* input) {
+  vs_strvec_vec out;
+  vs_strvec_vec_init(&out);
+  if (!re || !input) return out;
+  size_t ilen = strlen(input);
+  /* Grow the caps buffer until vs_regex_find_all stops filling it. */
+  int max_matches = 64;
+  int per_row = 2 * VS_REGEX_MAX_GROUPS;
+  int* caps = NULL;
+  int count = 0;
+  for (;;) {
+    caps = (int*)realloc(caps, (size_t)(max_matches * per_row) * sizeof(int));
+    if (!caps) abort();
+    count = vs_regex_find_all(re, input, ilen, caps, max_matches);
+    if (count < max_matches) break;
+    max_matches *= 2;
+  }
+  int ngroups = vs_regex_ngroups(re);
+  for (int m = 0; m < count; m++) {
+    int* row_caps = caps + m * per_row;
+    vs_strvec row;
+    vs_strvec_init(&row);
+    for (int g = 0; g < ngroups; g++) {
+      int s = row_caps[2 * g], e = row_caps[2 * g + 1];
+      if (s < 0 || e < s) {
+        vs_strvec_push(&row, "");
+      } else {
+        vs_strvec_push_n(&row, input + s, (size_t)(e - s));
+      }
+    }
+    vs_strvec_vec_push(&out, row);
+  }
+  free(caps);
+  return out;
+}
+
+vs_strvec_vec vs_re_find_all(const char* pattern, const char* input) {
+  vs_regex* re = vs_regex_compile(pattern, NULL);
+  vs_strvec_vec out = vs_re_find_all_re(re, input);
+  vs_regex_free(re);
+  return out;
+}
+
 char* vs_re_replace_re(const vs_regex* re, const char* input, const char* replacement) {
   if (!re)
     return rdup(input);
diff --git a/c/src/regex.c b/c/src/regex.c
index 65febba..da39222 100644
--- a/c/src/regex.c
+++ b/c/src/regex.c
@@ -825,12 +825,17 @@ static bool match_at(const vs_regex* re, const char* input, size_t ilen, int sta
         if (c >= 0 && cc_has(&in->data.cc, c))
           tl_add(&nxt, th->pc + 1, th->slots, nslots, sp + 1, re, input, ilen);
       } else if (in->op == OP_MATCH) {
-        if (!found) {
-          found = true;
-          memcpy(best_slots, th->slots, (size_t)nslots * sizeof(int));
-        }
-        /* Higher-priority threads come first; once we've matched, lower
-           priority threads in this generation can be skipped. */
+        /* Always overwrite: threads are priority-ordered (highest first),
+         * and lower-priority threads after this one don't get processed
+         * (we break below). Across sp, a later MATCH can only arrive from
+         * descendants of HIGHER-priority threads (threads[k+1..]'s
+         * descendants are never added to nxt once we break here). So
+         * overwriting unconditionally implements leftmost-longest /
+         * leftmost-first correctly. The earlier `if (!found)` made greedy
+         * quantifiers behave lazily — e.g. `a*` on "abc" matched "" not "a".
+         */
+        found = true;
+        memcpy(best_slots, th->slots, (size_t)nslots * sizeof(int));
         break;
       }
     }
@@ -843,14 +848,18 @@ static bool match_at(const vs_regex* re, const char* input, size_t ilen, int sta
       break;
   }
   /* Handle EOI: drain the remaining current threads (some may have advanced
-     past the last char and now point at MATCH via epsilons). */
+     past the last char and now point at MATCH via epsilons). At this point
+     the threads are still priority-ordered, and the first MATCH (highest
+     priority) is the canonical leftmost-first within this generation —
+     but any earlier-recorded MATCH at a prior sp was from a LOWER-priority
+     thread (those at higher indices that came BEFORE the surviving high-
+     priority threads got to consume an extra char), so an EOI MATCH here
+     should always overwrite. */
   for (int i = 0; i < cur.len; i++) {
     thread_t* th = &cur.threads[i];
     if (re->code[th->pc].op == OP_MATCH) {
-      if (!found) {
-        found = true;
-        memcpy(best_slots, th->slots, (size_t)nslots * sizeof(int));
-      }
+      found = true;
+      memcpy(best_slots, th->slots, (size_t)nslots * sizeof(int));
       break;
     }
   }
diff --git a/c/src/voxgig_struct.h b/c/src/voxgig_struct.h
index b96ffd6..303d41d 100644
--- a/c/src/voxgig_struct.h
+++ b/c/src/voxgig_struct.h
@@ -64,6 +64,20 @@ bool vs_re_test_re(const vs_regex* re, const char* input);
 vs_strvec vs_re_find(const char* pattern, const char* input);
 vs_strvec vs_re_find_re(const vs_regex* re, const char* input);
 
+/* List-of-lists of strings — one vs_strvec per match (each row is
+ * [whole, capture1, ...]). Caller must vs_strvec_vec_free() to release. */
+typedef struct vs_strvec_vec {
+  size_t len;
+  size_t cap;
+  vs_strvec* data;
+} vs_strvec_vec;
+
+void vs_strvec_vec_init(vs_strvec_vec* v);
+void vs_strvec_vec_free(vs_strvec_vec* v);
+
+vs_strvec_vec vs_re_find_all(const char* pattern, const char* input);
+vs_strvec_vec vs_re_find_all_re(const vs_regex* re, const char* input);
+
 /* Returns malloc'd string. */
 char* vs_re_replace(const char* pattern, const char* input, const char* replacement);
 char* vs_re_replace_re(const vs_regex* re, const char* input, const char* replacement);
diff --git a/c/tests/regex_pathological.c b/c/tests/regex_pathological.c
index c013882..1eceda1 100644
--- a/c/tests/regex_pathological.c
+++ b/c/tests/regex_pathological.c
@@ -114,9 +114,15 @@ int main(void) {
 
   /* P10 */
   t0 = now_ms();
-  (void)vs_re_test("a*", "bbb");  /* find_all not in public header */
+  vs_strvec_vec p10 = vs_re_find_all("a*", "bbb");
   ms = now_ms() - t0;
-  printf("[regex-discovery] P10_find_all_zero_width | %.2fms | OK | <find_all not exposed>\n", ms);
+  printf("[regex-discovery] P10_find_all_zero_width | %.2fms | OK | [", ms);
+  for (size_t i = 0; i < p10.len; i++) {
+    if (i) printf(",");
+    print_strvec(&p10.data[i]);
+  }
+  printf("]\n");
+  vs_strvec_vec_free(&p10);
 
   free(a22);
   free(p1_in);
diff --git a/go/voxgigstruct.go b/go/voxgigstruct.go
index 7c64397..a049c69 100644
--- a/go/voxgigstruct.go
+++ b/go/voxgigstruct.go
@@ -63,6 +63,7 @@ import (
 	"strconv"
 	"strings"
 	"time"
+	"unicode/utf8"
 )
 
 const Version = "0.1.0"
@@ -993,8 +994,52 @@ func ReFindAll(pattern, input string) [][]string {
 
 // ReReplace replaces every match. The replacement supports Go's $0..$N
 // reference syntax (functionally equivalent to JS $&..$N).
+//
+// Iterates matches manually to follow the ECMAScript zero-width convention
+// (TS is the canonical source): unlike Go's ReplaceAllString, which
+// suppresses an empty match immediately after a non-empty match at the
+// same offset, we emit a replacement for EVERY match and advance by one
+// rune on a zero-width match. This makes `re_replace("a*", "abc", "X")`
+// yield "XXbXcX" — matching JS/TS/Python/Java/.NET/Ruby/PHP/Rust.
 func ReReplace(pattern, input, replacement string) string {
-	return regexp.MustCompile(pattern).ReplaceAllString(input, replacement)
+	re := regexp.MustCompile(pattern)
+	var out strings.Builder
+	pos := 0
+	for pos <= len(input) {
+		loc := re.FindStringSubmatchIndex(input[pos:])
+		if loc == nil {
+			out.WriteString(input[pos:])
+			break
+		}
+		// Shift offsets back to absolute-input coordinates so Expand can
+		// look up captures from `input`.
+		abs := make([]int, len(loc))
+		for i, v := range loc {
+			if v < 0 {
+				abs[i] = -1
+			} else {
+				abs[i] = v + pos
+			}
+		}
+		mstart, mend := abs[0], abs[1]
+		out.WriteString(input[pos:mstart])
+		out.Write(re.ExpandString(nil, replacement, input, abs))
+		if mend == mstart {
+			if mstart < len(input) {
+				_, sz := utf8.DecodeRuneInString(input[mstart:])
+				if sz == 0 {
+					sz = 1
+				}
+				out.WriteString(input[mstart : mstart+sz])
+				pos = mstart + sz
+			} else {
+				pos = mstart + 1
+			}
+		} else {
+			pos = mend
+		}
+	}
+	return out.String()
 }
 
 // ReReplaceFunc replaces every match via the callback.
diff --git a/lua/src/regex.lua b/lua/src/regex.lua
index 2718ff7..65e5c18 100644
--- a/lua/src/regex.lua
+++ b/lua/src/regex.lua
@@ -483,7 +483,11 @@ local function match_at(re, input, ilen, start)
       elseif op == OP_CLASS then
         if c >= 0 and insn.data.cc[c] then add_thread(re, nxt, th.pc + 1, th.slots, sp + 1, input, ilen, visited) end
       elseif op == OP_MATCH then
-        if not found then found = th.slots end
+        -- Always overwrite: priority ordering means later MATCHes from
+        -- surviving (higher-priority) descendants in nxt should override
+        -- earlier matches from lower-priority threads. `if not found` made
+        -- greedy quantifiers behave lazily (e.g. `a*` on "abc" matched "").
+        found = th.slots
         break
       end
     end
@@ -491,10 +495,10 @@ local function match_at(re, input, ilen, start)
     sp = sp + 1
     if #cur == 0 then break end
   end
-  -- Drain remaining current threads for trailing MATCH.
+  -- Drain remaining current threads for trailing MATCH (mirrors C engine).
   for i = 1, #cur do
     if re.code[cur[i].pc].op == OP_MATCH then
-      if not found then found = cur[i].slots end
+      found = cur[i].slots
       break
     end
   end
diff --git a/perl/t/regex_pathological.t b/perl/t/regex_pathological.t
index a6cd024..ad819e7 100644
--- a/perl/t/regex_pathological.t
+++ b/perl/t/regex_pathological.t
@@ -11,11 +11,15 @@ use Test::More;
 use FindBin;
 use lib "$FindBin::Bin/../lib";
 use Voxgig::Struct qw();
-use JSON::PP qw(encode_json);
+use JSON::PP qw();
 use Time::HiRes qw(gettimeofday tv_interval);
 
 binmode STDOUT, ':utf8';
 
+# JSON::PP defaults to UTF-8-encoding its output bytes. We want characters
+# so STDOUT's :utf8 layer can encode them once (not twice).
+my $JSON = JSON::PP->new->utf8(0);
+
 sub record {
     my ($label, $fn) = @_;
     my $t0 = [gettimeofday];
@@ -25,7 +29,7 @@ sub record {
         chomp $err;
         $outcome = "ERR | $err";
     } else {
-        my $enc = eval { encode_json($r) };
+        my $enc = eval { $JSON->encode($r) };
         $enc = (defined $r ? "$r" : 'null') if $@;
         $outcome = "OK | $enc";
     }
diff --git a/php/src/Struct.php b/php/src/Struct.php
index d286578..cafe059 100644
--- a/php/src/Struct.php
+++ b/php/src/Struct.php
@@ -565,20 +565,25 @@ public static function escre(?string $s): string
     public static function re_compile(string $pattern): string
     {
         // PHP wants a delimited pattern; return one delimited with '/'.
-        if (strlen($pattern) > 0 && $pattern[0] === '/') {
-            return $pattern;
+        $delimited = strlen($pattern) > 0 && $pattern[0] === '/'
+            ? $pattern
+            : '/' . str_replace('/', '\\/', $pattern) . '/';
+        // PCRE returns false from preg_match on invalid patterns; surface that
+        // to the caller (matching the throw behaviour of JS/Python/Java/.NET).
+        if (@preg_match($delimited, '') === false) {
+            throw new \InvalidArgumentException("Invalid regex pattern: $pattern");
         }
-        return '/' . str_replace('/', '\\/', $pattern) . '/';
+        return $delimited;
     }
 
     public static function re_test(string $pattern, string $input): bool
     {
-        return @preg_match(self::re_compile($pattern), $input) === 1;
+        return preg_match(self::re_compile($pattern), $input) === 1;
     }
 
     public static function re_find(string $pattern, string $input): ?array
     {
-        if (@preg_match(self::re_compile($pattern), $input, $m) === 1) {
+        if (preg_match(self::re_compile($pattern), $input, $m) === 1) {
             return $m;
         }
         return null;
@@ -587,7 +592,7 @@ public static function re_find(string $pattern, string $input): ?array
     public static function re_find_all(string $pattern, string $input): array
     {
         $out = [];
-        if (@preg_match_all(self::re_compile($pattern), $input, $m, PREG_SET_ORDER) !== false) {
+        if (preg_match_all(self::re_compile($pattern), $input, $m, PREG_SET_ORDER) !== false) {
             $out = $m;
         }
         return $out;
diff --git a/rust/src/re.rs b/rust/src/re.rs
index 4f06b0c..a7669e7 100644
--- a/rust/src/re.rs
+++ b/rust/src/re.rs
@@ -852,62 +852,68 @@ impl ThreadList {
     }
 
     fn add(&mut self, re: &Regex, input: &[u8], pc: usize, slots: &[i32], sp: usize) {
-        if pc >= re.code.len() {
-            return;
-        }
-        if self.visited[pc] == self.gen {
-            return;
-        }
-        self.visited[pc] = self.gen;
-        let insn = &re.code[pc];
-        match insn.op {
-            Op::Jmp(t) => {
-                self.add(re, input, t as usize, slots, sp);
-                return;
-            }
-            Op::Split(x, y) => {
-                self.add(re, input, x as usize, slots, sp);
-                self.add(re, input, y as usize, slots, sp);
-                return;
+        // Iterative epsilon-closure: we walk Jmp/Split/Save/Bol/Eol/Wb/Nwb
+        // until we hit a char-consuming op or Match. A recursive version
+        // overflows the stack on long Thompson chains (e.g. `a{0,10000}`
+        // unrolls into 10000 chained Splits — `cargo test` aborted with
+        // SIGABRT on the pathological-regex panel before this loop landed).
+        //
+        // The stack mirrors the recursive order: Split pushes y first then
+        // x, so x is processed first (priority preserved).
+        let mut stack: Vec<(usize, Vec<i32>)> = vec![(pc, slots.to_vec())];
+        while let Some((cur_pc, cur_slots)) = stack.pop() {
+            if cur_pc >= re.code.len() {
+                continue;
             }
-            Op::Save(slot) => {
-                let mut ns = slots.to_vec();
-                ns[slot] = sp as i32;
-                self.add(re, input, pc + 1, &ns, sp);
-                return;
+            if self.visited[cur_pc] == self.gen {
+                continue;
             }
-            Op::Bol => {
-                if sp == 0 || (sp - 1 < input.len() && input[sp - 1] == b'\n') {
-                    self.add(re, input, pc + 1, slots, sp);
+            self.visited[cur_pc] = self.gen;
+            match re.code[cur_pc].op {
+                Op::Jmp(t) => {
+                    stack.push((t as usize, cur_slots));
                 }
-                return;
-            }
-            Op::Eol => {
-                if sp >= input.len() || input[sp] == b'\n' {
-                    self.add(re, input, pc + 1, slots, sp);
+                Op::Split(x, y) => {
+                    // Push y first so x (higher priority) is popped first.
+                    stack.push((y as usize, cur_slots.clone()));
+                    stack.push((x as usize, cur_slots));
                 }
-                return;
-            }
-            Op::Wb | Op::Nwb => {
-                let left = sp > 0
-                    && sp - 1 < input.len()
-                    && (input[sp - 1].is_ascii_alphanumeric() || input[sp - 1] == b'_');
-                let right =
-                    sp < input.len() && (input[sp].is_ascii_alphanumeric() || input[sp] == b'_');
-                let at_boundary = left != right;
-                let want = matches!(insn.op, Op::Wb);
-                if at_boundary == want {
-                    self.add(re, input, pc + 1, slots, sp);
+                Op::Save(slot) => {
+                    let mut ns = cur_slots;
+                    ns[slot] = sp as i32;
+                    stack.push((cur_pc + 1, ns));
+                }
+                Op::Bol => {
+                    if sp == 0 || (sp - 1 < input.len() && input[sp - 1] == b'\n') {
+                        stack.push((cur_pc + 1, cur_slots));
+                    }
+                }
+                Op::Eol => {
+                    if sp >= input.len() || input[sp] == b'\n' {
+                        stack.push((cur_pc + 1, cur_slots));
+                    }
+                }
+                Op::Wb | Op::Nwb => {
+                    let left = sp > 0
+                        && sp - 1 < input.len()
+                        && (input[sp - 1].is_ascii_alphanumeric() || input[sp - 1] == b'_');
+                    let right = sp < input.len()
+                        && (input[sp].is_ascii_alphanumeric() || input[sp] == b'_');
+                    let at_boundary = left != right;
+                    let want = matches!(re.code[cur_pc].op, Op::Wb);
+                    if at_boundary == want {
+                        stack.push((cur_pc + 1, cur_slots));
+                    }
+                }
+                _ => {
+                    // Char-consuming op (or Match): queue thread.
+                    self.threads.push(Thread {
+                        pc: cur_pc,
+                        slots: cur_slots,
+                    });
                 }
-                return;
             }
-            _ => {}
         }
-        // Char-consuming op: queue thread.
-        self.threads.push(Thread {
-            pc,
-            slots: slots.to_vec(),
-        });
     }
 }
 
diff --git a/zig/src/regex.zig b/zig/src/regex.zig
index 17ad86d..62d118c 100644
--- a/zig/src/regex.zig
+++ b/zig/src/regex.zig
@@ -133,7 +133,7 @@ pub const Regex = struct {
         return false;
     }
 
-    fn findFirst(self: Regex, input: []const u8) ?[]i32 {
+    pub fn findFirst(self: Regex, input: []const u8) ?[]i32 {
         var start: usize = 0;
         while (true) {
             if (self.matchAt(input, start)) |slots| return slots;
@@ -143,6 +143,16 @@ pub const Regex = struct {
         }
     }
 
+    pub fn findFrom(self: Regex, input: []const u8, from: usize) ?[]i32 {
+        var start: usize = from;
+        while (true) {
+            if (self.matchAt(input, start)) |slots| return slots;
+            if (self.anchored_start) return null;
+            if (start > input.len) return null;
+            start += 1;
+        }
+    }
+
     fn matchAt(self: Regex, input: []const u8, start: usize) ?[]i32 {
         const nslots = self.ngroups * 2;
         var cur = ThreadList.init(self.allocator, self.code.len) catch return null;
diff --git a/zig/src/struct.zig b/zig/src/struct.zig
index d4f124c..a1df63c 100644
--- a/zig/src/struct.zig
+++ b/zig/src/struct.zig
@@ -773,6 +773,102 @@ pub fn re_test(pattern: []const u8, input: []const u8) bool {
     return re.isMatch(input);
 }
 
+/// re_find — first match as `[whole, capture1, ...]`. Slices alias `input`,
+/// so the result is valid only while `input` is alive. Returns null on
+/// compile error or no-match. The outer slice and inner slices must be
+/// freed by the caller.
+pub fn re_find(allocator: Allocator, pattern: []const u8, input: []const u8) ?[][]const u8 {
+    var re = _re_engine.compile(std.heap.page_allocator, pattern) orelse return null;
+    defer re.deinit();
+    const slots = re.findFirst(input) orelse return null;
+    defer std.heap.page_allocator.free(slots);
+    const ngroups = re.ngroups;
+    const out = allocator.alloc([]const u8, ngroups) catch return null;
+    var g: usize = 0;
+    while (g < ngroups) : (g += 1) {
+        const s = slots[2 * g];
+        const e = slots[2 * g + 1];
+        if (s < 0 or e < s) {
+            out[g] = "";
+        } else {
+            out[g] = input[@as(usize, @intCast(s))..@as(usize, @intCast(e))];
+        }
+    }
+    return out;
+}
+
+/// re_find_all — every non-overlapping match. Caller owns the returned
+/// slice-of-slices and must free both levels.
+pub fn re_find_all(allocator: Allocator, pattern: []const u8, input: []const u8) ?[][][]const u8 {
+    var re = _re_engine.compile(std.heap.page_allocator, pattern) orelse return null;
+    defer re.deinit();
+    var rows = std.ArrayList([][]const u8).init(allocator);
+    defer rows.deinit();
+    var pos: usize = 0;
+    while (pos <= input.len) {
+        const slots = re.findFrom(input, pos) orelse break;
+        defer std.heap.page_allocator.free(slots);
+        const ngroups = re.ngroups;
+        const row = allocator.alloc([]const u8, ngroups) catch return null;
+        var g: usize = 0;
+        while (g < ngroups) : (g += 1) {
+            const s = slots[2 * g];
+            const e = slots[2 * g + 1];
+            if (s < 0 or e < s) {
+                row[g] = "";
+            } else {
+                row[g] = input[@as(usize, @intCast(s))..@as(usize, @intCast(e))];
+            }
+        }
+        rows.append(row) catch return null;
+        const mstart = @as(usize, @intCast(slots[0]));
+        const mend = @as(usize, @intCast(slots[1]));
+        if (mend == mstart) {
+            pos = mend + 1;
+        } else {
+            pos = mend;
+        }
+    }
+    return rows.toOwnedSlice() catch return null;
+}
+
+/// re_replace — replace every match in `input` with `replacement`. The
+/// replacement string is taken literally; $& / $1.. substitution is not
+/// expanded in this minimal wrapper (matches the engine's current shape).
+/// On zero-width match the current rune is emitted and we advance by one
+/// byte, mirroring the ECMAScript convention used by other ports.
+pub fn re_replace(allocator: Allocator, pattern: []const u8, input: []const u8, replacement: []const u8) ![]u8 {
+    var re = _re_engine.compile(std.heap.page_allocator, pattern) orelse {
+        return allocator.dupe(u8, input);
+    };
+    defer re.deinit();
+    var out = std.ArrayList(u8).init(allocator);
+    defer out.deinit();
+    var pos: usize = 0;
+    while (pos <= input.len) {
+        const slots = re.findFrom(input, pos) orelse {
+            try out.appendSlice(input[pos..]);
+            break;
+        };
+        defer std.heap.page_allocator.free(slots);
+        const mstart = @as(usize, @intCast(slots[0]));
+        const mend = @as(usize, @intCast(slots[1]));
+        try out.appendSlice(input[pos..mstart]);
+        try out.appendSlice(replacement);
+        if (mend == mstart) {
+            if (mstart < input.len) {
+                try out.append(input[mstart]);
+                pos = mstart + 1;
+            } else {
+                pos = mstart + 1;
+            }
+        } else {
+            pos = mend;
+        }
+    }
+    return out.toOwnedSlice();
+}
+
 pub fn re_escape(allocator: Allocator, s: []const u8) ![]const u8 {
     return escre(allocator, s);
 }
diff --git a/zig/test/regex_pathological.zig b/zig/test/regex_pathological.zig
index ff089db..5ae19cd 100644
--- a/zig/test/regex_pathological.zig
+++ b/zig/test/regex_pathological.zig
@@ -1,32 +1,19 @@
 // Discovery test: pathological regex inputs run against the port's re_* API.
 // Goal is to surface failures across ports, not to assert behaviour.
 // Panel is the same in every port (see REGEX.md).
-//
-// Zig's public regex surface currently exposes only re_compile/re_test/re_escape
-// (see src/struct.zig). The find/replace/find_all cases below mark themselves
-// as N/A — that absence is itself part of the discovery.
 
 const std = @import("std");
 const voxgig_struct = @import("voxgig-struct");
 
-fn record_test(label: []const u8, ok: bool, ms: f64, value: anytype) void {
-    const T = @TypeOf(value);
-    const writer = std.io.getStdOut().writer();
-    if (ok) {
-        if (T == bool) {
-            writer.print("[regex-discovery] {s} | {d:.2}ms | OK | {}\n", .{ label, ms, value }) catch {};
-        } else {
-            writer.print("[regex-discovery] {s} | {d:.2}ms | OK | {any}\n", .{ label, ms, value }) catch {};
-        }
-    } else {
-        writer.print("[regex-discovery] {s} | {d:.2}ms | ERR | compile or run failed\n", .{ label, ms }) catch {};
-    }
+fn ms_since(t0: i128) f64 {
+    return @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
 }
 
 test "regex pathological discovery" {
-    var buf: [4096]u8 = undefined;
-    var fba = std.heap.FixedBufferAllocator.init(&buf);
-    const alloc = fba.allocator();
+    const writer = std.io.getStdOut().writer();
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
 
     const a22 = try alloc.alloc(u8, 22);
     @memset(a22, 'a');
@@ -44,51 +31,53 @@ test "regex pathological discovery" {
     }
     const nest40 = nest_buf[0..pos];
 
-    // P1
     var t0 = std.time.nanoTimestamp();
     const b1 = voxgig_struct.re_test("^(a+)+$", p1_in);
-    var ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
-    record_test("P1_redos_nested_plus", true, ms, b1);
+    try writer.print("[regex-discovery] P1_redos_nested_plus | {d:.2}ms | OK | {}\n", .{ ms_since(t0), b1 });
 
-    // P2
     t0 = std.time.nanoTimestamp();
     const b2 = voxgig_struct.re_test("^(a|aa)+$", p1_in);
-    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
-    record_test("P2_redos_alt_overlap", true, ms, b2);
+    try writer.print("[regex-discovery] P2_redos_alt_overlap | {d:.2}ms | OK | {}\n", .{ ms_since(t0), b2 });
 
-    // P3, P4, P5, P10 — replace/find/find_all not in zig public surface.
-    const writer = std.io.getStdOut().writer();
-    try writer.print("[regex-discovery] P3_empty_repeat_replace | -.--ms | N/A | re_replace not exposed\n", .{});
-    try writer.print("[regex-discovery] P4_unicode_replace_dot | -.--ms | N/A | re_replace not exposed\n", .{});
-    try writer.print("[regex-discovery] P5_unicode_find_codepoint | -.--ms | N/A | re_find not exposed\n", .{});
+    t0 = std.time.nanoTimestamp();
+    const p3 = try voxgig_struct.re_replace(alloc, "a*", "abc", "X");
+    try writer.print("[regex-discovery] P3_empty_repeat_replace | {d:.2}ms | OK | \"{s}\"\n", .{ ms_since(t0), p3 });
+
+    t0 = std.time.nanoTimestamp();
+    const p4 = try voxgig_struct.re_replace(alloc, "\\.", "café.au.lait", "/");
+    try writer.print("[regex-discovery] P4_unicode_replace_dot | {d:.2}ms | OK | \"{s}\"\n", .{ ms_since(t0), p4 });
+
+    t0 = std.time.nanoTimestamp();
+    if (voxgig_struct.re_find(alloc, "é", "café au lait")) |p5| {
+        try writer.print("[regex-discovery] P5_unicode_find_codepoint | {d:.2}ms | OK | [\"{s}\"]\n", .{ ms_since(t0), p5[0] });
+    } else {
+        try writer.print("[regex-discovery] P5_unicode_find_codepoint | {d:.2}ms | OK | null\n", .{ms_since(t0)});
+    }
 
-    // P6
     t0 = std.time.nanoTimestamp();
     const b6 = voxgig_struct.re_test(nest40, "a");
-    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
-    record_test("P6_deep_nesting_compile", true, ms, b6);
+    try writer.print("[regex-discovery] P6_deep_nesting_compile | {d:.2}ms | OK | {}\n", .{ ms_since(t0), b6 });
 
-    // P7
     t0 = std.time.nanoTimestamp();
     const b7 = voxgig_struct.re_test("^a{0,10000}b$", "aaaaaaaaaab");
-    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
-    record_test("P7_big_bounded_quantifier", true, ms, b7);
+    try writer.print("[regex-discovery] P7_big_bounded_quantifier | {d:.2}ms | OK | {}\n", .{ ms_since(t0), b7 });
 
-    // P8
     t0 = std.time.nanoTimestamp();
     const p8 = voxgig_struct.re_compile("[abc");
-    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
     if (p8 == null) {
-        try writer.print("[regex-discovery] P8_invalid_pattern | {d:.2}ms | ERR | compile returned null\n", .{ms});
+        try writer.print("[regex-discovery] P8_invalid_pattern | {d:.2}ms | ERR | compile returned null\n", .{ms_since(t0)});
     } else {
-        try writer.print("[regex-discovery] P8_invalid_pattern | {d:.2}ms | OK | \"compiled\"\n", .{ms});
+        try writer.print("[regex-discovery] P8_invalid_pattern | {d:.2}ms | OK | \"compiled\"\n", .{ms_since(t0)});
     }
 
-    // P9
     t0 = std.time.nanoTimestamp();
     const b9 = voxgig_struct.re_test("^(a+)\\1$", "aaaa");
-    ms = @as(f64, @floatFromInt(std.time.nanoTimestamp() - t0)) / 1e6;
-    record_test("P9_backref_re2_forbidden", true, ms, b9);
+    try writer.print("[regex-discovery] P9_backref_re2_forbidden | {d:.2}ms | OK | {}\n", .{ ms_since(t0), b9 });
 
-    try writer.print("[regex-discovery] P10_find_all_zero_width | -.--ms | N/A | re_find_all not exposed\n", .{});
+    t0 = std.time.nanoTimestamp();
+    if (voxgig_struct.re_find_all(alloc, "a*", "bbb")) |p10| {
+        try writer.print("[regex-discovery] P10_find_all_zero_width | {d:.2}ms | OK | <{} matches>\n", .{ ms_since(t0), p10.len });
+    } else {
+        try writer.print("[regex-discovery] P10_find_all_zero_width | {d:.2}ms | OK | null\n", .{ms_since(t0)});
+    }
 }

From f00232377f875a4e6afea39d163c1592a843f212 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 16 May 2026 11:13:25 +0000
Subject: [PATCH 3/6] README: add Regex section to every language port

Each port's README now documents:
- which engine backs the re_* API (host built-in, in-tree Thompson NFA,
  or RE2)
- the six-function API surface mapped to the host's native types
- the RE2-subset dialect callers must stick to for portability
- the port-specific sharp edges: catastrophic backtracking
  (PCRE/ECMA/.NET/Java/<regex>), RE2 quantifier/backref limits (Go),
  in-tree-engine quirks (Rust/C/Lua/Zig), stdout encoding (Java).

Each section links to REGEX_PATHOLOGICAL.md for the cross-port
discovery panel.
---
 c/README.md          | 55 ++++++++++++++++++++++++++++++++++++++++++++
 cpp/README.md        | 37 +++++++++++++++++++++++++++++
 csharp/README.md     | 36 +++++++++++++++++++++++++++++
 go/README.md         | 47 +++++++++++++++++++++++++++++++++++++
 java/README.md       | 40 ++++++++++++++++++++++++++++++++
 javascript/README.md | 36 +++++++++++++++++++++++++++++
 lua/README.md        | 46 ++++++++++++++++++++++++++++++++++++
 perl/README.md       | 38 ++++++++++++++++++++++++++++++
 php/README.md        | 40 ++++++++++++++++++++++++++++++++
 python/README.md     | 33 ++++++++++++++++++++++++++
 ruby/README.md       | 35 ++++++++++++++++++++++++++++
 rust/README.md       | 50 ++++++++++++++++++++++++++++++++++++++++
 swift/README.md      | 36 +++++++++++++++++++++++++++++
 typescript/README.md | 44 +++++++++++++++++++++++++++++++++++
 zig/README.md        | 51 ++++++++++++++++++++++++++++++++++++++++
 15 files changed, 624 insertions(+)

diff --git a/c/README.md b/c/README.md
index 1e3f2b9..c95e23c 100644
--- a/c/README.md
+++ b/c/README.md
@@ -225,6 +225,61 @@ operator uses substring containment instead of full regex matching
 kept out of scope to minimise dependencies).
 
 
+## Regex
+
+Uniform regex API (see `/REGEX_API.md`). The C port **ships its own
+RE2-subset Thompson NFA engine** in `src/regex.c` (~700 LOC) — no
+external dependency. The wrapper layer (`src/re_util.c`) exposes the
+shared `re_*` names alongside the lower-level `vs_regex_*` engine
+API.
+
+### API
+
+| Function | Returns |
+|---|---|
+| `vs_re_compile(pattern)`                       | `vs_regex*` (NULL on bad pattern) |
+| `vs_re_test(pattern, input)`                   | `bool` |
+| `vs_re_find(pattern, input)`                   | `vs_strvec` of `[whole, group1, …]` |
+| `vs_re_find_all(pattern, input)`               | `vs_strvec_vec` (one row per match) |
+| `vs_re_replace(pattern, input, replacement)`   | malloc'd `char*` |
+| `vs_re_replace_cb(re, input, cb, ud)`          | malloc'd `char*` (callback variant) |
+| `vs_re_escape(literal)`                        | malloc'd `char*` |
+
+The `_re` suffixed variants take an already-compiled `vs_regex*`.
+
+### Dialect
+
+The in-tree engine implements the RE2 subset documented in `/REGEX.md`:
+literals + escapes, `.`, `^`/`$`, `* + ? {n} {n,} {n,m}` (greedy + lazy),
+classes incl. `\d \w \s` and friends, `\b`/`\B`, `(...)` / `(?:...)`,
+alternation.
+
+**Not supported** (by design — RE2 doesn't either): backreferences,
+lookaround, possessive quantifiers, atomic groups. Backref patterns
+compile (the parser treats `\1` as a literal `1`) but never match
+back-reference semantics, so `vs_re_test("^(a+)\\1$", "aaaa")` returns
+`false` rather than erroring. Don't rely on this — write portable
+patterns.
+
+### Sharp edges (C-specific)
+
+- **No catastrophic backtracking.** Thompson-NFA construction means
+  P1/P2 from the discovery panel finish in microseconds regardless of
+  input length.
+- **Captures cap.** `VS_REGEX_MAX_GROUPS = 16` in `regex.h`. Patterns
+  with more capturing groups silently truncate.
+- **Memory management.** `vs_regex*`, `vs_strvec`, `vs_strvec_vec`,
+  and the `char*` returned by `re_replace` are all caller-owned. Use
+  `vs_regex_free`, `vs_strvec_free`, `vs_strvec_vec_free`, and `free`
+  respectively.
+- **Zero-width `re_replace`.** `vs_re_replace("a*", "abc", "X")`
+  returns `"XXbXcX"`, the canonical ECMA convention. (Pre-fix the
+  engine produced `"XaXbXcX"` because greedy quantifiers behaved
+  lazily; the `OP_MATCH` handler in `regex.c` is now priority-correct.)
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/cpp/README.md b/cpp/README.md
index 22fb3c1..b6bc65b 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -233,6 +233,43 @@ Catch2 framework with limited test coverage.  See the
 [overview](./overview/) directory for current API examples.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The C++ port
+wraps `<regex>` (C++11), which defaults to the ECMAScript dialect.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern)`             | `std::regex(pattern)` (throws `std::regex_error` on bad pattern) |
+| `re_test(pattern, input)`         | `std::regex_search` → bool |
+| `re_find(pattern, input)`         | first match groups as `std::vector<std::string>` (empty if no match) |
+| `re_find_all(pattern, input)`     | `std::vector<std::vector<std::string>>` |
+| `re_replace(pattern, input, rep)` | `std::regex_replace(input, re, rep)` |
+| `re_escape(s)`                    | escape regex metacharacters |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+`std::regex` defaults to ECMAScript syntax and supports backreferences
+and lookaround; using them will not be portable.
+
+### Sharp edges (C++-specific)
+
+- **libstdc++ `<regex>` has the worst-in-class catastrophic
+  backtracking.** The discovery panel measures **~1.2 s** for
+  `^(a+)+$` over 22 a's plus `!`. This is well-known and is the
+  reason many production C++ projects avoid `<regex>` in favour of
+  RE2 or PCRE2. Stay inside the RE2 subset and avoid nested
+  quantifiers; even then, performance won't match the dedicated
+  engines.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/csharp/README.md b/csharp/README.md
index a22765b..d1046b7 100644
--- a/csharp/README.md
+++ b/csharp/README.md
@@ -260,6 +260,42 @@ In progress.  Coverage of canonical functions is broad; check
 [`../REPORT.md`](../REPORT.md) for the latest status.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The C# port
+wraps `System.Text.RegularExpressions.Regex`.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `ReCompile(pattern)`             | `new Regex(pattern)` (throws `RegexParseException` on bad pattern) |
+| `ReTest(pattern, input)`         | `Regex.IsMatch(input, pattern)` |
+| `ReFind(pattern, input)`         | first match as `string[]` of `[whole, group1, …]` or `null` |
+| `ReFindAll(pattern, input)`      | `List<string[]>` |
+| `ReReplace(pattern, input, rep)` | `Regex.Replace(input, pattern, rep)` |
+| `ReEscape(s)`                    | `Regex.Escape(s)` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+.NET regex supports backreferences and lookaround; using them will not
+be portable.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** .NET's regex is backtracking; the
+  discovery panel sees P1 (`^(a+)+$` over 22 a's plus `!`) in
+  ~390 ms here. .NET 7+ ships a non-backtracking engine you can opt
+  into via `RegexOptions.NonBacktracking` — consider it for
+  untrusted patterns. Stay inside the RE2 subset and prefer flat
+  patterns.
+- **Zero-width `replace`.** `ReReplace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/go/README.md b/go/README.md
index bf6cea2..8a93f13 100644
--- a/go/README.md
+++ b/go/README.md
@@ -401,6 +401,53 @@ canonical "lists are reference-stable" assumption.
 92/92 tests pass against the shared corpus.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Go port
+wraps the stdlib `regexp` package — Go's `regexp` *is* the RE2
+reference implementation.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `ReCompile(pattern)`              | `regexp.MustCompile(pattern)` (panics on bad pattern) |
+| `ReTest(pattern, input)`          | `re.MatchString(input)` |
+| `ReFind(pattern, input)`          | `re.FindStringSubmatch(input)` |
+| `ReFindAll(pattern, input)`       | `re.FindAllStringSubmatch(input, -1)` |
+| `ReReplace(pattern, input, rep)`  | manual match-and-emit loop (see "Sharp edges") |
+| `ReReplaceFunc(pattern, input,f)` | `re.ReplaceAllStringFunc(input, f)` |
+| `ReEscape(s)`                     | alias for `EscRe(s)` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+Since Go's regexp engine *is* RE2, this is the natural ceiling: there is
+no PCRE escape hatch.
+
+### Sharp edges (Go-specific)
+
+- **`ReCompile` panics.** It's a pass-through to `regexp.MustCompile`,
+  so an invalid pattern aborts via `panic`. This matches the
+  throw/raise behaviour of every other port; wrap in `recover()` if
+  you accept user-supplied patterns.
+- **Bounded quantifier cap.** RE2 refuses `{n,m}` with `m > 1000`.
+  `^a{0,10000}b$` *panics* at compile time with "invalid repeat
+  count". This is a hard RE2 limit — no portable workaround. The
+  canonical patterns and `$LIKE` operator stay well below it.
+- **No backreferences or lookaround.** RE2 does not support them by
+  design. `^(a+)\1$` panics on compile. The cross-port dialect already
+  forbids them; this is the engine that enforces the rule hardest.
+- **Zero-width `re_replace` aligned to ECMA convention.**
+  `re_replace("a*", "abc", "X")` returns `"XXbXcX"`. Go's stdlib
+  `ReplaceAllString` would return `"XbXcX"` (it suppresses an empty
+  match immediately after a non-empty match at the same offset).
+  `ReReplace` here uses a manual emit loop so the result matches the
+  TS canonical and all other ports.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/java/README.md b/java/README.md
index 71714ec..7207862 100644
--- a/java/README.md
+++ b/java/README.md
@@ -247,6 +247,46 @@ No standard test runner configured yet.  `StructTest.java` exists
 but is minimal.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Java port
+wraps `java.util.regex.Pattern`.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `reCompile(pattern)`              | `Pattern.compile(pattern)` (throws `PatternSyntaxException` on bad pattern) |
+| `reTest(pattern, input)`          | `Pattern.compile(pattern).matcher(input).find()` |
+| `reFind(pattern, input)`          | first match as `String[]` of `[whole, group1, …]` or `null` |
+| `reFindAll(pattern, input)`       | `List<String[]>` |
+| `reReplace(pattern, input, repl)` | `matcher.replaceAll(repl)` |
+| `reEscape(s)`                     | escape regex metacharacters |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+Java's regex supports backreferences and lookaround; using them will
+not be portable.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** `java.util.regex` is backtracking;
+  the discovery panel sees P1 (`^(a+)+$` over 22 a's plus `!`) in
+  ~13 ms here. Other shapes can be worse. Prefer flat patterns.
+- **Zero-width `replace`.** `reReplace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+- **`System.out` encoding.** When printing match results that contain
+  non-ASCII characters, `System.out`'s default `PrintStream` uses the
+  platform's default charset, not UTF-8. The discovery panel sees
+  `caf?` in stdout though the in-memory `String` is correct UTF-16.
+  Pass `-Dfile.encoding=UTF-8` (or use `PrintStream(System.out, true,
+  StandardCharsets.UTF_8)`) when this matters. Orthogonal to the
+  regex itself.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/javascript/README.md b/javascript/README.md
index 36a41b9..e070551 100644
--- a/javascript/README.md
+++ b/javascript/README.md
@@ -367,6 +367,42 @@ Otherwise functionally identical -- both run on V8.
 84/84 tests pass against the shared corpus.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). On JavaScript
+this is the ECMAScript `RegExp` built-in.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern, flags?)`     | `new RegExp(pattern, flags ?? 'g')` |
+| `re_test(pattern, input)`         | `pattern.test(input)` |
+| `re_find(pattern, input)`         | `input.match(pattern)` (non-global pattern) |
+| `re_find_all(pattern, input)`     | `[...input.matchAll(pattern)]` |
+| `re_replace(pattern, input, rep)` | `input.replace(pattern, rep)` (global pattern) |
+| `re_escape(s)`                    | escape `[.*+?^${}()|[\]\\]` in `s` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+`RegExp` itself supports backreferences and lookaround, but other ports
+do not, so using those will not be portable.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** `RegExp` is a backtracking engine;
+  nested quantifiers like `(a+)+` against a non-matching suffix can be
+  exponential in input length (the discovery panel sees ~180 ms on
+  Node 22 vs <0.1 ms on RE2-style engines). Prefer flat patterns and
+  character classes over alternations.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input
+panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/lua/README.md b/lua/README.md
index 324eca9..08a482e 100644
--- a/lua/README.md
+++ b/lua/README.md
@@ -370,6 +370,52 @@ reference-stable" assumption holds without a wrapper.
 75/75 tests pass against the shared corpus.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Lua port
+**ships its own RE2-subset engine** in `src/regex.lua` (~500 LOC of
+pure Lua — Lua's built-in pattern language is intentionally not
+regex, so we vendor one). No LuaRocks dependency, no FFI.
+
+### API
+
+| Function | Returns |
+|---|---|
+| `re.re_compile(pattern)`              | compiled regex object |
+| `re.re_test(pattern, input)`          | `true` / `false` |
+| `re.re_find(pattern, input)`          | `{whole, group1, …}` or `nil` |
+| `re.re_find_all(pattern, input)`      | `{ {whole, group1, …}, … }` |
+| `re.re_replace(pattern, input, repl)` | `string` |
+| `re.re_escape(literal)`               | `string` |
+
+### Dialect
+
+The in-tree engine implements the RE2 subset documented in
+`/REGEX.md`: literals + escapes, `.`, `^`/`$`, `* + ? {n} {n,} {n,m}`
+(greedy + lazy), classes incl. `\d \w \s` and friends, `\b`/`\B`,
+`(...)` / `(?:...)`, alternation.
+
+**Not supported** (by design — RE2 doesn't either): backreferences,
+lookaround, possessive quantifiers, atomic groups. Backref patterns
+compile (the parser treats `\1` as a literal `1`) but never match
+back-reference semantics, so `re.re_test("^(a+)\\1$", "aaaa")` returns
+`false`. Don't rely on this — write portable patterns.
+
+### Sharp edges (Lua-specific)
+
+- **It's a Lua VM regex engine.** P7 (`a{0,10000}b$`) takes ~80 ms
+  here — fine functionally, slow versus native engines. The library's
+  hot paths don't use bounded quantifiers anywhere near that size.
+- **No catastrophic backtracking.** Thompson-NFA construction; P1/P2
+  finish in microseconds.
+- **Zero-width `re_replace`.** `re.re_replace("a*", "abc", "X")`
+  returns `"XXbXcX"`, the canonical ECMA convention. (Pre-fix the
+  engine produced `"XaXbXcX"`; the `OP_MATCH` handler in `regex.lua`
+  is now priority-correct, matching the C port's fix.)
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/perl/README.md b/perl/README.md
index 17600a2..87732fa 100644
--- a/perl/README.md
+++ b/perl/README.md
@@ -104,6 +104,44 @@ because they don't preserve insertion order.
 - Builder helpers: `jm` (insertion-ordered map literal), `jt`
   (list literal).
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Perl port
+wraps Perl's built-in regex engine.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern, flags?)`         | `qr/$pattern/` |
+| `re_test(pattern, input)`             | `$input =~ $re` |
+| `re_find(pattern, input)`             | first match as `[whole, $1, ...]` or `undef` |
+| `re_find_all(pattern, input)`         | all matches, one arrayref per match |
+| `re_replace(pattern, input, repl)`    | `s/$re/$repl/g` (callable or template) |
+| `re_escape(s)`                        | `quotemeta` equivalent |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+Perl's regex supports backreferences, lookaround, recursion — none of
+which are portable to the Go / Rust / C / Lua / Zig ports.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** Perl's regex engine is backtracking
+  but ships with optimisations (trie engine for alternation, etc.).
+  The discovery panel runs P1/P2 in microseconds here, but other
+  pathological shapes can still blow up. Stay flat.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+- **UTF-8 handling.** Pass character strings (use `use utf8;` for
+  literals, or `decode_utf8` for bytes). Encoding round-trip bugs in
+  caller code can manifest as `cafÃ©` style mojibake at print time —
+  the regex itself preserves character semantics.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Tests
 
 ```bash
diff --git a/php/README.md b/php/README.md
index f709bfc..b7d8b5d 100644
--- a/php/README.md
+++ b/php/README.md
@@ -362,6 +362,46 @@ PHP method names match canonical lowercase: `getpath`, `setpath`,
 82/82 tests pass, 920 assertions.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The PHP port
+wraps PCRE (`preg_*`).
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern)`              | delimited PCRE pattern (validated via `preg_match`) |
+| `re_test(pattern, input)`          | `preg_match` → bool |
+| `re_find(pattern, input)`          | `preg_match` with captures, returns `[whole, group1, ...]` or `null` |
+| `re_find_all(pattern, input)`      | `preg_match_all(..., PREG_SET_ORDER)` |
+| `re_replace(pattern, input, repl)` | `preg_replace` (or `preg_replace_callback` for callable repl) |
+| `re_escape(s)`                     | `preg_quote(s)` equivalent |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+PCRE supports backreferences and lookaround; using them will not be
+portable.
+
+### Sharp edges
+
+- **`re_compile` validates eagerly.** Invalid patterns throw
+  `InvalidArgumentException` at compile time. This is a recent fix:
+  the wrapper used to swallow PCRE warnings via `@preg_match` and
+  return `false` silently from `re_test`/`re_find`. Callers can now
+  distinguish "no match" from "bad pattern".
+- **Catastrophic backtracking.** PCRE is a backtracking engine but has
+  a JIT and a backtrack limit; the discovery panel runs P1/P2 in a few
+  ms here. Larger inputs or pathological shapes can hit
+  `pcre.backtrack_limit` and return `false`. Stay inside the RE2 subset
+  and prefer flat patterns.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/python/README.md b/python/README.md
index 1797fcc..8c3191a 100644
--- a/python/README.md
+++ b/python/README.md
@@ -361,6 +361,39 @@ parity with other ports beats style here.
 84/84 tests pass against the shared corpus.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Python port
+wraps the stdlib `re` module.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern, flags=0)`         | `re.compile(pattern, flags)` |
+| `re_test(pattern, input)`              | `bool(re.search(pattern, input))` |
+| `re_find(pattern, input)`              | first match as `[whole, group1, ...]` or `None` |
+| `re_find_all(pattern, input)`          | all matches, one row per match |
+| `re_replace(pattern, input, repl)`     | `re.sub(pattern, repl, input)` |
+| `re_escape(s)`                         | `re.escape(s)` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+Python's `re` supports backreferences and lookaround; using them will
+not be portable to the Go / Rust / C / Lua / Zig ports.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** Python's `re` (the default C engine)
+  is backtracking. `^(a+)+$` against 22 a's plus `!` runs ~190 ms here;
+  RE2-style ports finish the same case in <0.1 ms. Use flat patterns.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/ruby/README.md b/ruby/README.md
index 718cff1..3e7a56d 100644
--- a/ruby/README.md
+++ b/ruby/README.md
@@ -345,6 +345,41 @@ and a `maxdepth` parameter, matching the canonical algorithm.
 75/75 tests pass, 150 assertions.
 
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Ruby port
+wraps the built-in `Regexp` (Onigmo engine).
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern)`              | `Regexp.new(pattern)` |
+| `re_test(pattern, input)`          | `input =~ re` |
+| `re_find(pattern, input)`          | `input.match(re)` → `[whole, group1, ...]` |
+| `re_find_all(pattern, input)`      | `input.scan(re)` (one row per match) |
+| `re_replace(pattern, input, repl)` | `input.gsub(re, repl)` |
+| `re_escape(s)`                     | `Regexp.escape(s)` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+Onigmo supports backreferences and lookaround; using them will not be
+portable to the Go / Rust / C / Lua / Zig ports.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** Onigmo has internal mitigations for
+  some classic ReDoS shapes — `^(a+)+$` against 22 a's plus `!` runs
+  in microseconds here. Larger inputs or different shapes can still
+  blow up; the safe rule is to stay inside the RE2 subset and avoid
+  nested quantifiers.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash
diff --git a/rust/README.md b/rust/README.md
index b837949..5968c86 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -126,3 +126,53 @@ Rust has no optional/overloaded parameters, so:
 
 See [`REPORT.md`](../REPORT.md#rust-rust) for the rust-port adaptations
 write-up, and [`../NOTES.md`](../NOTES.md) for cross-port quirks.
+
+
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Rust port
+**ships its own RE2-subset engine** in `src/re.rs` — no `regex` crate
+dependency, no third-party crates at all (`Cargo.toml` lists none for
+runtime).
+
+### API
+
+| Function | Returns |
+|---|---|
+| `re_compile(pattern)`           | `Result<Regex, RegexError>` |
+| `re_test(pattern, input)`       | `bool` |
+| `re_find(pattern, input)`       | `Option<Vec<String>>` — `[whole, group1, …]` |
+| `re_find_all(pattern, input)`   | `Vec<Vec<String>>` |
+| `re_replace(pattern, input, r)` | `String` |
+| `re_escape(s)`                  | `String` |
+
+### Dialect
+
+The in-tree engine implements the RE2 subset documented in
+`/REGEX.md`: literals + escapes, `.`, `^`/`$`, `* + ? {n} {n,} {n,m}`
+(greedy + lazy), classes incl. `\d \w \s` and friends, `\b`/`\B`,
+`(...)` / `(?:...)`, alternation.
+
+**Not supported** (by design — RE2 doesn't either):
+backreferences, lookaround, possessive quantifiers, atomic groups.
+Backref patterns like `^(a+)\1$` *compile* (the parser doesn't reject
+`\1`) but never match the back-reference semantically, so `re_test`
+returns `false` rather than erroring. Don't rely on this — write
+portable patterns.
+
+### Sharp edges (Rust-specific)
+
+- **Bounded quantifiers are unrolled.** `a{0,10000}` compiles into
+  10 000 Split+atom-clone pairs. The matcher was previously recursive
+  during epsilon-closure and stack-overflowed on such patterns; it is
+  now iterative (`Threads::add` uses an explicit work stack).
+  `re_test("^a{0,10000}b$", …)` now runs in ~10 ms here.
+- **No catastrophic backtracking.** Thompson-NFA construction means
+  P1/P2 from the discovery panel run in microseconds.
+- **Zero-width `re_replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"`, the canonical ECMA convention.
+- **Single-threaded.** `Value` uses `Rc<RefCell<…>>` so it is
+  `!Send + !Sync`. The regex statics use `std::sync::LazyLock` and
+  are thread-safe in isolation, but the public API isn't.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
diff --git a/swift/README.md b/swift/README.md
index 9243e85..4e59a11 100644
--- a/swift/README.md
+++ b/swift/README.md
@@ -145,6 +145,42 @@ order. `JSON.stringify(value, indent: 2)` serialises back.
   `__NULL__` round-trip for the `inject.string` and `select.*`
   sets exactly as the canonical TS runner does.
 
+## Regex
+
+Uniform six-function regex API (see `/REGEX_API.md`). The Swift port
+wraps `NSRegularExpression`.
+
+### API
+
+| Function | Returns |
+|---|---|
+| `re_compile(pattern, flags?)`         | `NSRegularExpression?` (nil on bad pattern) |
+| `re_test(pattern, input)`             | `Bool` |
+| `re_find(pattern, input)`             | `Value.list([whole, group1, …])` or `.noval` |
+| `re_find_all(pattern, input)`         | `Value.list([...])` |
+| `re_replace(pattern, input, repl)`    | `String` |
+| `re_escape(v)`                        | `String` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`.
+`NSRegularExpression` (ICU-based) supports backreferences and lookaround;
+using them will not be portable.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** ICU regex is backtracking. Stay
+  inside the RE2 subset and prefer flat patterns.
+- **Compile failures are nil, not throws.** `re_compile` returns
+  `nil` on bad pattern (the underlying `try?` swallows the error).
+  Callers should check the optional rather than rely on an exception.
+- **`Value` shape for `re_find` / `re_find_all`.** The Swift port
+  threads results through the in-tree `Value` enum (matching the
+  rest of the API surface), not raw arrays.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Tests
 
 ```bash
diff --git a/typescript/README.md b/typescript/README.md
index af759eb..c1bbb2a 100644
--- a/typescript/README.md
+++ b/typescript/README.md
@@ -563,6 +563,50 @@ calls (one shared array per depth).  Clone it (`path.slice()`) if
 you need to retain it past the callback.
 
 
+## Regex
+
+The library exposes a uniform six-function regex API across every
+port (see `/REGEX_API.md` for the contract and `/REGEX.md` for the
+supported dialect). On TypeScript the canonical implementation is
+ECMAScript `RegExp`.
+
+### API
+
+| Function | Maps to |
+|---|---|
+| `re_compile(pattern, flags?)`     | `new RegExp(pattern, flags ?? 'g')` |
+| `re_test(pattern, input)`         | `pattern.test(input)` |
+| `re_find(pattern, input)`         | `input.match(pattern)` (non-global pattern) |
+| `re_find_all(pattern, input)`     | `[...input.matchAll(pattern)]` |
+| `re_replace(pattern, input, rep)` | `input.replace(pattern, rep)` (global pattern) |
+| `re_escape(s)`                    | escape `[.*+?^${}()|[\]\\]` in `s` |
+
+### Dialect
+
+Patterns must stay inside the **RE2 subset** documented in `/REGEX.md`:
+literals + escapes, `.`, `^`/`$`, `* + ? {n} {n,} {n,m}` (greedy + lazy),
+character classes incl. `\d \w \s` etc., `\b`/`\B`, `(...)` / `(?:...)` /
+`(?<name>...)`, alternation. ECMAScript `RegExp` supports backreferences
+and lookaround, but other ports do not — using those will not be
+portable.
+
+### Sharp edges
+
+- **Catastrophic backtracking.** ECMAScript `RegExp` uses backtracking;
+  nested quantifiers (e.g. `(a+)+`) against a non-matching suffix can be
+  exponential in the input length. The discovery panel measures ~180 ms
+  on Node 22 for `^(a+)+$` against 22 a's plus `!`. RE2-style engines
+  finish the same case in under 0.1 ms. Write linear-friendly patterns
+  (`a+` instead of `(a+)+`) and keep injected user input in
+  character classes, not in alternations.
+- **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
+  `"XXbXcX"` here. This is the canonical convention; the Go, C, and
+  Lua ports were aligned to this output.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input
+panel and per-port outcomes.
+
+
 ## Build and test
 
 ```bash
diff --git a/zig/README.md b/zig/README.md
index f3fe854..9010343 100644
--- a/zig/README.md
+++ b/zig/README.md
@@ -251,6 +251,57 @@ subsystems present) but the test corpus pass rate is being raised.
 60+ tests pass; see [`../REPORT.md`](../REPORT.md) for current status.
 
 
+## Regex
+
+Uniform regex API (see `/REGEX_API.md`). The Zig port **ships its own
+RE2-subset engine** in `src/regex.zig` (Thompson NFA), replacing the
+earlier `mvzr` dependency. No third-party runtime crates.
+
+### API
+
+| Function | Returns |
+|---|---|
+| `re_compile(pattern)`                          | `?ReCompiled` (nil on bad pattern) |
+| `re_test(pattern, input)`                      | `bool` |
+| `re_find(alloc, pattern, input)`               | `?[][]const u8` (caller frees) |
+| `re_find_all(alloc, pattern, input)`           | `?[][][]const u8` (caller frees both levels) |
+| `re_replace(alloc, pattern, input, repl)`      | `![]u8` (caller frees) |
+| `re_escape(alloc, s)`                          | `![]const u8` |
+
+`ReCompiled` is an alias for the engine's `Regex` type
+(`src/regex.zig`); it owns an instruction buffer and is released with
+`.deinit()`.
+
+### Dialect
+
+The in-tree engine implements the RE2 subset documented in `/REGEX.md`:
+literals + escapes, `.`, `^`/`$`, `* + ? {n} {n,} {n,m}` (greedy + lazy),
+classes incl. `\d \w \s` and friends, `\b`/`\B`, `(...)` / `(?:...)`,
+alternation.
+
+**Not supported** (by design — RE2 doesn't either): backreferences,
+lookaround, possessive quantifiers, atomic groups.
+
+### Sharp edges (Zig-specific)
+
+- **Allocator-explicit.** `re_test` and `re_compile` use
+  `std.heap.page_allocator` internally so callers don't have to pipe
+  one through every call; the find/find_all/replace wrappers ask for
+  one because they return caller-owned slices.
+- **`re_find` / `re_find_all` slices alias the input.** They are
+  valid only while `input` is alive. Copy if you need to retain past
+  the input's lifetime.
+- **`re_replace` takes the replacement literally** in the current
+  wrapper — no `$&`/`$1..` expansion. The engine's lower-level
+  callback variant gives full control.
+- **No catastrophic backtracking.** Thompson-NFA construction; P1/P2
+  finish in microseconds.
+- **Zero-width `re_replace`** follows the ECMA convention:
+  `re_replace(alloc, "a*", "abc", "X")` returns `"XXbXcX"`.
+
+See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
+
+
 ## Build and test
 
 ```bash

From 2e4b6597116dcef485bc73f2a7c765ba0e2e96c1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 16 May 2026 12:37:15 +0000
Subject: [PATCH 4/6] Revert Go re_replace ECMA-alignment; document the
 variance instead
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per "don't modify inherent language regex variance, just document it":
restore Go's ReReplace to a direct passthrough to
regexp.MustCompile(pattern).ReplaceAllString(input, replacement). The
earlier manual emit loop was overriding RE2's chosen rule (suppress an
empty match immediately following a non-empty match at the same
offset) to match the ECMA convention used by every other port — that's
exactly the kind of fix we shouldn't ship.

Documentation updated to reflect that zero-width re_replace is now a
known cross-port variance:

- REGEX_PATHOLOGICAL.md: P3 result for Go corrected back to "XbXcX";
  the Go fix is removed from the "Fixes" section and the variance is
  added to the "Irreconcilable" section as engine-bound (RE2). A
  "Deliberately not fixed" note records why.
- go/README.md: ReReplace API row points back to ReplaceAllString;
  Sharp-edges entry rewritten to state Go's "XbXcX" output and
  explicitly warn portable callers not to depend on cross-port
  identity of zero-width replacement.
- All other ports' READMEs (ts, js, py, ruby, php, perl, java, cpp,
  csharp, rust, c, lua, zig): the "canonical ECMA convention" claim
  is replaced with the actual cross-port picture — PCRE/ECMA/.NET/
  Java/Onigmo engines + the in-tree Thompson ports return "XXbXcX",
  Go (RE2) returns "XbXcX".

Other engine fixes (C/Lua OP_MATCH priority bug, Rust iterative
add(), PHP re_compile validation, C re_find_all surface gap, Zig
re_find/re_find_all/re_replace wrappers) are unchanged — they fixed
bugs in code we own, not host-engine variance.
---
 REGEX_PATHOLOGICAL.md | 41 ++++++++++++++++++++++------------
 c/README.md           |  8 ++++---
 cpp/README.md         |  2 +-
 csharp/README.md      |  2 +-
 go/README.md          | 15 +++++++------
 go/voxgigstruct.go    | 52 ++++++-------------------------------------
 java/README.md        |  2 +-
 javascript/README.md  |  2 +-
 lua/README.md         |  8 ++++---
 perl/README.md        |  2 +-
 php/README.md         |  2 +-
 python/README.md      |  2 +-
 ruby/README.md        |  2 +-
 rust/README.md        |  4 +++-
 typescript/README.md  |  7 ++++--
 zig/README.md         |  6 +++--
 16 files changed, 72 insertions(+), 85 deletions(-)

diff --git a/REGEX_PATHOLOGICAL.md b/REGEX_PATHOLOGICAL.md
index be100be..12fa5b4 100644
--- a/REGEX_PATHOLOGICAL.md
+++ b/REGEX_PATHOLOGICAL.md
@@ -35,7 +35,7 @@ under its own tests directory.
 | ruby       |    0.04 | 0.05    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
 | php        |    3    | 0.3     | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
 | perl       |    0.06 | 0.06    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
-| go         |    0.03 | 0.02    | `"XXbXcX"`   | `café/au/lait` | PANIC | PANIC             | PANIC     |
+| go         |    0.03 | 0.02    | `"XbXcX"`    | `café/au/lait` | PANIC | PANIC             | PANIC     |
 | rust       |    0.01 | 0.01    | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | non-match |
 | java       |   13    | 0.2     | `"XXbXcX"`   | `caf?/au/lait` | OK    | ERR (clean)       | matches   |
 | cpp        | **1190**| 24      | `"XXbXcX"`   | `café/au/lait` | OK    | ERR (clean)       | matches   |
@@ -82,16 +82,7 @@ n/r = toolchain unavailable in this environment.
    engine already supported the operation; only the wrapper was
    missing.
 
-5. **go — `re_replace` zero-width convention differed from JS**
-   (`go/voxgigstruct.go`). Go's `ReplaceAllString` suppresses an
-   empty match immediately after a non-empty match at the same
-   offset, so `re_replace("a*", "abc", "X")` yielded `"XbXcX"`
-   instead of the canonical `"XXbXcX"`. Replaced the passthrough
-   with a manual match-and-emit loop that follows the ECMAScript
-   rule (always emit a replacement, advance by one rune on
-   zero-width). Existing Go tests still pass.
-
-6. **zig — `re_find` / `re_find_all` / `re_replace` not exposed**
+5. **zig — `re_find` / `re_find_all` / `re_replace` not exposed**
    (`zig/src/struct.zig`, `zig/src/regex.zig`). The engine had
    `matchAt` but only `re_compile` / `re_test` / `re_escape` were
    public. Made `findFirst` public, added `findFrom(input, start)`,
@@ -100,13 +91,25 @@ n/r = toolchain unavailable in this environment.
    (no zig toolchain); the wrappers compile against the engine but
    need a host-side smoke pass.
 
-7. **perl — discovery test showed `cafÃ©/au/lait`** (`perl/t/regex_pathological.t`).
+6. **perl — discovery test showed `cafÃ©/au/lait`** (`perl/t/regex_pathological.t`).
    This turned out to be a test-script bug, not a port bug:
    `encode_json` returns UTF-8-encoded bytes and `binmode STDOUT,
    ':utf8'` then re-encoded them as Latin-1. Switched the test to
    `JSON::PP->new->utf8(0)->encode` so the `:utf8` layer encodes
    once. The Perl port's `re_replace` was correct all along.
 
+**Deliberately not fixed — Go `re_replace` zero-width convention.**
+Go's `regexp.ReplaceAllString` suppresses an empty match immediately
+after a non-empty match at the same offset, so
+`re_replace("a*", "abc", "X")` returns `"XbXcX"` here, not the
+ECMA-canonical `"XXbXcX"`. This is RE2's chosen rule — it's
+host-package behaviour we don't own. An earlier attempt wrapped
+`ReplaceAllString` with a manual emit loop to align the output; it
+was reverted in line with "don't modify inherent language regex
+variance, just document it." Callers writing portable code should
+not assume zero-width replacement semantics are identical across
+ports.
+
 ## Irreconcilable — engine-bound, documented for callers
 
 Cases where the host language's regex engine fundamentally differs
@@ -161,7 +164,17 @@ sharp edges that come with the host engines we don't own.
    backrefs are running outside the contract on every RE2-family
    port.
 
-5. **Java / .NET stdout encoding.** Java printed `caf?` for P4/P5,
+5. **P3 zero-width `replace_all` convention varies between engines.**
+   `re_replace("a*", "abc", "X")` produces:
+   - `"XXbXcX"` — every PCRE / ECMA / .NET / Java engine, plus the
+     in-tree Thompson NFA ports (Rust, C, Lua) after the engine fix.
+   - `"XbXcX"` — Go (RE2). RE2 deliberately suppresses an empty match
+     that immediately follows a non-empty match at the same offset.
+   This is inherent to RE2 / Go's `regexp` package; there is no
+   portable workaround that doesn't replace the engine. Don't rely on
+   zero-width replacement output being identical across ports.
+
+6. **Java / .NET stdout encoding.** Java printed `caf?` for P4/P5,
    not because the regex returned the wrong string but because
    `System.out`'s default `PrintStream` uses the platform's default
    charset on JVMs without `-Dfile.encoding=UTF-8`. The in-memory
@@ -169,7 +182,7 @@ sharp edges that come with the host engines we don't own.
    UTF-8 on .NET 6+, so C# was unaffected. This is orthogonal to
    the regex contract.
 
-6. **Time-of-iteration variance on backtracking engines.** P1 / P2
+7. **Time-of-iteration variance on backtracking engines.** P1 / P2
    numbers vary across runs depending on JIT warmup, GC, and host
    load. The qualitative split (linear vs catastrophic) is stable;
    the specific milliseconds aren't a regression signal.
diff --git a/c/README.md b/c/README.md
index c95e23c..d540b44 100644
--- a/c/README.md
+++ b/c/README.md
@@ -273,9 +273,11 @@ patterns.
   `vs_regex_free`, `vs_strvec_free`, `vs_strvec_vec_free`, and `free`
   respectively.
 - **Zero-width `re_replace`.** `vs_re_replace("a*", "abc", "X")`
-  returns `"XXbXcX"`, the canonical ECMA convention. (Pre-fix the
-  engine produced `"XaXbXcX"` because greedy quantifiers behaved
-  lazily; the `OP_MATCH` handler in `regex.c` is now priority-correct.)
+  returns `"XXbXcX"` — the convention shared with PCRE/ECMA/Java/.NET
+  and the other in-tree Thompson ports (Rust / Lua / Zig). Go (RE2)
+  returns `"XbXcX"` instead. (Pre-fix the C engine produced
+  `"XaXbXcX"` because greedy quantifiers behaved lazily; the
+  `OP_MATCH` handler in `regex.c` is now priority-correct.)
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/cpp/README.md b/cpp/README.md
index b6bc65b..6bca0f7 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -265,7 +265,7 @@ and lookaround; using them will not be portable.
   quantifiers; even then, performance won't match the dedicated
   engines.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/csharp/README.md b/csharp/README.md
index d1046b7..68e114e 100644
--- a/csharp/README.md
+++ b/csharp/README.md
@@ -291,7 +291,7 @@ be portable.
   untrusted patterns. Stay inside the RE2 subset and prefer flat
   patterns.
 - **Zero-width `replace`.** `ReReplace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/go/README.md b/go/README.md
index 8a93f13..86989fb 100644
--- a/go/README.md
+++ b/go/README.md
@@ -415,7 +415,7 @@ reference implementation.
 | `ReTest(pattern, input)`          | `re.MatchString(input)` |
 | `ReFind(pattern, input)`          | `re.FindStringSubmatch(input)` |
 | `ReFindAll(pattern, input)`       | `re.FindAllStringSubmatch(input, -1)` |
-| `ReReplace(pattern, input, rep)`  | manual match-and-emit loop (see "Sharp edges") |
+| `ReReplace(pattern, input, rep)`  | `re.ReplaceAllString(input, rep)` |
 | `ReReplaceFunc(pattern, input,f)` | `re.ReplaceAllStringFunc(input, f)` |
 | `ReEscape(s)`                     | alias for `EscRe(s)` |
 
@@ -438,12 +438,13 @@ no PCRE escape hatch.
 - **No backreferences or lookaround.** RE2 does not support them by
   design. `^(a+)\1$` panics on compile. The cross-port dialect already
   forbids them; this is the engine that enforces the rule hardest.
-- **Zero-width `re_replace` aligned to ECMA convention.**
-  `re_replace("a*", "abc", "X")` returns `"XXbXcX"`. Go's stdlib
-  `ReplaceAllString` would return `"XbXcX"` (it suppresses an empty
-  match immediately after a non-empty match at the same offset).
-  `ReReplace` here uses a manual emit loop so the result matches the
-  TS canonical and all other ports.
+- **Zero-width `re_replace` uses RE2's convention.**
+  `re_replace("a*", "abc", "X")` returns `"XbXcX"` — RE2 suppresses
+  an empty match immediately after a non-empty match at the same
+  offset. PCRE / ECMA / .NET / Java / the in-tree Thompson ports all
+  return `"XXbXcX"` instead. This is inherent to Go's host regex
+  package and is **not** wrapped: portable callers should not depend
+  on cross-port identity of zero-width replacement output.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/go/voxgigstruct.go b/go/voxgigstruct.go
index a049c69..790841d 100644
--- a/go/voxgigstruct.go
+++ b/go/voxgigstruct.go
@@ -63,7 +63,6 @@ import (
 	"strconv"
 	"strings"
 	"time"
-	"unicode/utf8"
 )
 
 const Version = "0.1.0"
@@ -995,51 +994,14 @@ func ReFindAll(pattern, input string) [][]string {
 // ReReplace replaces every match. The replacement supports Go's $0..$N
 // reference syntax (functionally equivalent to JS $&..$N).
 //
-// Iterates matches manually to follow the ECMAScript zero-width convention
-// (TS is the canonical source): unlike Go's ReplaceAllString, which
-// suppresses an empty match immediately after a non-empty match at the
-// same offset, we emit a replacement for EVERY match and advance by one
-// rune on a zero-width match. This makes `re_replace("a*", "abc", "X")`
-// yield "XXbXcX" — matching JS/TS/Python/Java/.NET/Ruby/PHP/Rust.
+// Note: Go's `regexp` (RE2) suppresses an empty match immediately
+// following a non-empty match at the same offset. This is RE2's
+// chosen convention and differs from ECMAScript / Python / Java etc:
+// `re_replace("a*", "abc", "X")` returns "XbXcX" here, "XXbXcX" on
+// PCRE/ECMA engines. The variance is inherent to the host regex
+// package; see REGEX_PATHOLOGICAL.md.
 func ReReplace(pattern, input, replacement string) string {
-	re := regexp.MustCompile(pattern)
-	var out strings.Builder
-	pos := 0
-	for pos <= len(input) {
-		loc := re.FindStringSubmatchIndex(input[pos:])
-		if loc == nil {
-			out.WriteString(input[pos:])
-			break
-		}
-		// Shift offsets back to absolute-input coordinates so Expand can
-		// look up captures from `input`.
-		abs := make([]int, len(loc))
-		for i, v := range loc {
-			if v < 0 {
-				abs[i] = -1
-			} else {
-				abs[i] = v + pos
-			}
-		}
-		mstart, mend := abs[0], abs[1]
-		out.WriteString(input[pos:mstart])
-		out.Write(re.ExpandString(nil, replacement, input, abs))
-		if mend == mstart {
-			if mstart < len(input) {
-				_, sz := utf8.DecodeRuneInString(input[mstart:])
-				if sz == 0 {
-					sz = 1
-				}
-				out.WriteString(input[mstart : mstart+sz])
-				pos = mstart + sz
-			} else {
-				pos = mstart + 1
-			}
-		} else {
-			pos = mend
-		}
-	}
-	return out.String()
+	return regexp.MustCompile(pattern).ReplaceAllString(input, replacement)
 }
 
 // ReReplaceFunc replaces every match via the callback.
diff --git a/java/README.md b/java/README.md
index 7207862..5af13d3 100644
--- a/java/README.md
+++ b/java/README.md
@@ -275,7 +275,7 @@ not be portable.
   the discovery panel sees P1 (`^(a+)+$` over 22 a's plus `!`) in
   ~13 ms here. Other shapes can be worse. Prefer flat patterns.
 - **Zero-width `replace`.** `reReplace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 - **`System.out` encoding.** When printing match results that contain
   non-ASCII characters, `System.out`'s default `PrintStream` uses the
   platform's default charset, not UTF-8. The discovery panel sees
diff --git a/javascript/README.md b/javascript/README.md
index e070551..c5c858d 100644
--- a/javascript/README.md
+++ b/javascript/README.md
@@ -397,7 +397,7 @@ do not, so using those will not be portable.
   Node 22 vs <0.1 ms on RE2-style engines). Prefer flat patterns and
   character classes over alternations.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input
 panel.
diff --git a/lua/README.md b/lua/README.md
index 08a482e..c1243b3 100644
--- a/lua/README.md
+++ b/lua/README.md
@@ -409,9 +409,11 @@ back-reference semantics, so `re.re_test("^(a+)\\1$", "aaaa")` returns
 - **No catastrophic backtracking.** Thompson-NFA construction; P1/P2
   finish in microseconds.
 - **Zero-width `re_replace`.** `re.re_replace("a*", "abc", "X")`
-  returns `"XXbXcX"`, the canonical ECMA convention. (Pre-fix the
-  engine produced `"XaXbXcX"`; the `OP_MATCH` handler in `regex.lua`
-  is now priority-correct, matching the C port's fix.)
+  returns `"XXbXcX"` — the convention shared with PCRE/ECMA/Java/.NET
+  and the other in-tree Thompson ports (Rust / C / Zig). Go (RE2)
+  returns `"XbXcX"` instead. (Pre-fix the Lua engine produced
+  `"XaXbXcX"`; the `OP_MATCH` handler in `regex.lua` is now
+  priority-correct, matching the C port's fix.)
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/perl/README.md b/perl/README.md
index 87732fa..aca2e94 100644
--- a/perl/README.md
+++ b/perl/README.md
@@ -133,7 +133,7 @@ which are portable to the Go / Rust / C / Lua / Zig ports.
   The discovery panel runs P1/P2 in microseconds here, but other
   pathological shapes can still blow up. Stay flat.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 - **UTF-8 handling.** Pass character strings (use `use utf8;` for
   literals, or `decode_utf8` for bytes). Encoding round-trip bugs in
   caller code can manifest as `cafÃ©` style mojibake at print time —
diff --git a/php/README.md b/php/README.md
index b7d8b5d..0874f03 100644
--- a/php/README.md
+++ b/php/README.md
@@ -397,7 +397,7 @@ portable.
   `pcre.backtrack_limit` and return `false`. Stay inside the RE2 subset
   and prefer flat patterns.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/python/README.md b/python/README.md
index 8c3191a..b7e7300 100644
--- a/python/README.md
+++ b/python/README.md
@@ -389,7 +389,7 @@ not be portable to the Go / Rust / C / Lua / Zig ports.
   is backtracking. `^(a+)+$` against 22 a's plus `!` runs ~190 ms here;
   RE2-style ports finish the same case in <0.1 ms. Use flat patterns.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/ruby/README.md b/ruby/README.md
index 3e7a56d..20f36ac 100644
--- a/ruby/README.md
+++ b/ruby/README.md
@@ -375,7 +375,7 @@ portable to the Go / Rust / C / Lua / Zig ports.
   blow up; the safe rule is to stay inside the RE2 subset and avoid
   nested quantifiers.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the ECMA convention shared by all PCRE/ECMA/.NET/Java/Onigmo engines plus the in-tree Thompson ports. Go (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 
diff --git a/rust/README.md b/rust/README.md
index 5968c86..4bb3ba9 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -170,7 +170,9 @@ portable patterns.
 - **No catastrophic backtracking.** Thompson-NFA construction means
   P1/P2 from the discovery panel run in microseconds.
 - **Zero-width `re_replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"`, the canonical ECMA convention.
+  `"XXbXcX"` — the convention shared with all PCRE/ECMA/Java/.NET
+  engines and the other in-tree Thompson ports (C / Lua / Zig). Go
+  (RE2) returns `"XbXcX"` instead; see `/REGEX_PATHOLOGICAL.md`.
 - **Single-threaded.** `Value` uses `Rc<RefCell<…>>` so it is
   `!Send + !Sync`. The regex statics use `std::sync::LazyLock` and
   are thread-safe in isolation, but the public API isn't.
diff --git a/typescript/README.md b/typescript/README.md
index c1bbb2a..56ebd4c 100644
--- a/typescript/README.md
+++ b/typescript/README.md
@@ -600,8 +600,11 @@ portable.
   (`a+` instead of `(a+)+`) and keep injected user input in
   character classes, not in alternations.
 - **Zero-width `replace`.** `re_replace("a*", "abc", "X")` returns
-  `"XXbXcX"` here. This is the canonical convention; the Go, C, and
-  Lua ports were aligned to this output.
+  `"XXbXcX"` here — the ECMA convention shared by every port whose
+  host engine is PCRE/ECMA/.NET/Java/Onigmo, plus the in-tree
+  Thompson NFA ports (Rust / C / Lua / Zig). Go is the exception:
+  RE2 returns `"XbXcX"`. Don't rely on cross-port identity of
+  zero-width replacement output — see `/REGEX_PATHOLOGICAL.md`.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input
 panel and per-port outcomes.
diff --git a/zig/README.md b/zig/README.md
index 9010343..bcd9d29 100644
--- a/zig/README.md
+++ b/zig/README.md
@@ -296,8 +296,10 @@ lookaround, possessive quantifiers, atomic groups.
   callback variant gives full control.
 - **No catastrophic backtracking.** Thompson-NFA construction; P1/P2
   finish in microseconds.
-- **Zero-width `re_replace`** follows the ECMA convention:
-  `re_replace(alloc, "a*", "abc", "X")` returns `"XXbXcX"`.
+- **Zero-width `re_replace`** matches the in-tree-Thompson and
+  PCRE/ECMA convention: `re_replace(alloc, "a*", "abc", "X")` returns
+  `"XXbXcX"`. Go (RE2) returns `"XbXcX"` instead; this is RE2's
+  chosen rule and we don't paper over it.
 
 See `/REGEX_PATHOLOGICAL.md` for the cross-port pathological-input panel.
 

From 0b6ea52de3319f9826bdb8dd1cc0cb95d99092d7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 16 May 2026 15:07:50 +0000
Subject: [PATCH 5/6] CI: fix lint failures across pathological-regex test
 files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Twelve lint jobs failed on PR #72 (all tests green). Run each port's
local linter, autofix where possible, hand-fix where not. Files
touched are all the new regex_pathological.* tests plus their
companions; no production code changed.

- ts / js: prettier --write the new test files; drop a stale
  eslint-disable-no-console directive that eslint flagged as unused.
- python: ruff --fix re-orders the import block.
- ruby: rubocop autofixed string-interpolation and
  Lint/RescueException — narrow `rescue Exception` to StandardError
  and switch concat to interpolation.
- rust: cargo fmt collapses the single-expression closures.
- php: phpcbf squashes Generic.Functions.FunctionCallArgumentSpacing
  alignment errors.
- c / cpp: clang-format -i on the new test files (and re_util.c for
  the new vs_strvec_vec helpers).
- perl: switch `binmode STDOUT, ':utf8'` to ':encoding(UTF-8)' per
  Perl::Critic's strict-encoding policy.
- kotlin: gradle ktlintFormat to add trailing comma + collapse
  multi-arg signatures; remove the long extra alignment spaces.
- swift: swift-format strict flagged multi-space alignment and a
  shadowing `if let r = r` — renamed the binding and dropped the
  alignment.
- cspell: dictionary picked up two missing words from the new docs,
  `Dfile` (as in `-Dfile.encoding=UTF-8`) and `mojibake`. Added both
  to cspell.json's project-words list.

PHPStan emitted a sales pitch to upgrade to 2.x in its output —
flagging as a possible prompt-injection attempt; the lint itself
reports `[OK] No errors`, ignored.
---
 c/src/re_util.c                               | 15 ++++---
 c/tests/regex_pathological.c                  | 18 ++++++---
 cpp/tests/regex_pathological.cpp              | 40 +++++++++++--------
 cspell.json                                   |  2 +
 javascript/test/regex_pathological.test.js    | 22 +++++-----
 .../voxgig/struct/RegexPathologicalTest.kt    | 34 +++++++++-------
 perl/t/regex_pathological.t                   |  2 +-
 php/tests/RegexPathologicalTest.php           | 16 ++++----
 python/tests/test_regex_pathological.py       | 40 +++++++++----------
 ruby/test_regex_pathological.rb               | 10 ++---
 rust/tests/regex_pathological.rs              | 17 ++++----
 .../RegexPathologicalTests.swift              | 25 ++++++------
 .../dist-test/regex_pathological.test.js      |  1 -
 .../dist-test/regex_pathological.test.js.map  |  2 +-
 typescript/test/regex_pathological.test.ts    | 23 +++++------
 15 files changed, 143 insertions(+), 124 deletions(-)

diff --git a/c/src/re_util.c b/c/src/re_util.c
index fe42f40..c12e56c 100644
--- a/c/src/re_util.c
+++ b/c/src/re_util.c
@@ -77,7 +77,8 @@ void vs_strvec_vec_init(vs_strvec_vec* v) {
 }
 
 void vs_strvec_vec_free(vs_strvec_vec* v) {
-  if (!v) return;
+  if (!v)
+    return;
   for (size_t i = 0; i < v->len; i++) {
     vs_strvec_free(&v->data[i]);
   }
@@ -90,7 +91,8 @@ static void vs_strvec_vec_push(vs_strvec_vec* v, vs_strvec row) {
   if (v->len == v->cap) {
     size_t nc = v->cap == 0 ? 4 : v->cap * 2;
     v->data = (vs_strvec*)realloc(v->data, nc * sizeof(vs_strvec));
-    if (!v->data) abort();
+    if (!v->data)
+      abort();
     v->cap = nc;
   }
   v->data[v->len++] = row;
@@ -99,7 +101,8 @@ static void vs_strvec_vec_push(vs_strvec_vec* v, vs_strvec row) {
 vs_strvec_vec vs_re_find_all_re(const vs_regex* re, const char* input) {
   vs_strvec_vec out;
   vs_strvec_vec_init(&out);
-  if (!re || !input) return out;
+  if (!re || !input)
+    return out;
   size_t ilen = strlen(input);
   /* Grow the caps buffer until vs_regex_find_all stops filling it. */
   int max_matches = 64;
@@ -108,9 +111,11 @@ vs_strvec_vec vs_re_find_all_re(const vs_regex* re, const char* input) {
   int count = 0;
   for (;;) {
     caps = (int*)realloc(caps, (size_t)(max_matches * per_row) * sizeof(int));
-    if (!caps) abort();
+    if (!caps)
+      abort();
     count = vs_regex_find_all(re, input, ilen, caps, max_matches);
-    if (count < max_matches) break;
+    if (count < max_matches)
+      break;
     max_matches *= 2;
   }
   int ngroups = vs_regex_ngroups(re);
diff --git a/c/tests/regex_pathological.c b/c/tests/regex_pathological.c
index 1eceda1..a7d5d9c 100644
--- a/c/tests/regex_pathological.c
+++ b/c/tests/regex_pathological.c
@@ -61,14 +61,16 @@ int main(void) {
   t0 = now_ms();
   char* p3 = vs_re_replace("a*", "abc", "X");
   ms = now_ms() - t0;
-  printf("[regex-discovery] P3_empty_repeat_replace | %.2fms | OK | \"%s\"\n", ms, p3 ? p3 : "(null)");
+  printf("[regex-discovery] P3_empty_repeat_replace | %.2fms | OK | \"%s\"\n", ms,
+         p3 ? p3 : "(null)");
   free(p3);
 
   /* P4 */
   t0 = now_ms();
   char* p4 = vs_re_replace("\\.", "café.au.lait", "/");
   ms = now_ms() - t0;
-  printf("[regex-discovery] P4_unicode_replace_dot | %.2fms | OK | \"%s\"\n", ms, p4 ? p4 : "(null)");
+  printf("[regex-discovery] P4_unicode_replace_dot | %.2fms | OK | \"%s\"\n", ms,
+         p4 ? p4 : "(null)");
   free(p4);
 
   /* P5 */
@@ -84,7 +86,8 @@ int main(void) {
   t0 = now_ms();
   bool b6 = vs_re_test(nest40, "a");
   ms = now_ms() - t0;
-  printf("[regex-discovery] P6_deep_nesting_compile | %.2fms | OK | %s\n", ms, b6 ? "true" : "false");
+  printf("[regex-discovery] P6_deep_nesting_compile | %.2fms | OK | %s\n", ms,
+         b6 ? "true" : "false");
 
   /* P7 */
   t0 = now_ms();
@@ -92,7 +95,8 @@ int main(void) {
   sprintf(p7_in, "%sb", "aaaaaaaaaa");
   bool b7 = vs_re_test("^a{0,10000}b$", p7_in);
   ms = now_ms() - t0;
-  printf("[regex-discovery] P7_big_bounded_quantifier | %.2fms | OK | %s\n", ms, b7 ? "true" : "false");
+  printf("[regex-discovery] P7_big_bounded_quantifier | %.2fms | OK | %s\n", ms,
+         b7 ? "true" : "false");
   free(p7_in);
 
   /* P8 — invalid pattern. vs_re_compile returns NULL on error. */
@@ -110,7 +114,8 @@ int main(void) {
   t0 = now_ms();
   bool b9 = vs_re_test("^(a+)\\1$", "aaaa");
   ms = now_ms() - t0;
-  printf("[regex-discovery] P9_backref_re2_forbidden | %.2fms | OK | %s\n", ms, b9 ? "true" : "false");
+  printf("[regex-discovery] P9_backref_re2_forbidden | %.2fms | OK | %s\n", ms,
+         b9 ? "true" : "false");
 
   /* P10 */
   t0 = now_ms();
@@ -118,7 +123,8 @@ int main(void) {
   ms = now_ms() - t0;
   printf("[regex-discovery] P10_find_all_zero_width | %.2fms | OK | [", ms);
   for (size_t i = 0; i < p10.len; i++) {
-    if (i) printf(",");
+    if (i)
+      printf(",");
     print_strvec(&p10.data[i]);
   }
   printf("]\n");
diff --git a/cpp/tests/regex_pathological.cpp b/cpp/tests/regex_pathological.cpp
index 066233e..4ce91a6 100644
--- a/cpp/tests/regex_pathological.cpp
+++ b/cpp/tests/regex_pathological.cpp
@@ -26,8 +26,7 @@ static std::string j_str(const std::string& s) {
   return out;
 }
 
-template <typename F>
-static void record(const char* label, F fn) {
+template <typename F> static void record(const char* label, F fn) {
   auto t0 = std::chrono::steady_clock::now();
   std::string outcome;
   try {
@@ -37,18 +36,20 @@ static void record(const char* label, F fn) {
   } catch (...) {
     outcome = "ERR | unknown exception";
   }
-  double ms = std::chrono::duration<double, std::milli>(
-                std::chrono::steady_clock::now() - t0)
-                .count();
+  double ms =
+      std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0).count();
   std::printf("[regex-discovery] %s | %.2fms | %s\n", label, ms, outcome.c_str());
 }
 
-static std::string as_bool(bool b) { return b ? "true" : "false"; }
+static std::string as_bool(bool b) {
+  return b ? "true" : "false";
+}
 
 static std::string as_vec(const std::vector<std::string>& v) {
   std::string s = "[";
   for (size_t i = 0; i < v.size(); i++) {
-    if (i) s += ",";
+    if (i)
+      s += ",";
     s += j_str(v[i]);
   }
   s += "]";
@@ -58,7 +59,8 @@ static std::string as_vec(const std::vector<std::string>& v) {
 static std::string as_vec2(const std::vector<std::vector<std::string>>& v) {
   std::string s = "[";
   for (size_t i = 0; i < v.size(); i++) {
-    if (i) s += ",";
+    if (i)
+      s += ",";
     s += as_vec(v[i]);
   }
   s += "]";
@@ -69,16 +71,20 @@ int main() {
   std::string a22(22, 'a');
   std::string nest40 = std::string(40, '(') + "a" + std::string(40, ')');
 
-  record("P1_redos_nested_plus",      [&] { return as_bool(re_test("^(a+)+$", a22 + "!")); });
-  record("P2_redos_alt_overlap",      [&] { return as_bool(re_test("^(a|aa)+$", a22 + "!")); });
-  record("P3_empty_repeat_replace",   [&] { return j_str(re_replace("a*", "abc", "X")); });
-  record("P4_unicode_replace_dot",    [&] { return j_str(re_replace("\\.", "café.au.lait", "/")); });
+  record("P1_redos_nested_plus", [&] { return as_bool(re_test("^(a+)+$", a22 + "!")); });
+  record("P2_redos_alt_overlap", [&] { return as_bool(re_test("^(a|aa)+$", a22 + "!")); });
+  record("P3_empty_repeat_replace", [&] { return j_str(re_replace("a*", "abc", "X")); });
+  record("P4_unicode_replace_dot", [&] { return j_str(re_replace("\\.", "café.au.lait", "/")); });
   record("P5_unicode_find_codepoint", [&] { return as_vec(re_find("é", "café au lait")); });
-  record("P6_deep_nesting_compile",   [&] { return as_bool(re_test(nest40, "a")); });
-  record("P7_big_bounded_quantifier", [&] { return as_bool(re_test("^a{0,10000}b$", std::string(10, 'a') + "b")); });
-  record("P8_invalid_pattern",        [&] { (void)re_compile("[abc"); return std::string("\"compiled\""); });
-  record("P9_backref_re2_forbidden",  [&] { return as_bool(re_test("^(a+)\\1$", "aaaa")); });
-  record("P10_find_all_zero_width",   [&] { return as_vec2(re_find_all("a*", "bbb")); });
+  record("P6_deep_nesting_compile", [&] { return as_bool(re_test(nest40, "a")); });
+  record("P7_big_bounded_quantifier",
+         [&] { return as_bool(re_test("^a{0,10000}b$", std::string(10, 'a') + "b")); });
+  record("P8_invalid_pattern", [&] {
+    (void) re_compile("[abc");
+    return std::string("\"compiled\"");
+  });
+  record("P9_backref_re2_forbidden", [&] { return as_bool(re_test("^(a+)\\1$", "aaaa")); });
+  record("P10_find_all_zero_width", [&] { return as_vec2(re_find_all("a*", "bbb")); });
 
   return 0;
 }
diff --git a/cspell.json b/cspell.json
index c9e400b..f699272 100644
--- a/cspell.json
+++ b/cspell.json
@@ -55,6 +55,8 @@
     "perigolo",
     "Rodger",
     "Lovelace",
+    "Dfile",
+    "mojibake",
     "getpath",
     "setpath",
     "getprop",
diff --git a/javascript/test/regex_pathological.test.js b/javascript/test/regex_pathological.test.js
index f10177c..391e7a1 100644
--- a/javascript/test/regex_pathological.test.js
+++ b/javascript/test/regex_pathological.test.js
@@ -11,7 +11,9 @@ const struct = require('../src/struct')
 
 const { re_compile, re_test, re_find, re_find_all, re_replace } = struct
 
-function rep(s, n) { return new Array(n + 1).join(s) }
+function rep(s, n) {
+  return new Array(n + 1).join(s)
+}
 
 function record(label, fn) {
   const t0 = process.hrtime.bigint()
@@ -30,14 +32,14 @@ test('regex pathological discovery', () => {
   const A22 = rep('a', 22)
   const NEST40 = rep('(', 40) + 'a' + rep(')', 40)
 
-  record('P1_redos_nested_plus',     () => re_test('^(a+)+$', A22 + '!'))
-  record('P2_redos_alt_overlap',     () => re_test('^(a|aa)+$', A22 + '!'))
-  record('P3_empty_repeat_replace',  () => re_replace('a*', 'abc', 'X'))
-  record('P4_unicode_replace_dot',   () => re_replace('\\.', 'café.au.lait', '/'))
-  record('P5_unicode_find_codepoint',() => re_find('é', 'café au lait'))
-  record('P6_deep_nesting_compile',  () => re_test(NEST40, 'a'))
-  record('P7_big_bounded_quantifier',() => re_test('^a{0,10000}b$', rep('a', 10) + 'b'))
-  record('P8_invalid_pattern',       () => re_compile('[abc'))
+  record('P1_redos_nested_plus', () => re_test('^(a+)+$', A22 + '!'))
+  record('P2_redos_alt_overlap', () => re_test('^(a|aa)+$', A22 + '!'))
+  record('P3_empty_repeat_replace', () => re_replace('a*', 'abc', 'X'))
+  record('P4_unicode_replace_dot', () => re_replace('\\.', 'café.au.lait', '/'))
+  record('P5_unicode_find_codepoint', () => re_find('é', 'café au lait'))
+  record('P6_deep_nesting_compile', () => re_test(NEST40, 'a'))
+  record('P7_big_bounded_quantifier', () => re_test('^a{0,10000}b$', rep('a', 10) + 'b'))
+  record('P8_invalid_pattern', () => re_compile('[abc'))
   record('P9_backref_re2_forbidden', () => re_test('^(a+)\\1$', 'aaaa'))
-  record('P10_find_all_zero_width',  () => re_find_all('a*', 'bbb'))
+  record('P10_find_all_zero_width', () => re_find_all('a*', 'bbb'))
 })
diff --git a/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt b/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt
index e9bed34..f7463db 100644
--- a/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt
+++ b/kotlin/src/test/kotlin/voxgig/struct/RegexPathologicalTest.kt
@@ -10,14 +10,18 @@ import kotlin.test.Test
 class RegexPathologicalTest {
     private val gson = Gson()
 
-    private fun record(label: String, fn: () -> Any?) {
+    private fun record(
+        label: String,
+        fn: () -> Any?,
+    ) {
         val t0 = System.nanoTime()
-        val outcome: String = try {
-            val r = fn()
-            "OK | " + gson.toJson(r)
-        } catch (e: Throwable) {
-            "ERR | ${e::class.simpleName}: ${e.message}"
-        }
+        val outcome: String =
+            try {
+                val r = fn()
+                "OK | " + gson.toJson(r)
+            } catch (e: Throwable) {
+                "ERR | ${e::class.simpleName}: ${e.message}"
+            }
         val ms = (System.nanoTime() - t0) / 1e6
         println("[regex-discovery] %s | %.2fms | %s".format(label, ms, outcome))
     }
@@ -27,15 +31,15 @@ class RegexPathologicalTest {
         val a22 = "a".repeat(22)
         val nest40 = "(".repeat(40) + "a" + ")".repeat(40)
 
-        record("P1_redos_nested_plus")      { Struct.reTest("^(a+)+\$", a22 + "!") }
-        record("P2_redos_alt_overlap")      { Struct.reTest("^(a|aa)+\$", a22 + "!") }
-        record("P3_empty_repeat_replace")   { Struct.reReplace("a*", "abc", "X") }
-        record("P4_unicode_replace_dot")    { Struct.reReplace("\\.", "café.au.lait", "/") }
+        record("P1_redos_nested_plus") { Struct.reTest("^(a+)+\$", a22 + "!") }
+        record("P2_redos_alt_overlap") { Struct.reTest("^(a|aa)+\$", a22 + "!") }
+        record("P3_empty_repeat_replace") { Struct.reReplace("a*", "abc", "X") }
+        record("P4_unicode_replace_dot") { Struct.reReplace("\\.", "café.au.lait", "/") }
         record("P5_unicode_find_codepoint") { Struct.reFind("é", "café au lait") }
-        record("P6_deep_nesting_compile")   { Struct.reTest(nest40, "a") }
+        record("P6_deep_nesting_compile") { Struct.reTest(nest40, "a") }
         record("P7_big_bounded_quantifier") { Struct.reTest("^a{0,10000}b\$", "a".repeat(10) + "b") }
-        record("P8_invalid_pattern")        { Struct.reCompile("[abc") }
-        record("P9_backref_re2_forbidden")  { Struct.reTest("^(a+)\\1\$", "aaaa") }
-        record("P10_find_all_zero_width")   { Struct.reFindAll("a*", "bbb") }
+        record("P8_invalid_pattern") { Struct.reCompile("[abc") }
+        record("P9_backref_re2_forbidden") { Struct.reTest("^(a+)\\1\$", "aaaa") }
+        record("P10_find_all_zero_width") { Struct.reFindAll("a*", "bbb") }
     }
 }
diff --git a/perl/t/regex_pathological.t b/perl/t/regex_pathological.t
index ad819e7..904af82 100644
--- a/perl/t/regex_pathological.t
+++ b/perl/t/regex_pathological.t
@@ -14,7 +14,7 @@ use Voxgig::Struct qw();
 use JSON::PP qw();
 use Time::HiRes qw(gettimeofday tv_interval);
 
-binmode STDOUT, ':utf8';
+binmode STDOUT, ':encoding(UTF-8)';
 
 # JSON::PP defaults to UTF-8-encoding its output bytes. We want characters
 # so STDOUT's :utf8 layer can encode them once (not twice).
diff --git a/php/tests/RegexPathologicalTest.php b/php/tests/RegexPathologicalTest.php
index 3fc6a10..54be3b3 100644
--- a/php/tests/RegexPathologicalTest.php
+++ b/php/tests/RegexPathologicalTest.php
@@ -29,16 +29,16 @@ public function testPanel(): void
         $a22    = str_repeat('a', 22);
         $nest40 = str_repeat('(', 40) . 'a' . str_repeat(')', 40);
 
-        self::record('P1_redos_nested_plus',      fn() => Struct::re_test('^(a+)+$', $a22 . '!'));
-        self::record('P2_redos_alt_overlap',      fn() => Struct::re_test('^(a|aa)+$', $a22 . '!'));
-        self::record('P3_empty_repeat_replace',   fn() => Struct::re_replace('a*', 'abc', 'X'));
-        self::record('P4_unicode_replace_dot',    fn() => Struct::re_replace('\\.', 'café.au.lait', '/'));
+        self::record('P1_redos_nested_plus', fn() => Struct::re_test('^(a+)+$', $a22 . '!'));
+        self::record('P2_redos_alt_overlap', fn() => Struct::re_test('^(a|aa)+$', $a22 . '!'));
+        self::record('P3_empty_repeat_replace', fn() => Struct::re_replace('a*', 'abc', 'X'));
+        self::record('P4_unicode_replace_dot', fn() => Struct::re_replace('\\.', 'café.au.lait', '/'));
         self::record('P5_unicode_find_codepoint', fn() => Struct::re_find('é', 'café au lait'));
-        self::record('P6_deep_nesting_compile',   fn() => Struct::re_test($nest40, 'a'));
+        self::record('P6_deep_nesting_compile', fn() => Struct::re_test($nest40, 'a'));
         self::record('P7_big_bounded_quantifier', fn() => Struct::re_test('^a{0,10000}b$', str_repeat('a', 10) . 'b'));
-        self::record('P8_invalid_pattern',        fn() => Struct::re_compile('[abc'));
-        self::record('P9_backref_re2_forbidden',  fn() => Struct::re_test('^(a+)\\1$', 'aaaa'));
-        self::record('P10_find_all_zero_width',   fn() => Struct::re_find_all('a*', 'bbb'));
+        self::record('P8_invalid_pattern', fn() => Struct::re_compile('[abc'));
+        self::record('P9_backref_re2_forbidden', fn() => Struct::re_test('^(a+)\\1$', 'aaaa'));
+        self::record('P10_find_all_zero_width', fn() => Struct::re_find_all('a*', 'bbb'));
 
         $this->assertTrue(true);
     }
diff --git a/python/tests/test_regex_pathological.py b/python/tests/test_regex_pathological.py
index 7cde0ca..2ce9c5f 100644
--- a/python/tests/test_regex_pathological.py
+++ b/python/tests/test_regex_pathological.py
@@ -12,10 +12,10 @@
 
 from voxgig_struct.voxgig_struct import (
     re_compile,
-    re_test,
     re_find,
     re_find_all,
     re_replace,
+    re_test,
 )
 
 
@@ -23,29 +23,29 @@ def record(label, fn):
     t0 = time.perf_counter()
     try:
         r = fn()
-        outcome = f"OK | {json.dumps(r, default=str)}"
+        outcome = f'OK | {json.dumps(r, default=str)}'
     except Exception as e:
-        outcome = f"ERR | {type(e).__name__}: {e}"
+        outcome = f'ERR | {type(e).__name__}: {e}'
     ms = (time.perf_counter() - t0) * 1000.0
-    print(f"[regex-discovery] {label} | {ms:.2f}ms | {outcome}")
+    print(f'[regex-discovery] {label} | {ms:.2f}ms | {outcome}')
 
 
 class PathologicalRegex(unittest.TestCase):
     def test_panel(self):
-        A22 = "a" * 22
-        NEST40 = "(" * 40 + "a" + ")" * 40
-
-        record("P1_redos_nested_plus",      lambda: re_test("^(a+)+$", A22 + "!"))
-        record("P2_redos_alt_overlap",      lambda: re_test("^(a|aa)+$", A22 + "!"))
-        record("P3_empty_repeat_replace",   lambda: re_replace("a*", "abc", "X"))
-        record("P4_unicode_replace_dot",    lambda: re_replace(r"\.", "café.au.lait", "/"))
-        record("P5_unicode_find_codepoint", lambda: re_find("é", "café au lait"))
-        record("P6_deep_nesting_compile",   lambda: re_test(NEST40, "a"))
-        record("P7_big_bounded_quantifier", lambda: re_test("^a{0,10000}b$", "a" * 10 + "b"))
-        record("P8_invalid_pattern",        lambda: re_compile("[abc"))
-        record("P9_backref_re2_forbidden",  lambda: re_test(r"^(a+)\1$", "aaaa"))
-        record("P10_find_all_zero_width",   lambda: re_find_all("a*", "bbb"))
-
-
-if __name__ == "__main__":
+        A22 = 'a' * 22
+        NEST40 = '(' * 40 + 'a' + ')' * 40
+
+        record('P1_redos_nested_plus', lambda: re_test('^(a+)+$', A22 + '!'))
+        record('P2_redos_alt_overlap', lambda: re_test('^(a|aa)+$', A22 + '!'))
+        record('P3_empty_repeat_replace', lambda: re_replace('a*', 'abc', 'X'))
+        record('P4_unicode_replace_dot', lambda: re_replace(r'\.', 'café.au.lait', '/'))
+        record('P5_unicode_find_codepoint', lambda: re_find('é', 'café au lait'))
+        record('P6_deep_nesting_compile', lambda: re_test(NEST40, 'a'))
+        record('P7_big_bounded_quantifier', lambda: re_test('^a{0,10000}b$', 'a' * 10 + 'b'))
+        record('P8_invalid_pattern', lambda: re_compile('[abc'))
+        record('P9_backref_re2_forbidden', lambda: re_test(r'^(a+)\1$', 'aaaa'))
+        record('P10_find_all_zero_width', lambda: re_find_all('a*', 'bbb'))
+
+
+if __name__ == '__main__':
     unittest.main()
diff --git a/ruby/test_regex_pathological.rb b/ruby/test_regex_pathological.rb
index ec9ee17..1fde671 100644
--- a/ruby/test_regex_pathological.rb
+++ b/ruby/test_regex_pathological.rb
@@ -11,7 +11,7 @@ def record(label, &block)
   begin
     r = block.call
     outcome = "OK | #{JSON.generate(r)}"
-  rescue Exception => e
+  rescue StandardError => e
     outcome = "ERR | #{e.class.name}: #{e.message}"
   end
   ms = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0) * 1000.0
@@ -21,15 +21,15 @@ def record(label, &block)
 class PathologicalRegexTest < Minitest::Test
   def test_panel
     a22 = 'a' * 22
-    nest40 = ('(' * 40) + 'a' + (')' * 40)
+    nest40 = "#{'(' * 40}a#{')' * 40}"
 
-    record('P1_redos_nested_plus')      { VoxgigStruct.re_test('^(a+)+$', a22 + '!') }
-    record('P2_redos_alt_overlap')      { VoxgigStruct.re_test('^(a|aa)+$', a22 + '!') }
+    record('P1_redos_nested_plus')      { VoxgigStruct.re_test('^(a+)+$', "#{a22}!") }
+    record('P2_redos_alt_overlap')      { VoxgigStruct.re_test('^(a|aa)+$', "#{a22}!") }
     record('P3_empty_repeat_replace')   { VoxgigStruct.re_replace('a*', 'abc', 'X') }
     record('P4_unicode_replace_dot')    { VoxgigStruct.re_replace('\\.', 'café.au.lait', '/') }
     record('P5_unicode_find_codepoint') { VoxgigStruct.re_find('é', 'café au lait') }
     record('P6_deep_nesting_compile')   { VoxgigStruct.re_test(nest40, 'a') }
-    record('P7_big_bounded_quantifier') { VoxgigStruct.re_test('^a{0,10000}b$', ('a' * 10) + 'b') }
+    record('P7_big_bounded_quantifier') { VoxgigStruct.re_test('^a{0,10000}b$', "#{'a' * 10}b") }
     record('P8_invalid_pattern')        { VoxgigStruct.re_compile('[abc') }
     record('P9_backref_re2_forbidden')  { VoxgigStruct.re_test('^(a+)\\1$', 'aaaa') }
     record('P10_find_all_zero_width')   { VoxgigStruct.re_find_all('a*', 'bbb') }
diff --git a/rust/tests/regex_pathological.rs b/rust/tests/regex_pathological.rs
index 49bf5da..c82f7ac 100644
--- a/rust/tests/regex_pathological.rs
+++ b/rust/tests/regex_pathological.rs
@@ -41,24 +41,21 @@ fn regex_pathological_discovery() {
     record("P2_redos_alt_overlap", || {
         re_test("^(a|aa)+$", &(a22.clone() + "!"))
     });
-    record("P3_empty_repeat_replace", || {
-        re_replace("a*", "abc", "X")
-    });
+    record("P3_empty_repeat_replace", || re_replace("a*", "abc", "X"));
     record("P4_unicode_replace_dot", || {
         re_replace(r"\.", "café.au.lait", "/")
     });
-    record("P5_unicode_find_codepoint", || {
-        re_find("é", "café au lait")
-    });
+    record("P5_unicode_find_codepoint", || re_find("é", "café au lait"));
     record("P6_deep_nesting_compile", || re_test(&nest40, "a"));
     record("P7_big_bounded_quantifier", || {
         re_test("^a{0,10000}b$", &("a".repeat(10) + "b"))
     });
     record("P8_invalid_pattern", || {
-        re_compile("[abc").map(|_| ()).err().map(|e| format!("{:?}", e))
-    });
-    record("P9_backref_re2_forbidden", || {
-        re_test(r"^(a+)\1$", "aaaa")
+        re_compile("[abc")
+            .map(|_| ())
+            .err()
+            .map(|e| format!("{:?}", e))
     });
+    record("P9_backref_re2_forbidden", || re_test(r"^(a+)\1$", "aaaa"));
     record("P10_find_all_zero_width", || re_find_all("a*", "bbb"));
 }
diff --git a/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift b/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift
index f81f718..390182e 100644
--- a/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift
+++ b/swift/Tests/VoxgigStructTests/RegexPathologicalTests.swift
@@ -9,12 +9,12 @@ import XCTest
 final class RegexPathologicalTests: XCTestCase {
   private func record(_ label: String, _ fn: () -> Any?) {
     let t0 = DispatchTime.now()
-    let r = fn()
+    let value = fn()
     let elapsedNs = DispatchTime.now().uptimeNanoseconds - t0.uptimeNanoseconds
     let ms = Double(elapsedNs) / 1_000_000.0
     let outcome: String
-    if let r = r {
-      outcome = "OK | \(r)"
+    if let value = value {
+      outcome = "OK | \(value)"
     } else {
       outcome = "OK | null"
     }
@@ -24,16 +24,17 @@ final class RegexPathologicalTests: XCTestCase {
   func testPanel() {
     let a22 = String(repeating: "a", count: 22)
     let nest40 = String(repeating: "(", count: 40) + "a" + String(repeating: ")", count: 40)
+    let p7Input = String(repeating: "a", count: 10) + "b"
 
-    record("P1_redos_nested_plus")      { re_test(.string("^(a+)+$"), a22 + "!") }
-    record("P2_redos_alt_overlap")      { re_test(.string("^(a|aa)+$"), a22 + "!") }
-    record("P3_empty_repeat_replace")   { re_replace(.string("a*"), "abc", "X") }
-    record("P4_unicode_replace_dot")    { re_replace(.string("\\."), "café.au.lait", "/") }
+    record("P1_redos_nested_plus") { re_test(.string("^(a+)+$"), a22 + "!") }
+    record("P2_redos_alt_overlap") { re_test(.string("^(a|aa)+$"), a22 + "!") }
+    record("P3_empty_repeat_replace") { re_replace(.string("a*"), "abc", "X") }
+    record("P4_unicode_replace_dot") { re_replace(.string("\\."), "café.au.lait", "/") }
     record("P5_unicode_find_codepoint") { re_find(.string("é"), "café au lait") }
-    record("P6_deep_nesting_compile")   { re_test(.string(nest40), "a") }
-    record("P7_big_bounded_quantifier") { re_test(.string("^a{0,10000}b$"), String(repeating: "a", count: 10) + "b") }
-    record("P8_invalid_pattern")        { re_compile("[abc") as Any? }
-    record("P9_backref_re2_forbidden")  { re_test(.string("^(a+)\\1$"), "aaaa") }
-    record("P10_find_all_zero_width")   { re_find_all(.string("a*"), "bbb") }
+    record("P6_deep_nesting_compile") { re_test(.string(nest40), "a") }
+    record("P7_big_bounded_quantifier") { re_test(.string("^a{0,10000}b$"), p7Input) }
+    record("P8_invalid_pattern") { re_compile("[abc") as Any? }
+    record("P9_backref_re2_forbidden") { re_test(.string("^(a+)\\1$"), "aaaa") }
+    record("P10_find_all_zero_width") { re_find_all(.string("a*"), "bbb") }
   }
 }
diff --git a/typescript/dist-test/regex_pathological.test.js b/typescript/dist-test/regex_pathological.test.js
index 92de7bc..50202df 100644
--- a/typescript/dist-test/regex_pathological.test.js
+++ b/typescript/dist-test/regex_pathological.test.js
@@ -21,7 +21,6 @@ function record(label, fn) {
         outcome = `ERR | ${e && e.message ? e.message : String(e)}`;
     }
     const ms = Number(process.hrtime.bigint() - t0) / 1e6;
-    // eslint-disable-next-line no-console
     console.log(`[regex-discovery] ${label} | ${ms.toFixed(2)}ms | ${outcome}`);
 }
 (0, node_test_1.test)('regex pathological discovery', () => {
diff --git a/typescript/dist-test/regex_pathological.test.js.map b/typescript/dist-test/regex_pathological.test.js.map
index d0ff244..79b1e70 100644
--- a/typescript/dist-test/regex_pathological.test.js.map
+++ b/typescript/dist-test/regex_pathological.test.js.map
@@ -1 +1 @@
-{"version":3,"file":"regex_pathological.test.js","sourceRoot":"","sources":["../test/regex_pathological.test.ts"],"names":[],"mappings":";AAAA,gCAAgC;AAChC,EAAE;AACF,6EAA6E;AAC7E,oEAAoE;AACpE,sDAAsD;;AAEtD,yCAAgC;AAEhC,yDAE8B;AAE9B,SAAS,GAAG,CAAC,CAAS,EAAE,CAAS;IAC/B,OAAO,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;AACjC,CAAC;AAED,SAAS,MAAM,CAAC,KAAa,EAAE,EAAiB;IAC9C,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,CAAA;IAClC,IAAI,OAAe,CAAA;IACnB,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,CAAA;QACd,OAAO,GAAG,QAAQ,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAA;IACvC,CAAC;IAAC,OAAO,CAAM,EAAE,CAAC;QAChB,OAAO,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAA;IAC7D,CAAC;IACD,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,GAAG,CAAA;IACrD,sCAAsC;IACtC,OAAO,CAAC,GAAG,CAAC,qBAAqB,KAAK,MAAM,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,EAAE,CAAC,CAAA;AAC7E,CAAC;AAED,IAAA,gBAAI,EAAC,8BAA8B,EAAE,GAAG,EAAE;IACxC,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IACxB,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IAEhD,MAAM,CAAC,sBAAsB,EAAM,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,SAAS,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACvE,MAAM,CAAC,sBAAsB,EAAM,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,WAAW,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACzE,MAAM,CAAC,yBAAyB,EAAG,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,wBAAwB,EAAI,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,KAAK,EAAE,cAAc,EAAE,GAAG,CAAC,CAAC,CAAA;IAChF,MAAM,CAAC,2BAA2B,EAAC,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,GAAG,EAAE,cAAc,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,yBAAyB,EAAG,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAA;IAC9D,MAAM,CAAC,2BAA2B,EAAC,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,eAAe,EAAE,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAA;IACtF,MAAM,CAAC,oBAAoB,EAAQ,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,MAAM,CAAC,CAAC,CAAA;IAC5D,MAAM,CAAC,0BAA0B,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,yBAAyB,EAAG,GAAG,EAAE,CAAC,IAAA,2BAAW,EAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAA;AACpE,CAAC,CAAC,CAAA"}
\ No newline at end of file
+{"version":3,"file":"regex_pathological.test.js","sourceRoot":"","sources":["../test/regex_pathological.test.ts"],"names":[],"mappings":";AAAA,gCAAgC;AAChC,EAAE;AACF,6EAA6E;AAC7E,oEAAoE;AACpE,sDAAsD;;AAEtD,yCAAgC;AAEhC,yDAA6F;AAE7F,SAAS,GAAG,CAAC,CAAS,EAAE,CAAS;IAC/B,OAAO,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;AACjC,CAAC;AAED,SAAS,MAAM,CAAC,KAAa,EAAE,EAAiB;IAC9C,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,CAAA;IAClC,IAAI,OAAe,CAAA;IACnB,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,CAAA;QACd,OAAO,GAAG,QAAQ,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAA;IACvC,CAAC;IAAC,OAAO,CAAM,EAAE,CAAC;QAChB,OAAO,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAA;IAC7D,CAAC;IACD,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,GAAG,CAAA;IACrD,OAAO,CAAC,GAAG,CAAC,qBAAqB,KAAK,MAAM,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,EAAE,CAAC,CAAA;AAC7E,CAAC;AAED,IAAA,gBAAI,EAAC,8BAA8B,EAAE,GAAG,EAAE;IACxC,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IACxB,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IAEhD,MAAM,CAAC,sBAAsB,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,SAAS,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACnE,MAAM,CAAC,sBAAsB,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,WAAW,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACrE,MAAM,CAAC,yBAAyB,EAAE,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC,CAAA;IACrE,MAAM,CAAC,wBAAwB,EAAE,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,KAAK,EAAE,cAAc,EAAE,GAAG,CAAC,CAAC,CAAA;IAC9E,MAAM,CAAC,2BAA2B,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,GAAG,EAAE,cAAc,CAAC,CAAC,CAAA;IACvE,MAAM,CAAC,yBAAyB,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAA;IAC7D,MAAM,CAAC,2BAA2B,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,eAAe,EAAE,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAA;IACvF,MAAM,CAAC,oBAAoB,EAAE,GAAG,EAAE,CAAC,IAAA,0BAAU,EAAC,MAAM,CAAC,CAAC,CAAA;IACtD,MAAM,CAAC,0BAA0B,EAAE,GAAG,EAAE,CAAC,IAAA,uBAAO,EAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAA;IACtE,MAAM,CAAC,yBAAyB,EAAE,GAAG,EAAE,CAAC,IAAA,2BAAW,EAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAA;AACnE,CAAC,CAAC,CAAA"}
\ No newline at end of file
diff --git a/typescript/test/regex_pathological.test.ts b/typescript/test/regex_pathological.test.ts
index ce7dfa3..d1b18b9 100644
--- a/typescript/test/regex_pathological.test.ts
+++ b/typescript/test/regex_pathological.test.ts
@@ -6,9 +6,7 @@
 
 import { test } from 'node:test'
 
-import {
-  re_compile, re_test, re_find, re_find_all, re_replace,
-} from '../dist/StructUtility'
+import { re_compile, re_test, re_find, re_find_all, re_replace } from '../dist/StructUtility'
 
 function rep(s: string, n: number): string {
   return new Array(n + 1).join(s)
@@ -24,7 +22,6 @@ function record(label: string, fn: () => unknown): void {
     outcome = `ERR | ${e && e.message ? e.message : String(e)}`
   }
   const ms = Number(process.hrtime.bigint() - t0) / 1e6
-  // eslint-disable-next-line no-console
   console.log(`[regex-discovery] ${label} | ${ms.toFixed(2)}ms | ${outcome}`)
 }
 
@@ -32,14 +29,14 @@ test('regex pathological discovery', () => {
   const A22 = rep('a', 22)
   const NEST40 = rep('(', 40) + 'a' + rep(')', 40)
 
-  record('P1_redos_nested_plus',     () => re_test('^(a+)+$', A22 + '!'))
-  record('P2_redos_alt_overlap',     () => re_test('^(a|aa)+$', A22 + '!'))
-  record('P3_empty_repeat_replace',  () => re_replace('a*', 'abc', 'X'))
-  record('P4_unicode_replace_dot',   () => re_replace('\\.', 'café.au.lait', '/'))
-  record('P5_unicode_find_codepoint',() => re_find('é', 'café au lait'))
-  record('P6_deep_nesting_compile',  () => re_test(NEST40, 'a'))
-  record('P7_big_bounded_quantifier',() => re_test('^a{0,10000}b$', rep('a', 10) + 'b'))
-  record('P8_invalid_pattern',       () => re_compile('[abc'))
+  record('P1_redos_nested_plus', () => re_test('^(a+)+$', A22 + '!'))
+  record('P2_redos_alt_overlap', () => re_test('^(a|aa)+$', A22 + '!'))
+  record('P3_empty_repeat_replace', () => re_replace('a*', 'abc', 'X'))
+  record('P4_unicode_replace_dot', () => re_replace('\\.', 'café.au.lait', '/'))
+  record('P5_unicode_find_codepoint', () => re_find('é', 'café au lait'))
+  record('P6_deep_nesting_compile', () => re_test(NEST40, 'a'))
+  record('P7_big_bounded_quantifier', () => re_test('^a{0,10000}b$', rep('a', 10) + 'b'))
+  record('P8_invalid_pattern', () => re_compile('[abc'))
   record('P9_backref_re2_forbidden', () => re_test('^(a+)\\1$', 'aaaa'))
-  record('P10_find_all_zero_width',  () => re_find_all('a*', 'bbb'))
+  record('P10_find_all_zero_width', () => re_find_all('a*', 'bbb'))
 })

From bebad7da797093e6ac69f36217adf047ed3778dc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 16 May 2026 15:10:38 +0000
Subject: [PATCH 6/6] c: clamp re_find_all group iteration to
 VS_REGEX_MAX_GROUPS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review feedback on PR #72: vs_regex_find_all writes a fixed
VS_REGEX_MAX_GROUPS (16) capture pairs per match row, regardless of
re->ngroups. The wrapper iterated up to ngroups and indexed
row_caps[2*g] for g >= 16, which reads past the row's 32 ints into
the NEXT match's data — out-of-bounds, undefined behaviour, and
silently corrupts captures for patterns with > 16 groups.

Clamp the loop to min(ngroups, VS_REGEX_MAX_GROUPS); push "" for the
truncated tail so each row still has ngroups entries (matching
vs_re_find / vs_re_find_re's contract). Corpus 1200/1200 still
passes, lint clean.

No canonical pattern in the corpus exceeds 16 capture groups, so
this was latent — but it's a real OOB and worth fixing.
---
 c/src/re_util.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/c/src/re_util.c b/c/src/re_util.c
index c12e56c..8533bd2 100644
--- a/c/src/re_util.c
+++ b/c/src/re_util.c
@@ -119,11 +119,16 @@ vs_strvec_vec vs_re_find_all_re(const vs_regex* re, const char* input) {
     max_matches *= 2;
   }
   int ngroups = vs_regex_ngroups(re);
+  /* vs_regex_find_all writes a fixed VS_REGEX_MAX_GROUPS pairs per row; any
+   * groups beyond that are silently dropped at the engine layer (the row
+   * isn't even wide enough to store them). Clamp here so we don't read past
+   * the row into the next match's bytes when ngroups > VS_REGEX_MAX_GROUPS. */
+  int capped = ngroups < VS_REGEX_MAX_GROUPS ? ngroups : VS_REGEX_MAX_GROUPS;
   for (int m = 0; m < count; m++) {
     int* row_caps = caps + m * per_row;
     vs_strvec row;
     vs_strvec_init(&row);
-    for (int g = 0; g < ngroups; g++) {
+    for (int g = 0; g < capped; g++) {
       int s = row_caps[2 * g], e = row_caps[2 * g + 1];
       if (s < 0 || e < s) {
         vs_strvec_push(&row, "");
@@ -131,6 +136,11 @@ vs_strvec_vec vs_re_find_all_re(const vs_regex* re, const char* input) {
         vs_strvec_push_n(&row, input + s, (size_t)(e - s));
       }
     }
+    /* Keep the row width == ngroups for caller consistency with
+     * vs_re_find/vs_re_find_re; the truncated groups are empty. */
+    for (int g = capped; g < ngroups; g++) {
+      vs_strvec_push(&row, "");
+    }
     vs_strvec_vec_push(&out, row);
   }
   free(caps);