diff --git a/.github/workflows/build-skill-gate.yml b/.github/workflows/build-skill-gate.yml
new file mode 100644
index 0000000000..e59477762b
--- /dev/null
+++ b/.github/workflows/build-skill-gate.yml
@@ -0,0 +1,66 @@
+name: Build Skill TDD Gate
+
+on:
+ pull_request:
+ branches: [main]
+ paths:
+ - "build/**"
+ - "bin/gstack-build"
+ - "scripts/gen-skill-docs.ts"
+ - "scripts/discover-skills.ts"
+ - "scripts/host-config.ts"
+ - "scripts/models.ts"
+ - "scripts/resolvers/**"
+ - "hosts/**"
+ - "test/gen-skill-docs.test.ts"
+ - "package.json"
+ - "bun.lock"
+ - ".github/workflows/build-skill-gate.yml"
+ push:
+ branches: [main]
+ paths:
+ - "build/**"
+ - "bin/gstack-build"
+ - "scripts/gen-skill-docs.ts"
+ - "scripts/discover-skills.ts"
+ - "scripts/host-config.ts"
+ - "scripts/models.ts"
+ - "scripts/resolvers/**"
+ - "hosts/**"
+ - "test/gen-skill-docs.test.ts"
+ - "package.json"
+ - "bun.lock"
+ - ".github/workflows/build-skill-gate.yml"
+ workflow_dispatch:
+
+concurrency:
+ group: build-skill-gate-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build-skill-tdd-gate:
+ runs-on: ubuntu-latest
+ timeout-minutes: 20
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: oven-sh/setup-bun@v2
+ with:
+ bun-version: latest
+
+ - name: Install dependencies
+ run: bun install --frozen-lockfile
+
+ - name: Generate all host skill docs
+ run: bun run gen:skill-docs --host all
+
+ - name: Verify generated docs are fresh
+ run: |
+ git diff --exit-code || {
+ echo "Generated skill docs are stale. Run: bun run gen:skill-docs --host all"
+ exit 1
+ }
+
+ - name: Run deterministic build skill gate
+ run: bun run test:build-skill
diff --git a/.github/workflows/version-gate.yml b/.github/workflows/version-gate.yml
index 262baf6ea4..8e1f35229c 100644
--- a/.github/workflows/version-gate.yml
+++ b/.github/workflows/version-gate.yml
@@ -34,7 +34,7 @@ jobs:
set -euo pipefail
PR_VERSION=$(cat VERSION | tr -d '[:space:]')
BASE_REF="${{ github.event.pull_request.base.ref }}"
- git fetch origin "$BASE_REF" --depth=1 --quiet || true
+ git fetch origin "$BASE_REF:refs/remotes/origin/$BASE_REF" --depth=1 --quiet || true
BASE_VERSION=$(git show "origin/$BASE_REF:VERSION" 2>/dev/null | tr -d '[:space:]' || echo "0.0.0.0")
{
echo "pr_version=$PR_VERSION"
@@ -48,6 +48,15 @@ jobs:
LEVEL=$(bun run scripts/detect-bump.ts "${{ steps.versions.outputs.base_version }}" "${{ steps.versions.outputs.pr_version }}")
echo "level=$LEVEL" >> "$GITHUB_OUTPUT"
+ - name: Detect fork version repair
+ id: fork_repair
+ run: |
+ IS_REPAIR=$(bun run scripts/detect-fork-version-repair.ts \
+ "${{ steps.versions.outputs.base_ref }}" \
+ "${{ steps.versions.outputs.base_version }}" \
+ "${{ steps.versions.outputs.pr_version }}")
+ echo "is_repair=$IS_REPAIR" >> "$GITHUB_OUTPUT"
+
- name: Query queue (util) — fail-open on error
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -70,5 +79,6 @@ jobs:
- name: Compare PR VERSION to next free slot
env:
PR_VERSION: ${{ steps.versions.outputs.pr_version }}
+ FORK_VERSION_REPAIR: ${{ steps.fork_repair.outputs.is_repair }}
run: |
bun run scripts/compare-pr-version.ts next.json "${{ github.event.pull_request.number }}"
diff --git a/.gitignore b/.gitignore
index 9e413bc56b..12030662cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ bin/gstack-global-discover
.claude/skills/
.claude/scheduled_tasks.lock
.claude/*.lock
+.claude/settings.local.json
.agents/
.factory/
.kiro/
@@ -26,6 +27,7 @@ extension/lib/xterm.js
extension/lib/xterm.css
extension/lib/xterm-addon-fit.js
.gstack-worktrees/
+.worktrees/
/tmp/
*.log
*.bun-build
@@ -37,3 +39,5 @@ supabase/.temp/
# Throughput analysis — local-only, regenerate via scripts/garry-output-comparison.ts
docs/throughput-*.json
+build/configure.cm
+.llm-tmp/
diff --git a/AGENTS.md b/AGENTS.md
index c1e5595fc5..e068829f13 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -18,6 +18,10 @@ Invoke them by name (e.g., `/office-hours`).
| `/plan-eng-review` | Lock architecture, data flow, edge cases, and tests. |
| `/plan-design-review` | Rate each design dimension 0-10, explain what a 10 looks like. |
| `/plan-devex-review` | DX-mode review: TTHW, magical moments, friction points, persona traces. |
+| `/plan-domain-review` | Domain-model review for bounded contexts, state, ownership, and events. |
+| `/plan-api-review` | API contract review for REST/gRPC/async interfaces and compatibility. |
+| `/plan-arch-review` | Second-pass software architecture review after eng review. |
+| `/plan-modernization-review` | Modernization review for modularization, migrations, and rollout hazards. |
| `/plan-tune` | Self-tune AskUserQuestion sensitivity per question. |
| `/autoplan` | One command runs CEO → design → eng → DX review. |
| `/design-consultation` | Build a complete design system from scratch. |
@@ -28,6 +32,7 @@ Invoke them by name (e.g., `/office-hours`).
|-------|-------------|
| `/review` | Pre-landing PR review. Finds bugs that pass CI but break in prod. |
| `/codex` | Second opinion via OpenAI Codex. Review, challenge, or consult modes. |
+| `/build` | Autonomous gstack execution loop for living implementation plans. |
| `/investigate` | Systematic root-cause debugging. No fixes without investigation. |
| `/design-review` | Live-site visual audit + fix loop with atomic commits. |
| `/design-shotgun` | Generate multiple AI design variants, comparison board, iterate. |
@@ -89,6 +94,7 @@ Invoke them by name (e.g., `/office-hours`).
```bash
bun install # install dependencies
bun test # run free tests (no API spend)
+bun run test:build-skill # focused verification for /build skill changes
bun run test:windows # curated Windows-safe subset (runs on windows-latest)
bun run build # generate docs + compile binaries
bun run gen:skill-docs # regenerate SKILL.md files from templates
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 937e67e37f..8efee1ce93 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -498,6 +498,28 @@ ongoing background sync; gbrain owns the daemon lifecycle.
- `/codex` adversarial review during `/ship` caught all three correctness bugs above (silent attach, preamble inconsistency, orphan leak) before merge. Find-cost: ~10 min CC. Production-bug-cost: stale code search results that "almost worked" — the worst kind to debug.
- gbrain CLI minimum version is now v0.30.0 (uses `sources attach`, which doesn't exist in v0.20.x). Run `cd ~/git/gbrain && git pull && bun install && bun link` to upgrade.
+## [1.28.0.0-fork] - 2026-05-09 (anbangr/gstack fork)
+
+## **The plan file now updates itself as your build runs. Two concurrent builds no longer crash each other.**
+
+Two runtime gaps closed in one release. First: the plan markdown was write-once at kickoff, then frozen while the build ran. Now `saveState` reconciles the plan file after every phase transition, flipping the matching checkboxes atomically via POSIX rename. Second: running two `/build` invocations on the same repo simultaneously caused both to crash at the `git checkout main` step. The fix replaces every local branch checkout with `git fetch origin` followed by branching directly from the remote tracking ref, which works correctly inside git linked worktrees.
+
+### Itemized changes
+
+#### Added
+- **Gate visibility reconciliation** in `build/orchestrator/cli.ts`: `phaseGateProjection`, `featureGateProjection`, `reconcileVisiblePlanState` wired into `saveState`.
+- **`setCheckboxState`** in `plan-mutator.ts`: bidirectional checkbox flip with optional marker verification.
+- **`setCheckboxStatusNote`** in `plan-mutator.ts`: append/replace/remove status note suffix atomically.
+- **`PhaseGate`, `FeatureGate`, `PlanGateState`** types + gate checkbox parsing in `parser.ts`.
+- 27 new orchestrator tests covering gate projection, reconcile, and worktree-safe git ops.
+
+#### Changed
+- **`syncLandedBase`**: removed `git checkout ` + `git pull`. Safe in linked worktrees.
+- **`ensureFeatureBranch`**: replaced checkout sequence with `git fetch origin ` + `git checkout -b origin/`.
+- **`build/configure.cm`**: `primaryImpl`/`testFixer` → kimi. All timeouts → 1200000ms.
+
+---
+
## [1.28.0.0] - 2026-05-07
## **Browse handles real-world automation now: SOCKS5 with auth, container Xvfb, browser-native downloads. Plus a single-file `llms.txt` index agents can crawl in one read.**
@@ -862,6 +884,101 @@ end, just under the new "artifacts" terminology.
- **Preamble byte budget ratcheted from 35K to 36.5K** to honor the
remote-mode probe in `generate-brain-sync-block.ts`.
+## [1.26.7.0] - 2026-05-07
+
+## **`/build --dual-impl` is now model-agnostic instead of hardwired to Gemini versus Codex.**
+
+The build orchestrator now treats dual-implementation tournaments as configured primary and secondary roles. Implementors can be backed by Claude, Codex, Gemini, or Kimi, and the judge can use any supported provider while preserving isolated worktrees, recursive fix loops, judge hardening notes, and fail-closed resume behavior.
+
+### What you can now do
+
+- Configure primary, secondary, and judge roles independently for `--dual-impl` instead of being forced into Gemini primary, Codex secondary, and Claude judge.
+- Resume new dual-impl runs through generic `primary` / `secondary` state, worktree names, logs, and judge verdicts.
+- Keep old `--gemini-model`, `--codex-model`, and `--codex-review-model` flags working as compatibility aliases for primary, secondary, and review models.
+
+### What gets safer
+
+- Legacy persisted gemini/codex dual-impl state now fails with rerun guidance instead of being partially interpreted as the new state shape.
+- Judge output rejects stale `WINNER: gemini` and `WINNER: codex` values, requiring `WINNER: primary` or `WINNER: secondary`.
+- Sandboxed provider runs that can edit files but cannot write `.git` are recovered by the host, staging only summary-listed paths and cleaning generated cache noise before continuing.
+- The focused build-skill gate covers provider validation, state transitions, worktree setup, judge parsing, and generated docs.
+
+### Itemized changes
+
+#### Changed
+- `build/orchestrator/cli.ts` — routes dual implementors and judges through provider-aware dispatch, generic prompts, generic fix loops, and primary/secondary result handling.
+- `build/orchestrator/phase-runner.ts`, `types.ts`, and `worktree.ts` — replace gemini/codex dual state with candidate-keyed primary/secondary state.
+- `build/configure.cm` — updates default build routing for the configured model mix used by this branch.
+- `build/README.md`, `build/orchestrator/README.md`, and `build/SKILL.md.tmpl` — document model-agnostic dual-impl behavior and regenerated skill output.
+
+#### Added
+- `build/orchestrator/__tests__/cli.test.ts` — coverage for provider-agnostic dual-impl validation, prompts, and judge prompt formatting.
+- `build/orchestrator/__tests__/phase-runner.test.ts` — coverage for primary/secondary state transitions and legacy-state failure guidance.
+- `build/orchestrator/__tests__/sub-agents.test.ts` and `worktree.test.ts` — coverage for primary/secondary judge parsing and worktree naming.
+
+#### Fixed
+- `build/orchestrator/cli.ts` — recovers successful mutable agent runs when provider sandboxes block commits, using the agent summary as the allowlist for host-side staging.
+
+## [1.26.6.0] - 2026-05-07
+
+## **`/build` now catches dirty agent handoffs and classifies review timeouts more precisely.**
+
+The build orchestrator now treats a successful sub-agent exit as only one part of success. Implementor and review handoffs must leave useful output, commit when required, keep the child repo clean, and avoid mutating a parent workspace. This closes the class of failures where `/build` could continue after an agent claimed success while leaving scratch files, empty summaries, or changes in the wrong repo.
+
+### What you can now do
+
+- Run `/build` from nested workspaces with an explicit child project root, while workspace roots with immediate child repos are rejected unless `--allow-workspace-root` is set.
+- Let `/build` fail fast when implementors or review gates leave dirty repo state, miss required commits, or produce empty handoff summaries.
+- Run raw package `test` scripts through the detected package manager, including Bun-managed repos via `bun run test`.
+
+### What gets safer
+
+- Feature-review timeouts with pass evidence and no findings are preserved as tooling timeouts, while positive failure counts and explicit failure markers still stay conservative.
+- Test commands now run through the shell so quoted arguments survive.
+- Startup clean checks now include untracked files, preventing generated scratch files from slipping through the clean-worktree gate.
+
+### Itemized changes
+
+#### Added
+- `build/orchestrator/cli.ts` — post-agent hygiene snapshotting, parent-workspace mutation checks, and workspace-root selection validation.
+- `build/orchestrator/__tests__/cli.test.ts` — coverage for hygiene failures, parent workspace mutation detection, and `--allow-workspace-root`.
+- `build/orchestrator/__tests__/feature-review.test.ts` — timeout classification coverage for `0 failed`, positive failures, and explicit failure markers.
+
+#### Fixed
+- `build/orchestrator/sub-agents.ts` — maps raw package scripts to `bun run test`, `pnpm test`, `yarn test`, or `npm test` while preserving explicit test runner commands.
+- `build/orchestrator/feature-review.ts` — replaces broad `failed` timeout rejection with positive failure-count detection so `0 failed` can still count as pass evidence.
+- `build/orchestrator/phase-runner.ts` — surfaces hygiene failure messages directly in phase errors.
+
+## [1.26.5.0] - 2026-05-06
+
+## **`/build` survives transient Codex review transport drops without weakening sandbox policy.**
+
+Codex review, QA, and secondary review gates can now recover from the service disconnect path shown in the screenshot: `stream disconnected before completion`, TLS handshake EOFs, websocket connection failures, and Codex backend request-send failures. Those failures retry once inside `runCodexReview` with the same argv, cwd, model, prompt, and sandbox. Local sandbox blocks remain a separate path: only browser/socket/localhost permission failures can trigger the one-time `danger-full-access` gate retry.
+
+### What you can now do
+
+- **Resume `/build` review phases through transient Codex transport failures.** A dropped stream no longer fails the whole phase immediately; the Codex review runner retries once and writes the retry log as `phase----transport-retry.log`.
+- **Keep stale partial review output from poisoning retry verdicts.** The staged Codex output file is cleared before the retry, so a failed first attempt cannot leave an old `GATE FAIL` report that masks a clean retry.
+- **Keep sandbox escalation precise.** Codex service/network failures are not treated as workspace sandbox failures, and transport retries do not switch to `danger-full-access`.
+
+### What gets safer
+
+- **Review transport failure classification is now unit-tested.** The suite detects stream/TLS failures and websocket failures, while rejecting normal `GATE FAIL` reports and local sandbox permission failures.
+- **The live retry protocol is covered with a fake Codex binary.** The test proves the first invocation can fail after writing stale output, the retry starts with an empty output file, the final result passes, `retries === 1`, and the retry log path includes `transport-retry`.
+
+### Itemized changes
+
+#### Fixed
+- `build/orchestrator/sub-agents.ts` — adds Codex transport failure classification and one same-sandbox retry for non-zero Codex review exits caused by transient service/network errors.
+- `build/orchestrator/cli.ts` — keeps local sandbox-block retry classification separate from Codex service disconnects and routes explicit retry sandbox overrides through `runSlashCommand`.
+
+#### Added
+- `build/orchestrator/__tests__/sub-agents.test.ts` — classifier coverage plus a fake-binary `runCodexReview` retry test.
+- `build/orchestrator/__tests__/cli.test.ts` — sandbox retry classifier coverage, including the guard that transport disconnects are not sandbox failures.
+
+#### Changed
+- `build/README.md` and `build/orchestrator/README.md` — document the Codex review/QA sandbox override and the local verification sandbox retry behavior.
+
## [1.26.5.0] - 2026-05-06
## **The v1.26 memory feature now actually works on a fresh `/setup-gbrain` install, and `/sync-gbrain --full` actually registers github-hosted code sources.**
@@ -960,7 +1077,7 @@ Two functional gaps closed in one ship: the cwd repo wasn't actually being index
#### Changed
- `bin/gstack-gbrain-sync.ts` `runCodeImport` rewritten to use `gbrain sources add` + `gbrain sync --strategy code` (incremental) or `gbrain reindex-code --yes` (`--full`) instead of `gbrain import`. State file written via tmp+rename for atomicity.
- `setup-gbrain/SKILL.md.tmpl` Step 8 now writes both `## GBrain Configuration` AND `## GBrain Search Guidance` blocks, gated on Step 9 smoke test pass.
-- `scripts/resolvers/preamble/generate-brain-sync-block.ts` emits Variant A (4 lines, healthy) / Variant B (3 lines, empty corpus) / empty string (gbrain not configured). Reads cached cwd page_count from the state file (handles pretty + compact JSON via `tr -d '\n'` flatten).
+- `scripts/resolvers/preamble/generate-brain-sync-block.ts` emits Variant A (4 lines, healthy) / Variant B (3 lines, empty corpus) / empty string (gbrain not configured). Reads cached cwd page_count from the state file by matching the current repo `source_path`.
- `test/gen-skill-docs.test.ts` plan-review preamble byte budget bumped 33000 → 35000 to absorb the new context-load block.
- `test/gstack-gbrain-sync.test.ts` updated for native code surfaces (12 tests, was 8) — adds source-id derivation, dry-run no-lock, stale-lock takeover, fresh-lock blocking.
- `test/skill-e2e-memory-pipeline.test.ts` updated to assert `would: gbrain sources add` instead of `would: gbrain import`.
@@ -1202,6 +1319,87 @@ If you've been writing skill templates with `Recommendation: because 1` with `--dual-impl` until the
+ executor model can safely combine both workflows.
+
+## [1.25.1.1] - 2026-05-02
+
+## **Local Claude settings stay out of commits.**
+
+Host-local Claude settings are now ignored, so workspace-specific `.claude`
+configuration does not show up as accidental repository noise.
+
+### Fixed
+
+- `.claude/settings.local.json` is ignored as a local-only settings file.
+
+## [1.25.1.0] - 2026-05-02
+
+## **Build skills can launch the orchestrator even when spawned shells miss `PATH` setup.**
+
+The `/build` skill no longer assumes `gstack-build` is discoverable through the
+interactive shell's `PATH`. Before launch or resume, it now resolves an
+executable from `GSTACK_BUILD_CLI`, `command -v gstack-build`, host-specific
+Claude/Codex setup paths, or the current checkout's `bin/gstack-build`, then
+uses that resolved path for every background run.
+
+### Fixed
+
+- `/build` now launches and resumes through `_GSTACK_BUILD_CLI` instead of a bare
+ `gstack-build` command, fixing spawned-agent environments that could not find
+ the build CLI.
+- Generated Claude and Codex build skills get host-specific CLI candidates, so
+ Claude output does not contain Codex install paths and Codex output can use
+ `GSTACK_ROOT` when available.
+
+### Changed
+
+- Build documentation now describes the manual `PATH` requirement separately
+ from the `/build` skill's resolver order, including the explicit
+ `GSTACK_BUILD_CLI=/absolute/path/to/gstack-build` override.
+
+### Added
+
+- Regression coverage in `test/gen-skill-docs.test.ts` verifies generated build
+ skills use the resolver and do not regress to bare `gstack-build` launches.
+
+## [1.25.0.0] - 2026-05-02
+
+## **Fork customizations preserved while upgrading to upstream v1.25.0.0.**
+
+This fork keeps its custom `gstack-build` orchestration behavior while merging upstream releases. The upgrade path now treats the user's own gstack repository as the source of truth: fetch upstream, merge it into the local branch, resolve conflicts, regenerate skills, and push only to the user's fork.
+
+### Preserved local behavior
+
+- `gstack-build` recursive fix loops remain in place: review, reviewsecondary, and QA are expected to run fix-and-rerun loops until no issues remain.
+- Dual-implementor build hardening remains in place, including per-implementor test-fix iterations, judge hardening notes, resume SHA validation, and test-modification hygiene checks.
+- Build startup guardrails remain in place: dirty-tree checks, stale branch sweep, bounded branch processing, and restore-on-exit behavior.
+- `/gstack-upgrade` remains merge-based for customized installs. It must not hard-reset or replace the user's fork when upstream has a new release.
+
## [1.25.0.0] - 2026-05-01
## **Plan-mode skills surface every decision again, even when the host disallows AskUserQuestion.**
diff --git a/CLAUDE.md b/CLAUDE.md
index af3c58a02f..2ce43502ec 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -119,6 +119,9 @@ gstack/
├── codex/ # /codex skill (multi-AI second opinion via OpenAI Codex CLI)
├── land-and-deploy/ # /land-and-deploy skill (merge → deploy → canary verify)
├── office-hours/ # /office-hours skill (YC Office Hours — startup diagnostic + builder brainstorm)
+├── build/ # /build skill (autonomous plan executor: TDD loop, dual-impl, Codex review)
+│ ├── SKILL.md, SKILL.md.tmpl
+│ └── orchestrator/ # gstack-build CLI: cli.ts, phase-runner.ts, sub-agents.ts, worktree.ts, etc.
├── investigate/ # /investigate skill (systematic root-cause debugging)
├── retro/ # Retrospective skill (includes /retro global cross-project mode)
├── bin/ # CLI utilities (gstack-repo-mode, gstack-slug, gstack-config, etc.)
@@ -638,6 +641,17 @@ above, plus:
community PR, name the contributor with `Contributed by @username`. Contributors
did real work. Thank them publicly every time, no exceptions.
+## Fork versioning rule
+
+**Never bump the top-level `VERSION` file in this repo when working on fork-specific skills.**
+
+This repo (`anbangr/gstack`) is a personal fork of `garrytan/gstack`. The top-level `VERSION` file tracks the fork's release state relative to upstream. Bumping it creates divergence that makes `gstack-update-check` output confusing (`UPGRADE_AVAILABLE` with the local version higher than upstream).
+
+**The rule:**
+- Editing or building a custom skill (e.g. `build/SKILL.md.tmpl`)? Bump only the `version:` frontmatter field inside that skill file (e.g. `version: 1.19.0`). Do NOT touch `VERSION` or `package.json` version.
+- Merging upstream? Sync `VERSION` and `package.json` to upstream's version after the merge.
+- Only bump `VERSION` when merging or syncing with upstream, never for fork-local skill work.
+
## AI effort compression
When estimating or discussing effort, always show both human-team and CC+gstack time:
diff --git a/GSTACK_PLAYBOOK.md b/GSTACK_PLAYBOOK.md
new file mode 100644
index 0000000000..57460fcab7
--- /dev/null
+++ b/GSTACK_PLAYBOOK.md
@@ -0,0 +1,419 @@
+# GStack Playbook
+
+Practical guide for using gstack from idea to shipped product.
+
+If your host installs prefixed skills, replace `/skill-name` with `gstack-skill-name`.
+
+## Core Rule
+
+- `office-hours` decides what problem you are really solving.
+- `plan-ceo-review` decides what should be in scope.
+- `plan-eng-review` decides how to build it.
+- `review` checks the real diff.
+- `qa` checks the real app.
+- `ship` and `land-and-deploy` finish the job.
+
+## Default Workflow
+
+### 1. Start from zero
+
+Use when the idea is fuzzy or you want sharper framing.
+
+```text
+/office-hours I want to build an internal support copilot for our sales team.
+```
+
+Pass:
+- Idea or problem statement
+- Optional context: startup/business vs builder/hackathon
+
+Output:
+- Design doc in `~/.gstack/projects/...`
+
+### 2. Challenge scope
+
+Use if scope, ambition, or wedge is still uncertain.
+
+```text
+/plan-ceo-review hold scope on this plan
+```
+
+Pass:
+- The current plan or design doc
+- Optional mode:
+ - `scope expansion`
+ - `selective expansion`
+ - `hold scope`
+ - `scope reduction`
+
+Output:
+- Updated plan guidance
+- Review report in the plan file
+- Sometimes a separate CEO plan artifact
+
+### 3. Make it buildable
+
+Use after the direction is approved.
+
+```text
+/plan-eng-review break this into PR-sized migration phases with rollback points
+```
+
+Pass:
+- The approved plan
+- Optional focus:
+ - architecture
+ - migration phases
+ - tests
+ - performance
+ - failure modes
+ - rollout and rollback
+
+Output:
+- Buildable implementation plan
+- Test plan artifact for `/qa`
+
+### 4. Add specialist reviews only when needed
+
+For user-facing UI:
+
+```text
+/plan-design-review focus on onboarding, empty states, and mobile
+```
+
+For developer-facing products:
+
+```text
+/plan-devex-review dx polish for first-time API users
+```
+
+If you want the whole plan stack automatically:
+
+```text
+/autoplan
+```
+
+### 5. Build
+
+Implement from the reviewed plan file, not from scattered notes.
+
+```text
+/build
+```
+
+Recommended pattern:
+- Build in phases
+- Keep diffs small
+- Re-run `/review` after each meaningful phase (the `/build` skill can automate this loop)
+
+### 6. Debug when something breaks
+
+```text
+/investigate checkout sometimes double-submits on refresh
+```
+
+Use for:
+- bugs
+- regressions
+- 500s
+- confusing behavior
+
+### 7. Review the actual diff
+
+```text
+/review
+```
+
+Optional focus:
+
+```text
+/review focus on concurrency and trust boundaries
+```
+
+Use after code exists, before merge.
+
+### 8. QA the real app
+
+If you want testing plus fixes:
+
+```text
+/qa
+/qa https://staging.myapp.com
+```
+
+If you want report-only:
+
+```text
+/qa-only
+/qa-only https://staging.myapp.com
+```
+
+Useful modes:
+
+```text
+/qa --quick
+/qa --regression baseline.json
+```
+
+If authentication is needed:
+
+```text
+/setup-browser-cookies
+/setup-browser-cookies github.com
+```
+
+### 9. Run specialist post-build audits if needed
+
+Visual polish:
+
+```text
+/design-review https://myapp.com
+```
+
+Developer onboarding:
+
+```text
+/devex-review try the quickstart for this CLI
+```
+
+Performance:
+
+```text
+/benchmark https://myapp.com
+```
+
+Security:
+
+```text
+/cso
+/cso comprehensive
+```
+
+### 10. Ship
+
+Create or update the PR and do release prep:
+
+```text
+/ship
+```
+
+### 11. Merge and deploy
+
+One-time deploy setup:
+
+```text
+/setup-deploy
+```
+
+Then:
+
+```text
+/land-and-deploy
+```
+
+### 12. Watch production
+
+```text
+/canary https://myapp.com
+```
+
+### 13. Sync docs
+
+```text
+/document-release
+```
+
+### 14. Close the loop
+
+Project retro:
+
+```text
+/retro
+```
+
+Cross-project retro:
+
+```text
+/retro global
+```
+
+## Decision Tree
+
+### If the problem is still fuzzy
+
+- Run `/office-hours`
+
+### If scope is unclear
+
+- Add `/plan-ceo-review`
+
+### If you need a technical plan
+
+- Run `/plan-eng-review`
+
+### If UI/UX is central
+
+- Add `/plan-design-review`
+
+### If developers are the user
+
+- Add `/plan-devex-review`
+
+### If you want all plan reviews automatically
+
+- Run `/autoplan`
+
+### If code already exists and you want risk review
+
+- Run `/review`
+
+### If you want real browser testing
+
+- Run `/qa` or `/qa-only`
+
+### If something is broken and root cause is unclear
+
+- Run `/investigate`
+
+### If the branch is ready to land
+
+- Run `/ship`
+
+## Invocation Cheat Sheet
+
+| Skill | What to pass | Example |
+|-------|--------------|---------|
+| `/office-hours` | idea/problem statement | `/office-hours We want to simplify support handoffs.` |
+| `/plan-ceo-review` | plan + optional scope mode | `/plan-ceo-review scope reduction` |
+| `/plan-eng-review` | plan + optional technical focus | `/plan-eng-review focus on migration safety` |
+| `/plan-design-review` | plan + optional UI focus | `/plan-design-review focus on mobile and empty states` |
+| `/plan-devex-review` | plan + optional DX mode | `/plan-devex-review dx triage for this CLI` |
+| `/autoplan` | current plan | `/autoplan` |
+| `/build` | usually nothing | `/build` |
+| `/design-consultation` | product, audience, desired feel | `/design-consultation B2B analytics app, serious and high-trust` |
+| `/design-shotgun` | screen/page description | `/design-shotgun pricing page for a dev tools product` |
+| `/design-html` | approved design, mockup, or description | `/design-html build the approved dashboard design` |
+| `/investigate` | bug/error/symptom | `/investigate users get logged out after password reset` |
+| `/review` | usually nothing, optional focus | `/review` |
+| `/qa` | optional URL or mode | `/qa https://staging.myapp.com` |
+| `/qa-only` | optional URL | `/qa-only https://staging.myapp.com` |
+| `/design-review` | live URL | `/design-review https://myapp.com` |
+| `/devex-review` | onboarding or docs target | `/devex-review try the getting-started flow` |
+| `/benchmark` | usually URL | `/benchmark https://myapp.com` |
+| `/cso` | optional mode | `/cso daily` |
+| `/ship` | usually nothing | `/ship` |
+| `/setup-deploy` | usually nothing | `/setup-deploy` |
+| `/land-and-deploy` | usually nothing | `/land-and-deploy` |
+| `/canary` | production URL | `/canary https://myapp.com` |
+| `/document-release` | usually nothing | `/document-release` |
+| `/retro` | optional `global` | `/retro global` |
+| `/learn` | plain-English action | `/learn show project learnings` |
+| `/open-gstack-browser` | usually nothing | `/open-gstack-browser` |
+| `/setup-browser-cookies` | optional domain | `/setup-browser-cookies github.com` |
+| `/pair-agent` | target agent in plain English | `/pair-agent connect Codex to this browser session` |
+| `/careful` | nothing | `/careful` |
+| `/freeze` | directory path | `/freeze src/payments` |
+| `/guard` | usually a directory path | `/guard src/billing` |
+| `/unfreeze` | nothing | `/unfreeze` |
+| `/context-save` | optional note | `/context-save save release prep context` |
+| `/context-restore` | optional hint | `/context-restore resume payment refactor` |
+| `/plan-tune` | plain-English preference | `/plan-tune stop asking repeated scope questions` |
+| `/gstack-upgrade` | nothing | `/gstack-upgrade` |
+
+## Recommended Flows
+
+### New product
+
+```text
+/office-hours
+/plan-ceo-review
+/plan-eng-review
+/plan-design-review or /plan-devex-review if needed
+/build
+/review
+
+/qa
+/ship
+/land-and-deploy
+/document-release
+/retro
+```
+
+### Internal refactor
+
+```text
+/plan-eng-review
+/build
+/review after each phase
+/qa if behavior changed
+/ship
+```
+
+### UI-heavy feature
+
+```text
+/office-hours
+/plan-ceo-review
+/plan-design-review
+/plan-eng-review
+/build
+/design-review
+/qa
+/ship
+```
+
+### API, SDK, CLI, docs feature
+
+```text
+/office-hours
+/plan-ceo-review
+/plan-devex-review
+/plan-eng-review
+/build
+/devex-review
+/review
+/ship
+```
+
+## Utility Notes
+
+### `/browse`
+
+`/browse` is a browser toolbelt, not just a one-shot skill. After invoking it, use `$B ...` commands.
+
+Examples:
+
+```text
+$B goto https://myapp.com
+$B snapshot -i
+$B click @e3
+$B screenshot /tmp/homepage.png
+```
+
+### Safety defaults
+
+When work is risky:
+
+```text
+/careful
+/freeze src/payments
+```
+
+Or both:
+
+```text
+/guard src/payments
+```
+
+### Context management
+
+If work spans sessions:
+
+```text
+/context-save
+/context-restore
+```
+
+## One-line Summary
+
+Use `office-hours` to frame, `plan-ceo-review` to scope, `plan-eng-review` to build, `review` to check the diff, `qa` to test the app, and `ship` plus `land-and-deploy` to finish the job.
diff --git a/README.md b/README.md
index 87f2d5ddd6..be8e71553b 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Fork it. Improve it. Make it yours. And if you want to hate on free open source
Open Claude Code and paste this. Claude does the rest.
-> Install gstack: run **`git clone --single-branch --depth 1 https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup`** then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /office-hours, /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /design-shotgun, /design-html, /review, /ship, /land-and-deploy, /canary, /benchmark, /browse, /connect-chrome, /qa, /qa-only, /design-review, /setup-browser-cookies, /setup-deploy, /setup-gbrain, /retro, /investigate, /document-release, /codex, /cso, /autoplan, /plan-devex-review, /devex-review, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn. Then ask the user if they also want to add gstack to the current project so teammates get it.
+> Install gstack: run **`git clone --single-branch --depth 1 https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup`** then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /office-hours, /plan-ceo-review, /plan-domain-review, /plan-api-review, /plan-modernization-review, /plan-eng-review, /plan-design-review, /design-consultation, /design-shotgun, /design-html, /review, /ship, /land-and-deploy, /canary, /benchmark, /browse, /connect-chrome, /qa, /qa-only, /design-review, /setup-browser-cookies, /setup-deploy, /setup-gbrain, /sync-gbrain, /retro, /investigate, /document-release, /codex, /cso, /autoplan, /plan-devex-review, /devex-review, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn. Then ask the user if they also want to add gstack to the current project so teammates get it.
### Step 2: Team mode — auto-update for shared repos (recommended)
@@ -180,6 +180,9 @@ Each skill feeds into the next. `/office-hours` writes a design doc that `/plan-
|-------|----------------|--------------|
| `/office-hours` | **YC Office Hours** | Start here. Six forcing questions that reframe your product before you write code. Pushes back on your framing, challenges premises, generates implementation alternatives. Design doc feeds into every downstream skill. |
| `/plan-ceo-review` | **CEO / Founder** | Rethink the problem. Find the 10-star product hiding inside the request. Four modes: Expansion, Selective Expansion, Hold Scope, Reduction. |
+| `/plan-domain-review` | **Domain Architect** | Interactive domain-model pass for workflow-heavy plans. Clarifies glossary, bounded contexts, ownership seams, state transitions, and domain events without defaulting to CQRS. |
+| `/plan-api-review` | **API Designer** | Interactive contract pass for endpoints, services, webhooks, and event payloads. Locks in interface style, versioning, compatibility, error model, idempotency, and rate-limit expectations. |
+| `/plan-modernization-review` | **Modernization Lead** | Interactive migration pass for modularization, service extraction, and strangler-style rollouts. Clarifies current state, target state, phases, rollback points, and migration hazards. |
| `/plan-eng-review` | **Eng Manager** | Lock in architecture, data flow, diagrams, edge cases, and tests. Forces hidden assumptions into the open. |
| `/plan-design-review` | **Senior Designer** | Rates each design dimension 0-10, explains what a 10 looks like, then edits the plan to get there. AI Slop detection. Interactive — one AskUserQuestion per design choice. |
| `/plan-devex-review` | **Developer Experience Lead** | Interactive DX review: explores developer personas, benchmarks against competitors' TTHW, designs your magical moment, traces friction points step by step. Three modes: DX EXPANSION, DX POLISH, DX TRIAGE. 20-45 forcing questions. |
@@ -211,9 +214,15 @@ Each skill feeds into the next. `/office-hours` writes a design doc that `/plan-
|-----------------|--------------------------|----------------------------|
| **End users** (UI, web app, mobile) | `/plan-design-review` | `/design-review` |
| **Developers** (API, CLI, SDK, docs) | `/plan-devex-review` | `/devex-review` |
+| **Workflow-heavy business logic** | `/plan-domain-review` | — |
+| **Public or cross-service interfaces** | `/plan-api-review` | — |
+| **Migrations and decomposition** | `/plan-modernization-review` | — |
| **Architecture** (data flow, perf, tests) | `/plan-eng-review` | `/review` |
| **All of the above** | `/autoplan` (runs CEO → design → eng → DX, auto-detects which apply) | — |
+The three targeted architecture reviews are manual in v1. A good default sequence is:
+`/office-hours` → `/plan-ceo-review` → one or more of `/plan-domain-review`, `/plan-api-review`, `/plan-modernization-review` → `/plan-eng-review`.
+
### Power tools
| Skill | What it does |
@@ -464,10 +473,12 @@ Data is stored in [Supabase](https://supabase.com) (open source Firebase alterna
## gstack
Use /browse from gstack for all web browsing. Never use mcp__claude-in-chrome__* tools.
Available skills: /office-hours, /plan-ceo-review, /plan-eng-review, /plan-design-review,
+ /plan-domain-review, /plan-api-review, /plan-modernization-review, /plan-devex-review,
/design-consultation, /design-shotgun, /design-html, /review, /ship, /land-and-deploy,
/canary, /benchmark, /browse, /open-gstack-browser, /qa, /qa-only, /design-review,
-/setup-browser-cookies, /setup-deploy, /setup-gbrain, /sync-gbrain, /retro, /investigate, /document-release,
-/codex, /cso, /autoplan, /pair-agent, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn.
+/devex-review, /setup-browser-cookies, /setup-deploy, /setup-gbrain, /sync-gbrain,
+/retro, /investigate, /document-release, /codex, /cso, /autoplan, /pair-agent,
+/careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn.
```
## License
diff --git a/TODOS.md b/TODOS.md
index 0516f972e1..00573b0127 100644
--- a/TODOS.md
+++ b/TODOS.md
@@ -256,6 +256,7 @@ made opt-in. Lower priority than the gbrain-side perf issue above.
**Depends on:** v1.8.0.0 telemetry in production. P1 self-authoring commands.
---
+
## Sidebar Terminal (cc-pty-import follow-ups)
### v1.1: PTY session survives sidebar reload
@@ -375,6 +376,7 @@ scope of that PR; deliberately deferred to keep PTY-import small.
**Effort:** L (human: ~1-2 weeks / CC+gstack: ~2-3 hours for design doc + first-pass implementation).
**Priority:** P1 if interactive-skill volume is growing; P2 otherwise.
**Depends on / blocked by:** design doc — likely its own `docs/designs/STOP_ASK_ENFORCEMENT_V0.md`.
+
## Context skills
### `/context-save --lane` + `/context-restore --lane` for parallel workstreams
@@ -617,6 +619,7 @@ score SAFE 0.98+, attacks score INJECTION 0.99+). Pre-impl gate 3 (benign corpus
forced this pivot — see `~/.gstack/projects/garrytan-gstack/ceo-plans/2026-04-19-prompt-injection-guard.md`.
**What shipped in v1:**
+
- `browse/src/security.ts` — canary injection + check, verdict combiner (ensemble rule),
attack log with rotation, cross-process session state, status reporting
- `browse/src/security-classifier.ts` — TestSavantAI ONNX classifier + Haiku transcript
@@ -779,37 +782,40 @@ threshold (user-input default unchanged for SO-FP mitigation).
#### ~~Adversarial + integration + smoke-bench test suites (P1)~~ — SHIPPED
Four test files shipped this round:
- * `browse/test/security-adversarial.test.ts` (94a83c50) — 23 canary-channel
- + verdict-combiner attack-shape tests
- * `browse/test/security-integration.test.ts` (07745e04) — 10 layer-coexistence
- + defense-in-depth regression guards
- * `browse/test/security-live-playwright.test.ts` (b9677519) — 7 live-Chromium
- fixture tests (5 deterministic + 2 ML, skipped if model cache absent)
- * `browse/test/security-bench.test.ts` (afc6661f) — BrowseSafe-Bench 200-case
- smoke harness with hermetic dataset cache + v1 baseline metrics
+
+- `browse/test/security-adversarial.test.ts` (94a83c50) — 23 canary-channel
+ - verdict-combiner attack-shape tests
+- `browse/test/security-integration.test.ts` (07745e04) — 10 layer-coexistence
+ - defense-in-depth regression guards
+- `browse/test/security-live-playwright.test.ts` (b9677519) — 7 live-Chromium
+ fixture tests (5 deterministic + 2 ML, skipped if model cache absent)
+- `browse/test/security-bench.test.ts` (afc6661f) — BrowseSafe-Bench 200-case
+ smoke harness with hermetic dataset cache + v1 baseline metrics
#### Bun-native 5ms inference (P3 research) — SKELETON SHIPPED, forward pass open
Research skeleton landed this round (browse/src/security-bunnative.ts,
docs/designs/BUN_NATIVE_INFERENCE.md, browse/test/security-bunnative.test.ts):
- * Pure-TS WordPiece tokenizer — reads HF tokenizer.json directly, matches
- transformers.js output on fixture strings (correctness-tested in CI)
- * Stable `classify()` API that current callers can wire against today
- * Benchmark harness with p50/p95/p99 reporting — anchors v1 WASM baseline
- for future regressions
+- Pure-TS WordPiece tokenizer — reads HF tokenizer.json directly, matches
+ transformers.js output on fixture strings (correctness-tested in CI)
+- Stable `classify()` API that current callers can wire against today
+- Benchmark harness with p50/p95/p99 reporting — anchors v1 WASM baseline
+ for future regressions
Design doc captures the roadmap:
- * Approach A: pure-TS + Float32Array SIMD — ruled out (can't beat WASM)
- * Approach B: Bun FFI + Apple Accelerate cblas_sgemm — target ~3-6ms p50,
- macOS-only, ~1000 LOC
- * Approach C: Bun WebGPU — unexplored, worth a spike
+
+- Approach A: pure-TS + Float32Array SIMD — ruled out (can't beat WASM)
+- Approach B: Bun FFI + Apple Accelerate cblas_sgemm — target ~3-6ms p50,
+ macOS-only, ~1000 LOC
+- Approach C: Bun WebGPU — unexplored, worth a spike
Remaining work (XL, multi-week):
- * FFI proof-of-concept for cblas_sgemm
- * Single transformer layer implementation + correctness check vs onnxruntime
- * Full forward pass + weight loader + correctness regression fixtures
- * Production swap in security-bunnative.ts `classify()` body
+
+- FFI proof-of-concept for cblas_sgemm
+- Single transformer layer implementation + correctness check vs onnxruntime
+- Full forward pass + weight loader + correctness regression fixtures
+- Production swap in security-bunnative.ts `classify()` body
## Builder Ethos
@@ -836,6 +842,7 @@ Remaining work (XL, multi-week):
**Context:** Google shipped Chrome DevTools MCP in Chrome 146+ (June 2025). It provides screenshots, console messages, performance traces, Lighthouse audits, and full page interaction through the user's real browser. gstack should use it for real-session access while keeping Playwright for headless CI/testing workflows.
Potential new skills:
+
- `/debug-browser`: JS error tracing with source-mapped stack traces
- `/perf-debug`: performance traces, Core Web Vitals, network waterfall
@@ -1098,7 +1105,6 @@ Linux cookie import shipped in v0.11.11.0 (Wave 3). Supports Chrome, Chromium, B
**Priority:** P2
**Depends on:** None
-
### Visual verification with screenshots in PR body
**What:** /ship Step 7.5: screenshot key pages after push, embed in PR body.
@@ -1258,8 +1264,6 @@ Linux cookie import shipped in v0.11.11.0 (Wave 3). Supports Chrome, Chromium, B
**Priority:** P3
**Depends on:** Video recording
-
-
### Extend worktree isolation to Claude E2E tests
**What:** Add `useWorktree?: boolean` option to `runSkillTest()` so any Claude E2E test can opt into worktree mode for full repo context instead of tmpdir fixtures.
@@ -1410,7 +1414,6 @@ Shipped in v0.8.3. Step 8.5 added to `/ship` — after creating the PR, `/ship`
**Priority:** P3
**Depends on:** gstack-diff-scope (shipped)
-
## Codex
### Codex→Claude reverse buddy check skill
@@ -1462,6 +1465,7 @@ Shipped in v0.6.5. TemplateContext in gen-skill-docs.ts bakes skill name into pr
**Context:** All items are prose additions to `investigate/SKILL.md.tmpl`. No new scripts.
**Items:**
+
1. Stack trace auto-detection for freeze directory (parse deepest app frame)
2. Freeze boundary widening (ask to widen instead of hard-block when hitting boundary)
3. Post-fix auto-unfreeze + full test suite run
@@ -1643,8 +1647,36 @@ Shipped in v0.6.5. TemplateContext in gen-skill-docs.ts bakes skill name into pr
**Priority:** P2
**Depends on:** CDP patches proving the value of anti-bot stealth first
+---
+
+## Fork overlay follow-ons
+
+### Auto-discover and install new skills from fork repo
+
+**What:** When `fork_repo_path` is configured, Step 4.8 currently overlays only SKILL.md.tmpl files that already exist in `$INSTALL_DIR`. If the fork adds a brand-new skill (e.g., a `custom-build/SKILL.md.tmpl` that doesn't exist upstream), it is silently skipped — Step 4.9 only syncs dirs that already exist in the gemini/kimi host dirs.
+
+**Fix needed:**
+
+1. After the existing copy loop in Step 4.8, detect skill dirs present in `$_FORK_REPO` but absent from `$INSTALL_DIR`. For each missing dir, copy it to `$INSTALL_DIR` and report "new skill installed: ``".
+2. Step 4.9 sync loop should create missing skill dirs in `.gemini/skills/gstack/` and `.kimi/skills/gstack/` rather than only updating existing ones.
+
+**Why deferred:** The current loop structure uses `git diff --name-only | grep '/SKILL\.md\.tmpl$'` which only surfaces CHANGED files — files absent from the base ref are not included in the diff. Detecting new skills requires comparing `$_FORK_REPO`'s skill dirs against `$INSTALL_DIR` directly (a `comm -23` or `find` approach), which is a separate code path.
+
+**Effort:** S (human: ~1 hour / CC: ~10 min)
+**Priority:** P2
+
## Completed
+### Dual Implementor foundation + fix loops + hardening notes (v1.15.0.0 – v1.23.0.0)
+
+- **Phase 1/2 (v1.15.0.0):** `worktree.ts` with `createWorktrees`/`applyWinner`/`teardownWorktrees`, 6 new `PhaseStatus` values, `DualImplState`/`DualImplTestResult` interfaces, `phase-runner.ts` with `RUN_DUAL_IMPL`/`RUN_DUAL_TESTS`/`RUN_JUDGE_OPUS`/`APPLY_WINNER` action types, full transition test coverage.
+- **Phase 5 (v1.15.0.0):** `README.md` dual-impl section, `integration.test.ts` dry-run test with `--dual-impl --dry-run`.
+- **Fix loops + hardening (v1.23.0.0):** `runDualImplFixLoop` recursive fix passes (up to `DEFAULT_MAX_TEST_ITERATIONS`), per-iteration `fixHistory` threaded to the Opus judge, `HARDENING:` block flowing into Codex review prompt, SHA validation on resume, test hygiene gate before auto-select.
+
+**Completed:** v1.23.0.0 (2026-04-29)
+
+---
+
### Slim preamble + real-PTY plan-mode E2E harness (v1.13.1.0)
- Compressed 18 preamble resolvers; total `SKILL.md` corpus dropped from 3.08 MB to 2.30 MB across 47 outputs (-25.5%, ~196K tokens saved).
@@ -1687,23 +1719,26 @@ Shipped in v0.6.5. TemplateContext in gen-skill-docs.ts bakes skill name into pr
---
### Overlay efficacy harness + Opus 4.7 fanout nudge removal (v1.10.1.0)
+
- Built `test/skill-e2e-overlay-harness.test.ts`, a parametric periodic-tier eval that drives `@anthropic-ai/claude-agent-sdk` and measures first-turn fanout rate (overlay-ON vs overlay-OFF) across registered fixtures
- Measured the original "Fan out explicitly" overlay nudge: baseline Opus 4.7 = 70% first-turn fanout on toy prompt, with our nudge = 10%, with Anthropic's own canonical `` text = 0%
- Removed the counterproductive nudge from `model-overlays/opus-4-7.md`
- Shipped 36-test free-tier unit suite for the SDK runner + strict fixture validator
- Registered `overlay-harness-opus-4-7-fanout-{toy,realistic}` in E2E_TOUCHFILES and E2E_TIERS
- Total investigation cost: ~$7 across 3 eval runs
-**Completed:** v1.10.1.0
+ **Completed:** v1.10.1.0
### CI eval pipeline (v0.9.9.0)
+
- GitHub Actions eval upload on Ubicloud runners ($0.006/run)
- Within-file test concurrency (test() → testConcurrentIfSelected())
- Eval artifact upload + PR comment with pass/fail + cost
- Baseline comparison via artifact download from main
- EVALS_CONCURRENCY=40 for ~6min wall clock (was ~18min)
-**Completed:** v0.9.9.0
+ **Completed:** v0.9.9.0
### Deploy pipeline (v0.9.8.0)
+
- /land-and-deploy — merge PR, wait for CI/deploy, canary verification
- /canary — post-deploy monitoring loop with anomaly detection
- /benchmark — performance regression detection with Core Web Vitals
@@ -1712,41 +1747,81 @@ Shipped in v0.6.5. TemplateContext in gen-skill-docs.ts bakes skill name into pr
- E2E model pinning (Sonnet default, Opus for quality tests)
- E2E timing telemetry (first_response_ms, max_inter_turn_ms, wall_clock_ms)
- test:e2e:fast tier, --retry 2 on all E2E scripts
-**Completed:** v0.9.8.0
+ **Completed:** v0.9.8.0
### Phase 1: Foundations (v0.2.0)
+
- Rename to gstack
- Restructure to monorepo layout
- Setup script for skill symlinks
- Snapshot command with ref-based element selection
- Snapshot tests
-**Completed:** v0.2.0
+ **Completed:** v0.2.0
### Phase 2: Enhanced Browser (v0.2.0)
+
- Annotated screenshots, snapshot diffing, dialog handling, file upload
- Cursor-interactive elements, element state checks
- CircularBuffer, async buffer flush, health check
- Playwright error wrapping, useragent fix
- 148 integration tests
-**Completed:** v0.2.0
+ **Completed:** v0.2.0
### Phase 3: QA Testing Agent (v0.3.0)
+
- /qa SKILL.md with 6-phase workflow, 3 modes (full/quick/regression)
- Issue taxonomy, severity classification, exploration checklist
- Report template, health score rubric, framework detection
- wait/console/cookie-import commands, find-browse binary
-**Completed:** v0.3.0
+ **Completed:** v0.3.0
### Phase 3.5: Browser Cookie Import (v0.3.x)
+
- cookie-import-browser command (Chromium cookie DB decryption)
- Cookie picker web UI, /setup-browser-cookies skill
- 18 unit tests, browser registry (Comet, Chrome, Arc, Brave, Edge)
-**Completed:** v0.3.1
+ **Completed:** v0.3.1
### E2E test cost tracking
+
- Track cumulative API spend, warn if over threshold
-**Completed:** v0.3.6
+ **Completed:** v0.3.6
### Auto-upgrade mode + smart update check
+
- Config CLI (`bin/gstack-config`), auto-upgrade via `~/.gstack/config.yaml`, 12h cache TTL, exponential snooze backoff (24h→48h→1wk), "never ask again" option, vendored copy sync on upgrade
-**Completed:** v0.3.8
+ **Completed:** v0.3.8
+
+---
+
+## P3: Build orchestrator gate reconciler — architectural follow-ups (v1.28.0.0 deferrals)
+
+Explicitly deferred from the v1.28.0.0 /plan-eng-review. Ship now; revisit when the gate system has been dogfooded across multiple plan shapes.
+
+### Batch plan-file reads in `reconcileVisiblePlanState`
+
+**What:** `setCheckboxState` reads + writes the full plan file once per gate flip. For a 10-phase plan with 5 gates each, a full reconcile does up to 50 sequential file reads/writes on one `saveState` call. Hoist the `readFileSync`/`split` into `reconcileVisiblePlanState` (or expose a `applyCheckboxStateToLines` helper), apply all mutations to the in-memory lines array in a single pass, then call `writePlanContentAtomic` once.
+
+**Why:** Correctness is fine — each write is atomic and the reconcile only runs once per phase transition (not in a tight loop). But on slow disks or NFS mounts the per-gate latency compounds. The batched design also simplifies reasoning about consistency: one read, one write, one atomic rename.
+
+**Effort:** S (human: ~half day / CC: ~20 min)
+**Priority:** P3
+
+### Extract gate markers and projection to `gate-reconciler.ts`
+
+**What:** Move `PHASE_GATE_MARKERS`, `FEATURE_GATE_MARKERS`, `phaseGateProjection`, `featureGateProjection`, `reconcilePhaseVisibleGates`, `reconcileFeatureVisibleGates`, and `reconcileVisiblePlanState` out of `cli.ts` into a new `build/orchestrator/gate-reconciler.ts`. Export `featureGateProjection` so it can be unit-tested directly alongside `phaseGateProjection`.
+
+**Why:** `cli.ts` is already large. The gate reconciler is a self-contained subsystem with clear inputs (phase/feature state + plan file path) and outputs (checkbox mutations). Separating it makes the module boundary explicit, reduces `cli.ts` size, and allows `featureGateProjection` to be tested in isolation rather than only through `reconcileVisiblePlanState`.
+
+**Effort:** S (human: ~2 hours / CC: ~15 min)
+**Priority:** P3
+
+### Thread `visiblePlanProjection` as a parameter
+
+**What:** Replace the module-level `let visiblePlanProjection: ... | null = null` singleton in `cli.ts` with an explicit parameter threaded through `saveState`. Or expose setter/getter functions (`setVisiblePlanProjection` / `clearVisiblePlanProjection`) to make the mutation surface explicit and testable.
+
+**Why:** The current singleton is set in one location (~line 5508) and mutated in another (~lines 6110-6112) with no clear boundary. This is hard to reason about and untestable in isolation. After `gate-reconciler.ts` extraction above, threading the projection as a param is straightforward.
+
+**Effort:** XS (human: ~1 hour / CC: ~10 min)
+**Priority:** P3
+**Depends on:** gate-reconciler.ts extraction above
diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md
index c64e6e8bd9..75a5e6fb50 100644
--- a/autoplan/SKILL.md
+++ b/autoplan/SKILL.md
@@ -1719,7 +1719,7 @@ If Phase 3.5 ran (DX scope), also log:
SOURCE = "codex+subagent", "codex-only", "subagent-only", or "unavailable".
Replace N values with actual consensus counts from the tables.
-Suggest next step: `/ship` when ready to create the PR.
+Suggest next step: print the canonical build command with the absolute source-plan path, e.g. `/build /abs/path/to/source-plan.md`. If the approved plan came from the current conversation rather than a saved file, save it first and print the saved absolute path. Use `/ship` only after `/build` has implemented and committed the plan.
---
diff --git a/autoplan/SKILL.md.tmpl b/autoplan/SKILL.md.tmpl
index 6577a6725c..0242d675f6 100644
--- a/autoplan/SKILL.md.tmpl
+++ b/autoplan/SKILL.md.tmpl
@@ -889,7 +889,7 @@ If Phase 3.5 ran (DX scope), also log:
SOURCE = "codex+subagent", "codex-only", "subagent-only", or "unavailable".
Replace N values with actual consensus counts from the tables.
-Suggest next step: `/ship` when ready to create the PR.
+Suggest next step: print the canonical build command with the absolute source-plan path, e.g. `/build /abs/path/to/source-plan.md`. If the approved plan came from the current conversation rather than a saved file, save it first and print the saved absolute path. Use `/ship` only after `/build` has implemented and committed the plan.
---
diff --git a/bin/gstack-brain-reader b/bin/gstack-brain-reader
deleted file mode 120000
index 712ce87e69..0000000000
--- a/bin/gstack-brain-reader
+++ /dev/null
@@ -1 +0,0 @@
-gstack-brain-consumer
\ No newline at end of file
diff --git a/bin/gstack-brain-reader b/bin/gstack-brain-reader
new file mode 100755
index 0000000000..12403ae580
--- /dev/null
+++ b/bin/gstack-brain-reader
@@ -0,0 +1,201 @@
+#!/usr/bin/env bash
+# gstack-brain-consumer — manage the consumer (reader) registry.
+#
+# DEPRECATED in v1.17.0.0. This binary targets a gbrain HTTP /ingest-repo
+# endpoint that never shipped on the gbrain side. Live federation now uses
+# `gbrain sources` directly via bin/gstack-gbrain-source-wireup. This file
+# stays for one cycle to avoid breaking external scripts; removal in v1.18.0.0.
+#
+# Consumer = a reader that ingests the gstack-brain git repo as a source of
+# session memory. v1 primary consumer is GBrain; later versions can register
+# Codex, OpenClaw, or third-party readers.
+#
+# NOTE ON NAMING: internally this helper uses "consumer" (correct data-model
+# term). User-facing copy and the alias `gstack-brain-reader` use "reader"
+# (matches user mental model: "what's reading my brain?").
+#
+# Usage:
+# gstack-brain-consumer add --ingest-url --token
+# gstack-brain-consumer list
+# gstack-brain-consumer remove
+# gstack-brain-consumer test
+#
+# Env:
+# GSTACK_HOME — override ~/.gstack
+
+set -euo pipefail
+
+GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}"
+CONSUMERS_FILE="$GSTACK_HOME/consumers.json"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+CONFIG_BIN="$SCRIPT_DIR/gstack-config"
+
+ensure_file() {
+ mkdir -p "$GSTACK_HOME"
+ if [ ! -f "$CONSUMERS_FILE" ]; then
+ echo '{"consumers": []}' > "$CONSUMERS_FILE"
+ fi
+}
+
+get_remote_url() {
+ git -C "$GSTACK_HOME" remote get-url origin 2>/dev/null || echo ""
+}
+
+sub_add() {
+ local name="" url="" token=""
+ local positional=""
+ while [ $# -gt 0 ]; do
+ case "$1" in
+ --ingest-url) url="$2"; shift 2 ;;
+ --token) token="$2"; shift 2 ;;
+ --) shift; break ;;
+ -*) echo "Unknown flag: $1" >&2; exit 1 ;;
+ *) positional="$1"; shift ;;
+ esac
+ done
+ name="$positional"
+ if [ -z "$name" ] || [ -z "$url" ]; then
+ echo "Usage: gstack-brain-consumer add --ingest-url [--token ]" >&2
+ exit 1
+ fi
+ ensure_file
+ # Upsert in consumers.json, store token in gstack-config under `_token`.
+ python3 - "$CONSUMERS_FILE" "$name" "$url" <<'PYEOF'
+import sys, json
+path, name, url = sys.argv[1:4]
+try:
+ with open(path) as f:
+ data = json.load(f)
+except Exception:
+ data = {"consumers": []}
+entry = {"name": name, "ingest_url": url, "status": "unknown", "token_ref": f"{name}_token"}
+cs = data.setdefault("consumers", [])
+for i, c in enumerate(cs):
+ if c.get("name") == name:
+ cs[i] = entry
+ break
+else:
+ cs.append(entry)
+with open(path, "w") as f:
+ json.dump(data, f, indent=2)
+ f.write("\n")
+print(f"registered consumer: {name}")
+PYEOF
+ if [ -n "$token" ]; then
+ "$CONFIG_BIN" set "${name}_token" "$token"
+ echo "token stored: gstack-config get ${name}_token to retrieve"
+ fi
+ # Attempt registration with remote (HTTP POST).
+ sub_test "$name"
+}
+
+sub_list() {
+ if [ ! -f "$CONSUMERS_FILE" ]; then
+ echo '{"consumers": []}'
+ return 0
+ fi
+ cat "$CONSUMERS_FILE"
+}
+
+sub_remove() {
+ local name="${1:-}"
+ if [ -z "$name" ]; then
+ echo "Usage: gstack-brain-consumer remove " >&2
+ exit 1
+ fi
+ ensure_file
+ python3 - "$CONSUMERS_FILE" "$name" <<'PYEOF'
+import sys, json
+path, name = sys.argv[1:3]
+try:
+ with open(path) as f:
+ data = json.load(f)
+except Exception:
+ data = {"consumers": []}
+before = len(data.get("consumers", []))
+data["consumers"] = [c for c in data.get("consumers", []) if c.get("name") != name]
+after = len(data["consumers"])
+with open(path, "w") as f:
+ json.dump(data, f, indent=2)
+ f.write("\n")
+print(f"removed: {before - after} entry(ies)")
+PYEOF
+}
+
+sub_test() {
+ local name="${1:-}"
+ if [ -z "$name" ]; then
+ echo "Usage: gstack-brain-consumer test " >&2
+ exit 1
+ fi
+ ensure_file
+ # Look up the consumer by name.
+ local info
+ info=$(python3 - "$CONSUMERS_FILE" "$name" <<'PYEOF'
+import sys, json
+path, name = sys.argv[1:3]
+try:
+ with open(path) as f:
+ data = json.load(f)
+except Exception:
+ data = {"consumers": []}
+for c in data.get("consumers", []):
+ if c.get("name") == name:
+ print(c.get("ingest_url", ""))
+ sys.exit(0)
+sys.exit(1)
+PYEOF
+ ) || { echo "No such consumer: $name" >&2; exit 1; }
+
+ local url="$info"
+ local token
+ token=$("$CONFIG_BIN" get "${name}_token" 2>/dev/null || echo "")
+ if [ -z "$url" ] || [ -z "$token" ]; then
+ echo "consumer '$name': url or token missing; cannot test"
+ return 0
+ fi
+ local repo_url
+ repo_url=$(get_remote_url)
+ echo "Testing $name at ${url%/}/ingest-repo ..."
+ local resp
+ resp=$(curl -sS -X POST "${url%/}/ingest-repo" \
+ -H "Authorization: Bearer $token" \
+ -H "Content-Type: application/json" \
+ --data "{\"repo_url\":\"$repo_url\"}" \
+ -w "\n%{http_code}" 2>&1 || echo -e "\ncurl-error")
+ local code
+ code=$(echo "$resp" | tail -1)
+ if [ "$code" = "200" ] || [ "$code" = "201" ] || [ "$code" = "204" ]; then
+ echo "ok (HTTP $code)"
+ # Update status in consumers.json.
+ python3 - "$CONSUMERS_FILE" "$name" "ok" <<'PYEOF'
+import sys, json
+path, name, status = sys.argv[1:4]
+with open(path) as f: data = json.load(f)
+for c in data.get("consumers", []):
+ if c.get("name") == name:
+ c["status"] = status
+with open(path, "w") as f: json.dump(data, f, indent=2); f.write("\n")
+PYEOF
+ else
+ echo "failed (HTTP $code)"
+ python3 - "$CONSUMERS_FILE" "$name" "error" <<'PYEOF'
+import sys, json
+path, name, status = sys.argv[1:4]
+with open(path) as f: data = json.load(f)
+for c in data.get("consumers", []):
+ if c.get("name") == name:
+ c["status"] = status
+with open(path, "w") as f: json.dump(data, f, indent=2); f.write("\n")
+PYEOF
+ fi
+}
+
+case "${1:-}" in
+ add) shift; sub_add "$@" ;;
+ list) sub_list ;;
+ remove) shift; sub_remove "$@" ;;
+ test) shift; sub_test "$@" ;;
+ --help|-h|"") sed -n '2,20p' "$0" | sed 's/^# \{0,1\}//' ;;
+ *) echo "Unknown subcommand: $1" >&2; exit 1 ;;
+esac
diff --git a/bin/gstack-build b/bin/gstack-build
new file mode 100755
index 0000000000..dd3a044c8f
--- /dev/null
+++ b/bin/gstack-build
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# gstack-build — code-driven phase orchestrator for the /build skill.
+#
+# Thin wrapper around build/orchestrator/cli.ts. Matches the convention
+# used by every other bin/ script in this repo (gstack-config, gstack-slug,
+# gstack-update-check, etc.) — bash wrapper invoking the implementation.
+#
+# Compiled binaries via `bun build --compile` were tried and got SIGKILL'd
+# by macOS Gatekeeper in some environments; bash + bun run is reliable.
+#
+# Usage: gstack-build [flags] (-h for help)
+
+set -euo pipefail
+
+# Resolve the directory this script lives in, following symlinks.
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+while [ -L "$SCRIPT_PATH" ]; do
+ SCRIPT_PATH=$(readlink "$SCRIPT_PATH")
+done
+SCRIPT_DIR=$(cd "$(dirname "$SCRIPT_PATH")" && pwd)
+GSTACK_ROOT=$(cd "$SCRIPT_DIR/.." && pwd)
+
+if ! command -v bun >/dev/null 2>&1; then
+ echo "gstack-build: bun is required but not on PATH" >&2
+ echo " install: curl -fsSL https://bun.sh/install | bash" >&2
+ exit 127
+fi
+
+exec bun run "$GSTACK_ROOT/build/orchestrator/cli.ts" "$@"
diff --git a/bin/gstack-build-phase-guardrail b/bin/gstack-build-phase-guardrail
new file mode 100755
index 0000000000..d4d81a86c4
--- /dev/null
+++ b/bin/gstack-build-phase-guardrail
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# gstack-build-phase-guardrail — verify a feature completed cleanly after ship
+#
+# Usage: gstack-build-phase-guardrail
+#
+# Outputs a single line:
+# GUARDRAIL: PASS
+# GUARDRAIL: FAIL:
+#
+# Checks:
+# 1. PR for the feature branch is merged (not open) — uses gh pr view; fails closed on gh errors
+# 2. Feature branch is merged into origin/main — uses PR state to handle squash/rebase merges
+# 3. Local working tree has no staged/unstaged changes
+#
+# Note: broader feat/* branch hygiene (unmerged siblings from other devs) is
+# handled by the startup sweep gate (--skip-sweep bypasses it), not here.
+
+set -euo pipefail
+
+PLAN_FILE="${1:?living-plan-file required}"
+FEATURE_BRANCH="${2:?feature-branch required}"
+PROJECT_ROOT="${3:?project-root required}"
+
+fail() { printf 'GUARDRAIL: FAIL: %s\n' "$1"; exit 1; }
+
+# Require absolute path for PLAN_FILE so the cd below doesn't break resolution
+[[ "$PLAN_FILE" = /* ]] || fail "plan file must be an absolute path: $PLAN_FILE"
+
+cd "$PROJECT_ROOT" || fail "cannot cd to project root: $PROJECT_ROOT"
+
+[ -f "$PLAN_FILE" ] || fail "plan file not found: $PLAN_FILE"
+
+# 1. PR state check — fail closed on any gh error (auth, network, missing remote, etc.)
+# gh pr view returns non-zero for branches with no PR; treat that as "not merged".
+pr_state=$(gh pr view "$FEATURE_BRANCH" --json state --jq '.state' 2>/dev/null) || {
+ # Distinguish "no PR found" from "gh error"
+ gh_err=$(gh pr view "$FEATURE_BRANCH" --json state 2>&1 || true)
+ if echo "$gh_err" | grep -qi "no pull requests found\|could not find"; then
+ fail "no PR found for branch $FEATURE_BRANCH"
+ else
+ fail "gh pr view failed (auth/network/config error?) — output: ${gh_err:0:200}"
+ fi
+}
+
+case "$pr_state" in
+ MERGED)
+ # good — fall through to check 2
+ ;;
+ OPEN)
+ fail "PR for $FEATURE_BRANCH is still open"
+ ;;
+ CLOSED)
+ fail "PR for $FEATURE_BRANCH was closed without merging"
+ ;;
+ *)
+ fail "unexpected PR state '$pr_state' for $FEATURE_BRANCH"
+ ;;
+esac
+
+# 2. Feature branch commits reachable from origin/main.
+# git branch -r --merged misses squash and rebase merges because those strategies
+# do not create a merge commit. Use the PR MERGED state (checked above) as the
+# authoritative signal, and additionally verify origin/main is up to date.
+git fetch origin main 2>/dev/null || fail "git fetch origin main failed — check network/auth"
+
+# Confirm main actually advanced past the merge base to catch any edge case where
+# GitHub reports MERGED but the local fetch is still stale (should not happen after
+# the fetch above, but belt-and-suspenders).
+merge_base=$(git merge-base HEAD origin/main 2>/dev/null || true)
+[ -n "$merge_base" ] || fail "could not compute merge base between HEAD and origin/main"
+
+# 3. No staged/unstaged changes (untracked files ignored — .llm-tmp/ cleanup is best-effort)
+dirty=$(git status --porcelain 2>/dev/null | grep -v "^??" || true)
+[ -z "$dirty" ] || fail "working tree has staged/unstaged changes (run 'git status' to inspect)"
+
+printf 'GUARDRAIL: PASS\n'
diff --git a/bin/gstack-config b/bin/gstack-config
index 0cec75b6a5..59630e409e 100755
--- a/bin/gstack-config
+++ b/bin/gstack-config
@@ -85,6 +85,16 @@ CONFIG_HEADER='# gstack configuration — edit freely, changes take effect on ne
# # Non-Conductor users can point this at any directory
# # that holds parallel worktrees of the same repo.
#
+# ─── Fork skill overlay ───────────────────────────────────────────────
+# fork_repo_path: # Absolute path to your local gstack fork repo.
+# # When set, /gstack-upgrade diffs SKILL.md.tmpl files
+# # from the fork against the installed gstack, copies any
+# # that differ, regenerates SKILL.md for all hosts
+# # (claude + codex), and syncs gemini/kimi skill dirs.
+# # Runs even when no upstream upgrade is available.
+# # Set with:
+# # gstack-config set fork_repo_path /path/to/your/gstack
+#
'
# DEFAULTS table — canonical default values for known keys.
@@ -104,6 +114,7 @@ lookup_default() {
gstack_contributor) echo "false" ;;
skip_eng_review) echo "false" ;;
workspace_root) echo "$HOME/conductor/workspaces" ;;
+ fork_repo_path) echo "" ;;
cross_project_learnings) echo "" ;; # intentionally empty → unset triggers first-time prompt
artifacts_sync_mode) echo "off" ;;
artifacts_sync_mode_prompted) echo "false" ;;
@@ -119,7 +130,9 @@ case "${1:-}" in
echo "Error: key must contain only alphanumeric characters and underscores" >&2
exit 1
fi
- VALUE=$(grep -E "^${KEY}:" "$CONFIG_FILE" 2>/dev/null | tail -1 | awk '{print $2}' | tr -d '[:space:]' || true)
+ VALUE=$(grep -E "^${KEY}:" "$CONFIG_FILE" 2>/dev/null | tail -1 \
+ | sed 's/^[^:]*:[[:space:]]*//' \
+ | sed 's/[[:space:]]*#.*$//' || true)
if [ -z "$VALUE" ]; then
VALUE=$(lookup_default "$KEY")
fi
@@ -142,6 +155,17 @@ case "${1:-}" in
echo "Warning: artifacts_sync_mode '$VALUE' not recognized. Valid values: off, artifacts-only, full. Using off." >&2
VALUE="off"
fi
+ if [ "$KEY" = "fork_repo_path" ] && [ -n "$VALUE" ]; then
+ case "$VALUE" in
+ /*) ;;
+ *) echo "Error: fork_repo_path must be an absolute path (got: $VALUE)" >&2; exit 1 ;;
+ esac
+ if [ ! -d "$VALUE" ]; then
+ echo "Warning: fork_repo_path directory does not exist: $VALUE" >&2
+ elif [ ! -f "$VALUE/gstack-upgrade/SKILL.md.tmpl" ]; then
+ echo "Warning: $VALUE doesn't look like a gstack repo (missing gstack-upgrade/SKILL.md.tmpl)" >&2
+ fi
+ fi
mkdir -p "$STATE_DIR"
# Write annotated header on first creation
if [ ! -f "$CONFIG_FILE" ]; then
@@ -170,9 +194,11 @@ case "${1:-}" in
echo "# ─── Active values (including defaults for unset keys) ───"
for KEY in proactive routing_declined telemetry auto_upgrade update_check \
skill_prefix checkpoint_mode checkpoint_push codex_reviews \
- gstack_contributor skip_eng_review workspace_root \
+ gstack_contributor skip_eng_review workspace_root fork_repo_path \
artifacts_sync_mode artifacts_sync_mode_prompted; do
- VALUE=$(grep -E "^${KEY}:" "$CONFIG_FILE" 2>/dev/null | tail -1 | awk '{print $2}' | tr -d '[:space:]' || true)
+ VALUE=$(grep -E "^${KEY}:" "$CONFIG_FILE" 2>/dev/null | tail -1 \
+ | sed 's/^[^:]*:[[:space:]]*//' \
+ | sed 's/[[:space:]]*#.*$//' || true)
SOURCE="default"
if [ -n "$VALUE" ]; then
SOURCE="set"
@@ -186,7 +212,7 @@ case "${1:-}" in
echo "# gstack-config defaults"
for KEY in proactive routing_declined telemetry auto_upgrade update_check \
skill_prefix checkpoint_mode checkpoint_push codex_reviews \
- gstack_contributor skip_eng_review workspace_root \
+ gstack_contributor skip_eng_review workspace_root fork_repo_path \
artifacts_sync_mode artifacts_sync_mode_prompted; do
printf ' %-24s %s\n' "$KEY:" "$(lookup_default "$KEY")"
done
diff --git a/bin/gstack-update-check b/bin/gstack-update-check
index 31e9fdb6f8..a0d9f895b1 100755
--- a/bin/gstack-update-check
+++ b/bin/gstack-update-check
@@ -3,7 +3,7 @@
#
# Output (one line, or nothing):
# JUST_UPGRADED — marker found from recent upgrade
-# UPGRADE_AVAILABLE — remote VERSION differs from local
+# UPGRADE_AVAILABLE — remote VERSION is greater than local
# (nothing) — up to date, snoozed, disabled, or check skipped
#
# Env overrides (for testing):
@@ -99,6 +99,29 @@ check_snooze() {
return 1 # snooze expired
}
+version_gt() {
+ local left="$1"
+ local right="$2"
+ local IFS=.
+ local -a left_parts right_parts
+ read -r -a left_parts <<< "$left"
+ read -r -a right_parts <<< "$right"
+ local i l r
+ for i in 0 1 2 3; do
+ l="${left_parts[$i]:-0}"
+ r="${right_parts[$i]:-0}"
+ case "$l" in *[!0-9]*|'') l=0 ;; esac
+ case "$r" in *[!0-9]*|'') r=0 ;; esac
+ if [ "$l" -gt "$r" ]; then
+ return 0
+ fi
+ if [ "$l" -lt "$r" ]; then
+ return 1
+ fi
+ done
+ return 1
+}
+
# ─── Step 1: Read local version ──────────────────────────────
LOCAL=""
if [ -f "$VERSION_FILE" ]; then
@@ -144,6 +167,10 @@ if [ -f "$CACHE_FILE" ]; then
CACHED_OLD="$(echo "$CACHED" | awk '{print $2}')"
if [ "$CACHED_OLD" = "$LOCAL" ]; then
CACHED_NEW="$(echo "$CACHED" | awk '{print $3}')"
+ if ! version_gt "$CACHED_NEW" "$LOCAL"; then
+ echo "UP_TO_DATE $LOCAL" > "$CACHE_FILE"
+ exit 0
+ fi
if check_snooze "$CACHED_NEW"; then
exit 0 # snoozed — stay quiet
fi
@@ -190,12 +217,12 @@ if ! echo "$REMOTE" | grep -qE '^[0-9]+\.[0-9.]+$'; then
exit 0
fi
-if [ "$LOCAL" = "$REMOTE" ]; then
+if ! version_gt "$REMOTE" "$LOCAL"; then
echo "UP_TO_DATE $LOCAL" > "$CACHE_FILE"
exit 0
fi
-# Versions differ — upgrade available
+# Remote is greater than local — upgrade available
echo "UPGRADE_AVAILABLE $LOCAL $REMOTE" > "$CACHE_FILE"
if check_snooze "$REMOTE"; then
exit 0 # snoozed — stay quiet
diff --git a/browse/test/gstack-update-check.test.ts b/browse/test/gstack-update-check.test.ts
index 47300f0a69..23073495fb 100644
--- a/browse/test/gstack-update-check.test.ts
+++ b/browse/test/gstack-update-check.test.ts
@@ -154,6 +154,17 @@ describe('gstack-update-check', () => {
expect(stdout).toBe('UPGRADE_AVAILABLE 0.3.3 0.4.0');
});
+ test('suppresses cached UPGRADE_AVAILABLE when cached remote is lower than local', () => {
+ writeFileSync(join(gstackDir, 'VERSION'), '1.26.7.0\n');
+ writeFileSync(join(stateDir, 'last-update-check'), 'UPGRADE_AVAILABLE 1.26.7.0 1.26.3.0');
+
+ const { exitCode, stdout } = run();
+ expect(exitCode).toBe(0);
+ expect(stdout).toBe('');
+ const cache = readFileSync(join(stateDir, 'last-update-check'), 'utf-8');
+ expect(cache).toContain('UP_TO_DATE 1.26.7.0');
+ });
+
// ─── Path D3: Fresh cache, but local version changed ────────
test('re-checks when local version does not match cached old version', () => {
writeFileSync(join(gstackDir, 'VERSION'), '0.4.0\n');
@@ -182,7 +193,7 @@ describe('gstack-update-check', () => {
});
// ─── Path F: Versions differ (remote fetch) ─────────────────
- test('outputs UPGRADE_AVAILABLE when versions differ', () => {
+ test('outputs UPGRADE_AVAILABLE when remote version is greater than local', () => {
writeFileSync(join(gstackDir, 'VERSION'), '0.3.3\n');
writeFileSync(join(gstackDir, 'REMOTE_VERSION'), '0.4.0\n');
@@ -193,6 +204,17 @@ describe('gstack-update-check', () => {
expect(cache).toContain('UPGRADE_AVAILABLE 0.3.3 0.4.0');
});
+ test('treats lower remote version as up to date', () => {
+ writeFileSync(join(gstackDir, 'VERSION'), '1.26.7.0\n');
+ writeFileSync(join(gstackDir, 'REMOTE_VERSION'), '1.26.3.0\n');
+
+ const { exitCode, stdout } = run();
+ expect(exitCode).toBe(0);
+ expect(stdout).toBe('');
+ const cache = readFileSync(join(stateDir, 'last-update-check'), 'utf-8');
+ expect(cache).toContain('UP_TO_DATE 1.26.7.0');
+ });
+
// ─── Path G: Invalid remote response ────────────────────────
test('treats invalid remote response as up to date', () => {
writeFileSync(join(gstackDir, 'VERSION'), '0.3.3\n');
diff --git a/build/README.md b/build/README.md
new file mode 100644
index 0000000000..b6ec65ca34
--- /dev/null
+++ b/build/README.md
@@ -0,0 +1,471 @@
+# Build Skill Workflow
+
+The build skill turns an approved plan into shipped code. It has two components:
+
+- `/build`, the skill prompt in `build/SKILL.md.tmpl`, is the entry point. It
+ discovers the source plan, synthesizes a living plan via subagents, confirms
+ with the user, and hands off to the CLI for all execution.
+- `gstack-build`, the TypeScript orchestrator in `build/orchestrator/`, drives
+ the full TDD + review + ship loop. The skill always delegates to it — even for
+ single-phase plans — because the CLI survives context compaction, restarts, and
+ multi-hour sub-agent work where an LLM-driven loop cannot.
+
+## Entry Points
+
+`build/SKILL.md.tmpl` is the source of truth for the generated skill. Do not edit
+`build/SKILL.md` directly.
+
+The installed command is `bin/gstack-build`, a thin Bash wrapper that resolves
+the gstack checkout and runs:
+
+```bash
+bun run build/orchestrator/cli.ts [flags]
+```
+
+For manual use, install setup should put `gstack-build` on `PATH`. When the
+`/build` skill launches the CLI, it first resolves an executable from
+`GSTACK_BUILD_CLI`, `PATH`, host-specific setup paths, or this checkout's
+`bin/gstack-build`, so spawned Claude/Codex shells do not depend on inherited
+interactive shell configuration.
+
+Common commands:
+
+```bash
+gstack-build plans/example-impl-plan.md --print-only
+gstack-build plans/example-impl-plan.md --dry-run --skip-ship
+gstack-build plans/example-impl-plan.md --skip-ship
+gstack-build plans/example-impl-plan.md --dual-impl
+gstack-build plans/example-impl-plan.md --no-resume
+gstack-build merge --project-root /path/to/product-repo
+```
+
+## High-Level Flow
+
+1. Find or synthesize a living implementation plan organized into semantic feature blocks.
+2. Execute each feature block as a shipped unit of work, with phases inside it.
+3. Write failing tests first when the phase uses the TDD format.
+4. Implement until tests pass.
+5. Run recursive review gates until primary review, secondary review, and QA emit `GATE PASS`.
+ If a Codex review/QA gate fails with a known local sandbox-block signature
+ (browser, local socket, or localhost bind permission errors), retry that gate
+ once with `danger-full-access`.
+6. Flip the phase checkboxes in the plan.
+7. Persist state and continue to the next phase in the current feature.
+8. After a feature's phases are complete, run `/ship` and `/land-and-deploy`.
+9. Verify the landed feature against the origin plan, then continue to the next feature.
+10. After all features complete, verify no feature branches remain unmerged and archive the living/origin plans.
+
+The CLI owns the full durable loop. The skill prompt's role is plan discovery,
+synthesis, user confirmation, CLI launch, and post-feature monitoring.
+
+## Merge Mode
+
+`/build merge` launches `gstack-build merge`, a cleanup mode for leftover
+feature branches from previous build runs. It scans all unmerged local and
+remote `feat/*` branches, checks out each branch, runs configured `/review`,
+uses the configured `testFixer` role to fix review findings until the existing
+review cap is reached, then runs configured `/ship` and `/land-and-deploy`.
+The loop is fail-closed for direct merge runs: the first branch that cannot be
+reviewed clean, fixed, shipped, or landed stops the command with logs under
+`~/.gstack/build-state/build-merge-*/`.
+
+## Plan Format
+
+Living plans should regroup all source-plan weeks, milestones, blocks, and phases
+into deliverable feature sections. Legacy phase-only plans still run as one
+default feature.
+
+The preferred phase shape inside each feature is TDD-first. The durable
+markdown shape stays at three checkboxes, while the CLI enforces the full
+runtime lifecycle: Test Specification -> Verify Red -> Implementation -> Green
+tests -> Review/QA.
+
+```markdown
+## Feature 1: Parser workflow
+
+Origin trace: Week 1 / Phase 2
+Acceptance: Parser behavior satisfies the source plan.
+
+### Phase 1.1: Parser tests
+
+- [ ] **Test Specification (Gemini Sub-agent)**: Write failing tests covering the parser behavior.
+- [ ] **Implementation (Gemini Sub-agent)**: Make the tests pass with minimal code; the CLI runs the Green tests gate afterward.
+- [ ] **Review & QA (Codex Sub-agent)**: Run review and fix all findings.
+```
+
+Legacy two-checkbox phases are still supported:
+
+```markdown
+### Phase 1: Parser
+
+- [ ] **Implementation (Gemini Sub-agent)**: Implement the parser.
+- [ ] **Review & QA (Codex Sub-agent)**: Run review and fix all findings.
+```
+
+The parser accepts `## Feature N: Name`, `### Phase N: Name`, and decimal
+numbers like `### Phase 2.1: Name`. It records the exact checkbox line numbers
+so the plan mutator can flip only the intended lines. Checkbox-like text inside
+fenced code blocks is ignored.
+
+## Skill-Prompt Path
+
+Since v1.20.0, `/build` always routes every plan — including single-phase — to
+`gstack-build`. The LLM-driven execution loop is gone; the skill's role is now
+**plan discovery → living-plan synthesis → user confirmation → CLI handoff →
+monitoring**. The CLI handles all phase execution, TDD loops, review gates,
+ship, and land.
+
+The skill's startup sequence:
+
+1. Detect whether the current directory is a workspace root with immediate
+ child repos. If so, the root repo is orchestration-only by default; child repos
+ are implementation targets. Direct CLI execution against that root requires
+ `--allow-workspace-root`; single product repo invocation remains supported.
+2. Locate the workspace-level `*-gstack/inbox/` and
+ `*-gstack/inbox/living-plan/` directories. This chooses plan storage only; it
+ does not choose a plan file or target repo.
+3. Resolve plan status with `gstack-build plan-status`. The resolver reports
+ exact source-plan, living-plan, claim, manifest, and active-run candidates;
+ `/build` only auto-selects when exactly one safe source plan exists, unless
+ the user explicitly passes a plan path or `--all-inbox`.
+4. Select one or more target child repos. If a source plan spans multiple child
+ repos, split it into one living plan per target repo and write
+ `.llm-tmp/build-run-manifest.json`.
+5. Confirm the manifest with the user, then launch all manifest runs in private
+ git worktrees. The foreground CLI monitor owns polling, stale-run recovery,
+ and completion reporting.
+
+After `gstack-build` reports each feature complete:
+
+1. Spawn ship and land roles **only when `--skip-ship` was passed** to
+ `gstack-build`. Without `--skip-ship`, the CLI already ran `/ship` and
+ `/land-and-deploy` internally — re-spawning would double-ship and create
+ duplicate PRs.
+2. Delegate origin-plan coverage verification to a fresh Claude subagent (role:
+ `featureVerifier`) that reads only the relevant source-plan sections and
+ emits a `VERIFICATION: PASS | GAPS` result.
+3. Run `gstack-build-phase-guardrail` to confirm the feature PR merged, the
+ working tree is clean, and `origin/main` is up to date.
+4. After all features are complete, spawn a final-exam subagent (role:
+ `featureVerifier`) to compare the full source plan against the git log and
+ living plan. Archive plans on `EXAM: PASS`.
+
+## CLI Path
+
+For long plans, `/build` should launch `gstack-build` in the background and
+monitor `~/.gstack/build-state/.json` rather than blocking on the process.
+The CLI exists because code can reliably drive the phase loop after the current
+LLM context is gone.
+
+Startup sequence:
+
+1. Parse args and the plan file.
+2. Print the phase table and parser warnings.
+3. Resolve the project root from `--project-root`, the current git repo, or the plan location.
+4. Run startup gates unless `--dry-run` or `--skip-ship` is active.
+5. Acquire a per-plan lock.
+6. Load existing state or create fresh state.
+7. Drive phases until all are committed.
+8. Ship and verify, unless `--skip-ship` or `--dry-run` is active.
+9. Release the lock and append an analytics event.
+
+The state slug is `build-`.
+
+## Startup Gates
+
+The CLI has one preflight gate before phase execution:
+
+- Clean working tree check: tracked staged or modified files fail the run.
+ Untracked files are ignored. Use `--skip-clean-check` only when the dirty
+ state is intentional.
+
+This check is skipped by `--dry-run` and `--skip-ship`.
+
+## Phase State Machine
+
+`build/orchestrator/phase-runner.ts` is deliberately pure. It takes the current
+phase state and the previous action result, then returns the next action.
+
+Typical TDD phase:
+
+```text
+pending
+ -> RUN_GEMINI_TEST_SPEC
+test_spec_done
+ -> VERIFY_RED
+tests_red
+ -> RUN_GEMINI
+impl_done
+ -> RUN_TESTS
+tests_green
+ -> RUN_CODEX_REVIEW
+review_clean
+ -> MARK_COMPLETE
+committed
+ -> DONE
+```
+
+If tests pass during `VERIFY_RED`, the test specification is considered too
+weak and the test-writer role is asked to rewrite stricter tests, capped by
+`GSTACK_BUILD_RED_MAX_ITER`.
+
+If tests fail after implementation, the test-fixer role gets recursive fix passes, capped by
+`GSTACK_BUILD_TEST_MAX_ITER`.
+
+If any review gate emits `GATE FAIL`, the review loop runs again, capped by
+`GSTACK_BUILD_CODEX_MAX_ITER`. The phase cannot be marked complete until
+primary review, secondary review, and QA all produce `GATE PASS`.
+Codex review/QA gates normally use `workspace-write`; if that sandbox blocks
+local verification, the failed gate is retried once with `danger-full-access`.
+Set `GSTACK_BUILD_CODEX_REVIEW_SANDBOX` to choose an explicit sandbox and
+disable this automatic retry.
+
+## Dual-Implementor Mode
+
+`--dual-impl` replaces the single implementation pass with a tournament:
+
+1. Confirm or write failing tests.
+2. Create two temporary git worktrees.
+3. Run configured primary and secondary implementations in parallel.
+4. Run independent test-and-fix loops in each worktree.
+5. Choose a winner automatically when only one side passes.
+6. Otherwise ask the configured judge to review both diffs and test histories.
+7. Cherry-pick the winning commits back to the main working tree.
+8. Continue through the normal green-tests and review loop.
+
+Worktrees live under the OS temp directory with names like
+`gstack-dual--p-/`. Successful runs tear them down.
+Winner-apply failures preserve enough context for recovery.
+
+The judge must emit an anchored `WINNER: primary` or `WINNER: secondary` line. Missing
+or malformed verdicts fail closed.
+
+## State, Logs, and Resume
+
+Local state is canonical:
+
+```text
+~/.gstack/build-state/
+ .json
+ .lock
+ /
+ phase-1-gemini-testspec-1-input.md
+ phase-1-gemini-testspec-1-output.md
+ phase-1-gemini-testspec-1.log
+ phase-1-tests-1.log
+ phase-1-dual-primary-1-input.md
+ phase-1-dual-primary-1-output.md
+ phase-1-dual-primary-1.log
+ phase-1-dual-secondary-1-input.md
+ phase-1-dual-secondary-1-output.md
+ phase-1-dual-secondary-1.log
+ ship.log
+ land-and-deploy.log
+```
+
+State writes use temp-file plus rename. Plan checkbox writes do the same. If
+gbrain is available, state is mirrored there on a best-effort basis, but local
+JSON remains the source of truth.
+
+Resume is automatic. Re-running the same command loads the state file and
+continues from the first non-committed phase. Use `--no-resume` to discard
+existing state and start fresh.
+
+The lock file prevents two orchestrators from driving the same plan. A stale
+lock can be removed manually only after checking that no `gstack-build` process
+is still running.
+
+## Sub-Agent Roles
+
+- `testWriter` writes failing tests.
+- `primaryImpl` is the primary implementor.
+- `testFixer` fixes test failures.
+- `review` and `reviewSecondary` run the review gates.
+- `secondaryImpl` acts as the second implementor in `--dual-impl`.
+- `judge` judges dual-implementor tournaments.
+- `qa`, `ship`, and `land` run QA and release commands.
+
+Two additional roles are **template-only** — they are consumed by the skill
+prompt via `jq` and are intentionally absent from the CLI's `ROLE_DEFINITIONS`.
+They have no CLI flags or env var overrides:
+
+- `planSynthesizer` — synthesizes the living plan from the source plan.
+- `featureVerifier` — checks origin-plan coverage after each feature ships and
+ runs the final completion exam.
+
+`/context-save` is host-owned `/build` behavior, not a configured build role:
+Codex-running `/build` saves Codex context, and Claude-running `/build` saves
+Claude context.
+
+All role providers, models, reasoning levels, and commands are configured in
+`build/configure.cm`. If a role lookup returns empty (via `jq -r '... // empty'`),
+the skill halts with a STOP rather than silently using a wrong model — a
+misconfigured or missing `configure.cm` fails closed.
+
+The CLI talks to these tools through subprocess wrappers in
+`build/orchestrator/sub-agents.ts`. Codex stdin is explicitly closed because
+`codex exec` can otherwise hang.
+
+## Final Ship
+
+After every feature is committed, the CLI runs the existing release skills instead
+of using raw GitHub commands:
+
+```text
+
+
+```
+
+**Double-ship prevention:** The skill's Step 3 spawns the ship and land roles
+only when `--skip-ship` was passed to `gstack-build`. Without `--skip-ship`, the
+CLI already ran them internally — the skill skips that step to avoid creating
+duplicate PRs.
+
+**Feature verification:** After shipping, the skill delegates origin-plan
+coverage checking to a fresh `featureVerifier` subagent. It reads only the
+source-plan sections named in the feature's "Origin trace:" line and emits
+`VERIFICATION: PASS` or `VERIFICATION: GAPS`. Gaps restart the implementation
+loop for that feature.
+
+**Phase guardrail:** After ship + land, the skill runs `gstack-build-phase-guardrail`
+to confirm three things:
+
+1. The feature PR state is `MERGED` (checked via `gh pr view --json state` —
+ fails closed on `gh` errors, auth failures, or missing PRs).
+2. `origin/main` is fetchable and up to date (hard-fails on network error).
+3. The working tree has no staged or unstaged changes.
+
+The guardrail uses `gh pr view --json state` rather than `git branch --merged`
+so squash and rebase merges are detected correctly.
+
+CLI-level post-ship checks run after all features are complete:
+
+- no unmerged remote `feat/*` branches remain
+- the working tree is clean
+- local `HEAD` matches `origin/main`
+
+The build is marked `completed` only after these guardrails pass.
+
+## Failure Handling
+
+Most failures are terminal for the current run but resumable after repair:
+
+- no executable phases in the plan
+- dirty tracked working tree at startup
+- lock contention
+- Gemini timeout or non-zero exit
+- tests fail after the maximum fix iterations
+- tests pass before implementation after the maximum red attempts
+- review gates cannot converge to `GATE PASS`
+- Codex output has no parseable gate verdict
+- plan checkbox line no longer matches the parsed marker
+- dual-implementor judge output is malformed
+- winner cherry-pick and patch fallback both fail
+- final ship or post-ship guardrail fails
+
+The logs under the phase directory are the first place to inspect. After fixing
+the root cause, re-run the same `gstack-build` command to resume.
+
+## Important Flags
+
+| Flag | Effect |
+| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--print-only` | Parse the plan and print the phase table. |
+| `--dry-run` | Walk the state machine without spawning sub-agents or shipping. |
+| `--skip-ship` | Complete phases but skip final ship and deploy. |
+| `--no-resume` | Ignore existing state and start fresh. |
+| `--no-gbrain` | Use only local JSON state. |
+| `--dual-impl` | Run configured primary and secondary implementations in parallel worktrees. |
+| `--test-writer-model ` | Override failing-test writer model. |
+| `--primary-impl-model ` | Override primary implementor model. |
+| `--test-fixer-model ` | Override test-fixer model. |
+| `--secondary-impl-model ` | Override dual-impl secondary model. |
+| `--review-model ` | Override primary review model. |
+| `--review-secondary-model ` | Override secondary review model. |
+| `--qa-model ` | Override QA model. |
+| `--ship-model ` | Override ship model. |
+| `--land-model ` | Override land model. |
+| `---provider
` | Override role provider (`claude`, `codex`, `gemini`, `kimi`) where supported. Dual-impl primary, secondary, and judge roles are model-agnostic. |
+| `---reasoning ` | Override role reasoning (`low`, `medium`, `high`, `xhigh`). |
+| `---command ` | Override review, QA, ship, or land command. |
+| `--test-cmd ` | Override automatic test command detection. |
+| `--origin-plan ` | Source plan to verify after each feature and archive after final completion. |
+| `--max-codex-iter N` | Override the review gate loop cap. |
+| `--skip-clean-check` | Bypass tracked dirty-file preflight. |
+
+## Environment Variables
+
+Default role routing, retry caps, and timeouts live in `build/configure.cm`.
+Edit that file when the built-in defaults change; use the env vars below for
+per-run overrides. Set `GSTACK_BUILD_CONFIG_FILE` to point at a different
+config file.
+
+| Variable | Purpose |
+| ----------------------------------- | ---------------------------------------------------------------------------------- |
+| `GEMINI_BIN` | Gemini CLI path. |
+| `CODEX_BIN` | Codex CLI path. |
+| `CLAUDE_BIN` | Claude CLI path. |
+| `GBRAIN_BIN` | Optional gbrain CLI path. |
+| `GSTACK_BUILD_CONFIG_FILE` | Alternate build config file. |
+| `GSTACK_BUILD_DEFAULTS_FILE` | Legacy alias for `GSTACK_BUILD_CONFIG_FILE`. |
+| `GSTACK_BUILD__PROVIDER` | Role provider override where supported. |
+| `GSTACK_BUILD__MODEL` | Role model override. |
+| `GSTACK_BUILD__REASONING` | Role reasoning override. |
+| `GSTACK_BUILD__COMMAND` | Command override for review, QA, ship, and land roles. |
+| `GSTACK_BUILD_GEMINI_TIMEOUT` | Gemini call timeout in milliseconds. |
+| `GSTACK_BUILD_CODEX_TIMEOUT` | Codex call timeout in milliseconds. |
+| `GSTACK_BUILD_SHIP_TIMEOUT` | Final ship/deploy timeout in milliseconds. |
+| `GSTACK_BUILD_CODEX_MAX_ITER` | Review gate loop cap. |
+| `GSTACK_BUILD_TEST_TIMEOUT` | Test command timeout in milliseconds. |
+| `GSTACK_BUILD_TEST_MAX_ITER` | Gemini test-fix loop cap. |
+| `GSTACK_BUILD_RED_MAX_ITER` | Test-spec rewrite cap when tests pass too early. |
+| `GSTACK_BUILD_JUDGE_TIMEOUT` | Dual-impl judge timeout in milliseconds. |
+| `GSTACK_BUILD_JUDGE_MODEL` | Claude model used for tournament judging. |
+| `GSTACK_BUILD_CODEX_IMPL_SANDBOX` | Codex implementor sandbox override. |
+| `GSTACK_BUILD_CODEX_REVIEW_SANDBOX` | Codex review/QA sandbox override; explicit values disable automatic sandbox retry. |
+
+Role env vars use `GSTACK_BUILD__`, where role is
+`TEST_WRITER`, `PRIMARY_IMPL`, `TEST_FIXER`, `SECONDARY_IMPL`, `REVIEW`,
+`REVIEW_SECONDARY`, `QA`, `SHIP`, `LAND`, or `JUDGE`, and field is
+`PROVIDER`, `MODEL`, `REASONING`, or `COMMAND`. CLI flags override env vars;
+env vars override defaults.
+
+The template-only roles (`planSynthesizer`, `featureVerifier`) are read directly
+from `configure.cm` by the skill via `jq` and have no corresponding env var
+overrides. To change their models, edit `configure.cm`.
+
+## Module Map
+
+| File | Responsibility |
+| ---------------------------------- | ---------------------------------------------------------------------- |
+| `SKILL.md.tmpl` | Human-facing `/build` workflow and CLI-monitoring instructions. |
+| `configure.cm` | Role routing, retry caps, and timeouts (source of truth for defaults). |
+| `bin/gstack-build-phase-guardrail` | Post-feature guardrail: PR merged, origin/main up to date, tree clean. |
+| `orchestrator/cli.ts` | CLI args, startup gates, lock, main loop, ship guardrails. |
+| `orchestrator/parser.ts` | Markdown plan parser. |
+| `orchestrator/phase-runner.ts` | Pure phase state machine. |
+| `orchestrator/sub-agents.ts` | Gemini, Codex, Claude, test, verdict, and judge wrappers. |
+| `orchestrator/plan-mutator.ts` | Atomic checkbox updates in the plan file. |
+| `orchestrator/state.ts` | Local JSON state, gbrain mirror, lock files, log paths. |
+| `orchestrator/worktree.ts` | Dual-impl worktree creation, teardown, and winner apply. |
+| `orchestrator/ship.ts` | Final `/ship` plus `/land-and-deploy` delegation. |
+| `orchestrator/types.ts` | Shared phase and build state types. |
+
+## Testing
+
+Run the dedicated deterministic build-skill gate:
+
+```bash
+bun run test:build-skill
+```
+
+The gate runs the full orchestrator suite plus generated skill-doc contract
+tests. The matrix guard in `build/orchestrator/__tests__/coverage-matrix.test.ts`
+fails if a new build orchestrator module is added without explicit test
+ownership.
+
+After changing `build/SKILL.md.tmpl`, regenerate generated skill files:
+
+```bash
+bun run gen:skill-docs --host all
+```
diff --git a/build/SKILL.md b/build/SKILL.md
new file mode 100644
index 0000000000..49cd95bafb
--- /dev/null
+++ b/build/SKILL.md
@@ -0,0 +1,2068 @@
+---
+name: build
+preamble-tier: 4
+version: 1.22.0
+description: |
+ gstack autonomous execution skill. Reads the latest implementation plan and enters
+ a strict coding loop to build the feature in phases, running tests and reviews
+ automatically.
+ Use when asked to "build the feature", "build the plan", or "start coding".
+allowed-tools:
+ - Bash
+ - Read
+ - Edit
+ - Write
+ - Glob
+ - Grep
+ - Agent
+ - AskUserQuestion
+triggers:
+ - build the feature
+ - build the plan
+ - start coding
+ - build merge
+ - merge branches
+ - reexamine
+ - audit the plan
+---
+
+
+
+## Preamble (run first)
+
+```bash
+_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
+[ -n "$_UPD" ] && echo "$_UPD" || true
+mkdir -p ~/.gstack/sessions
+touch ~/.gstack/sessions/"$PPID"
+_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ')
+find ~/.gstack/sessions -mmin +120 -type f -exec rm {} + 2>/dev/null || true
+_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true")
+_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no")
+_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
+echo "BRANCH: $_BRANCH"
+_SKILL_PREFIX=$(~/.claude/skills/gstack/bin/gstack-config get skill_prefix 2>/dev/null || echo "false")
+echo "PROACTIVE: $_PROACTIVE"
+echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED"
+echo "SKILL_PREFIX: $_SKILL_PREFIX"
+source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true
+REPO_MODE=${REPO_MODE:-unknown}
+echo "REPO_MODE: $REPO_MODE"
+_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
+echo "LAKE_INTRO: $_LAKE_SEEN"
+_TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true)
+_TEL_PROMPTED=$([ -f ~/.gstack/.telemetry-prompted ] && echo "yes" || echo "no")
+_TEL_START=$(date +%s)
+_SESSION_ID="$$-$(date +%s)"
+echo "TELEMETRY: ${_TEL:-off}"
+echo "TEL_PROMPTED: $_TEL_PROMPTED"
+_EXPLAIN_LEVEL=$(~/.claude/skills/gstack/bin/gstack-config get explain_level 2>/dev/null || echo "default")
+if [ "$_EXPLAIN_LEVEL" != "default" ] && [ "$_EXPLAIN_LEVEL" != "terse" ]; then _EXPLAIN_LEVEL="default"; fi
+echo "EXPLAIN_LEVEL: $_EXPLAIN_LEVEL"
+_QUESTION_TUNING=$(~/.claude/skills/gstack/bin/gstack-config get question_tuning 2>/dev/null || echo "false")
+echo "QUESTION_TUNING: $_QUESTION_TUNING"
+mkdir -p ~/.gstack/analytics
+if [ "$_TEL" != "off" ]; then
+echo '{"skill":"build","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}' >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true
+fi
+for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do
+ if [ -f "$_PF" ]; then
+ if [ "$_TEL" != "off" ] && [ -x "~/.claude/skills/gstack/bin/gstack-telemetry-log" ]; then
+ ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true
+ fi
+ rm -f "$_PF" 2>/dev/null || true
+ fi
+ break
+done
+eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || true
+_LEARN_FILE="${GSTACK_HOME:-$HOME/.gstack}/projects/${SLUG:-unknown}/learnings.jsonl"
+if [ -f "$_LEARN_FILE" ]; then
+ _LEARN_COUNT=$(wc -l < "$_LEARN_FILE" 2>/dev/null | tr -d ' ')
+ echo "LEARNINGS: $_LEARN_COUNT entries loaded"
+ if [ "$_LEARN_COUNT" -gt 5 ] 2>/dev/null; then
+ ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 3 2>/dev/null || true
+ fi
+else
+ echo "LEARNINGS: 0"
+fi
+~/.claude/skills/gstack/bin/gstack-timeline-log '{"skill":"build","event":"started","branch":"'"$_BRANCH"'","session":"'"$_SESSION_ID"'"}' 2>/dev/null &
+_HAS_ROUTING="no"
+if [ -f CLAUDE.md ] && grep -q "## Skill routing" CLAUDE.md 2>/dev/null; then
+ _HAS_ROUTING="yes"
+fi
+_ROUTING_DECLINED=$(~/.claude/skills/gstack/bin/gstack-config get routing_declined 2>/dev/null || echo "false")
+echo "HAS_ROUTING: $_HAS_ROUTING"
+echo "ROUTING_DECLINED: $_ROUTING_DECLINED"
+_VENDORED="no"
+if [ -d ".claude/skills/gstack" ] && [ ! -L ".claude/skills/gstack" ]; then
+ if [ -f ".claude/skills/gstack/VERSION" ] || [ -d ".claude/skills/gstack/.git" ]; then
+ _VENDORED="yes"
+ fi
+fi
+echo "VENDORED_GSTACK: $_VENDORED"
+echo "MODEL_OVERLAY: claude"
+_CHECKPOINT_MODE=$(~/.claude/skills/gstack/bin/gstack-config get checkpoint_mode 2>/dev/null || echo "explicit")
+_CHECKPOINT_PUSH=$(~/.claude/skills/gstack/bin/gstack-config get checkpoint_push 2>/dev/null || echo "false")
+echo "CHECKPOINT_MODE: $_CHECKPOINT_MODE"
+echo "CHECKPOINT_PUSH: $_CHECKPOINT_PUSH"
+[ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true
+```
+
+## Plan Mode Safe Operations
+
+In plan mode, allowed because they inform the plan: `$B`, `$D`, `codex exec`/`codex review`, writes to `~/.gstack/`, writes to the plan file, and `open` for generated artifacts.
+
+## Skill Invocation During Plan Mode
+
+If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If no variant is callable, the skill is BLOCKED — stop and report `BLOCKED — AskUserQuestion unavailable` per the AskUserQuestion Format rule. At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
+
+If `PROACTIVE` is `"false"`, do not auto-invoke or proactively suggest skills. If a skill seems useful, ask: "I think /skillname might help here — want me to run it?"
+
+If `SKILL_PREFIX` is `"true"`, suggest/invoke `/gstack-*` names. Disk paths stay `~/.claude/skills/gstack/[skill-name]/SKILL.md`.
+
+If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined).
+
+If output shows `JUST_UPGRADED `: print "Running gstack v{to} (just updated!)". If `SPAWNED_SESSION` is true, skip feature discovery.
+
+Feature discovery, max one prompt per session:
+- Missing `~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint`: AskUserQuestion for Continuous checkpoint auto-commits. If accepted, run `~/.claude/skills/gstack/bin/gstack-config set checkpoint_mode continuous`. Always touch marker.
+- Missing `~/.claude/skills/gstack/.feature-prompted-model-overlay`: inform "Model overlays are active. MODEL_OVERLAY shows the patch." Always touch marker.
+
+After upgrade prompts, continue workflow.
+
+If `WRITING_STYLE_PENDING` is `yes`: ask once about writing style:
+
+> v1 prompts are simpler: first-use jargon glosses, outcome-framed questions, shorter prose. Keep default or restore terse?
+
+Options:
+- A) Keep the new default (recommended — good writing helps everyone)
+- B) Restore V0 prose — set `explain_level: terse`
+
+If A: leave `explain_level` unset (defaults to `default`).
+If B: run `~/.claude/skills/gstack/bin/gstack-config set explain_level terse`.
+
+Always run (regardless of choice):
+```bash
+rm -f ~/.gstack/.writing-style-prompt-pending
+touch ~/.gstack/.writing-style-prompted
+```
+
+Skip if `WRITING_STYLE_PENDING` is `no`.
+
+If `LAKE_INTRO` is `no`: say "gstack follows the **Boil the Lake** principle — do the complete thing when AI makes marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" Offer to open:
+
+```bash
+open https://garryslist.org/posts/boil-the-ocean
+touch ~/.gstack/.completeness-intro-seen
+```
+
+Only run `open` if yes. Always run `touch`.
+
+If `TEL_PROMPTED` is `no` AND `LAKE_INTRO` is `yes`: ask telemetry once via AskUserQuestion:
+
+> Help gstack get better. Share usage data only: skill, duration, crashes, stable device ID. No code, file paths, or repo names.
+
+Options:
+- A) Help gstack get better! (recommended)
+- B) No thanks
+
+If A: run `~/.claude/skills/gstack/bin/gstack-config set telemetry community`
+
+If B: ask follow-up:
+
+> Anonymous mode sends only aggregate usage, no unique ID.
+
+Options:
+- A) Sure, anonymous is fine
+- B) No thanks, fully off
+
+If B→A: run `~/.claude/skills/gstack/bin/gstack-config set telemetry anonymous`
+If B→B: run `~/.claude/skills/gstack/bin/gstack-config set telemetry off`
+
+Always run:
+```bash
+touch ~/.gstack/.telemetry-prompted
+```
+
+Skip if `TEL_PROMPTED` is `yes`.
+
+If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: ask once:
+
+> Let gstack proactively suggest skills, like /qa for "does this work?" or /investigate for bugs?
+
+Options:
+- A) Keep it on (recommended)
+- B) Turn it off — I'll type /commands myself
+
+If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true`
+If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false`
+
+Always run:
+```bash
+touch ~/.gstack/.proactive-prompted
+```
+
+Skip if `PROACTIVE_PROMPTED` is `yes`.
+
+If `HAS_ROUTING` is `no` AND `ROUTING_DECLINED` is `false` AND `PROACTIVE_PROMPTED` is `yes`:
+Check if a CLAUDE.md file exists in the project root. If it does not exist, create it.
+
+Use AskUserQuestion:
+
+> gstack works best when your project's CLAUDE.md includes skill routing rules.
+
+Options:
+- A) Add routing rules to CLAUDE.md (recommended)
+- B) No thanks, I'll invoke skills manually
+
+If A: Append this section to the end of CLAUDE.md:
+
+```markdown
+
+## Skill routing
+
+When the user's request matches an available skill, invoke it via the Skill tool. When in doubt, invoke the skill.
+
+Key routing rules:
+- Product ideas/brainstorming → invoke /office-hours
+- Strategy/scope → invoke /plan-ceo-review
+- Architecture → invoke /plan-eng-review
+- Design system/plan review → invoke /design-consultation or /plan-design-review
+- Full review pipeline → invoke /autoplan
+- Bugs/errors → invoke /investigate
+- QA/testing site behavior → invoke /qa or /qa-only
+- Code review/diff check → invoke /review
+- Visual polish → invoke /design-review
+- Ship/deploy/PR → invoke /ship or /land-and-deploy
+- Save progress → invoke /context-save
+- Resume context → invoke /context-restore
+```
+
+Then commit the change: `git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"`
+
+If B: run `~/.claude/skills/gstack/bin/gstack-config set routing_declined true` and say they can re-enable with `gstack-config set routing_declined false`.
+
+This only happens once per project. Skip if `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`.
+
+If `VENDORED_GSTACK` is `yes`, warn once via AskUserQuestion unless `~/.gstack/.vendoring-warned-$SLUG` exists:
+
+> This project has gstack vendored in `.claude/skills/gstack/`. Vendoring is deprecated.
+> Migrate to team mode?
+
+Options:
+- A) Yes, migrate to team mode now
+- B) No, I'll handle it myself
+
+If A:
+1. Run `git rm -r .claude/skills/gstack/`
+2. Run `echo '.claude/skills/gstack/' >> .gitignore`
+3. Run `~/.claude/skills/gstack/bin/gstack-team-init required` (or `optional`)
+4. Run `git add .claude/ .gitignore CLAUDE.md && git commit -m "chore: migrate gstack from vendored to team mode"`
+5. Tell the user: "Done. Each developer now runs: `cd ~/.claude/skills/gstack && ./setup --team`"
+
+If B: say "OK, you're on your own to keep the vendored copy up to date."
+
+Always run (regardless of choice):
+```bash
+eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || true
+touch ~/.gstack/.vendoring-warned-${SLUG:-unknown}
+```
+
+If marker exists, skip.
+
+If `SPAWNED_SESSION` is `"true"`, you are running inside a session spawned by an
+AI orchestrator (e.g., OpenClaw). In spawned sessions:
+- Do NOT use AskUserQuestion for interactive prompts. Auto-choose the recommended option.
+- Do NOT run upgrade checks, telemetry prompts, routing injection, or lake intro.
+- Focus on completing the task and reporting results via prose output.
+- End with a completion report: what shipped, decisions made, anything uncertain.
+
+## AskUserQuestion Format
+
+### Tool resolution (read first)
+
+"AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.
+
+**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
+
+**If no AskUserQuestion variant appears in your tool list, this skill is BLOCKED.** Stop, report `BLOCKED — AskUserQuestion unavailable`, and wait for the user. Do not write decisions to the plan file as a substitute, do not emit them as prose and stop, and do not silently auto-decide (only `/plan-tune` AUTO_DECIDE opt-ins authorize auto-picking).
+
+### Format
+
+Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose.
+
+```
+D —
+Project/branch/task: <1 short grounding sentence using _BRANCH>
+ELI10:
+Stakes if we pick wrong:
+Recommendation: because
+Completeness: A=X/10, B=Y/10 (or: Note: options differ in kind, not coverage — no completeness score)
+Pros / cons:
+A)