From a8ad6dddce313d0e424471d627ab1422a24266c7 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 09:04:16 +0100 Subject: [PATCH 01/23] docs: remove BUG_REVIEW.md; move CLI_MIGRATION.md to docs/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - BUG_REVIEW.md was a transient artifact from the post-merge bug-hunt review; the actual fixes shipped in PR #32 and are documented in the PR description / commit history. - CLI_MIGRATION.md is an audience-specific guide (Java MS-GF+ users porting CLI invocations) — belongs under docs/, not at root. - DOCS.md stays at root as the primary single-file reference (per the iter39 docs-rewrite design). - Updated inbound references in README.md and DOCS.md. --- BUG_REVIEW.md | 72 ----------------------- CLI_MIGRATION.md => docs/CLI_MIGRATION.md | 0 2 files changed, 72 deletions(-) delete mode 100644 BUG_REVIEW.md rename CLI_MIGRATION.md => docs/CLI_MIGRATION.md (100%) diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md deleted file mode 100644 index 46a90f2e..00000000 --- a/BUG_REVIEW.md +++ /dev/null @@ -1,72 +0,0 @@ -# msgf-rust bug review (2026-05-23) - -Branch: `review/bug-hunt` (from `master` @ 18360a3d) - -Systematic review of the Rust MS-GF+ port: static analysis of critical paths, -full `cargo test --release --workspace`, and targeted code reading. - -## Fixed in this branch - -| ID | Severity | Location | Issue | Fix | -|---|---|---|---|---| -| B1 | **Critical** | `msgf-rust.rs` `send_chunks` | Bench cap (`--max-spectra N`) truncated the final partial chunk to zero when `total == N` (e.g. N=100 with chunk size 5000 → empty output). | Removed erroneous tail `truncate` block; loop already stops at cap. | -| B2 | **High** | `msgf-rust.rs` param routing | Activation auto-detect was gated on `instrument == low-res`, so `--fragmentation auto --instrument QExactive` on mzML skipped peek and resolved to CID params for HCD data. | Gate auto-route on `fragmentation == auto` + mzML extension only. | -| B3 | **High** | `msgf-rust.rs` TSV write | `write_tsv(..., is_mgf=true)` always emitted MGF layout (extra `Title` column) even for mzML inputs. | Pass `!is_mzml`. | -| B4 | **High** | `match_engine.rs` GF | SpecE GF graph used `start_offset == 0` for protein N-term instead of `cand.is_protein_n_term`, breaking Met-cleaved N-termini at offset 1. | Use `cand.is_protein_n_term` / `is_protein_c_term`. | -| B5 | **Medium** | `tsv.rs` | `IsotopeError` column hardcoded to 0 while PIN writes `psm.isotope_offset`. | Thread isotope offset from PSM. | -| B6 | **Medium** | `msgf-rust.rs` CLI | Inverted `--charge-min/--charge-max` or isotope ranges produced empty ranges with no error. | Validate at startup and return clear error. | -| B7 | **High** | `match_engine.rs` dedup | Dedup used bare sequence + pin score; merged mod variants incorrectly. | Mod-aware pepSeq key + `rank_score`. | -| B8 | **Medium** | `match_engine.rs` dedup | HashMap survivor order was nondeterministic. | `BTreeMap` + best-`rank_score` survivor rule. | - -## Open — not fixed (documented for follow-up) - -| ID | Severity | Location | Issue | -|---|---|---|---| -| B9 | **Low** | `sa_walk.rs` | Test-only SA walk helper does not enforce `max_missed_cleavages`; production search uses `candidate_gen::enumerate_candidates`, which does. | -| B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. | -| B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. | - -## Known test failures (pre-existing, CI-skipped) - -These fail on `master` without the 7 CI skip flags; tracked as parity/min_peaks regressions: - -- `match_engine_smoke::known_peptide_appears_in_top_n` -- `match_engine_smoke::charge_missing_spectrum_uses_per_charge_scored_spec` -- `match_engine_smoke::spectrum_without_charge_tries_charge_range` -- Maven fixture loads, thread-determinism test (see `.github/workflows/ci.yml`) - -## Verification - -```bash -cargo test --release --workspace -- \ - --skip charge_missing_spectrum_uses_per_charge_scored_spec \ - --skip spectrum_without_charge_tries_charge_range \ - --skip known_peptide_appears_in_top_n \ - --skip read_bsa_canno_text_format \ - --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ - --skip tryp_pig_bov_revcat_full_set_loads \ - --skip match_spectra_output_invariant_across_thread_counts -``` - -## Performance (dedup pass) - -- PepSeq dedup keys use integer mod units + `Arc` cache per candidate (avoids repeated string formatting). -- Per-charge `TopNQueue` map uses `FxHashMap` (typically 1–3 charges per spectrum). - -## Documentation review (2026-05-24) - -Fixes applied on this branch: - -| Issue | Location | Fix | -|---|---|---| -| PIN column count said "28" | `README.md` | Corrected to 36 (default charge 2–3) + EdgeScore note | -| Auto-detect described "first spectrum" only | `README.md` | First 64 MS2 histogram; `--instrument` does not gate peek | -| Auto-detect required `--instrument low-res` | `DOCS.md` §4 | Matches code: only `--fragmentation auto` + mzML | -| TSV `IsotopeError` documented as always 0 | `DOCS.md` §3b | Updated after B5 fix | -| Broken `known-divergences.md` links | `README.md`, `DOCS.md` §8d | Legacy file removed in iter39; point to §8d / tests | -| Inverted charge/isotope ranges undocumented | `DOCS.md` §1 | Startup validation documented | - -**Still stale (not fixed here):** - -- `benchmark/ci/README.md` — references Java Maven workflow; no Rust benchmark workflow in `.github/workflows/` yet. -- `.claude/CLAUDE.md` — Java-tree context; accurate on `java-legacy` branch only. diff --git a/CLI_MIGRATION.md b/docs/CLI_MIGRATION.md similarity index 100% rename from CLI_MIGRATION.md rename to docs/CLI_MIGRATION.md From 55cff3fa22cc4361bc5552ecdf346d31e4589115 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 09:18:18 +0100 Subject: [PATCH 02/23] docs(spec): PR-Q1 quality cleanup design + finalize CLI_MIGRATION refs Adds the design spec for PR-Q1 (post-cutover code quality sweep) and finalizes the inbound-reference updates from the prior commit (docs/CLI_MIGRATION.md links) that weren't staged at that point. PR-Q1 is the first of three sequential sub-projects (quality -> speed -> ID-rate +5% per dataset). Decomposed during the 2026-05-26 brainstorm because the three concerns differ in risk, scope, and time profile; the ID-rate target is a multi-PR research project, not a single ship gate. Scope: 7 groups (6 in-PR + 1 out-of-repo memory update). Dangling .java:LINE refs (42), stale "port of MS-GF+" framing, identifier renames (MSGFRUST_RSS_PROBE etc.), 26 clippy warnings, lift CI lint to required, remove shipped design specs. --- DOCS.md | 4 +- README.md | 2 +- .../2026-05-26-quality-cleanup-design.md | 199 ++++++++++++++++++ 3 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 docs/superpowers/specs/2026-05-26-quality-cleanup-design.md diff --git a/DOCS.md b/DOCS.md index 5743ddc9..f2422cf8 100644 --- a/DOCS.md +++ b/DOCS.md @@ -1,6 +1,6 @@ # msgf-rust documentation -This is the full reference for the `msgf-rust` binary and its outputs. For a quick start and benchmark summary, see [`README.md`](README.md). For porting Java MS-GF+ command lines and numeric legacy flags, see [`CLI_MIGRATION.md`](CLI_MIGRATION.md). +This is the full reference for the `msgf-rust` binary and its outputs. For a quick start and benchmark summary, see [`README.md`](README.md). For porting Java MS-GF+ command lines and numeric legacy flags, see [`docs/CLI_MIGRATION.md`](docs/CLI_MIGRATION.md). Run `msgf-rust --help` for auto-generated help derived from the same `Cli` struct documented below. @@ -459,7 +459,7 @@ msgf-rust accepts **both** canonical kebab-case flags with named enum values **a ### 8b. Numeric-legacy values -Full legacy 0…N → named-value tables for `--fragmentation`, `--instrument`, `--protocol`, and `--enzyme-specificity` (`--ntt`) live in [`CLI_MIGRATION.md`](CLI_MIGRATION.md). clap accepts named values case-insensitively (`--fragmentation hcd` ≡ `HCD`). +Full legacy 0…N → named-value tables for `--fragmentation`, `--instrument`, `--protocol`, and `--enzyme-specificity` (`--ntt`) live in [`docs/CLI_MIGRATION.md`](docs/CLI_MIGRATION.md). clap accepts named values case-insensitively (`--fragmentation hcd` ≡ `HCD`). ### 8c. Behavior differences diff --git a/README.md b/README.md index f3a1b553..11adfac2 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ msgf-rust --spectrum spectra.mzML --database db.fasta \ **[quantms](https://github.com/bigbio/quantms) pipeline integration:** -Point quantms's PSM search step at `msgf-rust` and use the standard quantms post-processing. The `.pin` row format is the same; existing quantms scripts using legacy numeric flag values (`--fragmentation 3 --instrument 3 --protocol 4`) keep working without modification (see `CLI_MIGRATION.md`). +Point quantms's PSM search step at `msgf-rust` and use the standard quantms post-processing. The `.pin` row format is the same; existing quantms scripts using legacy numeric flag values (`--fragmentation 3 --instrument 3 --protocol 4`) keep working without modification (see [`docs/CLI_MIGRATION.md`](docs/CLI_MIGRATION.md)). ## CLI summary diff --git a/docs/superpowers/specs/2026-05-26-quality-cleanup-design.md b/docs/superpowers/specs/2026-05-26-quality-cleanup-design.md new file mode 100644 index 00000000..a20b03d8 --- /dev/null +++ b/docs/superpowers/specs/2026-05-26-quality-cleanup-design.md @@ -0,0 +1,199 @@ +# Design — Quality cleanup sweep (PR-Q1) + +**Date:** 2026-05-26 +**Branch:** `feat/quality-perf-id-rate` +**Status:** Spec for review +**First sub-project of three:** Q1 (this) → S1 (speed) → I1 (ID rate) + +## Problem + +Post-cutover the codebase carries stale historical references and lint debt accumulated across the Java→Rust port iterations. Specifically: + +- **42 dangling `Xxx.java:LINE` pointers** in source comments cite Java code that no longer exists in this repo (removed in cutover commit `b4565b8e chore: remove Java tool sources`). They read as broken hyperlinks. +- **File-header `port of Java MS-GF+ X` framing** introduces modules as ports of files that no longer live in-tree; misleading for new contributors. +- **`MSGFRUST_RSS_PROBE` env var** and any remaining `java_*` / `msgf_*` symbol names carry iter-era naming that doesn't reflect the current binary identity (`msgf-rust`). +- **26 clippy warnings** across the workspace, plus a known dead `mut` and undiscovered `unused_*` items. CI lint job runs `continue-on-error: true` because the codebase isn't yet clippy-clean. + +These don't affect runtime behavior, but they: +1. Confuse new contributors trying to read context-laden comments that point at non-existent code. +2. Block the CI lint job from being a real gate. +3. Make refactoring noisier than necessary (every modification trips a re-formatter or a stylistic warning). + +## Goal + +Single low-risk PR that lands a post-cutover quality sweep. Logic-preserving; bit-identical PIN/TSV output for `--precursor-cal off`. Lifts CI lint from advisory to required. + +## Non-goals + +- Speed or performance work (PR-S1, separate brainstorm). +- ID-rate work (PR-I1, separate multi-PR project). +- Parity test files (`tests/*_java_parity.rs`, `tests/gf_bsa_parity.rs`) — their identity IS Java parity; refs stay. +- `docs/parity-analysis/notes/` iter notes — historical; not edited. +- Renaming production public APIs across crate boundaries. +- Rust edition / toolchain bumps. + +## Scope — 7 logical groups (6 in-PR + 1 out-of-repo) + +### Group 1 — Dangling Java source pointers (42 refs) + +Replace `Xxx.java:LINE` citations with intent-only comments. The semantic intent stays; the broken pointer goes. + +Before: +```rust +// per-SpecKey raw-score retention (DBScanner.java:534). +``` +After: +```rust +// per-SpecKey raw-score retention (Java parity). +``` + +Files (counts from initial scan): +- `crates/search/src/match_engine.rs` — 12 refs +- `crates/output/src/pin.rs` — 2 refs +- `crates/input/src/mzml.rs` — 2 refs +- `crates/search/src/psm.rs` — 1 ref +- `crates/search/src/mass_calibrator.rs` — 1 ref +- Others — smaller counts + +**Excluded:** `crates/search/tests/gf_java_parity.rs`, `crates/search/tests/match_engine_java_parity.rs`. These tests' purpose is documenting Java parity; their citations are load-bearing. + +### Group 2 — Stale "port of MS-GF+" framing + +File-header `//!` intros and CLI `--help` strings that introduce modules/flags by reference to Java code go neutral. + +Targets: +- `crates/search/src/lib.rs`, `crates/scoring/src/lib.rs`, `crates/output/src/lib.rs` headers +- `crates/msgf-rust/src/bin/msgf-rust.rs` CLI help strings +- A few in-source `//!` modules across `model/`, `search/` + +Keep: +- `README.md` provenance section ("evolved from the Java MS-GF+ tradition" or equivalent) +- `DOCS.md` benchmarking-comparison table (explicitly cites Java numbers) +- All `docs/parity-analysis/` content + +### Group 3 — Stale identifier renames + +- `MSGFRUST_RSS_PROBE` env var → `MSGF_RSS_PROBE` (or just `RSS_PROBE`) + - Accept BOTH the old and new name during one release; emit a one-line deprecation eprintln if the old name is set, then drop in the next quality cleanup +- Audit for any remaining `java_*` or `msgf_*` named items in source (excluding test fixtures) +- The binary name (`msgf-rust`) and crate name (`msgf-rust`) stay — those are the product identity + +### Group 4 — Clippy 26 warnings + `unused_*` sweep + +| Warning class | Count | Fix approach | +|---|---:|---| +| `too_many_arguments` (8/7 or 11/7) | 5 | Wrap shared args in a small struct; one cohesive grouping per call site | +| Complex type → `type` alias | 6 | 2-3 reusable type aliases (`SegmentPartitionCache`, etc.) | +| `map_or` simplification | 6 | Mechanical rewrite | +| `doc_list_item_without_indentation` | 4 | Reformat bullet indents | +| `unused_mut` (real dead) | 1 | Drop `mut` | +| Manual `?` rewrite | 1 | Apply | +| Manual `split_once` | 1 | Apply | +| Loop-index borrow | 1 | `iter().enumerate()` | +| Crate summaries | 4 | Mostly auto-fixable via `cargo clippy --fix --lib` | + +Additionally: +- Run `cargo +nightly -W unused_variables -W dead_code -W unused_imports --workspace` and clean any findings the stable compiler missed. +- Where a finding is intentional, add `#[allow(...)]` with a one-line justification. + +### Group 5 — Lift CI lint to required + +`.github/workflows/ci.yml` currently runs the `lint` job with `continue-on-error: true`. After Groups 1-4, the workspace is clippy-clean. Drop the `continue-on-error` so lint becomes a real gate. + +### Group 6 — Remove outdated in-repo docs + +Tracked docs under `docs/superpowers/` exist for SHIPPED features that no longer need a public spec/plan to reference. Remove: + +- `docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md` — iter39 shipped 2026-05-23 in PR #30; design no longer in-flight. +- `docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md` — same. + +Keep: +- `docs/superpowers/specs/2026-05-26-quality-cleanup-design.md` — THIS spec; in-flight. +- All `docs/parity-analysis/notes/2026-05-25-*.md` — referenced by the in-flight ship-gates discussion (precursor-cal G1 still deferred). +- `docs/parity-analysis/snapshots/cal-shifts-2026-05-25.json` — current bench artifact. + +Future protocol (documented in this spec so reviewers can apply it): when a `docs/superpowers/{specs,plans}/*.md` file references a feature that has fully shipped + been benched + closed any deferred gate, remove it in the next quality cleanup. + +### Group 7 — Update project auto-memory (out-of-repo) + +Auto-memory lives at `~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/`. Out of the PR's diff but in the cleanup sweep. To be done by the controller alongside PR-Q1: + +- Update `MEMORY.md` index: PR #29 (rust-implement → dev) MERGED, not OPEN; PR #33 (precursor-cal-pr-a → dev) MERGED 2026-05-26; PR #32 (review/bug-hunt → dev) MERGED 2026-05-26. +- Add new entry referencing the 2026-05-25/26 bench numbers (LFQ 14,721 / Astral 36,771 / TMT 9,565 with `--precursor-cal auto`). +- Mark iter32-38 entries as historical / shipped. +- Note the new PR-Q1 / PR-S1 / PR-I1 sequencing. + +## File-by-file inventory (estimate) + +| File | Change kind | Risk | +|---|---|---| +| `crates/search/src/match_engine.rs` | Java-ref scrub + 1 `too_many_arguments` fix | Medium (hot path) | +| `crates/search/src/mass_calibrator.rs` | Java-ref scrub | Low | +| `crates/search/src/psm.rs` | Java-ref scrub | Low | +| `crates/scoring/src/scoring/scored_spectrum.rs` | 1 `too_many_arguments` fix + complex-type alias | Medium (hot path) | +| `crates/scoring/src/gf/*` | Clippy stylistic | Low | +| `crates/output/src/pin.rs` | Java-ref scrub + 1 `too_many_arguments` fix | Low | +| `crates/output/src/tsv.rs` | Clippy stylistic | Low | +| `crates/input/src/mzml.rs` | Java-ref scrub | Low | +| `crates/msgf-rust/src/bin/msgf-rust.rs` | CLI-help neutral + env var rename | Low | +| `crates/search/src/lib.rs`, `crates/scoring/src/lib.rs`, `crates/output/src/lib.rs` | Header neutral | Low | +| `crates/model/src/*` | Stylistic + 1 `unused_mut` | Low | +| `.github/workflows/ci.yml` | `continue-on-error` removed | Low | + +Estimated total: ~30 files modified + 2 files deleted (Group 6), ~200 LOC of comment/identifier/structural change, 0 functional behavior change. + +## Verification / ship criteria + +| Gate | Threshold | How | +|---|---|---| +| Clippy clean on stable | 0 warnings on `cargo clippy --workspace --release` | CI lint job (now required) | +| Nightly unused-lints clean | 0 (or `#[allow]` justified) | `cargo +nightly -W unused_variables -W dead_code -W unused_imports --workspace` locally | +| Workspace tests | 0 failures under existing skip list | `cargo test --release --workspace -- --skip ...` | +| Off-path bit-identical | `precursor_cal_bit_identical` passes | Already in tree | +| Sanity bench | LFQ / Astral / TMT PSM count within ±5 of pre-cleanup on `--precursor-cal off` | Optional VM run; deferred to reviewer if rayon noise alone explains drift | + +## Risks & mitigations + +| Risk | Mitigation | +|---|---| +| Java-ref scrub accidentally rewords a load-bearing semantic note | Replace IN PLACE preserving comment lines around it; reviewer (or CodeRabbit) flags semantic drift | +| `too_many_arguments` refactor introduces a parameter-ordering bug | The 5 refactors each touch ≤ 1 hot-path function; bench gate catches PSM drift | +| `MSGFRUST_RSS_PROBE` rename breaks an external bench script | Accept BOTH old + new env var name for one release with deprecation eprintln | +| Lifting CI lint surfaces platform-specific warnings (macOS / Windows) | Run `cargo clippy --workspace` locally with `--target x86_64-pc-windows-gnu` and `--target x86_64-apple-darwin` before PR open | + +## Sequencing (Q1 only) + +``` +feat/quality-perf-id-rate (current HEAD: a8ad6ddd) + ↓ +Group 1: Java-ref scrub (commit 1) + ↓ +Group 2: Header / CLI framing (commit 2) + ↓ +Group 3: Identifier renames (commit 3) + ↓ +Group 4: Clippy + unused sweep (commit 4) + ↓ +Group 5: CI lint required (commit 5) + ↓ +Group 6: Remove shipped specs (commit 6) + ↓ +Group 7: Memory update (out-of-repo, separate) + ↓ +Verification: tests + bit-identical gate + local clippy on 3 platforms + ↓ +Push + open PR-Q1 → dev +``` + +6 in-PR commits (Groups 1-6) + 1 out-of-repo memory update (Group 7) — keeps reverts easy per-group if any one surfaces an issue. + +## Open questions + +None — all design points resolved in brainstorming. + +## Related documents + +- `docs/superpowers/specs/2026-05-25-precursor-cal-ship-design.md` — PR-A spec (the precursor calibrator port) +- `docs/parity-analysis/notes/2026-05-25-precursor-cal-ship-gates.md` — current bench numbers + G1 gate status +- `.github/workflows/ci.yml` — CI policy, including the existing test skip list and the lint job's `continue-on-error` +- `DOCS.md` — primary user-facing reference (touched by Group 2 only where stale framing appears) From 84f83295991b227eb1398888dfc8d31306f98215 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 10:14:41 +0100 Subject: [PATCH 03/23] chore: scrub 32 dangling .java:LINE references in non-test source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Java source tree was removed in commit b4565b8e during the Rust-cutover; the inline citations to specific Java line numbers now point at code that does not exist in this repo. Replace each citation with intent-only "Java parity" comments. Preserves semantic meaning; removes the broken hyperlinks. Parity-test files (tests/*_java_parity.rs, tests/gf_bsa_parity.rs, tests/*_match_java.rs) untouched — their identity is Java parity and the citations are load-bearing documentation. 8 non-test files touched, 33 refs replaced, 0 functional changes. --- crates/input/src/mzml.rs | 6 ++-- crates/msgf-rust/src/bin/msgf-rust.rs | 13 ++++---- crates/output/src/pin.rs | 4 +-- crates/scoring/src/scoring/psm_score.rs | 2 +- crates/scoring/src/scoring/scored_spectrum.rs | 19 ++++++------ crates/search/src/mass_calibrator.rs | 4 +-- crates/search/src/match_engine.rs | 30 +++++++++---------- crates/search/src/psm.rs | 22 +++++++------- 8 files changed, 48 insertions(+), 52 deletions(-) diff --git a/crates/input/src/mzml.rs b/crates/input/src/mzml.rs index dcb5624d..30c0e7b2 100644 --- a/crates/input/src/mzml.rs +++ b/crates/input/src/mzml.rs @@ -59,8 +59,8 @@ const CV_32BIT: &str = "MS:1000521"; const CV_ZLIB: &str = "MS:1000574"; // Activation-method CV accessions (inside ). -// These mirror Java MS-GF+'s `ActivationMethod.cvTable` in -// `msutil/ActivationMethod.java` — we map each to one of our five +// These mirror Java MS-GF+'s `ActivationMethod.cvTable` (Java parity) +// — we map each to one of our five // canonical ActivationMethod variants. Unknown / unhandled child terms // fall through and the spectrum's activation_method stays None. const CV_CID: &str = "MS:1000133"; // collision-induced dissociation @@ -348,7 +348,7 @@ impl MzMLReader { // here, so downstream param routing picks an ETD-trained // model when ECD is the only signal. // - // Selection rule (mirrors `StaxMzMLParser.java:595-605`): + // Selection rule (Java parity for activation-method selection): // - ETD always wins (set unconditionally; matches Java's // `isETD` short-circuit). // - Other methods: first-wins. A spectrum with multiple diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index 8232659e..5f68f4a6 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -986,8 +986,8 @@ fn resolve_bundled_param( } // Step 2: Drop protocol — try `{frag}_{inst}_Tryp.param`. - // This mirrors Java's `return get(method, instType, enzyme)` fallback - // (NewScorerFactory.java line ~120). For (CID, HighRes, Tryp, TMT) this + // This mirrors Java parity: `return get(method, instType, enzyme)` fallback + // (drop protocol suffix when exact match is missing). For (CID, HighRes, Tryp, TMT) this // lands on `CID_HighRes_Tryp.param`, which IS what Java would pick when // the protocol-specific file is missing. if !prot_suffix.is_empty() { @@ -1005,7 +1005,7 @@ fn resolve_bundled_param( // LysN (for N-term enzymes). We always use Tryp here, so this step is // a no-op for now. If/when N-term enzyme support lands, replicate this. - // Step 4: Final fallback ladder (Java NewScorerFactory.java lines ~136-160). + // Step 4: Final fallback ladder (Java parity for scorer factory fallback). // - HCD + (TOF|HighRes) + C-term → CID_TOF_Tryp // - ETD + C-term → ETD_LowRes_Tryp // - Non-electron + N-term → CID_LowRes_LysN (skipped; N-term TBD) @@ -1114,8 +1114,7 @@ fn detect_dominant_activation(spectrum_path: &std::path::Path) -> Option( writer.write_all(&[b'\t', flag])?; } - // enzN, enzC, enzInt — C-4 (2026-05-19): Java DirectPinWriter.java:199-203 + // enzN, enzC, enzInt — C-4 (2026-05-19): Java parity — // emits enzymatic-boundary consistency features. enzN = boundary between // protein-pre and peptide[0]; enzC = boundary between peptide[last] and // protein-post; enzInt = count of internal positions consistent with the @@ -414,7 +414,7 @@ fn write_psm_row( // Proteins column(s): one tab-separated accession per candidate_idx. // After R-2.2 dedup, a PSM that matches the same peptide across multiple // proteins keeps all protein indices in candidate_idxs, and the PIN row - // emits one accession per index — matching Java DirectPinWriter.java:237. + // emits one accession per index — matching Java parity for multi-protein PIN rows. // For PSMs with a single candidate_idx (typical), output is identical to // the pre-R-2.5 single-accession emit (ctx.accession still used by TSV). write!(writer, "\t{}", cand.peptide)?; diff --git a/crates/scoring/src/scoring/psm_score.rs b/crates/scoring/src/scoring/psm_score.rs index 6b7c95f4..803addac 100644 --- a/crates/scoring/src/scoring/psm_score.rs +++ b/crates/scoring/src/scoring/psm_score.rs @@ -42,7 +42,7 @@ fn trace_pep_filter() -> Option<&'static String> { /// regresses Astral 1% FDR by ~30%; adding it as a new feature lets /// Percolator learn weights without breaking the existing distribution. /// -/// Mirrors Java's `DBScanner.java:513` call: fromIndex=1, toIndex=n+1 → +/// Java parity: fromIndex=1, toIndex=n+1 → /// reverse loop iterates `i` from n-1 down to 1, forward loop iterates /// `i` from 1 to n-1. pub fn psm_edge_score( diff --git a/crates/scoring/src/scoring/scored_spectrum.rs b/crates/scoring/src/scoring/scored_spectrum.rs index 6eeb296d..1b90707a 100644 --- a/crates/scoring/src/scoring/scored_spectrum.rs +++ b/crates/scoring/src/scoring/scored_spectrum.rs @@ -193,7 +193,7 @@ impl<'a> ScoredSpectrum<'a> { // MS2IonCurrent / ion-current-ratio denominator: Java zeroes precursor // peak intensities via `Spectrum.filterPrecursorPeaks` BEFORE // PSMFeatureFinder.computeSumIonCurrent iterates the spec - // (NewScoredSpectrum.java:44-45). Those zeroed peaks then contribute + // (Java parity: precursor peaks zeroed before ion-current sum). Those zeroed peaks then contribute // 0 to MS2IonCurrent. Rust filters precursor peaks for rank // assignment but the original `spec.peaks` is unmodified, so summing // it directly OVER-COUNTS by the precursor-peak intensity. Use the @@ -220,8 +220,8 @@ impl<'a> ScoredSpectrum<'a> { let parent_mass = neutral_mass; // = (precursor_mz - PROTON) * charge // iter30 C-1: apply Java-parity isotope-cluster deconvolution FIRST, - // BEFORE prob_peak is computed (Java's `NewScoredSpectrum.java:76-88` - // does deconv first, then probPeak from the post-deconv spectrum). + // BEFORE prob_peak is computed (Java parity: deconv first, then + // probPeak from the post-deconv spectrum). // // No `charge > 2` guard — Java's `applyDeconvolution` is unconditional; // `deconvolute_spectrum` is a no-op for charge ≤ 2 because its inner @@ -240,9 +240,8 @@ impl<'a> ScoredSpectrum<'a> { }; // iter30 C-2: compute prob_peak from the ACTIVE peak list (post-deconv - // if applied; else kept_count). Java: `probPeak = spec.size() / - // max(approxNumBins, 1)` where `spec` is the post-deconv spectrum - // (`NewScoredSpectrum.java:83-88`). + // if applied; else kept_count). Java parity: `probPeak = spec.size() / + // max(approxNumBins, 1)` where `spec` is the post-deconv spectrum. // // parent_mass = (precursor_mz - PROTON) * charge // approxNumBins = parent_mass / (mme.raw_value() * 2) @@ -897,8 +896,8 @@ fn nearest_peak_rank_in(peaks: &[(f64, f32)], ranks: &[u32], target_mz: f64, tol /// Java-parity isotope-cluster deconvolution. /// -/// Mirrors `Spectrum.getDeconvolutedSpectrum(toleranceBetweenIsotopes)` in -/// `astral-speed/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java`. +/// Java parity for spectrum deconvolution semantics +/// (`Spectrum.getDeconvolutedSpectrum(toleranceBetweenIsotopes)`). /// /// Input is the spectrum's peak list (sorted ascending by m/z) plus the /// rank vector aligned with it (rank 1 = highest intensity; `u32::MAX` @@ -1236,8 +1235,8 @@ mod tests { /// T-2: For charge-3 spectra with `apply_deconvolution=true`, `prob_peak` /// MUST be computed from the post-deconvolution peak count, not the - /// pre-deconvolution kept_count. Java's `NewScoredSpectrum.java:83-88` - /// derives `probPeak` from `spec.size()` AFTER `spec` is replaced by the + /// pre-deconvolution kept_count. Java parity: `probPeak` is derived from + /// `spec.size()` AFTER `spec` is replaced by the /// deconvoluted spectrum. Iter30 C-2 enforces this ordering. #[test] fn deconv_active_for_charge_3_uses_post_deconv_peak_count_for_prob_peak() { diff --git a/crates/search/src/mass_calibrator.rs b/crates/search/src/mass_calibrator.rs index fa1b5aa7..a227cf73 100644 --- a/crates/search/src/mass_calibrator.rs +++ b/crates/search/src/mass_calibrator.rs @@ -172,8 +172,8 @@ pub fn learn_calibration_stats( } } -/// Tighten ppm precursor tolerance after a successful cal pass (Java -/// `MSGFPlus.java` post-cal block). No-op when stats are unreliable or +/// Tighten ppm precursor tolerance after a successful cal pass (matching +/// Java's post-cal block). No-op when stats are unreliable or /// tolerance is not ppm-based. pub fn apply_tightened_precursor_tolerance(params: &mut SearchParams, stats: CalibrationStats) { if !stats.has_reliable_stats() { diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index 210bf6f8..a923eb22 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -343,7 +343,7 @@ impl<'a> PreparedSearch<'a> { } // R-2.1: per-charge queue keyed by charge state. Mirrors Java's - // per-SpecKey raw-score retention (DBScanner.java:534). + // per-SpecKey raw-score retention (Java parity). let mut per_charge_queues: FxHashMap = FxHashMap::default(); for &cand_idx in &window_cand_indices { @@ -463,7 +463,7 @@ impl<'a> PreparedSearch<'a> { // R-2.2: pepSeq + score dedup per-charge BEFORE GF compute. // Same peptide matched against multiple proteins collapses to one - // PsmMatch with aggregated candidate_idxs (Java DBScanner.java:719-733). + // PsmMatch with aggregated candidate_idxs (Java parity for pepSeq dedup). for queue in per_charge_queues.values_mut() { if queue.len() > 1 { let drained = queue.drain_into_vec(); @@ -476,7 +476,7 @@ impl<'a> PreparedSearch<'a> { // R-2.3: per-charge GF / SpecEValue compute. Each per-charge queue // gets SpecE calibrated against its OWN charge's GF distribution - // (Java DBScanner.java:606,779 — getRankScorer per SpecKey). + // (Java parity: getRankScorer per SpecKey). let enzyme_opt = if params.enzyme != Enzyme::NoCleavage && params.enzyme != Enzyme::NonSpecific { @@ -512,7 +512,7 @@ impl<'a> PreparedSearch<'a> { // R-2.4: spectrum-level merge with SpecE tie keep. R-1's // TopNQueue::push (Ordering::Equal arm) keeps SpecE ties at // capacity because PsmMatch::cmp orders by spec_e_value first. - // Matches Java DBScanner.java:745. + // Matches Java parity: SpecE tie-keep on spectrum-level merge. for (_charge, mut per_charge) in per_charge_queues.drain() { for psm in per_charge.drain_into_vec() { queue.push(psm); @@ -688,8 +688,7 @@ fn compute_spec_e_values_for_spectrum( // 2. Compute the minimum score across all PSMs (used as GF score threshold). // // iter37 HIGH-1: use `rank_score` (= node + cleavage + edge), not `score` - // (= node + cleavage only). Java's `DBScanner.java:619-621` reads - // `m.getScore()`, which is set at `DBScanner.java:533` as + // (= node + cleavage only). Java parity: `match.score` is // `cleavageScore + rawScore` where `rawScore` is `DBScanScorer.getScore`'s // `node + edge` return — i.e. Rust's `rank_score`. Using `score` here was // seeding the GF threshold below Java's level by the per-PSM edge_score @@ -785,9 +784,9 @@ fn compute_spec_e_values_for_spectrum( // 4. For each PSM in the queue, compute spec_e_value from its score. // // iter37 HIGH-1: use `rank_score` (Java-aligned `node + cleavage + edge`), - // not `score` (Rust pin-only `node + cleavage`). Java's - // `DBScanner.java:697-699` calls `gf.getSpectralProbability(match.getScore())` - // where `match.getScore()` is Java's `node + cleavage + edge`. Using + // not `score` (Rust pin-only `node + cleavage`). Java parity: + // `gf.getSpectralProbability(match.getScore())` where `match.getScore()` + // is `node + cleavage + edge`. Using // `score` here was looking up the wrong tail of the GF score distribution // (lower by the per-PSM edge contribution ~+20), giving inflated // SpecEValue values for PSMs whose top-1 was chosen via edge contribution. @@ -819,11 +818,10 @@ fn compute_spec_e_values_for_spectrum( // // e_value = spec_e_value * num_distinct_peptides_at_length. // - // HIGH-2 (2026-05-18): align lookup index with Java. Java's - // `DirectPinWriter.java:165` does + // HIGH-2 (2026-05-18): align lookup index with Java parity. // `sa.getNumDistinctPeptides(enzyme == null ? length - 2 : length - 1)` - // where `match.getLength() = pepLength + 2` (DBScanner.java:521 includes the - // two flanking residues in the stored length). So Java effectively queries + // where `match.getLength() = pepLength + 2` (flanking residues included in + // the stored length). So Java effectively queries // - with enzyme: `numDistinctPeptides[pepLength + 1]` // - without enzyme: `numDistinctPeptides[pepLength]` // @@ -898,7 +896,7 @@ pub(crate) fn compute_psm_features( // some headroom for partition multi-ion-type matches at long peptides). let mut matched_ions: SmallVec<[(f32, f64, f64, bool); 96]> = SmallVec::new(); - // Java parity (PSMFeatureFinder.java:51-54): feature-counting uses a + // Java parity: feature-counting uses a // HARDCODED fragment tolerance, NOT param.mme. High-res instruments // (HighRes / TOF / QExactive) get 20 ppm; low-res LTQ gets 0.5 Da. // The param.mme value (0.5 Da for HCD_QExactive_Tryp.param) is the @@ -972,7 +970,7 @@ pub(crate) fn compute_psm_features( // ── Ion-current ratio features (iter22 partition-ion-list fix) ───────────── // - // Java's `NewScoredSpectrum.getExplainedIonCurrent` (NewScoredSpectrum.java:253) + // Java parity: `NewScoredSpectrum.getExplainedIonCurrent` // iterates the FULL partition ion list across all segments (b, y, plus // partition-specific variants like a-ion, b-H2O, etc.) and sums matched // peak intensities. The current Rust matched-ion buffer above only @@ -1321,7 +1319,7 @@ mod feature_tests { // 0.0005 Da offset = ~6 ppm at m/z 89 (Ala b1) — within the // hardcoded 20 ppm window that compute_psm_features now uses for - // high-resolution instruments (Java parity, PSMFeatureFinder.java:51-54). + // high-resolution instruments (Java parity). // The previous 0.01 Da offset assumed Rust used param.mme (~0.05 Da // in this fixture's make_scorer), but the iter20 fix makes feature // counting use 20 ppm regardless of param.mme. diff --git a/crates/search/src/psm.rs b/crates/search/src/psm.rs index 1b28270e..b1756325 100644 --- a/crates/search/src/psm.rs +++ b/crates/search/src/psm.rs @@ -73,8 +73,8 @@ pub struct PsmMatch { /// share the same peptide sequence and rounded score (typically the same /// peptide matched against multiple proteins, e.g. shared tryptic /// peptides in target+decoy concat). The PIN writer iterates this Vec to - /// emit one tab-separated `Proteins` column per row, matching Java's - /// `DirectPinWriter.java:237`. + /// emit one tab-separated `Proteins` column per row, matching Java parity + /// for the Proteins column in PIN output. /// /// Every real PSM has length ≥ 1 with valid indices into /// `PreparedSearch.candidates`. Test fixtures that don't need to resolve @@ -89,8 +89,8 @@ pub struct PsmMatch { /// from iter19's design). Used by Percolator as one of many features. pub score: f32, /// iter33: queue-ordering score = `node + cleavage + edge`. Java's - /// `DBScanScorer.getScore` returns `node + edge` and `DBScanner.java:533` - /// adds cleavage, so Java's `match.score` (used by its `PriorityQueue` + /// `DBScanScorer.getScore` returns `node + edge` and Java parity adds + /// cleavage, so Java's `match.score` (used by its `PriorityQueue` /// ordering) is `node + cleavage + edge`. Rust's pin RawScore stays at /// `node + cleavage` for Percolator distribution stability (iter19); the /// SEPARATE `EdgeScore` PIN column carries the `+edge` contribution. @@ -229,7 +229,7 @@ impl TopNQueue { /// **Tie handling (R-1, 2026-05-18):** when the queue is at capacity and /// a new PSM is `Equal` (in `Ord` terms) to the worst retained PSM, the /// new PSM is inserted WITHOUT evicting the tied one. This matches - /// Java's `DBScanner.java:540` (`size < n OR score == worst → add`). + /// Java parity: `size < n OR score == worst → add`. /// As a result, the queue can grow beyond `capacity` when ties exist; /// `capacity` becomes a *minimum* top-N, not a hard cap. pub fn push(&mut self, m: PsmMatch) { @@ -244,9 +244,9 @@ impl TopNQueue { self.heap.push(Reverse(m)); } std::cmp::Ordering::Equal => { - // R-1 (2026-05-18): Java's DBScanner.java:540 keeps tied - // PSMs at capacity (and DBScanner.java:745 keeps SpecE - // ties on the per-spectrum merge). Rust now matches. + // R-1 (2026-05-18): Java parity keeps tied + // PSMs at capacity (and SpecE ties on the per-spectrum + // merge). Rust now matches. // The queue may exceed `capacity` when ties exist — // `capacity` becomes a *minimum* top-N, not a hard cap. self.heap.push(Reverse(m)); @@ -441,9 +441,9 @@ mod tests { #[test] fn topn_queue_keeps_ties_at_capacity() { - // R-1 fix: Java's DBScanner keeps tied PSMs at capacity - // (DBScanner.java:540 raw-score retention; DBScanner.java:745 SpecE - // merge). Rust's TopNQueue must mirror this — strict-greater eviction + // R-1 fix: Java parity keeps tied PSMs at capacity (raw-score + // retention and SpecE merge). Rust's TopNQueue must mirror this — + // strict-greater eviction // was dropping ties Java keeps, plausibly causing the Astral 14K raw- // target gap that R-1 + R-2 closed. let mut q = TopNQueue::new(1); From ba4c6b342b1a10349f48de1906237e4b5e0d70a0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 10:21:59 +0100 Subject: [PATCH 04/23] chore: neutralize "port of MS-GF+" framing in headers and CLI help The codebase is post-cutover; new contributors should read crate-lib top-of-file doc comments as descriptions of what each crate does, not as port-bookkeeping. CLI --help and enum doc comments that compared behavior to Java's command-line options now describe behavior directly. KEEPS user-facing provenance: - README.md and DOCS.md project-lineage sections - Legacy numeric flag values (Java MS-GF+ -X) in --help (user migration) - (Java -precursorCal) in precursor_cal doc (exact flag name we mirror) - docs/parity-analysis/** content - Parity test files Touched: 5 crate-lib headers + msgf-rust CLI framing. --- crates/input/src/lib.rs | 3 +-- crates/model/src/lib.rs | 2 +- crates/msgf-rust/src/bin/msgf-rust.rs | 19 +++++++++---------- crates/output/src/lib.rs | 2 +- crates/scoring/src/lib.rs | 2 +- crates/search/src/lib.rs | 3 ++- 6 files changed, 15 insertions(+), 16 deletions(-) diff --git a/crates/input/src/lib.rs b/crates/input/src/lib.rs index 65dc105a..0f44daac 100644 --- a/crates/input/src/lib.rs +++ b/crates/input/src/lib.rs @@ -1,5 +1,4 @@ -//! Input-side readers for MS-GF+ Rust port: MGF and mzML spectrum files -//! and `.fasta` protein databases. +//! Input readers: MGF, mzML, FASTA. pub mod fasta; pub mod mgf; diff --git a/crates/model/src/lib.rs b/crates/model/src/lib.rs index b931bf3b..14038425 100644 --- a/crates/model/src/lib.rs +++ b/crates/model/src/lib.rs @@ -1,4 +1,4 @@ -//! Domain model for MS-GF+ Rust port. +//! Core domain types: spectra, peptides, modifications, amino-acid sets, masses. //! //! Pure types: amino acids, modifications, peptides, enzymes, //! tolerances, spectra, proteins, masses, activation, instrument, diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index 5f68f4a6..f628c064 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -1,7 +1,7 @@ -//! msgf-rust: end-to-end MS-GF+ search. +//! msgf-rust: end-to-end peptide-spectrum database search. //! //! Loads an MGF or mzML spectrum file and a FASTA target database, runs a -//! tryptic database search with default MS-GF+ parameters, and writes output +//! tryptic database search and writes output //! in Percolator `.pin` format (and optionally `.tsv` format). //! //! Format dispatch: if `--spectrum` ends in `.mzML` or `.mzml`, `MzMLReader` @@ -29,10 +29,9 @@ use search::{ use search::precursor_cal::{constants as cal_constants, sample_every_nth}; use input::{detect_instrument_type, FastaReader, MgfReader, MzMLReader}; -/// Fragmentation method. Named values map to the same param-file resolution -/// logic as Java MS-GF+'s `-m` flag. `Auto` means "detect from the mzML's -/// activation block; fall back to the bundled HCD_QExactive_Tryp.param if -/// nothing detected" — the same semantics as omitting the flag pre-iter39. +/// Fragmentation method. `Auto` means "detect from the mzML's activation block; +/// fall back to the bundled HCD_QExactive_Tryp.param if nothing detected" — +/// the same semantics as omitting the flag pre-iter39. #[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] pub enum Fragmentation { #[clap(name = "auto")] Auto, @@ -52,7 +51,7 @@ pub enum Instrument { #[clap(name = "QExactive")] QExactive, } -/// Search protocol. Maps to Java MS-GF+'s `-protocol` flag. +/// Search protocol: sample labeling or enrichment strategy applied during the experiment. #[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] pub enum Protocol { #[clap(name = "auto")] Auto, @@ -63,8 +62,8 @@ pub enum Protocol { #[clap(name = "standard")] Standard, } -/// Enzymatic-cleavage enforcement at peptide span boundaries. Maps to Java -/// MS-GF+'s `-ntt` flag where 2=fully, 1=semi, 0=non-specific. +/// Enzymatic-cleavage enforcement at peptide span boundaries: +/// 2=fully, 1=semi, 0=non-specific. #[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] pub enum EnzymeSpecificity { #[clap(name = "non-specific")] NonSpecific, @@ -75,7 +74,7 @@ pub enum EnzymeSpecificity { #[derive(Parser, Debug)] #[command( name = "msgf-rust", - about = "MS-GF+ Rust port: database search of MGF/mzML spectra against FASTA", + about = "msgf-rust: database search of MGF/mzML spectra against FASTA", allow_hyphen_values = true, )] struct Cli { diff --git a/crates/output/src/lib.rs b/crates/output/src/lib.rs index a062cb7d..badb2a28 100644 --- a/crates/output/src/lib.rs +++ b/crates/output/src/lib.rs @@ -1,4 +1,4 @@ -//! Output writers for MS-GF+ search results. +//! Output writers: Percolator PIN, TSV. //! //! # Known column behaviors //! diff --git a/crates/scoring/src/lib.rs b/crates/scoring/src/lib.rs index 22482f6e..afed8aff 100644 --- a/crates/scoring/src/lib.rs +++ b/crates/scoring/src/lib.rs @@ -1,4 +1,4 @@ -//! Scoring sub-system for MS-GF+ Rust port. +//! Scoring model, ion-type prediction, and generating-function DP. //! //! Contains the parameter model, rank-based scoring, fragment ion //! prediction, and the generating-function DP for SpecEValue. diff --git a/crates/search/src/lib.rs b/crates/search/src/lib.rs index ce67f270..3ee5881f 100644 --- a/crates/search/src/lib.rs +++ b/crates/search/src/lib.rs @@ -1,4 +1,5 @@ -//! Search sub-system for MS-GF+ Rust port. +//! Peptide database search engine: candidate enumeration, precursor matching, +//! scoring, and PSM aggregation. //! //! Contains candidate generation, suffix array, search index, precursor //! matching, PSM structures, and the match engine. From f0831b0324395bffa944f9eb7bc2c0c150812389 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 10:25:38 +0100 Subject: [PATCH 05/23] chore: rename MSGFRUST_RSS_PROBE -> MSGF_RSS_PROBE (legacy accepted) The "MSGFRUST_" prefix dates from an early iter-era naming and does not match the binary's identity (msgf-rust). Switch the primary name to MSGF_RSS_PROBE and accept the legacy name for this release with a one-line deprecation warning on stderr. The legacy name will be removed in the next quality cleanup. Side-effect-only env var; no functional change to search/scoring. --- crates/msgf-rust/src/bin/msgf-rust.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index f628c064..48e17654 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -227,13 +227,24 @@ fn main() -> ExitCode { } } -/// Print VmRSS for the current process under MSGFRUST_RSS_PROBE=1. No-op +/// Print VmRSS for the current process under MSGF_RSS_PROBE=1. No-op /// otherwise and a no-op on non-Linux platforms regardless of the env var. +/// (Legacy name MSGFRUST_RSS_PROBE is accepted with a deprecation warning.) /// /// We gate behind an env var so production runs stay quiet; flip the var on /// when debugging memory regressions. fn log_rss(tag: &str) { - if std::env::var_os("MSGFRUST_RSS_PROBE").is_none() { + // Accept both new and legacy env var names. Legacy emits a one-time + // deprecation warning on stderr. + let new_set = std::env::var_os("MSGF_RSS_PROBE").is_some(); + let legacy_set = std::env::var_os("MSGFRUST_RSS_PROBE").is_some(); + if legacy_set && !new_set { + eprintln!( + "WARN: MSGFRUST_RSS_PROBE is deprecated; use MSGF_RSS_PROBE \ + (legacy name accepted in this release, will be removed next)" + ); + } + if !new_set && !legacy_set { return; } #[cfg(target_os = "linux")] From 20da1b4e4f0fec98b74f030ab84dac301d0307b1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 10:46:36 +0100 Subject: [PATCH 06/23] chore: fix all clippy warnings (workspace) Brings the workspace to clippy-clean on stable 1.87.0 so the CI lint job can be lifted from advisory to required in the next commit. Changes by class: - map_or simplifications: mechanical rewrite via clippy --fix - complex-type aliases: SegmentPartitionCache, SegmentPartitionSlice, DeconvResult, and RankKeptCtx struct in crates/scoring/src/scoring/scored_spectrum.rs - too_many_arguments: RankKeptCtx context struct in scored_spectrum.rs; #[allow] with reason for directional_node_score_inner, write_tsv, write_tsv_to, write_spectrum_rows, and compute_cleavage_credit - doc-list indentation: add blank line after list / fix continuation indent at 15 sites in msgf-rust.rs and scored_spectrum.rs - unused_mut, ? rewrite, manual split_once, loop-counter: via clippy --fix - needless_range_loop: suppressed with reason (seg indexes cache AND serves as fallback partition_for arg) No functional behavior change; PIN/TSV bit-identical regression gate in tree (precursor_cal_bit_identical) is the verification. --- crates/model/src/aa_set.rs | 2 +- crates/msgf-rust/src/bin/msgf-rust.rs | 19 ++-- crates/output/src/tsv.rs | 3 + crates/scoring/src/param_model.rs | 2 +- crates/scoring/src/scoring/scored_spectrum.rs | 87 ++++++++++++------- .../tests/add_prob_dist_chunked_parity.rs | 4 +- crates/search/src/distinct_peptide.rs | 2 +- crates/search/src/match_engine.rs | 3 +- crates/search/src/precursor_cal.rs | 2 +- crates/search/src/sa_walk.rs | 4 +- 10 files changed, 76 insertions(+), 52 deletions(-) diff --git a/crates/model/src/aa_set.rs b/crates/model/src/aa_set.rs index c8e54c97..cfe8209a 100644 --- a/crates/model/src/aa_set.rs +++ b/crates/model/src/aa_set.rs @@ -266,7 +266,7 @@ impl AminoAcidSetBuilder { continue; } // Take everything after the first `=`. Java accepts whitespace around the value. - let value = line.splitn(2, '=').nth(1).unwrap_or("").trim(); + let value = line.split_once('=').map(|x| x.1).unwrap_or("").trim(); let n: u32 = value.parse().map_err(|_| AaSetError::BadNumMods { value: value.to_string(), })?; diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index 48e17654..a1b1b397 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -175,6 +175,7 @@ struct Cli { /// strings (e.g. `C2H3N1O1`) are **not** yet supported. /// - `` is a single uppercase letter or `*` (wildcard). /// - `` is one of `any|N-term|C-term|Prot-N-term|Prot-C-term`. + /// /// A single `NumMods=N` line sets the max variable mods per peptide. /// Inline `#`-comments are stripped. Blank lines and full-line `#`-comments /// are ignored. When omitted, the binary uses its built-in defaults @@ -930,7 +931,7 @@ fn run(cli: Cli) -> Result<(), Box> { /// - fragmentation: 0=Auto/CID, 1=CID, 2=ETD, 3=HCD, 4=UVPD /// - instrument: 0=LowRes, 1=HighRes, 2=TOF, 3=QExactive /// - protocol: 0=Automatic,1=Phosphorylation, 2=iTRAQ, -/// 3=iTRAQPhospho, 4=TMT, 5=Standard +/// 3=iTRAQPhospho, 4=TMT, 5=Standard /// /// When all three are `None`, the historical default /// `HCD_QExactive_Tryp.param` is returned (preserving existing tests' @@ -1065,12 +1066,10 @@ fn detect_dominant_activation(spectrum_path: &std::path::Path) -> Option = std::collections::HashMap::new(); - let mut seen = 0usize; - for item in reader { + for (seen, item) in reader.enumerate() { if seen >= MAX_PEEK { break; } - seen += 1; if let Ok(spec) = item { if let Some(m) = spec.activation_method { *counts.entry(m).or_insert(0) += 1; @@ -1135,13 +1134,13 @@ fn detect_dominant_activation(spectrum_path: &std::path::Path) -> Option( writer: &mut W, spectra: &[Spectrum], @@ -122,6 +124,7 @@ struct RowCtx<'a> { ppm_mode: bool, } +#[allow(clippy::too_many_arguments, reason = "Writer API mirrors PIN writer; grouping into a struct would diverge from the parallel write_pin API")] fn write_spectrum_rows( writer: &mut W, spec: &Spectrum, diff --git a/crates/scoring/src/param_model.rs b/crates/scoring/src/param_model.rs index 2a267471..15d95fd8 100644 --- a/crates/scoring/src/param_model.rs +++ b/crates/scoring/src/param_model.rs @@ -362,7 +362,7 @@ fn read_param(cursor: &mut Cursor<&[u8]>) -> Result { for &partition in &partitions { let frag_list = frag_off_table.get(&partition); // Skip partitions with no ion types. - if frag_list.map_or(true, |v| v.is_empty()) { + if frag_list.is_none_or(|v| v.is_empty()) { continue; } let mut table: HashMap> = HashMap::new(); diff --git a/crates/scoring/src/scoring/scored_spectrum.rs b/crates/scoring/src/scoring/scored_spectrum.rs index 1b90707a..5c012ec5 100644 --- a/crates/scoring/src/scoring/scored_spectrum.rs +++ b/crates/scoring/src/scoring/scored_spectrum.rs @@ -29,6 +29,25 @@ use model::spectrum::Spectrum; const PROTON: f64 = 1.007_276_49; +/// Per-segment partition entries: `(Partition, Vec<(IonType, log-probs)>)`. +pub(crate) type SegmentPartitionCache = Vec<(Partition, Vec<(IonType, Vec)>)>; +/// Borrowed slice of per-segment partition entries. +pub(crate) type SegmentPartitionSlice<'a> = &'a [(Partition, Vec<(IonType, Vec)>)]; +/// Result of deconvolution: optional peak list and aligned rank list. +type DeconvResult = (Option>, Option>); + +/// Scoring context passed to `ScoredSpectrum::rank_kept`, bundling scalar +/// per-spectrum fields to stay under the `too_many_arguments` limit. +struct RankKeptCtx { + prob_peak: f32, + main_ion: IonType, + parent_mass: f64, + charge: u8, + segment_partition_cache: SegmentPartitionCache, + prefix_score_cache: Vec, + suffix_score_cache: Vec, +} + /// iter31 P-2: cache the (MSGF_TRACE_IONS && MSGF_TRACE_PEP) env-var probe /// once instead of calling `env::var_os` twice per `directional_node_score_inner` /// invocation. The inner loop fires for every (spectrum × split × segment) @@ -105,7 +124,7 @@ pub struct ScoredSpectrum<'a> { /// constructor `new_without_filtering` (no Param / RankScorer in scope) /// the cache is empty; the hot path tolerates length 0 by simply /// iterating no segments and returning 0.0. - segment_partition_cache: Vec<(Partition, Vec<(IonType, Vec)>)>, + segment_partition_cache: SegmentPartitionCache, /// FastScorer-style directional node-score tables indexed by nominal /// residue mass. Populated for production `new()` so candidate scoring /// can do array lookups instead of recomputing per-split node scores. @@ -130,8 +149,8 @@ pub struct ScoredSpectrum<'a> { /// Without this cache, `observed_node_mass` was 11.56% of Astral wall /// (per iter35 perf profile) — each call did a binary_search over peaks /// + linear scan. iter33's per-candidate `psm_edge_score` calls it twice - /// per edge × 9 edges × 16M candidates ≈ 290M times per Astral spectrum, - /// repeatedly for the same `node_nominal` values. + /// per edge × 9 edges × 16M candidates ≈ 290M times per Astral spectrum, + /// repeatedly for the same `node_nominal` values. observed_mass_cache: std::cell::RefCell>, } @@ -230,7 +249,7 @@ impl<'a> ScoredSpectrum<'a> { // spectra (a large fraction of the data), introducing a per-spectrum // divergence in both `prob_peak` and the prefix/suffix node-score // cache. - let (deconv_peaks, deconv_ranks): (Option>, Option>) = + let (deconv_peaks, deconv_ranks): DeconvResult = if param.apply_deconvolution { let tol = param.deconvolution_error_tolerance as f64; let (dp, dr) = deconvolute_spectrum(&spec.peaks, &ranks, charge, tol); @@ -268,7 +287,7 @@ impl<'a> ScoredSpectrum<'a> { // borrowed slice; `.to_vec()` clones it to owned so the cache can // outlive the borrow on `scorer`. let num_segs = param.num_segments.max(0) as usize; - let segment_partition_cache: Vec<(Partition, Vec<(IonType, Vec)>)> = (0..num_segs) + let segment_partition_cache: SegmentPartitionCache = (0..num_segs) .map(|seg| { let p = param.partition_for(charge, parent_mass, seg); let logs = scorer.partition_ion_logs(&p).to_vec(); @@ -363,14 +382,20 @@ impl<'a> ScoredSpectrum<'a> { // empty. `directional_node_score` tolerates an empty cache: the // outer loop iterates zero times and the function returns 0.0. // The test-fixture path doesn't need the per-segment optimization. - let segment_partition_cache: Vec<(Partition, Vec<(IonType, Vec)>)> = Vec::new(); - let prefix_score_cache: Vec = Vec::new(); - let suffix_score_cache: Vec = Vec::new(); Self::rank_kept( - spec, kept, kept_count, ranks, prob_peak, main_ion, parent_mass, charge, - segment_partition_cache, - prefix_score_cache, - suffix_score_cache, + spec, + kept, + kept_count, + ranks, + RankKeptCtx { + prob_peak, + main_ion, + parent_mass, + charge, + segment_partition_cache: Vec::new(), + prefix_score_cache: Vec::new(), + suffix_score_cache: Vec::new(), + }, ) } @@ -382,13 +407,7 @@ impl<'a> ScoredSpectrum<'a> { mut kept: Vec<(usize, f32, f64)>, kept_count: usize, mut ranks: Vec, - prob_peak: f32, - main_ion: IonType, - parent_mass: f64, - charge: u8, - segment_partition_cache: Vec<(Partition, Vec<(IonType, Vec)>)>, - prefix_score_cache: Vec, - suffix_score_cache: Vec, + ctx: RankKeptCtx, ) -> Self { let total_intensity: f64 = kept.iter().map(|&(_, intensity, _)| intensity as f64).sum(); kept.sort_by(|a, b| { @@ -405,13 +424,13 @@ impl<'a> ScoredSpectrum<'a> { ranks, kept_count, total_intensity, - prob_peak, - main_ion, - parent_mass, - charge, - segment_partition_cache, - prefix_score_cache, - suffix_score_cache, + prob_peak: ctx.prob_peak, + main_ion: ctx.main_ion, + parent_mass: ctx.parent_mass, + charge: ctx.charge, + segment_partition_cache: ctx.segment_partition_cache, + prefix_score_cache: ctx.prefix_score_cache, + suffix_score_cache: ctx.suffix_score_cache, deconv_peaks: None, deconv_ranks: None, // iter36: empty cache for test fixtures (rank_kept path). All @@ -542,7 +561,7 @@ impl<'a> ScoredSpectrum<'a> { if self.ranks[i] == u32::MAX { continue; } - if best.as_ref().map_or(true, |(_, best_int)| intensity > *best_int) { + if best.as_ref().is_none_or(|(_, best_int)| intensity > *best_int) { best = Some((i, intensity)); } } @@ -588,7 +607,7 @@ impl<'a> ScoredSpectrum<'a> { if self.ranks[i] == u32::MAX { continue; } - if best.as_ref().map_or(true, |(_, best_int)| intensity > *best_int) { + if best.as_ref().is_none_or(|(_, best_int)| intensity > *best_int) { best = Some((i, intensity)); } } @@ -665,10 +684,11 @@ impl<'a> ScoredSpectrum<'a> { ) } + #[allow(clippy::too_many_arguments, reason = "private inner driver tightly coupled to the scoring loop; all args are distinct")] fn directional_node_score_inner( peaks: &[(f64, f32)], ranks: &[u32], - segment_partition_cache: &[(Partition, Vec<(IonType, Vec)>)], + segment_partition_cache: SegmentPartitionSlice<'_>, scorer: &RankScorer, nominal_mass: f64, is_prefix: bool, @@ -689,6 +709,9 @@ impl<'a> ScoredSpectrum<'a> { // which on Astral runs is ~hundreds of millions of acquisitions of the // global env lock. let trace_ions = trace_ions_enabled(); + // `seg` indexes both the cache AND serves as the fallback argument to + // `partition_for` when the cache is absent — the range loop is required. + #[allow(clippy::needless_range_loop)] for seg in 0..num_segs { let ion_logs_slice: &[(IonType, Vec)] = if use_cache { segment_partition_cache[seg].1.as_slice() @@ -788,7 +811,7 @@ impl<'a> ScoredSpectrum<'a> { if ranks[i] == u32::MAX { continue; } - if best_peak_mz.as_ref().map_or(true, |&(_, best_int)| intensity > best_int) { + if best_peak_mz.as_ref().is_none_or(|&(_, best_int)| intensity > best_int) { best_peak_mz = Some((mz, intensity)); } } @@ -887,7 +910,7 @@ fn nearest_peak_rank_in(peaks: &[(f64, f32)], ranks: &[u32], target_mz: f64, tol if ranks[i] == u32::MAX { continue; } - if best.as_ref().map_or(true, |(_, best_int)| intensity > *best_int) { + if best.as_ref().is_none_or(|(_, best_int)| intensity > *best_int) { best = Some((i, intensity)); } } @@ -1669,7 +1692,7 @@ mod tests { let mut best: Option<(usize, f64)> = None; for (i, &(mz, _)) in s.peaks.iter().enumerate() { if (mz - target).abs() <= tol - && best.as_ref().map_or(true, |(_, d)| (mz - target).abs() < *d) + && best.as_ref().is_none_or(|(_, d)| (mz - target).abs() < *d) { best = Some((i, (mz - target).abs())); } diff --git a/crates/scoring/tests/add_prob_dist_chunked_parity.rs b/crates/scoring/tests/add_prob_dist_chunked_parity.rs index a0f39251..9a08f571 100644 --- a/crates/scoring/tests/add_prob_dist_chunked_parity.rs +++ b/crates/scoring/tests/add_prob_dist_chunked_parity.rs @@ -31,8 +31,8 @@ fn add_prob_dist_scalar( for t in t_start..t_end { let src_idx = (t - other_min) as usize; let dst_idx = (t + score_diff - self_min) as usize; - let cur = dst.get_probability((t + score_diff) as i32); - dst.set_prob((t + score_diff) as i32, cur + src_p(src, src_idx) * aa_prob); + let cur = dst.get_probability((t + score_diff)); + dst.set_prob((t + score_diff), cur + src_p(src, src_idx) * aa_prob); let _ = dst_idx; // silence } } diff --git a/crates/search/src/distinct_peptide.rs b/crates/search/src/distinct_peptide.rs index 56ece42f..c4dd2d8f 100644 --- a/crates/search/src/distinct_peptide.rs +++ b/crates/search/src/distinct_peptide.rs @@ -89,6 +89,6 @@ mod tests { }); assert_eq!(dp.positions.len(), 2); assert_eq!(dp.positions[0].protein_index, 0); - assert_eq!(dp.positions[1].is_decoy, true); + assert!(dp.positions[1].is_decoy); } } diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index a923eb22..f8287cb1 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -294,6 +294,7 @@ impl<'a> PreparedSearch<'a> { // monomorphizes + inlines into the candidate loop. Closure form // was not being inlined and went through FnMut::call_mut dispatch. #[inline(always)] + #[allow(clippy::too_many_arguments, reason = "private inner driver for the per-chunk search loop; all args are orthogonal cleavage parameters")] fn compute_cleavage_credit( cand: &Candidate, enz: Enzyme, @@ -413,7 +414,7 @@ impl<'a> PreparedSearch<'a> { let could_win = match per_charge_queues.get(&z) { Some(q) if q.len() >= q.capacity() as usize => { q.worst_rank_score() - .map_or(true, |worst| pin_score + max_edge_bonus > worst) + .is_none_or(|worst| pin_score + max_edge_bonus > worst) } // Queue below capacity (or doesn't exist yet): accept // everything until it fills up. diff --git a/crates/search/src/precursor_cal.rs b/crates/search/src/precursor_cal.rs index 046c9fa9..755e7659 100644 --- a/crates/search/src/precursor_cal.rs +++ b/crates/search/src/precursor_cal.rs @@ -92,7 +92,7 @@ pub fn median_absolute_deviation(values: &[f64], center: f64) -> f64 { if values.is_empty() { return 0.0; } - let mut deviations: Vec = values.iter().map(|v| (v - center).abs()).collect(); + let deviations: Vec = values.iter().map(|v| (v - center).abs()).collect(); median(&deviations) } diff --git a/crates/search/src/sa_walk.rs b/crates/search/src/sa_walk.rs index 92b58780..75379084 100644 --- a/crates/search/src/sa_walk.rs +++ b/crates/search/src/sa_walk.rs @@ -162,9 +162,7 @@ impl<'a> SaPeptideStream<'a> { return None; } let aa = byte_to_residue(b); - if AminoAcid::standard(aa).is_none() { - return None; - } + AminoAcid::standard(aa)?; ascii.push(aa); } // Position resolution doubles as a protein-boundary check: if the From 67316e5612a560651b859de2b4e3d0f85e277ecb Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 10:55:08 +0100 Subject: [PATCH 07/23] ci: lift clippy gate to required (--all-targets -D warnings) After PR-Q1 Task 4 left the workspace clippy-clean on --lib targets, remove continue-on-error from the lint job's clippy step and extend the lint command to --all-targets (covers tests + examples + bin in addition to lib). Also addresses 5 residual warnings in test/example targets that the --lib-only fix in Task 4 didn't reach: - crates/scoring/examples/dump_main_ion.rs: struct field shorthand - crates/scoring/examples/dump_prefix_cache.rs: needless_range_loop - crates/scoring/tests/add_prob_dist_chunked_parity.rs: unnecessary parens Rustfmt remains advisory (~11k lines of fmt churn pending; separate cleanup). Lint job now blocks PRs on clippy regressions. --- .github/workflows/ci.yml | 10 ++++------ crates/scoring/examples/dump_main_ion.rs | 2 +- crates/scoring/examples/dump_prefix_cache.rs | 3 +-- crates/scoring/tests/add_prob_dist_chunked_parity.rs | 4 ++-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 290e8611..4d61a688 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,10 +75,9 @@ jobs: lint: name: Lint (clippy + rustfmt) runs-on: ubuntu-latest - # Advisory only — the iter1-38 codebase isn't fmt-clean / clippy-clean - # yet (~11k lines of fmt churn pending). Surfaces the warnings without - # blocking PRs while that cleanup is sequenced separately. - continue-on-error: true + # Clippy is REQUIRED after the PR-Q1 cleanup sweep (2026-05-26). + # Rustfmt remains advisory until a future fmt-clean sweep lands + # (~11k lines of cosmetic churn pending; tracked separately). steps: - name: Checkout uses: actions/checkout@v4 @@ -96,5 +95,4 @@ jobs: continue-on-error: true - name: clippy - run: cargo clippy --workspace --all-targets - continue-on-error: true + run: cargo clippy --workspace --all-targets -- -D warnings diff --git a/crates/scoring/examples/dump_main_ion.rs b/crates/scoring/examples/dump_main_ion.rs index 80e5076a..e364707c 100644 --- a/crates/scoring/examples/dump_main_ion.rs +++ b/crates/scoring/examples/dump_main_ion.rs @@ -33,7 +33,7 @@ fn main() { let num_segs = param.num_segments.max(1) as usize; let mut ion_freq: std::collections::HashMap = std::collections::HashMap::new(); for seg in 0..num_segs { - let p = scoring::param_model::Partition { charge: charge, parent_mass: part.parent_mass, seg_num: seg as i32 }; + let p = scoring::param_model::Partition { charge, parent_mass: part.parent_mass, seg_num: seg as i32 }; if let Some(frags) = param.frag_off_table.get(&p) { for f in frags { if matches!(f.ion_type, IonType::Noise) { continue; } diff --git a/crates/scoring/examples/dump_prefix_cache.rs b/crates/scoring/examples/dump_prefix_cache.rs index 4d64e2a6..ffed9b5a 100644 --- a/crates/scoring/examples/dump_prefix_cache.rs +++ b/crates/scoring/examples/dump_prefix_cache.rs @@ -110,8 +110,7 @@ fn main() { println!("\n== nominal_mass = {:.1} (is_prefix=true) ==", nominal_mass); let mut total = 0.0_f32; let mut any_iter = false; - for seg in 0..num_segs { - let logs_slice = &cached_ion_logs[seg]; + for (seg, logs_slice) in cached_ion_logs.iter().enumerate().take(num_segs) { for (ion, logs) in logs_slice { if !ion.is_prefix() { continue; diff --git a/crates/scoring/tests/add_prob_dist_chunked_parity.rs b/crates/scoring/tests/add_prob_dist_chunked_parity.rs index 9a08f571..31395838 100644 --- a/crates/scoring/tests/add_prob_dist_chunked_parity.rs +++ b/crates/scoring/tests/add_prob_dist_chunked_parity.rs @@ -31,8 +31,8 @@ fn add_prob_dist_scalar( for t in t_start..t_end { let src_idx = (t - other_min) as usize; let dst_idx = (t + score_diff - self_min) as usize; - let cur = dst.get_probability((t + score_diff)); - dst.set_prob((t + score_diff), cur + src_p(src, src_idx) * aa_prob); + let cur = dst.get_probability(t + score_diff); + dst.set_prob(t + score_diff, cur + src_p(src, src_idx) * aa_prob); let _ = dst_idx; // silence } } From 2e1b6c7ed422cb893e24d95f4fca29a37d7a4c3b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 10:55:58 +0100 Subject: [PATCH 08/23] docs: remove shipped iter39 design+plan; track PR-Q1 plan Deletes the iter39 docs-rewrite spec and plan (shipped 2026-05-23 via PR #30; the rewrite is in dev and being relied on, so the design docs no longer need to be discoverable in the repo). Their lineage is in git history. Tracks the in-flight PR-Q1 implementation plan alongside its design spec (committed in 55cff3fa). Future protocol: when a docs/specs design file references a feature that has fully shipped and closed any deferred gate, remove it in the next quality cleanup. --- .../plans/2026-05-23-iter39-docs-rewrite.md | 1440 ----------------- .../plans/2026-05-26-quality-cleanup-plan.md | 1149 +++++++++++++ .../2026-05-23-iter39-docs-rewrite-design.md | 272 ---- 3 files changed, 1149 insertions(+), 1712 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md create mode 100644 docs/superpowers/plans/2026-05-26-quality-cleanup-plan.md delete mode 100644 docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md diff --git a/docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md b/docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md deleted file mode 100644 index 817a899a..00000000 --- a/docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md +++ /dev/null @@ -1,1440 +0,0 @@ -# iter39 — docs rewrite + CLI rename Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Rewrite README/docs to fit msgf-rust as a new app (not a Java fork), and rename Java-historical CLI flags to Rust-idiomatic named values with full backward compatibility for quantms scripts. - -**Architecture:** Five sequential commits on branch `iter39-docs-rewrite`. Commit 1 lands the CLI rename + tests (the only commit that touches Rust). Commits 2-4 add three new root-level docs (`README.md`, `DOCS.md`, `CLI_MIGRATION.md`). Commit 5 deletes the legacy `docs/` tree. - -**Tech Stack:** Rust 1.87, clap 4.x (`ValueEnum` derive + custom `value_parser`), cargo test. - -**Constraint:** The repo has a commit-message hook that blocks the word "superpowers" — none of the commit messages in this plan contain that substring. The phrase "skills planning artifacts" is used instead where relevant. - -**Design spec:** `docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md` (commit `eb4953cc`). - ---- - -## File Structure - -**Files modified (in `crates/msgf-rust/`):** -- `crates/msgf-rust/src/bin/msgf-rust.rs` — add 4 enum types + 4 custom parsers, change `Cli` struct fields, update `resolve_bundled_param` signature, update 15 `param_resolver_tests`. -- `crates/msgf-rust/tests/cli_smoke.rs` — add one new integration test. - -**Files created (at repo root):** -- `README.md` — replace existing 193-line Java-tool README with ~190-line linear narrative. -- `DOCS.md` — new ~505-line single-file reference. -- `CLI_MIGRATION.md` — new ~100-line mapping doc. - -**Files deleted (38 tracked files under `docs/`):** -- All listed in Task 7. The `docs/superpowers/specs/` and `docs/superpowers/plans/` paths are preserved. - ---- - -## Task 1: Add `Fragmentation`, `Instrument`, `Protocol`, `EnzymeSpecificity` enums and custom parsers - -**Files:** -- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs:1-30` (add `use` statements + enum definitions) -- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` (append parser functions at end of file, before tests) - -This task adds the enum types and parsers but doesn't yet wire them into the `Cli` struct or resolver. After this task the code still compiles and all existing tests pass. - -- [ ] **Step 1.1: Add `clap::ValueEnum` import** - -Add `ValueEnum` to the existing `clap` import line at the top of `crates/msgf-rust/src/bin/msgf-rust.rs`: - -```rust -use clap::{Parser, ValueEnum}; -``` - -(The file currently imports just `Parser`.) - -- [ ] **Step 1.2: Add the four enum types** - -Add right after the imports, before the `#[derive(Parser)] struct Cli` block: - -```rust -/// Fragmentation method. Named values map to the same param-file resolution -/// logic as Java MS-GF+'s `-m` flag. `Auto` means "detect from the mzML's -/// activation block; fall back to the bundled HCD_QExactive_Tryp.param if -/// nothing detected" — the same semantics as omitting the flag pre-iter39. -#[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] -pub enum Fragmentation { - #[clap(name = "auto")] Auto, - #[clap(name = "CID")] Cid, - #[clap(name = "ETD")] Etd, - #[clap(name = "HCD")] Hcd, - #[clap(name = "UVPD")] Uvpd, -} - -/// Instrument class. Drives the `LowRes`/`HighRes`/`TOF`/`QExactive` -/// classification used to pick the bundled param file. -#[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] -pub enum Instrument { - #[clap(name = "low-res")] LowRes, - #[clap(name = "high-res")] HighRes, - #[clap(name = "TOF")] Tof, - #[clap(name = "QExactive")] QExactive, -} - -/// Search protocol. Maps to Java MS-GF+'s `-protocol` flag. -#[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] -pub enum Protocol { - #[clap(name = "auto")] Auto, - #[clap(name = "phospho")] Phospho, - #[clap(name = "iTRAQ")] Itraq, - #[clap(name = "iTRAQ-phospho")] ItraqPhospho, - #[clap(name = "TMT")] Tmt, - #[clap(name = "standard")] Standard, -} - -/// Enzymatic-cleavage enforcement at peptide span boundaries. Maps to Java -/// MS-GF+'s `-ntt` flag where 2=fully, 1=semi, 0=non-specific. -#[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)] -pub enum EnzymeSpecificity { - #[clap(name = "non-specific")] NonSpecific, - #[clap(name = "semi")] Semi, - #[clap(name = "fully")] Fully, -} -``` - -- [ ] **Step 1.3: Add the four custom parser functions** - -Add at the bottom of the file (before the existing `#[cfg(test)] mod param_resolver_tests`), one parser per enum. Each accepts the canonical named form first, then falls back to the legacy numeric Java MS-GF+ ID: - -```rust -/// Parse `--fragmentation` value. Accepts named (case-insensitive: auto, CID, -/// ETD, HCD, UVPD) or legacy numeric (0=Auto, 1=CID, 2=ETD, 3=HCD, 4=UVPD). -fn parse_fragmentation(s: &str) -> Result { - if let Ok(v) = ::from_str(s, true) { return Ok(v); } - match s.parse::() { - Ok(0) => Ok(Fragmentation::Auto), - Ok(1) => Ok(Fragmentation::Cid), - Ok(2) => Ok(Fragmentation::Etd), - Ok(3) => Ok(Fragmentation::Hcd), - Ok(4) => Ok(Fragmentation::Uvpd), - _ => Err(format!( - "invalid fragmentation `{s}`: expected auto|CID|ETD|HCD|UVPD \ - (or legacy 0..=4)" - )), - } -} - -/// Parse `--instrument` value. Accepts named (low-res, high-res, TOF, -/// QExactive) or legacy numeric (0=LowRes, 1=HighRes, 2=TOF, 3=QExactive). -fn parse_instrument(s: &str) -> Result { - if let Ok(v) = ::from_str(s, true) { return Ok(v); } - match s.parse::() { - Ok(0) => Ok(Instrument::LowRes), - Ok(1) => Ok(Instrument::HighRes), - Ok(2) => Ok(Instrument::Tof), - Ok(3) => Ok(Instrument::QExactive), - _ => Err(format!( - "invalid instrument `{s}`: expected low-res|high-res|TOF|QExactive \ - (or legacy 0..=3)" - )), - } -} - -/// Parse `--protocol` value. Accepts named or legacy numeric -/// (0=Auto, 1=Phospho, 2=iTRAQ, 3=iTRAQ-phospho, 4=TMT, 5=Standard). -fn parse_protocol(s: &str) -> Result { - if let Ok(v) = ::from_str(s, true) { return Ok(v); } - match s.parse::() { - Ok(0) => Ok(Protocol::Auto), - Ok(1) => Ok(Protocol::Phospho), - Ok(2) => Ok(Protocol::Itraq), - Ok(3) => Ok(Protocol::ItraqPhospho), - Ok(4) => Ok(Protocol::Tmt), - Ok(5) => Ok(Protocol::Standard), - _ => Err(format!( - "invalid --protocol `{s}`: valid range is 0..=5 \ - (0=Automatic, 1=Phosphorylation, 2=iTRAQ, 3=iTRAQPhospho, \ - 4=TMT, 5=Standard) or named auto|phospho|iTRAQ|iTRAQ-phospho|TMT|standard" - )), - } -} - -/// Parse `--enzyme-specificity` (`--ntt`) value. Accepts named -/// (non-specific, semi, fully) or legacy numeric (0=non-specific, -/// 1=semi, 2=fully). -fn parse_enzyme_specificity(s: &str) -> Result { - if let Ok(v) = ::from_str(s, true) { return Ok(v); } - match s.parse::() { - Ok(0) => Ok(EnzymeSpecificity::NonSpecific), - Ok(1) => Ok(EnzymeSpecificity::Semi), - Ok(2) => Ok(EnzymeSpecificity::Fully), - _ => Err(format!( - "invalid enzyme specificity `{s}`: expected non-specific|semi|fully \ - (or legacy 0..=2)" - )), - } -} -``` - -- [ ] **Step 1.4: Verify the file compiles** - -Run: `cargo build --release -p msgf-rust 2>&1 | tail -5` -Expected: `Finished` (no errors). Warnings about unused enums/parsers are OK at this step — they'll be used in Task 2. - -- [ ] **Step 1.5: Verify existing tests still pass** - -Run: `cargo test --release -p msgf-rust 2>&1 | tail -5` -Expected: `test result: ok. 15 passed; 0 failed` for `param_resolver_tests` (the 15 existing resolver tests still pass — we haven't changed any logic yet). - -**Do not commit yet** — Task 2 finishes this commit. - ---- - -## Task 2: Wire the enums into `Cli` struct + `resolve_bundled_param` signature - -**Files:** -- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` — `Cli` struct fields, `resolve_bundled_param` and `resolve_bundled_param_for_activation` signatures, call sites in `run()`, the 15 `param_resolver_tests`. - -This task migrates the entire codebase from `Option` to the new enum types. After this task the code compiles, existing semantics are preserved (legacy numeric values still resolve to the same param files), and the 15 resolver tests pass with updated signatures. - -- [ ] **Step 2.1: Update the `Cli` struct fields** - -In `crates/msgf-rust/src/bin/msgf-rust.rs`, locate the four CLI fields (currently at approximately lines 84, 128, 134, 140, 147) and replace them. Show the AFTER state of each. - -Replace `ntt` field: -```rust - /// Number of Tolerable Termini (enzymatic-cleavage enforcement at span - /// boundaries). `fully`: both termini must be cleavage sites (strict, - /// equivalent to Java -ntt 2). `semi`: at least one terminus must be a - /// cleavage site (Java -ntt 1). `non-specific`: neither terminus needs - /// to be a cleavage site (Java -ntt 0). Legacy numeric 0/1/2 still accepted. - #[arg(long = "enzyme-specificity", alias = "ntt", - default_value = "fully", value_parser = parse_enzyme_specificity)] - enzyme_specificity: EnzymeSpecificity, -``` - -Replace `mod_file` field with `mods`: -```rust - /// Path to a mods.txt file describing fixed and variable modifications. - /// Format: each non-comment line is - /// `,,,,`, where: - /// - `` is a numeric monoisotopic mass delta (Da). Composition - /// strings (e.g. `C2H3N1O1`) are **not** yet supported. - /// - `` is a single uppercase letter or `*` (wildcard). - /// - `` is one of `any|N-term|C-term|Prot-N-term|Prot-C-term`. - /// A single `NumMods=N` line sets the max variable mods per peptide. - /// Inline `#`-comments are stripped. Blank lines and full-line `#`-comments - /// are ignored. When omitted, the binary uses its built-in defaults - /// (Carbamidomethyl-C fixed, Oxidation-M variable). The deprecated - /// `--mod` form (singular) is still accepted as a hidden alias. - #[arg(long = "mods", alias = "mod", value_name = "MODFILE")] - mods: Option, -``` - -Replace `fragmentation` field: -```rust - /// Fragmentation method. Named values: auto, CID, ETD, HCD, UVPD. - /// Legacy numeric (Java MS-GF+ `-m`): 0=auto, 1=CID, 2=ETD, 3=HCD, 4=UVPD. - #[arg(long, default_value = "auto", value_parser = parse_fragmentation)] - fragmentation: Fragmentation, -``` - -Replace `instrument` field: -```rust - /// Instrument class. Named values: low-res, high-res, TOF, QExactive. - /// Legacy numeric (Java MS-GF+ `-inst`): 0=low-res, 1=high-res, 2=TOF, 3=QExactive. - #[arg(long, default_value = "low-res", value_parser = parse_instrument)] - instrument: Instrument, -``` - -Replace `protocol` field: -```rust - /// Search protocol. Named values: auto, phospho, iTRAQ, iTRAQ-phospho, TMT, standard. - /// Legacy numeric (Java MS-GF+ `-protocol`): 0=auto, 1=phospho, 2=iTRAQ, 3=iTRAQ-phospho, 4=TMT, 5=standard. - #[arg(long, default_value = "auto", value_parser = parse_protocol)] - protocol: Protocol, -``` - -Remove the existing `ntt: u8` field entirely. - -- [ ] **Step 2.2: Update body references to renamed fields** - -Find the existing reference to `cli.mod_file` (around line 305): - -Replace: -```rust -let (aa, num_mods_from_file) = match &cli.mod_file { -``` -With: -```rust -let (aa, num_mods_from_file) = match &cli.mods { -``` - -Find the existing reference to `cli.ntt` (around line 339 or in SearchParams construction): - -Replace `cli.ntt` with `match cli.enzyme_specificity { EnzymeSpecificity::Fully => 2u8, EnzymeSpecificity::Semi => 1, EnzymeSpecificity::NonSpecific => 0 }`. Search for `cli\.ntt` to find all occurrences: - -Run: `grep -n 'cli\.ntt' crates/msgf-rust/src/bin/msgf-rust.rs` -Expected: 1-2 hits in the run() function where ntt gets passed to SearchParams. - -Replace each occurrence with the match expression above (or extract to a `let ntt: u8 = match cli.enzyme_specificity {...};` binding before the SearchParams construction). The downstream `SearchParams.num_tolerable_termini` is still `u8`, so the conversion is at the CLI/internal boundary. - -- [ ] **Step 2.3: Update `resolve_bundled_param` signature and call sites** - -Find the function (around line 652). Replace the signature: - -OLD: -```rust -fn resolve_bundled_param( - fragmentation: Option, - instrument: Option, - protocol: Option, -) -> Result { -``` - -NEW: -```rust -fn resolve_bundled_param( - fragmentation: Fragmentation, - instrument: Instrument, - protocol: Protocol, -) -> Result { -``` - -Replace the function body's input-normalization block (currently at the top of `resolve_bundled_param`, the `if fragmentation.is_none() && ... { return canonicalize_bundled("HCD_QExactive_Tryp.param"); }` short-circuit and the subsequent `match fragmentation.unwrap_or(0) { ... }` etc.) with: - -```rust - // Step 0: default-to-bundled short-circuit. When the caller passes all - // defaults (Fragmentation::Auto, Instrument::LowRes, Protocol::Auto) - // we use the historical hardcoded default. This preserves pre-iter39 - // behavior where omitting all three flags returned HCD_QExactive_Tryp.param. - if fragmentation == Fragmentation::Auto - && instrument == Instrument::LowRes - && protocol == Protocol::Auto { - return canonicalize_bundled("HCD_QExactive_Tryp.param"); - } - - // Step 1: Normalize. Java's normalization rules mirrored here: - // - Auto fragmentation → CID (Java's "null/PQD → CID") - // - HCD with low-res inst → upgrade to QExactive (Java's HCD-upgrade rule) - let frag = match fragmentation { - Fragmentation::Auto => "CID", - Fragmentation::Cid => "CID", - Fragmentation::Etd => "ETD", - Fragmentation::Hcd => "HCD", - Fragmentation::Uvpd => "UVPD", - }; -``` - -Then replace the subsequent `inst` and `protocol` string-mapping blocks with direct enum-to-string mappings: - -```rust - let mut inst = match instrument { - Instrument::LowRes => "LowRes", - Instrument::HighRes => "HighRes", - Instrument::Tof => "TOF", - Instrument::QExactive => "QExactive", - }; - // HCD-upgrade rule: HCD with low-res inst → upgrade to QExactive. - if frag == "HCD" && inst == "LowRes" { - inst = "QExactive"; - } - - let prot = match protocol { - Protocol::Auto => "", // empty: no protocol suffix - Protocol::Phospho => "_Phosphorylation", - Protocol::Itraq => "_iTRAQ", - Protocol::ItraqPhospho => "_iTRAQPhospho", - Protocol::Tmt => "_TMT", - Protocol::Standard => "", // standard = no suffix - }; -``` - -Adapt the existing file-name-construction code further down to use these new string bindings. The exact existing string assembly logic (which appends protocol suffix, enzyme suffix, falls back to `_NoCleavage`, etc.) stays unchanged — only the input normalization changed. - -Remove any remaining unreachable error branches that used to handle out-of-range numeric IDs (e.g. `99 => return Err(...)`) — clap's `value_parser` now rejects those at parse time before the resolver is called. - -- [ ] **Step 2.4: Update `resolve_bundled_param_for_activation`** - -Find the function (around line 872). It currently takes the auto-detected `(method, inst)` and a protocol `Option`. Update its body to construct the new enum variants directly: - -OLD: -```rust -fn resolve_bundled_param_for_activation( - method: ActivationMethod, - inst: Option, - protocol: Option, -) -> Result { - // ... builds (Some(frag_id), Some(inst_id), protocol) and calls - // resolve_bundled_param(Some(frag_id), Some(inst_id), protocol) -} -``` - -NEW: change `protocol: Option` to `protocol: Protocol`, and update the internal mapping that builds `Some(frag_id), Some(inst_id), protocol`. Construct `Fragmentation` and `Instrument` variants from the detected `method` and `inst`. The exact mapping (which is `Some(1) → Cid`, `Some(2) → Etd`, etc. internally) becomes: - -```rust -let frag = match method { - ActivationMethod::CID => Fragmentation::Cid, - ActivationMethod::ETD => Fragmentation::Etd, - ActivationMethod::HCD => Fragmentation::Hcd, - ActivationMethod::UVPD => Fragmentation::Uvpd, - _ => Fragmentation::Cid, // fallback for unsupported methods -}; -let inst = match inst { - Some(InstrumentType::LowRes) => Instrument::LowRes, - Some(InstrumentType::HighRes) => Instrument::HighRes, - Some(InstrumentType::TOF) => Instrument::Tof, - Some(InstrumentType::QExactive) => Instrument::QExactive, - None => Instrument::LowRes, -}; -resolve_bundled_param(frag, inst, protocol) -``` - -(The exact `InstrumentType`/`ActivationMethod` variant names come from the existing code — preserve them as-is. The point is just to swap the numeric IDs for enum variants.) - -- [ ] **Step 2.5: Update the auto-detect call site in `run()` / `main()`** - -Find the block that dispatches between the auto-detect and the no-detect paths (around lines 370-390 in `run()`). The two call sites that pass `cli.fragmentation`, `cli.instrument`, `cli.protocol` to `resolve_bundled_param` and `resolve_bundled_param_for_activation` now pass enum values directly instead of `Option`. No casts needed. - -Example existing line (and after): - -OLD: -```rust -resolve_bundled_param(cli.fragmentation, cli.instrument, cli.protocol)? -``` - -NEW: identical (the types changed but the expression is the same). If the line uses `Some(...)` wrapping anywhere, drop the wrapping. - -Same for `resolve_bundled_param_for_activation(method, inst, cli.protocol)?`. - -- [ ] **Step 2.6: Update the 15 `param_resolver_tests`** - -Find the `mod param_resolver_tests` block at the end of the file. Each test currently looks like: - -```rust -let p = resolve_bundled_param(Some(3), Some(3), Some(4)).unwrap(); -``` - -Rewrite each test call to use enum variants. The full mapping is: -- `None` → `Fragmentation::Auto`, `Instrument::LowRes`, or `Protocol::Auto` (the new defaults) -- `Some(0)` → `Auto` variant for fragmentation/protocol, `LowRes` for instrument, `NonSpecific` for enzyme specificity -- `Some(1)` → `Cid`/`HighRes`/`Phospho`/`Semi` -- `Some(2)` → `Etd`/`Tof`/`Itraq`/`Fully` -- `Some(3)` → `Hcd`/`QExactive`/`ItraqPhospho` -- `Some(4)` → `Uvpd`/`Tmt` -- `Some(5)` → `Standard` - -For example: -```rust -// OLD -let p = resolve_bundled_param(Some(3), Some(3), Some(4)).unwrap(); -// NEW -let p = resolve_bundled_param(Fragmentation::Hcd, Instrument::QExactive, Protocol::Tmt).unwrap(); -``` - -```rust -// OLD: default_resolves_to_hcd_qexactive_tryp -let p = resolve_bundled_param(None, None, None).unwrap(); -// NEW -let p = resolve_bundled_param(Fragmentation::Auto, Instrument::LowRes, Protocol::Auto).unwrap(); -``` - -For the three "rejects out-of-range" tests (`rejects_out_of_range_fragmentation`, `_instrument`, `_protocol`), these tested `resolve_bundled_param(Some(99), None, None)` returning Err. With clap parsing rejecting out-of-range values before the resolver, these tests no longer make sense in the resolver itself. Replace them with tests that exercise `parse_fragmentation`/`parse_instrument`/`parse_protocol` directly: - -```rust -#[test] -fn parse_fragmentation_rejects_out_of_range_numeric() { - let err = parse_fragmentation("99").unwrap_err(); - assert!(err.contains("0..=4"), "error message should mention range, got: {err}"); -} - -#[test] -fn parse_instrument_rejects_out_of_range_numeric() { - let err = parse_instrument("99").unwrap_err(); - assert!(err.contains("0..=3"), "got: {err}"); -} - -#[test] -fn parse_protocol_rejects_out_of_range_numeric() { - let err = parse_protocol("99").unwrap_err(); - assert!(err.contains("0..=5"), "got: {err}"); -} -``` - -These three replace the three old `rejects_out_of_range_*` tests, keeping the 15-test count. - -Run: `grep -c '#\[test\]' crates/msgf-rust/src/bin/msgf-rust.rs` -Expected: same count as before (15 in `param_resolver_tests` mod). - -- [ ] **Step 2.7: Build and run msgf-rust tests** - -Run: `cargo test --release -p msgf-rust 2>&1 | tail -15` -Expected: `test result: ok. 15 passed; 0 failed` for `param_resolver_tests` (plus 0/0 for `cli_smoke.rs` which we haven't touched yet — those run separately). - -If a test fails, the most likely cause is an off-by-one in the legacy-numeric mapping (e.g. legacy `Some(1)` → `Fragmentation::Cid` but the test expected CID_*.param and we accidentally produced ETD_*.param). Cross-check the mapping table above. - -- [ ] **Step 2.8: Run the cli_smoke integration tests** - -Run: `cargo test --release -p msgf-rust --test cli_smoke 2>&1 | tail -10` -Expected: `test result: ok. 7 passed; 0 failed`. - -These tests use legacy numeric form (`--fragmentation 3 --instrument 3 --protocol 4` and `--mod` alias) — they should keep passing because legacy values are still accepted. - -- [ ] **Step 2.9: Run the full workspace test suite** - -Run: -```bash -cargo test --release --workspace -- \ - --skip charge_missing_spectrum_uses_per_charge_scored_spec \ - --skip spectrum_without_charge_tries_charge_range \ - --skip known_peptide_appears_in_top_n \ - --skip read_bsa_canno_text_format \ - --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ - --skip tryp_pig_bov_revcat_full_set_loads \ - --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E '^test result' | wc -l -``` - -Expected: 37+ "test result: ok" lines (matching what CI runs). - -Run again to count failures: `cargo test --release --workspace -- [same skips] 2>&1 | grep -E '^test result.*FAILED' | wc -l` -Expected: `0`. - -**Do not commit yet** — Task 3 finishes Commit 1. - ---- - -## Task 3: Add round-trip integration test in `cli_smoke.rs` - -**Files:** -- Modify: `crates/msgf-rust/tests/cli_smoke.rs` — append new test at end. - -This task adds the regression test that guards the back-compat path: legacy numeric (`--fragmentation 3 --protocol 4`) and canonical named (`--fragmentation HCD --protocol TMT`) MUST resolve to byte-identical PIN output. - -- [ ] **Step 3.1: Write the new test** - -Append at the end of `crates/msgf-rust/tests/cli_smoke.rs`: - -```rust -/// Regression guard: legacy Java numeric flag values and the new -/// Rust-idiomatic named values must resolve to byte-identical PIN output. -/// Quantms scripts use the numeric form; new docs recommend the named form. -/// If this test breaks, the legacy compat layer is broken. -#[test] -fn cli_accepts_both_named_and_numeric_param_values() { - let bsa_fasta = fixture("test-fixtures/BSA.fasta"); - let test_mgf = fixture("test-fixtures/test.mgf"); - let mods_path = fixture("test-fixtures/Mods.txt"); - - let tmp_a = tempfile::tempdir().expect("tmpdir a"); - let pin_a = tmp_a.path().join("legacy.pin"); - - let tmp_b = tempfile::tempdir().expect("tmpdir b"); - let pin_b = tmp_b.path().join("named.pin"); - - // Run A: legacy numeric form (mirrors current quantms usage). - let status_a = base_cmd(test_mgf.to_str().unwrap(), - bsa_fasta.to_str().unwrap(), - &pin_a) - .arg("--mod").arg(&mods_path) - .arg("--fragmentation").arg("3") - .arg("--instrument").arg("3") - .arg("--protocol").arg("4") - .arg("--ntt").arg("2") - .status() - .expect("legacy form exit"); - assert!(status_a.success(), "legacy CLI form failed"); - - // Run B: canonical named form (mirrors new docs). - let status_b = base_cmd(test_mgf.to_str().unwrap(), - bsa_fasta.to_str().unwrap(), - &pin_b) - .arg("--mods").arg(&mods_path) - .arg("--fragmentation").arg("HCD") - .arg("--instrument").arg("QExactive") - .arg("--protocol").arg("TMT") - .arg("--enzyme-specificity").arg("fully") - .status() - .expect("named form exit"); - assert!(status_b.success(), "named CLI form failed"); - - let pin_a_bytes = std::fs::read(&pin_a).expect("read legacy pin"); - let pin_b_bytes = std::fs::read(&pin_b).expect("read named pin"); - assert_eq!(pin_a_bytes, pin_b_bytes, - "legacy and named CLI forms must produce byte-identical PIN output"); -} -``` - -This test uses the existing `fixture()` helper and `base_cmd()` builder defined at the top of `cli_smoke.rs`. Both run small TMT-style searches on the BSA + test.mgf fixture. - -- [ ] **Step 3.2: Run only the new test to verify it passes** - -Run: `cargo test --release -p msgf-rust --test cli_smoke cli_accepts_both_named_and_numeric_param_values 2>&1 | tail -10` -Expected: `test result: ok. 1 passed; 0 failed`. - -If it fails with byte-mismatch, inspect both PIN files manually: -```bash -diff /tmp/.tmpXXX/legacy.pin /tmp/.tmpYYY/named.pin | head -``` -Most likely cause of mismatch: a typo in the enum mapping that makes legacy "3" resolve to a different param file than named "HCD". - -- [ ] **Step 3.3: Run all cli_smoke tests one more time** - -Run: `cargo test --release -p msgf-rust --test cli_smoke 2>&1 | tail -5` -Expected: `test result: ok. 8 passed; 0 failed` (the 7 existing tests + the new round-trip). - -- [ ] **Step 3.4: Commit (Commit 1)** - -```bash -git add crates/msgf-rust/src/bin/msgf-rust.rs crates/msgf-rust/tests/cli_smoke.rs -git commit -m "$(cat <<'EOF' -feat(cli): rename param flags to named values with legacy compat - -Replace numeric Java-historical enum flags with Rust-idiomatic named -values and rename --mod → --mods, --ntt → --enzyme-specificity. All -legacy forms still accepted silently for quantms script compat. - -Canonical (shown in --help): -- --fragmentation auto|CID|ETD|HCD|UVPD (default: auto) -- --instrument low-res|high-res|TOF|QExactive (default: low-res) -- --protocol auto|phospho|iTRAQ|iTRAQ-phospho|TMT|standard (default: auto) -- --enzyme-specificity non-specific|semi|fully (default: fully) -- --mods (singular --mod kept as hidden alias) - -Legacy (silently accepted): -- --fragmentation 0..=4 -- --instrument 0..=3 -- --protocol 0..=5 -- --ntt 0..=2 (--ntt is also a clap alias of --enzyme-specificity) -- --mod - -clap parses values case-insensitively, so quantms scripts that lowercase -named values (--fragmentation hcd) keep working. - -Internal: -- Added four ValueEnum-derived enums: Fragmentation, Instrument, - Protocol, EnzymeSpecificity. -- Added four custom value parsers: parse_fragmentation, - parse_instrument, parse_protocol, parse_enzyme_specificity. Each tries - the canonical named value first, falls back to the legacy numeric ID. -- Changed resolve_bundled_param and resolve_bundled_param_for_activation - signatures from Option triples to strongly-typed enums. The - "all-defaults short-circuit" (which produced HCD_QExactive_Tryp.param - pre-iter39 when no flags were given) is preserved via the - Fragmentation::Auto + Instrument::LowRes + Protocol::Auto check. -- Updated the 15 param_resolver_tests for the new signature; replaced - the three "rejects out of range" resolver tests with equivalent tests - on the parser functions (clap rejects bad values at parse time now). - -Verified: -- cargo test --release -p msgf-rust → 18 passed (15 resolver tests - + 3 new parser-out-of-range tests). -- cargo test --release -p msgf-rust --test cli_smoke → 8 passed - (7 existing + 1 new round-trip). -- cargo test --release --workspace → no new failures vs baseline. - -New regression guard: cli_accepts_both_named_and_numeric_param_values -runs a small search twice (once with --fragmentation 3 --protocol 4, -once with --fragmentation HCD --protocol TMT) and asserts PIN outputs -are byte-identical. -EOF -)" -``` - -Run after commit: `git log -1 --format='%h %s'` -Expected: short SHA + commit subject `feat(cli): rename param flags to named values with legacy compat`. - ---- - -## Task 4: Write new `README.md` - -**Files:** -- Replace: `README.md` (currently 193 lines of Java-tool README). - -The new README is a linear top-to-bottom narrative serving both quantms operators and mass-spec researchers. Follow the section list from the spec (`docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md`, "README.md content + structure" — 12 sections, ~190 lines total). - -- [ ] **Step 4.1: Replace README.md** - -Overwrite `README.md` with the new content. The file structure (each line below is a section heading; section line-budget is the target from the spec): - -```markdown -# msgf-rust — peptide identification from MS/MS spectra - -[![CI](https://github.com/bigbio/msgf-rust/actions/workflows/ci.yml/badge.svg)](https://github.com/bigbio/msgf-rust/actions/workflows/ci.yml) -[![Release](https://img.shields.io/github/v/release/bigbio/msgf-rust)](https://github.com/bigbio/msgf-rust/releases) -[![License: UCSD-Noncommercial](https://img.shields.io/badge/license-UCSD--Noncommercial-blue)](LICENSE) - -> **A Rust port of MS-GF+** — takes mzML/MGF spectra + FASTA in, produces Percolator-ready `.pin` out. Beats Java MS-GF+ on all three benchmark datasets at 1% FDR while running 14-330% faster. - -## What is this? - -msgf-rust is a from-scratch Rust reimplementation of [MS-GF+](https://github.com/MSGFPlus/msgfplus) (Kim & Pevzner, 2014), the canonical generating-function peptide-identification engine. It reads MS/MS spectra (mzML or MGF), searches them against a FASTA protein database, and emits Percolator-ready PIN rows (or a TSV) with per-PSM features for rescoring. The original Java implementation is preserved on the `java-legacy` branch. - -## Why msgf-rust? - -Three datasets, three results (all at 1% FDR via Percolator 3.7.1): - -| Dataset | Java MS-GF+ PSMs | msgf-rust PSMs | Δ | Java wall | msgf-rust wall | Wall Δ | -|---|---:|---:|---:|---:|---:|---:| -| **Astral DDA** (LFQ_Astral_DDA_15min_50ng) | 35,818 | **36,170** | **+352 (+0.98%)** | 5:49 | 5:57 | within 2% | -| **PXD001819** (UPS1 yeast tryp) | 14,798 | 14,760 | -38 (-0.26%) | ~150s | **45.88s** | **3.3× faster** | -| **TMT** (a05058 PXD007683) | 10,166 | **11,108** | **+9.3%** | ~2:55 | **2:30** | **14% faster** | - -What that means: on Astral we find more peptide hits than Java; on PXD001819 we match Java's hit count at 3.3× the speed; on TMT we find ~9% more PSMs at 14% less wall. The remaining feature-level divergences (lnEValue, MeanRelErrorTop7 normalization) are tracked in `DOCS.md` §8d as research follow-up — they don't gate cutover. - -## Install - -**Option 1 — download a release archive** (recommended): - -Grab the archive for your platform from the [Releases page](https://github.com/bigbio/msgf-rust/releases). Five platform builds are published per release: - -``` -msgf-rust--x86_64-unknown-linux-gnu.tar.gz -msgf-rust--aarch64-unknown-linux-gnu.tar.gz -msgf-rust--x86_64-apple-darwin.tar.gz -msgf-rust--aarch64-apple-darwin.tar.gz -msgf-rust--x86_64-pc-windows-msvc.zip -``` - -Each archive contains the `msgf-rust` binary, the `resources/` tree (39 bundled `.param` files + unimod.obo), and LICENSE/NOTICE/README. - -**Option 2 — `cargo install`:** - -```bash -cargo install --git https://github.com/bigbio/msgf-rust --bin msgf-rust -``` - -**Option 3 — build from source:** - -```bash -git clone https://github.com/bigbio/msgf-rust -cd msgf-rust -cargo build --release -# Binary: target/release/msgf-rust -``` - -Requires Rust 1.85+ (see `rust-toolchain.toml`). - -## Quick Start - -```bash -msgf-rust \ - --spectrum BSA.mgf \ - --database BSA.fasta \ - --output-pin out.pin -``` - -This runs a tryptic search at 20 ppm precursor tolerance with the bundled HCD_QExactive_Tryp scoring model, writes Percolator-format PSMs to `out.pin`, and prints per-phase timings to stderr. Feed `out.pin` directly into Percolator (Docker or native) to compute q-values. - -A row in `out.pin` is one peptide–spectrum match with 28 columns: `SpecId`, `Label`, `ScanNr`, charge one-hot encoding, then features like `RawScore`, `lnSpecEValue`, `DeNovoScore`, ion-current ratios, peptide-length stats, etc. Full column reference: `DOCS.md` §3a. - -## Common workflows - -**Tryptic DDA + Percolator** (default): - -```bash -msgf-rust --spectrum spectra.mzML --database db.fasta --output-pin out.pin -docker run --rm -v $(pwd):/data biocontainers/percolator:v3.7.1_cv1 \ - percolator -X /data/weights.txt /data/out.pin -``` - -**TMT 10-plex search with mods.txt:** - -```bash -msgf-rust \ - --spectrum tmt_spectra.mzML \ - --database hsapiens.fasta \ - --output-pin out.pin \ - --mods tmt_10plex_mods.txt \ - --protocol TMT \ - --fragmentation HCD \ - --instrument QExactive -``` - -**Direct TSV output (skip Percolator):** - -```bash -msgf-rust --spectrum spectra.mzML --database db.fasta \ - --output-pin out.pin --output-tsv out.tsv -``` - -**[quantms](https://github.com/bigbio/quantms) pipeline integration:** - -Point quantms's PSM search step at `msgf-rust` and use the standard quantms post-processing. The `.pin` row format is the same; existing quantms scripts using legacy numeric flag values (`--fragmentation 3 --instrument 3 --protocol 4`) keep working without modification (see `CLI_MIGRATION.md`). - -## CLI summary - -Most-used flags (full reference in `DOCS.md` §1): - -| Flag | Purpose | Default | -|---|---|---| -| `--spectrum ` | Input mzML or MGF | (required) | -| `--database ` | Input FASTA | (required) | -| `--output-pin ` | Percolator PIN output | (required) | -| `--output-tsv ` | Optional TSV output | (off) | -| `--mods ` | mods.txt file (Cam-C + Ox-M built-in) | (off) | -| `--precursor-tol-ppm ` | Precursor mass tolerance | 20.0 | -| `--isotope-error-min/-max ` | Isotope error range | -1, 2 | -| `--charge-min/-max ` | Charge range when not in spectrum | 2, 3 | -| `--enzyme-specificity ` | NTT enforcement | fully | -| `--max-missed-cleavages ` | Missed cleavages | 1 | -| `--min/-max-length ` | Peptide length range | 6, 40 | -| `--min-peaks ` | Min peaks per spectrum to score | 10 | -| `--top-n ` | PSMs retained per spectrum | 10 | -| `--fragmentation ` | Frag method (auto-detect from mzML if `auto`) | auto | -| `--instrument ` | Instrument class | low-res | -| `--protocol ` | Search protocol | auto | -| `--param-file ` | Override bundled scoring model | (auto-pick) | -| `--threads ` | Worker threads | (logical CPUs) | - -Run `msgf-rust --help` for the auto-generated help with full descriptions. - -## Auto-detection - -For mzML inputs, msgf-rust reads the activation block of the first MS2 spectrum and selects a bundled `.param` file accordingly. The detection covers HCD/CID/ETD/UVPD activation and LowRes/HighRes/TOF/QExactive instrument classes (via mzML CV params). The bundled model is then resolved from `(fragmentation, instrument, protocol)`. MGF files have no activation metadata, so they go through the CLI defaults (which can be overridden with explicit `--fragmentation` / `--instrument` flags). Full resolution table: `DOCS.md` §4. - -## Parity vs Java MS-GF+ - -PIN output columns are bit-exact with Java MS-GF+ on the agreement bucket (same scan + same top-1 peptide) for most features. Three residual divergences exist as deferred research: `lnEValue` (num_distinct semantics), `MeanRelErrorTop7` (error-stat normalization), and the BSA charge-3 SEV gap from the deconvolution-implementation difference (`known-divergences.md` item #3, kept on the development branch). None gate cutover; aggregate 1% FDR PSM counts beat Java on all three benchmark datasets. Full detail: `DOCS.md` §8d. - -## Citation - -If you use msgf-rust in published work, please cite the original MS-GF+ paper: - -> Kim, S. and Pevzner, P.A. (2014). MS-GF+ makes progress towards a universal database search tool for proteomics. *Nature Communications*, 5:5277. - -And optionally this Rust port: - -> bigbio (2026). msgf-rust: a Rust port of MS-GF+ for the quantms pipeline. https://github.com/bigbio/msgf-rust - -## License - -msgf-rust inherits the upstream MS-GF+ UCSD-Noncommercial license. The license restricts redistribution and commercial use; see `LICENSE` for the full text and `NOTICE` for attribution. The original Java implementation is preserved on the `java-legacy` branch (frozen at the bigbio-optimized version) and `java-legacy-original` branch (synced to upstream `MSGFPlus/msgfplus/master`). - -## Acknowledgments - -- Sangtae Kim, Pavel Pevzner, and the PNNL Proteomics team at UCSD's Center for Computational Mass Spectrometry, for the original MS-GF+ engine and the bundled `.param` scoring models. -- The [bigbio](https://github.com/bigbio) maintainers and the [quantms](https://github.com/bigbio/quantms) team. -``` - -- [ ] **Step 4.2: Verify the build still passes (no source code touched, sanity only)** - -Run: `cargo build --release 2>&1 | tail -3` -Expected: `Finished` (nothing changed in Rust code, but verifies the working tree is clean). - -- [ ] **Step 4.3: Commit (Commit 2)** - -```bash -git add README.md -git commit -m "$(cat <<'EOF' -docs: rewrite README.md for post-cutover state - -Replace the legacy Java-tool README (193 lines, Java 17 + JAR + mvn) with -a linear-narrative README for the Rust port (~190 lines, dual audience). - -Sections, top to bottom: -1. Title + tagline + badges (CI, release, license) -2. What is this? — one paragraph, names UCSD original -3. Why msgf-rust? — benchmark table vs Java on Astral / PXD001819 / TMT -4. Install — release archive, cargo install, build from source -5. Quick Start — minimal command, one paragraph on .pin row shape -6. Common workflows — tryptic DDA, TMT, TSV output, quantms integration -7. CLI summary — table of ~17 most-used flags -8. Auto-detection — activation/instrument detection from mzML -9. Parity vs Java MS-GF+ — short summary; pointer to DOCS.md §8d -10. Citation -11. License — UCSD-Noncommercial; pointer to java-legacy and - java-legacy-original branches -12. Acknowledgments - -quantms operators have a labeled section in #6 + the CLI summary in #7. -Researchers see the benchmark proof up front in #3. - -The full CLI reference, mods.txt grammar, PIN/TSV column docs, training -notes, and Java→Rust migration table live in DOCS.md (separate commit). -The Java→Rust flag mapping table lives in CLI_MIGRATION.md (separate -commit). -EOF -)" -``` - -Run after: `git log -1 --format='%h %s'` -Expected: short SHA + `docs: rewrite README.md for post-cutover state`. - ---- - -## Task 5: Write new `DOCS.md` - -**Files:** -- Create: `DOCS.md` at repo root. - -The new `DOCS.md` is the single-file reference for everything not in README. Follow the section list from the spec (`docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md`, "DOCS.md content + structure" — 9 sections, ~505 lines total). - -The content is too large to embed verbatim in this plan; use the spec's section outline as the authoritative content guide and follow these per-section content requirements. - -- [ ] **Step 5.1: Create `DOCS.md` with the section skeleton** - -Create `DOCS.md` at repo root with this skeleton + section-specific content guide. Use the spec as the design reference; each section below names the *required content elements* the implementer must produce. - -```markdown -# msgf-rust documentation - -This is the full reference. For getting started, see [`README.md`](README.md). -For the Java→Rust flag mapping, see [`CLI_MIGRATION.md`](CLI_MIGRATION.md). - -## Contents - -1. [CLI reference](#1-cli-reference) -2. [Mods.txt format](#2-modstxt-format) -3. [Output formats](#3-output-formats) -4. [Auto-detection](#4-auto-detection) -5. [Building from source](#5-building-from-source) -6. [Training new `.param` files](#6-training-new-param-files) -7. [Isobaric labeling](#7-isobaric-labeling) -8. [Java MS-GF+ → msgf-rust migration](#8-java-ms-gf--msgf-rust-migration) -9. [License and citation](#9-license-and-citation) - -## 1. CLI reference - -(~130 lines) - -Tabulate every CLI flag in groups: Required (--spectrum, --database, --output-pin), Search params (--precursor-tol-ppm, --charge-min/-max, --enzyme-specificity, --max-missed-cleavages, --min-length, --max-length, --top-n, --isotope-error-min/-max, --min-peaks), Modifications (--mods), Scoring (--fragmentation, --instrument, --protocol, --param-file), Runtime (--threads, --ms-level, --max-spectra, --decoy-prefix), Output (--output-tsv). - -For each flag: name, value type, default, description, accepted legacy form (where applicable). - -## 2. Mods.txt format - -(~50 lines) - -Document the grammar: each non-comment line is `,,,,`. Field rules: -- `` — numeric Da; composition strings not supported. -- `` — uppercase letter or `*` wildcard. -- `` — `fix` or `opt`. -- `` — `any|N-term|C-term|Prot-N-term|Prot-C-term`. - -Special directive: `NumMods=N` sets max variable mods per peptide. - -Comment handling: `#`-prefix lines ignored, inline `# ...` stripped, blank lines OK. - -Three worked examples in fenced ```text blocks: (a) cam-C fixed + ox-M variable, (b) TMT 10-plex on K + N-term, (c) phospho-STY variable. - -## 3. Output formats - -(~90 lines) - -### 3a. PIN columns - -Table with one row per PIN column. Columns: `Column name`, `Type`, `Description`, `Computation`. ~28 rows (one per emitted column). Cross-reference Java MS-GF+'s DirectPinWriter for column semantics. - -### 3b. TSV columns - -Same shape as 3a but for the TSV writer's columns. - -### 3c. PIN vs TSV — which to use - -One paragraph: TSV is human-readable / Excel-friendly; PIN feeds Percolator for q-value rescoring. quantms-style pipelines use PIN. - -## 4. Auto-detection - -(~35 lines) - -Two tables: -- Activation method detection from mzML CV params (MS:1000133 → CID, MS:1000599 → ETD, MS:1000422 → HCD, MS:1002472 → UVPD). -- Param-file resolution: `(Fragmentation, Instrument, Protocol)` → bundled file name. Cover all 39 files in `resources/ionstat/`. - -Plus a "what happens when auto-detection fails" paragraph. - -## 5. Building from source - -(~30 lines) - -Requirements: Rust 1.85+. Build: `cargo build --release`. Test: `cargo test --release`. Binary location: `target/release/msgf-rust`. - -The CI suite skips 7 tests for documented reasons (3 min_peaks regressions, 3 Maven-fixture tests, 1 thread-determinism). The release binary is unaffected. Reproduce the CI test invocation: - -```bash -cargo test --release --workspace -- \ - --skip charge_missing_spectrum_uses_per_charge_scored_spec \ - --skip spectrum_without_charge_tries_charge_range \ - --skip known_peptide_appears_in_top_n \ - --skip read_bsa_canno_text_format \ - --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ - --skip tryp_pig_bov_revcat_full_set_loads \ - --skip match_spectra_output_invariant_across_thread_counts -``` - -## 6. Training new `.param` files - -(~25 lines) - -The Rust port reuses Java MS-GF+'s `.param` scoring-model files as-is — the binary format is unchanged; the 39 bundled files in `resources/ionstat/` came directly from the Java distribution. - -Training NEW `.param` files (for novel fragmentation methods or instrument classes) requires running a scoring-parameter generator. Java MS-GF+'s `ScoringParamGen` is the canonical implementation. - -**Status in v0.1.0:** the search/scoring side is fully ported and validated; the trainer is not yet ported. A Rust reimplementation is on the roadmap — see the [open issues](https://github.com/bigbio/msgf-rust/issues) for progress. - -Two paths until then: -1. Use the bundled `.param` files (covers HCD QExactive, CID LowRes, ETD HighRes, TMT/iTRAQ variants). -2. Train new models on the `java-legacy` branch (`git checkout java-legacy`), run Java MS-GF+'s `ScoringParamGen`, point the Rust binary at the output with `--param-file `. Format is identical. - -## 7. Isobaric labeling - -(~35 lines) - -Cover TMT and iTRAQ workflows: -- `--protocol TMT` or `--protocol iTRAQ` -- Required mods.txt entries (TMT 10-plex on K + N-term as 229.16293; iTRAQ 8-plex as 304.20536, etc.) -- Auto-selected param file (e.g. `HCD_QExactive_Tryp_TMT.param` when protocol=TMT, instrument=QExactive). -- Sample CLI commands for each. - -## 8. Java MS-GF+ → msgf-rust migration - -(~80 lines) - -### 8a. Flag rename table - -Table mapping Java MS-GF+ flag → msgf-rust flag. Example: - -| Java MS-GF+ | msgf-rust | -|---|---| -| `-s ` | `--spectrum ` | -| `-d ` | `--database ` | -| `-o ` | `--output-pin ` | -| `-mod ` | `--mods ` (alias: `--mod`) | -| `-t 20ppm` | `--precursor-tol-ppm 20` | -| `-ti -1,2` | `--isotope-error-min -1 --isotope-error-max 2` | -| `-inst 3` | `--instrument QExactive` (or `--instrument 3`) | -| `-m 3` | `--fragmentation HCD` (or `--fragmentation 3`) | -| `-protocol 4` | `--protocol TMT` (or `--protocol 4`) | -| `-ntt 2` | `--enzyme-specificity fully` (or `--ntt 2`) | -| `-tda 1` | (not needed — decoys are auto-generated) | -| `-e 1` | (not exposed — Trypsin is the only enzyme; for others, use `--param-file`) | -| `-outputFormat 1` | `--output-tsv ` | -| `-thread N` | `--threads N` | - -### 8b. Numeric-legacy values - -Cross-reference `CLI_MIGRATION.md` for the legacy 0..=N → named-value mapping. msgf-rust accepts both forms. - -### 8c. Behavior differences - -- mzXML, MS2, PKL, `_dta.txt` inputs are not supported (use mzML or MGF). -- mzIdentML output is not supported (use PIN + Percolator, or TSV). -- Decoys are always auto-generated by reversing target sequences (decoy prefix configurable via `--decoy-prefix`); there is no separate decoy-database flag. -- The CLI is picocli-equivalent (clap-derived) with auto-generated `--help`. - -### 8d. Known parity divergences - -Three areas where msgf-rust and Java MS-GF+ produce different PIN values on the agreement bucket (same scan + same top-1 peptide): - -| Feature | Divergence | Status | -|---|---|---| -| `lnEValue` | -4.15 OOM mean (Rust over-confident) | Deferred — known-divergences #2: num_distinct semantics | -| `MeanRelErrorTop7` / `MeanErrorTop7` / `StdevRelErrorTop7` | 99% of agreement-bucket PSMs differ >1% relative | Deferred — error-stat normalization differs | -| BSA charge-3 SEV (BSA.fasta + test.mgf fixture) | 1.03/1.20 OOM (pre-iter37) → 2.56/3.58 OOM (post-iter37) | Known — deconvolution-implementation divergence #3, kept on the dev branch parity test as a coarse smoke gate | - -Aggregate Astral 1% FDR PSM count stays +0.98% ahead of Java; Percolator's discriminative weights absorb the per-feature distribution differences. None of these block production use. - -## 9. License and citation - -(~15 lines) - -Reproduce the relevant LICENSE text (UCSD-Noncommercial). State the citation requirement (Kim & Pevzner 2014 + this port). Link to LICENSE/NOTICE. -``` - -The implementer expands each section's content guide into prose. The spec at `docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md` §"DOCS.md content + structure" is the design reference; the section list above is the authoritative skeleton. - -- [ ] **Step 5.2: Verify wc -l count is in the target range** - -Run: `wc -l DOCS.md` -Expected: 450-550 (target ~505). If the count is much higher, the implementer over-wrote — trim back to skeleton + essential content. If much lower, sections are too thin — fill out the content guides. - -- [ ] **Step 5.3: Commit (Commit 3)** - -```bash -git add DOCS.md -git commit -m "$(cat <<'EOF' -docs: add DOCS.md single-file reference - -Add DOCS.md at repo root: the full power-user reference covering all -flags, formats, build/test workflow, training notes, and Java→Rust -migration. ~505 lines, navigated via a top-of-file table of contents. - -Sections: -1. CLI reference — every flag with type/default/description and - accepted legacy form -2. Mods.txt format — grammar + 3 worked examples -3. Output formats — PIN columns, TSV columns, when to use which -4. Auto-detection — activation method detection from mzML + - param-file resolution table -5. Building from source — Rust 1.85+, cargo build/test, the 7 CI-skipped - tests and reasons -6. Training new .param files — current state (reuse Java's bundled - files), roadmap (port ScoringParamGen), interim workflow - (train on java-legacy, --param-file at the Rust binary) -7. Isobaric labeling — TMT and iTRAQ workflows, required mods entries, - auto-selected param file -8. Java MS-GF+ → msgf-rust migration — flag rename table, behavior - differences, known parity divergences -9. License and citation - -The DOCS.md design follows the linear-narrative pattern of README.md: -no nested directories, no site generator, just one Cmd-F-friendly file. -EOF -)" -``` - ---- - -## Task 6: Write new `CLI_MIGRATION.md` - -**Files:** -- Create: `CLI_MIGRATION.md` at repo root. - -The new `CLI_MIGRATION.md` is a focused one-pager for users porting Java MS-GF+ command lines or scripts to msgf-rust. ~100 lines. - -- [ ] **Step 6.1: Create CLI_MIGRATION.md** - -```markdown -# Migrating to msgf-rust from Java MS-GF+ - -msgf-rust accepts both the canonical Rust-idiomatic CLI form (named values, kebab-case) and the legacy Java MS-GF+ form (numeric IDs and short flag names) silently — running scripts written against Java MS-GF+ unchanged is supported. - -This page is a quick-reference for porting commands. For the full CLI reference, see [`DOCS.md`](DOCS.md) §1. - -## Table A — Java MS-GF+ flag → msgf-rust flag - -| Java MS-GF+ | msgf-rust canonical | msgf-rust legacy alias | -|---|---|---| -| `-s ` | `--spectrum ` | — | -| `-d ` | `--database ` | — | -| `-o ` | `--output-pin ` | — | -| `-mod ` | `--mods ` | `--mod ` | -| `-t 20ppm` | `--precursor-tol-ppm 20` | — | -| `-ti -1,2` | `--isotope-error-min -1 --isotope-error-max 2` | — | -| `-m 3` (HCD) | `--fragmentation HCD` | `--fragmentation 3` | -| `-inst 3` (QExactive) | `--instrument QExactive` | `--instrument 3` | -| `-protocol 4` (TMT) | `--protocol TMT` | `--protocol 4` | -| `-ntt 2` (fully specific) | `--enzyme-specificity fully` | `--ntt 2` | -| `-tda 1` (target+decoy) | (omit — decoys always auto-generated) | — | -| `-e 1` (Trypsin) | (omit — Trypsin is the only enzyme) | — | -| `-outputFormat 1` (TSV) | `--output-tsv ` | — | -| `-thread N` | `--threads N` | — | -| `-minLength 6` | `--min-length 6` | — | -| `-maxLength 40` | `--max-length 40` | — | -| `-maxMissedCleavages 1` | `--max-missed-cleavages 1` | — | -| `-minNumPeaks 10` | `--min-peaks 10` | — | - -## Table B — Numeric-legacy → named values - -| Flag | Legacy numeric | Canonical named | -|---|---|---| -| `--fragmentation` | `0` | `auto` | -| `--fragmentation` | `1` | `CID` | -| `--fragmentation` | `2` | `ETD` | -| `--fragmentation` | `3` | `HCD` | -| `--fragmentation` | `4` | `UVPD` | -| `--instrument` | `0` | `low-res` | -| `--instrument` | `1` | `high-res` | -| `--instrument` | `2` | `TOF` | -| `--instrument` | `3` | `QExactive` | -| `--protocol` | `0` | `auto` | -| `--protocol` | `1` | `phospho` | -| `--protocol` | `2` | `iTRAQ` | -| `--protocol` | `3` | `iTRAQ-phospho` | -| `--protocol` | `4` | `TMT` | -| `--protocol` | `5` | `standard` | -| `--enzyme-specificity` (aliases: `--ntt`) | `0` | `non-specific` | -| `--enzyme-specificity` | `1` | `semi` | -| `--enzyme-specificity` | `2` | `fully` | - -clap parses named values case-insensitively, so `--fragmentation hcd` works the same as `--fragmentation HCD`. - -## Worked examples - -### (a) Plain Trypsin DDA, 20 ppm precursor tolerance - -**Java MS-GF+:** - -```bash -java -Xmx4G -jar MSGFPlus.jar \ - -s spectra.mzML \ - -d uniprot.fasta \ - -tda 1 \ - -t 20ppm \ - -ti -1,2 \ - -o results.pin -``` - -**msgf-rust (canonical):** - -```bash -msgf-rust \ - --spectrum spectra.mzML \ - --database uniprot.fasta \ - --precursor-tol-ppm 20 \ - --isotope-error-min -1 --isotope-error-max 2 \ - --output-pin results.pin -``` - -**msgf-rust (legacy-form, drop-in for existing quantms scripts):** - -The Java-style flags above don't translate verbatim — `-s`, `-d`, `-o` are Java-only. But the search-parameter flags do; for example, an existing quantms script that calls msgf-rust with `--fragmentation 3 --instrument 3 --protocol 4` keeps working unchanged. - -### (b) TMT 10-plex search - -**Java MS-GF+:** - -```bash -java -Xmx8G -jar MSGFPlus.jar \ - -s tmt_spectra.mzML \ - -d hsapiens.fasta \ - -tda 1 \ - -t 20ppm \ - -inst 3 \ - -m 3 \ - -protocol 4 \ - -mod tmt_mods.txt \ - -o results.pin -``` - -**msgf-rust:** - -```bash -msgf-rust \ - --spectrum tmt_spectra.mzML \ - --database hsapiens.fasta \ - --precursor-tol-ppm 20 \ - --instrument QExactive \ - --fragmentation HCD \ - --protocol TMT \ - --mods tmt_mods.txt \ - --output-pin results.pin -``` - -### (c) Phospho STY search - -**Java MS-GF+:** - -```bash -java -Xmx4G -jar MSGFPlus.jar \ - -s phospho.mzML \ - -d uniprot.fasta \ - -tda 1 \ - -t 10ppm \ - -inst 1 \ - -m 3 \ - -protocol 1 \ - -mod phospho_mods.txt \ - -o results.pin -``` - -**msgf-rust:** - -```bash -msgf-rust \ - --spectrum phospho.mzML \ - --database uniprot.fasta \ - --precursor-tol-ppm 10 \ - --instrument high-res \ - --fragmentation HCD \ - --protocol phospho \ - --mods phospho_mods.txt \ - --output-pin results.pin -``` - -## Notes - -- `-tda 1` (target+decoy database analysis) is always on in msgf-rust — decoys are generated by reversing target sequences at search time. The decoy prefix is configurable via `--decoy-prefix` (default `XXX_`). -- The Java `-e` enzyme flag is not exposed; Trypsin is hardcoded. For non-tryptic searches, use a custom `.param` file via `--param-file`. -- mzXML, MS2, PKL, and `_dta.txt` inputs are not supported. Use mzML or MGF. -- mzIdentML output is not supported. Use PIN (with Percolator) or TSV. -``` - -- [ ] **Step 6.2: Commit (Commit 4)** - -```bash -git add CLI_MIGRATION.md -git commit -m "$(cat <<'EOF' -docs: add CLI_MIGRATION.md (Java + numeric legacy → new names) - -One-page reference for porting Java MS-GF+ command lines or quantms -scripts to msgf-rust. Covers: - -- Table A: Java flag → msgf-rust flag mapping (18 flags). -- Table B: numeric-legacy → canonical named value mapping (one row per - legacy ID across fragmentation, instrument, protocol, enzyme-specificity). -- Three worked examples (plain tryptic DDA; TMT 10-plex; phospho STY) - showing the Java MS-GF+ command line and the msgf-rust equivalent - side-by-side. -- Notes on behaviors that simply don't exist on the Rust side (no - -tda flag, no -e enzyme flag, no mzXML/PKL/MS2 input, no mzIdentML - output). - -msgf-rust silently accepts the legacy forms (--fragmentation 3, ---mod, --ntt) for backward compatibility with quantms scripts. New -canonical forms are documented for fresh users. -EOF -)" -``` - ---- - -## Task 7: Delete the legacy `docs/` tree - -**Files:** -- Delete: 38 tracked files under `docs/` (excluding `docs/superpowers/`). - -This removes the Java-tool documentation that has been replaced by README.md / DOCS.md / CLI_MIGRATION.md. - -- [ ] **Step 7.1: List the files to be deleted (sanity check before destruction)** - -Run: -```bash -git ls-files docs/ | grep -v 'docs/superpowers/' | sort -``` - -Expected output: 38 files including `docs/msgfplus.md`, `docs/readme.md`, `docs/benchmarks/*`, `docs/examples/*`, `docs/parameterfiles/*`, etc. Verify `docs/superpowers/specs/` and `docs/superpowers/plans/` files are NOT in this list. - -- [ ] **Step 7.2: Delete the files** - -Run: -```bash -git rm -r docs/benchmarks/ docs/examples/ docs/parameterfiles/ \ - docs/buildsa.md docs/changelog.md docs/isobariclabeling.md \ - docs/msgfdb_modfile.md docs/msgfplus.md docs/output.md docs/readme.md \ - docs/training-scoring-models.md docs/troubleshooting.md -``` - -Run: `git ls-files docs/ | grep -v 'docs/superpowers/' | wc -l` -Expected: `0` (all non-superpowers tracked files under docs/ are now gone). - -Run: `git ls-files docs/superpowers/ | wc -l` -Expected: `2` or more (the spec + this plan file are still tracked). - -- [ ] **Step 7.3: Verify Rust build is unaffected** - -Run: `cargo build --release 2>&1 | tail -3` -Expected: `Finished` (no source code references docs/, so the build is unaffected). - -- [ ] **Step 7.4: Verify the test suite runs (sanity)** - -Run: -```bash -cargo test --release --workspace -- \ - --skip charge_missing_spectrum_uses_per_charge_scored_spec \ - --skip spectrum_without_charge_tries_charge_range \ - --skip known_peptide_appears_in_top_n \ - --skip read_bsa_canno_text_format \ - --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ - --skip tryp_pig_bov_revcat_full_set_loads \ - --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E 'test result.*FAILED' | wc -l -``` - -Expected: `0` failed. - -- [ ] **Step 7.5: Commit (Commit 5)** - -```bash -git commit -m "$(cat <<'EOF' -docs: delete legacy docs/ tree (content migrated to DOCS.md) - -The docs/ tree predated the Rust cutover and described the Java tool -(mvn build, JAR distribution, Java CLI). Content that still applies has -been migrated to root-level README.md, DOCS.md, and CLI_MIGRATION.md. - -Deleted (38 tracked files): -- docs/msgfplus.md (full Java CLI reference — superseded by DOCS.md §1) -- docs/msgfdb_modfile.md (mods.txt grammar — superseded by DOCS.md §2) -- docs/output.md (PIN/TSV columns — superseded by DOCS.md §3) -- docs/buildsa.md (Java standalone SA builder — Java-only utility) -- docs/training-scoring-models.md (Java trainer — superseded by DOCS.md §6) -- docs/isobariclabeling.md (TMT/iTRAQ — superseded by DOCS.md §7) -- docs/troubleshooting.md (Java JVM tuning — Java-only) -- docs/changelog.md (Java release notes — GitHub Releases tracks v0.1.0+) -- docs/readme.md (Java tool overview — superseded by root README.md) -- docs/benchmarks/ (3 PNG figures from Java perf comparison — stale) -- docs/examples/ (Mods.txt + activation/enzyme/protocol samples — - inline examples in DOCS.md instead) -- docs/parameterfiles/ (15 Java -conf templates — no Rust equivalent) - -Preserved: -- docs/superpowers/specs/ — design specs (engineering planning). -- docs/superpowers/plans/ — implementation plans (engineering planning). -- docs/parity-analysis/ (already gitignored since commit 5e9b63ac; - no action needed). -EOF -)" -``` - -Run after: `git log --oneline -7` -Expected: 5 new commits on top of `eb4953cc` (the spec commit), in the order: -1. `feat(cli): rename param flags ...` -2. `docs: rewrite README.md ...` -3. `docs: add DOCS.md ...` -4. `docs: add CLI_MIGRATION.md ...` -5. `docs: delete legacy docs/ tree ...` - ---- - -## Task 8: Push branch and open PR - -- [ ] **Step 8.1: Push the branch** - -Run: `git push origin iter39-docs-rewrite` -Expected: 5 commits pushed; remote tracking is set up. - -- [ ] **Step 8.2: Open the PR** - -Run: -```bash -gh pr create --base dev --head iter39-docs-rewrite \ - --title "iter39: docs + CLI rename for the post-cutover state" \ - --body "$(cat <<'EOF' -## Summary - -- Rewrite README.md as a linear narrative serving quantms operators + mass-spec researchers (~190 lines). -- Add DOCS.md at repo root: single-file reference for CLI, formats, training, migration (~505 lines). -- Add CLI_MIGRATION.md: Java MS-GF+ → msgf-rust flag map + numeric legacy → named-value table + 3 worked examples (~100 lines). -- Rename CLI flags from Java-historical numeric IDs to Rust-idiomatic named values; legacy forms still accepted silently for quantms script compat. -- Delete the legacy docs/ tree (38 tracked files); preserve docs/ engineering-planning artifacts. - -Design spec: `docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md`. - -## CLI changes (one commit, fully backward-compatible) - -Canonical (shown in --help): -- `--fragmentation auto|CID|ETD|HCD|UVPD` (was numeric 0..=4) -- `--instrument low-res|high-res|TOF|QExactive` (was numeric 0..=3) -- `--protocol auto|phospho|iTRAQ|iTRAQ-phospho|TMT|standard` (was numeric 0..=5) -- `--enzyme-specificity non-specific|semi|fully` (was --ntt 0..=2) -- `--mods ` (was --mod, kept as hidden alias) - -Legacy (silently accepted): numeric 0..=N for the four enum flags, --ntt as a clap alias for --enzyme-specificity, --mod as a hidden alias for --mods. Quantms scripts using legacy form keep working unchanged. - -A new regression test (`cli_accepts_both_named_and_numeric_param_values`) runs a search twice — once with legacy numeric flags, once with canonical named flags — and asserts byte-identical PIN output. - -## Test plan - -- [x] cargo test --release --workspace passes (37+ test binaries, 0 new failures vs baseline) -- [x] New round-trip test guards the back-compat path -- [x] cargo build --release produces clean binary -- [x] Existing CI workflow (.github/workflows/ci.yml) needs no changes; the 7 known-skipped tests stay skipped -EOF -)" -``` - -Expected output: a PR URL like `https://github.com/bigbio/msgf-rust/pull/`. - -- [ ] **Step 8.3: Mark plan complete** - -Plan implementation finished. Wait for CI to pass on the new PR, then merge per the project's normal flow. - ---- - -## Self-review checklist - -After implementing all tasks, verify: - -- [ ] All 5 commits exist on `iter39-docs-rewrite`, in the order specified. -- [ ] No commit message contains the substring "superpowers" (commit hook blocks it). -- [ ] `cargo build --release` succeeds with zero warnings. -- [ ] `cargo test --release --workspace -- --skip [7 known]` reports 0 failed. -- [ ] `git ls-files docs/` shows ONLY `docs/superpowers/specs/...` and `docs/superpowers/plans/...`. -- [ ] Root has `README.md`, `DOCS.md`, `CLI_MIGRATION.md`, `LICENSE`, `NOTICE`, `Cargo.toml`, etc. -- [ ] `msgf-rust --help` shows the new canonical flag names; legacy numeric values still parse. -- [ ] The new test `cli_accepts_both_named_and_numeric_param_values` passes. diff --git a/docs/superpowers/plans/2026-05-26-quality-cleanup-plan.md b/docs/superpowers/plans/2026-05-26-quality-cleanup-plan.md new file mode 100644 index 00000000..ce582c30 --- /dev/null +++ b/docs/superpowers/plans/2026-05-26-quality-cleanup-plan.md @@ -0,0 +1,1149 @@ +# Quality cleanup (PR-Q1) Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Land a single low-risk cleanup PR on `feat/quality-perf-id-rate` → `dev` that removes 32 dangling `Xxx.java:LINE` references in non-test source, neutralizes stale "port of MS-GF+" framing in module headers + CLI help, renames the `MSGFRUST_RSS_PROBE` env var (legacy-compatible), fixes all 37+ stable clippy warnings, lifts CI lint from advisory to required, and deletes 2 shipped design specs. + +**Architecture:** Six in-PR commits (Groups 1-6 from the design spec) plus one out-of-repo memory update (Group 7, already completed by the controller during the brainstorm phase). Logic-preserving — `precursor_cal_bit_identical` regression gate is the safety net. Parity test files (`tests/*_java_parity.rs`, `tests/gf_bsa_parity.rs`, `tests/*_match_java.rs`) are NOT touched — their identity IS Java parity. + +**Tech Stack:** Rust 2024 edition pinned to 1.87.0 (`rust-toolchain.toml`), cargo workspace, clippy (stable), `cargo test --release --workspace`, GitHub Actions CI. + +**Spec:** `docs/superpowers/specs/2026-05-26-quality-cleanup-design.md` + +--- + +## File map + +**Group 1 — dangling Java refs (8 non-test files, 32 refs):** +- `crates/input/src/mzml.rs:63, 351` (2 refs) +- `crates/output/src/pin.rs:354, 417` (2 refs) +- `crates/search/src/mass_calibrator.rs:176` (1 ref) +- `crates/search/src/psm.rs:77, 92, 232, 247, 248, 445` (6 refs) +- `crates/search/src/match_engine.rs:346, 466, 479, 515, 691, 692, 789, 823, 825, 901, 975, 1324` (11 refs) +- `crates/scoring/src/scoring/scored_spectrum.rs:196, 223, 245, 901, 1239` (5 refs) +- `crates/scoring/src/scoring/psm_score.rs:45` (1 ref) +- `crates/msgf-rust/src/bin/msgf-rust.rs:990, 1008, 1118, 1331` (4 refs) + +**Group 2 — stale framing:** +- `crates/search/src/lib.rs`, `crates/scoring/src/lib.rs`, `crates/output/src/lib.rs`, `crates/input/src/lib.rs`, `crates/model/src/lib.rs` — top-of-file `//!` headers +- `crates/msgf-rust/src/bin/msgf-rust.rs` — CLI `--help` strings (specifically `#[command(about = ...)]` and any `#[arg(help = ...)]` that compares behavior to Java) + +**Group 3 — identifier renames:** +- `crates/msgf-rust/src/bin/msgf-rust.rs` — `MSGFRUST_RSS_PROBE` env var → support `MSGF_RSS_PROBE` AS WELL (accept both for one release) + +**Group 4 — clippy 37+ warnings (per crate):** +- `crates/model/src/aa_set.rs:269` (1 warning: manual `split_once`) +- `crates/scoring/src/param_model.rs:365` (1 `map_or`) +- `crates/scoring/src/scoring/scored_spectrum.rs` (12 warnings: 6 complex types, 4 `map_or`, 1 too-many-args, 1 loop index) +- `crates/scoring/src/scoring/scored_spectrum.rs:133-134` (doc list items) +- `crates/search/src/precursor_cal.rs:95` (1 dead `mut`) +- `crates/search/src/match_engine.rs:297, 415` (1 too-many-args, 1 `map_or`) +- `crates/search/src/sa_walk.rs:165` (1 `?` rewrite) +- `crates/output/src/tsv.rs:45, 64, 125` (3 too-many-args) +- `crates/msgf-rust/src/bin/msgf-rust.rs` (13 warnings: 11 doc-indentation, 1 loop counter, 1 misc) + +**Group 5 — CI lint required:** +- `.github/workflows/ci.yml` — drop `continue-on-error: true` from the `lint` job + +**Group 6 — delete shipped specs:** +- `docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md` — DELETE +- `docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md` — DELETE + +**Group 7 — out-of-repo (DONE during brainstorm):** +- `~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/MEMORY.md` — already updated +- `~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/project_pr_a_precursor_cal_shipped.md` — created +- `~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/project_quality_cleanup_pr_q1_active.md` — created +- `~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/project_next_sub_projects_sequencing.md` — created + +Verification only at task 7. + +--- + +## Pre-flight (verify before Task 1) + +```bash +cd /Users/yperez/work/msgfplus-workspace/astral-speed +git branch --show-current # must be feat/quality-perf-id-rate +git log origin/dev..HEAD --oneline | wc -l # expect 2 (a8ad6ddd + 55cff3fa) +git status --short # expect clean tree +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E "^test result|error" | grep -vE "0 passed.*0 failed.*0 ignored" | tail -10 +# Expect: all `test result:` lines show `0 failed`. +``` + +If any non-skipped test fails, STOP — pre-flight failed. + +--- + +## Task 1: Group 1 — Scrub dangling `.java:LINE` references + +**Files:** +- Modify: `crates/input/src/mzml.rs` +- Modify: `crates/output/src/pin.rs` +- Modify: `crates/search/src/mass_calibrator.rs` +- Modify: `crates/search/src/psm.rs` +- Modify: `crates/search/src/match_engine.rs` +- Modify: `crates/scoring/src/scoring/scored_spectrum.rs` +- Modify: `crates/scoring/src/scoring/psm_score.rs` +- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` + +**Rule:** Replace each `Xxx.java:LINE` or `Xxx.java` citation with intent-only text. Preserve the surrounding sentence's semantic meaning. Pattern: +- `// foo (DBScanner.java:534)` → `// foo (Java parity)` +- `// Java's NewScoredSpectrum.java:253 …` → `// Java parity: …` +- `/// MSGFPlus.java post-cal block` → `/// matching Java's post-cal block` + +DO NOT touch: +- `crates/search/tests/gf_java_parity.rs` +- `crates/search/tests/match_engine_java_parity.rs` +- `crates/search/tests/gf_bsa_parity.rs` +- `crates/model/tests/*_match_java.rs` +- `docs/parity-analysis/**` + +- [ ] **Step 1: Inventory and confirm exact ref count** + +```bash +cd /Users/yperez/work/msgfplus-workspace/astral-speed +grep -rEn "\.java:[0-9]+|\.java\b" crates/ --include='*.rs' 2>/dev/null \ + | grep -v "tests/.*java_parity\|tests/gf_bsa_parity\|tests/.*_match_java" \ + | tee /tmp/q1-task1-refs.txt | wc -l +``` + +Expected: 32 lines (matches the design spec). + +- [ ] **Step 2: Scrub `crates/input/src/mzml.rs`** + +Open the file. Find line 63: +```rust +// `msutil/ActivationMethod.java` — we map each to one of our five +``` +Replace with: +```rust +// Java parity for activation method names — we map each to one of our five +``` + +Find line 351: +```rust + // Selection rule (mirrors `StaxMzMLParser.java:595-605`): +``` +Replace with: +```rust + // Selection rule (Java parity): +``` + +- [ ] **Step 3: Scrub `crates/output/src/pin.rs`** + +Find line 354: +```rust + // enzN, enzC, enzInt — C-4 (2026-05-19): Java DirectPinWriter.java:199-203 +``` +Replace with: +```rust + // enzN, enzC, enzInt — C-4 (2026-05-19): Java parity +``` + +Find line 417: +```rust + // emits one accession per index — matching Java DirectPinWriter.java:237. +``` +Replace with: +```rust + // emits one accession per index — Java parity. +``` + +- [ ] **Step 4: Scrub `crates/search/src/mass_calibrator.rs`** + +Find line 176: +```rust +/// `MSGFPlus.java` post-cal block). No-op when stats are unreliable or +``` +Replace with: +```rust +/// matching Java's post-cal block). No-op when stats are unreliable or +``` + +- [ ] **Step 5: Scrub `crates/search/src/psm.rs`** + +Find line 77: +```rust + /// `DirectPinWriter.java:237`. +``` +Replace with: +```rust + /// (Java parity for PIN protein-list emission.) +``` + +Find line 92: +```rust + /// `DBScanScorer.getScore` returns `node + edge` and `DBScanner.java:533` +``` +Replace with: +```rust + /// Java's score returns `node + edge` (Java parity) +``` + +Find line 232: +```rust + /// Java's `DBScanner.java:540` (`size < n OR score == worst → add`). +``` +Replace with: +```rust + /// Java parity (`size < n OR score == worst → add`). +``` + +Find lines 247-248: +```rust + // R-1 (2026-05-18): Java's DBScanner.java:540 keeps tied + // PSMs at capacity (and DBScanner.java:745 keeps SpecE +``` +Replace with: +```rust + // R-1 (2026-05-18): Java parity — keeps tied + // PSMs at capacity (and keeps SpecE +``` + +Find line 445: +```rust + // (DBScanner.java:540 raw-score retention; DBScanner.java:745 SpecE +``` +Replace with: +```rust + // (Java parity — raw-score retention; SpecE +``` + +- [ ] **Step 6: Scrub `crates/search/src/match_engine.rs`** + +This file has 11 refs. Use the inventory from Step 1 to locate each line. For each: +1. Use `grep -n "\.java:" crates/search/src/match_engine.rs` to confirm current text. +2. Replace `Xxx.java:LINE` patterns with `Java parity` or `Java's behavior` depending on grammar fit. +3. Preserve surrounding comment context — only the citation itself goes. + +Example transformations (apply to each of the 11 refs): + +```rust +// per-SpecKey raw-score retention (DBScanner.java:534). +``` +→ +```rust +// per-SpecKey raw-score retention (Java parity). +``` + +```rust +// Java's `DBScanner.java:619-621` reads +``` +→ +```rust +// Java parity reads +``` + +```rust +// `DirectPinWriter.java:165` does +``` +→ +```rust +// Java parity does +``` + +```rust +// Java parity (PSMFeatureFinder.java:51-54): feature-counting uses a +``` +→ +```rust +// Java parity: feature-counting uses a +``` + +After all 11 replacements, verify: +```bash +grep -c "\.java:" crates/search/src/match_engine.rs +# Expect: 0 +``` + +- [ ] **Step 7: Scrub `crates/scoring/src/scoring/scored_spectrum.rs`** + +5 refs at lines 196, 223, 245, 901, 1239. Apply same replacement pattern. Special case for line 901: +```rust +/// `astral-speed/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java`. +``` +→ +```rust +/// (Java parity for spectrum filtering semantics.) +``` + +After: +```bash +grep -c "\.java" crates/scoring/src/scoring/scored_spectrum.rs +# Expect: 0 +``` + +- [ ] **Step 8: Scrub `crates/scoring/src/scoring/psm_score.rs`** + +Find line 45: +```rust +/// Mirrors Java's `DBScanner.java:513` call: fromIndex=1, toIndex=n+1 → +``` +Replace with: +```rust +/// Java parity call: fromIndex=1, toIndex=n+1 → +``` + +- [ ] **Step 9: Scrub `crates/msgf-rust/src/bin/msgf-rust.rs`** + +4 refs at lines 990, 1008, 1118, 1331. Same pattern. For line 990: +```rust + // (NewScorerFactory.java line ~120). For (CID, HighRes, Tryp, TMT) this +``` +→ +```rust + // (Java parity for scorer factory routing). For (CID, HighRes, Tryp, TMT) this +``` + +After all 4, verify: +```bash +grep -c "\.java" crates/msgf-rust/src/bin/msgf-rust.rs +# Expect: 0 +``` + +- [ ] **Step 10: Final verification — zero dangling java refs in non-test code** + +```bash +grep -rEn "\.java:[0-9]+|\.java\b" crates/ --include='*.rs' 2>/dev/null \ + | grep -v "tests/.*java_parity\|tests/gf_bsa_parity\|tests/.*_match_java" +``` + +Expected output: empty. If anything appears, fix it before committing. + +Also verify parity tests untouched: +```bash +git diff -- crates/search/tests/gf_java_parity.rs crates/search/tests/match_engine_java_parity.rs crates/search/tests/gf_bsa_parity.rs crates/model/tests/chemistry_constants_match_java.rs crates/model/tests/standard_aa_masses_match_java.rs crates/model/tests/common_mod_masses_match_java.rs +# Expect: empty (no diffs) +``` + +- [ ] **Step 11: Run workspace tests** + +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E "^test result|error" | grep -vE "0 passed.*0 failed.*0 ignored" | tail -10 +``` + +Expected: every `test result:` shows `0 failed`. Comment-only changes do not affect test outcomes. + +- [ ] **Step 12: Commit** + +```bash +git add crates/ +git commit -m "$(cat <<'COMMIT_EOF' +chore: scrub 32 dangling .java:LINE references in non-test source + +The Java source tree was removed in commit b4565b8e during the +Rust-cutover; the inline citations to specific Java line numbers now +point at code that does not exist in this repo. Replace each citation +with intent-only "Java parity" comments. Preserves semantic meaning; +removes the broken hyperlinks. + +Parity-test files (tests/*_java_parity.rs, tests/gf_bsa_parity.rs, +tests/*_match_java.rs) untouched — their identity is Java parity and +the citations are load-bearing documentation. + +8 non-test files touched, 32 refs replaced, 0 functional changes. +COMMIT_EOF +)" +``` + +Expected: commit created. + +--- + +## Task 2: Group 2 — Neutralize "port of MS-GF+" framing + +**Files:** +- Modify: `crates/search/src/lib.rs` +- Modify: `crates/scoring/src/lib.rs` +- Modify: `crates/output/src/lib.rs` +- Modify: `crates/input/src/lib.rs` +- Modify: `crates/model/src/lib.rs` +- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` (CLI help strings only) + +**Rule:** Module headers (`//!`) and CLI `--help` strings that introduce a module/flag by reference to Java code should switch to neutral framing. The codebase is post-cutover; we ship `msgf-rust`, not a "port". + +**Keep:** `README.md` and `DOCS.md` provenance sections that explain the project's lineage in user-facing context. Those stay. + +- [ ] **Step 1: Inventory headers + help strings with stale framing** + +```bash +cd /Users/yperez/work/msgfplus-workspace/astral-speed +# crate-lib headers +head -10 crates/search/src/lib.rs crates/scoring/src/lib.rs crates/output/src/lib.rs crates/input/src/lib.rs crates/model/src/lib.rs + +# CLI help strings +grep -nE "(MS-GF\+|MSGFPlus|port of.*MS-GF|Java MS-GF|mirrors? Java)" crates/msgf-rust/src/bin/msgf-rust.rs +``` + +Capture the output for Step 2. + +- [ ] **Step 2: Edit each module header** + +For each of the five `crates/*/src/lib.rs` files, if the top `//!` doc block opens with phrases like "Port of Java MS-GF+ X" or "Rust reimplementation of MSGFPlus", replace the opening sentence with a neutral description of what the crate does. The rest of the doc block stays. + +Example (`crates/search/src/lib.rs`): + +Current style (if present): +```rust +//! Port of Java MS-GF+ database search engine. +//! +//! Re-exports the public search surface. +``` + +Neutral: +```rust +//! Peptide database search engine: candidate enumeration, +//! precursor matching, scoring, and PSM aggregation. +//! +//! Re-exports the public search surface. +``` + +Apply analogous neutral framing to: +- `crates/scoring/src/lib.rs` ("Scoring model, ion prediction, and generating-function DP") +- `crates/output/src/lib.rs` ("Output writers: Percolator PIN, TSV") +- `crates/input/src/lib.rs` ("Input readers: MGF, mzML, FASTA") +- `crates/model/src/lib.rs` ("Core domain types: spectra, peptides, modifications, amino-acid sets, masses") + +If a file does NOT have a stale "port of" opener, leave it alone. + +- [ ] **Step 3: Edit CLI `--help` strings** + +In `crates/msgf-rust/src/bin/msgf-rust.rs`, find `#[command(about = ...)]` near the `Cli` struct. If it mentions Java behavior comparison, replace with a behavior-only description. + +Example: +```rust +about = "Rust port of MS-GF+: database search of MGF/mzML spectra against FASTA", +``` +→ +```rust +about = "msgf-rust: database search of MGF/mzML spectra against FASTA", +``` + +Then walk through the `#[arg(...)]` attributes. Any `help = "..."` string that explicitly says "matches Java -X behavior" or "Java MS-GF+ default" gets reworded to describe what the flag does without the comparison. Mention of Java numeric legacy values (`-protocol 0`, etc.) **stays** because that's user-facing migration info. + +- [ ] **Step 4: Verify CLI still parses + tests pass** + +```bash +cargo build --release -p msgf-rust 2>&1 | tail -3 +./target/release/msgf-rust --help 2>&1 | head -5 +# Expect: builds clean; --help opens with neutral about line. + +cargo test --release -p msgf-rust 2>&1 | grep -E "^test result" | tail -5 +# Expect: all PASS. +``` + +- [ ] **Step 5: Commit** + +```bash +git add crates/ +git commit -m "$(cat <<'COMMIT_EOF' +chore: neutralize "port of MS-GF+" framing in headers and CLI help + +The codebase is post-cutover; new contributors should read crate-lib +top-of-file doc comments as descriptions of what each crate does, not +as port-bookkeeping. CLI --help strings that compared behavior to +Java's command-line options now describe behavior directly. + +README.md and DOCS.md provenance sections kept (those are intentional +user-facing project lineage). docs/parity-analysis/** kept. + +5 crate-lib headers + msgf-rust CLI help touched. +COMMIT_EOF +)" +``` + +--- + +## Task 3: Group 3 — Identifier renames + legacy compat + +**Files:** +- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` + +- [ ] **Step 1: Locate the `MSGFRUST_RSS_PROBE` env var** + +```bash +grep -n "MSGFRUST_RSS_PROBE" crates/msgf-rust/src/bin/msgf-rust.rs +``` + +Expected: 1-3 sites (var read + maybe doc). + +- [ ] **Step 2: Add legacy compat support** + +Find the `log_rss` function (or equivalent that reads the env var). Replace the env-var read with both names: + +```rust +fn log_rss(label: &str) { + let new_name = std::env::var_os("MSGF_RSS_PROBE"); + let legacy = std::env::var_os("MSGFRUST_RSS_PROBE"); + if legacy.is_some() && new_name.is_none() { + eprintln!( + "WARN: MSGFRUST_RSS_PROBE is deprecated; use MSGF_RSS_PROBE \ + (legacy name accepted in this release, will be removed next)" + ); + } + if new_name.is_none() && legacy.is_none() { + return; + } + // ... existing RSS-reading logic unchanged ... +} +``` + +If the original function used a different control-flow (e.g., early return when the var is unset), preserve that flow — only the env-var name reading changes. + +- [ ] **Step 3: Update any in-source doc references to use the new name** + +```bash +grep -n "MSGFRUST_RSS_PROBE" crates/msgf-rust/src/bin/msgf-rust.rs +``` + +For each remaining reference, if it's a doc comment, update to mention the new name with the legacy note. Example: +```rust +/// Memory probe (set MSGF_RSS_PROBE=1; legacy MSGFRUST_RSS_PROBE accepted). +``` + +- [ ] **Step 4: Verify** + +```bash +cargo build --release -p msgf-rust 2>&1 | tail -3 +# Sanity check both env-var names: +MSGF_RSS_PROBE=1 ./target/release/msgf-rust --help 2>&1 | grep -E "^startup\s|RSS" | head -3 +# (header should print) +MSGFRUST_RSS_PROBE=1 ./target/release/msgf-rust --help 2>&1 | grep -E "WARN.*deprecated|^startup" | head -3 +# (should print deprecation warning AND the rss-probe header) +``` + +- [ ] **Step 5: Commit** + +```bash +git add crates/msgf-rust/src/bin/msgf-rust.rs +git commit -m "$(cat <<'COMMIT_EOF' +chore: rename MSGFRUST_RSS_PROBE -> MSGF_RSS_PROBE (legacy accepted) + +The "MSGFRUST_" prefix dates from an early iter-era naming and doesn't +match the binary's identity (msgf-rust). Switch to MSGF_RSS_PROBE and +keep the legacy name accepted for this release with a deprecation +warning on stderr. The legacy name will be removed in the next quality +cleanup. + +Side-effect-only env var; no functional change. +COMMIT_EOF +)" +``` + +--- + +## Task 4: Group 4 — Clippy + unused-lints sweep + +This task is the largest. Sub-divided into Tasks 4a-4d by warning class. After each sub-task, run the relevant `cargo clippy` and verify counts drop. + +### Task 4a: Auto-fixable simplifications (`map_or`, `?`, `split_once`, indentation) + +**Files (per the clippy inventory):** +- `crates/model/src/aa_set.rs` (1 split_once) +- `crates/scoring/src/param_model.rs` (1 map_or) +- `crates/scoring/src/scoring/scored_spectrum.rs` (4 map_or, 2 doc indentation) +- `crates/search/src/match_engine.rs` (1 map_or) +- `crates/search/src/sa_walk.rs` (1 ? rewrite) +- `crates/msgf-rust/src/bin/msgf-rust.rs` (11 doc indentation) + +- [ ] **Step 1: Apply per-crate `clippy --fix`** + +```bash +cd /Users/yperez/work/msgfplus-workspace/astral-speed +for c in model scoring search output msgf-rust; do + cargo clippy --fix --lib -p "$c" --allow-dirty --allow-staged 2>&1 | tail -3 +done +``` + +cargo-clippy will auto-apply the fixable lints (`map_or`, manual `split_once`, `?` rewrite, some doc-indent cases). Manual lints that don't have a machine-applicable fix remain. + +- [ ] **Step 2: Verify fixes look correct** + +```bash +git diff --stat | head -10 +# Expect: ~5-10 files changed with small line counts. + +# Sanity-check one of the rewrites: +grep -nE "manual.*split_once|map_or" crates/model/src/aa_set.rs crates/scoring/src/param_model.rs +``` + +If any `clippy --fix` result looks semantically wrong, revert that hunk with `git checkout ` and apply the fix manually instead. + +- [ ] **Step 3: Workspace tests** + +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E "^test result|error" | grep -vE "0 passed.*0 failed.*0 ignored" | tail -10 +``` + +Expected: 0 failed. + +- [ ] **Step 4: Stage but don't commit yet** (commit at end of Task 4) + +```bash +git add crates/ +``` + +### Task 4b: Complex-type aliases in scored_spectrum.rs + +**Files:** +- Modify: `crates/scoring/src/scoring/scored_spectrum.rs` + +Six warnings at lines 108, 233, 272, 367, 390, 672 about "very complex type used". Introduce 1-2 type aliases near the top of the file that name the recurring complex type. + +- [ ] **Step 1: Identify the recurring shape** + +```bash +grep -B 1 "very complex type" /tmp/clippy-output.log 2>/dev/null \ + || cargo clippy --lib -p scoring 2>&1 | grep -A 8 "complex type" | head -40 +``` + +Pattern (typical): `Vec<(Partition, Vec<(IonType, Vec)>)>` — the segment-partition cache. May also be a `&[(K, V)]` slice variant. + +- [ ] **Step 2: Add a `type SegmentPartitionCache = ...;` near the top** + +Open `crates/scoring/src/scoring/scored_spectrum.rs`. Find the existing `use ...;` block (lines 1-50 area). After the imports, before the first item, add: + +```rust +/// Per-segment partition entries: `(Partition, Vec<(IonType, log-probs)>)`. +pub(crate) type SegmentPartitionCache = Vec<(Partition, Vec<(IonType, Vec)>)>; +``` + +If a slice-borrow shape is also complained-about, also add: +```rust +pub(crate) type SegmentPartitionSlice<'a> = &'a [(Partition, Vec<(IonType, Vec)>)]; +``` + +- [ ] **Step 3: Substitute the alias at each warning site** + +For each of the 6 lines flagged by clippy, replace the inline complex type with the alias. Example: + +Before: +```rust +fn compute(... + segment_partition_cache: &Vec<(Partition, Vec<(IonType, Vec)>)>, +) -> ... { +``` + +After: +```rust +fn compute(... + segment_partition_cache: SegmentPartitionSlice<'_>, +) -> ... { +``` + +(Or `&SegmentPartitionCache` if the lifetime form doesn't fit.) + +- [ ] **Step 4: Verify clippy is happy** + +```bash +cargo clippy --lib -p scoring 2>&1 | grep "complex type" | wc -l +# Expect: 0 +``` + +- [ ] **Step 5: Tests** + +```bash +cargo test --release -p scoring 2>&1 | grep -E "^test result" | tail -3 +# Expect: 0 failed. +``` + +- [ ] **Step 6: Stage** + +```bash +git add crates/scoring/src/scoring/scored_spectrum.rs +``` + +### Task 4c: `too_many_arguments` refactors (5 sites) + +**Files:** +- Modify: `crates/scoring/src/scoring/scored_spectrum.rs` (2 sites: line 381 has 11/7, line 669 has 8/7) +- Modify: `crates/search/src/match_engine.rs` (1 site: line 297 has 8/7) +- Modify: `crates/output/src/tsv.rs` (3 sites: lines 45, 64, 125) + +**Pattern:** Group the shared args into a small struct passed by `&` reference; keep the caller side ergonomic. + +- [ ] **Step 1: Refactor `scored_spectrum.rs:381` (11-arg fn)** + +Locate the function (likely `Self::new` or `Self::compute_caches`). Identify which 3-5 args are passed together everywhere it's called. Common groupings: + +```rust +struct ScoredSpectrumBuildContext<'a> { + spec: &'a Spectrum, + scorer: &'a RankScorer, + charge: u8, + fragment_tolerance_da: f64, + deconv_peaks: Option<&'a [(f64, f32)]>, +} +``` + +Then change the function signature from 11 args to ~6 (the new ctx struct + the remaining standalone args). + +Update all callers (use `cargo build` errors to find them): +```bash +cargo build -p scoring 2>&1 | grep -E "error\[E" | head +``` + +- [ ] **Step 2: Refactor `scored_spectrum.rs:669` (8-arg fn)** + +Similar approach. If the function is `directional_node_score_inner`, the args fall into: +- Spectrum data: `peaks`, `ranks`, `precursor_filtered` +- Scoring context: `segment_partition_cache`, `scorer`, `nominal_mass`, `parent_mass`, etc. + +Group whichever feels cohesive. Don't force one cohesive grouping if the args are genuinely independent — `#[allow(clippy::too_many_arguments)]` with a one-line justification is acceptable for hot-path functions where wrapping in a struct hurts readability. + +- [ ] **Step 3: Refactor `match_engine.rs:297` (8-arg fn)** + +This is in `PreparedSearch::run_chunk_inner`. The args are inherent to the search loop; `#[allow(clippy::too_many_arguments)]` with a comment is probably the right call here since the function is private and not called from many places. + +```rust +#[allow( + clippy::too_many_arguments, + reason = "private inner driver; args reflect the search-loop state" +)] +fn run_chunk_inner( + ... +) -> Vec { ... } +``` + +- [ ] **Step 4: Refactor `tsv.rs:45, 64, 125` (3 writer fns)** + +Likely `write_tsv`, `write_psm_row`, etc. Args fall into: +- Output target: `writer` +- Data: `spectra`, `queues`, `candidates`, `params`, `idx` +- Format: `spec_file_name`, `use_mgf_specid` + +Group into: +```rust +struct TsvWriteContext<'a> { + spectra: &'a [Spectrum], + queues: &'a [TopNQueue], + candidates: &'a [Candidate], + params: &'a SearchParams, + idx: &'a SearchIndex, +} +``` + +Or alternatively, since this is public API across crate boundaries, use `#[allow(clippy::too_many_arguments)]` with a justification: "Writer API mirrors PIN writer; grouping into a context struct would diverge." + +Pick whichever produces fewer touched call sites. + +- [ ] **Step 5: Workspace tests** + +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E "^test result|error" | grep -vE "0 passed.*0 failed.*0 ignored" | tail -10 +``` + +Expected: 0 failed. + +- [ ] **Step 6: Stage** + +```bash +git add crates/ +``` + +### Task 4d: Dead `mut`, loop counter, doc indentation, remaining warnings + +**Files:** +- Modify: `crates/search/src/precursor_cal.rs` (line 95: dead `mut`) +- Modify: `crates/scoring/src/scoring/scored_spectrum.rs` (line 693: loop index) +- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` (lines 179-183, 923, 1059, 1129-1135: doc indentation + loop counter) + +- [ ] **Step 1: Fix dead `mut` in `precursor_cal.rs`** + +Open `crates/search/src/precursor_cal.rs` at line 95. Find the `let mut ...` that isn't actually mutated. Remove the `mut`: + +```rust +// before +let mut deviations: Vec = values.iter().map(|v| (v - center).abs()).collect(); +// after +let deviations: Vec = values.iter().map(|v| (v - center).abs()).collect(); +``` + +- [ ] **Step 2: Fix loop-index warning in `scored_spectrum.rs:693`** + +This says "the loop variable `seg` is used to index `segment_partition_cache`". Replace `for seg in 0..cache.len() { let entry = &cache[seg]; ... }` with `for entry in &cache { ... }` (using `iter().enumerate()` if the index is also needed). + +- [ ] **Step 3: Fix the 11 doc-indentation warnings in `msgf-rust.rs`** + +Lines 179-183 and 1129-1135 are in doc-comment blocks (probably bullet lists). Reformat the bullets so the second line aligns with the first character after `* ` or `- `: + +Before: +```rust + /// * **First item:** description + /// description continues +``` +After: +```rust + /// * **First item:** description + /// description continues +``` + +(Note: 3 spaces after `///` for second line to align with the text after `* `.) + +Apply to all flagged lines. + +- [ ] **Step 4: Fix loop-counter warning at `msgf-rust.rs:1059`** + +The warning says "the variable `seen` is used as a loop counter". Replace with the recommended pattern (e.g., `.enumerate()` or a separate counter outside the loop). + +- [ ] **Step 5: Confirm clippy is clean** + +```bash +cargo clippy --workspace --release 2>&1 | grep -cE "^warning:" +# Expect: 0 (or VERY close to 0 — any residual would be in transitive dep build script noise, which we can't fix) +``` + +- [ ] **Step 6: Workspace tests** + +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts 2>&1 | grep -E "^test result|error" | grep -vE "0 passed.*0 failed.*0 ignored" | tail -10 +``` + +Expected: 0 failed. + +- [ ] **Step 7: Commit Task 4 (all sub-tasks)** + +```bash +git add crates/ +git commit -m "$(cat <<'COMMIT_EOF' +chore: fix all clippy warnings (workspace) + +Brings the workspace to clippy-clean on stable 1.87.0 so the CI lint +job can be lifted from advisory to required. + +Changes by class: +- map_or simplifications (6 sites): mechanical rewrite +- complex-type aliases (6 sites): SegmentPartitionCache/Slice +- too_many_arguments (5 sites): context structs OR justified allow +- doc-list indentation (15 sites): align bullet continuations +- unused_mut (1 site): drop unused mut +- ? rewrite, manual split_once, loop-counter, loop-index: per clippy hint + +No functional behavior change; PIN/TSV bit-identical regression gate +in tree (precursor_cal_bit_identical) is the verification. +COMMIT_EOF +)" +``` + +--- + +## Task 5: Group 5 — Lift CI lint to required + +**Files:** +- Modify: `.github/workflows/ci.yml` + +- [ ] **Step 1: Locate the lint job's `continue-on-error`** + +```bash +grep -n "continue-on-error\|lint:" .github/workflows/ci.yml | head -10 +``` + +Should show the `lint:` job near line 75-80 with a `continue-on-error: true` immediately under it. + +- [ ] **Step 2: Remove the line** + +Open `.github/workflows/ci.yml`. Find: + +```yaml + lint: + name: Lint (clippy + rustfmt) + runs-on: ubuntu-latest + # Advisory only — the iter1-38 codebase isn't fmt-clean / clippy-clean + # yet (~11k lines of fmt churn pending). Surfaces the warnings without + # blocking PRs while that cleanup is sequenced separately. + continue-on-error: true +``` + +Replace with: + +```yaml + lint: + name: Lint (clippy + rustfmt) + runs-on: ubuntu-latest +``` + +(Both the `continue-on-error` line and the trailing comment block become obsolete.) + +- [ ] **Step 3: Confirm the lint job still passes the test locally** + +The CI lint job typically runs `cargo clippy --workspace --release -- -D warnings`. Simulate: + +```bash +cargo clippy --workspace --release -- -D warnings 2>&1 | tail -10 +``` + +Expected: `Finished` with no errors. If clippy fails, return to Task 4 — something was missed. + +- [ ] **Step 4: Also verify rustfmt is clean (if the job runs it)** + +```bash +grep "rustfmt\|cargo fmt" .github/workflows/ci.yml +``` + +If `cargo fmt --check` is part of the job, run it locally: + +```bash +cargo fmt --check 2>&1 | head -20 +``` + +If it fails, run `cargo fmt --all` and stage the formatting changes. Fmt changes can be folded into THIS commit since they're part of "make lint required". + +- [ ] **Step 5: Commit** + +```bash +git add .github/workflows/ci.yml +git diff --cached --stat | head +git commit -m "$(cat <<'COMMIT_EOF' +ci: lift lint job from advisory to required + +After the workspace clippy clean-up landed in the preceding commits, +the lint job can become a real PR gate. Drop continue-on-error: true +and the explanatory comment block. + +Going forward, new clippy warnings or rustfmt drift will block PRs. +COMMIT_EOF +)" +``` + +--- + +## Task 6: Group 6 — Delete shipped design specs + +**Files:** +- Delete: `docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md` +- Delete: `docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md` + +- [ ] **Step 1: Verify the files exist and the iter39 work shipped** + +```bash +ls docs/superpowers/specs/2026-05-23-*.md docs/superpowers/plans/2026-05-23-*.md +git log --oneline | grep -iE "iter39|docs.rewrite" | head -5 +``` + +Expected: both files present; git log shows the iter39 merge (PR #30). + +- [ ] **Step 2: Delete both files** + +```bash +git rm docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md \ + docs/superpowers/plans/2026-05-23-iter39-docs-rewrite.md +``` + +Note: this uses `git rm` so the deletion is staged automatically. + +- [ ] **Step 3: Confirm nothing references the deleted files** + +```bash +grep -rEn "2026-05-23-iter39-docs-rewrite" docs/ crates/ README.md DOCS.md .github/ 2>/dev/null +``` + +Expected: empty. (If anything points at the deleted files, update the reference.) + +- [ ] **Step 4: Commit** + +```bash +git diff --cached --stat +git commit -m "$(cat <<'COMMIT_EOF' +docs: remove shipped iter39 design+plan specs + +The iter39 docs-rewrite spec and plan shipped via PR #30 in 2026-05-23. +Now that the feature is in dev and being relied on, the design docs +no longer need to be discoverable in the repo. Their lineage is in +git history. + +Future protocol: when a docs/superpowers/{specs,plans}/*.md file +references a feature that has fully shipped and closed any deferred +gate, remove it in the next quality cleanup. +COMMIT_EOF +)" +``` + +--- + +## Task 7: Final verification + push + open PR + +- [ ] **Step 1: Confirm commit count** + +```bash +git log origin/dev..HEAD --oneline +# Expect 8 commits: +# 1. a8ad6ddd docs: remove BUG_REVIEW.md; move CLI_MIGRATION.md to docs/ (pre-existing) +# 2. 55cff3fa docs(spec): PR-Q1 quality cleanup design + finalize CLI_MIGRATION refs (pre-existing) +# 3. Group 1: java refs scrub +# 4. Group 2: framing neutralized +# 5. Group 3: env var rename +# 6. Group 4: clippy clean +# 7. Group 5: CI lint required +# 8. Group 6: shipped specs removed +``` + +- [ ] **Step 2: Full workspace test sweep** + +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts 2>&1 | tee /tmp/q1-final-tests.log | grep -E "^test result|error" | grep -vE "0 passed.*0 failed.*0 ignored" | tail -15 +``` + +Expected: every `test result:` shows `0 failed`. No errors. + +- [ ] **Step 3: Bit-identical regression gate** + +```bash +cargo test --release -p msgf-rust --test precursor_cal_bit_identical 2>&1 | tail -5 +``` + +Expected: `test result: ok. 1 passed`. + +- [ ] **Step 4: Confirm CI lint will pass under -D warnings** + +```bash +cargo clippy --workspace --release -- -D warnings 2>&1 | tail -5 +``` + +Expected: `Finished` with no errors. + +- [ ] **Step 5: Confirm auto-memory still consistent** + +```bash +ls ~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/project_pr_a_precursor_cal_shipped.md \ + ~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/project_quality_cleanup_pr_q1_active.md \ + ~/.claude/projects/-Users-yperez-work-msgfplus-workspace/memory/project_next_sub_projects_sequencing.md +``` + +Expected: all 3 present. (Group 7 was done during brainstorm; verification only.) + +- [ ] **Step 6: Push the branch** + +```bash +git push -u origin feat/quality-perf-id-rate 2>&1 | tail -5 +``` + +Expected: branch pushed; URL printed. + +- [ ] **Step 7: Open the PR** + +```bash +gh pr create --base dev --head feat/quality-perf-id-rate \ + --title "chore: quality cleanup (Q1) — dangling Java refs, clippy clean, lint required" \ + --body "$(cat <<'PR_BODY' +## Summary + +Post-cutover code-quality sweep. First of three sequential sub-projects +(Q1 quality → S1 speed → I1 ID-rate +5%/dataset). + +Logic-preserving: PIN/TSV output for `--precursor-cal off` is identical +to dev (sorted-row regression gate in tree). + +## What changed (6 commits) + +- **Group 1 (java refs scrub):** 32 dangling `Xxx.java:LINE` citations + in non-test source replaced with intent-only "Java parity" comments. + Parity-test files (`tests/*_java_parity.rs`, `tests/gf_bsa_parity.rs`, + `tests/*_match_java.rs`) untouched. +- **Group 2 (framing):** 5 crate-lib `//!` headers + CLI `--help` + strings reworded to describe behavior directly (not as a port). +- **Group 3 (env var):** `MSGFRUST_RSS_PROBE` → `MSGF_RSS_PROBE`, + legacy name accepted with deprecation warning for one release. +- **Group 4 (clippy):** All workspace warnings cleaned. New type + aliases (`SegmentPartitionCache`, etc.), 5 `too_many_arguments` + refactors / justified `#[allow]`, dead `mut`, doc indentation, etc. +- **Group 5 (CI):** Lint job lifted from `continue-on-error: true` to + required. +- **Group 6 (docs):** Removed 2 shipped design specs from + `docs/superpowers/`. + +## What's NOT in scope + +- Speed work (PR-S1, separate brainstorm) +- ID-rate work (PR-I1, multi-PR research project) +- Parity test files (deliberately preserved) +- `docs/parity-analysis/notes/` (current iter notes) + +## Verification + +- `cargo test --release --workspace` green under existing CI skip list +- `cargo clippy --workspace --release -- -D warnings` clean +- `precursor_cal_bit_identical` regression gate green +- Auto-memory updated (out-of-repo) with PR-A merged status + Q1/S1/I1 sequencing + +Spec: `docs/superpowers/specs/2026-05-26-quality-cleanup-design.md` +Plan: `docs/superpowers/plans/2026-05-26-quality-cleanup-plan.md` +PR_BODY +)" +``` + +Expected: PR URL printed. Record the PR number. + +- [ ] **Step 8: Verify CI starts** + +```bash +sleep 30 +gh pr view --json number,statusCheckRollup --jq '{number, checks: [.statusCheckRollup[]? | {name, status, conclusion}]}' +``` + +Expected: PR open; CI checks `IN_PROGRESS` or starting. Watch for `Lint (clippy + rustfmt)` to now be a hard gate (not skipped). + +--- + +## Self-review + +I checked the plan against the spec section-by-section: + +**1. Spec coverage:** +- Group 1 (dangling Java refs) → Task 1 ✓ +- Group 2 (stale framing) → Task 2 ✓ +- Group 3 (identifier renames) → Task 3 ✓ +- Group 4 (clippy + unused sweep) → Task 4 (4a-4d) ✓ +- Group 5 (CI lint required) → Task 5 ✓ +- Group 6 (remove shipped specs) → Task 6 ✓ +- Group 7 (auto-memory) → Pre-done during brainstorm; verified at Task 7 Step 5 ✓ +- All ship criteria → Task 7 Steps 2-4 ✓ + +**2. Placeholder scan:** Scanned for "TBD", "TODO", "fill in", "implement later". None present. Every Task 4 sub-task references a specific file/line from the clippy inventory. + +**3. Type consistency:** `SegmentPartitionCache` introduced in Task 4b is used by name in subsequent steps. CI lint job name consistent (`Lint (clippy + rustfmt)`). Commit messages refer to the same commit SHAs (`a8ad6ddd`, `55cff3fa`) used in pre-flight expectations. + +**Known soft spots:** +- The exact `cargo clippy --fix` output in Task 4a may vary slightly across clippy versions. If a `--fix` rewrite produces semantically suspect code, Step 2 of Task 4a documents the manual-revert procedure. +- The CLI `--help` strings in Task 2 are inspected by `head` and `grep` rather than enumerated up-front — the implementer reads the actual current content. The plan doesn't pre-script the exact replacements because the strings can drift between plan-writing and execution; the rule is "replace any Java-comparison phrasing with behavior-only". diff --git a/docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md b/docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md deleted file mode 100644 index bc3f0bfb..00000000 --- a/docs/superpowers/specs/2026-05-23-iter39-docs-rewrite-design.md +++ /dev/null @@ -1,272 +0,0 @@ -# iter39 — docs rewrite + CLI rename for the post-cutover state - -**Branch:** `iter39-docs-rewrite` (cut from `master` HEAD `c863dae1`) -**Date:** 2026-05-23 -**Status:** design approved, plan pending - ---- - -## Context - -PR #29 landed the Rust port of MS-GF+ as the production engine. The repo was -de-forked from `MSGFPlus/msgfplus` and renamed `bigbio/msgfplus` → -`bigbio/msgf-rust`. The Rust workspace is now at the repo root -(`Cargo.toml`, `crates/`, `resources/`, `test-fixtures/`). The Rust port beats -Java MS-GF+ at 1% FDR on all three benchmark datasets (Astral +0.98%, -PXD001819 within 0.3% at 3.3× wall, TMT +9.3% at 14% faster wall). - -The current `README.md` and `docs/` tree predate the cutover. They describe -the Java tool: `mvn` build, JAR distribution, Java CLI flags, Java parameter -file templates. Most of it is stale. - -This iteration treats msgf-rust as a new application and writes documentation -from scratch to fit it. It also takes the opportunity to clean up two -Java-historical CLI quirks: numeric-index enum flags and the singular `--mod` -flag for a file path. - -## Goals - -1. New `README.md` that serves both quantms pipeline operators and mass-spec - researchers running searches directly, in a single linear narrative. -2. New single-file `DOCS.md` reference at the repo root. -3. New `CLI_MIGRATION.md` mapping Java MS-GF+ flags and legacy numeric IDs - to the new Rust-idiomatic flag names. -4. CLI rename: replace numeric-ID enum flags with named values; rename - `--ntt` → `--enzyme-specificity`; rename `--mod` → `--mods` with hidden - alias. -5. Backward compatibility at runtime: the binary still accepts the legacy - numeric forms (`--fragmentation 3`, etc.) and the old `--mod` name, so - existing quantms scripts keep working without modification. -6. Delete the stale `docs/` user-facing tree. - -## Non-goals (deferred to later iterations) - -- Dockerfile rewrite (it still builds a Java JAR). -- One-time `cargo fmt` cleanup (~11k cosmetic lines). -- Thread-determinism tie-breaker fix. -- mdBook / GitHub Pages site. -- Porting Java's `ScoringParamGen` to Rust (acknowledged in `DOCS.md` as - roadmap work; tracked as an open issue). - -## Deliverables - -| Path | Action | Purpose | -|---|---|---| -| `README.md` | rewrite | Linear front-door doc serving both audiences. ~190 lines. | -| `DOCS.md` | create | Single-file reference for CLI, formats, training, migration. ~505 lines. | -| `CLI_MIGRATION.md` | create | Java MS-GF+ → msgf-rust mapping + numeric-legacy → named-value table + worked examples. ~100 lines. | -| `crates/msgf-rust/src/bin/msgf-rust.rs` | edit | Add 4 `ValueEnum`-derived types, rename flags, update existing tests. | -| `crates/msgf-rust/tests/cli_smoke.rs` | edit | Add one new test: legacy numeric form and new named form produce identical output. | -| `docs/` user-facing tree | delete | All files listed in "docs/ deletion list" below. | -| `docs/superpowers/specs/` | excluded from deletion | Engineering-planning artifacts; not user-facing. | - -## README.md content + structure - -Linear flow, top-to-bottom. Order chosen so a researcher sees the "why -switch?" benchmark proof early, and an operator can jump straight to -Quick Start and recipes. - -| # | Section | Content | -|---|---|---| -| 1 | Title + tagline + badges | CI, release, license, citation. ~8 lines. | -| 2 | What is this? | One paragraph: Rust port of MS-GF+, mzML/MGF + FASTA in, Percolator-ready `.pin` out. Names UCSD original team. ~10 lines. | -| 3 | Why msgf-rust? | Benchmark table: Rust vs Java MS-GF+ at 1% FDR on Astral / PXD001819 / TMT, plus wall-clock comparison. ~25 lines. | -| 4 | Install | Three options: (a) download a platform archive from GitHub Releases, (b) `cargo install --git`, (c) build from source. ~25 lines. | -| 5 | Quick Start | Minimal command: `msgf-rust --spectrum bsa.mgf --database bsa.fasta --output-pin out.pin`. Brief explanation of the `.pin` row. ~20 lines. | -| 6 | Common workflows | Four recipes: (a) Trypsin DDA + Percolator, (b) TMT search with mods, (c) Direct TSV output, (d) quantms pipeline integration. ~35 lines. | -| 7 | CLI summary | Table of ~15 most-used flags with one-line descriptions; link to `DOCS.md` for full reference. ~25 lines. | -| 8 | Auto-detection | Short paragraph: activation method auto-detected from mzML; param file auto-selected from (fragmentation, instrument, protocol). ~10 lines. | -| 9 | Parity vs Java MS-GF+ | One paragraph summary of what's bit-exact, what differs; link to `DOCS.md` known-divergences section. ~12 lines. | -| 10 | Citation | Cite Kim & Pevzner MS-GF+ paper. ~8 lines. | -| 11 | License | UCSD-Noncommercial; see `LICENSE`, `NOTICE`. ~6 lines. | -| 12 | Acknowledgments | UCSD original team, bigbio maintainers, quantms team. ~6 lines. | - -**Total:** ~190 lines. - -**Not in README** (lives in `DOCS.md` only): full CLI flag reference, -mods.txt grammar, PIN column-by-column reference, building from source in -detail, training notes, Java → Rust migration table, known-divergences -detail. - -## DOCS.md content + structure - -Single file, top-to-bottom. Each section is its own anchor for -deep-linking. - -| # | Section | Content | ~lines | -|---|---|---|---| -| 0 | Table of contents | Anchor links to each section below. | 15 | -| 1 | CLI reference | Every flag, with description / default / value format, grouped by: required, search params, modifications, scoring, runtime, output. | 130 | -| 2 | Mods.txt format | Grammar, per-field rules, location vocabulary, `NumMods=N` directive, comment handling, 3 worked examples (cam-C + ox-M; TMT 10-plex; phospho-STY). | 50 | -| 3 | Output formats | 3a. PIN columns table. 3b. TSV columns table. 3c. Choosing between them. | 90 | -| 4 | Auto-detection | Activation-method detection from mzML CV params; param-file resolution table showing `(fragmentation, instrument, protocol) → bundled file`; instrument-class detection. | 35 | -| 5 | Building from source | Requirements (Rust 1.85+), `cargo build --release`, `cargo test --release` with notes on the 7 known-skipped tests + reasons, where the binary lands. | 30 | -| 6 | Training new `.param` files | The Rust port reuses Java MS-GF+'s `.param` files as-is. ScoringParamGen is not yet ported; tracked as roadmap work. Two paths for now: use bundled `.param` files, or train on `java-legacy` branch and point Rust at the output with `--param-file`. | 25 | -| 7 | Isobaric labeling | TMT and iTRAQ workflows: `--protocol` value, `--mods` entries, which bundled `.param` file gets auto-selected. | 35 | -| 8 | Java MS-GF+ → msgf-rust migration | 8a. Flag rename table (Java `-s` → Rust `--spectrum`, etc.). 8b. Numeric-legacy values (still accepted: `--fragmentation 3` works alongside `--fragmentation HCD`). 8c. Behavior differences (no mzXML, no mzIdentML, etc.). 8d. Known parity divergences. | 80 | -| 9 | License + citation | Full LICENSE excerpt + how to cite. | 15 | - -**Total:** ~505 lines. - -## CLI rename details - -### Flag rename table - -| Old (Java-style, current) | New (Rust-idiomatic) | Default | Accepted legacy form | -|---|---|---|---| -| `--fragmentation <0..=4>` | `--fragmentation ` | `auto` | numeric 0..=4 | -| `--instrument <0..=3>` | `--instrument ` | `low-res` | numeric 0..=3 | -| `--protocol <0..=5>` | `--protocol ` | `auto` | numeric 0..=5 | -| `--ntt <0\|1\|2>` | `--enzyme-specificity ` | `fully` | numeric 0..=2 AND `--ntt` alias | -| `--mod ` | `--mods ` | (none) | `--mod` alias (hidden) | - -Named-value conventions: -- Acronyms uppercase (community standard): HCD, CID, ETD, UVPD, TMT, iTRAQ, TOF. -- Brand names preserve common-form casing: QExactive. -- Descriptive values lowercase kebab-case: `auto`, `low-res`, `high-res`, - `phospho`, `standard`, `non-specific`, `semi`, `fully`. -- clap parsing is case-insensitive — `--fragmentation hcd` works the same - as `--fragmentation HCD`. - -### Implementation per enum flag - -```rust -#[derive(Clone, Copy, Debug, ValueEnum)] -enum Fragmentation { - #[clap(name = "auto")] Auto, - #[clap(name = "CID")] Cid, - #[clap(name = "ETD")] Etd, - #[clap(name = "HCD")] Hcd, - #[clap(name = "UVPD")] Uvpd, -} - -#[arg(long, default_value = "auto", value_parser = parse_fragmentation)] -fragmentation: Fragmentation, - -fn parse_fragmentation(s: &str) -> Result { - // Canonical named value first (case-insensitive). - if let Ok(v) = ::from_str(s, true) { - return Ok(v); - } - // Legacy numeric ID (Java MS-GF+ compat). - match s.parse::() { - Ok(0) => Ok(Fragmentation::Auto), - Ok(1) => Ok(Fragmentation::Cid), - Ok(2) => Ok(Fragmentation::Etd), - Ok(3) => Ok(Fragmentation::Hcd), - Ok(4) => Ok(Fragmentation::Uvpd), - _ => Err(format!( - "invalid fragmentation `{s}`: expected auto|CID|ETD|HCD|UVPD \ - (or legacy 0..=4)" - )), - } -} -``` - -Same shape for `Instrument`, `Protocol`, `EnzymeSpecificity`. - -### `--mods` rename - -```rust -#[arg(long = "mods", alias = "mod", value_name = "MODFILE")] -mods: Option, -``` - -`alias` (not `visible_alias`) means `--mod` is still accepted but `--help` -only shows `--mods`. - -### Quantms compat policy - -For v0.1.0 (the cutover release) the numeric form is "Java legacy" rather -than "deprecated Rust v0". Accept silently — no deprecation warning to -stderr. Migration is documented in `DOCS.md` §8 and `CLI_MIGRATION.md`. -Working quantms scripts keep working with zero changes. - -### Internal code changes - -- Replace `Option` enum fields + numeric-positional calls - (`resolve_bundled_param(Some(3), Some(3), Some(4))`) with strongly-typed - enums (`resolve_bundled_param(Fragmentation::Hcd, Instrument::QExactive, - Protocol::Tmt)`). -- Update the 15 `param_resolver_tests` (~30 line diff). -- The auto-detect path (`resolve_bundled_param_for_activation`) now - constructs the enum variants directly instead of numeric IDs. - -## CLI_MIGRATION.md content - -~100 lines. Two tables + worked examples. - -- **Table A — Java MS-GF+ flag → msgf-rust flag.** Full mapping: `-s` → - `--spectrum`, `-d` → `--database`, `-o` → `--output-pin`, `-mod` → - `--mods`, `-tda 1` → "not needed, decoys auto-generated", `-inst N` → - `--instrument `, etc. -- **Table B — Numeric legacy → named values.** The same content as the - Implementation table above, formatted for users porting scripts. -- **3 worked examples.** A Java MS-GF+ command line rewritten as a - msgf-rust command line, side-by-side, for: (a) plain Trypsin DDA + 20ppm, - (b) TMT 10-plex search, (c) phospho-STY search. - -## docs/ deletion list - -Delete (all in this PR): - -- `docs/msgfplus.md` -- `docs/msgfdb_modfile.md` -- `docs/buildsa.md` -- `docs/output.md` -- `docs/readme.md` -- `docs/troubleshooting.md` -- `docs/training-scoring-models.md` -- `docs/isobariclabeling.md` -- `docs/changelog.md` -- `docs/parameterfiles/` (15 `.txt` files) -- `docs/examples/` (`Mods.txt`, `enzymes.txt`, etc. — content migrates into - `DOCS.md` as inline examples) -- `docs/benchmarks/` (3 PNG figures from the Java perf comparison; stale) - -Keep (excluded from deletion): - -- `docs/superpowers/specs/` — engineering-planning subdirectory, not - user-facing docs. This document lives here. - -Already gitignored, no action: - -- `docs/parity-analysis/` — local-only iter notes from iter1-38 development. - -## Testing - -| File | Change | -|---|---| -| `crates/msgf-rust/src/bin/msgf-rust.rs` (`param_resolver_tests`, 15 tests) | Update each from `resolve_bundled_param(Some(3), Some(3), Some(4))` → `resolve_bundled_param(Fragmentation::Hcd, Instrument::QExactive, Protocol::Tmt)`. Mechanical. | -| `crates/msgf-rust/tests/cli_smoke.rs` (7 existing integration tests) | The tests use `--fragmentation 3 --instrument 3 --protocol 4` strings; these still work (legacy accepted), so no behavior change is required. | -| `crates/msgf-rust/tests/cli_smoke.rs` (new test) | `cli_accepts_both_named_and_numeric_param_values`: run a search with `--fragmentation 3 --protocol 4` (legacy) and again with `--fragmentation HCD --protocol TMT` (canonical); assert PIN outputs are byte-identical. Guards the back-compat path. | - -CI workflow (`.github/workflows/ci.yml`) — no change. The 7 currently-skipped -tests remain skipped for the reasons documented inline. - -## Commit plan - -One PR (`iter39-docs-rewrite` → `dev`), five reviewable commits in order: - -1. `feat(cli): rename param flags to Rust-idiomatic named values with legacy compat` — CLI rename, enum types, custom parsers, updated `param_resolver_tests`, new round-trip test. -2. `docs: write new README.md (post-cutover, dual audience, linear narrative)` — replace `README.md`. -3. `docs: add DOCS.md (single-file reference)` — new `DOCS.md`. -4. `docs: add CLI_MIGRATION.md (Java → Rust + numeric legacy mapping)` — new file. -5. `docs: delete docs/ tree (content migrated to DOCS.md)` — `git rm -r` everything from the deletion list above; `docs/superpowers/` is preserved. - -PR title: `iter39: docs + CLI rename for the post-cutover state` - -## Risks - -- **Risk:** A quantms script uses `--fragmentation 3` and we silently break it. **Mitigation:** the new round-trip integration test in `cli_smoke.rs` ensures legacy numeric values resolve to the same enum variants as the named values, locked in CI. -- **Risk:** Hidden `--mod` alias is missed by a user trying to migrate. **Mitigation:** `CLI_MIGRATION.md` calls it out as a top-line "what's renamed" entry. -- **Risk:** The deletion of `docs/parameterfiles/*.txt` breaks external links from third-party tooling that bundled those templates. **Mitigation:** Low — these were Java `-conf` templates; no equivalent Rust mechanism exists. `CLI_MIGRATION.md` covers the closest Rust path (direct CLI flags + `--param-file`). -- **Risk:** README + DOCS.md diverge from the binary over time. **Mitigation:** acceptable — both files are short enough that any future iteration that touches CLI flags or output format updates them in the same PR. - -## Out of scope (re-affirming) - -- Dockerfile rewrite -- One-time `cargo fmt` -- Thread-determinism tie-breaker -- mdBook / Pages site -- Porting ScoringParamGen From ea1f481fb37c80d03977b2ab25a105140b5b51c2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 11:03:49 +0100 Subject: [PATCH 09/23] chore: address PR-Q1 final review observations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three non-blocking observations from the final code review: 1. DOCS.md §97 documented only the legacy MSGFRUST_RSS_PROBE name. Now mentions MSGF_RSS_PROBE as the canonical with the legacy noted. 2. crates/model/src/amino_acid.rs:13 inline comment referenced the legacy name; updated to MSGF_RSS_PROBE. 3. log_rss deprecation warning fired on every call when only the legacy env var was set. Guard with std::sync::Once so it prints exactly once per process invocation. All non-functional; verification: deprecation warning count is now 1 under MSGFRUST_RSS_PROBE=1 + multiple log_rss checkpoints. --- DOCS.md | 2 +- crates/model/src/amino_acid.rs | 2 +- crates/msgf-rust/src/bin/msgf-rust.rs | 15 +++++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/DOCS.md b/DOCS.md index f2422cf8..10b76397 100644 --- a/DOCS.md +++ b/DOCS.md @@ -94,7 +94,7 @@ Only tryptic enzyme models are bundled; other enzymes require `--param-file`. |---|---|---|---|---| | `--output-tsv` | path | *(off)* | Optional tab-separated PSM report (§3b). Skipped in bench mode (`--max-spectra > 0`). | Java `-outputFormat 1` with output path | -**Environment variable:** set `MSGFRUST_RSS_PROBE=1` on Linux to print `VmRSS` checkpoints to stderr during long runs (debugging memory use). +**Environment variable:** set `MSGF_RSS_PROBE=1` on Linux to print `VmRSS` checkpoints to stderr during long runs (debugging memory use). The legacy name `MSGFRUST_RSS_PROBE=1` is still accepted with a one-line deprecation warning and will be removed in the next quality cleanup. --- diff --git a/crates/model/src/amino_acid.rs b/crates/model/src/amino_acid.rs index a5c719a9..b46c3c46 100644 --- a/crates/model/src/amino_acid.rs +++ b/crates/model/src/amino_acid.rs @@ -10,7 +10,7 @@ //! cloned the `Modification`'s `String` `name` (and optional accession), //! producing one heap allocation per modified residue per candidate. At //! Astral scale that drives `PreparedSearch::prepare` to ~27 GB RSS on a -//! 31 GB VM (verified by the `MSGFRUST_RSS_PROBE=1` probe in +//! 31 GB VM (verified by the `MSGF_RSS_PROBE=1` probe in //! `msgf-rust.rs`). Wrapping `Modification` in `Arc` makes clones a //! refcount bump and shrinks `AminoAcid` from ~96 B to 24 B. diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index a1b1b397..1cacf6f6 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -235,15 +235,18 @@ fn main() -> ExitCode { /// We gate behind an env var so production runs stay quiet; flip the var on /// when debugging memory regressions. fn log_rss(tag: &str) { - // Accept both new and legacy env var names. Legacy emits a one-time - // deprecation warning on stderr. + // Accept both new and legacy env var names. Legacy emits the + // deprecation warning once per process (sync::Once guard). let new_set = std::env::var_os("MSGF_RSS_PROBE").is_some(); let legacy_set = std::env::var_os("MSGFRUST_RSS_PROBE").is_some(); if legacy_set && !new_set { - eprintln!( - "WARN: MSGFRUST_RSS_PROBE is deprecated; use MSGF_RSS_PROBE \ - (legacy name accepted in this release, will be removed next)" - ); + static LEGACY_WARN_ONCE: std::sync::Once = std::sync::Once::new(); + LEGACY_WARN_ONCE.call_once(|| { + eprintln!( + "WARN: MSGFRUST_RSS_PROBE is deprecated; use MSGF_RSS_PROBE \ + (legacy name accepted in this release, will be removed next)" + ); + }); } if !new_set && !legacy_set { return; From 28e7a65aa5915ee968073550fbeb210a12d983a1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 13:11:14 +0100 Subject: [PATCH 10/23] docs(spec): PR-V1 value-delivering design (stacks on PR-Q1 cleanup) After PR #35 (PR-Q1) closed unmerged for not delivering measurable wins, pivot strategy: stack 3 loosely-coupled sub-features on top of the cleanup commits and ship ONE PR with bench-gated value. Sub-features: - S1: profile-guided Astral wall reduction (gate: -5% wall) - S2: LFQ calibrator threshold fallback 1e-6 -> 1e-5 (gate: +50 PSMs) - S3: additive PrecursorErrorPpmSquared PIN column (gate: +50 PSMs on any one dataset) Each sub-feature ships only if its bench gate passes; failures get dropped before merge. --- .../specs/2026-05-26-pr-v1-design.md | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-26-pr-v1-design.md diff --git a/docs/superpowers/specs/2026-05-26-pr-v1-design.md b/docs/superpowers/specs/2026-05-26-pr-v1-design.md new file mode 100644 index 00000000..09678bcc --- /dev/null +++ b/docs/superpowers/specs/2026-05-26-pr-v1-design.md @@ -0,0 +1,168 @@ +# Design — PR-V1 (Value-delivering improvements stacked on cleanup) + +**Date:** 2026-05-26 +**Branch:** `feat/quality-perf-id-rate` (HEAD `ea1f481f` after PR #35 closed unmerged) +**Status:** Spec for review + +## Problem + +PR #35 (PR-Q1: code-quality cleanup) was closed unmerged because, while the 9 cleanup commits are real (lint gate, dangling Java refs scrubbed, identifier renames, clippy clean), they delivered no measurable PSM or speed improvement on the bench. The user's original ask was speed AND ID-rate wins; the brainstormed Q1 → S1 → I1 decomposition produced a first PR with no headline value. + +PR-V1 pivots: stack measurable improvements ON TOP of the existing cleanup commits and only open a PR when the bench shows at least one concrete win. Cleanup commits become the foundation; value delivery is the deliverable. + +## Goal + +Land ONE PR that delivers AT LEAST ONE of: +- Astral wall ≥5% reduction (off mode, controlled VM conditions) +- LFQ auto @1% FDR ≥ +50 PSMs over current (14,755 → ≥14,805) +- Any dataset auto @1% FDR ≥ +50 PSMs from a new additive PIN column + +Each sub-feature has its own gate. Sub-features that fail their gate get dropped before merge; the PR ships only what passes. + +## Non-goals + +- score_psm trace investigation (I5 in the brainstorm) — separate research PR after PR-V1 +- Algorithm-level restructuring beyond profile-identified hotspots +- Touching any `tests/*_java_parity.rs`, `tests/gf_bsa_parity.rs`, `tests/*_match_java.rs` +- Reverting the closed PR-Q1 commits (they stay on the branch as foundation) +- Edition / toolchain bumps + +## Current baseline (post PR-A merge, with PR-Q1 cleanup commits) + +| Dataset | Rust off | Rust auto | Java auto | Δ Rust-auto vs Java | +|---|---:|---:|---:|---:| +| LFQ (PXD001819) | 14,755 | 14,755 (cal skipped: 193/200) | 15,088 | −2.2% | +| Astral | 36,138 | 36,715 | 36,271 | **+1.4% (beats Java)** | +| TMT | 9,364 | 9,605 | 10,212 | −5.9% | + +| Dataset | Astral off wall | Astral auto wall | +|---|---|---| +| Astral | ~6:12 (PR-A bench, low VM load) | ~6:53 (PR-A bench) | + +These are the numbers each sub-feature is measured against. + +## Architecture — three loosely-coupled sub-features + +### S1 — Profile-guided Astral wall reduction + +**What:** Capture a flamegraph on the bench VM running `--precursor-cal off` on the Astral fixture (the largest, slowest dataset). Identify the top 3 hotspots. Apply 1–2 targeted optimizations to those hotspots only. NOT speculative restructuring. + +**Why:** Memory says iter32–38 already shipped the obvious perf wins (P-9b partition_for hoist, iter32 pipeline parse, etc.). What remains is profile-only-visible — without a flamegraph we'd be guessing. The bench VM (`pride-linux-vm`) has `cargo` + `perf` available. + +**Files:** Determined by profile. Likely candidates from memory: +- `crates/scoring/src/scoring/scored_spectrum.rs::directional_node_score_inner` (hot loop) +- `crates/scoring/src/gf/generating_function.rs::setup_score_threshold` + DP +- `crates/search/src/match_engine.rs::run_chunk_inner` +- `crates/input/src/mzml.rs` (mzML parse) + +**Procedure:** +1. Build PR-V1 binary on the VM with `cargo build --release --bin msgf-rust -- -C debuginfo=line-tables-only` (or similar) for stack frames +2. Wrap one Astral cal=off run in `perf record -F 99 -g` +3. Generate flamegraph via `inferno-flamegraph` (already installed in cargo? check first) +4. Visually identify the top 3 stack frames by exclusive time +5. Pick the 1–2 with the clearest path to a code change +6. Apply, re-bench under same controlled conditions, compare + +**Gate:** Astral wall ≥5% reduction (off mode) AND no other-dataset wall regression >2%. + +**If gate fails:** Drop S1 entirely; the profile work goes into a follow-up brainstorming spec (we now know where the time goes, even if we can't reduce it in this PR). + +### S2 — LFQ calibrator threshold fallback (was I1) + +**What:** Modify `crates/search/src/mass_calibrator.rs::learn_calibration_stats` so that if at the current `MAX_SPEC_EVALUE=1e-6` the `confident_psm_count` falls short of `MIN_CONFIDENT_PSMS (200)`, retry the residual extraction once with `MAX_SPEC_EVALUE=1e-5`. If the retry succeeds, use those residuals; if it still falls short, return empty stats (current behavior). + +**Why:** Rust's cal pre-pass on LFQ finds 193/200 confident PSMs at 1e-6. The 7-PSM shortfall is a SpecE-tail-distribution drift (Rust's spec_e values are shifted +0.87 in the agreement bucket vs Java per `2026-05-25-spece-tail-exploration.md`). A 1-decade SpecE relaxation is mathematically safe because: +- The cal computes a MEDIAN of residuals — robust to small numbers of noisier outliers +- The robust-sigma uses MAD × 1.4826 — also robust +- Java sticks at 1e-6 as a conservative default; we add an FALLBACK, not a baseline change + +The fallback preserves Java parity on Astral and TMT (both already qualify 200 at 1e-6) while recovering LFQ. + +**Files:** +- Modify: `crates/search/src/mass_calibrator.rs` (`learn_calibration_stats`, ~10 lines) +- Modify: `crates/search/src/precursor_cal.rs::constants` — add `MAX_SPEC_EVALUE_FALLBACK: f64 = 1e-5` +- Modify: `crates/search/tests/mass_calibrator_integration.rs` (new test asserting the fallback path) +- Modify: `docs/parity-analysis/snapshots/cal-shifts-2026-05-25.json` → bump to `cal-shifts-2026-05-26.json` (or update) reflecting LFQ now firing + +**Procedure:** +1. Add `MAX_SPEC_EVALUE_FALLBACK` constant +2. Refactor `learn_calibration_stats` to take the threshold as an arg internally (preserve external signature); call once at primary threshold; if `< MIN_CONFIDENT_PSMS`, retry at fallback +3. Update `CalibrationStats` to expose `effective_threshold_used: f64` for logging +4. Update integration test +5. Bench + +**Gate:** LFQ auto @1% FDR ≥ 14,805 (current 14,755 + 50 PSMs) AND no other dataset regresses. + +**If gate fails:** Drop S2. The fallback constant stays defined as dead code with `#[allow(dead_code)]` and a comment explaining why it didn't ship; remove in next quality cleanup. + +### S3 — Additive PIN column: `PrecursorErrorPpmSquared` (was I3) + +**What:** Add a new column `PrecursorErrorPpmSquared` to the Percolator PIN writer. Value: `psm.mass_error_ppm.powi(2)`. Header schema updated; value computed at write time (no new state on PsmMatch). + +**Why:** Percolator fits linear weights over PIN features. A pure linear weight cannot capture the U-shape of mass-error contribution to PSM confidence (small |ppm| = good, large |ppm| = bad in either direction). Adding the squared variant gives Percolator a linearized magnitude discriminator. Per the n=9 audit pattern (iter19 EdgeScore precedent), additive PIN columns are safe — they cannot regress existing Percolator weights, only potentially be picked up. + +**Files:** +- Modify: `crates/output/src/pin.rs` — add column to header + value emission +- Update: `test-fixtures/parity/goldens/precursor_cal_off.pin` (regenerate; PIN format changes for this PR) +- Update: `crates/msgf-rust/tests/precursor_cal_bit_identical.rs` test docstring noting the new column + +**Procedure:** +1. Read current PIN header in `pin.rs` +2. Append `PrecursorErrorPpmSquared` between existing `absdm` and `charge2` (preserves consistent positioning per memory's PIN header order) +3. Emit `format!("{:.6}", psm.mass_error_ppm.powi(2))` per PSM row at the matching column +4. Regenerate the bit-identical golden by running `msgf-rust --precursor-cal off` on `test-fixtures/test.mgf` + `test-fixtures/BSA.fasta` +5. Bench + +**Gate:** AT LEAST ONE dataset shows auto @1% FDR ≥ +50 PSMs over current AND no dataset regresses >50 PSMs. + +**If gate fails:** Drop S3 entirely (revert the pin.rs + golden changes). The PIN column is purely additive, so dropping it is clean. + +## Verification / ship criteria + +Bench protocol per sub-feature implementation: +1. Ping user to pause `conda-build` cohabitant on `pride-linux-vm` +2. Confirm `uptime` shows 1-min load avg < 2.0 +3. Run `/srv/data/msgf-bench/run_bench_pr_q1.sh` (or a copy pointed at PR-V1's binary) +4. Compare PSM counts + wall times vs the baseline in this spec +5. Restart conda-build (user) + +The PR opens if AND ONLY IF at least one sub-feature passes its gate. If all three fail, no PR — we cycle back to brainstorming with new candidates. + +## Risks & mitigations + +| Risk | Mitigation | +|---|---| +| Profiling reveals no clear hotspot (algorithm-bound, no quick win) | Drop S1; ship S2+S3 only | +| S2 fallback shifts a real mass and regresses Astral/TMT | Per-dataset bench gate catches; revert just S2 | +| S3 PrecursorError² column ends up flat (Percolator already extracts it from existing mass-error feature) | Additive feature; safe to ship even flat. Future: drop in a quality cleanup if proven flat over multiple datasets | +| Combined PR delivers no measurable win (all 3 fail gates) | Do NOT open a PR; cycle back to brainstorm | +| VM load contention during bench | User pauses conda-build per agreed protocol; bench results quote load avg | +| PIN golden regeneration breaks downstream consumers (Percolator config, quantms scripts) | Update DOCS.md PIN schema section in the S3 commit; document in PR description; column is APPENDED, not inserted mid-row, where possible | + +## Sequencing + +``` +feat/quality-perf-id-rate (HEAD: ea1f481f) + ↓ +S1: profile + 1-2 hotspot fixes (commits depend on findings) + ↓ bench gate (Astral wall -5%) +S2: LFQ cal fallback (~2 commits) + ↓ bench gate (LFQ +50 PSMs) +S3: PrecursorError² PIN column (1 commit + golden regen) + ↓ bench gate (≥1 dataset +50 PSMs) +Final 3-dataset bench + ↓ if ≥1 sub passed +PR open with quoted bench numbers +``` + +## Open questions + +None — all design points resolved in brainstorming. Implementation order and per-feature gates are explicit. + +## Related documents + +- `docs/superpowers/specs/2026-05-26-quality-cleanup-design.md` — PR-Q1 (cleanup foundation; commits 1–9 on this branch) +- `docs/parity-analysis/notes/2026-05-25-precursor-cal-ship-gates.md` — current G1 status (still open; PR-V1 may close part of LFQ's deferred gap) +- `docs/parity-analysis/notes/2026-05-25-spece-tail-exploration.md` — SpecE-tail context relevant to S2 (the 193/200 issue) +- PR #33 — PR-A (MassCalibrator port; the baseline this PR builds on) +- PR #35 (closed) — PR-Q1 (the cleanup commits this PR stacks on top of) From 542ab6e2506ccd54379755b4864b1aee9ac4d5c2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 17:19:00 +0100 Subject: [PATCH 11/23] perf(s1): swap HashMap -> FxHashMap on hot scoring tables Profile (perf record on Astral cal=off, 285K samples) identified 36.82% of CPU in HashMap lookups using SipHash13. The hot scoring path (compute_inner, directional_node_score_inner, rank_scorer::error_score) repeatedly looks up Partition keys for rank_dist_table, frag_off_table, ion_existence_table, etc. Switch the 7 Param hot tables to FxHashMap (rustc-hash). SipHash13's state-init + 13-round mix is unnecessary for non-cryptographic keys on a single-process search; FxHasher is a single multiply-and-xor. Same .get/.insert API; only iteration order differs (and no hot path iterates these tables). Expected: ~25-30% reduction in match_spectra wall on Astral cal=off. Bench gate (S1) requires >= 5% Astral wall reduction to ship. --- Cargo.lock | 3 +- crates/scoring/Cargo.toml | 2 + crates/scoring/src/param_model.rs | 68 ++++++++-------- crates/scoring/src/scoring/psm_score.rs | 18 ++--- crates/scoring/src/scoring/scored_spectrum.rs | 80 +++++++++---------- crates/scoring/src/testutil.rs | 34 ++++---- crates/scoring/tests/gf_graph_dp.rs | 18 ++--- .../tests/primitive_graph_arena_parity.rs | 18 ++--- crates/search/src/match_engine.rs | 18 ++--- .../tests/mass_calibrator_integration.rs | 17 ++-- crates/search/tests/match_engine_smoke.rs | 18 ++--- .../search/tests/match_engine_specevalue.rs | 18 ++--- 12 files changed, 158 insertions(+), 154 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d06d8962..d735d59d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler2" @@ -496,6 +496,7 @@ dependencies = [ "byteorder", "input", "model", + "rustc-hash", "tempfile", "thiserror", ] diff --git a/crates/scoring/Cargo.toml b/crates/scoring/Cargo.toml index b5fb6d16..f154d976 100644 --- a/crates/scoring/Cargo.toml +++ b/crates/scoring/Cargo.toml @@ -7,9 +7,11 @@ license.workspace = true [dependencies] model = { path = "../model" } +rustc-hash = "2" thiserror = { workspace = true } byteorder = { workspace = true } [dev-dependencies] +rustc-hash = "2" tempfile = "3.10" input = { path = "../input" } diff --git a/crates/scoring/src/param_model.rs b/crates/scoring/src/param_model.rs index 15d95fd8..77160aaa 100644 --- a/crates/scoring/src/param_model.rs +++ b/crates/scoring/src/param_model.rs @@ -1,8 +1,9 @@ //! Loader for the MS-GF+ `.param` binary format. use std::cmp::Ordering; -use std::collections::HashMap; use std::hash::{Hash, Hasher}; + +use rustc_hash::FxHashMap; use std::io::Cursor; use std::path::Path; @@ -27,29 +28,30 @@ pub struct Param { pub num_segments: i32, pub partitions: Vec, pub num_precursor_off: i32, - pub precursor_off_map: HashMap>, - pub frag_off_table: HashMap>, + pub precursor_off_map: FxHashMap>, + pub frag_off_table: FxHashMap>, pub max_rank: i32, - pub rank_dist_table: HashMap>>, + pub rank_dist_table: FxHashMap>>, pub error_scaling_factor: i32, - pub ion_err_dist_table: HashMap>, - pub noise_err_dist_table: HashMap>, - pub ion_existence_table: HashMap>, + pub ion_err_dist_table: FxHashMap>, + pub noise_err_dist_table: FxHashMap>, + pub ion_existence_table: FxHashMap>, /// Pre-filtered ion-type list per partition (Noise excluded), populated /// at load time. Used by `ion_types_for_partition_slice` to avoid /// per-call Vec allocation in the GF DP hot path. /// Call `rebuild_cache()` after manually constructing a `Param` in tests /// or any context where the cache was not populated during `load_from_bytes`. - pub partition_ion_types_cache: HashMap>, + pub partition_ion_types_cache: FxHashMap>, } /// Build the per-partition ion-type cache (Noise excluded). Single source of /// truth for both the parser (`load_from_bytes`) and the test helper /// (`Param::rebuild_cache`). fn build_partition_ion_types_cache( - frag_off_table: &HashMap>, -) -> HashMap> { - let mut cache: HashMap> = HashMap::with_capacity(frag_off_table.len()); + frag_off_table: &FxHashMap>, +) -> FxHashMap> { + let mut cache: FxHashMap> = + FxHashMap::with_capacity_and_hasher(frag_off_table.len(), Default::default()); for (&part, frag_list) in frag_off_table { let mut ions: Vec = Vec::with_capacity(frag_list.len()); for fof in frag_list { @@ -318,7 +320,7 @@ fn read_param(cursor: &mut Cursor<&[u8]>) -> Result { // -- Section 6: precursor offset frequency -- let num_precursor_off = read_i32(cursor)?; - let mut precursor_off_map: HashMap> = HashMap::new(); + let mut precursor_off_map: FxHashMap> = FxHashMap::default(); for _ in 0..num_precursor_off { let charge = read_i32(cursor)?; let reduced_charge = read_i32(cursor)?; @@ -337,7 +339,7 @@ fn read_param(cursor: &mut Cursor<&[u8]>) -> Result { } // -- Section 7: fragment offset frequency (per partition, in sorted order) -- - let mut frag_off_table: HashMap> = HashMap::new(); + let mut frag_off_table: FxHashMap> = FxHashMap::default(); for &partition in &partitions { let size = read_i32(cursor)?; let mut frags = Vec::with_capacity(size as usize); @@ -358,14 +360,14 @@ fn read_param(cursor: &mut Cursor<&[u8]>) -> Result { // -- Section 8: rank distributions (per partition × per ion type incl. NOISE) -- let max_rank = read_i32(cursor)?; - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); for &partition in &partitions { let frag_list = frag_off_table.get(&partition); // Skip partitions with no ion types. if frag_list.is_none_or(|v| v.is_empty()) { continue; } - let mut table: HashMap> = HashMap::new(); + let mut table: FxHashMap> = FxHashMap::default(); let mut ion_types: Vec = frag_list.unwrap().iter().map(|f| f.ion_type).collect(); ion_types.push(IonType::Noise); for ion in ion_types { @@ -380,9 +382,9 @@ fn read_param(cursor: &mut Cursor<&[u8]>) -> Result { // -- Section 9: error distributions (conditional) -- let error_scaling_factor = read_i32(cursor)?; - let mut ion_err_dist_table: HashMap> = HashMap::new(); - let mut noise_err_dist_table: HashMap> = HashMap::new(); - let mut ion_existence_table: HashMap> = HashMap::new(); + let mut ion_err_dist_table: FxHashMap> = FxHashMap::default(); + let mut noise_err_dist_table: FxHashMap> = FxHashMap::default(); + let mut ion_existence_table: FxHashMap> = FxHashMap::default(); if error_scaling_factor > 0 { let dist_len = (error_scaling_factor as usize) * 2 + 1; for &partition in &partitions { @@ -947,7 +949,6 @@ mod tests { use model::instrument::InstrumentType; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; Param { version: 10001, @@ -966,15 +967,15 @@ mod tests { num_segments: 1, partitions: vec![], num_precursor_off: 0, - precursor_off_map: HashMap::new(), - frag_off_table: HashMap::new(), + precursor_off_map: FxHashMap::default(), + frag_off_table: FxHashMap::default(), max_rank: 3, - rank_dist_table: HashMap::new(), + rank_dist_table: FxHashMap::default(), error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), } } @@ -1112,14 +1113,13 @@ mod tests { use model::instrument::InstrumentType; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; let part = Partition { charge: 2, parent_mass: 1000.0, seg_num: 0 }; let prefix = IonType::Prefix { charge: 1, offset_bits: 0.0_f32.to_bits() }; let suffix = IonType::Suffix { charge: 1, offset_bits: 0.0_f32.to_bits() }; // Populate frag_off_table (the source of truth for ion_types_for_segment). - let mut frag_off_table: HashMap> = HashMap::new(); + let mut frag_off_table: FxHashMap> = FxHashMap::default(); frag_off_table.insert(part, vec![ FragmentOffsetFrequency { ion_type: prefix, frequency: 0.7 }, FragmentOffsetFrequency { ion_type: suffix, frequency: 0.6 }, @@ -1142,15 +1142,15 @@ mod tests { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 2, - rank_dist_table: HashMap::new(), + rank_dist_table: FxHashMap::default(), error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; param.rebuild_cache(); diff --git a/crates/scoring/src/scoring/psm_score.rs b/crates/scoring/src/scoring/psm_score.rs index 803addac..17b7cb70 100644 --- a/crates/scoring/src/scoring/psm_score.rs +++ b/crates/scoring/src/scoring/psm_score.rs @@ -251,7 +251,7 @@ mod tests { use crate::scoring::scored_spectrum::ScoredSpectrum; use model::spectrum::Spectrum; use crate::testutil::tiny_param; - use std::collections::HashMap; + use rustc_hash::FxHashMap; fn pep(seq: &[u8]) -> Peptide { let residues: Vec = seq @@ -289,14 +289,14 @@ mod tests { let ion_freqs = vec![0.6_f32, 0.3, 0.05, 0.001]; let noise_freqs = vec![0.1_f32, 0.2, 0.3, 0.4]; - let mut ion_table: HashMap> = HashMap::new(); + let mut ion_table: FxHashMap> = FxHashMap::default(); ion_table.insert(prefix_ion, ion_freqs); ion_table.insert(noise_ion, noise_freqs); - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![FragmentOffsetFrequency { ion_type: prefix_ion, frequency: 0.7 }]); let mut p = Param { @@ -316,15 +316,15 @@ mod tests { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; p.rebuild_cache(); p diff --git a/crates/scoring/src/scoring/scored_spectrum.rs b/crates/scoring/src/scoring/scored_spectrum.rs index 5c012ec5..ca7ffc49 100644 --- a/crates/scoring/src/scoring/scored_spectrum.rs +++ b/crates/scoring/src/scoring/scored_spectrum.rs @@ -1111,7 +1111,7 @@ mod tests { use crate::param_model::SpecDataType; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; + use rustc_hash::FxHashMap; // Spectrum: precursor_mz=501.00727649 → neutral_mass≈(501.007-PROTON)*2≈1000.0 Da, // charge=2. @@ -1144,15 +1144,15 @@ mod tests { num_segments: 1, partitions: vec![], num_precursor_off: 0, - precursor_off_map: HashMap::new(), - frag_off_table: HashMap::new(), + precursor_off_map: FxHashMap::default(), + frag_off_table: FxHashMap::default(), max_rank: 3, - rank_dist_table: HashMap::new(), + rank_dist_table: FxHashMap::default(), error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; let scorer = RankScorer::new(¶m); @@ -1190,7 +1190,7 @@ mod tests { use model::instrument::InstrumentType; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; + use rustc_hash::FxHashMap; Param { version: 10001, data_type: SpecDataType { @@ -1208,15 +1208,15 @@ mod tests { num_segments: 1, partitions: vec![], num_precursor_off: 0, - precursor_off_map: HashMap::new(), - frag_off_table: HashMap::new(), + precursor_off_map: FxHashMap::default(), + frag_off_table: FxHashMap::default(), max_rank: 3, - rank_dist_table: HashMap::new(), + rank_dist_table: FxHashMap::default(), error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), } } @@ -1496,7 +1496,7 @@ mod tests { use crate::param_model::{FragmentOffsetFrequency, SpecDataType}; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; + use rustc_hash::FxHashMap; let part = Partition { charge: 2, parent_mass: 1000.0, seg_num: 0 }; let prefix1 = IonType::Prefix { charge: 1, offset_bits: 0.0_f32.to_bits() }; @@ -1505,14 +1505,14 @@ mod tests { let ion_freqs = vec![0.6_f32, 0.3, 0.05, 0.001]; let noise_freqs = vec![0.1_f32, 0.2, 0.3, 0.4]; - let mut ion_table: HashMap> = HashMap::new(); + let mut ion_table: FxHashMap> = FxHashMap::default(); ion_table.insert(prefix1, ion_freqs); ion_table.insert(noise, noise_freqs); - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![FragmentOffsetFrequency { ion_type: prefix1, frequency: 0.7, @@ -1522,13 +1522,13 @@ mod tests { let error_scaling_factor = 2_i32; let dist_len = (error_scaling_factor as usize) * 2 + 1; - let mut ion_err_dist_table: HashMap> = HashMap::new(); + let mut ion_err_dist_table: FxHashMap> = FxHashMap::default(); ion_err_dist_table.insert(part, vec![0.1_f32, 0.2, 0.4, 0.2, 0.1]); - let mut noise_err_dist_table: HashMap> = HashMap::new(); + let mut noise_err_dist_table: FxHashMap> = FxHashMap::default(); noise_err_dist_table.insert(part, vec![0.05_f32, 0.1, 0.7, 0.1, 0.05]); - let mut ion_existence_table: HashMap> = HashMap::new(); + let mut ion_existence_table: FxHashMap> = FxHashMap::default(); // [nn, ?, ?, yy] = [0.1, 0.3, 0.3, 0.5] ion_existence_table.insert(part, vec![0.1_f32, 0.3, 0.3, 0.5]); @@ -1551,7 +1551,7 @@ mod tests { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, @@ -1559,7 +1559,7 @@ mod tests { ion_err_dist_table, noise_err_dist_table, ion_existence_table, - partition_ion_types_cache: HashMap::new(), + partition_ion_types_cache: FxHashMap::default(), }; param.rebuild_cache(); @@ -1715,12 +1715,12 @@ mod precursor_filter_tests { use crate::param_model::{Param, PrecursorOffsetFrequency, SpecDataType}; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; + use rustc_hash::FxHashMap; /// Build a Param with a single precursor offset entry: charge 2, /// reduced_charge 2, offset 0.0 Da (the precursor itself), tolerance 0.5 Da. fn param_with_precursor_filter() -> Param { - let mut precursor_off_map: HashMap> = HashMap::new(); + let mut precursor_off_map: FxHashMap> = FxHashMap::default(); precursor_off_map.insert( 2, vec![PrecursorOffsetFrequency { @@ -1749,14 +1749,14 @@ mod precursor_filter_tests { partitions: vec![], num_precursor_off: 1, precursor_off_map, - frag_off_table: HashMap::new(), + frag_off_table: FxHashMap::default(), max_rank: 3, - rank_dist_table: HashMap::new(), + rank_dist_table: FxHashMap::default(), error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), } } @@ -1784,7 +1784,7 @@ mod precursor_filter_tests { /// Let's use reduced_charge=0 for the precursor filter test: /// c = 2 - 0 = 2; filter_mz = (neutral + 2*PROTON) / 2 + 0 = precursor_mz. fn param_with_precursor_filter_rc0() -> Param { - let mut precursor_off_map: HashMap> = HashMap::new(); + let mut precursor_off_map: FxHashMap> = FxHashMap::default(); precursor_off_map.insert( 2, vec![PrecursorOffsetFrequency { @@ -1813,14 +1813,14 @@ mod precursor_filter_tests { partitions: vec![], num_precursor_off: 1, precursor_off_map, - frag_off_table: HashMap::new(), + frag_off_table: FxHashMap::default(), max_rank: 3, - rank_dist_table: HashMap::new(), + rank_dist_table: FxHashMap::default(), error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), } } diff --git a/crates/scoring/src/testutil.rs b/crates/scoring/src/testutil.rs index fa988285..eadb1409 100644 --- a/crates/scoring/src/testutil.rs +++ b/crates/scoring/src/testutil.rs @@ -2,7 +2,7 @@ //! //! `cfg(test)` only — does not appear in release builds. -use std::collections::HashMap; +use rustc_hash::FxHashMap; use model::activation::ActivationMethod; use model::instrument::InstrumentType; @@ -33,14 +33,14 @@ pub fn tiny_param() -> Param { let ion_freqs = vec![0.6_f32, 0.3, 0.05, 0.001]; let noise_freqs = vec![0.1_f32, 0.2, 0.3, 0.4]; - let mut ion_table_inner: HashMap> = HashMap::new(); + let mut ion_table_inner: FxHashMap> = FxHashMap::default(); ion_table_inner.insert(prefix_ion, ion_freqs); ion_table_inner.insert(noise_ion, noise_freqs); - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); rank_dist_table.insert(part, ion_table_inner); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![]); let mut p = Param { @@ -60,15 +60,15 @@ pub fn tiny_param() -> Param { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; p.rebuild_cache(); p @@ -94,15 +94,15 @@ pub fn tiny_param_with_ions() -> Param { let ion_freqs = vec![0.6_f32, 0.3, 0.05, 0.001]; let noise_freqs = vec![0.1_f32, 0.2, 0.3, 0.4]; - let mut ion_table: HashMap> = HashMap::new(); + let mut ion_table: FxHashMap> = FxHashMap::default(); ion_table.insert(prefix1, ion_freqs); ion_table.insert(noise, noise_freqs); - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); rank_dist_table.insert(part, ion_table); // frag_off_table: one prefix ion entry so ion_types_for_segment returns it. - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![FragmentOffsetFrequency { ion_type: prefix1, frequency: 0.7, @@ -125,15 +125,15 @@ pub fn tiny_param_with_ions() -> Param { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; p.rebuild_cache(); p diff --git a/crates/scoring/tests/gf_graph_dp.rs b/crates/scoring/tests/gf_graph_dp.rs index e58de394..51dbff7b 100644 --- a/crates/scoring/tests/gf_graph_dp.rs +++ b/crates/scoring/tests/gf_graph_dp.rs @@ -9,7 +9,7 @@ //! integration tests. If the crate-internal version changes, this copy must be //! kept in sync. -use std::collections::HashMap; +use rustc_hash::FxHashMap; use model::{AminoAcidSetBuilder, Enzyme, Spectrum, Tolerance}; use scoring::{Param, RankScorer, ScoredSpectrum}; @@ -30,14 +30,14 @@ fn tiny_param() -> Param { let prefix1 = IonType::Prefix { charge: 1, offset_bits: 0.0_f32.to_bits() }; let noise = IonType::Noise; - let mut ion_table: HashMap> = HashMap::new(); + let mut ion_table: FxHashMap> = FxHashMap::default(); ion_table.insert(prefix1, vec![0.6_f32, 0.3, 0.05, 0.001]); ion_table.insert(noise, vec![0.1_f32, 0.2, 0.3, 0.4]); - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![FragmentOffsetFrequency { ion_type: prefix1, frequency: 0.7, @@ -60,15 +60,15 @@ fn tiny_param() -> Param { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; p.rebuild_cache(); p diff --git a/crates/scoring/tests/primitive_graph_arena_parity.rs b/crates/scoring/tests/primitive_graph_arena_parity.rs index 4a461714..5a0b73c2 100644 --- a/crates/scoring/tests/primitive_graph_arena_parity.rs +++ b/crates/scoring/tests/primitive_graph_arena_parity.rs @@ -5,7 +5,7 @@ //! thread-local arena pool for `PrimitiveAaGraph::new`'s 11 per-call Vec //! allocations. Bit-identical output required. -use std::collections::HashMap; +use rustc_hash::FxHashMap; use model::{AminoAcidSetBuilder, Spectrum, Tolerance}; use model::activation::ActivationMethod; @@ -23,14 +23,14 @@ fn tiny_param() -> Param { let prefix1 = IonType::Prefix { charge: 1, offset_bits: 0.0_f32.to_bits() }; let noise = IonType::Noise; - let mut ion_table: HashMap> = HashMap::new(); + let mut ion_table: FxHashMap> = FxHashMap::default(); ion_table.insert(prefix1, vec![0.6_f32, 0.3, 0.05, 0.001]); ion_table.insert(noise, vec![0.1_f32, 0.2, 0.3, 0.4]); - let mut rank_dist_table: HashMap>> = HashMap::new(); + let mut rank_dist_table: FxHashMap>> = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![FragmentOffsetFrequency { ion_type: prefix1, frequency: 0.7, @@ -53,15 +53,15 @@ fn tiny_param() -> Param { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; p.rebuild_cache(); p diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index f8287cb1..7a9fd0eb 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -1164,7 +1164,7 @@ mod feature_tests { use model::instrument::InstrumentType; use model::protocol::Protocol; use model::tolerance::Tolerance; - use std::collections::HashMap; + use rustc_hash::FxHashMap; /// Minimal RankScorer for feature tests, with mme = Da(tol_da). /// @@ -1181,13 +1181,13 @@ mod feature_tests { let prefix1 = IonType::Prefix { charge: 1, offset_bits: (PROTON as f32).to_bits() }; let suffix1 = IonType::Suffix { charge: 1, offset_bits: ((H2O + PROTON) as f32).to_bits() }; let noise = IonType::Noise; - let mut ion_table = HashMap::new(); + let mut ion_table = FxHashMap::default(); ion_table.insert(prefix1, vec![0.6_f32, 0.3, 0.05, 0.001]); ion_table.insert(suffix1, vec![0.6_f32, 0.3, 0.05, 0.001]); ion_table.insert(noise, vec![0.1_f32, 0.2, 0.3, 0.4]); - let mut rank_dist_table = HashMap::new(); + let mut rank_dist_table = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![ FragmentOffsetFrequency { ion_type: prefix1, frequency: 0.7 }, FragmentOffsetFrequency { ion_type: suffix1, frequency: 0.7 }, @@ -1209,15 +1209,15 @@ mod feature_tests { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; param.rebuild_cache(); RankScorer::new(¶m) diff --git a/crates/search/tests/mass_calibrator_integration.rs b/crates/search/tests/mass_calibrator_integration.rs index b714b727..338cc052 100644 --- a/crates/search/tests/mass_calibrator_integration.rs +++ b/crates/search/tests/mass_calibrator_integration.rs @@ -9,6 +9,7 @@ //! harness's responsibility. use std::collections::HashMap; +use rustc_hash::FxHashMap; use model::{AminoAcidSetBuilder, Protein, ProteinDb, Spectrum}; use scoring_crate::param_model::{IonType, Partition, SpecDataType}; @@ -32,15 +33,15 @@ fn tiny_scorer() -> RankScorer { let suffix1 = IonType::Suffix { charge: 1, offset_bits: 0.0_f32.to_bits() }; let noise = IonType::Noise; - let mut ion_table = HashMap::new(); + let mut ion_table = FxHashMap::default(); ion_table.insert(prefix1, vec![0.5_f32, 0.1, 0.05, 0.01]); ion_table.insert(suffix1, vec![0.5_f32, 0.1, 0.05, 0.01]); ion_table.insert(noise, vec![0.1_f32, 0.05, 0.02, 0.01]); - let mut rank_dist_table = HashMap::new(); + let mut rank_dist_table = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![]); let mut param = Param { @@ -60,15 +61,15 @@ fn tiny_scorer() -> RankScorer { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; param.rebuild_cache(); RankScorer::new(¶m) diff --git a/crates/search/tests/match_engine_smoke.rs b/crates/search/tests/match_engine_smoke.rs index f60a18cf..5687399e 100644 --- a/crates/search/tests/match_engine_smoke.rs +++ b/crates/search/tests/match_engine_smoke.rs @@ -1,6 +1,6 @@ //! match_engine smoke tests. -use std::collections::HashMap; +use rustc_hash::FxHashMap; use model::{AminoAcid, AminoAcidSetBuilder, Peptide, Protein, ProteinDb, Spectrum, PROTON, Tolerance}; use scoring_crate::{Param, RankScorer}; @@ -30,15 +30,15 @@ fn tiny_scorer() -> RankScorer { let suffix1 = IonType::Suffix { charge: 1, offset_bits: 0.0_f32.to_bits() }; let noise = IonType::Noise; - let mut ion_table = HashMap::new(); + let mut ion_table = FxHashMap::default(); ion_table.insert(prefix1, vec![0.5_f32, 0.1, 0.05, 0.01]); ion_table.insert(suffix1, vec![0.5_f32, 0.1, 0.05, 0.01]); ion_table.insert(noise, vec![0.05_f32, 0.05, 0.05, 0.05]); - let mut rank_dist_table = HashMap::new(); + let mut rank_dist_table = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![]); let mut param = Param { @@ -58,15 +58,15 @@ fn tiny_scorer() -> RankScorer { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; param.rebuild_cache(); RankScorer::new(¶m) diff --git a/crates/search/tests/match_engine_specevalue.rs b/crates/search/tests/match_engine_specevalue.rs index 81e0dd1a..e1fa043b 100644 --- a/crates/search/tests/match_engine_specevalue.rs +++ b/crates/search/tests/match_engine_specevalue.rs @@ -5,7 +5,7 @@ //! 2. For a well-matched spectrum, the top PSM has spec_e_value < 1.0. //! 3. The TopNQueue ordering reflects spec_e_value (best first in sorted_vec). -use std::collections::HashMap; +use rustc_hash::FxHashMap; use model::{AminoAcid, AminoAcidSetBuilder, Peptide, Protein, ProteinDb, Spectrum, PROTON, Tolerance}; use scoring_crate::{Param, RankScorer}; @@ -36,15 +36,15 @@ fn tiny_scorer() -> RankScorer { let suffix1 = IonType::Suffix { charge: 1, offset_bits: 0.0_f32.to_bits() }; let noise = IonType::Noise; - let mut ion_table = HashMap::new(); + let mut ion_table = FxHashMap::default(); ion_table.insert(prefix1, vec![0.5_f32, 0.1, 0.05, 0.01]); ion_table.insert(suffix1, vec![0.5_f32, 0.1, 0.05, 0.01]); ion_table.insert(noise, vec![0.05_f32, 0.05, 0.05, 0.05]); - let mut rank_dist_table = HashMap::new(); + let mut rank_dist_table = FxHashMap::default(); rank_dist_table.insert(part, ion_table); - let mut frag_off_table = HashMap::new(); + let mut frag_off_table = FxHashMap::default(); frag_off_table.insert(part, vec![]); let mut param = Param { @@ -64,15 +64,15 @@ fn tiny_scorer() -> RankScorer { num_segments: 1, partitions: vec![part], num_precursor_off: 0, - precursor_off_map: HashMap::new(), + precursor_off_map: FxHashMap::default(), frag_off_table, max_rank: 3, rank_dist_table, error_scaling_factor: 0, - ion_err_dist_table: HashMap::new(), - noise_err_dist_table: HashMap::new(), - ion_existence_table: HashMap::new(), - partition_ion_types_cache: HashMap::new(), + ion_err_dist_table: FxHashMap::default(), + noise_err_dist_table: FxHashMap::default(), + ion_existence_table: FxHashMap::default(), + partition_ion_types_cache: FxHashMap::default(), }; param.rebuild_cache(); RankScorer::new(¶m) From 67002e42db1603b7522b3191735aff59343ebd65 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 18:28:46 +0100 Subject: [PATCH 12/23] feat(s2): MassCalibrator threshold fallback 1e-6 -> 1e-5 If the strict SpecEValue threshold (1e-6) does not qualify MIN_CONFIDENT_PSMS (200) in the cal pre-pass, retry once at 1e-5 before giving up. Preserves Java parity on datasets where 1e-6 already succeeds (Astral, TMT); recovers LFQ-shaped distributions where Rust's SpecE-tail drift leaves the cal pre-pass a few PSMs short (LFQ ships at 193/200 in PR-A). Median-of-residuals + MAD-based robust sigma are robust to one decade of noisier outliers; the fallback is a one-shot retry, not a baseline threshold change. Bench gate (S2): LFQ auto @1% FDR >= 14,805 (baseline 14,755 + 50). --- crates/search/src/mass_calibrator.rs | 29 ++++++++++++++++++- crates/search/src/precursor_cal.rs | 5 ++++ .../tests/mass_calibrator_integration.rs | 15 ++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/crates/search/src/mass_calibrator.rs b/crates/search/src/mass_calibrator.rs index a227cf73..147731d8 100644 --- a/crates/search/src/mass_calibrator.rs +++ b/crates/search/src/mass_calibrator.rs @@ -151,8 +151,34 @@ pub fn learn_calibration_stats( &prepared.candidates, MIN_DE_NOVO_SCORE, constants::MIN_CONFIDENT_PSMS, + constants::MAX_SPEC_EVALUE, ); + // Fallback: if the strict 1e-6 threshold did not qualify enough + // confident PSMs, retry once at 1e-5. Preserves Java parity on + // datasets where 1e-6 succeeds; recovers LFQ-shaped distributions + // where Rust's SpecE-tail drift leaves the cal pre-pass a handful + // of PSMs short. The median-residual + MAD-based sigma are robust + // to one decade of noisier outliers. + let (residuals, filter) = if residuals.len() < constants::MIN_CONFIDENT_PSMS { + let (retry_residuals, retry_filter) = extract_residuals( + &sampled, + &queues, + originals, + &prepared.candidates, + MIN_DE_NOVO_SCORE, + constants::MIN_CONFIDENT_PSMS, + constants::MAX_SPEC_EVALUE_FALLBACK, + ); + if retry_residuals.len() >= constants::MIN_CONFIDENT_PSMS { + (retry_residuals, retry_filter) + } else { + (residuals, filter) + } + } else { + (residuals, filter) + }; + if residuals.len() < constants::MIN_CONFIDENT_PSMS { return CalibrationStats { rejected_spec_e: filter.rejected_spec_e, @@ -233,6 +259,7 @@ fn extract_residuals( candidates: &[crate::candidate_gen::Candidate], min_de_novo_score: i32, keep_top_n: usize, + max_spec_evalue: f64, ) -> (Vec, CalFilterCounts) { let mut residual_with_eval: Vec<(f64, f64)> = Vec::new(); let mut filter = CalFilterCounts::default(); @@ -265,7 +292,7 @@ fn extract_residuals( } for (&spectrum_idx, psm) in best_by_spec.iter() { - if psm.spec_e_value > constants::MAX_SPEC_EVALUE { + if psm.spec_e_value > max_spec_evalue { filter.rejected_spec_e += 1; continue; } diff --git a/crates/search/src/precursor_cal.rs b/crates/search/src/precursor_cal.rs index 755e7659..58742a06 100644 --- a/crates/search/src/precursor_cal.rs +++ b/crates/search/src/precursor_cal.rs @@ -78,6 +78,11 @@ pub mod constants { pub const MAX_SAMPLED: usize = 500; pub const MIN_CONFIDENT_PSMS: usize = 200; pub const MAX_SPEC_EVALUE: f64 = 1e-6; + /// Fallback SpecEValue threshold used when the strict 1e-6 cutoff + /// fails to qualify `MIN_CONFIDENT_PSMS` (e.g. LFQ datasets with + /// shifted SpecE-tail distribution). Java parity is preserved on + /// the primary pass; the fallback is a one-shot retry. + pub const MAX_SPEC_EVALUE_FALLBACK: f64 = 1e-5; pub const MIN_SPECKEYS_FOR_PREPASS: usize = 10_000; /// Java `DEFAULT_TIGHTENED_WINDOW_*` — post-cal main-pass tolerance tightening. diff --git a/crates/search/tests/mass_calibrator_integration.rs b/crates/search/tests/mass_calibrator_integration.rs index 338cc052..81dbe593 100644 --- a/crates/search/tests/mass_calibrator_integration.rs +++ b/crates/search/tests/mass_calibrator_integration.rs @@ -168,3 +168,18 @@ fn build_spec_keys_skips_below_min_peaks_and_expands_missing_charge() { assert!(keys.iter().any(|k| k.spectrum_idx == 1 && k.charge == 3)); assert!(!keys.iter().any(|k| k.spectrum_idx == 2)); } + +#[test] +fn fallback_threshold_constant_is_strictly_relaxed() { + use search::precursor_cal::constants; + let primary: f64 = constants::MAX_SPEC_EVALUE; + let fallback: f64 = constants::MAX_SPEC_EVALUE_FALLBACK; + assert!( + fallback > primary, + "fallback threshold ({fallback}) must be more permissive than primary ({primary})", + ); + assert!( + fallback <= 1e-3, + "fallback should not be excessively permissive; got {fallback}", + ); +} From 09824bdef36a770b861662cbba5324bf487548c8 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 26 May 2026 19:04:55 +0100 Subject: [PATCH 13/23] Revert "feat(s2): MassCalibrator threshold fallback 1e-6 -> 1e-5" This reverts commit 67002e42db1603b7522b3191735aff59343ebd65. --- crates/search/src/mass_calibrator.rs | 29 +------------------ crates/search/src/precursor_cal.rs | 5 ---- .../tests/mass_calibrator_integration.rs | 15 ---------- 3 files changed, 1 insertion(+), 48 deletions(-) diff --git a/crates/search/src/mass_calibrator.rs b/crates/search/src/mass_calibrator.rs index 147731d8..a227cf73 100644 --- a/crates/search/src/mass_calibrator.rs +++ b/crates/search/src/mass_calibrator.rs @@ -151,34 +151,8 @@ pub fn learn_calibration_stats( &prepared.candidates, MIN_DE_NOVO_SCORE, constants::MIN_CONFIDENT_PSMS, - constants::MAX_SPEC_EVALUE, ); - // Fallback: if the strict 1e-6 threshold did not qualify enough - // confident PSMs, retry once at 1e-5. Preserves Java parity on - // datasets where 1e-6 succeeds; recovers LFQ-shaped distributions - // where Rust's SpecE-tail drift leaves the cal pre-pass a handful - // of PSMs short. The median-residual + MAD-based sigma are robust - // to one decade of noisier outliers. - let (residuals, filter) = if residuals.len() < constants::MIN_CONFIDENT_PSMS { - let (retry_residuals, retry_filter) = extract_residuals( - &sampled, - &queues, - originals, - &prepared.candidates, - MIN_DE_NOVO_SCORE, - constants::MIN_CONFIDENT_PSMS, - constants::MAX_SPEC_EVALUE_FALLBACK, - ); - if retry_residuals.len() >= constants::MIN_CONFIDENT_PSMS { - (retry_residuals, retry_filter) - } else { - (residuals, filter) - } - } else { - (residuals, filter) - }; - if residuals.len() < constants::MIN_CONFIDENT_PSMS { return CalibrationStats { rejected_spec_e: filter.rejected_spec_e, @@ -259,7 +233,6 @@ fn extract_residuals( candidates: &[crate::candidate_gen::Candidate], min_de_novo_score: i32, keep_top_n: usize, - max_spec_evalue: f64, ) -> (Vec, CalFilterCounts) { let mut residual_with_eval: Vec<(f64, f64)> = Vec::new(); let mut filter = CalFilterCounts::default(); @@ -292,7 +265,7 @@ fn extract_residuals( } for (&spectrum_idx, psm) in best_by_spec.iter() { - if psm.spec_e_value > max_spec_evalue { + if psm.spec_e_value > constants::MAX_SPEC_EVALUE { filter.rejected_spec_e += 1; continue; } diff --git a/crates/search/src/precursor_cal.rs b/crates/search/src/precursor_cal.rs index 58742a06..755e7659 100644 --- a/crates/search/src/precursor_cal.rs +++ b/crates/search/src/precursor_cal.rs @@ -78,11 +78,6 @@ pub mod constants { pub const MAX_SAMPLED: usize = 500; pub const MIN_CONFIDENT_PSMS: usize = 200; pub const MAX_SPEC_EVALUE: f64 = 1e-6; - /// Fallback SpecEValue threshold used when the strict 1e-6 cutoff - /// fails to qualify `MIN_CONFIDENT_PSMS` (e.g. LFQ datasets with - /// shifted SpecE-tail distribution). Java parity is preserved on - /// the primary pass; the fallback is a one-shot retry. - pub const MAX_SPEC_EVALUE_FALLBACK: f64 = 1e-5; pub const MIN_SPECKEYS_FOR_PREPASS: usize = 10_000; /// Java `DEFAULT_TIGHTENED_WINDOW_*` — post-cal main-pass tolerance tightening. diff --git a/crates/search/tests/mass_calibrator_integration.rs b/crates/search/tests/mass_calibrator_integration.rs index 81dbe593..338cc052 100644 --- a/crates/search/tests/mass_calibrator_integration.rs +++ b/crates/search/tests/mass_calibrator_integration.rs @@ -168,18 +168,3 @@ fn build_spec_keys_skips_below_min_peaks_and_expands_missing_charge() { assert!(keys.iter().any(|k| k.spectrum_idx == 1 && k.charge == 3)); assert!(!keys.iter().any(|k| k.spectrum_idx == 2)); } - -#[test] -fn fallback_threshold_constant_is_strictly_relaxed() { - use search::precursor_cal::constants; - let primary: f64 = constants::MAX_SPEC_EVALUE; - let fallback: f64 = constants::MAX_SPEC_EVALUE_FALLBACK; - assert!( - fallback > primary, - "fallback threshold ({fallback}) must be more permissive than primary ({primary})", - ); - assert!( - fallback <= 1e-3, - "fallback should not be excessively permissive; got {fallback}", - ); -} From 9a6607a18e496c61a73d7fc439adbc8eb9e77494 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 08:59:26 +0100 Subject: [PATCH 14/23] perf(scoring): branchless f32/f64 rounding + GF DP inner-loop tightening Target 1 (roundf elimination): the perf flamegraph (Astral cal=off, post-PR-V1 FxHashMap) shows 5.15% CPU in libc roundf called from f32::round() / f64::round() in scoring hot paths. Replace with the branchless `(x + 0.5.copysign(x)) as i32` idiom. Mathematically equivalent for finite non-edge values: - x > 0: (x + 0.5) truncated == round_half_away_from_zero - x < 0: (x - 0.5) truncated == round_half_away_from_zero - x = 0: 0 + 0 = 0 in both forms Skipped: match_engine.rs:256,257,679,680 (`(tol - 0.4999).round()`) which is a window-widening adjustment with a different semantic, not the round-to-nearest idiom. Target 2 (GF DP inner-loop): compute_inner is already tightly written; added a comment documenting why no structural change was made and noting the next opportunities (prev_idx caching alongside valid_edges, SIMD widening of the inner multiply loop). No functional behavior change; bit-identical PIN regression gate green. --- crates/scoring/src/gf/generating_function.rs | 4 ++++ crates/scoring/src/gf/primitive_graph.rs | 5 ++++- crates/scoring/src/scoring/rank_scorer.rs | 3 ++- crates/scoring/src/scoring/scored_spectrum.rs | 11 ++++++++--- crates/search/src/match_engine.rs | 8 ++++---- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/crates/scoring/src/gf/generating_function.rs b/crates/scoring/src/gf/generating_function.rs index 7148294d..4ee0348b 100644 --- a/crates/scoring/src/gf/generating_function.rs +++ b/crates/scoring/src/gf/generating_function.rs @@ -564,6 +564,10 @@ fn compute_inner( } } + // compute_inner already tightly written; further perf needs algorithmic changes + // outside this iteration (e.g. caching prev_idx alongside valid_edges to avoid + // the second node_index_for_mass call, or SIMD-widening the inner multiply loop). + // Underflow guard at max_score - 1. // Read-then-write on the same slice; `cur_slice` is already &mut. let guard_idx = (cur_max_score - 1 - cur_min_score) as usize; diff --git a/crates/scoring/src/gf/primitive_graph.rs b/crates/scoring/src/gf/primitive_graph.rs index 0fb95e9c..94fe2238 100644 --- a/crates/scoring/src/gf/primitive_graph.rs +++ b/crates/scoring/src/gf/primitive_graph.rs @@ -830,7 +830,10 @@ fn compute_edge_error_scores( let delta = cur_obs.unwrap() - prev_obs.unwrap() - edge_mass_scratch[e]; s += scorer.error_score(part, delta as f32); } - let mut error_score = s.round() as i32; + // Branchless `f32::round() as i32` equivalent: avoids libc `roundf` call. + // Adds +0.5 for positive, -0.5 for negative, then truncates toward zero. + // Matches `round()`'s "round half away from zero" semantics for finite values. + let mut error_score = (s + 0.5_f32.copysign(s)) as i32; if !(-100..=100).contains(&error_score) { clamp_count += 1; error_score = -4; diff --git a/crates/scoring/src/scoring/rank_scorer.rs b/crates/scoring/src/scoring/rank_scorer.rs index dcc56289..3b0bdf76 100644 --- a/crates/scoring/src/scoring/rank_scorer.rs +++ b/crates/scoring/src/scoring/rank_scorer.rs @@ -207,7 +207,8 @@ impl RankScorer { if esf == 0 { return 0.0; } - let mut err_index = (error * esf as f32).round() as i32; + let err_val = error * esf as f32; + let mut err_index = (err_val + 0.5_f32.copysign(err_val)) as i32; if err_index > esf { err_index = esf; } else if err_index < -esf { err_index = -esf; } err_index += esf; diff --git a/crates/scoring/src/scoring/scored_spectrum.rs b/crates/scoring/src/scoring/scored_spectrum.rs index ca7ffc49..a2a7cc39 100644 --- a/crates/scoring/src/scoring/scored_spectrum.rs +++ b/crates/scoring/src/scoring/scored_spectrum.rs @@ -477,7 +477,11 @@ impl<'a> ScoredSpectrum<'a> { } let pref = *self.prefix_score_cache.get(prefix_nominal as usize)?; let suff = *self.suffix_score_cache.get(suffix_nominal as usize)?; - Some((pref + suff).round() as i32) + let v = pref + suff; + // Branchless `f32::round() as i32` equivalent: avoids libc `roundf` call. + // Adds +0.5 for positive, -0.5 for negative, then truncates toward zero. + // Matches `round()`'s "round half away from zero" semantics for finite values. + Some((v + 0.5_f32.copysign(v)) as i32) } /// Trace-only accessor: raw `prefix_score_cache[prefix_nominal]` if in @@ -650,7 +654,8 @@ impl<'a> ScoredSpectrum<'a> { let suff = self.directional_node_score( suffix_nominal, false, scorer, charge, parent_mass, fragment_tolerance_da, ); - (pref + suff).round() as i32 + let v = pref + suff; + (v + 0.5_f32.copysign(v)) as i32 } /// Score for a single directional (prefix or suffix) node at `nominal_mass`. @@ -890,7 +895,7 @@ impl<'a> ScoredSpectrum<'a> { s += scorer.error_score(part, delta as f32); } - s.round() as i32 + (s + 0.5_f32.copysign(s)) as i32 } } diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index 7a9fd0eb..8a226b9d 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -699,7 +699,7 @@ fn compute_spec_e_values_for_spectrum( // (TOLERANCE_LOG10 1.0 → 1.3 in iter30). let min_score = queue .iter_psms() - .map(|p| p.rank_score.round() as i32) + .map(|p| { let v = p.rank_score; (v + 0.5_f32.copysign(v)) as i32 }) .min() .unwrap_or(i32::MIN); @@ -801,7 +801,7 @@ fn compute_spec_e_values_for_spectrum( if psm_nominal_mass < min_peptide_mass_idx || psm_nominal_mass > max_peptide_mass_idx { return 1.0; } - let score_int = psm.rank_score.round() as i32; + let score_int = { let v = psm.rank_score; (v + 0.5_f32.copysign(v)) as i32 }; if score_int >= max_score { // Score exceeds GF range — return the probability at max_score - 1 // (which already has the underflow guard applied by the GF DP). @@ -1418,7 +1418,7 @@ pub(crate) fn dedup_pepseq_score( .clone(); let key = DedupMapKey { pep: pep_key, - score: psm.rank_score.round() as i32, + score: { let v = psm.rank_score; (v + 0.5_f32.copysign(v)) as i32 }, }; match groups.entry(key) { @@ -1461,7 +1461,7 @@ impl PepDedupKey { mod_units.push( aa.mod_ .as_ref() - .map(|m| (m.mass_delta * 100_000.0).round() as i32) + .map(|m| { let v = m.mass_delta * 100_000.0; (v + 0.5_f64.copysign(v)) as i32 }) .unwrap_or(0), ); } From d6a869de9fc2a5738e5e70a2a9e525393f8be847 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 09:55:45 +0100 Subject: [PATCH 15/23] Revert "perf(scoring): branchless f32/f64 rounding + GF DP inner-loop tightening" This reverts commit 9a6607a18e496c61a73d7fc439adbc8eb9e77494. --- crates/scoring/src/gf/generating_function.rs | 4 ---- crates/scoring/src/gf/primitive_graph.rs | 5 +---- crates/scoring/src/scoring/rank_scorer.rs | 3 +-- crates/scoring/src/scoring/scored_spectrum.rs | 11 +++-------- crates/search/src/match_engine.rs | 8 ++++---- 5 files changed, 9 insertions(+), 22 deletions(-) diff --git a/crates/scoring/src/gf/generating_function.rs b/crates/scoring/src/gf/generating_function.rs index 4ee0348b..7148294d 100644 --- a/crates/scoring/src/gf/generating_function.rs +++ b/crates/scoring/src/gf/generating_function.rs @@ -564,10 +564,6 @@ fn compute_inner( } } - // compute_inner already tightly written; further perf needs algorithmic changes - // outside this iteration (e.g. caching prev_idx alongside valid_edges to avoid - // the second node_index_for_mass call, or SIMD-widening the inner multiply loop). - // Underflow guard at max_score - 1. // Read-then-write on the same slice; `cur_slice` is already &mut. let guard_idx = (cur_max_score - 1 - cur_min_score) as usize; diff --git a/crates/scoring/src/gf/primitive_graph.rs b/crates/scoring/src/gf/primitive_graph.rs index 94fe2238..0fb95e9c 100644 --- a/crates/scoring/src/gf/primitive_graph.rs +++ b/crates/scoring/src/gf/primitive_graph.rs @@ -830,10 +830,7 @@ fn compute_edge_error_scores( let delta = cur_obs.unwrap() - prev_obs.unwrap() - edge_mass_scratch[e]; s += scorer.error_score(part, delta as f32); } - // Branchless `f32::round() as i32` equivalent: avoids libc `roundf` call. - // Adds +0.5 for positive, -0.5 for negative, then truncates toward zero. - // Matches `round()`'s "round half away from zero" semantics for finite values. - let mut error_score = (s + 0.5_f32.copysign(s)) as i32; + let mut error_score = s.round() as i32; if !(-100..=100).contains(&error_score) { clamp_count += 1; error_score = -4; diff --git a/crates/scoring/src/scoring/rank_scorer.rs b/crates/scoring/src/scoring/rank_scorer.rs index 3b0bdf76..dcc56289 100644 --- a/crates/scoring/src/scoring/rank_scorer.rs +++ b/crates/scoring/src/scoring/rank_scorer.rs @@ -207,8 +207,7 @@ impl RankScorer { if esf == 0 { return 0.0; } - let err_val = error * esf as f32; - let mut err_index = (err_val + 0.5_f32.copysign(err_val)) as i32; + let mut err_index = (error * esf as f32).round() as i32; if err_index > esf { err_index = esf; } else if err_index < -esf { err_index = -esf; } err_index += esf; diff --git a/crates/scoring/src/scoring/scored_spectrum.rs b/crates/scoring/src/scoring/scored_spectrum.rs index a2a7cc39..ca7ffc49 100644 --- a/crates/scoring/src/scoring/scored_spectrum.rs +++ b/crates/scoring/src/scoring/scored_spectrum.rs @@ -477,11 +477,7 @@ impl<'a> ScoredSpectrum<'a> { } let pref = *self.prefix_score_cache.get(prefix_nominal as usize)?; let suff = *self.suffix_score_cache.get(suffix_nominal as usize)?; - let v = pref + suff; - // Branchless `f32::round() as i32` equivalent: avoids libc `roundf` call. - // Adds +0.5 for positive, -0.5 for negative, then truncates toward zero. - // Matches `round()`'s "round half away from zero" semantics for finite values. - Some((v + 0.5_f32.copysign(v)) as i32) + Some((pref + suff).round() as i32) } /// Trace-only accessor: raw `prefix_score_cache[prefix_nominal]` if in @@ -654,8 +650,7 @@ impl<'a> ScoredSpectrum<'a> { let suff = self.directional_node_score( suffix_nominal, false, scorer, charge, parent_mass, fragment_tolerance_da, ); - let v = pref + suff; - (v + 0.5_f32.copysign(v)) as i32 + (pref + suff).round() as i32 } /// Score for a single directional (prefix or suffix) node at `nominal_mass`. @@ -895,7 +890,7 @@ impl<'a> ScoredSpectrum<'a> { s += scorer.error_score(part, delta as f32); } - (s + 0.5_f32.copysign(s)) as i32 + s.round() as i32 } } diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index 8a226b9d..7a9fd0eb 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -699,7 +699,7 @@ fn compute_spec_e_values_for_spectrum( // (TOLERANCE_LOG10 1.0 → 1.3 in iter30). let min_score = queue .iter_psms() - .map(|p| { let v = p.rank_score; (v + 0.5_f32.copysign(v)) as i32 }) + .map(|p| p.rank_score.round() as i32) .min() .unwrap_or(i32::MIN); @@ -801,7 +801,7 @@ fn compute_spec_e_values_for_spectrum( if psm_nominal_mass < min_peptide_mass_idx || psm_nominal_mass > max_peptide_mass_idx { return 1.0; } - let score_int = { let v = psm.rank_score; (v + 0.5_f32.copysign(v)) as i32 }; + let score_int = psm.rank_score.round() as i32; if score_int >= max_score { // Score exceeds GF range — return the probability at max_score - 1 // (which already has the underflow guard applied by the GF DP). @@ -1418,7 +1418,7 @@ pub(crate) fn dedup_pepseq_score( .clone(); let key = DedupMapKey { pep: pep_key, - score: { let v = psm.rank_score; (v + 0.5_f32.copysign(v)) as i32 }, + score: psm.rank_score.round() as i32, }; match groups.entry(key) { @@ -1461,7 +1461,7 @@ impl PepDedupKey { mod_units.push( aa.mod_ .as_ref() - .map(|m| { let v = m.mass_delta * 100_000.0; (v + 0.5_f64.copysign(v)) as i32 }) + .map(|m| (m.mass_delta * 100_000.0).round() as i32) .unwrap_or(0), ); } From 319af81e75514020a2793c70abad5dcf53e7819b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 10:12:39 +0100 Subject: [PATCH 16/23] perf(model): swap HashMap -> FxHashMap on AminoAcidSet hot tables Re-profile on the PR-V1 binary (Astral cal=off, 245K samples) showed the FxHashMap swap in scoring::Param did NOT close the dominant hashing hotspot. 39.35% of CPU was still in the FnMut::call_mut chain, which traces to: expand_mod_combinations -> AminoAcidSet::variants_for -> HashMap<(u8, ModLocation), Vec>::get -> hashbrown find_inner with std::hash::random::RandomState (SipHash13) PR-V1 missed this map because it lives in the `model` crate, not `scoring`. The candidate enumeration calls variants_for once per peptide position per candidate; on Astral this dominates wall. This commit: - Adds rustc-hash as a model-crate dependency. - Switches AminoAcidSet::table and aa_lists_cache from HashMap to FxHashMap. Same hashbrown internals; FxHasher is a multiply-and-xor vs SipHash13's 13-round mix. No functional behavior change; PIN bit-identical regression gate green. Tests pass. Expected wall reduction on Astral cal=off: -10..-25%, depending on how much of the 39% chain comes from the SipHash vs the surrounding Arc clones and Vec extends. --- Cargo.lock | 1 + crates/model/Cargo.toml | 1 + crates/model/src/aa_set.rs | 16 +++++++++++----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d735d59d..7a24182e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -344,6 +344,7 @@ dependencies = [ name = "model" version = "0.1.0" dependencies = [ + "rustc-hash", "tempfile", "thiserror", ] diff --git a/crates/model/Cargo.toml b/crates/model/Cargo.toml index ec839c8b..d3262816 100644 --- a/crates/model/Cargo.toml +++ b/crates/model/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] thiserror = { workspace = true } +rustc-hash = "2" [dev-dependencies] tempfile = "3.10" diff --git a/crates/model/src/aa_set.rs b/crates/model/src/aa_set.rs index cfe8209a..1a9bcebb 100644 --- a/crates/model/src/aa_set.rs +++ b/crates/model/src/aa_set.rs @@ -1,11 +1,12 @@ //! Heavyweight residue-and-modification set. Built via //! `AminoAcidSetBuilder`; queried by the candidate generator. -use std::collections::HashMap; use std::fs; use std::path::Path; use std::sync::Arc; +use rustc_hash::FxHashMap; + use crate::amino_acid::AminoAcid; use crate::enzyme::Enzyme; use crate::modification::{ModLocation, ModParseError, Modification, ResidueSpec}; @@ -16,10 +17,15 @@ const IMPLAUSIBLE_MASS_THRESHOLD: f64 = 1000.0; #[derive(Debug, Clone)] pub struct AminoAcidSet { /// (residue, location) → all variants (unmodified + modified) at that position. - table: HashMap<(u8, ModLocation), Vec>, + /// + /// Iter2 perf: switched from `HashMap` (SipHash13, RandomState) to + /// `FxHashMap` after a flamegraph on the post-PR-V1 binary showed 39% + /// of Astral CPU in `variants_for` lookups via SipHash. Same hashbrown + /// internals, faster hasher. + table: FxHashMap<(u8, ModLocation), Vec>, /// Per-location flattened AA lists, precomputed at build time. Avoids /// per-call rebuild in the GF DP hot path (PrimitiveAaGraph::new). - aa_lists_cache: HashMap>, + aa_lists_cache: FxHashMap>, has_cterm_mods: bool, min_aa_mass: f64, max_aa_mass: f64, @@ -327,7 +333,7 @@ impl AminoAcidSetBuilder { .map(Arc::new) .collect(); - let mut table: HashMap<(u8, ModLocation), Vec> = HashMap::new(); + let mut table: FxHashMap<(u8, ModLocation), Vec> = FxHashMap::default(); let locations = [ ModLocation::Anywhere, ModLocation::NTerm, ModLocation::CTerm, ModLocation::ProtNTerm, ModLocation::ProtCTerm, @@ -404,7 +410,7 @@ impl AminoAcidSetBuilder { // 5. Precompute the per-location AA lists used by `aa_list_for` and // `cached_aa_list`. Runs once at build time so the GF DP hot path // can borrow a slice. - let mut aa_lists_cache: HashMap> = HashMap::new(); + let mut aa_lists_cache: FxHashMap> = FxHashMap::default(); let anywhere_list: Vec = STANDARD_RESIDUES .iter() .flat_map(|&r| { From 096dbca916c7f6999d7cd3bb2c652e5d2d3cb612 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 11:13:01 +0100 Subject: [PATCH 17/23] perf(search): eliminate per-interior-position Vec clone in candidate enum A perf trace on the PR-V1 binary (Astral cal=off, 245K samples) showed 12.63%+ of CPU under `to_vec` and `Arc::clone` within the `expand_mod_combinations` -> `variants_for` chain. The candidate enumerator was cloning the full Anywhere-variants Vec for every interior position of every candidate peptide. For a length-L peptide, L-2 of L positions had no terminal merging to do (typically ~87% of positions on real peptides) and the clone was pure waste. Refactor: only the 1-2 terminal positions (pos 0 and pos L-1) build owned merged variants via a new `build_terminal_variants` helper. Interior positions reference the AminoAcidSet's Anywhere-variants slice directly. `expand_recursive`'s signature changes from `&[Vec]` to `&[&[AminoAcid]]`. No functional behavior change; bit-identical regression gate green. Expected wall reduction on Astral cal=off: -5..-15%, depending on how much of the 12.63% chain is the `to_vec` vs the surrounding recursion overhead. --- crates/search/src/candidate_gen.rs | 181 +++++++++++++++++------------ 1 file changed, 105 insertions(+), 76 deletions(-) diff --git a/crates/search/src/candidate_gen.rs b/crates/search/src/candidate_gen.rs index d73bee44..ef5f4c71 100644 --- a/crates/search/src/candidate_gen.rs +++ b/crates/search/src/candidate_gen.rs @@ -265,100 +265,129 @@ fn enumerate_all_spans(ctx: &EmitCtx<'_>, n: u32) -> Vec { /// merged in addition to Anywhere variants. /// - Position n-1: Protein_C_Term (if is_protein_c_term) or C_Term variants are /// merged in addition to Anywhere variants. -/// - All other positions: Anywhere only (unchanged). +/// - All other positions: Anywhere only — borrowed directly from AminoAcidSet, +/// no clone. fn expand_mod_combinations( span: &[u8], params: &SearchParams, is_protein_n_term: bool, is_protein_c_term: bool, ) -> Vec> { - use model::modification::ModLocation; - let n = span.len(); - // For each position, the list of variants at that residue. - let position_variants: Vec> = span.iter().enumerate().map(|(i, &r)| { - let anywhere_variants = params.aa_set.variants_for(r, ModLocation::Anywhere); - - // Helper: returns true if `term_variants` contains a FIXED mod variant - // for this residue. When a fixed terminal mod applies, the residue - // MUST carry it — the unmodified Anywhere variant is not a valid - // candidate. (Matches Java MS-GF+: fixed mods are mandatory.) - let has_fixed_in = |term_variants: &[AminoAcid]| -> bool { - term_variants.iter().any(|aa| { - aa.mod_.as_ref().map(|m| m.fixed).unwrap_or(false) - }) - }; - - // Collect the relevant terminal variant sets for this position. - let n_term_variants: &[AminoAcid] = if i == 0 { - let loc = if is_protein_n_term { - ModLocation::ProtNTerm - } else { - ModLocation::NTerm - }; - params.aa_set.variants_for(r, loc) - } else { - &[] - }; - let c_term_variants: &[AminoAcid] = if i == n - 1 { - let loc = if is_protein_c_term { - ModLocation::ProtCTerm - } else { - ModLocation::CTerm - }; - params.aa_set.variants_for(r, loc) - } else { - &[] - }; - - let has_fixed_n = has_fixed_in(n_term_variants); - let has_fixed_c = has_fixed_in(c_term_variants); - - // If a fixed terminal mod is mandatory at this position, the - // unmodified Anywhere variant is not a legal candidate. Drop the - // Anywhere variants in that case; otherwise include them. This - // prevents the candidate explosion that wildcard fixed N-term TMT - // would otherwise cause (every peptide would be enumerated twice - // at position 0: once unmodded, once TMT-modded). - // - // Note: Anywhere variants always include the residue's own fixed - // mods folded in (e.g. K-anywhere already carries K-TMT), so this - // rule applies only to terminal mods. - let mut variants: Vec = if has_fixed_n || has_fixed_c { - Vec::new() + + // Build owned merged-variant vecs only for the (up to two) terminal + // positions. Interior positions will borrow the Anywhere slice directly, + // eliminating the per-position `to_vec` clone that showed up in perf + // traces (~87% of positions on real tryptic peptides). + let pos0_owned: Option> = (n > 0).then(|| { + build_terminal_variants(params, span[0], 0, n, is_protein_n_term, is_protein_c_term) + }); + let pos_last_owned: Option> = (n > 1).then(|| { + build_terminal_variants(params, span[n - 1], n - 1, n, is_protein_n_term, is_protein_c_term) + }); + + // Collect per-position variant slices. Terminal positions reference the + // owned vecs above; interior positions borrow directly from AminoAcidSet. + // All borrows are valid for the duration of this function. + use model::modification::ModLocation; + let position_variants_refs: Vec<&[AminoAcid]> = span.iter().enumerate().map(|(i, &r)| { + if i == 0 { + pos0_owned.as_ref().unwrap().as_slice() + } else if i == n - 1 { + // n > 1 guaranteed here because n == 1 means i == 0 == n-1, + // which is already handled by the first branch. + pos_last_owned.as_ref().unwrap().as_slice() } else { - anywhere_variants.to_vec() - }; - - // Append all terminal variants (fixed + variable). When a fixed - // mod is present, the modded variant is the only legal one for - // that mod's residue/location slot; variable mods stack on top - // by adding additional explored variants. - for v in n_term_variants { - if !variants.contains(v) { - variants.push(v.clone()); - } - } - for v in c_term_variants { - if !variants.contains(v) { - variants.push(v.clone()); - } + // Interior position: borrow Anywhere variants — no clone. + params.aa_set.variants_for(r, ModLocation::Anywhere) } - - variants }).collect(); let mut out = Vec::new(); - let mut current = Vec::with_capacity(span.len()); + let mut current = Vec::with_capacity(n); expand_recursive( - &position_variants, 0, &mut current, 0, + &position_variants_refs, 0, &mut current, 0, params.max_variable_mods_per_peptide, &mut out, ); out } +/// Build the merged variant list for a terminal position (pos 0 or pos n-1). +/// +/// Mirrors the logic that was previously inlined in `expand_mod_combinations` +/// for all positions. Only called for the 1-2 terminal positions per span. +fn build_terminal_variants( + params: &SearchParams, + residue: u8, + pos: usize, + span_len: usize, + is_protein_n_term: bool, + is_protein_c_term: bool, +) -> Vec { + use model::modification::ModLocation; + + let anywhere_variants = params.aa_set.variants_for(residue, ModLocation::Anywhere); + + // Helper: returns true if `term_variants` contains a FIXED mod variant + // for this residue. When a fixed terminal mod applies, the residue + // MUST carry it — the unmodified Anywhere variant is not a valid + // candidate. (Matches Java MS-GF+: fixed mods are mandatory.) + let has_fixed_in = |term_variants: &[AminoAcid]| -> bool { + term_variants.iter().any(|aa| aa.mod_.as_ref().map(|m| m.fixed).unwrap_or(false)) + }; + + let n_term_variants: &[AminoAcid] = if pos == 0 { + let loc = if is_protein_n_term { ModLocation::ProtNTerm } else { ModLocation::NTerm }; + params.aa_set.variants_for(residue, loc) + } else { + &[] + }; + let c_term_variants: &[AminoAcid] = if pos == span_len - 1 { + let loc = if is_protein_c_term { ModLocation::ProtCTerm } else { ModLocation::CTerm }; + params.aa_set.variants_for(residue, loc) + } else { + &[] + }; + + let has_fixed_n = has_fixed_in(n_term_variants); + let has_fixed_c = has_fixed_in(c_term_variants); + + // If a fixed terminal mod is mandatory at this position, the + // unmodified Anywhere variant is not a legal candidate. Drop the + // Anywhere variants in that case; otherwise include them. This + // prevents the candidate explosion that wildcard fixed N-term TMT + // would otherwise cause (every peptide would be enumerated twice + // at position 0: once unmodded, once TMT-modded). + // + // Note: Anywhere variants always include the residue's own fixed + // mods folded in (e.g. K-anywhere already carries K-TMT), so this + // rule applies only to terminal mods. + let mut variants: Vec = if has_fixed_n || has_fixed_c { + Vec::new() + } else { + anywhere_variants.to_vec() + }; + + // Append all terminal variants (fixed + variable). When a fixed + // mod is present, the modded variant is the only legal one for + // that mod's residue/location slot; variable mods stack on top + // by adding additional explored variants. + for v in n_term_variants { + if !variants.contains(v) { + variants.push(v.clone()); + } + } + for v in c_term_variants { + if !variants.contains(v) { + variants.push(v.clone()); + } + } + + variants +} + fn expand_recursive( - position_variants: &[Vec], + position_variants: &[&[AminoAcid]], pos: usize, current: &mut Vec, mods_used: u32, @@ -369,7 +398,7 @@ fn expand_recursive( out.push(current.clone()); return; } - for variant in &position_variants[pos] { + for variant in position_variants[pos] { // Only VARIABLE mods consume slots against the per-peptide cap. // Fixed mods are unconditionally applied by the AminoAcidSet (e.g. // CAM-on-C, TMT-on-K, TMT-on-N-term-wildcard) and must not count From a8f2becaed471ea5c745b734d9c71f757d09243d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 12:10:04 +0100 Subject: [PATCH 18/23] perf(build): enable LTO=fat + codegen-units=1 on release profile Hot paths cross crate boundaries (search -> scoring -> model). Default release profile uses codegen-units=16 with no LTO, so LLVM cannot inline across crates and small leaf functions (AminoAcid::nominal_mass, Enzyme::is_cleavable, FxHasher::write_u32, etc.) stay as function calls across the hot search loop. LTO=fat + cgu=1 give LLVM a whole-binary view at link time so it can inline cross-crate, deduplicate similar generic instantiations, and pick better register allocations on hot paths. Build time goes up (~2-3x) and binary size grows slightly, but this is a release-time cost only. No `target-cpu=native` here -- the released binary must run on a baseline x86_64 (e.g. older bench VMs). Future bench-only builds can opt in with `RUSTFLAGS="-C target-cpu=native"`. The bit-identical regression gate (precursor_cal_off_pin_tsv_match_golden_after_sort) is green under the new profile -- LLVM's whole-program view does not change float semantics. --- Cargo.toml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 0eeaee62..59e7e4fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,15 @@ rust-version = "1.85" license = "LicenseRef-UCSD-Noncommercial" authors = ["bigbio MS-GF+ contributors"] +# Release profile: enable LTO + single codegen unit so LLVM sees the whole +# binary. Hot paths cross crate boundaries (e.g. search → scoring → model), +# so default codegen-units=16 leaves cross-crate inlining on the table. +# No `target-cpu=native` here — we keep the binary portable; opt in via +# `RUSTFLAGS="-C target-cpu=native"` for bench builds. +[profile.release] +lto = "fat" +codegen-units = 1 + [workspace.dependencies] # Core deps — used across many crates. clap = { version = "4.5", features = ["derive"] } From a96a64bfff2d6803bbb65ec73049d1ce6424150c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:12:39 +0100 Subject: [PATCH 19/23] perf(build): raise CPU baseline to Sandy Bridge (AVX, no FMA) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Project-level `.cargo/config.toml` sets `target-cpu=sandybridge` so LLVM can use AVX 256-bit f64 vector ops to auto-vectorize the chunked GF DP inner loop in `crates/scoring/src/gf/generating_function.rs`. The loop spends ~16% of leaf time on Astral, and the chunked 4-wide layout was already vectorization-friendly -- it just needed an enabled target. Sandy Bridge (Intel, Q1 2011) is the right baseline: - AVX is widely available (any x86 CPU from 2011+) - AVX 256-bit double ops vectorize the GF chunked loop - NO AVX2 (Haswell+) -- preserves portability to older hypervisors - NO FMA -- fused multiply-add changes intermediate rounding; we need bit-identity with Java's separate-mul-add path for the parity gate The default `x86-64` baseline is 22 years old (2003, SSE2 only). The hardware floor it targets is no longer realistic for the workloads this tool runs against. Users on pre-2011 hardware can override with their own RUSTFLAGS. Bench (iter5 vs S1b baseline, same VM, Java drift < 1.5%): | Dataset | Mode | S1b | iter5 | Δ | |------------|------|---------|---------|---------| | Astral | off | 5:32.53 | 5:26.38 | -1.8% | | Astral | auto | 6:19.09 | 5:52.15 | -7.1% | | LFQ | off | 0:46.78 | 0:42.71 | -8.7% | | LFQ | auto | 1:01.08 | 0:53.24 | -12.8% | | TMT | off | 2:31.29 | 2:07.36 | -15.8% | | TMT | auto | 2:50.50 | 2:19.40 | -18.3% | Rust now beats Java on Astral cal=off (5:26 vs 5:40, +4%) and TMT (by 20-27%). PINs sorted-row identical to S1b across all 6 dataset/mode cells; Percolator @1% PSM counts unchanged. The `.cargo/config.toml` entry is committed -- `.gitignore` updated to keep `.cargo/` private by default but allow `config.toml` through. --- .cargo/config.toml | 17 +++++++++++++++++ .gitignore | 3 ++- Cargo.toml | 3 +-- 3 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 00000000..576b1172 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,17 @@ +# Set the build-baseline CPU to Sandy Bridge (Intel, Q1 2011 / AMD +# Bulldozer, late 2011). AVX 256-bit f64 vector ops let LLVM auto-vectorize +# the chunked GF DP inner loop in `crates/scoring/src/gf/generating_function.rs`, +# which is ~16% of leaf time on Astral. Measured -7..-18% wall on the three +# reference datasets vs the default `x86-64` baseline; PINs stay bit-identical. +# +# AVX is enabled, but FMA is NOT — Sandy Bridge predates FMA3 (Haswell, 2013). +# Fused multiply-add changes intermediate rounding and would drift float +# results away from the Java reference, breaking the bit-identical PIN gate. +# +# Default `x86-64` baseline = 2003 / SSE2 only. Sandy Bridge is 14 years +# old and universal across modern proteomics workstations and cloud VMs. +# Users on older hardware can override with `cargo +nightly build --release +# -Z target-cpu-features=...` or by editing this file locally. + +[build] +rustflags = ["-C", "target-cpu=sandybridge"] diff --git a/.gitignore b/.gitignore index 5fdd8f26..0a9878e5 100644 --- a/.gitignore +++ b/.gitignore @@ -101,5 +101,6 @@ references/ .claude/scheduled_tasks.lock # Rust workspace local state (moved from rust/.gitignore during root restructure) -.cargo/ +.cargo/* +!.cargo/config.toml *.rs.bk diff --git a/Cargo.toml b/Cargo.toml index 59e7e4fe..ba6fa652 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ authors = ["bigbio MS-GF+ contributors"] # Release profile: enable LTO + single codegen unit so LLVM sees the whole # binary. Hot paths cross crate boundaries (e.g. search → scoring → model), # so default codegen-units=16 leaves cross-crate inlining on the table. -# No `target-cpu=native` here — we keep the binary portable; opt in via -# `RUSTFLAGS="-C target-cpu=native"` for bench builds. +# The release-build CPU baseline is set in `.cargo/config.toml`. [profile.release] lto = "fat" codegen-units = 1 From af010d5c34eac5f41607204404b25ebcb597528c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:14:22 +0100 Subject: [PATCH 20/23] fix(build): scope sandybridge target-cpu to x86_64 only The `[build] rustflags = ["-C", "target-cpu=sandybridge"]` introduced in the previous commit was unconditional, which fails on aarch64 targets (macOS Apple Silicon, ARM Linux) because sandybridge is an x86-only LLVM target-cpu. Scope via `[target.'cfg(target_arch = "x86_64")']` so the x86_64 release still gets AVX 256-bit vectorization while aarch64 builds fall back to their NEON-by-default baseline. macOS-latest CI is now aarch64-only on GitHub Actions, so this unblocks the macos test job. --- .cargo/config.toml | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 576b1172..4dc602e1 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,17 +1,22 @@ -# Set the build-baseline CPU to Sandy Bridge (Intel, Q1 2011 / AMD -# Bulldozer, late 2011). AVX 256-bit f64 vector ops let LLVM auto-vectorize -# the chunked GF DP inner loop in `crates/scoring/src/gf/generating_function.rs`, -# which is ~16% of leaf time on Astral. Measured -7..-18% wall on the three -# reference datasets vs the default `x86-64` baseline; PINs stay bit-identical. +# Set the build-baseline CPU to Sandy Bridge on x86_64 (Intel, Q1 2011 / +# AMD Bulldozer, late 2011). AVX 256-bit f64 vector ops let LLVM auto- +# vectorize the chunked GF DP inner loop in +# `crates/scoring/src/gf/generating_function.rs`, which is ~16% of leaf +# time on Astral. Measured -7..-18% wall on the three reference datasets +# vs the default `x86-64` baseline; PINs stay bit-identical. # -# AVX is enabled, but FMA is NOT — Sandy Bridge predates FMA3 (Haswell, 2013). -# Fused multiply-add changes intermediate rounding and would drift float -# results away from the Java reference, breaking the bit-identical PIN gate. +# AVX is enabled, but FMA is NOT — Sandy Bridge predates FMA3 (Haswell, +# 2013). Fused multiply-add changes intermediate rounding and would +# drift float results away from the Java reference, breaking the bit- +# identical PIN gate. # # Default `x86-64` baseline = 2003 / SSE2 only. Sandy Bridge is 14 years # old and universal across modern proteomics workstations and cloud VMs. -# Users on older hardware can override with `cargo +nightly build --release -# -Z target-cpu-features=...` or by editing this file locally. +# Users on older hardware can edit this file locally. +# +# Scoped to `target_arch = "x86_64"` so the macOS-aarch64 and other +# non-x86 platforms keep their architecture-default baseline (Apple +# Silicon already has NEON 128-bit vector ops on by default). -[build] +[target.'cfg(target_arch = "x86_64")'] rustflags = ["-C", "target-cpu=sandybridge"] From 9c10429cd89fd33233bd4e91d9b0d05afbe851e0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 15:30:42 +0100 Subject: [PATCH 21/23] perf(gf): cache per-edge values across the two passes of compute_inner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-node loop in `compute_inner` scans incoming edges twice: once to derive `cur_min_score`/`cur_max_score`, then once to accumulate probabilities into `cur_slice`. The first pass already computed every field the second pass needed (`prev_idx`, `prev_hdr`, `combined_score`, `aa_prob`) but threw all of it away, storing only the raw edge index `e`. The second pass redid the work — `node_index_for_mass` lookup, `arena.headers[prev_idx]` load, and the `combined_score` add — per edge in the hot loop. Replace `valid_edges: Vec` with `Vec` carrying the cached fields. `NodeSlice` is already `Copy` (16 bytes) so the storage cost is small relative to the saved arithmetic + indexed loads. Bit-identical regression gate green; full workspace tests pass under the standard CI skip list. No semantic change — purely caching. --- crates/scoring/src/gf/generating_function.rs | 40 ++++++++++++++------ 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/crates/scoring/src/gf/generating_function.rs b/crates/scoring/src/gf/generating_function.rs index 7148294d..d8d70ca3 100644 --- a/crates/scoring/src/gf/generating_function.rs +++ b/crates/scoring/src/gf/generating_function.rs @@ -400,6 +400,19 @@ impl ScoreDistArena { /// node's probability slice is materialized into a `ScoreDist` and stashed /// on `GeneratingFunction.node_dists` so the caller can dump it via /// `iter_node_dists`. The production path passes `false`. + +/// Cached per-edge values shared between the two passes of `compute_inner`'s +/// per-node loop. The first pass derives cur_min/cur_max from predecessors; +/// the second pass accumulates probabilities into cur_slice. Without this +/// cache, the second pass re-did `node_index_for_mass`, `arena.headers[idx]`, +/// and the `combined_score` add per edge — all hot in the leaf profile. +#[derive(Clone, Copy)] +struct ValidEdge { + prev_hdr: NodeSlice, + combined_score: i32, + aa_prob: f64, +} + fn compute_inner( graph: &PrimitiveAaGraph, aa_set: &AminoAcidSet, @@ -431,12 +444,17 @@ fn compute_inner( arena.storage[start] = 1.0; } - // Scratch buffer for valid edge indices. + // Scratch buffer for valid edges. Each entry caches everything the + // second (DP-fill) pass needs about a predecessor, so we don't redo + // the `node_index_for_mass` lookup, `arena.headers[..]` load, and + // `combined_score` arithmetic per edge. The first pass populated all + // of these already; storing them keeps the inner DP loop straight + // memory-access + arithmetic. let max_edges_per_node = (0..node_count) .map(|ni| graph.edge_offset[ni + 1] - graph.edge_offset[ni]) .max() .unwrap_or(0); - let mut valid_edges: Vec = Vec::with_capacity(max_edges_per_node); + let mut valid_edges: Vec = Vec::with_capacity(max_edges_per_node); // Forward DP over nodes in index order. for ni in 0..node_count { @@ -462,7 +480,9 @@ fn compute_inner( valid_edges.clear(); - // Scan incoming edges. + // Scan incoming edges. The first pass derives cur_min/cur_max and + // caches everything the second pass needs (prev_hdr is Copy, so + // storing it is cheap and avoids re-indexing arena.headers). for e in graph.edge_offset[ni]..graph.edge_offset[ni + 1] { let prev_mass = graph.edge_prev_node[e]; let prev_idx = match graph.node_index_for_mass(prev_mass) { @@ -475,6 +495,7 @@ fn compute_inner( } let combined_score = cur_node_score + graph.edge_score[e]; + let aa_prob = graph.edge_prob[e] as f64; let prev_max = prev_hdr.min_score + prev_hdr.len as i32; let possible_max = prev_max + combined_score; if possible_max > cur_max_score { @@ -489,7 +510,7 @@ fn compute_inner( } } - valid_edges.push(e); + valid_edges.push(ValidEdge { prev_hdr, combined_score, aa_prob }); } // Skip degenerate or out-of-bound ranges. @@ -517,14 +538,11 @@ fn compute_inner( let (prev_region, cur_region) = arena.storage.split_at_mut(cur_start); let cur_slice = &mut cur_region[..cur_len]; - for &e in &valid_edges { - let prev_mass = graph.edge_prev_node[e]; - // Safety: we already verified these are valid above. - let prev_idx = graph.node_index_for_mass(prev_mass).unwrap(); - let prev_hdr = arena.headers[prev_idx]; + for ve in &valid_edges { + let prev_hdr = ve.prev_hdr; let prev_slice = &prev_region[prev_hdr.range()]; - let combined_score = cur_node_score + graph.edge_score[e]; - let aa_prob = graph.edge_prob[e] as f64; + let combined_score = ve.combined_score; + let aa_prob = ve.aa_prob; // Mirror ScoreDist::add_prob_dist: // for t in max(other_min, self_min - score_diff) From 689b4f385efee5e6799c48665678ba2634ee4b20 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:03:21 +0100 Subject: [PATCH 22/23] Revert "perf(gf): cache per-edge values across the two passes of compute_inner" This reverts commit 9c10429cd89fd33233bd4e91d9b0d05afbe851e0. --- crates/scoring/src/gf/generating_function.rs | 40 ++++++-------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/crates/scoring/src/gf/generating_function.rs b/crates/scoring/src/gf/generating_function.rs index d8d70ca3..7148294d 100644 --- a/crates/scoring/src/gf/generating_function.rs +++ b/crates/scoring/src/gf/generating_function.rs @@ -400,19 +400,6 @@ impl ScoreDistArena { /// node's probability slice is materialized into a `ScoreDist` and stashed /// on `GeneratingFunction.node_dists` so the caller can dump it via /// `iter_node_dists`. The production path passes `false`. - -/// Cached per-edge values shared between the two passes of `compute_inner`'s -/// per-node loop. The first pass derives cur_min/cur_max from predecessors; -/// the second pass accumulates probabilities into cur_slice. Without this -/// cache, the second pass re-did `node_index_for_mass`, `arena.headers[idx]`, -/// and the `combined_score` add per edge — all hot in the leaf profile. -#[derive(Clone, Copy)] -struct ValidEdge { - prev_hdr: NodeSlice, - combined_score: i32, - aa_prob: f64, -} - fn compute_inner( graph: &PrimitiveAaGraph, aa_set: &AminoAcidSet, @@ -444,17 +431,12 @@ fn compute_inner( arena.storage[start] = 1.0; } - // Scratch buffer for valid edges. Each entry caches everything the - // second (DP-fill) pass needs about a predecessor, so we don't redo - // the `node_index_for_mass` lookup, `arena.headers[..]` load, and - // `combined_score` arithmetic per edge. The first pass populated all - // of these already; storing them keeps the inner DP loop straight - // memory-access + arithmetic. + // Scratch buffer for valid edge indices. let max_edges_per_node = (0..node_count) .map(|ni| graph.edge_offset[ni + 1] - graph.edge_offset[ni]) .max() .unwrap_or(0); - let mut valid_edges: Vec = Vec::with_capacity(max_edges_per_node); + let mut valid_edges: Vec = Vec::with_capacity(max_edges_per_node); // Forward DP over nodes in index order. for ni in 0..node_count { @@ -480,9 +462,7 @@ fn compute_inner( valid_edges.clear(); - // Scan incoming edges. The first pass derives cur_min/cur_max and - // caches everything the second pass needs (prev_hdr is Copy, so - // storing it is cheap and avoids re-indexing arena.headers). + // Scan incoming edges. for e in graph.edge_offset[ni]..graph.edge_offset[ni + 1] { let prev_mass = graph.edge_prev_node[e]; let prev_idx = match graph.node_index_for_mass(prev_mass) { @@ -495,7 +475,6 @@ fn compute_inner( } let combined_score = cur_node_score + graph.edge_score[e]; - let aa_prob = graph.edge_prob[e] as f64; let prev_max = prev_hdr.min_score + prev_hdr.len as i32; let possible_max = prev_max + combined_score; if possible_max > cur_max_score { @@ -510,7 +489,7 @@ fn compute_inner( } } - valid_edges.push(ValidEdge { prev_hdr, combined_score, aa_prob }); + valid_edges.push(e); } // Skip degenerate or out-of-bound ranges. @@ -538,11 +517,14 @@ fn compute_inner( let (prev_region, cur_region) = arena.storage.split_at_mut(cur_start); let cur_slice = &mut cur_region[..cur_len]; - for ve in &valid_edges { - let prev_hdr = ve.prev_hdr; + for &e in &valid_edges { + let prev_mass = graph.edge_prev_node[e]; + // Safety: we already verified these are valid above. + let prev_idx = graph.node_index_for_mass(prev_mass).unwrap(); + let prev_hdr = arena.headers[prev_idx]; let prev_slice = &prev_region[prev_hdr.range()]; - let combined_score = ve.combined_score; - let aa_prob = ve.aa_prob; + let combined_score = cur_node_score + graph.edge_score[e]; + let aa_prob = graph.edge_prob[e] as f64; // Mirror ScoreDist::add_prob_dist: // for t in max(other_min, self_min - score_diff) From 741fec5b954ab391bbbc5a59974f81c16501d33a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 27 May 2026 16:21:24 +0100 Subject: [PATCH 23/23] =?UTF-8?q?chore:=20review=20nits=20=E2=80=94=20modu?= =?UTF-8?q?le-scope=20use,=20dedup=20dev-dep,=20comment=20accuracy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses three review observations on the V1 speed milestone PR: 1. `crates/search/src/candidate_gen.rs` — `use model::modification::ModLocation;` was inlined inside `expand_mod_combinations` and again inside `build_terminal_variants`. Lifted to module scope and removed both inline `use`s. 2. `crates/scoring/Cargo.toml` — `rustc-hash = "2"` was listed in both `[dependencies]` and `[dev-dependencies]`. Removed the dev-dep entry; the regular dependency is in scope for tests already. 3. `Cargo.toml` — clarified that `[profile.release]` flags (`lto`, `codegen-units`) are release-only, while the CPU baseline in `.cargo/config.toml` is workspace-wide on purpose — so the bit-identical PIN gate runs under the same SIMD codegen as the shipped binary. No behavior change; cargo clippy --workspace --all-targets -D warnings clean; bit-identical PIN gate green. --- Cargo.toml | 5 ++++- crates/scoring/Cargo.toml | 1 - crates/search/src/candidate_gen.rs | 4 +--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ba6fa652..b4b942c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,10 @@ authors = ["bigbio MS-GF+ contributors"] # Release profile: enable LTO + single codegen unit so LLVM sees the whole # binary. Hot paths cross crate boundaries (e.g. search → scoring → model), # so default codegen-units=16 leaves cross-crate inlining on the table. -# The release-build CPU baseline is set in `.cargo/config.toml`. +# (These two flags scope to `--release` only; the workspace-wide CPU +# baseline is set unconditionally in `.cargo/config.toml`, so `cargo test` +# and `cargo build` also get the same SIMD codegen — this is intentional, +# so any bit-identity regression surfaces in CI under the same flags.) [profile.release] lto = "fat" codegen-units = 1 diff --git a/crates/scoring/Cargo.toml b/crates/scoring/Cargo.toml index f154d976..01781d54 100644 --- a/crates/scoring/Cargo.toml +++ b/crates/scoring/Cargo.toml @@ -12,6 +12,5 @@ thiserror = { workspace = true } byteorder = { workspace = true } [dev-dependencies] -rustc-hash = "2" tempfile = "3.10" input = { path = "../input" } diff --git a/crates/search/src/candidate_gen.rs b/crates/search/src/candidate_gen.rs index ef5f4c71..612929ad 100644 --- a/crates/search/src/candidate_gen.rs +++ b/crates/search/src/candidate_gen.rs @@ -14,6 +14,7 @@ use model::amino_acid::AminoAcid; use model::enzyme::Enzyme; +use model::modification::ModLocation; use model::peptide::Peptide; use model::protein::Protein; use crate::search_index::SearchIndex; @@ -289,7 +290,6 @@ fn expand_mod_combinations( // Collect per-position variant slices. Terminal positions reference the // owned vecs above; interior positions borrow directly from AminoAcidSet. // All borrows are valid for the duration of this function. - use model::modification::ModLocation; let position_variants_refs: Vec<&[AminoAcid]> = span.iter().enumerate().map(|(i, &r)| { if i == 0 { pos0_owned.as_ref().unwrap().as_slice() @@ -324,8 +324,6 @@ fn build_terminal_variants( is_protein_n_term: bool, is_protein_c_term: bool, ) -> Vec { - use model::modification::ModLocation; - let anywhere_variants = params.aa_set.variants_for(residue, ModLocation::Anywhere); // Helper: returns true if `term_variants` contains a FIXED mod variant