From 33b72b16a2df8784cd26c7ad59888994522b4510 Mon Sep 17 00:00:00 2001 From: Peter Lord Date: Fri, 22 May 2026 11:28:12 -0700 Subject: [PATCH] Encounter stats: store map_point_history on Mongo run docs + backfill script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /api/runs/encounter-stats endpoint shipped in #336 returned zero rows because the aggregation walks $map_point_history but that field was never stored on the run doc — only denormalized fields (killed_by, deck, relics, card_choices) were. The raw history lives on disk at data/runs/.json (which the share-run page already reads). Two changes: 1. submit_run now includes map_point_history in the inserted doc. Going forward, every new submission populates the field so the aggregation has data to walk. 2. tools/backfill_run_encounters_mongo.py reads every disk JSON whose corresponding Mongo doc is missing the field and $sets it. Idempotent — safe to re-run, processes ~5K docs/sec. Operator run after deploy: ssh prod 'cd /var/www/spire-codex && docker compose -f docker-compose.prod.yml exec backend python3 -m tools.backfill_run_encounters_mongo' (Beta box runs the same script against the beta compose file.) --- backend/app/services/runs_db_mongo.py | 8 ++ tools/backfill_run_encounters_mongo.py | 151 +++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 tools/backfill_run_encounters_mongo.py diff --git a/backend/app/services/runs_db_mongo.py b/backend/app/services/runs_db_mongo.py index c798ee7f..a0605fd4 100644 --- a/backend/app/services/runs_db_mongo.py +++ b/backend/app/services/runs_db_mongo.py @@ -328,6 +328,14 @@ def _submit_player_run( "relics": relics, "potions": potions, "card_choices": card_choices, + # Per-room history needed for the encounter-stats aggregation + # (`/api/runs/encounter-stats`). Stored as the original 2D + # array (acts → rooms) so the agg can `$unwind` it without a + # reshape. Each room dict carries at minimum model_id, + # room_type, damage_taken, turns_taken — the full submitted + # JSON has more fields we don't need at aggregation time, so + # we keep the projection narrow to bound doc size. + "map_point_history": data.get("map_point_history", []), } coll = _get_collection() diff --git a/tools/backfill_run_encounters_mongo.py b/tools/backfill_run_encounters_mongo.py new file mode 100644 index 00000000..bb47c36e --- /dev/null +++ b/tools/backfill_run_encounters_mongo.py @@ -0,0 +1,151 @@ +"""Backfill `map_point_history` onto existing Mongo run docs. + +The runs_db_mongo schema used to store only denormalized fields (deck, +relics, killed_by, card_choices, etc.). The encounter-stats aggregation +at /api/runs/encounter-stats walks `map_point_history` per run to +compute per-encounter sample counts, fatal counts, average damage taken, +and average turns. Without that field on the doc, the aggregation +returns zero rows. + +The raw run JSON is preserved on disk at `data/runs/.json` (the +share-run page already reads from there). This script walks those files, +parses each, and `$set`s `map_point_history` on the corresponding Mongo +doc when the field is missing. + +Idempotent: docs that already have a non-empty `map_point_history` are +skipped. Safe to re-run after a partial pass or to catch up on submissions +made between deploy and full backfill. + +Usage on the prod box: + + cd /var/www/spire-codex + docker compose -f docker-compose.prod.yml exec backend \ + python3 -m tools.backfill_run_encounters_mongo + +Add `--dry-run` to see what would change without writing. `--limit N` +caps the number of docs processed (useful for spot-checking on a slice). +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +# Allow running as a module from anywhere under /var/www/spire-codex. +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from backend.app.services.runs_db_mongo import _get_collection # noqa: E402 + + +def _runs_dir() -> Path: + """Locate the on-disk runs/ dir. Mirrors the env-var conventions used + by the backend (`DATA_DIR` set to /data in prod, defaults under the + project tree for local dev).""" + candidates = [ + Path(os.environ.get("DATA_DIR", "")) / "runs", + ROOT / "data" / "runs", + Path("/data/runs"), + ] + for c in candidates: + if c.is_dir(): + return c + raise SystemExit(f"runs/ dir not found in any of: {[str(c) for c in candidates]}") + + +def main() -> None: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument( + "--dry-run", action="store_true", help="print actions, write nothing" + ) + p.add_argument( + "--limit", type=int, default=0, help="cap docs processed (0 = no cap)" + ) + args = p.parse_args() + + coll = _get_collection() + runs_dir = _runs_dir() + print(f"reading runs from: {runs_dir}", flush=True) + + # Pull the set of run hashes that need backfill in one round trip, + # rather than checking every disk file against Mongo individually. + needs_backfill: set[str] = set( + d["_id"] + for d in coll.find( + { + "$or": [ + {"map_point_history": {"$exists": False}}, + {"map_point_history": []}, + ] + }, + {"_id": 1}, + ) + ) + total_to_check = len(needs_backfill) + print(f"docs needing backfill: {total_to_check}", flush=True) + + seen = 0 + updated = 0 + skipped_no_file = 0 + skipped_no_history = 0 + started = time.time() + + for hash_id in list(needs_backfill): + if args.limit and updated >= args.limit: + break + seen += 1 + run_file = runs_dir / f"{hash_id}.json" + if not run_file.exists(): + skipped_no_file += 1 + continue + try: + with open(run_file, "r", encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError) as e: + print(f" ✗ {hash_id[:12]}: {e}", flush=True) + continue + + history = data.get("map_point_history") + if not history: + skipped_no_history += 1 + continue + + if args.dry_run: + updated += 1 + if updated <= 5: + act_count = len(history) + room_count = sum(len(a) for a in history if isinstance(a, list)) + print( + f" would set {hash_id[:12]}: {act_count} acts, {room_count} rooms", + flush=True, + ) + continue + + coll.update_one({"_id": hash_id}, {"$set": {"map_point_history": history}}) + updated += 1 + if updated % 500 == 0: + elapsed = time.time() - started + print( + f" ... {updated:>5d}/{total_to_check} written in {elapsed:.1f}s", + flush=True, + ) + + elapsed = time.time() - started + print() + print(f"checked : {seen}") + print(f"updated : {updated}") + print(f"no file : {skipped_no_file}") + print(f"no history: {skipped_no_history}") + print(f"elapsed : {elapsed:.1f}s") + if args.dry_run: + print("(dry-run — nothing written)") + + +if __name__ == "__main__": + main()