fix(dn_tree): make_probability_mask infinite-recursion at p=0.5

claude · claude · commit 428f49699407 · 2026-05-21T13:19:01.000Z
Latent bug surfaced during the Pillar 13 drift-check wiring (#191): `make_probability_mask` used `p >= 0.5` to invert the (1-p) mask, which recursed with `1.0 - 0.5 = 0.5` infinitely whenever p was exactly 0.5. Pillar 13's independent re-derivation used the strict `p > 0.5` and correctly fell through to the AND-cascade — that's the canonical reference this fix matches. Real production usage (DNConfig default lr=0.03 with boost ~30 → effective_lr ≈ 0.9) never hit 0.5 exactly so the bug was dormant. Now that it's fixed: * Update Pillar 13's drift-check to use lr=0.5 (its canonical mid-range value per the pillar spec) instead of the lr=0.25 workaround. The drift-check now exercises the previously-broken branch and continues to pass bit-exactly. * Add two regression tests on dn_tree itself: - `make_probability_mask_at_half_terminates` — would stack-overflow if the fix regresses. - `make_probability_mask_at_half_is_bernoulli_half` — empirical popcount mean over N=1024 lands near 32 within 16 standard errors. No public API change. The fix is two characters: `>=` → `>`.
diff --git a/src/hpc/dn_tree.rs b/src/hpc/dn_tree.rs
@@ -132,8 +132,15 @@ pub(crate) fn bundle_into(current: &GraphHV, hv: &GraphHV, lr: f64, boost: f64,
 /// Create a u64 bitmask where each bit is independently 1 with probability ~`p`.
 ///
 /// Uses cascaded AND of random words to achieve the target probability:
-/// - p >= 0.5 → OR of inverse masks
-/// - p < 0.5 → AND cascade
+/// - p > 0.5  → invert the (1-p) mask
+/// - p <= 0.5 → AND cascade
+///
+/// At exactly `p = 0.5` the AND-cascade branch executes a single
+/// `rng.next_u64()` (n = ceil(-log2(0.5)) = 1) — each bit is then
+/// IID Bernoulli(0.5). Note the **strict** comparison here: an earlier
+/// version used `p >= 0.5`, which recursed with `1.0 - 0.5 = 0.5`
+/// infinitely. The Pillar-13 drift-check (`hpc::pillar::hhtl_contraction`)
+/// already uses the strict comparison and is the canonical reference.
 fn make_probability_mask(p: f64, rng: &mut SplitMix64) -> u64 {
     if p >= 1.0 {
         return u64::MAX;
@@ -142,13 +149,14 @@ fn make_probability_mask(p: f64, rng: &mut SplitMix64) -> u64 {
         return 0;
     }
 
-    if p >= 0.5 {
-        // p >= 0.5: use OR approach — each AND of randoms gives ~0.25, NOT gives ~0.75, etc.
-        // Simpler: just AND enough randoms to get (1-p) kill rate, then NOT.
+    if p > 0.5 {
+        // p > 0.5: invert the (1-p) mask. Strict > 0.5 so p == 0.5
+        // falls through to the AND-cascade and produces a single
+        // Bernoulli(0.5) word in one rng draw.
         return !make_probability_mask(1.0 - p, rng);
     }
 
-    // p < 0.5: AND cascade. Each AND halves the probability.
+    // p <= 0.5: AND cascade. Each AND halves the probability.
     // We need n ANDs where 0.5^n ≈ p, so n = -log2(p).
     let n = (-p.log2()).ceil() as u32;
     let mut mask = rng.next_u64();
@@ -543,6 +551,41 @@ mod tests {
         SplitMix64::new(42)
     }
 
+    /// Regression: at p = 0.5 exactly, the previous `p >= 0.5` branch
+    /// recursed with `1.0 - 0.5 = 0.5` infinitely. The strict `p > 0.5`
+    /// fix routes p=0.5 to the AND-cascade (n=1, one rng draw) which
+    /// produces a Bernoulli(0.5) mask in O(1) time.
+    #[test]
+    fn make_probability_mask_at_half_terminates() {
+        let mut rng = make_rng();
+        // If this stack-overflows, the recursion fix has regressed.
+        let mask = make_probability_mask(0.5, &mut rng);
+        // Bernoulli(0.5) over 64 bits — popcount should be near 32, but
+        // any value 0..=64 is valid for a single draw. The test's
+        // load-bearing assertion is that the call returns.
+        assert!(mask <= u64::MAX);
+    }
+
+    /// Empirical Bernoulli(0.5) check: average popcount over N=1024
+    /// independent masks must land near 32 (the true mean) within a
+    /// generous tolerance.
+    #[test]
+    fn make_probability_mask_at_half_is_bernoulli_half() {
+        let mut rng = make_rng();
+        const N: u32 = 1024;
+        let mut total: u64 = 0;
+        for _ in 0..N {
+            total += make_probability_mask(0.5, &mut rng).count_ones() as u64;
+        }
+        let mean = total as f64 / N as f64;
+        // σ per word = sqrt(64 * 0.5 * 0.5) = 4; mean's SE = 4 / √N = 0.125.
+        // Tolerance 2.0 ≈ 16 SEs — comfortable margin against flakes.
+        assert!(
+            (mean - 32.0).abs() < 2.0,
+            "make_probability_mask(0.5) mean popcount {mean:.4} not near 32"
+        );
+    }
+
     #[test]
     fn test_new_tree_structure() {
         let tree = DNTree::with_capacity(4096);
diff --git a/src/hpc/pillar/hhtl_contraction.rs b/src/hpc/pillar/hhtl_contraction.rs
@@ -486,7 +486,12 @@ mod tests {
         use crate::hpc::dn_tree::{bundle_into, SplitMix64 as DnSplitMix64};
 
         const N_TRIALS: u32 = 16;
-        const TEST_LR: f64 = 0.25;
+        // Was 0.25 to avoid the latent p=0.5 infinite-recursion bug in
+        // production's make_probability_mask; that bug is fixed in the
+        // same commit/PR that updates this constant. lr=0.5 now matches
+        // Pillar 13's canonical mid-range learning rate and exercises
+        // the previously-broken branch.
+        const TEST_LR: f64 = 0.5;
 
         // Both SplitMix64 implementations use identical algorithm (same
         // multiplier constants 0x9E3779B97F4A7C15, 0xBF58476D1CE4E5B9,