Bytecode parity - direct loop backedges

youknowone · youknowone · commit 35f56425f0b9 · 2026-04-10T13:24:11.000+09:00
diff --git a/crates/codegen/src/compile.rs b/crates/codegen/src/compile.rs
@@ -11148,6 +11148,46 @@ def f(base, cls, state):
         assert_eq!(return_count, 2);
     }
 
+    #[test]
+    fn test_loop_store_subscr_threads_direct_backedge() {
+        let code = compile_exec(
+            "\
+def f(kwonlyargs, kwonlydefaults, arg2value):
+    missing = 0
+    for kwarg in kwonlyargs:
+        if kwarg not in arg2value:
+            if kwonlydefaults and kwarg in kwonlydefaults:
+                arg2value[kwarg] = kwonlydefaults[kwarg]
+            else:
+                missing += 1
+    return missing
+",
+        );
+        let f = find_code(&code, "f").expect("missing function code");
+        let ops: Vec<_> = f
+            .instructions
+            .iter()
+            .map(|unit| unit.op)
+            .filter(|op| !matches!(op, Instruction::Cache))
+            .collect();
+
+        let store_subscr = ops
+            .iter()
+            .position(|op| matches!(op, Instruction::StoreSubscr))
+            .expect("missing STORE_SUBSCR");
+        let next_op = ops
+            .get(store_subscr + 1)
+            .expect("missing jump after STORE_SUBSCR");
+        let window_start = store_subscr.saturating_sub(3);
+        let window_end = (store_subscr + 5).min(ops.len());
+        let window = &ops[window_start..window_end];
+
+        assert!(
+            matches!(next_op, Instruction::JumpBackward { .. }),
+            "expected direct loop backedge after STORE_SUBSCR, got {next_op:?}; ops={window:?}"
+        );
+    }
+
     #[test]
     fn test_assert_without_message_raises_class_directly() {
         let code = compile_exec(
diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs
@@ -241,6 +241,12 @@ impl CodeInfo {
         self.dce(); // truncate after terminal in blocks that got return duplicated
         self.eliminate_unreachable_blocks(); // remove now-unreachable last block
         remove_redundant_nops_and_jumps(&mut self.blocks);
+        // Some jump-only blocks only appear after late CFG cleanup. Thread them
+        // once more so loop backedges stay direct instead of becoming
+        // JUMP_FORWARD -> JUMP_BACKWARD chains.
+        jump_threading(&mut self.blocks);
+        self.eliminate_unreachable_blocks();
+        remove_redundant_nops_and_jumps(&mut self.blocks);
         self.add_checks_for_loads_of_uninitialized_variables();
         // optimize_load_fast: after normalize_jumps
         self.optimize_load_fast_borrow();
@@ -3199,6 +3205,13 @@ fn is_exit_without_lineno(block: &Block) -> bool {
     !instruction_has_lineno(first) && last.instr.is_scope_exit()
 }
 
+fn is_jump_only_block(block: &Block) -> bool {
+    let [instr] = block.instructions.as_slice() else {
+        return false;
+    };
+    instr.instr.is_unconditional_jump() && instr.target != BlockIdx::NULL
+}
+
 fn maybe_propagate_location(
     instr: &mut InstructionInfo,
     location: SourceLocation,
@@ -3321,6 +3334,45 @@ fn duplicate_exits_without_lineno(blocks: &mut Vec<Block>, predecessors: &mut Ve
     }
 }
 
+fn duplicate_jump_targets_without_lineno(blocks: &mut Vec<Block>, predecessors: &mut Vec<u32>) {
+    let mut current = BlockIdx(0);
+    while current != BlockIdx::NULL {
+        let block = &blocks[current.idx()];
+        let last = match block.instructions.last() {
+            Some(ins) if ins.instr.is_unconditional_jump() && ins.target != BlockIdx::NULL => *ins,
+            _ => {
+                current = blocks[current.idx()].next;
+                continue;
+            }
+        };
+
+        let target = next_nonempty_block(blocks, last.target);
+        if target == BlockIdx::NULL || !is_jump_only_block(&blocks[target.idx()]) {
+            current = blocks[current.idx()].next;
+            continue;
+        }
+        if predecessors[target.idx()] <= 1 {
+            current = blocks[current.idx()].next;
+            continue;
+        }
+
+        let new_idx = BlockIdx(blocks.len() as u32);
+        let mut new_block = blocks[target.idx()].clone();
+        propagate_locations_in_block(&mut new_block, last.location, last.end_location);
+        let old_next = blocks[current.idx()].next;
+        new_block.next = old_next;
+        blocks.push(new_block);
+        blocks[current.idx()].next = new_idx;
+
+        let last_mut = blocks[current.idx()].instructions.last_mut().unwrap();
+        last_mut.target = new_idx;
+        predecessors[target.idx()] -= 1;
+        predecessors.push(1);
+
+        current = old_next;
+    }
+}
+
 fn propagate_line_numbers(blocks: &mut [Block], predecessors: &[u32]) {
     let mut current = BlockIdx(0);
     while current != BlockIdx::NULL {
@@ -3371,6 +3423,7 @@ fn propagate_line_numbers(blocks: &mut [Block], predecessors: &[u32]) {
 fn resolve_line_numbers(blocks: &mut Vec<Block>) {
     let mut predecessors = compute_predecessors(blocks);
     duplicate_exits_without_lineno(blocks, &mut predecessors);
+    duplicate_jump_targets_without_lineno(blocks, &mut predecessors);
     propagate_line_numbers(blocks, &predecessors);
 }
 
diff --git a/scripts/dis_dump.py b/scripts/dis_dump.py
@@ -33,6 +33,15 @@
     "STORE_FAST_LOAD_FAST_BORROW": "STORE_FAST_LOAD_FAST",
 }
 
+# Superinstruction normalization: split into constituent parts so jump target
+# indices are computed against the same logical instruction stream on both
+# interpreters.
+_SUPER_DECOMPOSE = {
+    "STORE_FAST_LOAD_FAST": ("STORE_FAST", "LOAD_FAST"),
+    "STORE_FAST_STORE_FAST": ("STORE_FAST", "STORE_FAST"),
+    "LOAD_FAST_LOAD_FAST": ("LOAD_FAST", "LOAD_FAST"),
+}
+
 # Jump instruction names (fallback when hasjrel/hasjabs is incomplete)
 _JUMP_OPNAMES = frozenset(
     {
@@ -188,13 +197,18 @@ def _extract_instructions(code):
     except Exception as e:
         return [["ERROR", str(e)]]
 
-    # Build filtered list and offset-to-index mapping
+    # Build filtered list and offset-to-index mapping for the normalized stream.
+    # This must use post-decomposition indices; otherwise a superinstruction that
+    # expands into multiple logical ops shifts later jump targets by 1.
     filtered = []
     offset_to_idx = {}
+    normalized_idx = 0
     for inst in raw:
         if inst.opname in SKIP_OPS:
             continue
-        offset_to_idx[inst.offset] = len(filtered)
+        opname = _OPNAME_NORMALIZE.get(inst.opname, inst.opname)
+        offset_to_idx[inst.offset] = normalized_idx
+        normalized_idx += len(_SUPER_DECOMPOSE.get(opname, (opname,)))
         filtered.append(inst)
 
     # Map offsets that land on CACHE slots to the next real instruction
@@ -205,14 +219,6 @@ def _extract_instructions(code):
                     offset_to_idx[inst.offset] = fi
                     break
 
-    # Superinstruction decomposition: split into constituent parts
-    # so we compare individual operations regardless of combining.
-    _SUPER_DECOMPOSE = {
-        "STORE_FAST_LOAD_FAST": ("STORE_FAST", "LOAD_FAST"),
-        "STORE_FAST_STORE_FAST": ("STORE_FAST", "STORE_FAST"),
-        "LOAD_FAST_LOAD_FAST": ("LOAD_FAST", "LOAD_FAST"),
-    }
-
     result = []
     for inst in filtered:
         opname = _OPNAME_NORMALIZE.get(inst.opname, inst.opname)