From 916cfaf856bac3e107f17a83068acdb746c5590e Mon Sep 17 00:00:00 2001 From: Michael D'Angelo Date: Fri, 10 Apr 2026 10:41:03 -0700 Subject: [PATCH 1/2] fix: preserve picklescan stack state --- CHANGELOG.md | 1 + .../modelaudit_picklescan/engine/scanner.py | 18 ++++++++++ .../modelaudit-picklescan/tests/test_api.py | 35 +++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1c12a9..c77e7fec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Bug Fixes - avoid CoreML nested parse failures on bounded-read truncation +- preserve standalone picklescan stack state across memo and container mutation opcodes - flag TensorFlow `LoadLibrary` and `LoadLibraryV2` graph ops as dangerous native-library loading - detect split CNTK native-user-function and native-library references - detect Linux/macOS native-library members in Keras archives and uppercase native-library members in PyTorch ZIPs diff --git a/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py b/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py index 73ce497f..ca43acb7 100644 --- a/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py +++ b/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py @@ -571,6 +571,11 @@ def _handle_opcode(self, op_name: str, arg: Any, position: int) -> None: self.stack.pop() return + if op_name == "DUP": + if self.stack: + self.stack.append(self.stack[-1]) + return + if op_name == "POP_MARK": self._pop_to_mark() return @@ -595,9 +600,22 @@ def _handle_opcode(self, op_name: str, arg: Any, position: int) -> None: self._collapse_top_n(3) return + if op_name in {"APPEND", "SETITEM"}: + if self.stack: + self.stack.pop() + if op_name == "SETITEM" and self.stack: + self.stack.pop() + return + + if op_name in {"APPENDS", "SETITEMS", "ADDITEMS"}: + self._pop_to_mark() + return + if op_name in _MEMO_WRITE_OPCODES: if self.stack: self.memo[arg] = self.stack[-1] + if isinstance(arg, int): + self.next_memo_index = max(self.next_memo_index, arg + 1) return if op_name == "MEMOIZE": diff --git a/packages/modelaudit-picklescan/tests/test_api.py b/packages/modelaudit-picklescan/tests/test_api.py index 5247799f..3b1a5c3b 100644 --- a/packages/modelaudit-picklescan/tests/test_api.py +++ b/packages/modelaudit-picklescan/tests/test_api.py @@ -96,6 +96,41 @@ def test_scan_bytes_attributes_reduce_calls_to_the_callable_operand_not_nested_a ) +@pytest.mark.parametrize( + ("payload", "source"), + [ + (b"cbuiltins\nlen\n}cos\nsystem\nK\x01s\x85R.", "setitem-args.pkl"), + (b"cbuiltins\nlen\n}(cos\nsystem\nK\x01u\x85R.", "setitems-args.pkl"), + ], +) +def test_scan_bytes_dict_mutation_operands_do_not_become_reduce_call_targets(payload: bytes, source: str) -> None: + report = scan_bytes(payload, source=source) + + assert report.status == ScanStatus.COMPLETE + assert report.verdict == SafetyVerdict.MALICIOUS + assert any( + finding.rule_code == "DANGEROUS_GLOBAL" and finding.details.get("import_reference") in SYSTEM_GLOBALS + for finding in report.findings + ) + assert not any( + finding.rule_code == "DANGEROUS_CALL" and finding.details.get("import_reference") in SYSTEM_GLOBALS + for finding in report.findings + ) + + +def test_scan_bytes_memoize_index_advances_after_explicit_memo_write() -> None: + payload = b"\x80\x04cbuiltins\nlen\nq\x00cos\nsystem\n\x94h\x01\x8c\x04echo\x85R." + + report = scan_bytes(payload, source="memoize-after-put.pkl") + + assert report.status == ScanStatus.COMPLETE + assert report.verdict == SafetyVerdict.MALICIOUS + assert any( + finding.rule_code == "DANGEROUS_CALL" and finding.details.get("import_reference") in SYSTEM_GLOBALS + for finding in report.findings + ) + + def test_scan_stream_uses_explicit_source_and_does_not_leak_prior_scan_state() -> None: scanner = PickleScanner() From fd935af1468d22d8ab1036b1d5038ba34cc6b9cd Mon Sep 17 00:00:00 2001 From: Michael D'Angelo Date: Fri, 10 Apr 2026 11:00:47 -0700 Subject: [PATCH 2/2] fix: use memo size for picklescan memoize --- .../src/modelaudit_picklescan/engine/scanner.py | 9 ++------- packages/modelaudit-picklescan/tests/test_api.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py b/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py index ca43acb7..ba7facb2 100644 --- a/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py +++ b/packages/modelaudit-picklescan/src/modelaudit_picklescan/engine/scanner.py @@ -313,7 +313,6 @@ def __init__( self.deadline = time.monotonic() + options.timeout_s self.stack: list[Any] = [] self.memo: dict[int | str, Any] = {} - self.next_memo_index = 0 self.findings: list[Finding] = [] self.notices: list[Notice] = [] self.errors: list[ScanError] = [] @@ -361,7 +360,6 @@ def run(self) -> None: self.first_pickle_end_pos = self.position_offset + self.stream.tell() self.stack.clear() self.memo.clear() - self.next_memo_index = 0 break if not parsed_opcode: @@ -612,16 +610,13 @@ def _handle_opcode(self, op_name: str, arg: Any, position: int) -> None: return if op_name in _MEMO_WRITE_OPCODES: - if self.stack: + if self.stack and isinstance(arg, int): self.memo[arg] = self.stack[-1] - if isinstance(arg, int): - self.next_memo_index = max(self.next_memo_index, arg + 1) return if op_name == "MEMOIZE": if self.stack: - self.memo[self.next_memo_index] = self.stack[-1] - self.next_memo_index += 1 + self.memo[len(self.memo)] = self.stack[-1] return if op_name in _MEMO_READ_OPCODES: diff --git a/packages/modelaudit-picklescan/tests/test_api.py b/packages/modelaudit-picklescan/tests/test_api.py index 3b1a5c3b..0d40b701 100644 --- a/packages/modelaudit-picklescan/tests/test_api.py +++ b/packages/modelaudit-picklescan/tests/test_api.py @@ -118,10 +118,18 @@ def test_scan_bytes_dict_mutation_operands_do_not_become_reduce_call_targets(pay ) -def test_scan_bytes_memoize_index_advances_after_explicit_memo_write() -> None: - payload = b"\x80\x04cbuiltins\nlen\nq\x00cos\nsystem\n\x94h\x01\x8c\x04echo\x85R." - - report = scan_bytes(payload, source="memoize-after-put.pkl") +@pytest.mark.parametrize( + ("payload", "source"), + [ + (b"\x80\x04cbuiltins\nlen\nq\x00cos\nsystem\n\x94h\x01\x8c\x04echo\x85R.", "memoize-after-put.pkl"), + (b"\x80\x04cbuiltins\nlen\nqdcos\nsystem\n\x94h\x01\x8c\x04echo\x85R.", "memoize-after-sparse-put.pkl"), + ], +) +def test_scan_bytes_memoize_index_uses_runtime_memo_size_after_explicit_memo_write( + payload: bytes, + source: str, +) -> None: + report = scan_bytes(payload, source=source) assert report.status == ScanStatus.COMPLETE assert report.verdict == SafetyVerdict.MALICIOUS