From 8e24b1935dffd9bd447a8ce49f8b6177d77fb612 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 12:27:54 -0400 Subject: [PATCH 1/2] Use safe YAML loaders for parameters --- .../secure-yaml-and-breakdowns.fixed.md | 1 + policyengine_core/parameters/config.py | 4 +- .../operations/homogenize_parameters.py | 54 ++++++++++++++++++- policyengine_core/tools/test_runner.py | 9 +++- tests/core/test_parameter_security.py | 52 ++++++++++++++++++ .../tools/test_runner/test_yaml_runner.py | 24 +++++++++ 6 files changed, 138 insertions(+), 6 deletions(-) create mode 100644 changelog.d/secure-yaml-and-breakdowns.fixed.md create mode 100644 tests/core/test_parameter_security.py diff --git a/changelog.d/secure-yaml-and-breakdowns.fixed.md b/changelog.d/secure-yaml-and-breakdowns.fixed.md new file mode 100644 index 000000000..586241d33 --- /dev/null +++ b/changelog.d/secure-yaml-and-breakdowns.fixed.md @@ -0,0 +1 @@ +Use safe YAML loaders and remove dynamic eval from parameter breakdown handling. diff --git a/policyengine_core/parameters/config.py b/policyengine_core/parameters/config.py index 73982d8a3..d2dbee20a 100644 --- a/policyengine_core/parameters/config.py +++ b/policyengine_core/parameters/config.py @@ -7,7 +7,7 @@ from policyengine_core.warnings import LibYAMLWarning try: - from yaml import CLoader as Loader + from yaml import CSafeLoader as Loader except ImportError: message = [ "libyaml is not installed in your environment.", @@ -17,7 +17,7 @@ ] warnings.warn(" ".join(message), LibYAMLWarning) from yaml import ( - Loader, + SafeLoader as Loader, ) # type: ignore # (see https://github.com/python/mypy/issues/1153#issuecomment-455802270) ALLOWED_PARAM_TYPES = (float, int, bool, type(None), typing.List) diff --git a/policyengine_core/parameters/operations/homogenize_parameters.py b/policyengine_core/parameters/operations/homogenize_parameters.py index 04888300c..cd310a9e5 100644 --- a/policyengine_core/parameters/operations/homogenize_parameters.py +++ b/policyengine_core/parameters/operations/homogenize_parameters.py @@ -1,3 +1,4 @@ +import ast import logging from typing import Any, Dict, List, Type @@ -71,8 +72,7 @@ def homogenize_parameter_node( elif dtype == bool: possible_values = [True, False] else: - # Try to execute the breakdown as Python code - possible_values = list(eval(first_breakdown)) + possible_values = evaluate_dynamic_breakdown(first_breakdown) if not hasattr(node, "children"): node = ParameterNode( node.name, @@ -119,3 +119,53 @@ def homogenize_parameter_node( node.children[child], breakdown[1:], variables, default_value ) return node + + +def evaluate_dynamic_breakdown(expression: str) -> List[Any]: + """Safely evaluate a dynamic breakdown expression. + + The parameter metadata only needs literal collections and the documented + ``range(...)`` / ``list(range(...))`` forms. Anything else is rejected. + """ + + parsed = ast.parse(expression, mode="eval") + evaluated = evaluate_dynamic_breakdown_node(parsed.body) + if isinstance(evaluated, range): + return list(evaluated) + if isinstance(evaluated, (list, tuple)): + return list(evaluated) + if isinstance(evaluated, set): + return list(evaluated) + raise ValueError( + f"Invalid dynamic breakdown expression '{expression}'. " + "Only literal collections and range() calls are allowed." + ) + + +def evaluate_dynamic_breakdown_node(node: ast.AST) -> Any: + if isinstance(node, ast.Constant): + return node.value + if isinstance(node, ast.List): + return [evaluate_dynamic_breakdown_node(element) for element in node.elts] + if isinstance(node, ast.Tuple): + return tuple(evaluate_dynamic_breakdown_node(element) for element in node.elts) + if isinstance(node, ast.Set): + return {evaluate_dynamic_breakdown_node(element) for element in node.elts} + if isinstance(node, ast.UnaryOp) and isinstance( + node.op, (ast.UAdd, ast.USub) + ): + operand = evaluate_dynamic_breakdown_node(node.operand) + return operand if isinstance(node.op, ast.UAdd) else -operand + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name): + if node.func.id == "range": + args = [evaluate_dynamic_breakdown_node(arg) for arg in node.args] + if node.keywords: + raise ValueError("range() keyword arguments are not allowed") + return range(*args) + if node.func.id == "list": + if len(node.args) != 1 or node.keywords: + raise ValueError("list() must contain a single positional argument") + return list(evaluate_dynamic_breakdown_node(node.args[0])) + raise ValueError( + f"Unsupported dynamic breakdown expression: {ast.unparse(node) if hasattr(ast, 'unparse') else type(node).__name__}" + ) diff --git a/policyengine_core/tools/test_runner.py b/policyengine_core/tools/test_runner.py index 03276236b..94edcdde8 100644 --- a/policyengine_core/tools/test_runner.py +++ b/policyengine_core/tools/test_runner.py @@ -31,7 +31,7 @@ def import_yaml(): import yaml try: - from yaml import CLoader as Loader + from yaml import CSafeLoader as Loader except ImportError: log.warning( " " @@ -126,7 +126,12 @@ def __init__(self, *, tax_benefit_system, options, **kwargs): def collect(self): try: tests = yaml.load(self.path.open(), Loader=Loader) - except (yaml.scanner.ScannerError, yaml.parser.ParserError, TypeError): + except ( + yaml.scanner.ScannerError, + yaml.parser.ParserError, + yaml.constructor.ConstructorError, + TypeError, + ): message = os.linesep.join( [ traceback.format_exc(), diff --git a/tests/core/test_parameter_security.py b/tests/core/test_parameter_security.py new file mode 100644 index 000000000..add0659ed --- /dev/null +++ b/tests/core/test_parameter_security.py @@ -0,0 +1,52 @@ +import pytest + +from policyengine_core.errors import ParameterParsingError +from policyengine_core.parameters import ParameterNode, homogenize_parameter_structures +from policyengine_core.parameters.helpers import _load_yaml_file + + +def test_parameter_yaml_loader_rejects_python_object_tags(tmp_path, monkeypatch): + calls = [] + + monkeypatch.setattr( + "os.system", + lambda command: calls.append(command) or 0, + ) + + yaml_path = tmp_path / "malicious.yaml" + yaml_path.write_text( + '!!python/object/apply:os.system ["echo pwned"]\n', + encoding="utf-8", + ) + + with pytest.raises(ParameterParsingError): + _load_yaml_file(str(yaml_path)) + + assert calls == [] + + +def test_homogenize_parameter_structures_rejects_dynamic_breakdown_code( + monkeypatch, +): + eval_calls = [] + + monkeypatch.setattr( + "builtins.eval", + lambda expression, globals=None, locals=None: eval_calls.append(expression) + or range(1, 4), + ) + + root = ParameterNode( + data={ + "value_by_category": { + "metadata": { + "breakdown": ['__import__("os").system("echo pwned")'], + }, + } + } + ) + + with pytest.raises(ValueError, match="breakdown"): + homogenize_parameter_structures(root, {}, default_value=0) + + assert eval_calls == [] diff --git a/tests/core/tools/test_runner/test_yaml_runner.py b/tests/core/tools/test_runner/test_yaml_runner.py index 72e3faadb..e15d191f1 100644 --- a/tests/core/tools/test_runner/test_yaml_runner.py +++ b/tests/core/tools/test_runner/test_yaml_runner.py @@ -203,6 +203,30 @@ def test_performance_tables_option_output(): clean_performance_files(paths) +def test_yaml_runner_rejects_python_object_tags(tmp_path, monkeypatch): + calls = [] + yaml_path = tmp_path / "malicious.yaml" + yaml_path.write_text( + '!!python/object/apply:os.system ["echo pwned"]\n', + encoding="utf-8", + ) + + monkeypatch.setattr( + "os.system", + lambda command: calls.append(command) or 0, + ) + + malicious_yaml_file = object.__new__(YamlFile) + malicious_yaml_file.path = yaml_path + malicious_yaml_file.options = {} + malicious_yaml_file.tax_benefit_system = TaxBenefitSystem() + + with pytest.raises(ValueError): + list(malicious_yaml_file.collect()) + + assert calls == [] + + def clean_performance_files(paths: List[str]): for path in paths: if os.path.isfile(path): From c94a7452ae9181d4040a12a6ff9eacb0f5ec9c1b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 12:35:27 -0400 Subject: [PATCH 2/2] Format YAML security hardening changes --- .../parameter_node_metadata.py | 4 +- policyengine_core/parameters/config.py | 29 +++++-- .../operations/homogenize_parameters.py | 48 +++++++++-- policyengine_core/tools/test_runner.py | 22 +++-- tests/core/test_parameter_security.py | 82 +++++++++++++++++- .../tools/test_runner/test_yaml_runner.py | 83 +++++++++++++++++++ 6 files changed, 245 insertions(+), 23 deletions(-) diff --git a/policyengine_core/data_structures/parameter_node_metadata.py b/policyengine_core/data_structures/parameter_node_metadata.py index 0483d9cb0..e91ba2600 100644 --- a/policyengine_core/data_structures/parameter_node_metadata.py +++ b/policyengine_core/data_structures/parameter_node_metadata.py @@ -37,8 +37,8 @@ class ParameterNodeMetadata: metadata: breakdown: - region # If `region` is an Enum-type variable with possible values `[ENGLAND, WALES]` then these children will be added. - - range(1, 7) # This code is `eval`uated to produce the list `[1, 2, 3, 4, 5, 6]` which are then added as grandchildren. - - [True, False] # This list is added as great-grandchildren (using the same `eval` method as above). + - range(1, 7) # Safe dynamic form producing `[1, 2, 3, 4, 5, 6]`, which are then added as grandchildren. + - [True, False] # Literal collection form added as great-grandchildren. """ diff --git a/policyengine_core/parameters/config.py b/policyengine_core/parameters/config.py index d2dbee20a..7adec9d25 100644 --- a/policyengine_core/parameters/config.py +++ b/policyengine_core/parameters/config.py @@ -33,15 +33,30 @@ def date_constructor(_loader, node): def dict_no_duplicate_constructor(loader, node, deep=False): - keys = [key.value for key, value in node.value] + explicit_keys = {} + for key_node, _value_node in node.value: + if key_node.tag == "tag:yaml.org,2002:merge": + continue + key = loader.construct_object(key_node, deep=deep) + try: + if key in explicit_keys: + raise yaml.parser.ParserError( + "", node.start_mark, f"Found duplicate key '{key}'" + ) + except TypeError as exc: + raise yaml.constructor.ConstructorError( + "", node.start_mark, f"Found unhashable key '{key}'" + ) from exc + explicit_keys[key] = True - if len(keys) != len(set(keys)): - duplicate = next((key for key in keys if keys.count(key) > 1)) - raise yaml.parser.ParserError( - "", node.start_mark, f"Found duplicate key '{duplicate}'" - ) + loader.flatten_mapping(node) + pairs = loader.construct_pairs(node, deep=deep) + mapping = {} - return loader.construct_mapping(node, deep) + for key, value in pairs: + mapping[key] = value + + return mapping yaml.add_constructor( diff --git a/policyengine_core/parameters/operations/homogenize_parameters.py b/policyengine_core/parameters/operations/homogenize_parameters.py index cd310a9e5..39164b721 100644 --- a/policyengine_core/parameters/operations/homogenize_parameters.py +++ b/policyengine_core/parameters/operations/homogenize_parameters.py @@ -7,6 +7,8 @@ from policyengine_core.parameters.parameter_node import ParameterNode from policyengine_core.variables import Variable +MAX_DYNAMIC_BREAKDOWN_VALUES = 10_000 + def homogenize_parameter_structures( root: ParameterNode, variables: Dict[str, Variable], default_value: Any = 0 @@ -44,6 +46,11 @@ def get_breakdown_variables(node: ParameterNode) -> List[str]: f"Invalid breakdown metadata for parameter {node.name}: {type(breakdown)}" ) return None + if len(breakdown) == 0: + logging.warning( + f"Invalid breakdown metadata for parameter {node.name}: empty list" + ) + return None return breakdown else: return None @@ -131,10 +138,13 @@ def evaluate_dynamic_breakdown(expression: str) -> List[Any]: parsed = ast.parse(expression, mode="eval") evaluated = evaluate_dynamic_breakdown_node(parsed.body) if isinstance(evaluated, range): + validate_dynamic_breakdown_range_cardinality(evaluated, expression) return list(evaluated) if isinstance(evaluated, (list, tuple)): + validate_dynamic_breakdown_cardinality(len(evaluated), expression) return list(evaluated) if isinstance(evaluated, set): + validate_dynamic_breakdown_cardinality(len(evaluated), expression) return list(evaluated) raise ValueError( f"Invalid dynamic breakdown expression '{expression}'. " @@ -142,18 +152,39 @@ def evaluate_dynamic_breakdown(expression: str) -> List[Any]: ) +def validate_dynamic_breakdown_cardinality(count: int, expression: str) -> None: + if count > MAX_DYNAMIC_BREAKDOWN_VALUES: + raise ValueError( + f"Dynamic breakdown expression '{expression}' produces {count} values, " + f"which exceeds the maximum of {MAX_DYNAMIC_BREAKDOWN_VALUES}." + ) + + +def validate_dynamic_breakdown_range_cardinality( + values: range, expression: str +) -> None: + try: + count = len(values) + except OverflowError as exc: + raise ValueError( + f"Dynamic breakdown expression '{expression}' produces too many values." + ) from exc + validate_dynamic_breakdown_cardinality(count, expression) + + def evaluate_dynamic_breakdown_node(node: ast.AST) -> Any: if isinstance(node, ast.Constant): return node.value if isinstance(node, ast.List): + validate_dynamic_breakdown_cardinality(len(node.elts), ast.unparse(node)) return [evaluate_dynamic_breakdown_node(element) for element in node.elts] if isinstance(node, ast.Tuple): + validate_dynamic_breakdown_cardinality(len(node.elts), ast.unparse(node)) return tuple(evaluate_dynamic_breakdown_node(element) for element in node.elts) if isinstance(node, ast.Set): + validate_dynamic_breakdown_cardinality(len(node.elts), ast.unparse(node)) return {evaluate_dynamic_breakdown_node(element) for element in node.elts} - if isinstance(node, ast.UnaryOp) and isinstance( - node.op, (ast.UAdd, ast.USub) - ): + if isinstance(node, ast.UnaryOp) and isinstance(node.op, (ast.UAdd, ast.USub)): operand = evaluate_dynamic_breakdown_node(node.operand) return operand if isinstance(node.op, ast.UAdd) else -operand if isinstance(node, ast.Call) and isinstance(node.func, ast.Name): @@ -161,11 +192,18 @@ def evaluate_dynamic_breakdown_node(node: ast.AST) -> Any: args = [evaluate_dynamic_breakdown_node(arg) for arg in node.args] if node.keywords: raise ValueError("range() keyword arguments are not allowed") - return range(*args) + result = range(*args) + validate_dynamic_breakdown_range_cardinality(result, ast.unparse(node)) + return result if node.func.id == "list": if len(node.args) != 1 or node.keywords: raise ValueError("list() must contain a single positional argument") - return list(evaluate_dynamic_breakdown_node(node.args[0])) + evaluated = evaluate_dynamic_breakdown_node(node.args[0]) + if isinstance(evaluated, (range, list, tuple, set)): + return evaluated + raise ValueError( + "list() only supports range() and literal collection expressions" + ) raise ValueError( f"Unsupported dynamic breakdown expression: {ast.unparse(node) if hasattr(ast, 'unparse') else type(node).__name__}" ) diff --git a/policyengine_core/tools/test_runner.py b/policyengine_core/tools/test_runner.py index 94edcdde8..6b8d4aefc 100644 --- a/policyengine_core/tools/test_runner.py +++ b/policyengine_core/tools/test_runner.py @@ -126,12 +126,7 @@ def __init__(self, *, tax_benefit_system, options, **kwargs): def collect(self): try: tests = yaml.load(self.path.open(), Loader=Loader) - except ( - yaml.scanner.ScannerError, - yaml.parser.ParserError, - yaml.constructor.ConstructorError, - TypeError, - ): + except (yaml.YAMLError, TypeError): message = os.linesep.join( [ traceback.format_exc(), @@ -144,6 +139,11 @@ def collect(self): tests: List[Dict] = [tests] for test in tests: + if not isinstance(test, dict): + raise ValueError( + f"'{self.path}' is not a valid YAML test file. " + "Expected a mapping or a list of mappings." + ) if not self.should_ignore(test): yield YamlItem.from_parent( self, @@ -155,11 +155,19 @@ def collect(self): def should_ignore(self, test): name_filter = self.options.get("name_filter") + keywords = test.get("keywords", []) + if keywords is None: + keywords = [] + if not isinstance(keywords, list): + raise ValueError( + f"'{self.path}' is not a valid YAML test file. " + "'keywords' must be a list." + ) return ( name_filter is not None and name_filter not in os.path.splitext(self.fspath.basename)[0] and name_filter not in test.get("name", "") - and name_filter not in test.get("keywords", []) + and name_filter not in keywords ) diff --git a/tests/core/test_parameter_security.py b/tests/core/test_parameter_security.py index add0659ed..a503ead9a 100644 --- a/tests/core/test_parameter_security.py +++ b/tests/core/test_parameter_security.py @@ -3,6 +3,10 @@ from policyengine_core.errors import ParameterParsingError from policyengine_core.parameters import ParameterNode, homogenize_parameter_structures from policyengine_core.parameters.helpers import _load_yaml_file +from policyengine_core.parameters.operations.homogenize_parameters import ( + MAX_DYNAMIC_BREAKDOWN_VALUES, + evaluate_dynamic_breakdown, +) def test_parameter_yaml_loader_rejects_python_object_tags(tmp_path, monkeypatch): @@ -32,8 +36,9 @@ def test_homogenize_parameter_structures_rejects_dynamic_breakdown_code( monkeypatch.setattr( "builtins.eval", - lambda expression, globals=None, locals=None: eval_calls.append(expression) - or range(1, 4), + lambda expression, globals=None, locals=None: ( + eval_calls.append(expression) or range(1, 4) + ), ) root = ParameterNode( @@ -50,3 +55,76 @@ def test_homogenize_parameter_structures_rejects_dynamic_breakdown_code( homogenize_parameter_structures(root, {}, default_value=0) assert eval_calls == [] + + +def test_homogenize_parameter_structures_rejects_oversized_dynamic_breakdown(): + root = ParameterNode( + data={ + "value_by_category": { + "metadata": { + "breakdown": [f"list(range({MAX_DYNAMIC_BREAKDOWN_VALUES + 1}))"], + }, + } + } + ) + + with pytest.raises(ValueError, match="exceeds the maximum"): + homogenize_parameter_structures(root, {}, default_value=0) + + +def test_homogenize_parameter_structures_rejects_overflowing_dynamic_breakdown(): + huge_stop = "1" + ("0" * 100) + root = ParameterNode( + data={ + "value_by_category": { + "metadata": { + "breakdown": [f"range(0, {huge_stop})"], + }, + } + } + ) + + with pytest.raises(ValueError, match="too many values"): + homogenize_parameter_structures(root, {}, default_value=0) + + +def test_parameter_yaml_loader_rejects_implicit_duplicate_keys(tmp_path): + yaml_path = tmp_path / "duplicate-bools.yaml" + yaml_path.write_text("true: 1\nTrue: 2\n", encoding="utf-8") + + with pytest.raises(ParameterParsingError, match="duplicate key"): + _load_yaml_file(str(yaml_path)) + + +def test_parameter_yaml_loader_allows_merge_key_overrides(tmp_path): + yaml_path = tmp_path / "merge-override.yaml" + yaml_path.write_text( + "defaults: &defaults\n value: 1\nmerged:\n <<: *defaults\n value: 2\n", + encoding="utf-8", + ) + + result = _load_yaml_file(str(yaml_path)) + + assert result["merged"]["value"] == 2 + + +def test_evaluate_dynamic_breakdown_allows_documented_safe_forms(): + assert evaluate_dynamic_breakdown("list(range(1, 4))") == [1, 2, 3] + assert evaluate_dynamic_breakdown("[1, 2, -3]") == [1, 2, -3] + assert evaluate_dynamic_breakdown('("a", "b")') == ["a", "b"] + + +def test_homogenize_parameter_structures_ignores_empty_breakdown_lists(): + root = ParameterNode( + data={ + "value_by_category": { + "metadata": { + "breakdown": [], + }, + } + } + ) + + result = homogenize_parameter_structures(root, {}, default_value=0) + + assert result is root diff --git a/tests/core/tools/test_runner/test_yaml_runner.py b/tests/core/tools/test_runner/test_yaml_runner.py index e15d191f1..9bd6f36f6 100644 --- a/tests/core/tools/test_runner/test_yaml_runner.py +++ b/tests/core/tools/test_runner/test_yaml_runner.py @@ -227,6 +227,89 @@ def test_yaml_runner_rejects_python_object_tags(tmp_path, monkeypatch): assert calls == [] +def test_yaml_runner_wraps_composer_errors(tmp_path): + yaml_path = tmp_path / "invalid-anchor.yaml" + yaml_path.write_text("value: *missing_anchor\n", encoding="utf-8") + + invalid_yaml_file = object.__new__(YamlFile) + invalid_yaml_file.path = yaml_path + invalid_yaml_file.options = {} + invalid_yaml_file.tax_benefit_system = TaxBenefitSystem() + + with pytest.raises(ValueError, match="not a valid YAML file"): + list(invalid_yaml_file.collect()) + + +def test_yaml_runner_rejects_scalar_roots(tmp_path): + yaml_path = tmp_path / "scalar.yaml" + yaml_path.write_text("foo\n", encoding="utf-8") + + scalar_yaml_file = object.__new__(YamlFile) + scalar_yaml_file.path = yaml_path + scalar_yaml_file.options = {} + scalar_yaml_file.tax_benefit_system = TaxBenefitSystem() + + with pytest.raises(ValueError, match="list of mappings"): + list(scalar_yaml_file.collect()) + + +def test_yaml_runner_rejects_scalar_keywords(tmp_path): + yaml_path = tmp_path / "invalid-keywords.yaml" + yaml_path.write_text( + "name: Example\nkeywords: 0\noutput: {}\n", + encoding="utf-8", + ) + + invalid_yaml_file = object.__new__(YamlFile) + invalid_yaml_file.path = yaml_path + invalid_yaml_file.options = {"name_filter": "missing"} + invalid_yaml_file.tax_benefit_system = TaxBenefitSystem() + + with pytest.raises(ValueError, match="'keywords' must be a list"): + list(invalid_yaml_file.collect()) + + +def test_yaml_runner_allows_yaml_merge_anchors(tmp_path): + yaml_path = tmp_path / "anchors.yaml" + yaml_path.write_text( + """ +- name: define anchor + input: + persons: &persons + Alicia: + salary: 4000 + households: + household: + parents: [Alicia] + output: + salary: 4000 + +- name: merge anchor + input: + persons: + <<: *persons + households: + household: + parents: [Alicia] + output: + salary: 4000 +""".strip(), + encoding="utf-8", + ) + + yaml_file = object.__new__(YamlFile) + yaml_file.config = None + yaml_file.session = None + yaml_file._nodeid = "anchors" + yaml_file.path = yaml_path + yaml_file.options = {} + yaml_file.tax_benefit_system = TaxBenefitSystem() + + collected = list(yaml_file.collect()) + + assert len(collected) == 2 + + def clean_performance_files(paths: List[str]): for path in paths: if os.path.isfile(path):