From 639da1f97f916d5b2b127c95efe8322675d7d530 Mon Sep 17 00:00:00 2001 From: HiKaliber <217629972+HiKaliber@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:51:45 -0600 Subject: [PATCH] feature/json-extraction-addtl-transform-data-types Adds a feature to specify additional return value data types directly within the JsonCssExtractionStrategy schema. Int, float, and bool data types can be specified using the "transform" key-pair of the extraction Schema (e.g., {"transform": "int" | "float" | "bool"}). Int defaults to rounding using common rounding as standard. Bool defaults to false if does not match common expressions. Changes: - Adds import for Decimal lib -- used for ensuring accurate rounding of int using common rounding - Adds implementation to extraction_strategy - Updates website documentation with guidance/examples - Adds unit and regression testing for pytest --- crawl4ai/extraction_strategy.py | 16 +++++++- docs/md_v2/api/strategies.md | 16 +++++--- tests/test_pr_1932.py | 71 +++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 tests/test_pr_1932.py diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index ed039890e..89029fd87 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -5,6 +5,7 @@ import json import time from enum import IntFlag, auto +from decimal import Decimal, ROUND_HALF_UP from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .config import ( @@ -1270,16 +1271,20 @@ def _apply_transform(self, value, transform): Apply a transformation to a value. How it works: - 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 1. Checks the transformation type (e.g., `lowercase`, `strip`, `int`). 2. Applies the transformation to the value. 3. Returns the transformed value. + Special Handling: + Values for `int` round away from zero using common rounding (e.g., 1.5 -> 2). + Values for `bool` default to False if not matched. + Args: value (str): The value to transform. transform (str): The type of transformation to apply. Returns: - str: The transformed value. + str | int | float | bool: The transformed value. """ if transform == "lowercase": @@ -1288,6 +1293,13 @@ def _apply_transform(self, value, transform): return value.upper() elif transform == "strip": return value.strip() + elif transform == "int": + return int(Decimal(value).quantize(Decimal("0."), rounding=ROUND_HALF_UP)) + elif transform == "float": + return float(value) + elif transform == "bool": + bool_map = {"true": True, "false" : False, "1": True, "0": False} + return bool_map.get(value.strip().lower(), False) return value def _compute_field(self, item, field): diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md index 8a40812ab..fdc893dcf 100644 --- a/docs/md_v2/api/strategies.md +++ b/docs/md_v2/api/strategies.md @@ -116,12 +116,12 @@ schema = { { "name": str, # Field name "selector": str, # CSS selector - "type": str, # Field type: "text", "attribute", "html", "regex" + "type": str, # Field type: "text", "attribute", "html", "regex" "attribute": str, # For type="attribute" - "pattern": str, # For type="regex" - "transform": str, # Optional: "lowercase", "uppercase", "strip" + "pattern": str, # For type="regex" + "transform": str, # Optional: "lowercase", "uppercase", "strip", "int", "float", "bool" "default": Any, # Default value if extraction fails - "source": str, # Optional: navigate to sibling first, e.g. "+ tr" + "source": str, # Optional: navigate to sibling first, e.g. "+ tr" } ] } @@ -245,7 +245,7 @@ async with AsyncWebCrawler() as crawler: print(f"{item['label']}: {item['value']}") ``` -### CSS Extraction +### JSON/CSS Extraction ```python from crawl4ai import JsonCssExtractionStrategy @@ -266,6 +266,12 @@ schema = { "type": "text", "transform": "strip" }, + { + "name": "quantity", + "selector": ".quantity", + "type": "text", + "transform": "int" + }, { "name": "image", "selector": "img", diff --git a/tests/test_pr_1932.py b/tests/test_pr_1932.py new file mode 100644 index 000000000..2c79346c6 --- /dev/null +++ b/tests/test_pr_1932.py @@ -0,0 +1,71 @@ +""" +Tests for PR #1932 - feature/ Add additional return data types to JsonCssExtractionStrategy using transform + +Adds a feature to specify additional return value data types directly within the JsonCssExtractionStrategy schema. +Int, float, and bool data types can be specified using the "transform" key-pair of the extraction Schema. + +""" + +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +def transform_result(value, transform) -> str | int | float | bool: + + strategy = JsonCssExtractionStrategy(schema={"key": "value"}) + + return strategy._apply_transform(value, transform) + + +class TestExistingTransforms: + """Regression tests for existing JsonElementExtractionStrategy transform operations.""" + + def test_uppercase(self): + assert transform_result("test", "uppercase") == "TEST" + + def test_lowercase(self): + assert transform_result("TEST", "lowercase") == "test" + + def test_strip(self): + assert transform_result(" test ", "strip") == "test" + + +class TestAddedTransforms: + """Unit tests for added JsonElementExtractionStrategy transform operations.""" + + def test_int(self): + assert transform_result("0", "int") == 0 + + def test_int_pos_round_up(self): + assert transform_result("1.5", "int") == 2 + + def test_int_pos_round_down(self): + assert transform_result("1.4", "int") == 1 + + def test_int_neg_round_up(self): + assert transform_result("-1.5", "int") == -2 + + def test_int_neg_round_down(self): + assert transform_result("-1.4", "int") == -1 + + def test_float(self): + assert transform_result("3.1416", "float") == 3.1416 + + def test_bool_true_upper(self): + assert transform_result("TRUE", "bool") == True + + def test_bool_false_upper(self): + assert transform_result("FALSE", "bool") == False + + def test_bool_true_lower(self): + assert transform_result("true", "bool") == True + + def test_bool_false_lower(self): + assert transform_result("false", "bool") == False + + def test_bool_one(self): + assert transform_result("1", "bool") == True + + def test_bool_zero(self): + assert transform_result("0", "bool") == False + + def test_bool_default(self): + assert transform_result("Other", "bool") == False \ No newline at end of file