Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions crawl4ai/extraction_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import time
from enum import IntFlag, auto
from decimal import Decimal, ROUND_HALF_UP

from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
from .config import (
Expand Down Expand Up @@ -1270,16 +1271,20 @@ def _apply_transform(self, value, transform):
Apply a transformation to a value.

How it works:
1. Checks the transformation type (e.g., `lowercase`, `strip`).
1. Checks the transformation type (e.g., `lowercase`, `strip`, `int`).
2. Applies the transformation to the value.
3. Returns the transformed value.

Special Handling:
Values for `int` round away from zero using common rounding (e.g., 1.5 -> 2).
Values for `bool` default to False if not matched.

Args:
value (str): The value to transform.
transform (str): The type of transformation to apply.

Returns:
str: The transformed value.
str | int | float | bool: The transformed value.
"""

if transform == "lowercase":
Expand All @@ -1288,6 +1293,13 @@ def _apply_transform(self, value, transform):
return value.upper()
elif transform == "strip":
return value.strip()
elif transform == "int":
return int(Decimal(value).quantize(Decimal("0."), rounding=ROUND_HALF_UP))
elif transform == "float":
return float(value)
elif transform == "bool":
bool_map = {"true": True, "false" : False, "1": True, "0": False}
return bool_map.get(value.strip().lower(), False)
return value

def _compute_field(self, item, field):
Expand Down
16 changes: 11 additions & 5 deletions docs/md_v2/api/strategies.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,12 @@ schema = {
{
"name": str, # Field name
"selector": str, # CSS selector
"type": str, # Field type: "text", "attribute", "html", "regex"
"type": str, # Field type: "text", "attribute", "html", "regex"
"attribute": str, # For type="attribute"
"pattern": str, # For type="regex"
"transform": str, # Optional: "lowercase", "uppercase", "strip"
"pattern": str, # For type="regex"
"transform": str, # Optional: "lowercase", "uppercase", "strip", "int", "float", "bool"
"default": Any, # Default value if extraction fails
"source": str, # Optional: navigate to sibling first, e.g. "+ tr"
"source": str, # Optional: navigate to sibling first, e.g. "+ tr"
}
]
}
Expand Down Expand Up @@ -245,7 +245,7 @@ async with AsyncWebCrawler() as crawler:
print(f"{item['label']}: {item['value']}")
```

### CSS Extraction
### JSON/CSS Extraction

```python
from crawl4ai import JsonCssExtractionStrategy
Expand All @@ -266,6 +266,12 @@ schema = {
"type": "text",
"transform": "strip"
},
{
"name": "quantity",
"selector": ".quantity",
"type": "text",
"transform": "int"
},
{
"name": "image",
"selector": "img",
Expand Down
71 changes: 71 additions & 0 deletions tests/test_pr_1932.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
Tests for PR #1932 - feature/ Add additional return data types to JsonCssExtractionStrategy using transform

Adds a feature to specify additional return value data types directly within the JsonCssExtractionStrategy schema.
Int, float, and bool data types can be specified using the "transform" key-pair of the extraction Schema.

"""

from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

def transform_result(value, transform) -> str | int | float | bool:

strategy = JsonCssExtractionStrategy(schema={"key": "value"})

return strategy._apply_transform(value, transform)


class TestExistingTransforms:
"""Regression tests for existing JsonElementExtractionStrategy transform operations."""

def test_uppercase(self):
assert transform_result("test", "uppercase") == "TEST"

def test_lowercase(self):
assert transform_result("TEST", "lowercase") == "test"

def test_strip(self):
assert transform_result(" test ", "strip") == "test"


class TestAddedTransforms:
"""Unit tests for added JsonElementExtractionStrategy transform operations."""

def test_int(self):
assert transform_result("0", "int") == 0

def test_int_pos_round_up(self):
assert transform_result("1.5", "int") == 2

def test_int_pos_round_down(self):
assert transform_result("1.4", "int") == 1

def test_int_neg_round_up(self):
assert transform_result("-1.5", "int") == -2

def test_int_neg_round_down(self):
assert transform_result("-1.4", "int") == -1

def test_float(self):
assert transform_result("3.1416", "float") == 3.1416

def test_bool_true_upper(self):
assert transform_result("TRUE", "bool") == True

def test_bool_false_upper(self):
assert transform_result("FALSE", "bool") == False

def test_bool_true_lower(self):
assert transform_result("true", "bool") == True

def test_bool_false_lower(self):
assert transform_result("false", "bool") == False

def test_bool_one(self):
assert transform_result("1", "bool") == True

def test_bool_zero(self):
assert transform_result("0", "bool") == False

def test_bool_default(self):
assert transform_result("Other", "bool") == False