Skip to content

Commit e525fac

Browse files
committed
Added integration test markers:
Added the integration marker to pytest configuration in tox.ini Applied the marker to both new Spark tests and existing text service integration tests Created Spark integration tests: Implemented tests that run Spark in local mode Created a shared SparkService fixture to avoid multiple context issues Properly handled PySpark Row objects in assertions Fixed SparkService for testing: Updated SparkService to handle Java security manager issues Added configuration for running in local mode Improved error handling and dependency management Added tox integration environment: Created a dedicated tox environment for running integration tests Ensured PySpark is properly installed in the test environment
1 parent 26f2ae1 commit e525fac

File tree

5 files changed

+190
-20
lines changed

5 files changed

+190
-20
lines changed

datafog/services/spark_service.py

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import importlib
99
import json
10+
import os
1011
import subprocess
1112
import sys
1213
from typing import Any, List
@@ -20,25 +21,55 @@ class SparkService:
2021
data reading and package installation.
2122
"""
2223

23-
def __init__(self):
24-
# First import necessary modules
25-
from pyspark.sql import DataFrame, SparkSession
26-
from pyspark.sql.functions import udf
27-
from pyspark.sql.types import ArrayType, StringType
28-
29-
# Assign fields
30-
self.SparkSession = SparkSession
31-
self.DataFrame = DataFrame
32-
self.udf = udf
33-
self.ArrayType = ArrayType
34-
self.StringType = StringType
35-
36-
# Now create spark session and ensure pyspark is installed
24+
def __init__(self, master=None):
25+
self.master = master
26+
27+
# Ensure pyspark is installed first
3728
self.ensure_installed("pyspark")
38-
self.spark = self.create_spark_session()
29+
30+
# Now import necessary modules after ensuring pyspark is installed
31+
try:
32+
from pyspark.sql import DataFrame, SparkSession
33+
from pyspark.sql.functions import udf
34+
from pyspark.sql.types import ArrayType, StringType
35+
36+
# Assign fields
37+
self.SparkSession = SparkSession
38+
self.DataFrame = DataFrame
39+
self.udf = udf
40+
self.ArrayType = ArrayType
41+
self.StringType = StringType
42+
43+
# Create the spark session
44+
self.spark = self.create_spark_session()
45+
except ImportError as e:
46+
raise ImportError(f"Failed to import PySpark modules: {e}. "
47+
f"Make sure PySpark is installed correctly.")
3948

4049
def create_spark_session(self):
41-
return self.SparkSession.builder.appName("datafog").getOrCreate()
50+
# Check if we're running in a test environment
51+
in_test_env = 'PYTEST_CURRENT_TEST' in os.environ or 'TOX_ENV_NAME' in os.environ
52+
53+
# Set Java system properties to handle security manager issues
54+
# This is needed for newer Java versions
55+
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.driver.allowMultipleContexts=true pyspark-shell'
56+
57+
# Create a builder with the app name
58+
builder = self.SparkSession.builder.appName("datafog")
59+
60+
# Add configuration to work around security manager issues
61+
builder = builder.config("spark.driver.allowMultipleContexts", "true")
62+
builder = builder.config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow")
63+
64+
# If master is specified, use it
65+
if self.master:
66+
builder = builder.master(self.master)
67+
# Otherwise, if we're in a test environment, use local mode
68+
elif in_test_env:
69+
builder = builder.master("local[1]")
70+
71+
# Create and return the session
72+
return builder.getOrCreate()
4273

4374
def read_json(self, path: str) -> List[dict]:
4475
return self.spark.read.json(path).collect()
@@ -47,6 +78,14 @@ def ensure_installed(self, package_name):
4778
try:
4879
importlib.import_module(package_name)
4980
except ImportError:
50-
subprocess.check_call(
51-
[sys.executable, "-m", "pip", "install", package_name]
52-
)
81+
print(f"Installing {package_name}...")
82+
try:
83+
subprocess.check_call(
84+
[sys.executable, "-m", "pip", "install", package_name]
85+
)
86+
print(f"{package_name} installed successfully.")
87+
except subprocess.CalledProcessError as e:
88+
print(f"Failed to install {package_name}: {e}")
89+
raise ImportError(f"Could not install {package_name}. "
90+
f"Please install it manually with 'pip install {package_name}'.")
91+

notes/story-1.7-tkt.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
**Story 1.7: Integration tests (no mocks)**
2+
3+
- [x] Run pytest with `-m "integration"` to run Spark in local mode.
4+
- [ ] Smoke test the CLI with a tmp file.
5+
- [ ] OCR path behind `PYTEST_DONUT=yes` flag.
6+
7+
## Implementation Notes
8+
9+
### Spark Integration Tests
10+
11+
1. Added integration marker to pytest configuration in tox.ini
12+
2. Created test_spark_integration.py with tests for SparkService in local mode
13+
3. Updated SparkService to support local mode for integration testing
14+
4. Added integration markers to existing text_service_integration.py tests
15+
5. Added a dedicated tox environment for running integration tests
16+
17+
To run the integration tests:
18+
19+
```bash
20+
tox -e integration
21+
```
22+
23+
Or directly with pytest:
24+
25+
```bash
26+
pytest -m "integration"
27+
```

tests/test_spark_integration.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""Integration tests for SparkService in local mode."""
2+
3+
import json
4+
import os
5+
import tempfile
6+
7+
import pytest
8+
9+
from datafog.services.spark_service import SparkService
10+
11+
12+
@pytest.fixture(scope="module")
13+
def spark_service():
14+
"""Create a shared SparkService instance for all tests."""
15+
# Initialize SparkService with explicit local mode
16+
service = SparkService(master="local[1]")
17+
18+
yield service
19+
20+
# Clean up after all tests
21+
if hasattr(service, 'spark') and service.spark is not None:
22+
service.spark.stop()
23+
24+
25+
@pytest.fixture
26+
def sample_json_data():
27+
"""Create a temporary JSON file with sample data for testing."""
28+
data = [
29+
{"name": "John Doe", "email": "john.doe@example.com", "age": 30},
30+
{"name": "Jane Smith", "email": "jane.smith@example.com", "age": 25},
31+
{"name": "Bob Johnson", "email": "bob.johnson@example.com", "age": 40},
32+
]
33+
34+
# Create a temporary file
35+
with tempfile.NamedTemporaryFile(suffix=".json", delete=False, mode="w") as f:
36+
for item in data:
37+
f.write(json.dumps(item) + "\n")
38+
temp_file = f.name
39+
40+
yield temp_file
41+
42+
# Clean up the temporary file after the test
43+
if os.path.exists(temp_file):
44+
os.remove(temp_file)
45+
46+
47+
@pytest.mark.integration
48+
def test_spark_service_initialization(spark_service):
49+
"""Test that SparkService can be initialized in local mode."""
50+
# Verify that the Spark session was created successfully
51+
assert spark_service.spark is not None
52+
assert spark_service.spark.sparkContext.appName == "datafog"
53+
assert spark_service.spark.sparkContext.master.startswith("local")
54+
55+
# Verify that the necessary Spark classes are available
56+
assert spark_service.DataFrame is not None
57+
assert spark_service.SparkSession is not None
58+
assert spark_service.udf is not None
59+
60+
61+
@pytest.mark.integration
62+
def test_spark_read_json(spark_service, sample_json_data):
63+
"""Test that SparkService can read JSON data in local mode."""
64+
# Read the JSON data
65+
result = spark_service.read_json(sample_json_data)
66+
67+
# Verify the result
68+
assert len(result) == 3, f"Expected 3 rows, got {len(result)}"
69+
70+
# PySpark Row objects have a __contains__ method and can be accessed like dictionaries
71+
# but they're not actually dictionaries
72+
assert all(hasattr(item, "name") for item in result), "Missing 'name' field"
73+
assert all(hasattr(item, "email") for item in result), "Missing 'email' field"
74+
assert all(hasattr(item, "age") for item in result), "Missing 'age' field"
75+
76+
# Verify specific values
77+
names = [item.name for item in result]
78+
assert "John Doe" in names, f"Expected 'John Doe' in {names}"
79+
assert "Jane Smith" in names, f"Expected 'Jane Smith' in {names}"
80+
assert "Bob Johnson" in names, f"Expected 'Bob Johnson' in {names}"

tests/test_text_service_integration.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def real_text_service():
1717
return TextService(text_chunk_length=1000) # Larger chunk to avoid multiple calls
1818

1919

20+
@pytest.mark.integration
2021
def test_engine_regex_detects_simple_entities():
2122
"""Test that regex engine correctly detects simple entities like emails and phones."""
2223
# Sample text with patterns that regex should easily detect
@@ -36,6 +37,7 @@ def test_engine_regex_detects_simple_entities():
3637
assert "123-45-6789" in result.get("SSN", [])
3738

3839

40+
@pytest.mark.integration
3941
def test_engine_auto_fallbacks_to_spacy():
4042
"""Test that auto mode works correctly with entity detection."""
4143
# We need to test the auto mode in a more controlled way
@@ -64,6 +66,7 @@ def test_engine_auto_fallbacks_to_spacy():
6466
assert any("john.smith@example.com" in email for email in auto_result["EMAIL"])
6567

6668

69+
@pytest.mark.integration
6770
def test_engine_spacy_only():
6871
"""Test that spaCy engine is always used regardless of regex potential hits."""
6972
# Sample text with both regex-detectable and spaCy-detectable entities
@@ -89,6 +92,7 @@ def test_engine_spacy_only():
8992
assert "EMAIL" not in spacy_result or not spacy_result["EMAIL"]
9093

9194

95+
@pytest.mark.integration
9296
def test_structured_annotation_output():
9397
"""Test that structured=True returns list of Span objects."""
9498
text = "John Smith's email is john.smith@example.com"
@@ -131,6 +135,7 @@ def test_structured_annotation_output():
131135
# which we've already verified above
132136

133137

138+
@pytest.mark.integration
134139
def test_debug_entity_types():
135140
"""Debug test to print the actual entity types returned by spaCy."""
136141
# Sample text with named entities

tox.ini

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,23 @@
22
envlist = py310,py311,py312
33
isolated_build = True
44

5+
[testenv:integration]
6+
deps =
7+
pytest==7.4.0
8+
pytest-asyncio==0.21.0
9+
pytest-cov
10+
pyspark>=3.0.0
11+
-r requirements-dev.txt
12+
extras = all
13+
allowlist_externals =
14+
tesseract
15+
pip
16+
python
17+
commands =
18+
pip install --no-cache-dir -r requirements-dev.txt
19+
pip install --no-cache-dir pyspark>=3.0.0
20+
python run_tests.py -m integration
21+
522
[testenv]
623
deps =
724
pytest==7.4.0
@@ -36,4 +53,6 @@ commands =
3653
mypy datafog tests
3754

3855
[pytest]
39-
asyncio_mode = auto
56+
asyncio_mode = auto
57+
markers =
58+
integration: marks tests as integration tests that may require external dependencies

0 commit comments

Comments
 (0)