99from datafog .main import DataFog
1010from datafog .models .annotator import AnnotationResult
1111from datafog .models .anonymizer import AnonymizerType , HashType
12- from datafog .processing .text_processing .spacy_pii_annotator import (
13- SpacyPIIAnnotator as TextPIIAnnotator ,
14- )
15- from datafog .services .image_service import ImageService
16- from datafog .services .text_service import TextService
12+
13+ # Try to import optional dependencies
14+ try :
15+ from datafog .processing .text_processing .spacy_pii_annotator import (
16+ SpacyPIIAnnotator as TextPIIAnnotator ,
17+ )
18+ from datafog .services .image_service import ImageService
19+ from datafog .services .text_service import TextService
20+
21+ HAS_FULL_DEPS = True
22+ except ImportError :
23+ HAS_FULL_DEPS = False
24+ TextPIIAnnotator = None
25+ ImageService = None
26+ TextService = None
27+
28+ # Try to import the full-featured DataFog for integration tests
29+ try :
30+ from datafog .main_original import DataFog as FullDataFog
31+
32+ HAS_ORIGINAL_MAIN = True
33+ except ImportError :
34+ HAS_ORIGINAL_MAIN = False
35+ FullDataFog = None
1736
1837
1938@pytest .fixture
2039def mock_image_service ():
21- with patch ("datafog.main.ImageService" ) as mock :
40+ if not HAS_FULL_DEPS :
41+ pytest .skip ("Full dependencies not available" )
42+ with patch ("datafog.services.image_service.ImageService" ) as mock :
2243 mock .return_value .ocr_extract = AsyncMock ()
2344 yield mock .return_value
2445
2546
2647@pytest .fixture
2748def mock_text_service ():
28- with patch ("datafog.main.TextService" ) as mock :
49+ if not HAS_FULL_DEPS :
50+ pytest .skip ("Full dependencies not available" )
51+ with patch ("datafog.services.text_service.TextService" ) as mock :
2952 mock .return_value .batch_annotate_text_async = AsyncMock ()
53+ mock .return_value .batch_annotate_text_sync .return_value = [
54+ {"PERSON" : ["Test Person" ]}
55+ ]
3056 yield mock .return_value
3157
3258
3359@pytest .fixture
3460def text_annotator ():
61+ if not HAS_FULL_DEPS :
62+ pytest .skip ("Full dependencies not available" )
3563 return TextPIIAnnotator .create ()
3664
3765
@@ -46,6 +74,7 @@ def image_url():
4674 return json .load (f )["executive_email" ]
4775
4876
77+ @pytest .mark .skipif (not HAS_FULL_DEPS , reason = "Full dependencies not available" )
4978def test_text_pii_annotator (text_annotator ):
5079 text = "Travis Kalanick lives at 1234 Elm St, Springfield."
5180 annotated_text = text_annotator .annotate (text )
@@ -84,7 +113,21 @@ def assert_file_output(annotated_text):
84113
85114
86115def test_datafog_init ():
116+ """Test the lean DataFog initialization."""
87117 datafog = DataFog ()
118+ # Test lean version attributes
119+ assert hasattr (datafog , "regex_annotator" )
120+ assert hasattr (datafog , "operations" )
121+ assert hasattr (datafog , "anonymizer" )
122+ assert datafog .operations == [OperationType .SCAN ]
123+
124+
125+ @pytest .mark .skipif (
126+ not HAS_FULL_DEPS or not HAS_ORIGINAL_MAIN , reason = "Full dependencies not available"
127+ )
128+ def test_full_datafog_init ():
129+ """Test the full-featured DataFog initialization when dependencies are available."""
130+ datafog = FullDataFog ()
88131 assert isinstance (datafog .image_service , ImageService )
89132 assert isinstance (datafog .text_service , TextService )
90133 assert datafog .spark_service is None
@@ -94,7 +137,7 @@ def test_datafog_init():
94137 custom_text_service = TextService ()
95138 custom_operations = [OperationType .SCAN , OperationType .REDACT ]
96139
97- datafog_custom = DataFog (
140+ datafog_custom = FullDataFog (
98141 image_service = custom_image_service ,
99142 text_service = custom_text_service ,
100143 operations = custom_operations ,
@@ -105,9 +148,14 @@ def test_datafog_init():
105148 assert datafog_custom .operations == custom_operations
106149
107150
151+ @pytest .mark .skipif (
152+ not HAS_FULL_DEPS or not HAS_ORIGINAL_MAIN , reason = "Full dependencies not available"
153+ )
108154@pytest .mark .asyncio
109155async def test_run_ocr_pipeline (mock_image_service , mock_text_service ):
110- datafog = DataFog (image_service = mock_image_service , text_service = mock_text_service )
156+ datafog = FullDataFog (
157+ image_service = mock_image_service , text_service = mock_text_service
158+ )
111159
112160 mock_image_service .ocr_extract .return_value = ["Extracted text" ]
113161 mock_text_service .batch_annotate_text_async .return_value = {
@@ -123,9 +171,12 @@ async def test_run_ocr_pipeline(mock_image_service, mock_text_service):
123171 assert result == {"PERSON" : ["Satya Nadella" ]}
124172
125173
174+ @pytest .mark .skipif (
175+ not HAS_FULL_DEPS or not HAS_ORIGINAL_MAIN , reason = "Full dependencies not available"
176+ )
126177@pytest .mark .asyncio
127178async def test_run_text_pipeline (mock_text_service ):
128- datafog = DataFog (text_service = mock_text_service )
179+ datafog = FullDataFog (text_service = mock_text_service )
129180
130181 mock_text_service .batch_annotate_text_async .return_value = {"PERSON" : ["Elon Musk" ]}
131182
@@ -139,36 +190,94 @@ async def test_run_text_pipeline(mock_text_service):
139190 assert result == {"PERSON" : ["Elon Musk" ]}
140191
141192
193+ @pytest .mark .skipif (not HAS_ORIGINAL_MAIN , reason = "Full main module not available" )
142194@pytest .mark .asyncio
143195async def test_run_text_pipeline_no_annotation ():
144- datafog = DataFog (operations = [])
196+ datafog = FullDataFog (operations = [])
145197
146198 result = await datafog .run_text_pipeline (["Sample text" ])
147199
148200 assert result == ["Sample text" ]
149201
150202
151- def test_run_text_pipeline_sync (mock_text_service ):
152- datafog = DataFog (text_service = mock_text_service )
203+ def test_run_text_pipeline_sync ():
204+ """Test lean DataFog run_text_pipeline_sync with regex annotator."""
205+ datafog = DataFog ()
206+
207+ # Test with sample text containing PII
208+ test_text = "Contact john@example.com or call (555) 123-4567"
209+ result = datafog .run_text_pipeline_sync ([test_text ])
153210
154- mock_text_service .batch_annotate_text_sync .return_value = {"PERSON" : ["Jeff Bezos" ]}
211+ # Should return annotations (dict format) since default is scan only
212+ assert isinstance (result , list )
213+ assert len (result ) == 1
214+ assert isinstance (result [0 ], dict )
215+
216+
217+ def test_run_text_pipeline_sync_no_annotation ():
218+ """Test lean DataFog with no annotation operations."""
219+ datafog = DataFog (operations = [])
220+
221+ result = datafog .run_text_pipeline_sync (["Sample text" ])
222+
223+ assert result == ["Sample text" ]
224+
225+
226+ @pytest .mark .skipif (
227+ not HAS_FULL_DEPS or not HAS_ORIGINAL_MAIN , reason = "Full dependencies not available"
228+ )
229+ def test_full_run_text_pipeline_sync (mock_text_service ):
230+ """Test full DataFog run_text_pipeline_sync with mocked text service."""
231+ datafog = FullDataFog (text_service = mock_text_service )
232+
233+ mock_text_service .batch_annotate_text_sync .return_value = [
234+ {"PERSON" : ["Jeff Bezos" ]}
235+ ]
155236
156237 result = datafog .run_text_pipeline_sync (["Jeff Bezos steps down as Amazon CEO" ])
157238
158239 mock_text_service .batch_annotate_text_sync .assert_called_once_with (
159240 ["Jeff Bezos steps down as Amazon CEO" ]
160241 )
161- assert result == {"PERSON" : ["Jeff Bezos" ]}
242+ assert result == [ {"PERSON" : ["Jeff Bezos" ]}]
162243
163244
164- def test_run_text_pipeline_sync_no_annotation ():
165- datafog = DataFog (operations = [])
245+ def test_lean_datafog_detect ():
246+ """Test lean DataFog detect method."""
247+ datafog = DataFog ()
166248
167- result = datafog .run_text_pipeline_sync (["Sample text" ])
249+ test_text = "Contact john@example.com or call (555) 123-4567"
250+ result = datafog .detect (test_text )
168251
169- assert result == ["Sample text" ]
252+ assert isinstance (result , dict )
253+ # Should detect email and phone
254+ assert "EMAIL" in result
255+ assert "PHONE" in result
256+
257+
258+ def test_lean_datafog_process ():
259+ """Test lean DataFog process method."""
260+ datafog = DataFog ()
261+
262+ test_text = "Contact john@example.com or call (555) 123-4567"
263+
264+ # Test without anonymization
265+ result = datafog .process (test_text , anonymize = False )
266+ assert result ["original" ] == test_text
267+ assert "findings" in result
268+ assert "anonymized" not in result
170269
270+ # Test with anonymization
271+ result = datafog .process (test_text , anonymize = True , method = "redact" )
272+ assert result ["original" ] == test_text
273+ assert "findings" in result
274+ assert "anonymized" in result
275+ assert result ["anonymized" ] != test_text
171276
277+
278+ @pytest .mark .skipif (
279+ not HAS_FULL_DEPS or not HAS_ORIGINAL_MAIN , reason = "Full dependencies not available"
280+ )
172281@pytest .mark .parametrize (
173282 "operation, hash_type, expected_pattern" ,
174283 [
@@ -199,11 +308,12 @@ def test_run_text_pipeline_sync_no_annotation():
199308 ),
200309 ],
201310)
202- def test_run_text_pipeline_anonymization (
311+ def test_full_run_text_pipeline_anonymization (
203312 mock_text_service , operation , hash_type , expected_pattern
204313):
314+ """Test full DataFog anonymization with mocked services."""
205315 logging .basicConfig (level = logging .INFO )
206- datafog = DataFog (
316+ datafog = FullDataFog (
207317 text_service = mock_text_service ,
208318 operations = [OperationType .SCAN , operation ],
209319 hash_type = hash_type ,
0 commit comments