Skip to content

Commit 2bf5379

Browse files
Sid MohanSid Mohan
authored andcommitted
pass, anonymizer tests for DataFog class
1 parent 929aa3c commit 2bf5379

File tree

5 files changed

+116
-69
lines changed

5 files changed

+116
-69
lines changed

datafog/main.py

Lines changed: 24 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -40,26 +40,32 @@ class DataFog:
4040

4141
def __init__(
4242
self,
43-
image_service=ImageService(),
44-
text_service=TextService(),
43+
image_service=None,
44+
text_service=None,
4545
spark_service=None,
4646
operations: List[OperationType] = [OperationType.SCAN],
47+
hash_type: HashType = HashType.SHA256,
48+
anonymizer_type: AnonymizerType = AnonymizerType.REPLACE,
4749
):
48-
self.image_service = image_service
49-
self.text_service = text_service
50+
self.image_service = image_service or ImageService()
51+
self.text_service = text_service or TextService()
5052
self.spark_service: SparkService = spark_service
5153
self.operations: List[OperationType] = operations
52-
self.anonymizer = Anonymizer()
54+
self.anonymizer = Anonymizer(
55+
hash_type=hash_type, anonymizer_type=anonymizer_type
56+
)
5357
self.logger = logging.getLogger(__name__)
5458
self.logger.info(
5559
"Initializing DataFog class with the following services and operations:"
5660
)
57-
self.logger.info(f"Image Service: {type(image_service)}")
58-
self.logger.info(f"Text Service: {type(text_service)}")
61+
self.logger.info(f"Image Service: {type(self.image_service)}")
62+
self.logger.info(f"Text Service: {type(self.text_service)}")
5963
self.logger.info(
60-
f"Spark Service: {type(spark_service) if spark_service else 'None'}"
64+
f"Spark Service: {type(self.spark_service) if self.spark_service else 'None'}"
6165
)
6266
self.logger.info(f"Operations: {operations}")
67+
self.logger.info(f"Hash Type: {hash_type}")
68+
self.logger.info(f"Anonymizer Type: {anonymizer_type}")
6369

6470
async def run_ocr_pipeline(self, image_urls: List[str]):
6571
"""
@@ -151,19 +157,15 @@ async def _process_text(self, text_list: List[str]):
151157
)
152158
return text_list
153159

154-
def run_text_pipeline_sync(self, str_list: List[str]):
160+
def run_text_pipeline_sync(self, str_list: List[str]) -> List[str]:
155161
"""
156162
Run the text pipeline synchronously on a list of input text.
157163
158-
This method processes a list of text strings in a synchronous manner, potentially
159-
annotating them for personally identifiable information (PII) and applying
160-
anonymization if enabled.
161-
162164
Args:
163165
str_list (List[str]): A list of text strings to be processed.
164166
165167
Returns:
166-
List: Processed text results based on the enabled operations.
168+
List[str]: Processed text results based on the enabled operations.
167169
168170
Raises:
169171
Exception: Any error encountered during the text processing.
@@ -176,29 +178,16 @@ def run_text_pipeline_sync(self, str_list: List[str]):
176178
f"Text annotation completed with {len(annotated_text)} annotations."
177179
)
178180

179-
if OperationType.REDACT in self.operations:
180-
return [
181-
self.anonymizer.anonymize(
182-
text, annotations, AnonymizerType.REDACT
183-
).anonymized_text
184-
for text, annotations in zip(
185-
str_list, annotated_text, strict=True
186-
)
187-
]
188-
elif OperationType.REPLACE in self.operations:
189-
return [
190-
self.anonymizer.anonymize(
191-
text, annotations, AnonymizerType.REPLACE
192-
).anonymized_text
193-
for text, annotations in zip(
194-
str_list, annotated_text, strict=True
195-
)
181+
if any(
182+
op in self.operations
183+
for op in [
184+
OperationType.REDACT,
185+
OperationType.REPLACE,
186+
OperationType.HASH,
196187
]
197-
elif OperationType.HASH in self.operations:
188+
):
198189
return [
199-
self.anonymizer.anonymize(
200-
text, annotations, AnonymizerType.HASH
201-
).anonymized_text
190+
self.anonymizer.anonymize(text, annotations).anonymized_text
202191
for text, annotations in zip(
203192
str_list, annotated_text, strict=True
204193
)

datafog/models/annotator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class AnnotationResult(BaseModel):
3939

4040
start: int
4141
end: int
42-
score: float
42+
score: Optional[float]
4343
entity_type: str
4444
recognition_metadata: Optional[AnnotatorMetadata]
4545

datafog/models/anonymizer.py

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -64,36 +64,21 @@ def replace_pii(
6464
) -> AnonymizationResult:
6565
"""Replace PII in text with anonymized values."""
6666
replacements = []
67-
print(f"Entities to anonymize: {self.entities}")
6867
for annotation in sorted(annotations, key=lambda x: x.start, reverse=True):
69-
print(f"Processing annotation: {annotation}")
7068
if not self.entities or annotation.entity_type in self.entities:
71-
print(f"Matched entity type: {annotation.entity_type}")
72-
if self.anonymizer_type == AnonymizerType.REPLACE:
73-
replacement = f"[{annotation.entity_type}_{len(replacements)}]"
74-
replacements.append(
75-
{
76-
"original": text[annotation.start : annotation.end],
77-
"replacement": replacement,
78-
"entity_type": annotation.entity_type,
79-
}
80-
)
81-
print(f"Added replacement: {replacements[-1]}")
82-
83-
print(f"Final replacements: {replacements}")
84-
anonymized_text = text
85-
for replacement in reversed(replacements):
86-
start = text.index(replacement["original"])
87-
end = start + len(replacement["original"])
88-
anonymized_text = (
89-
anonymized_text[:start]
90-
+ replacement["replacement"]
91-
+ anonymized_text[end:]
92-
)
69+
replacement = self._generate_replacement(
70+
text[annotation.start : annotation.end], annotation.entity_type
71+
)
72+
replacements.append(
73+
{
74+
"original": text[annotation.start : annotation.end],
75+
"replacement": replacement,
76+
"entity_type": annotation.entity_type,
77+
}
78+
)
79+
text = text[: annotation.start] + replacement + text[annotation.end :]
9380

94-
return AnonymizationResult(
95-
anonymized_text=anonymized_text, replaced_entities=replacements
96-
)
81+
return AnonymizationResult(anonymized_text=text, replaced_entities=replacements)
9782

9883
def _generate_replacement(self, original: str, entity_type: EntityTypes) -> str:
9984
"""Generate a replacement for the given entity."""
@@ -119,7 +104,7 @@ def hash_pii(
119104

120105
start, end = annotation.start, annotation.end
121106
original = text[start:end]
122-
replacement = self._hash_text(original)[: len(original)]
107+
replacement = self._hash_text(original)
123108

124109
text = text[:start] + replacement + text[end:]
125110
replacements.append(
@@ -145,7 +130,6 @@ def _hash_text(self, text: str) -> str:
145130
def redact_pii(
146131
self, text: str, annotations: List[AnnotationResult]
147132
) -> AnonymizationResult:
148-
"""Redact PII in text."""
149133
replacements = []
150134
for annotation in sorted(annotations, key=lambda x: x.start, reverse=True):
151135
if self.entities and annotation.entity_type not in self.entities:

tests/test_anonymizer.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,12 @@ def test_anonymizer_hash(sample_text, sample_annotations, hash_type):
8585
for replacement in result.replaced_entities:
8686
assert replacement["original"] in sample_text
8787
assert replacement["replacement"] not in sample_text
88-
assert len(replacement["replacement"]) == len(replacement["original"])
89-
88+
# assert len(replacement["replacement"]) == len(replacement["original"])
9089
# Check hash type-specific properties
9190
if hash_type == HashType.MD5:
92-
assert len(replacement["replacement"]) <= 32
91+
assert len(replacement["replacement"]) == 32
9392
elif hash_type in [HashType.SHA256, HashType.SHA3_256]:
94-
assert len(replacement["replacement"]) <= 64
93+
assert len(replacement["replacement"]) == 64
9594

9695

9796
def test_anonymizer_with_specific_entities(sample_text, sample_annotations):

tests/test_main.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import asyncio
22
import json
3+
import logging
4+
import re
35
from unittest.mock import AsyncMock, patch
46

57
import pytest
68

79
from datafog.config import OperationType
810
from datafog.main import DataFog
11+
from datafog.models.annotator import AnnotationResult
12+
from datafog.models.anonymizer import AnonymizerType, HashType
913
from datafog.processing.text_processing.spacy_pii_annotator import (
1014
SpacyPIIAnnotator as TextPIIAnnotator,
1115
)
@@ -164,3 +168,74 @@ def test_run_text_pipeline_sync_no_annotation():
164168
result = datafog.run_text_pipeline_sync(["Sample text"])
165169

166170
assert result == ["Sample text"]
171+
172+
173+
@pytest.mark.parametrize(
174+
"operation, hash_type, expected_pattern",
175+
[
176+
(
177+
OperationType.REDACT,
178+
None,
179+
r"\[REDACTED\] tries one more time to save his \$56 billion pay package",
180+
),
181+
(
182+
OperationType.REPLACE,
183+
None,
184+
r"\[PERSON(_[A-F0-9]+)?\] tries one more time to save his \$56 billion pay package",
185+
),
186+
(
187+
OperationType.HASH,
188+
HashType.MD5,
189+
r"([a-f0-9]{32}) tries one more time to save his \$56 billion pay package",
190+
),
191+
(
192+
OperationType.HASH,
193+
HashType.SHA256,
194+
r"([a-f0-9]{64}) tries one more time to save his \$56 billion pay package",
195+
),
196+
(
197+
OperationType.HASH,
198+
HashType.SHA3_256,
199+
r"([a-f0-9]{64}) tries one more time to save his \$56 billion pay package",
200+
),
201+
],
202+
)
203+
def test_run_text_pipeline_anonymization(
204+
mock_text_service, operation, hash_type, expected_pattern
205+
):
206+
logging.basicConfig(level=logging.INFO)
207+
datafog = DataFog(
208+
text_service=mock_text_service,
209+
operations=[OperationType.SCAN, operation],
210+
hash_type=hash_type,
211+
anonymizer_type=operation,
212+
)
213+
mock_text_service.batch_annotate_text_sync.return_value = [
214+
[
215+
AnnotationResult(
216+
start=0,
217+
end=9,
218+
entity_type="PERSON",
219+
text="Elon Musk",
220+
score=0.9,
221+
recognition_metadata={"confidence": "high"},
222+
)
223+
]
224+
]
225+
226+
result = datafog.run_text_pipeline_sync(
227+
["Elon Musk tries one more time to save his $56 billion pay package"]
228+
)
229+
230+
logging.info(f"Result: {result}")
231+
assert len(result) == 1, "Expected a single result"
232+
assert re.match(
233+
expected_pattern, result[0]
234+
), f"Result {result[0]!r} does not match pattern {expected_pattern!r}"
235+
236+
if operation == AnonymizerType.HASH:
237+
hashed_part = result[0].split()[0]
238+
if hash_type == HashType.MD5:
239+
assert len(hashed_part) == 32
240+
elif hash_type in [HashType.SHA256, HashType.SHA3_256]:
241+
assert len(hashed_part) == 64

0 commit comments

Comments
 (0)