|
| 1 | +""" |
| 2 | +DataFog: Lightning-fast PII detection and anonymization library. |
| 3 | +
|
| 4 | +Core package provides regex-based PII detection with 190x performance advantage. |
| 5 | +Optional extras available for advanced features: |
| 6 | +- pip install datafog[nlp] - for spaCy integration |
| 7 | +- pip install datafog[ocr] - for image/OCR processing |
| 8 | +- pip install datafog[all] - for all features |
| 9 | +""" |
| 10 | + |
1 | 11 | from .__about__ import __version__ |
2 | | -from .client import app |
3 | | -from .config import OperationType, get_config |
4 | | -from .main import DataFog, TextPIIAnnotator |
5 | | -from .models.annotator import ( |
6 | | - AnalysisExplanation, |
7 | | - AnnotationResult, |
8 | | - AnnotationResultWithAnaysisExplanation, |
9 | | - AnnotatorRequest, |
10 | | -) |
| 12 | + |
| 13 | +# Core imports - always available |
| 14 | +from .models.annotator import AnnotationResult, AnnotatorRequest |
11 | 15 | from .models.anonymizer import ( |
12 | 16 | AnonymizationResult, |
13 | 17 | Anonymizer, |
14 | 18 | AnonymizerRequest, |
15 | 19 | AnonymizerType, |
16 | 20 | ) |
17 | | -from .models.common import AnnotatorMetadata, EntityTypes, Pattern, PatternRecognizer |
18 | | -from .models.spacy_nlp import SpacyAnnotator |
19 | | -from .processing.image_processing.donut_processor import DonutProcessor |
20 | | -from .processing.image_processing.image_downloader import ImageDownloader |
21 | | -from .processing.image_processing.pytesseract_processor import PytesseractProcessor |
22 | | -from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator |
23 | | -from .services.image_service import ImageService |
24 | | -from .services.spark_service import SparkService |
25 | | -from .services.text_service import TextService |
| 21 | +from .models.common import EntityTypes |
| 22 | +from .processing.text_processing.regex_annotator import RegexAnnotator |
26 | 23 |
|
27 | | -__all__ = [ |
28 | | - "DonutProcessor", |
29 | | - "DataFog", |
30 | | - "ImageService", |
31 | | - "OperationType", |
32 | | - "SparkService", |
33 | | - "TextPIIAnnotator", |
34 | | - "TextService", |
35 | | - "SpacyPIIAnnotator", |
36 | | - "ImageDownloader", |
| 24 | +# Optional imports with graceful fallback |
| 25 | +try: |
| 26 | + from .client import app |
| 27 | +except ImportError: |
| 28 | + app = None |
| 29 | + |
| 30 | +try: |
| 31 | + from .main import DataFog, TextPIIAnnotator |
| 32 | +except ImportError: |
| 33 | + DataFog = None |
| 34 | + TextPIIAnnotator = None |
| 35 | + |
| 36 | +try: |
| 37 | + from .services.text_service import TextService |
| 38 | +except ImportError: |
| 39 | + TextService = None |
| 40 | + |
| 41 | + |
| 42 | +# Optional heavy features - only import if dependencies available |
| 43 | +def _optional_import(name, module_path, extra_name): |
| 44 | + """Helper to import optional modules with helpful error messages.""" |
| 45 | + try: |
| 46 | + module = __import__(module_path, fromlist=[name]) |
| 47 | + return getattr(module, name) |
| 48 | + except ImportError: |
| 49 | + |
| 50 | + def _missing_dependency(*args, **kwargs): |
| 51 | + raise ImportError( |
| 52 | + f"{name} requires additional dependencies. " |
| 53 | + f"Install with: pip install datafog[{extra_name}]" |
| 54 | + ) |
| 55 | + |
| 56 | + return _missing_dependency |
| 57 | + |
| 58 | + |
| 59 | +# OCR/Image processing - requires 'ocr' extra |
| 60 | +DonutProcessor = _optional_import( |
| 61 | + "DonutProcessor", "datafog.processing.image_processing.donut_processor", "ocr" |
| 62 | +) |
| 63 | +PytesseractProcessor = _optional_import( |
37 | 64 | "PytesseractProcessor", |
| 65 | + "datafog.processing.image_processing.pytesseract_processor", |
| 66 | + "ocr", |
| 67 | +) |
| 68 | +ImageService = _optional_import("ImageService", "datafog.services.image_service", "ocr") |
| 69 | + |
| 70 | +# NLP processing - requires 'nlp' extra |
| 71 | +SpacyPIIAnnotator = _optional_import( |
| 72 | + "SpacyPIIAnnotator", "datafog.processing.text_processing.spacy_pii_annotator", "nlp" |
| 73 | +) |
| 74 | + |
| 75 | +# Distributed processing - requires 'distributed' extra |
| 76 | +SparkService = _optional_import( |
| 77 | + "SparkService", "datafog.services.spark_service", "distributed" |
| 78 | +) |
| 79 | + |
| 80 | + |
| 81 | +# Simple API for core functionality |
| 82 | +def detect(text: str) -> list: |
| 83 | + """ |
| 84 | + Detect PII in text using regex patterns. |
| 85 | +
|
| 86 | + Args: |
| 87 | + text: Input text to scan for PII |
| 88 | +
|
| 89 | + Returns: |
| 90 | + List of detected PII entities |
| 91 | +
|
| 92 | + Example: |
| 93 | + >>> from datafog import detect |
| 94 | + >>> detect("Contact john@example.com") |
| 95 | + [{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}] |
| 96 | + """ |
| 97 | + annotator = RegexAnnotator() |
| 98 | + # Use the structured output to get proper positions |
| 99 | + _, result = annotator.annotate_with_spans(text) |
| 100 | + |
| 101 | + # Convert to simple format, filtering out empty matches |
| 102 | + entities = [] |
| 103 | + for span in result.spans: |
| 104 | + if span.text.strip(): # Only include non-empty matches |
| 105 | + entities.append( |
| 106 | + { |
| 107 | + "type": span.label, |
| 108 | + "value": span.text, |
| 109 | + "start": span.start, |
| 110 | + "end": span.end, |
| 111 | + } |
| 112 | + ) |
| 113 | + |
| 114 | + return entities |
| 115 | + |
| 116 | + |
| 117 | +def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: |
| 118 | + """ |
| 119 | + Process text to detect and optionally anonymize PII. |
| 120 | +
|
| 121 | + Args: |
| 122 | + text: Input text to process |
| 123 | + anonymize: Whether to anonymize detected PII |
| 124 | + method: Anonymization method ('redact', 'replace', 'hash') |
| 125 | +
|
| 126 | + Returns: |
| 127 | + Dictionary with original text, anonymized text (if requested), and findings |
| 128 | +
|
| 129 | + Example: |
| 130 | + >>> from datafog import process |
| 131 | + >>> process("Contact john@example.com", anonymize=True) |
| 132 | + { |
| 133 | + 'original': 'Contact john@example.com', |
| 134 | + 'anonymized': 'Contact [EMAIL_REDACTED]', |
| 135 | + 'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}] |
| 136 | + } |
| 137 | + """ |
| 138 | + findings = detect(text) |
| 139 | + |
| 140 | + result = {"original": text, "findings": findings} |
| 141 | + |
| 142 | + if anonymize: |
| 143 | + anonymized = text |
| 144 | + # Simple anonymization - replace from end to start to preserve positions |
| 145 | + for finding in sorted(findings, key=lambda x: x["start"], reverse=True): |
| 146 | + start, end = finding["start"], finding["end"] |
| 147 | + entity_type = finding["type"] |
| 148 | + |
| 149 | + if method == "redact": |
| 150 | + replacement = f"[{entity_type}_REDACTED]" |
| 151 | + elif method == "replace": |
| 152 | + replacement = f"[{entity_type}_XXXXX]" |
| 153 | + elif method == "hash": |
| 154 | + import hashlib |
| 155 | + |
| 156 | + replacement = f"[{entity_type}_{hashlib.md5(finding['value'].encode()).hexdigest()[:8]}]" |
| 157 | + else: |
| 158 | + replacement = f"[{entity_type}]" |
| 159 | + |
| 160 | + anonymized = anonymized[:start] + replacement + anonymized[end:] |
| 161 | + |
| 162 | + result["anonymized"] = anonymized |
| 163 | + |
| 164 | + return result |
| 165 | + |
| 166 | + |
| 167 | +# Core exports |
| 168 | +__all__ = [ |
38 | 169 | "__version__", |
39 | | - "app", |
40 | | - "AnalysisExplanation", |
| 170 | + "detect", |
| 171 | + "process", |
41 | 172 | "AnnotationResult", |
42 | | - "AnnotationResultWithAnaysisExplanation", |
43 | 173 | "AnnotatorRequest", |
44 | | - "AnnotatorMetadata", |
45 | | - "EntityTypes", |
46 | | - "Pattern", |
47 | | - "PatternRecognizer", |
48 | | - "get_config", |
49 | | - "SpacyAnnotator", |
50 | | - "AnonymizerType", |
51 | | - "AnonymizerRequest", |
52 | 174 | "AnonymizationResult", |
53 | 175 | "Anonymizer", |
| 176 | + "AnonymizerRequest", |
| 177 | + "AnonymizerType", |
| 178 | + "EntityTypes", |
| 179 | + "RegexAnnotator", |
| 180 | + # Optional exports (may be None if dependencies missing) |
| 181 | + "DataFog", |
| 182 | + "TextPIIAnnotator", |
| 183 | + "TextService", |
| 184 | + "app", |
| 185 | + "DonutProcessor", |
| 186 | + "PytesseractProcessor", |
| 187 | + "ImageService", |
| 188 | + "SpacyPIIAnnotator", |
| 189 | + "SparkService", |
54 | 190 | ] |
0 commit comments