File size: 8,587 Bytes
57594ac
 
 
 
 
 
 
 
 
 
ea3f5eb
57594ac
 
 
 
ea3f5eb
 
 
 
 
57594ac
 
ea3f5eb
57594ac
 
 
 
 
 
 
 
 
 
 
ea3f5eb
 
 
 
 
 
 
 
57594ac
ea3f5eb
57594ac
 
 
ea3f5eb
57594ac
 
 
 
 
 
ea3f5eb
57594ac
 
 
ea3f5eb
57594ac
 
 
 
 
 
ea3f5eb
 
 
 
 
 
 
 
57594ac
 
 
 
 
 
 
 
 
ea3f5eb
 
 
 
 
 
 
 
 
 
 
57594ac
 
ea3f5eb
 
57594ac
 
 
ea3f5eb
 
57594ac
 
ea3f5eb
 
57594ac
 
ea3f5eb
57594ac
 
ea3f5eb
 
 
57594ac
 
 
ea3f5eb
 
57594ac
 
 
 
 
 
 
 
 
 
ea3f5eb
57594ac
 
 
 
 
 
ea3f5eb
 
 
 
 
 
 
 
57594ac
 
 
 
 
 
 
 
ea3f5eb
 
 
 
 
57594ac
 
 
 
ea3f5eb
 
 
 
 
57594ac
 
ea3f5eb
57594ac
ea3f5eb
 
57594ac
 
 
 
 
 
 
 
 
 
 
 
 
 
ea3f5eb
 
57594ac
 
 
 
 
ea3f5eb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
from typing import List, Optional
import logging
import dotenv
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
from presidio_analyzer.nlp_engine import NlpArtifacts

# 📜 Our trusty scribe, logging every move of our privacy-protecting saga
logger = logging.getLogger("presidio-streamlit")


class AzureAIServiceWrapper(EntityRecognizer):
    """
    🦸‍♂️ The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics
    to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's
    analyzer, it’s ready to team up with your SFT app for world-saving AI missions. 💪
    """
    from azure.ai.textanalytics._models import PiiEntityCategory

    # 📋 Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it!
    TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]

    def __init__(
        self,
        supported_entities: Optional[List[str]] = None,
        supported_language: str = "en",
        ta_client: Optional[TextAnalyticsClient] = None,
        ta_key: Optional[str] = None,
        ta_endpoint: Optional[str] = None,
    ):
        """
        🎬 Lights, camera, action! Initializes our Azure-powered PII slayer.
        :param supported_entities: PII types to hunt (defaults to ALL the baddies).
        :param supported_language: Language to analyze (English by default, mate! 🇬🇧).
        :param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves).
        :param ta_key: Secret key to unlock Azure’s vault of NLP magic.
        :param ta_endpoint: The Azure portal where the PII-zapping happens.
        *Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints,
        and entity lists snap together for a privacy-protecting masterpiece! 😼
        """
        # 🛡️ Default to all supported entities if none specified—maximum coverage!
        if not supported_entities:
            supported_entities = self.TA_SUPPORTED_ENTITIES

        # 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest
        super().__init__(
            supported_entities=supported_entities,
            supported_language=supported_language,
            name="Azure AI Language PII",
        )

        # 🔑 Stash the key and endpoint for Azure’s secret handshake
        self.ta_key = ta_key
        self.ta_endpoint = ta_endpoint

        # 🤝 Authenticate if no client’s provided—time to summon Azure’s NLP beast!
        if not ta_client:
            ta_client = self.__authenticate_client(ta_key, ta_endpoint)
        self.ta_client = ta_client

    @staticmethod
    def __authenticate_client(key: str, endpoint: str):
        """
        🔓 Unlocks Azure’s treasure chest with a key and endpoint.
        :param key: The magic password to Azure’s NLP kingdom.
        :param endpoint: The gate to Azure’s Text Analytics realm.
        :return: A shiny TextAnalyticsClient ready to rumble!
        *Fun fact*: This is like getting VIP access to a privacy party—credentials
        checked, and we’re in! 🎉
        """
        ta_credential = AzureKeyCredential(key)
        text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, credential=ta_credential
        )
        return text_analytics_client

    def analyze(
        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
    ) -> List[RecognizerResult]:
        """
        🕵️‍♀️ The main event: Scans text for PII like a hawk and returns redacted results.
        :param text: The text to scrub clean of sensitive data.
        :param entities: Specific PII types to hunt (or all if None).
        :param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ‘em).
        :return: A list of RecognizerResult with PII locations and confidence scores.
        *Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs,
        credit cards, and emails don’t stand a chance! 🦅
        *SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! 😏
        """
        # 🗳️ Default to empty entity list if none provided—flexibility is our jam
        if not entities:
            entities = []

        # 🚀 Fire up Azure’s PII recognizer with the text and language
        response = self.ta_client.recognize_pii_entities(
            [text], language=self.supported_language
        )

        # ✅ Filter out any errors—only the good stuff makes the cut
        results = [doc for doc in response if not doc.is_error]
        recognizer_results = []

        # 🔍 Loop through results, cherry-picking valid entities
        for res in results:
            for entity in res.entities:
                # 🚫 Skip unsupported entities—we’re picky like that
                if entity.category not in self.supported_entities:
                    continue

                # 📝 Craft a fancy explanation for why we flagged this PII
                analysis_explanation = self._build_explanation(
                    original_score=entity.confidence_score,
                    entity_type=entity.category,
                )

                # 🎯 Log the hit: entity type, position, and confidence score
                recognizer_results.append(
                    RecognizerResult(
                        entity_type=entity.category,
                        start=entity.offset,
                        end=entity.offset + len(entity.text),
                        score=entity.confidence_score,
                        analysis_explanation=analysis_explanation,
                    )
                )

        # 🏆 Return the haul of PII findings—mission accomplished!
        return recognizer_results

    @staticmethod
    def _build_explanation(
        original_score: float, entity_type: str
    ) -> AnalysisExplanation:
        """
        📜 Writes a love letter explaining why we flagged a PII entity.
        :param original_score: Confidence score from Azure’s NLP oracle.
        :param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER).
        :return: An AnalysisExplanation object with all the juicy details.
        *Witty note*: This is like leaving a Post-it note saying, “Caught ya, sneaky
        credit card number!” 😜
        """
        explanation = AnalysisExplanation(
            recognizer=AzureAIServiceWrapper.__class__.__name__,
            original_score=original_score,
            textual_explanation=f"Identified as {entity_type} by Text Analytics",
        )
        return explanation

    def load(self) -> None:
        """
        🛠️ Placeholder for loading resources—Azure’s already warmed up, so we chill.
        *Cheeky remark*: Like a superhero on standby, we’re always ready to leap
        into action. No prep needed! 😎
        """
        pass


if __name__ == "__main__":
    """
    🎮 Demo mode: Test-drive our PII zapper with sample text!
    *Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it
    to the Hub! 🤗
    """
    import presidio_helpers

    # 🔐 Load secrets from .env—because hardcoding keys is so last century
    dotenv.load_dotenv()

    # 📖 Our test story, packed with PII for our hero to vanquish
    text = """
    Here are a few example sentences we currently support:

    Hello, my name is David Johnson and I live in Maine.
    My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
    
    On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
    
    My passport: 191280342 and my phone number: (212) 555-1234.
    
    This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
    
    Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
    """

    # 🦸‍♀️ Summon the analyzer with Azure’s secret sauce
    analyzer = presidio_helpers.analyzer_engine(
        model_path="Azure Text Analytics PII",
        ta_key=os.environ["TA_KEY"],
        ta_endpoint=os.environ["TA_ENDPOINT"],
    )

    # 💥 Unleash the PII-hunting beast on our text
    analyzer.analyze(text=text, language="en")