Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 8,587 Bytes
57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb 57594ac ea3f5eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import os
from typing import List, Optional
import logging
import dotenv
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
from presidio_analyzer.nlp_engine import NlpArtifacts
# 📜 Our trusty scribe, logging every move of our privacy-protecting saga
logger = logging.getLogger("presidio-streamlit")
class AzureAIServiceWrapper(EntityRecognizer):
"""
🦸♂️ The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics
to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's
analyzer, it’s ready to team up with your SFT app for world-saving AI missions. 💪
"""
from azure.ai.textanalytics._models import PiiEntityCategory
# 📋 Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it!
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
def __init__(
self,
supported_entities: Optional[List[str]] = None,
supported_language: str = "en",
ta_client: Optional[TextAnalyticsClient] = None,
ta_key: Optional[str] = None,
ta_endpoint: Optional[str] = None,
):
"""
🎬 Lights, camera, action! Initializes our Azure-powered PII slayer.
:param supported_entities: PII types to hunt (defaults to ALL the baddies).
:param supported_language: Language to analyze (English by default, mate! 🇬🇧).
:param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves).
:param ta_key: Secret key to unlock Azure’s vault of NLP magic.
:param ta_endpoint: The Azure portal where the PII-zapping happens.
*Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints,
and entity lists snap together for a privacy-protecting masterpiece! 😼
"""
# 🛡️ Default to all supported entities if none specified—maximum coverage!
if not supported_entities:
supported_entities = self.TA_SUPPORTED_ENTITIES
# 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest
super().__init__(
supported_entities=supported_entities,
supported_language=supported_language,
name="Azure AI Language PII",
)
# 🔑 Stash the key and endpoint for Azure’s secret handshake
self.ta_key = ta_key
self.ta_endpoint = ta_endpoint
# 🤝 Authenticate if no client’s provided—time to summon Azure’s NLP beast!
if not ta_client:
ta_client = self.__authenticate_client(ta_key, ta_endpoint)
self.ta_client = ta_client
@staticmethod
def __authenticate_client(key: str, endpoint: str):
"""
🔓 Unlocks Azure’s treasure chest with a key and endpoint.
:param key: The magic password to Azure’s NLP kingdom.
:param endpoint: The gate to Azure’s Text Analytics realm.
:return: A shiny TextAnalyticsClient ready to rumble!
*Fun fact*: This is like getting VIP access to a privacy party—credentials
checked, and we’re in! 🎉
"""
ta_credential = AzureKeyCredential(key)
text_analytics_client = TextAnalyticsClient(
endpoint=endpoint, credential=ta_credential
)
return text_analytics_client
def analyze(
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
) -> List[RecognizerResult]:
"""
🕵️♀️ The main event: Scans text for PII like a hawk and returns redacted results.
:param text: The text to scrub clean of sensitive data.
:param entities: Specific PII types to hunt (or all if None).
:param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ‘em).
:return: A list of RecognizerResult with PII locations and confidence scores.
*Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs,
credit cards, and emails don’t stand a chance! 🦅
*SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! 😏
"""
# 🗳️ Default to empty entity list if none provided—flexibility is our jam
if not entities:
entities = []
# 🚀 Fire up Azure’s PII recognizer with the text and language
response = self.ta_client.recognize_pii_entities(
[text], language=self.supported_language
)
# ✅ Filter out any errors—only the good stuff makes the cut
results = [doc for doc in response if not doc.is_error]
recognizer_results = []
# 🔍 Loop through results, cherry-picking valid entities
for res in results:
for entity in res.entities:
# 🚫 Skip unsupported entities—we’re picky like that
if entity.category not in self.supported_entities:
continue
# 📝 Craft a fancy explanation for why we flagged this PII
analysis_explanation = self._build_explanation(
original_score=entity.confidence_score,
entity_type=entity.category,
)
# 🎯 Log the hit: entity type, position, and confidence score
recognizer_results.append(
RecognizerResult(
entity_type=entity.category,
start=entity.offset,
end=entity.offset + len(entity.text),
score=entity.confidence_score,
analysis_explanation=analysis_explanation,
)
)
# 🏆 Return the haul of PII findings—mission accomplished!
return recognizer_results
@staticmethod
def _build_explanation(
original_score: float, entity_type: str
) -> AnalysisExplanation:
"""
📜 Writes a love letter explaining why we flagged a PII entity.
:param original_score: Confidence score from Azure’s NLP oracle.
:param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER).
:return: An AnalysisExplanation object with all the juicy details.
*Witty note*: This is like leaving a Post-it note saying, “Caught ya, sneaky
credit card number!” 😜
"""
explanation = AnalysisExplanation(
recognizer=AzureAIServiceWrapper.__class__.__name__,
original_score=original_score,
textual_explanation=f"Identified as {entity_type} by Text Analytics",
)
return explanation
def load(self) -> None:
"""
🛠️ Placeholder for loading resources—Azure’s already warmed up, so we chill.
*Cheeky remark*: Like a superhero on standby, we’re always ready to leap
into action. No prep needed! 😎
"""
pass
if __name__ == "__main__":
"""
🎮 Demo mode: Test-drive our PII zapper with sample text!
*Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it
to the Hub! 🤗
"""
import presidio_helpers
# 🔐 Load secrets from .env—because hardcoding keys is so last century
dotenv.load_dotenv()
# 📖 Our test story, packed with PII for our hero to vanquish
text = """
Here are a few example sentences we currently support:
Hello, my name is David Johnson and I live in Maine.
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
My passport: 191280342 and my phone number: (212) 555-1234.
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
"""
# 🦸♀️ Summon the analyzer with Azure’s secret sauce
analyzer = presidio_helpers.analyzer_engine(
model_path="Azure Text Analytics PII",
ta_key=os.environ["TA_KEY"],
ta_endpoint=os.environ["TA_ENDPOINT"],
)
# 💥 Unleash the PII-hunting beast on our text
analyzer.analyze(text=text, language="en") |