Spaces:

hackerbyhobby
/

SMS_scam_detection

Running

App Files Files Community

hackerbyhobby commited on Jan 24

Commit

ee7ace1

unverified ·

1 Parent(s): c624cbc

updated requirements and added apt.txt

Browse files

Files changed (1) hide show

app.py +96 -44

app.py CHANGED Viewed

@@ -4,39 +4,75 @@ from PIL import Image
 from transformers import pipeline
 import re
-# 1. Load keywords from separate files
 with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
     SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
 with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
     OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
-# 2. Load the zero-shot classification pipeline
 model_name = "joeddav/xlm-roberta-large-xnli"
 classifier = pipeline("zero-shot-classification", model=model_name)
-# We will classify among these three labels
 CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
-def boost_probabilities(probabilities: dict, text: str) -> dict:
     """
-    Increases SMiShing probability if 'smishing_keywords' or URLs are found.
-    Increases Other Scam probability if 'other_scam_keywords' are found.
-    Reduces Legitimate by the total amount of these boosts.
-    Then clamps negative probabilities to 0 and re-normalizes.
     """
     lower_text = text.lower()
-    # Count smishing keywords
-    smishing_keyword_count = sum(1 for kw in SMISHING_KEYWORDS if kw in lower_text)
-    # Count other scam keywords
-    other_scam_keyword_count = sum(1 for kw in OTHER_SCAM_KEYWORDS if kw in lower_text)
-    # Base boosts
-    smishing_boost = 0.30 * smishing_keyword_count
-    other_scam_boost = 0.30 * other_scam_keyword_count
-    # Check URLs => +0.20 only to Smishing
     found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
     if found_urls:
         smishing_boost += 0.35
@@ -50,7 +86,7 @@ def boost_probabilities(probabilities: dict, text: str) -> dict:
     p_smishing += smishing_boost
     p_other_scam += other_scam_boost
-    # Subtract total boost from Legitimate
     total_boost = smishing_boost + other_scam_boost
     p_legit -= total_boost
@@ -62,28 +98,30 @@ def boost_probabilities(probabilities: dict, text: str) -> dict:
     if p_legit < 0:
         p_legit = 0.0
-    # Re-normalize so sum=1
     total = p_smishing + p_other_scam + p_legit
     if total > 0:
         p_smishing /= total
         p_other_scam /= total
         p_legit /= total
     else:
-        # fallback if everything is zero
         p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
     return {
         "SMiShing": p_smishing,
         "Other Scam": p_other_scam,
-        "Legitimate": p_legit
     }
 def smishing_detector(text, image):
     """
-    1. OCR if image provided.
     2. Zero-shot classify => base probabilities.
-    3. Boost probabilities based on keywords + URL logic.
-    4. Return final classification + confidence.
     """
     combined_text = text or ""
     if image is not None:
@@ -96,12 +134,11 @@ def smishing_detector(text, image):
             "text_used_for_classification": "(none)",
             "label": "No text provided",
             "confidence": 0.0,
-            "smishing_keywords_found": [],
-            "other_scam_keywords_found": [],
             "urls_found": []
         }
-    # Perform zero-shot classification
     result = classifier(
         sequences=combined_text,
         candidate_labels=CANDIDATE_LABELS,
@@ -109,29 +146,47 @@ def smishing_detector(text, image):
     )
     original_probs = dict(zip(result["labels"], result["scores"]))
-    # Apply boosts
-    boosted_probs = boost_probabilities(original_probs, combined_text)
-    final_label = max(boosted_probs, key=boosted_probs.get)
-    final_confidence = round(boosted_probs[final_label], 3)
-    # For display: which keywords + URLs
     lower_text = combined_text.lower()
-    smishing_found = [kw for kw in SMISHING_KEYWORDS if kw in lower_text]
-    other_scam_found = [kw for kw in OTHER_SCAM_KEYWORDS if kw in lower_text]
     found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
     return {
         "text_used_for_classification": combined_text,
         "original_probabilities": {
             k: round(v, 3) for k, v in original_probs.items()
         },
         "boosted_probabilities": {
-            k: round(v, 3) for k, v in boosted_probs.items()
         },
         "label": final_label,
         "confidence": final_confidence,
-        "smishing_keywords_found": smishing_found,
-        "other_scam_keywords_found": other_scam_found,
         "urls_found": found_urls,
     }
@@ -149,15 +204,12 @@ demo = gr.Interface(
         )
     ],
     outputs="json",
-    title="SMiShing & Scam Detector (Separate Keywords + URL → SMiShing)",
     description="""
 This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
-(joeddav/xlm-roberta-large-xnli).
-- 'smishing_keywords.txt' boosts SMiShing specifically.
-- 'other_scam_keywords.txt' boosts Other Scam specifically.
-- Any URL found further boosts ONLY Smishing.
-- The total boost is subtracted from Legitimate.
-Supports English & Spanish text (OCR included).
 """,
     allow_flagging="never"
 )

 from transformers import pipeline
 import re
+# Language detection & translation
+from langdetect import detect
+from googletrans import Translator
+translator = Translator()
+# 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
 with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
     SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
 with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
     OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
+# 2. Zero-Shot Classification Pipeline
 model_name = "joeddav/xlm-roberta-large-xnli"
 classifier = pipeline("zero-shot-classification", model=model_name)
 CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
+def get_keywords_by_language(text: str):
+    """
+    1. Detect language (using `langdetect`).
+    2. If Spanish ('es'), translate each English-based keyword to Spanish using googletrans.
+    3. If English (or anything else), just use the original English lists.
+    """
+    # Attempt to detect language from a snippet (to reduce overhead on very large text)
+    snippet = text[:200]  # up to 200 chars for detection
+    try:
+        detected_lang = detect(snippet)
+    except:
+        detected_lang = "en"  # fallback if detection fails
+    if detected_lang == "es":
+        # Translate all SMiShing and Other Scam keywords to Spanish
+        smishing_in_spanish = [
+            translator.translate(kw, src="en", dest="es").text.lower()
+            for kw in SMISHING_KEYWORDS
+        ]
+        other_scam_in_spanish = [
+            translator.translate(kw, src="en", dest="es").text.lower()
+            for kw in OTHER_SCAM_KEYWORDS
+        ]
+        return smishing_in_spanish, other_scam_in_spanish, "es"
+    else:
+        # Default to English keywords
+        return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
+def boost_probabilities(probabilities: dict, text: str):
     """
+    1. Load the appropriate keyword lists (English or Spanish).
+    2. Count matches for SMiShing vs. Other Scam.
+    3. If a URL is found, add an extra boost only to SMiShing.
+    4. Subtract total boost from 'Legitimate'.
+    5. Clamp negative probabilities to 0, re-normalize.
     """
     lower_text = text.lower()
+    # Grab the correct keyword lists based on language
+    smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
+    # Count SMiShing keyword matches
+    smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
+    # Count Other Scam keyword matches
+    other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
+    # Base boost amounts
+    smishing_boost = 0.30 * smishing_count
+    other_scam_boost = 0.30 * other_scam_count
+    # Check for URLs => +0.35 only to SMiShing
     found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
     if found_urls:
         smishing_boost += 0.35
     p_smishing += smishing_boost
     p_other_scam += other_scam_boost
+    # Subtract total boost from 'Legitimate'
     total_boost = smishing_boost + other_scam_boost
     p_legit -= total_boost
     if p_legit < 0:
         p_legit = 0.0
+    # Re-normalize
     total = p_smishing + p_other_scam + p_legit
     if total > 0:
         p_smishing /= total
         p_other_scam /= total
         p_legit /= total
     else:
+        # fallback if everything is 0
         p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
     return {
         "SMiShing": p_smishing,
         "Other Scam": p_other_scam,
+        "Legitimate": p_legit,
+        "detected_lang": detected_lang
     }
 def smishing_detector(text, image):
     """
+    Main function called by Gradio.
+    1. Combine user text + OCR text (if an image is provided).
     2. Zero-shot classify => base probabilities.
+    3. Apply language detection & translation if needed, then boost logic.
+    4. Return final classification.
     """
     combined_text = text or ""
     if image is not None:
             "text_used_for_classification": "(none)",
             "label": "No text provided",
             "confidence": 0.0,
+            "keywords_found": [],
             "urls_found": []
         }
+    # 1. Zero-shot classification
     result = classifier(
         sequences=combined_text,
         candidate_labels=CANDIDATE_LABELS,
     )
     original_probs = dict(zip(result["labels"], result["scores"]))
+    # 2. Boost logic (including language detection + translation)
+    boosted = boost_probabilities(original_probs, combined_text)
+    final_label = max(boosted, key=boosted.get) if not isinstance(boosted.get("detected_lang"), float) else "Legitimate"
+    # to avoid conflict, let's store the detected language separately:
+    detected_lang = boosted.pop("detected_lang", "en")
+    # We have p_smishing, p_other_scam, p_legit left in boosted
+    final_label = max(boosted, key=boosted.get)
+    final_confidence = round(boosted[final_label], 3)
+    # 3. Identify which keywords & URLs we found
     lower_text = combined_text.lower()
+    # If we detected Spanish, we used the translated keywords to do matching. But let's also show them:
+    # For demonstration, let's just show the "English or Spanish" keywords. The code to show them in output
+    # can be the same as before, or you can do a second pass with the same logic from boost_probabilities.
     found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
+    # We'll do a quick second pass on actual matched keywords so user sees them
+    # - If language is es => we used translated Spanish keywords, let's do the same for display
+    # - If language is en => we used the original English lists
+    if detected_lang == "es":
+        smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
+    else:
+        smishing_keys, scam_keys, _ = (SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en")
+    found_smishing = [kw for kw in smishing_keys if kw in lower_text]
+    found_other_scam = [kw for kw in scam_keys if kw in lower_text]
     return {
+        "detected_language": detected_lang,
         "text_used_for_classification": combined_text,
         "original_probabilities": {
             k: round(v, 3) for k, v in original_probs.items()
         },
         "boosted_probabilities": {
+            k: round(v, 3) for k, v in boosted.items()
         },
         "label": final_label,
         "confidence": final_confidence,
+        "smishing_keywords_found": found_smishing,
+        "other_scam_keywords_found": found_other_scam,
         "urls_found": found_urls,
     }
         )
     ],
     outputs="json",
+    title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
     description="""
 This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
+(joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
+If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
+Any URL found further boosts SMiShing specifically.
 """,
     allow_flagging="never"
 )