File size: 8,015 Bytes
90d7edb
8edfc45
 
 
 
90d7edb
ee7ace1
 
 
 
 
 
 
def395e
 
90d7edb
def395e
 
 
ee7ace1
8edfc45
 
 
 
ee7ace1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8edfc45
ee7ace1
 
 
 
 
8edfc45
 
 
ee7ace1
 
def395e
ee7ace1
 
 
 
8edfc45
ee7ace1
 
 
 
 
8edfc45
 
c624cbc
def395e
 
 
 
 
 
 
 
 
 
ee7ace1
def395e
 
 
 
 
 
 
 
 
 
 
ee7ace1
def395e
8edfc45
def395e
 
 
8edfc45
ee7ace1
def395e
8edfc45
 
def395e
 
ee7ace1
 
8edfc45
 
 
 
ee7ace1
 
def395e
ee7ace1
 
8edfc45
def395e
8edfc45
 
 
 
def395e
8edfc45
 
 
 
 
ee7ace1
8edfc45
 
 
ee7ace1
8edfc45
 
 
 
 
 
 
ee7ace1
 
 
 
 
 
 
 
 
8edfc45
ee7ace1
8edfc45
ee7ace1
 
 
8edfc45
 
ee7ace1
 
 
 
 
 
 
 
 
 
 
8edfc45
ee7ace1
8edfc45
def395e
 
 
 
ee7ace1
def395e
8edfc45
 
ee7ace1
 
8edfc45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee7ace1
8edfc45
 
ee7ace1
 
 
8edfc45
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import gradio as gr
import pytesseract
from PIL import Image
from transformers import pipeline
import re

# Language detection & translation
from langdetect import detect
from googletrans import Translator

translator = Translator()

# 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
    SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]

with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
    OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]

# 2. Zero-Shot Classification Pipeline
model_name = "joeddav/xlm-roberta-large-xnli"
classifier = pipeline("zero-shot-classification", model=model_name)
CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]

def get_keywords_by_language(text: str):
    """
    1. Detect language (using `langdetect`).
    2. If Spanish ('es'), translate each English-based keyword to Spanish using googletrans.
    3. If English (or anything else), just use the original English lists.
    """
    # Attempt to detect language from a snippet (to reduce overhead on very large text)
    snippet = text[:200]  # up to 200 chars for detection
    try:
        detected_lang = detect(snippet)
    except:
        detected_lang = "en"  # fallback if detection fails

    if detected_lang == "es":
        # Translate all SMiShing and Other Scam keywords to Spanish
        smishing_in_spanish = [
            translator.translate(kw, src="en", dest="es").text.lower()
            for kw in SMISHING_KEYWORDS
        ]
        other_scam_in_spanish = [
            translator.translate(kw, src="en", dest="es").text.lower()
            for kw in OTHER_SCAM_KEYWORDS
        ]
        return smishing_in_spanish, other_scam_in_spanish, "es"
    else:
        # Default to English keywords
        return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"

def boost_probabilities(probabilities: dict, text: str):
    """
    1. Load the appropriate keyword lists (English or Spanish).
    2. Count matches for SMiShing vs. Other Scam.
    3. If a URL is found, add an extra boost only to SMiShing.
    4. Subtract total boost from 'Legitimate'.
    5. Clamp negative probabilities to 0, re-normalize.
    """
    lower_text = text.lower()

    # Grab the correct keyword lists based on language
    smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)

    # Count SMiShing keyword matches
    smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
    # Count Other Scam keyword matches
    other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)

    # Base boost amounts
    smishing_boost = 0.30 * smishing_count
    other_scam_boost = 0.30 * other_scam_count

    # Check for URLs => +0.35 only to SMiShing
    found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
    if found_urls:
        smishing_boost += 0.35

    # Extract original probabilities
    p_smishing = probabilities["SMiShing"]
    p_other_scam = probabilities["Other Scam"]
    p_legit = probabilities["Legitimate"]

    # Apply boosts
    p_smishing += smishing_boost
    p_other_scam += other_scam_boost

    # Subtract total boost from 'Legitimate'
    total_boost = smishing_boost + other_scam_boost
    p_legit -= total_boost

    # Clamp negative probabilities
    if p_smishing < 0:
        p_smishing = 0.0
    if p_other_scam < 0:
        p_other_scam = 0.0
    if p_legit < 0:
        p_legit = 0.0

    # Re-normalize
    total = p_smishing + p_other_scam + p_legit
    if total > 0:
        p_smishing /= total
        p_other_scam /= total
        p_legit /= total
    else:
        # fallback if everything is 0
        p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0

    return {
        "SMiShing": p_smishing,
        "Other Scam": p_other_scam,
        "Legitimate": p_legit,
        "detected_lang": detected_lang
    }

def smishing_detector(text, image):
    """
    Main function called by Gradio.
    1. Combine user text + OCR text (if an image is provided).
    2. Zero-shot classify => base probabilities.
    3. Apply language detection & translation if needed, then boost logic.
    4. Return final classification.
    """
    combined_text = text or ""
    if image is not None:
        ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
        combined_text += " " + ocr_text
    combined_text = combined_text.strip()

    if not combined_text:
        return {
            "text_used_for_classification": "(none)",
            "label": "No text provided",
            "confidence": 0.0,
            "keywords_found": [],
            "urls_found": []
        }

    # 1. Zero-shot classification
    result = classifier(
        sequences=combined_text,
        candidate_labels=CANDIDATE_LABELS,
        hypothesis_template="This message is {}."
    )
    original_probs = dict(zip(result["labels"], result["scores"]))

    # 2. Boost logic (including language detection + translation)
    boosted = boost_probabilities(original_probs, combined_text)
    final_label = max(boosted, key=boosted.get) if not isinstance(boosted.get("detected_lang"), float) else "Legitimate"
    # to avoid conflict, let's store the detected language separately:
    detected_lang = boosted.pop("detected_lang", "en")

    # We have p_smishing, p_other_scam, p_legit left in boosted
    final_label = max(boosted, key=boosted.get)
    final_confidence = round(boosted[final_label], 3)

    # 3. Identify which keywords & URLs we found
    lower_text = combined_text.lower()
    # If we detected Spanish, we used the translated keywords to do matching. But let's also show them:
    # For demonstration, let's just show the "English or Spanish" keywords. The code to show them in output
    # can be the same as before, or you can do a second pass with the same logic from boost_probabilities.
    found_urls = re.findall(r"(https?://[^\s]+)", lower_text)

    # We'll do a quick second pass on actual matched keywords so user sees them
    # - If language is es => we used translated Spanish keywords, let's do the same for display
    # - If language is en => we used the original English lists
    if detected_lang == "es":
        smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
    else:
        smishing_keys, scam_keys, _ = (SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en")

    found_smishing = [kw for kw in smishing_keys if kw in lower_text]
    found_other_scam = [kw for kw in scam_keys if kw in lower_text]

    return {
        "detected_language": detected_lang,
        "text_used_for_classification": combined_text,
        "original_probabilities": {
            k: round(v, 3) for k, v in original_probs.items()
        },
        "boosted_probabilities": {
            k: round(v, 3) for k, v in boosted.items()
        },
        "label": final_label,
        "confidence": final_confidence,
        "smishing_keywords_found": found_smishing,
        "other_scam_keywords_found": found_other_scam,
        "urls_found": found_urls,
    }

demo = gr.Interface(
    fn=smishing_detector,
    inputs=[
        gr.Textbox(
            lines=3,
            label="Paste Suspicious SMS Text (English/Spanish)",
            placeholder="Type or paste the message here..."
        ),
        gr.Image(
            type="pil",
            label="Or Upload a Screenshot (Optional)"
        )
    ],
    outputs="json",
    title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
    description="""
This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
(joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
Any URL found further boosts SMiShing specifically.
""",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()