hackerbyhobby commited on
Commit
2a76f84
·
unverified ·
1 Parent(s): 7169f21

rollback to the best

Browse files
Files changed (2) hide show
  1. app.py +208 -125
  2. app.py.bestoftues +0 -380
app.py CHANGED
@@ -7,16 +7,10 @@ from langdetect import detect
7
  from deep_translator import GoogleTranslator
8
  import openai
9
  import os
10
- import requests
11
- import json
12
 
13
  # Set your OpenAI API key
14
  openai.api_key = os.getenv("OPENAI_API_KEY")
15
 
16
- # Retrieve Google Safe Browsing API key from environment
17
- SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
18
- SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
19
-
20
  # Translator instance
21
  translator = GoogleTranslator(source="auto", target="es")
22
 
@@ -32,80 +26,6 @@ model_name = "joeddav/xlm-roberta-large-xnli"
32
  classifier = pipeline("zero-shot-classification", model=model_name)
33
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
34
 
35
- def check_urls_with_google_safebrowsing(urls):
36
- """
37
- Debug-enabled version of Google Safe Browsing check:
38
- - Prints payload and response to help troubleshoot issues.
39
- Returns a dict {url: bool is_malicious}.
40
- If the API key is missing or error occurs, returns {url: False}.
41
- """
42
- result = {}
43
- if not SAFE_BROWSING_API_KEY:
44
- print("No GOOGLE_SAFE_BROWSING_API_KEY found. Returning all URLs as safe.")
45
- for u in urls:
46
- result[u] = False
47
- return result
48
-
49
- threat_entries = [{"url": u} for u in urls]
50
- payload = {
51
- "client": {
52
- "clientId": "my-smishing-detector",
53
- "clientVersion": "1.0"
54
- },
55
- "threatInfo": {
56
- "threatTypes": [
57
- "MALWARE",
58
- "SOCIAL_ENGINEERING",
59
- "UNWANTED_SOFTWARE",
60
- "POTENTIALLY_HARMFUL_APPLICATION"
61
- ],
62
- "platformTypes": ["ANY_PLATFORM"],
63
- "threatEntryTypes": ["URL"],
64
- "threatEntries": threat_entries
65
- }
66
- }
67
-
68
- print("---- Safe Browsing Debug ----")
69
- print("REQUEST Endpoint:", SAFE_BROWSING_URL)
70
- print("API Key:", SAFE_BROWSING_API_KEY)
71
- print("REQUEST Payload (JSON):")
72
- print(json.dumps(payload, indent=2))
73
-
74
- try:
75
- resp = requests.post(
76
- SAFE_BROWSING_URL,
77
- params={"key": SAFE_BROWSING_API_KEY},
78
- json=payload,
79
- timeout=10
80
- )
81
- print("RESPONSE Status Code:", resp.status_code)
82
- try:
83
- data = resp.json()
84
- print("RESPONSE JSON:")
85
- print(json.dumps(data, indent=2))
86
- except Exception as parse_err:
87
- print("Error parsing response as JSON:", parse_err)
88
- data = {}
89
-
90
- malicious_urls = set()
91
- if "matches" in data:
92
- for match in data["matches"]:
93
- threat_url = match.get("threat", {}).get("url")
94
- if threat_url:
95
- malicious_urls.add(threat_url)
96
-
97
- for u in urls:
98
- result[u] = (u in malicious_urls)
99
-
100
- except Exception as e:
101
- print(f"Error contacting Safe Browsing API: {e}")
102
- for u in urls:
103
- result[u] = False
104
-
105
- print("RESULTS (url -> malicious):", result)
106
- print("---- End Debug ----\n")
107
- return result
108
-
109
  def get_keywords_by_language(text: str):
110
  """
111
  Detect language using langdetect and translate keywords if needed.
@@ -129,8 +49,7 @@ def get_keywords_by_language(text: str):
129
 
130
  def boost_probabilities(probabilities: dict, text: str):
131
  """
132
- Boost probabilities based on keyword matches, presence of URLs,
133
- and Google Safe Browsing checks.
134
  """
135
  lower_text = text.lower()
136
  smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
@@ -141,11 +60,7 @@ def boost_probabilities(probabilities: dict, text: str):
141
  smishing_boost = 0.30 * smishing_count
142
  other_scam_boost = 0.30 * other_scam_count
143
 
144
- # More robust URL pattern
145
- found_urls = re.findall(
146
- r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
147
- lower_text
148
- )
149
  if found_urls:
150
  smishing_boost += 0.35
151
 
@@ -157,11 +72,12 @@ def boost_probabilities(probabilities: dict, text: str):
157
  p_other_scam += other_scam_boost
158
  p_legit -= (smishing_boost + other_scam_boost)
159
 
160
- # Preliminary clamp & normalization
161
  p_smishing = max(p_smishing, 0.0)
162
  p_other_scam = max(p_other_scam, 0.0)
163
  p_legit = max(p_legit, 0.0)
164
 
 
165
  total = p_smishing + p_other_scam + p_legit
166
  if total > 0:
167
  p_smishing /= total
@@ -170,29 +86,147 @@ def boost_probabilities(probabilities: dict, text: str):
170
  else:
171
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
172
 
173
- # **Now** check Safe Browsing (with debug prints)
174
- sb_results = {}
175
- if found_urls:
176
- sb_results = check_urls_with_google_safebrowsing(found_urls)
177
- # If any malicious => set p_smishing=1.0
178
- if any(sb_results[u] for u in sb_results):
179
- p_smishing = 1.0
180
- p_other_scam = 0.0
181
- p_legit = 0.0
182
-
183
  return {
184
  "SMiShing": p_smishing,
185
  "Other Scam": p_other_scam,
186
  "Legitimate": p_legit,
187
- "detected_lang": detected_lang,
188
- "found_urls": found_urls,
189
- "safe_browsing_results": sb_results
190
  }
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  def smishing_detector(input_type, text, image):
193
  """
194
  Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
195
- and debugging logs for Safe Browsing calls.
 
 
196
  """
197
  if input_type == "Text":
198
  combined_text = text.strip() if text else ""
@@ -208,7 +242,9 @@ def smishing_detector(input_type, text, image):
208
  "confidence": 0.0,
209
  "keywords_found": [],
210
  "urls_found": [],
211
- "safe_browsing_results": {},
 
 
212
  }
213
 
214
  # 1. Local zero-shot classification
@@ -219,45 +255,90 @@ def smishing_detector(input_type, text, image):
219
  )
220
  original_probs = {k: float(v) for k, v in zip(local_result["labels"], local_result["scores"])}
221
 
222
- # 2. Boost with keywords, URLs, and Safe Browsing checks
223
- boosted_dict = boost_probabilities(original_probs, combined_text)
224
- detected_lang = boosted_dict.pop("detected_lang", "en")
225
- sb_results = boosted_dict.pop("safe_browsing_results", {})
226
- found_urls = boosted_dict.pop("found_urls", [])
227
-
228
- for k in boosted_dict:
229
- boosted_dict[k] = float(boosted_dict[k])
230
-
231
- final_label = max(boosted_dict, key=boosted_dict.get)
232
- final_confidence = round(boosted_dict[final_label], 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  return {
235
  "detected_language": detected_lang,
236
  "text_used_for_classification": combined_text,
237
  "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
238
- "boosted_probabilities": {k: round(v, 3) for k, v in boosted_dict.items()},
 
 
 
239
  "label": final_label,
240
  "confidence": final_confidence,
 
 
241
  "urls_found": found_urls,
242
- "safe_browsing_results": sb_results
243
  }
244
 
245
  #
246
  # Gradio interface with dynamic visibility
247
  #
248
  def toggle_inputs(choice):
 
 
 
249
  if choice == "Text":
 
250
  return gr.update(visible=True), gr.update(visible=False)
251
  else:
 
 
252
  return gr.update(visible=False), gr.update(visible=True)
253
 
254
  with gr.Blocks() as demo:
255
- gr.Markdown("## SMiShing & Scam Detector with Debug-Enabled Safe Browsing")
256
-
257
  with gr.Row():
258
  input_type = gr.Radio(
259
- choices=["Text", "Screenshot"],
260
- value="Text",
261
  label="Choose Input Type"
262
  )
263
 
@@ -265,14 +346,16 @@ with gr.Blocks() as demo:
265
  lines=3,
266
  label="Paste Suspicious SMS Text",
267
  placeholder="Type or paste the message here...",
268
- visible=True
269
  )
 
270
  image_input = gr.Image(
271
  type="pil",
272
  label="Upload Screenshot",
273
- visible=False
274
  )
275
 
 
276
  input_type.change(
277
  fn=toggle_inputs,
278
  inputs=input_type,
@@ -280,9 +363,11 @@ with gr.Blocks() as demo:
280
  queue=False
281
  )
282
 
 
283
  analyze_btn = gr.Button("Classify")
284
  output_json = gr.JSON(label="Result")
285
 
 
286
  analyze_btn.click(
287
  fn=smishing_detector,
288
  inputs=[input_type, text_input, image_input],
@@ -291,7 +376,5 @@ with gr.Blocks() as demo:
291
 
292
  if __name__ == "__main__":
293
  if not openai.api_key:
294
- print("WARNING: OPENAI_API_KEY not set. LLM calls may fail.")
295
- if not SAFE_BROWSING_API_KEY:
296
- print("WARNING: GOOGLE_SAFE_BROWSING_API_KEY not set. All URLs returned as safe.")
297
  demo.launch()
 
7
  from deep_translator import GoogleTranslator
8
  import openai
9
  import os
 
 
10
 
11
  # Set your OpenAI API key
12
  openai.api_key = os.getenv("OPENAI_API_KEY")
13
 
 
 
 
 
14
  # Translator instance
15
  translator = GoogleTranslator(source="auto", target="es")
16
 
 
26
  classifier = pipeline("zero-shot-classification", model=model_name)
27
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_keywords_by_language(text: str):
30
  """
31
  Detect language using langdetect and translate keywords if needed.
 
49
 
50
  def boost_probabilities(probabilities: dict, text: str):
51
  """
52
+ Boost probabilities based on keyword matches and presence of URLs.
 
53
  """
54
  lower_text = text.lower()
55
  smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
 
60
  smishing_boost = 0.30 * smishing_count
61
  other_scam_boost = 0.30 * other_scam_count
62
 
63
+ found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
 
 
 
 
64
  if found_urls:
65
  smishing_boost += 0.35
66
 
 
72
  p_other_scam += other_scam_boost
73
  p_legit -= (smishing_boost + other_scam_boost)
74
 
75
+ # Clamp
76
  p_smishing = max(p_smishing, 0.0)
77
  p_other_scam = max(p_other_scam, 0.0)
78
  p_legit = max(p_legit, 0.0)
79
 
80
+ # Re-normalize
81
  total = p_smishing + p_other_scam + p_legit
82
  if total > 0:
83
  p_smishing /= total
 
86
  else:
87
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
88
 
 
 
 
 
 
 
 
 
 
 
89
  return {
90
  "SMiShing": p_smishing,
91
  "Other Scam": p_other_scam,
92
  "Legitimate": p_legit,
93
+ "detected_lang": detected_lang
 
 
94
  }
95
 
96
+ def query_llm_for_classification(raw_message: str) -> dict:
97
+ """
98
+ First LLM call: asks for a classification (SMiShing, Other Scam, or Legitimate)
99
+ acting as a cybersecurity expert. Returns label and short reason.
100
+ """
101
+ if not raw_message.strip():
102
+ return {"label": "Unknown", "reason": "No message provided to the LLM."}
103
+
104
+ system_prompt = (
105
+ "You are a cybersecurity expert. You will classify the user's message "
106
+ "as one of: SMiShing, Other Scam, or Legitimate. Provide a short reason. "
107
+ "Return only JSON with keys: label, reason."
108
+ )
109
+ user_prompt = f"Message: {raw_message}\nClassify it as SMiShing, Other Scam, or Legitimate."
110
+
111
+ try:
112
+ response = openai.ChatCompletion.create(
113
+ model="gpt-3.5-turbo",
114
+ messages=[
115
+ {"role": "system", "content": system_prompt},
116
+ {"role": "user", "content": user_prompt}
117
+ ],
118
+ temperature=0.2
119
+ )
120
+ raw_reply = response["choices"][0]["message"]["content"].strip()
121
+
122
+ import json
123
+ llm_result = json.loads(raw_reply)
124
+ if "label" not in llm_result or "reason" not in llm_result:
125
+ return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
126
+
127
+ return llm_result
128
+
129
+ except Exception as e:
130
+ return {"label": "Unknown", "reason": f"LLM error: {e}"}
131
+
132
+ def incorporate_llm_label(boosted: dict, llm_label: str) -> dict:
133
+ """
134
+ Adjust the final probabilities based on the LLM's classification.
135
+ If LLM says SMiShing, add +0.2 to SMiShing, etc. Then clamp & re-normalize.
136
+ """
137
+ if llm_label == "SMiShing":
138
+ boosted["SMiShing"] += 0.2
139
+ elif llm_label == "Other Scam":
140
+ boosted["Other Scam"] += 0.2
141
+ elif llm_label == "Legitimate":
142
+ boosted["Legitimate"] += 0.2
143
+ # else "Unknown" => do nothing
144
+
145
+ # clamp
146
+ for k in boosted:
147
+ if boosted[k] < 0:
148
+ boosted[k] = 0.0
149
+
150
+ total = sum(boosted.values())
151
+ if total > 0:
152
+ for k in boosted:
153
+ boosted[k] /= total
154
+ else:
155
+ # fallback
156
+ boosted["Legitimate"] = 1.0
157
+ boosted["SMiShing"] = 0.0
158
+ boosted["Other Scam"] = 0.0
159
+
160
+ return boosted
161
+
162
+ def query_llm_for_explanation(
163
+ text: str,
164
+ final_label: str,
165
+ final_conf: float,
166
+ local_label: str,
167
+ local_conf: float,
168
+ llm_label: str,
169
+ llm_reason: str,
170
+ found_smishing: list,
171
+ found_other_scam: list,
172
+ found_urls: list,
173
+ detected_lang: str
174
+ ) -> str:
175
+ """
176
+ Second LLM call: provides a holistic explanation of the final classification
177
+ in the same language as detected_lang (English or Spanish).
178
+ """
179
+ # Decide the language for final explanation
180
+ if detected_lang == "es":
181
+ # Spanish
182
+ system_prompt = (
183
+ "Eres un experto en ciberseguridad. Proporciona una explicación final al usuario en español. "
184
+ "Combina la clasificación local, la clasificación LLM y la etiqueta final en una sola explicación breve. "
185
+ "No reveles el código interno ni el JSON bruto; simplemente da una breve explicación fácil de entender. "
186
+ "Termina con la etiqueta final. "
187
+ )
188
+ else:
189
+ # Default to English
190
+ system_prompt = (
191
+ "You are a cybersecurity expert providing a final explanation to the user in English. "
192
+ "Combine the local classification, the LLM classification, and the final label "
193
+ "into one concise explanation. Do not reveal internal code or raw JSON. "
194
+ "End with a final statement of the final label."
195
+ )
196
+
197
+ user_context = f"""
198
+ User Message:
199
+ {text}
200
+
201
+ Local Classification => Label: {local_label}, Confidence: {local_conf}
202
+ LLM Classification => Label: {llm_label}, Reason: {llm_reason}
203
+ Final Overall Label => {final_label} (confidence {final_conf})
204
+
205
+ Suspicious SMiShing Keywords => {found_smishing}
206
+ Suspicious Other Scam Keywords => {found_other_scam}
207
+ URLs => {found_urls}
208
+ """
209
+
210
+ try:
211
+ response = openai.ChatCompletion.create(
212
+ model="gpt-3.5-turbo",
213
+ messages=[
214
+ {"role": "system", "content": system_prompt},
215
+ {"role": "user", "content": user_context}
216
+ ],
217
+ temperature=0.2
218
+ )
219
+ final_explanation = response["choices"][0]["message"]["content"].strip()
220
+ return final_explanation
221
+ except Exception as e:
222
+ return f"Could not generate final explanation due to error: {e}"
223
+
224
  def smishing_detector(input_type, text, image):
225
  """
226
  Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
227
+ plus two LLM calls:
228
+ 1) classification to adjust final probabilities,
229
+ 2) a final explanation summarizing the outcome in the detected language.
230
  """
231
  if input_type == "Text":
232
  combined_text = text.strip() if text else ""
 
242
  "confidence": 0.0,
243
  "keywords_found": [],
244
  "urls_found": [],
245
+ "llm_label": "Unknown",
246
+ "llm_reason": "No text to analyze",
247
+ "final_explanation": "No text provided"
248
  }
249
 
250
  # 1. Local zero-shot classification
 
255
  )
256
  original_probs = {k: float(v) for k, v in zip(local_result["labels"], local_result["scores"])}
257
 
258
+ # 2. Basic boosting from keywords & URLs
259
+ boosted = boost_probabilities(original_probs, combined_text)
260
+ detected_lang = boosted.pop("detected_lang", "en")
261
+
262
+ # Convert to float only
263
+ for k in boosted:
264
+ boosted[k] = float(boosted[k])
265
+
266
+ local_label = max(boosted, key=boosted.get)
267
+ local_conf = round(boosted[local_label], 3)
268
+
269
+ # 3. LLM Classification
270
+ llm_classification = query_llm_for_classification(combined_text)
271
+ llm_label = llm_classification.get("label", "Unknown")
272
+ llm_reason = llm_classification.get("reason", "No reason provided")
273
+
274
+ # 4. Incorporate LLM’s label into final probabilities
275
+ boosted = incorporate_llm_label(boosted, llm_label)
276
+
277
+ # Now we have updated probabilities
278
+ final_label = max(boosted, key=boosted.get)
279
+ final_confidence = round(boosted[final_label], 3)
280
+
281
+ # 5. Gather found keywords & URLs
282
+ lower_text = combined_text.lower()
283
+ smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
284
+
285
+ found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
286
+ found_smishing = [kw for kw in smishing_keys if kw in lower_text]
287
+ found_other_scam = [kw for kw in scam_keys if kw in lower_text]
288
+
289
+ # 6. Final LLM explanation (in detected_lang)
290
+ final_explanation = query_llm_for_explanation(
291
+ text=combined_text,
292
+ final_label=final_label,
293
+ final_conf=final_confidence,
294
+ local_label=local_label,
295
+ local_conf=local_conf,
296
+ llm_label=llm_label,
297
+ llm_reason=llm_reason,
298
+ found_smishing=found_smishing,
299
+ found_other_scam=found_other_scam,
300
+ found_urls=found_urls,
301
+ detected_lang=detected_lang
302
+ )
303
 
304
  return {
305
  "detected_language": detected_lang,
306
  "text_used_for_classification": combined_text,
307
  "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
308
+ "boosted_probabilities_before_llm": {local_label: local_conf},
309
+ "llm_label": llm_label,
310
+ "llm_reason": llm_reason,
311
+ "boosted_probabilities_after_llm": {k: round(v, 3) for k, v in boosted.items()},
312
  "label": final_label,
313
  "confidence": final_confidence,
314
+ "smishing_keywords_found": found_smishing,
315
+ "other_scam_keywords_found": found_other_scam,
316
  "urls_found": found_urls,
317
+ "final_explanation": final_explanation,
318
  }
319
 
320
  #
321
  # Gradio interface with dynamic visibility
322
  #
323
  def toggle_inputs(choice):
324
+ """
325
+ Return updates for (text_input, image_input) based on the radio selection.
326
+ """
327
  if choice == "Text":
328
+ # Show text input, hide image
329
  return gr.update(visible=True), gr.update(visible=False)
330
  else:
331
+ # choice == "Screenshot"
332
+ # Hide text input, show image
333
  return gr.update(visible=False), gr.update(visible=True)
334
 
335
  with gr.Blocks() as demo:
336
+ gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic (Multilingual Explanation)")
337
+
338
  with gr.Row():
339
  input_type = gr.Radio(
340
+ choices=["Text", "Screenshot"],
341
+ value="Text",
342
  label="Choose Input Type"
343
  )
344
 
 
346
  lines=3,
347
  label="Paste Suspicious SMS Text",
348
  placeholder="Type or paste the message here...",
349
+ visible=True # default
350
  )
351
+
352
  image_input = gr.Image(
353
  type="pil",
354
  label="Upload Screenshot",
355
+ visible=False # hidden by default
356
  )
357
 
358
+ # Whenever input_type changes, toggle which input is visible
359
  input_type.change(
360
  fn=toggle_inputs,
361
  inputs=input_type,
 
363
  queue=False
364
  )
365
 
366
+ # Button to run classification
367
  analyze_btn = gr.Button("Classify")
368
  output_json = gr.JSON(label="Result")
369
 
370
+ # On button click, call the smishing_detector
371
  analyze_btn.click(
372
  fn=smishing_detector,
373
  inputs=[input_type, text_input, image_input],
 
376
 
377
  if __name__ == "__main__":
378
  if not openai.api_key:
379
+ print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
 
 
380
  demo.launch()
app.py.bestoftues DELETED
@@ -1,380 +0,0 @@
1
- import gradio as gr
2
- import pytesseract
3
- from PIL import Image
4
- from transformers import pipeline
5
- import re
6
- from langdetect import detect
7
- from deep_translator import GoogleTranslator
8
- import openai
9
- import os
10
-
11
- # Set your OpenAI API key
12
- openai.api_key = os.getenv("OPENAI_API_KEY")
13
-
14
- # Translator instance
15
- translator = GoogleTranslator(source="auto", target="es")
16
-
17
- # 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
18
- with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
19
- SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
20
-
21
- with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
22
- OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
23
-
24
- # 2. Zero-Shot Classification Pipeline
25
- model_name = "joeddav/xlm-roberta-large-xnli"
26
- classifier = pipeline("zero-shot-classification", model=model_name)
27
- CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
28
-
29
- def get_keywords_by_language(text: str):
30
- """
31
- Detect language using langdetect and translate keywords if needed.
32
- """
33
- snippet = text[:200]
34
- try:
35
- detected_lang = detect(snippet)
36
- except Exception:
37
- detected_lang = "en"
38
-
39
- if detected_lang == "es":
40
- smishing_in_spanish = [
41
- translator.translate(kw).lower() for kw in SMISHING_KEYWORDS
42
- ]
43
- other_scam_in_spanish = [
44
- translator.translate(kw).lower() for kw in OTHER_SCAM_KEYWORDS
45
- ]
46
- return smishing_in_spanish, other_scam_in_spanish, "es"
47
- else:
48
- return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
49
-
50
- def boost_probabilities(probabilities: dict, text: str):
51
- """
52
- Boost probabilities based on keyword matches and presence of URLs.
53
- """
54
- lower_text = text.lower()
55
- smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
56
-
57
- smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
58
- other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
59
-
60
- smishing_boost = 0.30 * smishing_count
61
- other_scam_boost = 0.30 * other_scam_count
62
-
63
- found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
64
- if found_urls:
65
- smishing_boost += 0.35
66
-
67
- p_smishing = probabilities.get("SMiShing", 0.0)
68
- p_other_scam = probabilities.get("Other Scam", 0.0)
69
- p_legit = probabilities.get("Legitimate", 1.0)
70
-
71
- p_smishing += smishing_boost
72
- p_other_scam += other_scam_boost
73
- p_legit -= (smishing_boost + other_scam_boost)
74
-
75
- # Clamp
76
- p_smishing = max(p_smishing, 0.0)
77
- p_other_scam = max(p_other_scam, 0.0)
78
- p_legit = max(p_legit, 0.0)
79
-
80
- # Re-normalize
81
- total = p_smishing + p_other_scam + p_legit
82
- if total > 0:
83
- p_smishing /= total
84
- p_other_scam /= total
85
- p_legit /= total
86
- else:
87
- p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
88
-
89
- return {
90
- "SMiShing": p_smishing,
91
- "Other Scam": p_other_scam,
92
- "Legitimate": p_legit,
93
- "detected_lang": detected_lang
94
- }
95
-
96
- def query_llm_for_classification(raw_message: str) -> dict:
97
- """
98
- First LLM call: asks for a classification (SMiShing, Other Scam, or Legitimate)
99
- acting as a cybersecurity expert. Returns label and short reason.
100
- """
101
- if not raw_message.strip():
102
- return {"label": "Unknown", "reason": "No message provided to the LLM."}
103
-
104
- system_prompt = (
105
- "You are a cybersecurity expert. You will classify the user's message "
106
- "as one of: SMiShing, Other Scam, or Legitimate. Provide a short reason. "
107
- "Return only JSON with keys: label, reason."
108
- )
109
- user_prompt = f"Message: {raw_message}\nClassify it as SMiShing, Other Scam, or Legitimate."
110
-
111
- try:
112
- response = openai.ChatCompletion.create(
113
- model="gpt-3.5-turbo",
114
- messages=[
115
- {"role": "system", "content": system_prompt},
116
- {"role": "user", "content": user_prompt}
117
- ],
118
- temperature=0.2
119
- )
120
- raw_reply = response["choices"][0]["message"]["content"].strip()
121
-
122
- import json
123
- llm_result = json.loads(raw_reply)
124
- if "label" not in llm_result or "reason" not in llm_result:
125
- return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
126
-
127
- return llm_result
128
-
129
- except Exception as e:
130
- return {"label": "Unknown", "reason": f"LLM error: {e}"}
131
-
132
- def incorporate_llm_label(boosted: dict, llm_label: str) -> dict:
133
- """
134
- Adjust the final probabilities based on the LLM's classification.
135
- If LLM says SMiShing, add +0.2 to SMiShing, etc. Then clamp & re-normalize.
136
- """
137
- if llm_label == "SMiShing":
138
- boosted["SMiShing"] += 0.2
139
- elif llm_label == "Other Scam":
140
- boosted["Other Scam"] += 0.2
141
- elif llm_label == "Legitimate":
142
- boosted["Legitimate"] += 0.2
143
- # else "Unknown" => do nothing
144
-
145
- # clamp
146
- for k in boosted:
147
- if boosted[k] < 0:
148
- boosted[k] = 0.0
149
-
150
- total = sum(boosted.values())
151
- if total > 0:
152
- for k in boosted:
153
- boosted[k] /= total
154
- else:
155
- # fallback
156
- boosted["Legitimate"] = 1.0
157
- boosted["SMiShing"] = 0.0
158
- boosted["Other Scam"] = 0.0
159
-
160
- return boosted
161
-
162
- def query_llm_for_explanation(
163
- text: str,
164
- final_label: str,
165
- final_conf: float,
166
- local_label: str,
167
- local_conf: float,
168
- llm_label: str,
169
- llm_reason: str,
170
- found_smishing: list,
171
- found_other_scam: list,
172
- found_urls: list,
173
- detected_lang: str
174
- ) -> str:
175
- """
176
- Second LLM call: provides a holistic explanation of the final classification
177
- in the same language as detected_lang (English or Spanish).
178
- """
179
- # Decide the language for final explanation
180
- if detected_lang == "es":
181
- # Spanish
182
- system_prompt = (
183
- "Eres un experto en ciberseguridad. Proporciona una explicación final al usuario en español. "
184
- "Combina la clasificación local, la clasificación LLM y la etiqueta final en una sola explicación breve. "
185
- "No reveles el código interno ni el JSON bruto; simplemente da una breve explicación fácil de entender. "
186
- "Termina con la etiqueta final. "
187
- )
188
- else:
189
- # Default to English
190
- system_prompt = (
191
- "You are a cybersecurity expert providing a final explanation to the user in English. "
192
- "Combine the local classification, the LLM classification, and the final label "
193
- "into one concise explanation. Do not reveal internal code or raw JSON. "
194
- "End with a final statement of the final label."
195
- )
196
-
197
- user_context = f"""
198
- User Message:
199
- {text}
200
-
201
- Local Classification => Label: {local_label}, Confidence: {local_conf}
202
- LLM Classification => Label: {llm_label}, Reason: {llm_reason}
203
- Final Overall Label => {final_label} (confidence {final_conf})
204
-
205
- Suspicious SMiShing Keywords => {found_smishing}
206
- Suspicious Other Scam Keywords => {found_other_scam}
207
- URLs => {found_urls}
208
- """
209
-
210
- try:
211
- response = openai.ChatCompletion.create(
212
- model="gpt-3.5-turbo",
213
- messages=[
214
- {"role": "system", "content": system_prompt},
215
- {"role": "user", "content": user_context}
216
- ],
217
- temperature=0.2
218
- )
219
- final_explanation = response["choices"][0]["message"]["content"].strip()
220
- return final_explanation
221
- except Exception as e:
222
- return f"Could not generate final explanation due to error: {e}"
223
-
224
- def smishing_detector(input_type, text, image):
225
- """
226
- Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
227
- plus two LLM calls:
228
- 1) classification to adjust final probabilities,
229
- 2) a final explanation summarizing the outcome in the detected language.
230
- """
231
- if input_type == "Text":
232
- combined_text = text.strip() if text else ""
233
- else:
234
- combined_text = ""
235
- if image is not None:
236
- combined_text = pytesseract.image_to_string(image, lang="spa+eng").strip()
237
-
238
- if not combined_text:
239
- return {
240
- "text_used_for_classification": "(none)",
241
- "label": "No text provided",
242
- "confidence": 0.0,
243
- "keywords_found": [],
244
- "urls_found": [],
245
- "llm_label": "Unknown",
246
- "llm_reason": "No text to analyze",
247
- "final_explanation": "No text provided"
248
- }
249
-
250
- # 1. Local zero-shot classification
251
- local_result = classifier(
252
- sequences=combined_text,
253
- candidate_labels=CANDIDATE_LABELS,
254
- hypothesis_template="This message is {}."
255
- )
256
- original_probs = {k: float(v) for k, v in zip(local_result["labels"], local_result["scores"])}
257
-
258
- # 2. Basic boosting from keywords & URLs
259
- boosted = boost_probabilities(original_probs, combined_text)
260
- detected_lang = boosted.pop("detected_lang", "en")
261
-
262
- # Convert to float only
263
- for k in boosted:
264
- boosted[k] = float(boosted[k])
265
-
266
- local_label = max(boosted, key=boosted.get)
267
- local_conf = round(boosted[local_label], 3)
268
-
269
- # 3. LLM Classification
270
- llm_classification = query_llm_for_classification(combined_text)
271
- llm_label = llm_classification.get("label", "Unknown")
272
- llm_reason = llm_classification.get("reason", "No reason provided")
273
-
274
- # 4. Incorporate LLM’s label into final probabilities
275
- boosted = incorporate_llm_label(boosted, llm_label)
276
-
277
- # Now we have updated probabilities
278
- final_label = max(boosted, key=boosted.get)
279
- final_confidence = round(boosted[final_label], 3)
280
-
281
- # 5. Gather found keywords & URLs
282
- lower_text = combined_text.lower()
283
- smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
284
-
285
- found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
286
- found_smishing = [kw for kw in smishing_keys if kw in lower_text]
287
- found_other_scam = [kw for kw in scam_keys if kw in lower_text]
288
-
289
- # 6. Final LLM explanation (in detected_lang)
290
- final_explanation = query_llm_for_explanation(
291
- text=combined_text,
292
- final_label=final_label,
293
- final_conf=final_confidence,
294
- local_label=local_label,
295
- local_conf=local_conf,
296
- llm_label=llm_label,
297
- llm_reason=llm_reason,
298
- found_smishing=found_smishing,
299
- found_other_scam=found_other_scam,
300
- found_urls=found_urls,
301
- detected_lang=detected_lang
302
- )
303
-
304
- return {
305
- "detected_language": detected_lang,
306
- "text_used_for_classification": combined_text,
307
- "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
308
- "boosted_probabilities_before_llm": {local_label: local_conf},
309
- "llm_label": llm_label,
310
- "llm_reason": llm_reason,
311
- "boosted_probabilities_after_llm": {k: round(v, 3) for k, v in boosted.items()},
312
- "label": final_label,
313
- "confidence": final_confidence,
314
- "smishing_keywords_found": found_smishing,
315
- "other_scam_keywords_found": found_other_scam,
316
- "urls_found": found_urls,
317
- "final_explanation": final_explanation,
318
- }
319
-
320
- #
321
- # Gradio interface with dynamic visibility
322
- #
323
- def toggle_inputs(choice):
324
- """
325
- Return updates for (text_input, image_input) based on the radio selection.
326
- """
327
- if choice == "Text":
328
- # Show text input, hide image
329
- return gr.update(visible=True), gr.update(visible=False)
330
- else:
331
- # choice == "Screenshot"
332
- # Hide text input, show image
333
- return gr.update(visible=False), gr.update(visible=True)
334
-
335
- with gr.Blocks() as demo:
336
- gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic (Multilingual Explanation)")
337
-
338
- with gr.Row():
339
- input_type = gr.Radio(
340
- choices=["Text", "Screenshot"],
341
- value="Text",
342
- label="Choose Input Type"
343
- )
344
-
345
- text_input = gr.Textbox(
346
- lines=3,
347
- label="Paste Suspicious SMS Text",
348
- placeholder="Type or paste the message here...",
349
- visible=True # default
350
- )
351
-
352
- image_input = gr.Image(
353
- type="pil",
354
- label="Upload Screenshot",
355
- visible=False # hidden by default
356
- )
357
-
358
- # Whenever input_type changes, toggle which input is visible
359
- input_type.change(
360
- fn=toggle_inputs,
361
- inputs=input_type,
362
- outputs=[text_input, image_input],
363
- queue=False
364
- )
365
-
366
- # Button to run classification
367
- analyze_btn = gr.Button("Classify")
368
- output_json = gr.JSON(label="Result")
369
-
370
- # On button click, call the smishing_detector
371
- analyze_btn.click(
372
- fn=smishing_detector,
373
- inputs=[input_type, text_input, image_input],
374
- outputs=output_json
375
- )
376
-
377
- if __name__ == "__main__":
378
- if not openai.api_key:
379
- print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
380
- demo.launch()