hackerbyhobby commited on
Commit
07e1348
unverified
1 Parent(s): f7e3939

updated to improve google safebrowsing scoring

Browse files
Files changed (2) hide show
  1. app.py +96 -14
  2. app.py.bestoftues +31 -20
app.py CHANGED
@@ -7,10 +7,16 @@ from langdetect import detect
7
  from deep_translator import GoogleTranslator
8
  import openai
9
  import os
 
 
10
 
11
  # Set your OpenAI API key
12
  openai.api_key = os.getenv("OPENAI_API_KEY")
13
 
 
 
 
 
14
  # Translator instance
15
  translator = GoogleTranslator(source="auto", target="es")
16
 
@@ -26,6 +32,68 @@ model_name = "joeddav/xlm-roberta-large-xnli"
26
  classifier = pipeline("zero-shot-classification", model=model_name)
27
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_keywords_by_language(text: str):
30
  """
31
  Detect language using langdetect and translate keywords if needed.
@@ -49,7 +117,8 @@ def get_keywords_by_language(text: str):
49
 
50
  def boost_probabilities(probabilities: dict, text: str):
51
  """
52
- Boost probabilities based on keyword matches and presence of URLs.
 
53
  """
54
  lower_text = text.lower()
55
  smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
@@ -60,7 +129,12 @@ def boost_probabilities(probabilities: dict, text: str):
60
  smishing_boost = 0.30 * smishing_count
61
  other_scam_boost = 0.30 * other_scam_count
62
 
63
- found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
 
 
 
 
 
64
  if found_urls:
65
  smishing_boost += 0.35
66
 
@@ -72,12 +146,11 @@ def boost_probabilities(probabilities: dict, text: str):
72
  p_other_scam += other_scam_boost
73
  p_legit -= (smishing_boost + other_scam_boost)
74
 
75
- # Clamp
76
  p_smishing = max(p_smishing, 0.0)
77
  p_other_scam = max(p_other_scam, 0.0)
78
  p_legit = max(p_legit, 0.0)
79
 
80
- # Re-normalize
81
  total = p_smishing + p_other_scam + p_legit
82
  if total > 0:
83
  p_smishing /= total
@@ -86,6 +159,16 @@ def boost_probabilities(probabilities: dict, text: str):
86
  else:
87
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
88
 
 
 
 
 
 
 
 
 
 
 
89
  return {
90
  "SMiShing": p_smishing,
91
  "Other Scam": p_other_scam,
@@ -119,7 +202,6 @@ def query_llm_for_classification(raw_message: str) -> dict:
119
  )
120
  raw_reply = response["choices"][0]["message"]["content"].strip()
121
 
122
- import json
123
  llm_result = json.loads(raw_reply)
124
  if "label" not in llm_result or "reason" not in llm_result:
125
  return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
@@ -176,17 +258,14 @@ def query_llm_for_explanation(
176
  Second LLM call: provides a holistic explanation of the final classification
177
  in the same language as detected_lang (English or Spanish).
178
  """
179
- # Decide the language for final explanation
180
  if detected_lang == "es":
181
- # Spanish
182
  system_prompt = (
183
  "Eres un experto en ciberseguridad. Proporciona una explicaci贸n final al usuario en espa帽ol. "
184
  "Combina la clasificaci贸n local, la clasificaci贸n LLM y la etiqueta final en una sola explicaci贸n breve. "
185
  "No reveles el c贸digo interno ni el JSON bruto; simplemente da una breve explicaci贸n f谩cil de entender. "
186
- "Termina con la etiqueta final. "
187
  )
188
  else:
189
- # Default to English
190
  system_prompt = (
191
  "You are a cybersecurity expert providing a final explanation to the user in English. "
192
  "Combine the local classification, the LLM classification, and the final label "
@@ -259,7 +338,6 @@ def smishing_detector(input_type, text, image):
259
  boosted = boost_probabilities(original_probs, combined_text)
260
  detected_lang = boosted.pop("detected_lang", "en")
261
 
262
- # Convert to float only
263
  for k in boosted:
264
  boosted[k] = float(boosted[k])
265
 
@@ -274,7 +352,6 @@ def smishing_detector(input_type, text, image):
274
  # 4. Incorporate LLM鈥檚 label into final probabilities
275
  boosted = incorporate_llm_label(boosted, llm_label)
276
 
277
- # Now we have updated probabilities
278
  final_label = max(boosted, key=boosted.get)
279
  final_confidence = round(boosted[final_label], 3)
280
 
@@ -282,11 +359,14 @@ def smishing_detector(input_type, text, image):
282
  lower_text = combined_text.lower()
283
  smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
284
 
285
- found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
 
 
 
286
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
287
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
288
 
289
- # 6. Final LLM explanation (in detected_lang)
290
  final_explanation = query_llm_for_explanation(
291
  text=combined_text,
292
  final_label=final_label,
@@ -333,7 +413,7 @@ def toggle_inputs(choice):
333
  return gr.update(visible=False), gr.update(visible=True)
334
 
335
  with gr.Blocks() as demo:
336
- gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic (Multilingual Explanation)")
337
 
338
  with gr.Row():
339
  input_type = gr.Radio(
@@ -377,4 +457,6 @@ with gr.Blocks() as demo:
377
  if __name__ == "__main__":
378
  if not openai.api_key:
379
  print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
 
 
380
  demo.launch()
 
7
  from deep_translator import GoogleTranslator
8
  import openai
9
  import os
10
+ import requests
11
+ import json
12
 
13
  # Set your OpenAI API key
14
  openai.api_key = os.getenv("OPENAI_API_KEY")
15
 
16
+ # Retrieve Google Safe Browsing API key from environment
17
+ SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
18
+ SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
19
+
20
  # Translator instance
21
  translator = GoogleTranslator(source="auto", target="es")
22
 
 
32
  classifier = pipeline("zero-shot-classification", model=model_name)
33
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
34
 
35
+ def check_urls_with_google_safebrowsing(urls):
36
+ """
37
+ Queries the Google Safe Browsing API to see if any URLs are malicious.
38
+ Returns a dict {url: bool is_malicious}.
39
+ If the API key is missing or an error occurs, returns {url: False} for all.
40
+ """
41
+ result = {}
42
+ if not SAFE_BROWSING_API_KEY:
43
+ # No key: we can't check. Return all as safe.
44
+ for u in urls:
45
+ result[u] = False
46
+ return result
47
+
48
+ # Build threatEntries for each URL
49
+ threat_entries = [{"url": u} for u in urls]
50
+
51
+ payload = {
52
+ "client": {
53
+ "clientId": "my-smishing-detector",
54
+ "clientVersion": "1.0"
55
+ },
56
+ "threatInfo": {
57
+ "threatTypes": [
58
+ "MALWARE",
59
+ "SOCIAL_ENGINEERING",
60
+ "UNWANTED_SOFTWARE",
61
+ "POTENTIALLY_HARMFUL_APPLICATION"
62
+ ],
63
+ "platformTypes": ["ANY_PLATFORM"],
64
+ "threatEntryTypes": ["URL"],
65
+ "threatEntries": threat_entries
66
+ }
67
+ }
68
+
69
+ try:
70
+ resp = requests.post(
71
+ SAFE_BROWSING_URL,
72
+ params={"key": SAFE_BROWSING_API_KEY},
73
+ json=payload,
74
+ timeout=10
75
+ )
76
+ data = resp.json()
77
+ # If "matches" is present, some URL is flagged
78
+ # Each match has "threat": {"url": "..."}
79
+ malicious_urls = set()
80
+ if "matches" in data:
81
+ for match in data["matches"]:
82
+ threat_url = match.get("threat", {}).get("url")
83
+ if threat_url:
84
+ malicious_urls.add(threat_url)
85
+
86
+ for u in urls:
87
+ result[u] = (u in malicious_urls)
88
+
89
+ except Exception as e:
90
+ print(f"Error contacting Safe Browsing API: {e}")
91
+ # default: everything safe if error
92
+ for u in urls:
93
+ result[u] = False
94
+
95
+ return result
96
+
97
  def get_keywords_by_language(text: str):
98
  """
99
  Detect language using langdetect and translate keywords if needed.
 
117
 
118
  def boost_probabilities(probabilities: dict, text: str):
119
  """
120
+ Boost probabilities based on keyword matches, presence of URLs,
121
+ and Google Safe Browsing checks.
122
  """
123
  lower_text = text.lower()
124
  smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
 
129
  smishing_boost = 0.30 * smishing_count
130
  other_scam_boost = 0.30 * other_scam_count
131
 
132
+ # More robust URL pattern
133
+ found_urls = re.findall(
134
+ r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
135
+ lower_text
136
+ )
137
+ # If any URL is found, add 0.35 to smishing
138
  if found_urls:
139
  smishing_boost += 0.35
140
 
 
146
  p_other_scam += other_scam_boost
147
  p_legit -= (smishing_boost + other_scam_boost)
148
 
149
+ # Preliminary clamp & normalization
150
  p_smishing = max(p_smishing, 0.0)
151
  p_other_scam = max(p_other_scam, 0.0)
152
  p_legit = max(p_legit, 0.0)
153
 
 
154
  total = p_smishing + p_other_scam + p_legit
155
  if total > 0:
156
  p_smishing /= total
 
159
  else:
160
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
161
 
162
+ # **Now** check Safe Browsing. If any URL is malicious => p_smishing=1.0
163
+ if found_urls:
164
+ malicious_results = check_urls_with_google_safebrowsing(found_urls)
165
+ # If any malicious => set p_smishing=1.0
166
+ if any(malicious_results[u] for u in malicious_results):
167
+ # Bump SMiShing to max
168
+ p_smishing = 1.0
169
+ p_other_scam = 0.0
170
+ p_legit = 0.0
171
+
172
  return {
173
  "SMiShing": p_smishing,
174
  "Other Scam": p_other_scam,
 
202
  )
203
  raw_reply = response["choices"][0]["message"]["content"].strip()
204
 
 
205
  llm_result = json.loads(raw_reply)
206
  if "label" not in llm_result or "reason" not in llm_result:
207
  return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
 
258
  Second LLM call: provides a holistic explanation of the final classification
259
  in the same language as detected_lang (English or Spanish).
260
  """
 
261
  if detected_lang == "es":
 
262
  system_prompt = (
263
  "Eres un experto en ciberseguridad. Proporciona una explicaci贸n final al usuario en espa帽ol. "
264
  "Combina la clasificaci贸n local, la clasificaci贸n LLM y la etiqueta final en una sola explicaci贸n breve. "
265
  "No reveles el c贸digo interno ni el JSON bruto; simplemente da una breve explicaci贸n f谩cil de entender. "
266
+ "Termina con la etiqueta final."
267
  )
268
  else:
 
269
  system_prompt = (
270
  "You are a cybersecurity expert providing a final explanation to the user in English. "
271
  "Combine the local classification, the LLM classification, and the final label "
 
338
  boosted = boost_probabilities(original_probs, combined_text)
339
  detected_lang = boosted.pop("detected_lang", "en")
340
 
 
341
  for k in boosted:
342
  boosted[k] = float(boosted[k])
343
 
 
352
  # 4. Incorporate LLM鈥檚 label into final probabilities
353
  boosted = incorporate_llm_label(boosted, llm_label)
354
 
 
355
  final_label = max(boosted, key=boosted.get)
356
  final_confidence = round(boosted[final_label], 3)
357
 
 
359
  lower_text = combined_text.lower()
360
  smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
361
 
362
+ found_urls = re.findall(
363
+ r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
364
+ lower_text
365
+ )
366
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
367
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
368
 
369
+ # 6. Final explanation in user's language
370
  final_explanation = query_llm_for_explanation(
371
  text=combined_text,
372
  final_label=final_label,
 
413
  return gr.update(visible=False), gr.update(visible=True)
414
 
415
  with gr.Blocks() as demo:
416
+ gr.Markdown("## SMiShing & Scam Detector with Google Safe Browsing + LLM")
417
 
418
  with gr.Row():
419
  input_type = gr.Radio(
 
457
  if __name__ == "__main__":
458
  if not openai.api_key:
459
  print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
460
+ if not SAFE_BROWSING_API_KEY:
461
+ print("WARNING: GOOGLE_SAFE_BROWSING_API_KEY not set. URL checks will be skipped.")
462
  demo.launch()
app.py.bestoftues CHANGED
@@ -8,7 +8,7 @@ from deep_translator import GoogleTranslator
8
  import openai
9
  import os
10
 
11
- # Set up your OpenAI API key
12
  openai.api_key = os.getenv("OPENAI_API_KEY")
13
 
14
  # Translator instance
@@ -60,7 +60,7 @@ def boost_probabilities(probabilities: dict, text: str):
60
  smishing_boost = 0.30 * smishing_count
61
  other_scam_boost = 0.30 * other_scam_count
62
 
63
- found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
64
  if found_urls:
65
  smishing_boost += 0.35
66
 
@@ -120,7 +120,6 @@ def query_llm_for_classification(raw_message: str) -> dict:
120
  raw_reply = response["choices"][0]["message"]["content"].strip()
121
 
122
  import json
123
- # Expect something like {"label": "...", "reason": "..."}
124
  llm_result = json.loads(raw_reply)
125
  if "label" not in llm_result or "reason" not in llm_result:
126
  return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
@@ -170,19 +169,31 @@ def query_llm_for_explanation(
170
  llm_reason: str,
171
  found_smishing: list,
172
  found_other_scam: list,
173
- found_urls: list
 
174
  ) -> str:
175
  """
176
- Second LLM call: provides a holistic explanation of the final classification.
177
- We include the local classification info, the LLM鈥檚 own classification, and
178
- relevant details (keywords, URLs).
179
  """
180
- system_prompt = (
181
- "You are a cybersecurity expert providing a final explanation to the user. "
182
- "Combine the local classification, the LLM classification, and the final label "
183
- "into one concise explanation. Do not reveal internal code or raw JSON; just give "
184
- "a short, user-friendly explanation. End with a final statement of the final label."
185
- )
 
 
 
 
 
 
 
 
 
 
 
 
186
  user_context = f"""
187
  User Message:
188
  {text}
@@ -195,7 +206,7 @@ Suspicious SMiShing Keywords => {found_smishing}
195
  Suspicious Other Scam Keywords => {found_other_scam}
196
  URLs => {found_urls}
197
  """
198
- # The LLM can combine these facts into a short paragraph.
199
  try:
200
  response = openai.ChatCompletion.create(
201
  model="gpt-3.5-turbo",
@@ -215,7 +226,7 @@ def smishing_detector(input_type, text, image):
215
  Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
216
  plus two LLM calls:
217
  1) classification to adjust final probabilities,
218
- 2) a final explanation summarizing the outcome.
219
  """
220
  if input_type == "Text":
221
  combined_text = text.strip() if text else ""
@@ -271,11 +282,11 @@ def smishing_detector(input_type, text, image):
271
  lower_text = combined_text.lower()
272
  smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
273
 
274
- found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
275
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
276
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
277
 
278
- # 6. Final LLM explanation
279
  final_explanation = query_llm_for_explanation(
280
  text=combined_text,
281
  final_label=final_label,
@@ -286,7 +297,8 @@ def smishing_detector(input_type, text, image):
286
  llm_reason=llm_reason,
287
  found_smishing=found_smishing,
288
  found_other_scam=found_other_scam,
289
- found_urls=found_urls
 
290
  )
291
 
292
  return {
@@ -321,7 +333,7 @@ def toggle_inputs(choice):
321
  return gr.update(visible=False), gr.update(visible=True)
322
 
323
  with gr.Blocks() as demo:
324
- gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic")
325
 
326
  with gr.Row():
327
  input_type = gr.Radio(
@@ -363,7 +375,6 @@ with gr.Blocks() as demo:
363
  )
364
 
365
  if __name__ == "__main__":
366
- # Warn if openai.api_key not set
367
  if not openai.api_key:
368
  print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
369
  demo.launch()
 
8
  import openai
9
  import os
10
 
11
+ # Set your OpenAI API key
12
  openai.api_key = os.getenv("OPENAI_API_KEY")
13
 
14
  # Translator instance
 
60
  smishing_boost = 0.30 * smishing_count
61
  other_scam_boost = 0.30 * other_scam_count
62
 
63
+ found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
64
  if found_urls:
65
  smishing_boost += 0.35
66
 
 
120
  raw_reply = response["choices"][0]["message"]["content"].strip()
121
 
122
  import json
 
123
  llm_result = json.loads(raw_reply)
124
  if "label" not in llm_result or "reason" not in llm_result:
125
  return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
 
169
  llm_reason: str,
170
  found_smishing: list,
171
  found_other_scam: list,
172
+ found_urls: list,
173
+ detected_lang: str
174
  ) -> str:
175
  """
176
+ Second LLM call: provides a holistic explanation of the final classification
177
+ in the same language as detected_lang (English or Spanish).
 
178
  """
179
+ # Decide the language for final explanation
180
+ if detected_lang == "es":
181
+ # Spanish
182
+ system_prompt = (
183
+ "Eres un experto en ciberseguridad. Proporciona una explicaci贸n final al usuario en espa帽ol. "
184
+ "Combina la clasificaci贸n local, la clasificaci贸n LLM y la etiqueta final en una sola explicaci贸n breve. "
185
+ "No reveles el c贸digo interno ni el JSON bruto; simplemente da una breve explicaci贸n f谩cil de entender. "
186
+ "Termina con la etiqueta final. "
187
+ )
188
+ else:
189
+ # Default to English
190
+ system_prompt = (
191
+ "You are a cybersecurity expert providing a final explanation to the user in English. "
192
+ "Combine the local classification, the LLM classification, and the final label "
193
+ "into one concise explanation. Do not reveal internal code or raw JSON. "
194
+ "End with a final statement of the final label."
195
+ )
196
+
197
  user_context = f"""
198
  User Message:
199
  {text}
 
206
  Suspicious Other Scam Keywords => {found_other_scam}
207
  URLs => {found_urls}
208
  """
209
+
210
  try:
211
  response = openai.ChatCompletion.create(
212
  model="gpt-3.5-turbo",
 
226
  Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
227
  plus two LLM calls:
228
  1) classification to adjust final probabilities,
229
+ 2) a final explanation summarizing the outcome in the detected language.
230
  """
231
  if input_type == "Text":
232
  combined_text = text.strip() if text else ""
 
282
  lower_text = combined_text.lower()
283
  smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
284
 
285
+ found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
286
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
287
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
288
 
289
+ # 6. Final LLM explanation (in detected_lang)
290
  final_explanation = query_llm_for_explanation(
291
  text=combined_text,
292
  final_label=final_label,
 
297
  llm_reason=llm_reason,
298
  found_smishing=found_smishing,
299
  found_other_scam=found_other_scam,
300
+ found_urls=found_urls,
301
+ detected_lang=detected_lang
302
  )
303
 
304
  return {
 
333
  return gr.update(visible=False), gr.update(visible=True)
334
 
335
  with gr.Blocks() as demo:
336
+ gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic (Multilingual Explanation)")
337
 
338
  with gr.Row():
339
  input_type = gr.Radio(
 
375
  )
376
 
377
  if __name__ == "__main__":
 
378
  if not openai.api_key:
379
  print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
380
  demo.launch()