hackerbyhobby commited on
Commit
7169f21
·
unverified ·
1 Parent(s): d5030c9

debugging safebrowsing

Browse files
Files changed (1) hide show
  1. app.py +225 -13
app.py CHANGED
@@ -1,15 +1,43 @@
 
 
 
 
 
 
 
 
 
1
  import requests
2
  import json
3
- import os
4
 
 
 
 
 
5
  SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
6
  SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def check_urls_with_google_safebrowsing(urls):
9
  """
10
- Debug-enabled version:
11
- - Prints payload and raw response to help troubleshoot Safe Browsing issues.
12
  Returns a dict {url: bool is_malicious}.
 
13
  """
14
  result = {}
15
  if not SAFE_BROWSING_API_KEY:
@@ -18,9 +46,7 @@ def check_urls_with_google_safebrowsing(urls):
18
  result[u] = False
19
  return result
20
 
21
- # Build threatEntries for each URL
22
  threat_entries = [{"url": u} for u in urls]
23
-
24
  payload = {
25
  "client": {
26
  "clientId": "my-smishing-detector",
@@ -40,10 +66,10 @@ def check_urls_with_google_safebrowsing(urls):
40
  }
41
 
42
  print("---- Safe Browsing Debug ----")
 
 
43
  print("REQUEST Payload (JSON):")
44
  print(json.dumps(payload, indent=2))
45
- print("REQUEST Endpoint:", SAFE_BROWSING_URL, "Key:", SAFE_BROWSING_API_KEY)
46
- print("URLs being checked:", urls)
47
 
48
  try:
49
  resp = requests.post(
@@ -52,17 +78,15 @@ def check_urls_with_google_safebrowsing(urls):
52
  json=payload,
53
  timeout=10
54
  )
55
-
56
  print("RESPONSE Status Code:", resp.status_code)
57
  try:
58
  data = resp.json()
59
  print("RESPONSE JSON:")
60
  print(json.dumps(data, indent=2))
61
- except Exception as parse_error:
62
- print("Error parsing response as JSON:", parse_error)
63
  data = {}
64
 
65
- # If "matches" is present, some URL is flagged
66
  malicious_urls = set()
67
  if "matches" in data:
68
  for match in data["matches"]:
@@ -75,11 +99,199 @@ def check_urls_with_google_safebrowsing(urls):
75
 
76
  except Exception as e:
77
  print(f"Error contacting Safe Browsing API: {e}")
78
- # default: everything safe if error
79
  for u in urls:
80
  result[u] = False
81
 
82
  print("RESULTS (url -> malicious):", result)
83
  print("---- End Debug ----\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pytesseract
3
+ from PIL import Image
4
+ from transformers import pipeline
5
+ import re
6
+ from langdetect import detect
7
+ from deep_translator import GoogleTranslator
8
+ import openai
9
+ import os
10
  import requests
11
  import json
 
12
 
13
+ # Set your OpenAI API key
14
+ openai.api_key = os.getenv("OPENAI_API_KEY")
15
+
16
+ # Retrieve Google Safe Browsing API key from environment
17
  SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
18
  SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
19
 
20
+ # Translator instance
21
+ translator = GoogleTranslator(source="auto", target="es")
22
+
23
+ # 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
24
+ with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
25
+ SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
26
+
27
+ with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
28
+ OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
29
+
30
+ # 2. Zero-Shot Classification Pipeline
31
+ model_name = "joeddav/xlm-roberta-large-xnli"
32
+ classifier = pipeline("zero-shot-classification", model=model_name)
33
+ CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
34
+
35
  def check_urls_with_google_safebrowsing(urls):
36
  """
37
+ Debug-enabled version of Google Safe Browsing check:
38
+ - Prints payload and response to help troubleshoot issues.
39
  Returns a dict {url: bool is_malicious}.
40
+ If the API key is missing or error occurs, returns {url: False}.
41
  """
42
  result = {}
43
  if not SAFE_BROWSING_API_KEY:
 
46
  result[u] = False
47
  return result
48
 
 
49
  threat_entries = [{"url": u} for u in urls]
 
50
  payload = {
51
  "client": {
52
  "clientId": "my-smishing-detector",
 
66
  }
67
 
68
  print("---- Safe Browsing Debug ----")
69
+ print("REQUEST Endpoint:", SAFE_BROWSING_URL)
70
+ print("API Key:", SAFE_BROWSING_API_KEY)
71
  print("REQUEST Payload (JSON):")
72
  print(json.dumps(payload, indent=2))
 
 
73
 
74
  try:
75
  resp = requests.post(
 
78
  json=payload,
79
  timeout=10
80
  )
 
81
  print("RESPONSE Status Code:", resp.status_code)
82
  try:
83
  data = resp.json()
84
  print("RESPONSE JSON:")
85
  print(json.dumps(data, indent=2))
86
+ except Exception as parse_err:
87
+ print("Error parsing response as JSON:", parse_err)
88
  data = {}
89
 
 
90
  malicious_urls = set()
91
  if "matches" in data:
92
  for match in data["matches"]:
 
99
 
100
  except Exception as e:
101
  print(f"Error contacting Safe Browsing API: {e}")
 
102
  for u in urls:
103
  result[u] = False
104
 
105
  print("RESULTS (url -> malicious):", result)
106
  print("---- End Debug ----\n")
107
+ return result
108
+
109
+ def get_keywords_by_language(text: str):
110
+ """
111
+ Detect language using langdetect and translate keywords if needed.
112
+ """
113
+ snippet = text[:200]
114
+ try:
115
+ detected_lang = detect(snippet)
116
+ except Exception:
117
+ detected_lang = "en"
118
+
119
+ if detected_lang == "es":
120
+ smishing_in_spanish = [
121
+ translator.translate(kw).lower() for kw in SMISHING_KEYWORDS
122
+ ]
123
+ other_scam_in_spanish = [
124
+ translator.translate(kw).lower() for kw in OTHER_SCAM_KEYWORDS
125
+ ]
126
+ return smishing_in_spanish, other_scam_in_spanish, "es"
127
+ else:
128
+ return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
129
+
130
+ def boost_probabilities(probabilities: dict, text: str):
131
+ """
132
+ Boost probabilities based on keyword matches, presence of URLs,
133
+ and Google Safe Browsing checks.
134
+ """
135
+ lower_text = text.lower()
136
+ smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
137
+
138
+ smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
139
+ other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
140
+
141
+ smishing_boost = 0.30 * smishing_count
142
+ other_scam_boost = 0.30 * other_scam_count
143
+
144
+ # More robust URL pattern
145
+ found_urls = re.findall(
146
+ r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
147
+ lower_text
148
+ )
149
+ if found_urls:
150
+ smishing_boost += 0.35
151
+
152
+ p_smishing = probabilities.get("SMiShing", 0.0)
153
+ p_other_scam = probabilities.get("Other Scam", 0.0)
154
+ p_legit = probabilities.get("Legitimate", 1.0)
155
+
156
+ p_smishing += smishing_boost
157
+ p_other_scam += other_scam_boost
158
+ p_legit -= (smishing_boost + other_scam_boost)
159
+
160
+ # Preliminary clamp & normalization
161
+ p_smishing = max(p_smishing, 0.0)
162
+ p_other_scam = max(p_other_scam, 0.0)
163
+ p_legit = max(p_legit, 0.0)
164
+
165
+ total = p_smishing + p_other_scam + p_legit
166
+ if total > 0:
167
+ p_smishing /= total
168
+ p_other_scam /= total
169
+ p_legit /= total
170
+ else:
171
+ p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
172
 
173
+ # **Now** check Safe Browsing (with debug prints)
174
+ sb_results = {}
175
+ if found_urls:
176
+ sb_results = check_urls_with_google_safebrowsing(found_urls)
177
+ # If any malicious => set p_smishing=1.0
178
+ if any(sb_results[u] for u in sb_results):
179
+ p_smishing = 1.0
180
+ p_other_scam = 0.0
181
+ p_legit = 0.0
182
+
183
+ return {
184
+ "SMiShing": p_smishing,
185
+ "Other Scam": p_other_scam,
186
+ "Legitimate": p_legit,
187
+ "detected_lang": detected_lang,
188
+ "found_urls": found_urls,
189
+ "safe_browsing_results": sb_results
190
+ }
191
+
192
+ def smishing_detector(input_type, text, image):
193
+ """
194
+ Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
195
+ and debugging logs for Safe Browsing calls.
196
+ """
197
+ if input_type == "Text":
198
+ combined_text = text.strip() if text else ""
199
+ else:
200
+ combined_text = ""
201
+ if image is not None:
202
+ combined_text = pytesseract.image_to_string(image, lang="spa+eng").strip()
203
+
204
+ if not combined_text:
205
+ return {
206
+ "text_used_for_classification": "(none)",
207
+ "label": "No text provided",
208
+ "confidence": 0.0,
209
+ "keywords_found": [],
210
+ "urls_found": [],
211
+ "safe_browsing_results": {},
212
+ }
213
+
214
+ # 1. Local zero-shot classification
215
+ local_result = classifier(
216
+ sequences=combined_text,
217
+ candidate_labels=CANDIDATE_LABELS,
218
+ hypothesis_template="This message is {}."
219
+ )
220
+ original_probs = {k: float(v) for k, v in zip(local_result["labels"], local_result["scores"])}
221
+
222
+ # 2. Boost with keywords, URLs, and Safe Browsing checks
223
+ boosted_dict = boost_probabilities(original_probs, combined_text)
224
+ detected_lang = boosted_dict.pop("detected_lang", "en")
225
+ sb_results = boosted_dict.pop("safe_browsing_results", {})
226
+ found_urls = boosted_dict.pop("found_urls", [])
227
+
228
+ for k in boosted_dict:
229
+ boosted_dict[k] = float(boosted_dict[k])
230
+
231
+ final_label = max(boosted_dict, key=boosted_dict.get)
232
+ final_confidence = round(boosted_dict[final_label], 3)
233
+
234
+ return {
235
+ "detected_language": detected_lang,
236
+ "text_used_for_classification": combined_text,
237
+ "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
238
+ "boosted_probabilities": {k: round(v, 3) for k, v in boosted_dict.items()},
239
+ "label": final_label,
240
+ "confidence": final_confidence,
241
+ "urls_found": found_urls,
242
+ "safe_browsing_results": sb_results
243
+ }
244
+
245
+ #
246
+ # Gradio interface with dynamic visibility
247
+ #
248
+ def toggle_inputs(choice):
249
+ if choice == "Text":
250
+ return gr.update(visible=True), gr.update(visible=False)
251
+ else:
252
+ return gr.update(visible=False), gr.update(visible=True)
253
+
254
+ with gr.Blocks() as demo:
255
+ gr.Markdown("## SMiShing & Scam Detector with Debug-Enabled Safe Browsing")
256
+
257
+ with gr.Row():
258
+ input_type = gr.Radio(
259
+ choices=["Text", "Screenshot"],
260
+ value="Text",
261
+ label="Choose Input Type"
262
+ )
263
+
264
+ text_input = gr.Textbox(
265
+ lines=3,
266
+ label="Paste Suspicious SMS Text",
267
+ placeholder="Type or paste the message here...",
268
+ visible=True
269
+ )
270
+ image_input = gr.Image(
271
+ type="pil",
272
+ label="Upload Screenshot",
273
+ visible=False
274
+ )
275
+
276
+ input_type.change(
277
+ fn=toggle_inputs,
278
+ inputs=input_type,
279
+ outputs=[text_input, image_input],
280
+ queue=False
281
+ )
282
+
283
+ analyze_btn = gr.Button("Classify")
284
+ output_json = gr.JSON(label="Result")
285
+
286
+ analyze_btn.click(
287
+ fn=smishing_detector,
288
+ inputs=[input_type, text_input, image_input],
289
+ outputs=output_json
290
+ )
291
+
292
+ if __name__ == "__main__":
293
+ if not openai.api_key:
294
+ print("WARNING: OPENAI_API_KEY not set. LLM calls may fail.")
295
+ if not SAFE_BROWSING_API_KEY:
296
+ print("WARNING: GOOGLE_SAFE_BROWSING_API_KEY not set. All URLs returned as safe.")
297
+ demo.launch()