hackerbyhobby
commited on
updated to improve google safebrowsing scoring
Browse files- app.py +96 -14
- app.py.bestoftues +31 -20
app.py
CHANGED
@@ -7,10 +7,16 @@ from langdetect import detect
|
|
7 |
from deep_translator import GoogleTranslator
|
8 |
import openai
|
9 |
import os
|
|
|
|
|
10 |
|
11 |
# Set your OpenAI API key
|
12 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
13 |
|
|
|
|
|
|
|
|
|
14 |
# Translator instance
|
15 |
translator = GoogleTranslator(source="auto", target="es")
|
16 |
|
@@ -26,6 +32,68 @@ model_name = "joeddav/xlm-roberta-large-xnli"
|
|
26 |
classifier = pipeline("zero-shot-classification", model=model_name)
|
27 |
CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def get_keywords_by_language(text: str):
|
30 |
"""
|
31 |
Detect language using langdetect and translate keywords if needed.
|
@@ -49,7 +117,8 @@ def get_keywords_by_language(text: str):
|
|
49 |
|
50 |
def boost_probabilities(probabilities: dict, text: str):
|
51 |
"""
|
52 |
-
Boost probabilities based on keyword matches
|
|
|
53 |
"""
|
54 |
lower_text = text.lower()
|
55 |
smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
|
@@ -60,7 +129,12 @@ def boost_probabilities(probabilities: dict, text: str):
|
|
60 |
smishing_boost = 0.30 * smishing_count
|
61 |
other_scam_boost = 0.30 * other_scam_count
|
62 |
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
64 |
if found_urls:
|
65 |
smishing_boost += 0.35
|
66 |
|
@@ -72,12 +146,11 @@ def boost_probabilities(probabilities: dict, text: str):
|
|
72 |
p_other_scam += other_scam_boost
|
73 |
p_legit -= (smishing_boost + other_scam_boost)
|
74 |
|
75 |
-
#
|
76 |
p_smishing = max(p_smishing, 0.0)
|
77 |
p_other_scam = max(p_other_scam, 0.0)
|
78 |
p_legit = max(p_legit, 0.0)
|
79 |
|
80 |
-
# Re-normalize
|
81 |
total = p_smishing + p_other_scam + p_legit
|
82 |
if total > 0:
|
83 |
p_smishing /= total
|
@@ -86,6 +159,16 @@ def boost_probabilities(probabilities: dict, text: str):
|
|
86 |
else:
|
87 |
p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
return {
|
90 |
"SMiShing": p_smishing,
|
91 |
"Other Scam": p_other_scam,
|
@@ -119,7 +202,6 @@ def query_llm_for_classification(raw_message: str) -> dict:
|
|
119 |
)
|
120 |
raw_reply = response["choices"][0]["message"]["content"].strip()
|
121 |
|
122 |
-
import json
|
123 |
llm_result = json.loads(raw_reply)
|
124 |
if "label" not in llm_result or "reason" not in llm_result:
|
125 |
return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
|
@@ -176,17 +258,14 @@ def query_llm_for_explanation(
|
|
176 |
Second LLM call: provides a holistic explanation of the final classification
|
177 |
in the same language as detected_lang (English or Spanish).
|
178 |
"""
|
179 |
-
# Decide the language for final explanation
|
180 |
if detected_lang == "es":
|
181 |
-
# Spanish
|
182 |
system_prompt = (
|
183 |
"Eres un experto en ciberseguridad. Proporciona una explicaci贸n final al usuario en espa帽ol. "
|
184 |
"Combina la clasificaci贸n local, la clasificaci贸n LLM y la etiqueta final en una sola explicaci贸n breve. "
|
185 |
"No reveles el c贸digo interno ni el JSON bruto; simplemente da una breve explicaci贸n f谩cil de entender. "
|
186 |
-
"Termina con la etiqueta final.
|
187 |
)
|
188 |
else:
|
189 |
-
# Default to English
|
190 |
system_prompt = (
|
191 |
"You are a cybersecurity expert providing a final explanation to the user in English. "
|
192 |
"Combine the local classification, the LLM classification, and the final label "
|
@@ -259,7 +338,6 @@ def smishing_detector(input_type, text, image):
|
|
259 |
boosted = boost_probabilities(original_probs, combined_text)
|
260 |
detected_lang = boosted.pop("detected_lang", "en")
|
261 |
|
262 |
-
# Convert to float only
|
263 |
for k in boosted:
|
264 |
boosted[k] = float(boosted[k])
|
265 |
|
@@ -274,7 +352,6 @@ def smishing_detector(input_type, text, image):
|
|
274 |
# 4. Incorporate LLM鈥檚 label into final probabilities
|
275 |
boosted = incorporate_llm_label(boosted, llm_label)
|
276 |
|
277 |
-
# Now we have updated probabilities
|
278 |
final_label = max(boosted, key=boosted.get)
|
279 |
final_confidence = round(boosted[final_label], 3)
|
280 |
|
@@ -282,11 +359,14 @@ def smishing_detector(input_type, text, image):
|
|
282 |
lower_text = combined_text.lower()
|
283 |
smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
|
284 |
|
285 |
-
found_urls = re.findall(
|
|
|
|
|
|
|
286 |
found_smishing = [kw for kw in smishing_keys if kw in lower_text]
|
287 |
found_other_scam = [kw for kw in scam_keys if kw in lower_text]
|
288 |
|
289 |
-
# 6. Final
|
290 |
final_explanation = query_llm_for_explanation(
|
291 |
text=combined_text,
|
292 |
final_label=final_label,
|
@@ -333,7 +413,7 @@ def toggle_inputs(choice):
|
|
333 |
return gr.update(visible=False), gr.update(visible=True)
|
334 |
|
335 |
with gr.Blocks() as demo:
|
336 |
-
gr.Markdown("## SMiShing & Scam Detector with
|
337 |
|
338 |
with gr.Row():
|
339 |
input_type = gr.Radio(
|
@@ -377,4 +457,6 @@ with gr.Blocks() as demo:
|
|
377 |
if __name__ == "__main__":
|
378 |
if not openai.api_key:
|
379 |
print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
|
|
|
|
|
380 |
demo.launch()
|
|
|
7 |
from deep_translator import GoogleTranslator
|
8 |
import openai
|
9 |
import os
|
10 |
+
import requests
|
11 |
+
import json
|
12 |
|
13 |
# Set your OpenAI API key
|
14 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
15 |
|
16 |
+
# Retrieve Google Safe Browsing API key from environment
|
17 |
+
SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
|
18 |
+
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
|
19 |
+
|
20 |
# Translator instance
|
21 |
translator = GoogleTranslator(source="auto", target="es")
|
22 |
|
|
|
32 |
classifier = pipeline("zero-shot-classification", model=model_name)
|
33 |
CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
|
34 |
|
35 |
+
def check_urls_with_google_safebrowsing(urls):
|
36 |
+
"""
|
37 |
+
Queries the Google Safe Browsing API to see if any URLs are malicious.
|
38 |
+
Returns a dict {url: bool is_malicious}.
|
39 |
+
If the API key is missing or an error occurs, returns {url: False} for all.
|
40 |
+
"""
|
41 |
+
result = {}
|
42 |
+
if not SAFE_BROWSING_API_KEY:
|
43 |
+
# No key: we can't check. Return all as safe.
|
44 |
+
for u in urls:
|
45 |
+
result[u] = False
|
46 |
+
return result
|
47 |
+
|
48 |
+
# Build threatEntries for each URL
|
49 |
+
threat_entries = [{"url": u} for u in urls]
|
50 |
+
|
51 |
+
payload = {
|
52 |
+
"client": {
|
53 |
+
"clientId": "my-smishing-detector",
|
54 |
+
"clientVersion": "1.0"
|
55 |
+
},
|
56 |
+
"threatInfo": {
|
57 |
+
"threatTypes": [
|
58 |
+
"MALWARE",
|
59 |
+
"SOCIAL_ENGINEERING",
|
60 |
+
"UNWANTED_SOFTWARE",
|
61 |
+
"POTENTIALLY_HARMFUL_APPLICATION"
|
62 |
+
],
|
63 |
+
"platformTypes": ["ANY_PLATFORM"],
|
64 |
+
"threatEntryTypes": ["URL"],
|
65 |
+
"threatEntries": threat_entries
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
try:
|
70 |
+
resp = requests.post(
|
71 |
+
SAFE_BROWSING_URL,
|
72 |
+
params={"key": SAFE_BROWSING_API_KEY},
|
73 |
+
json=payload,
|
74 |
+
timeout=10
|
75 |
+
)
|
76 |
+
data = resp.json()
|
77 |
+
# If "matches" is present, some URL is flagged
|
78 |
+
# Each match has "threat": {"url": "..."}
|
79 |
+
malicious_urls = set()
|
80 |
+
if "matches" in data:
|
81 |
+
for match in data["matches"]:
|
82 |
+
threat_url = match.get("threat", {}).get("url")
|
83 |
+
if threat_url:
|
84 |
+
malicious_urls.add(threat_url)
|
85 |
+
|
86 |
+
for u in urls:
|
87 |
+
result[u] = (u in malicious_urls)
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error contacting Safe Browsing API: {e}")
|
91 |
+
# default: everything safe if error
|
92 |
+
for u in urls:
|
93 |
+
result[u] = False
|
94 |
+
|
95 |
+
return result
|
96 |
+
|
97 |
def get_keywords_by_language(text: str):
|
98 |
"""
|
99 |
Detect language using langdetect and translate keywords if needed.
|
|
|
117 |
|
118 |
def boost_probabilities(probabilities: dict, text: str):
|
119 |
"""
|
120 |
+
Boost probabilities based on keyword matches, presence of URLs,
|
121 |
+
and Google Safe Browsing checks.
|
122 |
"""
|
123 |
lower_text = text.lower()
|
124 |
smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
|
|
|
129 |
smishing_boost = 0.30 * smishing_count
|
130 |
other_scam_boost = 0.30 * other_scam_count
|
131 |
|
132 |
+
# More robust URL pattern
|
133 |
+
found_urls = re.findall(
|
134 |
+
r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
|
135 |
+
lower_text
|
136 |
+
)
|
137 |
+
# If any URL is found, add 0.35 to smishing
|
138 |
if found_urls:
|
139 |
smishing_boost += 0.35
|
140 |
|
|
|
146 |
p_other_scam += other_scam_boost
|
147 |
p_legit -= (smishing_boost + other_scam_boost)
|
148 |
|
149 |
+
# Preliminary clamp & normalization
|
150 |
p_smishing = max(p_smishing, 0.0)
|
151 |
p_other_scam = max(p_other_scam, 0.0)
|
152 |
p_legit = max(p_legit, 0.0)
|
153 |
|
|
|
154 |
total = p_smishing + p_other_scam + p_legit
|
155 |
if total > 0:
|
156 |
p_smishing /= total
|
|
|
159 |
else:
|
160 |
p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
|
161 |
|
162 |
+
# **Now** check Safe Browsing. If any URL is malicious => p_smishing=1.0
|
163 |
+
if found_urls:
|
164 |
+
malicious_results = check_urls_with_google_safebrowsing(found_urls)
|
165 |
+
# If any malicious => set p_smishing=1.0
|
166 |
+
if any(malicious_results[u] for u in malicious_results):
|
167 |
+
# Bump SMiShing to max
|
168 |
+
p_smishing = 1.0
|
169 |
+
p_other_scam = 0.0
|
170 |
+
p_legit = 0.0
|
171 |
+
|
172 |
return {
|
173 |
"SMiShing": p_smishing,
|
174 |
"Other Scam": p_other_scam,
|
|
|
202 |
)
|
203 |
raw_reply = response["choices"][0]["message"]["content"].strip()
|
204 |
|
|
|
205 |
llm_result = json.loads(raw_reply)
|
206 |
if "label" not in llm_result or "reason" not in llm_result:
|
207 |
return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
|
|
|
258 |
Second LLM call: provides a holistic explanation of the final classification
|
259 |
in the same language as detected_lang (English or Spanish).
|
260 |
"""
|
|
|
261 |
if detected_lang == "es":
|
|
|
262 |
system_prompt = (
|
263 |
"Eres un experto en ciberseguridad. Proporciona una explicaci贸n final al usuario en espa帽ol. "
|
264 |
"Combina la clasificaci贸n local, la clasificaci贸n LLM y la etiqueta final en una sola explicaci贸n breve. "
|
265 |
"No reveles el c贸digo interno ni el JSON bruto; simplemente da una breve explicaci贸n f谩cil de entender. "
|
266 |
+
"Termina con la etiqueta final."
|
267 |
)
|
268 |
else:
|
|
|
269 |
system_prompt = (
|
270 |
"You are a cybersecurity expert providing a final explanation to the user in English. "
|
271 |
"Combine the local classification, the LLM classification, and the final label "
|
|
|
338 |
boosted = boost_probabilities(original_probs, combined_text)
|
339 |
detected_lang = boosted.pop("detected_lang", "en")
|
340 |
|
|
|
341 |
for k in boosted:
|
342 |
boosted[k] = float(boosted[k])
|
343 |
|
|
|
352 |
# 4. Incorporate LLM鈥檚 label into final probabilities
|
353 |
boosted = incorporate_llm_label(boosted, llm_label)
|
354 |
|
|
|
355 |
final_label = max(boosted, key=boosted.get)
|
356 |
final_confidence = round(boosted[final_label], 3)
|
357 |
|
|
|
359 |
lower_text = combined_text.lower()
|
360 |
smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
|
361 |
|
362 |
+
found_urls = re.findall(
|
363 |
+
r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
|
364 |
+
lower_text
|
365 |
+
)
|
366 |
found_smishing = [kw for kw in smishing_keys if kw in lower_text]
|
367 |
found_other_scam = [kw for kw in scam_keys if kw in lower_text]
|
368 |
|
369 |
+
# 6. Final explanation in user's language
|
370 |
final_explanation = query_llm_for_explanation(
|
371 |
text=combined_text,
|
372 |
final_label=final_label,
|
|
|
413 |
return gr.update(visible=False), gr.update(visible=True)
|
414 |
|
415 |
with gr.Blocks() as demo:
|
416 |
+
gr.Markdown("## SMiShing & Scam Detector with Google Safe Browsing + LLM")
|
417 |
|
418 |
with gr.Row():
|
419 |
input_type = gr.Radio(
|
|
|
457 |
if __name__ == "__main__":
|
458 |
if not openai.api_key:
|
459 |
print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
|
460 |
+
if not SAFE_BROWSING_API_KEY:
|
461 |
+
print("WARNING: GOOGLE_SAFE_BROWSING_API_KEY not set. URL checks will be skipped.")
|
462 |
demo.launch()
|
app.py.bestoftues
CHANGED
@@ -8,7 +8,7 @@ from deep_translator import GoogleTranslator
|
|
8 |
import openai
|
9 |
import os
|
10 |
|
11 |
-
# Set
|
12 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
13 |
|
14 |
# Translator instance
|
@@ -60,7 +60,7 @@ def boost_probabilities(probabilities: dict, text: str):
|
|
60 |
smishing_boost = 0.30 * smishing_count
|
61 |
other_scam_boost = 0.30 * other_scam_count
|
62 |
|
63 |
-
found_urls = re.findall(r"(https?://[^\s]
|
64 |
if found_urls:
|
65 |
smishing_boost += 0.35
|
66 |
|
@@ -120,7 +120,6 @@ def query_llm_for_classification(raw_message: str) -> dict:
|
|
120 |
raw_reply = response["choices"][0]["message"]["content"].strip()
|
121 |
|
122 |
import json
|
123 |
-
# Expect something like {"label": "...", "reason": "..."}
|
124 |
llm_result = json.loads(raw_reply)
|
125 |
if "label" not in llm_result or "reason" not in llm_result:
|
126 |
return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
|
@@ -170,19 +169,31 @@ def query_llm_for_explanation(
|
|
170 |
llm_reason: str,
|
171 |
found_smishing: list,
|
172 |
found_other_scam: list,
|
173 |
-
found_urls: list
|
|
|
174 |
) -> str:
|
175 |
"""
|
176 |
-
Second LLM call: provides a holistic explanation of the final classification
|
177 |
-
|
178 |
-
relevant details (keywords, URLs).
|
179 |
"""
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
user_context = f"""
|
187 |
User Message:
|
188 |
{text}
|
@@ -195,7 +206,7 @@ Suspicious SMiShing Keywords => {found_smishing}
|
|
195 |
Suspicious Other Scam Keywords => {found_other_scam}
|
196 |
URLs => {found_urls}
|
197 |
"""
|
198 |
-
|
199 |
try:
|
200 |
response = openai.ChatCompletion.create(
|
201 |
model="gpt-3.5-turbo",
|
@@ -215,7 +226,7 @@ def smishing_detector(input_type, text, image):
|
|
215 |
Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
|
216 |
plus two LLM calls:
|
217 |
1) classification to adjust final probabilities,
|
218 |
-
2) a final explanation summarizing the outcome.
|
219 |
"""
|
220 |
if input_type == "Text":
|
221 |
combined_text = text.strip() if text else ""
|
@@ -271,11 +282,11 @@ def smishing_detector(input_type, text, image):
|
|
271 |
lower_text = combined_text.lower()
|
272 |
smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
|
273 |
|
274 |
-
found_urls = re.findall(r"(https?://[^\s]
|
275 |
found_smishing = [kw for kw in smishing_keys if kw in lower_text]
|
276 |
found_other_scam = [kw for kw in scam_keys if kw in lower_text]
|
277 |
|
278 |
-
# 6. Final LLM explanation
|
279 |
final_explanation = query_llm_for_explanation(
|
280 |
text=combined_text,
|
281 |
final_label=final_label,
|
@@ -286,7 +297,8 @@ def smishing_detector(input_type, text, image):
|
|
286 |
llm_reason=llm_reason,
|
287 |
found_smishing=found_smishing,
|
288 |
found_other_scam=found_other_scam,
|
289 |
-
found_urls=found_urls
|
|
|
290 |
)
|
291 |
|
292 |
return {
|
@@ -321,7 +333,7 @@ def toggle_inputs(choice):
|
|
321 |
return gr.update(visible=False), gr.update(visible=True)
|
322 |
|
323 |
with gr.Blocks() as demo:
|
324 |
-
gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic")
|
325 |
|
326 |
with gr.Row():
|
327 |
input_type = gr.Radio(
|
@@ -363,7 +375,6 @@ with gr.Blocks() as demo:
|
|
363 |
)
|
364 |
|
365 |
if __name__ == "__main__":
|
366 |
-
# Warn if openai.api_key not set
|
367 |
if not openai.api_key:
|
368 |
print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
|
369 |
demo.launch()
|
|
|
8 |
import openai
|
9 |
import os
|
10 |
|
11 |
+
# Set your OpenAI API key
|
12 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
13 |
|
14 |
# Translator instance
|
|
|
60 |
smishing_boost = 0.30 * smishing_count
|
61 |
other_scam_boost = 0.30 * other_scam_count
|
62 |
|
63 |
+
found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
|
64 |
if found_urls:
|
65 |
smishing_boost += 0.35
|
66 |
|
|
|
120 |
raw_reply = response["choices"][0]["message"]["content"].strip()
|
121 |
|
122 |
import json
|
|
|
123 |
llm_result = json.loads(raw_reply)
|
124 |
if "label" not in llm_result or "reason" not in llm_result:
|
125 |
return {"label": "Unknown", "reason": f"Unexpected format: {raw_reply}"}
|
|
|
169 |
llm_reason: str,
|
170 |
found_smishing: list,
|
171 |
found_other_scam: list,
|
172 |
+
found_urls: list,
|
173 |
+
detected_lang: str
|
174 |
) -> str:
|
175 |
"""
|
176 |
+
Second LLM call: provides a holistic explanation of the final classification
|
177 |
+
in the same language as detected_lang (English or Spanish).
|
|
|
178 |
"""
|
179 |
+
# Decide the language for final explanation
|
180 |
+
if detected_lang == "es":
|
181 |
+
# Spanish
|
182 |
+
system_prompt = (
|
183 |
+
"Eres un experto en ciberseguridad. Proporciona una explicaci贸n final al usuario en espa帽ol. "
|
184 |
+
"Combina la clasificaci贸n local, la clasificaci贸n LLM y la etiqueta final en una sola explicaci贸n breve. "
|
185 |
+
"No reveles el c贸digo interno ni el JSON bruto; simplemente da una breve explicaci贸n f谩cil de entender. "
|
186 |
+
"Termina con la etiqueta final. "
|
187 |
+
)
|
188 |
+
else:
|
189 |
+
# Default to English
|
190 |
+
system_prompt = (
|
191 |
+
"You are a cybersecurity expert providing a final explanation to the user in English. "
|
192 |
+
"Combine the local classification, the LLM classification, and the final label "
|
193 |
+
"into one concise explanation. Do not reveal internal code or raw JSON. "
|
194 |
+
"End with a final statement of the final label."
|
195 |
+
)
|
196 |
+
|
197 |
user_context = f"""
|
198 |
User Message:
|
199 |
{text}
|
|
|
206 |
Suspicious Other Scam Keywords => {found_other_scam}
|
207 |
URLs => {found_urls}
|
208 |
"""
|
209 |
+
|
210 |
try:
|
211 |
response = openai.ChatCompletion.create(
|
212 |
model="gpt-3.5-turbo",
|
|
|
226 |
Main detection function combining text (if 'Text') & OCR (if 'Screenshot'),
|
227 |
plus two LLM calls:
|
228 |
1) classification to adjust final probabilities,
|
229 |
+
2) a final explanation summarizing the outcome in the detected language.
|
230 |
"""
|
231 |
if input_type == "Text":
|
232 |
combined_text = text.strip() if text else ""
|
|
|
282 |
lower_text = combined_text.lower()
|
283 |
smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
|
284 |
|
285 |
+
found_urls = re.findall(r"(https?://[^\s]+|\b(?:[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk))\b)", lower_text)
|
286 |
found_smishing = [kw for kw in smishing_keys if kw in lower_text]
|
287 |
found_other_scam = [kw for kw in scam_keys if kw in lower_text]
|
288 |
|
289 |
+
# 6. Final LLM explanation (in detected_lang)
|
290 |
final_explanation = query_llm_for_explanation(
|
291 |
text=combined_text,
|
292 |
final_label=final_label,
|
|
|
297 |
llm_reason=llm_reason,
|
298 |
found_smishing=found_smishing,
|
299 |
found_other_scam=found_other_scam,
|
300 |
+
found_urls=found_urls,
|
301 |
+
detected_lang=detected_lang
|
302 |
)
|
303 |
|
304 |
return {
|
|
|
333 |
return gr.update(visible=False), gr.update(visible=True)
|
334 |
|
335 |
with gr.Blocks() as demo:
|
336 |
+
gr.Markdown("## SMiShing & Scam Detector with LLM-Enhanced Logic (Multilingual Explanation)")
|
337 |
|
338 |
with gr.Row():
|
339 |
input_type = gr.Radio(
|
|
|
375 |
)
|
376 |
|
377 |
if __name__ == "__main__":
|
|
|
378 |
if not openai.api_key:
|
379 |
print("WARNING: OPENAI_API_KEY not set. LLM calls will fail or be skipped.")
|
380 |
demo.launch()
|