Spaces:

hackerbyhobby
/

SMS_scam_detection

Running

App Files Files Community

hackerbyhobby commited on Jan 29

Commit

535ff02

unverified ·

1 Parent(s): 07e1348

updated to improve google safebrowsing scoring

Browse files

Files changed (1) hide show

app.py +27 -29

app.py CHANGED Viewed

@@ -159,12 +159,12 @@ def boost_probabilities(probabilities: dict, text: str):
     else:
         p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
-    # **Now** check Safe Browsing. If any URL is malicious => p_smishing=1.0
     if found_urls:
-        malicious_results = check_urls_with_google_safebrowsing(found_urls)
         # If any malicious => set p_smishing=1.0
-        if any(malicious_results[u] for u in malicious_results):
-            # Bump SMiShing to max
             p_smishing = 1.0
             p_other_scam = 0.0
             p_legit = 0.0
@@ -173,7 +173,9 @@ def boost_probabilities(probabilities: dict, text: str):
         "SMiShing": p_smishing,
         "Other Scam": p_other_scam,
         "Legitimate": p_legit,
-        "detected_lang": detected_lang
     }
 def query_llm_for_classification(raw_message: str) -> dict:
@@ -334,15 +336,19 @@ def smishing_detector(input_type, text, image):
     )
     original_probs = {k: float(v) for k, v in zip(local_result["labels"], local_result["scores"])}
-    # 2. Basic boosting from keywords & URLs
-    boosted = boost_probabilities(original_probs, combined_text)
-    detected_lang = boosted.pop("detected_lang", "en")
-    for k in boosted:
-        boosted[k] = float(boosted[k])
-    local_label = max(boosted, key=boosted.get)
-    local_conf = round(boosted[local_label], 3)
     # 3. LLM Classification
     llm_classification = query_llm_for_classification(combined_text)
@@ -350,19 +356,15 @@ def smishing_detector(input_type, text, image):
     llm_reason = llm_classification.get("reason", "No reason provided")
     # 4. Incorporate LLM’s label into final probabilities
-    boosted = incorporate_llm_label(boosted, llm_label)
-    final_label = max(boosted, key=boosted.get)
-    final_confidence = round(boosted[final_label], 3)
-    # 5. Gather found keywords & URLs
     lower_text = combined_text.lower()
     smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
-    found_urls = re.findall(
-        r"(https?://[^\s]+|\b[a-zA-Z0-9.-]+\.(?:com|net|org|edu|gov|mil|io|ai|co|info|biz|us|uk|de|fr|es|ru|jp|cn|in|au|ca|br|mx|it|nl|se|no|fi|ch|pl|kr|vn|id|tw|sg|hk)\b)",
-        lower_text
-    )
     found_smishing = [kw for kw in smishing_keys if kw in lower_text]
     found_other_scam = [kw for kw in scam_keys if kw in lower_text]
@@ -381,6 +383,7 @@ def smishing_detector(input_type, text, image):
         detected_lang=detected_lang
     )
     return {
         "detected_language": detected_lang,
         "text_used_for_classification": combined_text,
@@ -388,12 +391,13 @@ def smishing_detector(input_type, text, image):
         "boosted_probabilities_before_llm": {local_label: local_conf},
         "llm_label": llm_label,
         "llm_reason": llm_reason,
-        "boosted_probabilities_after_llm": {k: round(v, 3) for k, v in boosted.items()},
         "label": final_label,
         "confidence": final_confidence,
         "smishing_keywords_found": found_smishing,
         "other_scam_keywords_found": found_other_scam,
         "urls_found": found_urls,
         "final_explanation": final_explanation,
     }
@@ -405,11 +409,8 @@ def toggle_inputs(choice):
     Return updates for (text_input, image_input) based on the radio selection.
     """
     if choice == "Text":
-        # Show text input, hide image
         return gr.update(visible=True), gr.update(visible=False)
     else:
-        # choice == "Screenshot"
-        # Hide text input, show image
         return gr.update(visible=False), gr.update(visible=True)
 with gr.Blocks() as demo:
@@ -426,16 +427,15 @@ with gr.Blocks() as demo:
         lines=3,
         label="Paste Suspicious SMS Text",
         placeholder="Type or paste the message here...",
-        visible=True  # default
     )
     image_input = gr.Image(
         type="pil",
         label="Upload Screenshot",
-        visible=False  # hidden by default
     )
-    # Whenever input_type changes, toggle which input is visible
     input_type.change(
         fn=toggle_inputs,
         inputs=input_type,
@@ -443,11 +443,9 @@ with gr.Blocks() as demo:
         queue=False
     )
-    # Button to run classification
     analyze_btn = gr.Button("Classify")
     output_json = gr.JSON(label="Result")
-    # On button click, call the smishing_detector
     analyze_btn.click(
         fn=smishing_detector,
         inputs=[input_type, text_input, image_input],

     else:
         p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
+    # **Now** check Safe Browsing
+    sb_results = {}
     if found_urls:
+        sb_results = check_urls_with_google_safebrowsing(found_urls)
         # If any malicious => set p_smishing=1.0
+        if any(sb_results[u] for u in sb_results):
             p_smishing = 1.0
             p_other_scam = 0.0
             p_legit = 0.0
         "SMiShing": p_smishing,
         "Other Scam": p_other_scam,
         "Legitimate": p_legit,
+        "detected_lang": detected_lang,
+        "found_urls": found_urls,
+        "safe_browsing_results": sb_results  # <--- show which are malicious
     }
 def query_llm_for_classification(raw_message: str) -> dict:
     )
     original_probs = {k: float(v) for k, v in zip(local_result["labels"], local_result["scores"])}
+    # 2. Basic boosting from keywords & URLs (plus Safe Browsing)
+    boosted_dict = boost_probabilities(original_probs, combined_text)
+    detected_lang = boosted_dict.pop("detected_lang", "en")
+    # Also retrieve the safe_browsing_results
+    sb_results = boosted_dict.pop("safe_browsing_results", {})
+    found_urls = boosted_dict.pop("found_urls", [])
+    for k in boosted_dict:
+        boosted_dict[k] = float(boosted_dict[k])
+    local_label = max(boosted_dict, key=boosted_dict.get)
+    local_conf = round(boosted_dict[local_label], 3)
     # 3. LLM Classification
     llm_classification = query_llm_for_classification(combined_text)
     llm_reason = llm_classification.get("reason", "No reason provided")
     # 4. Incorporate LLM’s label into final probabilities
+    boosted_dict = incorporate_llm_label(boosted_dict, llm_label)
+    final_label = max(boosted_dict, key=boosted_dict.get)
+    final_confidence = round(boosted_dict[final_label], 3)
+    # 5. Gather found keywords & URLs for the final explanation
     lower_text = combined_text.lower()
     smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
     found_smishing = [kw for kw in smishing_keys if kw in lower_text]
     found_other_scam = [kw for kw in scam_keys if kw in lower_text]
         detected_lang=detected_lang
     )
+    # Combine results in final output
     return {
         "detected_language": detected_lang,
         "text_used_for_classification": combined_text,
         "boosted_probabilities_before_llm": {local_label: local_conf},
         "llm_label": llm_label,
         "llm_reason": llm_reason,
+        "boosted_probabilities_after_llm": {k: round(v, 3) for k, v in boosted_dict.items()},
         "label": final_label,
         "confidence": final_confidence,
         "smishing_keywords_found": found_smishing,
         "other_scam_keywords_found": found_other_scam,
         "urls_found": found_urls,
+        "safe_browsing_results": sb_results,  # <--- show if malicious
         "final_explanation": final_explanation,
     }
     Return updates for (text_input, image_input) based on the radio selection.
     """
     if choice == "Text":
         return gr.update(visible=True), gr.update(visible=False)
     else:
         return gr.update(visible=False), gr.update(visible=True)
 with gr.Blocks() as demo:
         lines=3,
         label="Paste Suspicious SMS Text",
         placeholder="Type or paste the message here...",
+        visible=True
     )
     image_input = gr.Image(
         type="pil",
         label="Upload Screenshot",
+        visible=False
     )
     input_type.change(
         fn=toggle_inputs,
         inputs=input_type,
         queue=False
     )
     analyze_btn = gr.Button("Classify")
     output_json = gr.JSON(label="Result")
     analyze_btn.click(
         fn=smishing_detector,
         inputs=[input_type, text_input, image_input],