hackerbyhobby commited on
Commit
ee7ace1
·
unverified ·
1 Parent(s): c624cbc

updated requirements and added apt.txt

Browse files
Files changed (1) hide show
  1. app.py +96 -44
app.py CHANGED
@@ -4,39 +4,75 @@ from PIL import Image
4
  from transformers import pipeline
5
  import re
6
 
7
- # 1. Load keywords from separate files
 
 
 
 
 
 
8
  with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
9
  SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
10
 
11
  with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
12
  OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
13
 
14
- # 2. Load the zero-shot classification pipeline
15
  model_name = "joeddav/xlm-roberta-large-xnli"
16
  classifier = pipeline("zero-shot-classification", model=model_name)
17
-
18
- # We will classify among these three labels
19
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
20
 
21
- def boost_probabilities(probabilities: dict, text: str) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  """
23
- Increases SMiShing probability if 'smishing_keywords' or URLs are found.
24
- Increases Other Scam probability if 'other_scam_keywords' are found.
25
- Reduces Legitimate by the total amount of these boosts.
26
- Then clamps negative probabilities to 0 and re-normalizes.
 
27
  """
28
  lower_text = text.lower()
29
 
30
- # Count smishing keywords
31
- smishing_keyword_count = sum(1 for kw in SMISHING_KEYWORDS if kw in lower_text)
32
- # Count other scam keywords
33
- other_scam_keyword_count = sum(1 for kw in OTHER_SCAM_KEYWORDS if kw in lower_text)
34
 
35
- # Base boosts
36
- smishing_boost = 0.30 * smishing_keyword_count
37
- other_scam_boost = 0.30 * other_scam_keyword_count
 
38
 
39
- # Check URLs => +0.20 only to Smishing
 
 
 
 
40
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
41
  if found_urls:
42
  smishing_boost += 0.35
@@ -50,7 +86,7 @@ def boost_probabilities(probabilities: dict, text: str) -> dict:
50
  p_smishing += smishing_boost
51
  p_other_scam += other_scam_boost
52
 
53
- # Subtract total boost from Legitimate
54
  total_boost = smishing_boost + other_scam_boost
55
  p_legit -= total_boost
56
 
@@ -62,28 +98,30 @@ def boost_probabilities(probabilities: dict, text: str) -> dict:
62
  if p_legit < 0:
63
  p_legit = 0.0
64
 
65
- # Re-normalize so sum=1
66
  total = p_smishing + p_other_scam + p_legit
67
  if total > 0:
68
  p_smishing /= total
69
  p_other_scam /= total
70
  p_legit /= total
71
  else:
72
- # fallback if everything is zero
73
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
74
 
75
  return {
76
  "SMiShing": p_smishing,
77
  "Other Scam": p_other_scam,
78
- "Legitimate": p_legit
 
79
  }
80
 
81
  def smishing_detector(text, image):
82
  """
83
- 1. OCR if image provided.
 
84
  2. Zero-shot classify => base probabilities.
85
- 3. Boost probabilities based on keywords + URL logic.
86
- 4. Return final classification + confidence.
87
  """
88
  combined_text = text or ""
89
  if image is not None:
@@ -96,12 +134,11 @@ def smishing_detector(text, image):
96
  "text_used_for_classification": "(none)",
97
  "label": "No text provided",
98
  "confidence": 0.0,
99
- "smishing_keywords_found": [],
100
- "other_scam_keywords_found": [],
101
  "urls_found": []
102
  }
103
 
104
- # Perform zero-shot classification
105
  result = classifier(
106
  sequences=combined_text,
107
  candidate_labels=CANDIDATE_LABELS,
@@ -109,29 +146,47 @@ def smishing_detector(text, image):
109
  )
110
  original_probs = dict(zip(result["labels"], result["scores"]))
111
 
112
- # Apply boosts
113
- boosted_probs = boost_probabilities(original_probs, combined_text)
114
- final_label = max(boosted_probs, key=boosted_probs.get)
115
- final_confidence = round(boosted_probs[final_label], 3)
 
 
 
 
 
116
 
117
- # For display: which keywords + URLs
118
  lower_text = combined_text.lower()
119
- smishing_found = [kw for kw in SMISHING_KEYWORDS if kw in lower_text]
120
- other_scam_found = [kw for kw in OTHER_SCAM_KEYWORDS if kw in lower_text]
 
121
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
122
 
 
 
 
 
 
 
 
 
 
 
 
123
  return {
 
124
  "text_used_for_classification": combined_text,
125
  "original_probabilities": {
126
  k: round(v, 3) for k, v in original_probs.items()
127
  },
128
  "boosted_probabilities": {
129
- k: round(v, 3) for k, v in boosted_probs.items()
130
  },
131
  "label": final_label,
132
  "confidence": final_confidence,
133
- "smishing_keywords_found": smishing_found,
134
- "other_scam_keywords_found": other_scam_found,
135
  "urls_found": found_urls,
136
  }
137
 
@@ -149,15 +204,12 @@ demo = gr.Interface(
149
  )
150
  ],
151
  outputs="json",
152
- title="SMiShing & Scam Detector (Separate Keywords + URL → SMiShing)",
153
  description="""
154
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
155
- (joeddav/xlm-roberta-large-xnli).
156
- - 'smishing_keywords.txt' boosts SMiShing specifically.
157
- - 'other_scam_keywords.txt' boosts Other Scam specifically.
158
- - Any URL found further boosts ONLY Smishing.
159
- - The total boost is subtracted from Legitimate.
160
- Supports English & Spanish text (OCR included).
161
  """,
162
  allow_flagging="never"
163
  )
 
4
  from transformers import pipeline
5
  import re
6
 
7
+ # Language detection & translation
8
+ from langdetect import detect
9
+ from googletrans import Translator
10
+
11
+ translator = Translator()
12
+
13
+ # 1. Load separate keywords for SMiShing and Other Scam (assumed in English)
14
  with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
15
  SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
16
 
17
  with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
18
  OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
19
 
20
+ # 2. Zero-Shot Classification Pipeline
21
  model_name = "joeddav/xlm-roberta-large-xnli"
22
  classifier = pipeline("zero-shot-classification", model=model_name)
 
 
23
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
24
 
25
+ def get_keywords_by_language(text: str):
26
+ """
27
+ 1. Detect language (using `langdetect`).
28
+ 2. If Spanish ('es'), translate each English-based keyword to Spanish using googletrans.
29
+ 3. If English (or anything else), just use the original English lists.
30
+ """
31
+ # Attempt to detect language from a snippet (to reduce overhead on very large text)
32
+ snippet = text[:200] # up to 200 chars for detection
33
+ try:
34
+ detected_lang = detect(snippet)
35
+ except:
36
+ detected_lang = "en" # fallback if detection fails
37
+
38
+ if detected_lang == "es":
39
+ # Translate all SMiShing and Other Scam keywords to Spanish
40
+ smishing_in_spanish = [
41
+ translator.translate(kw, src="en", dest="es").text.lower()
42
+ for kw in SMISHING_KEYWORDS
43
+ ]
44
+ other_scam_in_spanish = [
45
+ translator.translate(kw, src="en", dest="es").text.lower()
46
+ for kw in OTHER_SCAM_KEYWORDS
47
+ ]
48
+ return smishing_in_spanish, other_scam_in_spanish, "es"
49
+ else:
50
+ # Default to English keywords
51
+ return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
52
+
53
+ def boost_probabilities(probabilities: dict, text: str):
54
  """
55
+ 1. Load the appropriate keyword lists (English or Spanish).
56
+ 2. Count matches for SMiShing vs. Other Scam.
57
+ 3. If a URL is found, add an extra boost only to SMiShing.
58
+ 4. Subtract total boost from 'Legitimate'.
59
+ 5. Clamp negative probabilities to 0, re-normalize.
60
  """
61
  lower_text = text.lower()
62
 
63
+ # Grab the correct keyword lists based on language
64
+ smishing_keywords, other_scam_keywords, detected_lang = get_keywords_by_language(text)
 
 
65
 
66
+ # Count SMiShing keyword matches
67
+ smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
68
+ # Count Other Scam keyword matches
69
+ other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
70
 
71
+ # Base boost amounts
72
+ smishing_boost = 0.30 * smishing_count
73
+ other_scam_boost = 0.30 * other_scam_count
74
+
75
+ # Check for URLs => +0.35 only to SMiShing
76
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
77
  if found_urls:
78
  smishing_boost += 0.35
 
86
  p_smishing += smishing_boost
87
  p_other_scam += other_scam_boost
88
 
89
+ # Subtract total boost from 'Legitimate'
90
  total_boost = smishing_boost + other_scam_boost
91
  p_legit -= total_boost
92
 
 
98
  if p_legit < 0:
99
  p_legit = 0.0
100
 
101
+ # Re-normalize
102
  total = p_smishing + p_other_scam + p_legit
103
  if total > 0:
104
  p_smishing /= total
105
  p_other_scam /= total
106
  p_legit /= total
107
  else:
108
+ # fallback if everything is 0
109
  p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
110
 
111
  return {
112
  "SMiShing": p_smishing,
113
  "Other Scam": p_other_scam,
114
+ "Legitimate": p_legit,
115
+ "detected_lang": detected_lang
116
  }
117
 
118
  def smishing_detector(text, image):
119
  """
120
+ Main function called by Gradio.
121
+ 1. Combine user text + OCR text (if an image is provided).
122
  2. Zero-shot classify => base probabilities.
123
+ 3. Apply language detection & translation if needed, then boost logic.
124
+ 4. Return final classification.
125
  """
126
  combined_text = text or ""
127
  if image is not None:
 
134
  "text_used_for_classification": "(none)",
135
  "label": "No text provided",
136
  "confidence": 0.0,
137
+ "keywords_found": [],
 
138
  "urls_found": []
139
  }
140
 
141
+ # 1. Zero-shot classification
142
  result = classifier(
143
  sequences=combined_text,
144
  candidate_labels=CANDIDATE_LABELS,
 
146
  )
147
  original_probs = dict(zip(result["labels"], result["scores"]))
148
 
149
+ # 2. Boost logic (including language detection + translation)
150
+ boosted = boost_probabilities(original_probs, combined_text)
151
+ final_label = max(boosted, key=boosted.get) if not isinstance(boosted.get("detected_lang"), float) else "Legitimate"
152
+ # to avoid conflict, let's store the detected language separately:
153
+ detected_lang = boosted.pop("detected_lang", "en")
154
+
155
+ # We have p_smishing, p_other_scam, p_legit left in boosted
156
+ final_label = max(boosted, key=boosted.get)
157
+ final_confidence = round(boosted[final_label], 3)
158
 
159
+ # 3. Identify which keywords & URLs we found
160
  lower_text = combined_text.lower()
161
+ # If we detected Spanish, we used the translated keywords to do matching. But let's also show them:
162
+ # For demonstration, let's just show the "English or Spanish" keywords. The code to show them in output
163
+ # can be the same as before, or you can do a second pass with the same logic from boost_probabilities.
164
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
165
 
166
+ # We'll do a quick second pass on actual matched keywords so user sees them
167
+ # - If language is es => we used translated Spanish keywords, let's do the same for display
168
+ # - If language is en => we used the original English lists
169
+ if detected_lang == "es":
170
+ smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
171
+ else:
172
+ smishing_keys, scam_keys, _ = (SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en")
173
+
174
+ found_smishing = [kw for kw in smishing_keys if kw in lower_text]
175
+ found_other_scam = [kw for kw in scam_keys if kw in lower_text]
176
+
177
  return {
178
+ "detected_language": detected_lang,
179
  "text_used_for_classification": combined_text,
180
  "original_probabilities": {
181
  k: round(v, 3) for k, v in original_probs.items()
182
  },
183
  "boosted_probabilities": {
184
+ k: round(v, 3) for k, v in boosted.items()
185
  },
186
  "label": final_label,
187
  "confidence": final_confidence,
188
+ "smishing_keywords_found": found_smishing,
189
+ "other_scam_keywords_found": found_other_scam,
190
  "urls_found": found_urls,
191
  }
192
 
 
204
  )
205
  ],
206
  outputs="json",
207
+ title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
208
  description="""
209
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
210
+ (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
211
+ If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
212
+ Any URL found further boosts SMiShing specifically.
 
 
 
213
  """,
214
  allow_flagging="never"
215
  )