ApsidalSolid4 commited on
Commit
e3d20ad
·
verified ·
1 Parent(s): d475cd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -39
app.py CHANGED
@@ -13,18 +13,16 @@ from functools import partial
13
  import time
14
  from datetime import datetime
15
 
16
-
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
-
21
  MAX_LENGTH = 512
22
  MODEL_NAME = "microsoft/deberta-v3-small"
23
  WINDOW_SIZE = 6
24
  WINDOW_OVERLAP = 2
25
  CONFIDENCE_THRESHOLD = 0.65
26
- BATCH_SIZE = 8
27
- MAX_WORKERS = 4
28
 
29
  class TextWindowProcessor:
30
  def __init__(self):
@@ -41,7 +39,6 @@ class TextWindowProcessor:
41
  disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
42
  self.nlp.disable_pipes(*disabled_pipes)
43
 
44
-
45
  self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
46
 
47
  def split_into_sentences(self, text: str) -> List[str]:
@@ -64,12 +61,10 @@ class TextWindowProcessor:
64
  window_sentence_indices = []
65
 
66
  for i in range(len(sentences)):
67
-
68
  half_window = window_size // 2
69
  start_idx = max(0, i - half_window)
70
  end_idx = min(len(sentences), i + half_window + 1)
71
 
72
-
73
  window = sentences[start_idx:end_idx]
74
  windows.append(" ".join(window))
75
  window_sentence_indices.append(list(range(start_idx, end_idx)))
@@ -78,7 +73,6 @@ class TextWindowProcessor:
78
 
79
  class TextClassifier:
80
  def __init__(self):
81
-
82
  if not torch.cuda.is_available():
83
  torch.set_num_threads(MAX_WORKERS)
84
  torch.set_num_interop_threads(MAX_WORKERS)
@@ -91,7 +85,6 @@ class TextClassifier:
91
  self.initialize_model()
92
 
93
  def initialize_model(self):
94
- """Initialize the model and tokenizer."""
95
  logger.info("Initializing model and tokenizer...")
96
 
97
  from transformers import DebertaV2TokenizerFast
@@ -130,7 +123,6 @@ class TextClassifier:
130
 
131
  predictions = []
132
 
133
-
134
  for i in range(0, len(windows), BATCH_SIZE):
135
  batch_windows = windows[i:i + BATCH_SIZE]
136
 
@@ -155,7 +147,6 @@ class TextClassifier:
155
  }
156
  predictions.append(prediction)
157
 
158
-
159
  del inputs, outputs, probs
160
  if torch.cuda.is_available():
161
  torch.cuda.empty_cache()
@@ -177,7 +168,6 @@ class TextClassifier:
177
  }
178
 
179
  def detailed_scan(self, text: str) -> Dict:
180
-
181
  text = text.rstrip()
182
 
183
  if not text.strip():
@@ -196,14 +186,11 @@ class TextClassifier:
196
  if not sentences:
197
  return {}
198
 
199
-
200
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
201
 
202
-
203
  sentence_appearances = {i: 0 for i in range(len(sentences))}
204
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
205
 
206
-
207
  for i in range(0, len(windows), BATCH_SIZE):
208
  batch_windows = windows[i:i + BATCH_SIZE]
209
  batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
@@ -220,45 +207,38 @@ class TextClassifier:
220
  outputs = self.model(**inputs)
221
  probs = F.softmax(outputs.logits, dim=-1)
222
 
223
-
224
  for window_idx, indices in enumerate(batch_indices):
225
  center_idx = len(indices) // 2
226
- center_weight = 0.7
227
- edge_weight = 0.3 / (len(indices) - 1)
228
 
229
  for pos, sent_idx in enumerate(indices):
230
-
231
  weight = center_weight if pos == center_idx else edge_weight
232
  sentence_appearances[sent_idx] += weight
233
  sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
234
  sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
235
 
236
-
237
  del inputs, outputs, probs
238
  if torch.cuda.is_available():
239
  torch.cuda.empty_cache()
240
 
241
-
242
  sentence_predictions = []
243
  for i in range(len(sentences)):
244
  if sentence_appearances[i] > 0:
245
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
246
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
247
 
248
-
249
  if i > 0 and i < len(sentences) - 1:
250
  prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
251
  prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
252
  next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
253
  next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
254
 
255
-
256
  current_pred = 'human' if human_prob > ai_prob else 'ai'
257
  prev_pred = 'human' if prev_human > prev_ai else 'ai'
258
  next_pred = 'human' if next_human > next_ai else 'ai'
259
 
260
  if current_pred != prev_pred or current_pred != next_pred:
261
-
262
  smooth_factor = 0.1
263
  human_prob = (human_prob * (1 - smooth_factor) +
264
  (prev_human + next_human) * smooth_factor / 2)
@@ -289,14 +269,14 @@ class TextClassifier:
289
 
290
  if confidence >= CONFIDENCE_THRESHOLD:
291
  if pred['prediction'] == 'human':
292
- color = "
293
  else:
294
- color = "
295
  else:
296
  if pred['prediction'] == 'human':
297
- color = "
298
  else:
299
- color = "
300
 
301
  html_parts.append(f'<span style="background-color: {color};">{sentence}</span>')
302
 
@@ -324,13 +304,10 @@ class TextClassifier:
324
  }
325
 
326
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
327
-
328
  start_time = time.time()
329
 
330
-
331
  word_count = len(text.split())
332
 
333
-
334
  original_mode = mode
335
  if word_count < 200 and mode == "detailed":
336
  mode = "quick"
@@ -344,15 +321,13 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
344
  Windows analyzed: {result['num_windows']}
345
  """
346
 
347
-
348
  if original_mode == "detailed":
349
  quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
350
 
351
-
352
  execution_time = (time.time() - start_time) * 1000
353
 
354
  return (
355
- text,
356
  "Quick scan mode - no sentence-level analysis available",
357
  quick_analysis
358
  )
@@ -374,7 +349,6 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
374
  Number of sentences analyzed: {final_pred['num_sentences']}
375
  """
376
 
377
-
378
  execution_time = (time.time() - start_time) * 1000
379
 
380
  return (
@@ -383,10 +357,8 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
383
  overall_result
384
  )
385
 
386
-
387
  classifier = TextClassifier()
388
 
389
-
390
  demo = gr.Interface(
391
  fn=lambda text, mode: analyze_text(text, mode, classifier),
392
  inputs=[
@@ -413,12 +385,11 @@ demo = gr.Interface(
413
  flagging_mode="never"
414
  )
415
 
416
-
417
  app = demo.app
418
 
419
  app.add_middleware(
420
  CORSMiddleware,
421
- allow_origins=["*"],
422
  allow_credentials=True,
423
  allow_methods=["GET", "POST", "OPTIONS"],
424
  allow_headers=["*"],
 
13
  import time
14
  from datetime import datetime
15
 
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
 
19
  MAX_LENGTH = 512
20
  MODEL_NAME = "microsoft/deberta-v3-small"
21
  WINDOW_SIZE = 6
22
  WINDOW_OVERLAP = 2
23
  CONFIDENCE_THRESHOLD = 0.65
24
+ BATCH_SIZE = 8
25
+ MAX_WORKERS = 4
26
 
27
  class TextWindowProcessor:
28
  def __init__(self):
 
39
  disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
40
  self.nlp.disable_pipes(*disabled_pipes)
41
 
 
42
  self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
43
 
44
  def split_into_sentences(self, text: str) -> List[str]:
 
61
  window_sentence_indices = []
62
 
63
  for i in range(len(sentences)):
 
64
  half_window = window_size // 2
65
  start_idx = max(0, i - half_window)
66
  end_idx = min(len(sentences), i + half_window + 1)
67
 
 
68
  window = sentences[start_idx:end_idx]
69
  windows.append(" ".join(window))
70
  window_sentence_indices.append(list(range(start_idx, end_idx)))
 
73
 
74
  class TextClassifier:
75
  def __init__(self):
 
76
  if not torch.cuda.is_available():
77
  torch.set_num_threads(MAX_WORKERS)
78
  torch.set_num_interop_threads(MAX_WORKERS)
 
85
  self.initialize_model()
86
 
87
  def initialize_model(self):
 
88
  logger.info("Initializing model and tokenizer...")
89
 
90
  from transformers import DebertaV2TokenizerFast
 
123
 
124
  predictions = []
125
 
 
126
  for i in range(0, len(windows), BATCH_SIZE):
127
  batch_windows = windows[i:i + BATCH_SIZE]
128
 
 
147
  }
148
  predictions.append(prediction)
149
 
 
150
  del inputs, outputs, probs
151
  if torch.cuda.is_available():
152
  torch.cuda.empty_cache()
 
168
  }
169
 
170
  def detailed_scan(self, text: str) -> Dict:
 
171
  text = text.rstrip()
172
 
173
  if not text.strip():
 
186
  if not sentences:
187
  return {}
188
 
 
189
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
190
 
 
191
  sentence_appearances = {i: 0 for i in range(len(sentences))}
192
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
193
 
 
194
  for i in range(0, len(windows), BATCH_SIZE):
195
  batch_windows = windows[i:i + BATCH_SIZE]
196
  batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
 
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
 
 
210
  for window_idx, indices in enumerate(batch_indices):
211
  center_idx = len(indices) // 2
212
+ center_weight = 0.7
213
+ edge_weight = 0.3 / (len(indices) - 1)
214
 
215
  for pos, sent_idx in enumerate(indices):
 
216
  weight = center_weight if pos == center_idx else edge_weight
217
  sentence_appearances[sent_idx] += weight
218
  sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
219
  sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
220
 
 
221
  del inputs, outputs, probs
222
  if torch.cuda.is_available():
223
  torch.cuda.empty_cache()
224
 
 
225
  sentence_predictions = []
226
  for i in range(len(sentences)):
227
  if sentence_appearances[i] > 0:
228
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
229
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
230
 
 
231
  if i > 0 and i < len(sentences) - 1:
232
  prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
233
  prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
234
  next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
235
  next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
236
 
 
237
  current_pred = 'human' if human_prob > ai_prob else 'ai'
238
  prev_pred = 'human' if prev_human > prev_ai else 'ai'
239
  next_pred = 'human' if next_human > next_ai else 'ai'
240
 
241
  if current_pred != prev_pred or current_pred != next_pred:
 
242
  smooth_factor = 0.1
243
  human_prob = (human_prob * (1 - smooth_factor) +
244
  (prev_human + next_human) * smooth_factor / 2)
 
269
 
270
  if confidence >= CONFIDENCE_THRESHOLD:
271
  if pred['prediction'] == 'human':
272
+ color = "#90EE90"
273
  else:
274
+ color = "#FFB6C6"
275
  else:
276
  if pred['prediction'] == 'human':
277
+ color = "#E8F5E9"
278
  else:
279
+ color = "#FFEBEE"
280
 
281
  html_parts.append(f'<span style="background-color: {color};">{sentence}</span>')
282
 
 
304
  }
305
 
306
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
 
307
  start_time = time.time()
308
 
 
309
  word_count = len(text.split())
310
 
 
311
  original_mode = mode
312
  if word_count < 200 and mode == "detailed":
313
  mode = "quick"
 
321
  Windows analyzed: {result['num_windows']}
322
  """
323
 
 
324
  if original_mode == "detailed":
325
  quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
326
 
 
327
  execution_time = (time.time() - start_time) * 1000
328
 
329
  return (
330
+ text,
331
  "Quick scan mode - no sentence-level analysis available",
332
  quick_analysis
333
  )
 
349
  Number of sentences analyzed: {final_pred['num_sentences']}
350
  """
351
 
 
352
  execution_time = (time.time() - start_time) * 1000
353
 
354
  return (
 
357
  overall_result
358
  )
359
 
 
360
  classifier = TextClassifier()
361
 
 
362
  demo = gr.Interface(
363
  fn=lambda text, mode: analyze_text(text, mode, classifier),
364
  inputs=[
 
385
  flagging_mode="never"
386
  )
387
 
 
388
  app = demo.app
389
 
390
  app.add_middleware(
391
  CORSMiddleware,
392
+ allow_origins=["*"],
393
  allow_credentials=True,
394
  allow_methods=["GET", "POST", "OPTIONS"],
395
  allow_headers=["*"],