ApsidalSolid4 commited on
Commit
444f8bc
·
verified ·
1 Parent(s): 33fd63d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -25
app.py CHANGED
@@ -178,9 +178,100 @@ class TextClassifier:
178
 
179
  def detailed_scan(self, text: str) -> Dict:
180
  """Original prediction method with modified window handling"""
181
- # Clean up trailing whitespace
182
- text = text.rstrip()
183
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  if not text.strip():
185
  return {
186
  'sentence_predictions': [],
@@ -192,25 +283,23 @@ class TextClassifier:
192
  'num_sentences': 0
193
  }
194
  }
195
-
196
- self.model.eval()
197
  sentences = self.processor.split_into_sentences(text)
198
  if not sentences:
199
  return {}
200
-
201
  # Create centered windows for each sentence
202
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
203
-
204
  # Track scores for each sentence
205
  sentence_appearances = {i: 0 for i in range(len(sentences))}
206
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
207
-
208
  # Process windows in batches
209
- batch_size = 16
210
- for i in range(0, len(windows), batch_size):
211
- batch_windows = windows[i:i + batch_size]
212
- batch_indices = window_sentence_indices[i:i + batch_size]
213
-
214
  inputs = self.tokenizer(
215
  batch_windows,
216
  truncation=True,
@@ -218,48 +307,48 @@ class TextClassifier:
218
  max_length=MAX_LENGTH,
219
  return_tensors="pt"
220
  ).to(self.device)
221
-
222
  with torch.no_grad():
223
  outputs = self.model(**inputs)
224
  probs = F.softmax(outputs.logits, dim=-1)
225
-
226
  # Attribute predictions with weighted scoring
227
  for window_idx, indices in enumerate(batch_indices):
228
  center_idx = len(indices) // 2
229
  center_weight = 0.7 # Higher weight for center sentence
230
  edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
231
-
232
  for pos, sent_idx in enumerate(indices):
233
  # Apply higher weight to center sentence
234
  weight = center_weight if pos == center_idx else edge_weight
235
  sentence_appearances[sent_idx] += weight
236
  sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
237
  sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
238
-
239
  # Clean up memory
240
  del inputs, outputs, probs
241
  if torch.cuda.is_available():
242
  torch.cuda.empty_cache()
243
-
244
  # Calculate final predictions with boundary smoothing
245
  sentence_predictions = []
246
  for i in range(len(sentences)):
247
  if sentence_appearances[i] > 0:
248
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
249
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
250
-
251
- # Only apply minimal smoothing at prediction boundaries
252
  if i > 0 and i < len(sentences) - 1:
253
  prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
254
  prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
255
  next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
256
  next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
257
-
258
  # Check if we're at a prediction boundary
259
  current_pred = 'human' if human_prob > ai_prob else 'ai'
260
  prev_pred = 'human' if prev_human > prev_ai else 'ai'
261
  next_pred = 'human' if next_human > next_ai else 'ai'
262
-
263
  if current_pred != prev_pred or current_pred != next_pred:
264
  # Small adjustment at boundaries
265
  smooth_factor = 0.1
@@ -267,7 +356,7 @@ class TextClassifier:
267
  (prev_human + next_human) * smooth_factor / 2)
268
  ai_prob = (ai_prob * (1 - smooth_factor) +
269
  (prev_ai + next_ai) * smooth_factor / 2)
270
-
271
  sentence_predictions.append({
272
  'sentence': sentences[i],
273
  'human_prob': human_prob,
@@ -275,7 +364,7 @@ class TextClassifier:
275
  'prediction': 'human' if human_prob > ai_prob else 'ai',
276
  'confidence': max(human_prob, ai_prob)
277
  })
278
-
279
  return {
280
  'sentence_predictions': sentence_predictions,
281
  'highlighted_text': self.format_predictions_html(sentence_predictions),
@@ -283,7 +372,6 @@ class TextClassifier:
283
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
284
  }
285
 
286
-
287
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
288
  """Format predictions as HTML with color-coding."""
289
  html_parts = []
 
178
 
179
  def detailed_scan(self, text: str) -> Dict:
180
  """Original prediction method with modified window handling"""
181
+ if self.model is None or self.tokenizer is None:
182
+ self.load_model()
183
+
184
+ self.model.eval()
185
+ sentences = self.processor.split_into_sentences(text)
186
+ if not sentences:
187
+ return {}
188
+
189
+ # Create centered windows for each sentence
190
+ windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
191
+
192
+ # Track scores for each sentence
193
+ sentence_appearances = {i: 0 for i in range(len(sentences))}
194
+ sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
195
+
196
+ # Process windows in batches
197
+ batch_size = 16
198
+ for i in range(0, len(windows), batch_size):
199
+ batch_windows = windows[i:i + batch_size]
200
+ batch_indices = window_sentence_indices[i:i + batch_size]
201
+
202
+ inputs = self.tokenizer(
203
+ batch_windows,
204
+ truncation=True,
205
+ padding=True,
206
+ max_length=MAX_LENGTH,
207
+ return_tensors="pt"
208
+ ).to(self.device)
209
+
210
+ with torch.no_grad():
211
+ outputs = self.model(**inputs)
212
+ probs = F.softmax(outputs.logits, dim=-1)
213
+
214
+ # Attribute predictions more carefully
215
+ for window_idx, indices in enumerate(batch_indices):
216
+ center_idx = len(indices) // 2
217
+ center_weight = 0.7 # Higher weight for center sentence
218
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
219
+
220
+ for pos, sent_idx in enumerate(indices):
221
+ # Apply higher weight to center sentence
222
+ weight = center_weight if pos == center_idx else edge_weight
223
+ sentence_appearances[sent_idx] += weight
224
+ sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
225
+ sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
226
+
227
+ del inputs, outputs, probs
228
+ if torch.cuda.is_available():
229
+ torch.cuda.empty_cache()
230
+
231
+ # Calculate final predictions
232
+ sentence_predictions = []
233
+ for i in range(len(sentences)):
234
+ if sentence_appearances[i] > 0:
235
+ human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
236
+ ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
237
+
238
+ # Only apply minimal smoothing at prediction boundaries
239
+ if i > 0 and i < len(sentences) - 1:
240
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
241
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
242
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
243
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
244
+
245
+ # Check if we're at a prediction boundary
246
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
247
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
248
+ next_pred = 'human' if next_human > next_ai else 'ai'
249
+
250
+ if current_pred != prev_pred or current_pred != next_pred:
251
+ # Small adjustment at boundaries
252
+ smooth_factor = 0.1
253
+ human_prob = (human_prob * (1 - smooth_factor) +
254
+ (prev_human + next_human) * smooth_factor / 2)
255
+ ai_prob = (ai_prob * (1 - smooth_factor) +
256
+ (prev_ai + next_ai) * smooth_factor / 2)
257
+
258
+ sentence_predictions.append({
259
+ 'sentence': sentences[i],
260
+ 'human_prob': human_prob,
261
+ 'ai_prob': ai_prob,
262
+ 'prediction': 'human' if human_prob > ai_prob else 'ai',
263
+ 'confidence': max(human_prob, ai_prob)
264
+ })
265
+
266
+ return {
267
+ 'sentence_predictions': sentence_predictions,
268
+ 'highlighted_text': self.format_predictions_html(sentence_predictions),
269
+ 'full_text': text,
270
+ 'overall_prediction': self.aggregate_predictions(sentence_predictions)
271
+ }
272
+
273
+ def detailed_scan(self, text: str) -> Dict:
274
+ """Perform a detailed scan with improved sentence-level analysis."""
275
  if not text.strip():
276
  return {
277
  'sentence_predictions': [],
 
283
  'num_sentences': 0
284
  }
285
  }
286
+
 
287
  sentences = self.processor.split_into_sentences(text)
288
  if not sentences:
289
  return {}
290
+
291
  # Create centered windows for each sentence
292
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
293
+
294
  # Track scores for each sentence
295
  sentence_appearances = {i: 0 for i in range(len(sentences))}
296
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
297
+
298
  # Process windows in batches
299
+ for i in range(0, len(windows), BATCH_SIZE):
300
+ batch_windows = windows[i:i + BATCH_SIZE]
301
+ batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
302
+
 
303
  inputs = self.tokenizer(
304
  batch_windows,
305
  truncation=True,
 
307
  max_length=MAX_LENGTH,
308
  return_tensors="pt"
309
  ).to(self.device)
310
+
311
  with torch.no_grad():
312
  outputs = self.model(**inputs)
313
  probs = F.softmax(outputs.logits, dim=-1)
314
+
315
  # Attribute predictions with weighted scoring
316
  for window_idx, indices in enumerate(batch_indices):
317
  center_idx = len(indices) // 2
318
  center_weight = 0.7 # Higher weight for center sentence
319
  edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
320
+
321
  for pos, sent_idx in enumerate(indices):
322
  # Apply higher weight to center sentence
323
  weight = center_weight if pos == center_idx else edge_weight
324
  sentence_appearances[sent_idx] += weight
325
  sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
326
  sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
327
+
328
  # Clean up memory
329
  del inputs, outputs, probs
330
  if torch.cuda.is_available():
331
  torch.cuda.empty_cache()
332
+
333
  # Calculate final predictions with boundary smoothing
334
  sentence_predictions = []
335
  for i in range(len(sentences)):
336
  if sentence_appearances[i] > 0:
337
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
338
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
339
+
340
+ # Apply minimal smoothing at prediction boundaries
341
  if i > 0 and i < len(sentences) - 1:
342
  prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
343
  prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
344
  next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
345
  next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
346
+
347
  # Check if we're at a prediction boundary
348
  current_pred = 'human' if human_prob > ai_prob else 'ai'
349
  prev_pred = 'human' if prev_human > prev_ai else 'ai'
350
  next_pred = 'human' if next_human > next_ai else 'ai'
351
+
352
  if current_pred != prev_pred or current_pred != next_pred:
353
  # Small adjustment at boundaries
354
  smooth_factor = 0.1
 
356
  (prev_human + next_human) * smooth_factor / 2)
357
  ai_prob = (ai_prob * (1 - smooth_factor) +
358
  (prev_ai + next_ai) * smooth_factor / 2)
359
+
360
  sentence_predictions.append({
361
  'sentence': sentences[i],
362
  'human_prob': human_prob,
 
364
  'prediction': 'human' if human_prob > ai_prob else 'ai',
365
  'confidence': max(human_prob, ai_prob)
366
  })
367
+
368
  return {
369
  'sentence_predictions': sentence_predictions,
370
  'highlighted_text': self.format_predictions_html(sentence_predictions),
 
372
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
373
  }
374
 
 
375
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
376
  """Format predictions as HTML with color-coding."""
377
  html_parts = []