ApsidalSolid4 commited on
Commit
9a1a827
·
verified ·
1 Parent(s): eb0611f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -192
app.py CHANGED
@@ -176,201 +176,113 @@ class TextClassifier:
176
  'num_windows': len(predictions)
177
  }
178
 
179
- def detailed_scan(self, text: str) -> Dict:
180
- """Original prediction method with modified window handling"""
181
- if self.model is None or self.tokenizer is None:
182
- self.load_model()
183
-
184
- self.model.eval()
185
- sentences = self.processor.split_into_sentences(text)
186
- if not sentences:
187
- return {}
188
-
189
- # Create centered windows for each sentence
190
- windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
191
-
192
- # Track scores for each sentence
193
- sentence_appearances = {i: 0 for i in range(len(sentences))}
194
- sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
195
-
196
- # Process windows in batches
197
- batch_size = 16
198
- for i in range(0, len(windows), batch_size):
199
- batch_windows = windows[i:i + batch_size]
200
- batch_indices = window_sentence_indices[i:i + batch_size]
201
-
202
- inputs = self.tokenizer(
203
- batch_windows,
204
- truncation=True,
205
- padding=True,
206
- max_length=MAX_LENGTH,
207
- return_tensors="pt"
208
- ).to(self.device)
209
-
210
- with torch.no_grad():
211
- outputs = self.model(**inputs)
212
- probs = F.softmax(outputs.logits, dim=-1)
213
-
214
- # Attribute predictions more carefully
215
- for window_idx, indices in enumerate(batch_indices):
216
- center_idx = len(indices) // 2
217
- center_weight = 0.7 # Higher weight for center sentence
218
- edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
219
-
220
- for pos, sent_idx in enumerate(indices):
221
- # Apply higher weight to center sentence
222
- weight = center_weight if pos == center_idx else edge_weight
223
- sentence_appearances[sent_idx] += weight
224
- sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
225
- sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
226
-
227
- del inputs, outputs, probs
228
- if torch.cuda.is_available():
229
- torch.cuda.empty_cache()
230
-
231
- # Calculate final predictions
232
- sentence_predictions = []
233
- for i in range(len(sentences)):
234
- if sentence_appearances[i] > 0:
235
- human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
236
- ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
237
-
238
- # Only apply minimal smoothing at prediction boundaries
239
- if i > 0 and i < len(sentences) - 1:
240
- prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
241
- prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
242
- next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
243
- next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
244
-
245
- # Check if we're at a prediction boundary
246
- current_pred = 'human' if human_prob > ai_prob else 'ai'
247
- prev_pred = 'human' if prev_human > prev_ai else 'ai'
248
- next_pred = 'human' if next_human > next_ai else 'ai'
249
-
250
- if current_pred != prev_pred or current_pred != next_pred:
251
- # Small adjustment at boundaries
252
- smooth_factor = 0.1
253
- human_prob = (human_prob * (1 - smooth_factor) +
254
- (prev_human + next_human) * smooth_factor / 2)
255
- ai_prob = (ai_prob * (1 - smooth_factor) +
256
- (prev_ai + next_ai) * smooth_factor / 2)
257
-
258
- sentence_predictions.append({
259
- 'sentence': sentences[i],
260
- 'human_prob': human_prob,
261
- 'ai_prob': ai_prob,
262
- 'prediction': 'human' if human_prob > ai_prob else 'ai',
263
- 'confidence': max(human_prob, ai_prob)
264
- })
265
-
266
- return {
267
- 'sentence_predictions': sentence_predictions,
268
- 'highlighted_text': self.format_predictions_html(sentence_predictions),
269
- 'full_text': text,
270
- 'overall_prediction': self.aggregate_predictions(sentence_predictions)
271
- }
272
-
273
- def detailed_scan(self, text: str) -> Dict:
274
- """Perform a detailed scan with improved sentence-level analysis."""
275
- if not text.strip():
276
- return {
277
- 'sentence_predictions': [],
278
- 'highlighted_text': '',
279
- 'full_text': '',
280
- 'overall_prediction': {
281
- 'prediction': 'unknown',
282
- 'confidence': 0.0,
283
- 'num_sentences': 0
284
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
286
 
287
- sentences = self.processor.split_into_sentences(text)
288
- if not sentences:
289
- return {}
290
-
291
- # Create centered windows for each sentence
292
- windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
293
-
294
- # Track scores for each sentence
295
- sentence_appearances = {i: 0 for i in range(len(sentences))}
296
- sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
297
-
298
- # Process windows in batches
299
- for i in range(0, len(windows), BATCH_SIZE):
300
- batch_windows = windows[i:i + BATCH_SIZE]
301
- batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
302
-
303
- inputs = self.tokenizer(
304
- batch_windows,
305
- truncation=True,
306
- padding=True,
307
- max_length=MAX_LENGTH,
308
- return_tensors="pt"
309
- ).to(self.device)
310
-
311
- with torch.no_grad():
312
- outputs = self.model(**inputs)
313
- probs = F.softmax(outputs.logits, dim=-1)
314
-
315
- # Attribute predictions with weighted scoring
316
- for window_idx, indices in enumerate(batch_indices):
317
- center_idx = len(indices) // 2
318
- center_weight = 0.7 # Higher weight for center sentence
319
- edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
320
-
321
- for pos, sent_idx in enumerate(indices):
322
- # Apply higher weight to center sentence
323
- weight = center_weight if pos == center_idx else edge_weight
324
- sentence_appearances[sent_idx] += weight
325
- sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
326
- sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
327
-
328
- # Clean up memory
329
- del inputs, outputs, probs
330
- if torch.cuda.is_available():
331
- torch.cuda.empty_cache()
332
-
333
- # Calculate final predictions with boundary smoothing
334
- sentence_predictions = []
335
- for i in range(len(sentences)):
336
- if sentence_appearances[i] > 0:
337
- human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
338
- ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
339
-
340
- # Apply minimal smoothing at prediction boundaries
341
- if i > 0 and i < len(sentences) - 1:
342
- prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
343
- prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
344
- next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
345
- next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
346
-
347
- # Check if we're at a prediction boundary
348
- current_pred = 'human' if human_prob > ai_prob else 'ai'
349
- prev_pred = 'human' if prev_human > prev_ai else 'ai'
350
- next_pred = 'human' if next_human > next_ai else 'ai'
351
-
352
- if current_pred != prev_pred or current_pred != next_pred:
353
- # Small adjustment at boundaries
354
- smooth_factor = 0.1
355
- human_prob = (human_prob * (1 - smooth_factor) +
356
- (prev_human + next_human) * smooth_factor / 2)
357
- ai_prob = (ai_prob * (1 - smooth_factor) +
358
- (prev_ai + next_ai) * smooth_factor / 2)
359
-
360
- sentence_predictions.append({
361
- 'sentence': sentences[i],
362
- 'human_prob': human_prob,
363
- 'ai_prob': ai_prob,
364
- 'prediction': 'human' if human_prob > ai_prob else 'ai',
365
- 'confidence': max(human_prob, ai_prob)
366
- })
367
-
368
- return {
369
- 'sentence_predictions': sentence_predictions,
370
- 'highlighted_text': self.format_predictions_html(sentence_predictions),
371
- 'full_text': text,
372
- 'overall_prediction': self.aggregate_predictions(sentence_predictions)
373
- }
374
 
375
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
376
  """Format predictions as HTML with color-coding."""
 
176
  'num_windows': len(predictions)
177
  }
178
 
179
+ def detailed_scan(self, text: str) -> Dict:
180
+ """Original prediction method with modified window handling"""
181
+ # Clean up trailing whitespace
182
+ text = text.rstrip()
183
+
184
+ if not text.strip():
185
+ return {
186
+ 'sentence_predictions': [],
187
+ 'highlighted_text': '',
188
+ 'full_text': '',
189
+ 'overall_prediction': {
190
+ 'prediction': 'unknown',
191
+ 'confidence': 0.0,
192
+ 'num_sentences': 0
193
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  }
195
+
196
+ self.model.eval()
197
+ sentences = self.processor.split_into_sentences(text)
198
+ if not sentences:
199
+ return {}
200
+
201
+ # Create centered windows for each sentence
202
+ windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
203
+
204
+ # Track scores for each sentence
205
+ sentence_appearances = {i: 0 for i in range(len(sentences))}
206
+ sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
207
+
208
+ # Process windows in batches
209
+ batch_size = 16
210
+ for i in range(0, len(windows), batch_size):
211
+ batch_windows = windows[i:i + batch_size]
212
+ batch_indices = window_sentence_indices[i:i + batch_size]
213
+
214
+ inputs = self.tokenizer(
215
+ batch_windows,
216
+ truncation=True,
217
+ padding=True,
218
+ max_length=MAX_LENGTH,
219
+ return_tensors="pt"
220
+ ).to(self.device)
221
+
222
+ with torch.no_grad():
223
+ outputs = self.model(**inputs)
224
+ probs = F.softmax(outputs.logits, dim=-1)
225
+
226
+ # Attribute predictions with weighted scoring
227
+ for window_idx, indices in enumerate(batch_indices):
228
+ center_idx = len(indices) // 2
229
+ center_weight = 0.7 # Higher weight for center sentence
230
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
231
+
232
+ for pos, sent_idx in enumerate(indices):
233
+ # Apply higher weight to center sentence
234
+ weight = center_weight if pos == center_idx else edge_weight
235
+ sentence_appearances[sent_idx] += weight
236
+ sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
237
+ sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
238
+
239
+ # Clean up memory
240
+ del inputs, outputs, probs
241
+ if torch.cuda.is_available():
242
+ torch.cuda.empty_cache()
243
+
244
+ # Calculate final predictions with boundary smoothing
245
+ sentence_predictions = []
246
+ for i in range(len(sentences)):
247
+ if sentence_appearances[i] > 0:
248
+ human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
249
+ ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
250
+
251
+ # Only apply minimal smoothing at prediction boundaries
252
+ if i > 0 and i < len(sentences) - 1:
253
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
254
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
255
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
256
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
257
+
258
+ # Check if we're at a prediction boundary
259
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
260
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
261
+ next_pred = 'human' if next_human > next_ai else 'ai'
262
+
263
+ if current_pred != prev_pred or current_pred != next_pred:
264
+ # Small adjustment at boundaries
265
+ smooth_factor = 0.1
266
+ human_prob = (human_prob * (1 - smooth_factor) +
267
+ (prev_human + next_human) * smooth_factor / 2)
268
+ ai_prob = (ai_prob * (1 - smooth_factor) +
269
+ (prev_ai + next_ai) * smooth_factor / 2)
270
+
271
+ sentence_predictions.append({
272
+ 'sentence': sentences[i],
273
+ 'human_prob': human_prob,
274
+ 'ai_prob': ai_prob,
275
+ 'prediction': 'human' if human_prob > ai_prob else 'ai',
276
+ 'confidence': max(human_prob, ai_prob)
277
+ })
278
+
279
+ return {
280
+ 'sentence_predictions': sentence_predictions,
281
+ 'highlighted_text': self.format_predictions_html(sentence_predictions),
282
+ 'full_text': text,
283
+ 'overall_prediction': self.aggregate_predictions(sentence_predictions)
284
  }
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
288
  """Format predictions as HTML with color-coding."""