ApsidalSolid4 commited on
Commit
2a56a00
·
verified ·
1 Parent(s): 8373deb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -34
app.py CHANGED
@@ -163,7 +163,7 @@ class TextClassifier:
163
  }
164
 
165
  def detailed_scan(self, text: str) -> Dict:
166
- """Optimized detailed scan with sentence-level analysis."""
167
  if not text.strip():
168
  return {
169
  'sentence_predictions': [],
@@ -180,22 +180,21 @@ class TextClassifier:
180
  if not sentences:
181
  return {}
182
 
183
- # Pre-calculate window information
184
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
185
- sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0, 'appearances': 0} for i in range(len(sentences))}
186
 
187
- # Calculate weights once
188
- center_weight = 0.7
189
- edge_weight = 0.3 / (WINDOW_SIZE - 1) if WINDOW_SIZE > 1 else 0.3
190
 
191
- # Process all windows in larger batches
192
- batch_size = min(32, len(windows)) # Increased batch size
193
  for i in range(0, len(windows), batch_size):
194
  batch_end = min(i + batch_size, len(windows))
195
  batch_windows = windows[i:batch_end]
196
  batch_indices = window_sentence_indices[i:batch_end]
197
 
198
- # Process batch
199
  inputs = self.tokenizer(
200
  batch_windows,
201
  truncation=True,
@@ -208,46 +207,54 @@ class TextClassifier:
208
  outputs = self.model(**inputs)
209
  probs = F.softmax(outputs.logits, dim=-1)
210
 
211
- # Process each window in the batch
212
  for window_idx, indices in enumerate(batch_indices):
213
  center_idx = len(indices) // 2
 
 
 
 
214
  window_human_prob = probs[window_idx][1].item()
215
  window_ai_prob = probs[window_idx][0].item()
216
 
217
- # Update scores for all sentences in this window
218
  for pos, sent_idx in enumerate(indices):
 
219
  weight = center_weight if pos == center_idx else edge_weight
 
220
  sentence_scores[sent_idx]['human_prob'] += weight * window_human_prob
221
  sentence_scores[sent_idx]['ai_prob'] += weight * window_ai_prob
222
- sentence_scores[sent_idx]['appearances'] += weight
223
 
 
224
  del inputs, outputs, probs
225
  if torch.cuda.is_available():
226
  torch.cuda.empty_cache()
227
 
228
- # Calculate final predictions
229
  sentence_predictions = []
230
- prev_pred = None
231
  for i in range(len(sentences)):
232
- scores = sentence_scores[i]
233
- if scores['appearances'] > 0:
234
- # Calculate base probabilities
235
- human_prob = scores['human_prob'] / scores['appearances']
236
- ai_prob = scores['ai_prob'] / scores['appearances']
237
- current_pred = 'human' if human_prob > ai_prob else 'ai'
 
 
 
 
238
 
239
- # Only apply smoothing at actual prediction boundaries
240
- if i > 0 and prev_pred and current_pred != prev_pred:
241
- # Simple smoothing only at boundaries
242
- smooth_factor = 0.1
243
- if i < len(sentences) - 1:
244
- next_scores = sentence_scores[i + 1]
245
- next_human = next_scores['human_prob'] / next_scores['appearances']
246
- next_ai = next_scores['ai_prob'] / next_scores['appearances']
247
-
248
- # Apply minimal smoothing
249
- human_prob = human_prob * (1 - smooth_factor) + next_human * smooth_factor
250
- ai_prob = ai_prob * (1 - smooth_factor) + next_ai * smooth_factor
251
 
252
  sentence_predictions.append({
253
  'sentence': sentences[i],
@@ -256,7 +263,6 @@ class TextClassifier:
256
  'prediction': 'human' if human_prob > ai_prob else 'ai',
257
  'confidence': max(human_prob, ai_prob)
258
  })
259
- prev_pred = current_pred
260
 
261
  return {
262
  'sentence_predictions': sentence_predictions,
@@ -264,7 +270,6 @@ class TextClassifier:
264
  'full_text': text,
265
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
266
  }
267
-
268
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
269
  """Format predictions as HTML with color-coding."""
270
  html_parts = []
 
163
  }
164
 
165
  def detailed_scan(self, text: str) -> Dict:
166
+ """Perform a detailed scan with sentence-level analysis and improved boundary handling."""
167
  if not text.strip():
168
  return {
169
  'sentence_predictions': [],
 
180
  if not sentences:
181
  return {}
182
 
183
+ # Create centered windows for each sentence
184
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
 
185
 
186
+ # Track scores for each sentence
187
+ sentence_appearances = {i: 0 for i in range(len(sentences))}
188
+ sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
189
 
190
+ # Increased batch size and process windows more efficiently
191
+ batch_size = 32 # Increased from 16 to 32
192
  for i in range(0, len(windows), batch_size):
193
  batch_end = min(i + batch_size, len(windows))
194
  batch_windows = windows[i:batch_end]
195
  batch_indices = window_sentence_indices[i:batch_end]
196
 
197
+ # Process batch more efficiently
198
  inputs = self.tokenizer(
199
  batch_windows,
200
  truncation=True,
 
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
 
210
+ # Attribute predictions with center-weighted approach
211
  for window_idx, indices in enumerate(batch_indices):
212
  center_idx = len(indices) // 2
213
+ center_weight = 0.7 # Higher weight for center sentence
214
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
215
+
216
+ # Process probabilities once per window
217
  window_human_prob = probs[window_idx][1].item()
218
  window_ai_prob = probs[window_idx][0].item()
219
 
 
220
  for pos, sent_idx in enumerate(indices):
221
+ # Apply higher weight to center sentence
222
  weight = center_weight if pos == center_idx else edge_weight
223
+ sentence_appearances[sent_idx] += weight
224
  sentence_scores[sent_idx]['human_prob'] += weight * window_human_prob
225
  sentence_scores[sent_idx]['ai_prob'] += weight * window_ai_prob
 
226
 
227
+ # Clean up GPU memory more aggressively
228
  del inputs, outputs, probs
229
  if torch.cuda.is_available():
230
  torch.cuda.empty_cache()
231
 
232
+ # Calculate final predictions with boundary smoothing
233
  sentence_predictions = []
 
234
  for i in range(len(sentences)):
235
+ if sentence_appearances[i] > 0:
236
+ human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
237
+ ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
238
+
239
+ # Apply minimal smoothing at prediction boundaries
240
+ if i > 0 and i < len(sentences) - 1:
241
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
242
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
243
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
244
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
245
 
246
+ # Check if we're at a prediction boundary
247
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
248
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
249
+ next_pred = 'human' if next_human > next_ai else 'ai'
250
+
251
+ if current_pred != prev_pred or current_pred != next_pred:
252
+ # Small adjustment at boundaries
253
+ smooth_factor = 0.1
254
+ human_prob = (human_prob * (1 - smooth_factor) +
255
+ (prev_human + next_human) * smooth_factor / 2)
256
+ ai_prob = (ai_prob * (1 - smooth_factor) +
257
+ (prev_ai + next_ai) * smooth_factor / 2)
258
 
259
  sentence_predictions.append({
260
  'sentence': sentences[i],
 
263
  'prediction': 'human' if human_prob > ai_prob else 'ai',
264
  'confidence': max(human_prob, ai_prob)
265
  })
 
266
 
267
  return {
268
  'sentence_predictions': sentence_predictions,
 
270
  'full_text': text,
271
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
272
  }
 
273
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
274
  """Format predictions as HTML with color-coding."""
275
  html_parts = []