ApsidalSolid4 commited on
Commit
99608c9
·
verified ·
1 Parent(s): 444f8bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -93
app.py CHANGED
@@ -176,102 +176,105 @@ class TextClassifier:
176
  'num_windows': len(predictions)
177
  }
178
 
179
- def detailed_scan(self, text: str) -> Dict:
180
- """Original prediction method with modified window handling"""
181
- if self.model is None or self.tokenizer is None:
182
- self.load_model()
183
-
184
- self.model.eval()
185
- sentences = self.processor.split_into_sentences(text)
186
- if not sentences:
187
- return {}
188
-
189
- # Create centered windows for each sentence
190
- windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
191
-
192
- # Track scores for each sentence
193
- sentence_appearances = {i: 0 for i in range(len(sentences))}
194
- sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
195
-
196
- # Process windows in batches
197
- batch_size = 16
198
- for i in range(0, len(windows), batch_size):
199
- batch_windows = windows[i:i + batch_size]
200
- batch_indices = window_sentence_indices[i:i + batch_size]
201
-
202
- inputs = self.tokenizer(
203
- batch_windows,
204
- truncation=True,
205
- padding=True,
206
- max_length=MAX_LENGTH,
207
- return_tensors="pt"
208
- ).to(self.device)
209
-
210
- with torch.no_grad():
211
- outputs = self.model(**inputs)
212
- probs = F.softmax(outputs.logits, dim=-1)
213
-
214
- # Attribute predictions more carefully
215
- for window_idx, indices in enumerate(batch_indices):
216
- center_idx = len(indices) // 2
217
- center_weight = 0.7 # Higher weight for center sentence
218
- edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
219
-
220
- for pos, sent_idx in enumerate(indices):
221
- # Apply higher weight to center sentence
222
- weight = center_weight if pos == center_idx else edge_weight
223
- sentence_appearances[sent_idx] += weight
224
- sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
225
- sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
226
-
227
- del inputs, outputs, probs
228
- if torch.cuda.is_available():
229
- torch.cuda.empty_cache()
230
-
231
- # Calculate final predictions
232
- sentence_predictions = []
233
- for i in range(len(sentences)):
234
- if sentence_appearances[i] > 0:
235
- human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
236
- ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
237
-
238
- # Only apply minimal smoothing at prediction boundaries
239
- if i > 0 and i < len(sentences) - 1:
240
- prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
241
- prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
242
- next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
243
- next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
244
-
245
- # Check if we're at a prediction boundary
246
- current_pred = 'human' if human_prob > ai_prob else 'ai'
247
- prev_pred = 'human' if prev_human > prev_ai else 'ai'
248
- next_pred = 'human' if next_human > next_ai else 'ai'
249
-
250
- if current_pred != prev_pred or current_pred != next_pred:
251
- # Small adjustment at boundaries
252
- smooth_factor = 0.1
253
- human_prob = (human_prob * (1 - smooth_factor) +
254
- (prev_human + next_human) * smooth_factor / 2)
255
- ai_prob = (ai_prob * (1 - smooth_factor) +
256
- (prev_ai + next_ai) * smooth_factor / 2)
257
-
258
- sentence_predictions.append({
259
- 'sentence': sentences[i],
260
- 'human_prob': human_prob,
261
- 'ai_prob': ai_prob,
262
- 'prediction': 'human' if human_prob > ai_prob else 'ai',
263
- 'confidence': max(human_prob, ai_prob)
264
- })
265
-
266
- return {
267
- 'sentence_predictions': sentence_predictions,
268
- 'highlighted_text': self.format_predictions_html(sentence_predictions),
269
- 'full_text': text,
270
- 'overall_prediction': self.aggregate_predictions(sentence_predictions)
271
- }
272
 
273
  def detailed_scan(self, text: str) -> Dict:
274
  """Perform a detailed scan with improved sentence-level analysis."""
 
 
 
275
  if not text.strip():
276
  return {
277
  'sentence_predictions': [],
 
176
  'num_windows': len(predictions)
177
  }
178
 
179
+ # def detailed_scan(self, text: str) -> Dict:
180
+ # """Original prediction method with modified window handling"""
181
+ # if self.model is None or self.tokenizer is None:
182
+ # self.load_model()
183
+
184
+ # self.model.eval()
185
+ # sentences = self.processor.split_into_sentences(text)
186
+ # if not sentences:
187
+ # return {}
188
+
189
+ # # Create centered windows for each sentence
190
+ # windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
191
+
192
+ # # Track scores for each sentence
193
+ # sentence_appearances = {i: 0 for i in range(len(sentences))}
194
+ # sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
195
+
196
+ # # Process windows in batches
197
+ # batch_size = 16
198
+ # for i in range(0, len(windows), batch_size):
199
+ # batch_windows = windows[i:i + batch_size]
200
+ # batch_indices = window_sentence_indices[i:i + batch_size]
201
+
202
+ # inputs = self.tokenizer(
203
+ # batch_windows,
204
+ # truncation=True,
205
+ # padding=True,
206
+ # max_length=MAX_LENGTH,
207
+ # return_tensors="pt"
208
+ # ).to(self.device)
209
+
210
+ # with torch.no_grad():
211
+ # outputs = self.model(**inputs)
212
+ # probs = F.softmax(outputs.logits, dim=-1)
213
+
214
+ # # Attribute predictions more carefully
215
+ # for window_idx, indices in enumerate(batch_indices):
216
+ # center_idx = len(indices) // 2
217
+ # center_weight = 0.7 # Higher weight for center sentence
218
+ # edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
219
+
220
+ # for pos, sent_idx in enumerate(indices):
221
+ # # Apply higher weight to center sentence
222
+ # weight = center_weight if pos == center_idx else edge_weight
223
+ # sentence_appearances[sent_idx] += weight
224
+ # sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
225
+ # sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
226
+
227
+ # del inputs, outputs, probs
228
+ # if torch.cuda.is_available():
229
+ # torch.cuda.empty_cache()
230
+
231
+ # # Calculate final predictions
232
+ # sentence_predictions = []
233
+ # for i in range(len(sentences)):
234
+ # if sentence_appearances[i] > 0:
235
+ # human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
236
+ # ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
237
+
238
+ # # Only apply minimal smoothing at prediction boundaries
239
+ # if i > 0 and i < len(sentences) - 1:
240
+ # prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
241
+ # prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
242
+ # next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
243
+ # next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
244
+
245
+ # # Check if we're at a prediction boundary
246
+ # current_pred = 'human' if human_prob > ai_prob else 'ai'
247
+ # prev_pred = 'human' if prev_human > prev_ai else 'ai'
248
+ # next_pred = 'human' if next_human > next_ai else 'ai'
249
+
250
+ # if current_pred != prev_pred or current_pred != next_pred:
251
+ # # Small adjustment at boundaries
252
+ # smooth_factor = 0.1
253
+ # human_prob = (human_prob * (1 - smooth_factor) +
254
+ # (prev_human + next_human) * smooth_factor / 2)
255
+ # ai_prob = (ai_prob * (1 - smooth_factor) +
256
+ # (prev_ai + next_ai) * smooth_factor / 2)
257
+
258
+ # sentence_predictions.append({
259
+ # 'sentence': sentences[i],
260
+ # 'human_prob': human_prob,
261
+ # 'ai_prob': ai_prob,
262
+ # 'prediction': 'human' if human_prob > ai_prob else 'ai',
263
+ # 'confidence': max(human_prob, ai_prob)
264
+ # })
265
+
266
+ # return {
267
+ # 'sentence_predictions': sentence_predictions,
268
+ # 'highlighted_text': self.format_predictions_html(sentence_predictions),
269
+ # 'full_text': text,
270
+ # 'overall_prediction': self.aggregate_predictions(sentence_predictions)
271
+ # }
272
 
273
  def detailed_scan(self, text: str) -> Dict:
274
  """Perform a detailed scan with improved sentence-level analysis."""
275
+ # Clean up trailing whitespace
276
+ text = text.rstrip()
277
+
278
  if not text.strip():
279
  return {
280
  'sentence_predictions': [],