ApsidalSolid4 commited on
Commit
1bb7d9d
·
verified ·
1 Parent(s): 2a56a00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -67
app.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch.nn.functional as F
5
  import spacy
6
- from typing import List, Dict, Tuple
7
  import logging
8
  import os
9
  import gradio as gr
@@ -51,22 +51,25 @@ class TextWindowProcessor:
51
  windows.append(" ".join(window))
52
  return windows
53
 
54
- def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
55
- """Create windows with better boundary handling"""
56
  windows = []
57
  window_sentence_indices = []
58
-
59
  for i in range(len(sentences)):
60
- # Calculate window boundaries centered on current sentence
61
  half_window = window_size // 2
62
  start_idx = max(0, i - half_window)
63
  end_idx = min(len(sentences), i + half_window + 1)
64
-
65
- # Create the window
 
 
 
 
66
  window = sentences[start_idx:end_idx]
67
  windows.append(" ".join(window))
68
  window_sentence_indices.append(list(range(start_idx, end_idx)))
69
-
70
  return windows, window_sentence_indices
71
 
72
  class TextClassifier:
@@ -163,7 +166,7 @@ class TextClassifier:
163
  }
164
 
165
  def detailed_scan(self, text: str) -> Dict:
166
- """Perform a detailed scan with sentence-level analysis and improved boundary handling."""
167
  if not text.strip():
168
  return {
169
  'sentence_predictions': [],
@@ -175,26 +178,23 @@ class TextClassifier:
175
  'num_sentences': 0
176
  }
177
  }
178
-
179
  sentences = self.processor.split_into_sentences(text)
180
  if not sentences:
181
  return {}
182
-
183
  # Create centered windows for each sentence
184
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
185
-
186
  # Track scores for each sentence
187
  sentence_appearances = {i: 0 for i in range(len(sentences))}
188
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
189
-
190
- # Increased batch size and process windows more efficiently
191
- batch_size = 32 # Increased from 16 to 32
192
- for i in range(0, len(windows), batch_size):
193
- batch_end = min(i + batch_size, len(windows))
194
- batch_windows = windows[i:batch_end]
195
- batch_indices = window_sentence_indices[i:batch_end]
196
-
197
- # Process batch more efficiently
198
  inputs = self.tokenizer(
199
  batch_windows,
200
  truncation=True,
@@ -202,60 +202,23 @@ class TextClassifier:
202
  max_length=MAX_LENGTH,
203
  return_tensors="pt"
204
  ).to(self.device)
205
-
206
  with torch.no_grad():
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
-
210
- # Attribute predictions with center-weighted approach
211
  for window_idx, indices in enumerate(batch_indices):
212
- center_idx = len(indices) // 2
213
- center_weight = 0.7 # Higher weight for center sentence
214
- edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
215
-
216
- # Process probabilities once per window
217
- window_human_prob = probs[window_idx][1].item()
218
- window_ai_prob = probs[window_idx][0].item()
219
-
220
- for pos, sent_idx in enumerate(indices):
221
- # Apply higher weight to center sentence
222
- weight = center_weight if pos == center_idx else edge_weight
223
- sentence_appearances[sent_idx] += weight
224
- sentence_scores[sent_idx]['human_prob'] += weight * window_human_prob
225
- sentence_scores[sent_idx]['ai_prob'] += weight * window_ai_prob
226
-
227
- # Clean up GPU memory more aggressively
228
- del inputs, outputs, probs
229
- if torch.cuda.is_available():
230
- torch.cuda.empty_cache()
231
-
232
- # Calculate final predictions with boundary smoothing
233
  sentence_predictions = []
234
  for i in range(len(sentences)):
235
  if sentence_appearances[i] > 0:
236
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
237
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
238
-
239
- # Apply minimal smoothing at prediction boundaries
240
- if i > 0 and i < len(sentences) - 1:
241
- prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
242
- prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
243
- next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
244
- next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
245
-
246
- # Check if we're at a prediction boundary
247
- current_pred = 'human' if human_prob > ai_prob else 'ai'
248
- prev_pred = 'human' if prev_human > prev_ai else 'ai'
249
- next_pred = 'human' if next_human > next_ai else 'ai'
250
-
251
- if current_pred != prev_pred or current_pred != next_pred:
252
- # Small adjustment at boundaries
253
- smooth_factor = 0.1
254
- human_prob = (human_prob * (1 - smooth_factor) +
255
- (prev_human + next_human) * smooth_factor / 2)
256
- ai_prob = (ai_prob * (1 - smooth_factor) +
257
- (prev_ai + next_ai) * smooth_factor / 2)
258
-
259
  sentence_predictions.append({
260
  'sentence': sentences[i],
261
  'human_prob': human_prob,
@@ -263,13 +226,14 @@ class TextClassifier:
263
  'prediction': 'human' if human_prob > ai_prob else 'ai',
264
  'confidence': max(human_prob, ai_prob)
265
  })
266
-
267
  return {
268
  'sentence_predictions': sentence_predictions,
269
  'highlighted_text': self.format_predictions_html(sentence_predictions),
270
  'full_text': text,
271
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
272
  }
 
273
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
274
  """Format predictions as HTML with color-coding."""
275
  html_parts = []
 
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch.nn.functional as F
5
  import spacy
6
+ from typing import List, Dict
7
  import logging
8
  import os
9
  import gradio as gr
 
51
  windows.append(" ".join(window))
52
  return windows
53
 
54
+ def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
55
+ """Create centered windows for detailed analysis mode."""
56
  windows = []
57
  window_sentence_indices = []
58
+
59
  for i in range(len(sentences)):
 
60
  half_window = window_size // 2
61
  start_idx = max(0, i - half_window)
62
  end_idx = min(len(sentences), i + half_window + 1)
63
+
64
+ if start_idx == 0:
65
+ end_idx = min(len(sentences), window_size)
66
+ elif end_idx == len(sentences):
67
+ start_idx = max(0, len(sentences) - window_size)
68
+
69
  window = sentences[start_idx:end_idx]
70
  windows.append(" ".join(window))
71
  window_sentence_indices.append(list(range(start_idx, end_idx)))
72
+
73
  return windows, window_sentence_indices
74
 
75
  class TextClassifier:
 
166
  }
167
 
168
  def detailed_scan(self, text: str) -> Dict:
169
+ """Perform a detailed scan with sentence-level analysis."""
170
  if not text.strip():
171
  return {
172
  'sentence_predictions': [],
 
178
  'num_sentences': 0
179
  }
180
  }
181
+
182
  sentences = self.processor.split_into_sentences(text)
183
  if not sentences:
184
  return {}
185
+
186
  # Create centered windows for each sentence
187
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
188
+
189
  # Track scores for each sentence
190
  sentence_appearances = {i: 0 for i in range(len(sentences))}
191
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
192
+
193
+ # Process windows in batches
194
+ for i in range(0, len(windows), BATCH_SIZE):
195
+ batch_windows = windows[i:i + BATCH_SIZE]
196
+ batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
197
+
 
 
 
198
  inputs = self.tokenizer(
199
  batch_windows,
200
  truncation=True,
 
202
  max_length=MAX_LENGTH,
203
  return_tensors="pt"
204
  ).to(self.device)
205
+
206
  with torch.no_grad():
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
+
 
210
  for window_idx, indices in enumerate(batch_indices):
211
+ for sent_idx in indices:
212
+ sentence_appearances[sent_idx] += 1
213
+ sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
214
+ sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
215
+
216
+ # Average the scores and create final sentence-level predictions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  sentence_predictions = []
218
  for i in range(len(sentences)):
219
  if sentence_appearances[i] > 0:
220
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
221
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  sentence_predictions.append({
223
  'sentence': sentences[i],
224
  'human_prob': human_prob,
 
226
  'prediction': 'human' if human_prob > ai_prob else 'ai',
227
  'confidence': max(human_prob, ai_prob)
228
  })
229
+
230
  return {
231
  'sentence_predictions': sentence_predictions,
232
  'highlighted_text': self.format_predictions_html(sentence_predictions),
233
  'full_text': text,
234
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
235
  }
236
+
237
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
238
  """Format predictions as HTML with color-coding."""
239
  html_parts = []