ApsidalSolid4 commited on
Commit
6ca0d72
·
verified ·
1 Parent(s): e3d20ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -31
app.py CHANGED
@@ -1,53 +1,67 @@
 
 
 
1
  import torch
2
  import numpy as np
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch.nn.functional as F
5
- import spacy
6
  from typing import List, Dict, Tuple
7
  import logging
8
  import os
9
- import gradio as gr
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from concurrent.futures import ThreadPoolExecutor
12
  from functools import partial
13
  import time
14
  from datetime import datetime
15
 
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
- MAX_LENGTH = 512
20
- MODEL_NAME = "microsoft/deberta-v3-small"
21
- WINDOW_SIZE = 6
22
- WINDOW_OVERLAP = 2
23
- CONFIDENCE_THRESHOLD = 0.65
24
- BATCH_SIZE = 8
25
- MAX_WORKERS = 4
26
-
 
 
 
27
  class TextWindowProcessor:
28
  def __init__(self):
 
29
  try:
30
  self.nlp = spacy.load("en_core_web_sm")
31
  except OSError:
 
32
  logger.info("Downloading spacy model...")
33
  spacy.cli.download("en_core_web_sm")
34
  self.nlp = spacy.load("en_core_web_sm")
35
 
 
36
  if 'sentencizer' not in self.nlp.pipe_names:
37
  self.nlp.add_pipe('sentencizer')
38
 
 
39
  disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
40
  self.nlp.disable_pipes(*disabled_pipes)
41
 
 
42
  self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
43
 
 
44
  def split_into_sentences(self, text: str) -> List[str]:
45
  doc = self.nlp(text)
46
  return [str(sent).strip() for sent in doc.sents]
47
 
 
48
  def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
49
  if len(sentences) < window_size:
50
- return [" ".join(sentences)]
51
 
52
  windows = []
53
  stride = window_size - overlap
@@ -56,6 +70,8 @@ class TextWindowProcessor:
56
  windows.append(" ".join(window))
57
  return windows
58
 
 
 
59
  def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
60
  windows = []
61
  window_sentence_indices = []
@@ -71,12 +87,16 @@ class TextWindowProcessor:
71
 
72
  return windows, window_sentence_indices
73
 
 
 
74
  class TextClassifier:
75
  def __init__(self):
 
76
  if not torch.cuda.is_available():
77
  torch.set_num_threads(MAX_WORKERS)
78
  torch.set_num_interop_threads(MAX_WORKERS)
79
 
 
80
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
81
  self.model_name = MODEL_NAME
82
  self.tokenizer = None
@@ -84,22 +104,26 @@ class TextClassifier:
84
  self.processor = TextWindowProcessor()
85
  self.initialize_model()
86
 
 
87
  def initialize_model(self):
88
  logger.info("Initializing model and tokenizer...")
89
 
 
90
  from transformers import DebertaV2TokenizerFast
91
 
92
  self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
93
  self.model_name,
94
  model_max_length=MAX_LENGTH,
95
- use_fast=True
96
  )
97
 
 
98
  self.model = AutoModelForSequenceClassification.from_pretrained(
99
  self.model_name,
100
  num_labels=2
101
  ).to(self.device)
102
 
 
103
  model_path = "model_20250209_184929_acc1.0000.pt"
104
  if os.path.exists(model_path):
105
  logger.info(f"Loading custom model from {model_path}")
@@ -108,8 +132,11 @@ class TextClassifier:
108
  else:
109
  logger.warning("Custom model file not found. Using base model.")
110
 
 
111
  self.model.eval()
112
 
 
 
113
  def quick_scan(self, text: str) -> Dict:
114
  if not text.strip():
115
  return {
@@ -118,14 +145,17 @@ class TextClassifier:
118
  'num_windows': 0
119
  }
120
 
 
121
  sentences = self.processor.split_into_sentences(text)
122
  windows = self.processor.create_windows(sentences, WINDOW_SIZE, WINDOW_OVERLAP)
123
 
124
  predictions = []
125
 
 
126
  for i in range(0, len(windows), BATCH_SIZE):
127
  batch_windows = windows[i:i + BATCH_SIZE]
128
 
 
129
  inputs = self.tokenizer(
130
  batch_windows,
131
  truncation=True,
@@ -134,10 +164,12 @@ class TextClassifier:
134
  return_tensors="pt"
135
  ).to(self.device)
136
 
 
137
  with torch.no_grad():
138
  outputs = self.model(**inputs)
139
  probs = F.softmax(outputs.logits, dim=-1)
140
 
 
141
  for idx, window in enumerate(batch_windows):
142
  prediction = {
143
  'window': window,
@@ -147,6 +179,7 @@ class TextClassifier:
147
  }
148
  predictions.append(prediction)
149
 
 
150
  del inputs, outputs, probs
151
  if torch.cuda.is_available():
152
  torch.cuda.empty_cache()
@@ -158,6 +191,7 @@ class TextClassifier:
158
  'num_windows': 0
159
  }
160
 
 
161
  avg_human_prob = sum(p['human_prob'] for p in predictions) / len(predictions)
162
  avg_ai_prob = sum(p['ai_prob'] for p in predictions) / len(predictions)
163
 
@@ -167,6 +201,8 @@ class TextClassifier:
167
  'num_windows': len(predictions)
168
  }
169
 
 
 
170
  def detailed_scan(self, text: str) -> Dict:
171
  text = text.rstrip()
172
 
@@ -182,19 +218,24 @@ class TextClassifier:
182
  }
183
  }
184
 
 
185
  sentences = self.processor.split_into_sentences(text)
186
  if not sentences:
187
  return {}
188
 
 
189
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
190
 
 
191
  sentence_appearances = {i: 0 for i in range(len(sentences))}
192
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
193
 
 
194
  for i in range(0, len(windows), BATCH_SIZE):
195
  batch_windows = windows[i:i + BATCH_SIZE]
196
  batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
197
 
 
198
  inputs = self.tokenizer(
199
  batch_windows,
200
  truncation=True,
@@ -203,31 +244,37 @@ class TextClassifier:
203
  return_tensors="pt"
204
  ).to(self.device)
205
 
 
206
  with torch.no_grad():
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
 
 
210
  for window_idx, indices in enumerate(batch_indices):
211
  center_idx = len(indices) // 2
212
- center_weight = 0.7
213
- edge_weight = 0.3 / (len(indices) - 1)
214
 
 
215
  for pos, sent_idx in enumerate(indices):
216
  weight = center_weight if pos == center_idx else edge_weight
217
  sentence_appearances[sent_idx] += weight
218
  sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
219
  sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
220
 
 
221
  del inputs, outputs, probs
222
  if torch.cuda.is_available():
223
  torch.cuda.empty_cache()
224
 
 
225
  sentence_predictions = []
226
  for i in range(len(sentences)):
227
  if sentence_appearances[i] > 0:
228
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
229
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
230
 
 
231
  if i > 0 and i < len(sentences) - 1:
232
  prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
233
  prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
@@ -238,8 +285,9 @@ class TextClassifier:
238
  prev_pred = 'human' if prev_human > prev_ai else 'ai'
239
  next_pred = 'human' if next_human > next_ai else 'ai'
240
 
 
241
  if current_pred != prev_pred or current_pred != next_pred:
242
- smooth_factor = 0.1
243
  human_prob = (human_prob * (1 - smooth_factor) +
244
  (prev_human + next_human) * smooth_factor / 2)
245
  ai_prob = (ai_prob * (1 - smooth_factor) +
@@ -253,6 +301,7 @@ class TextClassifier:
253
  'confidence': max(human_prob, ai_prob)
254
  })
255
 
 
256
  return {
257
  'sentence_predictions': sentence_predictions,
258
  'highlighted_text': self.format_predictions_html(sentence_predictions),
@@ -260,6 +309,7 @@ class TextClassifier:
260
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
261
  }
262
 
 
263
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
264
  html_parts = []
265
 
@@ -267,21 +317,23 @@ class TextClassifier:
267
  sentence = pred['sentence']
268
  confidence = pred['confidence']
269
 
 
270
  if confidence >= CONFIDENCE_THRESHOLD:
271
  if pred['prediction'] == 'human':
272
- color = "#90EE90"
273
  else:
274
- color = "#FFB6C6"
275
  else:
276
  if pred['prediction'] == 'human':
277
- color = "#E8F5E9"
278
  else:
279
- color = "#FFEBEE"
280
 
281
  html_parts.append(f'<span style="background-color: {color};">{sentence}</span>')
282
 
283
  return " ".join(html_parts)
284
 
 
285
  def aggregate_predictions(self, predictions: List[Dict]) -> Dict:
286
  if not predictions:
287
  return {
@@ -290,6 +342,7 @@ class TextClassifier:
290
  'num_sentences': 0
291
  }
292
 
 
293
  total_human_prob = sum(p['human_prob'] for p in predictions)
294
  total_ai_prob = sum(p['ai_prob'] for p in predictions)
295
  num_sentences = len(predictions)
@@ -303,16 +356,20 @@ class TextClassifier:
303
  'num_sentences': num_sentences
304
  }
305
 
 
 
306
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
307
  start_time = time.time()
308
 
309
  word_count = len(text.split())
310
 
 
311
  original_mode = mode
312
  if word_count < 200 and mode == "detailed":
313
  mode = "quick"
314
 
315
  if mode == "quick":
 
316
  result = classifier.quick_scan(text)
317
 
318
  quick_analysis = f"""
@@ -321,19 +378,22 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
321
  Windows analyzed: {result['num_windows']}
322
  """
323
 
 
324
  if original_mode == "detailed":
325
  quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
326
 
327
  execution_time = (time.time() - start_time) * 1000
328
 
329
  return (
330
- text,
331
  "Quick scan mode - no sentence-level analysis available",
332
  quick_analysis
333
  )
334
  else:
 
335
  analysis = classifier.detailed_scan(text)
336
 
 
337
  detailed_analysis = []
338
  for pred in analysis['sentence_predictions']:
339
  confidence = pred['confidence'] * 100
@@ -342,6 +402,7 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
342
  detailed_analysis.append(f"Confidence: {confidence:.1f}%")
343
  detailed_analysis.append("-" * 50)
344
 
 
345
  final_pred = analysis['overall_prediction']
346
  overall_result = f"""
347
  FINAL PREDICTION: {final_pred['prediction'].upper()}
@@ -352,13 +413,15 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
352
  execution_time = (time.time() - start_time) * 1000
353
 
354
  return (
355
- analysis['highlighted_text'],
356
- "\n".join(detailed_analysis),
357
- overall_result
358
  )
359
 
 
360
  classifier = TextClassifier()
361
 
 
362
  demo = gr.Interface(
363
  fn=lambda text, mode: analyze_text(text, mode, classifier),
364
  inputs=[
@@ -375,9 +438,9 @@ demo = gr.Interface(
375
  )
376
  ],
377
  outputs=[
378
- gr.HTML(label="Highlighted Analysis"),
379
- gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10),
380
- gr.Textbox(label="Overall Result", lines=4)
381
  ],
382
  title="AI Text Detector",
383
  description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.",
@@ -385,8 +448,10 @@ demo = gr.Interface(
385
  flagging_mode="never"
386
  )
387
 
 
388
  app = demo.app
389
 
 
390
  app.add_middleware(
391
  CORSMiddleware,
392
  allow_origins=["*"],
@@ -395,10 +460,11 @@ app.add_middleware(
395
  allow_headers=["*"],
396
  )
397
 
 
398
  if __name__ == "__main__":
399
- demo.queue()
400
  demo.launch(
401
- server_name="0.0.0.0",
402
- server_port=7860,
403
- share=True
404
  )
 
1
+ # AI Text Detector Code Analysis
2
+
3
+ # IMPORTS AND CONFIGURATION
4
  import torch
5
  import numpy as np
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification # HuggingFace transformers for NLP models
7
  import torch.nn.functional as F
8
+ import spacy # Used for sentence splitting
9
  from typing import List, Dict, Tuple
10
  import logging
11
  import os
12
+ import gradio as gr # Used for creating the web UI
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from concurrent.futures import ThreadPoolExecutor
15
  from functools import partial
16
  import time
17
  from datetime import datetime
18
 
19
+ # Basic logging setup
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
+ # GLOBAL PARAMETERS
24
+ MAX_LENGTH = 512 # Maximum token length for the model input
25
+ MODEL_NAME = "microsoft/deberta-v3-small" # Using Microsoft's DeBERTa v3 small model as the base
26
+ WINDOW_SIZE = 6 # Number of sentences in each analysis window
27
+ WINDOW_OVERLAP = 2 # Number of sentences that overlap between adjacent windows
28
+ CONFIDENCE_THRESHOLD = 0.65 # Threshold for highlighting predictions with stronger colors
29
+ BATCH_SIZE = 8 # Number of windows to process in a single batch for efficiency
30
+ MAX_WORKERS = 4 # Maximum number of worker threads for parallel processing
31
+
32
+ # TEXT WINDOW PROCESSOR
33
+ # This class handles sentence splitting and window creation for text analysis
34
  class TextWindowProcessor:
35
  def __init__(self):
36
+ # Initialize SpaCy with minimal pipeline for sentence splitting
37
  try:
38
  self.nlp = spacy.load("en_core_web_sm")
39
  except OSError:
40
+ # Auto-download SpaCy model if not available
41
  logger.info("Downloading spacy model...")
42
  spacy.cli.download("en_core_web_sm")
43
  self.nlp = spacy.load("en_core_web_sm")
44
 
45
+ # Add sentencizer if not already present
46
  if 'sentencizer' not in self.nlp.pipe_names:
47
  self.nlp.add_pipe('sentencizer')
48
 
49
+ # Disable unnecessary components for better performance
50
  disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
51
  self.nlp.disable_pipes(*disabled_pipes)
52
 
53
+ # Setup ThreadPoolExecutor for parallel processing
54
  self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
55
 
56
+ # Split text into individual sentences using SpaCy
57
  def split_into_sentences(self, text: str) -> List[str]:
58
  doc = self.nlp(text)
59
  return [str(sent).strip() for sent in doc.sents]
60
 
61
+ # Create overlapping windows of fixed size (for quick scan)
62
  def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
63
  if len(sentences) < window_size:
64
+ return [" ".join(sentences)] # Return single window if not enough sentences
65
 
66
  windows = []
67
  stride = window_size - overlap
 
70
  windows.append(" ".join(window))
71
  return windows
72
 
73
+ # Create windows centered around each sentence (for detailed scan)
74
+ # This provides better analysis of individual sentences with proper context
75
  def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
76
  windows = []
77
  window_sentence_indices = []
 
87
 
88
  return windows, window_sentence_indices
89
 
90
+ # TEXT CLASSIFIER
91
+ # This class handles the actual AI/Human classification using a pre-trained model
92
  class TextClassifier:
93
  def __init__(self):
94
+ # Configure CPU threading if CUDA not available
95
  if not torch.cuda.is_available():
96
  torch.set_num_threads(MAX_WORKERS)
97
  torch.set_num_interop_threads(MAX_WORKERS)
98
 
99
+ # Set device (GPU if available, otherwise CPU)
100
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
101
  self.model_name = MODEL_NAME
102
  self.tokenizer = None
 
104
  self.processor = TextWindowProcessor()
105
  self.initialize_model()
106
 
107
+ # Initialize the model and tokenizer
108
  def initialize_model(self):
109
  logger.info("Initializing model and tokenizer...")
110
 
111
+ # Using DeBERTa tokenizer specifically for better compatibility
112
  from transformers import DebertaV2TokenizerFast
113
 
114
  self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
115
  self.model_name,
116
  model_max_length=MAX_LENGTH,
117
+ use_fast=True # Use fast tokenizer for better performance
118
  )
119
 
120
+ # Load classification model with 2 labels (AI and Human)
121
  self.model = AutoModelForSequenceClassification.from_pretrained(
122
  self.model_name,
123
  num_labels=2
124
  ).to(self.device)
125
 
126
+ # Try to load custom fine-tuned model weights if available
127
  model_path = "model_20250209_184929_acc1.0000.pt"
128
  if os.path.exists(model_path):
129
  logger.info(f"Loading custom model from {model_path}")
 
132
  else:
133
  logger.warning("Custom model file not found. Using base model.")
134
 
135
+ # Set model to evaluation mode
136
  self.model.eval()
137
 
138
+ # Quick scan analysis - faster but less detailed
139
+ # Uses fixed-size windows with overlap
140
  def quick_scan(self, text: str) -> Dict:
141
  if not text.strip():
142
  return {
 
145
  'num_windows': 0
146
  }
147
 
148
+ # Split text into sentences and then into windows
149
  sentences = self.processor.split_into_sentences(text)
150
  windows = self.processor.create_windows(sentences, WINDOW_SIZE, WINDOW_OVERLAP)
151
 
152
  predictions = []
153
 
154
+ # Process windows in batches for efficiency
155
  for i in range(0, len(windows), BATCH_SIZE):
156
  batch_windows = windows[i:i + BATCH_SIZE]
157
 
158
+ # Tokenize and prepare input for the model
159
  inputs = self.tokenizer(
160
  batch_windows,
161
  truncation=True,
 
164
  return_tensors="pt"
165
  ).to(self.device)
166
 
167
+ # Run inference with no gradient calculation
168
  with torch.no_grad():
169
  outputs = self.model(**inputs)
170
  probs = F.softmax(outputs.logits, dim=-1)
171
 
172
+ # Process predictions for each window
173
  for idx, window in enumerate(batch_windows):
174
  prediction = {
175
  'window': window,
 
179
  }
180
  predictions.append(prediction)
181
 
182
+ # Clean up to free memory
183
  del inputs, outputs, probs
184
  if torch.cuda.is_available():
185
  torch.cuda.empty_cache()
 
191
  'num_windows': 0
192
  }
193
 
194
+ # Average probabilities across all windows for final prediction
195
  avg_human_prob = sum(p['human_prob'] for p in predictions) / len(predictions)
196
  avg_ai_prob = sum(p['ai_prob'] for p in predictions) / len(predictions)
197
 
 
201
  'num_windows': len(predictions)
202
  }
203
 
204
+ # Detailed scan analysis - slower but provides sentence-level insights
205
+ # Uses windows centered around each sentence for more precise analysis
206
  def detailed_scan(self, text: str) -> Dict:
207
  text = text.rstrip()
208
 
 
218
  }
219
  }
220
 
221
+ # Split text into sentences
222
  sentences = self.processor.split_into_sentences(text)
223
  if not sentences:
224
  return {}
225
 
226
+ # Create a window centered on each sentence
227
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
228
 
229
+ # Track appearances and scores for each sentence
230
  sentence_appearances = {i: 0 for i in range(len(sentences))}
231
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
232
 
233
+ # Process windows in batches
234
  for i in range(0, len(windows), BATCH_SIZE):
235
  batch_windows = windows[i:i + BATCH_SIZE]
236
  batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
237
 
238
+ # Tokenize and prepare input
239
  inputs = self.tokenizer(
240
  batch_windows,
241
  truncation=True,
 
244
  return_tensors="pt"
245
  ).to(self.device)
246
 
247
+ # Run inference
248
  with torch.no_grad():
249
  outputs = self.model(**inputs)
250
  probs = F.softmax(outputs.logits, dim=-1)
251
 
252
+ # Process each window's predictions
253
  for window_idx, indices in enumerate(batch_indices):
254
  center_idx = len(indices) // 2
255
+ center_weight = 0.7 # Center sentence gets 70% weight
256
+ edge_weight = 0.3 / (len(indices) - 1) # Other sentences share 30%
257
 
258
+ # Apply weighted prediction to each sentence in window
259
  for pos, sent_idx in enumerate(indices):
260
  weight = center_weight if pos == center_idx else edge_weight
261
  sentence_appearances[sent_idx] += weight
262
  sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
263
  sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
264
 
265
+ # Clean up memory
266
  del inputs, outputs, probs
267
  if torch.cuda.is_available():
268
  torch.cuda.empty_cache()
269
 
270
+ # Calculate final predictions for each sentence with smoothing between adjacent sentences
271
  sentence_predictions = []
272
  for i in range(len(sentences)):
273
  if sentence_appearances[i] > 0:
274
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
275
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
276
 
277
+ # Apply smoothing for sentences not at boundaries
278
  if i > 0 and i < len(sentences) - 1:
279
  prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
280
  prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
 
285
  prev_pred = 'human' if prev_human > prev_ai else 'ai'
286
  next_pred = 'human' if next_human > next_ai else 'ai'
287
 
288
+ # Only smooth if current sentence prediction differs from neighbors
289
  if current_pred != prev_pred or current_pred != next_pred:
290
+ smooth_factor = 0.1 # 10% smoothing factor
291
  human_prob = (human_prob * (1 - smooth_factor) +
292
  (prev_human + next_human) * smooth_factor / 2)
293
  ai_prob = (ai_prob * (1 - smooth_factor) +
 
301
  'confidence': max(human_prob, ai_prob)
302
  })
303
 
304
+ # Return detailed results
305
  return {
306
  'sentence_predictions': sentence_predictions,
307
  'highlighted_text': self.format_predictions_html(sentence_predictions),
 
309
  'overall_prediction': self.aggregate_predictions(sentence_predictions)
310
  }
311
 
312
+ # Format predictions with color highlighting for visual assessment
313
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
314
  html_parts = []
315
 
 
317
  sentence = pred['sentence']
318
  confidence = pred['confidence']
319
 
320
+ # Color coding: stronger colors for high confidence, lighter for low confidence
321
  if confidence >= CONFIDENCE_THRESHOLD:
322
  if pred['prediction'] == 'human':
323
+ color = "#90EE90" # Green for human (high confidence)
324
  else:
325
+ color = "#FFB6C6" # Pink for AI (high confidence)
326
  else:
327
  if pred['prediction'] == 'human':
328
+ color = "#E8F5E9" # Light green for human (low confidence)
329
  else:
330
+ color = "#FFEBEE" # Light pink for AI (low confidence)
331
 
332
  html_parts.append(f'<span style="background-color: {color};">{sentence}</span>')
333
 
334
  return " ".join(html_parts)
335
 
336
+ # Aggregate individual sentence predictions into an overall result
337
  def aggregate_predictions(self, predictions: List[Dict]) -> Dict:
338
  if not predictions:
339
  return {
 
342
  'num_sentences': 0
343
  }
344
 
345
+ # Calculate average probabilities across all sentences
346
  total_human_prob = sum(p['human_prob'] for p in predictions)
347
  total_ai_prob = sum(p['ai_prob'] for p in predictions)
348
  num_sentences = len(predictions)
 
356
  'num_sentences': num_sentences
357
  }
358
 
359
+ # MAIN ANALYSIS FUNCTION
360
+ # Brings everything together to analyze text based on selected mode
361
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
362
  start_time = time.time()
363
 
364
  word_count = len(text.split())
365
 
366
+ # Auto-switch to quick mode for short texts
367
  original_mode = mode
368
  if word_count < 200 and mode == "detailed":
369
  mode = "quick"
370
 
371
  if mode == "quick":
372
+ # Perform quick analysis
373
  result = classifier.quick_scan(text)
374
 
375
  quick_analysis = f"""
 
378
  Windows analyzed: {result['num_windows']}
379
  """
380
 
381
+ # Notify if automatically switched from detailed to quick mode
382
  if original_mode == "detailed":
383
  quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
384
 
385
  execution_time = (time.time() - start_time) * 1000
386
 
387
  return (
388
+ text, # Original text (no highlighting)
389
  "Quick scan mode - no sentence-level analysis available",
390
  quick_analysis
391
  )
392
  else:
393
+ # Perform detailed analysis
394
  analysis = classifier.detailed_scan(text)
395
 
396
+ # Format sentence-by-sentence analysis text
397
  detailed_analysis = []
398
  for pred in analysis['sentence_predictions']:
399
  confidence = pred['confidence'] * 100
 
402
  detailed_analysis.append(f"Confidence: {confidence:.1f}%")
403
  detailed_analysis.append("-" * 50)
404
 
405
+ # Format overall result summary
406
  final_pred = analysis['overall_prediction']
407
  overall_result = f"""
408
  FINAL PREDICTION: {final_pred['prediction'].upper()}
 
413
  execution_time = (time.time() - start_time) * 1000
414
 
415
  return (
416
+ analysis['highlighted_text'], # HTML-highlighted text
417
+ "\n".join(detailed_analysis), # Detailed sentence analysis
418
+ overall_result # Overall summary
419
  )
420
 
421
+ # Initialize the classifier
422
  classifier = TextClassifier()
423
 
424
+ # GRADIO USER INTERFACE
425
  demo = gr.Interface(
426
  fn=lambda text, mode: analyze_text(text, mode, classifier),
427
  inputs=[
 
438
  )
439
  ],
440
  outputs=[
441
+ gr.HTML(label="Highlighted Analysis"), # Shows color-coded result
442
+ gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10), # Detailed breakdown
443
+ gr.Textbox(label="Overall Result", lines=4) # Summary results
444
  ],
445
  title="AI Text Detector",
446
  description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.",
 
448
  flagging_mode="never"
449
  )
450
 
451
+ # FastAPI configuration
452
  app = demo.app
453
 
454
+ # Add CORS middleware to allow cross-origin requests
455
  app.add_middleware(
456
  CORSMiddleware,
457
  allow_origins=["*"],
 
460
  allow_headers=["*"],
461
  )
462
 
463
+ # Start the server when run directly
464
  if __name__ == "__main__":
465
+ demo.queue() # Enable request queuing
466
  demo.launch(
467
+ server_name="0.0.0.0", # Listen on all interfaces
468
+ server_port=7860, # Default Gradio port
469
+ share=True # Generate public URL
470
  )