ApsidalSolid4 commited on
Commit
a167831
·
verified ·
1 Parent(s): aec2abb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -103
app.py CHANGED
@@ -10,6 +10,12 @@ import gradio as gr
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from concurrent.futures import ThreadPoolExecutor
12
  from functools import partial
 
 
 
 
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
@@ -24,6 +30,153 @@ CONFIDENCE_THRESHOLD = 0.65
24
  BATCH_SIZE = 8 # Reduced batch size for CPU
25
  MAX_WORKERS = 4 # Number of worker threads for processing
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  class TextWindowProcessor:
28
  def __init__(self):
29
  try:
@@ -176,100 +329,6 @@ class TextClassifier:
176
  'num_windows': len(predictions)
177
  }
178
 
179
- # def detailed_scan(self, text: str) -> Dict:
180
- # """Original prediction method with modified window handling"""
181
- # if self.model is None or self.tokenizer is None:
182
- # self.load_model()
183
-
184
- # self.model.eval()
185
- # sentences = self.processor.split_into_sentences(text)
186
- # if not sentences:
187
- # return {}
188
-
189
- # # Create centered windows for each sentence
190
- # windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
191
-
192
- # # Track scores for each sentence
193
- # sentence_appearances = {i: 0 for i in range(len(sentences))}
194
- # sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
195
-
196
- # # Process windows in batches
197
- # batch_size = 16
198
- # for i in range(0, len(windows), batch_size):
199
- # batch_windows = windows[i:i + batch_size]
200
- # batch_indices = window_sentence_indices[i:i + batch_size]
201
-
202
- # inputs = self.tokenizer(
203
- # batch_windows,
204
- # truncation=True,
205
- # padding=True,
206
- # max_length=MAX_LENGTH,
207
- # return_tensors="pt"
208
- # ).to(self.device)
209
-
210
- # with torch.no_grad():
211
- # outputs = self.model(**inputs)
212
- # probs = F.softmax(outputs.logits, dim=-1)
213
-
214
- # # Attribute predictions more carefully
215
- # for window_idx, indices in enumerate(batch_indices):
216
- # center_idx = len(indices) // 2
217
- # center_weight = 0.7 # Higher weight for center sentence
218
- # edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
219
-
220
- # for pos, sent_idx in enumerate(indices):
221
- # # Apply higher weight to center sentence
222
- # weight = center_weight if pos == center_idx else edge_weight
223
- # sentence_appearances[sent_idx] += weight
224
- # sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
225
- # sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
226
-
227
- # del inputs, outputs, probs
228
- # if torch.cuda.is_available():
229
- # torch.cuda.empty_cache()
230
-
231
- # # Calculate final predictions
232
- # sentence_predictions = []
233
- # for i in range(len(sentences)):
234
- # if sentence_appearances[i] > 0:
235
- # human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
236
- # ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
237
-
238
- # # Only apply minimal smoothing at prediction boundaries
239
- # if i > 0 and i < len(sentences) - 1:
240
- # prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
241
- # prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
242
- # next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
243
- # next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
244
-
245
- # # Check if we're at a prediction boundary
246
- # current_pred = 'human' if human_prob > ai_prob else 'ai'
247
- # prev_pred = 'human' if prev_human > prev_ai else 'ai'
248
- # next_pred = 'human' if next_human > next_ai else 'ai'
249
-
250
- # if current_pred != prev_pred or current_pred != next_pred:
251
- # # Small adjustment at boundaries
252
- # smooth_factor = 0.1
253
- # human_prob = (human_prob * (1 - smooth_factor) +
254
- # (prev_human + next_human) * smooth_factor / 2)
255
- # ai_prob = (ai_prob * (1 - smooth_factor) +
256
- # (prev_ai + next_ai) * smooth_factor / 2)
257
-
258
- # sentence_predictions.append({
259
- # 'sentence': sentences[i],
260
- # 'human_prob': human_prob,
261
- # 'ai_prob': ai_prob,
262
- # 'prediction': 'human' if human_prob > ai_prob else 'ai',
263
- # 'confidence': max(human_prob, ai_prob)
264
- # })
265
-
266
- # return {
267
- # 'sentence_predictions': sentence_predictions,
268
- # 'highlighted_text': self.format_predictions_html(sentence_predictions),
269
- # 'full_text': text,
270
- # 'overall_prediction': self.aggregate_predictions(sentence_predictions)
271
- # }
272
-
273
  def detailed_scan(self, text: str) -> Dict:
274
  """Perform a detailed scan with improved sentence-level analysis."""
275
  # Clean up trailing whitespace
@@ -420,8 +479,14 @@ class TextClassifier:
420
  'num_sentences': num_sentences
421
  }
422
 
 
 
 
423
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
424
  """Analyze text using specified mode and return formatted results."""
 
 
 
425
  # Count words in the text
426
  word_count = len(text.split())
427
 
@@ -432,31 +497,55 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
432
 
433
  if mode == "quick":
434
  result = classifier.quick_scan(text)
 
 
 
435
 
436
  quick_analysis = f"""
437
- PREDICTION: {result['prediction'].upper()}
438
- Confidence: {result['confidence']*100:.1f}%
439
- Windows analyzed: {result['num_windows']}
440
  """
441
 
442
  # Add note if mode was switched
443
  if original_mode == "detailed":
444
  quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
445
 
446
- return (
447
  text, # No highlighting in quick mode
448
  "Quick scan mode - no sentence-level analysis available",
449
  quick_analysis
450
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  else:
452
  analysis = classifier.detailed_scan(text)
 
 
 
453
 
454
  detailed_analysis = []
455
  for pred in analysis['sentence_predictions']:
456
- confidence = pred['confidence'] * 100
457
  detailed_analysis.append(f"Sentence: {pred['sentence']}")
458
  detailed_analysis.append(f"Prediction: {pred['prediction'].upper()}")
459
- detailed_analysis.append(f"Confidence: {confidence:.1f}%")
460
  detailed_analysis.append("-" * 50)
461
 
462
  final_pred = analysis['overall_prediction']
@@ -466,16 +555,35 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
466
  Number of sentences analyzed: {final_pred['num_sentences']}
467
  """
468
 
469
- return (
470
  analysis['highlighted_text'],
471
  "\n".join(detailed_analysis),
472
  overall_result
473
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
  # Initialize the classifier globally
476
  classifier = TextClassifier()
477
 
478
- # Create Gradio interface
479
  demo = gr.Interface(
480
  fn=lambda text, mode: analyze_text(text, mode, classifier),
481
  inputs=[
@@ -497,7 +605,7 @@ demo = gr.Interface(
497
  gr.Textbox(label="Overall Result", lines=4)
498
  ],
499
  title="AI Text Detector",
500
- description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.",
501
  api_name="predict",
502
  flagging_mode="never"
503
  )
 
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from concurrent.futures import ThreadPoolExecutor
12
  from functools import partial
13
+ import time
14
+ import pandas as pd
15
+ from datetime import datetime
16
+ import threading
17
+ import random
18
+ from openpyxl import load_workbook
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
 
30
  BATCH_SIZE = 8 # Reduced batch size for CPU
31
  MAX_WORKERS = 4 # Number of worker threads for processing
32
 
33
+ class ExcelLogger:
34
+ def __init__(self, log_dir="logs", excel_file=None):
35
+ """Initialize the Excel logger.
36
+
37
+ Args:
38
+ log_dir: Directory to store log files
39
+ excel_file: Specific Excel file name (defaults to predictions_YYYY-MM.xlsx)
40
+ """
41
+ self.log_dir = log_dir
42
+ os.makedirs(log_dir, exist_ok=True)
43
+
44
+ # Use monthly Excel files by default
45
+ if excel_file is None:
46
+ current_month = datetime.now().strftime('%Y-%m')
47
+ excel_file = f"predictions_{current_month}.xlsx"
48
+
49
+ self.excel_path = os.path.join(log_dir, excel_file)
50
+
51
+ # Create excel file with headers if it doesn't exist
52
+ if not os.path.exists(self.excel_path):
53
+ self._create_excel_file()
54
+
55
+ # Create a lock for thread safety
56
+ self.file_lock = threading.Lock()
57
+
58
+ def _create_excel_file(self):
59
+ """Create a new Excel file with appropriate sheets and headers."""
60
+ # Create DataFrame for metrics
61
+ metrics_df = pd.DataFrame(columns=[
62
+ 'timestamp', 'word_count', 'mode', 'prediction',
63
+ 'confidence', 'prediction_time_seconds', 'num_sentences'
64
+ ])
65
+
66
+ # Create DataFrame for text storage
67
+ text_df = pd.DataFrame(columns=[
68
+ 'entry_id', 'timestamp', 'text'
69
+ ])
70
+
71
+ # Create Excel writer
72
+ with pd.ExcelWriter(self.excel_path, engine='openpyxl') as writer:
73
+ metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
74
+ text_df.to_excel(writer, sheet_name='TextData', index=False)
75
+
76
+ logger.info(f"Created new Excel log file: {self.excel_path}")
77
+
78
+ def log_prediction(self, prediction_data, store_text=True):
79
+ """Log prediction data to the Excel file.
80
+
81
+ Args:
82
+ prediction_data: Dictionary containing prediction metrics
83
+ store_text: Whether to store the full text
84
+ """
85
+ # Generate a unique entry ID
86
+ entry_id = f"{datetime.now().strftime('%Y%m%d%H%M%S')}_{random.randint(1000, 9999)}"
87
+
88
+ # Extract text if present
89
+ text = prediction_data.pop('text', None) if store_text else None
90
+
91
+ # Ensure timestamp is present
92
+ if 'timestamp' not in prediction_data:
93
+ prediction_data['timestamp'] = datetime.now().isoformat()
94
+
95
+ # Add entry_id to the metrics
96
+ metrics_data = prediction_data.copy()
97
+ metrics_data['entry_id'] = entry_id
98
+
99
+ # Start a thread to write data to Excel
100
+ thread = threading.Thread(
101
+ target=self._write_to_excel,
102
+ args=(metrics_data, text, entry_id, store_text)
103
+ )
104
+ thread.daemon = True
105
+ thread.start()
106
+
107
+ def _write_to_excel(self, metrics_data, text, entry_id, store_text):
108
+ """Write data to Excel file with retry mechanism for concurrent access."""
109
+ max_retries = 5
110
+ retry_delay = 0.5
111
+
112
+ for attempt in range(max_retries):
113
+ try:
114
+ with self.file_lock:
115
+ # Load existing data
116
+ metrics_df = pd.read_excel(self.excel_path, sheet_name='Metrics')
117
+
118
+ # Append new metrics data
119
+ new_metrics = pd.DataFrame([metrics_data])
120
+ metrics_df = pd.concat([metrics_df, new_metrics], ignore_index=True)
121
+
122
+ # If text storage is requested
123
+ if store_text and text:
124
+ try:
125
+ text_df = pd.read_excel(self.excel_path, sheet_name='TextData')
126
+
127
+ # Append new text data
128
+ new_text = pd.DataFrame([{
129
+ 'entry_id': entry_id,
130
+ 'timestamp': metrics_data['timestamp'],
131
+ 'text': text
132
+ }])
133
+ text_df = pd.concat([text_df, new_text], ignore_index=True)
134
+ except:
135
+ # If TextData sheet doesn't exist or can't be read
136
+ text_df = pd.DataFrame([{
137
+ 'entry_id': entry_id,
138
+ 'timestamp': metrics_data['timestamp'],
139
+ 'text': text
140
+ }])
141
+
142
+ # Write back to Excel
143
+ with pd.ExcelWriter(self.excel_path, engine='openpyxl', mode='a',
144
+ if_sheet_exists='replace') as writer:
145
+ metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
146
+ if store_text and text:
147
+ text_df.to_excel(writer, sheet_name='TextData', index=False)
148
+
149
+ # Successfully wrote to file
150
+ break
151
+
152
+ except Exception as e:
153
+ # If error occurs (likely due to concurrent access), retry after delay
154
+ logger.warning(f"Error writing to Excel (attempt {attempt+1}/{max_retries}): {e}")
155
+ time.sleep(retry_delay * (attempt + 1)) # Progressive backoff
156
+ else:
157
+ # If all retries fail, log to backup file
158
+ logger.error(f"Failed to write to Excel after {max_retries} attempts, logging to backup file")
159
+ self._write_to_backup(metrics_data, text, entry_id, store_text)
160
+
161
+ def _write_to_backup(self, metrics_data, text, entry_id, store_text):
162
+ """Write to backup CSV files if Excel writing fails."""
163
+ timestamp = datetime.now().strftime('%Y%m%d')
164
+
165
+ # Log metrics to CSV
166
+ metrics_csv = os.path.join(self.log_dir, f"metrics_backup_{timestamp}.csv")
167
+ pd.DataFrame([metrics_data]).to_csv(metrics_csv, mode='a', header=not os.path.exists(metrics_csv), index=False)
168
+
169
+ # Log text to separate CSV if needed
170
+ if store_text and text:
171
+ text_csv = os.path.join(self.log_dir, f"text_backup_{timestamp}.csv")
172
+ text_data = {
173
+ 'entry_id': entry_id,
174
+ 'timestamp': metrics_data['timestamp'],
175
+ 'text': text
176
+ }
177
+ pd.DataFrame([text_data]).to_csv(text_csv, mode='a', header=not os.path.exists(text_csv), index=False)
178
+
179
+
180
  class TextWindowProcessor:
181
  def __init__(self):
182
  try:
 
329
  'num_windows': len(predictions)
330
  }
331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  def detailed_scan(self, text: str) -> Dict:
333
  """Perform a detailed scan with improved sentence-level analysis."""
334
  # Clean up trailing whitespace
 
479
  'num_sentences': num_sentences
480
  }
481
 
482
+ # Initialize the logger
483
+ excel_logger = ExcelLogger(log_dir="prediction_logs")
484
+
485
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
486
  """Analyze text using specified mode and return formatted results."""
487
+ # Start timing the prediction
488
+ start_time = time.time()
489
+
490
  # Count words in the text
491
  word_count = len(text.split())
492
 
 
497
 
498
  if mode == "quick":
499
  result = classifier.quick_scan(text)
500
+ prediction = result['prediction']
501
+ confidence = result['confidence']
502
+ num_windows = result['num_windows']
503
 
504
  quick_analysis = f"""
505
+ PREDICTION: {prediction.upper()}
506
+ Confidence: {confidence*100:.1f}%
507
+ Windows analyzed: {num_windows}
508
  """
509
 
510
  # Add note if mode was switched
511
  if original_mode == "detailed":
512
  quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
513
 
514
+ output = (
515
  text, # No highlighting in quick mode
516
  "Quick scan mode - no sentence-level analysis available",
517
  quick_analysis
518
  )
519
+
520
+ # End timing
521
+ end_time = time.time()
522
+ prediction_time = end_time - start_time
523
+
524
+ # Log the data
525
+ log_data = {
526
+ "timestamp": datetime.now().isoformat(),
527
+ "word_count": word_count,
528
+ "mode": mode,
529
+ "prediction": prediction,
530
+ "confidence": confidence,
531
+ "prediction_time_seconds": prediction_time,
532
+ "num_sentences": 0, # No sentence analysis in quick mode
533
+ "text": text
534
+ }
535
+ excel_logger.log_prediction(log_data)
536
+
537
  else:
538
  analysis = classifier.detailed_scan(text)
539
+ prediction = analysis['overall_prediction']['prediction']
540
+ confidence = analysis['overall_prediction']['confidence']
541
+ num_sentences = analysis['overall_prediction']['num_sentences']
542
 
543
  detailed_analysis = []
544
  for pred in analysis['sentence_predictions']:
545
+ pred_confidence = pred['confidence'] * 100
546
  detailed_analysis.append(f"Sentence: {pred['sentence']}")
547
  detailed_analysis.append(f"Prediction: {pred['prediction'].upper()}")
548
+ detailed_analysis.append(f"Confidence: {pred_confidence:.1f}%")
549
  detailed_analysis.append("-" * 50)
550
 
551
  final_pred = analysis['overall_prediction']
 
555
  Number of sentences analyzed: {final_pred['num_sentences']}
556
  """
557
 
558
+ output = (
559
  analysis['highlighted_text'],
560
  "\n".join(detailed_analysis),
561
  overall_result
562
  )
563
+
564
+ # End timing
565
+ end_time = time.time()
566
+ prediction_time = end_time - start_time
567
+
568
+ # Log the data
569
+ log_data = {
570
+ "timestamp": datetime.now().isoformat(),
571
+ "word_count": word_count,
572
+ "mode": mode,
573
+ "prediction": prediction,
574
+ "confidence": confidence,
575
+ "prediction_time_seconds": prediction_time,
576
+ "num_sentences": num_sentences,
577
+ "text": text
578
+ }
579
+ excel_logger.log_prediction(log_data)
580
+
581
+ return output
582
 
583
  # Initialize the classifier globally
584
  classifier = TextClassifier()
585
 
586
+ # Create Gradio interface with added information about data collection
587
  demo = gr.Interface(
588
  fn=lambda text, mode: analyze_text(text, mode, classifier),
589
  inputs=[
 
605
  gr.Textbox(label="Overall Result", lines=4)
606
  ],
607
  title="AI Text Detector",
608
+ description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions. Note: For testing purposes, text and analysis data will be recorded.",
609
  api_name="predict",
610
  flagging_mode="never"
611
  )