Spaces:

ApsidalSolid4
/

CITProjectAIDetector

Running

App Files Files Community

ApsidalSolid4 commited on Mar 13

Commit

a167831

verified ·

1 Parent(s): aec2abb

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -103

app.py CHANGED Viewed

@@ -10,6 +10,12 @@ import gradio as gr
 from fastapi.middleware.cors import CORSMiddleware
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -24,6 +30,153 @@ CONFIDENCE_THRESHOLD = 0.65
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
 class TextWindowProcessor:
     def __init__(self):
         try:
@@ -176,100 +329,6 @@ class TextClassifier:
             'num_windows': len(predictions)
         }
-    # def detailed_scan(self, text: str) -> Dict:
-    #     """Original prediction method with modified window handling"""
-    #     if self.model is None or self.tokenizer is None:
-    #         self.load_model()
-    #     self.model.eval()
-    #     sentences = self.processor.split_into_sentences(text)
-    #     if not sentences:
-    #         return {}
-    #     # Create centered windows for each sentence
-    #     windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
-    #     # Track scores for each sentence
-    #     sentence_appearances = {i: 0 for i in range(len(sentences))}
-    #     sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
-    #     # Process windows in batches
-    #     batch_size = 16
-    #     for i in range(0, len(windows), batch_size):
-    #         batch_windows = windows[i:i + batch_size]
-    #         batch_indices = window_sentence_indices[i:i + batch_size]
-    #         inputs = self.tokenizer(
-    #             batch_windows,
-    #             truncation=True,
-    #             padding=True,
-    #             max_length=MAX_LENGTH,
-    #             return_tensors="pt"
-    #         ).to(self.device)
-    #         with torch.no_grad():
-    #             outputs = self.model(**inputs)
-    #             probs = F.softmax(outputs.logits, dim=-1)
-    #             # Attribute predictions more carefully
-    #             for window_idx, indices in enumerate(batch_indices):
-    #                 center_idx = len(indices) // 2
-    #                 center_weight = 0.7  # Higher weight for center sentence
-    #                 edge_weight = 0.3 / (len(indices) - 1)  # Distribute remaining weight
-    #                 for pos, sent_idx in enumerate(indices):
-    #                     # Apply higher weight to center sentence
-    #                     weight = center_weight if pos == center_idx else edge_weight
-    #                     sentence_appearances[sent_idx] += weight
-    #                     sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
-    #                     sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
-    #         del inputs, outputs, probs
-    #         if torch.cuda.is_available():
-    #             torch.cuda.empty_cache()
-    #     # Calculate final predictions
-    #     sentence_predictions = []
-    #     for i in range(len(sentences)):
-    #         if sentence_appearances[i] > 0:
-    #             human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
-    #             ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
-    #             # Only apply minimal smoothing at prediction boundaries
-    #             if i > 0 and i < len(sentences) - 1:
-    #                 prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
-    #                 prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
-    #                 next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
-    #                 next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
-    #                 # Check if we're at a prediction boundary
-    #                 current_pred = 'human' if human_prob > ai_prob else 'ai'
-    #                 prev_pred = 'human' if prev_human > prev_ai else 'ai'
-    #                 next_pred = 'human' if next_human > next_ai else 'ai'
-    #                 if current_pred != prev_pred or current_pred != next_pred:
-    #                     # Small adjustment at boundaries
-    #                     smooth_factor = 0.1
-    #                     human_prob = (human_prob * (1 - smooth_factor) +
-    #                                 (prev_human + next_human) * smooth_factor / 2)
-    #                     ai_prob = (ai_prob * (1 - smooth_factor) +
-    #                             (prev_ai + next_ai) * smooth_factor / 2)
-    #             sentence_predictions.append({
-    #                 'sentence': sentences[i],
-    #                 'human_prob': human_prob,
-    #                 'ai_prob': ai_prob,
-    #                 'prediction': 'human' if human_prob > ai_prob else 'ai',
-    #                 'confidence': max(human_prob, ai_prob)
-    #             })
-    #     return {
-    #         'sentence_predictions': sentence_predictions,
-    #         'highlighted_text': self.format_predictions_html(sentence_predictions),
-    #         'full_text': text,
-    #         'overall_prediction': self.aggregate_predictions(sentence_predictions)
-    #     }
     def detailed_scan(self, text: str) -> Dict:
         """Perform a detailed scan with improved sentence-level analysis."""
         # Clean up trailing whitespace
@@ -420,8 +479,14 @@ class TextClassifier:
             'num_sentences': num_sentences
         }
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
     # Count words in the text
     word_count = len(text.split())
@@ -432,31 +497,55 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     if mode == "quick":
         result = classifier.quick_scan(text)
         quick_analysis = f"""
-        PREDICTION: {result['prediction'].upper()}
-        Confidence: {result['confidence']*100:.1f}%
-        Windows analyzed: {result['num_windows']}
         """
         # Add note if mode was switched
         if original_mode == "detailed":
             quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
-        return (
             text,  # No highlighting in quick mode
             "Quick scan mode - no sentence-level analysis available",
             quick_analysis
         )
     else:
         analysis = classifier.detailed_scan(text)
         detailed_analysis = []
         for pred in analysis['sentence_predictions']:
-            confidence = pred['confidence'] * 100
             detailed_analysis.append(f"Sentence: {pred['sentence']}")
             detailed_analysis.append(f"Prediction: {pred['prediction'].upper()}")
-            detailed_analysis.append(f"Confidence: {confidence:.1f}%")
             detailed_analysis.append("-" * 50)
         final_pred = analysis['overall_prediction']
@@ -466,16 +555,35 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
         Number of sentences analyzed: {final_pred['num_sentences']}
         """
-        return (
             analysis['highlighted_text'],
             "\n".join(detailed_analysis),
             overall_result
         )
 # Initialize the classifier globally
 classifier = TextClassifier()
-# Create Gradio interface
 demo = gr.Interface(
     fn=lambda text, mode: analyze_text(text, mode, classifier),
     inputs=[
@@ -497,7 +605,7 @@ demo = gr.Interface(
         gr.Textbox(label="Overall Result", lines=4)
     ],
     title="AI Text Detector",
-    description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.",
     api_name="predict",
     flagging_mode="never"
 )

 from fastapi.middleware.cors import CORSMiddleware
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
+import time
+import pandas as pd
+from datetime import datetime
+import threading
+import random
+from openpyxl import load_workbook
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 BATCH_SIZE = 8  # Reduced batch size for CPU
 MAX_WORKERS = 4  # Number of worker threads for processing
+class ExcelLogger:
+    def __init__(self, log_dir="logs", excel_file=None):
+        """Initialize the Excel logger.
+        Args:
+            log_dir: Directory to store log files
+            excel_file: Specific Excel file name (defaults to predictions_YYYY-MM.xlsx)
+        """
+        self.log_dir = log_dir
+        os.makedirs(log_dir, exist_ok=True)
+        # Use monthly Excel files by default
+        if excel_file is None:
+            current_month = datetime.now().strftime('%Y-%m')
+            excel_file = f"predictions_{current_month}.xlsx"
+        self.excel_path = os.path.join(log_dir, excel_file)
+        # Create excel file with headers if it doesn't exist
+        if not os.path.exists(self.excel_path):
+            self._create_excel_file()
+        # Create a lock for thread safety
+        self.file_lock = threading.Lock()
+    def _create_excel_file(self):
+        """Create a new Excel file with appropriate sheets and headers."""
+        # Create DataFrame for metrics
+        metrics_df = pd.DataFrame(columns=[
+            'timestamp', 'word_count', 'mode', 'prediction',
+            'confidence', 'prediction_time_seconds', 'num_sentences'
+        ])
+        # Create DataFrame for text storage
+        text_df = pd.DataFrame(columns=[
+            'entry_id', 'timestamp', 'text'
+        ])
+        # Create Excel writer
+        with pd.ExcelWriter(self.excel_path, engine='openpyxl') as writer:
+            metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
+            text_df.to_excel(writer, sheet_name='TextData', index=False)
+        logger.info(f"Created new Excel log file: {self.excel_path}")
+    def log_prediction(self, prediction_data, store_text=True):
+        """Log prediction data to the Excel file.
+        Args:
+            prediction_data: Dictionary containing prediction metrics
+            store_text: Whether to store the full text
+        """
+        # Generate a unique entry ID
+        entry_id = f"{datetime.now().strftime('%Y%m%d%H%M%S')}_{random.randint(1000, 9999)}"
+        # Extract text if present
+        text = prediction_data.pop('text', None) if store_text else None
+        # Ensure timestamp is present
+        if 'timestamp' not in prediction_data:
+            prediction_data['timestamp'] = datetime.now().isoformat()
+        # Add entry_id to the metrics
+        metrics_data = prediction_data.copy()
+        metrics_data['entry_id'] = entry_id
+        # Start a thread to write data to Excel
+        thread = threading.Thread(
+            target=self._write_to_excel,
+            args=(metrics_data, text, entry_id, store_text)
+        )
+        thread.daemon = True
+        thread.start()
+    def _write_to_excel(self, metrics_data, text, entry_id, store_text):
+        """Write data to Excel file with retry mechanism for concurrent access."""
+        max_retries = 5
+        retry_delay = 0.5
+        for attempt in range(max_retries):
+            try:
+                with self.file_lock:
+                    # Load existing data
+                    metrics_df = pd.read_excel(self.excel_path, sheet_name='Metrics')
+                    # Append new metrics data
+                    new_metrics = pd.DataFrame([metrics_data])
+                    metrics_df = pd.concat([metrics_df, new_metrics], ignore_index=True)
+                    # If text storage is requested
+                    if store_text and text:
+                        try:
+                            text_df = pd.read_excel(self.excel_path, sheet_name='TextData')
+                            # Append new text data
+                            new_text = pd.DataFrame([{
+                                'entry_id': entry_id,
+                                'timestamp': metrics_data['timestamp'],
+                                'text': text
+                            }])
+                            text_df = pd.concat([text_df, new_text], ignore_index=True)
+                        except:
+                            # If TextData sheet doesn't exist or can't be read
+                            text_df = pd.DataFrame([{
+                                'entry_id': entry_id,
+                                'timestamp': metrics_data['timestamp'],
+                                'text': text
+                            }])
+                    # Write back to Excel
+                    with pd.ExcelWriter(self.excel_path, engine='openpyxl', mode='a',
+                                         if_sheet_exists='replace') as writer:
+                        metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
+                        if store_text and text:
+                            text_df.to_excel(writer, sheet_name='TextData', index=False)
+                    # Successfully wrote to file
+                    break
+            except Exception as e:
+                # If error occurs (likely due to concurrent access), retry after delay
+                logger.warning(f"Error writing to Excel (attempt {attempt+1}/{max_retries}): {e}")
+                time.sleep(retry_delay * (attempt + 1))  # Progressive backoff
+        else:
+            # If all retries fail, log to backup file
+            logger.error(f"Failed to write to Excel after {max_retries} attempts, logging to backup file")
+            self._write_to_backup(metrics_data, text, entry_id, store_text)
+    def _write_to_backup(self, metrics_data, text, entry_id, store_text):
+        """Write to backup CSV files if Excel writing fails."""
+        timestamp = datetime.now().strftime('%Y%m%d')
+        # Log metrics to CSV
+        metrics_csv = os.path.join(self.log_dir, f"metrics_backup_{timestamp}.csv")
+        pd.DataFrame([metrics_data]).to_csv(metrics_csv, mode='a', header=not os.path.exists(metrics_csv), index=False)
+        # Log text to separate CSV if needed
+        if store_text and text:
+            text_csv = os.path.join(self.log_dir, f"text_backup_{timestamp}.csv")
+            text_data = {
+                'entry_id': entry_id,
+                'timestamp': metrics_data['timestamp'],
+                'text': text
+            }
+            pd.DataFrame([text_data]).to_csv(text_csv, mode='a', header=not os.path.exists(text_csv), index=False)
 class TextWindowProcessor:
     def __init__(self):
         try:
             'num_windows': len(predictions)
         }
     def detailed_scan(self, text: str) -> Dict:
         """Perform a detailed scan with improved sentence-level analysis."""
         # Clean up trailing whitespace
             'num_sentences': num_sentences
         }
+# Initialize the logger
+excel_logger = ExcelLogger(log_dir="prediction_logs")
 def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
     """Analyze text using specified mode and return formatted results."""
+    # Start timing the prediction
+    start_time = time.time()
     # Count words in the text
     word_count = len(text.split())
     if mode == "quick":
         result = classifier.quick_scan(text)
+        prediction = result['prediction']
+        confidence = result['confidence']
+        num_windows = result['num_windows']
         quick_analysis = f"""
+        PREDICTION: {prediction.upper()}
+        Confidence: {confidence*100:.1f}%
+        Windows analyzed: {num_windows}
         """
         # Add note if mode was switched
         if original_mode == "detailed":
             quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
+        output = (
             text,  # No highlighting in quick mode
             "Quick scan mode - no sentence-level analysis available",
             quick_analysis
         )
+        # End timing
+        end_time = time.time()
+        prediction_time = end_time - start_time
+        # Log the data
+        log_data = {
+            "timestamp": datetime.now().isoformat(),
+            "word_count": word_count,
+            "mode": mode,
+            "prediction": prediction,
+            "confidence": confidence,
+            "prediction_time_seconds": prediction_time,
+            "num_sentences": 0,  # No sentence analysis in quick mode
+            "text": text
+        }
+        excel_logger.log_prediction(log_data)
     else:
         analysis = classifier.detailed_scan(text)
+        prediction = analysis['overall_prediction']['prediction']
+        confidence = analysis['overall_prediction']['confidence']
+        num_sentences = analysis['overall_prediction']['num_sentences']
         detailed_analysis = []
         for pred in analysis['sentence_predictions']:
+            pred_confidence = pred['confidence'] * 100
             detailed_analysis.append(f"Sentence: {pred['sentence']}")
             detailed_analysis.append(f"Prediction: {pred['prediction'].upper()}")
+            detailed_analysis.append(f"Confidence: {pred_confidence:.1f}%")
             detailed_analysis.append("-" * 50)
         final_pred = analysis['overall_prediction']
         Number of sentences analyzed: {final_pred['num_sentences']}
         """
+        output = (
             analysis['highlighted_text'],
             "\n".join(detailed_analysis),
             overall_result
         )
+        # End timing
+        end_time = time.time()
+        prediction_time = end_time - start_time
+        # Log the data
+        log_data = {
+            "timestamp": datetime.now().isoformat(),
+            "word_count": word_count,
+            "mode": mode,
+            "prediction": prediction,
+            "confidence": confidence,
+            "prediction_time_seconds": prediction_time,
+            "num_sentences": num_sentences,
+            "text": text
+        }
+        excel_logger.log_prediction(log_data)
+    return output
 # Initialize the classifier globally
 classifier = TextClassifier()
+# Create Gradio interface with added information about data collection
 demo = gr.Interface(
     fn=lambda text, mode: analyze_text(text, mode, classifier),
     inputs=[
         gr.Textbox(label="Overall Result", lines=4)
     ],
     title="AI Text Detector",
+    description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions. Note: For testing purposes, text and analysis data will be recorded.",
     api_name="predict",
     flagging_mode="never"
 )