# AI Text Detector Code Analysis

# IMPORTS AND CONFIGURATION
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification  # HuggingFace transformers for NLP models
import torch.nn.functional as F
import spacy  # Used for sentence splitting
from typing import List, Dict, Tuple
import logging
import os
import gradio as gr  # Used for creating the web UI
from fastapi.middleware.cors import CORSMiddleware
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import time
from datetime import datetime

# Basic logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# GLOBAL PARAMETERS
MAX_LENGTH = 512  # Maximum token length for the model input
MODEL_NAME = "microsoft/deberta-v3-small"  # Using Microsoft's DeBERTa v3 small model as the base
WINDOW_SIZE = 6  # Number of sentences in each analysis window
WINDOW_OVERLAP = 2  # Number of sentences that overlap between adjacent windows
CONFIDENCE_THRESHOLD = 0.65  # Threshold for highlighting predictions with stronger colors
BATCH_SIZE = 8  # Number of windows to process in a single batch for efficiency
MAX_WORKERS = 4  # Maximum number of worker threads for parallel processing

# TEXT WINDOW PROCESSOR
# This class handles sentence splitting and window creation for text analysis
class TextWindowProcessor:
    def __init__(self):
        # Initialize SpaCy with minimal pipeline for sentence splitting
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            # Auto-download SpaCy model if not available
            logger.info("Downloading spacy model...")
            spacy.cli.download("en_core_web_sm")
            self.nlp = spacy.load("en_core_web_sm")

        # Add sentencizer if not already present
        if 'sentencizer' not in self.nlp.pipe_names:
            self.nlp.add_pipe('sentencizer')

        # Disable unnecessary components for better performance
        disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
        self.nlp.disable_pipes(*disabled_pipes)
        
        # Setup ThreadPoolExecutor for parallel processing
        self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

    # Split text into individual sentences using SpaCy
    def split_into_sentences(self, text: str) -> List[str]:
        doc = self.nlp(text)
        return [str(sent).strip() for sent in doc.sents]

    # Create overlapping windows of fixed size (for quick scan)
    def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
        if len(sentences) < window_size:
            return [" ".join(sentences)]  # Return single window if not enough sentences

        windows = []
        stride = window_size - overlap
        for i in range(0, len(sentences) - window_size + 1, stride):
            window = sentences[i:i + window_size]
            windows.append(" ".join(window))
        return windows

    # Create windows centered around each sentence (for detailed scan)
    # This provides better analysis of individual sentences with proper context
    def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
        windows = []
        window_sentence_indices = []

        for i in range(len(sentences)):
            half_window = window_size // 2
            start_idx = max(0, i - half_window)
            end_idx = min(len(sentences), i + half_window + 1)

            window = sentences[start_idx:end_idx]
            windows.append(" ".join(window))
            window_sentence_indices.append(list(range(start_idx, end_idx)))

        return windows, window_sentence_indices

# TEXT CLASSIFIER
# This class handles the actual AI/Human classification using a pre-trained model
class TextClassifier:
    def __init__(self):
        # Configure CPU threading if CUDA not available
        if not torch.cuda.is_available():
            torch.set_num_threads(MAX_WORKERS)
            torch.set_num_interop_threads(MAX_WORKERS)
            
        # Set device (GPU if available, otherwise CPU)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_name = MODEL_NAME
        self.tokenizer = None
        self.model = None
        self.processor = TextWindowProcessor()
        self.initialize_model()

    # Initialize the model and tokenizer
    def initialize_model(self):
        logger.info("Initializing model and tokenizer...")
        
        # Using DeBERTa tokenizer specifically for better compatibility
        from transformers import DebertaV2TokenizerFast
        
        self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
            self.model_name,
            model_max_length=MAX_LENGTH,
            use_fast=True  # Use fast tokenizer for better performance
        )
        
        # Load classification model with 2 labels (AI and Human)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=2
        ).to(self.device)
            
        # Try to load custom fine-tuned model weights if available
        model_path = "model_20250209_184929_acc1.0000.pt"
        if os.path.exists(model_path):
            logger.info(f"Loading custom model from {model_path}")
            checkpoint = torch.load(model_path, map_location=self.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
        else:
            logger.warning("Custom model file not found. Using base model.")
            
        # Set model to evaluation mode
        self.model.eval()

    # Quick scan analysis - faster but less detailed
    # Uses fixed-size windows with overlap
    def quick_scan(self, text: str) -> Dict:
        if not text.strip():
            return {
                'prediction': 'unknown',
                'confidence': 0.0,
                'num_windows': 0
            }

        # Split text into sentences and then into windows
        sentences = self.processor.split_into_sentences(text)
        windows = self.processor.create_windows(sentences, WINDOW_SIZE, WINDOW_OVERLAP)

        predictions = []
        
        # Process windows in batches for efficiency
        for i in range(0, len(windows), BATCH_SIZE):
            batch_windows = windows[i:i + BATCH_SIZE]

            # Tokenize and prepare input for the model
            inputs = self.tokenizer(
                batch_windows,
                truncation=True,
                padding=True,
                max_length=MAX_LENGTH,
                return_tensors="pt"
            ).to(self.device)

            # Run inference with no gradient calculation
            with torch.no_grad():
                outputs = self.model(**inputs)
                probs = F.softmax(outputs.logits, dim=-1)

                # Process predictions for each window
                for idx, window in enumerate(batch_windows):
                    prediction = {
                        'window': window,
                        'human_prob': probs[idx][1].item(),
                        'ai_prob': probs[idx][0].item(),
                        'prediction': 'human' if probs[idx][1] > probs[idx][0] else 'ai'
                    }
                    predictions.append(prediction)

            # Clean up to free memory
            del inputs, outputs, probs
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        if not predictions:
            return {
                'prediction': 'unknown',
                'confidence': 0.0,
                'num_windows': 0
            }

        # Average probabilities across all windows for final prediction
        avg_human_prob = sum(p['human_prob'] for p in predictions) / len(predictions)
        avg_ai_prob = sum(p['ai_prob'] for p in predictions) / len(predictions)

        return {
            'prediction': 'human' if avg_human_prob > avg_ai_prob else 'ai',
            'confidence': max(avg_human_prob, avg_ai_prob),
            'num_windows': len(predictions)
        }

    # Detailed scan analysis - slower but provides sentence-level insights
    # Uses windows centered around each sentence for more precise analysis
    def detailed_scan(self, text: str) -> Dict:
        text = text.rstrip()
        
        if not text.strip():
            return {
                'sentence_predictions': [],
                'highlighted_text': '',
                'full_text': '',
                'overall_prediction': {
                    'prediction': 'unknown',
                    'confidence': 0.0,
                    'num_sentences': 0
                }
            }

        # Split text into sentences
        sentences = self.processor.split_into_sentences(text)
        if not sentences:
            return {}

        # Create a window centered on each sentence
        windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)

        # Track appearances and scores for each sentence
        sentence_appearances = {i: 0 for i in range(len(sentences))}
        sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}

        # Process windows in batches
        for i in range(0, len(windows), BATCH_SIZE):
            batch_windows = windows[i:i + BATCH_SIZE]
            batch_indices = window_sentence_indices[i:i + BATCH_SIZE]

            # Tokenize and prepare input
            inputs = self.tokenizer(
                batch_windows,
                truncation=True,
                padding=True,
                max_length=MAX_LENGTH,
                return_tensors="pt"
            ).to(self.device)

            # Run inference
            with torch.no_grad():
                outputs = self.model(**inputs)
                probs = F.softmax(outputs.logits, dim=-1)

                # Process each window's predictions
                for window_idx, indices in enumerate(batch_indices):
                    center_idx = len(indices) // 2
                    center_weight = 0.7  # Center sentence gets 70% weight
                    edge_weight = 0.3 / (len(indices) - 1)  # Other sentences share 30%

                    # Apply weighted prediction to each sentence in window
                    for pos, sent_idx in enumerate(indices):
                        weight = center_weight if pos == center_idx else edge_weight
                        sentence_appearances[sent_idx] += weight
                        sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
                        sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()

            # Clean up memory
            del inputs, outputs, probs
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Calculate final predictions for each sentence with smoothing between adjacent sentences
        sentence_predictions = []
        for i in range(len(sentences)):
            if sentence_appearances[i] > 0:
                human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
                ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]

                # Apply smoothing for sentences not at boundaries
                if i > 0 and i < len(sentences) - 1:
                    prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
                    prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
                    next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
                    next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]

                    current_pred = 'human' if human_prob > ai_prob else 'ai'
                    prev_pred = 'human' if prev_human > prev_ai else 'ai'
                    next_pred = 'human' if next_human > next_ai else 'ai'

                    # Only smooth if current sentence prediction differs from neighbors
                    if current_pred != prev_pred or current_pred != next_pred:
                        smooth_factor = 0.1  # 10% smoothing factor
                        human_prob = (human_prob * (1 - smooth_factor) + 
                                    (prev_human + next_human) * smooth_factor / 2)
                        ai_prob = (ai_prob * (1 - smooth_factor) + 
                                (prev_ai + next_ai) * smooth_factor / 2)

                sentence_predictions.append({
                    'sentence': sentences[i],
                    'human_prob': human_prob,
                    'ai_prob': ai_prob,
                    'prediction': 'human' if human_prob > ai_prob else 'ai',
                    'confidence': max(human_prob, ai_prob)
                })

        # Return detailed results
        return {
            'sentence_predictions': sentence_predictions,
            'highlighted_text': self.format_predictions_html(sentence_predictions),
            'full_text': text,
            'overall_prediction': self.aggregate_predictions(sentence_predictions)
        }

    # Format predictions with color highlighting for visual assessment
    def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
        html_parts = []
        
        for pred in sentence_predictions:
            sentence = pred['sentence']
            confidence = pred['confidence']
            
            # Color coding: stronger colors for high confidence, lighter for low confidence
            if confidence >= CONFIDENCE_THRESHOLD:
                if pred['prediction'] == 'human':
                    color = "#90EE90"  # Green for human (high confidence)
                else:
                    color = "#FFB6C6"  # Pink for AI (high confidence)
            else:
                if pred['prediction'] == 'human':
                    color = "#E8F5E9"  # Light green for human (low confidence)
                else:
                    color = "#FFEBEE"  # Light pink for AI (low confidence)
                    
            html_parts.append(f'<span style="background-color: {color};">{sentence}</span>')
            
        return " ".join(html_parts)

    # Aggregate individual sentence predictions into an overall result
    def aggregate_predictions(self, predictions: List[Dict]) -> Dict:
        if not predictions:
            return {
                'prediction': 'unknown',
                'confidence': 0.0,
                'num_sentences': 0
            }

        # Calculate average probabilities across all sentences
        total_human_prob = sum(p['human_prob'] for p in predictions)
        total_ai_prob = sum(p['ai_prob'] for p in predictions)
        num_sentences = len(predictions)

        avg_human_prob = total_human_prob / num_sentences
        avg_ai_prob = total_ai_prob / num_sentences

        return {
            'prediction': 'human' if avg_human_prob > avg_ai_prob else 'ai',
            'confidence': max(avg_human_prob, avg_ai_prob),
            'num_sentences': num_sentences
        }

# MAIN ANALYSIS FUNCTION
# Brings everything together to analyze text based on selected mode
def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
    start_time = time.time()
    
    word_count = len(text.split())
    
    # Auto-switch to quick mode for short texts
    original_mode = mode
    if word_count < 200 and mode == "detailed":
        mode = "quick"
    
    if mode == "quick":
        # Perform quick analysis
        result = classifier.quick_scan(text)
        
        quick_analysis = f"""
        PREDICTION: {result['prediction'].upper()}
        Confidence: {result['confidence']*100:.1f}%
        Windows analyzed: {result['num_windows']}
        """
        
        # Notify if automatically switched from detailed to quick mode
        if original_mode == "detailed":
            quick_analysis += f"\n\nNote: Switched to quick mode because text contains only {word_count} words. Minimum 200 words required for detailed analysis."
        
        execution_time = (time.time() - start_time) * 1000
        
        return (
            text,  # Original text (no highlighting)
            "Quick scan mode - no sentence-level analysis available",
            quick_analysis
        )
    else:
        # Perform detailed analysis
        analysis = classifier.detailed_scan(text)
        
        # Format sentence-by-sentence analysis text
        detailed_analysis = []
        for pred in analysis['sentence_predictions']:
            confidence = pred['confidence'] * 100
            detailed_analysis.append(f"Sentence: {pred['sentence']}")
            detailed_analysis.append(f"Prediction: {pred['prediction'].upper()}")
            detailed_analysis.append(f"Confidence: {confidence:.1f}%")
            detailed_analysis.append("-" * 50)
        
        # Format overall result summary
        final_pred = analysis['overall_prediction']
        overall_result = f"""
        FINAL PREDICTION: {final_pred['prediction'].upper()}
        Overall confidence: {final_pred['confidence']*100:.1f}%
        Number of sentences analyzed: {final_pred['num_sentences']}
        """
        
        execution_time = (time.time() - start_time) * 1000
        
        return (
            analysis['highlighted_text'],  # HTML-highlighted text
            "\n".join(detailed_analysis),  # Detailed sentence analysis
            overall_result  # Overall summary
        )

# Initialize the classifier
classifier = TextClassifier()

# GRADIO USER INTERFACE
demo = gr.Interface(
    fn=lambda text, mode: analyze_text(text, mode, classifier),
    inputs=[
        gr.Textbox(
            lines=8,
            placeholder="Enter text to analyze...",
            label="Input Text"
        ),
        gr.Radio(
            choices=["quick", "detailed"],
            value="quick",
            label="Analysis Mode",
            info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
        )
    ],
    outputs=[
        gr.HTML(label="Highlighted Analysis"),  # Shows color-coded result
        gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10),  # Detailed breakdown
        gr.Textbox(label="Overall Result", lines=4)  # Summary results
    ],
    title="AI Text Detector",
    description="Analyze text to detect if it was written by a human or AI. Choose between quick scan and detailed sentence-level analysis. 200+ words suggested for accurate predictions.",
    api_name="predict",
    flagging_mode="never"
)

# FastAPI configuration
app = demo.app

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["GET", "POST", "OPTIONS"],
    allow_headers=["*"],
)

# Start the server when run directly
if __name__ == "__main__":
    demo.queue()  # Enable request queuing
    demo.launch(
        server_name="0.0.0.0",  # Listen on all interfaces
        server_port=7860,  # Default Gradio port
        share=True  # Generate public URL
    )