import os
import time
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import threading
import queue
import multiprocessing
from functools import lru_cache

# First check if GPU is available for maximum speed
has_gpu = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
print(f"GPU available: {has_gpu} - {gpu_name}")

# Download model files
def get_model_path(repo_id, filename):
    print(f"Obtaining {filename}...")
    return hf_hub_download(repo_id=repo_id, filename=filename)

base_model_path = get_model_path(
    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", 
    "articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
    "johnpaulbin/articulate-V1-Q8_0-GGUF", 
    "articulate-V1-q8_0.gguf"
)

# Optimize environment variables for CPU performance
os.environ["LLAMA_CUBLAS"] = "0"  # Disable CUDA since we're CPU only
os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
os.environ["LLAMA_AVX"] = "1"      # Enable AVX
os.environ["LLAMA_AVX2"] = "1"     # Enable AVX2
os.environ["LLAMA_F16"] = "1"      # Use FP16 where available

# Import the right module
from llama_cpp import Llama
print("Using CPU-optimized llama-cpp-python")

# Cache for translations
translation_cache = {}
MAX_CACHE_SIZE = 5000  # Increased cache size

# Common phrases for pre-loading
COMMON_PHRASES = {
    "English to Spanish": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Spanish to English": [
        "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
        "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
    ],
    "English to Korean": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Korean to English": [
        "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
        "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
    ]
}

# Implement LRU cache for better performance
@lru_cache(maxsize=100)
def get_cached_translation(direction, text):
    """LRU cache for translations"""
    return None  # This gets bypassed when there's a cache hit

# Create a worker pool for parallel translation
class ModelWorkerPool:
    def __init__(self, num_workers=1):
        self.num_workers = num_workers
        self.request_queue = queue.Queue()
        self.response_queue = queue.Queue()
        self.workers = []
        self.initialized = False
        
        # Create shared model instance with optimized settings
        print("Initializing model with CPU optimizations...")
        start_time = time.time()
        
        # CPU optimization settings - use fewer threads for Q8 model
        cpu_count = multiprocessing.cpu_count()
        optimal_threads = max(1, min(4, cpu_count - 1))  # Use fewer threads for better performance
        
        # Create a smaller context size for faster inference
        self.model = Llama(
            model_path=base_model_path,
            lora_path=adapter_path,
            n_ctx=256,                # Reduced context for faster processing
            n_threads=optimal_threads, # Optimized thread count
            n_batch=512,              # Reduced batch size for CPU
            use_mmap=True,            # Efficient memory mapping
            n_gpu_layers=0,           # CPU only
            seed=42,                  # Consistent results
            verbose=False,            # Reduce overhead
            rope_freq_base=10000,     # Default attention parameters
            rope_freq_scale=1.0,
        )
        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
        
        # Start worker threads
        for i in range(num_workers):
            worker = threading.Thread(target=self._worker_loop, daemon=True)
            worker.start()
            self.workers.append(worker)
        
        self.initialized = True
        
        # Pre-warm in background thread to not block startup
        warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
        warming_thread.start()
    
    def _worker_loop(self):
        """Worker thread that processes translation requests"""
        while True:
            try:
                request = self.request_queue.get()
                if request is None:  # Shutdown signal
                    break
                    
                direction, text, callback_id = request
                
                # Check LRU cache first
                cached = get_cached_translation(direction, text)
                if cached is not None:
                    self.response_queue.put((callback_id, cached))
                    self.request_queue.task_done()
                    continue
                
                # Check regular cache
                cache_key = f"{direction}:{text}"
                if cache_key in translation_cache:
                    result = translation_cache[cache_key]
                else:
                    # Process new translation
                    result = self._process_translation(direction, text)
                    # Store in regular cache
                    if len(translation_cache) >= MAX_CACHE_SIZE:
                        translation_cache.pop(next(iter(translation_cache)))
                    translation_cache[cache_key] = result
                
                self.response_queue.put((callback_id, result))
                self.request_queue.task_done()
            except Exception as e:
                print(f"Error in worker thread: {e}")
                self.response_queue.put((callback_id, f"Error: {str(e)}"))
                self.request_queue.task_done()
    
    def _prewarm_model(self):
        """Pre-compute common translations to warm up the model - minimal to save time"""
        print("Pre-warming model with essential phrases (truncated for speed)...")
        start = time.time()
        
        # Just warm up with one phrase per direction to speed up startup
        for direction, phrases in COMMON_PHRASES.items():
            self._process_translation(direction, phrases[0])
            # Only do the most common phrase to save startup time
            
        print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
    
    def _process_translation(self, direction, text):
        """Optimized translation function"""
        # Skip empty inputs
        if not text or not text.strip():
            return ""
            
        # Start timing for performance tracking
        start_time = time.time()
        
        # Map language directions
        lang_map = {
            "English to Spanish": ("ENGLISH", "SPANISH"),
            "Spanish to English": ("SPANISH", "ENGLISH"),
            "Korean to English": ("KOREAN", "ENGLISH"),
            "English to Korean": ("ENGLISH", "KOREAN")
        }
        
        if direction not in lang_map:
            return "Invalid direction"
        
        source_lang, target_lang = lang_map[direction]
        
        # Truncate long inputs for faster processing
        max_input_length = 100  # Limit input length
        if len(text) > max_input_length:
            text = text[:max_input_length] + "..."
        
        # Efficient prompt format
        prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
        
        # Reduce max tokens for faster inference
        input_tokens = len(text.split())
        max_tokens = min(50, max(20, int(input_tokens * 1.2)))
        
        # Generate translation with aggressive performance optimizations
        try:
            response = self.model.create_completion(
                prompt,
                max_tokens=max_tokens,
                temperature=0.0,      # Deterministic for faster inference
                top_k=1,              # Only consider most likely token
                top_p=1.0,            # No sampling
                repeat_penalty=1.0,   # No repeat penalty
                stream=False,         # Get complete response at once
                stop=["[/", "\n\n"],  # Stop early if possible
            )
            
            translation = response['choices'][0]['text'].strip()
            
            # Log performance
            inference_time = time.time() - start_time
            tokens_per_second = (input_tokens + len(translation.split())) / inference_time
            print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
            
            return translation
        except Exception as e:
            print(f"Translation error: {e}")
            return f"Error: Could not translate text. Try shorter input."
    
    def request_translation(self, direction, text, callback_id):
        """Queue a translation request"""
        self.request_queue.put((direction, text, callback_id))

# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
worker_pool = ModelWorkerPool(num_workers=1)

# Counter for request IDs
next_request_id = 0

# Fast similarity check function for finding close matches in cache
def find_similar_cached(direction, text, threshold=0.8):
    """Find similar translations in cache based on prefix matching"""
    if len(text) < 5:  # For very short inputs, look for exact matches
        return None
        
    text_lower = text.lower()
    best_match = None
    best_score = 0
    
    for cached_key in list(translation_cache.keys()):
        cached_dir, cached_text = cached_key.split(":", 1)
        if cached_dir != direction:
            continue
            
        # Simple similarity - prefix matching
        if cached_text.lower().startswith(text_lower[:5]):
            similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
            if similarity > best_score and similarity > threshold:
                best_score = similarity
                best_match = translation_cache[cached_key]
                
    return best_match

# Gradio interface functions
def translate(direction, text, progress=gr.Progress()):
    """Queue translation request and wait for result - optimized version"""
    global next_request_id
    
    # Trim whitespace for better cache hits
    text = text.strip()
    
    # Skip empty inputs
    if not text:
        return ""
    
    # Check LRU cache first
    cached = get_cached_translation(direction, text)
    if cached is not None:
        return cached
    
    # Check main cache
    cache_key = f"{direction}:{text}"
    if cache_key in translation_cache:
        return translation_cache[cache_key]
    
    # For short inputs, try to find similar cached
    if len(text) < 20:
        similar = find_similar_cached(direction, text)
        if similar:
            return similar
    
    # Generate unique request ID
    request_id = next_request_id
    next_request_id += 1
    
    # Queue the request
    worker_pool.request_translation(direction, text, request_id)
    
    # Wait for the response with reasonable timeout
    progress(0, desc="Translating...")
    max_wait = 20  # Reduced maximum wait time
    start_time = time.time()
    
    # Show progress while waiting
    while time.time() - start_time < max_wait:
        progress((time.time() - start_time) / max_wait)
        
        # Check for our response
        try:
            while not worker_pool.response_queue.empty():
                resp_id, result = worker_pool.response_queue.get_nowait()
                if resp_id == request_id:
                    # Update LRU cache
                    get_cached_translation.__wrapped__.__defaults__ = (result,)
                    progress(1.0)
                    return result
        except queue.Empty:
            pass
        
        # Small sleep to prevent CPU hogging - reduced for faster response
        time.sleep(0.01)
    
    progress(1.0)
    return "Translation timed out. Please try a shorter text."

# Create Gradio interface with simplified UI for performance
with gr.Blocks(title="Fast CPU Translation App") as iface:
    gr.Markdown(f"""
    ## Fast CPU Translation App
    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}  
    **For best performance, use short sentences or phrases.**
    """)
    
    with gr.Row():
        direction = gr.Radio(
            choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
            label="Translation Direction",
            value="English to Spanish"
        )
    
    with gr.Row():
        input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
        output_text = gr.Textbox(lines=3, label="Translation")
    
    # Add translate button
    translate_btn = gr.Button("Translate")
    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
    
    # Add examples with common short phrases for quick results
    gr.Examples(
        examples=[
            ["English to Spanish", "Hello"],
            ["Spanish to English", "Hola"],
            ["English to Korean", "Thank you"],
            ["Korean to English", "감사합니다"]
        ],
        inputs=[direction, input_text],
        fn=translate,
        outputs=output_text
    )
    
    # Add performance tips
    gr.Markdown("""
    ### Performance Tips
    - Keep text under 50 characters for fastest results
    - Common phrases are pre-cached
    - First translation may be slow, subsequent ones faster
    - Frequently used phrases use an LRU cache for speed
    """)

# Launch with optimized settings
if __name__ == "__main__":
    iface.launch(
        debug=False,
        show_error=True,
        share=False,
        quiet=True,
        server_name="0.0.0.0", 
        server_port=7860
    )