Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

johnpaulbin commited on Feb 26

Commit

3ca3b8c

verified ·

1 Parent(s): e73b223

Update app.py

Browse files

Files changed (1) hide show

app.py +350 -233

app.py CHANGED Viewed

@@ -1,23 +1,72 @@
 import os
 import time
-import torch
-import gradio as gr
-from huggingface_hub import hf_hub_download
 import threading
 import queue
 import multiprocessing
-from functools import lru_cache
-# First check if GPU is available for maximum speed
 has_gpu = torch.cuda.is_available()
 gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
 print(f"GPU available: {has_gpu} - {gpu_name}")
 # Download model files
 def get_model_path(repo_id, filename):
     print(f"Obtaining {filename}...")
-    return hf_hub_download(repo_id=repo_id, filename=filename)
 base_model_path = get_model_path(
     "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
     "articulate-11-expspanish-base-merged-q8_0.gguf"
@@ -27,146 +76,120 @@ adapter_path = get_model_path(
     "articulate-V1-q8_0.gguf"
 )
-# Optimize environment variables for CPU performance
-os.environ["LLAMA_CUBLAS"] = "0"  # Disable CUDA since we're CPU only
-os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
-os.environ["LLAMA_AVX"] = "1"      # Enable AVX
-os.environ["LLAMA_AVX2"] = "1"     # Enable AVX2
-os.environ["LLAMA_F16"] = "1"      # Use FP16 where available
-# Import the right module
 from llama_cpp import Llama
-print("Using CPU-optimized llama-cpp-python")
-# Cache for translations
 translation_cache = {}
-MAX_CACHE_SIZE = 5000  # Increased cache size
-# Common phrases for pre-loading
-COMMON_PHRASES = {
-    "English to Spanish": [
-        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
-        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
-    ],
-    "Spanish to English": [
-        "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
-        "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
-    ],
-    "English to Korean": [
-        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
-        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
-    ],
-    "Korean to English": [
-        "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
-        "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
-    ]
-}
-# Implement LRU cache for better performance
-@lru_cache(maxsize=100)
-def get_cached_translation(direction, text):
-    """LRU cache for translations"""
-    return None  # This gets bypassed when there's a cache hit
-# Create a worker pool for parallel translation
-class ModelWorkerPool:
-    def __init__(self, num_workers=1):
-        self.num_workers = num_workers
         self.request_queue = queue.Queue()
         self.response_queue = queue.Queue()
-        self.workers = []
-        self.initialized = False
-        # Create shared model instance with optimized settings
-        print("Initializing model with CPU optimizations...")
-        start_time = time.time()
-        # CPU optimization settings - use fewer threads for Q8 model
-        cpu_count = multiprocessing.cpu_count()
-        optimal_threads = max(1, min(4, cpu_count - 1))  # Use fewer threads for better performance
-        # Create a smaller context size for faster inference
-        self.model = Llama(
-            model_path=base_model_path,
-            lora_path=adapter_path,
-            n_ctx=256,                # Reduced context for faster processing
-            n_threads=optimal_threads, # Optimized thread count
-            n_batch=512,              # Reduced batch size for CPU
-            use_mmap=True,            # Efficient memory mapping
-            n_gpu_layers=0,           # CPU only
-            seed=42,                  # Consistent results
-            verbose=False,            # Reduce overhead
-            rope_freq_base=10000,     # Default attention parameters
-            rope_freq_scale=1.0,
-        )
-        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
-        # Start worker threads
-        for i in range(num_workers):
-            worker = threading.Thread(target=self._worker_loop, daemon=True)
-            worker.start()
-            self.workers.append(worker)
-        self.initialized = True
-        # Pre-warm in background thread to not block startup
-        warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
-        warming_thread.start()
-    def _worker_loop(self):
-        """Worker thread that processes translation requests"""
         while True:
             try:
                 request = self.request_queue.get()
-                if request is None:  # Shutdown signal
                     break
-                direction, text, callback_id = request
-                # Check LRU cache first
-                cached = get_cached_translation(direction, text)
-                if cached is not None:
-                    self.response_queue.put((callback_id, cached))
-                    self.request_queue.task_done()
-                    continue
-                # Check regular cache
-                cache_key = f"{direction}:{text}"
-                if cache_key in translation_cache:
-                    result = translation_cache[cache_key]
-                else:
-                    # Process new translation
-                    result = self._process_translation(direction, text)
-                    # Store in regular cache
-                    if len(translation_cache) >= MAX_CACHE_SIZE:
-                        translation_cache.pop(next(iter(translation_cache)))
-                    translation_cache[cache_key] = result
-                self.response_queue.put((callback_id, result))
-                self.request_queue.task_done()
             except Exception as e:
-                print(f"Error in worker thread: {e}")
-                self.response_queue.put((callback_id, f"Error: {str(e)}"))
-                self.request_queue.task_done()
-    def _prewarm_model(self):
-        """Pre-compute common translations to warm up the model - minimal to save time"""
-        print("Pre-warming model with essential phrases (truncated for speed)...")
-        start = time.time()
-        # Just warm up with one phrase per direction to speed up startup
-        for direction, phrases in COMMON_PHRASES.items():
-            self._process_translation(direction, phrases[0])
-            # Only do the most common phrase to save startup time
-        print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
     def _process_translation(self, direction, text):
-        """Optimized translation function"""
-        # Skip empty inputs
         if not text or not text.strip():
             return ""
         # Start timing for performance tracking
         start_time = time.time()
@@ -183,185 +206,279 @@ class ModelWorkerPool:
         source_lang, target_lang = lang_map[direction]
-        # Truncate long inputs for faster processing
-        max_input_length = 100  # Limit input length
-        if len(text) > max_input_length:
-            text = text[:max_input_length] + "..."
         # Efficient prompt format
         prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
-        # Reduce max tokens for faster inference
-        input_tokens = len(text.split())
-        max_tokens = min(50, max(20, int(input_tokens * 1.2)))
-        # Generate translation with aggressive performance optimizations
-        try:
-            response = self.model.create_completion(
-                prompt,
-                max_tokens=max_tokens,
-                temperature=0.0,      # Deterministic for faster inference
-                top_k=1,              # Only consider most likely token
-                top_p=1.0,            # No sampling
-                repeat_penalty=1.0,   # No repeat penalty
-                stream=False,         # Get complete response at once
-                stop=["[/", "\n\n"],  # Stop early if possible
-            )
-            translation = response['choices'][0]['text'].strip()
-            # Log performance
-            inference_time = time.time() - start_time
-            tokens_per_second = (input_tokens + len(translation.split())) / inference_time
-            print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
-            return translation
-        except Exception as e:
-            print(f"Translation error: {e}")
-            return f"Error: Could not translate text. Try shorter input."
     def request_translation(self, direction, text, callback_id):
         """Queue a translation request"""
         self.request_queue.put((direction, text, callback_id))
-# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
-worker_pool = ModelWorkerPool(num_workers=1)
 # Counter for request IDs
 next_request_id = 0
-# Fast similarity check function for finding close matches in cache
-def find_similar_cached(direction, text, threshold=0.8):
-    """Find similar translations in cache based on prefix matching"""
-    if len(text) < 5:  # For very short inputs, look for exact matches
-        return None
-    text_lower = text.lower()
-    best_match = None
-    best_score = 0
-    for cached_key in list(translation_cache.keys()):
-        cached_dir, cached_text = cached_key.split(":", 1)
-        if cached_dir != direction:
             continue
-        # Simple similarity - prefix matching
-        if cached_text.lower().startswith(text_lower[:5]):
-            similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
-            if similarity > best_score and similarity > threshold:
-                best_score = similarity
-                best_match = translation_cache[cached_key]
-    return best_match
 # Gradio interface functions
 def translate(direction, text, progress=gr.Progress()):
-    """Queue translation request and wait for result - optimized version"""
     global next_request_id
-    # Trim whitespace for better cache hits
-    text = text.strip()
     # Skip empty inputs
-    if not text:
         return ""
-    # Check LRU cache first
-    cached = get_cached_translation(direction, text)
-    if cached is not None:
-        return cached
-    # Check main cache
     cache_key = f"{direction}:{text}"
     if cache_key in translation_cache:
         return translation_cache[cache_key]
-    # For short inputs, try to find similar cached
-    if len(text) < 20:
-        similar = find_similar_cached(direction, text)
-        if similar:
-            return similar
-    # Generate unique request ID
     request_id = next_request_id
     next_request_id += 1
     # Queue the request
-    worker_pool.request_translation(direction, text, request_id)
-    # Wait for the response with reasonable timeout
-    progress(0, desc="Translating...")
-    max_wait = 20  # Reduced maximum wait time
     start_time = time.time()
-    # Show progress while waiting
     while time.time() - start_time < max_wait:
-        progress((time.time() - start_time) / max_wait)
         # Check for our response
         try:
-            while not worker_pool.response_queue.empty():
-                resp_id, result = worker_pool.response_queue.get_nowait()
                 if resp_id == request_id:
-                    # Update LRU cache
-                    get_cached_translation.__wrapped__.__defaults__ = (result,)
                     progress(1.0)
                     return result
         except queue.Empty:
             pass
-        # Small sleep to prevent CPU hogging - reduced for faster response
-        time.sleep(0.01)
     progress(1.0)
-    return "Translation timed out. Please try a shorter text."
-# Create Gradio interface with simplified UI for performance
-with gr.Blocks(title="Fast CPU Translation App") as iface:
     gr.Markdown(f"""
-    ## Fast CPU Translation App
-    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
-    **For best performance, use short sentences or phrases.**
     """)
     with gr.Row():
-        direction = gr.Radio(
-            choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
             label="Translation Direction",
             value="English to Spanish"
         )
     with gr.Row():
-        input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
-        output_text = gr.Textbox(lines=3, label="Translation")
     # Add translate button
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
-    # Add examples with common short phrases for quick results
     gr.Examples(
         examples=[
-            ["English to Spanish", "Hello"],
-            ["Spanish to English", "Hola"],
-            ["English to Korean", "Thank you"],
-            ["Korean to English", "감사합니다"]
         ],
         inputs=[direction, input_text],
         fn=translate,
         outputs=output_text
     )
-    # Add performance tips
-    gr.Markdown("""
-    ### Performance Tips
-    - Keep text under 50 characters for fastest results
-    - Common phrases are pre-cached
-    - First translation may be slow, subsequent ones faster
-    - Frequently used phrases use an LRU cache for speed
-    """)
-iface.launch(
-    show_error=True,
-)

 import os
 import time
 import threading
 import queue
 import multiprocessing
+from pathlib import Path
+import torch
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import numpy as np
+# Set up environment variables for CPU optimization
+os.environ["OMP_NUM_THREADS"] = str(max(1, multiprocessing.cpu_count() - 1))  # Optimal OpenMP threads
+os.environ["MKL_NUM_THREADS"] = str(max(1, multiprocessing.cpu_count() - 1))  # Optimal MKL threads
+os.environ["LLAMA_AVX"] = "1"
+os.environ["LLAMA_AVX2"] = "1"
+os.environ["LLAMA_F16"] = "1"
+# Cache directories
+CACHE_DIR = Path.home() / ".cache" / "fast_translate"
+MODEL_CACHE = CACHE_DIR / "models"
+QUANTIZED_CACHE = CACHE_DIR / "quantized"
+os.makedirs(MODEL_CACHE, exist_ok=True)
+os.makedirs(QUANTIZED_CACHE, exist_ok=True)
+# Check if we're running on CPU
 has_gpu = torch.cuda.is_available()
 gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
 print(f"GPU available: {has_gpu} - {gpu_name}")
+# Configure CPU settings
+cpu_count = multiprocessing.cpu_count()
+optimal_threads = max(4, cpu_count - 1)  # Leave one core free
+print(f"Using {optimal_threads} of {cpu_count} CPU cores")
 # Download model files
 def get_model_path(repo_id, filename):
     print(f"Obtaining {filename}...")
+    # Download to our custom cache location
+    return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_CACHE)
+# Function to quantize model to int4 or int8
+def quantize_model(input_model_path, output_model_path, quantization_type="q4_0"):
+    """Quantize model to lower precision for faster inference on CPU"""
+    try:
+        from llama_cpp import llama_model_quantize
+        # Check if quantized model already exists
+        if os.path.exists(output_model_path):
+            print(f"Using existing quantized model: {output_model_path}")
+            return output_model_path
+        print(f"Quantizing model to {quantization_type}...")
+        start_time = time.time()
+        # Quantize using llama-cpp-python built-in quantization
+        llama_model_quantize(
+            input_model_path,
+            output_model_path,
+            quantization_type
+        )
+        print(f"Quantization completed in {time.time() - start_time:.2f}s")
+        return output_model_path
+    except Exception as e:
+        print(f"Quantization failed: {e}, using original model")
+        return input_model_path
+# Download models
 base_model_path = get_model_path(
     "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
     "articulate-11-expspanish-base-merged-q8_0.gguf"
     "articulate-V1-q8_0.gguf"
 )
+# Quantize models (creates int4 versions for faster CPU inference)
+quantized_base_path = str(QUANTIZED_CACHE / "articulate-base-q4_0.gguf")
+quantized_adapter_path = str(QUANTIZED_CACHE / "articulate-adapter-q4_0.gguf")
+base_model_path = quantize_model(base_model_path, quantized_base_path, "q4_0")
+adapter_path = quantize_model(adapter_path, quantized_adapter_path, "q4_0")
+# Import after setting environment variables
 from llama_cpp import Llama
+# Translation cache
 translation_cache = {}
+MAX_CACHE_SIZE = 1000
+# Model worker with batching support
+class ModelWorker:
+    def __init__(self):
+        self.model = None
         self.request_queue = queue.Queue()
         self.response_queue = queue.Queue()
+        self.batch_queue = []
+        self.batch_event = threading.Event()
+        self.batch_size = 4  # Process up to 4 requests at once
+        self.batch_timeout = 0.1  # Wait 100ms max to collect batch
+        self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
+        self.batch_thread = threading.Thread(target=self._batch_loop, daemon=True)
+        self.worker_thread.start()
+        self.batch_thread.start()
+    def _batch_loop(self):
+        """Collect requests into batches for more efficient processing"""
         while True:
             try:
+                # Get a request
                 request = self.request_queue.get()
+                if request is None:
                     break
+                # Add to batch
+                self.batch_queue.append(request)
+                # Try to collect more requests for the batch
+                batch_start = time.time()
+                while (len(self.batch_queue) < self.batch_size and
+                      time.time() - batch_start < self.batch_timeout):
+                    try:
+                        req = self.request_queue.get_nowait()
+                        if req is None:
+                            break
+                        self.batch_queue.append(req)
+                    except queue.Empty:
+                        time.sleep(0.01)
+                # Signal worker to process the batch
+                current_batch = self.batch_queue.copy()
+                self.batch_queue = []
+                for req in current_batch:
+                    self._process_request(req)
             except Exception as e:
+                print(f"Error in batch thread: {e}")
+    def _worker_loop(self):
+        """Initialize model and process requests"""
+        try:
+            # Initialize model with optimized settings
+            print("Initializing model in background thread...")
+            start_time = time.time()
+            # Create model context with very optimized settings for CPU
+            self.model = Llama(
+                model_path=base_model_path,
+                lora_path=adapter_path,
+                n_ctx=256,                # Smaller context for speed
+                n_threads=optimal_threads, # Use all but one CPU core
+                n_batch=512,              # Smaller batch for CPU
+                use_mmap=True,            # Memory mapping (more efficient)
+                n_gpu_layers=0,           # Force CPU only
+                seed=42,                  # Consistent results
+                rope_freq_base=10000,     # Default RoPE settings
+                rope_freq_scale=1.0,
+                verbose=False             # Reduce overhead
+            )
+            print(f"Model loaded in {time.time() - start_time:.2f} seconds")
+            # Pre-warm the model with common phrases by running a simple inference
+            print("Pre-warming model...")
+            self.model.create_completion("[ENGLISH]hello[SPANISH]", max_tokens=8)
+            print("Model ready for translation")
+        except Exception as e:
+            print(f"Failed to initialize model: {e}")
+    def _process_request(self, request):
+        """Process a single translation request"""
+        try:
+            direction, text, callback_id = request
+            result = self._process_translation(direction, text)
+            self.response_queue.put((callback_id, result))
+        except Exception as e:
+            print(f"Error processing request: {e}")
+            self.response_queue.put((callback_id, f"Error: {str(e)}"))
     def _process_translation(self, direction, text):
+        """Translate text with optimized settings"""
         if not text or not text.strip():
             return ""
+        # Check cache first for faster response
+        cache_key = f"{direction}:{text}"
+        if cache_key in translation_cache:
+            print("Cache hit!")
+            return translation_cache[cache_key]
         # Start timing for performance tracking
         start_time = time.time()
         source_lang, target_lang = lang_map[direction]
         # Efficient prompt format
         prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
+        # Estimate appropriate token length based on input
+        input_tokens = min(100, max(10, len(text.split())))
+        max_tokens = min(100, max(25, int(input_tokens * 1.3)))
+        # Generate translation with aggressively optimized settings for speed
+        response = self.model.create_completion(
+            prompt,
+            max_tokens=max_tokens,
+            temperature=0.0,     # Deterministic
+            top_k=1,             # Most likely token
+            top_p=1.0,           # No sampling
+            repeat_penalty=1.0,  # No penalty
+            stream=False         # Get complete response
+        )
+        translation = response['choices'][0]['text'].strip()
+        # Cache result
+        if len(translation_cache) >= MAX_CACHE_SIZE:
+            # Remove oldest entry (first key)
+            translation_cache.pop(next(iter(translation_cache)))
+        translation_cache[cache_key] = translation
+        # Log performance
+        inference_time = time.time() - start_time
+        tokens_per_second = (input_tokens + len(translation.split())) / inference_time
+        print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
+        return translation
     def request_translation(self, direction, text, callback_id):
         """Queue a translation request"""
         self.request_queue.put((direction, text, callback_id))
+# Model preloading thread that preloads and pre-computes common translations
+def preload_common_phrases(worker):
+    # Dictionary of common phrases that will benefit from caching
+    common_phrases = {
+        "English to Spanish": [
+            "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
+            "I don't understand", "Please", "Sorry", "Yes", "No", "Where is",
+            "How much does it cost?", "What time is it?", "I don't speak Spanish",
+            "Where is the bathroom?", "I need help", "Can you help me?"
+        ],
+        "Spanish to English": [
+            "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
+            "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está",
+            "¿Cuánto cuesta?", "¿Qué hora es?", "No hablo español", "¿Dónde está el baño?",
+            "Necesito ayuda", "¿Puedes ayudarme?"
+        ],
+        "English to Korean": [
+            "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
+            "I don't understand", "Please", "Sorry", "Yes", "No", "Where is",
+            "How much is this?", "What time is it?", "I don't speak Korean"
+        ],
+        "Korean to English": [
+            "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
+            "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요",
+            "이거 얼마예요?", "지금 몇 시예요?", "한국어를 못해요"
+        ]
+    }
+    preload_requests = []
+    for direction, phrases in common_phrases.items():
+        for phrase in phrases:
+            preload_requests.append((direction, phrase, f"preload_{len(preload_requests)}"))
+    # Process preloading in a separate thread
+    def preloader():
+        print(f"Preloading {len(preload_requests)} common phrases in background...")
+        for request in preload_requests:
+            worker.request_translation(*request)
+            # Small sleep to avoid overwhelming the queue
+            time.sleep(0.1)
+        print("Preloading complete")
+    thread = threading.Thread(target=preloader, daemon=True)
+    thread.start()
+    return thread
+# Create worker instance
+worker = ModelWorker()
+# Start preloading common phrases in background
+preload_thread = preload_common_phrases(worker)
 # Counter for request IDs
 next_request_id = 0
+# Implementation of a faster sentence splitter for batching
+def split_sentences(text, max_length=50):
+    """Split text into manageable chunks for faster translation"""
+    if len(text) <= max_length:
+        return [text]
+    # Split on natural boundaries
+    delimiters = ['. ', '! ', '? ', '.\n', '!\n', '?\n', '\n\n']
+    chunks = []
+    current_chunk = ""
+    lines = text.split('\n')
+    for line in lines:
+        if not line.strip():
+            if current_chunk:
+                chunks.append(current_chunk)
+                current_chunk = ""
             continue
+        words = line.split(' ')
+        for word in words:
+            test_chunk = f"{current_chunk} {word}".strip()
+            if len(test_chunk) > max_length:
+                chunks.append(current_chunk)
+                current_chunk = word
+            else:
+                current_chunk = test_chunk
+            # Check for natural breaks
+            for delimiter in delimiters:
+                if delimiter in current_chunk[-len(delimiter):]:
+                    chunks.append(current_chunk)
+                    current_chunk = ""
+                    break
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
 # Gradio interface functions
 def translate(direction, text, progress=gr.Progress()):
+    """Fast translation with batching and caching"""
     global next_request_id
     # Skip empty inputs
+    if not text or not text.strip():
         return ""
+    # Check exact cache hit
     cache_key = f"{direction}:{text}"
     if cache_key in translation_cache:
         return translation_cache[cache_key]
+    # For longer texts, split into sentences for faster processing
+    if len(text) > 50:
+        progress(0.1, desc="Processing text...")
+        chunks = split_sentences(text)
+        if len(chunks) > 1:
+            results = []
+            for i, chunk in enumerate(chunks):
+                # Check if this chunk is in cache
+                chunk_key = f"{direction}:{chunk}"
+                if chunk_key in translation_cache:
+                    results.append(translation_cache[chunk_key])
+                    continue
+                # Request translation for this chunk
+                chunk_id = next_request_id
+                next_request_id += 1
+                worker.request_translation(direction, chunk, chunk_id)
+                # Wait for response
+                chunk_start = time.time()
+                while time.time() - chunk_start < 10:  # 10 second timeout per chunk
+                    progress((i + 0.5) / len(chunks), desc=f"Translating part {i+1}/{len(chunks)}")
+                    try:
+                        while not worker.response_queue.empty():
+                            resp_id, result = worker.response_queue.get_nowait()
+                            if resp_id == chunk_id:
+                                results.append(result)
+                                chunk_found = True
+                                break
+                    except queue.Empty:
+                        pass
+                    time.sleep(0.05)
+                if len(results) != i + 1:
+                    results.append(f"[Translation failed for part {i+1}]")
+            combined = " ".join(results)
+            translation_cache[cache_key] = combined
+            progress(1.0)
+            return combined
+    # For single sentences
     request_id = next_request_id
     next_request_id += 1
     # Queue the request
+    worker.request_translation(direction, text, request_id)
+    # Wait for the response
+    progress(0.2, desc="Translating...")
     start_time = time.time()
+    max_wait = 20  # Maximum wait time in seconds
     while time.time() - start_time < max_wait:
+        progress(0.2 + 0.8 * ((time.time() - start_time) / max_wait), desc="Translating...")
         # Check for our response
         try:
+            while not worker.response_queue.empty():
+                resp_id, result = worker.response_queue.get_nowait()
                 if resp_id == request_id:
                     progress(1.0)
                     return result
         except queue.Empty:
             pass
+        # Small sleep to prevent CPU hogging
+        time.sleep(0.05)
     progress(1.0)
+    return "Translation timed out. Please try again with a shorter text."
+# Create Gradio interface
+with gr.Blocks(title="Ultra-Fast Translation App (CPU Optimized)") as iface:
     gr.Markdown(f"""
+    ## Ultra-Fast Translation App (CPU Optimized)
+    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU optimized with int4 quantization'}
     """)
     with gr.Row():
+        direction = gr.Dropdown(
+            choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
             label="Translation Direction",
             value="English to Spanish"
         )
     with gr.Row():
+        input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
+        output_text = gr.Textbox(lines=5, label="Translation")
     # Add translate button
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
+    # Optimization options
+    with gr.Accordion("Performance Tips", open=True):
+        gr.Markdown("""
+        ### Speed Optimization Tips
+        - ✅ The model has been quantized to int4 for faster CPU execution
+        - ✅ Common phrases are pre-cached for instant results
+        - ✅ Long text is automatically split into smaller chunks
+        - ✅ First translation will be slower as the model warms up
+        - ✅ Short sentences (< 50 chars) translate much faster
+        """)
+    # Add examples with preloaded common phrases
     gr.Examples(
         examples=[
+            ["English to Spanish", "Hello, how are you today?"],
+            ["Spanish to English", "Hola, ¿cómo estás hoy?"],
+            ["English to Korean", "The weather is nice today."],
+            ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
         ],
         inputs=[direction, input_text],
         fn=translate,
         outputs=output_text
     )
+# Launch with optimized settings
+if __name__ == "__main__":
+    iface.launch(
+        debug=False,
+        show_error=True,
+        share=False,
+        quiet=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )