import os import time import torch import gradio as gr from huggingface_hub import hf_hub_download import threading import queue import multiprocessing from functools import lru_cache # First check if GPU is available for maximum speed has_gpu = torch.cuda.is_available() gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU" print(f"GPU available: {has_gpu} - {gpu_name}") # Download model files def get_model_path(repo_id, filename): print(f"Obtaining {filename}...") return hf_hub_download(repo_id=repo_id, filename=filename) base_model_path = get_model_path( "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", "articulate-11-expspanish-base-merged-q8_0.gguf" ) adapter_path = get_model_path( "johnpaulbin/articulate-V1-Q8_0-GGUF", "articulate-V1-q8_0.gguf" ) # Optimize environment variables for CPU performance os.environ["LLAMA_CUBLAS"] = "0" # Disable CUDA since we're CPU only os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL os.environ["LLAMA_AVX"] = "1" # Enable AVX os.environ["LLAMA_AVX2"] = "1" # Enable AVX2 os.environ["LLAMA_F16"] = "1" # Use FP16 where available # Import the right module from llama_cpp import Llama print("Using CPU-optimized llama-cpp-python") # Cache for translations translation_cache = {} MAX_CACHE_SIZE = 5000 # Increased cache size # Common phrases for pre-loading COMMON_PHRASES = { "English to Spanish": [ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?", "I don't understand", "Please", "Sorry", "Yes", "No", "Where is" ], "Spanish to English": [ "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?", "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está" ], "English to Korean": [ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?", "I don't understand", "Please", "Sorry", "Yes", "No", "Where is" ], "Korean to English": [ "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?", "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요" ] } # Implement LRU cache for better performance @lru_cache(maxsize=100) def get_cached_translation(direction, text): """LRU cache for translations""" return None # This gets bypassed when there's a cache hit # Create a worker pool for parallel translation class ModelWorkerPool: def __init__(self, num_workers=1): self.num_workers = num_workers self.request_queue = queue.Queue() self.response_queue = queue.Queue() self.workers = [] self.initialized = False # Create shared model instance with optimized settings print("Initializing model with CPU optimizations...") start_time = time.time() # CPU optimization settings - use fewer threads for Q8 model cpu_count = multiprocessing.cpu_count() optimal_threads = max(1, min(4, cpu_count - 1)) # Use fewer threads for better performance # Create a smaller context size for faster inference self.model = Llama( model_path=base_model_path, lora_path=adapter_path, n_ctx=256, # Reduced context for faster processing n_threads=optimal_threads, # Optimized thread count n_batch=512, # Reduced batch size for CPU use_mmap=True, # Efficient memory mapping n_gpu_layers=0, # CPU only seed=42, # Consistent results verbose=False, # Reduce overhead rope_freq_base=10000, # Default attention parameters rope_freq_scale=1.0, ) print(f"Model loaded in {time.time() - start_time:.2f} seconds") # Start worker threads for i in range(num_workers): worker = threading.Thread(target=self._worker_loop, daemon=True) worker.start() self.workers.append(worker) self.initialized = True # Pre-warm in background thread to not block startup warming_thread = threading.Thread(target=self._prewarm_model, daemon=True) warming_thread.start() def _worker_loop(self): """Worker thread that processes translation requests""" while True: try: request = self.request_queue.get() if request is None: # Shutdown signal break direction, text, callback_id = request # Check LRU cache first cached = get_cached_translation(direction, text) if cached is not None: self.response_queue.put((callback_id, cached)) self.request_queue.task_done() continue # Check regular cache cache_key = f"{direction}:{text}" if cache_key in translation_cache: result = translation_cache[cache_key] else: # Process new translation result = self._process_translation(direction, text) # Store in regular cache if len(translation_cache) >= MAX_CACHE_SIZE: translation_cache.pop(next(iter(translation_cache))) translation_cache[cache_key] = result self.response_queue.put((callback_id, result)) self.request_queue.task_done() except Exception as e: print(f"Error in worker thread: {e}") self.response_queue.put((callback_id, f"Error: {str(e)}")) self.request_queue.task_done() def _prewarm_model(self): """Pre-compute common translations to warm up the model - minimal to save time""" print("Pre-warming model with essential phrases (truncated for speed)...") start = time.time() # Just warm up with one phrase per direction to speed up startup for direction, phrases in COMMON_PHRASES.items(): self._process_translation(direction, phrases[0]) # Only do the most common phrase to save startup time print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds") def _process_translation(self, direction, text): """Optimized translation function""" # Skip empty inputs if not text or not text.strip(): return "" # Start timing for performance tracking start_time = time.time() # Map language directions lang_map = { "English to Spanish": ("ENGLISH", "SPANISH"), "Spanish to English": ("SPANISH", "ENGLISH"), "Korean to English": ("KOREAN", "ENGLISH"), "English to Korean": ("ENGLISH", "KOREAN") } if direction not in lang_map: return "Invalid direction" source_lang, target_lang = lang_map[direction] # Truncate long inputs for faster processing max_input_length = 100 # Limit input length if len(text) > max_input_length: text = text[:max_input_length] + "..." # Efficient prompt format prompt = f"[{source_lang}]{text.strip()}[{target_lang}]" # Reduce max tokens for faster inference input_tokens = len(text.split()) max_tokens = min(50, max(20, int(input_tokens * 1.2))) # Generate translation with aggressive performance optimizations try: response = self.model.create_completion( prompt, max_tokens=max_tokens, temperature=0.0, # Deterministic for faster inference top_k=1, # Only consider most likely token top_p=1.0, # No sampling repeat_penalty=1.0, # No repeat penalty stream=False, # Get complete response at once stop=["[/", "\n\n"], # Stop early if possible ) translation = response['choices'][0]['text'].strip() # Log performance inference_time = time.time() - start_time tokens_per_second = (input_tokens + len(translation.split())) / inference_time print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)") return translation except Exception as e: print(f"Translation error: {e}") return f"Error: Could not translate text. Try shorter input." def request_translation(self, direction, text, callback_id): """Queue a translation request""" self.request_queue.put((direction, text, callback_id)) # Create optimized worker pool - use just one worker for better performance with Q8 model on CPU worker_pool = ModelWorkerPool(num_workers=1) # Counter for request IDs next_request_id = 0 # Fast similarity check function for finding close matches in cache def find_similar_cached(direction, text, threshold=0.8): """Find similar translations in cache based on prefix matching""" if len(text) < 5: # For very short inputs, look for exact matches return None text_lower = text.lower() best_match = None best_score = 0 for cached_key in list(translation_cache.keys()): cached_dir, cached_text = cached_key.split(":", 1) if cached_dir != direction: continue # Simple similarity - prefix matching if cached_text.lower().startswith(text_lower[:5]): similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower()))) if similarity > best_score and similarity > threshold: best_score = similarity best_match = translation_cache[cached_key] return best_match # Gradio interface functions def translate(direction, text, progress=gr.Progress()): """Queue translation request and wait for result - optimized version""" global next_request_id # Trim whitespace for better cache hits text = text.strip() # Skip empty inputs if not text: return "" # Check LRU cache first cached = get_cached_translation(direction, text) if cached is not None: return cached # Check main cache cache_key = f"{direction}:{text}" if cache_key in translation_cache: return translation_cache[cache_key] # For short inputs, try to find similar cached if len(text) < 20: similar = find_similar_cached(direction, text) if similar: return similar # Generate unique request ID request_id = next_request_id next_request_id += 1 # Queue the request worker_pool.request_translation(direction, text, request_id) # Wait for the response with reasonable timeout progress(0, desc="Translating...") max_wait = 20 # Reduced maximum wait time start_time = time.time() # Show progress while waiting while time.time() - start_time < max_wait: progress((time.time() - start_time) / max_wait) # Check for our response try: while not worker_pool.response_queue.empty(): resp_id, result = worker_pool.response_queue.get_nowait() if resp_id == request_id: # Update LRU cache get_cached_translation.__wrapped__.__defaults__ = (result,) progress(1.0) return result except queue.Empty: pass # Small sleep to prevent CPU hogging - reduced for faster response time.sleep(0.01) progress(1.0) return "Translation timed out. Please try a shorter text." # Create Gradio interface with simplified UI for performance with gr.Blocks(title="Fast CPU Translation App") as iface: gr.Markdown(f""" ## Fast CPU Translation App Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'} **For best performance, use short sentences or phrases.** """) with gr.Row(): direction = gr.Radio( choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"], label="Translation Direction", value="English to Spanish" ) with gr.Row(): input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...") output_text = gr.Textbox(lines=3, label="Translation") # Add translate button translate_btn = gr.Button("Translate") translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text) # Add examples with common short phrases for quick results gr.Examples( examples=[ ["English to Spanish", "Hello"], ["Spanish to English", "Hola"], ["English to Korean", "Thank you"], ["Korean to English", "감사합니다"] ], inputs=[direction, input_text], fn=translate, outputs=output_text ) # Add performance tips gr.Markdown(""" ### Performance Tips - Keep text under 50 characters for fastest results - Common phrases are pre-cached - First translation may be slow, subsequent ones faster - Frequently used phrases use an LRU cache for speed """) # Launch with optimized settings if __name__ == "__main__": iface.launch( debug=False, show_error=True, share=False, quiet=True, server_name="0.0.0.0", server_port=7860 )