Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

johnpaulbin commited on Feb 26

Commit

cedd7b9

verified ·

1 Parent(s): 42b5300

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -137

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from huggingface_hub import hf_hub_download
 import threading
 import queue
 import multiprocessing
 # First check if GPU is available for maximum speed
 has_gpu = torch.cuda.is_available()
@@ -26,37 +27,22 @@ adapter_path = get_model_path(
     "articulate-V1-q8_0.gguf"
 )
-# Set up optimized environment variables for llama-cpp-python
-os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
 os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
-# For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
-os.environ["LLAMA_AVX"] = "1"
-os.environ["LLAMA_AVX2"] = "1"
-os.environ["LLAMA_F16"] = "1"  # Use FP16 where available
-# Determine the most optimized backend
-if has_gpu:
-    try:
-        from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
-        LlamaClass = GPULlama
-        print("Using GPU-accelerated llama-cpp-python")
-        n_gpu_layers = -1  # Use all layers on GPU
-    except ImportError:
-        from llama_cpp import Llama
-        LlamaClass = Llama
-        print("Using standard llama-cpp-python with GPU acceleration")
-        n_gpu_layers = -1  # Use all layers on GPU
-else:
-    from llama_cpp import Llama
-    LlamaClass = Llama
-    print("Using CPU-only llama-cpp-python")
-    n_gpu_layers = 0
 # Cache for translations
 translation_cache = {}
-MAX_CACHE_SIZE = 1000
-# Pre-compute common translations
 COMMON_PHRASES = {
     "English to Spanish": [
         "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
@@ -76,46 +62,59 @@ COMMON_PHRASES = {
     ]
 }
-# Background worker for model loading and inference
-class ModelWorker:
-    def __init__(self):
-        self.model = None
         self.request_queue = queue.Queue()
         self.response_queue = queue.Queue()
-        self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
-        self.worker_thread.start()
-    def _worker_loop(self):
-        # Initialize model in the worker thread
-        print("Initializing model in background thread...")
-        # CPU optimization settings
         cpu_count = multiprocessing.cpu_count()
-        optimal_threads = max(4, cpu_count - 2)  # Leave two cores free
-        # Initialize with the most optimized settings
-        start_time = time.time()
-        self.model = LlamaClass(
             model_path=base_model_path,
             lora_path=adapter_path,
-            n_ctx=512,               # Larger context for longer translations
             n_threads=optimal_threads, # Optimized thread count
-            n_batch=1024,              # Large batch for parallel processing
-            use_mmap=True,             # Efficient memory mapping
-            n_gpu_layers=n_gpu_layers, # GPU acceleration if available
-            seed=42,                   # Consistent results
-            verbose=False,             # Reduce overhead
-            main_gpu=0,                # Primary GPU
-            tensor_split=None,         # Auto-distribute across GPUs if multiple
-            rope_freq_base=10000,      # Optimized attention parameters
             rope_freq_scale=1.0,
         )
         print(f"Model loaded in {time.time() - start_time:.2f} seconds")
-        # Pre-warm the model with common phrases
-        self._prewarm_model()
-        # Process requests
         while True:
             try:
                 request = self.request_queue.get()
@@ -123,31 +122,51 @@ class ModelWorker:
                     break
                 direction, text, callback_id = request
-                result = self._process_translation(direction, text)
                 self.response_queue.put((callback_id, result))
             except Exception as e:
                 print(f"Error in worker thread: {e}")
                 self.response_queue.put((callback_id, f"Error: {str(e)}"))
     def _prewarm_model(self):
-        """Pre-compute common translations to warm up the model"""
-        print("Pre-warming model with common phrases...")
         start = time.time()
         for direction, phrases in COMMON_PHRASES.items():
-            for phrase in phrases[:3]:  # Just do a few to warm up
-                self._process_translation(direction, phrase)
-        print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")
     def _process_translation(self, direction, text):
         # Skip empty inputs
         if not text or not text.strip():
             return ""
-        # Check cache first for faster response
-        cache_key = f"{direction}:{text}"
-        if cache_key in translation_cache:
-            return translation_cache[cache_key]
         # Start timing for performance tracking
         start_time = time.time()
@@ -164,147 +183,191 @@ class ModelWorker:
         source_lang, target_lang = lang_map[direction]
         # Efficient prompt format
         prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
-        # Estimate appropriate token length based on input
         input_tokens = len(text.split())
-        max_tokens = min(200, max(50, int(input_tokens * 1.5)))
-        # Generate translation with optimized settings
-        response = self.model.create_completion(
-            prompt,
-            max_tokens=max_tokens,
-            temperature=0.0,      # Deterministic for faster inference
-            top_k=1,              # Only consider most likely token
-            top_p=1.0,            # No sampling
-            repeat_penalty=1.0,   # No repeat penalty
-            stream=False          # Get complete response at once
-        )
-        translation = response['choices'][0]['text'].strip()
-        # Cache result
-        if len(translation_cache) >= MAX_CACHE_SIZE:
-            # Remove oldest entry (first key)
-            translation_cache.pop(next(iter(translation_cache)))
-        translation_cache[cache_key] = translation
-        # Log performance
-        inference_time = time.time() - start_time
-        tokens_per_second = (input_tokens + len(translation.split())) / inference_time
-        print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
-        return translation
     def request_translation(self, direction, text, callback_id):
         """Queue a translation request"""
         self.request_queue.put((direction, text, callback_id))
-# Create worker instance
-worker = ModelWorker()
 # Counter for request IDs
 next_request_id = 0
 # Gradio interface functions
 def translate(direction, text, progress=gr.Progress()):
-    """Queue translation request and wait for result"""
     global next_request_id
-    # Check cache first for immediate response
     cache_key = f"{direction}:{text}"
     if cache_key in translation_cache:
         return translation_cache[cache_key]
-    # If input is very short, check if we have a similar cached phrase
     if len(text) < 20:
-        for cached_key in translation_cache:
-            cached_dir, cached_text = cached_key.split(":", 1)
-            if cached_dir == direction and cached_text.lower().startswith(text.lower()):
-                return translation_cache[cached_key]
     # Generate unique request ID
     request_id = next_request_id
     next_request_id += 1
     # Queue the request
-    worker.request_translation(direction, text, request_id)
-    # Wait for the response (with progress feedback)
     progress(0, desc="Translating...")
-    max_wait = 30  # Maximum wait time in seconds
     start_time = time.time()
     while time.time() - start_time < max_wait:
         progress((time.time() - start_time) / max_wait)
         # Check for our response
         try:
-            while not worker.response_queue.empty():
-                resp_id, result = worker.response_queue.get_nowait()
                 if resp_id == request_id:
                     progress(1.0)
                     return result
         except queue.Empty:
             pass
-        # Small sleep to prevent CPU hogging
-        time.sleep(0.05)
     progress(1.0)
-    return "Translation timed out. Please try again."
-# Create Gradio interface
-with gr.Blocks(title="Ultra-Fast Translation App") as iface:
     gr.Markdown(f"""
-    ## Ultra-Fast Translation App
-    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
     """)
     with gr.Row():
-        direction = gr.Dropdown(
-            choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
             label="Translation Direction",
             value="English to Spanish"
         )
     with gr.Row():
-        input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
-        output_text = gr.Textbox(lines=5, label="Translation")
     # Add translate button
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
-    # Optimization options
-    with gr.Accordion("Advanced Options", open=False):
-        gr.Markdown("""
-        ### Performance Tips
-        - Short sentences translate faster than long paragraphs
-        - Common phrases may be cached for instant results
-        - First translation might be slower as the model warms up
-        """)
-    # Add examples with preloaded common phrases
     gr.Examples(
         examples=[
-            ["English to Spanish", "Hello, how are you today?"],
-            ["Spanish to English", "Hola, ¿cómo estás hoy?"],
-            ["English to Korean", "The weather is nice today."],
-            ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
         ],
         inputs=[direction, input_text],
         fn=translate,
         outputs=output_text
     )
 # Launch with optimized settings
-iface.launch(
-    debug=False,
-    show_error=True,
-    share=False,       # Don't share publicly by default
-    quiet=True,        # Reduce console output
-    server_name="0.0.0.0",
-    server_port=7860
-)

 import threading
 import queue
 import multiprocessing
+from functools import lru_cache
 # First check if GPU is available for maximum speed
 has_gpu = torch.cuda.is_available()
     "articulate-V1-q8_0.gguf"
 )
+# Optimize environment variables for CPU performance
+os.environ["LLAMA_CUBLAS"] = "0"  # Disable CUDA since we're CPU only
 os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
+os.environ["LLAMA_AVX"] = "1"      # Enable AVX
+os.environ["LLAMA_AVX2"] = "1"     # Enable AVX2
+os.environ["LLAMA_F16"] = "1"      # Use FP16 where available
+# Import the right module
+from llama_cpp import Llama
+print("Using CPU-optimized llama-cpp-python")
 # Cache for translations
 translation_cache = {}
+MAX_CACHE_SIZE = 5000  # Increased cache size
+# Common phrases for pre-loading
 COMMON_PHRASES = {
     "English to Spanish": [
         "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
     ]
 }
+# Implement LRU cache for better performance
+@lru_cache(maxsize=100)
+def get_cached_translation(direction, text):
+    """LRU cache for translations"""
+    return None  # This gets bypassed when there's a cache hit
+# Create a worker pool for parallel translation
+class ModelWorkerPool:
+    def __init__(self, num_workers=1):
+        self.num_workers = num_workers
         self.request_queue = queue.Queue()
         self.response_queue = queue.Queue()
+        self.workers = []
+        self.initialized = False
+        # Create shared model instance with optimized settings
+        print("Initializing model with CPU optimizations...")
+        start_time = time.time()
+        # CPU optimization settings - use fewer threads for Q8 model
         cpu_count = multiprocessing.cpu_count()
+        optimal_threads = max(1, min(4, cpu_count - 1))  # Use fewer threads for better performance
+        # Create a smaller context size for faster inference
+        self.model = Llama(
             model_path=base_model_path,
             lora_path=adapter_path,
+            n_ctx=256,                # Reduced context for faster processing
             n_threads=optimal_threads, # Optimized thread count
+            n_batch=512,              # Reduced batch size for CPU
+            use_mmap=True,            # Efficient memory mapping
+            n_gpu_layers=0,           # CPU only
+            seed=42,                  # Consistent results
+            verbose=False,            # Reduce overhead
+            rope_freq_base=10000,     # Default attention parameters
             rope_freq_scale=1.0,
         )
         print(f"Model loaded in {time.time() - start_time:.2f} seconds")
+        # Start worker threads
+        for i in range(num_workers):
+            worker = threading.Thread(target=self._worker_loop, daemon=True)
+            worker.start()
+            self.workers.append(worker)
+        self.initialized = True
+        # Pre-warm in background thread to not block startup
+        warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
+        warming_thread.start()
+    def _worker_loop(self):
+        """Worker thread that processes translation requests"""
         while True:
             try:
                 request = self.request_queue.get()
                     break
                 direction, text, callback_id = request
+                # Check LRU cache first
+                cached = get_cached_translation(direction, text)
+                if cached is not None:
+                    self.response_queue.put((callback_id, cached))
+                    self.request_queue.task_done()
+                    continue
+                # Check regular cache
+                cache_key = f"{direction}:{text}"
+                if cache_key in translation_cache:
+                    result = translation_cache[cache_key]
+                else:
+                    # Process new translation
+                    result = self._process_translation(direction, text)
+                    # Store in regular cache
+                    if len(translation_cache) >= MAX_CACHE_SIZE:
+                        translation_cache.pop(next(iter(translation_cache)))
+                    translation_cache[cache_key] = result
                 self.response_queue.put((callback_id, result))
+                self.request_queue.task_done()
             except Exception as e:
                 print(f"Error in worker thread: {e}")
                 self.response_queue.put((callback_id, f"Error: {str(e)}"))
+                self.request_queue.task_done()
     def _prewarm_model(self):
+        """Pre-compute common translations to warm up the model - minimal to save time"""
+        print("Pre-warming model with essential phrases (truncated for speed)...")
         start = time.time()
+        # Just warm up with one phrase per direction to speed up startup
         for direction, phrases in COMMON_PHRASES.items():
+            self._process_translation(direction, phrases[0])
+            # Only do the most common phrase to save startup time
+        print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
     def _process_translation(self, direction, text):
+        """Optimized translation function"""
         # Skip empty inputs
         if not text or not text.strip():
             return ""
         # Start timing for performance tracking
         start_time = time.time()
         source_lang, target_lang = lang_map[direction]
+        # Truncate long inputs for faster processing
+        max_input_length = 100  # Limit input length
+        if len(text) > max_input_length:
+            text = text[:max_input_length] + "..."
         # Efficient prompt format
         prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
+        # Reduce max tokens for faster inference
         input_tokens = len(text.split())
+        max_tokens = min(50, max(20, int(input_tokens * 1.2)))
+        # Generate translation with aggressive performance optimizations
+        try:
+            response = self.model.create_completion(
+                prompt,
+                max_tokens=max_tokens,
+                temperature=0.0,      # Deterministic for faster inference
+                top_k=1,              # Only consider most likely token
+                top_p=1.0,            # No sampling
+                repeat_penalty=1.0,   # No repeat penalty
+                stream=False,         # Get complete response at once
+                stop=["[/", "\n\n"],  # Stop early if possible
+            )
+            translation = response['choices'][0]['text'].strip()
+            # Log performance
+            inference_time = time.time() - start_time
+            tokens_per_second = (input_tokens + len(translation.split())) / inference_time
+            print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
+            return translation
+        except Exception as e:
+            print(f"Translation error: {e}")
+            return f"Error: Could not translate text. Try shorter input."
     def request_translation(self, direction, text, callback_id):
         """Queue a translation request"""
         self.request_queue.put((direction, text, callback_id))
+# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
+worker_pool = ModelWorkerPool(num_workers=1)
 # Counter for request IDs
 next_request_id = 0
+# Fast similarity check function for finding close matches in cache
+def find_similar_cached(direction, text, threshold=0.8):
+    """Find similar translations in cache based on prefix matching"""
+    if len(text) < 5:  # For very short inputs, look for exact matches
+        return None
+    text_lower = text.lower()
+    best_match = None
+    best_score = 0
+    for cached_key in list(translation_cache.keys()):
+        cached_dir, cached_text = cached_key.split(":", 1)
+        if cached_dir != direction:
+            continue
+        # Simple similarity - prefix matching
+        if cached_text.lower().startswith(text_lower[:5]):
+            similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
+            if similarity > best_score and similarity > threshold:
+                best_score = similarity
+                best_match = translation_cache[cached_key]
+    return best_match
 # Gradio interface functions
 def translate(direction, text, progress=gr.Progress()):
+    """Queue translation request and wait for result - optimized version"""
     global next_request_id
+    # Trim whitespace for better cache hits
+    text = text.strip()
+    # Skip empty inputs
+    if not text:
+        return ""
+    # Check LRU cache first
+    cached = get_cached_translation(direction, text)
+    if cached is not None:
+        return cached
+    # Check main cache
     cache_key = f"{direction}:{text}"
     if cache_key in translation_cache:
         return translation_cache[cache_key]
+    # For short inputs, try to find similar cached
     if len(text) < 20:
+        similar = find_similar_cached(direction, text)
+        if similar:
+            return similar
     # Generate unique request ID
     request_id = next_request_id
     next_request_id += 1
     # Queue the request
+    worker_pool.request_translation(direction, text, request_id)
+    # Wait for the response with reasonable timeout
     progress(0, desc="Translating...")
+    max_wait = 20  # Reduced maximum wait time
     start_time = time.time()
+    # Show progress while waiting
     while time.time() - start_time < max_wait:
         progress((time.time() - start_time) / max_wait)
         # Check for our response
         try:
+            while not worker_pool.response_queue.empty():
+                resp_id, result = worker_pool.response_queue.get_nowait()
                 if resp_id == request_id:
+                    # Update LRU cache
+                    get_cached_translation.__wrapped__.__defaults__ = (result,)
                     progress(1.0)
                     return result
         except queue.Empty:
             pass
+        # Small sleep to prevent CPU hogging - reduced for faster response
+        time.sleep(0.01)
     progress(1.0)
+    return "Translation timed out. Please try a shorter text."
+# Create Gradio interface with simplified UI for performance
+with gr.Blocks(title="Fast CPU Translation App") as iface:
     gr.Markdown(f"""
+    ## Fast CPU Translation App
+    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
+    **For best performance, use short sentences or phrases.**
     """)
     with gr.Row():
+        direction = gr.Radio(
+            choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
             label="Translation Direction",
             value="English to Spanish"
         )
     with gr.Row():
+        input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
+        output_text = gr.Textbox(lines=3, label="Translation")
     # Add translate button
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
+    # Add examples with common short phrases for quick results
     gr.Examples(
         examples=[
+            ["English to Spanish", "Hello"],
+            ["Spanish to English", "Hola"],
+            ["English to Korean", "Thank you"],
+            ["Korean to English", "감사합니다"]
         ],
         inputs=[direction, input_text],
         fn=translate,
         outputs=output_text
     )
+    # Add performance tips
+    gr.Markdown("""
+    ### Performance Tips
+    - Keep text under 50 characters for fastest results
+    - Common phrases are pre-cached
+    - First translation may be slow, subsequent ones faster
+    - Frequently used phrases use an LRU cache for speed
+    """)
 # Launch with optimized settings
+if __name__ == "__main__":
+    iface.launch(
+        debug=False,
+        show_error=True,
+        share=False,
+        quiet=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )