Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

johnpaulbin commited on Feb 25

Commit

f06b197

verified ·

1 Parent(s): c2b521a

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -78

app.py CHANGED Viewed

@@ -1,16 +1,22 @@
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
 import gradio as gr
 import multiprocessing
-import time
-import os
-# Model paths
 def get_model_path(repo_id, filename):
     print(f"Obtaining {filename}...")
     return hf_hub_download(repo_id=repo_id, filename=filename)
-# Get models
 base_model_path = get_model_path(
     "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
     "articulate-11-expspanish-base-merged-q8_0.gguf"
@@ -20,95 +26,241 @@ adapter_path = get_model_path(
     "articulate-V1-q8_0.gguf"
 )
-# Conservative CPU settings to avoid memory corruption
-cpu_count = multiprocessing.cpu_count()
-optimal_threads = max(1, min(8, cpu_count // 2))  # More conservative thread count
-batch_size = 128  # Reduced batch size to prevent memory issues
-print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
-# Initialize model with safer parameters
-start_time = time.time()
-llm = Llama(
-    model_path=base_model_path,
-    lora_path=adapter_path,
-    n_ctx=512,
-    n_threads=optimal_threads,
-    n_batch=batch_size,      # Smaller batch size for stability
-    use_mmap=True,
-    n_gpu_layers=0,
-    verbose=False
-)
-print(f"Model loaded in {time.time() - start_time:.2f} seconds")
-# Simple translation cache (limited size)
 translation_cache = {}
-MAX_CACHE_SIZE = 50  # Reduced cache size
-def translate(direction, text):
-    # Validate input
-    if not text or not text.strip():
-        return ""
-    text = text.strip()
-    # Simple cache lookup
-    cache_key = f"{direction}:{text}"
-    if cache_key in translation_cache:
-        return translation_cache[cache_key]
-    # Start timing
-    start_time = time.time()
-    # Language mapping
-    lang_map = {
-        "English to Spanish": ("ENGLISH", "SPANISH"),
-        "Spanish to English": ("SPANISH", "ENGLISH"),
-        "Korean to English": ("KOREAN", "ENGLISH"),
-        "English to Korean": ("ENGLISH", "KOREAN")
-    }
-    if direction not in lang_map:
-        return "Invalid direction"
-    source_lang, target_lang = lang_map[direction]
-    # Create prompt
-    prompt = f"[{source_lang}]{text}[{target_lang}]"
-    try:
-        # Generate translation with conservative settings
-        response = llm.create_completion(
             prompt,
-            max_tokens=128,      # Conservative token limit
-            temperature=0.0,     # Deterministic
-            top_k=1,             # Most likely token only
-            top_p=1.0,           # No sampling
-            repeat_penalty=1.0,
-            stream=False
         )
         translation = response['choices'][0]['text'].strip()
-        # Manage cache size
         if len(translation_cache) >= MAX_CACHE_SIZE:
-            # Remove oldest entry
             translation_cache.pop(next(iter(translation_cache)))
         translation_cache[cache_key] = translation
         # Log performance
         inference_time = time.time() - start_time
-        print(f"Translation completed in {inference_time:.3f}s")
         return translation
-    except Exception as e:
-        print(f"Translation error: {e}")
-        return f"Error during translation: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="Translation App") as iface:
-    gr.Markdown("## Fast Translation App")
     with gr.Row():
         direction = gr.Dropdown(
@@ -118,24 +270,41 @@ with gr.Blocks(title="Translation App") as iface:
         )
     with gr.Row():
-        input_text = gr.Textbox(lines=5, label="Input Text")
         output_text = gr.Textbox(lines=5, label="Translation")
     # Add translate button
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
-    # Examples WITHOUT caching (to avoid memory issues)
     gr.Examples(
         examples=[
             ["English to Spanish", "Hello, how are you today?"],
             ["Spanish to English", "Hola, ¿cómo estás hoy?"],
             ["English to Korean", "The weather is nice today."],
-            ["Korean to English", "오늘 날씨가 좋습니다."]
         ],
         inputs=[direction, input_text],
-        cache_examples=False  # Disabled caching to prevent memory issues
     )
-# Launch with safer settings
-iface.launch(debug=False, show_error=True)

+import os
+import time
+import torch
 import gradio as gr
+from huggingface_hub import hf_hub_download
+import threading
+import queue
 import multiprocessing
+# First check if GPU is available for maximum speed
+has_gpu = torch.cuda.is_available()
+gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
+print(f"GPU available: {has_gpu} - {gpu_name}")
+# Download model files
 def get_model_path(repo_id, filename):
     print(f"Obtaining {filename}...")
     return hf_hub_download(repo_id=repo_id, filename=filename)
 base_model_path = get_model_path(
     "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
     "articulate-11-expspanish-base-merged-q8_0.gguf"
     "articulate-V1-q8_0.gguf"
 )
+# Set up optimized environment variables for llama-cpp-python
+os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
+os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
+# For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
+os.environ["LLAMA_AVX"] = "1"
+os.environ["LLAMA_AVX2"] = "1"
+os.environ["LLAMA_F16"] = "1"  # Use FP16 where available
+# Determine the most optimized backend
+if has_gpu:
+    try:
+        from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
+        LlamaClass = GPULlama
+        print("Using GPU-accelerated llama-cpp-python")
+        n_gpu_layers = -1  # Use all layers on GPU
+    except ImportError:
+        from llama_cpp import Llama
+        LlamaClass = Llama
+        print("Using standard llama-cpp-python with GPU acceleration")
+        n_gpu_layers = -1  # Use all layers on GPU
+else:
+    from llama_cpp import Llama
+    LlamaClass = Llama
+    print("Using CPU-only llama-cpp-python")
+    n_gpu_layers = 0
+# Cache for translations
 translation_cache = {}
+MAX_CACHE_SIZE = 1000
+# Pre-compute common translations
+COMMON_PHRASES = {
+    "English to Spanish": [
+        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
+        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
+    ],
+    "Spanish to English": [
+        "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
+        "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
+    ],
+    "English to Korean": [
+        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
+        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
+    ],
+    "Korean to English": [
+        "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
+        "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
+    ]
+}
+# Background worker for model loading and inference
+class ModelWorker:
+    def __init__(self):
+        self.model = None
+        self.request_queue = queue.Queue()
+        self.response_queue = queue.Queue()
+        self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
+        self.worker_thread.start()
+    def _worker_loop(self):
+        # Initialize model in the worker thread
+        print("Initializing model in background thread...")
+        # CPU optimization settings
+        cpu_count = multiprocessing.cpu_count()
+        optimal_threads = max(4, cpu_count - 2)  # Leave two cores free
+        # Initialize with the most optimized settings
+        start_time = time.time()
+        self.model = LlamaClass(
+            model_path=base_model_path,
+            lora_path=adapter_path,
+            n_ctx=512,               # Larger context for longer translations
+            n_threads=optimal_threads, # Optimized thread count
+            n_batch=1024,              # Large batch for parallel processing
+            use_mmap=True,             # Efficient memory mapping
+            n_gpu_layers=n_gpu_layers, # GPU acceleration if available
+            seed=42,                   # Consistent results
+            verbose=False,             # Reduce overhead
+            main_gpu=0,                # Primary GPU
+            tensor_split=None,         # Auto-distribute across GPUs if multiple
+            rope_freq_base=10000,      # Optimized attention parameters
+            rope_freq_scale=1.0,
+        )
+        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
+        # Pre-warm the model with common phrases
+        self._prewarm_model()
+        # Process requests
+        while True:
+            try:
+                request = self.request_queue.get()
+                if request is None:  # Shutdown signal
+                    break
+                direction, text, callback_id = request
+                result = self._process_translation(direction, text)
+                self.response_queue.put((callback_id, result))
+            except Exception as e:
+                print(f"Error in worker thread: {e}")
+                self.response_queue.put((callback_id, f"Error: {str(e)}"))
+    def _prewarm_model(self):
+        """Pre-compute common translations to warm up the model"""
+        print("Pre-warming model with common phrases...")
+        start = time.time()
+        for direction, phrases in COMMON_PHRASES.items():
+            for phrase in phrases[:3]:  # Just do a few to warm up
+                self._process_translation(direction, phrase)
+        print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")
+    def _process_translation(self, direction, text):
+        # Skip empty inputs
+        if not text or not text.strip():
+            return ""
+        # Check cache first for faster response
+        cache_key = f"{direction}:{text}"
+        if cache_key in translation_cache:
+            return translation_cache[cache_key]
+        # Start timing for performance tracking
+        start_time = time.time()
+        # Map language directions
+        lang_map = {
+            "English to Spanish": ("ENGLISH", "SPANISH"),
+            "Spanish to English": ("SPANISH", "ENGLISH"),
+            "Korean to English": ("KOREAN", "ENGLISH"),
+            "English to Korean": ("ENGLISH", "KOREAN")
+        }
+        if direction not in lang_map:
+            return "Invalid direction"
+        source_lang, target_lang = lang_map[direction]
+        # Efficient prompt format
+        prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
+        # Estimate appropriate token length based on input
+        input_tokens = len(text.split())
+        max_tokens = min(200, max(50, int(input_tokens * 1.5)))
+        # Generate translation with optimized settings
+        response = self.model.create_completion(
             prompt,
+            max_tokens=max_tokens,
+            temperature=0.0,      # Deterministic for faster inference
+            top_k=1,              # Only consider most likely token
+            top_p=1.0,            # No sampling
+            repeat_penalty=1.0,   # No repeat penalty
+            stream=False          # Get complete response at once
         )
         translation = response['choices'][0]['text'].strip()
+        # Cache result
         if len(translation_cache) >= MAX_CACHE_SIZE:
+            # Remove oldest entry (first key)
             translation_cache.pop(next(iter(translation_cache)))
         translation_cache[cache_key] = translation
         # Log performance
         inference_time = time.time() - start_time
+        tokens_per_second = (input_tokens + len(translation.split())) / inference_time
+        print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
         return translation
+    def request_translation(self, direction, text, callback_id):
+        """Queue a translation request"""
+        self.request_queue.put((direction, text, callback_id))
+# Create worker instance
+worker = ModelWorker()
+# Counter for request IDs
+next_request_id = 0
+# Gradio interface functions
+def translate(direction, text, progress=gr.Progress()):
+    """Queue translation request and wait for result"""
+    global next_request_id
+    # Check cache first for immediate response
+    cache_key = f"{direction}:{text}"
+    if cache_key in translation_cache:
+        return translation_cache[cache_key]
+    # If input is very short, check if we have a similar cached phrase
+    if len(text) < 20:
+        for cached_key in translation_cache:
+            cached_dir, cached_text = cached_key.split(":", 1)
+            if cached_dir == direction and cached_text.lower().startswith(text.lower()):
+                return translation_cache[cached_key]
+    # Generate unique request ID
+    request_id = next_request_id
+    next_request_id += 1
+    # Queue the request
+    worker.request_translation(direction, text, request_id)
+    # Wait for the response (with progress feedback)
+    progress(0, desc="Translating...")
+    max_wait = 30  # Maximum wait time in seconds
+    start_time = time.time()
+    while time.time() - start_time < max_wait:
+        progress((time.time() - start_time) / max_wait)
+        # Check for our response
+        try:
+            while not worker.response_queue.empty():
+                resp_id, result = worker.response_queue.get_nowait()
+                if resp_id == request_id:
+                    progress(1.0)
+                    return result
+        except queue.Empty:
+            pass
+        # Small sleep to prevent CPU hogging
+        time.sleep(0.05)
+    progress(1.0)
+    return "Translation timed out. Please try again."
 # Create Gradio interface
+with gr.Blocks(title="Ultra-Fast Translation App") as iface:
+    gr.Markdown(f"""
+    ## Ultra-Fast Translation App
+    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
+    """)
     with gr.Row():
         direction = gr.Dropdown(
         )
     with gr.Row():
+        input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
         output_text = gr.Textbox(lines=5, label="Translation")
     # Add translate button
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
+    # Optimization options
+    with gr.Accordion("Advanced Options", open=False):
+        gr.Markdown("""
+        ### Performance Tips
+        - Short sentences translate faster than long paragraphs
+        - Common phrases may be cached for instant results
+        - First translation might be slower as the model warms up
+        """)
+    # Add examples with preloaded common phrases
     gr.Examples(
         examples=[
             ["English to Spanish", "Hello, how are you today?"],
             ["Spanish to English", "Hola, ¿cómo estás hoy?"],
             ["English to Korean", "The weather is nice today."],
+            ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
         ],
         inputs=[direction, input_text],
+        fn=translate,
+        outputs=output_text
     )
+# Launch with optimized settings
+iface.launch(
+    debug=False,
+    show_error=True,
+    share=False,       # Don't share publicly by default
+    quiet=True,        # Reduce console output
+    server_name="0.0.0.0",
+    server_port=7860
+)