Spaces:

johnpaulbin
/

googoo

Sleeping

File size: 14,105 Bytes

f06b197
 
 
7f36089
f06b197
 
 
b27a850
cedd7b9
7f36089
f06b197
 
 
 
 
 
b27a850
 
 
7f36089
b27a850
 
 
 
 
 
 
 
 
cedd7b9
 
f06b197
cedd7b9
 
 
b27a850
cedd7b9
 
 
b27a850
f06b197
b27a850
cedd7b9
7f36089
cedd7b9
f06b197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cedd7b9
 
 
 
 
 
 
 
 
 
f06b197
 
cedd7b9
 
 
 
 
 
f06b197
cedd7b9
f06b197
cedd7b9
f06b197
cedd7b9
 
f06b197
 
cedd7b9
f06b197
cedd7b9
 
 
 
 
 
f06b197
 
 
 
cedd7b9
 
 
 
 
f06b197
cedd7b9
 
 
 
 
 
 
 
f06b197
 
 
 
 
 
 
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
cedd7b9
f06b197
 
 
cedd7b9
b27a850
f06b197
cedd7b9
 
f06b197
cedd7b9
 
f06b197
cedd7b9
 
 
 
b27a850
f06b197
cedd7b9
f06b197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cedd7b9
 
 
 
 
f06b197
 
 
cedd7b9
f06b197
cedd7b9
c2b521a
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
 
 
 
 
cedd7b9
 
f06b197
 
 
 
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
 
cedd7b9
f06b197
 
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
 
 
 
cedd7b9
f06b197
cedd7b9
 
 
f06b197
 
 
 
 
 
cedd7b9
f06b197
cedd7b9
f06b197
cedd7b9
f06b197
 
cedd7b9
f06b197
 
c2b521a
f06b197
 
cedd7b9
 
f06b197
cedd7b9
 
f06b197
 
 
 
 
cedd7b9
 
f06b197
 
cedd7b9
7f36089
cedd7b9
 
f06b197
cedd7b9
 
 
f06b197
b27a850
 
cedd7b9
 
b27a850
 
 
 
 
cedd7b9
 
b27a850
 
 
 
 
cedd7b9
b27a850
 
cedd7b9
 
 
 
b27a850
 
f06b197
 
b27a850
cedd7b9
 
 
 
 
 
 
 
 
7f36089
f06b197
cedd7b9

import os
import time
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import threading
import queue
import multiprocessing
from functools import lru_cache

# First check if GPU is available for maximum speed
has_gpu = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
print(f"GPU available: {has_gpu} - {gpu_name}")

# Download model files
def get_model_path(repo_id, filename):
    print(f"Obtaining {filename}...")
    return hf_hub_download(repo_id=repo_id, filename=filename)

base_model_path = get_model_path(
    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", 
    "articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
    "johnpaulbin/articulate-V1-Q8_0-GGUF", 
    "articulate-V1-q8_0.gguf"
)

# Optimize environment variables for CPU performance
os.environ["LLAMA_CUBLAS"] = "0"  # Disable CUDA since we're CPU only
os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
os.environ["LLAMA_AVX"] = "1"      # Enable AVX
os.environ["LLAMA_AVX2"] = "1"     # Enable AVX2
os.environ["LLAMA_F16"] = "1"      # Use FP16 where available

# Import the right module
from llama_cpp import Llama
print("Using CPU-optimized llama-cpp-python")

# Cache for translations
translation_cache = {}
MAX_CACHE_SIZE = 5000  # Increased cache size

# Common phrases for pre-loading
COMMON_PHRASES = {
    "English to Spanish": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Spanish to English": [
        "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
        "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
    ],
    "English to Korean": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Korean to English": [
        "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
        "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
    ]
}

# Implement LRU cache for better performance
@lru_cache(maxsize=100)
def get_cached_translation(direction, text):
    """LRU cache for translations"""
    return None  # This gets bypassed when there's a cache hit

# Create a worker pool for parallel translation
class ModelWorkerPool:
    def __init__(self, num_workers=1):
        self.num_workers = num_workers
        self.request_queue = queue.Queue()
        self.response_queue = queue.Queue()
        self.workers = []
        self.initialized = False
        
        # Create shared model instance with optimized settings
        print("Initializing model with CPU optimizations...")
        start_time = time.time()
        
        # CPU optimization settings - use fewer threads for Q8 model
        cpu_count = multiprocessing.cpu_count()
        optimal_threads = max(1, min(4, cpu_count - 1))  # Use fewer threads for better performance
        
        # Create a smaller context size for faster inference
        self.model = Llama(
            model_path=base_model_path,
            lora_path=adapter_path,
            n_ctx=256,                # Reduced context for faster processing
            n_threads=optimal_threads, # Optimized thread count
            n_batch=512,              # Reduced batch size for CPU
            use_mmap=True,            # Efficient memory mapping
            n_gpu_layers=0,           # CPU only
            seed=42,                  # Consistent results
            verbose=False,            # Reduce overhead
            rope_freq_base=10000,     # Default attention parameters
            rope_freq_scale=1.0,
        )
        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
        
        # Start worker threads
        for i in range(num_workers):
            worker = threading.Thread(target=self._worker_loop, daemon=True)
            worker.start()
            self.workers.append(worker)
        
        self.initialized = True
        
        # Pre-warm in background thread to not block startup
        warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
        warming_thread.start()
    
    def _worker_loop(self):
        """Worker thread that processes translation requests"""
        while True:
            try:
                request = self.request_queue.get()
                if request is None:  # Shutdown signal
                    break
                    
                direction, text, callback_id = request
                
                # Check LRU cache first
                cached = get_cached_translation(direction, text)
                if cached is not None:
                    self.response_queue.put((callback_id, cached))
                    self.request_queue.task_done()
                    continue
                
                # Check regular cache
                cache_key = f"{direction}:{text}"
                if cache_key in translation_cache:
                    result = translation_cache[cache_key]
                else:
                    # Process new translation
                    result = self._process_translation(direction, text)
                    # Store in regular cache
                    if len(translation_cache) >= MAX_CACHE_SIZE:
                        translation_cache.pop(next(iter(translation_cache)))
                    translation_cache[cache_key] = result
                
                self.response_queue.put((callback_id, result))
                self.request_queue.task_done()
            except Exception as e:
                print(f"Error in worker thread: {e}")
                self.response_queue.put((callback_id, f"Error: {str(e)}"))
                self.request_queue.task_done()
    
    def _prewarm_model(self):
        """Pre-compute common translations to warm up the model - minimal to save time"""
        print("Pre-warming model with essential phrases (truncated for speed)...")
        start = time.time()
        
        # Just warm up with one phrase per direction to speed up startup
        for direction, phrases in COMMON_PHRASES.items():
            self._process_translation(direction, phrases[0])
            # Only do the most common phrase to save startup time
            
        print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
    
    def _process_translation(self, direction, text):
        """Optimized translation function"""
        # Skip empty inputs
        if not text or not text.strip():
            return ""
            
        # Start timing for performance tracking
        start_time = time.time()
        
        # Map language directions
        lang_map = {
            "English to Spanish": ("ENGLISH", "SPANISH"),
            "Spanish to English": ("SPANISH", "ENGLISH"),
            "Korean to English": ("KOREAN", "ENGLISH"),
            "English to Korean": ("ENGLISH", "KOREAN")
        }
        
        if direction not in lang_map:
            return "Invalid direction"
        
        source_lang, target_lang = lang_map[direction]
        
        # Truncate long inputs for faster processing
        max_input_length = 100  # Limit input length
        if len(text) > max_input_length:
            text = text[:max_input_length] + "..."
        
        # Efficient prompt format
        prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
        
        # Reduce max tokens for faster inference
        input_tokens = len(text.split())
        max_tokens = min(50, max(20, int(input_tokens * 1.2)))
        
        # Generate translation with aggressive performance optimizations
        try:
            response = self.model.create_completion(
                prompt,
                max_tokens=max_tokens,
                temperature=0.0,      # Deterministic for faster inference
                top_k=1,              # Only consider most likely token
                top_p=1.0,            # No sampling
                repeat_penalty=1.0,   # No repeat penalty
                stream=False,         # Get complete response at once
                stop=["[/", "\n\n"],  # Stop early if possible
            )
            
            translation = response['choices'][0]['text'].strip()
            
            # Log performance
            inference_time = time.time() - start_time
            tokens_per_second = (input_tokens + len(translation.split())) / inference_time
            print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
            
            return translation
        except Exception as e:
            print(f"Translation error: {e}")
            return f"Error: Could not translate text. Try shorter input."
    
    def request_translation(self, direction, text, callback_id):
        """Queue a translation request"""
        self.request_queue.put((direction, text, callback_id))

# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
worker_pool = ModelWorkerPool(num_workers=1)

# Counter for request IDs
next_request_id = 0

# Fast similarity check function for finding close matches in cache
def find_similar_cached(direction, text, threshold=0.8):
    """Find similar translations in cache based on prefix matching"""
    if len(text) < 5:  # For very short inputs, look for exact matches
        return None
        
    text_lower = text.lower()
    best_match = None
    best_score = 0
    
    for cached_key in list(translation_cache.keys()):
        cached_dir, cached_text = cached_key.split(":", 1)
        if cached_dir != direction:
            continue
            
        # Simple similarity - prefix matching
        if cached_text.lower().startswith(text_lower[:5]):
            similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
            if similarity > best_score and similarity > threshold:
                best_score = similarity
                best_match = translation_cache[cached_key]
                
    return best_match

# Gradio interface functions
def translate(direction, text, progress=gr.Progress()):
    """Queue translation request and wait for result - optimized version"""
    global next_request_id
    
    # Trim whitespace for better cache hits
    text = text.strip()
    
    # Skip empty inputs
    if not text:
        return ""
    
    # Check LRU cache first
    cached = get_cached_translation(direction, text)
    if cached is not None:
        return cached
    
    # Check main cache
    cache_key = f"{direction}:{text}"
    if cache_key in translation_cache:
        return translation_cache[cache_key]
    
    # For short inputs, try to find similar cached
    if len(text) < 20:
        similar = find_similar_cached(direction, text)
        if similar:
            return similar
    
    # Generate unique request ID
    request_id = next_request_id
    next_request_id += 1
    
    # Queue the request
    worker_pool.request_translation(direction, text, request_id)
    
    # Wait for the response with reasonable timeout
    progress(0, desc="Translating...")
    max_wait = 20  # Reduced maximum wait time
    start_time = time.time()
    
    # Show progress while waiting
    while time.time() - start_time < max_wait:
        progress((time.time() - start_time) / max_wait)
        
        # Check for our response
        try:
            while not worker_pool.response_queue.empty():
                resp_id, result = worker_pool.response_queue.get_nowait()
                if resp_id == request_id:
                    # Update LRU cache
                    get_cached_translation.__wrapped__.__defaults__ = (result,)
                    progress(1.0)
                    return result
        except queue.Empty:
            pass
        
        # Small sleep to prevent CPU hogging - reduced for faster response
        time.sleep(0.01)
    
    progress(1.0)
    return "Translation timed out. Please try a shorter text."

# Create Gradio interface with simplified UI for performance
with gr.Blocks(title="Fast CPU Translation App") as iface:
    gr.Markdown(f"""
    ## Fast CPU Translation App
    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}  
    **For best performance, use short sentences or phrases.**
    """)
    
    with gr.Row():
        direction = gr.Radio(
            choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
            label="Translation Direction",
            value="English to Spanish"
        )
    
    with gr.Row():
        input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
        output_text = gr.Textbox(lines=3, label="Translation")
    
    # Add translate button
    translate_btn = gr.Button("Translate")
    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
    
    # Add examples with common short phrases for quick results
    gr.Examples(
        examples=[
            ["English to Spanish", "Hello"],
            ["Spanish to English", "Hola"],
            ["English to Korean", "Thank you"],
            ["Korean to English", "감사합니다"]
        ],
        inputs=[direction, input_text],
        fn=translate,
        outputs=output_text
    )
    
    # Add performance tips
    gr.Markdown("""
    ### Performance Tips
    - Keep text under 50 characters for fastest results
    - Common phrases are pre-cached
    - First translation may be slow, subsequent ones faster
    - Frequently used phrases use an LRU cache for speed
    """)

# Launch with optimized settings
if __name__ == "__main__":
    iface.launch(
        debug=False,
        show_error=True,
        share=False,
        quiet=True,
        server_name="0.0.0.0", 
        server_port=7860
    )