googoo / app.py
johnpaulbin's picture
Update app.py
cedd7b9 verified
raw
history blame
14.1 kB
import os
import time
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import threading
import queue
import multiprocessing
from functools import lru_cache
# First check if GPU is available for maximum speed
has_gpu = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
print(f"GPU available: {has_gpu} - {gpu_name}")
# Download model files
def get_model_path(repo_id, filename):
print(f"Obtaining {filename}...")
return hf_hub_download(repo_id=repo_id, filename=filename)
base_model_path = get_model_path(
"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
"articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
"johnpaulbin/articulate-V1-Q8_0-GGUF",
"articulate-V1-q8_0.gguf"
)
# Optimize environment variables for CPU performance
os.environ["LLAMA_CUBLAS"] = "0" # Disable CUDA since we're CPU only
os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
os.environ["LLAMA_AVX"] = "1" # Enable AVX
os.environ["LLAMA_AVX2"] = "1" # Enable AVX2
os.environ["LLAMA_F16"] = "1" # Use FP16 where available
# Import the right module
from llama_cpp import Llama
print("Using CPU-optimized llama-cpp-python")
# Cache for translations
translation_cache = {}
MAX_CACHE_SIZE = 5000 # Increased cache size
# Common phrases for pre-loading
COMMON_PHRASES = {
"English to Spanish": [
"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
],
"Spanish to English": [
"Hola", "Gracias", "Buenos dรญas", "ยฟCรณmo estรกs?", "ยฟCรณmo te llamas?",
"No entiendo", "Por favor", "Lo siento", "Sรญ", "No", "Dรณnde estรก"
],
"English to Korean": [
"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
],
"Korean to English": [
"์•ˆ๋…•ํ•˜์„ธ์š”", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค", "์ข‹์€ ์•„์นจ์ž…๋‹ˆ๋‹ค", "์–ด๋–ป๊ฒŒ ์ง€๋‚ด์„ธ์š”?", "์ด๋ฆ„์ด ๋ญ์˜ˆ์š”?",
"์ดํ•ด๊ฐ€ ์•ˆ ๋ผ์š”", "์ œ๋ฐœ", "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค", "๋„ค", "์•„๋‹ˆ์š”", "์–ด๋””์— ์žˆ์–ด์š”"
]
}
# Implement LRU cache for better performance
@lru_cache(maxsize=100)
def get_cached_translation(direction, text):
"""LRU cache for translations"""
return None # This gets bypassed when there's a cache hit
# Create a worker pool for parallel translation
class ModelWorkerPool:
def __init__(self, num_workers=1):
self.num_workers = num_workers
self.request_queue = queue.Queue()
self.response_queue = queue.Queue()
self.workers = []
self.initialized = False
# Create shared model instance with optimized settings
print("Initializing model with CPU optimizations...")
start_time = time.time()
# CPU optimization settings - use fewer threads for Q8 model
cpu_count = multiprocessing.cpu_count()
optimal_threads = max(1, min(4, cpu_count - 1)) # Use fewer threads for better performance
# Create a smaller context size for faster inference
self.model = Llama(
model_path=base_model_path,
lora_path=adapter_path,
n_ctx=256, # Reduced context for faster processing
n_threads=optimal_threads, # Optimized thread count
n_batch=512, # Reduced batch size for CPU
use_mmap=True, # Efficient memory mapping
n_gpu_layers=0, # CPU only
seed=42, # Consistent results
verbose=False, # Reduce overhead
rope_freq_base=10000, # Default attention parameters
rope_freq_scale=1.0,
)
print(f"Model loaded in {time.time() - start_time:.2f} seconds")
# Start worker threads
for i in range(num_workers):
worker = threading.Thread(target=self._worker_loop, daemon=True)
worker.start()
self.workers.append(worker)
self.initialized = True
# Pre-warm in background thread to not block startup
warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
warming_thread.start()
def _worker_loop(self):
"""Worker thread that processes translation requests"""
while True:
try:
request = self.request_queue.get()
if request is None: # Shutdown signal
break
direction, text, callback_id = request
# Check LRU cache first
cached = get_cached_translation(direction, text)
if cached is not None:
self.response_queue.put((callback_id, cached))
self.request_queue.task_done()
continue
# Check regular cache
cache_key = f"{direction}:{text}"
if cache_key in translation_cache:
result = translation_cache[cache_key]
else:
# Process new translation
result = self._process_translation(direction, text)
# Store in regular cache
if len(translation_cache) >= MAX_CACHE_SIZE:
translation_cache.pop(next(iter(translation_cache)))
translation_cache[cache_key] = result
self.response_queue.put((callback_id, result))
self.request_queue.task_done()
except Exception as e:
print(f"Error in worker thread: {e}")
self.response_queue.put((callback_id, f"Error: {str(e)}"))
self.request_queue.task_done()
def _prewarm_model(self):
"""Pre-compute common translations to warm up the model - minimal to save time"""
print("Pre-warming model with essential phrases (truncated for speed)...")
start = time.time()
# Just warm up with one phrase per direction to speed up startup
for direction, phrases in COMMON_PHRASES.items():
self._process_translation(direction, phrases[0])
# Only do the most common phrase to save startup time
print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
def _process_translation(self, direction, text):
"""Optimized translation function"""
# Skip empty inputs
if not text or not text.strip():
return ""
# Start timing for performance tracking
start_time = time.time()
# Map language directions
lang_map = {
"English to Spanish": ("ENGLISH", "SPANISH"),
"Spanish to English": ("SPANISH", "ENGLISH"),
"Korean to English": ("KOREAN", "ENGLISH"),
"English to Korean": ("ENGLISH", "KOREAN")
}
if direction not in lang_map:
return "Invalid direction"
source_lang, target_lang = lang_map[direction]
# Truncate long inputs for faster processing
max_input_length = 100 # Limit input length
if len(text) > max_input_length:
text = text[:max_input_length] + "..."
# Efficient prompt format
prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
# Reduce max tokens for faster inference
input_tokens = len(text.split())
max_tokens = min(50, max(20, int(input_tokens * 1.2)))
# Generate translation with aggressive performance optimizations
try:
response = self.model.create_completion(
prompt,
max_tokens=max_tokens,
temperature=0.0, # Deterministic for faster inference
top_k=1, # Only consider most likely token
top_p=1.0, # No sampling
repeat_penalty=1.0, # No repeat penalty
stream=False, # Get complete response at once
stop=["[/", "\n\n"], # Stop early if possible
)
translation = response['choices'][0]['text'].strip()
# Log performance
inference_time = time.time() - start_time
tokens_per_second = (input_tokens + len(translation.split())) / inference_time
print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
return translation
except Exception as e:
print(f"Translation error: {e}")
return f"Error: Could not translate text. Try shorter input."
def request_translation(self, direction, text, callback_id):
"""Queue a translation request"""
self.request_queue.put((direction, text, callback_id))
# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
worker_pool = ModelWorkerPool(num_workers=1)
# Counter for request IDs
next_request_id = 0
# Fast similarity check function for finding close matches in cache
def find_similar_cached(direction, text, threshold=0.8):
"""Find similar translations in cache based on prefix matching"""
if len(text) < 5: # For very short inputs, look for exact matches
return None
text_lower = text.lower()
best_match = None
best_score = 0
for cached_key in list(translation_cache.keys()):
cached_dir, cached_text = cached_key.split(":", 1)
if cached_dir != direction:
continue
# Simple similarity - prefix matching
if cached_text.lower().startswith(text_lower[:5]):
similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
if similarity > best_score and similarity > threshold:
best_score = similarity
best_match = translation_cache[cached_key]
return best_match
# Gradio interface functions
def translate(direction, text, progress=gr.Progress()):
"""Queue translation request and wait for result - optimized version"""
global next_request_id
# Trim whitespace for better cache hits
text = text.strip()
# Skip empty inputs
if not text:
return ""
# Check LRU cache first
cached = get_cached_translation(direction, text)
if cached is not None:
return cached
# Check main cache
cache_key = f"{direction}:{text}"
if cache_key in translation_cache:
return translation_cache[cache_key]
# For short inputs, try to find similar cached
if len(text) < 20:
similar = find_similar_cached(direction, text)
if similar:
return similar
# Generate unique request ID
request_id = next_request_id
next_request_id += 1
# Queue the request
worker_pool.request_translation(direction, text, request_id)
# Wait for the response with reasonable timeout
progress(0, desc="Translating...")
max_wait = 20 # Reduced maximum wait time
start_time = time.time()
# Show progress while waiting
while time.time() - start_time < max_wait:
progress((time.time() - start_time) / max_wait)
# Check for our response
try:
while not worker_pool.response_queue.empty():
resp_id, result = worker_pool.response_queue.get_nowait()
if resp_id == request_id:
# Update LRU cache
get_cached_translation.__wrapped__.__defaults__ = (result,)
progress(1.0)
return result
except queue.Empty:
pass
# Small sleep to prevent CPU hogging - reduced for faster response
time.sleep(0.01)
progress(1.0)
return "Translation timed out. Please try a shorter text."
# Create Gradio interface with simplified UI for performance
with gr.Blocks(title="Fast CPU Translation App") as iface:
gr.Markdown(f"""
## Fast CPU Translation App
Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
**For best performance, use short sentences or phrases.**
""")
with gr.Row():
direction = gr.Radio(
choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
label="Translation Direction",
value="English to Spanish"
)
with gr.Row():
input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
output_text = gr.Textbox(lines=3, label="Translation")
# Add translate button
translate_btn = gr.Button("Translate")
translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
# Add examples with common short phrases for quick results
gr.Examples(
examples=[
["English to Spanish", "Hello"],
["Spanish to English", "Hola"],
["English to Korean", "Thank you"],
["Korean to English", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค"]
],
inputs=[direction, input_text],
fn=translate,
outputs=output_text
)
# Add performance tips
gr.Markdown("""
### Performance Tips
- Keep text under 50 characters for fastest results
- Common phrases are pre-cached
- First translation may be slow, subsequent ones faster
- Frequently used phrases use an LRU cache for speed
""")
# Launch with optimized settings
if __name__ == "__main__":
iface.launch(
debug=False,
show_error=True,
share=False,
quiet=True,
server_name="0.0.0.0",
server_port=7860
)