Spaces:
Sleeping
Sleeping
File size: 4,959 Bytes
7f36089 b27a850 7f36089 b27a850 7f36089 b27a850 9c9d112 b27a850 49c7346 b27a850 7f36089 49c7346 b27a850 7f36089 b27a850 7f36089 26149dc b27a850 26149dc 7f36089 b27a850 26149dc b27a850 26149dc b27a850 7f36089 b27a850 7f36089 b27a850 7f36089 b27a850 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr
import multiprocessing
import time
import os
# Model paths - download models if not already cached
def get_model_path(repo_id, filename):
print(f"Obtaining {filename}...")
return hf_hub_download(repo_id=repo_id, filename=filename)
# Get models
base_model_path = get_model_path(
"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
"articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
"johnpaulbin/articulate-V1-Q8_0-GGUF",
"articulate-V1-q8_0.gguf"
)
# CPU optimization settings
cpu_count = multiprocessing.cpu_count()
physical_cores = max(1, cpu_count // 2) # Estimate physical cores
optimal_threads = max(4, physical_cores - 1) # Leave one core free for system
batch_size = int(os.environ.get("BATCH_SIZE", "512")) # Configurable batch size
print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
# Initialize model with optimized parameters
start_time = time.time()
llm = Llama(
model_path=base_model_path,
lora_path=adapter_path,
n_ctx=512, # Context length
n_threads=optimal_threads, # Optimized thread count
n_batch=batch_size, # Process more tokens in parallel
use_mmap=True, # More efficient memory usage
n_gpu_layers=0, # CPU only
seed=42, # Consistent results
verbose=False # Reduce logging overhead
)
print(f"Model loaded in {time.time() - start_time:.2f} seconds")
# Translation cache
translation_cache = {}
MAX_CACHE_SIZE = 100 # Limit cache size
def translate(direction, text):
# Skip empty inputs
if not text or not text.strip():
return ""
# Check cache first for faster response
cache_key = f"{direction}:{text}"
if cache_key in translation_cache:
return translation_cache[cache_key]
# Start timing for performance tracking
start_time = time.time()
# Map language directions
lang_map = {
"English to Spanish": ("ENGLISH", "SPANISH"),
"Spanish to English": ("SPANISH", "ENGLISH"),
"Korean to English": ("KOREAN", "ENGLISH"),
"English to Korean": ("ENGLISH", "KOREAN")
}
if direction not in lang_map:
return "Invalid direction"
source_lang, target_lang = lang_map[direction]
# Efficient prompt format
prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
# Estimate appropriate token length based on input
input_tokens = len(text.split())
max_tokens = min(200, max(50, int(input_tokens * 1.5)))
# Generate translation with optimized settings
response = llm.create_completion(
prompt,
max_tokens=max_tokens,
temperature=0.0, # Deterministic for faster inference
top_k=1, # Only consider most likely token
top_p=1.0, # No sampling
repeat_penalty=1.0, # No repeat penalty processing
stream=False # Get complete response at once (faster)
)
translation = response['choices'][0]['text'].strip()
# Cache result
if len(translation_cache) >= MAX_CACHE_SIZE:
# Remove oldest entry (first key)
translation_cache.pop(next(iter(translation_cache)))
translation_cache[cache_key] = translation
# Log performance
inference_time = time.time() - start_time
tokens_per_second = (input_tokens + len(translation.split())) / inference_time
print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
return translation
# Create Gradio interface with minimal overhead
with gr.Blocks(title="Fast Translation App") as iface:
gr.Markdown("## Translation App")
with gr.Row():
direction = gr.Dropdown(
choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
label="Translation Direction",
value="English to Spanish"
)
with gr.Row():
input_text = gr.Textbox(lines=5, label="Input Text")
output_text = gr.Textbox(lines=5, label="Translation")
# Add translate button
translate_btn = gr.Button("Translate")
translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
# Add examples for convenience
gr.Examples(
examples=[
["English to Spanish", "Hello, how are you today?"],
["Spanish to English", "Hola, ¿cómo estás hoy?"],
["English to Korean", "The weather is nice today."],
["Korean to English", "오늘 날씨가 좋습니다."]
],
inputs=[direction, input_text],
outputs=output_text,
cache_examples=True # Pre-compute examples
)
# Launch with optimized settings
iface.launch(debug=False, show_error=True) |