from huggingface_hub import hf_hub_download from llama_cpp import Llama import gradio as gr import multiprocessing import time import os # Model paths def get_model_path(repo_id, filename): print(f"Obtaining {filename}...") return hf_hub_download(repo_id=repo_id, filename=filename) # Get models base_model_path = get_model_path( "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", "articulate-11-expspanish-base-merged-q8_0.gguf" ) adapter_path = get_model_path( "johnpaulbin/articulate-V1-Q8_0-GGUF", "articulate-V1-q8_0.gguf" ) # Conservative CPU settings to avoid memory corruption cpu_count = multiprocessing.cpu_count() optimal_threads = max(1, min(8, cpu_count // 2)) # More conservative thread count batch_size = 128 # Reduced batch size to prevent memory issues print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...") # Initialize model with safer parameters start_time = time.time() llm = Llama( model_path=base_model_path, lora_path=adapter_path, n_ctx=512, n_threads=optimal_threads, n_batch=batch_size, # Smaller batch size for stability use_mmap=True, n_gpu_layers=0, verbose=False ) print(f"Model loaded in {time.time() - start_time:.2f} seconds") # Simple translation cache (limited size) translation_cache = {} MAX_CACHE_SIZE = 50 # Reduced cache size def translate(direction, text): # Validate input if not text or not text.strip(): return "" text = text.strip() # Simple cache lookup cache_key = f"{direction}:{text}" if cache_key in translation_cache: return translation_cache[cache_key] # Start timing start_time = time.time() # Language mapping lang_map = { "English to Spanish": ("ENGLISH", "SPANISH"), "Spanish to English": ("SPANISH", "ENGLISH"), "Korean to English": ("KOREAN", "ENGLISH"), "English to Korean": ("ENGLISH", "KOREAN") } if direction not in lang_map: return "Invalid direction" source_lang, target_lang = lang_map[direction] # Create prompt prompt = f"[{source_lang}]{text}[{target_lang}]" try: # Generate translation with conservative settings response = llm.create_completion( prompt, max_tokens=128, # Conservative token limit temperature=0.0, # Deterministic top_k=1, # Most likely token only top_p=1.0, # No sampling repeat_penalty=1.0, stream=False ) translation = response['choices'][0]['text'].strip() # Manage cache size if len(translation_cache) >= MAX_CACHE_SIZE: # Remove oldest entry translation_cache.pop(next(iter(translation_cache))) translation_cache[cache_key] = translation # Log performance inference_time = time.time() - start_time print(f"Translation completed in {inference_time:.3f}s") return translation except Exception as e: print(f"Translation error: {e}") return f"Error during translation: {str(e)}" # Create Gradio interface with gr.Blocks(title="Translation App") as iface: gr.Markdown("## Fast Translation App") with gr.Row(): direction = gr.Dropdown( choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"], label="Translation Direction", value="English to Spanish" ) with gr.Row(): input_text = gr.Textbox(lines=5, label="Input Text") output_text = gr.Textbox(lines=5, label="Translation") # Add translate button translate_btn = gr.Button("Translate") translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text) # Examples WITHOUT caching (to avoid memory issues) gr.Examples( examples=[ ["English to Spanish", "Hello, how are you today?"], ["Spanish to English", "Hola, ¿cómo estás hoy?"], ["English to Korean", "The weather is nice today."], ["Korean to English", "오늘 날씨가 좋습니다."] ], inputs=[direction, input_text], cache_examples=False # Disabled caching to prevent memory issues ) # Launch with safer settings iface.launch(debug=False, show_error=True)