Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

johnpaulbin commited on Feb 25

Commit

c2b521a

verified ·

1 Parent(s): 590c26b

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -59

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import multiprocessing
 import time
 import os
-# Model paths - download models if not already cached
 def get_model_path(repo_id, filename):
     print(f"Obtaining {filename}...")
     return hf_hub_download(repo_id=repo_id, filename=filename)
@@ -20,47 +20,47 @@ adapter_path = get_model_path(
     "articulate-V1-q8_0.gguf"
 )
-# CPU optimization settings
 cpu_count = multiprocessing.cpu_count()
-physical_cores = max(1, cpu_count // 2)  # Estimate physical cores
-optimal_threads = max(4, physical_cores - 1)  # Leave one core free for system
-batch_size = int(os.environ.get("BATCH_SIZE", "512"))  # Configurable batch size
 print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
-# Initialize model with optimized parameters
 start_time = time.time()
 llm = Llama(
     model_path=base_model_path,
     lora_path=adapter_path,
-    n_ctx=512,                # Context length
-    n_threads=optimal_threads, # Optimized thread count
-    n_batch=batch_size,       # Process more tokens in parallel
-    use_mmap=True,            # More efficient memory usage
-    n_gpu_layers=0,           # CPU only
-    seed=42,                  # Consistent results
-    verbose=False             # Reduce logging overhead
 )
 print(f"Model loaded in {time.time() - start_time:.2f} seconds")
-# Translation cache
 translation_cache = {}
-MAX_CACHE_SIZE = 100  # Limit cache size
 def translate(direction, text):
-    # Skip empty inputs
     if not text or not text.strip():
         return ""
-    # Check cache first for faster response
     cache_key = f"{direction}:{text}"
     if cache_key in translation_cache:
         return translation_cache[cache_key]
-    # Start timing for performance tracking
     start_time = time.time()
-    # Map language directions
     lang_map = {
         "English to Spanish": ("ENGLISH", "SPANISH"),
         "Spanish to English": ("SPANISH", "ENGLISH"),
@@ -73,42 +73,42 @@ def translate(direction, text):
     source_lang, target_lang = lang_map[direction]
-    # Efficient prompt format
-    prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
-    # Estimate appropriate token length based on input
-    input_tokens = len(text.split())
-    max_tokens = min(200, max(50, int(input_tokens * 1.5)))
-    # Generate translation with optimized settings
-    response = llm.create_completion(
-        prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,      # Deterministic for faster inference
-        top_k=1,              # Only consider most likely token
-        top_p=1.0,            # No sampling
-        repeat_penalty=1.0,   # No repeat penalty processing
-        stream=False          # Get complete response at once (faster)
-    )
-    translation = response['choices'][0]['text'].strip()
-    # Cache result
-    if len(translation_cache) >= MAX_CACHE_SIZE:
-        # Remove oldest entry (first key)
-        translation_cache.pop(next(iter(translation_cache)))
-    translation_cache[cache_key] = translation
-    # Log performance
-    inference_time = time.time() - start_time
-    tokens_per_second = (input_tokens + len(translation.split())) / inference_time
-    print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
-    return translation
-# Create Gradio interface with minimal overhead
-with gr.Blocks(title="Fast Translation App") as iface:
-    gr.Markdown("## Translation App")
     with gr.Row():
         direction = gr.Dropdown(
@@ -125,7 +125,7 @@ with gr.Blocks(title="Fast Translation App") as iface:
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
-    # Add examples - FIXED VERSION
     gr.Examples(
         examples=[
             ["English to Spanish", "Hello, how are you today?"],
@@ -134,10 +134,8 @@ with gr.Blocks(title="Fast Translation App") as iface:
             ["Korean to English", "오늘 날씨가 좋습니다."]
         ],
         inputs=[direction, input_text],
-        fn=translate,  # Added the missing function parameter
-        outputs=output_text,
-        cache_examples=True
     )
-# Launch with optimized settings
 iface.launch(debug=False, show_error=True)

 import time
 import os
+# Model paths
 def get_model_path(repo_id, filename):
     print(f"Obtaining {filename}...")
     return hf_hub_download(repo_id=repo_id, filename=filename)
     "articulate-V1-q8_0.gguf"
 )
+# Conservative CPU settings to avoid memory corruption
 cpu_count = multiprocessing.cpu_count()
+optimal_threads = max(1, min(8, cpu_count // 2))  # More conservative thread count
+batch_size = 128  # Reduced batch size to prevent memory issues
 print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
+# Initialize model with safer parameters
 start_time = time.time()
 llm = Llama(
     model_path=base_model_path,
     lora_path=adapter_path,
+    n_ctx=512,
+    n_threads=optimal_threads,
+    n_batch=batch_size,      # Smaller batch size for stability
+    use_mmap=True,
+    n_gpu_layers=0,
+    verbose=False
 )
 print(f"Model loaded in {time.time() - start_time:.2f} seconds")
+# Simple translation cache (limited size)
 translation_cache = {}
+MAX_CACHE_SIZE = 50  # Reduced cache size
 def translate(direction, text):
+    # Validate input
     if not text or not text.strip():
         return ""
+    text = text.strip()
+    # Simple cache lookup
     cache_key = f"{direction}:{text}"
     if cache_key in translation_cache:
         return translation_cache[cache_key]
+    # Start timing
     start_time = time.time()
+    # Language mapping
     lang_map = {
         "English to Spanish": ("ENGLISH", "SPANISH"),
         "Spanish to English": ("SPANISH", "ENGLISH"),
     source_lang, target_lang = lang_map[direction]
+    # Create prompt
+    prompt = f"[{source_lang}]{text}[{target_lang}]"
+    try:
+        # Generate translation with conservative settings
+        response = llm.create_completion(
+            prompt,
+            max_tokens=128,      # Conservative token limit
+            temperature=0.0,     # Deterministic
+            top_k=1,             # Most likely token only
+            top_p=1.0,           # No sampling
+            repeat_penalty=1.0,
+            stream=False
+        )
+        translation = response['choices'][0]['text'].strip()
+        # Manage cache size
+        if len(translation_cache) >= MAX_CACHE_SIZE:
+            # Remove oldest entry
+            translation_cache.pop(next(iter(translation_cache)))
+        translation_cache[cache_key] = translation
+        # Log performance
+        inference_time = time.time() - start_time
+        print(f"Translation completed in {inference_time:.3f}s")
+        return translation
+    except Exception as e:
+        print(f"Translation error: {e}")
+        return f"Error during translation: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Translation App") as iface:
+    gr.Markdown("## Fast Translation App")
     with gr.Row():
         direction = gr.Dropdown(
     translate_btn = gr.Button("Translate")
     translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
+    # Examples WITHOUT caching (to avoid memory issues)
     gr.Examples(
         examples=[
             ["English to Spanish", "Hello, how are you today?"],
             ["Korean to English", "오늘 날씨가 좋습니다."]
         ],
         inputs=[direction, input_text],
+        cache_examples=False  # Disabled caching to prevent memory issues
     )
+# Launch with safer settings
 iface.launch(debug=False, show_error=True)