File size: 4,551 Bytes
7f36089
 
 
b27a850
 
 
7f36089
c2b521a
b27a850
 
 
7f36089
b27a850
 
 
 
 
 
 
 
 
 
c2b521a
9c9d112
c2b521a
 
b27a850
 
49c7346
c2b521a
b27a850
7f36089
49c7346
 
c2b521a
 
 
 
 
 
7f36089
b27a850
 
c2b521a
b27a850
c2b521a
7f36089
26149dc
c2b521a
b27a850
 
c2b521a
 
 
 
b27a850
 
 
 
c2b521a
b27a850
 
c2b521a
b27a850
 
 
 
 
 
 
 
26149dc
7f36089
b27a850
 
c2b521a
 
b27a850
c2b521a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f36089
c2b521a
 
 
b27a850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b521a
b27a850
 
 
 
 
 
 
 
c2b521a
b27a850
7f36089
c2b521a
b27a850
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr
import multiprocessing
import time
import os

# Model paths
def get_model_path(repo_id, filename):
    print(f"Obtaining {filename}...")
    return hf_hub_download(repo_id=repo_id, filename=filename)

# Get models
base_model_path = get_model_path(
    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", 
    "articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
    "johnpaulbin/articulate-V1-Q8_0-GGUF", 
    "articulate-V1-q8_0.gguf"
)

# Conservative CPU settings to avoid memory corruption
cpu_count = multiprocessing.cpu_count()
optimal_threads = max(1, min(8, cpu_count // 2))  # More conservative thread count
batch_size = 128  # Reduced batch size to prevent memory issues

print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")

# Initialize model with safer parameters
start_time = time.time()
llm = Llama(
    model_path=base_model_path,
    lora_path=adapter_path,
    n_ctx=512,               
    n_threads=optimal_threads,
    n_batch=batch_size,      # Smaller batch size for stability
    use_mmap=True,           
    n_gpu_layers=0,          
    verbose=False            
)
print(f"Model loaded in {time.time() - start_time:.2f} seconds")

# Simple translation cache (limited size)
translation_cache = {}
MAX_CACHE_SIZE = 50  # Reduced cache size

def translate(direction, text):
    # Validate input
    if not text or not text.strip():
        return ""
    
    text = text.strip()
    
    # Simple cache lookup
    cache_key = f"{direction}:{text}"
    if cache_key in translation_cache:
        return translation_cache[cache_key]
    
    # Start timing
    start_time = time.time()
    
    # Language mapping
    lang_map = {
        "English to Spanish": ("ENGLISH", "SPANISH"),
        "Spanish to English": ("SPANISH", "ENGLISH"),
        "Korean to English": ("KOREAN", "ENGLISH"),
        "English to Korean": ("ENGLISH", "KOREAN")
    }
    
    if direction not in lang_map:
        return "Invalid direction"
    
    source_lang, target_lang = lang_map[direction]
    
    # Create prompt
    prompt = f"[{source_lang}]{text}[{target_lang}]"
    
    try:
        # Generate translation with conservative settings
        response = llm.create_completion(
            prompt,
            max_tokens=128,      # Conservative token limit
            temperature=0.0,     # Deterministic
            top_k=1,             # Most likely token only
            top_p=1.0,           # No sampling
            repeat_penalty=1.0,
            stream=False         
        )
        
        translation = response['choices'][0]['text'].strip()
        
        # Manage cache size
        if len(translation_cache) >= MAX_CACHE_SIZE:
            # Remove oldest entry
            translation_cache.pop(next(iter(translation_cache)))
        translation_cache[cache_key] = translation
        
        # Log performance
        inference_time = time.time() - start_time
        print(f"Translation completed in {inference_time:.3f}s")
        
        return translation
        
    except Exception as e:
        print(f"Translation error: {e}")
        return f"Error during translation: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Translation App") as iface:
    gr.Markdown("## Fast Translation App")
    
    with gr.Row():
        direction = gr.Dropdown(
            choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
            label="Translation Direction",
            value="English to Spanish"
        )
    
    with gr.Row():
        input_text = gr.Textbox(lines=5, label="Input Text")
        output_text = gr.Textbox(lines=5, label="Translation")
    
    # Add translate button
    translate_btn = gr.Button("Translate")
    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
    
    # Examples WITHOUT caching (to avoid memory issues)
    gr.Examples(
        examples=[
            ["English to Spanish", "Hello, how are you today?"],
            ["Spanish to English", "Hola, ¿cómo estás hoy?"],
            ["English to Korean", "The weather is nice today."],
            ["Korean to English", "오늘 날씨가 좋습니다."]
        ],
        inputs=[direction, input_text],
        cache_examples=False  # Disabled caching to prevent memory issues
    )

# Launch with safer settings
iface.launch(debug=False, show_error=True)