johnpaulbin commited on
Commit
c2b521a
Β·
verified Β·
1 Parent(s): 590c26b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -59
app.py CHANGED
@@ -5,7 +5,7 @@ import multiprocessing
5
  import time
6
  import os
7
 
8
- # Model paths - download models if not already cached
9
  def get_model_path(repo_id, filename):
10
  print(f"Obtaining {filename}...")
11
  return hf_hub_download(repo_id=repo_id, filename=filename)
@@ -20,47 +20,47 @@ adapter_path = get_model_path(
20
  "articulate-V1-q8_0.gguf"
21
  )
22
 
23
- # CPU optimization settings
24
  cpu_count = multiprocessing.cpu_count()
25
- physical_cores = max(1, cpu_count // 2) # Estimate physical cores
26
- optimal_threads = max(4, physical_cores - 1) # Leave one core free for system
27
- batch_size = int(os.environ.get("BATCH_SIZE", "512")) # Configurable batch size
28
 
29
  print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
30
 
31
- # Initialize model with optimized parameters
32
  start_time = time.time()
33
  llm = Llama(
34
  model_path=base_model_path,
35
  lora_path=adapter_path,
36
- n_ctx=512, # Context length
37
- n_threads=optimal_threads, # Optimized thread count
38
- n_batch=batch_size, # Process more tokens in parallel
39
- use_mmap=True, # More efficient memory usage
40
- n_gpu_layers=0, # CPU only
41
- seed=42, # Consistent results
42
- verbose=False # Reduce logging overhead
43
  )
44
  print(f"Model loaded in {time.time() - start_time:.2f} seconds")
45
 
46
- # Translation cache
47
  translation_cache = {}
48
- MAX_CACHE_SIZE = 100 # Limit cache size
49
 
50
  def translate(direction, text):
51
- # Skip empty inputs
52
  if not text or not text.strip():
53
  return ""
54
-
55
- # Check cache first for faster response
 
 
56
  cache_key = f"{direction}:{text}"
57
  if cache_key in translation_cache:
58
  return translation_cache[cache_key]
59
 
60
- # Start timing for performance tracking
61
  start_time = time.time()
62
 
63
- # Map language directions
64
  lang_map = {
65
  "English to Spanish": ("ENGLISH", "SPANISH"),
66
  "Spanish to English": ("SPANISH", "ENGLISH"),
@@ -73,42 +73,42 @@ def translate(direction, text):
73
 
74
  source_lang, target_lang = lang_map[direction]
75
 
76
- # Efficient prompt format
77
- prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
78
-
79
- # Estimate appropriate token length based on input
80
- input_tokens = len(text.split())
81
- max_tokens = min(200, max(50, int(input_tokens * 1.5)))
82
-
83
- # Generate translation with optimized settings
84
- response = llm.create_completion(
85
- prompt,
86
- max_tokens=max_tokens,
87
- temperature=0.0, # Deterministic for faster inference
88
- top_k=1, # Only consider most likely token
89
- top_p=1.0, # No sampling
90
- repeat_penalty=1.0, # No repeat penalty processing
91
- stream=False # Get complete response at once (faster)
92
- )
93
-
94
- translation = response['choices'][0]['text'].strip()
95
-
96
- # Cache result
97
- if len(translation_cache) >= MAX_CACHE_SIZE:
98
- # Remove oldest entry (first key)
99
- translation_cache.pop(next(iter(translation_cache)))
100
- translation_cache[cache_key] = translation
101
 
102
- # Log performance
103
- inference_time = time.time() - start_time
104
- tokens_per_second = (input_tokens + len(translation.split())) / inference_time
105
- print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
106
-
107
- return translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Create Gradio interface with minimal overhead
110
- with gr.Blocks(title="Fast Translation App") as iface:
111
- gr.Markdown("## Translation App")
112
 
113
  with gr.Row():
114
  direction = gr.Dropdown(
@@ -125,7 +125,7 @@ with gr.Blocks(title="Fast Translation App") as iface:
125
  translate_btn = gr.Button("Translate")
126
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
127
 
128
- # Add examples - FIXED VERSION
129
  gr.Examples(
130
  examples=[
131
  ["English to Spanish", "Hello, how are you today?"],
@@ -134,10 +134,8 @@ with gr.Blocks(title="Fast Translation App") as iface:
134
  ["Korean to English", "였늘 날씨가 μ’‹μŠ΅λ‹ˆλ‹€."]
135
  ],
136
  inputs=[direction, input_text],
137
- fn=translate, # Added the missing function parameter
138
- outputs=output_text,
139
- cache_examples=True
140
  )
141
 
142
- # Launch with optimized settings
143
  iface.launch(debug=False, show_error=True)
 
5
  import time
6
  import os
7
 
8
+ # Model paths
9
  def get_model_path(repo_id, filename):
10
  print(f"Obtaining {filename}...")
11
  return hf_hub_download(repo_id=repo_id, filename=filename)
 
20
  "articulate-V1-q8_0.gguf"
21
  )
22
 
23
+ # Conservative CPU settings to avoid memory corruption
24
  cpu_count = multiprocessing.cpu_count()
25
+ optimal_threads = max(1, min(8, cpu_count // 2)) # More conservative thread count
26
+ batch_size = 128 # Reduced batch size to prevent memory issues
 
27
 
28
  print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
29
 
30
+ # Initialize model with safer parameters
31
  start_time = time.time()
32
  llm = Llama(
33
  model_path=base_model_path,
34
  lora_path=adapter_path,
35
+ n_ctx=512,
36
+ n_threads=optimal_threads,
37
+ n_batch=batch_size, # Smaller batch size for stability
38
+ use_mmap=True,
39
+ n_gpu_layers=0,
40
+ verbose=False
 
41
  )
42
  print(f"Model loaded in {time.time() - start_time:.2f} seconds")
43
 
44
+ # Simple translation cache (limited size)
45
  translation_cache = {}
46
+ MAX_CACHE_SIZE = 50 # Reduced cache size
47
 
48
  def translate(direction, text):
49
+ # Validate input
50
  if not text or not text.strip():
51
  return ""
52
+
53
+ text = text.strip()
54
+
55
+ # Simple cache lookup
56
  cache_key = f"{direction}:{text}"
57
  if cache_key in translation_cache:
58
  return translation_cache[cache_key]
59
 
60
+ # Start timing
61
  start_time = time.time()
62
 
63
+ # Language mapping
64
  lang_map = {
65
  "English to Spanish": ("ENGLISH", "SPANISH"),
66
  "Spanish to English": ("SPANISH", "ENGLISH"),
 
73
 
74
  source_lang, target_lang = lang_map[direction]
75
 
76
+ # Create prompt
77
+ prompt = f"[{source_lang}]{text}[{target_lang}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ try:
80
+ # Generate translation with conservative settings
81
+ response = llm.create_completion(
82
+ prompt,
83
+ max_tokens=128, # Conservative token limit
84
+ temperature=0.0, # Deterministic
85
+ top_k=1, # Most likely token only
86
+ top_p=1.0, # No sampling
87
+ repeat_penalty=1.0,
88
+ stream=False
89
+ )
90
+
91
+ translation = response['choices'][0]['text'].strip()
92
+
93
+ # Manage cache size
94
+ if len(translation_cache) >= MAX_CACHE_SIZE:
95
+ # Remove oldest entry
96
+ translation_cache.pop(next(iter(translation_cache)))
97
+ translation_cache[cache_key] = translation
98
+
99
+ # Log performance
100
+ inference_time = time.time() - start_time
101
+ print(f"Translation completed in {inference_time:.3f}s")
102
+
103
+ return translation
104
+
105
+ except Exception as e:
106
+ print(f"Translation error: {e}")
107
+ return f"Error during translation: {str(e)}"
108
 
109
+ # Create Gradio interface
110
+ with gr.Blocks(title="Translation App") as iface:
111
+ gr.Markdown("## Fast Translation App")
112
 
113
  with gr.Row():
114
  direction = gr.Dropdown(
 
125
  translate_btn = gr.Button("Translate")
126
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
127
 
128
+ # Examples WITHOUT caching (to avoid memory issues)
129
  gr.Examples(
130
  examples=[
131
  ["English to Spanish", "Hello, how are you today?"],
 
134
  ["Korean to English", "였늘 날씨가 μ’‹μŠ΅λ‹ˆλ‹€."]
135
  ],
136
  inputs=[direction, input_text],
137
+ cache_examples=False # Disabled caching to prevent memory issues
 
 
138
  )
139
 
140
+ # Launch with safer settings
141
  iface.launch(debug=False, show_error=True)