Spaces:

johnpaulbin
/

googoo

Sleeping

johnpaulbin commited on Feb 25

Commit

9c9d112

verified ·

1 Parent(s): 253566e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,13 +11,20 @@ base_model_path = hf_hub_download(repo_id=base_model_repo, filename=base_model_f
 adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
 adapter_file = "articulate-V1-q8_0.gguf"
 adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
 # Initialize the Llama model with base model and adapter
 llm = Llama(
     model_path=base_model_path,
     lora_path=adapter_path,
     n_ctx=512,       # Context length, set manually since adapter lacks it
-    n_threads=2,     # Adjust based on your system
     n_gpu_layers=0    # Set to >0 if GPU acceleration is desired and supported
 )

 adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
 adapter_file = "articulate-V1-q8_0.gguf"
 adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
+import multiprocessing
+# Optimize thread count based on available CPU cores
+# Use half the available cores for better performance with LLMs
+cpu_count = multiprocessing.cpu_count()
+optimal_threads = max(2, cpu_count // 2)
+print(f"Initializing model with {optimal_threads} threads...")
 # Initialize the Llama model with base model and adapter
 llm = Llama(
     model_path=base_model_path,
     lora_path=adapter_path,
     n_ctx=512,       # Context length, set manually since adapter lacks it
+    n_threads=optimal_threads,     # Adjust based on your system
+    use_mmap=True,
     n_gpu_layers=0    # Set to >0 if GPU acceleration is desired and supported
 )