drvikasgaur commited on
Commit
713b966
·
verified ·
1 Parent(s): d266723

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -16
app.py CHANGED
@@ -5,15 +5,15 @@ import torch
5
  import os
6
 
7
  # ---- LOAD LLM ----
8
- model_name = "meta-llama/Llama-3.2-3B-Instruct"
9
 
10
- # Read token from environment variable (set in HF Space Secrets)
11
  hf_token = os.getenv("HF_TOKEN")
12
 
13
  tokenizer = AutoTokenizer.from_pretrained(
14
  model_name,
15
- token=hf_token, # Secure token
16
- trust_remote_code=True # Required for llama3 models
17
  )
18
 
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -22,19 +22,14 @@ model = AutoModelForCausalLM.from_pretrained(
22
  model_name,
23
  token=hf_token,
24
  trust_remote_code=True,
25
- torch_dtype=torch.float16 if device=="cuda" else torch.float32,
26
- device_map="auto" if device=="cuda" else None
27
  ).to(device)
28
 
29
-
30
-
31
-
32
  # --- Define llm generation function ---
33
  def llm(prompt, max_new_tokens=1000, temperature=0.3, do_sample=True):
34
- # Wrap the prompt into proper Llama 3 chat format
35
- system_prompt = "[INST] " + prompt + " [/INST]"
36
-
37
- inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
38
 
39
  output = model.generate(
40
  **inputs,
@@ -48,9 +43,6 @@ def llm(prompt, max_new_tokens=1000, temperature=0.3, do_sample=True):
48
 
49
  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
50
 
51
- # Remove prompt part (optional cleanup)
52
- generated_text = generated_text.replace(system_prompt, "").strip()
53
-
54
  return [{"generated_text": generated_text}]
55
 
56
 
 
5
  import os
6
 
7
  # ---- LOAD LLM ----
8
+ model_name = "Qwen/Qwen1.5-0.5B"
9
 
10
+ # No need for token usually; Qwen is public, but keeping it flexible
11
  hf_token = os.getenv("HF_TOKEN")
12
 
13
  tokenizer = AutoTokenizer.from_pretrained(
14
  model_name,
15
+ token=hf_token, # can be None if not set
16
+ trust_remote_code=True # required for Qwen
17
  )
18
 
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
22
  model_name,
23
  token=hf_token,
24
  trust_remote_code=True,
25
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
26
+ device_map="auto" if device == "cuda" else None
27
  ).to(device)
28
 
 
 
 
29
  # --- Define llm generation function ---
30
  def llm(prompt, max_new_tokens=1000, temperature=0.3, do_sample=True):
31
+ # Qwen does not require special prompt wrapping like [INST] ... [/INST]
32
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 
33
 
34
  output = model.generate(
35
  **inputs,
 
43
 
44
  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
45
 
 
 
 
46
  return [{"generated_text": generated_text}]
47
 
48