Spaces:
Paused
Paused
File size: 15,446 Bytes
cef5bae 7226a27 8f5c9a5 cef5bae 7226a27 5dce664 7226a27 5dce664 7226a27 083650c 5dce664 7226a27 8f5c9a5 5dce664 50aecff 7226a27 ad54127 5dce664 cef5bae 7226a27 8f5c9a5 5dce664 cef5bae 5dce664 331b624 7226a27 5dce664 7226a27 5dce664 083650c 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 083650c 7226a27 5dce664 331b624 7226a27 331b624 5dce664 a7005d1 083650c 5e9cf4c 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 50aecff 7226a27 cef5bae 5573ab1 5dce664 f0fe889 5dce664 5573ab1 cef5bae 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 5dce664 7226a27 cef5bae 083650c 5dce664 083650c d1bfef7 5dce664 7226a27 5dce664 5573ab1 7226a27 5573ab1 7226a27 5573ab1 5dce664 a7005d1 5dce664 50aecff 5dce664 50aecff 86aae0c cef5bae 5dce664 a7005d1 5dce664 daecd7b 7226a27 5dce664 7226a27 daecd7b 5dce664 7226a27 5dce664 7226a27 083650c 7226a27 daecd7b 7226a27 d9024e6 5dce664 7226a27 5dce664 7226a27 5dce664 99b3f8f 7226a27 d9024e6 7226a27 5dce664 99b3f8f 7226a27 5dce664 083650c 99b3f8f 7226a27 50aecff d3ade6b 083650c d3ade6b 7226a27 d3ade6b 7226a27 d3ade6b 5dce664 d3ade6b 5573ab1 5dce664 d3ade6b 7226a27 d3ade6b 7226a27 50aecff 5573ab1 d3ade6b 8f5c9a5 7226a27 cef5bae 5dce664 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
import os
import json # For debug printing
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
try:
# Try importing llama-cpp-python for GGUF support
from llama_cpp import Llama
LLAMA_CPP_AVAILABLE = True
except ImportError:
print("β οΈ WARNING: llama-cpp-python library not found. Local GGUF execution will not be available.")
print(" To enable local GGUF, run: pip install llama-cpp-python")
Llama = None # Define as None if import fails
LLAMA_CPP_AVAILABLE = False
# --- Configuration ---
# HF Repo ID for the standard model (used in Space and for tokenizer)
HF_CHECKPOINT = "ibm-granite/granite-3.3-2b-instruct"
# GGUF Settings for Local Execution (Using llama-cpp-python)
GGUF_REPO_ID = "ibm-granite/granite-3.3-2b-instruct-gguf" # Official IBM v3.3 GGUF repo
GGUF_FILENAME = "granite-3.3-2b-instruct-Q2_K.gguf" # Smallest Q2_K quantization
# GGUF_FILENAME = "granite-3.3-2b-instruct-Q4_K_M.gguf" # Fallback if Q2_K fails
# Template Filename (Use v3.3 template for both paths now)
TEMPLATE_FILENAME = "granite3.3_2b_chat_template.jinja"
# --- End Configuration ---
# Detect Space environment
env = os.environ
is_space = env.get("SPACE_ID") is not None
print(f"RUNNING IN SPACE? {is_space}")
# Device setup (primarily for HF model in Space)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# --- Load model function ---
def load_model():
primary_checkpoint = HF_CHECKPOINT
model_name_display = primary_checkpoint
# --- Function to load and apply template ---
def apply_template_from_file(tokenizer, template_filename):
applied_template = False
try:
print(f"Attempting to load chat template from: {template_filename}")
script_dir = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(script_dir, template_filename)
if not os.path.exists(template_path):
print(f"β οΈ WARNING: Template file not found at: {template_path}")
return False
with open(template_path, "r", encoding="utf-8") as f:
custom_chat_template_content = f.read()
if hasattr(tokenizer, 'chat_template'):
tokenizer.chat_template = custom_chat_template_content
applied_template = True
print(f"β
Loaded and applied chat template from: {template_filename}")
else:
print(f"β οΈ WARNING: Tokenizer object does not support setting 'chat_template'.")
except Exception as e:
print(f"β ERROR reading or applying template file '{template_filename}': {e}")
if not applied_template:
print("Falling back to tokenizer's default built-in template (if any).")
print("--- Final Chat Template Being Used (by HF Tokenizer) ---")
print(tokenizer.chat_template if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template else "No template found or template empty/default.")
print("-------------------------------------------------------")
return applied_template
# --- End function ---
# --- Load Tokenizer (Common for both paths now) ---
try:
print(f"Loading HF Tokenizer: {primary_checkpoint}")
tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
print("β
Loaded HF Tokenizer.")
# Apply the v3.3 template UNCONDITIONALLY
apply_template_from_file(tokenizer, TEMPLATE_FILENAME)
except Exception as e:
print(f"β Failed to load tokenizer {primary_checkpoint}: {e}")
raise RuntimeError("Failed to load the necessary tokenizer.") from e
# --- End Tokenizer Loading ---
if is_space:
print(f"π Running in Space. Loading HF model: {primary_checkpoint}")
try:
# Load HF Model for Space
model = AutoModelForCausalLM.from_pretrained(
primary_checkpoint,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="auto"
)
print(f"β
Loaded HF {primary_checkpoint}")
model_name_display = primary_checkpoint
# Tokenizer already loaded and template applied
return tokenizer, model, model_name_display
except Exception as e:
print(f"β HF Primary load failed: {e}")
raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
else: # Running Locally - Load GGUF using llama-cpp-python
print(f"π» Running Locally. Attempting GGUF setup via llama-cpp-python.")
if not LLAMA_CPP_AVAILABLE:
raise RuntimeError("llama-cpp-python library is required but not installed/found.")
print(f" GGUF Repo ID: {GGUF_REPO_ID}")
print(f" GGUF Filename: {GGUF_FILENAME}")
try:
# Load GGUF Model using llama-cpp-python
print(f"Attempting to load GGUF model using Llama.from_pretrained...")
model = Llama.from_pretrained(
repo_id=GGUF_REPO_ID,
filename=GGUF_FILENAME,
n_gpu_layers=0, # Force CPU execution
verbose=True,
n_ctx=4096 # Increased context window
)
print(f"β
Loaded GGUF model {GGUF_FILENAME} using llama-cpp-python")
model_name_display = f"GGUF (llama-cpp): {GGUF_FILENAME}"
# Return tokenizer loaded earlier and the Llama model object
return tokenizer, model, model_name_display
except Exception as e:
print(f"β Local GGUF load failed using llama-cpp-python: {e}")
if "Not Found" in str(e) or "404" in str(e):
print(f" File not found. Please ensure Repo ID '{GGUF_REPO_ID}' and Filename '{GGUF_FILENAME}' are correct and the file exists on Hugging Face Hub.")
elif "invalid GGUF file" in str(e) or "failed to load model" in str(e):
print(f" Model loading failed. The GGUF file '{GGUF_FILENAME}' might be corrupted, incompatible with this version of llama-cpp-python, or the quantization level is unsupported.")
print(f" Consider trying a different quantization like 'Q4_K_M'.")
# Add other potential error checks based on llama-cpp-python exceptions
raise RuntimeError(f"Failed to load local GGUF model '{GGUF_FILENAME}' using llama-cpp-python.") from e
# --- Call load_model ---
try:
# Tokenizer should now be loaded for both paths
tokenizer, model, model_name = load_model()
if tokenizer is None: # Should not happen now
raise RuntimeError("Tokenizer failed to load.")
except Exception as load_err:
print(f"π¨ CRITICAL ERROR DURING MODEL LOADING: {load_err}")
# For UI testing, you might want to create dummy objects instead of raising
# tokenizer = None
# model = None
# model_name = "LOAD FAILED"
raise # Re-raise for now
# --- Load hotel docs function ---
def load_hotel_docs(hotel_id):
knowledge_dir = "knowledge"
path = os.path.join(knowledge_dir, f"{hotel_id}.txt")
if not os.path.exists(path):
print(f"β οΈ Knowledge file not found: {path}")
return []
try:
with open(path, encoding="utf-8") as f:
content = f.read().strip()
print(f"DEBUG [load_hotel_docs]: Read {len(content)} chars from {path}.")
if not content:
print(f"β οΈ WARNING [load_hotel_docs]: File {path} is empty.")
return []
return [(hotel_id, content)] # Return list with tuple: [(id, content)]
except Exception as e:
print(f"β Error reading knowledge file {path}: {e}")
return []
# --- Dynamic Hotel ID Detection ---
available_hotels = []
knowledge_dir = "knowledge"
if os.path.exists(knowledge_dir):
print("π Scanning for available hotels...")
files = os.listdir(knowledge_dir)
potential_ids = set()
for f in files:
if f.endswith(".txt") and not f.endswith("-system.txt"):
potential_ids.add(f[:-4]) # Add ID without .txt
for hotel_id in sorted(list(potential_ids)):
doc_file = os.path.join(knowledge_dir, f"{hotel_id}.txt")
sys_file = os.path.join(knowledge_dir, f"{hotel_id}-system.txt")
if os.path.exists(doc_file) and os.path.exists(sys_file):
available_hotels.append(hotel_id)
print(f" β
Found valid hotel pair: {hotel_id}")
else:
print(f" β οΈ Skipping '{hotel_id}': Missing either '{hotel_id}.txt' or '{hotel_id}-system.txt'")
print("Hotel scan complete.")
else:
print(f"β οΈ Knowledge directory '{knowledge_dir}' not found. No hotels loaded.")
# --- End Hotel Scanning ---
# --- Chat function ---
def chat(message, history, hotel_id):
if history is None: history = []
# Convert Gradio history
history_hf_format = []
for user_msg, assistant_msg in history:
if user_msg: history_hf_format.append({"role": "user", "content": user_msg})
if assistant_msg: history_hf_format.append({"role": "assistant", "content": assistant_msg})
current_turn = {"role": "user", "content": message}
ui_history = history + [[message, None]]
yield ui_history, ""
response = "Sorry, an error occurred."
input_text = "" # Initialize input_text
try:
# --- System Prompt Loading ---
default_system_prompt = "You are a helpful hotel assistant."
system_prompt_filename = f"{hotel_id}-system.txt"
system_prompt_path = os.path.join("knowledge", system_prompt_filename)
system_prompt_content = default_system_prompt
if os.path.exists(system_prompt_path):
try:
with open(system_prompt_path, "r", encoding="utf-8") as f: loaded_prompt = f.read().strip()
if loaded_prompt: system_prompt_content = loaded_prompt
else: print(f"β οΈ System prompt file '{system_prompt_path}' is empty. Using default.")
except Exception as e: print(f"β Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
else: print(f"β οΈ System prompt file not found: '{system_prompt_path}'. Using default.")
# --- Document Loading ---
hotel_docs_list = load_hotel_docs(hotel_id)
# --- Message List Construction (Base: System, History, User) ---
messages = [{"role": "system", "content": system_prompt_content}]
messages.extend(history_hf_format)
messages.append(current_turn)
print(f"DEBUG [chat]: Base messages list:\n{json.dumps(messages, indent=2)}")
# --- Prepare documents kwarg (Used by apply_chat_template in BOTH paths) ---
documents_for_kwarg = []
if hotel_docs_list:
# Use 'doc_id' and 'text' keys for v3.3 template
documents_for_kwarg = [{"doc_id": doc_id, "text": doc_content} for doc_id, doc_content in hotel_docs_list]
print(f"DEBUG [chat]: Preparing documents kwarg: {len(documents_for_kwarg)} docs")
# --- Template Application (Now UNCONDITIONAL - uses tokenizer) ---
input_text = tokenizer.apply_chat_template(
messages,
documents=documents_for_kwarg, # Use kwarg for v3.3 template
tokenize=False,
add_generation_prompt=True
)
# --- THIS IS THE DEBUG PRINT YOU REQUESTED ---
print("\n" + "="*40 + " FINAL PROMPT STRING " + "="*40)
print(input_text)
print("="*99 + "\n")
# --- END DEBUG PRINT ---
except Exception as e:
print(f"β Error during prompt preparation: {e}")
ui_history[-1][1] = "Sorry, an error occurred while preparing the prompt."
yield ui_history, ""
return
# --- Generation ---
try:
if is_space:
# --- HF Space Generation (model.generate) ---
print("π Generating response using HF model...")
inputs = tokenizer(input_text, return_tensors="pt").to(device)
input_length = inputs.input_ids.shape[1]
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=1024,
do_sample=False,
eos_token_id=tokenizer.eos_token_id
)
new_token_ids = outputs[0][input_length:]
response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
print("β
HF Generation complete.")
else: # Local GGUF Generation using llama-cpp-python's lower-level call
print("π» Generating response using GGUF model (llama-cpp-python)...")
# --- Use model(prompt_string, ...) ---
output = model( # Call the Llama object directly with the formatted string
input_text,
max_tokens=512, # Max tokens to generate
stop=["<|end_of_text|>"], # Use model's stop token(s)
temperature=0.1,
# echo=False # Usually default, don't echo the prompt
)
# Extract response content
if output and 'choices' in output and output['choices'] and 'text' in output['choices'][0]:
response = output['choices'][0]['text'].strip()
else:
print(f"β οΈ Unexpected output format from model call: {output}")
response = "Sorry, received an unexpected response structure."
# --- End model(prompt_string, ...) ---
print("β
GGUF Generation complete (llama-cpp-python).")
if not response:
response = "Sorry, I encountered an issue generating a response (empty)."
except Exception as e:
print(f"β Error during model generation or processing: {e}")
response = f"Sorry, an error occurred: {e}"
print(f"DEBUG: Final response variable before UI append = {repr(response)}")
ui_history[-1][1] = response
yield ui_history, ""
# --- Gradio UI ---
with gr.Blocks() as demo:
with gr.Column(variant="panel"):
gr.Markdown("### π¨ MultiβHotel Chatbot Demo")
gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF info
hotel_selector = gr.Dropdown(
choices=available_hotels,
label="Hotel",
value=available_hotels[0] if available_hotels else None,
interactive=bool(available_hotels)
)
with gr.Row():
chatbot = gr.Chatbot(label="Chat History", height=500)
msg = gr.Textbox(
show_label=False,
placeholder="Ask about the hotel..."
)
clear_btn = gr.Button("Clear")
clear_btn.click(lambda: ([], ""), None, [chatbot, msg])
msg.submit(
fn=chat,
inputs=[msg, chatbot, hotel_selector],
outputs=[chatbot, msg]
)
if is_space:
gr.Markdown("β οΈ Pause the Space when done to avoid charges.")
# Enable streaming queue
demo.queue(default_concurrency_limit=2, max_size=32)
if __name__ == "__main__":
print("Launching Gradio Interface...")
demo.launch()
print("Gradio Interface closed.") |