import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch # Directory where your fine-tuned Phi-2 model and associated files are stored. model_dir = "./phi2-qlora-finetuned" # Directory to store offloaded model parts (for large models). offload_dir = "./offload" # Load the tokenizer. tokenizer = AutoTokenizer.from_pretrained(model_dir) # Load the base model with offloading support. # base_model = AutoModelForCausalLM.from_pretrained( # model_dir, # device_map="auto", # Automatically use available devices (GPU/CPU). # offload_folder=offload_dir # Directory to offload layers (for larger models). # ) # CPU base_model = AutoModelForCausalLM.from_pretrained( model_dir, device_map="cpu", # Force CPU usage torch_dtype=torch.float32, # Use float32 for CPU trust_remote_code=True, offload_folder=offload_dir # Directory to offload layers (for larger models). ) # Load the adapter (PEFT) weights. model = PeftModel.from_pretrained(base_model, model_dir) def generate_response(prompt, max_new_tokens=200, temperature=0.7): """ Generate a response from the fine-tuned Phi-2 model given a prompt. """ # Tokenize the prompt and move tensors to the model's device. inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Generate output text using sampling. outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature ) # Decode the generated tokens and return the response. response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Create a Gradio interface with example prompts. demo = gr.Interface( fn=generate_response, inputs=[ gr.Textbox(lines=4, label="Input Prompt"), gr.Slider(50, 500, value=200, label="Max New Tokens"), gr.Slider(0.0, 1.0, value=0.7, label="Temperature") ], outputs=gr.Textbox(label="Response"), title="Phi-2 Fine-tuned Chat", description="A Hugging Face Space app serving the fine-tuned Phi-2 model trained on OpenAssistant/oasst1 data.", examples=[ ["Hello, how are you today?", 150, 0.7], ["Translate this sentence from English to French: I love programming.", 200, 0.8], ["Tell me a joke about artificial intelligence.", 180, 0.6], ["what is value of 2 + 2: ", 150, 0.9], ["Explain what about economics and how does it impact the individuals financial sector: ", 250, 0.7], ["Who is Randy orton?", 200, 0.8] ] ) if __name__ == "__main__": demo.launch()