Spaces:
Running
Running
File size: 2,665 Bytes
1a2a056 809a7d7 da870d3 1a2a056 da870d3 ebc420e 767ab2c da870d3 767ab2c da870d3 1a2a056 8da2345 1a2a056 da870d3 1a2a056 da870d3 1a2a056 be11d10 ce12018 df2b5b8 1a2a056 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
# Directory where your fine-tuned Phi-2 model and associated files are stored.
model_dir = "./phi2-qlora-finetuned"
# Directory to store offloaded model parts (for large models).
offload_dir = "./offload"
# Load the tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_dir)
# Load the base model with offloading support.
# base_model = AutoModelForCausalLM.from_pretrained(
# model_dir,
# device_map="auto", # Automatically use available devices (GPU/CPU).
# offload_folder=offload_dir # Directory to offload layers (for larger models).
# )
# CPU
base_model = AutoModelForCausalLM.from_pretrained(
model_dir,
device_map="cpu", # Force CPU usage
torch_dtype=torch.float32, # Use float32 for CPU
trust_remote_code=True,
offload_folder=offload_dir # Directory to offload layers (for larger models).
)
# Load the adapter (PEFT) weights.
model = PeftModel.from_pretrained(base_model, model_dir)
def generate_response(prompt, max_new_tokens=200, temperature=0.7):
"""
Generate a response from the fine-tuned Phi-2 model given a prompt.
"""
# Tokenize the prompt and move tensors to the model's device.
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate output text using sampling.
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature
)
# Decode the generated tokens and return the response.
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# Create a Gradio interface with example prompts.
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=4, label="Input Prompt"),
gr.Slider(50, 500, value=200, label="Max New Tokens"),
gr.Slider(0.0, 1.0, value=0.7, label="Temperature")
],
outputs=gr.Textbox(label="Response"),
title="Phi-2 Fine-tuned Chat",
description="A Hugging Face Space app serving the fine-tuned Phi-2 model trained on OpenAssistant/oasst1 data.",
examples=[
["Hello, how are you today?", 150, 0.7],
["Translate this sentence from English to French: I love programming.", 200, 0.8],
["Tell me a joke about artificial intelligence.", 180, 0.6],
["what is value of 2 + 2: ", 150, 0.9],
["Explain what about economics and how does it impact the individuals financial sector: ", 250, 0.7],
["Who is Randy orton?", 200, 0.8]
]
)
if __name__ == "__main__":
demo.launch()
|