hb-llm / app.py
Lhumpal's picture
Update app.py
873eb03 verified
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import os
import ollama
import time # Import time module for measuring response time
app = FastAPI()
# model_name = 'llama3.2'
model_name = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M'
model = ollama.pull(model_name)
class ChatRequest(BaseModel):
message: str
system_message: str = "You are a friendly Chatbot."
max_tokens: int = 512
temperature: float = 0.7
top_p: float = 0.95
class ChatResponse(BaseModel):
model_status: str
response: str
response_time: float
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
try:
start_time = time.time()
messages = [
{'role': 'system', 'content': request.system_message},
{'role': 'user', 'content': request.message}
]
response = ollama.chat(model=model_name, messages=messages)
response = str(response)
end_time = time.time()
response_time = end_time - start_time
return {"model_status": model.status, "response_time": response_time, "response": response}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))