from fastapi import FastAPI, HTTPException from pydantic import BaseModel from huggingface_hub import InferenceClient import os import ollama import time # Import time module for measuring response time app = FastAPI() # model_name = 'llama3.2' model_name = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M' model = ollama.pull(model_name) class ChatRequest(BaseModel): message: str system_message: str = "You are a friendly Chatbot." max_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.95 class ChatResponse(BaseModel): model_status: str response: str response_time: float @app.post("/chat", response_model=ChatResponse) async def chat(request: ChatRequest): try: start_time = time.time() messages = [ {'role': 'system', 'content': request.system_message}, {'role': 'user', 'content': request.message} ] response = ollama.chat(model=model_name, messages=messages) response = str(response) end_time = time.time() response_time = end_time - start_time return {"model_status": model.status, "response_time": response_time, "response": response} except Exception as e: raise HTTPException(status_code=500, detail=str(e))