File size: 1,288 Bytes
7e27297
 
 
 
4283479
f49dbfe
7e27297
 
 
93a80e2
5221913
9be17b1
48bb8bf
3bfa766
7e27297
 
 
 
 
 
 
 
a4bbc50
873eb03
e412a0c
7e27297
 
 
 
e412a0c
f49dbfe
f1b182f
9be17b1
f1b182f
 
 
 
873eb03
abcebe6
e412a0c
 
7e27297
873eb03
1080421
7e27297
f49dbfe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import os
import ollama
import time  # Import time module for measuring response time

app = FastAPI()

# model_name = 'llama3.2'
model_name = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M'

model = ollama.pull(model_name)

class ChatRequest(BaseModel):
    message: str
    system_message: str = "You are a friendly Chatbot."
    max_tokens: int = 512
    temperature: float = 0.7
    top_p: float = 0.95

class ChatResponse(BaseModel):
    model_status: str
    response: str
    response_time: float

@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    try:
        start_time = time.time()

        messages = [
            {'role': 'system', 'content': request.system_message},
            {'role': 'user', 'content': request.message}
        ]

        response = ollama.chat(model=model_name, messages=messages)
        response = str(response)

        end_time = time.time()
        response_time = end_time - start_time

        return {"model_status": model.status, "response_time": response_time, "response": response}
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))