Spaces:

Lhumpal
/

beast-llm

Sleeping

File size: 11,344 Bytes

ffd9ec7
 
 
ec9d0bc
5efbc3d
0b7df41
015794e
9c86737
0526456
c802485
90f8451
 
c59b8a2
0ab7702
522e9d7
ffd9ec7
 
4a1495c
 
0b7df41
4083e2d
0526456
 
3e419e2
0a0e8cf
1835f57
0a0e8cf
 
90f8451
 
 
7cc632c
 
c6ee45e
90f8451
 
344d17a
 
90f8451
 
 
bc03209
0de6c54
13cd2d2
1f5682b
f1a3c8f
1f5682b
 
 
fc9568c
1f5682b
90f8451
7f349bb
0f0d7e3
 
ac3d904
 
a4ff331
f417ee0
ffd9ec7
90f8451
7cc632c
 
 
90f8451
 
1835f57
90f8451
 
 
c50451d
7f349bb
 
f417ee0
 
0f0d7e3
 
 
 
 
 
6a6f1b3
d6a43ae
6e781e6
5efbc3d
 
 
 
 
 
 
91304f2
2076526
b471855
5efbc3d
 
59f6fc6
5efbc3d
 
 
 
d6a43ae
32c4d71
 
919e8de
32c4d71
919e8de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32c4d71
 
 
919e8de
32c4d71
 
919e8de
 
d6a43ae
 
 
 
 
7bda0be
d6a43ae
 
 
 
 
 
ca33690
f1a3c8f
08a4aab
6a6f1b3
d6a43ae
abadb06
e2d54b4
f9ba337
abadb06
d7e4374
abadb06
bc03209
f9ba337
 
 
bc03209
f9ba337
abadb06
 
 
 
 
90f8451
08a4aab
 
f9ba337
90f8451
78bfbb7
b045ce8
90f8451
 
 
 
6e781e6
 
90f8451
f417ee0
 
1e8bd2e
f417ee0
0f0d7e3
ac3d904
 
f417ee0
 
90f8451
db20b75
 
90f8451
 
fa4f06b
f417ee0
ad463d5
0b7df41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9be5e08
0b7df41
ffd9ec7
99942af

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import os
import textwrap
from google import genai
from google.genai.types import GenerateContentConfig
from datasets import load_dataset
from huggingface_hub import login
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import numpy as np

app = FastAPI()

# Get the token from the environment variable
hf_token = os.environ.get("HF_TOKEN")
google_api_key = os.environ.get("GOOGLE_API_KEY")

login(token=hf_token)

def chunk_text(text, chunk_size=250, chunk_overlap=0):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", "."]
    )
    chunks = splitter.split_text(text)
    return chunks

# Function to build FAISS index
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

def build_faiss_vectorstore(chunks):
    vectorstore = FAISS.from_texts(chunks, embedding_model)
    num_documents = len(vectorstore.index_to_docstore_id)
    print(f"Total number of documents: {num_documents}")
    return vectorstore

# Function to retrieve similar text
def retrieve(query, vectorstore, top_k=8):
    docs_and_scores = vectorstore.similarity_search_with_score(query=query, k=top_k)

    # Filter results based on score threshold
    filtered_docs_and_scores = [(doc.page_content, float(score)) for doc, score in docs_and_scores if float(score) <= 0.7]

    # Separate docs from the (doc, score) tuples
    docs_content = [doc for doc, _ in filtered_docs_and_scores]

    return docs_content, filtered_docs_and_scores
    
class ChatRequest(BaseModel):
    message: str = ""
    system_message: str = ""
    temperature: float = 1.5
    max_output_tokens: int = 200
    chat_history: List[Dict[str, Any]] = []
    model_choice: str = "google"

# grab dataset
dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*", "facts": "facts/*"})
text = dataset["facts"]["text"]
text_string = "".join(text)

# Chunk and index the documents
chunks = chunk_text(text_string, chunk_size=500)
# Build the vectorsore
vectorstore = build_faiss_vectorstore(chunks)

@app.post("/chat")
async def chat(request: ChatRequest):
    try:
        if request.model_choice == "google":
            client = genai.Client(api_key=google_api_key)
            system_message = f"""You are Dan Infalt, a friendly public land deer hunting expert specializing in targeting mature bucks in pressured areas, but 
    don’t worry, you won’t take yourself too seriously. You respond in a conversational matter but still direct. You have dry humor you mix in every once in a while.
    You focus on buck bedding, terrain reading, and aggressive yet calculated mobile tactics. Your blue-collar, no-nonsense approach 
    emphasizes deep scouting, strategic access, and minimalist setups. Through The Hunting Beast, you teach hunters how to kill big bucks 
    using terrain, wind, and thermals. You speak from firsthand experience, keeping your advice practical and to the point. Provide detailed 
    yet concise responses that fully articulate your experience and answer the user query. Please keep your reponses between 0 and {request.max_output_tokens} words."""

            # ------------ summarize chat history ------------
            summary_thresh = 10
            if len(request.chat_history) > summary_thresh:
                summarize_prompt = f"""Please summarize the following chat history concisely, focusing on the key points and main topics discussed. Avoid 
                unnecessary details and provide a clear, straightforward summary. {request.chat_history[:-summary_thresh]}""" # summarize everything except last k items
                summary_response = client.models.generate_content(
                    model="gemini-2.0-flash",
                    contents=summarize_prompt,
                    config=GenerateContentConfig(
                        system_instruction=["You are a helpful assistant who is an expert at summarization."],
                        max_output_tokens=250,
                        temperature=0.5
                    ),
                )
                request.chat_history = request.chat_history[-(summary_thresh+2):] # keep last k items
                request.chat_history.insert(1, 
                    {"role": "user", 
                     "parts": [{"text": f"Here is a summary of this conversation so far: {summary_response.text}"}]})

            # ------------ rephrase user question ------------
            rephrase_prompt = f"""Given the user question and the chat history, rewrite the user question to improve clarity, specificity, and retrieval accuracy while 
            maintaining its original intent within the given chat.

            - Read the chat history 
            - Expand only where necessary to remove vagueness.  
            - Keep the question natural and concise.  
            - Avoid adding excessive detail or unrelated context.  
            - Ensure the enhanced question remains true to what the user is asking.  
            
            Example Enhancements:  
            
            User: "Does camo really matter?"  
            Refined: "How important is camouflage for a hunter’s success, and how does it compare to other factors like movement and scent control?"  
            
            User: "How does attitude affect success?"  
            Refined: "How do mindset factors like patience, confidence, and adaptability influence a hunter’s success?"  
            
            User: "What does it mean if I see does while buck hunting?"  
            Refined: "If I see does while hunting for a buck, what does that indicate about deer movement and buck activity?"  

            Chat history:
            {request.chat_history}
            
            Now, given the chat history, refine the following user question to improve clarity, specificity, and retrieval accuracy while maintaining its original 
            intent within the given chat:  
            {request.message}
            """
            
            rephrase_response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=rephrase_prompt,
                config=GenerateContentConfig(
                    system_instruction=["You are a public land deer hunting expert specializing in targeting mature bucks in pressured areas. Your job is to use your deer hunting knowledge to enhance user questions for better retrieval in a Retrieval-Augmented Generation (RAG) system."],
                    max_output_tokens=250,
                    temperature=0.5
                ),
            )
            
            #  ------------ Retrieve relevant text ------------
            rephrase_response = rephrase_response.text
            docs, filtered_docs_and_scores = retrieve(rephrase_response, vectorstore, top_k=10)
            docs = "\n\n".join(docs)

            # ------------ Retrievel Augmented Generation ------------
            rag_prompt = f"""Use the following information to answer the user's query. You do not have to use all the information, just the pieces that directly 
            help answer the query most accurately. Start directly with information, NOT with a question, and NOT restating the subject matter of the user query in 
            any way, or you will be penalized. Respond in a conversational manner. 
            
            Here are three examples of the style and tone of a response. Notice the good response and bad response. Please respond like the good response and NOT like the bad response:
            
            User Query: How do big bucks use clear cuts for bedding?

            Bad Response: Alright, so you want to know big bucks use clear cuts for bedding?, eh? Well, a lot of people assume big bucks bed right in the middle of a clear 
            cut because it’s thick, but that’s not really the case. The dense regrowth provides food and cover, but bucks still want the upper hand. 
            
            Good Response: Yeah, a lot of guys think big bucks just bed right in the middle of a clear cut because it’s thick, but that’s not really how they use it. The 
            thick regrowth is great for food and cover, but those bucks still want an advantage. Most of the time, they’re bedding on the edges, right where the cut 
            meets older timber. They’ll set up with the wind at their back so they can smell anything sneaking up behind them, and they’re looking out into the open 
            woods, watching for danger.
            
            You have access to the following relevant information retrieved based on the user's query:

            {docs}

            Using the information above, answer the user's query as accurately as possible in the tone and style of the Good Response:

            User Query: {request.message}
            """
            
            # remove the unfformatted user message     
            del request.chat_history[-1]
            # add the user message with RAG data
            rag_prompt = textwrap.dedent(rag_prompt)
            request.chat_history.append({"role": "user", "parts": [{"text": rag_prompt}]})

            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=request.chat_history,
                config=GenerateContentConfig(
                    system_instruction=[system_message],
                    max_output_tokens=request.max_output_tokens,
                    temperature=request.temperature
                ),
            )

            # delete the prompt and put back the unformatted user message
            del request.chat_history[-1]
            request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})

            return {"response": response.text, "dataset_str": text_string, "rephrase_response": rephrase_response, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}

        if request.model_choice == "HF":
            if hf_token:
                client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct", token=hf_token)
            else:
                raise ValueError("HF_TOKEN environment variable not set. Please add it as a secret in your Hugging Face Space.")
                
            messages = [
                {"role": "system", "content": request.system_message},
                {"role": "user", "content": request.message},
            ]
    
            response = client.chat_completion(
                messages=messages,
                max_tokens=request.max_tokens,
                temperature=request.temperature,
                top_p=request.top_p,
            )
    
            return {"response": response.choices[0].message.content}
            
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))