Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

seanpedrickcase commited on 4 days ago

Commit

96d818b

1 Parent(s): b4066c5

Updated Dockerfile so it actually works. Modified config file to include/exclude models and change relevant options as needed. Fixed thumbs up/down feedback.

Browse files

Files changed (9) hide show

Dockerfile +57 -20
app.py +33 -22
chatfuncs/aws_functions.py +1 -3
chatfuncs/chatfuncs.py +27 -21
chatfuncs/config.py +19 -4
chatfuncs/model_load.py +3 -3
requirements.txt +3 -3
requirements_aws.txt +27 -0
requirements_gpu.txt +3 -3

Dockerfile CHANGED Viewed

@@ -1,44 +1,81 @@
-FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
-RUN apt-get update \
-    && apt-get install -y \
         g++ \
         make \
         cmake \
         unzip \
-        libcurl4-openssl-dev \
-        git \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
 WORKDIR /src
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements_cpu.txt
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
 # Switch to the "user" user
 USER user
 # Set home to the user's home directory
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH \
-    	PYTHONPATH=$HOME/app \
-	PYTHONUNBUFFERED=1 \
-	GRADIO_ALLOW_FLAGGING=never \
-	GRADIO_NUM_PORTS=1 \
-	GRADIO_SERVER_NAME=0.0.0.0 \
-	GRADIO_THEME=huggingface \
-	SYSTEM=spaces \
 	LLAMA_CUBLAS=0
 # Set the working directory to the user's home directory
-WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
-COPY --chown=user . $HOME/app
 CMD ["python", "app.py"]

+FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
+RUN apt-get update && \
+    apt-get install -y \
         g++ \
         make \
         cmake \
+        pkg-config \
         unzip \
+        libcurl4-openssl-dev \
+        build-essential \
+        libopenblas-dev \
+        git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# Upgrade pip
+RUN python3 -m pip install --upgrade pip
+# Optional: CMake args for BLAS for llama-cpp-python installation
+ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
 WORKDIR /src
+COPY requirements_aws.txt .
+RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
+&& pip install --no-cache-dir --target=/install sentence-transformers==4.1.0 --no-deps \
+&& pip install --no-cache-dir --target=/install span-marker==1.7.0 --no-deps \
+&& pip install --no-cache-dir --target=/install langchain-huggingface==0.1.2 --no-deps \
+&& pip install --no-cache-dir --target=/install keybert==0.9.0 --no-deps \
+&& pip install --no-cache-dir --target=/install -r requirements_aws.txt
+# Stage 2: Final runtime image
+FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
+RUN apt-get update && \
+    apt-get install -y \
+    libopenblas0 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
+# Create required directories
+RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
+    && chown -R user:user /home/user/app
+# Copy installed packages from builder stage
+COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Switch to the "user" user
 USER user
 # Set home to the user's home directory
+ENV APP_HOME=/home/user
+ENV PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    GRADIO_ANALYTICS_ENABLED=False \
+    TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
+    SYSTEM=spaces \
 	LLAMA_CUBLAS=0
 # Set the working directory to the user's home directory
+WORKDIR $APP_HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $APP_HOME/app
+# Ensure permissions are really user:user again after copying
+RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
 CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 import gradio as gr
 import pandas as pd
-from torch import float16
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  AutoModelForCausalLM
@@ -15,7 +15,7 @@ from chatfuncs.ingest import embed_faiss_save_to_zip
 from chatfuncs.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
 from chatfuncs.aws_functions import upload_file_to_s3
 from chatfuncs.auth import authenticate_user
-from chatfuncs.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES
 from chatfuncs.model_load import torch_device, gpu_config, cpu_config, context_length
 import chatfuncs.chatfuncs as chatf
 import chatfuncs.ingest as ing
@@ -94,17 +94,17 @@ def create_hf_model(model_name:str, hf_token=HF_TOKEN):
             model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
         else:
             if hf_token:
-                model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=hf_token) # , torch_dtype=float16
             else:
-                model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # , torch_dtype=float16
     else:
         if "flan" in model_name:
             model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
         else:
             if hf_token:
-                model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token) # , torch_dtype=float16
             else:
-                model = AutoModelForCausalLM.from_pretrained(model_name) # , torch_dtype=float16
     if hf_token:
         tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = context_length, token=hf_token)
@@ -212,6 +212,7 @@ with app:
     session_hash_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="S3 logs", visible=False)
     access_logs_state = gr.State(access_logs_data_folder + 'dataset1.csv')
     access_s3_logs_loc_state = gr.State(access_logs_data_folder)
@@ -222,9 +223,8 @@ with app:
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
-    gr.Markdown(f"""Chat with PDF, web page or (new) csv/Excel documents. The default is a small model ({SMALL_MODEL_NAME}), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative ({LARGE_MODEL_NAME}), can reason a little better, but is much slower (See Advanced settings tab).\n\nBy default '[{DEFAULT_DATA_SOURCE_NAME}]({DEFAULT_DATA_SOURCE})' is loaded.If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.""")
     with gr.Row():
         current_source = gr.Textbox(label="Current data source(s)", value=DEFAULT_DATA_SOURCE, scale = 10)
         current_model = gr.Textbox(label="Current model", value=model_type, scale = 3)
@@ -233,10 +233,11 @@ with app:
         with gr.Row():
             #chat_height = 500
-            chatbot = gr.Chatbot(value=None, avatar_images=('user.jfif', 'bot.jpg'), scale = 1, resizable=True, show_copy_all_button=True, show_copy_button=True, show_share_button=True, type='messages') # , height=chat_height
-            with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = True):
-                sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here") # , height=chat_height
         with gr.Row():
             message = gr.Textbox(
                 label="Enter your question here",
@@ -247,12 +248,11 @@ with app:
             clear = gr.Button(value="Clear chat", variant="secondary", scale=1)
             stop = gr.Button(value="Stop generating", variant="stop", scale=1)
-        examples_set = gr.Radio(label="Example questions",
-            choices=default_examples_set)
         current_topic = gr.Textbox(label="Feature currently disabled - Keywords related to current conversation topic.", placeholder="Keywords related to the conversation topic will appear here", visible=False)
-    with gr.Tab("Load in a different file to chat with"):
         with gr.Accordion("PDF file", open = False):
             in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
             load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
@@ -272,15 +272,25 @@ with app:
             ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
             file_out_box = gr.File(file_count='single', file_types=['.zip'])
-    with gr.Tab("Advanced features"):
         out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
         temp_slide = gr.Slider(minimum=0.1, value = 0.5, maximum=1, step=0.1, label="Choose temperature setting for response generation.")
         with gr.Row():
-            model_choice = gr.Radio(label="Choose a chat model", value=SMALL_MODEL_NAME, choices = default_model_choices)
-            in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=True)
-            change_model_button = gr.Button(value="Load model", scale=0)
-        with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False):
-            gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
         load_text = gr.Text(label="Load status")
@@ -318,7 +328,8 @@ with app:
     clear.click(lambda: None, None, chatbot, queue=False)
     # Thumbs up or thumbs down voting function
-    chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
     ###

 from langchain_community.vectorstores import FAISS
 import gradio as gr
 import pandas as pd
+from torch import float16, float32
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  AutoModelForCausalLM
 from chatfuncs.helper_functions import get_connection_params, reveal_feedback_buttons, wipe_logs
 from chatfuncs.aws_functions import upload_file_to_s3
 from chatfuncs.auth import authenticate_user
+from chatfuncs.config import FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, HOST_NAME, COGNITO_AUTH, INPUT_FOLDER, OUTPUT_FOLDER, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, DEFAULT_EMBEDDINGS_LOCATION, EMBEDDINGS_MODEL_NAME, DEFAULT_DATA_SOURCE, HF_TOKEN, LARGE_MODEL_REPO_ID, LARGE_MODEL_GGUF_FILE, LARGE_MODEL_NAME, SMALL_MODEL_NAME, SMALL_MODEL_REPO_ID, DEFAULT_DATA_SOURCE_NAME, DEFAULT_EXAMPLES, DEFAULT_MODEL_CHOICES, RUN_GEMINI_MODELS, LOAD_LARGE_MODEL
 from chatfuncs.model_load import torch_device, gpu_config, cpu_config, context_length
 import chatfuncs.chatfuncs as chatf
 import chatfuncs.ingest as ing
             model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
         else:
             if hf_token:
+                model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=hf_token, torch_dtype=float32) # , torch_dtype=float16 - not compatible with CPU and Gemma 3
             else:
+                model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=float32) # , torch_dtype=float16
     else:
         if "flan" in model_name:
             model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
         else:
             if hf_token:
+                model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, torch_dtype=float32) # , torch_dtype=float16
             else:
+                model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=float32) # , torch_dtype=float16
     if hf_token:
         tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = context_length, token=hf_token)
     session_hash_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="S3 logs", visible=False)
+    latest_user_rating_data_path = gr.Textbox(label="output_ratings_textbox", visible=False)
     access_logs_state = gr.State(access_logs_data_folder + 'dataset1.csv')
     access_s3_logs_loc_state = gr.State(access_logs_data_folder)
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
+    gr.Markdown(f"""Chat with PDF, web page or (new) csv/Excel documents. The default is a small model ({SMALL_MODEL_NAME}), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative ({LARGE_MODEL_NAME}, if available), can reason a little better, but is much slower (See Advanced settings tab).\n\nBy default '[{DEFAULT_DATA_SOURCE_NAME}]({DEFAULT_DATA_SOURCE})' is loaded.If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.""")
     with gr.Row():
         current_source = gr.Textbox(label="Current data source(s)", value=DEFAULT_DATA_SOURCE, scale = 10)
         current_model = gr.Textbox(label="Current model", value=model_type, scale = 3)
         with gr.Row():
             #chat_height = 500
+            chatbot = gr.Chatbot(value=None, avatar_images=('user.jfif', 'bot.jpg'), scale = 1, resizable=True, show_copy_all_button=True, show_copy_button=True, show_share_button=None, type='messages', max_height=500)
+            with gr.Accordion("Source paragraphs with the most relevant text will appear here", open = True):
+                sources = gr.HTML(value = "No relevant source paragraphs currently loaded", max_height=500) # , height=chat_height
+        gr.Markdown("Make sure that your questions are as specific as possible to allow the search engine to find the most relevant text to your query.")
         with gr.Row():
             message = gr.Textbox(
                 label="Enter your question here",
             clear = gr.Button(value="Clear chat", variant="secondary", scale=1)
             stop = gr.Button(value="Stop generating", variant="stop", scale=1)
+        examples_set = gr.Radio(label="Example questions", choices=default_examples_set)
         current_topic = gr.Textbox(label="Feature currently disabled - Keywords related to current conversation topic.", placeholder="Keywords related to the conversation topic will appear here", visible=False)
+    with gr.Tab("Load in a different file/webpage"):
         with gr.Accordion("PDF file", open = False):
             in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
             load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
             ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
             file_out_box = gr.File(file_count='single', file_types=['.zip'])
+    with gr.Tab("Advanced settings - change model/model options"):
         out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
         temp_slide = gr.Slider(minimum=0.1, value = 0.5, maximum=1, step=0.1, label="Choose temperature setting for response generation.")
         with gr.Row():
+            with gr.Column(scale=3):
+                model_choice = gr.Radio(label="Choose a chat model", value=SMALL_MODEL_NAME, choices = default_model_choices)
+                if RUN_GEMINI_MODELS == "1":
+                    in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=True)
+                else:
+                    in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password",interactive=True, visible=False)
+            with gr.Column(scale=1):
+                change_model_button = gr.Button(value="Load model")
+        if LOAD_LARGE_MODEL == "1":
+            with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False, visible=True):
+                gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
+        else:
+            with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False, visible=False):
+                gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=False)
         load_text = gr.Text(label="Load status")
     clear.click(lambda: None, None, chatbot, queue=False)
     # Thumbs up or thumbs down voting function
+    chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], [latest_user_rating_data_path]).\
+    success(fn = upload_file_to_s3, inputs=[latest_user_rating_data_path, latest_user_rating_data_path], outputs=[s3_logs_output_textbox])
     ###

chatfuncs/aws_functions.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from typing import Type, List
 import pandas as pd
 import boto3
-import tempfile
 import os
-from chatfuncs.helper_functions import get_or_create_env_var
 from chatfuncs.config import AWS_REGION, RUN_AWS_FUNCTIONS, QA_CHATBOT_BUCKET
 PandasDataFrame = Type[pd.DataFrame]
@@ -17,7 +15,7 @@ if RUN_AWS_FUNCTIONS == "1":
         bucket_name = os.environ['']
         session = boto3.Session() # profile_name="default"
     except Exception as e:
-        print(e)
     def get_assumed_role_info():
         sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'

 from typing import Type, List
 import pandas as pd
 import boto3
 import os
 from chatfuncs.config import AWS_REGION, RUN_AWS_FUNCTIONS, QA_CHATBOT_BUCKET
 PandasDataFrame = Type[pd.DataFrame]
         bucket_name = os.environ['']
         session = boto3.Session() # profile_name="default"
     except Exception as e:
+        print("Failed to start boto3 session due to:", e)
     def get_assumed_role_info():
         sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -34,7 +34,7 @@ from langchain.docstore.document import Document
 from chatfuncs.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma
 from chatfuncs.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
-from chatfuncs.config import GEMINI_API_KEY, AWS_DEFAULT_REGION, LARGE_MODEL_NAME, SMALL_MODEL_NAME, RUN_AWS_FUNCTIONS
 model_object = [] # Define empty list for model functions to run
 tokenizer = [] # Define empty list for model functions to run
@@ -1187,35 +1187,41 @@ def hide_block():
 # Vote function
-def vote(data: gr.LikeData, chat_history, instruction_prompt_out, model_type):
-    import os
-    import pandas as pd
-    chat_history_last = str(str(chat_history[-1][0]) + " - " + str(chat_history[-1][1]))
     response_df = pd.DataFrame(data={"thumbs_up":data.liked,
-                                        "chosen_response":data.value,
                                           "input_prompt":instruction_prompt_out,
-                                          "chat_history":chat_history_last,
                                           "model_type": model_type,
                                           "date_time": pd.Timestamp.now()}, index=[0])
     if data.liked:
-        print("You upvoted this response: " + data.value)
-        if os.path.isfile("thumbs_up_data.csv"):
-             existing_thumbs_up_df = pd.read_csv("thumbs_up_data.csv")
-             thumbs_up_df_concat = pd.concat([existing_thumbs_up_df, response_df], ignore_index=True).drop("Unnamed: 0",axis=1, errors="ignore")
-             thumbs_up_df_concat.to_csv("thumbs_up_data.csv")
-        else:
-            response_df.to_csv("thumbs_up_data.csv")
     else:
-        print("You downvoted this response: " + data.value)
-        if os.path.isfile("thumbs_down_data.csv"):
-             existing_thumbs_down_df = pd.read_csv("thumbs_down_data.csv")
-             thumbs_down_df_concat = pd.concat([existing_thumbs_down_df, response_df], ignore_index=True).drop("Unnamed: 0",axis=1, errors="ignore")
-             thumbs_down_df_concat.to_csv("thumbs_down_data.csv")
-        else:
-            response_df.to_csv("thumbs_down_data.csv")

 from chatfuncs.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3, instruction_prompt_qwen, instruction_prompt_template_orca, instruction_prompt_gemma
 from chatfuncs.model_load import temperature, max_new_tokens, sample, repetition_penalty, top_p, top_k, torch_device, CtransGenGenerationConfig, max_tokens
+from chatfuncs.config import GEMINI_API_KEY, AWS_DEFAULT_REGION, LARGE_MODEL_NAME, SMALL_MODEL_NAME, RUN_AWS_FUNCTIONS, FEEDBACK_LOGS_FOLDER
 model_object = [] # Define empty list for model functions to run
 tokenizer = [] # Define empty list for model functions to run
 # Vote function
+def vote(data: gr.LikeData, chat_history:list[dict], instruction_prompt_out:str, model_type:str, feedback_folder:str=FEEDBACK_LOGS_FOLDER):
+    query_text = next(
+    (entry['content'] for entry in reversed(chat_history) if entry.get('role') == 'user'),
+    "")
+    response_text = next(
+    (entry['content'] for entry in reversed(chat_history) if entry.get('role') == 'assistant'),
+    "")
+    chat_history_latest = str(query_text + " - " + response_text)
+    if isinstance(data.value, list): chosen_response = data.value[-1]
+    else: chosen_response = data.value
     response_df = pd.DataFrame(data={"thumbs_up":data.liked,
+                                        "chosen_response":chosen_response,
                                           "input_prompt":instruction_prompt_out,
+                                          "chat_history":chat_history_latest,
                                           "model_type": model_type,
                                           "date_time": pd.Timestamp.now()}, index=[0])
     if data.liked:
+        print("You upvoted this response:", chosen_response)
+    else:
+        print("You downvoted this response:", chosen_response)
+    output_data_path = feedback_folder + "thumbs_up_down_data.csv"
+    if os.path.isfile(output_data_path):
+            existing_thumbs_down_df = pd.read_csv(output_data_path)
+            thumbs_down_df_concat = pd.concat([existing_thumbs_down_df, response_df], ignore_index=True).drop("Unnamed: 0",axis=1, errors="ignore")
+            thumbs_down_df_concat.to_csv(output_data_path)
     else:
+        response_df.to_csv(output_data_path)
+    return output_data_path

chatfuncs/config.py CHANGED Viewed

@@ -163,8 +163,12 @@ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS',
 ###
 # RUN CONFIG
 GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
@@ -181,17 +185,28 @@ SMALL_MODEL_NAME = get_or_create_env_var("SMALL_MODEL_NAME", "Gemma 3 1B (small,
 SMALL_MODEL_REPO_ID = get_or_create_env_var("SMALL_MODEL_REPO_ID", 'google/gemma-3-1b-it') #'Qwen/Qwen2-0.5B-Instruct')
 LARGE_MODEL_NAME = get_or_create_env_var("LARGE_MODEL_NAME", "Phi 3.5 Mini (larger, slow)")
 LARGE_MODEL_REPO_ID = get_or_create_env_var("LARGE_MODEL_REPO_ID", "QuantFactory/Phi-3.5-mini-instruct-GGUF") # "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), # "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
 LARGE_MODEL_GGUF_FILE = get_or_create_env_var("LARGE_MODEL_GGUF_FILE", "Phi-3.5-mini-instruct.Q4_K_M.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")  #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf")#"mistral-7b-openorca.Q4_K_M.gguf"),
 if RUN_AWS_FUNCTIONS == "1":
-    default_model_choices = f'["{SMALL_MODEL_NAME}", "{LARGE_MODEL_NAME}", "gemini-2.0-flash-001", "gemini-2.5-flash-preview-04-17", "gemini-2.5-pro-preview-03-25", "anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]'
-else:
-    default_model_choices = f'["{SMALL_MODEL_NAME}", "{LARGE_MODEL_NAME}", "gemini-2.0-flash-001", "gemini-2.5-flash-preview-04-17", "gemini-2.5-pro-preview-03-25"]'
-DEFAULT_MODEL_CHOICES = get_or_create_env_var("DEFAULT_MODEL_CHOICES", default_model_choices)
 EMBEDDINGS_MODEL_NAME = get_or_create_env_var('EMBEDDINGS_MODEL_NAME', "BAAI/bge-base-en-v1.5") #"mixedbread-ai/mxbai-embed-xsmall-v1"

 ###
 # RUN CONFIG
+RUN_GEMINI_MODELS = get_or_create_env_var('RUN_GEMINI_MODELS', '1')
 GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
+# NOTE THAT THIS IS REQUIRED
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
 SMALL_MODEL_REPO_ID = get_or_create_env_var("SMALL_MODEL_REPO_ID", 'google/gemma-3-1b-it') #'Qwen/Qwen2-0.5B-Instruct')
+LOAD_LARGE_MODEL = get_or_create_env_var("LOAD_LARGE_MODEL", '0')
 LARGE_MODEL_NAME = get_or_create_env_var("LARGE_MODEL_NAME", "Phi 3.5 Mini (larger, slow)")
 LARGE_MODEL_REPO_ID = get_or_create_env_var("LARGE_MODEL_REPO_ID", "QuantFactory/Phi-3.5-mini-instruct-GGUF") # "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), # "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
 LARGE_MODEL_GGUF_FILE = get_or_create_env_var("LARGE_MODEL_GGUF_FILE", "Phi-3.5-mini-instruct.Q4_K_M.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")  #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf")#"mistral-7b-openorca.Q4_K_M.gguf"),
+# Build up options for models
+default_model_choices = [SMALL_MODEL_NAME]
+if LOAD_LARGE_MODEL == "1":
+    default_model_choices.append(LARGE_MODEL_NAME)
 if RUN_AWS_FUNCTIONS == "1":
+    default_model_choices.extend(["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"])
+if RUN_GEMINI_MODELS == "1":
+    default_model_choices.extend(["gemini-2.0-flash-001", "gemini-2.5-flash-preview-04-17", "models/gemini-2.5-pro-exp-03-25"])
+DEFAULT_MODEL_CHOICES = get_or_create_env_var("DEFAULT_MODEL_CHOICES", str(default_model_choices))
 EMBEDDINGS_MODEL_NAME = get_or_create_env_var('EMBEDDINGS_MODEL_NAME', "BAAI/bge-base-en-v1.5") #"mixedbread-ai/mxbai-embed-xsmall-v1"

chatfuncs/model_load.py CHANGED Viewed

@@ -17,15 +17,15 @@ temperature: float = 0.1
 top_k: int = 3
 top_p: float = 1
 repetition_penalty: float = 1.15
-flan_alpaca_repetition_penalty: float = 1.3
 last_n_tokens: int = 64
 max_new_tokens: int = 1024
 seed: int = 42
 reset: bool = False
 stream: bool = True
 threads: int = threads
-batch_size:int = 256
-context_length:int = 2048
 sample = True
 # Bedrock parameters

 top_k: int = 3
 top_p: float = 1
 repetition_penalty: float = 1.15
+#flan_alpaca_repetition_penalty: float = 1.3
 last_n_tokens: int = 64
 max_new_tokens: int = 1024
 seed: int = 42
 reset: bool = False
 stream: bool = True
 threads: int = threads
+batch_size:int = 128
+context_length:int = 4096
 sample = True
 # Bedrock parameters

requirements.txt CHANGED Viewed

@@ -5,9 +5,9 @@ beautifulsoup4==4.13.4
 google-generativeai==0.8.5
 pandas==2.2.3
 transformers==4.51.3
-# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
-llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-#-C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
 torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
 sentence_transformers==4.1.0
 faiss-cpu==1.10.0

 google-generativeai==0.8.5
 pandas==2.2.3
 transformers==4.51.3
+# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
+#llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # Older version based on wheel if the below line doesn't work
+llama-cpp-python==0.3.8 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
 torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu
 sentence_transformers==4.1.0
 faiss-cpu==1.10.0

requirements_aws.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+#langchain==0.3.24
+#langchain-huggingface==0.1.2 # Loaded in Dockerfile
+boto3==1.38.0
+python-dotenv==1.1.0
+langchain-community==0.3.22
+beautifulsoup4==4.13.4
+google-generativeai==0.8.5
+pandas==2.2.3
+transformers==4.51.3
+# For Windows https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp311-#cp311-win_amd64.whl -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
+#llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # For linux
+llama-cpp-python==0.3.8 -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
+#torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cpu # Loaded in Dockerfile
+#sentence_transformers==4.1.0 # Loaded in Dockerfile
+faiss-cpu==1.10.0
+pypdf==5.4.0
+python-docx==1.1.2
+#keybert==0.9.0 # Loaded in Dockerfile
+#span-marker==1.7.0 # Loaded in Dockerfile
+gradio==5.25.2
+nltk==3.9.1
+bm25s==0.2.12
+PyStemmer==2.2.0.3
+scikit-learn==1.6.1
+scipy==1.15.2
+numpy==1.26.4

requirements_gpu.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-langchain==0.3.24
 langchain-community==0.3.22
 langchain-huggingface==0.1.2
 beautifulsoup4==4.13.4
@@ -6,8 +6,8 @@ google-generativeai==0.8.5
 pandas==2.2.3
 transformers==4.51.3
 torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cu121
-llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
-#llama-cpp-python==0.3.8 -C cmake.args="-DGGML_CUDA=on"
 sentence_transformers==4.1.0
 faiss-cpu==1.10.0
 pypdf==5.4.0

+#langchain==0.3.24
 langchain-community==0.3.22
 langchain-huggingface==0.1.2
 beautifulsoup4==4.13.4
 pandas==2.2.3
 transformers==4.51.3
 torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/cu121
+#llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+llama-cpp-python==0.3.8 -C cmake.args="-DGGML_CUDA=on"
 sentence_transformers==4.1.0
 faiss-cpu==1.10.0
 pypdf==5.4.0