Spaces:

tdurzynski
/

chat-with-your-data

Running

File size: 9,433 Bytes

0bfd27d
 
 
9ca2091
0bfd27d
ddcc450
 
 
0bfd27d
 
 
42866ce
 
0bfd27d
 
 
 
 
 
 
42866ce
0bfd27d
 
 
 
 
ddcc450
0bfd27d
 
ddcc450
 
0bfd27d
 
 
42866ce
 
 
 
ddcc450
0bfd27d
 
 
9ca2091
6f98b16
 
 
0bfd27d
6f98b16
42866ce
0bfd27d
6f98b16
0bfd27d
 
6f98b16
0bfd27d
9ca2091
 
 
ddcc450
9ca2091
0bfd27d
 
 
9ca2091
 
0bfd27d
 
42866ce
 
0bfd27d
 
42866ce
0bfd27d
 
42866ce
0bfd27d
 
 
42866ce
9ca2091
 
 
 
 
42866ce
 
9ca2091
42866ce
 
 
 
9ca2091
 
42866ce
9ca2091
 
42866ce
9ca2091
 
 
 
 
 
42866ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ca2091
42866ce
0bfd27d
9ca2091
6f98b16
 
9ca2091
 
42866ce
9ca2091
 
 
6f98b16
 
 
 
9ca2091
6f98b16
 
 
 
 
 
9ca2091
42866ce
9ca2091
42866ce
9ca2091
 
 
 
 
 
 
42866ce
9ca2091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42866ce
9ca2091
42866ce
9ca2091
 
 
42866ce
 
9ca2091
 
 
42866ce
9ca2091
 
 
 
 
 
 
42866ce
9ca2091
42866ce
9ca2091
 
 
 
42866ce
9ca2091
42866ce
 
 
0bfd27d
ddcc450

import os
import logging
import gradio as gr
import asyncio
from dotenv import load_dotenv
from langchain_community.document_loaders import ArxivLoader  # Updated import
from langchain_community.vectorstores import Chroma  # Updated import
from langchain_huggingface import HuggingFaceEmbeddings  # Updated import
from langchain_groq import ChatGroq
from PyPDF2 import PdfReader
from huggingface_hub import login
from groq import AsyncGroq
from langchain.docstore.document import Document

# Load environment variables
load_dotenv()
HUGGING_API_KEY = os.getenv("HUGGING_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not HUGGING_API_KEY or not GROQ_API_KEY:
    raise ValueError("API keys for HuggingFace or Groq are missing.")

# Configure Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Authenticate with Hugging Face (for model downloads)
login(HUGGING_API_KEY)

# Load models and embeddings with a local embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key=GROQ_API_KEY)
client = AsyncGroq(api_key=GROQ_API_KEY)

# Global state for PDF vector store
pdf_vector_store = None
current_pdf_path = None

# General Chat
async def chat_with_replit(message, history):
    try:
        messages = [{"role": "system", "content": "You are an assistant answering user questions."}]
        for chat in history or []:
            user_msg, assistant_msg = chat
            messages.append({"role": "user", "content": user_msg})
            messages.append({"role": "assistant", "content": assistant_msg})
        messages.append({"role": "user", "content": message})
        response = await client.chat.completions.create(
            messages=messages, model="llama3-70b-8192", temperature=0, max_tokens=1024, top_p=1, stream=False
        )
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Chat error: {e}")
        return "Error in chat response."

def chat_with_replit_sync(message, history):
    return asyncio.run(chat_with_replit(message, history))

# ArXiv Chat
async def chat_with_replit_arxiv(message, history, doi_num):
    try:
        loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
        documents = loader.load_and_split()
        if not documents:
            return "No documents found for the provided arXiv number."
        metadata = documents[0].metadata
        vector_store = Chroma.from_documents(documents, embedding_model)
        results = vector_store.similarity_search(message, k=3)
        relevant_content = "\n\n".join(doc.page_content for doc in results)
        messages = [
            {"role": "user", "content": message},
            {"role": "system", "content": f"Answer based on this arXiv paper {doi_num}.\nMetadata: {metadata}.\nRelevant Content: {relevant_content}"}
        ]
        response = await client.chat.completions.create(
            messages=messages, model="llama3-70b-8192", temperature=0, max_tokens=1024, top_p=1, stream=False
        )
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Error in chat with ArXiv PDF: {e}")
        return "Error processing chat with arXiv paper."

def chat_with_replit_arxiv_sync(message, history, doi_num):
    return asyncio.run(chat_with_replit_arxiv(message, history, doi_num))

# Local PDF Chat
async def chat_with_replit_local_pdf(message, vector_store):
    try:
        if not vector_store:
            return "Please upload a PDF first and wait for processing to complete."
        results = vector_store.similarity_search(message, k=3)
        relevant_content = "\n\n".join(doc.page_content for doc in results)
        messages = [
            {"role": "user", "content": message},
            {"role": "system", "content": f"Answer based on the uploaded PDF.\nRelevant Content: {relevant_content}"}
        ]
        response = await client.chat.completions.create(
            messages=messages, model="llama3-70b-8192", temperature=0, max_tokens=1024, top_p=1, stream=False
        )
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Error in chat with local PDF: {e}")
        return "Error processing chat with local PDF."

def process_pdf(pdf_file):
    global pdf_vector_store, current_pdf_path
    try:
        if pdf_file != current_pdf_path:
            logger.info("Extracting text from PDF...")
            reader = PdfReader(pdf_file)
            text = "\n".join(page.extract_text() or "" for page in reader.pages)
            if not text.strip():
                return "Could not extract text from PDF."
            documents = [Document(page_content=text, metadata={"source": pdf_file})]
            logger.info("Creating vector store...")
            pdf_vector_store = Chroma.from_documents(documents, embedding_model)
            current_pdf_path = pdf_file
            return "PDF processed successfully. You can now ask questions."
        return "PDF already processed. Ask away!"
    except Exception as e:
        logger.error(f"Error processing PDF: {e}")
        return f"Error processing PDF: {str(e)}"

# Gradio UI
with gr.Blocks() as app:
    with gr.Tab(label="General Chat"):
        gr.Markdown("### Chat with the Assistant")
        with gr.Row():
            general_chat_input = gr.Textbox(placeholder="Type your message here...", label="Your Message")
            general_send_button = gr.Button("Send")
        general_chat_output = gr.Markdown(label="Chat Output")
        general_chat_history = gr.State([])

        def update_general_chat(user_message, history):
            history = history or []
            history.append([user_message, ""])
            return history, history

        def update_general_response(history):
            user_message = history[-1][0]
            response = chat_with_replit_sync(user_message, history[:-1])
            history[-1][1] = response
            formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
            return history, formatted

        general_send_button.click(update_general_chat, inputs=[general_chat_input, general_chat_history],
                                  outputs=[general_chat_history, general_chat_output])
        general_send_button.click(update_general_response, inputs=general_chat_history,
                                  outputs=[general_chat_history, general_chat_output])

    with gr.Tab(label="Chat with ArXiv Paper"):
        gr.Markdown("### Ask Questions About an ArXiv Paper")
        with gr.Row():
            arxiv_input = gr.Textbox(placeholder="Enter your question here...", label="Your Question")
            arxiv_doi = gr.Textbox(placeholder="Enter arXiv number, e.g. 2502.02523", label="ArXiv Number")
            arxiv_send_button = gr.Button("Send")
        arxiv_chat_output = gr.Markdown(label="Chat Output")
        arxiv_chat_history = gr.State([])

        def update_arxiv_chat(user_message, history):
            history = history or []
            history.append([user_message, ""])
            return history, history

        def update_arxiv_response(history, doi_num):
            user_message = history[-1][0]
            response = chat_with_replit_arxiv_sync(user_message, history[:-1], doi_num)
            history[-1][1] = response
            formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
            return history, formatted

        arxiv_send_button.click(update_arxiv_chat, inputs=[arxiv_input, arxiv_chat_history],
                                outputs=[arxiv_chat_history, arxiv_chat_output])
        arxiv_send_button.click(update_arxiv_response, inputs=[arxiv_chat_history, arxiv_doi],
                                outputs=[arxiv_chat_history, arxiv_chat_output])

    with gr.Tab(label="Chat with Local PDF"):
        gr.Markdown("### Ask Questions About an Uploaded PDF")
        pdf_file_input = gr.File(label="Upload PDF file", file_types=[".pdf"])
        pdf_status = gr.Textbox(label="PDF Processing Status", interactive=False)
        with gr.Row():
            pdf_chat_input = gr.Textbox(placeholder="Enter your question here...", label="Your Question")
            pdf_send_button = gr.Button("Send")
        pdf_chat_output = gr.Markdown(label="Chat Output")
        pdf_chat_history = gr.State([])

        def update_pdf_chat(user_message, history):
            history = history or []
            history.append([user_message, ""])
            return history, history

        def update_pdf_response(history):
            user_message = history[-1][0]
            response = asyncio.run(chat_with_replit_local_pdf(user_message, pdf_vector_store))
            history[-1][1] = response
            formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
            return history, formatted

        pdf_file_input.change(process_pdf, inputs=pdf_file_input, outputs=pdf_status)
        pdf_send_button.click(update_pdf_chat, inputs=[pdf_chat_input, pdf_chat_history],
                              outputs=[pdf_chat_history, pdf_chat_output])
        pdf_send_button.click(update_pdf_response, inputs=pdf_chat_history,
                              outputs=[pdf_chat_history, pdf_chat_output])

app.launch()