Spaces:
Running
Running
Upload 5 files
Browse files- 23NYCRR500_0.pdf +0 -0
- app.py +196 -0
- htmlTemplates.py +44 -0
- readme.md +74 -0
- requirements.txt +11 -0
23NYCRR500_0.pdf
ADDED
Binary file (566 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# from langchain.text_splitter import CharacterTextSplitter
|
4 |
+
# from langchain.embeddings import OllamaEmbeddings
|
5 |
+
# from langchain.vectorstores import FAISS
|
6 |
+
# from langchain.callbacks.manager import CallbackManager
|
7 |
+
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
8 |
+
# from langchain.chat_models import ChatOllama
|
9 |
+
# from langchain.memory import ConversationBufferMemory
|
10 |
+
# from langchain.chains import ConversationalRetrievalChain
|
11 |
+
from htmlTemplates import css, bot_template, user_template
|
12 |
+
from functools import wraps
|
13 |
+
# -------
|
14 |
+
import time
|
15 |
+
from IPython.display import Image
|
16 |
+
from pprint import pprint
|
17 |
+
import torch
|
18 |
+
import rich
|
19 |
+
import random
|
20 |
+
from langchain_community.document_loaders import WebBaseLoader
|
21 |
+
from langchain.text_splitter import CharacterTextSplitter
|
22 |
+
from langchain_community.document_loaders import PyPDFLoader
|
23 |
+
from haystack.dataclasses import Document
|
24 |
+
|
25 |
+
from haystack import Pipeline
|
26 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
27 |
+
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
|
28 |
+
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
|
29 |
+
from haystack.components.writers import DocumentWriter
|
30 |
+
from haystack.document_stores.types import DuplicatePolicy
|
31 |
+
from haystack.utils import ComponentDevice
|
32 |
+
from haystack.components.generators import HuggingFaceLocalGenerator
|
33 |
+
from haystack.components.builders import PromptBuilder
|
34 |
+
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
|
35 |
+
|
36 |
+
# Decorator for measuring execution time
|
37 |
+
def timeit(func):
|
38 |
+
@wraps(func)
|
39 |
+
def timeit_wrapper(*args, **kwargs):
|
40 |
+
start_time = time.perf_counter()
|
41 |
+
result = func(*args, **kwargs)
|
42 |
+
end_time = time.perf_counter()
|
43 |
+
total_time = end_time - start_time
|
44 |
+
print(f"\nFunction {func.__name__} Took {total_time:.4f} seconds")
|
45 |
+
return result
|
46 |
+
|
47 |
+
return timeit_wrapper
|
48 |
+
|
49 |
+
|
50 |
+
@timeit
|
51 |
+
def load_chunk_data():
|
52 |
+
# oad data from websites
|
53 |
+
urls= ['https://csrc.nist.gov/projects/olir/informative-reference-catalog/details?referenceId=99#/',
|
54 |
+
'https://attack.mitre.org/',
|
55 |
+
'https://cloudsecurityalliance.org/',
|
56 |
+
'https://www.ftc.gov/business-guidance/small-businesses/cybersecurity/basics',
|
57 |
+
'https://www.pcisecuritystandards.org/',
|
58 |
+
'https://www.google.com/url?q=https://gdpr.eu/&sa=U&sqi=2&ved=2ahUKEwjJ8Ib2_6WFAxUxhYkEHQcPDYkQFnoECBoQAQ&usg=AOvVaw0wq2V0DbVTnZS1IzbdX0Os']
|
59 |
+
docs = []
|
60 |
+
for url in urls:
|
61 |
+
loader = WebBaseLoader(url)
|
62 |
+
data = loader.load()
|
63 |
+
|
64 |
+
# Split the loaded data
|
65 |
+
text_splitter = CharacterTextSplitter(separator='\n',
|
66 |
+
chunk_size=1000,
|
67 |
+
chunk_overlap=40)
|
68 |
+
|
69 |
+
doc = text_splitter.split_documents(data)
|
70 |
+
docs.extend(doc)
|
71 |
+
# load data from pdf
|
72 |
+
loader = PyPDFLoader("23NYCRR500_0.pdf")
|
73 |
+
pages = loader.load_and_split()
|
74 |
+
|
75 |
+
doc = text_splitter.split_documents(pages)
|
76 |
+
docs.extend(doc)
|
77 |
+
|
78 |
+
raw_docs=[]
|
79 |
+
|
80 |
+
for doc in docs:
|
81 |
+
doc = Document(content=doc.page_content, meta=doc.metadata)
|
82 |
+
raw_docs.append(doc)
|
83 |
+
return raw_docs
|
84 |
+
@timeit
|
85 |
+
def indexing_pipeline(raw_docs):
|
86 |
+
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
|
87 |
+
indexing = Pipeline()
|
88 |
+
indexing.add_component("cleaner", DocumentCleaner())
|
89 |
+
indexing.add_component("splitter", DocumentSplitter(split_by='sentence', split_length=2))
|
90 |
+
indexing.add_component("doc_embedder", SentenceTransformersDocumentEmbedder(model="thenlper/gte-large",
|
91 |
+
device=ComponentDevice.from_str("cpu"),
|
92 |
+
meta_fields_to_embed=["title"]))
|
93 |
+
indexing.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
|
94 |
+
|
95 |
+
indexing.connect("cleaner", "splitter")
|
96 |
+
indexing.connect("splitter", "doc_embedder")
|
97 |
+
indexing.connect("doc_embedder", "writer")
|
98 |
+
#raw_docs = load_chunk_data()
|
99 |
+
indexing.run({"cleaner":{"documents":raw_docs}})
|
100 |
+
return document_store
|
101 |
+
@timeit
|
102 |
+
def rag_pipeline(document_store):
|
103 |
+
generator = HuggingFaceLocalGenerator("HuggingFaceH4/zephyr-7b-beta",
|
104 |
+
|
105 |
+
generation_kwargs={"max_new_tokens": 1000})
|
106 |
+
generator.warm_up()
|
107 |
+
prompt_template = """<|system|>Using the information contained in the context, give a comprehensive answer to the question.
|
108 |
+
If the answer is contained in the context, also report the source URL.
|
109 |
+
If the answer cannot be deduced from the context, do not give an answer.</s>
|
110 |
+
<|user|>
|
111 |
+
Context:
|
112 |
+
{% for doc in documents %}
|
113 |
+
{{ doc.content }} URL:{{ doc.meta['url'] }}
|
114 |
+
{% endfor %};
|
115 |
+
Question: {{query}}
|
116 |
+
</s>
|
117 |
+
<|assistant|>
|
118 |
+
"""
|
119 |
+
prompt_builder = PromptBuilder(template=prompt_template)
|
120 |
+
rag = Pipeline()
|
121 |
+
rag.add_component("text_embedder", SentenceTransformersTextEmbedder(model="thenlper/gte-large",
|
122 |
+
device=ComponentDevice.from_str("cpu")))
|
123 |
+
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))
|
124 |
+
rag.add_component("prompt_builder", prompt_builder)
|
125 |
+
rag.add_component("llm", generator)
|
126 |
+
|
127 |
+
rag.connect("text_embedder", "retriever")
|
128 |
+
rag.connect("retriever.documents", "prompt_builder.documents")
|
129 |
+
rag.connect("prompt_builder.prompt", "llm.prompt")
|
130 |
+
return rag
|
131 |
+
@timeit
|
132 |
+
def get_generative_answer(query,rag):
|
133 |
+
|
134 |
+
results = rag.run({
|
135 |
+
"text_embedder": {"text": query},
|
136 |
+
"prompt_builder": {"query": query}
|
137 |
+
}
|
138 |
+
)
|
139 |
+
|
140 |
+
answer = results["llm"]["replies"][0]
|
141 |
+
return answer
|
142 |
+
|
143 |
+
# Function to handle user input and generate responses
|
144 |
+
@timeit
|
145 |
+
def handle_userinput(user_question, rag):
|
146 |
+
answer = get_generative_answer(user_question, rag)
|
147 |
+
st.write(bot_template.replace("{{MSG}}", answer), unsafe_allow_html=True)
|
148 |
+
# Function to create a conversation chain
|
149 |
+
# @timeit
|
150 |
+
# def get_conversation_chain(vectorstore):
|
151 |
+
# llm = ChatOllama(
|
152 |
+
# model="llama2:70b-chat",
|
153 |
+
# callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
|
154 |
+
# # num_gpu=2
|
155 |
+
# )
|
156 |
+
# # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
157 |
+
|
158 |
+
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
159 |
+
# conversation_chain = ConversationalRetrievalChain.from_llm(
|
160 |
+
# llm=llm, retriever=vectorstore.as_retriever(), memory=memory
|
161 |
+
# )
|
162 |
+
# return conversation_chain
|
163 |
+
|
164 |
+
|
165 |
+
# Function to handle user input and generate responses
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
# Main function
|
170 |
+
def main():
|
171 |
+
|
172 |
+
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
|
173 |
+
st.write(css, unsafe_allow_html=True)
|
174 |
+
|
175 |
+
# Initialize session state variables
|
176 |
+
if "conversation" not in st.session_state:
|
177 |
+
st.session_state.conversation = None
|
178 |
+
if "chat_history" not in st.session_state:
|
179 |
+
st.session_state.chat_history = None
|
180 |
+
|
181 |
+
# Streamlit app layout
|
182 |
+
st.header("Chat with multiple PDFs :books:")
|
183 |
+
user_question = st.text_input("Ask a question about your documents:")
|
184 |
+
if user_question:
|
185 |
+
# Load and index data only once
|
186 |
+
if "document_store" not in st.session_state:
|
187 |
+
raw_docs = load_chunk_data()
|
188 |
+
document_store = indexing_pipeline(raw_docs)
|
189 |
+
st.session_state.document_store = document_store
|
190 |
+
st.session_state.rag = rag_pipeline(document_store)
|
191 |
+
print(user_question)
|
192 |
+
handle_userinput(user_question, st.session_state.rag)
|
193 |
+
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
main()
|
htmlTemplates.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = """
|
2 |
+
<style>
|
3 |
+
.chat-message {
|
4 |
+
padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
|
5 |
+
}
|
6 |
+
.chat-message.user {
|
7 |
+
background-color: #2b313e
|
8 |
+
}
|
9 |
+
.chat-message.bot {
|
10 |
+
background-color: #475063
|
11 |
+
}
|
12 |
+
.chat-message .avatar {
|
13 |
+
width: 20%;
|
14 |
+
}
|
15 |
+
.chat-message .avatar img {
|
16 |
+
max-width: 78px;
|
17 |
+
max-height: 78px;
|
18 |
+
border-radius: 50%;
|
19 |
+
object-fit: cover;
|
20 |
+
}
|
21 |
+
.chat-message .message {
|
22 |
+
width: 80%;
|
23 |
+
padding: 0 1.5rem;
|
24 |
+
color: #fff;
|
25 |
+
}
|
26 |
+
"""
|
27 |
+
|
28 |
+
bot_template = """
|
29 |
+
<div class="chat-message bot">
|
30 |
+
<div class="avatar">
|
31 |
+
<img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
|
32 |
+
</div>
|
33 |
+
<div class="message">{{MSG}}</div>
|
34 |
+
</div>
|
35 |
+
"""
|
36 |
+
|
37 |
+
user_template = """
|
38 |
+
<div class="chat-message user">
|
39 |
+
<div class="avatar">
|
40 |
+
<img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
|
41 |
+
</div>
|
42 |
+
<div class="message">{{MSG}}</div>
|
43 |
+
</div>
|
44 |
+
"""
|
readme.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MultiPDF Chat App
|
2 |
+
|
3 |
+
## Introduction
|
4 |
+
------------
|
5 |
+
The MultiPDF Chat App is a Python application that allows you to chat with multiple PDF documents. You can ask questions about the PDFs using natural language, and the application will provide relevant responses based on the content of the documents. This app utilizes a language model to generate accurate answers to your queries. Please note that the app will only respond to questions related to the loaded PDFs.
|
6 |
+
|
7 |
+
## How It Works
|
8 |
+
------------
|
9 |
+
|
10 |
+

|
11 |
+
|
12 |
+
The application follows these steps to provide responses to your questions:
|
13 |
+
|
14 |
+
1. PDF Loading: The app reads multiple PDF documents and extracts their text content.
|
15 |
+
|
16 |
+
2. Text Chunking: The extracted text is divided into smaller chunks that can be processed effectively.
|
17 |
+
|
18 |
+
3. Language Model: The application utilizes a language model to generate vector representations (embeddings) of the text chunks.
|
19 |
+
|
20 |
+
4. Similarity Matching: When you ask a question, the app compares it with the text chunks and identifies the most semantically similar ones.
|
21 |
+
|
22 |
+
5. Response Generation: The selected chunks are passed to the language model, which generates a response based on the relevant content of the PDFs.
|
23 |
+
|
24 |
+
## Dependencies and Installation
|
25 |
+
----------------------------
|
26 |
+
To install the MultiPDF Chat App, please follow these steps:
|
27 |
+
1. Download [Ollama library](https://github.com/jmorganca/ollama)
|
28 |
+
```
|
29 |
+
curl https://ollama.ai/install.sh | sh
|
30 |
+
```
|
31 |
+
2. pull the chat model we will use [LLAMA2](https://ollama.ai/library/llama2)
|
32 |
+
```
|
33 |
+
ollama pull llama2:70b-chat
|
34 |
+
```
|
35 |
+
3. Create new enviroment with python 3.9 and activate it, in this case we will use conda
|
36 |
+
```
|
37 |
+
conda create -n chat-with-pdf python=3.9
|
38 |
+
```
|
39 |
+
```
|
40 |
+
conda activate chat-with-pdf
|
41 |
+
```
|
42 |
+
|
43 |
+
|
44 |
+
4. Clone the repository to your local machine.
|
45 |
+
```
|
46 |
+
git clone https://github.com/jorge-armando-navarro-flores/chat-with-multiple-PDFs-LLAMA2.git
|
47 |
+
```
|
48 |
+
```
|
49 |
+
cd chat-with-multiple-PDFs-LLAMA2
|
50 |
+
```
|
51 |
+
|
52 |
+
5. Install the required dependencies by running the following command:
|
53 |
+
```
|
54 |
+
pip install -r requirements.txt
|
55 |
+
```
|
56 |
+
|
57 |
+
|
58 |
+
## Usage
|
59 |
+
-----
|
60 |
+
To use the MultiPDF Chat App, follow these steps:
|
61 |
+
|
62 |
+
1. Run the `main.py` file using the Streamlit CLI. Execute the following command:
|
63 |
+
```
|
64 |
+
streamlit run app.py
|
65 |
+
```
|
66 |
+
|
67 |
+
2. The application will launch in your default web browser, displaying the user interface.
|
68 |
+

|
69 |
+
|
70 |
+
3. Load multiple PDF documents into the app by following the provided instructions.
|
71 |
+

|
72 |
+
|
73 |
+
4. Ask questions in natural language about the loaded PDFs using the chat interface.
|
74 |
+

|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
haystack-ai
|
2 |
+
transformers
|
3 |
+
bitsandbytes==0.39.0
|
4 |
+
accelerate
|
5 |
+
sentence_transformers
|
6 |
+
langchain_community
|
7 |
+
langchain
|
8 |
+
pypdf
|
9 |
+
streamlit
|
10 |
+
IPython
|
11 |
+
beautifulsoup4
|