Nechba commited on
Commit
5d311f4
·
verified ·
1 Parent(s): 2bc53ef

Upload 5 files

Browse files
Files changed (5) hide show
  1. 23NYCRR500_0.pdf +0 -0
  2. app.py +196 -0
  3. htmlTemplates.py +44 -0
  4. readme.md +74 -0
  5. requirements.txt +11 -0
23NYCRR500_0.pdf ADDED
Binary file (566 kB). View file
 
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # from langchain.text_splitter import CharacterTextSplitter
4
+ # from langchain.embeddings import OllamaEmbeddings
5
+ # from langchain.vectorstores import FAISS
6
+ # from langchain.callbacks.manager import CallbackManager
7
+ # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
8
+ # from langchain.chat_models import ChatOllama
9
+ # from langchain.memory import ConversationBufferMemory
10
+ # from langchain.chains import ConversationalRetrievalChain
11
+ from htmlTemplates import css, bot_template, user_template
12
+ from functools import wraps
13
+ # -------
14
+ import time
15
+ from IPython.display import Image
16
+ from pprint import pprint
17
+ import torch
18
+ import rich
19
+ import random
20
+ from langchain_community.document_loaders import WebBaseLoader
21
+ from langchain.text_splitter import CharacterTextSplitter
22
+ from langchain_community.document_loaders import PyPDFLoader
23
+ from haystack.dataclasses import Document
24
+
25
+ from haystack import Pipeline
26
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
27
+ from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
28
+ from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
29
+ from haystack.components.writers import DocumentWriter
30
+ from haystack.document_stores.types import DuplicatePolicy
31
+ from haystack.utils import ComponentDevice
32
+ from haystack.components.generators import HuggingFaceLocalGenerator
33
+ from haystack.components.builders import PromptBuilder
34
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
35
+
36
+ # Decorator for measuring execution time
37
+ def timeit(func):
38
+ @wraps(func)
39
+ def timeit_wrapper(*args, **kwargs):
40
+ start_time = time.perf_counter()
41
+ result = func(*args, **kwargs)
42
+ end_time = time.perf_counter()
43
+ total_time = end_time - start_time
44
+ print(f"\nFunction {func.__name__} Took {total_time:.4f} seconds")
45
+ return result
46
+
47
+ return timeit_wrapper
48
+
49
+
50
+ @timeit
51
+ def load_chunk_data():
52
+ # oad data from websites
53
+ urls= ['https://csrc.nist.gov/projects/olir/informative-reference-catalog/details?referenceId=99#/',
54
+ 'https://attack.mitre.org/',
55
+ 'https://cloudsecurityalliance.org/',
56
+ 'https://www.ftc.gov/business-guidance/small-businesses/cybersecurity/basics',
57
+ 'https://www.pcisecuritystandards.org/',
58
+ 'https://www.google.com/url?q=https://gdpr.eu/&sa=U&sqi=2&ved=2ahUKEwjJ8Ib2_6WFAxUxhYkEHQcPDYkQFnoECBoQAQ&usg=AOvVaw0wq2V0DbVTnZS1IzbdX0Os']
59
+ docs = []
60
+ for url in urls:
61
+ loader = WebBaseLoader(url)
62
+ data = loader.load()
63
+
64
+ # Split the loaded data
65
+ text_splitter = CharacterTextSplitter(separator='\n',
66
+ chunk_size=1000,
67
+ chunk_overlap=40)
68
+
69
+ doc = text_splitter.split_documents(data)
70
+ docs.extend(doc)
71
+ # load data from pdf
72
+ loader = PyPDFLoader("23NYCRR500_0.pdf")
73
+ pages = loader.load_and_split()
74
+
75
+ doc = text_splitter.split_documents(pages)
76
+ docs.extend(doc)
77
+
78
+ raw_docs=[]
79
+
80
+ for doc in docs:
81
+ doc = Document(content=doc.page_content, meta=doc.metadata)
82
+ raw_docs.append(doc)
83
+ return raw_docs
84
+ @timeit
85
+ def indexing_pipeline(raw_docs):
86
+ document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
87
+ indexing = Pipeline()
88
+ indexing.add_component("cleaner", DocumentCleaner())
89
+ indexing.add_component("splitter", DocumentSplitter(split_by='sentence', split_length=2))
90
+ indexing.add_component("doc_embedder", SentenceTransformersDocumentEmbedder(model="thenlper/gte-large",
91
+ device=ComponentDevice.from_str("cpu"),
92
+ meta_fields_to_embed=["title"]))
93
+ indexing.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
94
+
95
+ indexing.connect("cleaner", "splitter")
96
+ indexing.connect("splitter", "doc_embedder")
97
+ indexing.connect("doc_embedder", "writer")
98
+ #raw_docs = load_chunk_data()
99
+ indexing.run({"cleaner":{"documents":raw_docs}})
100
+ return document_store
101
+ @timeit
102
+ def rag_pipeline(document_store):
103
+ generator = HuggingFaceLocalGenerator("HuggingFaceH4/zephyr-7b-beta",
104
+
105
+ generation_kwargs={"max_new_tokens": 1000})
106
+ generator.warm_up()
107
+ prompt_template = """<|system|>Using the information contained in the context, give a comprehensive answer to the question.
108
+ If the answer is contained in the context, also report the source URL.
109
+ If the answer cannot be deduced from the context, do not give an answer.</s>
110
+ <|user|>
111
+ Context:
112
+ {% for doc in documents %}
113
+ {{ doc.content }} URL:{{ doc.meta['url'] }}
114
+ {% endfor %};
115
+ Question: {{query}}
116
+ </s>
117
+ <|assistant|>
118
+ """
119
+ prompt_builder = PromptBuilder(template=prompt_template)
120
+ rag = Pipeline()
121
+ rag.add_component("text_embedder", SentenceTransformersTextEmbedder(model="thenlper/gte-large",
122
+ device=ComponentDevice.from_str("cpu")))
123
+ rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))
124
+ rag.add_component("prompt_builder", prompt_builder)
125
+ rag.add_component("llm", generator)
126
+
127
+ rag.connect("text_embedder", "retriever")
128
+ rag.connect("retriever.documents", "prompt_builder.documents")
129
+ rag.connect("prompt_builder.prompt", "llm.prompt")
130
+ return rag
131
+ @timeit
132
+ def get_generative_answer(query,rag):
133
+
134
+ results = rag.run({
135
+ "text_embedder": {"text": query},
136
+ "prompt_builder": {"query": query}
137
+ }
138
+ )
139
+
140
+ answer = results["llm"]["replies"][0]
141
+ return answer
142
+
143
+ # Function to handle user input and generate responses
144
+ @timeit
145
+ def handle_userinput(user_question, rag):
146
+ answer = get_generative_answer(user_question, rag)
147
+ st.write(bot_template.replace("{{MSG}}", answer), unsafe_allow_html=True)
148
+ # Function to create a conversation chain
149
+ # @timeit
150
+ # def get_conversation_chain(vectorstore):
151
+ # llm = ChatOllama(
152
+ # model="llama2:70b-chat",
153
+ # callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
154
+ # # num_gpu=2
155
+ # )
156
+ # # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
157
+
158
+ # memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
159
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
160
+ # llm=llm, retriever=vectorstore.as_retriever(), memory=memory
161
+ # )
162
+ # return conversation_chain
163
+
164
+
165
+ # Function to handle user input and generate responses
166
+
167
+
168
+
169
+ # Main function
170
+ def main():
171
+
172
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
173
+ st.write(css, unsafe_allow_html=True)
174
+
175
+ # Initialize session state variables
176
+ if "conversation" not in st.session_state:
177
+ st.session_state.conversation = None
178
+ if "chat_history" not in st.session_state:
179
+ st.session_state.chat_history = None
180
+
181
+ # Streamlit app layout
182
+ st.header("Chat with multiple PDFs :books:")
183
+ user_question = st.text_input("Ask a question about your documents:")
184
+ if user_question:
185
+ # Load and index data only once
186
+ if "document_store" not in st.session_state:
187
+ raw_docs = load_chunk_data()
188
+ document_store = indexing_pipeline(raw_docs)
189
+ st.session_state.document_store = document_store
190
+ st.session_state.rag = rag_pipeline(document_store)
191
+ print(user_question)
192
+ handle_userinput(user_question, st.session_state.rag)
193
+
194
+
195
+ if __name__ == "__main__":
196
+ main()
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = """
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ """
27
+
28
+ bot_template = """
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ """
36
+
37
+ user_template = """
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ """
readme.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MultiPDF Chat App
2
+
3
+ ## Introduction
4
+ ------------
5
+ The MultiPDF Chat App is a Python application that allows you to chat with multiple PDF documents. You can ask questions about the PDFs using natural language, and the application will provide relevant responses based on the content of the documents. This app utilizes a language model to generate accurate answers to your queries. Please note that the app will only respond to questions related to the loaded PDFs.
6
+
7
+ ## How It Works
8
+ ------------
9
+
10
+ ![MultiPDF Chat App Diagram](./docs/PDF-LangChain.jpg)
11
+
12
+ The application follows these steps to provide responses to your questions:
13
+
14
+ 1. PDF Loading: The app reads multiple PDF documents and extracts their text content.
15
+
16
+ 2. Text Chunking: The extracted text is divided into smaller chunks that can be processed effectively.
17
+
18
+ 3. Language Model: The application utilizes a language model to generate vector representations (embeddings) of the text chunks.
19
+
20
+ 4. Similarity Matching: When you ask a question, the app compares it with the text chunks and identifies the most semantically similar ones.
21
+
22
+ 5. Response Generation: The selected chunks are passed to the language model, which generates a response based on the relevant content of the PDFs.
23
+
24
+ ## Dependencies and Installation
25
+ ----------------------------
26
+ To install the MultiPDF Chat App, please follow these steps:
27
+ 1. Download [Ollama library](https://github.com/jmorganca/ollama)
28
+ ```
29
+ curl https://ollama.ai/install.sh | sh
30
+ ```
31
+ 2. pull the chat model we will use [LLAMA2](https://ollama.ai/library/llama2)
32
+ ```
33
+ ollama pull llama2:70b-chat
34
+ ```
35
+ 3. Create new enviroment with python 3.9 and activate it, in this case we will use conda
36
+ ```
37
+ conda create -n chat-with-pdf python=3.9
38
+ ```
39
+ ```
40
+ conda activate chat-with-pdf
41
+ ```
42
+
43
+
44
+ 4. Clone the repository to your local machine.
45
+ ```
46
+ git clone https://github.com/jorge-armando-navarro-flores/chat-with-multiple-PDFs-LLAMA2.git
47
+ ```
48
+ ```
49
+ cd chat-with-multiple-PDFs-LLAMA2
50
+ ```
51
+
52
+ 5. Install the required dependencies by running the following command:
53
+ ```
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+
58
+ ## Usage
59
+ -----
60
+ To use the MultiPDF Chat App, follow these steps:
61
+
62
+ 1. Run the `main.py` file using the Streamlit CLI. Execute the following command:
63
+ ```
64
+ streamlit run app.py
65
+ ```
66
+
67
+ 2. The application will launch in your default web browser, displaying the user interface.
68
+ ![Upload PDF](./docs/interface.png)
69
+
70
+ 3. Load multiple PDF documents into the app by following the provided instructions.
71
+ ![Upload PDF](./docs/upload_PDF.png)
72
+
73
+ 4. Ask questions in natural language about the loaded PDFs using the chat interface.
74
+ ![Ask to PDF](./docs/ask_to_PDF.png)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ haystack-ai
2
+ transformers
3
+ bitsandbytes==0.39.0
4
+ accelerate
5
+ sentence_transformers
6
+ langchain_community
7
+ langchain
8
+ pypdf
9
+ streamlit
10
+ IPython
11
+ beautifulsoup4