Spaces:

DrishtiSharma
/

chat-w-google-patents

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

fc64c4c

verified ·

1 Parent(s): e90d440

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -73

app.py CHANGED Viewed

@@ -1,11 +1,27 @@
 import sys
 import os
 import re
 import time
-import tempfile
 import streamlit as st
 import nltk
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain.llms import OpenAI
@@ -15,52 +31,62 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import NLTKTextSplitter
 from patent_downloader import PatentDownloader
-# Download NLTK resources
-nltk.download("punkt", quiet=True)
-#fetch API key
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    st.error("Critical Error: OpenAI API key not found in environment variables. Please configure it.")
     st.stop()
-def extract_patent_number(url):
-    """Extracts patent number from a Google patent link."""
-    pattern = r"/patent/([A-Z]{2}\d+)"
-    match = re.search(pattern, url)
-    return match.group(1) if match else None
-def download_pdf(patent_number):
-    """Downloads patent PDF using a temporary directory."""
-    try:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            patent_downloader = PatentDownloader(verbose=True)
-            output_path = patent_downloader.download(patents=patent_number, output_path=temp_dir)
-            return output_path[0]
-    except Exception as e:
-        st.error(f"Failed to download patent PDF: {e}")
-        return None
 def load_docs(document_path):
-    """Loads and splits PDF documents into chunks."""
     try:
-        loader = UnstructuredPDFLoader(document_path)
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         return text_splitter.split_documents(documents)
     except Exception as e:
-        st.error(f"Failed to process PDF: {e}")
-        return []
-def load_chain(docs):
-    """Creates a conversational retrieval chain using in-memory ChromaDB."""
-    vectordb = Chroma.from_documents(
-        docs, HuggingFaceEmbeddings(), persist_directory=None
     )
     memory = ConversationBufferMemory(
         memory_key="chat_history",
@@ -68,7 +94,6 @@ def load_chain(docs):
         input_key="question",
         output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         vectordb.as_retriever(search_kwargs={"k": 3}),
@@ -76,8 +101,20 @@ def load_chain(docs):
         memory=memory,
     )
-# Streamlit UI
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Patent Chat: Google Patents Chat Demo",
@@ -85,10 +122,8 @@ if __name__ == "__main__":
         layout="wide",
         initial_sidebar_state="expanded",
     )
     st.header("📖 Patent Chat: Google Patents Chat Demo")
-    # Input for Google Patent Link
     patent_link = st.text_input("Enter Google Patent Link:", key="PATENT_LINK")
     if not patent_link:
@@ -100,64 +135,48 @@ if __name__ == "__main__":
         st.error("Invalid patent link format. Please provide a valid Google patent link.")
         st.stop()
-    st.write(f"🔍 Patent Number: **{patent_number}**")
-    # Download or Upload PDF
-    st.write("📥 Downloading patent PDF...")
-    pdf_path = None
-    try:
         pdf_path = download_pdf(patent_number)
-    except Exception:
-        st.error("Automatic download failed. Please upload the PDF manually below.")
-    if not pdf_path:
-        uploaded_file = st.file_uploader("Upload the patent PDF file:", type="pdf")
-        if uploaded_file:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
-                tmp_file.write(uploaded_file.read())
-                pdf_path = tmp_file.name
-            st.success("✅ PDF successfully uploaded.")
-        else:
-            st.stop()
-    # Load and Process PDF
-    st.write("🔄 Processing document...")
-    docs = load_docs(pdf_path)
-    if not docs:
-        st.error("No content found in the PDF. Exiting...")
-        st.stop()
-    chain = load_chain(docs)
     st.success("🚀 Document successfully loaded! You can now start asking questions.")
-    # Initialize chat history
     if "messages" not in st.session_state:
         st.session_state["messages"] = [
             {"role": "assistant", "content": "Hello! How can I assist you with this patent?"}
         ]
-    # Display chat history
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
-    # Handle User Input
     if user_input := st.chat_input("What is your question?"):
         st.session_state.messages.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         with st.chat_message("assistant"):
             message_placeholder = st.empty()
-            with st.spinner("Generating response..."):
-                try:
-                    assistant_response = chain({"question": user_input})
-                    full_response = assistant_response.get("answer", "I'm sorry, I couldn't generate a response.")
-                except Exception as e:
-                    full_response = f"An error occurred: {e}"
-            message_placeholder.markdown(full_response)
         st.session_state.messages.append({"role": "assistant", "content": full_response})

 import sys
 import os
 import re
+import shutil
 import time
 import streamlit as st
 import nltk
+import tempfile
+# Set up temporary directory for NLTK resources
+nltk_data_path = os.path.join(tempfile.gettempdir(), "nltk_data")
+os.makedirs(nltk_data_path, exist_ok=True)
+nltk.data.path = [nltk_data_path]  # Force NLTK to use only the temp directory
+# Force clean download of 'punkt'
+try:
+    print("Ensuring NLTK 'punkt' resource is downloaded...")
+    if not os.path.exists(os.path.join(nltk_data_path, "tokenizers/punkt")):
+        nltk.download("punkt", download_dir=nltk_data_path)
+except Exception as e:
+    print(f"Error downloading NLTK 'punkt': {e}")
+    raise e
+sys.path.append(os.path.abspath("."))
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain.llms import OpenAI
 from langchain.text_splitter import NLTKTextSplitter
 from patent_downloader import PatentDownloader
+PERSISTED_DIRECTORY = tempfile.mkdtemp()
+# Fetch API key securely from the environment
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    st.error("Critical Error: OpenAI API key not found in the environment variables. Please configure it.")
     st.stop()
+def check_poppler_installed():
+    if not shutil.which("pdfinfo"):
+        raise EnvironmentError(
+            "Poppler is not installed or not in PATH. Install 'poppler-utils' for PDF processing."
+        )
+check_poppler_installed()
 def load_docs(document_path):
     try:
+        loader = UnstructuredPDFLoader(
+            document_path,
+            mode="elements",
+            strategy="fast",
+            ocr_languages=None
+        )
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         return text_splitter.split_documents(documents)
     except Exception as e:
+        st.error(f"Failed to load and process PDF: {e}")
+        st.stop()
+def already_indexed(vectordb, file_name):
+    indexed_sources = set(
+        x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
+    )
+    return file_name in indexed_sources
+def load_chain(file_name=None):
+    loaded_patent = st.session_state.get("LOADED_PATENT")
+    vectordb = Chroma(
+        persist_directory=PERSISTED_DIRECTORY,
+        embedding_function=HuggingFaceEmbeddings(),
     )
+    if loaded_patent == file_name or already_indexed(vectordb, file_name):
+        st.write("✅ Already indexed.")
+    else:
+        vectordb.delete_collection()
+        docs = load_docs(file_name)
+        st.write("🔍 Number of Documents: ", len(docs))
+        vectordb = Chroma.from_documents(
+            docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
+        )
+        vectordb.persist()
+        st.session_state["LOADED_PATENT"] = file_name
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         input_key="question",
         output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         vectordb.as_retriever(search_kwargs={"k": 3}),
         memory=memory,
     )
+def extract_patent_number(url):
+    pattern = r"/patent/([A-Z]{2}\d+)"
+    match = re.search(pattern, url)
+    return match.group(1) if match else None
+def download_pdf(patent_number):
+    try:
+        patent_downloader = PatentDownloader(verbose=True)
+        output_path = patent_downloader.download(patents=patent_number, output_path=tempfile.gettempdir())
+        return output_path[0]
+    except Exception as e:
+        st.error(f"Failed to download patent PDF: {e}")
+        st.stop()
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Patent Chat: Google Patents Chat Demo",
         layout="wide",
         initial_sidebar_state="expanded",
     )
     st.header("📖 Patent Chat: Google Patents Chat Demo")
     patent_link = st.text_input("Enter Google Patent Link:", key="PATENT_LINK")
     if not patent_link:
         st.error("Invalid patent link format. Please provide a valid Google patent link.")
         st.stop()
+    st.write(f"Patent number: **{patent_number}**")
+    pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
+    if os.path.isfile(pdf_path):
+        st.write("✅ File already downloaded.")
+    else:
+        st.write("📥 Downloading patent file...")
         pdf_path = download_pdf(patent_number)
+        st.write(f"✅ File downloaded: {pdf_path}")
+    st.write("🔄 Loading document into the system...")
+    chain = load_chain(pdf_path)
     st.success("🚀 Document successfully loaded! You can now start asking questions.")
     if "messages" not in st.session_state:
         st.session_state["messages"] = [
             {"role": "assistant", "content": "Hello! How can I assist you with this patent?"}
         ]
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
     if user_input := st.chat_input("What is your question?"):
         st.session_state.messages.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         with st.chat_message("assistant"):
             message_placeholder = st.empty()
+            full_response = ""
+        with st.spinner("Generating response..."):
+            try:
+                assistant_response = chain({"question": user_input})
+                for chunk in assistant_response["answer"].split():
+                    full_response += chunk + " "
+                    time.sleep(0.05)
+                    message_placeholder.markdown(full_response + "▌")
+            except Exception as e:
+                full_response = f"An error occurred: {e}"
+            finally:
+                message_placeholder.markdown(full_response)
         st.session_state.messages.append({"role": "assistant", "content": full_response})