Spaces:

Oxbridge-Economics
/

Mailbox

Running

App Files Files Community

gavinzli commited on Mar 24

Commit

c529966

1 Parent(s): 593e9ef

Add initial binary files and update environment loading in main.py

Browse files

Files changed (9) hide show

.gitignore +2 -0
app.py +0 -1
chain/__init__.py +0 -50
collection_data.csv +0 -0
controllers/mail.py +46 -14
main.py +2 -1
models/mails/__init__.py +8 -7
requirements.txt +149 -0
token.pickle +0 -0

.gitignore CHANGED Viewed

@@ -172,3 +172,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc

 # PyPI configuration file
 .pypirc
+attachments

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ with st.sidebar:
         result = mail.collect()
         with st.chat_message("assistant"):
             response_content = st.markdown(result)
-            # st.session_state.messages.append({"role": "assistant", "content": result})
 if 'chat_id' not in st.session_state:
     st.session_state.chat_id = str(uuid.uuid4())

         result = mail.collect()
         with st.chat_message("assistant"):
             response_content = st.markdown(result)
 if 'chat_id' not in st.session_state:
     st.session_state.chat_id = str(uuid.uuid4())

chain/__init__.py CHANGED Viewed

@@ -6,15 +6,12 @@ from venv import logger
 from pymongo import errors
 from langchain_core.runnables.history import RunnableWithMessageHistory
-# from langchain_core.output_parsers import PydanticOutputParser
 from langchain_core.messages import BaseMessage, message_to_dict
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains.retrieval import create_retrieval_chain
 from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
 from langchain_mongodb import MongoDBChatMessageHistory
-# from schema import FollowUpQ
 from models.llm import GPTModel
 llm = GPTModel()
@@ -107,50 +104,3 @@ class RAGChain(RunnableWithMessageHistory):
             history_messages_key="chat_history",
             output_messages_key="answer"
         )
-# class FollowUpChain():
-#     """
-#     FollowUpQChain is a class to generate follow-up questions based on contexts and initial query.
-#     Attributes:
-#         parser (PydanticOutputParser): An instance of PydanticOutputParser to parse the output.
-#         chain (Chain): A chain of prompts and models to generate follow-up questions.
-#     Methods:
-#         __init__():
-#             Initializes the FollowUpQChain with a parser and a prompt chain.
-#         invoke(contexts, query):
-#             Invokes the chain with the provided contexts and query to generate follow-up questions.
-#                 contexts (str): The contexts to be used for generating follow-up questions.
-#                 query (str): The initial query to be used for generating follow-up questions.
-#     """
-#     def __init__(self):
-#         self.parser = PydanticOutputParser(pydantic_object=FollowUpQ)
-#         prompt = ChatPromptTemplate.from_messages([
-#                     ("system", "You are a professional commentator on current events.Your task\
-#                       is to provide 3 follow-up questions based on contexts and initial query."),
-#                     ("system", "contexts: {contexts}"),
-#                     ("system", "initial query: {query}"),
-#                     ("human", "Format instructions: {format_instructions}"),
-#                     ("placeholder", "{agent_scratchpad}"),
-#                 ])
-#         self.chain = prompt | llm | self.parser
-#     def invoke(self, query, contexts):
-#         """
-#         Invokes the chain with the provided content and additional parameters.
-#         Args:
-#             content (str): The article content to be processed.
-#         Returns:
-#             The result of the chain invocation.
-#         """
-#         result = self.chain.invoke({
-#             'contexts': contexts,
-#             'format_instructions': self.parser.get_format_instructions(),
-#             'query': query
-#         })
-#         return result.questions

 from pymongo import errors
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_core.messages import BaseMessage, message_to_dict
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains.retrieval import create_retrieval_chain
 from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
 from langchain_mongodb import MongoDBChatMessageHistory
 from models.llm import GPTModel
 llm = GPTModel()
             history_messages_key="chat_history",
             output_messages_key="answer"
         )

collection_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

controllers/mail.py CHANGED Viewed

@@ -1,16 +1,25 @@
 """Module to search and list emails from Gmail."""
 import base64
 from datetime import datetime, timedelta
 import pandas as pd
 from langchain_core.documents import Document
-from venv import logger
-from models.mails import build_gmail_service
 from models.chroma import vectorstore
 SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
 EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
 service = build_gmail_service()
 def search_emails(query):
@@ -28,10 +37,10 @@ def search_emails(query):
     return messages
 def list_emails(messages):
-    """List emails from the search results."""
     ids = []
     documents = []
-    for message in messages[:50]:
         msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
         metadata = {}
         for header in msg['payload']['headers']:
@@ -45,19 +54,42 @@ def list_emails(messages):
                 metadata['cc'] = header['value']
         metadata['date'] = datetime.fromtimestamp(
             int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
         if 'parts' in msg['payload']:
-            body = ''.join(
-                part['body']['data'] for part in msg['payload']['parts'] if 'data' in part['body']
-            )
-            body = base64.urlsafe_b64decode(body).decode('utf-8')
         else:
             body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
-        ids.append(msg['id'])
-        documents.append(Document(
-            page_content=body,
-            metadata=metadata
-        ))
-    return vectorstore.add_documents(documents= documents, ids = ids)
 def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
     """

 """Module to search and list emails from Gmail."""
+import os
+import re
 import base64
 from datetime import datetime, timedelta
+from venv import logger
 import pandas as pd
 from langchain_core.documents import Document
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders.image import UnstructuredImageLoader
+from langchain_community.document_loaders.csv_loader import CSVLoader
 from models.chroma import vectorstore
+from models.mails import build_gmail_service
 SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
 EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
+ATTACHMENTS_DIR = "attachments"
+os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
 service = build_gmail_service()
 def search_emails(query):
     return messages
 def list_emails(messages):
+    """List emails from the search results and download attachments."""
     ids = []
     documents = []
+    for message in messages[:100]:
         msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
         metadata = {}
         for header in msg['payload']['headers']:
                 metadata['cc'] = header['value']
         metadata['date'] = datetime.fromtimestamp(
             int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
+        body = ""
         if 'parts' in msg['payload']:
+            for part in msg['payload']['parts']:
+                if part['filename']:
+                    attachment_id = part['body']['attachmentId']
+                    logger.info("Downloading attachment: %s", part['filename'])
+                    attachment = service.users().messages().attachments().get(
+                        userId='me', messageId=message['id'], id=attachment_id).execute()
+                    file_data = base64.urlsafe_b64decode(attachment['data'].encode('UTF-8'))
+                    path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
+                    with open(path, 'wb') as f:
+                        f.write(file_data)
+                    if part['filename'].endswith('.pdf'):
+                        attachment_documents = PyPDFLoader(path).load()
+                        documents = documents + attachment_documents
+                    if part['filename'].endswith('.png'):
+                        attachment_documents = UnstructuredImageLoader(path).load()
+                        documents = documents + attachment_documents
+                    if part['filename'].endswith('.csv'):
+                        attachment_documents = CSVLoader(path).load()
+                        documents = documents + attachment_documents
+            for index, document in enumerate(documents):
+                _id = f"{msg['id']}_{index}"
+                if 'source' in document.metadata:
+                    document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
+                document.metadata.update(metadata)
+                ids.append(_id)
         else:
             body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
+            body = re.sub(r'<[^>]+>', '', body)  # Remove HTML tags
+            documents.append(Document(
+                page_content=body,
+                metadata=metadata
+            ))
+            ids.append(msg['id'])
+    return vectorstore.add_documents(documents=documents, ids=ids)
 def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
     """

main.py CHANGED Viewed

@@ -1,10 +1,11 @@
 """Module to run the mail collection process."""
 from dotenv import load_dotenv
 from controllers import mail
 from chain import RAGChain
 from retriever import DocRetriever
-# load_dotenv()
 if __name__ == "__main__":
     mail.collect()

 """Module to run the mail collection process."""
 from dotenv import load_dotenv
 from controllers import mail
 from chain import RAGChain
 from retriever import DocRetriever
+load_dotenv()
 if __name__ == "__main__":
     mail.collect()

models/mails/__init__.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os.path
 import pickle
 from google.auth.transport.requests import Request
-from google.oauth2.credentials import Credentials
 from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
 SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
@@ -46,12 +48,11 @@ def build_gmail_service():
             flow = InstalledAppFlow.from_client_config(client_config, SCOPES)
             # flow = InstalledAppFlow.from_client_secrets_file("./credentials.json", SCOPES)
             creds = flow.run_local_server(port=0)
-            print(creds.to_json(), type(creds))
-        # with open("token.pickle", "wb") as token:
-        #     pickle.dump(creds, token)
-        with open("token.json", "wb") as token:
-            token.write(creds.to_json().encode())
-        creds = Credentials.from_authorized_user_file("token.json")
     service = build("gmail", "v1", credentials=creds)
     return service

+"""Module to build and return a Gmail API service instance."""
 import os.path
 import pickle
 from google.auth.transport.requests import Request
+# from google.oauth2.credentials import Credentials
 from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
 SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
             flow = InstalledAppFlow.from_client_config(client_config, SCOPES)
             # flow = InstalledAppFlow.from_client_secrets_file("./credentials.json", SCOPES)
             creds = flow.run_local_server(port=0)
+        with open("token.pickle", "wb") as token:
+            pickle.dump(creds, token)
+        # with open("token.json", "wb") as token:
+        #     token.write(creds.to_json().encode())
+        # creds = Credentials.from_authorized_user_file("token.json")
     service = build("gmail", "v1", credentials=creds)
     return service

requirements.txt ADDED Viewed

	@@ -0,0 +1,149 @@

+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+asgiref==3.8.1
+attrs==25.3.0
+backoff==2.2.1
+bcrypt==4.3.0
+beautifulsoup4==4.13.3
+blinker==1.9.0
+build==1.2.2.post1
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+chroma-hnswlib==0.7.6
+chromadb==0.6.3
+click==8.1.8
+coloredlogs==15.0.1
+Deprecated==1.2.18
+distro==1.9.0
+dnspython==2.7.0
+durationpy==0.9
+fastapi==0.115.11
+filelock==3.17.0
+flatbuffers==25.2.10
+fsspec==2025.2.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-api-core==2.24.1
+google-api-python-client==2.163.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+googleapis-common-protos==1.69.0
+grpcio==1.70.0
+h11==0.14.0
+httpcore==1.0.7
+httplib2==0.22.0
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.29.2
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.5.2
+Jinja2==3.1.6
+jiter==0.8.2
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kubernetes==32.0.1
+langchain==0.3.20
+langchain-chroma==0.2.2
+langchain-core==0.3.41
+langchain-mongodb==0.5.0
+langchain-openai==0.3.7
+langchain-text-splitters==0.3.6
+langsmith==0.3.11
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mmh3==5.1.0
+monotonic==1.6
+mpmath==1.3.0
+narwhals==1.31.0
+networkx
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime
+openai==1.65.4
+opentelemetry-api==1.30.0
+opentelemetry-exporter-otlp-proto-common==1.30.0
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+opentelemetry-instrumentation==0.51b0
+opentelemetry-instrumentation-asgi==0.51b0
+opentelemetry-instrumentation-fastapi==0.51b0
+opentelemetry-proto==1.30.0
+opentelemetry-sdk==1.30.0
+opentelemetry-semantic-conventions==0.51b0
+opentelemetry-util-http==0.51b0
+orjson==3.10.15
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+posthog==3.18.1
+proto-plus==1.26.0
+protobuf==5.29.3
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydeck==0.9.1
+Pygments==2.19.1
+pymongo==4.11.2
+pyparsing==3.2.1
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2025.1
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.9.4
+rpds-py==0.23.1
+rsa==4.9
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy
+sentence-transformers==3.4.1
+setuptools==75.8.2
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.39
+starlette==0.46.0
+streamlit==1.43.2
+sympy==1.13.1
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tiktoken==0.9.0
+tokenizers==0.21.0
+toml==0.10.2
+torch==2.6.0
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.49.0
+typer==0.15.2
+typing_extensions==4.12.2
+tzdata==2025.1
+uritemplate==4.1.1
+urllib3==2.3.0
+uvicorn==0.34.0
+uvloop==0.21.0
+watchdog==6.0.0
+watchfiles==1.0.4
+websocket-client==1.8.0
+websockets==15.0.1
+wrapt==1.17.2
+zipp==3.21.0
+zstandard==0.23.0

token.pickle ADDED Viewed

Binary file (1.01 kB). View file