gavinzli commited on
Commit
c529966
·
1 Parent(s): 593e9ef

Add initial binary files and update environment loading in main.py

Browse files
.gitignore CHANGED
@@ -172,3 +172,5 @@ cython_debug/
172
 
173
  # PyPI configuration file
174
  .pypirc
 
 
 
172
 
173
  # PyPI configuration file
174
  .pypirc
175
+
176
+ attachments
app.py CHANGED
@@ -17,7 +17,6 @@ with st.sidebar:
17
  result = mail.collect()
18
  with st.chat_message("assistant"):
19
  response_content = st.markdown(result)
20
- # st.session_state.messages.append({"role": "assistant", "content": result})
21
 
22
  if 'chat_id' not in st.session_state:
23
  st.session_state.chat_id = str(uuid.uuid4())
 
17
  result = mail.collect()
18
  with st.chat_message("assistant"):
19
  response_content = st.markdown(result)
 
20
 
21
  if 'chat_id' not in st.session_state:
22
  st.session_state.chat_id = str(uuid.uuid4())
chain/__init__.py CHANGED
@@ -6,15 +6,12 @@ from venv import logger
6
 
7
  from pymongo import errors
8
  from langchain_core.runnables.history import RunnableWithMessageHistory
9
- # from langchain_core.output_parsers import PydanticOutputParser
10
  from langchain_core.messages import BaseMessage, message_to_dict
11
  from langchain.chains.combine_documents import create_stuff_documents_chain
12
  from langchain.chains.retrieval import create_retrieval_chain
13
  from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
14
  from langchain_mongodb import MongoDBChatMessageHistory
15
 
16
-
17
- # from schema import FollowUpQ
18
  from models.llm import GPTModel
19
 
20
  llm = GPTModel()
@@ -107,50 +104,3 @@ class RAGChain(RunnableWithMessageHistory):
107
  history_messages_key="chat_history",
108
  output_messages_key="answer"
109
  )
110
-
111
- # class FollowUpChain():
112
- # """
113
- # FollowUpQChain is a class to generate follow-up questions based on contexts and initial query.
114
-
115
- # Attributes:
116
- # parser (PydanticOutputParser): An instance of PydanticOutputParser to parse the output.
117
- # chain (Chain): A chain of prompts and models to generate follow-up questions.
118
-
119
- # Methods:
120
- # __init__():
121
- # Initializes the FollowUpQChain with a parser and a prompt chain.
122
-
123
- # invoke(contexts, query):
124
- # Invokes the chain with the provided contexts and query to generate follow-up questions.
125
-
126
- # contexts (str): The contexts to be used for generating follow-up questions.
127
- # query (str): The initial query to be used for generating follow-up questions.
128
- # """
129
- # def __init__(self):
130
- # self.parser = PydanticOutputParser(pydantic_object=FollowUpQ)
131
- # prompt = ChatPromptTemplate.from_messages([
132
- # ("system", "You are a professional commentator on current events.Your task\
133
- # is to provide 3 follow-up questions based on contexts and initial query."),
134
- # ("system", "contexts: {contexts}"),
135
- # ("system", "initial query: {query}"),
136
- # ("human", "Format instructions: {format_instructions}"),
137
- # ("placeholder", "{agent_scratchpad}"),
138
- # ])
139
- # self.chain = prompt | llm | self.parser
140
-
141
- # def invoke(self, query, contexts):
142
- # """
143
- # Invokes the chain with the provided content and additional parameters.
144
-
145
- # Args:
146
- # content (str): The article content to be processed.
147
-
148
- # Returns:
149
- # The result of the chain invocation.
150
- # """
151
- # result = self.chain.invoke({
152
- # 'contexts': contexts,
153
- # 'format_instructions': self.parser.get_format_instructions(),
154
- # 'query': query
155
- # })
156
- # return result.questions
 
6
 
7
  from pymongo import errors
8
  from langchain_core.runnables.history import RunnableWithMessageHistory
 
9
  from langchain_core.messages import BaseMessage, message_to_dict
10
  from langchain.chains.combine_documents import create_stuff_documents_chain
11
  from langchain.chains.retrieval import create_retrieval_chain
12
  from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
13
  from langchain_mongodb import MongoDBChatMessageHistory
14
 
 
 
15
  from models.llm import GPTModel
16
 
17
  llm = GPTModel()
 
104
  history_messages_key="chat_history",
105
  output_messages_key="answer"
106
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
collection_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
controllers/mail.py CHANGED
@@ -1,16 +1,25 @@
1
  """Module to search and list emails from Gmail."""
 
 
2
  import base64
3
  from datetime import datetime, timedelta
 
 
4
  import pandas as pd
5
  from langchain_core.documents import Document
 
 
 
6
 
7
- from venv import logger
8
- from models.mails import build_gmail_service
9
  from models.chroma import vectorstore
 
10
 
11
  SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
12
  EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
13
 
 
 
 
14
  service = build_gmail_service()
15
 
16
  def search_emails(query):
@@ -28,10 +37,10 @@ def search_emails(query):
28
  return messages
29
 
30
  def list_emails(messages):
31
- """List emails from the search results."""
32
  ids = []
33
  documents = []
34
- for message in messages[:50]:
35
  msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
36
  metadata = {}
37
  for header in msg['payload']['headers']:
@@ -45,19 +54,42 @@ def list_emails(messages):
45
  metadata['cc'] = header['value']
46
  metadata['date'] = datetime.fromtimestamp(
47
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
 
48
  if 'parts' in msg['payload']:
49
- body = ''.join(
50
- part['body']['data'] for part in msg['payload']['parts'] if 'data' in part['body']
51
- )
52
- body = base64.urlsafe_b64decode(body).decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  else:
54
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
55
- ids.append(msg['id'])
56
- documents.append(Document(
57
- page_content=body,
58
- metadata=metadata
59
- ))
60
- return vectorstore.add_documents(documents= documents, ids = ids)
 
61
 
62
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
63
  """
 
1
  """Module to search and list emails from Gmail."""
2
+ import os
3
+ import re
4
  import base64
5
  from datetime import datetime, timedelta
6
+ from venv import logger
7
+
8
  import pandas as pd
9
  from langchain_core.documents import Document
10
+ from langchain_community.document_loaders import PyPDFLoader
11
+ from langchain_community.document_loaders.image import UnstructuredImageLoader
12
+ from langchain_community.document_loaders.csv_loader import CSVLoader
13
 
 
 
14
  from models.chroma import vectorstore
15
+ from models.mails import build_gmail_service
16
 
17
  SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
18
  EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
19
 
20
+ ATTACHMENTS_DIR = "attachments"
21
+ os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
22
+
23
  service = build_gmail_service()
24
 
25
  def search_emails(query):
 
37
  return messages
38
 
39
  def list_emails(messages):
40
+ """List emails from the search results and download attachments."""
41
  ids = []
42
  documents = []
43
+ for message in messages[:100]:
44
  msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
45
  metadata = {}
46
  for header in msg['payload']['headers']:
 
54
  metadata['cc'] = header['value']
55
  metadata['date'] = datetime.fromtimestamp(
56
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
57
+ body = ""
58
  if 'parts' in msg['payload']:
59
+ for part in msg['payload']['parts']:
60
+ if part['filename']:
61
+ attachment_id = part['body']['attachmentId']
62
+ logger.info("Downloading attachment: %s", part['filename'])
63
+ attachment = service.users().messages().attachments().get(
64
+ userId='me', messageId=message['id'], id=attachment_id).execute()
65
+ file_data = base64.urlsafe_b64decode(attachment['data'].encode('UTF-8'))
66
+ path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
67
+ with open(path, 'wb') as f:
68
+ f.write(file_data)
69
+ if part['filename'].endswith('.pdf'):
70
+ attachment_documents = PyPDFLoader(path).load()
71
+ documents = documents + attachment_documents
72
+ if part['filename'].endswith('.png'):
73
+ attachment_documents = UnstructuredImageLoader(path).load()
74
+ documents = documents + attachment_documents
75
+ if part['filename'].endswith('.csv'):
76
+ attachment_documents = CSVLoader(path).load()
77
+ documents = documents + attachment_documents
78
+ for index, document in enumerate(documents):
79
+ _id = f"{msg['id']}_{index}"
80
+ if 'source' in document.metadata:
81
+ document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
82
+ document.metadata.update(metadata)
83
+ ids.append(_id)
84
  else:
85
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
86
+ body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
87
+ documents.append(Document(
88
+ page_content=body,
89
+ metadata=metadata
90
+ ))
91
+ ids.append(msg['id'])
92
+ return vectorstore.add_documents(documents=documents, ids=ids)
93
 
94
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
95
  """
main.py CHANGED
@@ -1,10 +1,11 @@
1
  """Module to run the mail collection process."""
2
  from dotenv import load_dotenv
 
3
  from controllers import mail
4
  from chain import RAGChain
5
  from retriever import DocRetriever
6
 
7
- # load_dotenv()
8
 
9
  if __name__ == "__main__":
10
  mail.collect()
 
1
  """Module to run the mail collection process."""
2
  from dotenv import load_dotenv
3
+
4
  from controllers import mail
5
  from chain import RAGChain
6
  from retriever import DocRetriever
7
 
8
+ load_dotenv()
9
 
10
  if __name__ == "__main__":
11
  mail.collect()
models/mails/__init__.py CHANGED
@@ -1,11 +1,13 @@
 
1
  import os.path
2
  import pickle
3
 
4
  from google.auth.transport.requests import Request
5
- from google.oauth2.credentials import Credentials
6
  from google_auth_oauthlib.flow import InstalledAppFlow
7
  from googleapiclient.discovery import build
8
 
 
9
  SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
10
 
11
 
@@ -46,12 +48,11 @@ def build_gmail_service():
46
  flow = InstalledAppFlow.from_client_config(client_config, SCOPES)
47
  # flow = InstalledAppFlow.from_client_secrets_file("./credentials.json", SCOPES)
48
  creds = flow.run_local_server(port=0)
49
- print(creds.to_json(), type(creds))
50
 
51
- # with open("token.pickle", "wb") as token:
52
- # pickle.dump(creds, token)
53
- with open("token.json", "wb") as token:
54
- token.write(creds.to_json().encode())
55
- creds = Credentials.from_authorized_user_file("token.json")
56
  service = build("gmail", "v1", credentials=creds)
57
  return service
 
1
+ """Module to build and return a Gmail API service instance."""
2
  import os.path
3
  import pickle
4
 
5
  from google.auth.transport.requests import Request
6
+ # from google.oauth2.credentials import Credentials
7
  from google_auth_oauthlib.flow import InstalledAppFlow
8
  from googleapiclient.discovery import build
9
 
10
+
11
  SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
12
 
13
 
 
48
  flow = InstalledAppFlow.from_client_config(client_config, SCOPES)
49
  # flow = InstalledAppFlow.from_client_secrets_file("./credentials.json", SCOPES)
50
  creds = flow.run_local_server(port=0)
 
51
 
52
+ with open("token.pickle", "wb") as token:
53
+ pickle.dump(creds, token)
54
+ # with open("token.json", "wb") as token:
55
+ # token.write(creds.to_json().encode())
56
+ # creds = Credentials.from_authorized_user_file("token.json")
57
  service = build("gmail", "v1", credentials=creds)
58
  return service
requirements.txt ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ annotated-types==0.7.0
3
+ anyio==4.8.0
4
+ asgiref==3.8.1
5
+ attrs==25.3.0
6
+ backoff==2.2.1
7
+ bcrypt==4.3.0
8
+ beautifulsoup4==4.13.3
9
+ blinker==1.9.0
10
+ build==1.2.2.post1
11
+ cachetools==5.5.2
12
+ certifi==2025.1.31
13
+ charset-normalizer==3.4.1
14
+ chroma-hnswlib==0.7.6
15
+ chromadb==0.6.3
16
+ click==8.1.8
17
+ coloredlogs==15.0.1
18
+ Deprecated==1.2.18
19
+ distro==1.9.0
20
+ dnspython==2.7.0
21
+ durationpy==0.9
22
+ fastapi==0.115.11
23
+ filelock==3.17.0
24
+ flatbuffers==25.2.10
25
+ fsspec==2025.2.0
26
+ gitdb==4.0.12
27
+ GitPython==3.1.44
28
+ google-api-core==2.24.1
29
+ google-api-python-client==2.163.0
30
+ google-auth==2.38.0
31
+ google-auth-httplib2==0.2.0
32
+ google-auth-oauthlib==1.2.1
33
+ googleapis-common-protos==1.69.0
34
+ grpcio==1.70.0
35
+ h11==0.14.0
36
+ httpcore==1.0.7
37
+ httplib2==0.22.0
38
+ httptools==0.6.4
39
+ httpx==0.28.1
40
+ huggingface-hub==0.29.2
41
+ humanfriendly==10.0
42
+ idna==3.10
43
+ importlib_metadata==8.5.0
44
+ importlib_resources==6.5.2
45
+ Jinja2==3.1.6
46
+ jiter==0.8.2
47
+ joblib==1.4.2
48
+ jsonpatch==1.33
49
+ jsonpointer==3.0.0
50
+ jsonschema==4.23.0
51
+ jsonschema-specifications==2024.10.1
52
+ kubernetes==32.0.1
53
+ langchain==0.3.20
54
+ langchain-chroma==0.2.2
55
+ langchain-core==0.3.41
56
+ langchain-mongodb==0.5.0
57
+ langchain-openai==0.3.7
58
+ langchain-text-splitters==0.3.6
59
+ langsmith==0.3.11
60
+ markdown-it-py==3.0.0
61
+ MarkupSafe==3.0.2
62
+ mdurl==0.1.2
63
+ mmh3==5.1.0
64
+ monotonic==1.6
65
+ mpmath==1.3.0
66
+ narwhals==1.31.0
67
+ networkx
68
+ numpy==1.26.4
69
+ oauthlib==3.2.2
70
+ onnxruntime
71
+ openai==1.65.4
72
+ opentelemetry-api==1.30.0
73
+ opentelemetry-exporter-otlp-proto-common==1.30.0
74
+ opentelemetry-exporter-otlp-proto-grpc==1.30.0
75
+ opentelemetry-instrumentation==0.51b0
76
+ opentelemetry-instrumentation-asgi==0.51b0
77
+ opentelemetry-instrumentation-fastapi==0.51b0
78
+ opentelemetry-proto==1.30.0
79
+ opentelemetry-sdk==1.30.0
80
+ opentelemetry-semantic-conventions==0.51b0
81
+ opentelemetry-util-http==0.51b0
82
+ orjson==3.10.15
83
+ overrides==7.7.0
84
+ packaging==24.2
85
+ pandas==2.2.3
86
+ pillow==11.1.0
87
+ posthog==3.18.1
88
+ proto-plus==1.26.0
89
+ protobuf==5.29.3
90
+ pyarrow==19.0.1
91
+ pyasn1==0.6.1
92
+ pyasn1_modules==0.4.1
93
+ pydantic==2.10.6
94
+ pydantic_core==2.27.2
95
+ pydeck==0.9.1
96
+ Pygments==2.19.1
97
+ pymongo==4.11.2
98
+ pyparsing==3.2.1
99
+ PyPika==0.48.9
100
+ pyproject_hooks==1.2.0
101
+ python-dateutil==2.9.0.post0
102
+ python-dotenv==1.0.1
103
+ pytz==2025.1
104
+ PyYAML==6.0.2
105
+ referencing==0.36.2
106
+ regex==2024.11.6
107
+ requests==2.32.3
108
+ requests-oauthlib==2.0.0
109
+ requests-toolbelt==1.0.0
110
+ rich==13.9.4
111
+ rpds-py==0.23.1
112
+ rsa==4.9
113
+ safetensors==0.5.3
114
+ scikit-learn==1.6.1
115
+ scipy
116
+ sentence-transformers==3.4.1
117
+ setuptools==75.8.2
118
+ shellingham==1.5.4
119
+ six==1.17.0
120
+ smmap==5.0.2
121
+ sniffio==1.3.1
122
+ soupsieve==2.6
123
+ SQLAlchemy==2.0.39
124
+ starlette==0.46.0
125
+ streamlit==1.43.2
126
+ sympy==1.13.1
127
+ tenacity==9.0.0
128
+ threadpoolctl==3.5.0
129
+ tiktoken==0.9.0
130
+ tokenizers==0.21.0
131
+ toml==0.10.2
132
+ torch==2.6.0
133
+ tornado==6.4.2
134
+ tqdm==4.67.1
135
+ transformers==4.49.0
136
+ typer==0.15.2
137
+ typing_extensions==4.12.2
138
+ tzdata==2025.1
139
+ uritemplate==4.1.1
140
+ urllib3==2.3.0
141
+ uvicorn==0.34.0
142
+ uvloop==0.21.0
143
+ watchdog==6.0.0
144
+ watchfiles==1.0.4
145
+ websocket-client==1.8.0
146
+ websockets==15.0.1
147
+ wrapt==1.17.2
148
+ zipp==3.21.0
149
+ zstandard==0.23.0
token.pickle ADDED
Binary file (1.01 kB). View file