gavinzli commited on
Commit
0d660bd
·
1 Parent(s): b5a92b5

No code changes made; skipping commit.

Browse files
Files changed (1) hide show
  1. controllers/mail.py +130 -117
controllers/mail.py CHANGED
@@ -8,42 +8,47 @@ from ics import Calendar
8
 
9
  import pandas as pd
10
  from langchain_core.documents import Document
11
- from langchain_community.document_loaders import PyPDFLoader
12
- from langchain_community.document_loaders.image import UnstructuredImageLoader
13
- from langchain_community.document_loaders import UnstructuredExcelLoader
14
- from langchain_community.document_loaders.csv_loader import CSVLoader
 
 
15
 
16
  from models.chroma import vectorstore
17
  from models.mails import build_gmail_service
18
 
19
- SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
20
- EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
21
 
22
  ATTACHMENTS_DIR = "attachments"
23
  os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
24
 
25
  service = build_gmail_service()
26
 
 
27
  def search_emails(query):
28
  """Search emails based on a query."""
29
- result = service.users().messages().list(userId='me', q=query).execute()
30
  messages = []
31
- if 'messages' in result:
32
- messages.extend(result['messages'])
33
- while 'nextPageToken' in result:
34
- page_token = result['nextPageToken']
35
- result = service.users().messages().list(
36
- userId='me', q=query, pageToken=page_token).execute()
37
- if 'messages' in result:
38
- messages.extend(result['messages'])
 
39
  return messages
40
 
 
41
  def list_emails(messages):
42
  """
43
  Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
44
 
45
  Args:
46
- messages (list): A list of email message dictionaries, where each dictionary contains
47
  at least an 'id' key representing the email's unique identifier.
48
 
49
  Returns:
@@ -73,117 +78,127 @@ def list_emails(messages):
73
  ids = []
74
  documents = []
75
  for message in messages:
76
- msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
77
  metadata = {}
78
- for header in msg['payload']['headers']:
79
- if header['name'] == 'From':
80
- metadata['from'] = header['value']
81
- elif header['name'] == 'To':
82
- metadata['to'] = header['value']
83
- elif header['name'] == 'Subject':
84
- metadata['subject'] = header['value']
85
- elif header['name'] == 'Cc':
86
- metadata['cc'] = header['value']
87
- metadata['date'] = datetime.fromtimestamp(
88
- int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
89
- metadata['msg_id'] = msg['id']
90
- print(metadata, msg['payload']['mimeType'])
 
91
  ids = []
92
  documents = []
93
  mimeType = []
94
- if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
 
 
 
 
95
  mimeType = []
96
  attach_docs = []
97
- for part in msg['payload']['parts']:
98
- print("mimeType: ", part['mimeType'])
99
- mimeType.append(part['mimeType'])
100
- if part['mimeType'] == 'text/plain' and 'text/html' not in mimeType:
101
- body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
102
- body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
103
- metadata['mimeType'] = part['mimeType']
104
- documents.append(Document(
105
- page_content=body,
106
- metadata=metadata
107
- ))
108
- ids.append(msg['id'])
109
- elif part['mimeType'] == 'text/html' and 'text/plain' not in mimeType:
110
- body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
111
- body = re.sub(r'<[^>]+>', '', body)
112
- metadata['mimeType'] = part['mimeType']
113
- documents.append(Document(
114
- page_content=body,
115
- metadata=metadata
116
- ))
117
- ids.append(msg['id'])
118
- if part['filename']:
119
- attachment_id = part['body']['attachmentId']
120
- logger.info("Downloading attachment: %s", part['filename'])
121
- attachment = service.users().messages().attachments().get(
122
- userId='me', messageId=message['id'], id=attachment_id).execute()
123
- file_data = base64.urlsafe_b64decode(attachment['data'].encode('UTF-8'))
124
- path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
125
- with open(path, 'wb') as f:
126
  f.write(file_data)
127
- if part['mimeType'] == 'application/pdf':
128
  attach_docs = PyPDFLoader(path).load()
129
- elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
130
  attach_docs = UnstructuredImageLoader(path).load()
131
- elif part['filename'].endswith('.csv'):
132
  attach_docs = CSVLoader(path).load()
133
- elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
 
 
 
134
  attach_docs = UnstructuredExcelLoader(path).load()
135
- elif part['mimeType'] == 'application/ics':
136
- with open(path, 'r', encoding='utf-8') as f:
137
  calendar = Calendar(f.read())
138
  for event in calendar.events:
139
- documents.append(Document(
140
- page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
141
- metadata = {
142
- "attachment": part['filename'],
143
- "mimeType": part['mimeType'],
144
- "location": event.location,
145
- "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
146
- "last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
147
- "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
148
- "end": event.end.strftime("%d/%m/%Y %H:%M:%S")
149
- }
150
- ))
 
 
 
 
151
  ids.append(f"{msg['id']}_{attachment_id}")
152
  if os.path.exists(path):
153
  os.remove(path)
154
  for index, document in enumerate(attach_docs or []):
155
- document.metadata['mimeType'] = part['mimeType']
156
- if 'page_label' in document.metadata:
157
- document.metadata['page'] = document.metadata['page_label']
158
- document.metadata['attachment'] = part['filename']
159
- document.metadata = {key: value for key, value in document.metadata.items() if key in ['attachment', 'page']}
 
 
 
 
160
  document.metadata.update(metadata)
161
  documents.append(document)
162
  ids.append(f"{msg['id']}_{attachment_id}_{index}")
163
- elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
164
- body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
165
- body = re.sub(r'<[^>]+>', '', body)
166
- metadata['mimeType'] = msg['payload']['mimeType']
167
- documents.append(Document(
168
- page_content=body,
169
- metadata=metadata
170
- ))
171
- ids.append(msg['id'])
172
- elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
173
- body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
174
- body = re.sub(r'<[^>]+>', '', body)
175
- metadata['mimeType'] = msg['payload']['mimeType']
176
- documents.append(Document(
177
- page_content=body,
178
- metadata=metadata
179
- ))
180
- ids.append(msg['id'])
181
- if 'multipart/alternative' in mimeType and len(mimeType) == 1:
182
  print("Only multipart/alternative found in the email.")
183
  else:
184
  vectorstore.add_documents(documents=documents, ids=ids)
185
 
186
- def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
 
187
  """
188
  Main function to search and list emails from Gmail.
189
 
@@ -205,6 +220,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
205
  else:
206
  logger.info("No emails found after two weeks ago.")
207
 
 
208
  def get_documents():
209
  """
210
  Main function to list emails from the database.
@@ -215,15 +231,14 @@ def get_documents():
215
  None
216
  """
217
  data = vectorstore.get()
218
- df = pd.DataFrame({
219
- 'ids': data['ids'],
220
- 'documents': data['documents'],
221
- 'metadatas': data['metadatas']
222
- })
223
- df.to_excel('collection_data.xlsx', index=False)
224
  df = pd.concat(
225
- [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
226
- axis=1).to_excel('collection_data_expand.xlsx', index=False)
 
227
 
228
  def get():
229
  """
@@ -235,9 +250,7 @@ def get():
235
  None
236
  """
237
  data = vectorstore.get()
238
- df = pd.DataFrame({
239
- 'id': data['ids'],
240
- 'documents': data['documents'],
241
- 'metadatas': data['metadatas']
242
- })
243
- return df.to_dict(orient='records')
 
8
 
9
  import pandas as pd
10
  from langchain_core.documents import Document
11
+ from langchain_community.document_loaders import (
12
+ PyPDFLoader,
13
+ UnstructuredExcelLoader,
14
+ CSVLoader,
15
+ UnstructuredImageLoader,
16
+ )
17
 
18
  from models.chroma import vectorstore
19
  from models.mails import build_gmail_service
20
 
21
+ SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
22
+ EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
23
 
24
  ATTACHMENTS_DIR = "attachments"
25
  os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
26
 
27
  service = build_gmail_service()
28
 
29
+
30
  def search_emails(query):
31
  """Search emails based on a query."""
32
+ result = service.users().messages().list(userId="me", q=query).execute()
33
  messages = []
34
+ if "messages" in result:
35
+ messages.extend(result["messages"])
36
+ while "nextPageToken" in result:
37
+ page_token = result["nextPageToken"]
38
+ result = (
39
+ service.users().messages().list(userId="me", q=query, pageToken=page_token).execute()
40
+ )
41
+ if "messages" in result:
42
+ messages.extend(result["messages"])
43
  return messages
44
 
45
+
46
  def list_emails(messages):
47
  """
48
  Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
49
 
50
  Args:
51
+ messages (list): A list of email message dictionaries, where each dictionary contains
52
  at least an 'id' key representing the email's unique identifier.
53
 
54
  Returns:
 
78
  ids = []
79
  documents = []
80
  for message in messages:
81
+ msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
82
  metadata = {}
83
+ for header in msg["payload"]["headers"]:
84
+ if header["name"] == "From":
85
+ metadata["from"] = header["value"]
86
+ elif header["name"] == "To":
87
+ metadata["to"] = header["value"]
88
+ elif header["name"] == "Subject":
89
+ metadata["subject"] = header["value"]
90
+ elif header["name"] == "Cc":
91
+ metadata["cc"] = header["value"]
92
+ metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
93
+ "%d/%m/%Y %H:%M:%S"
94
+ )
95
+ metadata["msg_id"] = msg["id"]
96
+ print(metadata, msg["payload"]["mimeType"])
97
  ids = []
98
  documents = []
99
  mimeType = []
100
+ if msg["payload"]["mimeType"] in [
101
+ "multipart/alternative",
102
+ "multipart/related",
103
+ "multipart/mixed",
104
+ ]:
105
  mimeType = []
106
  attach_docs = []
107
+ for part in msg["payload"]["parts"]:
108
+ print("mimeType: ", part["mimeType"])
109
+ mimeType.append(part["mimeType"])
110
+ if part["mimeType"] == "text/plain" and "text/html" not in mimeType:
111
+ body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
112
+ body = re.sub(r"<[^>]+>", "", body) # Remove HTML tags
113
+ metadata["mimeType"] = part["mimeType"]
114
+ documents.append(Document(page_content=body, metadata=metadata))
115
+ ids.append(msg["id"])
116
+ elif part["mimeType"] == "text/html" and "text/plain" not in mimeType:
117
+ body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
118
+ body = re.sub(r"<[^>]+>", "", body)
119
+ metadata["mimeType"] = part["mimeType"]
120
+ documents.append(Document(page_content=body, metadata=metadata))
121
+ ids.append(msg["id"])
122
+ if part["filename"]:
123
+ attachment_id = part["body"]["attachmentId"]
124
+ logger.info("Downloading attachment: %s", part["filename"])
125
+ attachment = (
126
+ service.users()
127
+ .messages()
128
+ .attachments()
129
+ .get(userId="me", messageId=message["id"], id=attachment_id)
130
+ .execute()
131
+ )
132
+ file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
133
+ path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
134
+ with open(path, "wb") as f:
 
135
  f.write(file_data)
136
+ if part["mimeType"] == "application/pdf":
137
  attach_docs = PyPDFLoader(path).load()
138
+ elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
139
  attach_docs = UnstructuredImageLoader(path).load()
140
+ elif part["filename"].endswith(".csv"):
141
  attach_docs = CSVLoader(path).load()
142
+ elif (
143
+ part["mimeType"]
144
+ == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
145
+ ):
146
  attach_docs = UnstructuredExcelLoader(path).load()
147
+ elif part["mimeType"] == "application/ics":
148
+ with open(path, "r", encoding="utf-8") as f:
149
  calendar = Calendar(f.read())
150
  for event in calendar.events:
151
+ documents.append(
152
+ Document(
153
+ page_content=f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
154
+ metadata={
155
+ "attachment": part["filename"],
156
+ "mimeType": part["mimeType"],
157
+ "location": event.location,
158
+ "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
159
+ "last_modified": event.last_modified.strftime(
160
+ "%d/%m/%Y %H:%M:%S"
161
+ ),
162
+ "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
163
+ "end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
164
+ },
165
+ )
166
+ )
167
  ids.append(f"{msg['id']}_{attachment_id}")
168
  if os.path.exists(path):
169
  os.remove(path)
170
  for index, document in enumerate(attach_docs or []):
171
+ document.metadata["mimeType"] = part["mimeType"]
172
+ if "page_label" in document.metadata:
173
+ document.metadata["page"] = document.metadata["page_label"]
174
+ document.metadata["attachment"] = part["filename"]
175
+ document.metadata = {
176
+ key: value
177
+ for key, value in document.metadata.items()
178
+ if key in ["attachment", "page"]
179
+ }
180
  document.metadata.update(metadata)
181
  documents.append(document)
182
  ids.append(f"{msg['id']}_{attachment_id}_{index}")
183
+ elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
184
+ body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
185
+ body = re.sub(r"<[^>]+>", "", body)
186
+ metadata["mimeType"] = msg["payload"]["mimeType"]
187
+ documents.append(Document(page_content=body, metadata=metadata))
188
+ ids.append(msg["id"])
189
+ elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
190
+ body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
191
+ body = re.sub(r"<[^>]+>", "", body)
192
+ metadata["mimeType"] = msg["payload"]["mimeType"]
193
+ documents.append(Document(page_content=body, metadata=metadata))
194
+ ids.append(msg["id"])
195
+ if "multipart/alternative" in mimeType and len(mimeType) == 1:
 
 
 
 
 
 
196
  print("Only multipart/alternative found in the email.")
197
  else:
198
  vectorstore.add_documents(documents=documents, ids=ids)
199
 
200
+
201
+ def collect(query=(datetime.today() - timedelta(days=21)).strftime("after:%Y/%m/%d")):
202
  """
203
  Main function to search and list emails from Gmail.
204
 
 
220
  else:
221
  logger.info("No emails found after two weeks ago.")
222
 
223
+
224
  def get_documents():
225
  """
226
  Main function to list emails from the database.
 
231
  None
232
  """
233
  data = vectorstore.get()
234
+ df = pd.DataFrame(
235
+ {"ids": data["ids"], "documents": data["documents"], "metadatas": data["metadatas"]}
236
+ )
237
+ df.to_excel("collection_data.xlsx", index=False)
 
 
238
  df = pd.concat(
239
+ [df.drop("metadatas", axis=1), df["metadatas"].apply(pd.Series)], axis=1
240
+ ).to_excel("collection_data_expand.xlsx", index=False)
241
+
242
 
243
  def get():
244
  """
 
250
  None
251
  """
252
  data = vectorstore.get()
253
+ df = pd.DataFrame(
254
+ {"id": data["ids"], "documents": data["documents"], "metadatas": data["metadatas"]}
255
+ )
256
+ return df.to_dict(orient="records")