Spaces:
Running
Running
No code changes made; skipping commit.
Browse files- controllers/mail.py +130 -117
controllers/mail.py
CHANGED
@@ -8,42 +8,47 @@ from ics import Calendar
|
|
8 |
|
9 |
import pandas as pd
|
10 |
from langchain_core.documents import Document
|
11 |
-
from langchain_community.document_loaders import
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
15 |
|
16 |
from models.chroma import vectorstore
|
17 |
from models.mails import build_gmail_service
|
18 |
|
19 |
-
SCOPES = [
|
20 |
-
EMAIL_PATTERN = r
|
21 |
|
22 |
ATTACHMENTS_DIR = "attachments"
|
23 |
os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
|
24 |
|
25 |
service = build_gmail_service()
|
26 |
|
|
|
27 |
def search_emails(query):
|
28 |
"""Search emails based on a query."""
|
29 |
-
result = service.users().messages().list(userId=
|
30 |
messages = []
|
31 |
-
if
|
32 |
-
messages.extend(result[
|
33 |
-
while
|
34 |
-
page_token = result[
|
35 |
-
result =
|
36 |
-
userId=
|
37 |
-
|
38 |
-
|
|
|
39 |
return messages
|
40 |
|
|
|
41 |
def list_emails(messages):
|
42 |
"""
|
43 |
Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
|
44 |
|
45 |
Args:
|
46 |
-
messages (list): A list of email message dictionaries, where each dictionary contains
|
47 |
at least an 'id' key representing the email's unique identifier.
|
48 |
|
49 |
Returns:
|
@@ -73,117 +78,127 @@ def list_emails(messages):
|
|
73 |
ids = []
|
74 |
documents = []
|
75 |
for message in messages:
|
76 |
-
msg = service.users().messages().get(userId=
|
77 |
metadata = {}
|
78 |
-
for header in msg[
|
79 |
-
if header[
|
80 |
-
metadata[
|
81 |
-
elif header[
|
82 |
-
metadata[
|
83 |
-
elif header[
|
84 |
-
metadata[
|
85 |
-
elif header[
|
86 |
-
metadata[
|
87 |
-
metadata[
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
91 |
ids = []
|
92 |
documents = []
|
93 |
mimeType = []
|
94 |
-
if msg[
|
|
|
|
|
|
|
|
|
95 |
mimeType = []
|
96 |
attach_docs = []
|
97 |
-
for part in msg[
|
98 |
-
print("mimeType: ", part[
|
99 |
-
mimeType.append(part[
|
100 |
-
if part[
|
101 |
-
body = base64.urlsafe_b64decode(part[
|
102 |
-
body = re.sub(r
|
103 |
-
metadata[
|
104 |
-
documents.append(Document(
|
105 |
-
|
106 |
-
|
107 |
-
))
|
108 |
-
|
109 |
-
|
110 |
-
body =
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
path
|
125 |
-
with open(path, 'wb') as f:
|
126 |
f.write(file_data)
|
127 |
-
if part[
|
128 |
attach_docs = PyPDFLoader(path).load()
|
129 |
-
elif part[
|
130 |
attach_docs = UnstructuredImageLoader(path).load()
|
131 |
-
elif part[
|
132 |
attach_docs = CSVLoader(path).load()
|
133 |
-
elif
|
|
|
|
|
|
|
134 |
attach_docs = UnstructuredExcelLoader(path).load()
|
135 |
-
elif part[
|
136 |
-
with open(path,
|
137 |
calendar = Calendar(f.read())
|
138 |
for event in calendar.events:
|
139 |
-
documents.append(
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
151 |
ids.append(f"{msg['id']}_{attachment_id}")
|
152 |
if os.path.exists(path):
|
153 |
os.remove(path)
|
154 |
for index, document in enumerate(attach_docs or []):
|
155 |
-
document.metadata[
|
156 |
-
if
|
157 |
-
document.metadata[
|
158 |
-
document.metadata[
|
159 |
-
document.metadata = {
|
|
|
|
|
|
|
|
|
160 |
document.metadata.update(metadata)
|
161 |
documents.append(document)
|
162 |
ids.append(f"{msg['id']}_{attachment_id}_{index}")
|
163 |
-
elif msg[
|
164 |
-
body = base64.urlsafe_b64decode(msg[
|
165 |
-
body = re.sub(r
|
166 |
-
metadata[
|
167 |
-
documents.append(Document(
|
168 |
-
|
169 |
-
|
170 |
-
))
|
171 |
-
|
172 |
-
|
173 |
-
body =
|
174 |
-
|
175 |
-
|
176 |
-
documents.append(Document(
|
177 |
-
page_content=body,
|
178 |
-
metadata=metadata
|
179 |
-
))
|
180 |
-
ids.append(msg['id'])
|
181 |
-
if 'multipart/alternative' in mimeType and len(mimeType) == 1:
|
182 |
print("Only multipart/alternative found in the email.")
|
183 |
else:
|
184 |
vectorstore.add_documents(documents=documents, ids=ids)
|
185 |
|
186 |
-
|
|
|
187 |
"""
|
188 |
Main function to search and list emails from Gmail.
|
189 |
|
@@ -205,6 +220,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
|
|
205 |
else:
|
206 |
logger.info("No emails found after two weeks ago.")
|
207 |
|
|
|
208 |
def get_documents():
|
209 |
"""
|
210 |
Main function to list emails from the database.
|
@@ -215,15 +231,14 @@ def get_documents():
|
|
215 |
None
|
216 |
"""
|
217 |
data = vectorstore.get()
|
218 |
-
df = pd.DataFrame(
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
})
|
223 |
-
df.to_excel('collection_data.xlsx', index=False)
|
224 |
df = pd.concat(
|
225 |
-
[df.drop(
|
226 |
-
|
|
|
227 |
|
228 |
def get():
|
229 |
"""
|
@@ -235,9 +250,7 @@ def get():
|
|
235 |
None
|
236 |
"""
|
237 |
data = vectorstore.get()
|
238 |
-
df = pd.DataFrame(
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
})
|
243 |
-
return df.to_dict(orient='records')
|
|
|
8 |
|
9 |
import pandas as pd
|
10 |
from langchain_core.documents import Document
|
11 |
+
from langchain_community.document_loaders import (
|
12 |
+
PyPDFLoader,
|
13 |
+
UnstructuredExcelLoader,
|
14 |
+
CSVLoader,
|
15 |
+
UnstructuredImageLoader,
|
16 |
+
)
|
17 |
|
18 |
from models.chroma import vectorstore
|
19 |
from models.mails import build_gmail_service
|
20 |
|
21 |
+
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
22 |
+
EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
|
23 |
|
24 |
ATTACHMENTS_DIR = "attachments"
|
25 |
os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
|
26 |
|
27 |
service = build_gmail_service()
|
28 |
|
29 |
+
|
30 |
def search_emails(query):
|
31 |
"""Search emails based on a query."""
|
32 |
+
result = service.users().messages().list(userId="me", q=query).execute()
|
33 |
messages = []
|
34 |
+
if "messages" in result:
|
35 |
+
messages.extend(result["messages"])
|
36 |
+
while "nextPageToken" in result:
|
37 |
+
page_token = result["nextPageToken"]
|
38 |
+
result = (
|
39 |
+
service.users().messages().list(userId="me", q=query, pageToken=page_token).execute()
|
40 |
+
)
|
41 |
+
if "messages" in result:
|
42 |
+
messages.extend(result["messages"])
|
43 |
return messages
|
44 |
|
45 |
+
|
46 |
def list_emails(messages):
|
47 |
"""
|
48 |
Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
|
49 |
|
50 |
Args:
|
51 |
+
messages (list): A list of email message dictionaries, where each dictionary contains
|
52 |
at least an 'id' key representing the email's unique identifier.
|
53 |
|
54 |
Returns:
|
|
|
78 |
ids = []
|
79 |
documents = []
|
80 |
for message in messages:
|
81 |
+
msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
|
82 |
metadata = {}
|
83 |
+
for header in msg["payload"]["headers"]:
|
84 |
+
if header["name"] == "From":
|
85 |
+
metadata["from"] = header["value"]
|
86 |
+
elif header["name"] == "To":
|
87 |
+
metadata["to"] = header["value"]
|
88 |
+
elif header["name"] == "Subject":
|
89 |
+
metadata["subject"] = header["value"]
|
90 |
+
elif header["name"] == "Cc":
|
91 |
+
metadata["cc"] = header["value"]
|
92 |
+
metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
|
93 |
+
"%d/%m/%Y %H:%M:%S"
|
94 |
+
)
|
95 |
+
metadata["msg_id"] = msg["id"]
|
96 |
+
print(metadata, msg["payload"]["mimeType"])
|
97 |
ids = []
|
98 |
documents = []
|
99 |
mimeType = []
|
100 |
+
if msg["payload"]["mimeType"] in [
|
101 |
+
"multipart/alternative",
|
102 |
+
"multipart/related",
|
103 |
+
"multipart/mixed",
|
104 |
+
]:
|
105 |
mimeType = []
|
106 |
attach_docs = []
|
107 |
+
for part in msg["payload"]["parts"]:
|
108 |
+
print("mimeType: ", part["mimeType"])
|
109 |
+
mimeType.append(part["mimeType"])
|
110 |
+
if part["mimeType"] == "text/plain" and "text/html" not in mimeType:
|
111 |
+
body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
|
112 |
+
body = re.sub(r"<[^>]+>", "", body) # Remove HTML tags
|
113 |
+
metadata["mimeType"] = part["mimeType"]
|
114 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
115 |
+
ids.append(msg["id"])
|
116 |
+
elif part["mimeType"] == "text/html" and "text/plain" not in mimeType:
|
117 |
+
body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
|
118 |
+
body = re.sub(r"<[^>]+>", "", body)
|
119 |
+
metadata["mimeType"] = part["mimeType"]
|
120 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
121 |
+
ids.append(msg["id"])
|
122 |
+
if part["filename"]:
|
123 |
+
attachment_id = part["body"]["attachmentId"]
|
124 |
+
logger.info("Downloading attachment: %s", part["filename"])
|
125 |
+
attachment = (
|
126 |
+
service.users()
|
127 |
+
.messages()
|
128 |
+
.attachments()
|
129 |
+
.get(userId="me", messageId=message["id"], id=attachment_id)
|
130 |
+
.execute()
|
131 |
+
)
|
132 |
+
file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
|
133 |
+
path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
|
134 |
+
with open(path, "wb") as f:
|
|
|
135 |
f.write(file_data)
|
136 |
+
if part["mimeType"] == "application/pdf":
|
137 |
attach_docs = PyPDFLoader(path).load()
|
138 |
+
elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
|
139 |
attach_docs = UnstructuredImageLoader(path).load()
|
140 |
+
elif part["filename"].endswith(".csv"):
|
141 |
attach_docs = CSVLoader(path).load()
|
142 |
+
elif (
|
143 |
+
part["mimeType"]
|
144 |
+
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
145 |
+
):
|
146 |
attach_docs = UnstructuredExcelLoader(path).load()
|
147 |
+
elif part["mimeType"] == "application/ics":
|
148 |
+
with open(path, "r", encoding="utf-8") as f:
|
149 |
calendar = Calendar(f.read())
|
150 |
for event in calendar.events:
|
151 |
+
documents.append(
|
152 |
+
Document(
|
153 |
+
page_content=f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
|
154 |
+
metadata={
|
155 |
+
"attachment": part["filename"],
|
156 |
+
"mimeType": part["mimeType"],
|
157 |
+
"location": event.location,
|
158 |
+
"created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
|
159 |
+
"last_modified": event.last_modified.strftime(
|
160 |
+
"%d/%m/%Y %H:%M:%S"
|
161 |
+
),
|
162 |
+
"start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
|
163 |
+
"end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
|
164 |
+
},
|
165 |
+
)
|
166 |
+
)
|
167 |
ids.append(f"{msg['id']}_{attachment_id}")
|
168 |
if os.path.exists(path):
|
169 |
os.remove(path)
|
170 |
for index, document in enumerate(attach_docs or []):
|
171 |
+
document.metadata["mimeType"] = part["mimeType"]
|
172 |
+
if "page_label" in document.metadata:
|
173 |
+
document.metadata["page"] = document.metadata["page_label"]
|
174 |
+
document.metadata["attachment"] = part["filename"]
|
175 |
+
document.metadata = {
|
176 |
+
key: value
|
177 |
+
for key, value in document.metadata.items()
|
178 |
+
if key in ["attachment", "page"]
|
179 |
+
}
|
180 |
document.metadata.update(metadata)
|
181 |
documents.append(document)
|
182 |
ids.append(f"{msg['id']}_{attachment_id}_{index}")
|
183 |
+
elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
|
184 |
+
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
185 |
+
body = re.sub(r"<[^>]+>", "", body)
|
186 |
+
metadata["mimeType"] = msg["payload"]["mimeType"]
|
187 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
188 |
+
ids.append(msg["id"])
|
189 |
+
elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
|
190 |
+
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
191 |
+
body = re.sub(r"<[^>]+>", "", body)
|
192 |
+
metadata["mimeType"] = msg["payload"]["mimeType"]
|
193 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
194 |
+
ids.append(msg["id"])
|
195 |
+
if "multipart/alternative" in mimeType and len(mimeType) == 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
print("Only multipart/alternative found in the email.")
|
197 |
else:
|
198 |
vectorstore.add_documents(documents=documents, ids=ids)
|
199 |
|
200 |
+
|
201 |
+
def collect(query=(datetime.today() - timedelta(days=21)).strftime("after:%Y/%m/%d")):
|
202 |
"""
|
203 |
Main function to search and list emails from Gmail.
|
204 |
|
|
|
220 |
else:
|
221 |
logger.info("No emails found after two weeks ago.")
|
222 |
|
223 |
+
|
224 |
def get_documents():
|
225 |
"""
|
226 |
Main function to list emails from the database.
|
|
|
231 |
None
|
232 |
"""
|
233 |
data = vectorstore.get()
|
234 |
+
df = pd.DataFrame(
|
235 |
+
{"ids": data["ids"], "documents": data["documents"], "metadatas": data["metadatas"]}
|
236 |
+
)
|
237 |
+
df.to_excel("collection_data.xlsx", index=False)
|
|
|
|
|
238 |
df = pd.concat(
|
239 |
+
[df.drop("metadatas", axis=1), df["metadatas"].apply(pd.Series)], axis=1
|
240 |
+
).to_excel("collection_data_expand.xlsx", index=False)
|
241 |
+
|
242 |
|
243 |
def get():
|
244 |
"""
|
|
|
250 |
None
|
251 |
"""
|
252 |
data = vectorstore.get()
|
253 |
+
df = pd.DataFrame(
|
254 |
+
{"id": data["ids"], "documents": data["documents"], "metadatas": data["metadatas"]}
|
255 |
+
)
|
256 |
+
return df.to_dict(orient="records")
|
|
|
|