ai: Advanced the file extraction process
Browse files- jarvis.py +116 -25
- requirements.txt +2 -1
jarvis.py
CHANGED
@@ -16,10 +16,13 @@ import random
|
|
16 |
import requests
|
17 |
import threading
|
18 |
import uuid
|
|
|
|
|
19 |
|
20 |
from PIL import Image
|
21 |
from pathlib import Path
|
22 |
from pptx import Presentation
|
|
|
23 |
|
24 |
os.system("apt-get update -q -y && apt-get install -q -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind libleptonica-dev libtesseract-dev")
|
25 |
|
@@ -79,33 +82,121 @@ def marked_item(item, marked, attempts):
|
|
79 |
def get_model_key(display):
|
80 |
return next((k for k, v in MODEL_MAPPING.items() if v == display), DEFAULT_MODEL_KEY)
|
81 |
|
82 |
-
def
|
83 |
-
|
84 |
-
c = ""
|
85 |
try:
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
except Exception as e:
|
107 |
-
|
108 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
async def fetch_response_async(host, key, model, msgs, cfg, sid):
|
111 |
for t in [60, 80, 120, 240]:
|
|
|
16 |
import requests
|
17 |
import threading
|
18 |
import uuid
|
19 |
+
import zipfile
|
20 |
+
import io
|
21 |
|
22 |
from PIL import Image
|
23 |
from pathlib import Path
|
24 |
from pptx import Presentation
|
25 |
+
from openpyxl import load_workbook
|
26 |
|
27 |
os.system("apt-get update -q -y && apt-get install -q -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind libleptonica-dev libtesseract-dev")
|
28 |
|
|
|
82 |
def get_model_key(display):
|
83 |
return next((k for k, v in MODEL_MAPPING.items() if v == display), DEFAULT_MODEL_KEY)
|
84 |
|
85 |
+
def extract_pdf_content(fp):
|
86 |
+
content = ""
|
|
|
87 |
try:
|
88 |
+
with pdfplumber.open(fp) as pdf:
|
89 |
+
for page in pdf.pages:
|
90 |
+
text = page.extract_text() or ""
|
91 |
+
content += text + "\n"
|
92 |
+
if page.images:
|
93 |
+
img_obj = page.to_image(resolution=300)
|
94 |
+
for img in page.images:
|
95 |
+
bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
|
96 |
+
cropped = img_obj.original.crop(bbox)
|
97 |
+
ocr_text = pytesseract.image_to_string(cropped)
|
98 |
+
if ocr_text.strip():
|
99 |
+
content += ocr_text + "\n"
|
100 |
+
tables = page.extract_tables()
|
101 |
+
for table in tables:
|
102 |
+
for row in table:
|
103 |
+
cells = [str(cell) for cell in row if cell is not None]
|
104 |
+
if cells:
|
105 |
+
content += "\t".join(cells) + "\n"
|
106 |
+
except Exception as e:
|
107 |
+
content += f"{fp}: {e}"
|
108 |
+
return content.strip()
|
109 |
+
|
110 |
+
def extract_docx_content(fp):
|
111 |
+
content = ""
|
112 |
+
try:
|
113 |
+
doc = docx.Document(fp)
|
114 |
+
for para in doc.paragraphs:
|
115 |
+
content += para.text + "\n"
|
116 |
+
for table in doc.tables:
|
117 |
+
for row in table.rows:
|
118 |
+
cells = [cell.text for cell in row.cells]
|
119 |
+
content += "\t".join(cells) + "\n"
|
120 |
+
with zipfile.ZipFile(fp) as z:
|
121 |
+
for file in z.namelist():
|
122 |
+
if file.startswith("word/media/"):
|
123 |
+
data = z.read(file)
|
124 |
+
try:
|
125 |
+
img = Image.open(io.BytesIO(data))
|
126 |
+
ocr_text = pytesseract.image_to_string(img)
|
127 |
+
if ocr_text.strip():
|
128 |
+
content += ocr_text + "\n"
|
129 |
+
except Exception:
|
130 |
+
pass
|
131 |
+
except Exception as e:
|
132 |
+
content += f"{fp}: {e}"
|
133 |
+
return content.strip()
|
134 |
+
|
135 |
+
def extract_excel_content(fp):
|
136 |
+
content = ""
|
137 |
+
try:
|
138 |
+
sheets = pd.read_excel(fp, sheet_name=None)
|
139 |
+
for name, df in sheets.items():
|
140 |
+
content += f"Sheet: {name}\n"
|
141 |
+
content += df.to_csv(index=False) + "\n"
|
142 |
+
wb = load_workbook(fp, data_only=True)
|
143 |
+
if wb._images:
|
144 |
+
for image in wb._images:
|
145 |
+
img = image.ref
|
146 |
+
if isinstance(img, bytes):
|
147 |
+
try:
|
148 |
+
pil_img = Image.open(io.BytesIO(img))
|
149 |
+
ocr_text = pytesseract.image_to_string(pil_img)
|
150 |
+
if ocr_text.strip():
|
151 |
+
content += ocr_text + "\n"
|
152 |
+
except Exception:
|
153 |
+
pass
|
154 |
except Exception as e:
|
155 |
+
content += f"{fp}: {e}"
|
156 |
+
return content.strip()
|
157 |
+
|
158 |
+
def extract_pptx_content(fp):
|
159 |
+
content = ""
|
160 |
+
try:
|
161 |
+
prs = Presentation(fp)
|
162 |
+
for slide in prs.slides:
|
163 |
+
for shape in slide.shapes:
|
164 |
+
if hasattr(shape, "text") and shape.text:
|
165 |
+
content += shape.text + "\n"
|
166 |
+
if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
|
167 |
+
try:
|
168 |
+
img = Image.open(io.BytesIO(shape.image.blob))
|
169 |
+
ocr_text = pytesseract.image_to_string(img)
|
170 |
+
if ocr_text.strip():
|
171 |
+
content += ocr_text + "\n"
|
172 |
+
except Exception:
|
173 |
+
pass
|
174 |
+
if slide.shapes:
|
175 |
+
for shape in slide.shapes:
|
176 |
+
if shape.has_table:
|
177 |
+
table = shape.table
|
178 |
+
for row in table.rows:
|
179 |
+
cells = [cell.text for cell in row.cells]
|
180 |
+
content += "\t".join(cells) + "\n"
|
181 |
+
except Exception as e:
|
182 |
+
content += f"{fp}: {e}"
|
183 |
+
return content.strip()
|
184 |
+
|
185 |
+
def extract_file_content(fp):
|
186 |
+
ext = Path(fp).suffix.lower()
|
187 |
+
if ext == ".pdf":
|
188 |
+
return extract_pdf_content(fp)
|
189 |
+
elif ext in [".doc", ".docx"]:
|
190 |
+
return extract_docx_content(fp)
|
191 |
+
elif ext in [".xlsx", ".xls"]:
|
192 |
+
return extract_excel_content(fp)
|
193 |
+
elif ext in [".ppt", ".pptx"]:
|
194 |
+
return extract_pptx_content(fp)
|
195 |
+
else:
|
196 |
+
try:
|
197 |
+
return Path(fp).read_text(encoding="utf-8").strip()
|
198 |
+
except Exception as e:
|
199 |
+
return f"{fp}: {e}"
|
200 |
|
201 |
async def fetch_response_async(host, key, model, msgs, cfg, sid):
|
202 |
for t in [60, 80, 120, 240]:
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
httpx
|
|
|
2 |
pandas
|
3 |
pdfplumber
|
4 |
-
|
5 |
python-docx
|
6 |
python-pptx
|
7 |
pytesseract
|
|
|
1 |
httpx
|
2 |
+
openpyxl
|
3 |
pandas
|
4 |
pdfplumber
|
5 |
+
pillow
|
6 |
python-docx
|
7 |
python-pptx
|
8 |
pytesseract
|