hadadrjt commited on
Commit
fee4a44
·
1 Parent(s): 933e48c

ai: Advanced the file extraction process

Browse files
Files changed (2) hide show
  1. jarvis.py +116 -25
  2. requirements.txt +2 -1
jarvis.py CHANGED
@@ -16,10 +16,13 @@ import random
16
  import requests
17
  import threading
18
  import uuid
 
 
19
 
20
  from PIL import Image
21
  from pathlib import Path
22
  from pptx import Presentation
 
23
 
24
  os.system("apt-get update -q -y && apt-get install -q -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind libleptonica-dev libtesseract-dev")
25
 
@@ -79,33 +82,121 @@ def marked_item(item, marked, attempts):
79
  def get_model_key(display):
80
  return next((k for k, v in MODEL_MAPPING.items() if v == display), DEFAULT_MODEL_KEY)
81
 
82
- def extract_file_content(fp):
83
- ext = Path(fp).suffix.lower()
84
- c = ""
85
  try:
86
- if ext == ".pdf":
87
- with pdfplumber.open(fp) as pdf:
88
- for p in pdf.pages:
89
- t = p.extract_text() or ""
90
- c += t + "\n"
91
- elif ext in [".doc", ".docx"]:
92
- d = docx.Document(fp)
93
- for para in d.paragraphs:
94
- c += para.text + "\n"
95
- elif ext in [".xlsx", ".xls"]:
96
- df = pd.read_excel(fp)
97
- c += df.to_csv(index=False)
98
- elif ext in [".ppt", ".pptx"]:
99
- prs = Presentation(fp)
100
- for s in prs.slides:
101
- for sh in s.shapes:
102
- if hasattr(sh, "text") and sh.text:
103
- c += sh.text + "\n"
104
- else:
105
- c = Path(fp).read_text(encoding="utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  except Exception as e:
107
- c = f"{fp}: {e}"
108
- return c.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  async def fetch_response_async(host, key, model, msgs, cfg, sid):
111
  for t in [60, 80, 120, 240]:
 
16
  import requests
17
  import threading
18
  import uuid
19
+ import zipfile
20
+ import io
21
 
22
  from PIL import Image
23
  from pathlib import Path
24
  from pptx import Presentation
25
+ from openpyxl import load_workbook
26
 
27
  os.system("apt-get update -q -y && apt-get install -q -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind libleptonica-dev libtesseract-dev")
28
 
 
82
  def get_model_key(display):
83
  return next((k for k, v in MODEL_MAPPING.items() if v == display), DEFAULT_MODEL_KEY)
84
 
85
+ def extract_pdf_content(fp):
86
+ content = ""
 
87
  try:
88
+ with pdfplumber.open(fp) as pdf:
89
+ for page in pdf.pages:
90
+ text = page.extract_text() or ""
91
+ content += text + "\n"
92
+ if page.images:
93
+ img_obj = page.to_image(resolution=300)
94
+ for img in page.images:
95
+ bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
96
+ cropped = img_obj.original.crop(bbox)
97
+ ocr_text = pytesseract.image_to_string(cropped)
98
+ if ocr_text.strip():
99
+ content += ocr_text + "\n"
100
+ tables = page.extract_tables()
101
+ for table in tables:
102
+ for row in table:
103
+ cells = [str(cell) for cell in row if cell is not None]
104
+ if cells:
105
+ content += "\t".join(cells) + "\n"
106
+ except Exception as e:
107
+ content += f"{fp}: {e}"
108
+ return content.strip()
109
+
110
+ def extract_docx_content(fp):
111
+ content = ""
112
+ try:
113
+ doc = docx.Document(fp)
114
+ for para in doc.paragraphs:
115
+ content += para.text + "\n"
116
+ for table in doc.tables:
117
+ for row in table.rows:
118
+ cells = [cell.text for cell in row.cells]
119
+ content += "\t".join(cells) + "\n"
120
+ with zipfile.ZipFile(fp) as z:
121
+ for file in z.namelist():
122
+ if file.startswith("word/media/"):
123
+ data = z.read(file)
124
+ try:
125
+ img = Image.open(io.BytesIO(data))
126
+ ocr_text = pytesseract.image_to_string(img)
127
+ if ocr_text.strip():
128
+ content += ocr_text + "\n"
129
+ except Exception:
130
+ pass
131
+ except Exception as e:
132
+ content += f"{fp}: {e}"
133
+ return content.strip()
134
+
135
+ def extract_excel_content(fp):
136
+ content = ""
137
+ try:
138
+ sheets = pd.read_excel(fp, sheet_name=None)
139
+ for name, df in sheets.items():
140
+ content += f"Sheet: {name}\n"
141
+ content += df.to_csv(index=False) + "\n"
142
+ wb = load_workbook(fp, data_only=True)
143
+ if wb._images:
144
+ for image in wb._images:
145
+ img = image.ref
146
+ if isinstance(img, bytes):
147
+ try:
148
+ pil_img = Image.open(io.BytesIO(img))
149
+ ocr_text = pytesseract.image_to_string(pil_img)
150
+ if ocr_text.strip():
151
+ content += ocr_text + "\n"
152
+ except Exception:
153
+ pass
154
  except Exception as e:
155
+ content += f"{fp}: {e}"
156
+ return content.strip()
157
+
158
+ def extract_pptx_content(fp):
159
+ content = ""
160
+ try:
161
+ prs = Presentation(fp)
162
+ for slide in prs.slides:
163
+ for shape in slide.shapes:
164
+ if hasattr(shape, "text") and shape.text:
165
+ content += shape.text + "\n"
166
+ if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
167
+ try:
168
+ img = Image.open(io.BytesIO(shape.image.blob))
169
+ ocr_text = pytesseract.image_to_string(img)
170
+ if ocr_text.strip():
171
+ content += ocr_text + "\n"
172
+ except Exception:
173
+ pass
174
+ if slide.shapes:
175
+ for shape in slide.shapes:
176
+ if shape.has_table:
177
+ table = shape.table
178
+ for row in table.rows:
179
+ cells = [cell.text for cell in row.cells]
180
+ content += "\t".join(cells) + "\n"
181
+ except Exception as e:
182
+ content += f"{fp}: {e}"
183
+ return content.strip()
184
+
185
+ def extract_file_content(fp):
186
+ ext = Path(fp).suffix.lower()
187
+ if ext == ".pdf":
188
+ return extract_pdf_content(fp)
189
+ elif ext in [".doc", ".docx"]:
190
+ return extract_docx_content(fp)
191
+ elif ext in [".xlsx", ".xls"]:
192
+ return extract_excel_content(fp)
193
+ elif ext in [".ppt", ".pptx"]:
194
+ return extract_pptx_content(fp)
195
+ else:
196
+ try:
197
+ return Path(fp).read_text(encoding="utf-8").strip()
198
+ except Exception as e:
199
+ return f"{fp}: {e}"
200
 
201
  async def fetch_response_async(host, key, model, msgs, cfg, sid):
202
  for t in [60, 80, 120, 240]:
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  httpx
 
2
  pandas
3
  pdfplumber
4
- Pillow
5
  python-docx
6
  python-pptx
7
  pytesseract
 
1
  httpx
2
+ openpyxl
3
  pandas
4
  pdfplumber
5
+ pillow
6
  python-docx
7
  python-pptx
8
  pytesseract