Spaces:

hadadrjt
/

ai

Running

App Files Files Community

hadadrjt commited on 20 days ago

Commit

fee4a44

1 Parent(s): 933e48c

ai: Advanced the file extraction process

Browse files

Files changed (2) hide show

jarvis.py +116 -25
requirements.txt +2 -1

jarvis.py CHANGED Viewed

@@ -16,10 +16,13 @@ import random
 import requests
 import threading
 import uuid
 from PIL import Image
 from pathlib import Path
 from pptx import Presentation
 os.system("apt-get update -q -y && apt-get install -q -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind libleptonica-dev libtesseract-dev")
@@ -79,33 +82,121 @@ def marked_item(item, marked, attempts):
 def get_model_key(display):
     return next((k for k, v in MODEL_MAPPING.items() if v == display), DEFAULT_MODEL_KEY)
-def extract_file_content(fp):
-    ext = Path(fp).suffix.lower()
-    c = ""
     try:
-        if ext == ".pdf":
-            with pdfplumber.open(fp) as pdf:
-                for p in pdf.pages:
-                    t = p.extract_text() or ""
-                    c += t + "\n"
-        elif ext in [".doc", ".docx"]:
-            d = docx.Document(fp)
-            for para in d.paragraphs:
-                c += para.text + "\n"
-        elif ext in [".xlsx", ".xls"]:
-            df = pd.read_excel(fp)
-            c += df.to_csv(index=False)
-        elif ext in [".ppt", ".pptx"]:
-            prs = Presentation(fp)
-            for s in prs.slides:
-                for sh in s.shapes:
-                    if hasattr(sh, "text") and sh.text:
-                        c += sh.text + "\n"
-        else:
-            c = Path(fp).read_text(encoding="utf-8")
     except Exception as e:
-        c = f"{fp}: {e}"
-    return c.strip()
 async def fetch_response_async(host, key, model, msgs, cfg, sid):
     for t in [60, 80, 120, 240]:

 import requests
 import threading
 import uuid
+import zipfile
+import io
 from PIL import Image
 from pathlib import Path
 from pptx import Presentation
+from openpyxl import load_workbook
 os.system("apt-get update -q -y && apt-get install -q -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind libleptonica-dev libtesseract-dev")
 def get_model_key(display):
     return next((k for k, v in MODEL_MAPPING.items() if v == display), DEFAULT_MODEL_KEY)
+def extract_pdf_content(fp):
+    content = ""
     try:
+        with pdfplumber.open(fp) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text() or ""
+                content += text + "\n"
+                if page.images:
+                    img_obj = page.to_image(resolution=300)
+                    for img in page.images:
+                        bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
+                        cropped = img_obj.original.crop(bbox)
+                        ocr_text = pytesseract.image_to_string(cropped)
+                        if ocr_text.strip():
+                            content += ocr_text + "\n"
+                tables = page.extract_tables()
+                for table in tables:
+                    for row in table:
+                        cells = [str(cell) for cell in row if cell is not None]
+                        if cells:
+                            content += "\t".join(cells) + "\n"
+    except Exception as e:
+        content += f"{fp}: {e}"
+    return content.strip()
+def extract_docx_content(fp):
+    content = ""
+    try:
+        doc = docx.Document(fp)
+        for para in doc.paragraphs:
+            content += para.text + "\n"
+        for table in doc.tables:
+            for row in table.rows:
+                cells = [cell.text for cell in row.cells]
+                content += "\t".join(cells) + "\n"
+        with zipfile.ZipFile(fp) as z:
+            for file in z.namelist():
+                if file.startswith("word/media/"):
+                    data = z.read(file)
+                    try:
+                        img = Image.open(io.BytesIO(data))
+                        ocr_text = pytesseract.image_to_string(img)
+                        if ocr_text.strip():
+                            content += ocr_text + "\n"
+                    except Exception:
+                        pass
+    except Exception as e:
+        content += f"{fp}: {e}"
+    return content.strip()
+def extract_excel_content(fp):
+    content = ""
+    try:
+        sheets = pd.read_excel(fp, sheet_name=None)
+        for name, df in sheets.items():
+            content += f"Sheet: {name}\n"
+            content += df.to_csv(index=False) + "\n"
+        wb = load_workbook(fp, data_only=True)
+        if wb._images:
+            for image in wb._images:
+                img = image.ref
+                if isinstance(img, bytes):
+                    try:
+                        pil_img = Image.open(io.BytesIO(img))
+                        ocr_text = pytesseract.image_to_string(pil_img)
+                        if ocr_text.strip():
+                            content += ocr_text + "\n"
+                    except Exception:
+                        pass
     except Exception as e:
+        content += f"{fp}: {e}"
+    return content.strip()
+def extract_pptx_content(fp):
+    content = ""
+    try:
+        prs = Presentation(fp)
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text:
+                    content += shape.text + "\n"
+                if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
+                    try:
+                        img = Image.open(io.BytesIO(shape.image.blob))
+                        ocr_text = pytesseract.image_to_string(img)
+                        if ocr_text.strip():
+                            content += ocr_text + "\n"
+                    except Exception:
+                        pass
+            if slide.shapes:
+                for shape in slide.shapes:
+                    if shape.has_table:
+                        table = shape.table
+                        for row in table.rows:
+                            cells = [cell.text for cell in row.cells]
+                            content += "\t".join(cells) + "\n"
+    except Exception as e:
+        content += f"{fp}: {e}"
+    return content.strip()
+def extract_file_content(fp):
+    ext = Path(fp).suffix.lower()
+    if ext == ".pdf":
+        return extract_pdf_content(fp)
+    elif ext in [".doc", ".docx"]:
+        return extract_docx_content(fp)
+    elif ext in [".xlsx", ".xls"]:
+        return extract_excel_content(fp)
+    elif ext in [".ppt", ".pptx"]:
+        return extract_pptx_content(fp)
+    else:
+        try:
+            return Path(fp).read_text(encoding="utf-8").strip()
+        except Exception as e:
+            return f"{fp}: {e}"
 async def fetch_response_async(host, key, model, msgs, cfg, sid):
     for t in [60, 80, 120, 240]:

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
 httpx
 pandas
 pdfplumber
-Pillow
 python-docx
 python-pptx
 pytesseract

 httpx
+openpyxl
 pandas
 pdfplumber
+pillow
 python-docx
 python-pptx
 pytesseract