Spaces:

Copain22
/

Cafe-Chatbot

Running on Zero

Copain22 commited on 1 day ago

Commit

f3f8525

verified ·

1 Parent(s): 5179dc5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from transformers import (
 from sentence_transformers import SentenceTransformer
 import gradio as gr
 import spaces
 from pathlib import Path
 from PyPDF2 import PdfReader
@@ -50,31 +51,29 @@ def load_pdfs(folder_path="."):
     docs = []
     current_section = None
     for pdf_file in Path(folder_path).glob("*.pdf"):
-        reader = PdfReader(str(pdf_file))
-        for page in reader.pages:
-            text = page.extract_text()
-            if text:
-                lines = text.split("\n")
-                for line in lines:
-                    line = line.strip()
-                    if not line:
-                        continue
-                    # New smarter heading detection:
-                    # If the line is mostly UPPERCASE and not too long
-                    if line.isupper() and len(line.split()) <= 6:
-                        if current_section:
-                            docs.append(current_section)
-                        current_section = line
-                    else:
-                        if current_section:
-                            current_section += f" | {line}"
-                        else:
                             current_section = line
-                if current_section:
-                    docs.append(current_section)
-                    current_section = None
     return docs

 from sentence_transformers import SentenceTransformer
 import gradio as gr
 import spaces
+import pdfplumber
 from pathlib import Path
 from PyPDF2 import PdfReader
     docs = []
     current_section = None
     for pdf_file in Path(folder_path).glob("*.pdf"):
+        with pdfplumber.open(str(pdf_file)) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    lines = text.split("\n")
+                    for line in lines:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        if line.isupper() and len(line.split()) <= 6:
+                            if current_section:
+                                docs.append(current_section)
                             current_section = line
+                        else:
+                            if current_section:
+                                current_section += f" | {line}"
+                            else:
+                                current_section = line
+            if current_section:
+                docs.append(current_section)
+                current_section = None
     return docs