Copain22 commited on
Commit
f3f8525
·
verified ·
1 Parent(s): 5179dc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -24
app.py CHANGED
@@ -16,6 +16,7 @@ from transformers import (
16
  from sentence_transformers import SentenceTransformer
17
  import gradio as gr
18
  import spaces
 
19
 
20
  from pathlib import Path
21
  from PyPDF2 import PdfReader
@@ -50,31 +51,29 @@ def load_pdfs(folder_path="."):
50
  docs = []
51
  current_section = None
52
  for pdf_file in Path(folder_path).glob("*.pdf"):
53
- reader = PdfReader(str(pdf_file))
54
- for page in reader.pages:
55
- text = page.extract_text()
56
- if text:
57
- lines = text.split("\n")
58
- for line in lines:
59
- line = line.strip()
60
- if not line:
61
- continue
62
-
63
- # New smarter heading detection:
64
- # If the line is mostly UPPERCASE and not too long
65
- if line.isupper() and len(line.split()) <= 6:
66
- if current_section:
67
- docs.append(current_section)
68
- current_section = line
69
- else:
70
- if current_section:
71
- current_section += f" | {line}"
72
- else:
73
  current_section = line
74
-
75
- if current_section:
76
- docs.append(current_section)
77
- current_section = None
 
 
 
 
 
78
 
79
  return docs
80
 
 
16
  from sentence_transformers import SentenceTransformer
17
  import gradio as gr
18
  import spaces
19
+ import pdfplumber
20
 
21
  from pathlib import Path
22
  from PyPDF2 import PdfReader
 
51
  docs = []
52
  current_section = None
53
  for pdf_file in Path(folder_path).glob("*.pdf"):
54
+ with pdfplumber.open(str(pdf_file)) as pdf:
55
+ for page in pdf.pages:
56
+ text = page.extract_text()
57
+ if text:
58
+ lines = text.split("\n")
59
+ for line in lines:
60
+ line = line.strip()
61
+ if not line:
62
+ continue
63
+
64
+ if line.isupper() and len(line.split()) <= 6:
65
+ if current_section:
66
+ docs.append(current_section)
 
 
 
 
 
 
 
67
  current_section = line
68
+ else:
69
+ if current_section:
70
+ current_section += f" | {line}"
71
+ else:
72
+ current_section = line
73
+
74
+ if current_section:
75
+ docs.append(current_section)
76
+ current_section = None
77
 
78
  return docs
79