Omarrran commited on
Commit
e657d8c
Β·
verified Β·
1 Parent(s): 6f14fd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -65
app.py CHANGED
@@ -8,7 +8,6 @@ import gradio as gr
8
  import google.generativeai as genai
9
  from PyPDF2 import PdfReader
10
  from tika import parser
11
- from unstructured.partition.pdf import partition_pdf
12
 
13
  # Configure logging
14
  tmp_log = "pdf_processor_log.txt"
@@ -22,6 +21,14 @@ logging.basicConfig(
22
  )
23
  logger = logging.getLogger("pdf_processor")
24
 
 
 
 
 
 
 
 
 
25
  # Load API key from environment
26
  API_KEY = os.getenv("GOOGLE_API_KEY", None)
27
  if not API_KEY:
@@ -116,70 +123,13 @@ def process_pdf(pdf_file, progress=gr.Progress()):
116
  tmp = tempfile.gettempdir()
117
  path = os.path.join(tmp, pdf_file.name)
118
  with open(path, 'wb') as f: f.write(pdf_file.read())
119
- methods = [("unstructured", extract_text_with_unstructured),
120
- ("pypdf", extract_text_with_pypdf),
121
- ("tika", extract_text_with_tika)]
122
- for name, fn in methods:
123
- try:
124
- secs = fn(path)
125
- if secs:
126
- EXTRACTION_METHOD = name
127
- PDF_SECTIONS = secs
128
- break
129
- except:
130
- continue
131
- if not PDF_SECTIONS:
132
- return None, None, "❌ Extraction failed.", ""
133
- combined, struct = "", ""
134
- for i,sec in enumerate(PDF_SECTIONS,1):
135
- struct += f"{i}. {sec['title']}\n"
136
- block = f"## {sec['title']}\n{sec['content']}\n\n"
137
- combined += block if len(combined+block)<30000 else f"## {sec['title']}\n[Truncated]\n\n"
138
- EXTRACTED_TEXT = combined
139
- summary, err = generate_greg_brockman_summary(combined)
140
- if err:
141
- return None, struct, f"❌ {err}", combined
142
- return summary, struct, "βœ… Done", f"Used {EXTRACTION_METHOD}, {len(PDF_SECTIONS)} sections"
143
-
144
- def ask_question(question):
145
- if not API_KEY: return "❌ Set GOOGLE_API_KEY."
146
- if not EXTRACTED_TEXT: return "❌ Process a PDF first."
147
- if not question.strip(): return "❌ Enter a question."
148
- ans, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
149
- return ans if not err else f"❌ {err}"
150
-
151
- def view_log():
152
- try:
153
- return open(tmp_log).read()
154
- except:
155
- return "Error reading log."
156
-
157
- def save_summary(summary):
158
- if not summary: return "❌ No summary."
159
- fn = f"summary_{datetime.now():%Y%m%d_%H%M%S}.txt"
160
- open(fn, 'w', encoding='utf-8').write(summary)
161
- return f"βœ… Saved to {fn}"
162
-
163
- def save_qa(question, answer):
164
- if not question or not answer: return "❌ Incomplete Q&A."
165
- fn = f"qa_{datetime.now():%Y%m%d_%H%M%S}.txt"
166
- with open(fn,'w',encoding='utf-8') as f:
167
- f.write(f"Q: {question}\n\nA: {answer}")
168
- return f"βœ… Saved to {fn}"
169
-
170
- # --- Gradio UI ---
171
- with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
172
- gr.Markdown("# πŸ“„ PDF Analyzer with Gemini API")
173
- gr.Markdown("Upload a PDF, get a summary, ask questions.")
174
- with gr.Tab("PDF Processing"):
175
- pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"], type="binary")
176
- process_btn = gr.Button("Process PDF")
177
- summary_out = gr.Textbox(label="Summary", lines=15)
178
- struct_out = gr.Textbox(label="Structure", lines=8)
179
- status = gr.Markdown("")
180
- log_out = gr.Textbox(label="Log", lines=8)
181
- process_btn.click(process_pdf, inputs=[pdf_file],
182
- outputs=[summary_out, struct_out, status, log_out])
183
  with gr.Tab("Ask Questions"):
184
  question = gr.Textbox(label="Question", lines=2)
185
  ask_btn = gr.Button("Ask")
 
8
  import google.generativeai as genai
9
  from PyPDF2 import PdfReader
10
  from tika import parser
 
11
 
12
  # Configure logging
13
  tmp_log = "pdf_processor_log.txt"
 
21
  )
22
  logger = logging.getLogger("pdf_processor")
23
 
24
+ # Attempt to import Unstructured.io partitioning
25
+ try:
26
+ from unstructured.partition.pdf import partition_pdf
27
+ UNSTRUCTURED_AVAILABLE = True
28
+ except ImportError:
29
+ UNSTRUCTURED_AVAILABLE = False
30
+ logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
31
+
32
  # Load API key from environment
33
  API_KEY = os.getenv("GOOGLE_API_KEY", None)
34
  if not API_KEY:
 
123
  tmp = tempfile.gettempdir()
124
  path = os.path.join(tmp, pdf_file.name)
125
  with open(path, 'wb') as f: f.write(pdf_file.read())
126
+ methods = []
127
+ if UNSTRUCTURED_AVAILABLE:
128
+ methods.append(("unstructured", extract_text_with_unstructured))
129
+ methods.extend([
130
+ ("pypdf", extract_text_with_pypdf),
131
+ ("tika", extract_text_with_tika)
132
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  with gr.Tab("Ask Questions"):
134
  question = gr.Textbox(label="Question", lines=2)
135
  ask_btn = gr.Button("Ask")