Omarrran commited on
Commit
c72b167
·
verified ·
1 Parent(s): 5c06b65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -70
app.py CHANGED
@@ -22,22 +22,21 @@ logging.basicConfig(
22
  )
23
  logger = logging.getLogger("pdf_processor")
24
 
25
- # Attempt to import Unstructured.io partitioning
26
  try:
27
  from unstructured.partition.pdf import partition_pdf
28
  UNSTRUCTURED_AVAILABLE = True
29
  except ImportError:
30
  UNSTRUCTURED_AVAILABLE = False
31
- logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
32
 
33
- # Load API key from environment (set this in your Space's Secrets as GOOGLE_API_KEY)
34
  API_KEY = os.getenv("GOOGLE_API_KEY")
35
  if API_KEY:
36
  genai.configure(api_key=API_KEY)
37
  else:
38
  logger.warning("GOOGLE_API_KEY not set in environment.")
39
 
40
- # Globals to store state
41
  EXTRACTED_TEXT = ""
42
  PDF_SECTIONS = []
43
  EXTRACTION_METHOD = ""
@@ -45,21 +44,25 @@ EXTRACTION_METHOD = ""
45
 
46
  # --- Extraction Functions ---
47
  def extract_text_with_unstructured(pdf_path):
48
- logger.info("Extracting via Unstructured.io...")
49
- elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
50
- sections, current = [], {"title": "Introduction", "content": ""}
51
- for e in elements:
52
- if hasattr(e, "text") and (t := e.text.strip()):
53
- # Section header heuristic
54
- if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
55
- if current["content"]:
56
- sections.append(current)
57
- current = {"title": t, "content": ""}
58
- else:
59
- current["content"] += t + "\n\n"
60
- if current["content"]:
61
- sections.append(current)
62
- return sections
 
 
 
 
63
 
64
 
65
  def extract_text_with_pypdf(pdf_path):
@@ -76,7 +79,6 @@ def extract_text_with_pypdf(pdf_path):
76
  {"title": parts[i].strip(), "content": parts[i + 1].strip()}
77
  for i in range(1, len(parts), 2)
78
  ]
79
- # fallback single section
80
  return [{"title": "Document", "content": full_text}]
81
 
82
 
@@ -100,7 +102,7 @@ def extract_text_with_tika(pdf_path):
100
  return sections
101
 
102
 
103
- # --- Gemini API calls ---
104
  def generate_greg_brockman_summary(content):
105
  model = genai.GenerativeModel("gemini-1.5-pro")
106
  prompt = f"""
@@ -110,14 +112,14 @@ You are an expert document analyst specializing in proposal evaluation.
110
  1. GOAL: ...
111
  ... (rest of template) ...
112
 
113
- CONTENT TO ANALYZE:
114
  {content}
115
  """
116
  try:
117
  resp = model.generate_content(prompt)
118
  return resp.text, None
119
  except Exception as e:
120
- logger.error(f"Summary generation error: {e}")
121
  return None, str(e)
122
 
123
 
@@ -135,11 +137,11 @@ QUESTION: {question}
135
  resp = model.generate_content(prompt)
136
  return resp.text, None
137
  except Exception as e:
138
- logger.error(f"Q&A generation error: {e}")
139
  return None, str(e)
140
 
141
 
142
- # --- Processing & Q&A Handlers ---
143
  def process_pdf(pdf_file, progress=gr.Progress()):
144
  global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
145
 
@@ -148,13 +150,25 @@ def process_pdf(pdf_file, progress=gr.Progress()):
148
  if pdf_file is None:
149
  return None, None, "❌ No file uploaded.", ""
150
 
151
- # Save to temp
152
  tmp_dir = tempfile.gettempdir()
153
- path = os.path.join(tmp_dir, pdf_file.name)
154
- with open(path, "wb") as f:
155
- f.write(pdf_file.read())
156
-
157
- # Choose methods
 
 
 
 
 
 
 
 
 
 
 
 
158
  methods = []
159
  if UNSTRUCTURED_AVAILABLE:
160
  methods.append(("unstructured", extract_text_with_unstructured))
@@ -164,6 +178,7 @@ def process_pdf(pdf_file, progress=gr.Progress()):
164
  ]
165
 
166
  sections = None
 
167
  for name, fn in methods:
168
  try:
169
  secs = fn(path)
@@ -172,45 +187,37 @@ def process_pdf(pdf_file, progress=gr.Progress()):
172
  EXTRACTION_METHOD = name
173
  break
174
  except Exception as e:
175
- logger.warning(f"{name} failed: {e}")
 
 
176
  if not sections:
177
- return None, None, "❌ Extraction failed.", ""
178
 
179
- # Combine & store
180
- combined = ""
181
- structure = ""
182
- for idx, sec in enumerate(sections, start=1):
183
- structure += f"{idx}. {sec['title']}\n"
184
  chunk = f"## {sec['title']}\n{sec['content']}\n\n"
185
- if len(combined) + len(chunk) < 30000:
186
- combined += chunk
187
- else:
188
- combined += f"## {sec['title']}\n[Truncated]\n\n"
189
- structure += " [Content truncated]\n"
190
  EXTRACTED_TEXT = combined
191
  PDF_SECTIONS = sections
192
 
193
- # Generate summary
194
  summary, err = generate_greg_brockman_summary(combined)
195
  if err:
196
  return None, structure, f"❌ {err}", combined
197
 
198
- return summary, structure, "✅ PDF processed successfully", f"Used {EXTRACTION_METHOD}."
199
-
200
 
201
  def ask_question(question):
202
  if not API_KEY:
203
  return "❌ Set GOOGLE_API_KEY in Secrets."
204
  if not EXTRACTED_TEXT:
205
- return "❌ Please upload & process a PDF first."
206
  if not question.strip():
207
  return "❌ Enter a question."
208
 
209
- answer, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
210
- if err:
211
- return f"❌ {err}"
212
- return answer
213
-
214
 
215
  def view_log():
216
  try:
@@ -218,7 +225,6 @@ def view_log():
218
  except Exception as e:
219
  return f"Error reading log: {e}"
220
 
221
-
222
  def save_summary(summary):
223
  if not summary:
224
  return "❌ No summary to save."
@@ -227,7 +233,6 @@ def save_summary(summary):
227
  f.write(summary)
228
  return f"✅ Saved to {fn}"
229
 
230
-
231
  def save_qa(question, answer):
232
  if not question or not answer:
233
  return "❌ Nothing to save."
@@ -243,28 +248,16 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
243
  gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
244
 
245
  with gr.Tab("Setup"):
246
- with gr.Row():
247
- api_key_input = gr.Textbox(
248
- label="Google Gemini API Key",
249
- type="password",
250
- placeholder="Set in Secrets (GOOGLE_API_KEY)"
251
- )
252
- api_button = gr.Button("Configure API")
253
- api_status = gr.Markdown("⚠️ Using environment GOOGLE_API_KEY")
254
- api_button.click(
255
- fn=lambda key: (genai.configure(api_key=key) or "✅ API configured", None),
256
- inputs=[api_key_input],
257
- outputs=[api_status, gr.State()]
258
- )
259
 
260
  with gr.Tab("PDF Processing"):
261
  with gr.Row():
262
  pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
263
  proc_btn = gr.Button("Process PDF", variant="primary")
264
- status = gr.Markdown("Awaiting upload...")
265
  summary_out = gr.Textbox(label="Summary", lines=15)
266
  structure_out = gr.Textbox(label="Structure", lines=8)
267
- log_info = gr.Textbox(label="Internal Log", lines=5)
268
  proc_btn.click(
269
  fn=process_pdf,
270
  inputs=[pdf_file],
@@ -289,5 +282,4 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
289
  refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
290
 
291
  if __name__ == "__main__":
292
- # On Hugging Face Spaces, share=True isn't needed; server_name="0.0.0.0" ensures external access
293
  app.launch(server_name="0.0.0.0")
 
22
  )
23
  logger = logging.getLogger("pdf_processor")
24
 
25
+ # Try Unstructured.io
26
  try:
27
  from unstructured.partition.pdf import partition_pdf
28
  UNSTRUCTURED_AVAILABLE = True
29
  except ImportError:
30
  UNSTRUCTURED_AVAILABLE = False
31
+ logger.warning("unstructured.partition.pdf not available; skipping that method")
32
 
33
+ # Load Gemini API key from env (set in your Space Secrets)
34
  API_KEY = os.getenv("GOOGLE_API_KEY")
35
  if API_KEY:
36
  genai.configure(api_key=API_KEY)
37
  else:
38
  logger.warning("GOOGLE_API_KEY not set in environment.")
39
 
 
40
  EXTRACTED_TEXT = ""
41
  PDF_SECTIONS = []
42
  EXTRACTION_METHOD = ""
 
44
 
45
  # --- Extraction Functions ---
46
  def extract_text_with_unstructured(pdf_path):
47
+ try:
48
+ logger.info("Extracting via Unstructured.io...")
49
+ elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
50
+ sections, current = [], {"title": "Introduction", "content": ""}
51
+ for e in elements:
52
+ if hasattr(e, "text") and (t := e.text.strip()):
53
+ if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
54
+ if current["content"]:
55
+ sections.append(current)
56
+ current = {"title": t, "content": ""}
57
+ else:
58
+ current["content"] += t + "\n\n"
59
+ if current["content"]:
60
+ sections.append(current)
61
+ return sections
62
+ except Exception as e:
63
+ # Bubble up so process_pdf can catch & log
64
+ logger.error(f"Unstructured extraction error: {e}", exc_info=True)
65
+ raise
66
 
67
 
68
  def extract_text_with_pypdf(pdf_path):
 
79
  {"title": parts[i].strip(), "content": parts[i + 1].strip()}
80
  for i in range(1, len(parts), 2)
81
  ]
 
82
  return [{"title": "Document", "content": full_text}]
83
 
84
 
 
102
  return sections
103
 
104
 
105
+ # --- Gemini calls ---
106
  def generate_greg_brockman_summary(content):
107
  model = genai.GenerativeModel("gemini-1.5-pro")
108
  prompt = f"""
 
112
  1. GOAL: ...
113
  ... (rest of template) ...
114
 
115
+ CONTENT:
116
  {content}
117
  """
118
  try:
119
  resp = model.generate_content(prompt)
120
  return resp.text, None
121
  except Exception as e:
122
+ logger.error(f"Summary error: {e}")
123
  return None, str(e)
124
 
125
 
 
137
  resp = model.generate_content(prompt)
138
  return resp.text, None
139
  except Exception as e:
140
+ logger.error(f"Q&A error: {e}")
141
  return None, str(e)
142
 
143
 
144
+ # --- Handlers ---
145
  def process_pdf(pdf_file, progress=gr.Progress()):
146
  global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
147
 
 
150
  if pdf_file is None:
151
  return None, None, "❌ No file uploaded.", ""
152
 
153
+ # Determine path & write bytes if needed
154
  tmp_dir = tempfile.gettempdir()
155
+ # Case 1: NamedString (in‐memory) with .name & .data
156
+ if hasattr(pdf_file, "name") and hasattr(pdf_file, "data"):
157
+ path = os.path.join(tmp_dir, pdf_file.name)
158
+ with open(path, "wb") as f:
159
+ f.write(pdf_file.data)
160
+ # Case 2: direct filepath (str)
161
+ elif isinstance(pdf_file, str):
162
+ path = pdf_file
163
+ # Case 3: file‐like with .read()
164
+ elif hasattr(pdf_file, "read"):
165
+ path = os.path.join(tmp_dir, getattr(pdf_file, "name", "uploaded.pdf"))
166
+ with open(path, "wb") as f:
167
+ f.write(pdf_file.read())
168
+ else:
169
+ return None, None, "❌ Unrecognized upload type", ""
170
+
171
+ # Try methods in order
172
  methods = []
173
  if UNSTRUCTURED_AVAILABLE:
174
  methods.append(("unstructured", extract_text_with_unstructured))
 
178
  ]
179
 
180
  sections = None
181
+ last_err = ""
182
  for name, fn in methods:
183
  try:
184
  secs = fn(path)
 
187
  EXTRACTION_METHOD = name
188
  break
189
  except Exception as e:
190
+ last_err = f"{name} failed: {e}"
191
+ logger.warning(last_err)
192
+
193
  if not sections:
194
+ return None, None, "❌ Extraction failed", last_err
195
 
196
+ # Combine & summarize
197
+ combined, structure = "", ""
198
+ for i, sec in enumerate(sections, 1):
199
+ structure += f"{i}. {sec['title']}\n"
 
200
  chunk = f"## {sec['title']}\n{sec['content']}\n\n"
201
+ combined += chunk if len(combined + chunk) < 30000 else f"## {sec['title']}\n[Truncated]\n\n"
 
 
 
 
202
  EXTRACTED_TEXT = combined
203
  PDF_SECTIONS = sections
204
 
 
205
  summary, err = generate_greg_brockman_summary(combined)
206
  if err:
207
  return None, structure, f"❌ {err}", combined
208
 
209
+ return summary, structure, "✅ PDF processed", f"Used {EXTRACTION_METHOD}"
 
210
 
211
  def ask_question(question):
212
  if not API_KEY:
213
  return "❌ Set GOOGLE_API_KEY in Secrets."
214
  if not EXTRACTED_TEXT:
215
+ return "❌ Process a PDF first."
216
  if not question.strip():
217
  return "❌ Enter a question."
218
 
219
+ ans, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
220
+ return ans if not err else f"❌ {err}"
 
 
 
221
 
222
  def view_log():
223
  try:
 
225
  except Exception as e:
226
  return f"Error reading log: {e}"
227
 
 
228
  def save_summary(summary):
229
  if not summary:
230
  return "❌ No summary to save."
 
233
  f.write(summary)
234
  return f"✅ Saved to {fn}"
235
 
 
236
  def save_qa(question, answer):
237
  if not question or not answer:
238
  return "❌ Nothing to save."
 
248
  gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
249
 
250
  with gr.Tab("Setup"):
251
+ gr.Markdown("⚠️ Make sure `GOOGLE_API_KEY` is set in your Space's Secrets.")
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  with gr.Tab("PDF Processing"):
254
  with gr.Row():
255
  pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
256
  proc_btn = gr.Button("Process PDF", variant="primary")
257
+ status = gr.Markdown("Awaiting upload")
258
  summary_out = gr.Textbox(label="Summary", lines=15)
259
  structure_out = gr.Textbox(label="Structure", lines=8)
260
+ log_info = gr.Textbox(label="Internal Log", lines=5)
261
  proc_btn.click(
262
  fn=process_pdf,
263
  inputs=[pdf_file],
 
282
  refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
283
 
284
  if __name__ == "__main__":
 
285
  app.launch(server_name="0.0.0.0")