Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,22 +22,21 @@ logging.basicConfig(
|
|
22 |
)
|
23 |
logger = logging.getLogger("pdf_processor")
|
24 |
|
25 |
-
#
|
26 |
try:
|
27 |
from unstructured.partition.pdf import partition_pdf
|
28 |
UNSTRUCTURED_AVAILABLE = True
|
29 |
except ImportError:
|
30 |
UNSTRUCTURED_AVAILABLE = False
|
31 |
-
logger.warning("unstructured.partition.pdf not available; skipping that
|
32 |
|
33 |
-
# Load API key from
|
34 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
35 |
if API_KEY:
|
36 |
genai.configure(api_key=API_KEY)
|
37 |
else:
|
38 |
logger.warning("GOOGLE_API_KEY not set in environment.")
|
39 |
|
40 |
-
# Globals to store state
|
41 |
EXTRACTED_TEXT = ""
|
42 |
PDF_SECTIONS = []
|
43 |
EXTRACTION_METHOD = ""
|
@@ -45,21 +44,25 @@ EXTRACTION_METHOD = ""
|
|
45 |
|
46 |
# --- Extraction Functions ---
|
47 |
def extract_text_with_unstructured(pdf_path):
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
63 |
|
64 |
|
65 |
def extract_text_with_pypdf(pdf_path):
|
@@ -76,7 +79,6 @@ def extract_text_with_pypdf(pdf_path):
|
|
76 |
{"title": parts[i].strip(), "content": parts[i + 1].strip()}
|
77 |
for i in range(1, len(parts), 2)
|
78 |
]
|
79 |
-
# fallback single section
|
80 |
return [{"title": "Document", "content": full_text}]
|
81 |
|
82 |
|
@@ -100,7 +102,7 @@ def extract_text_with_tika(pdf_path):
|
|
100 |
return sections
|
101 |
|
102 |
|
103 |
-
# --- Gemini
|
104 |
def generate_greg_brockman_summary(content):
|
105 |
model = genai.GenerativeModel("gemini-1.5-pro")
|
106 |
prompt = f"""
|
@@ -110,14 +112,14 @@ You are an expert document analyst specializing in proposal evaluation.
|
|
110 |
1. GOAL: ...
|
111 |
... (rest of template) ...
|
112 |
|
113 |
-
CONTENT
|
114 |
{content}
|
115 |
"""
|
116 |
try:
|
117 |
resp = model.generate_content(prompt)
|
118 |
return resp.text, None
|
119 |
except Exception as e:
|
120 |
-
logger.error(f"Summary
|
121 |
return None, str(e)
|
122 |
|
123 |
|
@@ -135,11 +137,11 @@ QUESTION: {question}
|
|
135 |
resp = model.generate_content(prompt)
|
136 |
return resp.text, None
|
137 |
except Exception as e:
|
138 |
-
logger.error(f"Q&A
|
139 |
return None, str(e)
|
140 |
|
141 |
|
142 |
-
# ---
|
143 |
def process_pdf(pdf_file, progress=gr.Progress()):
|
144 |
global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
|
145 |
|
@@ -148,13 +150,25 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
148 |
if pdf_file is None:
|
149 |
return None, None, "❌ No file uploaded.", ""
|
150 |
|
151 |
-
#
|
152 |
tmp_dir = tempfile.gettempdir()
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
methods = []
|
159 |
if UNSTRUCTURED_AVAILABLE:
|
160 |
methods.append(("unstructured", extract_text_with_unstructured))
|
@@ -164,6 +178,7 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
164 |
]
|
165 |
|
166 |
sections = None
|
|
|
167 |
for name, fn in methods:
|
168 |
try:
|
169 |
secs = fn(path)
|
@@ -172,45 +187,37 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
172 |
EXTRACTION_METHOD = name
|
173 |
break
|
174 |
except Exception as e:
|
175 |
-
|
|
|
|
|
176 |
if not sections:
|
177 |
-
return None, None, "❌ Extraction failed
|
178 |
|
179 |
-
# Combine &
|
180 |
-
combined = ""
|
181 |
-
|
182 |
-
|
183 |
-
structure += f"{idx}. {sec['title']}\n"
|
184 |
chunk = f"## {sec['title']}\n{sec['content']}\n\n"
|
185 |
-
if len(combined
|
186 |
-
combined += chunk
|
187 |
-
else:
|
188 |
-
combined += f"## {sec['title']}\n[Truncated]\n\n"
|
189 |
-
structure += " [Content truncated]\n"
|
190 |
EXTRACTED_TEXT = combined
|
191 |
PDF_SECTIONS = sections
|
192 |
|
193 |
-
# Generate summary
|
194 |
summary, err = generate_greg_brockman_summary(combined)
|
195 |
if err:
|
196 |
return None, structure, f"❌ {err}", combined
|
197 |
|
198 |
-
return summary, structure, "✅ PDF processed
|
199 |
-
|
200 |
|
201 |
def ask_question(question):
|
202 |
if not API_KEY:
|
203 |
return "❌ Set GOOGLE_API_KEY in Secrets."
|
204 |
if not EXTRACTED_TEXT:
|
205 |
-
return "❌
|
206 |
if not question.strip():
|
207 |
return "❌ Enter a question."
|
208 |
|
209 |
-
|
210 |
-
if err
|
211 |
-
return f"❌ {err}"
|
212 |
-
return answer
|
213 |
-
|
214 |
|
215 |
def view_log():
|
216 |
try:
|
@@ -218,7 +225,6 @@ def view_log():
|
|
218 |
except Exception as e:
|
219 |
return f"Error reading log: {e}"
|
220 |
|
221 |
-
|
222 |
def save_summary(summary):
|
223 |
if not summary:
|
224 |
return "❌ No summary to save."
|
@@ -227,7 +233,6 @@ def save_summary(summary):
|
|
227 |
f.write(summary)
|
228 |
return f"✅ Saved to {fn}"
|
229 |
|
230 |
-
|
231 |
def save_qa(question, answer):
|
232 |
if not question or not answer:
|
233 |
return "❌ Nothing to save."
|
@@ -243,28 +248,16 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
|
|
243 |
gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
|
244 |
|
245 |
with gr.Tab("Setup"):
|
246 |
-
|
247 |
-
api_key_input = gr.Textbox(
|
248 |
-
label="Google Gemini API Key",
|
249 |
-
type="password",
|
250 |
-
placeholder="Set in Secrets (GOOGLE_API_KEY)"
|
251 |
-
)
|
252 |
-
api_button = gr.Button("Configure API")
|
253 |
-
api_status = gr.Markdown("⚠️ Using environment GOOGLE_API_KEY")
|
254 |
-
api_button.click(
|
255 |
-
fn=lambda key: (genai.configure(api_key=key) or "✅ API configured", None),
|
256 |
-
inputs=[api_key_input],
|
257 |
-
outputs=[api_status, gr.State()]
|
258 |
-
)
|
259 |
|
260 |
with gr.Tab("PDF Processing"):
|
261 |
with gr.Row():
|
262 |
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
263 |
proc_btn = gr.Button("Process PDF", variant="primary")
|
264 |
-
status = gr.Markdown("Awaiting upload
|
265 |
summary_out = gr.Textbox(label="Summary", lines=15)
|
266 |
structure_out = gr.Textbox(label="Structure", lines=8)
|
267 |
-
log_info
|
268 |
proc_btn.click(
|
269 |
fn=process_pdf,
|
270 |
inputs=[pdf_file],
|
@@ -289,5 +282,4 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
|
|
289 |
refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
|
290 |
|
291 |
if __name__ == "__main__":
|
292 |
-
# On Hugging Face Spaces, share=True isn't needed; server_name="0.0.0.0" ensures external access
|
293 |
app.launch(server_name="0.0.0.0")
|
|
|
22 |
)
|
23 |
logger = logging.getLogger("pdf_processor")
|
24 |
|
25 |
+
# Try Unstructured.io
|
26 |
try:
|
27 |
from unstructured.partition.pdf import partition_pdf
|
28 |
UNSTRUCTURED_AVAILABLE = True
|
29 |
except ImportError:
|
30 |
UNSTRUCTURED_AVAILABLE = False
|
31 |
+
logger.warning("unstructured.partition.pdf not available; skipping that method")
|
32 |
|
33 |
+
# Load Gemini API key from env (set in your Space Secrets)
|
34 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
35 |
if API_KEY:
|
36 |
genai.configure(api_key=API_KEY)
|
37 |
else:
|
38 |
logger.warning("GOOGLE_API_KEY not set in environment.")
|
39 |
|
|
|
40 |
EXTRACTED_TEXT = ""
|
41 |
PDF_SECTIONS = []
|
42 |
EXTRACTION_METHOD = ""
|
|
|
44 |
|
45 |
# --- Extraction Functions ---
|
46 |
def extract_text_with_unstructured(pdf_path):
|
47 |
+
try:
|
48 |
+
logger.info("Extracting via Unstructured.io...")
|
49 |
+
elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
|
50 |
+
sections, current = [], {"title": "Introduction", "content": ""}
|
51 |
+
for e in elements:
|
52 |
+
if hasattr(e, "text") and (t := e.text.strip()):
|
53 |
+
if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
|
54 |
+
if current["content"]:
|
55 |
+
sections.append(current)
|
56 |
+
current = {"title": t, "content": ""}
|
57 |
+
else:
|
58 |
+
current["content"] += t + "\n\n"
|
59 |
+
if current["content"]:
|
60 |
+
sections.append(current)
|
61 |
+
return sections
|
62 |
+
except Exception as e:
|
63 |
+
# Bubble up so process_pdf can catch & log
|
64 |
+
logger.error(f"Unstructured extraction error: {e}", exc_info=True)
|
65 |
+
raise
|
66 |
|
67 |
|
68 |
def extract_text_with_pypdf(pdf_path):
|
|
|
79 |
{"title": parts[i].strip(), "content": parts[i + 1].strip()}
|
80 |
for i in range(1, len(parts), 2)
|
81 |
]
|
|
|
82 |
return [{"title": "Document", "content": full_text}]
|
83 |
|
84 |
|
|
|
102 |
return sections
|
103 |
|
104 |
|
105 |
+
# --- Gemini calls ---
|
106 |
def generate_greg_brockman_summary(content):
|
107 |
model = genai.GenerativeModel("gemini-1.5-pro")
|
108 |
prompt = f"""
|
|
|
112 |
1. GOAL: ...
|
113 |
... (rest of template) ...
|
114 |
|
115 |
+
CONTENT:
|
116 |
{content}
|
117 |
"""
|
118 |
try:
|
119 |
resp = model.generate_content(prompt)
|
120 |
return resp.text, None
|
121 |
except Exception as e:
|
122 |
+
logger.error(f"Summary error: {e}")
|
123 |
return None, str(e)
|
124 |
|
125 |
|
|
|
137 |
resp = model.generate_content(prompt)
|
138 |
return resp.text, None
|
139 |
except Exception as e:
|
140 |
+
logger.error(f"Q&A error: {e}")
|
141 |
return None, str(e)
|
142 |
|
143 |
|
144 |
+
# --- Handlers ---
|
145 |
def process_pdf(pdf_file, progress=gr.Progress()):
|
146 |
global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
|
147 |
|
|
|
150 |
if pdf_file is None:
|
151 |
return None, None, "❌ No file uploaded.", ""
|
152 |
|
153 |
+
# Determine path & write bytes if needed
|
154 |
tmp_dir = tempfile.gettempdir()
|
155 |
+
# Case 1: NamedString (in‐memory) with .name & .data
|
156 |
+
if hasattr(pdf_file, "name") and hasattr(pdf_file, "data"):
|
157 |
+
path = os.path.join(tmp_dir, pdf_file.name)
|
158 |
+
with open(path, "wb") as f:
|
159 |
+
f.write(pdf_file.data)
|
160 |
+
# Case 2: direct filepath (str)
|
161 |
+
elif isinstance(pdf_file, str):
|
162 |
+
path = pdf_file
|
163 |
+
# Case 3: file‐like with .read()
|
164 |
+
elif hasattr(pdf_file, "read"):
|
165 |
+
path = os.path.join(tmp_dir, getattr(pdf_file, "name", "uploaded.pdf"))
|
166 |
+
with open(path, "wb") as f:
|
167 |
+
f.write(pdf_file.read())
|
168 |
+
else:
|
169 |
+
return None, None, "❌ Unrecognized upload type", ""
|
170 |
+
|
171 |
+
# Try methods in order
|
172 |
methods = []
|
173 |
if UNSTRUCTURED_AVAILABLE:
|
174 |
methods.append(("unstructured", extract_text_with_unstructured))
|
|
|
178 |
]
|
179 |
|
180 |
sections = None
|
181 |
+
last_err = ""
|
182 |
for name, fn in methods:
|
183 |
try:
|
184 |
secs = fn(path)
|
|
|
187 |
EXTRACTION_METHOD = name
|
188 |
break
|
189 |
except Exception as e:
|
190 |
+
last_err = f"{name} failed: {e}"
|
191 |
+
logger.warning(last_err)
|
192 |
+
|
193 |
if not sections:
|
194 |
+
return None, None, "❌ Extraction failed", last_err
|
195 |
|
196 |
+
# Combine & summarize
|
197 |
+
combined, structure = "", ""
|
198 |
+
for i, sec in enumerate(sections, 1):
|
199 |
+
structure += f"{i}. {sec['title']}\n"
|
|
|
200 |
chunk = f"## {sec['title']}\n{sec['content']}\n\n"
|
201 |
+
combined += chunk if len(combined + chunk) < 30000 else f"## {sec['title']}\n[Truncated]\n\n"
|
|
|
|
|
|
|
|
|
202 |
EXTRACTED_TEXT = combined
|
203 |
PDF_SECTIONS = sections
|
204 |
|
|
|
205 |
summary, err = generate_greg_brockman_summary(combined)
|
206 |
if err:
|
207 |
return None, structure, f"❌ {err}", combined
|
208 |
|
209 |
+
return summary, structure, "✅ PDF processed", f"Used {EXTRACTION_METHOD}"
|
|
|
210 |
|
211 |
def ask_question(question):
|
212 |
if not API_KEY:
|
213 |
return "❌ Set GOOGLE_API_KEY in Secrets."
|
214 |
if not EXTRACTED_TEXT:
|
215 |
+
return "❌ Process a PDF first."
|
216 |
if not question.strip():
|
217 |
return "❌ Enter a question."
|
218 |
|
219 |
+
ans, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
|
220 |
+
return ans if not err else f"❌ {err}"
|
|
|
|
|
|
|
221 |
|
222 |
def view_log():
|
223 |
try:
|
|
|
225 |
except Exception as e:
|
226 |
return f"Error reading log: {e}"
|
227 |
|
|
|
228 |
def save_summary(summary):
|
229 |
if not summary:
|
230 |
return "❌ No summary to save."
|
|
|
233 |
f.write(summary)
|
234 |
return f"✅ Saved to {fn}"
|
235 |
|
|
|
236 |
def save_qa(question, answer):
|
237 |
if not question or not answer:
|
238 |
return "❌ Nothing to save."
|
|
|
248 |
gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
|
249 |
|
250 |
with gr.Tab("Setup"):
|
251 |
+
gr.Markdown("⚠️ Make sure `GOOGLE_API_KEY` is set in your Space's Secrets.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
with gr.Tab("PDF Processing"):
|
254 |
with gr.Row():
|
255 |
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
256 |
proc_btn = gr.Button("Process PDF", variant="primary")
|
257 |
+
status = gr.Markdown("Awaiting upload…")
|
258 |
summary_out = gr.Textbox(label="Summary", lines=15)
|
259 |
structure_out = gr.Textbox(label="Structure", lines=8)
|
260 |
+
log_info = gr.Textbox(label="Internal Log", lines=5)
|
261 |
proc_btn.click(
|
262 |
fn=process_pdf,
|
263 |
inputs=[pdf_file],
|
|
|
282 |
refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
|
283 |
|
284 |
if __name__ == "__main__":
|
|
|
285 |
app.launch(server_name="0.0.0.0")
|