ProfessorLeVesseur commited on
Commit
39b1f14
·
verified ·
1 Parent(s): cce5718

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -286
app.py CHANGED
@@ -14,15 +14,26 @@ from docling_core.types.doc import DoclingDocument
14
  from docling_core.types.doc.document import DocTagsDocument
15
  import torch
16
 
17
- # import logging
18
- # logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- if 'pdf_processed' not in st.session_state:
21
- st.session_state['pdf_processed'] = False
22
- if 'markdown_texts' not in st.session_state:
23
- st.session_state['markdown_texts'] = []
24
- if 'df' not in st.session_state:
25
- st.session_state['df'] = pd.DataFrame()
26
 
27
  # ---------------------------------------------------------------------------------------
28
  # API Configuration
@@ -37,351 +48,145 @@ headers = {
37
  # Survey Analysis Class
38
  # ---------------------------------------------------------------------------------------
39
  class SurveyAnalysis:
40
- def __init__(self, api_key=None):
41
- self.api_key = api_key
42
-
43
  def prepare_llm_input(self, survey_response, topics):
44
- # Create topic description string from user input
45
- topic_descriptions = "\n".join([f"- **{topic}**: {description}" for topic, description in topics.items()])
46
-
47
- llm_input = f"""
48
- Your task is to review PDF docling and extract information related to the provided topics. Here are the topic descriptions:
49
-
50
  {topic_descriptions}
51
 
52
- **Instructions:**
53
- - Extract and summarize the PDF focusing only on the provided topics.
54
- - If a topic is not mentioned in the notes, it should not be included in the Topic_Summary.
55
- - Use **exact quotes** from the original text for each point in your Topic_Summary.
56
- - Exclude erroneous content.
57
- - Do not add additional explanations or instructions.
58
 
59
- **Format your response as follows:**
60
  [Topic]
61
  - "Exact quote"
62
- - "Exact quote"
63
- - "Exact quote"
64
 
65
- **Meeting Notes:**
66
  {survey_response}
67
  """
68
- return llm_input
69
 
70
  def query_api(self, payload):
71
- response = requests.post(API_URL, headers=headers, json=payload)
72
- return response.json()
 
 
 
 
 
73
 
74
  def extract_meeting_notes(self, response):
75
- output = response.get('outputs', {}).get('out-0', '')
76
- return output
77
 
78
  def process_dataframe(self, df, topics):
79
  results = []
80
  for _, row in df.iterrows():
81
  llm_input = self.prepare_llm_input(row['Document_Text'], topics)
82
- payload = {
83
- "user_id": "<USER or Conversation ID>",
84
- "in-0": llm_input
85
- }
86
  response = self.query_api(payload)
87
- meeting_notes = self.extract_meeting_notes(response)
88
- results.append({
89
- 'Document_Text': row['Document_Text'],
90
- 'Topic_Summary': meeting_notes
91
- })
92
-
93
- result_df = pd.DataFrame(results)
94
- df = df.reset_index(drop=True)
95
- return pd.concat([df, result_df[['Topic_Summary']]], axis=1)
96
 
97
  # ---------------------------------------------------------------------------------------
98
- # Function to Extract Excerpts
99
  # ---------------------------------------------------------------------------------------
100
- def extract_excerpts(processed_df):
101
- new_rows = []
102
-
103
- for _, row in processed_df.iterrows():
104
- Topic_Summary = row['Topic_Summary']
105
-
106
- # Split the Topic_Summary by topic
107
- sections = re.split(r'\n(?=\[)', Topic_Summary)
108
-
109
- for section in sections:
110
- # Extract the topic
111
- topic_match = re.match(r'\[([^\]]+)\]', section)
112
- if topic_match:
113
- topic = topic_match.group(1)
114
-
115
- # Extract all excerpts within the section
116
- excerpts = re.findall(r'- "([^"]+)"', section)
117
-
118
- for excerpt in excerpts:
119
- new_rows.append({
120
- 'Document_Text': row['Document_Text'],
121
- 'Topic_Summary': row['Topic_Summary'],
122
- 'Excerpt': excerpt,
123
- 'Topic': topic
124
- })
125
-
126
- return pd.DataFrame(new_rows)
127
-
128
- #------------------------------------------------------------------------
129
- # Streamlit Configuration
130
- #------------------------------------------------------------------------
131
-
132
- # Set page configuration
133
- st.set_page_config(
134
- page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
135
- page_icon=":bar_chart:",
136
- layout="centered",
137
- initial_sidebar_state="auto",
138
- menu_items={
139
- 'Get Help': 'mailto:[email protected]',
140
- 'About': "This app is built to support PDF analysis"
141
- }
142
- )
143
-
144
- #------------------------------------------------------------------------
145
- # Sidebar
146
- #------------------------------------------------------------------------
147
-
148
- # Sidebar with image
149
- with st.sidebar:
150
- # Set the desired width in pixels
151
- image_width = 300
152
- # Define the path to the image
153
- # image_path = "steelcase_small.png"
154
- image_path = "mtss.ai_small.png"
155
- # Display the image
156
- st.image(image_path, width=image_width)
157
-
158
- # Additional sidebar content
159
-
160
- with st.expander("**MTSS.ai**", expanded=True):
161
- st.write("""
162
- - **Support**: Cheyne LeVesseur PhD
163
- - **Email**: [email protected]
164
- """)
165
- st.divider()
166
- st.subheader('Instructions')
167
-
168
- Instructions = """
169
- - **Step 1**: Upload your PDF file.
170
- - **Step 2**: Review the processed text.
171
- - **Step 3**: Add your topics and descriptions of interest.
172
- - **Step 4**: Review the extracted excerpts and classifications, and topic distribution and frequency.
173
- - **Step 5**: Review bar charts of topics.
174
- - **Step 6**: Download the processed data as a CSV file.
175
- """
176
- st.markdown(Instructions)
177
-
178
- # Load SmolDocling model using transformers
179
  @st.cache_resource
180
  def load_smol_docling():
181
  device = "cuda" if torch.cuda.is_available() else "cpu"
182
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
183
  model = AutoModelForVision2Seq.from_pretrained(
184
- "ds4sd/SmolDocling-256M-preview",
185
- torch_dtype=torch.float32
186
  ).to(device)
187
  return model, processor
188
 
189
  model, processor = load_smol_docling()
190
 
191
- # # Convert PDF to images
192
- # def convert_pdf_to_images(pdf_file):
193
- # images = []
194
- # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
195
- # for page_number in range(len(doc)):
196
- # page = doc.load_page(page_number)
197
- # pix = page.get_pixmap(dpi=300) # Higher DPI for clarity
198
- # img_data = pix.tobytes("png")
199
- # image = Image.open(io.BytesIO(img_data))
200
- # images.append(image)
201
- # return images
202
-
203
- # Improved PDF to image conversion
204
  def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
205
  images = []
206
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
207
- for page_number in range(len(doc)):
208
- page = doc.load_page(page_number)
209
  pix = page.get_pixmap(dpi=dpi)
210
- img_data = pix.tobytes("png")
211
- image = Image.open(io.BytesIO(img_data)).convert("RGB")
212
- # Resize image to max dimension
213
- image.thumbnail((max_size, max_size), Image.LANCZOS)
214
- images.append(image)
215
  return images
216
 
217
- # Extract structured markdown text using SmolDocling (transformers)
218
- # def extract_markdown_from_image(image):
219
- # prompt_text = "Convert this page to docling."
220
- # device = "cuda" if torch.cuda.is_available() else "cpu"
221
-
222
- # # Prepare inputs
223
- # messages = [
224
- # {
225
- # "role": "user",
226
- # "content": [
227
- # {"type": "image"},
228
- # {"type": "text", "text": prompt_text}
229
- # ]
230
- # }
231
- # ]
232
- # prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
233
- # inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
234
-
235
- # # Generate outputs
236
- # generated_ids = model.generate(**inputs, max_new_tokens=1024)
237
- # prompt_length = inputs.input_ids.shape[1]
238
- # trimmed_generated_ids = generated_ids[:, prompt_length:]
239
- # doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
240
-
241
- # # Clean the output
242
- # doctags = doctags.replace("<end_of_utterance>", "").strip()
243
-
244
- # # Populate document
245
- # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
246
-
247
- # # Create a docling document
248
- # doc = DoclingDocument(name="ExtractedDocument")
249
- # doc.load_from_doctags(doctags_doc)
250
-
251
- # # Export as markdown
252
- # markdown_text = doc.export_to_markdown()
253
- # return markdown_text
254
-
255
  def extract_markdown_from_image(image):
256
- # start_time = time.time()
257
- prompt_text = "Convert this page to docling."
258
  device = "cuda" if torch.cuda.is_available() else "cpu"
259
-
260
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}]
261
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
262
  inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
263
-
264
- with torch.no_grad(): # <-- Crucial for speed
265
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
266
-
267
- prompt_length = inputs.input_ids.shape[1]
268
- trimmed_generated_ids = generated_ids[:, prompt_length:]
269
- doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
270
- doctags = doctags.replace("<end_of_utterance>", "").strip()
271
-
272
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
273
  doc = DoclingDocument(name="ExtractedDocument")
274
  doc.load_from_doctags(doctags_doc)
275
- markdown_text = doc.export_to_markdown()
276
- # processing_time = time.time() - start_time
277
- # logging.info(f"Inference took {processing_time:.2f} seconds")
278
- return markdown_text
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  # Streamlit UI
 
281
  st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
282
 
283
  uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
284
 
285
- if uploaded_file:
286
- if not st.session_state['pdf_processed']:
287
- with st.spinner("Processing PDF..."):
288
- images = convert_pdf_to_images(uploaded_file)
289
-
290
- markdown_texts = []
291
- for idx, image in enumerate(images):
292
- markdown_text = extract_markdown_from_image(image)
293
- markdown_texts.append(markdown_text)
294
-
295
- df = pd.DataFrame({'Document_Text': markdown_texts})
296
 
297
- # Save results into session state
298
- st.session_state['markdown_texts'] = markdown_texts
299
- st.session_state['df'] = df
300
- st.session_state['pdf_processed'] = True
301
 
302
- st.success("PDF processed successfully!")
303
- else:
304
- st.success("PDF already processed. Using cached results.")
305
-
306
- # Use cached dataframe for further processing
307
- df = st.session_state['df']
308
-
309
- if df.empty or df['Document_Text'].isnull().all():
310
- st.error("No meaningful text extracted from the PDF.")
311
- st.stop()
312
-
313
- st.markdown("### Extracted Markdown Preview")
314
- st.write(df.head())
315
-
316
- if st.button("Reset / Upload New PDF"):
317
- st.session_state['pdf_processed'] = False
318
- st.session_state['markdown_texts'] = []
319
- st.session_state['df'] = pd.DataFrame()
320
- st.experimental_rerun()
321
-
322
- # ---------------------------------------------------------------------------------------
323
- # User Input for Topics
324
- # ---------------------------------------------------------------------------------------
325
  st.markdown("### Enter Topics and Descriptions")
326
- num_topics = st.number_input("Number of topics", min_value=1, max_value=10, value=1, step=1)
327
-
328
  topics = {}
329
  for i in range(num_topics):
330
  topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
331
- description = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
332
- if topic and description:
333
- topics[topic] = description
334
 
335
- # Add a button to execute the analysis
336
  if st.button("Run Analysis"):
337
  if not topics:
338
  st.warning("Please enter at least one topic and description.")
339
  st.stop()
340
 
341
- # ---------------------------------------------------------------------------------------
342
- # Your existing SurveyAnalysis and extract_excerpts functions remain unchanged here:
343
- # ---------------------------------------------------------------------------------------
344
  analyzer = SurveyAnalysis()
345
- processed_df = analyzer.process_dataframe(df, topics)
346
- df_VIP_extracted = extract_excerpts(processed_df)
347
-
348
- required_columns = ['Document_Text', 'Topic_Summary', 'Excerpt', 'Topic']
349
- missing_columns = [col for col in required_columns if col not in df_VIP_extracted.columns]
350
 
351
- if missing_columns:
352
- st.error(f"Missing columns after processing: {missing_columns}")
353
- st.stop()
354
-
355
- df_VIP_extracted = df_VIP_extracted[required_columns]
356
-
357
- st.markdown("### Processed Meeting Notes")
358
- st.dataframe(df_VIP_extracted)
359
-
360
- st.write(f"**Number of meeting notes analyzed:** {len(df)}")
361
- st.write(f"**Number of excerpts extracted:** {len(df_VIP_extracted)}")
362
-
363
- # CSV download
364
- csv = df_VIP_extracted.to_csv(index=False)
365
- st.download_button(
366
- "Download data as CSV",
367
- data=csv,
368
- file_name='extracted_meeting_notes.csv',
369
- mime='text/csv'
370
- )
371
 
372
- # Topic distribution visualization
373
- topic_counts = df_VIP_extracted['Topic'].value_counts()
374
- frequency_table = pd.DataFrame({'Topic': topic_counts.index, 'Count': topic_counts.values})
375
- frequency_table['Percentage'] = (frequency_table['Count'] / frequency_table['Count'].sum() * 100).round(0)
376
 
377
- st.markdown("### Topic Distribution")
378
- st.dataframe(frequency_table)
379
-
380
- fig, ax = plt.subplots(figsize=(10, 5))
381
- ax.bar(frequency_table['Topic'], frequency_table['Count'], color='#3d9aa1')
382
- ax.set_ylabel('Count')
383
- ax.set_title('Frequency of Topics')
384
  st.pyplot(fig)
385
 
386
- else:
 
 
 
 
 
387
  st.info("Please upload a PDF file to begin.")
 
14
  from docling_core.types.doc.document import DocTagsDocument
15
  import torch
16
 
17
+ # ---------------------------------------------------------------------------------------
18
+ # Streamlit Page Configuration
19
+ # ---------------------------------------------------------------------------------------
20
+ st.set_page_config(
21
+ page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
22
+ page_icon=":bar_chart:",
23
+ layout="centered",
24
+ initial_sidebar_state="auto",
25
+ menu_items={
26
+ 'Get Help': 'mailto:[email protected]',
27
+ 'About': "This app is built to support PDF analysis"
28
+ }
29
+ )
30
 
31
+ # ---------------------------------------------------------------------------------------
32
+ # Session State Initialization
33
+ # ---------------------------------------------------------------------------------------
34
+ for key in ['pdf_processed', 'markdown_texts', 'df']:
35
+ if key not in st.session_state:
36
+ st.session_state[key] = False if key == 'pdf_processed' else []
37
 
38
  # ---------------------------------------------------------------------------------------
39
  # API Configuration
 
48
  # Survey Analysis Class
49
  # ---------------------------------------------------------------------------------------
50
  class SurveyAnalysis:
 
 
 
51
  def prepare_llm_input(self, survey_response, topics):
52
+ topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
53
+ return f"""Extract and summarize PDF notes based on topics:
 
 
 
 
54
  {topic_descriptions}
55
 
56
+ Instructions:
57
+ - Extract exact quotes per topic.
58
+ - Ignore irrelevant topics.
 
 
 
59
 
60
+ Format:
61
  [Topic]
62
  - "Exact quote"
 
 
63
 
64
+ Meeting Notes:
65
  {survey_response}
66
  """
 
67
 
68
  def query_api(self, payload):
69
+ try:
70
+ res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
71
+ res.raise_for_status()
72
+ return res.json()
73
+ except requests.exceptions.RequestException as e:
74
+ st.error(f"API request failed: {e}")
75
+ return {'outputs': {'out-0': ''}}
76
 
77
  def extract_meeting_notes(self, response):
78
+ return response.get('outputs', {}).get('out-0', '')
 
79
 
80
  def process_dataframe(self, df, topics):
81
  results = []
82
  for _, row in df.iterrows():
83
  llm_input = self.prepare_llm_input(row['Document_Text'], topics)
84
+ payload = {"user_id": "user", "in-0": llm_input}
 
 
 
85
  response = self.query_api(payload)
86
+ notes = self.extract_meeting_notes(response)
87
+ results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
88
+ return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
 
 
 
 
 
 
89
 
90
  # ---------------------------------------------------------------------------------------
91
+ # Helper Functions
92
  # ---------------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  @st.cache_resource
94
  def load_smol_docling():
95
  device = "cuda" if torch.cuda.is_available() else "cpu"
96
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
97
  model = AutoModelForVision2Seq.from_pretrained(
98
+ "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
 
99
  ).to(device)
100
  return model, processor
101
 
102
  model, processor = load_smol_docling()
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
105
  images = []
106
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
107
+ for page in doc:
 
108
  pix = page.get_pixmap(dpi=dpi)
109
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
110
+ img.thumbnail((max_size, max_size), Image.LANCZOS)
111
+ images.append(img)
 
 
112
  return images
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def extract_markdown_from_image(image):
 
 
115
  device = "cuda" if torch.cuda.is_available() else "cpu"
116
+ prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
 
 
117
  inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
118
+ with torch.no_grad():
 
119
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
120
+ doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip()
 
 
 
 
 
121
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
122
  doc = DoclingDocument(name="ExtractedDocument")
123
  doc.load_from_doctags(doctags_doc)
124
+ return doc.export_to_markdown()
 
 
 
125
 
126
+ def extract_excerpts(processed_df):
127
+ rows = []
128
+ for _, r in processed_df.iterrows():
129
+ for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
130
+ topic_match = re.match(r'\[([^\]]+)\]', sec)
131
+ if topic_match:
132
+ topic = topic_match.group(1)
133
+ excerpts = re.findall(r'- "([^"]+)"', sec)
134
+ for excerpt in excerpts:
135
+ rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
136
+ return pd.DataFrame(rows)
137
+
138
+ # ---------------------------------------------------------------------------------------
139
  # Streamlit UI
140
+ # ---------------------------------------------------------------------------------------
141
  st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
142
 
143
  uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
144
 
145
+ if uploaded_file and not st.session_state['pdf_processed']:
146
+ with st.spinner("Processing PDF..."):
147
+ images = convert_pdf_to_images(uploaded_file)
148
+ markdown_texts = [extract_markdown_from_image(img) for img in images]
149
+ st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
150
+ st.session_state['pdf_processed'] = True
151
+ st.success("PDF processed successfully!")
 
 
 
 
152
 
153
+ if st.session_state['pdf_processed']:
154
+ st.markdown("### Extracted Text Preview")
155
+ st.write(st.session_state['df'].head())
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  st.markdown("### Enter Topics and Descriptions")
158
+ num_topics = st.number_input("Number of topics", 1, 10, 1)
 
159
  topics = {}
160
  for i in range(num_topics):
161
  topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
162
+ desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
163
+ if topic and desc:
164
+ topics[topic] = desc
165
 
 
166
  if st.button("Run Analysis"):
167
  if not topics:
168
  st.warning("Please enter at least one topic and description.")
169
  st.stop()
170
 
 
 
 
171
  analyzer = SurveyAnalysis()
172
+ processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
173
+ extracted_df = extract_excerpts(processed_df)
 
 
 
174
 
175
+ st.markdown("### Extracted Excerpts")
176
+ st.dataframe(extracted_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ csv = extracted_df.to_csv(index=False)
179
+ st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
 
 
180
 
181
+ topic_counts = extracted_df['Topic'].value_counts()
182
+ fig, ax = plt.subplots()
183
+ topic_counts.plot.bar(ax=ax, color='#3d9aa1')
 
 
 
 
184
  st.pyplot(fig)
185
 
186
+ if st.button("Reset / Upload New PDF"):
187
+ for key in ['pdf_processed', 'markdown_texts', 'df']:
188
+ st.session_state[key] = False if key == 'pdf_processed' else []
189
+ st.experimental_rerun()
190
+
191
+ if not uploaded_file:
192
  st.info("Please upload a PDF file to begin.")