ProfessorLeVesseur commited on
Commit
d03227a
·
verified ·
1 Parent(s): d6d185b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -329
app.py CHANGED
@@ -1,309 +1,3 @@
1
- # # ---------------------------------------------------------------------------------------
2
- # # Imports and Options
3
- # # ---------------------------------------------------------------------------------------
4
- # import streamlit as st
5
- # import pandas as pd
6
- # import requests
7
- # import re
8
- # import fitz # PyMuPDF
9
- # import io
10
- # import matplotlib.pyplot as plt
11
- # from PIL import Image
12
- # from mlx_vlm import load, generate
13
- # from mlx_vlm.prompt_utils import apply_chat_template
14
- # from mlx_vlm.utils import load_config, stream_generate
15
- # from docling_core.types.doc.document import DocTagsDocument, DoclingDocument
16
-
17
- # # Set Streamlit to wide mode
18
- # # st.set_page_config(layout="wide")
19
-
20
- # # ---------------------------------------------------------------------------------------
21
- # # API Configuration
22
- # # ---------------------------------------------------------------------------------------
23
- # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
24
- # headers = {
25
- # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
26
- # 'Content-Type': 'application/json'
27
- # }
28
-
29
- # # ---------------------------------------------------------------------------------------
30
- # # Survey Analysis Class
31
- # # ---------------------------------------------------------------------------------------
32
- # class SurveyAnalysis:
33
- # def __init__(self, api_key=None):
34
- # self.api_key = api_key
35
-
36
- # def prepare_llm_input(self, survey_response, topics):
37
- # # Create topic description string from user input
38
- # topic_descriptions = "\n".join([f"- **{topic}**: {description}" for topic, description in topics.items()])
39
-
40
- # llm_input = f"""
41
- # Your task is to review PDF docling and extract information related to the provided topics. Here are the topic descriptions:
42
-
43
- # {topic_descriptions}
44
-
45
- # **Instructions:**
46
- # - Extract and summarize the PDF focusing only on the provided topics.
47
- # - If a topic is not mentioned in the notes, it should not be included in the Topic_Summary.
48
- # - Use **exact quotes** from the original text for each point in your Topic_Summary.
49
- # - Exclude erroneous content.
50
- # - Do not add additional explanations or instructions.
51
-
52
- # **Format your response as follows:**
53
- # [Topic]
54
- # - "Exact quote"
55
- # - "Exact quote"
56
- # - "Exact quote"
57
-
58
- # **Meeting Notes:**
59
- # {survey_response}
60
- # """
61
- # return llm_input
62
-
63
- # def query_api(self, payload):
64
- # response = requests.post(API_URL, headers=headers, json=payload)
65
- # return response.json()
66
-
67
- # def extract_meeting_notes(self, response):
68
- # output = response.get('outputs', {}).get('out-0', '')
69
- # return output
70
-
71
- # def process_dataframe(self, df, topics):
72
- # results = []
73
- # for _, row in df.iterrows():
74
- # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
75
- # payload = {
76
- # "user_id": "<USER or Conversation ID>",
77
- # "in-0": llm_input
78
- # }
79
- # response = self.query_api(payload)
80
- # meeting_notes = self.extract_meeting_notes(response)
81
- # results.append({
82
- # 'Document_Text': row['Document_Text'],
83
- # 'Topic_Summary': meeting_notes
84
- # })
85
-
86
- # result_df = pd.DataFrame(results)
87
- # df = df.reset_index(drop=True)
88
- # return pd.concat([df, result_df[['Topic_Summary']]], axis=1)
89
-
90
- # # ---------------------------------------------------------------------------------------
91
- # # Function to Extract Excerpts
92
- # # ---------------------------------------------------------------------------------------
93
- # def extract_excerpts(processed_df):
94
- # new_rows = []
95
-
96
- # for _, row in processed_df.iterrows():
97
- # Topic_Summary = row['Topic_Summary']
98
-
99
- # # Split the Topic_Summary by topic
100
- # sections = re.split(r'\n(?=\[)', Topic_Summary)
101
-
102
- # for section in sections:
103
- # # Extract the topic
104
- # topic_match = re.match(r'\[([^\]]+)\]', section)
105
- # if topic_match:
106
- # topic = topic_match.group(1)
107
-
108
- # # Extract all excerpts within the section
109
- # excerpts = re.findall(r'- "([^"]+)"', section)
110
-
111
- # for excerpt in excerpts:
112
- # new_rows.append({
113
- # 'Document_Text': row['Document_Text'],
114
- # 'Topic_Summary': row['Topic_Summary'],
115
- # 'Excerpt': excerpt,
116
- # 'Topic': topic
117
- # })
118
-
119
- # return pd.DataFrame(new_rows)
120
-
121
- # #------------------------------------------------------------------------
122
- # # Streamlit Configuration
123
- # #------------------------------------------------------------------------
124
-
125
- # # Set page configuration
126
- # st.set_page_config(
127
- # page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
128
- # page_icon=":bar_chart:",
129
- # layout="centered",
130
- # initial_sidebar_state="auto",
131
- # menu_items={
132
- # 'Get Help': 'mailto:[email protected]',
133
- # 'About': "This app is built to support PDF analysis"
134
- # }
135
- # )
136
-
137
- # #------------------------------------------------------------------------
138
- # # Sidebar
139
- # #------------------------------------------------------------------------
140
-
141
- # # Sidebar with image
142
- # with st.sidebar:
143
- # # Set the desired width in pixels
144
- # image_width = 300
145
- # # Define the path to the image
146
- # # image_path = "steelcase_small.png"
147
- # image_path = "mtss.ai_small.png"
148
- # # Display the image
149
- # st.image(image_path, width=image_width)
150
-
151
- # # Additional sidebar content
152
-
153
- # with st.expander("**MTSS.ai**", expanded=True):
154
- # st.write("""
155
- # - **Support**: Cheyne LeVesseur PhD
156
- # - **Email**: [email protected]
157
- # """)
158
- # st.divider()
159
- # st.subheader('Instructions')
160
-
161
- # Instructions = """
162
- # - **Step 1**: Upload your PDF file.
163
- # - **Step 2**: Review the processed text.
164
- # - **Step 3**: Add your topics and descriptions of interest.
165
- # - **Step 4**: Review the extracted excerpts and classifications, and topic distribution and frequency.
166
- # - **Step 5**: Review bar charts of topics.
167
- # - **Step 6**: Download the processed data as a CSV file.
168
- # """
169
- # st.markdown(Instructions)
170
-
171
- # # Load SmolDocling model ()
172
- # @st.cache_resource
173
- # def load_smol_docling():
174
- # model_path = "ds4sd/SmolDocling-256M-preview"
175
- # model, processor = load(model_path)
176
- # config = load_config(model_path)
177
- # return model, processor, config
178
-
179
- # model, processor, config = load_smol_docling()
180
-
181
- # # Convert PDF to images
182
- # def convert_pdf_to_images(pdf_file):
183
- # images = []
184
- # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
185
- # for page_number in range(len(doc)):
186
- # page = doc.load_page(page_number)
187
- # pix = page.get_pixmap(dpi=300) # Higher DPI for clarity
188
- # img_data = pix.tobytes("png")
189
- # image = Image.open(io.BytesIO(img_data))
190
- # images.append(image)
191
- # return images
192
-
193
- # # Extract structured markdown text using SmolDocling (mlx_vlm)
194
- # def extract_markdown_from_image(image):
195
- # prompt = "Convert this page to docling."
196
- # formatted_prompt = apply_chat_template(processor, config, prompt, num_images=1)
197
- # output = ""
198
-
199
- # for token in stream_generate(
200
- # model, processor, formatted_prompt, [image], max_tokens=4096, verbose=False):
201
- # output += token.text
202
- # if "</doctag>" in token.text:
203
- # break
204
-
205
- # # Convert DocTags to Markdown
206
- # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([output], [image])
207
- # doc = DoclingDocument(name="ExtractedDocument")
208
- # doc.load_from_doctags(doctags_doc)
209
- # markdown_text = doc.export_to_markdown()
210
- # return markdown_text
211
-
212
- # # Streamlit UI
213
- # st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
214
-
215
- # uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
216
-
217
- # if uploaded_file:
218
- # with st.spinner("Processing PDF..."):
219
- # images = convert_pdf_to_images(uploaded_file)
220
-
221
- # markdown_texts = []
222
- # for idx, image in enumerate(images):
223
- # markdown_text = extract_markdown_from_image(image)
224
- # markdown_texts.append(markdown_text)
225
-
226
- # df = pd.DataFrame({'Document_Text': markdown_texts})
227
-
228
- # st.success("PDF processed successfully!")
229
-
230
- # # Check if extraction was successful
231
- # if df.empty or df['Document_Text'].isnull().all():
232
- # st.error("No meaningful text extracted from the PDF.")
233
- # st.stop()
234
-
235
- # st.markdown("### Extracted Markdown Preview")
236
- # st.write(df.head())
237
-
238
- # # ---------------------------------------------------------------------------------------
239
- # # User Input for Topics
240
- # # ---------------------------------------------------------------------------------------
241
- # st.markdown("### Enter Topics and Descriptions")
242
- # num_topics = st.number_input("Number of topics", min_value=1, max_value=10, value=1, step=1)
243
-
244
- # topics = {}
245
- # for i in range(num_topics):
246
- # topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
247
- # description = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
248
- # if topic and description:
249
- # topics[topic] = description
250
-
251
- # # Add a button to execute the analysis
252
- # if st.button("Run Analysis"):
253
- # if not topics:
254
- # st.warning("Please enter at least one topic and description.")
255
- # st.stop()
256
-
257
- # # ---------------------------------------------------------------------------------------
258
- # # Your existing SurveyAnalysis and extract_excerpts functions remain unchanged here:
259
- # # ---------------------------------------------------------------------------------------
260
- # analyzer = SurveyAnalysis()
261
- # processed_df = analyzer.process_dataframe(df, topics)
262
- # df_VIP_extracted = extract_excerpts(processed_df)
263
-
264
- # required_columns = ['Document_Text', 'Topic_Summary', 'Excerpt', 'Topic']
265
- # missing_columns = [col for col in required_columns if col not in df_VIP_extracted.columns]
266
-
267
- # if missing_columns:
268
- # st.error(f"Missing columns after processing: {missing_columns}")
269
- # st.stop()
270
-
271
- # df_VIP_extracted = df_VIP_extracted[required_columns]
272
-
273
- # st.markdown("### Processed Meeting Notes")
274
- # st.dataframe(df_VIP_extracted)
275
-
276
- # st.write(f"**Number of meeting notes analyzed:** {len(df)}")
277
- # st.write(f"**Number of excerpts extracted:** {len(df_VIP_extracted)}")
278
-
279
- # # CSV download
280
- # csv = df_VIP_extracted.to_csv(index=False)
281
- # st.download_button(
282
- # "Download data as CSV",
283
- # data=csv,
284
- # file_name='extracted_meeting_notes.csv',
285
- # mime='text/csv'
286
- # )
287
-
288
- # # Topic distribution visualization
289
- # topic_counts = df_VIP_extracted['Topic'].value_counts()
290
- # frequency_table = pd.DataFrame({'Topic': topic_counts.index, 'Count': topic_counts.values})
291
- # frequency_table['Percentage'] = (frequency_table['Count'] / frequency_table['Count'].sum() * 100).round(0)
292
-
293
- # st.markdown("### Topic Distribution")
294
- # st.dataframe(frequency_table)
295
-
296
- # fig, ax = plt.subplots(figsize=(10, 5))
297
- # ax.bar(frequency_table['Topic'], frequency_table['Count'], color='#3d9aa1')
298
- # ax.set_ylabel('Count')
299
- # ax.set_title('Frequency of Topics')
300
- # st.pyplot(fig)
301
-
302
- # else:
303
- # st.info("Please upload a PDF file to begin.")
304
-
305
-
306
-
307
  # ---------------------------------------------------------------------------------------
308
  # Imports and Options
309
  # ---------------------------------------------------------------------------------------
@@ -320,6 +14,9 @@ from docling_core.types.doc import DoclingDocument
320
  from docling_core.types.doc.document import DocTagsDocument
321
  import torch
322
 
 
 
 
323
  # ---------------------------------------------------------------------------------------
324
  # API Configuration
325
  # ---------------------------------------------------------------------------------------
@@ -484,54 +181,93 @@ def load_smol_docling():
484
 
485
  model, processor = load_smol_docling()
486
 
487
- # Convert PDF to images
488
- def convert_pdf_to_images(pdf_file):
 
 
 
 
 
 
 
 
 
 
 
 
489
  images = []
490
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
491
  for page_number in range(len(doc)):
492
  page = doc.load_page(page_number)
493
- pix = page.get_pixmap(dpi=300) # Higher DPI for clarity
494
  img_data = pix.tobytes("png")
495
- image = Image.open(io.BytesIO(img_data))
 
 
496
  images.append(image)
497
  return images
498
 
499
  # Extract structured markdown text using SmolDocling (transformers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  def extract_markdown_from_image(image):
 
501
  prompt_text = "Convert this page to docling."
502
  device = "cuda" if torch.cuda.is_available() else "cpu"
503
 
504
- # Prepare inputs
505
- messages = [
506
- {
507
- "role": "user",
508
- "content": [
509
- {"type": "image"},
510
- {"type": "text", "text": prompt_text}
511
- ]
512
- }
513
- ]
514
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
515
  inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
516
 
517
- # Generate outputs
518
- generated_ids = model.generate(**inputs, max_new_tokens=1024)
 
519
  prompt_length = inputs.input_ids.shape[1]
520
  trimmed_generated_ids = generated_ids[:, prompt_length:]
521
  doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
522
-
523
- # Clean the output
524
  doctags = doctags.replace("<end_of_utterance>", "").strip()
525
 
526
- # Populate document
527
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
528
-
529
- # Create a docling document
530
  doc = DoclingDocument(name="ExtractedDocument")
531
  doc.load_from_doctags(doctags_doc)
532
-
533
- # Export as markdown
534
  markdown_text = doc.export_to_markdown()
 
 
535
  return markdown_text
536
 
537
  # Streamlit UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ---------------------------------------------------------------------------------------
2
  # Imports and Options
3
  # ---------------------------------------------------------------------------------------
 
14
  from docling_core.types.doc.document import DocTagsDocument
15
  import torch
16
 
17
+ import logging
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
  # ---------------------------------------------------------------------------------------
21
  # API Configuration
22
  # ---------------------------------------------------------------------------------------
 
181
 
182
  model, processor = load_smol_docling()
183
 
184
+ # # Convert PDF to images
185
+ # def convert_pdf_to_images(pdf_file):
186
+ # images = []
187
+ # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
188
+ # for page_number in range(len(doc)):
189
+ # page = doc.load_page(page_number)
190
+ # pix = page.get_pixmap(dpi=300) # Higher DPI for clarity
191
+ # img_data = pix.tobytes("png")
192
+ # image = Image.open(io.BytesIO(img_data))
193
+ # images.append(image)
194
+ # return images
195
+
196
+ # Improved PDF to image conversion
197
+ def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
198
  images = []
199
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
200
  for page_number in range(len(doc)):
201
  page = doc.load_page(page_number)
202
+ pix = page.get_pixmap(dpi=dpi)
203
  img_data = pix.tobytes("png")
204
+ image = Image.open(io.BytesIO(img_data)).convert("RGB")
205
+ # Resize image to max dimension
206
+ image.thumbnail((max_size, max_size), Image.LANCZOS)
207
  images.append(image)
208
  return images
209
 
210
  # Extract structured markdown text using SmolDocling (transformers)
211
+ # def extract_markdown_from_image(image):
212
+ # prompt_text = "Convert this page to docling."
213
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
214
+
215
+ # # Prepare inputs
216
+ # messages = [
217
+ # {
218
+ # "role": "user",
219
+ # "content": [
220
+ # {"type": "image"},
221
+ # {"type": "text", "text": prompt_text}
222
+ # ]
223
+ # }
224
+ # ]
225
+ # prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
226
+ # inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
227
+
228
+ # # Generate outputs
229
+ # generated_ids = model.generate(**inputs, max_new_tokens=1024)
230
+ # prompt_length = inputs.input_ids.shape[1]
231
+ # trimmed_generated_ids = generated_ids[:, prompt_length:]
232
+ # doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
233
+
234
+ # # Clean the output
235
+ # doctags = doctags.replace("<end_of_utterance>", "").strip()
236
+
237
+ # # Populate document
238
+ # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
239
+
240
+ # # Create a docling document
241
+ # doc = DoclingDocument(name="ExtractedDocument")
242
+ # doc.load_from_doctags(doctags_doc)
243
+
244
+ # # Export as markdown
245
+ # markdown_text = doc.export_to_markdown()
246
+ # return markdown_text
247
+
248
  def extract_markdown_from_image(image):
249
+ start_time = time.time()
250
  prompt_text = "Convert this page to docling."
251
  device = "cuda" if torch.cuda.is_available() else "cpu"
252
 
253
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}]
 
 
 
 
 
 
 
 
 
254
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
255
  inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
256
 
257
+ with torch.no_grad(): # <-- Crucial for speed
258
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
259
+
260
  prompt_length = inputs.input_ids.shape[1]
261
  trimmed_generated_ids = generated_ids[:, prompt_length:]
262
  doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
 
 
263
  doctags = doctags.replace("<end_of_utterance>", "").strip()
264
 
 
265
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
 
 
266
  doc = DoclingDocument(name="ExtractedDocument")
267
  doc.load_from_doctags(doctags_doc)
 
 
268
  markdown_text = doc.export_to_markdown()
269
+ processing_time = time.time() - start_time
270
+ logging.info(f"Inference took {processing_time:.2f} seconds")
271
  return markdown_text
272
 
273
  # Streamlit UI