ProfessorLeVesseur commited on
Commit
ca854bd
·
verified ·
1 Parent(s): 8275a49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -273
app.py CHANGED
@@ -1,264 +1,3 @@
1
- # # ---------------------------------------------------------------------------------------
2
- # # Imports and Options
3
- # # ---------------------------------------------------------------------------------------
4
- # import streamlit as st
5
- # import pandas as pd
6
- # import requests
7
- # import re
8
- # import fitz # PyMuPDF
9
- # import io
10
- # import matplotlib.pyplot as plt
11
- # from PIL import Image
12
- # from transformers import AutoProcessor, AutoModelForVision2Seq
13
- # from docling_core.types.doc import DoclingDocument
14
- # from docling_core.types.doc.document import DocTagsDocument
15
- # import torch
16
- # import os
17
- # from huggingface_hub import InferenceClient
18
-
19
- # # ---------------------------------------------------------------------------------------
20
- # # Streamlit Page Configuration
21
- # # ---------------------------------------------------------------------------------------
22
- # st.set_page_config(
23
- # page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
24
- # page_icon=":bar_chart:",
25
- # layout="centered",
26
- # initial_sidebar_state="auto",
27
- # menu_items={
28
- # 'Get Help': 'mailto:[email protected]',
29
- # 'About': "This app is built to support PDF analysis"
30
- # }
31
- # )
32
-
33
- # # ---------------------------------------------------------------------------------------
34
- # # Session State Initialization
35
- # # ---------------------------------------------------------------------------------------
36
- # for key in ['pdf_processed', 'markdown_texts', 'df']:
37
- # if key not in st.session_state:
38
- # st.session_state[key] = False if key == 'pdf_processed' else []
39
-
40
- # # ---------------------------------------------------------------------------------------
41
- # # API Configuration
42
- # # ---------------------------------------------------------------------------------------
43
- # # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
44
- # # headers = {
45
- # # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
46
- # # 'Content-Type': 'application/json'
47
- # # }
48
-
49
- # # Retrieve Hugging Face API key from environment variables
50
- # hf_api_key = os.getenv('HF_API_KEY')
51
- # if not hf_api_key:
52
- # raise ValueError("HF_API_KEY not set in environment variables")
53
-
54
- # # Create the Hugging Face inference client
55
- # client = InferenceClient(api_key=hf_api_key)
56
-
57
- # # # ---------------------------------------------------------------------------------------
58
- # # # Survey Analysis Class
59
- # # # ---------------------------------------------------------------------------------------
60
- # # class SurveyAnalysis:
61
- # # def prepare_llm_input(self, survey_response, topics):
62
- # # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
63
- # # return f"""Extract and summarize PDF notes based on topics:
64
- # # {topic_descriptions}
65
-
66
- # # Instructions:
67
- # # - Extract exact quotes per topic.
68
- # # - Ignore irrelevant topics.
69
-
70
- # # Format:
71
- # # [Topic]
72
- # # - "Exact quote"
73
-
74
- # # Meeting Notes:
75
- # # {survey_response}
76
- # # """
77
-
78
- # # def query_api(self, payload):
79
- # # try:
80
- # # res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
81
- # # res.raise_for_status()
82
- # # return res.json()
83
- # # except requests.exceptions.RequestException as e:
84
- # # st.error(f"API request failed: {e}")
85
- # # return {'outputs': {'out-0': ''}}
86
-
87
- # # def extract_meeting_notes(self, response):
88
- # # return response.get('outputs', {}).get('out-0', '')
89
-
90
- # # def process_dataframe(self, df, topics):
91
- # # results = []
92
- # # for _, row in df.iterrows():
93
- # # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
94
- # # payload = {"user_id": "user", "in-0": llm_input}
95
- # # response = self.query_api(payload)
96
- # # notes = self.extract_meeting_notes(response)
97
- # # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
98
- # # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
99
-
100
- # # ---------------------------------------------------------------------------------------
101
- # # Survey Analysis Class
102
- # # ---------------------------------------------------------------------------------------
103
- # class SurveyAnalysis:
104
- # def prepare_llm_input(self, survey_response, topics):
105
- # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
106
- # return f"""Extract and summarize PDF notes based on topics:
107
- # {topic_descriptions}
108
-
109
- # Instructions:
110
- # - Extract exact quotes per topic.
111
- # - Ignore irrelevant topics.
112
-
113
- # Format:
114
- # [Topic]
115
- # - "Exact quote"
116
-
117
- # Meeting Notes:
118
- # {survey_response}
119
- # """
120
-
121
- # def prompt_response_from_hf_llm(self, llm_input):
122
- # # Define a system prompt to guide the model's responses
123
- # system_prompt = """
124
- # <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona>
125
- # <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task>
126
- # <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context>
127
- # <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format>
128
- # """
129
-
130
- # # Generate the refined prompt using Hugging Face API
131
- # response = client.chat.completions.create(
132
- # model="meta-llama/Llama-3.1-70B-Instruct",
133
- # messages=[
134
- # {"role": "system", "content": system_prompt}, # Add system prompt here
135
- # {"role": "user", "content": llm_input}
136
- # ],
137
- # stream=True,
138
- # temperature=0.5,
139
- # max_tokens=1024,
140
- # top_p=0.7
141
- # )
142
-
143
- # # Combine messages if response is streamed
144
- # response_content = ""
145
- # for message in response:
146
- # response_content += message.choices[0].delta.content
147
-
148
- # return response_content.strip()
149
-
150
- # def extract_text(self, response):
151
- # return response
152
-
153
- # def process_dataframe(self, df, topics):
154
- # results = []
155
- # for _, row in df.iterrows():
156
- # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
157
- # response = self.prompt_response_from_hf_llm(llm_input)
158
- # notes = self.extract_text(response)
159
- # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
160
- # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
161
-
162
- # # ---------------------------------------------------------------------------------------
163
- # # Helper Functions
164
- # # ---------------------------------------------------------------------------------------
165
- # @st.cache_resource
166
- # def load_smol_docling():
167
- # device = "cuda" if torch.cuda.is_available() else "cpu"
168
- # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
169
- # model = AutoModelForVision2Seq.from_pretrained(
170
- # "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
171
- # ).to(device)
172
- # return model, processor
173
-
174
- # model, processor = load_smol_docling()
175
-
176
- # def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
177
- # images = []
178
- # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
179
- # for page in doc:
180
- # pix = page.get_pixmap(dpi=dpi)
181
- # img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
182
- # img.thumbnail((max_size, max_size), Image.LANCZOS)
183
- # images.append(img)
184
- # return images
185
-
186
- # def extract_markdown_from_image(image):
187
- # device = "cuda" if torch.cuda.is_available() else "cpu"
188
- # prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
189
- # inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
190
- # with torch.no_grad():
191
- # generated_ids = model.generate(**inputs, max_new_tokens=1024)
192
- # doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip()
193
- # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
194
- # doc = DoclingDocument(name="ExtractedDocument")
195
- # doc.load_from_doctags(doctags_doc)
196
- # return doc.export_to_markdown()
197
-
198
- # def extract_excerpts(processed_df):
199
- # rows = []
200
- # for _, r in processed_df.iterrows():
201
- # for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
202
- # topic_match = re.match(r'\[([^\]]+)\]', sec)
203
- # if topic_match:
204
- # topic = topic_match.group(1)
205
- # excerpts = re.findall(r'- "([^"]+)"', sec)
206
- # for excerpt in excerpts:
207
- # rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
208
- # return pd.DataFrame(rows)
209
-
210
- # # ---------------------------------------------------------------------------------------
211
- # # Streamlit UI
212
- # # ---------------------------------------------------------------------------------------
213
- # st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
214
-
215
- # uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
216
-
217
- # if uploaded_file and not st.session_state['pdf_processed']:
218
- # with st.spinner("Processing PDF..."):
219
- # images = convert_pdf_to_images(uploaded_file)
220
- # markdown_texts = [extract_markdown_from_image(img) for img in images]
221
- # st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
222
- # st.session_state['pdf_processed'] = True
223
- # st.success("PDF processed successfully!")
224
-
225
- # if st.session_state['pdf_processed']:
226
- # st.markdown("### Extracted Text Preview")
227
- # st.write(st.session_state['df'].head())
228
-
229
- # st.markdown("### Enter Topics and Descriptions")
230
- # num_topics = st.number_input("Number of topics", 1, 10, 1)
231
- # topics = {}
232
- # for i in range(num_topics):
233
- # topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
234
- # desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
235
- # if topic and desc:
236
- # topics[topic] = desc
237
-
238
- # if st.button("Run Analysis"):
239
- # if not topics:
240
- # st.warning("Please enter at least one topic and description.")
241
- # st.stop()
242
-
243
- # analyzer = SurveyAnalysis()
244
- # processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
245
- # extracted_df = extract_excerpts(processed_df)
246
-
247
- # st.markdown("### Extracted Excerpts")
248
- # st.dataframe(extracted_df)
249
-
250
- # csv = extracted_df.to_csv(index=False)
251
- # st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
252
-
253
- # topic_counts = extracted_df['Topic'].value_counts()
254
- # fig, ax = plt.subplots()
255
- # topic_counts.plot.bar(ax=ax, color='#3d9aa1')
256
- # st.pyplot(fig)
257
-
258
- # if not uploaded_file:
259
- # st.info("Please upload a PDF file to begin.")
260
-
261
-
262
  # ---------------------------------------------------------------------------------------
263
  # Imports and Options
264
  # ---------------------------------------------------------------------------------------
@@ -291,6 +30,39 @@ st.set_page_config(
291
  }
292
  )
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  # ---------------------------------------------------------------------------------------
295
  # Session State Initialization
296
  # ---------------------------------------------------------------------------------------
@@ -314,7 +86,7 @@ class AIAnalysis:
314
  def __init__(self, client):
315
  self.client = client
316
 
317
- def prepare_llm_input(self, survey_response, topics):
318
  topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
319
  return f"""Extract and summarize PDF notes based on topics:
320
  {topic_descriptions}
@@ -327,8 +99,8 @@ Instructions:
327
  [Topic]
328
  - "Exact quote"
329
 
330
- Meeting Notes:
331
- {survey_response}
332
  """
333
 
334
  def prompt_response_from_hf_llm(self, llm_input):
@@ -376,15 +148,6 @@ Meeting Notes:
376
  results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
377
  return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
378
 
379
- def process_dataframe(self, df, topics):
380
- results = []
381
- for _, row in df.iterrows():
382
- llm_input = self.prepare_llm_input(row['Document_Text'], topics)
383
- response = self.prompt_response_from_hf_llm(llm_input)
384
- notes = self.extract_text(response)
385
- results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
386
- return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
387
-
388
  # ---------------------------------------------------------------------------------------
389
  # Helper Functions
390
  # ---------------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ---------------------------------------------------------------------------------------
2
  # Imports and Options
3
  # ---------------------------------------------------------------------------------------
 
30
  }
31
  )
32
 
33
+ # ---------------------------------------------------------------------------------------
34
+ # Streamlit Sidebar
35
+ # ---------------------------------------------------------------------------------------
36
+
37
+ st.sidebar.title("📌 About This App")
38
+
39
+ st.sidebar.markdown("""
40
+ #### ⚠️ **Important Note on Processing Time**
41
+
42
+ This app uses the **SmolDocling** model (`ds4sd/SmolDocling-256M-preview`) to convert PDF pages into markdown text. Currently, the model is running on a CPU-based environment (**CPU basic | 2 vCPU - 16 GB RAM**), and therefore processing each page can take a significant amount of time (approximately **6 minutes per page**).
43
+
44
+ This setup is suitable for testing and demonstration purposes, but **not efficient for real-world usage**.
45
+
46
+ For faster processing, consider running the optimized version `ds4sd/SmolDocling-256M-preview-mlx-bf16` locally on a MacBook, where it performs significantly faster.
47
+
48
+ ---
49
+
50
+ #### 🛠️ **How This App Works**
51
+
52
+ Here's a quick overview of the workflow:
53
+
54
+ 1. **Upload PDF**: You upload a PDF document using the uploader provided.
55
+ 2. **Convert PDF to Images**: The PDF is converted into individual images (one per page).
56
+ 3. **Extract Markdown from Images**: Each image is processed by the SmolDocling model to extract markdown-formatted text.
57
+ 4. **Enter Topics and Descriptions**: You provide specific topics and their descriptions you'd like to extract from the document.
58
+ 5. **Extract Excerpts**: The app uses the **meta-llama/Llama-3.1-70B-Instruct** model to extract exact quotes relevant to your provided topics.
59
+ 6. **Results in a DataFrame**: All extracted quotes and their topics are compiled into a structured DataFrame that you can preview and download.
60
+
61
+ ---
62
+
63
+ Please proceed by uploading your PDF file to begin the analysis.
64
+ """)
65
+
66
  # ---------------------------------------------------------------------------------------
67
  # Session State Initialization
68
  # ---------------------------------------------------------------------------------------
 
86
  def __init__(self, client):
87
  self.client = client
88
 
89
+ def prepare_llm_input(self, document_content, topics):
90
  topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
91
  return f"""Extract and summarize PDF notes based on topics:
92
  {topic_descriptions}
 
99
  [Topic]
100
  - "Exact quote"
101
 
102
+ Document Content:
103
+ {document_content}
104
  """
105
 
106
  def prompt_response_from_hf_llm(self, llm_input):
 
148
  results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
149
  return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
150
 
 
 
 
 
 
 
 
 
 
151
  # ---------------------------------------------------------------------------------------
152
  # Helper Functions
153
  # ---------------------------------------------------------------------------------------