ProfessorLeVesseur commited on
Commit
c13044d
·
verified ·
1 Parent(s): 00f1e2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +268 -53
app.py CHANGED
@@ -1,3 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ---------------------------------------------------------------------------------------
2
  # Imports and Options
3
  # ---------------------------------------------------------------------------------------
@@ -40,12 +301,6 @@ for key in ['pdf_processed', 'markdown_texts', 'df']:
40
  # ---------------------------------------------------------------------------------------
41
  # API Configuration
42
  # ---------------------------------------------------------------------------------------
43
- # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
44
- # headers = {
45
- # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
46
- # 'Content-Type': 'application/json'
47
- # }
48
-
49
  # Retrieve Hugging Face API key from environment variables
50
  hf_api_key = os.getenv('HF_API_KEY')
51
  if not hf_api_key:
@@ -54,49 +309,6 @@ if not hf_api_key:
54
  # Create the Hugging Face inference client
55
  client = InferenceClient(api_key=hf_api_key)
56
 
57
- # # ---------------------------------------------------------------------------------------
58
- # # Survey Analysis Class
59
- # # ---------------------------------------------------------------------------------------
60
- # class SurveyAnalysis:
61
- # def prepare_llm_input(self, survey_response, topics):
62
- # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
63
- # return f"""Extract and summarize PDF notes based on topics:
64
- # {topic_descriptions}
65
-
66
- # Instructions:
67
- # - Extract exact quotes per topic.
68
- # - Ignore irrelevant topics.
69
-
70
- # Format:
71
- # [Topic]
72
- # - "Exact quote"
73
-
74
- # Meeting Notes:
75
- # {survey_response}
76
- # """
77
-
78
- # def query_api(self, payload):
79
- # try:
80
- # res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
81
- # res.raise_for_status()
82
- # return res.json()
83
- # except requests.exceptions.RequestException as e:
84
- # st.error(f"API request failed: {e}")
85
- # return {'outputs': {'out-0': ''}}
86
-
87
- # def extract_meeting_notes(self, response):
88
- # return response.get('outputs', {}).get('out-0', '')
89
-
90
- # def process_dataframe(self, df, topics):
91
- # results = []
92
- # for _, row in df.iterrows():
93
- # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
94
- # payload = {"user_id": "user", "in-0": llm_input}
95
- # response = self.query_api(payload)
96
- # notes = self.extract_meeting_notes(response)
97
- # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
98
- # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
99
-
100
  # ---------------------------------------------------------------------------------------
101
  # Survey Analysis Class
102
  # ---------------------------------------------------------------------------------------
@@ -250,10 +462,13 @@ if st.session_state['pdf_processed']:
250
  csv = extracted_df.to_csv(index=False)
251
  st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
252
 
253
- topic_counts = extracted_df['Topic'].value_counts()
254
- fig, ax = plt.subplots()
255
- topic_counts.plot.bar(ax=ax, color='#3d9aa1')
256
- st.pyplot(fig)
 
 
 
257
 
258
  if not uploaded_file:
259
  st.info("Please upload a PDF file to begin.")
 
1
+ # # ---------------------------------------------------------------------------------------
2
+ # # Imports and Options
3
+ # # ---------------------------------------------------------------------------------------
4
+ # import streamlit as st
5
+ # import pandas as pd
6
+ # import requests
7
+ # import re
8
+ # import fitz # PyMuPDF
9
+ # import io
10
+ # import matplotlib.pyplot as plt
11
+ # from PIL import Image
12
+ # from transformers import AutoProcessor, AutoModelForVision2Seq
13
+ # from docling_core.types.doc import DoclingDocument
14
+ # from docling_core.types.doc.document import DocTagsDocument
15
+ # import torch
16
+ # import os
17
+ # from huggingface_hub import InferenceClient
18
+
19
+ # # ---------------------------------------------------------------------------------------
20
+ # # Streamlit Page Configuration
21
+ # # ---------------------------------------------------------------------------------------
22
+ # st.set_page_config(
23
+ # page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
24
+ # page_icon=":bar_chart:",
25
+ # layout="centered",
26
+ # initial_sidebar_state="auto",
27
+ # menu_items={
28
+ # 'Get Help': 'mailto:[email protected]',
29
+ # 'About': "This app is built to support PDF analysis"
30
+ # }
31
+ # )
32
+
33
+ # # ---------------------------------------------------------------------------------------
34
+ # # Session State Initialization
35
+ # # ---------------------------------------------------------------------------------------
36
+ # for key in ['pdf_processed', 'markdown_texts', 'df']:
37
+ # if key not in st.session_state:
38
+ # st.session_state[key] = False if key == 'pdf_processed' else []
39
+
40
+ # # ---------------------------------------------------------------------------------------
41
+ # # API Configuration
42
+ # # ---------------------------------------------------------------------------------------
43
+ # # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
44
+ # # headers = {
45
+ # # 'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
46
+ # # 'Content-Type': 'application/json'
47
+ # # }
48
+
49
+ # # Retrieve Hugging Face API key from environment variables
50
+ # hf_api_key = os.getenv('HF_API_KEY')
51
+ # if not hf_api_key:
52
+ # raise ValueError("HF_API_KEY not set in environment variables")
53
+
54
+ # # Create the Hugging Face inference client
55
+ # client = InferenceClient(api_key=hf_api_key)
56
+
57
+ # # # ---------------------------------------------------------------------------------------
58
+ # # # Survey Analysis Class
59
+ # # # ---------------------------------------------------------------------------------------
60
+ # # class SurveyAnalysis:
61
+ # # def prepare_llm_input(self, survey_response, topics):
62
+ # # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
63
+ # # return f"""Extract and summarize PDF notes based on topics:
64
+ # # {topic_descriptions}
65
+
66
+ # # Instructions:
67
+ # # - Extract exact quotes per topic.
68
+ # # - Ignore irrelevant topics.
69
+
70
+ # # Format:
71
+ # # [Topic]
72
+ # # - "Exact quote"
73
+
74
+ # # Meeting Notes:
75
+ # # {survey_response}
76
+ # # """
77
+
78
+ # # def query_api(self, payload):
79
+ # # try:
80
+ # # res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
81
+ # # res.raise_for_status()
82
+ # # return res.json()
83
+ # # except requests.exceptions.RequestException as e:
84
+ # # st.error(f"API request failed: {e}")
85
+ # # return {'outputs': {'out-0': ''}}
86
+
87
+ # # def extract_meeting_notes(self, response):
88
+ # # return response.get('outputs', {}).get('out-0', '')
89
+
90
+ # # def process_dataframe(self, df, topics):
91
+ # # results = []
92
+ # # for _, row in df.iterrows():
93
+ # # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
94
+ # # payload = {"user_id": "user", "in-0": llm_input}
95
+ # # response = self.query_api(payload)
96
+ # # notes = self.extract_meeting_notes(response)
97
+ # # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
98
+ # # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
99
+
100
+ # # ---------------------------------------------------------------------------------------
101
+ # # Survey Analysis Class
102
+ # # ---------------------------------------------------------------------------------------
103
+ # class SurveyAnalysis:
104
+ # def prepare_llm_input(self, survey_response, topics):
105
+ # topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
106
+ # return f"""Extract and summarize PDF notes based on topics:
107
+ # {topic_descriptions}
108
+
109
+ # Instructions:
110
+ # - Extract exact quotes per topic.
111
+ # - Ignore irrelevant topics.
112
+
113
+ # Format:
114
+ # [Topic]
115
+ # - "Exact quote"
116
+
117
+ # Meeting Notes:
118
+ # {survey_response}
119
+ # """
120
+
121
+ # def prompt_response_from_hf_llm(self, llm_input):
122
+ # # Define a system prompt to guide the model's responses
123
+ # system_prompt = """
124
+ # <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona>
125
+ # <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task>
126
+ # <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context>
127
+ # <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format>
128
+ # """
129
+
130
+ # # Generate the refined prompt using Hugging Face API
131
+ # response = client.chat.completions.create(
132
+ # model="meta-llama/Llama-3.1-70B-Instruct",
133
+ # messages=[
134
+ # {"role": "system", "content": system_prompt}, # Add system prompt here
135
+ # {"role": "user", "content": llm_input}
136
+ # ],
137
+ # stream=True,
138
+ # temperature=0.5,
139
+ # max_tokens=1024,
140
+ # top_p=0.7
141
+ # )
142
+
143
+ # # Combine messages if response is streamed
144
+ # response_content = ""
145
+ # for message in response:
146
+ # response_content += message.choices[0].delta.content
147
+
148
+ # return response_content.strip()
149
+
150
+ # def extract_text(self, response):
151
+ # return response
152
+
153
+ # def process_dataframe(self, df, topics):
154
+ # results = []
155
+ # for _, row in df.iterrows():
156
+ # llm_input = self.prepare_llm_input(row['Document_Text'], topics)
157
+ # response = self.prompt_response_from_hf_llm(llm_input)
158
+ # notes = self.extract_text(response)
159
+ # results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
160
+ # return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
161
+
162
+ # # ---------------------------------------------------------------------------------------
163
+ # # Helper Functions
164
+ # # ---------------------------------------------------------------------------------------
165
+ # @st.cache_resource
166
+ # def load_smol_docling():
167
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
168
+ # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
169
+ # model = AutoModelForVision2Seq.from_pretrained(
170
+ # "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
171
+ # ).to(device)
172
+ # return model, processor
173
+
174
+ # model, processor = load_smol_docling()
175
+
176
+ # def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
177
+ # images = []
178
+ # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
179
+ # for page in doc:
180
+ # pix = page.get_pixmap(dpi=dpi)
181
+ # img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
182
+ # img.thumbnail((max_size, max_size), Image.LANCZOS)
183
+ # images.append(img)
184
+ # return images
185
+
186
+ # def extract_markdown_from_image(image):
187
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
188
+ # prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
189
+ # inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
190
+ # with torch.no_grad():
191
+ # generated_ids = model.generate(**inputs, max_new_tokens=1024)
192
+ # doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip()
193
+ # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
194
+ # doc = DoclingDocument(name="ExtractedDocument")
195
+ # doc.load_from_doctags(doctags_doc)
196
+ # return doc.export_to_markdown()
197
+
198
+ # def extract_excerpts(processed_df):
199
+ # rows = []
200
+ # for _, r in processed_df.iterrows():
201
+ # for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
202
+ # topic_match = re.match(r'\[([^\]]+)\]', sec)
203
+ # if topic_match:
204
+ # topic = topic_match.group(1)
205
+ # excerpts = re.findall(r'- "([^"]+)"', sec)
206
+ # for excerpt in excerpts:
207
+ # rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
208
+ # return pd.DataFrame(rows)
209
+
210
+ # # ---------------------------------------------------------------------------------------
211
+ # # Streamlit UI
212
+ # # ---------------------------------------------------------------------------------------
213
+ # st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
214
+
215
+ # uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
216
+
217
+ # if uploaded_file and not st.session_state['pdf_processed']:
218
+ # with st.spinner("Processing PDF..."):
219
+ # images = convert_pdf_to_images(uploaded_file)
220
+ # markdown_texts = [extract_markdown_from_image(img) for img in images]
221
+ # st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
222
+ # st.session_state['pdf_processed'] = True
223
+ # st.success("PDF processed successfully!")
224
+
225
+ # if st.session_state['pdf_processed']:
226
+ # st.markdown("### Extracted Text Preview")
227
+ # st.write(st.session_state['df'].head())
228
+
229
+ # st.markdown("### Enter Topics and Descriptions")
230
+ # num_topics = st.number_input("Number of topics", 1, 10, 1)
231
+ # topics = {}
232
+ # for i in range(num_topics):
233
+ # topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
234
+ # desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
235
+ # if topic and desc:
236
+ # topics[topic] = desc
237
+
238
+ # if st.button("Run Analysis"):
239
+ # if not topics:
240
+ # st.warning("Please enter at least one topic and description.")
241
+ # st.stop()
242
+
243
+ # analyzer = SurveyAnalysis()
244
+ # processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
245
+ # extracted_df = extract_excerpts(processed_df)
246
+
247
+ # st.markdown("### Extracted Excerpts")
248
+ # st.dataframe(extracted_df)
249
+
250
+ # csv = extracted_df.to_csv(index=False)
251
+ # st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
252
+
253
+ # topic_counts = extracted_df['Topic'].value_counts()
254
+ # fig, ax = plt.subplots()
255
+ # topic_counts.plot.bar(ax=ax, color='#3d9aa1')
256
+ # st.pyplot(fig)
257
+
258
+ # if not uploaded_file:
259
+ # st.info("Please upload a PDF file to begin.")
260
+
261
+
262
  # ---------------------------------------------------------------------------------------
263
  # Imports and Options
264
  # ---------------------------------------------------------------------------------------
 
301
  # ---------------------------------------------------------------------------------------
302
  # API Configuration
303
  # ---------------------------------------------------------------------------------------
 
 
 
 
 
 
304
  # Retrieve Hugging Face API key from environment variables
305
  hf_api_key = os.getenv('HF_API_KEY')
306
  if not hf_api_key:
 
309
  # Create the Hugging Face inference client
310
  client = InferenceClient(api_key=hf_api_key)
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  # ---------------------------------------------------------------------------------------
313
  # Survey Analysis Class
314
  # ---------------------------------------------------------------------------------------
 
462
  csv = extracted_df.to_csv(index=False)
463
  st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
464
 
465
+ if not extracted_df.empty:
466
+ topic_counts = extracted_df['Topic'].value_counts()
467
+ fig, ax = plt.subplots()
468
+ topic_counts.plot.bar(ax=ax, color='#3d9aa1')
469
+ st.pyplot(fig)
470
+ else:
471
+ st.warning("No topics were extracted. Please check the input data and topics.")
472
 
473
  if not uploaded_file:
474
  st.info("Please upload a PDF file to begin.")