Spaces:

ProfessorLeVesseur
/

PDF_Topic_Extraction_Analysis_App

Running

App Files Files Community

ProfessorLeVesseur commited on 21 days ago

Commit

c13044d

verified ·

1 Parent(s): 00f1e2e

Update app.py

Browse files

Files changed (1) hide show

app.py +268 -53

app.py CHANGED Viewed

@@ -1,3 +1,264 @@
 # ---------------------------------------------------------------------------------------
 # Imports and Options
 # ---------------------------------------------------------------------------------------
@@ -40,12 +301,6 @@ for key in ['pdf_processed', 'markdown_texts', 'df']:
 # ---------------------------------------------------------------------------------------
 # API Configuration
 # ---------------------------------------------------------------------------------------
-# API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
-# headers = {
-#     'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
-#     'Content-Type': 'application/json'
-# }
 # Retrieve Hugging Face API key from environment variables
 hf_api_key = os.getenv('HF_API_KEY')
 if not hf_api_key:
@@ -54,49 +309,6 @@ if not hf_api_key:
 # Create the Hugging Face inference client
 client = InferenceClient(api_key=hf_api_key)
-# # ---------------------------------------------------------------------------------------
-# # Survey Analysis Class
-# # ---------------------------------------------------------------------------------------
-# class SurveyAnalysis:
-#     def prepare_llm_input(self, survey_response, topics):
-#         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
-#         return f"""Extract and summarize PDF notes based on topics:
-# {topic_descriptions}
-# Instructions:
-# - Extract exact quotes per topic.
-# - Ignore irrelevant topics.
-# Format:
-# [Topic]
-# - "Exact quote"
-# Meeting Notes:
-# {survey_response}
-# """
-#     def query_api(self, payload):
-#         try:
-#             res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-#             res.raise_for_status()
-#             return res.json()
-#         except requests.exceptions.RequestException as e:
-#             st.error(f"API request failed: {e}")
-#             return {'outputs': {'out-0': ''}}
-#     def extract_meeting_notes(self, response):
-#         return response.get('outputs', {}).get('out-0', '')
-#     def process_dataframe(self, df, topics):
-#         results = []
-#         for _, row in df.iterrows():
-#             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
-#             payload = {"user_id": "user", "in-0": llm_input}
-#             response = self.query_api(payload)
-#             notes = self.extract_meeting_notes(response)
-#             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
-#         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
 # ---------------------------------------------------------------------------------------
 # Survey Analysis Class
 # ---------------------------------------------------------------------------------------
@@ -250,10 +462,13 @@ if st.session_state['pdf_processed']:
         csv = extracted_df.to_csv(index=False)
         st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
-        topic_counts = extracted_df['Topic'].value_counts()
-        fig, ax = plt.subplots()
-        topic_counts.plot.bar(ax=ax, color='#3d9aa1')
-        st.pyplot(fig)
 if not uploaded_file:
     st.info("Please upload a PDF file to begin.")

+# # ---------------------------------------------------------------------------------------
+# # Imports and Options
+# # ---------------------------------------------------------------------------------------
+# import streamlit as st
+# import pandas as pd
+# import requests
+# import re
+# import fitz  # PyMuPDF
+# import io
+# import matplotlib.pyplot as plt
+# from PIL import Image
+# from transformers import AutoProcessor, AutoModelForVision2Seq
+# from docling_core.types.doc import DoclingDocument
+# from docling_core.types.doc.document import DocTagsDocument
+# import torch
+# import os
+# from huggingface_hub import InferenceClient
+# # ---------------------------------------------------------------------------------------
+# # Streamlit Page Configuration
+# # ---------------------------------------------------------------------------------------
+# st.set_page_config(
+#     page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
+#     page_icon=":bar_chart:",
+#     layout="centered",
+#     initial_sidebar_state="auto",
+#     menu_items={
+#         'Get Help': 'mailto:[email protected]',
+#         'About': "This app is built to support PDF analysis"
+#     }
+# )
+# # ---------------------------------------------------------------------------------------
+# # Session State Initialization
+# # ---------------------------------------------------------------------------------------
+# for key in ['pdf_processed', 'markdown_texts', 'df']:
+#     if key not in st.session_state:
+#         st.session_state[key] = False if key == 'pdf_processed' else []
+# # ---------------------------------------------------------------------------------------
+# # API Configuration
+# # ---------------------------------------------------------------------------------------
+# # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
+# # headers = {
+# #     'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
+# #     'Content-Type': 'application/json'
+# # }
+# # Retrieve Hugging Face API key from environment variables
+# hf_api_key = os.getenv('HF_API_KEY')
+# if not hf_api_key:
+#     raise ValueError("HF_API_KEY not set in environment variables")
+# # Create the Hugging Face inference client
+# client = InferenceClient(api_key=hf_api_key)
+# # # ---------------------------------------------------------------------------------------
+# # # Survey Analysis Class
+# # # ---------------------------------------------------------------------------------------
+# # class SurveyAnalysis:
+# #     def prepare_llm_input(self, survey_response, topics):
+# #         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
+# #         return f"""Extract and summarize PDF notes based on topics:
+# # {topic_descriptions}
+# # Instructions:
+# # - Extract exact quotes per topic.
+# # - Ignore irrelevant topics.
+# # Format:
+# # [Topic]
+# # - "Exact quote"
+# # Meeting Notes:
+# # {survey_response}
+# # """
+# #     def query_api(self, payload):
+# #         try:
+# #             res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+# #             res.raise_for_status()
+# #             return res.json()
+# #         except requests.exceptions.RequestException as e:
+# #             st.error(f"API request failed: {e}")
+# #             return {'outputs': {'out-0': ''}}
+# #     def extract_meeting_notes(self, response):
+# #         return response.get('outputs', {}).get('out-0', '')
+# #     def process_dataframe(self, df, topics):
+# #         results = []
+# #         for _, row in df.iterrows():
+# #             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
+# #             payload = {"user_id": "user", "in-0": llm_input}
+# #             response = self.query_api(payload)
+# #             notes = self.extract_meeting_notes(response)
+# #             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
+# #         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
+# # ---------------------------------------------------------------------------------------
+# # Survey Analysis Class
+# # ---------------------------------------------------------------------------------------
+# class SurveyAnalysis:
+#     def prepare_llm_input(self, survey_response, topics):
+#         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
+#         return f"""Extract and summarize PDF notes based on topics:
+# {topic_descriptions}
+# Instructions:
+# - Extract exact quotes per topic.
+# - Ignore irrelevant topics.
+# Format:
+# [Topic]
+# - "Exact quote"
+# Meeting Notes:
+# {survey_response}
+# """
+#     def prompt_response_from_hf_llm(self, llm_input):
+#         # Define a system prompt to guide the model's responses
+#         system_prompt = """
+#         <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona>
+#         <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task>
+#         <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context>
+#         <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format>
+#         """
+#         # Generate the refined prompt using Hugging Face API
+#         response = client.chat.completions.create(
+#             model="meta-llama/Llama-3.1-70B-Instruct",
+#             messages=[
+#                 {"role": "system", "content": system_prompt},  # Add system prompt here
+#                 {"role": "user", "content": llm_input}
+#             ],
+#             stream=True,
+#             temperature=0.5,
+#             max_tokens=1024,
+#             top_p=0.7
+#         )
+#         # Combine messages if response is streamed
+#         response_content = ""
+#         for message in response:
+#             response_content += message.choices[0].delta.content
+#         return response_content.strip()
+#     def extract_text(self, response):
+#         return response
+#     def process_dataframe(self, df, topics):
+#         results = []
+#         for _, row in df.iterrows():
+#             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
+#             response = self.prompt_response_from_hf_llm(llm_input)
+#             notes = self.extract_text(response)
+#             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
+#         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
+# # ---------------------------------------------------------------------------------------
+# # Helper Functions
+# # ---------------------------------------------------------------------------------------
+# @st.cache_resource
+# def load_smol_docling():
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
+#     model = AutoModelForVision2Seq.from_pretrained(
+#         "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
+#     ).to(device)
+#     return model, processor
+# model, processor = load_smol_docling()
+# def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
+#     images = []
+#     doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+#     for page in doc:
+#         pix = page.get_pixmap(dpi=dpi)
+#         img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+#         img.thumbnail((max_size, max_size), Image.LANCZOS)
+#         images.append(img)
+#     return images
+# def extract_markdown_from_image(image):
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
+#     inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
+#     with torch.no_grad():
+#         generated_ids = model.generate(**inputs, max_new_tokens=1024)
+#     doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip()
+#     doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
+#     doc = DoclingDocument(name="ExtractedDocument")
+#     doc.load_from_doctags(doctags_doc)
+#     return doc.export_to_markdown()
+# def extract_excerpts(processed_df):
+#     rows = []
+#     for _, r in processed_df.iterrows():
+#         for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
+#             topic_match = re.match(r'\[([^\]]+)\]', sec)
+#             if topic_match:
+#                 topic = topic_match.group(1)
+#                 excerpts = re.findall(r'- "([^"]+)"', sec)
+#                 for excerpt in excerpts:
+#                     rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
+#     return pd.DataFrame(rows)
+# # ---------------------------------------------------------------------------------------
+# # Streamlit UI
+# # ---------------------------------------------------------------------------------------
+# st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
+# uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
+# if uploaded_file and not st.session_state['pdf_processed']:
+#     with st.spinner("Processing PDF..."):
+#         images = convert_pdf_to_images(uploaded_file)
+#         markdown_texts = [extract_markdown_from_image(img) for img in images]
+#         st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
+#         st.session_state['pdf_processed'] = True
+#     st.success("PDF processed successfully!")
+# if st.session_state['pdf_processed']:
+#     st.markdown("### Extracted Text Preview")
+#     st.write(st.session_state['df'].head())
+#     st.markdown("### Enter Topics and Descriptions")
+#     num_topics = st.number_input("Number of topics", 1, 10, 1)
+#     topics = {}
+#     for i in range(num_topics):
+#         topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
+#         desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
+#         if topic and desc:
+#             topics[topic] = desc
+#     if st.button("Run Analysis"):
+#         if not topics:
+#             st.warning("Please enter at least one topic and description.")
+#             st.stop()
+#         analyzer = SurveyAnalysis()
+#         processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
+#         extracted_df = extract_excerpts(processed_df)
+#         st.markdown("### Extracted Excerpts")
+#         st.dataframe(extracted_df)
+#         csv = extracted_df.to_csv(index=False)
+#         st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
+#         topic_counts = extracted_df['Topic'].value_counts()
+#         fig, ax = plt.subplots()
+#         topic_counts.plot.bar(ax=ax, color='#3d9aa1')
+#         st.pyplot(fig)
+# if not uploaded_file:
+#     st.info("Please upload a PDF file to begin.")
 # ---------------------------------------------------------------------------------------
 # Imports and Options
 # ---------------------------------------------------------------------------------------
 # ---------------------------------------------------------------------------------------
 # API Configuration
 # ---------------------------------------------------------------------------------------
 # Retrieve Hugging Face API key from environment variables
 hf_api_key = os.getenv('HF_API_KEY')
 if not hf_api_key:
 # Create the Hugging Face inference client
 client = InferenceClient(api_key=hf_api_key)
 # ---------------------------------------------------------------------------------------
 # Survey Analysis Class
 # ---------------------------------------------------------------------------------------
         csv = extracted_df.to_csv(index=False)
         st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
+        if not extracted_df.empty:
+            topic_counts = extracted_df['Topic'].value_counts()
+            fig, ax = plt.subplots()
+            topic_counts.plot.bar(ax=ax, color='#3d9aa1')
+            st.pyplot(fig)
+        else:
+            st.warning("No topics were extracted. Please check the input data and topics.")
 if not uploaded_file:
     st.info("Please upload a PDF file to begin.")