Spaces:

ProfessorLeVesseur
/

PDF_Topic_Extraction_Analysis_App

Running

App Files Files Community

ProfessorLeVesseur commited on 20 days ago

Commit

ca854bd

verified ·

1 Parent(s): 8275a49

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -273

app.py CHANGED Viewed

@@ -1,264 +1,3 @@
-# # ---------------------------------------------------------------------------------------
-# # Imports and Options
-# # ---------------------------------------------------------------------------------------
-# import streamlit as st
-# import pandas as pd
-# import requests
-# import re
-# import fitz  # PyMuPDF
-# import io
-# import matplotlib.pyplot as plt
-# from PIL import Image
-# from transformers import AutoProcessor, AutoModelForVision2Seq
-# from docling_core.types.doc import DoclingDocument
-# from docling_core.types.doc.document import DocTagsDocument
-# import torch
-# import os
-# from huggingface_hub import InferenceClient
-# # ---------------------------------------------------------------------------------------
-# # Streamlit Page Configuration
-# # ---------------------------------------------------------------------------------------
-# st.set_page_config(
-#     page_title="Choose Your Own Adventure (Topic Extraction) PDF Analysis App",
-#     page_icon=":bar_chart:",
-#     layout="centered",
-#     initial_sidebar_state="auto",
-#     menu_items={
-#         'Get Help': 'mailto:[email protected]',
-#         'About': "This app is built to support PDF analysis"
-#     }
-# )
-# # ---------------------------------------------------------------------------------------
-# # Session State Initialization
-# # ---------------------------------------------------------------------------------------
-# for key in ['pdf_processed', 'markdown_texts', 'df']:
-#     if key not in st.session_state:
-#         st.session_state[key] = False if key == 'pdf_processed' else []
-# # ---------------------------------------------------------------------------------------
-# # API Configuration
-# # ---------------------------------------------------------------------------------------
-# # API_URL = "https://api.stack-ai.com/inference/v0/run/2df89a6c-a4af-4576-880e-27058e498f02/67acad8b0603ba4631db38e7"
-# # headers = {
-# #     'Authorization': 'Bearer a9e4979e-cdbe-49ea-a193-53562a784805',
-# #     'Content-Type': 'application/json'
-# # }
-# # Retrieve Hugging Face API key from environment variables
-# hf_api_key = os.getenv('HF_API_KEY')
-# if not hf_api_key:
-#     raise ValueError("HF_API_KEY not set in environment variables")
-# # Create the Hugging Face inference client
-# client = InferenceClient(api_key=hf_api_key)
-# # # ---------------------------------------------------------------------------------------
-# # # Survey Analysis Class
-# # # ---------------------------------------------------------------------------------------
-# # class SurveyAnalysis:
-# #     def prepare_llm_input(self, survey_response, topics):
-# #         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
-# #         return f"""Extract and summarize PDF notes based on topics:
-# # {topic_descriptions}
-# # Instructions:
-# # - Extract exact quotes per topic.
-# # - Ignore irrelevant topics.
-# # Format:
-# # [Topic]
-# # - "Exact quote"
-# # Meeting Notes:
-# # {survey_response}
-# # """
-# #     def query_api(self, payload):
-# #         try:
-# #             res = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-# #             res.raise_for_status()
-# #             return res.json()
-# #         except requests.exceptions.RequestException as e:
-# #             st.error(f"API request failed: {e}")
-# #             return {'outputs': {'out-0': ''}}
-# #     def extract_meeting_notes(self, response):
-# #         return response.get('outputs', {}).get('out-0', '')
-# #     def process_dataframe(self, df, topics):
-# #         results = []
-# #         for _, row in df.iterrows():
-# #             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
-# #             payload = {"user_id": "user", "in-0": llm_input}
-# #             response = self.query_api(payload)
-# #             notes = self.extract_meeting_notes(response)
-# #             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
-# #         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
-# # ---------------------------------------------------------------------------------------
-# # Survey Analysis Class
-# # ---------------------------------------------------------------------------------------
-# class SurveyAnalysis:
-#     def prepare_llm_input(self, survey_response, topics):
-#         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
-#         return f"""Extract and summarize PDF notes based on topics:
-# {topic_descriptions}
-# Instructions:
-# - Extract exact quotes per topic.
-# - Ignore irrelevant topics.
-# Format:
-# [Topic]
-# - "Exact quote"
-# Meeting Notes:
-# {survey_response}
-# """
-#     def prompt_response_from_hf_llm(self, llm_input):
-#         # Define a system prompt to guide the model's responses
-#         system_prompt = """
-#         <Persona> An expert Implementation Specialist at Michigan's Multi-Tiered System of Support Technical Assistance Center (MiMTSS TA Center) with deep expertise in SWPBIS, SEL, Structured Literacy, Science of Reading, and family engagement practices.</Persona>
-#         <Task> Analyze educational data and provide evidence-based recommendations for improving student outcomes across multiple tiers of support, drawing from established frameworks in behavioral interventions, literacy instruction, and family engagement.</Task>
-#         <Context> Operating within Michigan's educational system to support schools in implementing multi-tiered support systems, with access to student metrics data and knowledge of state-specific educational requirements and MTSS frameworks. </Context>
-#         <Format> Deliver insights through clear, actionable recommendations supported by data analysis, incorporating technical expertise while maintaining accessibility for educators and administrators at various levels of MTSS implementation.</Format>
-#         """
-#         # Generate the refined prompt using Hugging Face API
-#         response = client.chat.completions.create(
-#             model="meta-llama/Llama-3.1-70B-Instruct",
-#             messages=[
-#                 {"role": "system", "content": system_prompt},  # Add system prompt here
-#                 {"role": "user", "content": llm_input}
-#             ],
-#             stream=True,
-#             temperature=0.5,
-#             max_tokens=1024,
-#             top_p=0.7
-#         )
-#         # Combine messages if response is streamed
-#         response_content = ""
-#         for message in response:
-#             response_content += message.choices[0].delta.content
-#         return response_content.strip()
-#     def extract_text(self, response):
-#         return response
-#     def process_dataframe(self, df, topics):
-#         results = []
-#         for _, row in df.iterrows():
-#             llm_input = self.prepare_llm_input(row['Document_Text'], topics)
-#             response = self.prompt_response_from_hf_llm(llm_input)
-#             notes = self.extract_text(response)
-#             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
-#         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
-# # ---------------------------------------------------------------------------------------
-# # Helper Functions
-# # ---------------------------------------------------------------------------------------
-# @st.cache_resource
-# def load_smol_docling():
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#     processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
-#     model = AutoModelForVision2Seq.from_pretrained(
-#         "ds4sd/SmolDocling-256M-preview", torch_dtype=torch.float32
-#     ).to(device)
-#     return model, processor
-# model, processor = load_smol_docling()
-# def convert_pdf_to_images(pdf_file, dpi=150, max_size=1600):
-#     images = []
-#     doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-#     for page in doc:
-#         pix = page.get_pixmap(dpi=dpi)
-#         img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
-#         img.thumbnail((max_size, max_size), Image.LANCZOS)
-#         images.append(img)
-#     return images
-# def extract_markdown_from_image(image):
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#     prompt = processor.apply_chat_template([{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}], add_generation_prompt=True)
-#     inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)
-#     with torch.no_grad():
-#         generated_ids = model.generate(**inputs, max_new_tokens=1024)
-#     doctags = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=False)[0].replace("<end_of_utterance>", "").strip()
-#     doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
-#     doc = DoclingDocument(name="ExtractedDocument")
-#     doc.load_from_doctags(doctags_doc)
-#     return doc.export_to_markdown()
-# def extract_excerpts(processed_df):
-#     rows = []
-#     for _, r in processed_df.iterrows():
-#         for sec in re.split(r'\n(?=\[)', r['Topic_Summary']):
-#             topic_match = re.match(r'\[([^\]]+)\]', sec)
-#             if topic_match:
-#                 topic = topic_match.group(1)
-#                 excerpts = re.findall(r'- "([^"]+)"', sec)
-#                 for excerpt in excerpts:
-#                     rows.append({'Document_Text': r['Document_Text'], 'Topic_Summary': r['Topic_Summary'], 'Excerpt': excerpt, 'Topic': topic})
-#     return pd.DataFrame(rows)
-# # ---------------------------------------------------------------------------------------
-# # Streamlit UI
-# # ---------------------------------------------------------------------------------------
-# st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
-# uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
-# if uploaded_file and not st.session_state['pdf_processed']:
-#     with st.spinner("Processing PDF..."):
-#         images = convert_pdf_to_images(uploaded_file)
-#         markdown_texts = [extract_markdown_from_image(img) for img in images]
-#         st.session_state['df'] = pd.DataFrame({'Document_Text': markdown_texts})
-#         st.session_state['pdf_processed'] = True
-#     st.success("PDF processed successfully!")
-# if st.session_state['pdf_processed']:
-#     st.markdown("### Extracted Text Preview")
-#     st.write(st.session_state['df'].head())
-#     st.markdown("### Enter Topics and Descriptions")
-#     num_topics = st.number_input("Number of topics", 1, 10, 1)
-#     topics = {}
-#     for i in range(num_topics):
-#         topic = st.text_input(f"Topic {i+1} Name", key=f"topic_{i}")
-#         desc = st.text_area(f"Topic {i+1} Description", key=f"description_{i}")
-#         if topic and desc:
-#             topics[topic] = desc
-#     if st.button("Run Analysis"):
-#         if not topics:
-#             st.warning("Please enter at least one topic and description.")
-#             st.stop()
-#         analyzer = SurveyAnalysis()
-#         processed_df = analyzer.process_dataframe(st.session_state['df'], topics)
-#         extracted_df = extract_excerpts(processed_df)
-#         st.markdown("### Extracted Excerpts")
-#         st.dataframe(extracted_df)
-#         csv = extracted_df.to_csv(index=False)
-#         st.download_button("Download CSV", csv, "extracted_notes.csv", "text/csv")
-#         topic_counts = extracted_df['Topic'].value_counts()
-#         fig, ax = plt.subplots()
-#         topic_counts.plot.bar(ax=ax, color='#3d9aa1')
-#         st.pyplot(fig)
-# if not uploaded_file:
-#     st.info("Please upload a PDF file to begin.")
 # ---------------------------------------------------------------------------------------
 # Imports and Options
 # ---------------------------------------------------------------------------------------
@@ -291,6 +30,39 @@ st.set_page_config(
     }
 )
 # ---------------------------------------------------------------------------------------
 # Session State Initialization
 # ---------------------------------------------------------------------------------------
@@ -314,7 +86,7 @@ class AIAnalysis:
     def __init__(self, client):
         self.client = client
-    def prepare_llm_input(self, survey_response, topics):
         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
         return f"""Extract and summarize PDF notes based on topics:
 {topic_descriptions}
@@ -327,8 +99,8 @@ Instructions:
 [Topic]
 - "Exact quote"
-Meeting Notes:
-{survey_response}
 """
     def prompt_response_from_hf_llm(self, llm_input):
@@ -376,15 +148,6 @@ Meeting Notes:
             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
-    def process_dataframe(self, df, topics):
-        results = []
-        for _, row in df.iterrows():
-            llm_input = self.prepare_llm_input(row['Document_Text'], topics)
-            response = self.prompt_response_from_hf_llm(llm_input)
-            notes = self.extract_text(response)
-            results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
-        return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
 # ---------------------------------------------------------------------------------------
 # Helper Functions
 # ---------------------------------------------------------------------------------------

 # ---------------------------------------------------------------------------------------
 # Imports and Options
 # ---------------------------------------------------------------------------------------
     }
 )
+# ---------------------------------------------------------------------------------------
+# Streamlit Sidebar
+# ---------------------------------------------------------------------------------------
+st.sidebar.title("📌 About This App")
+st.sidebar.markdown("""
+#### ⚠️ **Important Note on Processing Time**
+This app uses the **SmolDocling** model (`ds4sd/SmolDocling-256M-preview`) to convert PDF pages into markdown text. Currently, the model is running on a CPU-based environment (**CPU basic | 2 vCPU - 16 GB RAM**), and therefore processing each page can take a significant amount of time (approximately **6 minutes per page**).
+This setup is suitable for testing and demonstration purposes, but **not efficient for real-world usage**.
+For faster processing, consider running the optimized version `ds4sd/SmolDocling-256M-preview-mlx-bf16` locally on a MacBook, where it performs significantly faster.
+---
+#### 🛠️ **How This App Works**
+Here's a quick overview of the workflow:
+1. **Upload PDF**: You upload a PDF document using the uploader provided.
+2. **Convert PDF to Images**: The PDF is converted into individual images (one per page).
+3. **Extract Markdown from Images**: Each image is processed by the SmolDocling model to extract markdown-formatted text.
+4. **Enter Topics and Descriptions**: You provide specific topics and their descriptions you'd like to extract from the document.
+5. **Extract Excerpts**: The app uses the **meta-llama/Llama-3.1-70B-Instruct** model to extract exact quotes relevant to your provided topics.
+6. **Results in a DataFrame**: All extracted quotes and their topics are compiled into a structured DataFrame that you can preview and download.
+---
+Please proceed by uploading your PDF file to begin the analysis.
+""")
 # ---------------------------------------------------------------------------------------
 # Session State Initialization
 # ---------------------------------------------------------------------------------------
     def __init__(self, client):
         self.client = client
+    def prepare_llm_input(self, document_content, topics):
         topic_descriptions = "\n".join([f"- **{t}**: {d}" for t, d in topics.items()])
         return f"""Extract and summarize PDF notes based on topics:
 {topic_descriptions}
 [Topic]
 - "Exact quote"
+Document Content:
+{document_content}
 """
     def prompt_response_from_hf_llm(self, llm_input):
             results.append({'Document_Text': row['Document_Text'], 'Topic_Summary': notes})
         return pd.concat([df.reset_index(drop=True), pd.DataFrame(results)['Topic_Summary']], axis=1)
 # ---------------------------------------------------------------------------------------
 # Helper Functions
 # ---------------------------------------------------------------------------------------