Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pyttsx3 | |
import tempfile | |
import PyPDF2 | |
from huggingface_hub import InferenceClient | |
page_bg_img = """ | |
<style> | |
.stApp { | |
background: linear-gradient( #eee 38%, #ccc 68%); | |
} | |
</style> | |
""" | |
st.markdown(page_bg_img, unsafe_allow_html=True) | |
st.title("Summarize & Listen to your Academic Materials on the Fly.") | |
uploaded_pdf = st.file_uploader("Upload a research Paper", type="pdf") | |
full_text = None | |
MODEL_NAME = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO" | |
client = InferenceClient(MODEL_NAME) | |
DETAILED_SUMMARIZATION_PROMPT = """ | |
<INST>You are a very powerful summarization engine for summarizing academic contents, | |
now you are to summarize the following text you are going to be provided which is from a document, make sure to understand | |
all improperly parsed text and actually parse them properly , also make sure that your final summarization is very coherent and understandable by a student and is under 4000 words , | |
also the length of the summarized text should be less than the original provided text, | |
if you are provided with a text that includes unnecessary items that do not contribute value to the book like preface about the author, do not include them in the summarization | |
Your summary should be concise and should accurately and objectively communicate the key points of the paper. | |
You should not include any personal opinions or interpretations in your summary but rather focus on | |
objectively presenting the information from the paper. Your summary should be written in your own words | |
and should not include any direct quotes from the paper. Please ensure that your summary is clear, | |
concise, and accurately reflects the content of the original paper. | |
do not go out of context of the words provided. | |
Now here is your provided text : | |
</INST> | |
""" | |
with st.spinner("Extracting Text..."): | |
if uploaded_pdf is not None: | |
tfile = tempfile.NamedTemporaryFile(delete=False) | |
tfile.write(uploaded_pdf.read()) | |
with open(tfile.name, "rb") as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
num_pages = len(pdf_reader.pages) | |
# Get text from all pages | |
full_text = "" | |
for page_num in range(num_pages): | |
page = pdf_reader.pages[page_num] | |
page_text = page.extract_text() | |
full_text += page_text | |
# truncating the full text at 25k characters | |
full_text = full_text if len(full_text) < 100000 else full_text[:100000] | |
# print(full_text) | |
st.success("Text Extracted Successfully!!!") | |
################################################################################### | |
def synthesize_text_to_audio(text): | |
engine = pyttsx3.init() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file: | |
temp_file_path = temp_file.name | |
engine.save_to_file(text, temp_file_path) # Save the audio to a temporary file | |
engine.runAndWait() | |
sound_file = open(temp_file_path, "rb") # Open the saved audio file for reading | |
return sound_file | |
summarized_text = None | |
if full_text: | |
with st.spinner("Summarizing Text Content..."): | |
summarized_text = client.text_generation( | |
DETAILED_SUMMARIZATION_PROMPT + full_text, | |
max_new_tokens=4096, | |
temperature=0.2, | |
top_p=0.8, | |
) | |
print(summarized_text) | |
if summarized_text: | |
with st.spinner('Synthesizing to Audio...'): | |
st.audio(synthesize_text_to_audio(summarized_text)) | |