Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on about 1 month ago

Commit

92d0377

verified ·

1 Parent(s): c4ebcd4

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -121

app.py CHANGED Viewed

@@ -1,34 +1,19 @@
 import os
 import io
-import re
 from flask import Flask, request, jsonify
-from flask_cors import CORS
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
 import nltk
-import string
 from nltk.corpus import stopwords
-from nltk.tokenize import sent_tokenize, word_tokenize
-from nltk.probability import FreqDist
-from heapq import nlargest
-from collections import defaultdict
 app = Flask(__name__)
-CORS(app)  # Enable CORS for all routes
-# Set NLTK data path to a directory included in the project
-nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
-os.makedirs(nltk_data_dir, exist_ok=True)
-nltk.data.path.append(nltk_data_dir)
-# Ensure NLTK data is available (pre-downloaded)
-try:
-    stopwords.words('english')  # Test if stopwords are accessible
-except LookupError:
-    print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
-    # Fallback will be used if this fails
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -36,6 +21,48 @@ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 @app.route("/", methods=["GET"])
 def index():
     return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
@@ -57,127 +84,47 @@ def summarize():
     file_content = file.read()
     # Process file based on type
-    text = None
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
-            text = extract_text_from_pdf(file_content)
         elif file_ext == "docx":
-            text = extract_text_from_docx(file_content)
         elif file_ext == "pptx":
-            text = extract_text_from_pptx(file_content)
         elif file_ext == "txt":
-            text = extract_text_from_txt(file_content)
-        # Generate a summary of the text
-        try:
-            summary = generate_summary(text)
-        except LookupError as e:
-            print(f"NLTK summarization failed: {e}. Using fallback.")
-            summary = simple_summarize(text)
-        except Exception as e:
-            print(f"Summarization error: {e}")
-            summary = text[:1000] + "..." if len(text) > 1000 else text
-        # Include metadata
-        word_count = len(text.split())
-        return jsonify({
-            "filename": filename,
-            "summary": summary,
-            "original_word_count": word_count,
-            "summary_word_count": len(summary.split()) if summary else 0
-        })
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
-# Text extraction functions
-def extract_text_from_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
-    text = ""
-    for page in reader.pages:
-        page_text = page.extract_text()
-        if page_text:
-            text += page_text + "\n\n"
-    return clean_text(text)
-def extract_text_from_docx(file_content):
     doc = Document(io.BytesIO(file_content))
-    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-    return clean_text(text)
-def extract_text_from_pptx(file_content):
     ppt = Presentation(io.BytesIO(file_content))
     text = []
     for slide in ppt.slides:
         for shape in slide.shapes:
-            if hasattr(shape, "text") and shape.text.strip():
                 text.append(shape.text)
-    return clean_text("\n".join(text))
-def extract_text_from_txt(file_content):
-    text = file_content.decode("utf-8", errors="ignore")
-    return clean_text(text)
-def clean_text(text):
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
-    return text.strip()
-def generate_summary(text, sentence_count=5):
-    if len(text.split()) < 100:
-        return text
-    sentences = sent_tokenize(text)
-    if len(sentences) <= sentence_count:
-        return text
-    clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
-    stop_words = set(stopwords.words('english'))
-    word_frequencies = defaultdict(int)
-    for sentence in clean_sentences:
-        for word in word_tokenize(sentence):
-            if word not in stop_words:
-                word_frequencies[word] += 1
-    max_frequency = max(word_frequencies.values()) if word_frequencies else 1
-    for word in word_frequencies:
-        word_frequencies[word] = word_frequencies[word] / max_frequency
-    sentence_scores = defaultdict(int)
-    for i, sentence in enumerate(clean_sentences):
-        for word in word_tokenize(sentence):
-            if word in word_frequencies:
-                sentence_scores[i] += word_frequencies[word]
-    top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
-    top_indices.sort()
-    return ' '.join([sentences[i] for i in top_indices])
-def simple_summarize(text, max_chars=1000):
-    paragraphs = text.split('\n\n')
-    base_summary = ' '.join(paragraphs[:3])
-    if len(text) <= max_chars:
-        return text
-    if len(base_summary) < max_chars:
-        remaining_text = ' '.join(paragraphs[3:])
-        sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
-        for sentence in sentences:
-            if len(base_summary) + len(sentence) + 1 <= max_chars:
-                base_summary += ' ' + sentence
-            else:
-                break
-    if len(base_summary) > max_chars:
-        base_summary = base_summary[:max_chars] + "..."
-    return base_summary
 if __name__ == "__main__":
-    # For local testing only
-    app.run(host="0.0.0.0", port=7860)

 import os
 import io
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
 import nltk
 from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
 app = Flask(__name__)
+# Download NLTK data when the app starts
+nltk.download('punkt', quiet=True)
+nltk.download('stopwords', quiet=True)
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+# Extractive summarization function
+def extractive_summary(text, num_sentences=5):
+    """
+    Summarizes the given text by selecting the top N most important sentences.
+    Args:
+        text (str): The text to summarize.
+        num_sentences (int): Number of sentences to include in the summary (default: 5).
+    Returns:
+        str: The summarized text.
+    """
+    # Get stop words (e.g., "the", "is") to ignore them
+    stop_words = set(stopwords.words('english'))
+    # Tokenize text into words and sentences
+    words = word_tokenize(text)
+    sentences = sent_tokenize(text)
+    # If the text has fewer sentences than requested, return the full text
+    if len(sentences) <= num_sentences:
+        return text
+    # Calculate word frequencies, excluding stop words and non-alphanumeric characters
+    freq_table = {}
+    for word in words:
+        word = word.lower()
+        if word not in stop_words and word.isalnum():
+            freq_table[word] = freq_table.get(word, 0) + 1
+    # Score sentences based on the frequency of their words
+    sentence_scores = {}
+    for sentence in sentences:
+        for word, freq in freq_table.items():
+            if word in sentence.lower():
+                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
+    # Select the top N sentences with the highest scores
+    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
+    summary = ' '.join(summary_sentences)
+    return summary
 @app.route("/", methods=["GET"])
 def index():
     return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
     file_content = file.read()
     # Process file based on type
+    summary = None
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
+            summary = summarize_pdf(file_content)
         elif file_ext == "docx":
+            summary = summarize_docx(file_content)
         elif file_ext == "pptx":
+            summary = summarize_pptx(file_content)
         elif file_ext == "txt":
+            summary = summarize_txt(file_content)
+        return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
+# Summarization functions
+def summarize_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
+    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
+    return extractive_summary(text, num_sentences=5)
+def summarize_docx(file_content):
     doc = Document(io.BytesIO(file_content))
+    text = "\n".join([para.text for para in doc.paragraphs])
+    return extractive_summary(text, num_sentences=5)
+def summarize_pptx(file_content):
     ppt = Presentation(io.BytesIO(file_content))
     text = []
     for slide in ppt.slides:
         for shape in slide.shapes:
+            if hasattr(shape, "text"):
                 text.append(shape.text)
+    full_text = "\n".join(text)
+    return extractive_summary(full_text, num_sentences=5)
+def summarize_txt(file_content):
+    text = file_content.decode("utf-8")
+    return extractive_summary(text, num_sentences=5)
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)