Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on Mar 31

Commit

b7db40a

verified ·

1 Parent(s): 3e65e21

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -21

app.py CHANGED Viewed

@@ -1,12 +1,30 @@
 import os
 import io
-from flask import Flask, request, jsonify, render_template
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
 app = Flask(__name__)
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -35,46 +53,114 @@ def summarize():
     file_content = file.read()
     # Process file based on type
-    summary = None
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
-            summary = summarize_pdf(file_content)
         elif file_ext == "docx":
-            summary = summarize_docx(file_content)
         elif file_ext == "pptx":
-            summary = summarize_pptx(file_content)
         elif file_ext == "txt":
-            summary = summarize_txt(file_content)
-        return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
-# Summarization functions
-def summarize_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
-    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
-    return text[:500]  # Returning a short summary (first 500 chars)
-def summarize_docx(file_content):
     doc = Document(io.BytesIO(file_content))
-    text = "\n".join([para.text for para in doc.paragraphs])
-    return text[:500]
-def summarize_pptx(file_content):
     ppt = Presentation(io.BytesIO(file_content))
     text = []
     for slide in ppt.slides:
         for shape in slide.shapes:
-            if hasattr(shape, "text"):
                 text.append(shape.text)
-    return "\n".join(text)[:500]
-def summarize_txt(file_content):
-    text = file_content.decode("utf-8")
-    return text[:500]
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)

 import os
 import io
+import re
+from flask import Flask, request, jsonify
+from flask_cors import CORS
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.probability import FreqDist
+from heapq import nlargest
+from collections import defaultdict
+import string
 app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+# Download necessary NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('punkt')
+    nltk.download('stopwords')
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
     file_content = file.read()
     # Process file based on type
+    text = None
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
+            text = extract_text_from_pdf(file_content)
         elif file_ext == "docx":
+            text = extract_text_from_docx(file_content)
         elif file_ext == "pptx":
+            text = extract_text_from_pptx(file_content)
         elif file_ext == "txt":
+            text = extract_text_from_txt(file_content)
+        # Generate a summary of the text
+        summary = generate_summary(text)
+        # Include metadata
+        word_count = len(text.split())
+        return jsonify({
+            "filename": filename,
+            "summary": summary,
+            "original_word_count": word_count,
+            "summary_word_count": len(summary.split())
+        })
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
+# Improved text extraction functions
+def extract_text_from_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
+    text = ""
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text += page_text + "\n\n"
+    return clean_text(text)
+def extract_text_from_docx(file_content):
     doc = Document(io.BytesIO(file_content))
+    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+    return clean_text(text)
+def extract_text_from_pptx(file_content):
     ppt = Presentation(io.BytesIO(file_content))
     text = []
     for slide in ppt.slides:
         for shape in slide.shapes:
+            if hasattr(shape, "text") and shape.text.strip():
                 text.append(shape.text)
+    return clean_text("\n".join(text))
+def extract_text_from_txt(file_content):
+    text = file_content.decode("utf-8", errors="ignore")
+    return clean_text(text)
+def clean_text(text):
+    # Remove excess whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters but keep sentence punctuation
+    text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
+    return text.strip()
+def generate_summary(text, sentence_count=5):
+    # If text is very short, return it as is
+    if len(text.split()) < 100:
+        return text
+    # Tokenize the text into sentences
+    sentences = sent_tokenize(text)
+    # If too few sentences, return all
+    if len(sentences) <= sentence_count:
+        return text
+    # Remove punctuation and convert to lowercase for processing
+    clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
+    # Get stop words
+    stop_words = set(stopwords.words('english'))
+    # Calculate word frequencies excluding stop words
+    word_frequencies = defaultdict(int)
+    for sentence in clean_sentences:
+        for word in word_tokenize(sentence):
+            if word not in stop_words:
+                word_frequencies[word] += 1
+    # Normalize frequencies
+    max_frequency = max(word_frequencies.values()) if word_frequencies else 1
+    for word in word_frequencies:
+        word_frequencies[word] = word_frequencies[word] / max_frequency
+    # Calculate sentence scores based on word frequencies
+    sentence_scores = defaultdict(int)
+    for i, sentence in enumerate(clean_sentences):
+        for word in word_tokenize(sentence):
+            if word in word_frequencies:
+                sentence_scores[i] += word_frequencies[word]
+    # Get top sentences
+    top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
+    top_indices.sort()  # Sort to maintain original order
+    # Combine top sentences to form summary
+    summary = ' '.join([sentences[i] for i in top_indices])
+    return summary
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)