Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on about 1 month ago

Commit

80d0b8a

verified ·

1 Parent(s): 0ad9cc2

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -38

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import io
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
@@ -9,84 +10,68 @@ import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
 app = Flask(__name__)
-# Download NLTK data when the app starts
-nltk.download('punkt', quiet=True)
-nltk.download('stopwords', quiet=True)
-# Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
-# Extractive summarization function
 def extractive_summary(text, num_sentences=5):
-    """
-    Summarizes the given text by selecting the top N most important sentences.
-    Args:
-        text (str): The text to summarize.
-        num_sentences (int): Number of sentences to include in the summary (default: 5).
-    Returns:
-        str: The summarized text.
-    """
-    # Get stop words (e.g., "the", "is") to ignore them
     stop_words = set(stopwords.words('english'))
-    # Tokenize text into words and sentences
     words = word_tokenize(text)
     sentences = sent_tokenize(text)
-    # If the text has fewer sentences than requested, return the full text
     if len(sentences) <= num_sentences:
         return text
-    # Calculate word frequencies, excluding stop words and non-alphanumeric characters
     freq_table = {}
     for word in words:
         word = word.lower()
         if word not in stop_words and word.isalnum():
             freq_table[word] = freq_table.get(word, 0) + 1
-    # Score sentences based on the frequency of their words
     sentence_scores = {}
     for sentence in sentences:
         for word, freq in freq_table.items():
             if word in sentence.lower():
                 sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
-    # Select the top N sentences with the highest scores
     summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
-    summary = ' '.join(summary_sentences)
-    return summary
 @app.route("/", methods=["GET"])
 def index():
     return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
 @app.route("/summarize", methods=["POST"])
 def summarize():
     if "file" not in request.files:
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     if file.filename == "":
         return jsonify({"error": "No selected file"}), 400
     if not allowed_file(file.filename):
         return jsonify({"error": "Unsupported file format"}), 400
     filename = secure_filename(file.filename)
     file_content = file.read()
-    # Process file based on type
-    summary = None
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
             summary = summarize_pdf(file_content)
@@ -96,12 +81,12 @@ def summarize():
             summary = summarize_pptx(file_content)
         elif file_ext == "txt":
             summary = summarize_txt(file_content)
         return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
-# Summarization functions
 def summarize_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
     text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

 import os
 import io
+import logging
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
+# Configure logging for debugging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = Flask(__name__)
+# Use the NLTK data path set by the Dockerfile
+nltk.data.path.append(os.getenv("NLTK_DATA", "/app/nltk_data"))
+# Verify NLTK data is accessible (optional, for debugging)
+try:
+    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('corpora/stopwords')
+    logger.info("NLTK data loaded successfully.")
+except LookupError:
+    logger.error("NLTK data not found. Check Dockerfile setup.")
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 def extractive_summary(text, num_sentences=5):
     stop_words = set(stopwords.words('english'))
     words = word_tokenize(text)
     sentences = sent_tokenize(text)
     if len(sentences) <= num_sentences:
         return text
     freq_table = {}
     for word in words:
         word = word.lower()
         if word not in stop_words and word.isalnum():
             freq_table[word] = freq_table.get(word, 0) + 1
     sentence_scores = {}
     for sentence in sentences:
         for word, freq in freq_table.items():
             if word in sentence.lower():
                 sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
     summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
+    return ' '.join(summary_sentences)
 @app.route("/", methods=["GET"])
 def index():
+    logger.info("Root endpoint accessed.")
     return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
 @app.route("/summarize", methods=["POST"])
 def summarize():
+    logger.info("Summarize endpoint called.")
     if "file" not in request.files:
+        logger.error("No file uploaded.")
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     if file.filename == "":
+        logger.error("No file selected.")
         return jsonify({"error": "No selected file"}), 400
     if not allowed_file(file.filename):
+        logger.error(f"Unsupported file format: {file.filename}")
         return jsonify({"error": "Unsupported file format"}), 400
     filename = secure_filename(file.filename)
     file_content = file.read()
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
             summary = summarize_pdf(file_content)
             summary = summarize_pptx(file_content)
         elif file_ext == "txt":
             summary = summarize_txt(file_content)
+        logger.info(f"File {filename} summarized successfully.")
         return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
+        logger.error(f"Error processing file {filename}: {str(e)}")
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
 def summarize_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
     text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])