Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import io
|
|
|
3 |
from flask import Flask, request, jsonify
|
4 |
from werkzeug.utils import secure_filename
|
5 |
from PyPDF2 import PdfReader
|
@@ -9,84 +10,68 @@ import nltk
|
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
11 |
|
|
|
|
|
|
|
|
|
12 |
app = Flask(__name__)
|
13 |
|
14 |
-
#
|
15 |
-
nltk.
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
# Allowed file extensions
|
19 |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
20 |
|
21 |
def allowed_file(filename):
|
22 |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
23 |
|
24 |
-
# Extractive summarization function
|
25 |
def extractive_summary(text, num_sentences=5):
|
26 |
-
"""
|
27 |
-
Summarizes the given text by selecting the top N most important sentences.
|
28 |
-
|
29 |
-
Args:
|
30 |
-
text (str): The text to summarize.
|
31 |
-
num_sentences (int): Number of sentences to include in the summary (default: 5).
|
32 |
-
|
33 |
-
Returns:
|
34 |
-
str: The summarized text.
|
35 |
-
"""
|
36 |
-
# Get stop words (e.g., "the", "is") to ignore them
|
37 |
stop_words = set(stopwords.words('english'))
|
38 |
-
|
39 |
-
# Tokenize text into words and sentences
|
40 |
words = word_tokenize(text)
|
41 |
sentences = sent_tokenize(text)
|
42 |
-
|
43 |
-
# If the text has fewer sentences than requested, return the full text
|
44 |
if len(sentences) <= num_sentences:
|
45 |
return text
|
46 |
-
|
47 |
-
# Calculate word frequencies, excluding stop words and non-alphanumeric characters
|
48 |
freq_table = {}
|
49 |
for word in words:
|
50 |
word = word.lower()
|
51 |
if word not in stop_words and word.isalnum():
|
52 |
freq_table[word] = freq_table.get(word, 0) + 1
|
53 |
-
|
54 |
-
# Score sentences based on the frequency of their words
|
55 |
sentence_scores = {}
|
56 |
for sentence in sentences:
|
57 |
for word, freq in freq_table.items():
|
58 |
if word in sentence.lower():
|
59 |
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
|
60 |
-
|
61 |
-
# Select the top N sentences with the highest scores
|
62 |
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
|
63 |
-
|
64 |
-
return summary
|
65 |
|
66 |
@app.route("/", methods=["GET"])
|
67 |
def index():
|
|
|
68 |
return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
|
69 |
|
70 |
@app.route("/summarize", methods=["POST"])
|
71 |
def summarize():
|
|
|
72 |
if "file" not in request.files:
|
|
|
73 |
return jsonify({"error": "No file uploaded"}), 400
|
74 |
-
|
75 |
file = request.files["file"]
|
76 |
-
|
77 |
if file.filename == "":
|
|
|
78 |
return jsonify({"error": "No selected file"}), 400
|
79 |
-
|
80 |
if not allowed_file(file.filename):
|
|
|
81 |
return jsonify({"error": "Unsupported file format"}), 400
|
82 |
-
|
83 |
filename = secure_filename(file.filename)
|
84 |
file_content = file.read()
|
85 |
-
|
86 |
-
# Process file based on type
|
87 |
-
summary = None
|
88 |
file_ext = filename.rsplit(".", 1)[1].lower()
|
89 |
-
|
90 |
try:
|
91 |
if file_ext == "pdf":
|
92 |
summary = summarize_pdf(file_content)
|
@@ -96,12 +81,12 @@ def summarize():
|
|
96 |
summary = summarize_pptx(file_content)
|
97 |
elif file_ext == "txt":
|
98 |
summary = summarize_txt(file_content)
|
99 |
-
|
100 |
return jsonify({"filename": filename, "summary": summary})
|
101 |
except Exception as e:
|
|
|
102 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
103 |
|
104 |
-
# Summarization functions
|
105 |
def summarize_pdf(file_content):
|
106 |
reader = PdfReader(io.BytesIO(file_content))
|
107 |
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
|
|
1 |
import os
|
2 |
import io
|
3 |
+
import logging
|
4 |
from flask import Flask, request, jsonify
|
5 |
from werkzeug.utils import secure_filename
|
6 |
from PyPDF2 import PdfReader
|
|
|
10 |
from nltk.corpus import stopwords
|
11 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
12 |
|
13 |
+
# Configure logging for debugging
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
app = Flask(__name__)
|
18 |
|
19 |
+
# Use the NLTK data path set by the Dockerfile
|
20 |
+
nltk.data.path.append(os.getenv("NLTK_DATA", "/app/nltk_data"))
|
21 |
+
|
22 |
+
# Verify NLTK data is accessible (optional, for debugging)
|
23 |
+
try:
|
24 |
+
nltk.data.find('tokenizers/punkt')
|
25 |
+
nltk.data.find('corpora/stopwords')
|
26 |
+
logger.info("NLTK data loaded successfully.")
|
27 |
+
except LookupError:
|
28 |
+
logger.error("NLTK data not found. Check Dockerfile setup.")
|
29 |
|
|
|
30 |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
31 |
|
32 |
def allowed_file(filename):
|
33 |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
34 |
|
|
|
35 |
def extractive_summary(text, num_sentences=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
stop_words = set(stopwords.words('english'))
|
|
|
|
|
37 |
words = word_tokenize(text)
|
38 |
sentences = sent_tokenize(text)
|
|
|
|
|
39 |
if len(sentences) <= num_sentences:
|
40 |
return text
|
|
|
|
|
41 |
freq_table = {}
|
42 |
for word in words:
|
43 |
word = word.lower()
|
44 |
if word not in stop_words and word.isalnum():
|
45 |
freq_table[word] = freq_table.get(word, 0) + 1
|
|
|
|
|
46 |
sentence_scores = {}
|
47 |
for sentence in sentences:
|
48 |
for word, freq in freq_table.items():
|
49 |
if word in sentence.lower():
|
50 |
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
|
|
|
|
|
51 |
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
|
52 |
+
return ' '.join(summary_sentences)
|
|
|
53 |
|
54 |
@app.route("/", methods=["GET"])
|
55 |
def index():
|
56 |
+
logger.info("Root endpoint accessed.")
|
57 |
return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
|
58 |
|
59 |
@app.route("/summarize", methods=["POST"])
|
60 |
def summarize():
|
61 |
+
logger.info("Summarize endpoint called.")
|
62 |
if "file" not in request.files:
|
63 |
+
logger.error("No file uploaded.")
|
64 |
return jsonify({"error": "No file uploaded"}), 400
|
|
|
65 |
file = request.files["file"]
|
|
|
66 |
if file.filename == "":
|
67 |
+
logger.error("No file selected.")
|
68 |
return jsonify({"error": "No selected file"}), 400
|
|
|
69 |
if not allowed_file(file.filename):
|
70 |
+
logger.error(f"Unsupported file format: {file.filename}")
|
71 |
return jsonify({"error": "Unsupported file format"}), 400
|
|
|
72 |
filename = secure_filename(file.filename)
|
73 |
file_content = file.read()
|
|
|
|
|
|
|
74 |
file_ext = filename.rsplit(".", 1)[1].lower()
|
|
|
75 |
try:
|
76 |
if file_ext == "pdf":
|
77 |
summary = summarize_pdf(file_content)
|
|
|
81 |
summary = summarize_pptx(file_content)
|
82 |
elif file_ext == "txt":
|
83 |
summary = summarize_txt(file_content)
|
84 |
+
logger.info(f"File {filename} summarized successfully.")
|
85 |
return jsonify({"filename": filename, "summary": summary})
|
86 |
except Exception as e:
|
87 |
+
logger.error(f"Error processing file {filename}: {str(e)}")
|
88 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
89 |
|
|
|
90 |
def summarize_pdf(file_content):
|
91 |
reader = PdfReader(io.BytesIO(file_content))
|
92 |
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|