mike23415 commited on
Commit
80d0b8a
·
verified ·
1 Parent(s): 0ad9cc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -38
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import io
 
3
  from flask import Flask, request, jsonify
4
  from werkzeug.utils import secure_filename
5
  from PyPDF2 import PdfReader
@@ -9,84 +10,68 @@ import nltk
9
  from nltk.corpus import stopwords
10
  from nltk.tokenize import word_tokenize, sent_tokenize
11
 
 
 
 
 
12
  app = Flask(__name__)
13
 
14
- # Download NLTK data when the app starts
15
- nltk.download('punkt', quiet=True)
16
- nltk.download('stopwords', quiet=True)
 
 
 
 
 
 
 
17
 
18
- # Allowed file extensions
19
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
20
 
21
  def allowed_file(filename):
22
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
23
 
24
- # Extractive summarization function
25
  def extractive_summary(text, num_sentences=5):
26
- """
27
- Summarizes the given text by selecting the top N most important sentences.
28
-
29
- Args:
30
- text (str): The text to summarize.
31
- num_sentences (int): Number of sentences to include in the summary (default: 5).
32
-
33
- Returns:
34
- str: The summarized text.
35
- """
36
- # Get stop words (e.g., "the", "is") to ignore them
37
  stop_words = set(stopwords.words('english'))
38
-
39
- # Tokenize text into words and sentences
40
  words = word_tokenize(text)
41
  sentences = sent_tokenize(text)
42
-
43
- # If the text has fewer sentences than requested, return the full text
44
  if len(sentences) <= num_sentences:
45
  return text
46
-
47
- # Calculate word frequencies, excluding stop words and non-alphanumeric characters
48
  freq_table = {}
49
  for word in words:
50
  word = word.lower()
51
  if word not in stop_words and word.isalnum():
52
  freq_table[word] = freq_table.get(word, 0) + 1
53
-
54
- # Score sentences based on the frequency of their words
55
  sentence_scores = {}
56
  for sentence in sentences:
57
  for word, freq in freq_table.items():
58
  if word in sentence.lower():
59
  sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
60
-
61
- # Select the top N sentences with the highest scores
62
  summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
63
- summary = ' '.join(summary_sentences)
64
- return summary
65
 
66
  @app.route("/", methods=["GET"])
67
  def index():
 
68
  return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
69
 
70
  @app.route("/summarize", methods=["POST"])
71
  def summarize():
 
72
  if "file" not in request.files:
 
73
  return jsonify({"error": "No file uploaded"}), 400
74
-
75
  file = request.files["file"]
76
-
77
  if file.filename == "":
 
78
  return jsonify({"error": "No selected file"}), 400
79
-
80
  if not allowed_file(file.filename):
 
81
  return jsonify({"error": "Unsupported file format"}), 400
82
-
83
  filename = secure_filename(file.filename)
84
  file_content = file.read()
85
-
86
- # Process file based on type
87
- summary = None
88
  file_ext = filename.rsplit(".", 1)[1].lower()
89
-
90
  try:
91
  if file_ext == "pdf":
92
  summary = summarize_pdf(file_content)
@@ -96,12 +81,12 @@ def summarize():
96
  summary = summarize_pptx(file_content)
97
  elif file_ext == "txt":
98
  summary = summarize_txt(file_content)
99
-
100
  return jsonify({"filename": filename, "summary": summary})
101
  except Exception as e:
 
102
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
103
 
104
- # Summarization functions
105
  def summarize_pdf(file_content):
106
  reader = PdfReader(io.BytesIO(file_content))
107
  text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
 
1
  import os
2
  import io
3
+ import logging
4
  from flask import Flask, request, jsonify
5
  from werkzeug.utils import secure_filename
6
  from PyPDF2 import PdfReader
 
10
  from nltk.corpus import stopwords
11
  from nltk.tokenize import word_tokenize, sent_tokenize
12
 
13
+ # Configure logging for debugging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
  app = Flask(__name__)
18
 
19
+ # Use the NLTK data path set by the Dockerfile
20
+ nltk.data.path.append(os.getenv("NLTK_DATA", "/app/nltk_data"))
21
+
22
+ # Verify NLTK data is accessible (optional, for debugging)
23
+ try:
24
+ nltk.data.find('tokenizers/punkt')
25
+ nltk.data.find('corpora/stopwords')
26
+ logger.info("NLTK data loaded successfully.")
27
+ except LookupError:
28
+ logger.error("NLTK data not found. Check Dockerfile setup.")
29
 
 
30
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
31
 
32
  def allowed_file(filename):
33
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
34
 
 
35
  def extractive_summary(text, num_sentences=5):
 
 
 
 
 
 
 
 
 
 
 
36
  stop_words = set(stopwords.words('english'))
 
 
37
  words = word_tokenize(text)
38
  sentences = sent_tokenize(text)
 
 
39
  if len(sentences) <= num_sentences:
40
  return text
 
 
41
  freq_table = {}
42
  for word in words:
43
  word = word.lower()
44
  if word not in stop_words and word.isalnum():
45
  freq_table[word] = freq_table.get(word, 0) + 1
 
 
46
  sentence_scores = {}
47
  for sentence in sentences:
48
  for word, freq in freq_table.items():
49
  if word in sentence.lower():
50
  sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
 
 
51
  summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
52
+ return ' '.join(summary_sentences)
 
53
 
54
  @app.route("/", methods=["GET"])
55
  def index():
56
+ logger.info("Root endpoint accessed.")
57
  return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
58
 
59
  @app.route("/summarize", methods=["POST"])
60
  def summarize():
61
+ logger.info("Summarize endpoint called.")
62
  if "file" not in request.files:
63
+ logger.error("No file uploaded.")
64
  return jsonify({"error": "No file uploaded"}), 400
 
65
  file = request.files["file"]
 
66
  if file.filename == "":
67
+ logger.error("No file selected.")
68
  return jsonify({"error": "No selected file"}), 400
 
69
  if not allowed_file(file.filename):
70
+ logger.error(f"Unsupported file format: {file.filename}")
71
  return jsonify({"error": "Unsupported file format"}), 400
 
72
  filename = secure_filename(file.filename)
73
  file_content = file.read()
 
 
 
74
  file_ext = filename.rsplit(".", 1)[1].lower()
 
75
  try:
76
  if file_ext == "pdf":
77
  summary = summarize_pdf(file_content)
 
81
  summary = summarize_pptx(file_content)
82
  elif file_ext == "txt":
83
  summary = summarize_txt(file_content)
84
+ logger.info(f"File {filename} summarized successfully.")
85
  return jsonify({"filename": filename, "summary": summary})
86
  except Exception as e:
87
+ logger.error(f"Error processing file {filename}: {str(e)}")
88
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
89
 
 
90
  def summarize_pdf(file_content):
91
  reader = PdfReader(io.BytesIO(file_content))
92
  text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])