mike23415 commited on
Commit
92d0377
·
verified ·
1 Parent(s): c4ebcd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -121
app.py CHANGED
@@ -1,34 +1,19 @@
1
  import os
2
  import io
3
- import re
4
  from flask import Flask, request, jsonify
5
- from flask_cors import CORS
6
  from werkzeug.utils import secure_filename
7
  from PyPDF2 import PdfReader
8
  from docx import Document
9
  from pptx import Presentation
10
  import nltk
11
- import string
12
  from nltk.corpus import stopwords
13
- from nltk.tokenize import sent_tokenize, word_tokenize
14
- from nltk.probability import FreqDist
15
- from heapq import nlargest
16
- from collections import defaultdict
17
 
18
  app = Flask(__name__)
19
- CORS(app) # Enable CORS for all routes
20
 
21
- # Set NLTK data path to a directory included in the project
22
- nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
23
- os.makedirs(nltk_data_dir, exist_ok=True)
24
- nltk.data.path.append(nltk_data_dir)
25
-
26
- # Ensure NLTK data is available (pre-downloaded)
27
- try:
28
- stopwords.words('english') # Test if stopwords are accessible
29
- except LookupError:
30
- print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
31
- # Fallback will be used if this fails
32
 
33
  # Allowed file extensions
34
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -36,6 +21,48 @@ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
36
  def allowed_file(filename):
37
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  @app.route("/", methods=["GET"])
40
  def index():
41
  return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
@@ -57,127 +84,47 @@ def summarize():
57
  file_content = file.read()
58
 
59
  # Process file based on type
60
- text = None
61
  file_ext = filename.rsplit(".", 1)[1].lower()
62
 
63
  try:
64
  if file_ext == "pdf":
65
- text = extract_text_from_pdf(file_content)
66
  elif file_ext == "docx":
67
- text = extract_text_from_docx(file_content)
68
  elif file_ext == "pptx":
69
- text = extract_text_from_pptx(file_content)
70
  elif file_ext == "txt":
71
- text = extract_text_from_txt(file_content)
72
-
73
- # Generate a summary of the text
74
- try:
75
- summary = generate_summary(text)
76
- except LookupError as e:
77
- print(f"NLTK summarization failed: {e}. Using fallback.")
78
- summary = simple_summarize(text)
79
- except Exception as e:
80
- print(f"Summarization error: {e}")
81
- summary = text[:1000] + "..." if len(text) > 1000 else text
82
-
83
- # Include metadata
84
- word_count = len(text.split())
85
-
86
- return jsonify({
87
- "filename": filename,
88
- "summary": summary,
89
- "original_word_count": word_count,
90
- "summary_word_count": len(summary.split()) if summary else 0
91
- })
92
  except Exception as e:
93
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
94
 
95
- # Text extraction functions
96
- def extract_text_from_pdf(file_content):
97
  reader = PdfReader(io.BytesIO(file_content))
98
- text = ""
99
- for page in reader.pages:
100
- page_text = page.extract_text()
101
- if page_text:
102
- text += page_text + "\n\n"
103
- return clean_text(text)
104
 
105
- def extract_text_from_docx(file_content):
106
  doc = Document(io.BytesIO(file_content))
107
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
108
- return clean_text(text)
109
 
110
- def extract_text_from_pptx(file_content):
111
  ppt = Presentation(io.BytesIO(file_content))
112
  text = []
113
  for slide in ppt.slides:
114
  for shape in slide.shapes:
115
- if hasattr(shape, "text") and shape.text.strip():
116
  text.append(shape.text)
117
- return clean_text("\n".join(text))
118
-
119
- def extract_text_from_txt(file_content):
120
- text = file_content.decode("utf-8", errors="ignore")
121
- return clean_text(text)
122
-
123
- def clean_text(text):
124
- text = re.sub(r'\s+', ' ', text)
125
- text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
126
- return text.strip()
127
 
128
- def generate_summary(text, sentence_count=5):
129
- if len(text.split()) < 100:
130
- return text
131
-
132
- sentences = sent_tokenize(text)
133
- if len(sentences) <= sentence_count:
134
- return text
135
-
136
- clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
137
- stop_words = set(stopwords.words('english'))
138
-
139
- word_frequencies = defaultdict(int)
140
- for sentence in clean_sentences:
141
- for word in word_tokenize(sentence):
142
- if word not in stop_words:
143
- word_frequencies[word] += 1
144
-
145
- max_frequency = max(word_frequencies.values()) if word_frequencies else 1
146
- for word in word_frequencies:
147
- word_frequencies[word] = word_frequencies[word] / max_frequency
148
-
149
- sentence_scores = defaultdict(int)
150
- for i, sentence in enumerate(clean_sentences):
151
- for word in word_tokenize(sentence):
152
- if word in word_frequencies:
153
- sentence_scores[i] += word_frequencies[word]
154
-
155
- top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
156
- top_indices.sort()
157
-
158
- return ' '.join([sentences[i] for i in top_indices])
159
-
160
- def simple_summarize(text, max_chars=1000):
161
- paragraphs = text.split('\n\n')
162
- base_summary = ' '.join(paragraphs[:3])
163
-
164
- if len(text) <= max_chars:
165
- return text
166
-
167
- if len(base_summary) < max_chars:
168
- remaining_text = ' '.join(paragraphs[3:])
169
- sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
170
- for sentence in sentences:
171
- if len(base_summary) + len(sentence) + 1 <= max_chars:
172
- base_summary += ' ' + sentence
173
- else:
174
- break
175
-
176
- if len(base_summary) > max_chars:
177
- base_summary = base_summary[:max_chars] + "..."
178
-
179
- return base_summary
180
 
181
  if __name__ == "__main__":
182
- # For local testing only
183
- app.run(host="0.0.0.0", port=7860)
 
1
  import os
2
  import io
 
3
  from flask import Flask, request, jsonify
 
4
  from werkzeug.utils import secure_filename
5
  from PyPDF2 import PdfReader
6
  from docx import Document
7
  from pptx import Presentation
8
  import nltk
 
9
  from nltk.corpus import stopwords
10
+ from nltk.tokenize import word_tokenize, sent_tokenize
 
 
 
11
 
12
  app = Flask(__name__)
 
13
 
14
+ # Download NLTK data when the app starts
15
+ nltk.download('punkt', quiet=True)
16
+ nltk.download('stopwords', quiet=True)
 
 
 
 
 
 
 
 
17
 
18
  # Allowed file extensions
19
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 
21
  def allowed_file(filename):
22
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
23
 
24
+ # Extractive summarization function
25
+ def extractive_summary(text, num_sentences=5):
26
+ """
27
+ Summarizes the given text by selecting the top N most important sentences.
28
+
29
+ Args:
30
+ text (str): The text to summarize.
31
+ num_sentences (int): Number of sentences to include in the summary (default: 5).
32
+
33
+ Returns:
34
+ str: The summarized text.
35
+ """
36
+ # Get stop words (e.g., "the", "is") to ignore them
37
+ stop_words = set(stopwords.words('english'))
38
+
39
+ # Tokenize text into words and sentences
40
+ words = word_tokenize(text)
41
+ sentences = sent_tokenize(text)
42
+
43
+ # If the text has fewer sentences than requested, return the full text
44
+ if len(sentences) <= num_sentences:
45
+ return text
46
+
47
+ # Calculate word frequencies, excluding stop words and non-alphanumeric characters
48
+ freq_table = {}
49
+ for word in words:
50
+ word = word.lower()
51
+ if word not in stop_words and word.isalnum():
52
+ freq_table[word] = freq_table.get(word, 0) + 1
53
+
54
+ # Score sentences based on the frequency of their words
55
+ sentence_scores = {}
56
+ for sentence in sentences:
57
+ for word, freq in freq_table.items():
58
+ if word in sentence.lower():
59
+ sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
60
+
61
+ # Select the top N sentences with the highest scores
62
+ summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
63
+ summary = ' '.join(summary_sentences)
64
+ return summary
65
+
66
  @app.route("/", methods=["GET"])
67
  def index():
68
  return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
 
84
  file_content = file.read()
85
 
86
  # Process file based on type
87
+ summary = None
88
  file_ext = filename.rsplit(".", 1)[1].lower()
89
 
90
  try:
91
  if file_ext == "pdf":
92
+ summary = summarize_pdf(file_content)
93
  elif file_ext == "docx":
94
+ summary = summarize_docx(file_content)
95
  elif file_ext == "pptx":
96
+ summary = summarize_pptx(file_content)
97
  elif file_ext == "txt":
98
+ summary = summarize_txt(file_content)
99
+
100
+ return jsonify({"filename": filename, "summary": summary})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  except Exception as e:
102
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
103
 
104
+ # Summarization functions
105
+ def summarize_pdf(file_content):
106
  reader = PdfReader(io.BytesIO(file_content))
107
+ text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
108
+ return extractive_summary(text, num_sentences=5)
 
 
 
 
109
 
110
+ def summarize_docx(file_content):
111
  doc = Document(io.BytesIO(file_content))
112
+ text = "\n".join([para.text for para in doc.paragraphs])
113
+ return extractive_summary(text, num_sentences=5)
114
 
115
+ def summarize_pptx(file_content):
116
  ppt = Presentation(io.BytesIO(file_content))
117
  text = []
118
  for slide in ppt.slides:
119
  for shape in slide.shapes:
120
+ if hasattr(shape, "text"):
121
  text.append(shape.text)
122
+ full_text = "\n".join(text)
123
+ return extractive_summary(full_text, num_sentences=5)
 
 
 
 
 
 
 
 
124
 
125
+ def summarize_txt(file_content):
126
+ text = file_content.decode("utf-8")
127
+ return extractive_summary(text, num_sentences=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  if __name__ == "__main__":
130
+ app.run(host="0.0.0.0", port=7860, debug=True)