mike23415 commited on
Commit
b7db40a
·
verified ·
1 Parent(s): 3e65e21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -21
app.py CHANGED
@@ -1,12 +1,30 @@
1
  import os
2
  import io
3
- from flask import Flask, request, jsonify, render_template
 
 
4
  from werkzeug.utils import secure_filename
5
  from PyPDF2 import PdfReader
6
  from docx import Document
7
  from pptx import Presentation
 
 
 
 
 
 
 
8
 
9
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
10
 
11
  # Allowed file extensions
12
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -35,46 +53,114 @@ def summarize():
35
  file_content = file.read()
36
 
37
  # Process file based on type
38
- summary = None
39
  file_ext = filename.rsplit(".", 1)[1].lower()
40
 
41
  try:
42
  if file_ext == "pdf":
43
- summary = summarize_pdf(file_content)
44
  elif file_ext == "docx":
45
- summary = summarize_docx(file_content)
46
  elif file_ext == "pptx":
47
- summary = summarize_pptx(file_content)
48
  elif file_ext == "txt":
49
- summary = summarize_txt(file_content)
50
-
51
- return jsonify({"filename": filename, "summary": summary})
 
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
54
 
55
- # Summarization functions
56
- def summarize_pdf(file_content):
57
  reader = PdfReader(io.BytesIO(file_content))
58
- text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
59
- return text[:500] # Returning a short summary (first 500 chars)
 
 
 
 
60
 
61
- def summarize_docx(file_content):
62
  doc = Document(io.BytesIO(file_content))
63
- text = "\n".join([para.text for para in doc.paragraphs])
64
- return text[:500]
65
 
66
- def summarize_pptx(file_content):
67
  ppt = Presentation(io.BytesIO(file_content))
68
  text = []
69
  for slide in ppt.slides:
70
  for shape in slide.shapes:
71
- if hasattr(shape, "text"):
72
  text.append(shape.text)
73
- return "\n".join(text)[:500]
74
 
75
- def summarize_txt(file_content):
76
- text = file_content.decode("utf-8")
77
- return text[:500]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  if __name__ == "__main__":
80
  app.run(host="0.0.0.0", port=7860, debug=True)
 
1
  import os
2
  import io
3
+ import re
4
+ from flask import Flask, request, jsonify
5
+ from flask_cors import CORS
6
  from werkzeug.utils import secure_filename
7
  from PyPDF2 import PdfReader
8
  from docx import Document
9
  from pptx import Presentation
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.tokenize import sent_tokenize, word_tokenize
13
+ from nltk.probability import FreqDist
14
+ from heapq import nlargest
15
+ from collections import defaultdict
16
+ import string
17
 
18
  app = Flask(__name__)
19
+ CORS(app) # Enable CORS for all routes
20
+
21
+ # Download necessary NLTK data
22
+ try:
23
+ nltk.data.find('tokenizers/punkt')
24
+ nltk.data.find('corpora/stopwords')
25
+ except LookupError:
26
+ nltk.download('punkt')
27
+ nltk.download('stopwords')
28
 
29
  # Allowed file extensions
30
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 
53
  file_content = file.read()
54
 
55
  # Process file based on type
56
+ text = None
57
  file_ext = filename.rsplit(".", 1)[1].lower()
58
 
59
  try:
60
  if file_ext == "pdf":
61
+ text = extract_text_from_pdf(file_content)
62
  elif file_ext == "docx":
63
+ text = extract_text_from_docx(file_content)
64
  elif file_ext == "pptx":
65
+ text = extract_text_from_pptx(file_content)
66
  elif file_ext == "txt":
67
+ text = extract_text_from_txt(file_content)
68
+
69
+ # Generate a summary of the text
70
+ summary = generate_summary(text)
71
+
72
+ # Include metadata
73
+ word_count = len(text.split())
74
+
75
+ return jsonify({
76
+ "filename": filename,
77
+ "summary": summary,
78
+ "original_word_count": word_count,
79
+ "summary_word_count": len(summary.split())
80
+ })
81
  except Exception as e:
82
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
83
 
84
+ # Improved text extraction functions
85
+ def extract_text_from_pdf(file_content):
86
  reader = PdfReader(io.BytesIO(file_content))
87
+ text = ""
88
+ for page in reader.pages:
89
+ page_text = page.extract_text()
90
+ if page_text:
91
+ text += page_text + "\n\n"
92
+ return clean_text(text)
93
 
94
+ def extract_text_from_docx(file_content):
95
  doc = Document(io.BytesIO(file_content))
96
+ text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
97
+ return clean_text(text)
98
 
99
+ def extract_text_from_pptx(file_content):
100
  ppt = Presentation(io.BytesIO(file_content))
101
  text = []
102
  for slide in ppt.slides:
103
  for shape in slide.shapes:
104
+ if hasattr(shape, "text") and shape.text.strip():
105
  text.append(shape.text)
106
+ return clean_text("\n".join(text))
107
 
108
+ def extract_text_from_txt(file_content):
109
+ text = file_content.decode("utf-8", errors="ignore")
110
+ return clean_text(text)
111
+
112
+ def clean_text(text):
113
+ # Remove excess whitespace
114
+ text = re.sub(r'\s+', ' ', text)
115
+ # Remove special characters but keep sentence punctuation
116
+ text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
117
+ return text.strip()
118
+
119
+ def generate_summary(text, sentence_count=5):
120
+ # If text is very short, return it as is
121
+ if len(text.split()) < 100:
122
+ return text
123
+
124
+ # Tokenize the text into sentences
125
+ sentences = sent_tokenize(text)
126
+
127
+ # If too few sentences, return all
128
+ if len(sentences) <= sentence_count:
129
+ return text
130
+
131
+ # Remove punctuation and convert to lowercase for processing
132
+ clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
133
+
134
+ # Get stop words
135
+ stop_words = set(stopwords.words('english'))
136
+
137
+ # Calculate word frequencies excluding stop words
138
+ word_frequencies = defaultdict(int)
139
+ for sentence in clean_sentences:
140
+ for word in word_tokenize(sentence):
141
+ if word not in stop_words:
142
+ word_frequencies[word] += 1
143
+
144
+ # Normalize frequencies
145
+ max_frequency = max(word_frequencies.values()) if word_frequencies else 1
146
+ for word in word_frequencies:
147
+ word_frequencies[word] = word_frequencies[word] / max_frequency
148
+
149
+ # Calculate sentence scores based on word frequencies
150
+ sentence_scores = defaultdict(int)
151
+ for i, sentence in enumerate(clean_sentences):
152
+ for word in word_tokenize(sentence):
153
+ if word in word_frequencies:
154
+ sentence_scores[i] += word_frequencies[word]
155
+
156
+ # Get top sentences
157
+ top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
158
+ top_indices.sort() # Sort to maintain original order
159
+
160
+ # Combine top sentences to form summary
161
+ summary = ' '.join([sentences[i] for i in top_indices])
162
+
163
+ return summary
164
 
165
  if __name__ == "__main__":
166
  app.run(host="0.0.0.0", port=7860, debug=True)