mike23415 commited on
Commit
b4aa0e4
·
verified ·
1 Parent(s): 24aa083

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -27
app.py CHANGED
@@ -1,12 +1,14 @@
1
  import os
2
  import io
3
  import logging
 
4
  from flask import Flask, request, jsonify
5
  from werkzeug.utils import secure_filename
6
  from PyPDF2 import PdfReader
7
  from docx import Document
8
  from pptx import Presentation
9
  from transformers import T5Tokenizer, T5ForConditionalGeneration
 
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
@@ -14,15 +16,18 @@ logger = logging.getLogger(__name__)
14
 
15
  # Initialize Flask app
16
  app = Flask(__name__)
 
17
 
18
- # Set Hugging Face cache directory
19
- os.environ["HF_HOME"] = "/app/hf_cache"
 
 
20
 
21
  # Load T5 model and tokenizer
22
  logger.info("Loading T5-Base model...")
23
  try:
24
- tokenizer = T5Tokenizer.from_pretrained("t5-base")
25
- model = T5ForConditionalGeneration.from_pretrained("t5-base")
26
  logger.info("T5-Base model loaded successfully.")
27
  except Exception as e:
28
  logger.error(f"Failed to load T5-Base: {str(e)}")
@@ -40,7 +45,8 @@ def summarize_text(text, max_length=150, min_length=30):
40
  if not text.strip():
41
  return "No text found in the document to summarize."
42
 
43
- input_text = "summarize: " + text
 
44
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
45
  summary_ids = model.generate(
46
  inputs["input_ids"],
@@ -54,7 +60,7 @@ def summarize_text(text, max_length=150, min_length=30):
54
  return summary
55
  except Exception as e:
56
  logger.error(f"Error in T5 summarization: {str(e)}")
57
- return "Error summarizing text."
58
 
59
  @app.route("/", methods=["GET"])
60
  def index():
@@ -65,23 +71,28 @@ def index():
65
  @app.route("/summarize", methods=["POST"])
66
  def summarize():
67
  logger.info("Summarize endpoint called.")
68
-
 
 
 
 
 
69
  # Check if a file is in the request
70
  if "file" not in request.files:
71
- logger.error("No file uploaded.")
72
- return jsonify({"error": "No file uploaded"}), 400
73
 
74
  file = request.files["file"]
75
 
76
  # Check if file is empty
77
  if file.filename == "":
78
- logger.error("No file selected.")
79
  return jsonify({"error": "No selected file"}), 400
80
 
81
  # Check if file has an allowed extension
82
  if not allowed_file(file.filename):
83
  logger.error(f"Unsupported file format: {file.filename}")
84
- return jsonify({"error": "Unsupported file format"}), 400
85
 
86
  # Process the file
87
  filename = secure_filename(file.filename)
@@ -102,10 +113,15 @@ def summarize():
102
  return jsonify({"error": "Unsupported file format"}), 400
103
 
104
  # Generate summary
 
105
  summary = summarize_text(text)
106
 
107
  logger.info(f"File {filename} summarized successfully.")
108
- return jsonify({"filename": filename, "summary": summary})
 
 
 
 
109
 
110
  except Exception as e:
111
  logger.error(f"Error processing file {filename}: {str(e)}")
@@ -113,29 +129,49 @@ def summarize():
113
 
114
  def summarize_pdf(file_content):
115
  """Extract text from PDF."""
116
- reader = PdfReader(io.BytesIO(file_content))
117
- text = "\n".join([page.extract_text() or "" for page in reader.pages])
118
- return text.strip()
 
 
 
 
119
 
120
  def summarize_docx(file_content):
121
  """Extract text from DOCX."""
122
- doc = Document(io.BytesIO(file_content))
123
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
124
- return text.strip()
 
 
 
 
125
 
126
  def summarize_pptx(file_content):
127
  """Extract text from PPTX."""
128
- ppt = Presentation(io.BytesIO(file_content))
129
- text = []
130
- for slide in ppt.slides:
131
- for shape in slide.shapes:
132
- if hasattr(shape, "text") and shape.text.strip():
133
- text.append(shape.text.strip())
134
- return "\n".join(text).strip()
 
 
 
 
135
 
136
  def summarize_txt(file_content):
137
  """Extract text from TXT file."""
138
- return file_content.decode("utf-8").strip()
 
 
 
 
 
 
 
 
139
 
140
  if __name__ == "__main__":
141
- app.run(host="0.0.0.0", port=7860, debug=True)
 
1
  import os
2
  import io
3
  import logging
4
+ import tempfile
5
  from flask import Flask, request, jsonify
6
  from werkzeug.utils import secure_filename
7
  from PyPDF2 import PdfReader
8
  from docx import Document
9
  from pptx import Presentation
10
  from transformers import T5Tokenizer, T5ForConditionalGeneration
11
+ from flask_cors import CORS # Import CORS for cross-origin requests
12
 
13
  # Configure logging
14
  logging.basicConfig(level=logging.INFO)
 
16
 
17
  # Initialize Flask app
18
  app = Flask(__name__)
19
+ CORS(app) # Enable CORS for all routes
20
 
21
+ # Set up a temporary directory for Hugging Face cache
22
+ cache_dir = tempfile.mkdtemp()
23
+ os.environ["HF_HOME"] = cache_dir
24
+ os.environ["TRANSFORMERS_CACHE"] = cache_dir
25
 
26
  # Load T5 model and tokenizer
27
  logger.info("Loading T5-Base model...")
28
  try:
29
+ tokenizer = T5Tokenizer.from_pretrained("t5-base", cache_dir=cache_dir)
30
+ model = T5ForConditionalGeneration.from_pretrained("t5-base", cache_dir=cache_dir)
31
  logger.info("T5-Base model loaded successfully.")
32
  except Exception as e:
33
  logger.error(f"Failed to load T5-Base: {str(e)}")
 
45
  if not text.strip():
46
  return "No text found in the document to summarize."
47
 
48
+ # Limit text length to prevent tokenizer errors
49
+ input_text = "summarize: " + text[:10000] # Limiting to 10K chars to be safe
50
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
51
  summary_ids = model.generate(
52
  inputs["input_ids"],
 
60
  return summary
61
  except Exception as e:
62
  logger.error(f"Error in T5 summarization: {str(e)}")
63
+ return f"Error summarizing text: {str(e)}"
64
 
65
  @app.route("/", methods=["GET"])
66
  def index():
 
71
  @app.route("/summarize", methods=["POST"])
72
  def summarize():
73
  logger.info("Summarize endpoint called.")
74
+
75
+ # Debug the incoming request
76
+ logger.info(f"Request headers: {request.headers}")
77
+ logger.info(f"Request files: {request.files}")
78
+ logger.info(f"Request form: {request.form}")
79
+
80
  # Check if a file is in the request
81
  if "file" not in request.files:
82
+ logger.error("No file found in request.files")
83
+ return jsonify({"error": "No file uploaded. Make sure to use 'file' as the form field name."}), 400
84
 
85
  file = request.files["file"]
86
 
87
  # Check if file is empty
88
  if file.filename == "":
89
+ logger.error("File has no filename")
90
  return jsonify({"error": "No selected file"}), 400
91
 
92
  # Check if file has an allowed extension
93
  if not allowed_file(file.filename):
94
  logger.error(f"Unsupported file format: {file.filename}")
95
+ return jsonify({"error": f"Unsupported file format. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
96
 
97
  # Process the file
98
  filename = secure_filename(file.filename)
 
113
  return jsonify({"error": "Unsupported file format"}), 400
114
 
115
  # Generate summary
116
+ logger.info(f"Generating summary for {filename} with text length {len(text)}")
117
  summary = summarize_text(text)
118
 
119
  logger.info(f"File {filename} summarized successfully.")
120
+ return jsonify({
121
+ "filename": filename,
122
+ "summary": summary,
123
+ "textLength": len(text)
124
+ })
125
 
126
  except Exception as e:
127
  logger.error(f"Error processing file {filename}: {str(e)}")
 
129
 
130
  def summarize_pdf(file_content):
131
  """Extract text from PDF."""
132
+ try:
133
+ reader = PdfReader(io.BytesIO(file_content))
134
+ text = "\n".join([page.extract_text() or "" for page in reader.pages])
135
+ return text.strip()
136
+ except Exception as e:
137
+ logger.error(f"Error extracting text from PDF: {str(e)}")
138
+ raise Exception(f"Failed to extract text from PDF: {str(e)}")
139
 
140
  def summarize_docx(file_content):
141
  """Extract text from DOCX."""
142
+ try:
143
+ doc = Document(io.BytesIO(file_content))
144
+ text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
145
+ return text.strip()
146
+ except Exception as e:
147
+ logger.error(f"Error extracting text from DOCX: {str(e)}")
148
+ raise Exception(f"Failed to extract text from DOCX: {str(e)}")
149
 
150
  def summarize_pptx(file_content):
151
  """Extract text from PPTX."""
152
+ try:
153
+ ppt = Presentation(io.BytesIO(file_content))
154
+ text = []
155
+ for slide in ppt.slides:
156
+ for shape in slide.shapes:
157
+ if hasattr(shape, "text") and shape.text.strip():
158
+ text.append(shape.text.strip())
159
+ return "\n".join(text).strip()
160
+ except Exception as e:
161
+ logger.error(f"Error extracting text from PPTX: {str(e)}")
162
+ raise Exception(f"Failed to extract text from PPTX: {str(e)}")
163
 
164
  def summarize_txt(file_content):
165
  """Extract text from TXT file."""
166
+ try:
167
+ return file_content.decode("utf-8").strip()
168
+ except UnicodeDecodeError:
169
+ # Try different encodings if UTF-8 fails
170
+ try:
171
+ return file_content.decode("latin-1").strip()
172
+ except Exception as e:
173
+ logger.error(f"Error decoding text file: {str(e)}")
174
+ raise Exception(f"Failed to decode text file: {str(e)}")
175
 
176
  if __name__ == "__main__":
177
+ app.run(host="0.0.0.0", port=7860, debug=True)