Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
import os
|
2 |
import io
|
3 |
import logging
|
|
|
4 |
from flask import Flask, request, jsonify
|
5 |
from werkzeug.utils import secure_filename
|
6 |
from PyPDF2 import PdfReader
|
7 |
from docx import Document
|
8 |
from pptx import Presentation
|
9 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
|
|
10 |
|
11 |
# Configure logging
|
12 |
logging.basicConfig(level=logging.INFO)
|
@@ -14,15 +16,18 @@ logger = logging.getLogger(__name__)
|
|
14 |
|
15 |
# Initialize Flask app
|
16 |
app = Flask(__name__)
|
|
|
17 |
|
18 |
-
# Set Hugging Face cache
|
19 |
-
|
|
|
|
|
20 |
|
21 |
# Load T5 model and tokenizer
|
22 |
logger.info("Loading T5-Base model...")
|
23 |
try:
|
24 |
-
tokenizer = T5Tokenizer.from_pretrained("t5-base")
|
25 |
-
model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
26 |
logger.info("T5-Base model loaded successfully.")
|
27 |
except Exception as e:
|
28 |
logger.error(f"Failed to load T5-Base: {str(e)}")
|
@@ -40,7 +45,8 @@ def summarize_text(text, max_length=150, min_length=30):
|
|
40 |
if not text.strip():
|
41 |
return "No text found in the document to summarize."
|
42 |
|
43 |
-
|
|
|
44 |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
45 |
summary_ids = model.generate(
|
46 |
inputs["input_ids"],
|
@@ -54,7 +60,7 @@ def summarize_text(text, max_length=150, min_length=30):
|
|
54 |
return summary
|
55 |
except Exception as e:
|
56 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
57 |
-
return "Error summarizing text
|
58 |
|
59 |
@app.route("/", methods=["GET"])
|
60 |
def index():
|
@@ -65,23 +71,28 @@ def index():
|
|
65 |
@app.route("/summarize", methods=["POST"])
|
66 |
def summarize():
|
67 |
logger.info("Summarize endpoint called.")
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
69 |
# Check if a file is in the request
|
70 |
if "file" not in request.files:
|
71 |
-
logger.error("No file
|
72 |
-
return jsonify({"error": "No file uploaded"}), 400
|
73 |
|
74 |
file = request.files["file"]
|
75 |
|
76 |
# Check if file is empty
|
77 |
if file.filename == "":
|
78 |
-
logger.error("
|
79 |
return jsonify({"error": "No selected file"}), 400
|
80 |
|
81 |
# Check if file has an allowed extension
|
82 |
if not allowed_file(file.filename):
|
83 |
logger.error(f"Unsupported file format: {file.filename}")
|
84 |
-
return jsonify({"error": "Unsupported file format"}), 400
|
85 |
|
86 |
# Process the file
|
87 |
filename = secure_filename(file.filename)
|
@@ -102,10 +113,15 @@ def summarize():
|
|
102 |
return jsonify({"error": "Unsupported file format"}), 400
|
103 |
|
104 |
# Generate summary
|
|
|
105 |
summary = summarize_text(text)
|
106 |
|
107 |
logger.info(f"File {filename} summarized successfully.")
|
108 |
-
return jsonify({
|
|
|
|
|
|
|
|
|
109 |
|
110 |
except Exception as e:
|
111 |
logger.error(f"Error processing file {filename}: {str(e)}")
|
@@ -113,29 +129,49 @@ def summarize():
|
|
113 |
|
114 |
def summarize_pdf(file_content):
|
115 |
"""Extract text from PDF."""
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
119 |
|
120 |
def summarize_docx(file_content):
|
121 |
"""Extract text from DOCX."""
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
125 |
|
126 |
def summarize_pptx(file_content):
|
127 |
"""Extract text from PPTX."""
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
for
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
135 |
|
136 |
def summarize_txt(file_content):
|
137 |
"""Extract text from TXT file."""
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
if __name__ == "__main__":
|
141 |
-
app.run(host="0.0.0.0", port=7860, debug=True)
|
|
|
1 |
import os
|
2 |
import io
|
3 |
import logging
|
4 |
+
import tempfile
|
5 |
from flask import Flask, request, jsonify
|
6 |
from werkzeug.utils import secure_filename
|
7 |
from PyPDF2 import PdfReader
|
8 |
from docx import Document
|
9 |
from pptx import Presentation
|
10 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
11 |
+
from flask_cors import CORS # Import CORS for cross-origin requests
|
12 |
|
13 |
# Configure logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
|
|
16 |
|
17 |
# Initialize Flask app
|
18 |
app = Flask(__name__)
|
19 |
+
CORS(app) # Enable CORS for all routes
|
20 |
|
21 |
+
# Set up a temporary directory for Hugging Face cache
|
22 |
+
cache_dir = tempfile.mkdtemp()
|
23 |
+
os.environ["HF_HOME"] = cache_dir
|
24 |
+
os.environ["TRANSFORMERS_CACHE"] = cache_dir
|
25 |
|
26 |
# Load T5 model and tokenizer
|
27 |
logger.info("Loading T5-Base model...")
|
28 |
try:
|
29 |
+
tokenizer = T5Tokenizer.from_pretrained("t5-base", cache_dir=cache_dir)
|
30 |
+
model = T5ForConditionalGeneration.from_pretrained("t5-base", cache_dir=cache_dir)
|
31 |
logger.info("T5-Base model loaded successfully.")
|
32 |
except Exception as e:
|
33 |
logger.error(f"Failed to load T5-Base: {str(e)}")
|
|
|
45 |
if not text.strip():
|
46 |
return "No text found in the document to summarize."
|
47 |
|
48 |
+
# Limit text length to prevent tokenizer errors
|
49 |
+
input_text = "summarize: " + text[:10000] # Limiting to 10K chars to be safe
|
50 |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
51 |
summary_ids = model.generate(
|
52 |
inputs["input_ids"],
|
|
|
60 |
return summary
|
61 |
except Exception as e:
|
62 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
63 |
+
return f"Error summarizing text: {str(e)}"
|
64 |
|
65 |
@app.route("/", methods=["GET"])
|
66 |
def index():
|
|
|
71 |
@app.route("/summarize", methods=["POST"])
|
72 |
def summarize():
|
73 |
logger.info("Summarize endpoint called.")
|
74 |
+
|
75 |
+
# Debug the incoming request
|
76 |
+
logger.info(f"Request headers: {request.headers}")
|
77 |
+
logger.info(f"Request files: {request.files}")
|
78 |
+
logger.info(f"Request form: {request.form}")
|
79 |
+
|
80 |
# Check if a file is in the request
|
81 |
if "file" not in request.files:
|
82 |
+
logger.error("No file found in request.files")
|
83 |
+
return jsonify({"error": "No file uploaded. Make sure to use 'file' as the form field name."}), 400
|
84 |
|
85 |
file = request.files["file"]
|
86 |
|
87 |
# Check if file is empty
|
88 |
if file.filename == "":
|
89 |
+
logger.error("File has no filename")
|
90 |
return jsonify({"error": "No selected file"}), 400
|
91 |
|
92 |
# Check if file has an allowed extension
|
93 |
if not allowed_file(file.filename):
|
94 |
logger.error(f"Unsupported file format: {file.filename}")
|
95 |
+
return jsonify({"error": f"Unsupported file format. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
|
96 |
|
97 |
# Process the file
|
98 |
filename = secure_filename(file.filename)
|
|
|
113 |
return jsonify({"error": "Unsupported file format"}), 400
|
114 |
|
115 |
# Generate summary
|
116 |
+
logger.info(f"Generating summary for {filename} with text length {len(text)}")
|
117 |
summary = summarize_text(text)
|
118 |
|
119 |
logger.info(f"File {filename} summarized successfully.")
|
120 |
+
return jsonify({
|
121 |
+
"filename": filename,
|
122 |
+
"summary": summary,
|
123 |
+
"textLength": len(text)
|
124 |
+
})
|
125 |
|
126 |
except Exception as e:
|
127 |
logger.error(f"Error processing file {filename}: {str(e)}")
|
|
|
129 |
|
130 |
def summarize_pdf(file_content):
|
131 |
"""Extract text from PDF."""
|
132 |
+
try:
|
133 |
+
reader = PdfReader(io.BytesIO(file_content))
|
134 |
+
text = "\n".join([page.extract_text() or "" for page in reader.pages])
|
135 |
+
return text.strip()
|
136 |
+
except Exception as e:
|
137 |
+
logger.error(f"Error extracting text from PDF: {str(e)}")
|
138 |
+
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
139 |
|
140 |
def summarize_docx(file_content):
|
141 |
"""Extract text from DOCX."""
|
142 |
+
try:
|
143 |
+
doc = Document(io.BytesIO(file_content))
|
144 |
+
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
145 |
+
return text.strip()
|
146 |
+
except Exception as e:
|
147 |
+
logger.error(f"Error extracting text from DOCX: {str(e)}")
|
148 |
+
raise Exception(f"Failed to extract text from DOCX: {str(e)}")
|
149 |
|
150 |
def summarize_pptx(file_content):
|
151 |
"""Extract text from PPTX."""
|
152 |
+
try:
|
153 |
+
ppt = Presentation(io.BytesIO(file_content))
|
154 |
+
text = []
|
155 |
+
for slide in ppt.slides:
|
156 |
+
for shape in slide.shapes:
|
157 |
+
if hasattr(shape, "text") and shape.text.strip():
|
158 |
+
text.append(shape.text.strip())
|
159 |
+
return "\n".join(text).strip()
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Error extracting text from PPTX: {str(e)}")
|
162 |
+
raise Exception(f"Failed to extract text from PPTX: {str(e)}")
|
163 |
|
164 |
def summarize_txt(file_content):
|
165 |
"""Extract text from TXT file."""
|
166 |
+
try:
|
167 |
+
return file_content.decode("utf-8").strip()
|
168 |
+
except UnicodeDecodeError:
|
169 |
+
# Try different encodings if UTF-8 fails
|
170 |
+
try:
|
171 |
+
return file_content.decode("latin-1").strip()
|
172 |
+
except Exception as e:
|
173 |
+
logger.error(f"Error decoding text file: {str(e)}")
|
174 |
+
raise Exception(f"Failed to decode text file: {str(e)}")
|
175 |
|
176 |
if __name__ == "__main__":
|
177 |
+
app.run(host="0.0.0.0", port=7860, debug=True)
|