Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,13 +14,13 @@ logger = logging.getLogger(__name__)
|
|
14 |
|
15 |
app = Flask(__name__)
|
16 |
|
17 |
-
# Set Hugging Face cache
|
18 |
-
os.environ["HF_HOME"] = "/
|
19 |
|
20 |
# Load T5 model and tokenizer
|
21 |
logger.info("Loading T5-Base model...")
|
22 |
try:
|
23 |
-
tokenizer = T5Tokenizer.from_pretrained("t5-base")
|
24 |
model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
25 |
logger.info("T5-Base model loaded successfully.")
|
26 |
except Exception as e:
|
@@ -30,10 +30,14 @@ except Exception as e:
|
|
30 |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
31 |
|
32 |
def allowed_file(filename):
|
|
|
33 |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
34 |
|
35 |
def summarize_text(text, max_length=150, min_length=30):
|
36 |
"""Summarize text using T5-Base."""
|
|
|
|
|
|
|
37 |
try:
|
38 |
input_text = "summarize: " + text
|
39 |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
@@ -49,29 +53,37 @@ def summarize_text(text, max_length=150, min_length=30):
|
|
49 |
return summary
|
50 |
except Exception as e:
|
51 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
52 |
-
|
53 |
|
54 |
@app.route("/", methods=["GET"])
|
55 |
def index():
|
|
|
56 |
logger.info("Root endpoint accessed.")
|
57 |
-
return "Document Summarizer API with T5-Base is running! Use /summarize
|
58 |
|
59 |
@app.route("/summarize", methods=["POST"])
|
60 |
def summarize():
|
|
|
61 |
logger.info("Summarize endpoint called.")
|
|
|
62 |
if "file" not in request.files:
|
63 |
logger.error("No file uploaded.")
|
64 |
return jsonify({"error": "No file uploaded"}), 400
|
|
|
65 |
file = request.files["file"]
|
|
|
66 |
if file.filename == "":
|
67 |
logger.error("No file selected.")
|
68 |
return jsonify({"error": "No selected file"}), 400
|
|
|
69 |
if not allowed_file(file.filename):
|
70 |
logger.error(f"Unsupported file format: {file.filename}")
|
71 |
return jsonify({"error": "Unsupported file format"}), 400
|
|
|
72 |
filename = secure_filename(file.filename)
|
73 |
file_content = file.read()
|
74 |
file_ext = filename.rsplit(".", 1)[1].lower()
|
|
|
75 |
try:
|
76 |
if file_ext == "pdf":
|
77 |
text = summarize_pdf(file_content)
|
@@ -81,34 +93,56 @@ def summarize():
|
|
81 |
text = summarize_pptx(file_content)
|
82 |
elif file_ext == "txt":
|
83 |
text = summarize_txt(file_content)
|
|
|
84 |
summary = summarize_text(text)
|
85 |
-
|
|
|
86 |
return jsonify({"filename": filename, "summary": summary})
|
|
|
87 |
except Exception as e:
|
88 |
logger.error(f"Error processing file {filename}: {str(e)}")
|
89 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
90 |
|
91 |
def summarize_pdf(file_content):
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
def summarize_docx(file_content):
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
def summarize_pptx(file_content):
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
def summarize_txt(file_content):
|
111 |
-
|
|
|
|
|
|
|
|
|
112 |
|
113 |
if __name__ == "__main__":
|
114 |
-
app.run(host="0.0.0.0", port=7860, debug=True)
|
|
|
14 |
|
15 |
app = Flask(__name__)
|
16 |
|
17 |
+
# Set Hugging Face cache to a writable directory
|
18 |
+
os.environ["HF_HOME"] = "/tmp/huggingface_cache"
|
19 |
|
20 |
# Load T5 model and tokenizer
|
21 |
logger.info("Loading T5-Base model...")
|
22 |
try:
|
23 |
+
tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
|
24 |
model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
25 |
logger.info("T5-Base model loaded successfully.")
|
26 |
except Exception as e:
|
|
|
30 |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
31 |
|
32 |
def allowed_file(filename):
|
33 |
+
"""Check if the uploaded file has an allowed extension."""
|
34 |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
35 |
|
36 |
def summarize_text(text, max_length=150, min_length=30):
|
37 |
"""Summarize text using T5-Base."""
|
38 |
+
if not text.strip():
|
39 |
+
return "No meaningful text found in the document."
|
40 |
+
|
41 |
try:
|
42 |
input_text = "summarize: " + text
|
43 |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
|
|
53 |
return summary
|
54 |
except Exception as e:
|
55 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
56 |
+
return "Error in summarization process."
|
57 |
|
58 |
@app.route("/", methods=["GET"])
|
59 |
def index():
|
60 |
+
"""Root endpoint for API status check."""
|
61 |
logger.info("Root endpoint accessed.")
|
62 |
+
return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."
|
63 |
|
64 |
@app.route("/summarize", methods=["POST"])
|
65 |
def summarize():
|
66 |
+
"""Handle file uploads and summarize the content."""
|
67 |
logger.info("Summarize endpoint called.")
|
68 |
+
|
69 |
if "file" not in request.files:
|
70 |
logger.error("No file uploaded.")
|
71 |
return jsonify({"error": "No file uploaded"}), 400
|
72 |
+
|
73 |
file = request.files["file"]
|
74 |
+
|
75 |
if file.filename == "":
|
76 |
logger.error("No file selected.")
|
77 |
return jsonify({"error": "No selected file"}), 400
|
78 |
+
|
79 |
if not allowed_file(file.filename):
|
80 |
logger.error(f"Unsupported file format: {file.filename}")
|
81 |
return jsonify({"error": "Unsupported file format"}), 400
|
82 |
+
|
83 |
filename = secure_filename(file.filename)
|
84 |
file_content = file.read()
|
85 |
file_ext = filename.rsplit(".", 1)[1].lower()
|
86 |
+
|
87 |
try:
|
88 |
if file_ext == "pdf":
|
89 |
text = summarize_pdf(file_content)
|
|
|
93 |
text = summarize_pptx(file_content)
|
94 |
elif file_ext == "txt":
|
95 |
text = summarize_txt(file_content)
|
96 |
+
|
97 |
summary = summarize_text(text)
|
98 |
+
|
99 |
+
logger.info(f"File {filename} summarized successfully.")
|
100 |
return jsonify({"filename": filename, "summary": summary})
|
101 |
+
|
102 |
except Exception as e:
|
103 |
logger.error(f"Error processing file {filename}: {str(e)}")
|
104 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
105 |
|
106 |
def summarize_pdf(file_content):
|
107 |
+
"""Extract text from a PDF file."""
|
108 |
+
try:
|
109 |
+
reader = PdfReader(io.BytesIO(file_content))
|
110 |
+
text = "\n".join([page.extract_text() or "" for page in reader.pages])
|
111 |
+
return text.strip() or "No extractable text found in PDF."
|
112 |
+
except Exception as e:
|
113 |
+
logger.error(f"Error reading PDF: {str(e)}")
|
114 |
+
return "Error extracting text from PDF."
|
115 |
|
116 |
def summarize_docx(file_content):
|
117 |
+
"""Extract text from a DOCX file."""
|
118 |
+
try:
|
119 |
+
doc = Document(io.BytesIO(file_content))
|
120 |
+
text = "\n".join([para.text for para in doc.paragraphs])
|
121 |
+
return text.strip() or "No extractable text found in DOCX."
|
122 |
+
except Exception as e:
|
123 |
+
logger.error(f"Error reading DOCX: {str(e)}")
|
124 |
+
return "Error extracting text from DOCX."
|
125 |
|
126 |
def summarize_pptx(file_content):
|
127 |
+
"""Extract text from a PPTX file."""
|
128 |
+
try:
|
129 |
+
ppt = Presentation(io.BytesIO(file_content))
|
130 |
+
text = []
|
131 |
+
for slide in ppt.slides:
|
132 |
+
for shape in slide.shapes:
|
133 |
+
if hasattr(shape, "text"):
|
134 |
+
text.append(shape.text)
|
135 |
+
return "\n".join(text).strip() or "No extractable text found in PPTX."
|
136 |
+
except Exception as e:
|
137 |
+
logger.error(f"Error reading PPTX: {str(e)}")
|
138 |
+
return "Error extracting text from PPTX."
|
139 |
|
140 |
def summarize_txt(file_content):
|
141 |
+
"""Extract text from a TXT file with safe decoding."""
|
142 |
+
try:
|
143 |
+
return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
|
144 |
+
except UnicodeDecodeError:
|
145 |
+
return file_content.decode("latin-1").strip() or "No extractable text found in TXT."
|
146 |
|
147 |
if __name__ == "__main__":
|
148 |
+
app.run(host="0.0.0.0", port=7860, debug=True)
|