Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,15 +12,16 @@ from transformers import T5Tokenizer, T5ForConditionalGeneration
|
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
|
|
15 |
app = Flask(__name__)
|
16 |
|
17 |
-
# Set Hugging Face cache
|
18 |
-
os.environ["HF_HOME"] = "/
|
19 |
|
20 |
# Load T5 model and tokenizer
|
21 |
logger.info("Loading T5-Base model...")
|
22 |
try:
|
23 |
-
tokenizer = T5Tokenizer.from_pretrained("t5-base"
|
24 |
model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
25 |
logger.info("T5-Base model loaded successfully.")
|
26 |
except Exception as e:
|
@@ -35,10 +36,10 @@ def allowed_file(filename):
|
|
35 |
|
36 |
def summarize_text(text, max_length=150, min_length=30):
|
37 |
"""Summarize text using T5-Base."""
|
38 |
-
if not text.strip():
|
39 |
-
return "No meaningful text found in the document."
|
40 |
-
|
41 |
try:
|
|
|
|
|
|
|
42 |
input_text = "summarize: " + text
|
43 |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
44 |
summary_ids = model.generate(
|
@@ -53,29 +54,28 @@ def summarize_text(text, max_length=150, min_length=30):
|
|
53 |
return summary
|
54 |
except Exception as e:
|
55 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
56 |
-
return "Error
|
57 |
|
58 |
@app.route("/", methods=["GET"])
|
59 |
def index():
|
60 |
-
"""Root endpoint
|
61 |
logger.info("Root endpoint accessed.")
|
62 |
-
return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."
|
63 |
|
64 |
@app.route("/summarize", methods=["POST"])
|
65 |
def summarize():
|
66 |
-
"""Handle file uploads and
|
67 |
logger.info("Summarize endpoint called.")
|
68 |
-
|
69 |
if "file" not in request.files:
|
70 |
logger.error("No file uploaded.")
|
71 |
return jsonify({"error": "No file uploaded"}), 400
|
72 |
-
|
73 |
-
file = request.files["file"]
|
74 |
|
|
|
75 |
if file.filename == "":
|
76 |
logger.error("No file selected.")
|
77 |
return jsonify({"error": "No selected file"}), 400
|
78 |
-
|
79 |
if not allowed_file(file.filename):
|
80 |
logger.error(f"Unsupported file format: {file.filename}")
|
81 |
return jsonify({"error": "Unsupported file format"}), 400
|
@@ -83,7 +83,7 @@ def summarize():
|
|
83 |
filename = secure_filename(file.filename)
|
84 |
file_content = file.read()
|
85 |
file_ext = filename.rsplit(".", 1)[1].lower()
|
86 |
-
|
87 |
try:
|
88 |
if file_ext == "pdf":
|
89 |
text = summarize_pdf(file_content)
|
@@ -93,56 +93,45 @@ def summarize():
|
|
93 |
text = summarize_pptx(file_content)
|
94 |
elif file_ext == "txt":
|
95 |
text = summarize_txt(file_content)
|
|
|
|
|
96 |
|
97 |
-
|
|
|
98 |
|
|
|
99 |
logger.info(f"File {filename} summarized successfully.")
|
100 |
return jsonify({"filename": filename, "summary": summary})
|
101 |
-
|
102 |
except Exception as e:
|
103 |
logger.error(f"Error processing file {filename}: {str(e)}")
|
104 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
105 |
|
106 |
def summarize_pdf(file_content):
|
107 |
-
"""Extract text from
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
return text.strip() or "No extractable text found in PDF."
|
112 |
-
except Exception as e:
|
113 |
-
logger.error(f"Error reading PDF: {str(e)}")
|
114 |
-
return "Error extracting text from PDF."
|
115 |
|
116 |
def summarize_docx(file_content):
|
117 |
-
"""Extract text from
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
return text.strip() or "No extractable text found in DOCX."
|
122 |
-
except Exception as e:
|
123 |
-
logger.error(f"Error reading DOCX: {str(e)}")
|
124 |
-
return "Error extracting text from DOCX."
|
125 |
|
126 |
def summarize_pptx(file_content):
|
127 |
-
"""Extract text from
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
for
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
return "\n".join(text).strip() or "No extractable text found in PPTX."
|
136 |
-
except Exception as e:
|
137 |
-
logger.error(f"Error reading PPTX: {str(e)}")
|
138 |
-
return "Error extracting text from PPTX."
|
139 |
|
140 |
def summarize_txt(file_content):
|
141 |
-
"""Extract text from
|
142 |
-
|
143 |
-
return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
|
144 |
-
except UnicodeDecodeError:
|
145 |
-
return file_content.decode("latin-1").strip() or "No extractable text found in TXT."
|
146 |
|
147 |
if __name__ == "__main__":
|
148 |
app.run(host="0.0.0.0", port=7860, debug=True)
|
|
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
+
# Initialize Flask app
|
16 |
app = Flask(__name__)
|
17 |
|
18 |
+
# Set Hugging Face cache directory
|
19 |
+
os.environ["HF_HOME"] = "/app/hf_cache"
|
20 |
|
21 |
# Load T5 model and tokenizer
|
22 |
logger.info("Loading T5-Base model...")
|
23 |
try:
|
24 |
+
tokenizer = T5Tokenizer.from_pretrained("t5-base")
|
25 |
model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
26 |
logger.info("T5-Base model loaded successfully.")
|
27 |
except Exception as e:
|
|
|
36 |
|
37 |
def summarize_text(text, max_length=150, min_length=30):
|
38 |
"""Summarize text using T5-Base."""
|
|
|
|
|
|
|
39 |
try:
|
40 |
+
if not text.strip():
|
41 |
+
return "No text found in the document to summarize."
|
42 |
+
|
43 |
input_text = "summarize: " + text
|
44 |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
45 |
summary_ids = model.generate(
|
|
|
54 |
return summary
|
55 |
except Exception as e:
|
56 |
logger.error(f"Error in T5 summarization: {str(e)}")
|
57 |
+
return "Error summarizing text."
|
58 |
|
59 |
@app.route("/", methods=["GET"])
|
60 |
def index():
|
61 |
+
"""Root endpoint."""
|
62 |
logger.info("Root endpoint accessed.")
|
63 |
+
return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."
|
64 |
|
65 |
@app.route("/summarize", methods=["POST"])
|
66 |
def summarize():
|
67 |
+
"""Handle file uploads and summarization."""
|
68 |
logger.info("Summarize endpoint called.")
|
69 |
+
|
70 |
if "file" not in request.files:
|
71 |
logger.error("No file uploaded.")
|
72 |
return jsonify({"error": "No file uploaded"}), 400
|
|
|
|
|
73 |
|
74 |
+
file = request.files["file"]
|
75 |
if file.filename == "":
|
76 |
logger.error("No file selected.")
|
77 |
return jsonify({"error": "No selected file"}), 400
|
78 |
+
|
79 |
if not allowed_file(file.filename):
|
80 |
logger.error(f"Unsupported file format: {file.filename}")
|
81 |
return jsonify({"error": "Unsupported file format"}), 400
|
|
|
83 |
filename = secure_filename(file.filename)
|
84 |
file_content = file.read()
|
85 |
file_ext = filename.rsplit(".", 1)[1].lower()
|
86 |
+
|
87 |
try:
|
88 |
if file_ext == "pdf":
|
89 |
text = summarize_pdf(file_content)
|
|
|
93 |
text = summarize_pptx(file_content)
|
94 |
elif file_ext == "txt":
|
95 |
text = summarize_txt(file_content)
|
96 |
+
else:
|
97 |
+
return jsonify({"error": "Unsupported file format"}), 400
|
98 |
|
99 |
+
if not text.strip():
|
100 |
+
return jsonify({"error": "No extractable text found in the document"}), 400
|
101 |
|
102 |
+
summary = summarize_text(text)
|
103 |
logger.info(f"File {filename} summarized successfully.")
|
104 |
return jsonify({"filename": filename, "summary": summary})
|
105 |
+
|
106 |
except Exception as e:
|
107 |
logger.error(f"Error processing file {filename}: {str(e)}")
|
108 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
109 |
|
110 |
def summarize_pdf(file_content):
|
111 |
+
"""Extract text from PDF."""
|
112 |
+
reader = PdfReader(io.BytesIO(file_content))
|
113 |
+
text = "\n".join([page.extract_text() or "" for page in reader.pages])
|
114 |
+
return text.strip()
|
|
|
|
|
|
|
|
|
115 |
|
116 |
def summarize_docx(file_content):
|
117 |
+
"""Extract text from DOCX."""
|
118 |
+
doc = Document(io.BytesIO(file_content))
|
119 |
+
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
120 |
+
return text.strip()
|
|
|
|
|
|
|
|
|
121 |
|
122 |
def summarize_pptx(file_content):
|
123 |
+
"""Extract text from PPTX."""
|
124 |
+
ppt = Presentation(io.BytesIO(file_content))
|
125 |
+
text = []
|
126 |
+
for slide in ppt.slides:
|
127 |
+
for shape in slide.shapes:
|
128 |
+
if hasattr(shape, "text") and shape.text.strip():
|
129 |
+
text.append(shape.text.strip())
|
130 |
+
return "\n".join(text).strip()
|
|
|
|
|
|
|
|
|
131 |
|
132 |
def summarize_txt(file_content):
|
133 |
+
"""Extract text from TXT file."""
|
134 |
+
return file_content.decode("utf-8").strip()
|
|
|
|
|
|
|
135 |
|
136 |
if __name__ == "__main__":
|
137 |
app.run(host="0.0.0.0", port=7860, debug=True)
|