Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on Mar 31

Commit

798ae00

verified ·

1 Parent(s): f8e5cca

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -21

app.py CHANGED Viewed

@@ -14,13 +14,13 @@ logger = logging.getLogger(__name__)
 app = Flask(__name__)
-# Set Hugging Face cache directory (matches Dockerfile)
-os.environ["HF_HOME"] = "/app/hf_cache"
 # Load T5 model and tokenizer
 logger.info("Loading T5-Base model...")
 try:
-    tokenizer = T5Tokenizer.from_pretrained("t5-base")
     model = T5ForConditionalGeneration.from_pretrained("t5-base")
     logger.info("T5-Base model loaded successfully.")
 except Exception as e:
@@ -30,10 +30,14 @@ except Exception as e:
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 def summarize_text(text, max_length=150, min_length=30):
     """Summarize text using T5-Base."""
     try:
         input_text = "summarize: " + text
         inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
@@ -49,29 +53,37 @@ def summarize_text(text, max_length=150, min_length=30):
         return summary
     except Exception as e:
         logger.error(f"Error in T5 summarization: {str(e)}")
-        raise
 @app.route("/", methods=["GET"])
 def index():
     logger.info("Root endpoint accessed.")
-    return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."
 @app.route("/summarize", methods=["POST"])
 def summarize():
     logger.info("Summarize endpoint called.")
     if "file" not in request.files:
         logger.error("No file uploaded.")
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     if file.filename == "":
         logger.error("No file selected.")
         return jsonify({"error": "No selected file"}), 400
     if not allowed_file(file.filename):
         logger.error(f"Unsupported file format: {file.filename}")
         return jsonify({"error": "Unsupported file format"}), 400
     filename = secure_filename(file.filename)
     file_content = file.read()
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
             text = summarize_pdf(file_content)
@@ -81,34 +93,56 @@ def summarize():
             text = summarize_pptx(file_content)
         elif file_ext == "txt":
             text = summarize_txt(file_content)
         summary = summarize_text(text)
-        logger.info(f"File {filename} summarized successfully with T5.")
         return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
         logger.error(f"Error processing file {filename}: {str(e)}")
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
 def summarize_pdf(file_content):
-    reader = PdfReader(io.BytesIO(file_content))
-    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
-    return text
 def summarize_docx(file_content):
-    doc = Document(io.BytesIO(file_content))
-    text = "\n".join([para.text for para in doc.paragraphs])
-    return text
 def summarize_pptx(file_content):
-    ppt = Presentation(io.BytesIO(file_content))
-    text = []
-    for slide in ppt.slides:
-        for shape in slide.shapes:
-            if hasattr(shape, "text"):
-                text.append(shape.text)
-    return "\n".join(text)
 def summarize_txt(file_content):
-    return file_content.decode("utf-8")
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860, debug=True)

 app = Flask(__name__)
+# Set Hugging Face cache to a writable directory
+os.environ["HF_HOME"] = "/tmp/huggingface_cache"
 # Load T5 model and tokenizer
 logger.info("Loading T5-Base model...")
 try:
+    tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
     model = T5ForConditionalGeneration.from_pretrained("t5-base")
     logger.info("T5-Base model loaded successfully.")
 except Exception as e:
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 def allowed_file(filename):
+    """Check if the uploaded file has an allowed extension."""
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 def summarize_text(text, max_length=150, min_length=30):
     """Summarize text using T5-Base."""
+    if not text.strip():
+        return "No meaningful text found in the document."
     try:
         input_text = "summarize: " + text
         inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
         return summary
     except Exception as e:
         logger.error(f"Error in T5 summarization: {str(e)}")
+        return "Error in summarization process."
 @app.route("/", methods=["GET"])
 def index():
+    """Root endpoint for API status check."""
     logger.info("Root endpoint accessed.")
+    return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."
 @app.route("/summarize", methods=["POST"])
 def summarize():
+    """Handle file uploads and summarize the content."""
     logger.info("Summarize endpoint called.")
     if "file" not in request.files:
         logger.error("No file uploaded.")
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     if file.filename == "":
         logger.error("No file selected.")
         return jsonify({"error": "No selected file"}), 400
     if not allowed_file(file.filename):
         logger.error(f"Unsupported file format: {file.filename}")
         return jsonify({"error": "Unsupported file format"}), 400
     filename = secure_filename(file.filename)
     file_content = file.read()
     file_ext = filename.rsplit(".", 1)[1].lower()
     try:
         if file_ext == "pdf":
             text = summarize_pdf(file_content)
             text = summarize_pptx(file_content)
         elif file_ext == "txt":
             text = summarize_txt(file_content)
         summary = summarize_text(text)
+        logger.info(f"File {filename} summarized successfully.")
         return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
         logger.error(f"Error processing file {filename}: {str(e)}")
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
 def summarize_pdf(file_content):
+    """Extract text from a PDF file."""
+    try:
+        reader = PdfReader(io.BytesIO(file_content))
+        text = "\n".join([page.extract_text() or "" for page in reader.pages])
+        return text.strip() or "No extractable text found in PDF."
+    except Exception as e:
+        logger.error(f"Error reading PDF: {str(e)}")
+        return "Error extracting text from PDF."
 def summarize_docx(file_content):
+    """Extract text from a DOCX file."""
+    try:
+        doc = Document(io.BytesIO(file_content))
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text.strip() or "No extractable text found in DOCX."
+    except Exception as e:
+        logger.error(f"Error reading DOCX: {str(e)}")
+        return "Error extracting text from DOCX."
 def summarize_pptx(file_content):
+    """Extract text from a PPTX file."""
+    try:
+        ppt = Presentation(io.BytesIO(file_content))
+        text = []
+        for slide in ppt.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    text.append(shape.text)
+        return "\n".join(text).strip() or "No extractable text found in PPTX."
+    except Exception as e:
+        logger.error(f"Error reading PPTX: {str(e)}")
+        return "Error extracting text from PPTX."
 def summarize_txt(file_content):
+    """Extract text from a TXT file with safe decoding."""
+    try:
+        return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
+    except UnicodeDecodeError:
+        return file_content.decode("latin-1").strip() or "No extractable text found in TXT."
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)