Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on about 1 month ago

Commit

b4aa0e4

verified ·

1 Parent(s): 24aa083

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -27

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import os
 import io
 import logging
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -14,15 +16,18 @@ logger = logging.getLogger(__name__)
 # Initialize Flask app
 app = Flask(__name__)
-# Set Hugging Face cache directory
-os.environ["HF_HOME"] = "/app/hf_cache"
 # Load T5 model and tokenizer
 logger.info("Loading T5-Base model...")
 try:
-    tokenizer = T5Tokenizer.from_pretrained("t5-base")
-    model = T5ForConditionalGeneration.from_pretrained("t5-base")
     logger.info("T5-Base model loaded successfully.")
 except Exception as e:
     logger.error(f"Failed to load T5-Base: {str(e)}")
@@ -40,7 +45,8 @@ def summarize_text(text, max_length=150, min_length=30):
         if not text.strip():
             return "No text found in the document to summarize."
-        input_text = "summarize: " + text
         inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
         summary_ids = model.generate(
             inputs["input_ids"],
@@ -54,7 +60,7 @@ def summarize_text(text, max_length=150, min_length=30):
         return summary
     except Exception as e:
         logger.error(f"Error in T5 summarization: {str(e)}")
-        return "Error summarizing text."
 @app.route("/", methods=["GET"])
 def index():
@@ -65,23 +71,28 @@ def index():
 @app.route("/summarize", methods=["POST"])
 def summarize():
     logger.info("Summarize endpoint called.")
     # Check if a file is in the request
     if "file" not in request.files:
-        logger.error("No file uploaded.")
-        return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     # Check if file is empty
     if file.filename == "":
-        logger.error("No file selected.")
         return jsonify({"error": "No selected file"}), 400
     # Check if file has an allowed extension
     if not allowed_file(file.filename):
         logger.error(f"Unsupported file format: {file.filename}")
-        return jsonify({"error": "Unsupported file format"}), 400
     # Process the file
     filename = secure_filename(file.filename)
@@ -102,10 +113,15 @@ def summarize():
             return jsonify({"error": "Unsupported file format"}), 400
         # Generate summary
         summary = summarize_text(text)
         logger.info(f"File {filename} summarized successfully.")
-        return jsonify({"filename": filename, "summary": summary})
     except Exception as e:
         logger.error(f"Error processing file {filename}: {str(e)}")
@@ -113,29 +129,49 @@ def summarize():
 def summarize_pdf(file_content):
     """Extract text from PDF."""
-    reader = PdfReader(io.BytesIO(file_content))
-    text = "\n".join([page.extract_text() or "" for page in reader.pages])
-    return text.strip()
 def summarize_docx(file_content):
     """Extract text from DOCX."""
-    doc = Document(io.BytesIO(file_content))
-    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-    return text.strip()
 def summarize_pptx(file_content):
     """Extract text from PPTX."""
-    ppt = Presentation(io.BytesIO(file_content))
-    text = []
-    for slide in ppt.slides:
-        for shape in slide.shapes:
-            if hasattr(shape, "text") and shape.text.strip():
-                text.append(shape.text.strip())
-    return "\n".join(text).strip()
 def summarize_txt(file_content):
     """Extract text from TXT file."""
-    return file_content.decode("utf-8").strip()
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860, debug=True)

 import os
 import io
 import logging
+import tempfile
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
 from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
 from transformers import T5Tokenizer, T5ForConditionalGeneration
+from flask_cors import CORS  # Import CORS for cross-origin requests
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Initialize Flask app
 app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+# Set up a temporary directory for Hugging Face cache
+cache_dir = tempfile.mkdtemp()
+os.environ["HF_HOME"] = cache_dir
+os.environ["TRANSFORMERS_CACHE"] = cache_dir
 # Load T5 model and tokenizer
 logger.info("Loading T5-Base model...")
 try:
+    tokenizer = T5Tokenizer.from_pretrained("t5-base", cache_dir=cache_dir)
+    model = T5ForConditionalGeneration.from_pretrained("t5-base", cache_dir=cache_dir)
     logger.info("T5-Base model loaded successfully.")
 except Exception as e:
     logger.error(f"Failed to load T5-Base: {str(e)}")
         if not text.strip():
             return "No text found in the document to summarize."
+        # Limit text length to prevent tokenizer errors
+        input_text = "summarize: " + text[:10000]  # Limiting to 10K chars to be safe
         inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
         summary_ids = model.generate(
             inputs["input_ids"],
         return summary
     except Exception as e:
         logger.error(f"Error in T5 summarization: {str(e)}")
+        return f"Error summarizing text: {str(e)}"
 @app.route("/", methods=["GET"])
 def index():
 @app.route("/summarize", methods=["POST"])
 def summarize():
     logger.info("Summarize endpoint called.")
+    # Debug the incoming request
+    logger.info(f"Request headers: {request.headers}")
+    logger.info(f"Request files: {request.files}")
+    logger.info(f"Request form: {request.form}")
     # Check if a file is in the request
     if "file" not in request.files:
+        logger.error("No file found in request.files")
+        return jsonify({"error": "No file uploaded. Make sure to use 'file' as the form field name."}), 400
     file = request.files["file"]
     # Check if file is empty
     if file.filename == "":
+        logger.error("File has no filename")
         return jsonify({"error": "No selected file"}), 400
     # Check if file has an allowed extension
     if not allowed_file(file.filename):
         logger.error(f"Unsupported file format: {file.filename}")
+        return jsonify({"error": f"Unsupported file format. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
     # Process the file
     filename = secure_filename(file.filename)
             return jsonify({"error": "Unsupported file format"}), 400
         # Generate summary
+        logger.info(f"Generating summary for {filename} with text length {len(text)}")
         summary = summarize_text(text)
         logger.info(f"File {filename} summarized successfully.")
+        return jsonify({
+            "filename": filename,
+            "summary": summary,
+            "textLength": len(text)
+        })
     except Exception as e:
         logger.error(f"Error processing file {filename}: {str(e)}")
 def summarize_pdf(file_content):
     """Extract text from PDF."""
+    try:
+        reader = PdfReader(io.BytesIO(file_content))
+        text = "\n".join([page.extract_text() or "" for page in reader.pages])
+        return text.strip()
+    except Exception as e:
+        logger.error(f"Error extracting text from PDF: {str(e)}")
+        raise Exception(f"Failed to extract text from PDF: {str(e)}")
 def summarize_docx(file_content):
     """Extract text from DOCX."""
+    try:
+        doc = Document(io.BytesIO(file_content))
+        text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+        return text.strip()
+    except Exception as e:
+        logger.error(f"Error extracting text from DOCX: {str(e)}")
+        raise Exception(f"Failed to extract text from DOCX: {str(e)}")
 def summarize_pptx(file_content):
     """Extract text from PPTX."""
+    try:
+        ppt = Presentation(io.BytesIO(file_content))
+        text = []
+        for slide in ppt.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text.strip():
+                    text.append(shape.text.strip())
+        return "\n".join(text).strip()
+    except Exception as e:
+        logger.error(f"Error extracting text from PPTX: {str(e)}")
+        raise Exception(f"Failed to extract text from PPTX: {str(e)}")
 def summarize_txt(file_content):
     """Extract text from TXT file."""
+    try:
+        return file_content.decode("utf-8").strip()
+    except UnicodeDecodeError:
+        # Try different encodings if UTF-8 fails
+        try:
+            return file_content.decode("latin-1").strip()
+        except Exception as e:
+            logger.error(f"Error decoding text file: {str(e)}")
+            raise Exception(f"Failed to decode text file: {str(e)}")
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)