Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on about 1 month ago

Commit

2a3fae3

verified ·

1 Parent(s): 764d4f7

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -64

app.py CHANGED Viewed

@@ -1,89 +1,69 @@
 import os
-import fitz  # PyMuPDF for PDF
-import pytesseract
-from PIL import Image
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
-from transformers import T5Tokenizer, T5ForConditionalGeneration
-from pptx import Presentation
 from docx import Document
 app = Flask(__name__)
-app.config["UPLOAD_FOLDER"] = "uploads"
-os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
-# Load T5 model
-model_name = "t5-base"
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-model = T5ForConditionalGeneration.from_pretrained(model_name)
-# Function to extract text from PDFs
-def extract_text_from_pdf(pdf_path):
-    doc = fitz.open(pdf_path)
-    text = "\n".join([page.get_text("text") for page in doc])
-    return text.strip()
-# Function to extract text from PowerPoint files
-def extract_text_from_pptx(pptx_path):
-    presentation = Presentation(pptx_path)
-    text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
-    return text.strip()
-# Function to extract text from Word documents
-def extract_text_from_docx(docx_path):
-    document = Document(docx_path)
-    text = "\n".join([paragraph.text for paragraph in document.paragraphs])
-    return text.strip()
-# Function to extract text from images using OCR
-def extract_text_from_image(image_path):
-    img = Image.open(image_path)
-    text = pytesseract.image_to_string(img)
-    return text.strip()
-# Summarization function
-def summarize_text(input_text):
-    input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
-    output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
-    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
-# API for file upload and summarization
 @app.route("/summarize", methods=["POST"])
-def summarize_file():
     if "file" not in request.files:
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     if file.filename == "":
         return jsonify({"error": "No selected file"}), 400
     filename = secure_filename(file.filename)
-    file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
-    file.save(file_path)
-    try:
-        # Determine file type and extract text
-        if filename.endswith(".pdf"):
-            text = extract_text_from_pdf(file_path)
-        elif filename.endswith(".pptx"):
-            text = extract_text_from_pptx(file_path)
-        elif filename.endswith(".docx"):
-            text = extract_text_from_docx(file_path)
-        elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
-            text = extract_text_from_image(file_path)
-        else:
-            return jsonify({"error": "Unsupported file type"}), 400
-        if not text:
-            return jsonify({"error": "No text found in the file"}), 400
-        summary = summarize_text(text)
-        return jsonify({"summary": summary})
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-    finally:
-        os.remove(file_path)  # Clean up
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

 import os
+import io
 from flask import Flask, request, jsonify
 from werkzeug.utils import secure_filename
+from PyPDF2 import PdfReader
 from docx import Document
+from pptx import Presentation
 app = Flask(__name__)
+# Allowed file extensions
+ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
+def allowed_file(filename):
+    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 @app.route("/summarize", methods=["POST"])
+def summarize():
     if "file" not in request.files:
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     if file.filename == "":
         return jsonify({"error": "No selected file"}), 400
+    if not allowed_file(file.filename):
+        return jsonify({"error": "Unsupported file format"}), 400
     filename = secure_filename(file.filename)
+    file_content = file.read()
+    # Process file based on type
+    summary = None
+    file_ext = filename.rsplit(".", 1)[1].lower()
+    if file_ext == "pdf":
+        summary = summarize_pdf(file_content)
+    elif file_ext == "docx":
+        summary = summarize_docx(file_content)
+    elif file_ext == "pptx":
+        summary = summarize_pptx(file_content)
+    elif file_ext == "txt":
+        summary = summarize_txt(file_content)
+    return jsonify({"filename": filename, "summary": summary})
+# Summarization functions
+def summarize_pdf(file_content):
+    reader = PdfReader(io.BytesIO(file_content))
+    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
+    return text[:500]  # Returning a short summary (first 500 chars)
+def summarize_docx(file_content):
+    doc = Document(io.BytesIO(file_content))
+    text = "\n".join([para.text for para in doc.paragraphs])
+    return text[:500]
+def summarize_pptx(file_content):
+    ppt = Presentation(io.BytesIO(file_content))
+    text = "\n".join([slide.shapes.title.text for slide in ppt.slides if slide.shapes.title])
+    return text[:500]
+def summarize_txt(file_content):
+    text = file_content.decode("utf-8")
+    return text[:500]
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)