Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on Mar 31

Commit

764d4f7

verified ·

1 Parent(s): 8020602

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -49

app.py CHANGED Viewed

@@ -1,71 +1,89 @@
-import torch
-import pdfplumber
 import pytesseract
 from PIL import Image
-from docx import Document
-from pptx import Presentation
-from transformers import T5Tokenizer, T5ForConditionalGeneration
 from flask import Flask, request, jsonify
-# Optimize for CPU
-torch.set_num_threads(4)  # Adjust based on CPU cores
-device = torch.device("cpu")
-# Load T5-Base model
 model_name = "t5-base"
 tokenizer = T5Tokenizer.from_pretrained(model_name)
-model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
-# Flask App
-app = Flask(__name__)
-# Function to extract text from files
-def extract_text(file):
-    filename = file.filename.lower()
-    if filename.endswith(".pdf"):
-        with pdfplumber.open(file) as pdf:
-            return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
-    elif filename.endswith(".docx"):
-        doc = Document(file)
-        return " ".join([para.text for para in doc.paragraphs])
-    elif filename.endswith(".pptx"):
-        prs = Presentation(file)
-        return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
-    elif filename.endswith((".png", ".jpg", ".jpeg")):
-        image = Image.open(file)
-        return pytesseract.image_to_string(image)
-    return None
 @app.route("/summarize", methods=["POST"])
-def summarize():
-    file = request.files.get("file")
-    if not file:
         return jsonify({"error": "No file uploaded"}), 400
-    text = extract_text(file)
-    if not text:
-        return jsonify({"error": "No text found in file"}), 400
-    # Format text for T5
-    input_text = "summarize: " + text.strip()
-    # Tokenize input
-    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
-    # Generate summary
-    with torch.no_grad():
-        summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4)
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    return jsonify({"summary": summary})
 if __name__ == "__main__":
-    print("🚀 API is running on port 7860")
     app.run(host="0.0.0.0", port=7860)

+import os
+import fitz  # PyMuPDF for PDF
 import pytesseract
 from PIL import Image
 from flask import Flask, request, jsonify
+from werkzeug.utils import secure_filename
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+from pptx import Presentation
+from docx import Document
+app = Flask(__name__)
+app.config["UPLOAD_FOLDER"] = "uploads"
+os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
+# Load T5 model
 model_name = "t5-base"
 tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+# Function to extract text from PDFs
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = "\n".join([page.get_text("text") for page in doc])
+    return text.strip()
+# Function to extract text from PowerPoint files
+def extract_text_from_pptx(pptx_path):
+    presentation = Presentation(pptx_path)
+    text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
+    return text.strip()
+# Function to extract text from Word documents
+def extract_text_from_docx(docx_path):
+    document = Document(docx_path)
+    text = "\n".join([paragraph.text for paragraph in document.paragraphs])
+    return text.strip()
+# Function to extract text from images using OCR
+def extract_text_from_image(image_path):
+    img = Image.open(image_path)
+    text = pytesseract.image_to_string(img)
+    return text.strip()
+# Summarization function
+def summarize_text(input_text):
+    input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
+    output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+# API for file upload and summarization
 @app.route("/summarize", methods=["POST"])
+def summarize_file():
+    if "file" not in request.files:
         return jsonify({"error": "No file uploaded"}), 400
+    file = request.files["file"]
+    if file.filename == "":
+        return jsonify({"error": "No selected file"}), 400
+    filename = secure_filename(file.filename)
+    file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
+    file.save(file_path)
+    try:
+        # Determine file type and extract text
+        if filename.endswith(".pdf"):
+            text = extract_text_from_pdf(file_path)
+        elif filename.endswith(".pptx"):
+            text = extract_text_from_pptx(file_path)
+        elif filename.endswith(".docx"):
+            text = extract_text_from_docx(file_path)
+        elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
+            text = extract_text_from_image(file_path)
+        else:
+            return jsonify({"error": "Unsupported file type"}), 400
+        if not text:
+            return jsonify({"error": "No text found in the file"}), 400
+        summary = summarize_text(text)
+        return jsonify({"summary": summary})
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+    finally:
+        os.remove(file_path)  # Clean up
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)