Spaces:
Sleeping
Sleeping
File size: 2,178 Bytes
d2d0219 3b4df89 524f780 d2d0219 17c487a d2d0219 e116825 d2d0219 3b4df89 d2d0219 3b4df89 d2d0219 3b4df89 d2d0219 3b4df89 d2d0219 3b4df89 d2d0219 3b4df89 9fd7d89 3b4df89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import torch
import pdfplumber
import pytesseract
from PIL import Image
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration
from flask import Flask, request, jsonify
# Optimize for CPU
torch.set_num_threads(4) # Adjust based on CPU cores
device = torch.device("cpu")
# Load T5-Base model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
# Flask App
app = Flask(__name__)
# Function to extract text from files
def extract_text(file):
filename = file.filename.lower()
if filename.endswith(".pdf"):
with pdfplumber.open(file) as pdf:
return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
elif filename.endswith(".docx"):
doc = Document(file)
return " ".join([para.text for para in doc.paragraphs])
elif filename.endswith(".pptx"):
prs = Presentation(file)
return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
elif filename.endswith((".png", ".jpg", ".jpeg")):
image = Image.open(file)
return pytesseract.image_to_string(image)
return None
@app.route("/summarize", methods=["POST"])
def summarize():
file = request.files.get("file")
if not file:
return jsonify({"error": "No file uploaded"}), 400
text = extract_text(file)
if not text:
return jsonify({"error": "No text found in file"}), 400
# Format text for T5
input_text = "summarize: " + text.strip()
# Tokenize input
inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
# Generate summary
with torch.no_grad():
summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return jsonify({"summary": summary})
if __name__ == "__main__":
print("🚀 API is running on port 7860")
app.run(host="0.0.0.0", port=7860)
|