File size: 2,178 Bytes
d2d0219
 
 
 
 
 
 
3b4df89
524f780
d2d0219
 
 
17c487a
d2d0219
 
 
 
e116825
d2d0219
3b4df89
 
d2d0219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b4df89
 
d2d0219
 
3b4df89
d2d0219
 
 
 
3b4df89
d2d0219
 
 
 
 
 
 
3b4df89
d2d0219
 
 
 
 
 
 
3b4df89
9fd7d89
3b4df89
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch
import pdfplumber
import pytesseract
from PIL import Image
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration
from flask import Flask, request, jsonify

# Optimize for CPU
torch.set_num_threads(4)  # Adjust based on CPU cores
device = torch.device("cpu")

# Load T5-Base model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Flask App
app = Flask(__name__)

# Function to extract text from files
def extract_text(file):
    filename = file.filename.lower()
    
    if filename.endswith(".pdf"):
        with pdfplumber.open(file) as pdf:
            return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])

    elif filename.endswith(".docx"):
        doc = Document(file)
        return " ".join([para.text for para in doc.paragraphs])

    elif filename.endswith(".pptx"):
        prs = Presentation(file)
        return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])

    elif filename.endswith((".png", ".jpg", ".jpeg")):
        image = Image.open(file)
        return pytesseract.image_to_string(image)

    return None

@app.route("/summarize", methods=["POST"])
def summarize():
    file = request.files.get("file")

    if not file:
        return jsonify({"error": "No file uploaded"}), 400

    text = extract_text(file)
    if not text:
        return jsonify({"error": "No text found in file"}), 400

    # Format text for T5
    input_text = "summarize: " + text.strip()

    # Tokenize input
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)

    # Generate summary
    with torch.no_grad():
        summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4)
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return jsonify({"summary": summary})

if __name__ == "__main__":
    print("🚀 API is running on port 7860")
    app.run(host="0.0.0.0", port=7860)