Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,89 +1,69 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
-
import pytesseract
|
4 |
-
from PIL import Image
|
5 |
from flask import Flask, request, jsonify
|
6 |
from werkzeug.utils import secure_filename
|
7 |
-
from
|
8 |
-
from pptx import Presentation
|
9 |
from docx import Document
|
|
|
10 |
|
11 |
app = Flask(__name__)
|
12 |
-
app.config["UPLOAD_FOLDER"] = "uploads"
|
13 |
-
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
|
14 |
-
|
15 |
-
# Load T5 model
|
16 |
-
model_name = "t5-base"
|
17 |
-
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
18 |
-
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
19 |
-
|
20 |
-
# Function to extract text from PDFs
|
21 |
-
def extract_text_from_pdf(pdf_path):
|
22 |
-
doc = fitz.open(pdf_path)
|
23 |
-
text = "\n".join([page.get_text("text") for page in doc])
|
24 |
-
return text.strip()
|
25 |
-
|
26 |
-
# Function to extract text from PowerPoint files
|
27 |
-
def extract_text_from_pptx(pptx_path):
|
28 |
-
presentation = Presentation(pptx_path)
|
29 |
-
text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
|
30 |
-
return text.strip()
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
-
document = Document(docx_path)
|
35 |
-
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
|
36 |
-
return text.strip()
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
img = Image.open(image_path)
|
41 |
-
text = pytesseract.image_to_string(img)
|
42 |
-
return text.strip()
|
43 |
|
44 |
-
# Summarization function
|
45 |
-
def summarize_text(input_text):
|
46 |
-
input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
|
47 |
-
output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
|
48 |
-
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
49 |
-
|
50 |
-
# API for file upload and summarization
|
51 |
@app.route("/summarize", methods=["POST"])
|
52 |
-
def
|
53 |
if "file" not in request.files:
|
54 |
return jsonify({"error": "No file uploaded"}), 400
|
55 |
|
56 |
file = request.files["file"]
|
|
|
57 |
if file.filename == "":
|
58 |
return jsonify({"error": "No selected file"}), 400
|
59 |
|
|
|
|
|
|
|
60 |
filename = secure_filename(file.filename)
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
text = extract_text_from_pptx(file_path)
|
70 |
-
elif filename.endswith(".docx"):
|
71 |
-
text = extract_text_from_docx(file_path)
|
72 |
-
elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
|
73 |
-
text = extract_text_from_image(file_path)
|
74 |
-
else:
|
75 |
-
return jsonify({"error": "Unsupported file type"}), 400
|
76 |
|
77 |
-
|
78 |
-
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
os.remove(file_path) # Clean up
|
87 |
|
88 |
if __name__ == "__main__":
|
89 |
app.run(host="0.0.0.0", port=7860)
|
|
|
1 |
import os
|
2 |
+
import io
|
|
|
|
|
3 |
from flask import Flask, request, jsonify
|
4 |
from werkzeug.utils import secure_filename
|
5 |
+
from PyPDF2 import PdfReader
|
|
|
6 |
from docx import Document
|
7 |
+
from pptx import Presentation
|
8 |
|
9 |
app = Flask(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
# Allowed file extensions
|
12 |
+
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
|
|
|
|
|
|
13 |
|
14 |
+
def allowed_file(filename):
|
15 |
+
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
|
|
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
@app.route("/summarize", methods=["POST"])
|
18 |
+
def summarize():
|
19 |
if "file" not in request.files:
|
20 |
return jsonify({"error": "No file uploaded"}), 400
|
21 |
|
22 |
file = request.files["file"]
|
23 |
+
|
24 |
if file.filename == "":
|
25 |
return jsonify({"error": "No selected file"}), 400
|
26 |
|
27 |
+
if not allowed_file(file.filename):
|
28 |
+
return jsonify({"error": "Unsupported file format"}), 400
|
29 |
+
|
30 |
filename = secure_filename(file.filename)
|
31 |
+
file_content = file.read()
|
32 |
+
|
33 |
+
# Process file based on type
|
34 |
+
summary = None
|
35 |
+
file_ext = filename.rsplit(".", 1)[1].lower()
|
36 |
+
|
37 |
+
if file_ext == "pdf":
|
38 |
+
summary = summarize_pdf(file_content)
|
39 |
+
elif file_ext == "docx":
|
40 |
+
summary = summarize_docx(file_content)
|
41 |
+
elif file_ext == "pptx":
|
42 |
+
summary = summarize_pptx(file_content)
|
43 |
+
elif file_ext == "txt":
|
44 |
+
summary = summarize_txt(file_content)
|
45 |
+
|
46 |
+
return jsonify({"filename": filename, "summary": summary})
|
47 |
|
48 |
+
# Summarization functions
|
49 |
+
def summarize_pdf(file_content):
|
50 |
+
reader = PdfReader(io.BytesIO(file_content))
|
51 |
+
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
52 |
+
return text[:500] # Returning a short summary (first 500 chars)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
def summarize_docx(file_content):
|
55 |
+
doc = Document(io.BytesIO(file_content))
|
56 |
+
text = "\n".join([para.text for para in doc.paragraphs])
|
57 |
+
return text[:500]
|
58 |
|
59 |
+
def summarize_pptx(file_content):
|
60 |
+
ppt = Presentation(io.BytesIO(file_content))
|
61 |
+
text = "\n".join([slide.shapes.title.text for slide in ppt.slides if slide.shapes.title])
|
62 |
+
return text[:500]
|
63 |
|
64 |
+
def summarize_txt(file_content):
|
65 |
+
text = file_content.decode("utf-8")
|
66 |
+
return text[:500]
|
|
|
67 |
|
68 |
if __name__ == "__main__":
|
69 |
app.run(host="0.0.0.0", port=7860)
|