mike23415 commited on
Commit
2a3fae3
·
verified ·
1 Parent(s): 764d4f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -64
app.py CHANGED
@@ -1,89 +1,69 @@
1
  import os
2
- import fitz # PyMuPDF for PDF
3
- import pytesseract
4
- from PIL import Image
5
  from flask import Flask, request, jsonify
6
  from werkzeug.utils import secure_filename
7
- from transformers import T5Tokenizer, T5ForConditionalGeneration
8
- from pptx import Presentation
9
  from docx import Document
 
10
 
11
  app = Flask(__name__)
12
- app.config["UPLOAD_FOLDER"] = "uploads"
13
- os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
14
-
15
- # Load T5 model
16
- model_name = "t5-base"
17
- tokenizer = T5Tokenizer.from_pretrained(model_name)
18
- model = T5ForConditionalGeneration.from_pretrained(model_name)
19
-
20
- # Function to extract text from PDFs
21
- def extract_text_from_pdf(pdf_path):
22
- doc = fitz.open(pdf_path)
23
- text = "\n".join([page.get_text("text") for page in doc])
24
- return text.strip()
25
-
26
- # Function to extract text from PowerPoint files
27
- def extract_text_from_pptx(pptx_path):
28
- presentation = Presentation(pptx_path)
29
- text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
30
- return text.strip()
31
 
32
- # Function to extract text from Word documents
33
- def extract_text_from_docx(docx_path):
34
- document = Document(docx_path)
35
- text = "\n".join([paragraph.text for paragraph in document.paragraphs])
36
- return text.strip()
37
 
38
- # Function to extract text from images using OCR
39
- def extract_text_from_image(image_path):
40
- img = Image.open(image_path)
41
- text = pytesseract.image_to_string(img)
42
- return text.strip()
43
 
44
- # Summarization function
45
- def summarize_text(input_text):
46
- input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
47
- output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
48
- return tokenizer.decode(output_ids[0], skip_special_tokens=True)
49
-
50
- # API for file upload and summarization
51
  @app.route("/summarize", methods=["POST"])
52
- def summarize_file():
53
  if "file" not in request.files:
54
  return jsonify({"error": "No file uploaded"}), 400
55
 
56
  file = request.files["file"]
 
57
  if file.filename == "":
58
  return jsonify({"error": "No selected file"}), 400
59
 
 
 
 
60
  filename = secure_filename(file.filename)
61
- file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
62
- file.save(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- try:
65
- # Determine file type and extract text
66
- if filename.endswith(".pdf"):
67
- text = extract_text_from_pdf(file_path)
68
- elif filename.endswith(".pptx"):
69
- text = extract_text_from_pptx(file_path)
70
- elif filename.endswith(".docx"):
71
- text = extract_text_from_docx(file_path)
72
- elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
73
- text = extract_text_from_image(file_path)
74
- else:
75
- return jsonify({"error": "Unsupported file type"}), 400
76
 
77
- if not text:
78
- return jsonify({"error": "No text found in the file"}), 400
 
 
79
 
80
- summary = summarize_text(text)
81
- return jsonify({"summary": summary})
 
 
82
 
83
- except Exception as e:
84
- return jsonify({"error": str(e)}), 500
85
- finally:
86
- os.remove(file_path) # Clean up
87
 
88
  if __name__ == "__main__":
89
  app.run(host="0.0.0.0", port=7860)
 
1
  import os
2
+ import io
 
 
3
  from flask import Flask, request, jsonify
4
  from werkzeug.utils import secure_filename
5
+ from PyPDF2 import PdfReader
 
6
  from docx import Document
7
+ from pptx import Presentation
8
 
9
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Allowed file extensions
12
+ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 
 
 
13
 
14
+ def allowed_file(filename):
15
+ return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 
 
 
16
 
 
 
 
 
 
 
 
17
  @app.route("/summarize", methods=["POST"])
18
+ def summarize():
19
  if "file" not in request.files:
20
  return jsonify({"error": "No file uploaded"}), 400
21
 
22
  file = request.files["file"]
23
+
24
  if file.filename == "":
25
  return jsonify({"error": "No selected file"}), 400
26
 
27
+ if not allowed_file(file.filename):
28
+ return jsonify({"error": "Unsupported file format"}), 400
29
+
30
  filename = secure_filename(file.filename)
31
+ file_content = file.read()
32
+
33
+ # Process file based on type
34
+ summary = None
35
+ file_ext = filename.rsplit(".", 1)[1].lower()
36
+
37
+ if file_ext == "pdf":
38
+ summary = summarize_pdf(file_content)
39
+ elif file_ext == "docx":
40
+ summary = summarize_docx(file_content)
41
+ elif file_ext == "pptx":
42
+ summary = summarize_pptx(file_content)
43
+ elif file_ext == "txt":
44
+ summary = summarize_txt(file_content)
45
+
46
+ return jsonify({"filename": filename, "summary": summary})
47
 
48
+ # Summarization functions
49
+ def summarize_pdf(file_content):
50
+ reader = PdfReader(io.BytesIO(file_content))
51
+ text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
52
+ return text[:500] # Returning a short summary (first 500 chars)
 
 
 
 
 
 
 
53
 
54
+ def summarize_docx(file_content):
55
+ doc = Document(io.BytesIO(file_content))
56
+ text = "\n".join([para.text for para in doc.paragraphs])
57
+ return text[:500]
58
 
59
+ def summarize_pptx(file_content):
60
+ ppt = Presentation(io.BytesIO(file_content))
61
+ text = "\n".join([slide.shapes.title.text for slide in ppt.slides if slide.shapes.title])
62
+ return text[:500]
63
 
64
+ def summarize_txt(file_content):
65
+ text = file_content.decode("utf-8")
66
+ return text[:500]
 
67
 
68
  if __name__ == "__main__":
69
  app.run(host="0.0.0.0", port=7860)