mike23415 commited on
Commit
764d4f7
·
verified ·
1 Parent(s): 8020602

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -49
app.py CHANGED
@@ -1,71 +1,89 @@
1
- import torch
2
- import pdfplumber
3
  import pytesseract
4
  from PIL import Image
5
- from docx import Document
6
- from pptx import Presentation
7
- from transformers import T5Tokenizer, T5ForConditionalGeneration
8
  from flask import Flask, request, jsonify
 
 
 
 
9
 
10
- # Optimize for CPU
11
- torch.set_num_threads(4) # Adjust based on CPU cores
12
- device = torch.device("cpu")
13
 
14
- # Load T5-Base model
15
  model_name = "t5-base"
16
  tokenizer = T5Tokenizer.from_pretrained(model_name)
17
- model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
18
-
19
- # Flask App
20
- app = Flask(__name__)
21
 
22
- # Function to extract text from files
23
- def extract_text(file):
24
- filename = file.filename.lower()
25
-
26
- if filename.endswith(".pdf"):
27
- with pdfplumber.open(file) as pdf:
28
- return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
29
 
30
- elif filename.endswith(".docx"):
31
- doc = Document(file)
32
- return " ".join([para.text for para in doc.paragraphs])
 
 
33
 
34
- elif filename.endswith(".pptx"):
35
- prs = Presentation(file)
36
- return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
 
 
37
 
38
- elif filename.endswith((".png", ".jpg", ".jpeg")):
39
- image = Image.open(file)
40
- return pytesseract.image_to_string(image)
 
 
41
 
42
- return None
 
 
 
 
43
 
 
44
  @app.route("/summarize", methods=["POST"])
45
- def summarize():
46
- file = request.files.get("file")
47
-
48
- if not file:
49
  return jsonify({"error": "No file uploaded"}), 400
50
 
51
- text = extract_text(file)
52
- if not text:
53
- return jsonify({"error": "No text found in file"}), 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- # Format text for T5
56
- input_text = "summarize: " + text.strip()
57
 
58
- # Tokenize input
59
- inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
60
 
61
- # Generate summary
62
- with torch.no_grad():
63
- summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4)
64
-
65
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
66
-
67
- return jsonify({"summary": summary})
68
 
69
  if __name__ == "__main__":
70
- print("🚀 API is running on port 7860")
71
  app.run(host="0.0.0.0", port=7860)
 
1
+ import os
2
+ import fitz # PyMuPDF for PDF
3
  import pytesseract
4
  from PIL import Image
 
 
 
5
  from flask import Flask, request, jsonify
6
+ from werkzeug.utils import secure_filename
7
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
8
+ from pptx import Presentation
9
+ from docx import Document
10
 
11
+ app = Flask(__name__)
12
+ app.config["UPLOAD_FOLDER"] = "uploads"
13
+ os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
14
 
15
+ # Load T5 model
16
  model_name = "t5-base"
17
  tokenizer = T5Tokenizer.from_pretrained(model_name)
18
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
 
 
 
19
 
20
+ # Function to extract text from PDFs
21
+ def extract_text_from_pdf(pdf_path):
22
+ doc = fitz.open(pdf_path)
23
+ text = "\n".join([page.get_text("text") for page in doc])
24
+ return text.strip()
 
 
25
 
26
+ # Function to extract text from PowerPoint files
27
+ def extract_text_from_pptx(pptx_path):
28
+ presentation = Presentation(pptx_path)
29
+ text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
30
+ return text.strip()
31
 
32
+ # Function to extract text from Word documents
33
+ def extract_text_from_docx(docx_path):
34
+ document = Document(docx_path)
35
+ text = "\n".join([paragraph.text for paragraph in document.paragraphs])
36
+ return text.strip()
37
 
38
+ # Function to extract text from images using OCR
39
+ def extract_text_from_image(image_path):
40
+ img = Image.open(image_path)
41
+ text = pytesseract.image_to_string(img)
42
+ return text.strip()
43
 
44
+ # Summarization function
45
+ def summarize_text(input_text):
46
+ input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
47
+ output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
48
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True)
49
 
50
+ # API for file upload and summarization
51
  @app.route("/summarize", methods=["POST"])
52
+ def summarize_file():
53
+ if "file" not in request.files:
 
 
54
  return jsonify({"error": "No file uploaded"}), 400
55
 
56
+ file = request.files["file"]
57
+ if file.filename == "":
58
+ return jsonify({"error": "No selected file"}), 400
59
+
60
+ filename = secure_filename(file.filename)
61
+ file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
62
+ file.save(file_path)
63
+
64
+ try:
65
+ # Determine file type and extract text
66
+ if filename.endswith(".pdf"):
67
+ text = extract_text_from_pdf(file_path)
68
+ elif filename.endswith(".pptx"):
69
+ text = extract_text_from_pptx(file_path)
70
+ elif filename.endswith(".docx"):
71
+ text = extract_text_from_docx(file_path)
72
+ elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
73
+ text = extract_text_from_image(file_path)
74
+ else:
75
+ return jsonify({"error": "Unsupported file type"}), 400
76
 
77
+ if not text:
78
+ return jsonify({"error": "No text found in the file"}), 400
79
 
80
+ summary = summarize_text(text)
81
+ return jsonify({"summary": summary})
82
 
83
+ except Exception as e:
84
+ return jsonify({"error": str(e)}), 500
85
+ finally:
86
+ os.remove(file_path) # Clean up
 
 
 
87
 
88
  if __name__ == "__main__":
 
89
  app.run(host="0.0.0.0", port=7860)