mike23415 commited on
Commit
dc17435
·
verified ·
1 Parent(s): 916aef5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -50
app.py CHANGED
@@ -12,15 +12,16 @@ from transformers import T5Tokenizer, T5ForConditionalGeneration
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
 
15
  app = Flask(__name__)
16
 
17
- # Set Hugging Face cache to a writable directory
18
- os.environ["HF_HOME"] = "/tmp/huggingface_cache"
19
 
20
  # Load T5 model and tokenizer
21
  logger.info("Loading T5-Base model...")
22
  try:
23
- tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
24
  model = T5ForConditionalGeneration.from_pretrained("t5-base")
25
  logger.info("T5-Base model loaded successfully.")
26
  except Exception as e:
@@ -35,10 +36,10 @@ def allowed_file(filename):
35
 
36
  def summarize_text(text, max_length=150, min_length=30):
37
  """Summarize text using T5-Base."""
38
- if not text.strip():
39
- return "No meaningful text found in the document."
40
-
41
  try:
 
 
 
42
  input_text = "summarize: " + text
43
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
44
  summary_ids = model.generate(
@@ -53,29 +54,28 @@ def summarize_text(text, max_length=150, min_length=30):
53
  return summary
54
  except Exception as e:
55
  logger.error(f"Error in T5 summarization: {str(e)}")
56
- return "Error in summarization process."
57
 
58
  @app.route("/", methods=["GET"])
59
  def index():
60
- """Root endpoint for API status check."""
61
  logger.info("Root endpoint accessed.")
62
- return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."
63
 
64
  @app.route("/summarize", methods=["POST"])
65
  def summarize():
66
- """Handle file uploads and summarize the content."""
67
  logger.info("Summarize endpoint called.")
68
-
69
  if "file" not in request.files:
70
  logger.error("No file uploaded.")
71
  return jsonify({"error": "No file uploaded"}), 400
72
-
73
- file = request.files["file"]
74
 
 
75
  if file.filename == "":
76
  logger.error("No file selected.")
77
  return jsonify({"error": "No selected file"}), 400
78
-
79
  if not allowed_file(file.filename):
80
  logger.error(f"Unsupported file format: {file.filename}")
81
  return jsonify({"error": "Unsupported file format"}), 400
@@ -83,7 +83,7 @@ def summarize():
83
  filename = secure_filename(file.filename)
84
  file_content = file.read()
85
  file_ext = filename.rsplit(".", 1)[1].lower()
86
-
87
  try:
88
  if file_ext == "pdf":
89
  text = summarize_pdf(file_content)
@@ -93,56 +93,45 @@ def summarize():
93
  text = summarize_pptx(file_content)
94
  elif file_ext == "txt":
95
  text = summarize_txt(file_content)
 
 
96
 
97
- summary = summarize_text(text)
 
98
 
 
99
  logger.info(f"File {filename} summarized successfully.")
100
  return jsonify({"filename": filename, "summary": summary})
101
-
102
  except Exception as e:
103
  logger.error(f"Error processing file {filename}: {str(e)}")
104
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
105
 
106
  def summarize_pdf(file_content):
107
- """Extract text from a PDF file."""
108
- try:
109
- reader = PdfReader(io.BytesIO(file_content))
110
- text = "\n".join([page.extract_text() or "" for page in reader.pages])
111
- return text.strip() or "No extractable text found in PDF."
112
- except Exception as e:
113
- logger.error(f"Error reading PDF: {str(e)}")
114
- return "Error extracting text from PDF."
115
 
116
  def summarize_docx(file_content):
117
- """Extract text from a DOCX file."""
118
- try:
119
- doc = Document(io.BytesIO(file_content))
120
- text = "\n".join([para.text for para in doc.paragraphs])
121
- return text.strip() or "No extractable text found in DOCX."
122
- except Exception as e:
123
- logger.error(f"Error reading DOCX: {str(e)}")
124
- return "Error extracting text from DOCX."
125
 
126
  def summarize_pptx(file_content):
127
- """Extract text from a PPTX file."""
128
- try:
129
- ppt = Presentation(io.BytesIO(file_content))
130
- text = []
131
- for slide in ppt.slides:
132
- for shape in slide.shapes:
133
- if hasattr(shape, "text"):
134
- text.append(shape.text)
135
- return "\n".join(text).strip() or "No extractable text found in PPTX."
136
- except Exception as e:
137
- logger.error(f"Error reading PPTX: {str(e)}")
138
- return "Error extracting text from PPTX."
139
 
140
  def summarize_txt(file_content):
141
- """Extract text from a TXT file with safe decoding."""
142
- try:
143
- return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
144
- except UnicodeDecodeError:
145
- return file_content.decode("latin-1").strip() or "No extractable text found in TXT."
146
 
147
  if __name__ == "__main__":
148
  app.run(host="0.0.0.0", port=7860, debug=True)
 
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
+ # Initialize Flask app
16
  app = Flask(__name__)
17
 
18
+ # Set Hugging Face cache directory
19
+ os.environ["HF_HOME"] = "/app/hf_cache"
20
 
21
  # Load T5 model and tokenizer
22
  logger.info("Loading T5-Base model...")
23
  try:
24
+ tokenizer = T5Tokenizer.from_pretrained("t5-base")
25
  model = T5ForConditionalGeneration.from_pretrained("t5-base")
26
  logger.info("T5-Base model loaded successfully.")
27
  except Exception as e:
 
36
 
37
  def summarize_text(text, max_length=150, min_length=30):
38
  """Summarize text using T5-Base."""
 
 
 
39
  try:
40
+ if not text.strip():
41
+ return "No text found in the document to summarize."
42
+
43
  input_text = "summarize: " + text
44
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
45
  summary_ids = model.generate(
 
54
  return summary
55
  except Exception as e:
56
  logger.error(f"Error in T5 summarization: {str(e)}")
57
+ return "Error summarizing text."
58
 
59
  @app.route("/", methods=["GET"])
60
  def index():
61
+ """Root endpoint."""
62
  logger.info("Root endpoint accessed.")
63
+ return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."
64
 
65
  @app.route("/summarize", methods=["POST"])
66
  def summarize():
67
+ """Handle file uploads and summarization."""
68
  logger.info("Summarize endpoint called.")
69
+
70
  if "file" not in request.files:
71
  logger.error("No file uploaded.")
72
  return jsonify({"error": "No file uploaded"}), 400
 
 
73
 
74
+ file = request.files["file"]
75
  if file.filename == "":
76
  logger.error("No file selected.")
77
  return jsonify({"error": "No selected file"}), 400
78
+
79
  if not allowed_file(file.filename):
80
  logger.error(f"Unsupported file format: {file.filename}")
81
  return jsonify({"error": "Unsupported file format"}), 400
 
83
  filename = secure_filename(file.filename)
84
  file_content = file.read()
85
  file_ext = filename.rsplit(".", 1)[1].lower()
86
+
87
  try:
88
  if file_ext == "pdf":
89
  text = summarize_pdf(file_content)
 
93
  text = summarize_pptx(file_content)
94
  elif file_ext == "txt":
95
  text = summarize_txt(file_content)
96
+ else:
97
+ return jsonify({"error": "Unsupported file format"}), 400
98
 
99
+ if not text.strip():
100
+ return jsonify({"error": "No extractable text found in the document"}), 400
101
 
102
+ summary = summarize_text(text)
103
  logger.info(f"File {filename} summarized successfully.")
104
  return jsonify({"filename": filename, "summary": summary})
105
+
106
  except Exception as e:
107
  logger.error(f"Error processing file {filename}: {str(e)}")
108
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
109
 
110
  def summarize_pdf(file_content):
111
+ """Extract text from PDF."""
112
+ reader = PdfReader(io.BytesIO(file_content))
113
+ text = "\n".join([page.extract_text() or "" for page in reader.pages])
114
+ return text.strip()
 
 
 
 
115
 
116
  def summarize_docx(file_content):
117
+ """Extract text from DOCX."""
118
+ doc = Document(io.BytesIO(file_content))
119
+ text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
120
+ return text.strip()
 
 
 
 
121
 
122
  def summarize_pptx(file_content):
123
+ """Extract text from PPTX."""
124
+ ppt = Presentation(io.BytesIO(file_content))
125
+ text = []
126
+ for slide in ppt.slides:
127
+ for shape in slide.shapes:
128
+ if hasattr(shape, "text") and shape.text.strip():
129
+ text.append(shape.text.strip())
130
+ return "\n".join(text).strip()
 
 
 
 
131
 
132
  def summarize_txt(file_content):
133
+ """Extract text from TXT file."""
134
+ return file_content.decode("utf-8").strip()
 
 
 
135
 
136
  if __name__ == "__main__":
137
  app.run(host="0.0.0.0", port=7860, debug=True)