mike23415 commited on
Commit
798ae00
·
verified ·
1 Parent(s): f8e5cca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -21
app.py CHANGED
@@ -14,13 +14,13 @@ logger = logging.getLogger(__name__)
14
 
15
  app = Flask(__name__)
16
 
17
- # Set Hugging Face cache directory (matches Dockerfile)
18
- os.environ["HF_HOME"] = "/app/hf_cache"
19
 
20
  # Load T5 model and tokenizer
21
  logger.info("Loading T5-Base model...")
22
  try:
23
- tokenizer = T5Tokenizer.from_pretrained("t5-base")
24
  model = T5ForConditionalGeneration.from_pretrained("t5-base")
25
  logger.info("T5-Base model loaded successfully.")
26
  except Exception as e:
@@ -30,10 +30,14 @@ except Exception as e:
30
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
31
 
32
  def allowed_file(filename):
 
33
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
34
 
35
  def summarize_text(text, max_length=150, min_length=30):
36
  """Summarize text using T5-Base."""
 
 
 
37
  try:
38
  input_text = "summarize: " + text
39
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
@@ -49,29 +53,37 @@ def summarize_text(text, max_length=150, min_length=30):
49
  return summary
50
  except Exception as e:
51
  logger.error(f"Error in T5 summarization: {str(e)}")
52
- raise
53
 
54
  @app.route("/", methods=["GET"])
55
  def index():
 
56
  logger.info("Root endpoint accessed.")
57
- return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."
58
 
59
  @app.route("/summarize", methods=["POST"])
60
  def summarize():
 
61
  logger.info("Summarize endpoint called.")
 
62
  if "file" not in request.files:
63
  logger.error("No file uploaded.")
64
  return jsonify({"error": "No file uploaded"}), 400
 
65
  file = request.files["file"]
 
66
  if file.filename == "":
67
  logger.error("No file selected.")
68
  return jsonify({"error": "No selected file"}), 400
 
69
  if not allowed_file(file.filename):
70
  logger.error(f"Unsupported file format: {file.filename}")
71
  return jsonify({"error": "Unsupported file format"}), 400
 
72
  filename = secure_filename(file.filename)
73
  file_content = file.read()
74
  file_ext = filename.rsplit(".", 1)[1].lower()
 
75
  try:
76
  if file_ext == "pdf":
77
  text = summarize_pdf(file_content)
@@ -81,34 +93,56 @@ def summarize():
81
  text = summarize_pptx(file_content)
82
  elif file_ext == "txt":
83
  text = summarize_txt(file_content)
 
84
  summary = summarize_text(text)
85
- logger.info(f"File {filename} summarized successfully with T5.")
 
86
  return jsonify({"filename": filename, "summary": summary})
 
87
  except Exception as e:
88
  logger.error(f"Error processing file {filename}: {str(e)}")
89
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
90
 
91
  def summarize_pdf(file_content):
92
- reader = PdfReader(io.BytesIO(file_content))
93
- text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
94
- return text
 
 
 
 
 
95
 
96
  def summarize_docx(file_content):
97
- doc = Document(io.BytesIO(file_content))
98
- text = "\n".join([para.text for para in doc.paragraphs])
99
- return text
 
 
 
 
 
100
 
101
  def summarize_pptx(file_content):
102
- ppt = Presentation(io.BytesIO(file_content))
103
- text = []
104
- for slide in ppt.slides:
105
- for shape in slide.shapes:
106
- if hasattr(shape, "text"):
107
- text.append(shape.text)
108
- return "\n".join(text)
 
 
 
 
 
109
 
110
  def summarize_txt(file_content):
111
- return file_content.decode("utf-8")
 
 
 
 
112
 
113
  if __name__ == "__main__":
114
- app.run(host="0.0.0.0", port=7860, debug=True)
 
14
 
15
  app = Flask(__name__)
16
 
17
+ # Set Hugging Face cache to a writable directory
18
+ os.environ["HF_HOME"] = "/tmp/huggingface_cache"
19
 
20
  # Load T5 model and tokenizer
21
  logger.info("Loading T5-Base model...")
22
  try:
23
+ tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
24
  model = T5ForConditionalGeneration.from_pretrained("t5-base")
25
  logger.info("T5-Base model loaded successfully.")
26
  except Exception as e:
 
30
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
31
 
32
  def allowed_file(filename):
33
+ """Check if the uploaded file has an allowed extension."""
34
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
35
 
36
  def summarize_text(text, max_length=150, min_length=30):
37
  """Summarize text using T5-Base."""
38
+ if not text.strip():
39
+ return "No meaningful text found in the document."
40
+
41
  try:
42
  input_text = "summarize: " + text
43
  inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
 
53
  return summary
54
  except Exception as e:
55
  logger.error(f"Error in T5 summarization: {str(e)}")
56
+ return "Error in summarization process."
57
 
58
  @app.route("/", methods=["GET"])
59
  def index():
60
+ """Root endpoint for API status check."""
61
  logger.info("Root endpoint accessed.")
62
+ return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."
63
 
64
  @app.route("/summarize", methods=["POST"])
65
  def summarize():
66
+ """Handle file uploads and summarize the content."""
67
  logger.info("Summarize endpoint called.")
68
+
69
  if "file" not in request.files:
70
  logger.error("No file uploaded.")
71
  return jsonify({"error": "No file uploaded"}), 400
72
+
73
  file = request.files["file"]
74
+
75
  if file.filename == "":
76
  logger.error("No file selected.")
77
  return jsonify({"error": "No selected file"}), 400
78
+
79
  if not allowed_file(file.filename):
80
  logger.error(f"Unsupported file format: {file.filename}")
81
  return jsonify({"error": "Unsupported file format"}), 400
82
+
83
  filename = secure_filename(file.filename)
84
  file_content = file.read()
85
  file_ext = filename.rsplit(".", 1)[1].lower()
86
+
87
  try:
88
  if file_ext == "pdf":
89
  text = summarize_pdf(file_content)
 
93
  text = summarize_pptx(file_content)
94
  elif file_ext == "txt":
95
  text = summarize_txt(file_content)
96
+
97
  summary = summarize_text(text)
98
+
99
+ logger.info(f"File {filename} summarized successfully.")
100
  return jsonify({"filename": filename, "summary": summary})
101
+
102
  except Exception as e:
103
  logger.error(f"Error processing file {filename}: {str(e)}")
104
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
105
 
106
  def summarize_pdf(file_content):
107
+ """Extract text from a PDF file."""
108
+ try:
109
+ reader = PdfReader(io.BytesIO(file_content))
110
+ text = "\n".join([page.extract_text() or "" for page in reader.pages])
111
+ return text.strip() or "No extractable text found in PDF."
112
+ except Exception as e:
113
+ logger.error(f"Error reading PDF: {str(e)}")
114
+ return "Error extracting text from PDF."
115
 
116
  def summarize_docx(file_content):
117
+ """Extract text from a DOCX file."""
118
+ try:
119
+ doc = Document(io.BytesIO(file_content))
120
+ text = "\n".join([para.text for para in doc.paragraphs])
121
+ return text.strip() or "No extractable text found in DOCX."
122
+ except Exception as e:
123
+ logger.error(f"Error reading DOCX: {str(e)}")
124
+ return "Error extracting text from DOCX."
125
 
126
  def summarize_pptx(file_content):
127
+ """Extract text from a PPTX file."""
128
+ try:
129
+ ppt = Presentation(io.BytesIO(file_content))
130
+ text = []
131
+ for slide in ppt.slides:
132
+ for shape in slide.shapes:
133
+ if hasattr(shape, "text"):
134
+ text.append(shape.text)
135
+ return "\n".join(text).strip() or "No extractable text found in PPTX."
136
+ except Exception as e:
137
+ logger.error(f"Error reading PPTX: {str(e)}")
138
+ return "Error extracting text from PPTX."
139
 
140
  def summarize_txt(file_content):
141
+ """Extract text from a TXT file with safe decoding."""
142
+ try:
143
+ return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
144
+ except UnicodeDecodeError:
145
+ return file_content.decode("latin-1").strip() or "No extractable text found in TXT."
146
 
147
  if __name__ == "__main__":
148
+ app.run(host="0.0.0.0", port=7860, debug=True)