ApsidalSolid4 commited on
Commit
83846c5
·
verified ·
1 Parent(s): b02cae9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -104
app.py CHANGED
@@ -11,11 +11,10 @@ from fastapi.middleware.cors import CORSMiddleware
11
  from concurrent.futures import ThreadPoolExecutor
12
  from functools import partial
13
  import time
14
- import pandas as pd
15
  from datetime import datetime
16
  import threading
17
  import random
18
- from openpyxl import load_workbook
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
@@ -30,53 +29,54 @@ CONFIDENCE_THRESHOLD = 0.65
30
  BATCH_SIZE = 8 # Reduced batch size for CPU
31
  MAX_WORKERS = 4 # Number of worker threads for processing
32
 
33
- class ExcelLogger:
34
- def __init__(self, log_dir="logs", excel_file=None):
35
- """Initialize the Excel logger.
36
 
37
  Args:
38
- log_dir: Directory to store log files
39
- excel_file: Specific Excel file name (defaults to predictions_YYYY-MM.xlsx)
40
  """
41
  self.log_dir = log_dir
42
  os.makedirs(log_dir, exist_ok=True)
43
 
44
- # Use monthly Excel files by default
45
- if excel_file is None:
46
- current_month = datetime.now().strftime('%Y-%m')
47
- excel_file = f"predictions_{current_month}.xlsx"
48
 
49
- self.excel_path = os.path.join(log_dir, excel_file)
50
-
51
- # Create excel file with headers if it doesn't exist
52
- if not os.path.exists(self.excel_path):
53
- self._create_excel_file()
54
-
55
- # Create a lock for thread safety
56
- self.file_lock = threading.Lock()
57
-
58
- def _create_excel_file(self):
59
- """Create a new Excel file with appropriate sheets and headers."""
60
- # Create DataFrame for metrics
61
- metrics_df = pd.DataFrame(columns=[
62
- 'timestamp', 'word_count', 'mode', 'prediction',
63
  'confidence', 'prediction_time_seconds', 'num_sentences'
64
- ])
65
 
66
- # Create DataFrame for text storage
67
- text_df = pd.DataFrame(columns=[
68
- 'entry_id', 'timestamp', 'text'
69
- ])
70
 
71
- # Create Excel writer
72
- with pd.ExcelWriter(self.excel_path, engine='openpyxl') as writer:
73
- metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
74
- text_df.to_excel(writer, sheet_name='TextData', index=False)
75
 
76
- logger.info(f"Created new Excel log file: {self.excel_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  def log_prediction(self, prediction_data, store_text=True):
79
- """Log prediction data to the Excel file.
80
 
81
  Args:
82
  prediction_data: Dictionary containing prediction metrics
@@ -92,89 +92,95 @@ class ExcelLogger:
92
  if 'timestamp' not in prediction_data:
93
  prediction_data['timestamp'] = datetime.now().isoformat()
94
 
95
- # Add entry_id to the metrics
96
  metrics_data = prediction_data.copy()
97
  metrics_data['entry_id'] = entry_id
98
 
99
- # Start a thread to write data to Excel
100
  thread = threading.Thread(
101
- target=self._write_to_excel,
102
  args=(metrics_data, text, entry_id, store_text)
103
  )
104
  thread.daemon = True
105
  thread.start()
106
 
107
- def _write_to_excel(self, metrics_data, text, entry_id, store_text):
108
- """Write data to Excel file with retry mechanism for concurrent access."""
109
  max_retries = 5
110
  retry_delay = 0.5
111
 
 
112
  for attempt in range(max_retries):
113
  try:
114
- with self.file_lock:
115
- # Load existing data
116
- metrics_df = pd.read_excel(self.excel_path, sheet_name='Metrics')
117
-
118
- # Append new metrics data
119
- new_metrics = pd.DataFrame([metrics_data])
120
- metrics_df = pd.concat([metrics_df, new_metrics], ignore_index=True)
121
-
122
- # If text storage is requested
123
- if store_text and text:
124
- try:
125
- text_df = pd.read_excel(self.excel_path, sheet_name='TextData')
126
-
127
- # Append new text data
128
- new_text = pd.DataFrame([{
129
- 'entry_id': entry_id,
130
- 'timestamp': metrics_data['timestamp'],
131
- 'text': text
132
- }])
133
- text_df = pd.concat([text_df, new_text], ignore_index=True)
134
- except:
135
- # If TextData sheet doesn't exist or can't be read
136
- text_df = pd.DataFrame([{
137
- 'entry_id': entry_id,
138
- 'timestamp': metrics_data['timestamp'],
139
- 'text': text
140
- }])
141
-
142
- # Write back to Excel
143
- with pd.ExcelWriter(self.excel_path, engine='openpyxl', mode='a',
144
- if_sheet_exists='replace') as writer:
145
- metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
146
- if store_text and text:
147
- text_df.to_excel(writer, sheet_name='TextData', index=False)
148
-
149
- # Successfully wrote to file
150
  break
151
-
152
  except Exception as e:
153
- # If error occurs (likely due to concurrent access), retry after delay
154
- logger.warning(f"Error writing to Excel (attempt {attempt+1}/{max_retries}): {e}")
155
- time.sleep(retry_delay * (attempt + 1)) # Progressive backoff
156
  else:
157
- # If all retries fail, log to backup file
158
- logger.error(f"Failed to write to Excel after {max_retries} attempts, logging to backup file")
159
- self._write_to_backup(metrics_data, text, entry_id, store_text)
160
-
161
- def _write_to_backup(self, metrics_data, text, entry_id, store_text):
162
- """Write to backup CSV files if Excel writing fails."""
163
- timestamp = datetime.now().strftime('%Y%m%d')
164
-
165
- # Log metrics to CSV
166
- metrics_csv = os.path.join(self.log_dir, f"metrics_backup_{timestamp}.csv")
167
- pd.DataFrame([metrics_data]).to_csv(metrics_csv, mode='a', header=not os.path.exists(metrics_csv), index=False)
 
 
 
 
 
 
 
 
 
168
 
169
- # Log text to separate CSV if needed
170
  if store_text and text:
171
- text_csv = os.path.join(self.log_dir, f"text_backup_{timestamp}.csv")
172
- text_data = {
173
- 'entry_id': entry_id,
174
- 'timestamp': metrics_data['timestamp'],
175
- 'text': text
176
- }
177
- pd.DataFrame([text_data]).to_csv(text_csv, mode='a', header=not os.path.exists(text_csv), index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180
  class TextWindowProcessor:
@@ -480,7 +486,16 @@ class TextClassifier:
480
  }
481
 
482
  # Initialize the logger
483
- excel_logger = ExcelLogger(log_dir=".")
 
 
 
 
 
 
 
 
 
484
 
485
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
486
  """Analyze text using specified mode and return formatted results."""
@@ -532,7 +547,10 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
532
  "num_sentences": 0, # No sentence analysis in quick mode
533
  "text": text
534
  }
535
- excel_logger.log_prediction(log_data)
 
 
 
536
 
537
  else:
538
  analysis = classifier.detailed_scan(text)
@@ -576,14 +594,17 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
576
  "num_sentences": num_sentences,
577
  "text": text
578
  }
579
- excel_logger.log_prediction(log_data)
 
 
 
580
 
581
  return output
582
 
583
  # Initialize the classifier globally
584
  classifier = TextClassifier()
585
 
586
- # Create Gradio interface with added information about data collection
587
  demo = gr.Interface(
588
  fn=lambda text, mode: analyze_text(text, mode, classifier),
589
  inputs=[
@@ -619,8 +640,26 @@ app.add_middleware(
619
  allow_headers=["*"],
620
  )
621
 
 
 
 
 
 
622
  # Ensure CORS is applied before launching
623
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  demo.queue()
625
  demo.launch(
626
  server_name="0.0.0.0",
 
11
  from concurrent.futures import ThreadPoolExecutor
12
  from functools import partial
13
  import time
14
+ import csv
15
  from datetime import datetime
16
  import threading
17
  import random
 
18
 
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO)
 
29
  BATCH_SIZE = 8 # Reduced batch size for CPU
30
  MAX_WORKERS = 4 # Number of worker threads for processing
31
 
32
+ class CSVLogger:
33
+ def __init__(self, log_dir="."):
34
+ """Initialize the CSV logger.
35
 
36
  Args:
37
+ log_dir: Directory to store CSV log files
 
38
  """
39
  self.log_dir = log_dir
40
  os.makedirs(log_dir, exist_ok=True)
41
 
42
+ # Create monthly CSV files
43
+ current_month = datetime.now().strftime('%Y-%m')
44
+ self.metrics_path = os.path.join(log_dir, f"metrics_{current_month}.csv")
45
+ self.text_path = os.path.join(log_dir, f"text_data_{current_month}.csv")
46
 
47
+ # Define headers
48
+ self.metrics_headers = [
49
+ 'entry_id', 'timestamp', 'word_count', 'mode', 'prediction',
 
 
 
 
 
 
 
 
 
 
 
50
  'confidence', 'prediction_time_seconds', 'num_sentences'
51
+ ]
52
 
53
+ self.text_headers = ['entry_id', 'timestamp', 'text']
 
 
 
54
 
55
+ # Initialize the files if they don't exist
56
+ self._initialize_files()
 
 
57
 
58
+ # Create locks for thread safety
59
+ self.metrics_lock = threading.Lock()
60
+ self.text_lock = threading.Lock()
61
+
62
+ print(f"CSV logger initialized with files at: {os.path.abspath(self.metrics_path)}")
63
+
64
+ def _initialize_files(self):
65
+ """Create the CSV files with headers if they don't exist."""
66
+ # Initialize metrics file
67
+ if not os.path.exists(self.metrics_path):
68
+ with open(self.metrics_path, 'w', newline='') as f:
69
+ writer = csv.writer(f)
70
+ writer.writerow(self.metrics_headers)
71
+
72
+ # Initialize text data file
73
+ if not os.path.exists(self.text_path):
74
+ with open(self.text_path, 'w', newline='') as f:
75
+ writer = csv.writer(f)
76
+ writer.writerow(self.text_headers)
77
 
78
  def log_prediction(self, prediction_data, store_text=True):
79
+ """Log prediction data to CSV files.
80
 
81
  Args:
82
  prediction_data: Dictionary containing prediction metrics
 
92
  if 'timestamp' not in prediction_data:
93
  prediction_data['timestamp'] = datetime.now().isoformat()
94
 
95
+ # Add entry_id to metrics data
96
  metrics_data = prediction_data.copy()
97
  metrics_data['entry_id'] = entry_id
98
 
99
+ # Start a thread to write data
100
  thread = threading.Thread(
101
+ target=self._write_to_csv,
102
  args=(metrics_data, text, entry_id, store_text)
103
  )
104
  thread.daemon = True
105
  thread.start()
106
 
107
+ def _write_to_csv(self, metrics_data, text, entry_id, store_text):
108
+ """Write data to CSV files with retry mechanism."""
109
  max_retries = 5
110
  retry_delay = 0.5
111
 
112
+ # Write metrics data
113
  for attempt in range(max_retries):
114
  try:
115
+ with self.metrics_lock:
116
+ with open(self.metrics_path, 'a', newline='') as f:
117
+ writer = csv.writer(f)
118
+ # Prepare row in the correct order based on headers
119
+ row = [
120
+ metrics_data.get('entry_id', ''),
121
+ metrics_data.get('timestamp', ''),
122
+ metrics_data.get('word_count', 0),
123
+ metrics_data.get('mode', ''),
124
+ metrics_data.get('prediction', ''),
125
+ metrics_data.get('confidence', 0.0),
126
+ metrics_data.get('prediction_time_seconds', 0.0),
127
+ metrics_data.get('num_sentences', 0)
128
+ ]
129
+ writer.writerow(row)
130
+ print(f"Successfully wrote metrics to CSV, entry_id: {entry_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  break
 
132
  except Exception as e:
133
+ print(f"Error writing metrics to CSV (attempt {attempt+1}/{max_retries}): {e}")
134
+ time.sleep(retry_delay * (attempt + 1))
 
135
  else:
136
+ # If all retries fail, write to backup file
137
+ backup_path = os.path.join(self.log_dir, f"metrics_backup_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv")
138
+ try:
139
+ with open(backup_path, 'w', newline='') as f:
140
+ writer = csv.writer(f)
141
+ writer.writerow(self.metrics_headers)
142
+ row = [
143
+ metrics_data.get('entry_id', ''),
144
+ metrics_data.get('timestamp', ''),
145
+ metrics_data.get('word_count', 0),
146
+ metrics_data.get('mode', ''),
147
+ metrics_data.get('prediction', ''),
148
+ metrics_data.get('confidence', 0.0),
149
+ metrics_data.get('prediction_time_seconds', 0.0),
150
+ metrics_data.get('num_sentences', 0)
151
+ ]
152
+ writer.writerow(row)
153
+ print(f"Wrote metrics backup to {backup_path}")
154
+ except Exception as e:
155
+ print(f"Error writing metrics backup: {e}")
156
 
157
+ # Write text data if requested
158
  if store_text and text:
159
+ for attempt in range(max_retries):
160
+ try:
161
+ with self.text_lock:
162
+ with open(self.text_path, 'a', newline='') as f:
163
+ writer = csv.writer(f)
164
+ # Handle potential newlines in text by replacing them
165
+ safe_text = text.replace('\n', ' ').replace('\r', ' ') if text else ''
166
+ writer.writerow([entry_id, metrics_data.get('timestamp', ''), safe_text])
167
+ print(f"Successfully wrote text data to CSV, entry_id: {entry_id}")
168
+ break
169
+ except Exception as e:
170
+ print(f"Error writing text data to CSV (attempt {attempt+1}/{max_retries}): {e}")
171
+ time.sleep(retry_delay * (attempt + 1))
172
+ else:
173
+ # If all retries fail, write to backup file
174
+ backup_path = os.path.join(self.log_dir, f"text_backup_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv")
175
+ try:
176
+ with open(backup_path, 'w', newline='') as f:
177
+ writer = csv.writer(f)
178
+ writer.writerow(self.text_headers)
179
+ safe_text = text.replace('\n', ' ').replace('\r', ' ') if text else ''
180
+ writer.writerow([entry_id, metrics_data.get('timestamp', ''), safe_text])
181
+ print(f"Wrote text data backup to {backup_path}")
182
+ except Exception as e:
183
+ print(f"Error writing text data backup: {e}")
184
 
185
 
186
  class TextWindowProcessor:
 
486
  }
487
 
488
  # Initialize the logger
489
+ csv_logger = CSVLogger(log_dir=".")
490
+
491
+ # Add file listing endpoint for debugging
492
+ def list_files():
493
+ """List all files in the current directory and subdirectories."""
494
+ all_files = []
495
+ for root, dirs, files in os.walk('.'):
496
+ for file in files:
497
+ all_files.append(os.path.join(root, file))
498
+ return all_files
499
 
500
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
501
  """Analyze text using specified mode and return formatted results."""
 
547
  "num_sentences": 0, # No sentence analysis in quick mode
548
  "text": text
549
  }
550
+
551
+ # Log to CSV
552
+ print(f"Logging prediction data: word_count={word_count}, mode={mode}, prediction={prediction}")
553
+ csv_logger.log_prediction(log_data)
554
 
555
  else:
556
  analysis = classifier.detailed_scan(text)
 
594
  "num_sentences": num_sentences,
595
  "text": text
596
  }
597
+
598
+ # Log to CSV
599
+ print(f"Logging prediction data: word_count={word_count}, mode={mode}, prediction={prediction}")
600
+ csv_logger.log_prediction(log_data)
601
 
602
  return output
603
 
604
  # Initialize the classifier globally
605
  classifier = TextClassifier()
606
 
607
+ # Create Gradio interface
608
  demo = gr.Interface(
609
  fn=lambda text, mode: analyze_text(text, mode, classifier),
610
  inputs=[
 
640
  allow_headers=["*"],
641
  )
642
 
643
+ # Add file listing endpoint for debugging
644
+ @app.get("/list_files")
645
+ async def get_files():
646
+ return {"files": list_files()}
647
+
648
  # Ensure CORS is applied before launching
649
  if __name__ == "__main__":
650
+ # Create empty CSV files if they don't exist
651
+ current_month = datetime.now().strftime('%Y-%m')
652
+ metrics_path = f"metrics_{current_month}.csv"
653
+ text_path = f"text_data_{current_month}.csv"
654
+
655
+ print(f"Current directory: {os.getcwd()}")
656
+ print(f"Looking for CSV files: {metrics_path}, {text_path}")
657
+
658
+ if not os.path.exists(metrics_path):
659
+ print(f"Creating metrics CSV file: {metrics_path}")
660
+ if not os.path.exists(text_path):
661
+ print(f"Creating text data CSV file: {text_path}")
662
+
663
  demo.queue()
664
  demo.launch(
665
  server_name="0.0.0.0",