Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,10 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
11 |
from concurrent.futures import ThreadPoolExecutor
|
12 |
from functools import partial
|
13 |
import time
|
14 |
-
import
|
15 |
from datetime import datetime
|
16 |
import threading
|
17 |
import random
|
18 |
-
from openpyxl import load_workbook
|
19 |
|
20 |
# Configure logging
|
21 |
logging.basicConfig(level=logging.INFO)
|
@@ -30,53 +29,54 @@ CONFIDENCE_THRESHOLD = 0.65
|
|
30 |
BATCH_SIZE = 8 # Reduced batch size for CPU
|
31 |
MAX_WORKERS = 4 # Number of worker threads for processing
|
32 |
|
33 |
-
class
|
34 |
-
def __init__(self, log_dir="
|
35 |
-
"""Initialize the
|
36 |
|
37 |
Args:
|
38 |
-
log_dir: Directory to store log files
|
39 |
-
excel_file: Specific Excel file name (defaults to predictions_YYYY-MM.xlsx)
|
40 |
"""
|
41 |
self.log_dir = log_dir
|
42 |
os.makedirs(log_dir, exist_ok=True)
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
if not os.path.exists(self.excel_path):
|
53 |
-
self._create_excel_file()
|
54 |
-
|
55 |
-
# Create a lock for thread safety
|
56 |
-
self.file_lock = threading.Lock()
|
57 |
-
|
58 |
-
def _create_excel_file(self):
|
59 |
-
"""Create a new Excel file with appropriate sheets and headers."""
|
60 |
-
# Create DataFrame for metrics
|
61 |
-
metrics_df = pd.DataFrame(columns=[
|
62 |
-
'timestamp', 'word_count', 'mode', 'prediction',
|
63 |
'confidence', 'prediction_time_seconds', 'num_sentences'
|
64 |
-
]
|
65 |
|
66 |
-
|
67 |
-
text_df = pd.DataFrame(columns=[
|
68 |
-
'entry_id', 'timestamp', 'text'
|
69 |
-
])
|
70 |
|
71 |
-
#
|
72 |
-
|
73 |
-
metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
|
74 |
-
text_df.to_excel(writer, sheet_name='TextData', index=False)
|
75 |
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
def log_prediction(self, prediction_data, store_text=True):
|
79 |
-
"""Log prediction data to
|
80 |
|
81 |
Args:
|
82 |
prediction_data: Dictionary containing prediction metrics
|
@@ -92,89 +92,95 @@ class ExcelLogger:
|
|
92 |
if 'timestamp' not in prediction_data:
|
93 |
prediction_data['timestamp'] = datetime.now().isoformat()
|
94 |
|
95 |
-
# Add entry_id to
|
96 |
metrics_data = prediction_data.copy()
|
97 |
metrics_data['entry_id'] = entry_id
|
98 |
|
99 |
-
# Start a thread to write data
|
100 |
thread = threading.Thread(
|
101 |
-
target=self.
|
102 |
args=(metrics_data, text, entry_id, store_text)
|
103 |
)
|
104 |
thread.daemon = True
|
105 |
thread.start()
|
106 |
|
107 |
-
def
|
108 |
-
"""Write data to
|
109 |
max_retries = 5
|
110 |
retry_delay = 0.5
|
111 |
|
|
|
112 |
for attempt in range(max_retries):
|
113 |
try:
|
114 |
-
with self.
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
'timestamp': metrics_data['timestamp'],
|
131 |
-
'text': text
|
132 |
-
}])
|
133 |
-
text_df = pd.concat([text_df, new_text], ignore_index=True)
|
134 |
-
except:
|
135 |
-
# If TextData sheet doesn't exist or can't be read
|
136 |
-
text_df = pd.DataFrame([{
|
137 |
-
'entry_id': entry_id,
|
138 |
-
'timestamp': metrics_data['timestamp'],
|
139 |
-
'text': text
|
140 |
-
}])
|
141 |
-
|
142 |
-
# Write back to Excel
|
143 |
-
with pd.ExcelWriter(self.excel_path, engine='openpyxl', mode='a',
|
144 |
-
if_sheet_exists='replace') as writer:
|
145 |
-
metrics_df.to_excel(writer, sheet_name='Metrics', index=False)
|
146 |
-
if store_text and text:
|
147 |
-
text_df.to_excel(writer, sheet_name='TextData', index=False)
|
148 |
-
|
149 |
-
# Successfully wrote to file
|
150 |
break
|
151 |
-
|
152 |
except Exception as e:
|
153 |
-
|
154 |
-
|
155 |
-
time.sleep(retry_delay * (attempt + 1)) # Progressive backoff
|
156 |
else:
|
157 |
-
# If all retries fail,
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
#
|
170 |
if store_text and text:
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
|
180 |
class TextWindowProcessor:
|
@@ -480,7 +486,16 @@ class TextClassifier:
|
|
480 |
}
|
481 |
|
482 |
# Initialize the logger
|
483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
|
486 |
"""Analyze text using specified mode and return formatted results."""
|
@@ -532,7 +547,10 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
|
|
532 |
"num_sentences": 0, # No sentence analysis in quick mode
|
533 |
"text": text
|
534 |
}
|
535 |
-
|
|
|
|
|
|
|
536 |
|
537 |
else:
|
538 |
analysis = classifier.detailed_scan(text)
|
@@ -576,14 +594,17 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
|
|
576 |
"num_sentences": num_sentences,
|
577 |
"text": text
|
578 |
}
|
579 |
-
|
|
|
|
|
|
|
580 |
|
581 |
return output
|
582 |
|
583 |
# Initialize the classifier globally
|
584 |
classifier = TextClassifier()
|
585 |
|
586 |
-
# Create Gradio interface
|
587 |
demo = gr.Interface(
|
588 |
fn=lambda text, mode: analyze_text(text, mode, classifier),
|
589 |
inputs=[
|
@@ -619,8 +640,26 @@ app.add_middleware(
|
|
619 |
allow_headers=["*"],
|
620 |
)
|
621 |
|
|
|
|
|
|
|
|
|
|
|
622 |
# Ensure CORS is applied before launching
|
623 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
624 |
demo.queue()
|
625 |
demo.launch(
|
626 |
server_name="0.0.0.0",
|
|
|
11 |
from concurrent.futures import ThreadPoolExecutor
|
12 |
from functools import partial
|
13 |
import time
|
14 |
+
import csv
|
15 |
from datetime import datetime
|
16 |
import threading
|
17 |
import random
|
|
|
18 |
|
19 |
# Configure logging
|
20 |
logging.basicConfig(level=logging.INFO)
|
|
|
29 |
BATCH_SIZE = 8 # Reduced batch size for CPU
|
30 |
MAX_WORKERS = 4 # Number of worker threads for processing
|
31 |
|
32 |
+
class CSVLogger:
|
33 |
+
def __init__(self, log_dir="."):
|
34 |
+
"""Initialize the CSV logger.
|
35 |
|
36 |
Args:
|
37 |
+
log_dir: Directory to store CSV log files
|
|
|
38 |
"""
|
39 |
self.log_dir = log_dir
|
40 |
os.makedirs(log_dir, exist_ok=True)
|
41 |
|
42 |
+
# Create monthly CSV files
|
43 |
+
current_month = datetime.now().strftime('%Y-%m')
|
44 |
+
self.metrics_path = os.path.join(log_dir, f"metrics_{current_month}.csv")
|
45 |
+
self.text_path = os.path.join(log_dir, f"text_data_{current_month}.csv")
|
46 |
|
47 |
+
# Define headers
|
48 |
+
self.metrics_headers = [
|
49 |
+
'entry_id', 'timestamp', 'word_count', 'mode', 'prediction',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
'confidence', 'prediction_time_seconds', 'num_sentences'
|
51 |
+
]
|
52 |
|
53 |
+
self.text_headers = ['entry_id', 'timestamp', 'text']
|
|
|
|
|
|
|
54 |
|
55 |
+
# Initialize the files if they don't exist
|
56 |
+
self._initialize_files()
|
|
|
|
|
57 |
|
58 |
+
# Create locks for thread safety
|
59 |
+
self.metrics_lock = threading.Lock()
|
60 |
+
self.text_lock = threading.Lock()
|
61 |
+
|
62 |
+
print(f"CSV logger initialized with files at: {os.path.abspath(self.metrics_path)}")
|
63 |
+
|
64 |
+
def _initialize_files(self):
|
65 |
+
"""Create the CSV files with headers if they don't exist."""
|
66 |
+
# Initialize metrics file
|
67 |
+
if not os.path.exists(self.metrics_path):
|
68 |
+
with open(self.metrics_path, 'w', newline='') as f:
|
69 |
+
writer = csv.writer(f)
|
70 |
+
writer.writerow(self.metrics_headers)
|
71 |
+
|
72 |
+
# Initialize text data file
|
73 |
+
if not os.path.exists(self.text_path):
|
74 |
+
with open(self.text_path, 'w', newline='') as f:
|
75 |
+
writer = csv.writer(f)
|
76 |
+
writer.writerow(self.text_headers)
|
77 |
|
78 |
def log_prediction(self, prediction_data, store_text=True):
|
79 |
+
"""Log prediction data to CSV files.
|
80 |
|
81 |
Args:
|
82 |
prediction_data: Dictionary containing prediction metrics
|
|
|
92 |
if 'timestamp' not in prediction_data:
|
93 |
prediction_data['timestamp'] = datetime.now().isoformat()
|
94 |
|
95 |
+
# Add entry_id to metrics data
|
96 |
metrics_data = prediction_data.copy()
|
97 |
metrics_data['entry_id'] = entry_id
|
98 |
|
99 |
+
# Start a thread to write data
|
100 |
thread = threading.Thread(
|
101 |
+
target=self._write_to_csv,
|
102 |
args=(metrics_data, text, entry_id, store_text)
|
103 |
)
|
104 |
thread.daemon = True
|
105 |
thread.start()
|
106 |
|
107 |
+
def _write_to_csv(self, metrics_data, text, entry_id, store_text):
|
108 |
+
"""Write data to CSV files with retry mechanism."""
|
109 |
max_retries = 5
|
110 |
retry_delay = 0.5
|
111 |
|
112 |
+
# Write metrics data
|
113 |
for attempt in range(max_retries):
|
114 |
try:
|
115 |
+
with self.metrics_lock:
|
116 |
+
with open(self.metrics_path, 'a', newline='') as f:
|
117 |
+
writer = csv.writer(f)
|
118 |
+
# Prepare row in the correct order based on headers
|
119 |
+
row = [
|
120 |
+
metrics_data.get('entry_id', ''),
|
121 |
+
metrics_data.get('timestamp', ''),
|
122 |
+
metrics_data.get('word_count', 0),
|
123 |
+
metrics_data.get('mode', ''),
|
124 |
+
metrics_data.get('prediction', ''),
|
125 |
+
metrics_data.get('confidence', 0.0),
|
126 |
+
metrics_data.get('prediction_time_seconds', 0.0),
|
127 |
+
metrics_data.get('num_sentences', 0)
|
128 |
+
]
|
129 |
+
writer.writerow(row)
|
130 |
+
print(f"Successfully wrote metrics to CSV, entry_id: {entry_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
break
|
|
|
132 |
except Exception as e:
|
133 |
+
print(f"Error writing metrics to CSV (attempt {attempt+1}/{max_retries}): {e}")
|
134 |
+
time.sleep(retry_delay * (attempt + 1))
|
|
|
135 |
else:
|
136 |
+
# If all retries fail, write to backup file
|
137 |
+
backup_path = os.path.join(self.log_dir, f"metrics_backup_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv")
|
138 |
+
try:
|
139 |
+
with open(backup_path, 'w', newline='') as f:
|
140 |
+
writer = csv.writer(f)
|
141 |
+
writer.writerow(self.metrics_headers)
|
142 |
+
row = [
|
143 |
+
metrics_data.get('entry_id', ''),
|
144 |
+
metrics_data.get('timestamp', ''),
|
145 |
+
metrics_data.get('word_count', 0),
|
146 |
+
metrics_data.get('mode', ''),
|
147 |
+
metrics_data.get('prediction', ''),
|
148 |
+
metrics_data.get('confidence', 0.0),
|
149 |
+
metrics_data.get('prediction_time_seconds', 0.0),
|
150 |
+
metrics_data.get('num_sentences', 0)
|
151 |
+
]
|
152 |
+
writer.writerow(row)
|
153 |
+
print(f"Wrote metrics backup to {backup_path}")
|
154 |
+
except Exception as e:
|
155 |
+
print(f"Error writing metrics backup: {e}")
|
156 |
|
157 |
+
# Write text data if requested
|
158 |
if store_text and text:
|
159 |
+
for attempt in range(max_retries):
|
160 |
+
try:
|
161 |
+
with self.text_lock:
|
162 |
+
with open(self.text_path, 'a', newline='') as f:
|
163 |
+
writer = csv.writer(f)
|
164 |
+
# Handle potential newlines in text by replacing them
|
165 |
+
safe_text = text.replace('\n', ' ').replace('\r', ' ') if text else ''
|
166 |
+
writer.writerow([entry_id, metrics_data.get('timestamp', ''), safe_text])
|
167 |
+
print(f"Successfully wrote text data to CSV, entry_id: {entry_id}")
|
168 |
+
break
|
169 |
+
except Exception as e:
|
170 |
+
print(f"Error writing text data to CSV (attempt {attempt+1}/{max_retries}): {e}")
|
171 |
+
time.sleep(retry_delay * (attempt + 1))
|
172 |
+
else:
|
173 |
+
# If all retries fail, write to backup file
|
174 |
+
backup_path = os.path.join(self.log_dir, f"text_backup_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv")
|
175 |
+
try:
|
176 |
+
with open(backup_path, 'w', newline='') as f:
|
177 |
+
writer = csv.writer(f)
|
178 |
+
writer.writerow(self.text_headers)
|
179 |
+
safe_text = text.replace('\n', ' ').replace('\r', ' ') if text else ''
|
180 |
+
writer.writerow([entry_id, metrics_data.get('timestamp', ''), safe_text])
|
181 |
+
print(f"Wrote text data backup to {backup_path}")
|
182 |
+
except Exception as e:
|
183 |
+
print(f"Error writing text data backup: {e}")
|
184 |
|
185 |
|
186 |
class TextWindowProcessor:
|
|
|
486 |
}
|
487 |
|
488 |
# Initialize the logger
|
489 |
+
csv_logger = CSVLogger(log_dir=".")
|
490 |
+
|
491 |
+
# Add file listing endpoint for debugging
|
492 |
+
def list_files():
|
493 |
+
"""List all files in the current directory and subdirectories."""
|
494 |
+
all_files = []
|
495 |
+
for root, dirs, files in os.walk('.'):
|
496 |
+
for file in files:
|
497 |
+
all_files.append(os.path.join(root, file))
|
498 |
+
return all_files
|
499 |
|
500 |
def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
|
501 |
"""Analyze text using specified mode and return formatted results."""
|
|
|
547 |
"num_sentences": 0, # No sentence analysis in quick mode
|
548 |
"text": text
|
549 |
}
|
550 |
+
|
551 |
+
# Log to CSV
|
552 |
+
print(f"Logging prediction data: word_count={word_count}, mode={mode}, prediction={prediction}")
|
553 |
+
csv_logger.log_prediction(log_data)
|
554 |
|
555 |
else:
|
556 |
analysis = classifier.detailed_scan(text)
|
|
|
594 |
"num_sentences": num_sentences,
|
595 |
"text": text
|
596 |
}
|
597 |
+
|
598 |
+
# Log to CSV
|
599 |
+
print(f"Logging prediction data: word_count={word_count}, mode={mode}, prediction={prediction}")
|
600 |
+
csv_logger.log_prediction(log_data)
|
601 |
|
602 |
return output
|
603 |
|
604 |
# Initialize the classifier globally
|
605 |
classifier = TextClassifier()
|
606 |
|
607 |
+
# Create Gradio interface
|
608 |
demo = gr.Interface(
|
609 |
fn=lambda text, mode: analyze_text(text, mode, classifier),
|
610 |
inputs=[
|
|
|
640 |
allow_headers=["*"],
|
641 |
)
|
642 |
|
643 |
+
# Add file listing endpoint for debugging
|
644 |
+
@app.get("/list_files")
|
645 |
+
async def get_files():
|
646 |
+
return {"files": list_files()}
|
647 |
+
|
648 |
# Ensure CORS is applied before launching
|
649 |
if __name__ == "__main__":
|
650 |
+
# Create empty CSV files if they don't exist
|
651 |
+
current_month = datetime.now().strftime('%Y-%m')
|
652 |
+
metrics_path = f"metrics_{current_month}.csv"
|
653 |
+
text_path = f"text_data_{current_month}.csv"
|
654 |
+
|
655 |
+
print(f"Current directory: {os.getcwd()}")
|
656 |
+
print(f"Looking for CSV files: {metrics_path}, {text_path}")
|
657 |
+
|
658 |
+
if not os.path.exists(metrics_path):
|
659 |
+
print(f"Creating metrics CSV file: {metrics_path}")
|
660 |
+
if not os.path.exists(text_path):
|
661 |
+
print(f"Creating text data CSV file: {text_path}")
|
662 |
+
|
663 |
demo.queue()
|
664 |
demo.launch(
|
665 |
server_name="0.0.0.0",
|