Spaces:
Running
Running
import pandas as pd | |
import time | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig | |
# Use a sentiment-specific model (replace with TinyBERT if fine-tuned) | |
MODEL = "tabularisai/multilingual-sentiment-analysis" # Pre-trained for positive/negative sentiment | |
print("Loading model and tokenizer...") | |
start_load = time.time() | |
# Check for MPS (Metal) availability on M2 chip, fallback to CPU | |
device = "mps" if torch.backends.mps.is_available() else "cpu" | |
print(f"Using device: {device}") | |
# Load with optimizations (only once, removing redundancy) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True) | |
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device) | |
config = AutoConfig.from_pretrained(MODEL) | |
load_time = time.time() - start_load | |
print(f"Model and tokenizer loaded in {load_time:.2f} seconds\n") | |
# Optimized preprocessing (unchanged from your code) | |
def preprocess(text): | |
if not isinstance(text, str): | |
text = str(text) if not pd.isna(text) else "" | |
new_text = [] | |
for t in text.split(" "): | |
t = '@user' if t.startswith('@') and len(t) > 1 else t | |
t = 'http' if t.startswith('http') else t | |
new_text.append(t) | |
return " ".join(new_text) | |
# Batch prediction function (optimized for performance) | |
def predict_sentiment_batch(texts: list, batch_size: int = 16) -> list: | |
if not isinstance(texts, list): | |
raise TypeError(f"Expected list of texts, got {type(texts)}") | |
# Validate and clean inputs | |
valid_texts = [str(text) for text in texts if isinstance(text, str) and text.strip()] | |
if not valid_texts: | |
return [] # Return empty list if no valid texts | |
print(f"Processing {len(valid_texts)} valid samples...") | |
processed_texts = [preprocess(text) for text in valid_texts] | |
predictions = [] | |
for i in range(0, len(processed_texts), batch_size): | |
batch = processed_texts[i:i + batch_size] | |
try: | |
inputs = tokenizer( | |
batch, | |
padding=True, | |
truncation=True, | |
return_tensors="pt", | |
max_length=64 # Reduced for speed on short texts like tweets | |
).to(device) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
batch_preds = outputs.logits.argmax(dim=1).cpu().numpy() | |
predictions.extend([config.id2label[p] for p in batch_preds]) | |
except Exception as e: | |
print(f"Error processing batch {i // batch_size}: {str(e)}") | |
predictions.extend(["neutral"] * len(batch)) # Consider logging instead | |
print(f"Predictions for {len(valid_texts)} samples generated in {time.time() - start_load:.2f} seconds") | |
predictions = [prediction.lower().replace("very ", "") for prediction in predictions] | |
print(predictions) | |
return predictions | |
# # Example usage with your dataset (uncomment and adjust paths) | |
# test_data = pd.read_csv("/Users/caasidev/development/AI/last try/Whatssap-project/srcs/tweets.csv") | |
# print(f"Processing {len(test_data)} samples...") | |
# start_prediction = time.time() | |
# text_samples = test_data['text'].tolist() | |
# test_data['predicted_sentiment'] = predict_sentiment_batch(text_samples) | |
# prediction_time = time.time() - start_prediction | |
# time_per_sample = prediction_time / len(test_data) | |
# # Print runtime statistics | |
# print("\nRuntime Statistics:") | |
# print(f"- Model loading time: {load_time:.2f} seconds") | |
# print(f"- Total prediction time for {len(test_data)} samples: {prediction_time:.2f} seconds") | |
# print(f"- Average time per sample: {time_per_sample:.4f} seconds") | |
# print(f"- Estimated time for 1000 samples: {(time_per_sample * 1000):.2f} seconds") | |
# print(f"- Estimated time for 20000 samples: {(time_per_sample * 20000 / 60):.2f} minutes") | |
# # Print a sample of predictions | |
# print("\nPredicted Sentiments (first 5 samples):") | |
# print(test_data[['text', 'predicted_sentiment']].head()) |