SocialMediaFoci / preprocessor.py
hansche's picture
added the datetime import (#3)
1bd5399 verified
import re
import pandas as pd
import spacy
from langdetect import detect_langs
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import streamlit as st
from datetime import datetime
# Lighter model
MODEL ="cardiffnlp/twitter-xlm-roberta-base-sentiment"
# Cache model loading with fallback for quantization
@st.cache_resource
def load_model():
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
# Attempt quantization with fallback
try:
# Set quantization engine explicitly (fbgemm for x86, qnnpack for ARM)
torch.backends.quantized.engine = 'fbgemm' if torch.cuda.is_available() else 'qnnpack'
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
print("Model quantized successfully.")
except RuntimeError as e:
print(f"Quantization failed: {e}. Using non-quantized model.")
config = AutoConfig.from_pretrained(MODEL)
return tokenizer, model, config, device
tokenizer, model, config, device = load_model()
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))
def preprocess(text):
if text is None:
return ""
if not isinstance(text, str):
try:
text = str(text)
except:
return ""
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
def clean_message(text):
if not isinstance(text, str):
return ""
text = text.lower()
text = text.replace("<media omitted>", "").replace("this message was deleted", "").replace("null", "")
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text)
return text.strip()
def lemmatize_text(text, lang):
if lang == 'fr':
doc = nlp_fr(text)
else:
doc = nlp_en(text)
return " ".join([token.lemma_ for token in doc if not token.is_punct])
def preprocess(data):
pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
filtered_messages, valid_dates = [], []
for line in data.strip().split("\n"):
match = re.match(pattern, line)
if match:
entry = match.groupdict()
sender = entry.get("Sender")
if sender and sender.strip().lower() != "system":
filtered_messages.append(f"{sender.strip()}: {entry['Message']}")
valid_dates.append(f"{entry['Date']}, {entry['Time'].replace(' ', ' ')}")
print("-_____--------------__________----------_____________----------______________")
def convert_to_target_format(date_str):
try:
# Attempt to parse the original date string
dt = datetime.strptime(date_str, '%d/%m/%Y, %H:%M')
except ValueError:
# Return the original date string if parsing fails
return date_str
# Extract components without leading zeros
month = dt.month
day = dt.day
year_short = dt.strftime('%y') # Last two digits of the year
# Convert to 12-hour format and determine AM/PM
hour_12 = dt.hour % 12
if hour_12 == 0:
hour_12 = 12 # Adjust 0 (from 12 AM/PM) to 12
hour_str = str(hour_12)
# Format minute with leading zero if necessary
minute_str = f"{dt.minute:02d}"
# Get AM/PM designation
am_pm = dt.strftime('%p')
# Construct the formatted date string with Unicode narrow space
return f"{month}/{day}/{year_short}, {hour_str}:{minute_str}\u202f{am_pm}"
converted_dates = [convert_to_target_format(date) for date in valid_dates]
df = pd.DataFrame({'user_message': filtered_messages, 'message_date': converted_dates})
df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
df.rename(columns={'message_date': 'date'}, inplace=True)
users, messages = [], []
msg_pattern = r"^(.*?):\s(.*)$"
for message in df["user_message"]:
match = re.match(msg_pattern, message)
if match:
users.append(match.group(1))
messages.append(match.group(2))
else:
users.append("group_notification")
messages.append(message)
df["user"] = users
df["message"] = messages
df = df[df["user"] != "group_notification"].reset_index(drop=True)
df["unfiltered_messages"] = df["message"]
df["message"] = df["message"].apply(clean_message)
# Extract time-based features
df['year'] = pd.to_numeric(df['date'].dt.year, downcast='integer')
df['month'] = df['date'].dt.month_name()
df['day'] = pd.to_numeric(df['date'].dt.day, downcast='integer')
df['hour'] = pd.to_numeric(df['date'].dt.hour, downcast='integer')
df['day_of_week'] = df['date'].dt.day_name()
# Lemmatize messages for topic modeling
lemmatized_messages = []
for message in df["message"]:
try:
lang = detect_langs(message)
lemmatized_messages.append(lemmatize_text(message, lang))
except:
lemmatized_messages.append("")
df["lemmatized_message"] = lemmatized_messages
df = df[df["message"].notnull() & (df["message"] != "")].copy()
df.drop(columns=["user_message"], inplace=True)
# Perform topic modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
dtm = vectorizer.fit_transform(df['lemmatized_message'])
# Apply LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)
# Assign topics to messages
topic_results = lda.transform(dtm)
df = df.iloc[:topic_results.shape[0]].copy()
df['topic'] = topic_results.argmax(axis=1)
# Store topics for visualization
topics = []
for topic in lda.components_:
topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
print("Top words for each topic-----------------------------------------------------:")
print(topics)
return df, topics
def preprocess_for_clustering(df, n_clusters=5):
df = df[df["lemmatized_message"].notnull() & (df["lemmatized_message"].str.strip() != "")]
df = df.reset_index(drop=True)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
if tfidf_matrix.shape[0] < 2:
raise ValueError("Not enough messages for clustering.")
df = df.iloc[:tfidf_matrix.shape[0]].copy()
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)
df['cluster'] = clusters
tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(tfidf_matrix.toarray())
return df, reduced_features, kmeans.cluster_centers_
def predict_sentiment_batch(texts: list, batch_size: int = 32) -> list:
"""Predict sentiment for a batch of texts"""
if not isinstance(texts, list):
raise TypeError(f"Expected list of texts, got {type(texts)}")
processed_texts = [preprocess(text) for text in texts]
predictions = []
for i in range(0, len(processed_texts), batch_size):
batch = processed_texts[i:i+batch_size]
inputs = tokenizer(
batch,
padding=True,
truncation=True,
return_tensors="pt",
max_length=128
).to(device)
with torch.no_grad():
outputs = model(**inputs)
batch_preds = outputs.logits.argmax(dim=1).cpu().numpy()
predictions.extend([config.id2label[p] for p in batch_preds])
return predictions