Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.metrics.pairwise import cosine_similarity | |
import random | |
import json | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from typing import List, Dict, Any, Optional | |
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT | |
from scipy.sparse import csr_matrix | |
from .base import BaseClassifier | |
class TFIDFClassifier(BaseClassifier): | |
"""Classifier using TF-IDF and clustering for fast classification""" | |
def __init__(self) -> None: | |
super().__init__() | |
self.vectorizer: TfidfVectorizer = TfidfVectorizer( | |
max_features=1000, stop_words="english", ngram_range=(1, 2) | |
) | |
self.model: Optional[KMeans] = None | |
self.feature_names: Optional[np.ndarray] = None | |
self.categories: Optional[List[str]] = None | |
self.centroids: Optional[np.ndarray] = None | |
def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]: | |
"""Classify texts using TF-IDF and clustering""" | |
# Vectorize the texts | |
X: csr_matrix = self.vectorizer.fit_transform(texts) | |
self.feature_names = self.vectorizer.get_feature_names_out() | |
# Auto-detect categories if not provided | |
if not categories: | |
num_clusters: int = min(5, len(texts)) # Don't create more clusters than texts | |
self.categories = self._generate_default_categories(texts, num_clusters) | |
else: | |
self.categories = categories | |
num_clusters = len(categories) | |
# Cluster the texts | |
self.model = KMeans(n_clusters=num_clusters, random_state=42) | |
clusters: np.ndarray = self.model.fit_predict(X) | |
self.centroids = self.model.cluster_centers_ | |
# Calculate distances to centroids for confidence | |
distances: np.ndarray = self._calculate_distances(X) | |
# Prepare results | |
results: List[Dict[str, Any]] = [] | |
for i, text in enumerate(texts): | |
cluster_idx: int = clusters[i] | |
# Calculate confidence (inverse of distance, normalized) | |
confidence: float = self._calculate_confidence(distances[i]) | |
# Create explanation | |
explanation: str = self._generate_explanation(X[i], cluster_idx) | |
results.append( | |
{ | |
"category": self.categories[cluster_idx], | |
"confidence": confidence, | |
"explanation": explanation, | |
} | |
) | |
return results | |
def _calculate_distances(self, X: csr_matrix) -> np.ndarray: | |
"""Calculate distances from each point to each centroid""" | |
return np.sqrt( | |
( | |
(X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2 | |
).sum(axis=2) | |
) | |
def _calculate_confidence(self, distances: np.ndarray) -> float: | |
"""Convert distances to confidence scores (0-100)""" | |
min_dist: float = np.min(distances) | |
max_dist: float = np.max(distances) | |
# Normalize and invert (smaller distance = higher confidence) | |
if max_dist == min_dist: | |
return 70 # Default mid-range confidence when all distances are equal | |
normalized_dist: np.ndarray = (distances - min_dist) / (max_dist - min_dist) | |
min_normalized: float = np.min(normalized_dist) | |
# Invert and scale to 50-100 range (TF-IDF is never 100% confident) | |
confidence: float = 100 - (min_normalized * 50) | |
return round(confidence, 1) | |
def _generate_explanation(self, text_vector: csr_matrix, cluster_idx: int) -> str: | |
"""Generate an explanation for the classification""" | |
# Get the most important features for this cluster | |
centroid: np.ndarray = self.centroids[cluster_idx] | |
# Get indices of top features for this text | |
text_array: np.ndarray = text_vector.toarray()[0] | |
top_indices: np.ndarray = text_array.argsort()[-5:][::-1] | |
# Get the feature names for these indices | |
top_features: List[str] = [self.feature_names[i] for i in top_indices if text_array[i] > 0] | |
if not top_features: | |
return "No significant features identified for this classification." | |
explanation: str = f"Classification based on key terms: {', '.join(top_features)}" | |
return explanation | |