File size: 4,499 Bytes
0f1938f
 
 
 
 
 
 
 
 
 
535a3a5
0f1938f
720c911
0f1938f
 
 
 
 
535a3a5
0f1938f
535a3a5
0f1938f
 
535a3a5
 
 
 
0f1938f
535a3a5
0f1938f
 
535a3a5
0f1938f
 
 
 
535a3a5
0f1938f
 
 
 
 
 
 
535a3a5
0f1938f
 
 
535a3a5
0f1938f
 
535a3a5
0f1938f
535a3a5
0f1938f
 
535a3a5
0f1938f
 
535a3a5
0f1938f
 
 
 
 
 
 
 
 
 
 
535a3a5
0f1938f
 
 
 
 
 
 
535a3a5
0f1938f
535a3a5
 
0f1938f
 
 
 
 
535a3a5
 
0f1938f
 
535a3a5
0f1938f
 
535a3a5
0f1938f
 
535a3a5
0f1938f
 
535a3a5
 
0f1938f
 
535a3a5
0f1938f
 
 
 
535a3a5
0f1938f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import random
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any, Optional
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
from scipy.sparse import csr_matrix

from .base import BaseClassifier


class TFIDFClassifier(BaseClassifier):
    """Classifier using TF-IDF and clustering for fast classification"""

    def __init__(self) -> None:
        super().__init__()
        self.vectorizer: TfidfVectorizer = TfidfVectorizer(
            max_features=1000, stop_words="english", ngram_range=(1, 2)
        )
        self.model: Optional[KMeans] = None
        self.feature_names: Optional[np.ndarray] = None
        self.categories: Optional[List[str]] = None
        self.centroids: Optional[np.ndarray] = None

    def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
        """Classify texts using TF-IDF and clustering"""
        # Vectorize the texts
        X: csr_matrix = self.vectorizer.fit_transform(texts)
        self.feature_names = self.vectorizer.get_feature_names_out()

        # Auto-detect categories if not provided
        if not categories:
            num_clusters: int = min(5, len(texts))  # Don't create more clusters than texts
            self.categories = self._generate_default_categories(texts, num_clusters)
        else:
            self.categories = categories
            num_clusters = len(categories)

        # Cluster the texts
        self.model = KMeans(n_clusters=num_clusters, random_state=42)
        clusters: np.ndarray = self.model.fit_predict(X)
        self.centroids = self.model.cluster_centers_

        # Calculate distances to centroids for confidence
        distances: np.ndarray = self._calculate_distances(X)

        # Prepare results
        results: List[Dict[str, Any]] = []
        for i, text in enumerate(texts):
            cluster_idx: int = clusters[i]

            # Calculate confidence (inverse of distance, normalized)
            confidence: float = self._calculate_confidence(distances[i])

            # Create explanation
            explanation: str = self._generate_explanation(X[i], cluster_idx)

            results.append(
                {
                    "category": self.categories[cluster_idx],
                    "confidence": confidence,
                    "explanation": explanation,
                }
            )

        return results

    def _calculate_distances(self, X: csr_matrix) -> np.ndarray:
        """Calculate distances from each point to each centroid"""
        return np.sqrt(
            (
                (X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2
            ).sum(axis=2)
        )

    def _calculate_confidence(self, distances: np.ndarray) -> float:
        """Convert distances to confidence scores (0-100)"""
        min_dist: float = np.min(distances)
        max_dist: float = np.max(distances)

        # Normalize and invert (smaller distance = higher confidence)
        if max_dist == min_dist:
            return 70  # Default mid-range confidence when all distances are equal

        normalized_dist: np.ndarray = (distances - min_dist) / (max_dist - min_dist)
        min_normalized: float = np.min(normalized_dist)

        # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
        confidence: float = 100 - (min_normalized * 50)
        return round(confidence, 1)

    def _generate_explanation(self, text_vector: csr_matrix, cluster_idx: int) -> str:
        """Generate an explanation for the classification"""
        # Get the most important features for this cluster
        centroid: np.ndarray = self.centroids[cluster_idx]

        # Get indices of top features for this text
        text_array: np.ndarray = text_vector.toarray()[0]
        top_indices: np.ndarray = text_array.argsort()[-5:][::-1]

        # Get the feature names for these indices
        top_features: List[str] = [self.feature_names[i] for i in top_indices if text_array[i] > 0]

        if not top_features:
            return "No significant features identified for this classification."

        explanation: str = f"Classification based on key terms: {', '.join(top_features)}"
        return explanation