File size: 1,641 Bytes
0f1938f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535a3a5
0f1938f
 
535a3a5
0f1938f
 
 
 
 
 
 
 
 
 
 
 
535a3a5
0f1938f
 
 
 
 
 
 
 
 
 
 
535a3a5
0f1938f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import random
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any, Optional
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT


class BaseClassifier:
    """Base class for text classifiers"""

    def __init__(self) -> None:
        pass

    def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
        """
        Classify a list of texts into categories

        Args:
            texts (list): List of text strings to classify
            categories (list, optional): List of category names. If None, categories will be auto-detected

        Returns:
            list: List of classification results with categories, confidence scores, and explanations
        """
        raise NotImplementedError("Subclasses must implement this method")

    def _generate_default_categories(self, texts: List[str], num_clusters: int = 5) -> List[str]:
        """
        Generate default categories based on text clustering

        Args:
            texts (list): List of text strings
            num_clusters (int): Number of clusters to generate

        Returns:
            list: List of category names
        """
        # Simple implementation - in real system this would be more sophisticated
        default_categories: List[str] = [f"Category {i+1}" for i in range(num_clusters)]
        return default_categories