Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on 15 days ago

Commit

0f1938f

1 Parent(s): 442b8d8

clean classifier

Browse files

Files changed (4) hide show

classifiers/__init__.py +3 -0
classifiers/base.py +48 -0
classifiers/llm.py +137 -0
classifiers/tfidf.py +113 -0

classifiers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .tfidf import TFIDFClassifier
2	+
3	+ __all__ = ['TFIDFClassifier', 'LLMClassifier']

classifiers/base.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
+import random
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Dict, Any, Optional
+from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
+class BaseClassifier:
+    """Base class for text classifiers"""
+    def __init__(self):
+        pass
+    def classify(self, texts, categories=None):
+        """
+        Classify a list of texts into categories
+        Args:
+            texts (list): List of text strings to classify
+            categories (list, optional): List of category names. If None, categories will be auto-detected
+        Returns:
+            list: List of classification results with categories, confidence scores, and explanations
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    def _generate_default_categories(self, texts, num_clusters=5):
+        """
+        Generate default categories based on text clustering
+        Args:
+            texts (list): List of text strings
+            num_clusters (int): Number of clusters to generate
+        Returns:
+            list: List of category names
+        """
+        # Simple implementation - in real system this would be more sophisticated
+        default_categories = [f"Category {i+1}" for i in range(num_clusters)]
+        return default_categories

classifiers/llm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
+import random
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Dict, Any, Optional
+from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
+from base import BaseClassifier
+class LLMClassifier(BaseClassifier):
+    """Classifier using a Large Language Model for more accurate but slower classification"""
+    def __init__(self, client, model="gpt-3.5-turbo"):
+        super().__init__()
+        self.client = client
+        self.model = model
+    def classify(
+        self, texts: List[str], categories: Optional[List[str]] = None
+    ) -> List[Dict[str, Any]]:
+        """Classify texts using an LLM with parallel processing"""
+        if not categories:
+            # First, use LLM to generate appropriate categories
+            categories = self._suggest_categories(texts)
+        # Process texts in parallel
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            # Submit all tasks with their original indices
+            future_to_index = {
+                executor.submit(self._classify_text, text, categories): idx
+                for idx, text in enumerate(texts)
+            }
+            # Initialize results list with None values
+            results = [None] * len(texts)
+            # Collect results as they complete
+            for future in as_completed(future_to_index):
+                original_idx = future_to_index[future]
+                try:
+                    result = future.result()
+                    results[original_idx] = result
+                except Exception as e:
+                    print(f"Error processing text: {str(e)}")
+                    results[original_idx] = {
+                        "category": categories[0],
+                        "confidence": 50,
+                        "explanation": f"Error during classification: {str(e)}",
+                    }
+        return results
+    def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
+        """Use LLM to suggest appropriate categories for the dataset"""
+        # Take a sample of texts to avoid token limitations
+        if len(texts) > sample_size:
+            sample_texts = random.sample(texts, sample_size)
+        else:
+            sample_texts = texts
+        prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.2,
+                max_tokens=100,
+            )
+            # Parse response to get categories
+            categories_text = response.choices[0].message.content.strip()
+            categories = [cat.strip() for cat in categories_text.split(",")]
+            return categories
+        except Exception as e:
+            # Fallback to default categories on error
+            print(f"Error suggesting categories: {str(e)}")
+            return self._generate_default_categories(texts)
+    def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
+        """Use LLM to classify a single text"""
+        prompt = TEXT_CLASSIFICATION_PROMPT.format(
+            categories=", ".join(categories), text=text
+        )
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+                max_tokens=200,
+            )
+            # Parse JSON response
+            response_text = response.choices[0].message.content.strip()
+            result = json.loads(response_text)
+            # Ensure all required fields are present
+            if not all(k in result for k in ["category", "confidence", "explanation"]):
+                raise ValueError("Missing required fields in LLM response")
+            # Validate category is in the list
+            if result["category"] not in categories:
+                result["category"] = categories[
+                    0
+                ]  # Default to first category if invalid
+            # Validate confidence is a number between 0 and 100
+            try:
+                result["confidence"] = float(result["confidence"])
+                if not 0 <= result["confidence"] <= 100:
+                    result["confidence"] = 50
+            except:
+                result["confidence"] = 50
+            return result
+        except json.JSONDecodeError:
+            # Fall back to simple parsing if JSON fails
+            category = categories[0]  # Default
+            for cat in categories:
+                if cat.lower() in response_text.lower():
+                    category = cat
+                    break
+            return {
+                "category": category,
+                "confidence": 50,
+                "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)",
+            }

classifiers/tfidf.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
+import random
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Dict, Any, Optional
+from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
+from base import BaseClassifier
+class TFIDFClassifier(BaseClassifier):
+    """Classifier using TF-IDF and clustering for fast classification"""
+    def __init__(self):
+        super().__init__()
+        self.vectorizer = TfidfVectorizer(
+            max_features=1000, stop_words="english", ngram_range=(1, 2)
+        )
+        self.model = None
+        self.feature_names = None
+        self.categories = None
+        self.centroids = None
+    def classify(self, texts, categories=None):
+        """Classify texts using TF-IDF and clustering"""
+        # Vectorize the texts
+        X = self.vectorizer.fit_transform(texts)
+        self.feature_names = self.vectorizer.get_feature_names_out()
+        # Auto-detect categories if not provided
+        if not categories:
+            num_clusters = min(5, len(texts))  # Don't create more clusters than texts
+            self.categories = self._generate_default_categories(texts, num_clusters)
+        else:
+            self.categories = categories
+            num_clusters = len(categories)
+        # Cluster the texts
+        self.model = KMeans(n_clusters=num_clusters, random_state=42)
+        clusters = self.model.fit_predict(X)
+        self.centroids = self.model.cluster_centers_
+        # Calculate distances to centroids for confidence
+        distances = self._calculate_distances(X)
+        # Prepare results
+        results = []
+        for i, text in enumerate(texts):
+            cluster_idx = clusters[i]
+            # Calculate confidence (inverse of distance, normalized)
+            confidence = self._calculate_confidence(distances[i])
+            # Create explanation
+            explanation = self._generate_explanation(X[i], cluster_idx)
+            results.append(
+                {
+                    "category": self.categories[cluster_idx],
+                    "confidence": confidence,
+                    "explanation": explanation,
+                }
+            )
+        return results
+    def _calculate_distances(self, X):
+        """Calculate distances from each point to each centroid"""
+        return np.sqrt(
+            (
+                (X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2
+            ).sum(axis=2)
+        )
+    def _calculate_confidence(self, distances):
+        """Convert distances to confidence scores (0-100)"""
+        min_dist = np.min(distances)
+        max_dist = np.max(distances)
+        # Normalize and invert (smaller distance = higher confidence)
+        if max_dist == min_dist:
+            return 70  # Default mid-range confidence when all distances are equal
+        normalized_dist = (distances - min_dist) / (max_dist - min_dist)
+        min_normalized = np.min(normalized_dist)
+        # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
+        confidence = 100 - (min_normalized * 50)
+        return round(confidence, 1)
+    def _generate_explanation(self, text_vector, cluster_idx):
+        """Generate an explanation for the classification"""
+        # Get the most important features for this cluster
+        centroid = self.centroids[cluster_idx]
+        # Get indices of top features for this text
+        text_array = text_vector.toarray()[0]
+        top_indices = text_array.argsort()[-5:][::-1]
+        # Get the feature names for these indices
+        top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
+        if not top_features:
+            return "No significant features identified for this classification."
+        explanation = f"Classification based on key terms: {', '.join(top_features)}"
+        return explanation