Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on 25 days ago

Commit

1bc76b5

0 Parent(s):

first commit

Browse files

Files changed (6) hide show

.gitignore +2 -0
app.py +586 -0
classifiers.py +256 -0
examples/sample_reviews.csv +11 -0
requirements.txt +9 -0
utils.py +188 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ *.pyc

app.py ADDED Viewed

	@@ -0,0 +1,586 @@

+import os
+import gradio as gr
+import pandas as pd
+import numpy as np
+from litellm import OpenAI
+import json
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+import time
+import torch
+import traceback
+import logging
+# Import local modules
+from classifiers import TFIDFClassifier, LLMClassifier
+from utils import load_data, export_data, visualize_results, validate_results
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Initialize API key from environment variable
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
+# Only initialize client if API key is available
+client = None
+if OPENAI_API_KEY:
+    try:
+        client = OpenAI(api_key=OPENAI_API_KEY)
+        logging.info("OpenAI client initialized successfully")
+    except Exception as e:
+        logging.error(f"Failed to initialize OpenAI client: {str(e)}")
+def update_api_key(api_key):
+    """Update the OpenAI API key"""
+    global OPENAI_API_KEY, client
+    if not api_key:
+        return "API Key cannot be empty"
+    OPENAI_API_KEY = api_key
+    try:
+        client = OpenAI(api_key=api_key)
+        # Test the connection with a simple request
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "test"}],
+            max_tokens=5
+        )
+        return f"API Key updated and verified successfully"
+    except Exception as e:
+        error_msg = str(e)
+        logging.error(f"API key update failed: {error_msg}")
+        return f"Failed to update API Key: {error_msg}"
+def process_file(file, text_columns, categories, classifier_type, show_explanations):
+    """Process the uploaded file and classify text data"""
+    try:
+        # Load data from file
+        if isinstance(file, str):
+            df = load_data(file)
+        else:
+            df = load_data(file.name)
+        if not text_columns:
+            return None, "Please select at least one text column"
+        # Check if all selected columns exist
+        missing_columns = [col for col in text_columns if col not in df.columns]
+        if missing_columns:
+            return None, f"Columns not found in the file: {', '.join(missing_columns)}. Available columns: {', '.join(df.columns)}"
+        # Combine text from selected columns
+        texts = []
+        for _, row in df.iterrows():
+            combined_text = " ".join(str(row[col]) for col in text_columns)
+            texts.append(combined_text)
+        # Parse categories if provided
+        category_list = []
+        if categories:
+            category_list = [cat.strip() for cat in categories.split(",")]
+        # Select classifier based on data size and user choice
+        num_texts = len(texts)
+        # If no specific model is chosen, select the most appropriate one
+        if classifier_type == "auto":
+            if num_texts <= 500:
+                classifier_type = "gpt4"
+            elif num_texts <= 1000:
+                classifier_type = "gpt35"
+            elif num_texts <= 5000:
+                classifier_type = "hybrid"
+            else:
+                classifier_type = "tfidf"
+        # Initialize appropriate classifier
+        if classifier_type == "tfidf":
+            classifier = TFIDFClassifier()
+            results = classifier.classify(texts, category_list)
+        elif classifier_type == "gpt35":
+            if client is None:
+                return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
+            classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
+            results = classifier.classify(texts, category_list)
+        elif classifier_type == "gpt4":
+            if client is None:
+                return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
+            classifier = LLMClassifier(client=client, model="gpt-4")
+            results = classifier.classify(texts, category_list)
+        else:  # hybrid
+            if client is None:
+                return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
+            # First pass with TF-IDF
+            tfidf_classifier = TFIDFClassifier()
+            tfidf_results = tfidf_classifier.classify(texts, category_list)
+            # Second pass with LLM for low confidence results
+            llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
+            results = []
+            for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
+                if tfidf_result["confidence"] < 70:  # If confidence is below 70%
+                    llm_result = llm_classifier.classify([text], category_list)[0]
+                    results.append(llm_result)
+                else:
+                    results.append(tfidf_result)
+        # Create results dataframe
+        result_df = df.copy()
+        result_df["Category"] = [r["category"] for r in results]
+        result_df["Confidence"] = [r["confidence"] for r in results]
+        if show_explanations:
+            result_df["Explanation"] = [r["explanation"] for r in results]
+        # Validate results using LLM
+        validation_report = validate_results(result_df, text_columns, client)
+        return result_df, validation_report
+    except Exception as e:
+        error_traceback = traceback.format_exc()
+        return None, f"Error: {str(e)}\n{error_traceback}"
+def export_results(df, format_type):
+    """Export results to a file and return the file path for download"""
+    if df is None:
+        return None
+    # Create a temporary file
+    import tempfile
+    import os
+    # Create a temporary directory if it doesn't exist
+    temp_dir = "temp_exports"
+    os.makedirs(temp_dir, exist_ok=True)
+    # Generate a unique filename
+    timestamp = time.strftime("%Y%m%d-%H%M%S")
+    filename = f"classification_results_{timestamp}"
+    if format_type == "excel":
+        file_path = os.path.join(temp_dir, f"{filename}.xlsx")
+        df.to_excel(file_path, index=False)
+    else:
+        file_path = os.path.join(temp_dir, f"{filename}.csv")
+        df.to_csv(file_path, index=False)
+    return file_path
+# Create Gradio interface
+with gr.Blocks(title="Text Classification System") as demo:
+    gr.Markdown("# Text Classification System")
+    gr.Markdown("Upload your data file (Excel/CSV) and classify text using AI")
+    with gr.Tab("Setup"):
+        api_key_input = gr.Textbox(
+            label="OpenAI API Key",
+            placeholder="Enter your API key here",
+            type="password",
+            value=OPENAI_API_KEY
+        )
+        api_key_button = gr.Button("Update API Key")
+        api_key_message = gr.Textbox(label="Status", interactive=False)
+        # Display current API status
+        api_status = "API Key is set" if OPENAI_API_KEY else "No API Key found. Please set one."
+        gr.Markdown(f"**Current API Status**: {api_status}")
+        api_key_button.click(update_api_key, inputs=[api_key_input], outputs=[api_key_message])
+    with gr.Tab("Classify Data"):
+        with gr.Column():
+            file_input = gr.File(label="Upload Excel/CSV File")
+            # Variable to store available columns
+            available_columns = gr.State([])
+            # Button to load file and suggest categories
+            load_categories_button = gr.Button("Load File")
+            # Display original dataframe
+            original_df = gr.Dataframe(
+                label="Original Data",
+                interactive=False,
+                visible=False
+            )
+            with gr.Row():
+                with gr.Column():
+                    suggested_categories = gr.CheckboxGroup(
+                        label="Suggested Categories",
+                        choices=[],
+                        value=[],
+                        interactive=True,
+                        visible=False
+                    )
+                    new_category = gr.Textbox(
+                        label="Add New Category",
+                        placeholder="Enter a new category name",
+                        visible=False
+                    )
+                    with gr.Row():
+                        add_category_button = gr.Button("Add Category", visible=False)
+                        suggest_category_button = gr.Button("Suggest Category", visible=False)
+                    # Original categories input (hidden)
+                    categories = gr.Textbox(
+                        visible=False
+                    )
+                with gr.Column():
+                    text_column = gr.CheckboxGroup(
+                        label="Select Text Columns",
+                        choices=[],
+                        interactive=True,
+                        visible=False
+                    )
+                    classifier_type = gr.Dropdown(
+                        choices=[
+                            ("TF-IDF (Rapide, <1000 lignes)", "tfidf"),
+                            ("LLM GPT-3.5 (Fiable, <1000 lignes)", "gpt35"),
+                            ("LLM GPT-4 (Très fiable, <500 lignes)", "gpt4"),
+                            ("TF-IDF + LLM (Hybride, >1000 lignes)", "hybrid")
+                        ],
+                        label="Modèle de classification",
+                        value="tfidf",
+                        visible=False
+                    )
+                    show_explanations = gr.Checkbox(label="Show Explanations", value=True, visible=False)
+                    process_button = gr.Button("Process and Classify", visible=False)
+        results_df = gr.Dataframe(interactive=True, visible=False)
+        # Create containers for visualization and validation report
+        with gr.Row(visible=False) as results_row:
+            with gr.Column():
+                visualization = gr.Plot(label="Classification Distribution")
+                with gr.Row():
+                    csv_download = gr.File(label="Download CSV", visible=False)
+                    excel_download = gr.File(label="Download Excel", visible=False)
+            with gr.Column():
+                validation_output = gr.Textbox(label="Validation Report", interactive=False)
+                improve_button = gr.Button("Improve Classification with Report", visible=False)
+        # Function to load file and suggest categories
+        def load_file_and_suggest_categories(file):
+            if not file:
+                return [], gr.CheckboxGroup(choices=[]), gr.CheckboxGroup(choices=[], visible=False), gr.Textbox(visible=False), gr.Button(visible=False), gr.Button(visible=False), gr.CheckboxGroup(choices=[], visible=False), gr.Dropdown(visible=False), gr.Checkbox(visible=False), gr.Button(visible=False), gr.Dataframe(visible=False)
+            try:
+                df = load_data(file.name)
+                columns = list(df.columns)
+                # Analyze columns to suggest text columns
+                suggested_text_columns = []
+                for col in columns:
+                    # Check if column contains text data
+                    if df[col].dtype == 'object':  # String type
+                        # Check if column contains mostly text (not just numbers or dates)
+                        sample = df[col].head(100).dropna()
+                        if len(sample) > 0:
+                            # Check if most values contain spaces (indicating text)
+                            text_ratio = sum(' ' in str(val) for val in sample) / len(sample)
+                            if text_ratio > 0.3:  # If more than 30% of values contain spaces
+                                suggested_text_columns.append(col)
+                # If no columns were suggested, use all object columns
+                if not suggested_text_columns:
+                    suggested_text_columns = [col for col in columns if df[col].dtype == 'object']
+                # Get a sample of text for category suggestion
+                sample_texts = []
+                for col in suggested_text_columns:
+                    sample_texts.extend(df[col].head(5).tolist())
+                # Use LLM to suggest categories
+                if client:
+                    prompt = f"""
+                    Based on these example texts, suggest 5 appropriate categories for classification:
+                    {sample_texts[:5]}
+                    Return your answer as a comma-separated list of category names only.
+                    """
+                    try:
+                        response = client.chat.completions.create(
+                            model="gpt-3.5-turbo",
+                            messages=[{"role": "user", "content": prompt}],
+                            temperature=0.2,
+                            max_tokens=100
+                        )
+                        suggested_cats = [cat.strip() for cat in response.choices[0].message.content.strip().split(",")]
+                    except:
+                        suggested_cats = ["Positive", "Negative", "Neutral", "Mixed", "Other"]
+                else:
+                    suggested_cats = ["Positive", "Negative", "Neutral", "Mixed", "Other"]
+                return (
+                    columns,
+                    gr.CheckboxGroup(choices=columns, value=suggested_text_columns),
+                    gr.CheckboxGroup(choices=suggested_cats, value=suggested_cats, visible=True),
+                    gr.Textbox(visible=True),
+                    gr.Button(visible=True),
+                    gr.Button(visible=True),
+                    gr.CheckboxGroup(choices=columns, value=suggested_text_columns, visible=True),
+                    gr.Dropdown(visible=True),
+                    gr.Checkbox(visible=True),
+                    gr.Button(visible=True),
+                    gr.Dataframe(value=df, visible=True)
+                )
+            except Exception as e:
+                return [], gr.CheckboxGroup(choices=[]), gr.CheckboxGroup(choices=[], visible=False), gr.Textbox(visible=False), gr.Button(visible=False), gr.Button(visible=False), gr.CheckboxGroup(choices=[], visible=False), gr.Dropdown(visible=False), gr.Checkbox(visible=False), gr.Button(visible=False), gr.Dataframe(visible=False)
+        # Function to add a new category
+        def add_new_category(current_categories, new_category):
+            if not new_category or new_category.strip() == "":
+                return current_categories
+            new_categories = current_categories + [new_category.strip()]
+            return gr.CheckboxGroup(choices=new_categories, value=new_categories)
+        # Function to update categories textbox
+        def update_categories_textbox(selected_categories):
+            return ", ".join(selected_categories)
+        # Function to show results after processing
+        def show_results(df, validation_report):
+            if df is None:
+                return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
+            # Export to both formats
+            csv_path = export_results(df, "csv")
+            excel_path = export_results(df, "excel")
+            return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
+        # Function to suggest a new category
+        def suggest_new_category(file, current_categories, text_columns):
+            if not file or not text_columns:
+                return gr.CheckboxGroup(choices=current_categories, value=current_categories)
+            try:
+                df = load_data(file.name)
+                # Get sample texts from selected columns
+                sample_texts = []
+                for col in text_columns:
+                    sample_texts.extend(df[col].head(5).tolist())
+                if client:
+                    prompt = f"""
+                    Based on these example texts and the existing categories ({', '.join(current_categories)}),
+                    suggest one additional appropriate category for classification.
+                    Example texts:
+                    {sample_texts[:5]}
+                    Return only the suggested category name, nothing else.
+                    """
+                    try:
+                        response = client.chat.completions.create(
+                            model="gpt-3.5-turbo",
+                            messages=[{"role": "user", "content": prompt}],
+                            temperature=0.2,
+                            max_tokens=50
+                        )
+                        new_cat = response.choices[0].message.content.strip()
+                        if new_cat and new_cat not in current_categories:
+                            current_categories.append(new_cat)
+                    except:
+                        pass
+                return gr.CheckboxGroup(choices=current_categories, value=current_categories)
+            except Exception as e:
+                return gr.CheckboxGroup(choices=current_categories, value=current_categories)
+        # Function to handle export and show download button
+        def handle_export(df, format_type):
+            if df is None:
+                return gr.File(visible=False)
+            file_path = export_results(df, format_type)
+            return gr.File(value=file_path, visible=True)
+        # Function to improve classification based on validation report
+        def improve_classification(df, validation_report, text_columns, categories, classifier_type, show_explanations, file):
+            """Improve classification based on validation report"""
+            if df is None or not validation_report:
+                return df, validation_report, gr.Button(visible=False), gr.CheckboxGroup(choices=[], value=[])
+            try:
+                # Extract insights from validation report
+                if client:
+                    prompt = f"""
+                    Based on this validation report, analyze the current classification and suggest improvements:
+                    {validation_report}
+                    Return your answer in JSON format with these fields:
+                    - suggested_categories: list of improved category names (must be different from current categories: {categories})
+                    - confidence_threshold: a number between 0 and 100 for minimum confidence
+                    - focus_areas: list of specific aspects to focus on during classification
+                    - analysis: a brief analysis of what needs improvement
+                    - new_categories_needed: boolean indicating if new categories should be added
+                    JSON response:
+                    """
+                    try:
+                        response = client.chat.completions.create(
+                            model="gpt-4",
+                            messages=[{"role": "user", "content": prompt}],
+                            temperature=0.2,
+                            max_tokens=300
+                        )
+                        improvements = json.loads(response.choices[0].message.content.strip())
+                        # Get current categories
+                        current_categories = [cat.strip() for cat in categories.split(",")]
+                        # If new categories are needed, suggest them based on the data
+                        if improvements.get("new_categories_needed", False):
+                            # Get sample texts for category suggestion
+                            sample_texts = []
+                            for col in text_columns:
+                                if isinstance(file, str):
+                                    temp_df = load_data(file)
+                                else:
+                                    temp_df = load_data(file.name)
+                                sample_texts.extend(temp_df[col].head(5).tolist())
+                            category_prompt = f"""
+                            Based on these example texts and the current categories ({', '.join(current_categories)}),
+                            suggest new categories that would improve the classification. The validation report indicates:
+                            {improvements.get('analysis', '')}
+                            Example texts:
+                            {sample_texts[:5]}
+                            Return your answer as a comma-separated list of new category names only.
+                            """
+                            category_response = client.chat.completions.create(
+                                model="gpt-4",
+                                messages=[{"role": "user", "content": category_prompt}],
+                                temperature=0.2,
+                                max_tokens=100
+                            )
+                            new_categories = [cat.strip() for cat in category_response.choices[0].message.content.strip().split(",")]
+                            # Combine current and new categories
+                            all_categories = current_categories + new_categories
+                            categories = ",".join(all_categories)
+                        # Process with improved parameters
+                        improved_df, new_validation = process_file(
+                            file,
+                            text_columns,
+                            categories,
+                            classifier_type,
+                            show_explanations
+                        )
+                        return improved_df, new_validation, gr.Button(visible=True), gr.CheckboxGroup(choices=all_categories, value=all_categories)
+                    except Exception as e:
+                        print(f"Error in improvement process: {str(e)}")
+                        return df, validation_report, gr.Button(visible=True), gr.CheckboxGroup(choices=current_categories, value=current_categories)
+                else:
+                    return df, validation_report, gr.Button(visible=True), gr.CheckboxGroup(choices=current_categories, value=current_categories)
+            except Exception as e:
+                print(f"Error in improvement process: {str(e)}")
+                return df, validation_report, gr.Button(visible=True), gr.CheckboxGroup(choices=current_categories, value=current_categories)
+        # Connect functions
+        load_categories_button.click(
+            load_file_and_suggest_categories,
+            inputs=[file_input],
+            outputs=[
+                available_columns,
+                text_column,
+                suggested_categories,
+                new_category,
+                add_category_button,
+                suggest_category_button,
+                text_column,
+                classifier_type,
+                show_explanations,
+                process_button,
+                original_df
+            ]
+        )
+        add_category_button.click(
+            add_new_category,
+            inputs=[suggested_categories, new_category],
+            outputs=[suggested_categories]
+        )
+        suggested_categories.change(
+            update_categories_textbox,
+            inputs=[suggested_categories],
+            outputs=[categories]
+        )
+        suggest_category_button.click(
+            suggest_new_category,
+            inputs=[file_input, suggested_categories, text_column],
+            outputs=[suggested_categories]
+        )
+        process_button.click(
+            process_file,
+            inputs=[file_input, text_column, categories, classifier_type, show_explanations],
+            outputs=[results_df, validation_output]
+        ).then(
+            show_results,
+            inputs=[results_df, validation_output],
+            outputs=[results_row, csv_download, excel_download, results_df]
+        ).then(
+            visualize_results,
+            inputs=[results_df, text_column],
+            outputs=[visualization]
+        ).then(
+            lambda x: gr.Button(visible=True),
+            inputs=[],
+            outputs=[improve_button]
+        )
+        improve_button.click(
+            improve_classification,
+            inputs=[results_df, validation_output, text_column, categories, classifier_type, show_explanations, file_input],
+            outputs=[results_df, validation_output, improve_button, suggested_categories]
+        ).then(
+            show_results,
+            inputs=[results_df, validation_output],
+            outputs=[results_row, csv_download, excel_download, results_df]
+        ).then(
+            visualize_results,
+            inputs=[results_df, text_column],
+            outputs=[visualization]
+        )
+def create_example_data():
+    """Create example data for demonstration"""
+    from utils import create_example_file
+    example_path = create_example_file()
+    return f"Example file created at: {example_path}"
+if __name__ == "__main__":
+    # Create examples directory and sample file if it doesn't exist
+    if not os.path.exists("examples"):
+        create_example_data()
+    # Launch the Gradio app
+    demo.launch()

classifiers.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
+import random
+import json
+class BaseClassifier:
+    """Base class for text classifiers"""
+    def __init__(self):
+        pass
+    def classify(self, texts, categories=None):
+        """
+        Classify a list of texts into categories
+        Args:
+            texts (list): List of text strings to classify
+            categories (list, optional): List of category names. If None, categories will be auto-detected
+        Returns:
+            list: List of classification results with categories, confidence scores, and explanations
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    def _generate_default_categories(self, texts, num_clusters=5):
+        """
+        Generate default categories based on text clustering
+        Args:
+            texts (list): List of text strings
+            num_clusters (int): Number of clusters to generate
+        Returns:
+            list: List of category names
+        """
+        # Simple implementation - in real system this would be more sophisticated
+        default_categories = [f"Category {i+1}" for i in range(num_clusters)]
+        return default_categories
+class TFIDFClassifier(BaseClassifier):
+    """Classifier using TF-IDF and clustering for fast classification"""
+    def __init__(self):
+        super().__init__()
+        self.vectorizer = TfidfVectorizer(
+            max_features=1000,
+            stop_words='english',
+            ngram_range=(1, 2)
+        )
+        self.model = None
+        self.feature_names = None
+        self.categories = None
+        self.centroids = None
+    def classify(self, texts, categories=None):
+        """Classify texts using TF-IDF and clustering"""
+        # Vectorize the texts
+        X = self.vectorizer.fit_transform(texts)
+        self.feature_names = self.vectorizer.get_feature_names_out()
+        # Auto-detect categories if not provided
+        if not categories:
+            num_clusters = min(5, len(texts))  # Don't create more clusters than texts
+            self.categories = self._generate_default_categories(texts, num_clusters)
+        else:
+            self.categories = categories
+            num_clusters = len(categories)
+        # Cluster the texts
+        self.model = KMeans(n_clusters=num_clusters, random_state=42)
+        clusters = self.model.fit_predict(X)
+        self.centroids = self.model.cluster_centers_
+        # Calculate distances to centroids for confidence
+        distances = self._calculate_distances(X)
+        # Prepare results
+        results = []
+        for i, text in enumerate(texts):
+            cluster_idx = clusters[i]
+            # Calculate confidence (inverse of distance, normalized)
+            confidence = self._calculate_confidence(distances[i])
+            # Create explanation
+            explanation = self._generate_explanation(X[i], cluster_idx)
+            results.append({
+                "category": self.categories[cluster_idx],
+                "confidence": confidence,
+                "explanation": explanation
+            })
+        return results
+    def _calculate_distances(self, X):
+        """Calculate distances from each point to each centroid"""
+        return np.sqrt(((X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2).sum(axis=2))
+    def _calculate_confidence(self, distances):
+        """Convert distances to confidence scores (0-100)"""
+        min_dist = np.min(distances)
+        max_dist = np.max(distances)
+        # Normalize and invert (smaller distance = higher confidence)
+        if max_dist == min_dist:
+            return 70  # Default mid-range confidence when all distances are equal
+        normalized_dist = (distances - min_dist) / (max_dist - min_dist)
+        min_normalized = np.min(normalized_dist)
+        # Invert and scale to 50-100 range (TF-IDF is never 100% confident)
+        confidence = 100 - (min_normalized * 50)
+        return round(confidence, 1)
+    def _generate_explanation(self, text_vector, cluster_idx):
+        """Generate an explanation for the classification"""
+        # Get the most important features for this cluster
+        centroid = self.centroids[cluster_idx]
+        # Get indices of top features for this text
+        text_array = text_vector.toarray()[0]
+        top_indices = text_array.argsort()[-5:][::-1]
+        # Get the feature names for these indices
+        top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]
+        if not top_features:
+            return "No significant features identified for this classification."
+        explanation = f"Classification based on key terms: {', '.join(top_features)}"
+        return explanation
+class LLMClassifier(BaseClassifier):
+    """Classifier using a Large Language Model for more accurate but slower classification"""
+    def __init__(self, client, model="gpt-3.5-turbo"):
+        super().__init__()
+        self.client = client
+        self.model = model
+    def classify(self, texts, categories=None):
+        """Classify texts using an LLM"""
+        if not categories:
+            # First, use LLM to generate appropriate categories
+            categories = self._suggest_categories(texts)
+        results = []
+        for text in texts:
+            # Classify each text individually
+            result = self._classify_text(text, categories)
+            results.append(result)
+        return results
+    def _suggest_categories(self, texts, sample_size=20):
+        """Use LLM to suggest appropriate categories for the dataset"""
+        # Take a sample of texts to avoid token limitations
+        if len(texts) > sample_size:
+            sample_texts = random.sample(texts, sample_size)
+        else:
+            sample_texts = texts
+        prompt = """
+        I have a collection of texts that I need to classify into categories. Here are some examples:
+        {}
+        Based on these examples, suggest up 2 to 5 appropriate categories for classification.
+        Return your answer as a comma-separated list of category names only.
+        """.format("\n---\n".join(sample_texts))
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.2,
+                max_tokens=100
+            )
+            # Parse response to get categories
+            categories_text = response.choices[0].message.content.strip()
+            categories = [cat.strip() for cat in categories_text.split(",")]
+            return categories
+        except Exception as e:
+            # Fallback to default categories on error
+            print(f"Error suggesting categories: {str(e)}")
+            return self._generate_default_categories(texts)
+    def _classify_text(self, text, categories):
+        """Use LLM to classify a single text"""
+        categories_str = ", ".join(categories)
+        prompt = f"""
+        Classify the following text into one of these categories: {categories_str}
+        Text: {text}
+        Return your answer in JSON format with these fields:
+        - category: the chosen category from the list
+        - confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
+        - explanation: a brief explanation of why this category was chosen (1-2 sentences)
+        JSON response:
+        """
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+                max_tokens=200
+            )
+            # Parse JSON response
+            response_text = response.choices[0].message.content.strip()
+            result = json.loads(response_text)
+            # Ensure all required fields are present
+            if not all(k in result for k in ["category", "confidence", "explanation"]):
+                raise ValueError("Missing required fields in LLM response")
+            # Validate category is in the list
+            if result["category"] not in categories:
+                result["category"] = categories[0]  # Default to first category if invalid
+            # Validate confidence is a number between 0 and 100
+            try:
+                result["confidence"] = float(result["confidence"])
+                if not 0 <= result["confidence"] <= 100:
+                    result["confidence"] = 50
+            except:
+                result["confidence"] = 50
+            return result
+        except json.JSONDecodeError:
+            # Fall back to simple parsing if JSON fails
+            category = categories[0]  # Default
+            for cat in categories:
+                if cat.lower() in response_text.lower():
+                    category = cat
+                    break
+            return {
+                "category": category,
+                "confidence": 50,
+                "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)"
+            }

examples/sample_reviews.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+text
+"I absolutely love this product! It exceeded all my expectations."
+"The service was terrible and the staff was rude."
+"The product arrived on time but was slightly damaged."
+"I have mixed feelings about this. Some features are great, others not so much."
+"This is a complete waste of money. Do not buy!"
+"The customer service team was very helpful in resolving my issue."
+"It's okay, nothing special but gets the job done."
+"I'm extremely disappointed with the quality of this product."
+"This is the best purchase I've made all year!"
+"It's reasonably priced and works as expected."

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+litellm>=1.10.0
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.2.0
+openpyxl>=3.1.0
+torch>=2.0.0
+transformers>=4.30.0
+matplotlib>=3.7.0

utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import TfidfVectorizer
+import tempfile
+def load_data(file_path):
+    """
+    Load data from an Excel or CSV file
+    Args:
+        file_path (str): Path to the file
+    Returns:
+        pd.DataFrame: Loaded data
+    """
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.xlsx' or file_ext == '.xls':
+        return pd.read_excel(file_path)
+    elif file_ext == '.csv':
+        return pd.read_csv(file_path)
+    else:
+        raise ValueError(f"Unsupported file format: {file_ext}. Please upload an Excel or CSV file.")
+def export_data(df, file_name, format_type="excel"):
+    """
+    Export dataframe to file
+    Args:
+        df (pd.DataFrame): Dataframe to export
+        file_name (str): Name of the output file
+        format_type (str): "excel" or "csv"
+    Returns:
+        str: Path to the exported file
+    """
+    # Create export directory if it doesn't exist
+    export_dir = "exports"
+    os.makedirs(export_dir, exist_ok=True)
+    # Full path for the export file
+    export_path = os.path.join(export_dir, file_name)
+    # Export based on format type
+    if format_type == "excel":
+        df.to_excel(export_path, index=False)
+    else:
+        df.to_csv(export_path, index=False)
+    return export_path
+def visualize_results(df, text_column, category_column="Category"):
+    """
+    Create visualization of classification results
+    Args:
+        df (pd.DataFrame): Dataframe with classification results
+        text_column (str): Name of the column containing text data
+        category_column (str): Name of the column containing categories
+    Returns:
+        matplotlib.figure.Figure: Visualization figure
+    """
+    # Get categories and their counts
+    category_counts = df[category_column].value_counts()
+    # Create a new figure
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # Create the histogram
+    bars = ax.bar(category_counts.index, category_counts.values)
+    # Add value labels on top of each bar
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width()/2., height,
+                f'{int(height)}',
+                ha='center', va='bottom')
+    # Customize the plot
+    ax.set_xlabel('Categories')
+    ax.set_ylabel('Number of Texts')
+    ax.set_title('Distribution of Classified Texts')
+    # Rotate x-axis labels if they're too long
+    plt.xticks(rotation=45, ha='right')
+    # Add grid
+    ax.grid(True, linestyle='--', alpha=0.7)
+    plt.tight_layout()
+    return fig
+def validate_results(df, text_columns, client):
+    """
+    Use LLM to validate the classification results
+    Args:
+        df (pd.DataFrame): Dataframe with classification results
+        text_columns (list): List of column names containing text data
+        client: LiteLLM client
+    Returns:
+        str: Validation report
+    """
+    try:
+        # Sample a few rows for validation
+        sample_size = min(5, len(df))
+        sample_df = df.sample(n=sample_size, random_state=42)
+        # Build validation prompt
+        validation_prompts = []
+        for _, row in sample_df.iterrows():
+            # Combine text from all selected columns
+            text = " ".join(str(row[col]) for col in text_columns)
+            assigned_category = row["Category"]
+            confidence = row["Confidence"]
+            validation_prompts.append(
+                f"Text: {text}\nAssigned Category: {assigned_category}\nConfidence: {confidence}\n"
+            )
+        prompt = """
+        As a validation expert, review the following text classifications and provide feedback.
+        For each text, assess whether the assigned category seems appropriate:
+        {}
+        Provide a brief validation report with:
+        1. Overall accuracy assessment (0-100%)
+        2. Any potential misclassifications identified
+        3. Suggestions for improvement
+        Keep your response under 300 words.
+        """.format("\n---\n".join(validation_prompts))
+        # Call LLM API
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.3,
+            max_tokens=400
+        )
+        validation_report = response.choices[0].message.content.strip()
+        return validation_report
+    except Exception as e:
+        return f"Validation failed: {str(e)}"
+def create_example_file():
+    """
+    Create an example CSV file for testing
+    Returns:
+        str: Path to the created file
+    """
+    # Create some example data
+    data = {
+        "text": [
+            "I absolutely love this product! It exceeded all my expectations.",
+            "The service was terrible and the staff was rude.",
+            "The product arrived on time but was slightly damaged.",
+            "I have mixed feelings about this. Some features are great, others not so much.",
+            "This is a complete waste of money. Do not buy!",
+            "The customer service team was very helpful in resolving my issue.",
+            "It's okay, nothing special but gets the job done.",
+            "I'm extremely disappointed with the quality of this product.",
+            "This is the best purchase I've made all year!",
+            "It's reasonably priced and works as expected."
+        ]
+    }
+    # Create dataframe
+    df = pd.DataFrame(data)
+    # Save to a CSV file
+    example_dir = "examples"
+    os.makedirs(example_dir, exist_ok=True)
+    file_path = os.path.join(example_dir, "sample_reviews.csv")
+    df.to_csv(file_path, index=False)
+    return file_path