import logging import time import traceback from sklearn.feature_extraction.text import TfidfVectorizer from litellm import OpenAI from classifiers import TFIDFClassifier, LLMClassifier from utils import load_data, validate_results def update_api_key(api_key): """Update the OpenAI API key""" global OPENAI_API_KEY, client if not api_key: return "API Key cannot be empty" OPENAI_API_KEY = api_key try: client = OpenAI(api_key=api_key) # Test the connection with a simple request response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "test"}], max_tokens=5, ) return f"API Key updated and verified successfully" except Exception as e: error_msg = str(e) logging.error(f"API key update failed: {error_msg}") return f"Failed to update API Key: {error_msg}" def process_file(file, text_columns, categories, classifier_type, show_explanations): """Process the uploaded file and classify text data""" # Initialize result_df and validation_report result_df = None validation_report = None try: # Load data from file if isinstance(file, str): df = load_data(file) else: df = load_data(file.name) if not text_columns: return None, "Please select at least one text column" # Check if all selected columns exist missing_columns = [col for col in text_columns if col not in df.columns] if missing_columns: return ( None, f"Columns not found in the file: {', '.join(missing_columns)}. Available columns: {', '.join(df.columns)}", ) # Combine text from selected columns texts = [] for _, row in df.iterrows(): combined_text = " ".join(str(row[col]) for col in text_columns) texts.append(combined_text) # Parse categories if provided category_list = [] if categories: category_list = [cat.strip() for cat in categories.split(",")] # Select classifier based on data size and user choice num_texts = len(texts) # If no specific model is chosen, select the most appropriate one if classifier_type == "auto": if num_texts <= 500: classifier_type = "gpt4" elif num_texts <= 1000: classifier_type = "gpt35" elif num_texts <= 5000: classifier_type = "hybrid" else: classifier_type = "tfidf" # Initialize appropriate classifier if classifier_type == "tfidf": classifier = TFIDFClassifier() results = classifier.classify(texts, category_list) elif classifier_type in ["gpt35", "gpt4"]: if client is None: return ( None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.", ) model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4" classifier = LLMClassifier(client=client, model=model) results = classifier.classify(texts, category_list) else: # hybrid if client is None: return ( None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'.", ) # First pass with TF-IDF tfidf_classifier = TFIDFClassifier() tfidf_results = tfidf_classifier.classify(texts, category_list) # Second pass with LLM for low confidence results llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo") results = [] low_confidence_texts = [] low_confidence_indices = [] for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)): if tfidf_result["confidence"] < 70: # If confidence is below 70% low_confidence_texts.append(text) low_confidence_indices.append(i) results.append(None) # Placeholder else: results.append(tfidf_result) if low_confidence_texts: llm_results = llm_classifier.classify( low_confidence_texts, category_list ) for idx, llm_result in zip(low_confidence_indices, llm_results): results[idx] = llm_result # Create results dataframe result_df = df.copy() result_df["Category"] = [r["category"] for r in results] result_df["Confidence"] = [r["confidence"] for r in results] if show_explanations: result_df["Explanation"] = [r["explanation"] for r in results] # Validate results using LLM validation_report = validate_results(result_df, text_columns, client) return result_df, validation_report except Exception as e: error_traceback = traceback.format_exc() return None, f"Error: {str(e)}\n{error_traceback}" def export_results(df, format_type): """Export results to a file and return the file path for download""" if df is None: return None # Create a temporary file import tempfile import os # Create a temporary directory if it doesn't exist temp_dir = "temp_exports" os.makedirs(temp_dir, exist_ok=True) # Generate a unique filename timestamp = time.strftime("%Y%m%d-%H%M%S") filename = f"classification_results_{timestamp}" if format_type == "excel": file_path = os.path.join(temp_dir, f"{filename}.xlsx") df.to_excel(file_path, index=False) else: file_path = os.path.join(temp_dir, f"{filename}.csv") df.to_csv(file_path, index=False) return file_path