Spaces:
Sleeping
Sleeping
add parallization
Browse files- app.py +17 -11
- classifiers.py +27 -9
- utils.py +10 -0
app.py
CHANGED
@@ -12,6 +12,7 @@ import time
|
|
12 |
import torch
|
13 |
import traceback
|
14 |
import logging
|
|
|
15 |
|
16 |
# Import local modules
|
17 |
from classifiers import TFIDFClassifier, LLMClassifier
|
@@ -106,15 +107,11 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
|
|
106 |
if classifier_type == "tfidf":
|
107 |
classifier = TFIDFClassifier()
|
108 |
results = classifier.classify(texts, category_list)
|
109 |
-
elif classifier_type
|
110 |
if client is None:
|
111 |
return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
|
112 |
-
|
113 |
-
|
114 |
-
elif classifier_type == "gpt4":
|
115 |
-
if client is None:
|
116 |
-
return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
|
117 |
-
classifier = LLMClassifier(client=client, model="gpt-4")
|
118 |
results = classifier.classify(texts, category_list)
|
119 |
else: # hybrid
|
120 |
if client is None:
|
@@ -126,12 +123,21 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
|
|
126 |
# Second pass with LLM for low confidence results
|
127 |
llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
128 |
results = []
|
|
|
|
|
|
|
129 |
for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
|
130 |
if tfidf_result["confidence"] < 70: # If confidence is below 70%
|
131 |
-
|
132 |
-
|
|
|
133 |
else:
|
134 |
results.append(tfidf_result)
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
# Create results dataframe
|
137 |
result_df = df.copy()
|
@@ -364,7 +370,7 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
364 |
def show_results(df, validation_report):
|
365 |
"""Show the results after processing"""
|
366 |
if df is None:
|
367 |
-
return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
|
368 |
|
369 |
# Sort by category if it exists
|
370 |
if "Category" in df.columns:
|
@@ -374,7 +380,7 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
374 |
csv_path = export_results(df, "csv")
|
375 |
excel_path = export_results(df, "excel")
|
376 |
|
377 |
-
return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
|
378 |
|
379 |
# Function to suggest a new category
|
380 |
def suggest_new_category(file, current_categories, text_columns):
|
|
|
12 |
import torch
|
13 |
import traceback
|
14 |
import logging
|
15 |
+
import asyncio
|
16 |
|
17 |
# Import local modules
|
18 |
from classifiers import TFIDFClassifier, LLMClassifier
|
|
|
107 |
if classifier_type == "tfidf":
|
108 |
classifier = TFIDFClassifier()
|
109 |
results = classifier.classify(texts, category_list)
|
110 |
+
elif classifier_type in ["gpt35", "gpt4"]:
|
111 |
if client is None:
|
112 |
return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
|
113 |
+
model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
|
114 |
+
classifier = LLMClassifier(client=client, model=model)
|
|
|
|
|
|
|
|
|
115 |
results = classifier.classify(texts, category_list)
|
116 |
else: # hybrid
|
117 |
if client is None:
|
|
|
123 |
# Second pass with LLM for low confidence results
|
124 |
llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
125 |
results = []
|
126 |
+
low_confidence_texts = []
|
127 |
+
low_confidence_indices = []
|
128 |
+
|
129 |
for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
|
130 |
if tfidf_result["confidence"] < 70: # If confidence is below 70%
|
131 |
+
low_confidence_texts.append(text)
|
132 |
+
low_confidence_indices.append(i)
|
133 |
+
results.append(None) # Placeholder
|
134 |
else:
|
135 |
results.append(tfidf_result)
|
136 |
+
|
137 |
+
if low_confidence_texts:
|
138 |
+
llm_results = llm_classifier.classify(low_confidence_texts, category_list)
|
139 |
+
for idx, llm_result in zip(low_confidence_indices, llm_results):
|
140 |
+
results[idx] = llm_result
|
141 |
|
142 |
# Create results dataframe
|
143 |
result_df = df.copy()
|
|
|
370 |
def show_results(df, validation_report):
|
371 |
"""Show the results after processing"""
|
372 |
if df is None:
|
373 |
+
return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
|
374 |
|
375 |
# Sort by category if it exists
|
376 |
if "Category" in df.columns:
|
|
|
380 |
csv_path = export_results(df, "csv")
|
381 |
excel_path = export_results(df, "excel")
|
382 |
|
383 |
+
return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
|
384 |
|
385 |
# Function to suggest a new category
|
386 |
def suggest_new_category(file, current_categories, text_columns):
|
classifiers.py
CHANGED
@@ -5,6 +5,8 @@ from sklearn.cluster import KMeans
|
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import random
|
7 |
import json
|
|
|
|
|
8 |
|
9 |
class BaseClassifier:
|
10 |
"""Base class for text classifiers"""
|
@@ -143,21 +145,37 @@ class LLMClassifier(BaseClassifier):
|
|
143 |
self.client = client
|
144 |
self.model = model
|
145 |
|
146 |
-
def classify(self, texts, categories=None):
|
147 |
-
"""Classify texts using an LLM"""
|
148 |
if not categories:
|
149 |
# First, use LLM to generate appropriate categories
|
150 |
categories = self._suggest_categories(texts)
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
#
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
return results
|
159 |
|
160 |
-
def _suggest_categories(self, texts, sample_size=20):
|
161 |
"""Use LLM to suggest appropriate categories for the dataset"""
|
162 |
# Take a sample of texts to avoid token limitations
|
163 |
if len(texts) > sample_size:
|
@@ -192,7 +210,7 @@ class LLMClassifier(BaseClassifier):
|
|
192 |
print(f"Error suggesting categories: {str(e)}")
|
193 |
return self._generate_default_categories(texts)
|
194 |
|
195 |
-
def _classify_text(self, text, categories):
|
196 |
"""Use LLM to classify a single text"""
|
197 |
categories_str = ", ".join(categories)
|
198 |
|
|
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import random
|
7 |
import json
|
8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
+
from typing import List, Dict, Any, Optional
|
10 |
|
11 |
class BaseClassifier:
|
12 |
"""Base class for text classifiers"""
|
|
|
145 |
self.client = client
|
146 |
self.model = model
|
147 |
|
148 |
+
def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
149 |
+
"""Classify texts using an LLM with parallel processing"""
|
150 |
if not categories:
|
151 |
# First, use LLM to generate appropriate categories
|
152 |
categories = self._suggest_categories(texts)
|
153 |
|
154 |
+
# Process texts in parallel
|
155 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
156 |
+
# Submit all tasks
|
157 |
+
future_to_text = {
|
158 |
+
executor.submit(self._classify_text, text, categories): text
|
159 |
+
for text in texts
|
160 |
+
}
|
161 |
+
|
162 |
+
# Collect results as they complete
|
163 |
+
results = []
|
164 |
+
for future in as_completed(future_to_text):
|
165 |
+
try:
|
166 |
+
result = future.result()
|
167 |
+
results.append(result)
|
168 |
+
except Exception as e:
|
169 |
+
print(f"Error processing text: {str(e)}")
|
170 |
+
results.append({
|
171 |
+
"category": categories[0],
|
172 |
+
"confidence": 50,
|
173 |
+
"explanation": f"Error during classification: {str(e)}"
|
174 |
+
})
|
175 |
|
176 |
return results
|
177 |
|
178 |
+
def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
|
179 |
"""Use LLM to suggest appropriate categories for the dataset"""
|
180 |
# Take a sample of texts to avoid token limitations
|
181 |
if len(texts) > sample_size:
|
|
|
210 |
print(f"Error suggesting categories: {str(e)}")
|
211 |
return self._generate_default_categories(texts)
|
212 |
|
213 |
+
def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
214 |
"""Use LLM to classify a single text"""
|
215 |
categories_str = ", ".join(categories)
|
216 |
|
utils.py
CHANGED
@@ -64,6 +64,16 @@ def visualize_results(df, text_column, category_column="Category"):
|
|
64 |
Returns:
|
65 |
matplotlib.figure.Figure: Visualization figure
|
66 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# Get categories and their counts
|
68 |
category_counts = df[category_column].value_counts()
|
69 |
|
|
|
64 |
Returns:
|
65 |
matplotlib.figure.Figure: Visualization figure
|
66 |
"""
|
67 |
+
# Check if category column exists
|
68 |
+
if category_column not in df.columns:
|
69 |
+
# Create a simple figure with a message
|
70 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
71 |
+
ax.text(0.5, 0.5, "No categories to display",
|
72 |
+
ha='center', va='center', fontsize=12)
|
73 |
+
ax.set_title('No Classification Results Available')
|
74 |
+
plt.tight_layout()
|
75 |
+
return fig
|
76 |
+
|
77 |
# Get categories and their counts
|
78 |
category_counts = df[category_column].value_counts()
|
79 |
|