simondh commited on
Commit
ca09c52
·
1 Parent(s): 53ce8ef

add parallization

Browse files
Files changed (3) hide show
  1. app.py +17 -11
  2. classifiers.py +27 -9
  3. utils.py +10 -0
app.py CHANGED
@@ -12,6 +12,7 @@ import time
12
  import torch
13
  import traceback
14
  import logging
 
15
 
16
  # Import local modules
17
  from classifiers import TFIDFClassifier, LLMClassifier
@@ -106,15 +107,11 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
106
  if classifier_type == "tfidf":
107
  classifier = TFIDFClassifier()
108
  results = classifier.classify(texts, category_list)
109
- elif classifier_type == "gpt35":
110
  if client is None:
111
  return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
112
- classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
113
- results = classifier.classify(texts, category_list)
114
- elif classifier_type == "gpt4":
115
- if client is None:
116
- return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
117
- classifier = LLMClassifier(client=client, model="gpt-4")
118
  results = classifier.classify(texts, category_list)
119
  else: # hybrid
120
  if client is None:
@@ -126,12 +123,21 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
126
  # Second pass with LLM for low confidence results
127
  llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
128
  results = []
 
 
 
129
  for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
130
  if tfidf_result["confidence"] < 70: # If confidence is below 70%
131
- llm_result = llm_classifier.classify([text], category_list)[0]
132
- results.append(llm_result)
 
133
  else:
134
  results.append(tfidf_result)
 
 
 
 
 
135
 
136
  # Create results dataframe
137
  result_df = df.copy()
@@ -364,7 +370,7 @@ with gr.Blocks(title="Text Classification System") as demo:
364
  def show_results(df, validation_report):
365
  """Show the results after processing"""
366
  if df is None:
367
- return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False), gr.Dataframe(visible=False)
368
 
369
  # Sort by category if it exists
370
  if "Category" in df.columns:
@@ -374,7 +380,7 @@ with gr.Blocks(title="Text Classification System") as demo:
374
  csv_path = export_results(df, "csv")
375
  excel_path = export_results(df, "excel")
376
 
377
- return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True), gr.Dataframe(value=df, visible=True)
378
 
379
  # Function to suggest a new category
380
  def suggest_new_category(file, current_categories, text_columns):
 
12
  import torch
13
  import traceback
14
  import logging
15
+ import asyncio
16
 
17
  # Import local modules
18
  from classifiers import TFIDFClassifier, LLMClassifier
 
107
  if classifier_type == "tfidf":
108
  classifier = TFIDFClassifier()
109
  results = classifier.classify(texts, category_list)
110
+ elif classifier_type in ["gpt35", "gpt4"]:
111
  if client is None:
112
  return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
113
+ model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
114
+ classifier = LLMClassifier(client=client, model=model)
 
 
 
 
115
  results = classifier.classify(texts, category_list)
116
  else: # hybrid
117
  if client is None:
 
123
  # Second pass with LLM for low confidence results
124
  llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
125
  results = []
126
+ low_confidence_texts = []
127
+ low_confidence_indices = []
128
+
129
  for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
130
  if tfidf_result["confidence"] < 70: # If confidence is below 70%
131
+ low_confidence_texts.append(text)
132
+ low_confidence_indices.append(i)
133
+ results.append(None) # Placeholder
134
  else:
135
  results.append(tfidf_result)
136
+
137
+ if low_confidence_texts:
138
+ llm_results = llm_classifier.classify(low_confidence_texts, category_list)
139
+ for idx, llm_result in zip(low_confidence_indices, llm_results):
140
+ results[idx] = llm_result
141
 
142
  # Create results dataframe
143
  result_df = df.copy()
 
370
  def show_results(df, validation_report):
371
  """Show the results after processing"""
372
  if df is None:
373
+ return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
374
 
375
  # Sort by category if it exists
376
  if "Category" in df.columns:
 
380
  csv_path = export_results(df, "csv")
381
  excel_path = export_results(df, "excel")
382
 
383
+ return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
384
 
385
  # Function to suggest a new category
386
  def suggest_new_category(file, current_categories, text_columns):
classifiers.py CHANGED
@@ -5,6 +5,8 @@ from sklearn.cluster import KMeans
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import random
7
  import json
 
 
8
 
9
  class BaseClassifier:
10
  """Base class for text classifiers"""
@@ -143,21 +145,37 @@ class LLMClassifier(BaseClassifier):
143
  self.client = client
144
  self.model = model
145
 
146
- def classify(self, texts, categories=None):
147
- """Classify texts using an LLM"""
148
  if not categories:
149
  # First, use LLM to generate appropriate categories
150
  categories = self._suggest_categories(texts)
151
 
152
- results = []
153
- for text in texts:
154
- # Classify each text individually
155
- result = self._classify_text(text, categories)
156
- results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  return results
159
 
160
- def _suggest_categories(self, texts, sample_size=20):
161
  """Use LLM to suggest appropriate categories for the dataset"""
162
  # Take a sample of texts to avoid token limitations
163
  if len(texts) > sample_size:
@@ -192,7 +210,7 @@ class LLMClassifier(BaseClassifier):
192
  print(f"Error suggesting categories: {str(e)}")
193
  return self._generate_default_categories(texts)
194
 
195
- def _classify_text(self, text, categories):
196
  """Use LLM to classify a single text"""
197
  categories_str = ", ".join(categories)
198
 
 
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import random
7
  import json
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from typing import List, Dict, Any, Optional
10
 
11
  class BaseClassifier:
12
  """Base class for text classifiers"""
 
145
  self.client = client
146
  self.model = model
147
 
148
+ def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
149
+ """Classify texts using an LLM with parallel processing"""
150
  if not categories:
151
  # First, use LLM to generate appropriate categories
152
  categories = self._suggest_categories(texts)
153
 
154
+ # Process texts in parallel
155
+ with ThreadPoolExecutor(max_workers=10) as executor:
156
+ # Submit all tasks
157
+ future_to_text = {
158
+ executor.submit(self._classify_text, text, categories): text
159
+ for text in texts
160
+ }
161
+
162
+ # Collect results as they complete
163
+ results = []
164
+ for future in as_completed(future_to_text):
165
+ try:
166
+ result = future.result()
167
+ results.append(result)
168
+ except Exception as e:
169
+ print(f"Error processing text: {str(e)}")
170
+ results.append({
171
+ "category": categories[0],
172
+ "confidence": 50,
173
+ "explanation": f"Error during classification: {str(e)}"
174
+ })
175
 
176
  return results
177
 
178
+ def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
179
  """Use LLM to suggest appropriate categories for the dataset"""
180
  # Take a sample of texts to avoid token limitations
181
  if len(texts) > sample_size:
 
210
  print(f"Error suggesting categories: {str(e)}")
211
  return self._generate_default_categories(texts)
212
 
213
+ def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
214
  """Use LLM to classify a single text"""
215
  categories_str = ", ".join(categories)
216
 
utils.py CHANGED
@@ -64,6 +64,16 @@ def visualize_results(df, text_column, category_column="Category"):
64
  Returns:
65
  matplotlib.figure.Figure: Visualization figure
66
  """
 
 
 
 
 
 
 
 
 
 
67
  # Get categories and their counts
68
  category_counts = df[category_column].value_counts()
69
 
 
64
  Returns:
65
  matplotlib.figure.Figure: Visualization figure
66
  """
67
+ # Check if category column exists
68
+ if category_column not in df.columns:
69
+ # Create a simple figure with a message
70
+ fig, ax = plt.subplots(figsize=(10, 6))
71
+ ax.text(0.5, 0.5, "No categories to display",
72
+ ha='center', va='center', fontsize=12)
73
+ ax.set_title('No Classification Results Available')
74
+ plt.tight_layout()
75
+ return fig
76
+
77
  # Get categories and their counts
78
  category_counts = df[category_column].value_counts()
79