Spaces:

pentarosarium
/

gprocess

Sleeping

App Files Files Community

pentarosarium commited on Dec 3, 2024

Commit

7384066

1 Parent(s): a197ca6

1.60 +decluster

Browse files

Files changed (1) hide show

app.py +94 -14

app.py CHANGED Viewed

@@ -71,12 +71,15 @@ class ProcessControl:
 class EventDetector:
     def __init__(self):
         try:
-            # Initialize models
             device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing models on device: {device}")
             # Initialize all models
-            self.initialize_models(device)  # Move initialization to separate method
             self.device = device
             self.initialized = True
@@ -85,7 +88,85 @@ class EventDetector:
         except Exception as e:
             logger.error(f"Error in EventDetector initialization: {str(e)}")
             raise
     @spaces.GPU(duration=30)
     def initialize_models(self, device):
         """Initialize all models with GPU support"""
@@ -650,7 +731,7 @@ def create_interface():
         # Create state for file data
         current_file = gr.State(None)
-        gr.Markdown("# AI-анализ мониторинга новостей v.1.58")
         with gr.Row():
             file_input = gr.File(
@@ -716,14 +797,7 @@ def create_interface():
         def process_and_download(file_bytes):
             if file_bytes is None:
                 gr.Warning("Пожалуйста, загрузите файл")
-                return (
-                    pd.DataFrame(),
-                    None,
-                    None,
-                    None,
-                    "Ожидание файла...",
-                    ""
-                )
             try:
                 file_obj = io.BytesIO(file_bytes)
@@ -735,9 +809,15 @@ def create_interface():
                 df = pd.read_excel(file_obj, sheet_name='Публикации')
                 original_count = len(df)
                 df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
-                removed_count = original_count - len(df)
-                dedup_message = f"Удалено {removed_count} дубликатов из {original_count} записей"
-                logger.info(f"Removed {removed_count} duplicate entries")
                 processed_rows = []
                 total = len(df)

 class EventDetector:
     def __init__(self):
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing models on device: {device}")
             # Initialize all models
+            self.initialize_models(device)
+            # Initialize transformer for declusterization
+            self.tokenizer_cluster = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
+            self.model_cluster = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)
             self.device = device
             self.initialized = True
         except Exception as e:
             logger.error(f"Error in EventDetector initialization: {str(e)}")
             raise
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def encode_text(self, text):
+        if pd.isna(text):
+            text = ""
+        text = str(text)
+        encoded_input = self.tokenizer_cluster(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(self.device)
+        with torch.no_grad():
+            model_output = self.model_cluster(**encoded_input)
+        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        return torch.nn.functional.normalize(sentence_embeddings[0], p=2, dim=0)
+    @spaces.GPU(duration=20)
+    def decluster_texts(self, df, text_column, similarity_threshold=0.75, time_threshold=24):
+        try:
+            if df.empty:
+                return df
+            # Sort by datetime if available
+            if 'datetime' in df.columns:
+                df = df.sort_values('datetime')
+            clusters = []
+            processed = set()
+            # Process each text
+            for idx in df.index:
+                if idx in processed:
+                    continue
+                row1 = df.loc[idx]
+                cluster = [idx]
+                processed.add(idx)
+                if not pd.isna(row1[text_column]):
+                    text1_embedding = self.encode_text(row1[text_column])
+                    for other_idx in df.index:
+                        if other_idx in processed:
+                            continue
+                        row2 = df.loc[other_idx]
+                        if pd.isna(row2[text_column]):
+                            continue
+                        # Check time difference if datetime available
+                        if 'datetime' in df.columns:
+                            time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
+                            if abs(time_diff.total_seconds() / 3600) > time_threshold:
+                                continue
+                        text2_embedding = self.encode_text(row2[text_column])
+                        similarity = torch.dot(text1_embedding, text2_embedding).item()
+                        if similarity >= similarity_threshold:
+                            cluster.append(other_idx)
+                            processed.add(other_idx)
+            # Process clusters
+            indices_to_delete = set()
+            for cluster_indices in clusters:
+                if len(cluster_indices) > 1:
+                    # Keep the longest text
+                    text_lengths = df.loc[cluster_indices, text_column].str.len()
+                    longest_text_idx = text_lengths.idxmax()
+                    indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
+            # Return declusterized DataFrame
+            return df.drop(index=list(indices_to_delete))
+        except Exception as e:
+            logger.error(f"Declusterization error: {str(e)}")
+            return df
     @spaces.GPU(duration=30)
     def initialize_models(self, device):
         """Initialize all models with GPU support"""
         # Create state for file data
         current_file = gr.State(None)
+        gr.Markdown("# AI-анализ мониторинга новостей v.1.60 + добавка")
         with gr.Row():
             file_input = gr.File(
         def process_and_download(file_bytes):
             if file_bytes is None:
                 gr.Warning("Пожалуйста, загрузите файл")
+                return (pd.DataFrame(), None, None, None, "Ожидание файла...", "")
             try:
                 file_obj = io.BytesIO(file_bytes)
                 df = pd.read_excel(file_obj, sheet_name='Публикации')
                 original_count = len(df)
                 df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
+                removed_dupes = original_count - len(df)
+                dedup_message = f"Удалено {removed_dupes} дубликатов из {original_count} записей"
+                # Decluster the deduplicated data
+                decluster_count = len(df)
+                df = detector.decluster_texts(df, 'Выдержки из текста')
+                removed_clusters = decluster_count - len(df)
+                dedup_message += f"\nУдалено {removed_clusters} похожих текстов"
+                logger.info(f"Removed {removed_clusters} similar texts after declusterization")
                 processed_rows = []
                 total = len(df)