pentarosarium commited on
Commit
7384066
·
1 Parent(s): a197ca6

1.60 +decluster

Browse files
Files changed (1) hide show
  1. app.py +94 -14
app.py CHANGED
@@ -71,12 +71,15 @@ class ProcessControl:
71
  class EventDetector:
72
  def __init__(self):
73
  try:
74
- # Initialize models
75
  device = "cuda" if torch.cuda.is_available() else "cpu"
76
  logger.info(f"Initializing models on device: {device}")
77
 
78
  # Initialize all models
79
- self.initialize_models(device) # Move initialization to separate method
 
 
 
 
80
 
81
  self.device = device
82
  self.initialized = True
@@ -85,7 +88,85 @@ class EventDetector:
85
  except Exception as e:
86
  logger.error(f"Error in EventDetector initialization: {str(e)}")
87
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
89
  @spaces.GPU(duration=30)
90
  def initialize_models(self, device):
91
  """Initialize all models with GPU support"""
@@ -650,7 +731,7 @@ def create_interface():
650
  # Create state for file data
651
  current_file = gr.State(None)
652
 
653
- gr.Markdown("# AI-анализ мониторинга новостей v.1.58")
654
 
655
  with gr.Row():
656
  file_input = gr.File(
@@ -716,14 +797,7 @@ def create_interface():
716
  def process_and_download(file_bytes):
717
  if file_bytes is None:
718
  gr.Warning("Пожалуйста, загрузите файл")
719
- return (
720
- pd.DataFrame(),
721
- None,
722
- None,
723
- None,
724
- "Ожидание файла...",
725
- ""
726
- )
727
 
728
  try:
729
  file_obj = io.BytesIO(file_bytes)
@@ -735,9 +809,15 @@ def create_interface():
735
  df = pd.read_excel(file_obj, sheet_name='Публикации')
736
  original_count = len(df)
737
  df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
738
- removed_count = original_count - len(df)
739
- dedup_message = f"Удалено {removed_count} дубликатов из {original_count} записей"
740
- logger.info(f"Removed {removed_count} duplicate entries")
 
 
 
 
 
 
741
 
742
  processed_rows = []
743
  total = len(df)
 
71
  class EventDetector:
72
  def __init__(self):
73
  try:
 
74
  device = "cuda" if torch.cuda.is_available() else "cpu"
75
  logger.info(f"Initializing models on device: {device}")
76
 
77
  # Initialize all models
78
+ self.initialize_models(device)
79
+
80
+ # Initialize transformer for declusterization
81
+ self.tokenizer_cluster = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
82
+ self.model_cluster = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)
83
 
84
  self.device = device
85
  self.initialized = True
 
88
  except Exception as e:
89
  logger.error(f"Error in EventDetector initialization: {str(e)}")
90
  raise
91
+
92
+ def mean_pooling(self, model_output, attention_mask):
93
+ token_embeddings = model_output[0]
94
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
95
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
96
+
97
+ def encode_text(self, text):
98
+ if pd.isna(text):
99
+ text = ""
100
+ text = str(text)
101
+
102
+ encoded_input = self.tokenizer_cluster(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(self.device)
103
+ with torch.no_grad():
104
+ model_output = self.model_cluster(**encoded_input)
105
+ sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
106
+ return torch.nn.functional.normalize(sentence_embeddings[0], p=2, dim=0)
107
+
108
+ @spaces.GPU(duration=20)
109
+ def decluster_texts(self, df, text_column, similarity_threshold=0.75, time_threshold=24):
110
+ try:
111
+ if df.empty:
112
+ return df
113
+
114
+ # Sort by datetime if available
115
+ if 'datetime' in df.columns:
116
+ df = df.sort_values('datetime')
117
+
118
+ clusters = []
119
+ processed = set()
120
+
121
+ # Process each text
122
+ for idx in df.index:
123
+ if idx in processed:
124
+ continue
125
+
126
+ row1 = df.loc[idx]
127
+ cluster = [idx]
128
+ processed.add(idx)
129
+
130
+ if not pd.isna(row1[text_column]):
131
+ text1_embedding = self.encode_text(row1[text_column])
132
+
133
+ for other_idx in df.index:
134
+ if other_idx in processed:
135
+ continue
136
+
137
+ row2 = df.loc[other_idx]
138
+ if pd.isna(row2[text_column]):
139
+ continue
140
+
141
+ # Check time difference if datetime available
142
+ if 'datetime' in df.columns:
143
+ time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
144
+ if abs(time_diff.total_seconds() / 3600) > time_threshold:
145
+ continue
146
+
147
+ text2_embedding = self.encode_text(row2[text_column])
148
+ similarity = torch.dot(text1_embedding, text2_embedding).item()
149
+
150
+ if similarity >= similarity_threshold:
151
+ cluster.append(other_idx)
152
+ processed.add(other_idx)
153
+
154
+ # Process clusters
155
+ indices_to_delete = set()
156
+ for cluster_indices in clusters:
157
+ if len(cluster_indices) > 1:
158
+ # Keep the longest text
159
+ text_lengths = df.loc[cluster_indices, text_column].str.len()
160
+ longest_text_idx = text_lengths.idxmax()
161
+ indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
162
+
163
+ # Return declusterized DataFrame
164
+ return df.drop(index=list(indices_to_delete))
165
 
166
+ except Exception as e:
167
+ logger.error(f"Declusterization error: {str(e)}")
168
+ return df
169
+
170
  @spaces.GPU(duration=30)
171
  def initialize_models(self, device):
172
  """Initialize all models with GPU support"""
 
731
  # Create state for file data
732
  current_file = gr.State(None)
733
 
734
+ gr.Markdown("# AI-анализ мониторинга новостей v.1.60 + добавка")
735
 
736
  with gr.Row():
737
  file_input = gr.File(
 
797
  def process_and_download(file_bytes):
798
  if file_bytes is None:
799
  gr.Warning("Пожалуйста, загрузите файл")
800
+ return (pd.DataFrame(), None, None, None, "Ожидание файла...", "")
 
 
 
 
 
 
 
801
 
802
  try:
803
  file_obj = io.BytesIO(file_bytes)
 
809
  df = pd.read_excel(file_obj, sheet_name='Публикации')
810
  original_count = len(df)
811
  df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
812
+ removed_dupes = original_count - len(df)
813
+ dedup_message = f"Удалено {removed_dupes} дубликатов из {original_count} записей"
814
+
815
+ # Decluster the deduplicated data
816
+ decluster_count = len(df)
817
+ df = detector.decluster_texts(df, 'Выдержки из текста')
818
+ removed_clusters = decluster_count - len(df)
819
+ dedup_message += f"\nУдалено {removed_clusters} похожих текстов"
820
+ logger.info(f"Removed {removed_clusters} similar texts after declusterization")
821
 
822
  processed_rows = []
823
  total = len(df)