pentarosarium commited on
Commit
f036fc2
·
1 Parent(s): e1603e5

revert dedup

Browse files
Files changed (1) hide show
  1. app.py +11 -0
app.py CHANGED
@@ -331,6 +331,17 @@ def process_file(uploaded_file, model_choice):
331
  if missing_columns:
332
  st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
333
  return df if df is not None else None
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  # Initialize progress tracking
336
  progress_bar = st.progress(0)
 
331
  if missing_columns:
332
  st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
333
  return df if df is not None else None
334
+
335
+ # Deduplication
336
+ original_news_count = len(df)
337
+ df = df.groupby('Объект', group_keys=False).apply(
338
+ lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
339
+ ).reset_index(drop=True)
340
+
341
+ remaining_news_count = len(df)
342
+ duplicates_removed = original_news_count - remaining_news_count
343
+ st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
344
+
345
 
346
  # Initialize progress tracking
347
  progress_bar = st.progress(0)