Spaces:
Running
Running
Commit
·
f036fc2
1
Parent(s):
e1603e5
revert dedup
Browse files
app.py
CHANGED
@@ -331,6 +331,17 @@ def process_file(uploaded_file, model_choice):
|
|
331 |
if missing_columns:
|
332 |
st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
|
333 |
return df if df is not None else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
# Initialize progress tracking
|
336 |
progress_bar = st.progress(0)
|
|
|
331 |
if missing_columns:
|
332 |
st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
|
333 |
return df if df is not None else None
|
334 |
+
|
335 |
+
# Deduplication
|
336 |
+
original_news_count = len(df)
|
337 |
+
df = df.groupby('Объект', group_keys=False).apply(
|
338 |
+
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
339 |
+
).reset_index(drop=True)
|
340 |
+
|
341 |
+
remaining_news_count = len(df)
|
342 |
+
duplicates_removed = original_news_count - remaining_news_count
|
343 |
+
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
344 |
+
|
345 |
|
346 |
# Initialize progress tracking
|
347 |
progress_bar = st.progress(0)
|