Spaces:
Sleeping
Sleeping
Commit
·
7384066
1
Parent(s):
a197ca6
1.60 +decluster
Browse files
app.py
CHANGED
@@ -71,12 +71,15 @@ class ProcessControl:
|
|
71 |
class EventDetector:
|
72 |
def __init__(self):
|
73 |
try:
|
74 |
-
# Initialize models
|
75 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
76 |
logger.info(f"Initializing models on device: {device}")
|
77 |
|
78 |
# Initialize all models
|
79 |
-
self.initialize_models(device)
|
|
|
|
|
|
|
|
|
80 |
|
81 |
self.device = device
|
82 |
self.initialized = True
|
@@ -85,7 +88,85 @@ class EventDetector:
|
|
85 |
except Exception as e:
|
86 |
logger.error(f"Error in EventDetector initialization: {str(e)}")
|
87 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
|
|
|
|
|
|
|
|
89 |
@spaces.GPU(duration=30)
|
90 |
def initialize_models(self, device):
|
91 |
"""Initialize all models with GPU support"""
|
@@ -650,7 +731,7 @@ def create_interface():
|
|
650 |
# Create state for file data
|
651 |
current_file = gr.State(None)
|
652 |
|
653 |
-
gr.Markdown("# AI-анализ мониторинга новостей v.1.
|
654 |
|
655 |
with gr.Row():
|
656 |
file_input = gr.File(
|
@@ -716,14 +797,7 @@ def create_interface():
|
|
716 |
def process_and_download(file_bytes):
|
717 |
if file_bytes is None:
|
718 |
gr.Warning("Пожалуйста, загрузите файл")
|
719 |
-
return (
|
720 |
-
pd.DataFrame(),
|
721 |
-
None,
|
722 |
-
None,
|
723 |
-
None,
|
724 |
-
"Ожидание файла...",
|
725 |
-
""
|
726 |
-
)
|
727 |
|
728 |
try:
|
729 |
file_obj = io.BytesIO(file_bytes)
|
@@ -735,9 +809,15 @@ def create_interface():
|
|
735 |
df = pd.read_excel(file_obj, sheet_name='Публикации')
|
736 |
original_count = len(df)
|
737 |
df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
|
738 |
-
|
739 |
-
dedup_message = f"Удалено {
|
740 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
741 |
|
742 |
processed_rows = []
|
743 |
total = len(df)
|
|
|
71 |
class EventDetector:
|
72 |
def __init__(self):
|
73 |
try:
|
|
|
74 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
75 |
logger.info(f"Initializing models on device: {device}")
|
76 |
|
77 |
# Initialize all models
|
78 |
+
self.initialize_models(device)
|
79 |
+
|
80 |
+
# Initialize transformer for declusterization
|
81 |
+
self.tokenizer_cluster = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
|
82 |
+
self.model_cluster = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)
|
83 |
|
84 |
self.device = device
|
85 |
self.initialized = True
|
|
|
88 |
except Exception as e:
|
89 |
logger.error(f"Error in EventDetector initialization: {str(e)}")
|
90 |
raise
|
91 |
+
|
92 |
+
def mean_pooling(self, model_output, attention_mask):
|
93 |
+
token_embeddings = model_output[0]
|
94 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
95 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
96 |
+
|
97 |
+
def encode_text(self, text):
|
98 |
+
if pd.isna(text):
|
99 |
+
text = ""
|
100 |
+
text = str(text)
|
101 |
+
|
102 |
+
encoded_input = self.tokenizer_cluster(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(self.device)
|
103 |
+
with torch.no_grad():
|
104 |
+
model_output = self.model_cluster(**encoded_input)
|
105 |
+
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
106 |
+
return torch.nn.functional.normalize(sentence_embeddings[0], p=2, dim=0)
|
107 |
+
|
108 |
+
@spaces.GPU(duration=20)
|
109 |
+
def decluster_texts(self, df, text_column, similarity_threshold=0.75, time_threshold=24):
|
110 |
+
try:
|
111 |
+
if df.empty:
|
112 |
+
return df
|
113 |
+
|
114 |
+
# Sort by datetime if available
|
115 |
+
if 'datetime' in df.columns:
|
116 |
+
df = df.sort_values('datetime')
|
117 |
+
|
118 |
+
clusters = []
|
119 |
+
processed = set()
|
120 |
+
|
121 |
+
# Process each text
|
122 |
+
for idx in df.index:
|
123 |
+
if idx in processed:
|
124 |
+
continue
|
125 |
+
|
126 |
+
row1 = df.loc[idx]
|
127 |
+
cluster = [idx]
|
128 |
+
processed.add(idx)
|
129 |
+
|
130 |
+
if not pd.isna(row1[text_column]):
|
131 |
+
text1_embedding = self.encode_text(row1[text_column])
|
132 |
+
|
133 |
+
for other_idx in df.index:
|
134 |
+
if other_idx in processed:
|
135 |
+
continue
|
136 |
+
|
137 |
+
row2 = df.loc[other_idx]
|
138 |
+
if pd.isna(row2[text_column]):
|
139 |
+
continue
|
140 |
+
|
141 |
+
# Check time difference if datetime available
|
142 |
+
if 'datetime' in df.columns:
|
143 |
+
time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
|
144 |
+
if abs(time_diff.total_seconds() / 3600) > time_threshold:
|
145 |
+
continue
|
146 |
+
|
147 |
+
text2_embedding = self.encode_text(row2[text_column])
|
148 |
+
similarity = torch.dot(text1_embedding, text2_embedding).item()
|
149 |
+
|
150 |
+
if similarity >= similarity_threshold:
|
151 |
+
cluster.append(other_idx)
|
152 |
+
processed.add(other_idx)
|
153 |
+
|
154 |
+
# Process clusters
|
155 |
+
indices_to_delete = set()
|
156 |
+
for cluster_indices in clusters:
|
157 |
+
if len(cluster_indices) > 1:
|
158 |
+
# Keep the longest text
|
159 |
+
text_lengths = df.loc[cluster_indices, text_column].str.len()
|
160 |
+
longest_text_idx = text_lengths.idxmax()
|
161 |
+
indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
|
162 |
+
|
163 |
+
# Return declusterized DataFrame
|
164 |
+
return df.drop(index=list(indices_to_delete))
|
165 |
|
166 |
+
except Exception as e:
|
167 |
+
logger.error(f"Declusterization error: {str(e)}")
|
168 |
+
return df
|
169 |
+
|
170 |
@spaces.GPU(duration=30)
|
171 |
def initialize_models(self, device):
|
172 |
"""Initialize all models with GPU support"""
|
|
|
731 |
# Create state for file data
|
732 |
current_file = gr.State(None)
|
733 |
|
734 |
+
gr.Markdown("# AI-анализ мониторинга новостей v.1.60 + добавка")
|
735 |
|
736 |
with gr.Row():
|
737 |
file_input = gr.File(
|
|
|
797 |
def process_and_download(file_bytes):
|
798 |
if file_bytes is None:
|
799 |
gr.Warning("Пожалуйста, загрузите файл")
|
800 |
+
return (pd.DataFrame(), None, None, None, "Ожидание файла...", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
801 |
|
802 |
try:
|
803 |
file_obj = io.BytesIO(file_bytes)
|
|
|
809 |
df = pd.read_excel(file_obj, sheet_name='Публикации')
|
810 |
original_count = len(df)
|
811 |
df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
|
812 |
+
removed_dupes = original_count - len(df)
|
813 |
+
dedup_message = f"Удалено {removed_dupes} дубликатов из {original_count} записей"
|
814 |
+
|
815 |
+
# Decluster the deduplicated data
|
816 |
+
decluster_count = len(df)
|
817 |
+
df = detector.decluster_texts(df, 'Выдержки из текста')
|
818 |
+
removed_clusters = decluster_count - len(df)
|
819 |
+
dedup_message += f"\nУдалено {removed_clusters} похожих текстов"
|
820 |
+
logger.info(f"Removed {removed_clusters} similar texts after declusterization")
|
821 |
|
822 |
processed_rows = []
|
823 |
total = len(df)
|