Spaces:
Running
Running
Commit
·
6036a45
1
Parent(s):
d007853
3.53
Browse files- app.py +405 -302
- requirements.txt +1 -3
app.py
CHANGED
@@ -29,6 +29,40 @@ from transformers import (
|
|
29 |
AutoModelForCausalLM # 4 Qwen
|
30 |
)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
class FallbackLLMSystem:
|
33 |
def __init__(self):
|
34 |
"""Initialize fallback models for event detection and reasoning"""
|
@@ -249,98 +283,197 @@ class QwenSystem:
|
|
249 |
raise
|
250 |
|
251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
class TranslationSystem:
|
253 |
-
def __init__(self
|
254 |
-
"""
|
255 |
-
Initialize translation system using Helsinki NLP model.
|
256 |
-
"""
|
257 |
try:
|
258 |
-
self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
|
259 |
-
|
260 |
except Exception as e:
|
261 |
-
st.error(f"Error initializing
|
262 |
raise
|
263 |
|
264 |
def translate_text(self, text):
|
265 |
-
"""
|
266 |
-
Translate single text using Helsinki NLP model with chunking for long texts.
|
267 |
-
"""
|
268 |
if pd.isna(text) or not isinstance(text, str) or not text.strip():
|
269 |
-
return text
|
270 |
|
271 |
text = str(text).strip()
|
272 |
if not text:
|
273 |
-
return
|
274 |
|
275 |
try:
|
276 |
-
|
277 |
-
max_chunk_size = 512 # Standard transformer length
|
278 |
-
|
279 |
-
if len(text.split()) <= max_chunk_size:
|
280 |
-
# Direct translation for short texts
|
281 |
-
result = self.translator(text, max_length=512)
|
282 |
-
return result[0]['translation_text']
|
283 |
-
|
284 |
-
# Split long text into chunks by sentences
|
285 |
chunks = self._split_into_chunks(text, max_chunk_size)
|
286 |
translated_chunks = []
|
287 |
|
288 |
for chunk in chunks:
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
return ' '.join(translated_chunks)
|
294 |
|
295 |
except Exception as e:
|
296 |
-
st.warning(f"Translation error: {str(e)}
|
297 |
return text
|
298 |
-
|
299 |
-
def _split_into_chunks(self, text, max_length):
|
300 |
-
"""
|
301 |
-
Split text into chunks by sentences, respecting max length.
|
302 |
-
"""
|
303 |
-
# Simple sentence splitting by common punctuation
|
304 |
-
sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if s.strip()]
|
305 |
-
|
306 |
-
chunks = []
|
307 |
-
current_chunk = []
|
308 |
-
current_length = 0
|
309 |
-
|
310 |
-
for sentence in sentences:
|
311 |
-
sentence_length = len(sentence.split())
|
312 |
-
|
313 |
-
if current_length + sentence_length > max_length:
|
314 |
-
if current_chunk:
|
315 |
-
chunks.append(' '.join(current_chunk))
|
316 |
-
current_chunk = [sentence]
|
317 |
-
current_length = sentence_length
|
318 |
-
else:
|
319 |
-
current_chunk.append(sentence)
|
320 |
-
current_length += sentence_length
|
321 |
|
322 |
-
if current_chunk:
|
323 |
-
chunks.append(' '.join(current_chunk))
|
324 |
-
|
325 |
-
return chunks
|
326 |
-
|
327 |
|
328 |
|
329 |
def process_file(uploaded_file, model_choice, translation_method=None):
|
330 |
df = None
|
331 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
333 |
llm = init_langchain_llm(model_choice)
|
334 |
-
# Add fallback initialization here
|
335 |
-
fallback_llm = FallbackLLMSystem() if model_choice != "Local-MT5" else llm
|
336 |
-
translator = TranslationSystem(batch_size=5)
|
337 |
|
338 |
-
#
|
339 |
groq_llm = ensure_groq_llm()
|
340 |
if groq_llm is None:
|
341 |
st.warning("Failed to initialize Groq LLM for impact estimation. Using fallback model.")
|
342 |
|
343 |
-
#
|
|
|
|
|
|
|
|
|
|
|
344 |
df['Translated'] = ''
|
345 |
df['Sentiment'] = ''
|
346 |
df['Impact'] = ''
|
@@ -348,104 +481,104 @@ def process_file(uploaded_file, model_choice, translation_method=None):
|
|
348 |
df['Event_Type'] = ''
|
349 |
df['Event_Summary'] = ''
|
350 |
|
351 |
-
# Validate required columns
|
352 |
-
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
353 |
-
missing_columns = [col for col in required_columns if col not in df.columns]
|
354 |
-
if missing_columns:
|
355 |
-
st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
|
356 |
-
return None
|
357 |
-
|
358 |
# Deduplication
|
359 |
-
|
360 |
df = df.groupby('Объект', group_keys=False).apply(
|
361 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
362 |
).reset_index(drop=True)
|
363 |
-
|
364 |
-
remaining_news_count = len(df)
|
365 |
-
duplicates_removed = original_news_count - remaining_news_count
|
366 |
-
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
367 |
-
|
368 |
-
# Initialize progress tracking
|
369 |
-
progress_bar = st.progress(0)
|
370 |
-
status_text = st.empty()
|
371 |
|
372 |
-
# Process
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
try:
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
row['Выдержки из текста'],
|
392 |
row['Объект']
|
393 |
)
|
394 |
except Exception as e:
|
|
|
|
|
395 |
if 'rate limit' in str(e).lower():
|
396 |
-
st.warning("Rate limit reached. Using fallback
|
397 |
-
event_type, event_summary = fallback_llm.detect_events(
|
398 |
-
row['Выдержки из текста'],
|
399 |
-
row['Объект']
|
400 |
-
)
|
401 |
-
|
402 |
-
df.at[idx, 'Event_Type'] = event_type
|
403 |
-
df.at[idx, 'Event_Summary'] = event_summary
|
404 |
|
|
|
|
|
405 |
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
if 'rate limit' in str(e).lower():
|
418 |
-
st.warning("Groq rate limit reached. Waiting before retry...")
|
419 |
-
time.sleep(240) # Wait 4 minutes
|
420 |
-
continue
|
421 |
-
|
422 |
-
df.at[idx, 'Impact'] = impact
|
423 |
-
df.at[idx, 'Reasoning'] = reasoning
|
424 |
-
|
425 |
-
# Update progress
|
426 |
-
progress = (idx + 1) / len(df)
|
427 |
-
progress_bar.progress(progress)
|
428 |
-
status_text.text(f"Проанализировано {idx + 1} из {len(df)} новостей")
|
429 |
-
|
430 |
-
except Exception as e:
|
431 |
-
if 'rate limit' in str(e).lower():
|
432 |
-
wait_time = 240 # 4 minutes wait for rate limit
|
433 |
-
st.warning(f"Rate limit reached. Waiting {wait_time} seconds...")
|
434 |
-
time.sleep(wait_time)
|
435 |
-
continue
|
436 |
-
st.warning(f"Ошибка при обработке новости {idx + 1}: {str(e)}")
|
437 |
-
continue
|
438 |
|
439 |
-
|
440 |
-
|
|
|
441 |
|
442 |
-
|
443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
|
445 |
return df
|
446 |
|
447 |
except Exception as e:
|
448 |
-
st.error(f"
|
449 |
return None
|
450 |
|
451 |
def translate_reasoning_to_russian(llm, text):
|
@@ -539,81 +672,33 @@ def get_mapped_sentiment(result):
|
|
539 |
|
540 |
|
541 |
def analyze_sentiment(text):
|
542 |
-
finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
|
543 |
-
roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
|
544 |
-
finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
|
545 |
-
|
546 |
-
# Consider sentiment negative if any model says it's negative
|
547 |
-
if any(result == "Negative" for result in [finbert_result, roberta_result, finbert_tone_result]):
|
548 |
-
return "Negative"
|
549 |
-
elif all(result == "Positive" for result in [finbert_result, roberta_result, finbert_tone_result]):
|
550 |
-
return "Positive"
|
551 |
-
return "Neutral"
|
552 |
-
|
553 |
-
def analyze_sentiment(text):
|
554 |
-
finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
|
555 |
-
roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
|
556 |
-
finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
|
557 |
-
|
558 |
-
# Count occurrences of each sentiment
|
559 |
-
sentiments = [finbert_result, roberta_result, finbert_tone_result]
|
560 |
-
sentiment_counts = {s: sentiments.count(s) for s in set(sentiments)}
|
561 |
-
|
562 |
-
# Return sentiment if at least two models agree, otherwise return Neutral
|
563 |
-
for sentiment, count in sentiment_counts.items():
|
564 |
-
if count >= 2:
|
565 |
-
return sentiment
|
566 |
-
return "Neutral"
|
567 |
-
|
568 |
-
|
569 |
-
def detect_events(llm, text, entity):
|
570 |
-
"""
|
571 |
-
Detect events in news text. This function works with both API-based LLMs and local models.
|
572 |
-
"""
|
573 |
-
# Initialize default return values
|
574 |
-
event_type = "Нет"
|
575 |
-
summary = ""
|
576 |
-
|
577 |
try:
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
response_text = response.content if hasattr(response, 'content') else str(response)
|
598 |
-
|
599 |
-
if "Тип:" in response_text and "Краткое описание:" in response_text:
|
600 |
-
type_part, summary_part = response_text.split("Краткое описание:")
|
601 |
-
event_type_temp = type_part.split("Тип:")[1].strip()
|
602 |
-
# Validate event type
|
603 |
-
valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
|
604 |
-
if event_type_temp in valid_types:
|
605 |
-
event_type = event_type_temp
|
606 |
-
summary = summary_part.strip()
|
607 |
|
608 |
-
#
|
609 |
-
|
610 |
-
# Assuming llm is FallbackLLMSystem instance
|
611 |
-
event_type, summary = llm.detect_events(text, entity)
|
612 |
-
|
613 |
-
except Exception as e:
|
614 |
-
st.warning(f"Ошибка при анализе событий: {str(e)}")
|
615 |
|
616 |
-
|
|
|
|
|
|
|
617 |
|
618 |
def fuzzy_deduplicate(df, column, threshold=50):
|
619 |
seen_texts = []
|
@@ -852,12 +937,13 @@ def create_output_file(df, uploaded_file, llm):
|
|
852 |
wb.save(output)
|
853 |
output.seek(0)
|
854 |
return output
|
|
|
855 |
def main():
|
|
|
|
|
856 |
with st.sidebar:
|
857 |
-
st.title("::: AI-анализ мониторинга новостей (v.3.
|
858 |
-
st.subheader("по материалам СКАН-ИНТЕРФАКС
|
859 |
-
|
860 |
-
|
861 |
|
862 |
model_choice = st.radio(
|
863 |
"Выберите модель для анализа:",
|
@@ -865,53 +951,75 @@ def main():
|
|
865 |
key="model_selector",
|
866 |
help="Выберите модель для анализа новостей"
|
867 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
868 |
st.markdown(
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
|
874 |
-
|
875 |
-
|
876 |
-
|
877 |
-
with st.expander("ℹ️ Инструкция"):
|
878 |
-
st.markdown("""
|
879 |
-
1. Выберите модель для анализа
|
880 |
-
2. Выберите метод перевода
|
881 |
-
3. Загрузите Excel файл с новостями
|
882 |
-
4. Дождитесь завершения анализа
|
883 |
-
5. Скачайте результаты анализа в формате Excel
|
884 |
-
""", unsafe_allow_html=True)
|
885 |
-
|
886 |
-
|
887 |
-
st.markdown(
|
888 |
-
"""
|
889 |
-
<style>
|
890 |
-
.signature {
|
891 |
-
position: fixed;
|
892 |
-
right: 12px;
|
893 |
-
up: 12px;
|
894 |
-
font-size: 14px;
|
895 |
-
color: #FF0000;
|
896 |
-
opacity: 0.9;
|
897 |
-
z-index: 999;
|
898 |
-
}
|
899 |
-
</style>
|
900 |
-
<div class="signature">denis.pokrovsky.npff</div>
|
901 |
-
""",
|
902 |
-
unsafe_allow_html=True
|
903 |
)
|
904 |
|
|
|
905 |
st.title("Анализ мониторинга новостей")
|
906 |
|
|
|
907 |
if 'processed_df' not in st.session_state:
|
908 |
st.session_state.processed_df = None
|
|
|
|
|
|
|
909 |
|
910 |
-
|
911 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
912 |
|
913 |
if uploaded_file is not None and st.session_state.processed_df is None:
|
914 |
-
start_time = time.time()
|
|
|
915 |
try:
|
916 |
st.session_state.processed_df = process_file(
|
917 |
uploaded_file,
|
@@ -920,63 +1028,58 @@ def main():
|
|
920 |
)
|
921 |
|
922 |
if st.session_state.processed_df is not None:
|
923 |
-
|
924 |
-
|
925 |
-
preview_columns = ['Объект', 'Заголовок']
|
926 |
-
if 'Sentiment' in st.session_state.processed_df.columns:
|
927 |
-
preview_columns.append('Sentiment')
|
928 |
-
if 'Impact' in st.session_state.processed_df.columns:
|
929 |
-
preview_columns.append('Impact')
|
930 |
-
|
931 |
-
preview_df = st.session_state.processed_df[preview_columns].head()
|
932 |
-
st.dataframe(preview_df)
|
933 |
|
934 |
-
# Show
|
935 |
-
st.subheader("
|
936 |
-
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
-
|
942 |
-
|
943 |
-
st.
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
|
|
|
|
|
|
|
|
951 |
|
952 |
-
|
953 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
954 |
|
955 |
except Exception as e:
|
956 |
-
st.error(f"
|
957 |
st.session_state.processed_df = None
|
958 |
|
959 |
-
|
960 |
-
|
961 |
-
|
962 |
-
output = create_output_file(
|
963 |
-
st.session_state.processed_df,
|
964 |
-
uploaded_file,
|
965 |
-
init_langchain_llm(model_choice) # Initialize new LLM instance
|
966 |
-
)
|
967 |
-
|
968 |
-
|
969 |
-
end_time = time.time()
|
970 |
-
elapsed_time = end_time - start_time
|
971 |
-
formatted_time = format_elapsed_time(elapsed_time)
|
972 |
-
st.success(f"Обработка и анализ завершены за {formatted_time}.")
|
973 |
-
|
974 |
-
st.download_button(
|
975 |
-
label="Скачать результат анализа",
|
976 |
-
data=output,
|
977 |
-
file_name="результат_анализа.xlsx",
|
978 |
-
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
979 |
-
)
|
980 |
|
981 |
if __name__ == "__main__":
|
982 |
main()
|
|
|
29 |
AutoModelForCausalLM # 4 Qwen
|
30 |
)
|
31 |
|
32 |
+
from threading import Event
|
33 |
+
import threading
|
34 |
+
from queue import Queue
|
35 |
+
|
36 |
+
class ProcessControl:
|
37 |
+
def __init__(self):
|
38 |
+
self.pause_event = Event()
|
39 |
+
self.stop_event = Event()
|
40 |
+
self.pause_event.set() # Start in non-paused state
|
41 |
+
|
42 |
+
def pause(self):
|
43 |
+
self.pause_event.clear()
|
44 |
+
|
45 |
+
def resume(self):
|
46 |
+
self.pause_event.set()
|
47 |
+
|
48 |
+
def stop(self):
|
49 |
+
self.stop_event.set()
|
50 |
+
self.pause_event.set() # Ensure not stuck in pause
|
51 |
+
|
52 |
+
def reset(self):
|
53 |
+
self.stop_event.clear()
|
54 |
+
self.pause_event.set()
|
55 |
+
|
56 |
+
def is_paused(self):
|
57 |
+
return not self.pause_event.is_set()
|
58 |
+
|
59 |
+
def is_stopped(self):
|
60 |
+
return self.stop_event.is_set()
|
61 |
+
|
62 |
+
def wait_if_paused(self):
|
63 |
+
self.pause_event.wait()
|
64 |
+
|
65 |
+
|
66 |
class FallbackLLMSystem:
|
67 |
def __init__(self):
|
68 |
"""Initialize fallback models for event detection and reasoning"""
|
|
|
283 |
raise
|
284 |
|
285 |
|
286 |
+
class ProcessingUI:
|
287 |
+
def __init__(self):
|
288 |
+
if 'control' not in st.session_state:
|
289 |
+
st.session_state.control = ProcessControl()
|
290 |
+
if 'negative_container' not in st.session_state:
|
291 |
+
st.session_state.negative_container = st.empty()
|
292 |
+
if 'events_container' not in st.session_state:
|
293 |
+
st.session_state.events_container = st.empty()
|
294 |
+
|
295 |
+
# Create control buttons
|
296 |
+
col1, col2 = st.columns(2)
|
297 |
+
with col1:
|
298 |
+
if st.button("⏸️ Pause/Resume" if not st.session_state.control.is_paused() else "▶️ Resume", key="pause_button"):
|
299 |
+
if st.session_state.control.is_paused():
|
300 |
+
st.session_state.control.resume()
|
301 |
+
else:
|
302 |
+
st.session_state.control.pause()
|
303 |
+
|
304 |
+
with col2:
|
305 |
+
if st.button("⏹️ Stop", key="stop_button"):
|
306 |
+
st.session_state.control.stop()
|
307 |
+
|
308 |
+
self.progress_bar = st.progress(0)
|
309 |
+
self.status = st.empty()
|
310 |
+
|
311 |
+
def update_progress(self, current, total):
|
312 |
+
progress = current / total
|
313 |
+
self.progress_bar.progress(progress)
|
314 |
+
self.status.text(f"Processing {current} of {total} items...")
|
315 |
+
|
316 |
+
def show_negative(self, entity, headline, analysis, impact=None):
|
317 |
+
with st.session_state.negative_container:
|
318 |
+
st.markdown(f"""
|
319 |
+
<div style='background-color: #ffebee; padding: 10px; border-radius: 5px; margin: 5px 0;'>
|
320 |
+
<strong style='color: #d32f2f;'>⚠️ Negative Alert:</strong><br>
|
321 |
+
<strong>Entity:</strong> {entity}<br>
|
322 |
+
<strong>News:</strong> {headline}<br>
|
323 |
+
<strong>Analysis:</strong> {analysis}<br>
|
324 |
+
{f"<strong>Impact:</strong> {impact}<br>" if impact else ""}
|
325 |
+
</div>
|
326 |
+
""", unsafe_allow_html=True)
|
327 |
+
|
328 |
+
def show_event(self, entity, event_type, headline):
|
329 |
+
with st.session_state.events_container:
|
330 |
+
st.markdown(f"""
|
331 |
+
<div style='background-color: #e3f2fd; padding: 10px; border-radius: 5px; margin: 5px 0;'>
|
332 |
+
<strong style='color: #1976d2;'>🔔 Event Detected:</strong><br>
|
333 |
+
<strong>Entity:</strong> {entity}<br>
|
334 |
+
<strong>Type:</strong> {event_type}<br>
|
335 |
+
<strong>News:</strong> {headline}
|
336 |
+
</div>
|
337 |
+
""", unsafe_allow_html=True)
|
338 |
+
|
339 |
+
class EventDetectionSystem:
|
340 |
+
def __init__(self):
|
341 |
+
try:
|
342 |
+
# Initialize models with specific labels
|
343 |
+
self.finbert = pipeline(
|
344 |
+
"text-classification",
|
345 |
+
model="ProsusAI/finbert",
|
346 |
+
return_all_scores=True
|
347 |
+
)
|
348 |
+
self.business_classifier = pipeline(
|
349 |
+
"text-classification",
|
350 |
+
model="yiyanghkust/finbert-tone",
|
351 |
+
return_all_scores=True
|
352 |
+
)
|
353 |
+
st.success("BERT models initialized for event detection")
|
354 |
+
except Exception as e:
|
355 |
+
st.error(f"Error initializing BERT models: {str(e)}")
|
356 |
+
raise
|
357 |
+
|
358 |
+
def detect_event_type(self, text, entity):
|
359 |
+
event_type = "Нет"
|
360 |
+
summary = ""
|
361 |
+
|
362 |
+
try:
|
363 |
+
# Ensure text is properly formatted
|
364 |
+
text = str(text).strip()
|
365 |
+
if not text:
|
366 |
+
return "Нет", "Empty text"
|
367 |
+
|
368 |
+
# Get predictions
|
369 |
+
finbert_scores = self.finbert(
|
370 |
+
text,
|
371 |
+
truncation=True,
|
372 |
+
max_length=512
|
373 |
+
)
|
374 |
+
business_scores = self.business_classifier(
|
375 |
+
text,
|
376 |
+
truncation=True,
|
377 |
+
max_length=512
|
378 |
+
)
|
379 |
+
|
380 |
+
# Get highest scoring predictions
|
381 |
+
finbert_pred = max(finbert_scores[0], key=lambda x: x['score'])
|
382 |
+
business_pred = max(business_scores[0], key=lambda x: x['score'])
|
383 |
+
|
384 |
+
# Map to event types with confidence threshold
|
385 |
+
confidence_threshold = 0.6
|
386 |
+
max_confidence = max(finbert_pred['score'], business_pred['score'])
|
387 |
+
|
388 |
+
if max_confidence >= confidence_threshold:
|
389 |
+
if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
|
390 |
+
event_type = "Отчетность"
|
391 |
+
summary = f"Финансовая отчетность (confidence: {max_confidence:.2f})"
|
392 |
+
elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт', 'реструктуризац']):
|
393 |
+
event_type = "РЦБ"
|
394 |
+
summary = f"Событие РЦБ (confidence: {max_confidence:.2f})"
|
395 |
+
elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
|
396 |
+
event_type = "Суд"
|
397 |
+
summary = f"Судебное разбирательство (confidence: {max_confidence:.2f})"
|
398 |
+
|
399 |
+
if event_type != "Нет":
|
400 |
+
summary += f"\nКомпания: {entity}"
|
401 |
+
|
402 |
+
return event_type, summary
|
403 |
+
|
404 |
+
except Exception as e:
|
405 |
+
st.warning(f"Event detection error: {str(e)}")
|
406 |
+
return "Нет", "Error in event detection"
|
407 |
+
|
408 |
class TranslationSystem:
|
409 |
+
def __init__(self):
|
410 |
+
"""Initialize translation system using Helsinki NLP model"""
|
|
|
|
|
411 |
try:
|
412 |
+
self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
|
413 |
+
st.success("Translation system initialized")
|
414 |
except Exception as e:
|
415 |
+
st.error(f"Error initializing translator: {str(e)}")
|
416 |
raise
|
417 |
|
418 |
def translate_text(self, text):
|
|
|
|
|
|
|
419 |
if pd.isna(text) or not isinstance(text, str) or not text.strip():
|
420 |
+
return str(text) if pd.notna(text) else ""
|
421 |
|
422 |
text = str(text).strip()
|
423 |
if not text:
|
424 |
+
return ""
|
425 |
|
426 |
try:
|
427 |
+
max_chunk_size = 450
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
chunks = self._split_into_chunks(text, max_chunk_size)
|
429 |
translated_chunks = []
|
430 |
|
431 |
for chunk in chunks:
|
432 |
+
if not chunk.strip():
|
433 |
+
continue
|
434 |
+
|
435 |
+
try:
|
436 |
+
result = self.translator(chunk, max_length=512)
|
437 |
+
if result and isinstance(result, list) and len(result) > 0:
|
438 |
+
translated_chunks.append(result[0].get('translation_text', chunk))
|
439 |
+
else:
|
440 |
+
translated_chunks.append(chunk)
|
441 |
+
except Exception as e:
|
442 |
+
st.warning(f"Chunk translation error: {str(e)}")
|
443 |
+
translated_chunks.append(chunk)
|
444 |
+
time.sleep(0.1)
|
445 |
|
446 |
return ' '.join(translated_chunks)
|
447 |
|
448 |
except Exception as e:
|
449 |
+
st.warning(f"Translation error: {str(e)}")
|
450 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
|
|
|
|
|
|
|
|
|
|
452 |
|
453 |
|
454 |
def process_file(uploaded_file, model_choice, translation_method=None):
|
455 |
df = None
|
456 |
try:
|
457 |
+
# Initialize UI and control systems
|
458 |
+
ui = ProcessingUI()
|
459 |
+
translator = TranslationSystem()
|
460 |
+
event_detector = EventDetectionSystem()
|
461 |
+
|
462 |
+
# Load and prepare data
|
463 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
464 |
llm = init_langchain_llm(model_choice)
|
|
|
|
|
|
|
465 |
|
466 |
+
# Initialize Groq for impact estimation
|
467 |
groq_llm = ensure_groq_llm()
|
468 |
if groq_llm is None:
|
469 |
st.warning("Failed to initialize Groq LLM for impact estimation. Using fallback model.")
|
470 |
|
471 |
+
# Prepare dataframe
|
472 |
+
text_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
473 |
+
for col in text_columns:
|
474 |
+
df[col] = df[col].fillna('').astype(str).apply(lambda x: x.strip())
|
475 |
+
|
476 |
+
# Initialize required columns
|
477 |
df['Translated'] = ''
|
478 |
df['Sentiment'] = ''
|
479 |
df['Impact'] = ''
|
|
|
481 |
df['Event_Type'] = ''
|
482 |
df['Event_Summary'] = ''
|
483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
# Deduplication
|
485 |
+
original_count = len(df)
|
486 |
df = df.groupby('Объект', group_keys=False).apply(
|
487 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
488 |
).reset_index(drop=True)
|
489 |
+
st.write(f"Removed {original_count - len(df)} duplicates.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
|
491 |
+
# Process rows
|
492 |
+
total_rows = len(df)
|
493 |
+
processed_rows = 0
|
494 |
+
|
495 |
+
for idx, row in df.iterrows():
|
496 |
+
# Check for stop/pause
|
497 |
+
if st.session_state.control.is_stopped():
|
498 |
+
st.warning("Processing stopped by user")
|
499 |
+
break
|
500 |
+
|
501 |
+
st.session_state.control.wait_if_paused()
|
502 |
+
if st.session_state.control.is_paused():
|
503 |
+
st.info("Processing paused... Click Resume to continue")
|
504 |
+
continue
|
505 |
+
|
506 |
+
try:
|
507 |
+
# Translation
|
508 |
+
translated_text = translator.translate_text(row['Выдержки из текста'])
|
509 |
+
df.at[idx, 'Translated'] = translated_text
|
510 |
+
|
511 |
+
# Sentiment analysis
|
512 |
+
sentiment = analyze_sentiment(translated_text)
|
513 |
+
df.at[idx, 'Sentiment'] = sentiment
|
514 |
+
|
515 |
+
# Event detection using BERT
|
516 |
+
event_type, event_summary = event_detector.detect_event_type(
|
517 |
+
translated_text,
|
518 |
+
row['Объект']
|
519 |
+
)
|
520 |
+
df.at[idx, 'Event_Type'] = event_type
|
521 |
+
df.at[idx, 'Event_Summary'] = event_summary
|
522 |
+
|
523 |
+
# Show events in real-time
|
524 |
+
if event_type != "Нет":
|
525 |
+
ui.show_event(
|
526 |
+
row['Объект'],
|
527 |
+
event_type,
|
528 |
+
row['Заголовок']
|
529 |
+
)
|
530 |
+
|
531 |
+
# Handle negative sentiment
|
532 |
+
if sentiment == "Negative":
|
533 |
try:
|
534 |
+
impact, reasoning = estimate_impact(
|
535 |
+
groq_llm if groq_llm is not None else llm,
|
536 |
+
translated_text,
|
|
|
537 |
row['Объект']
|
538 |
)
|
539 |
except Exception as e:
|
540 |
+
impact = "Неопределенный эффект"
|
541 |
+
reasoning = "Error in impact estimation"
|
542 |
if 'rate limit' in str(e).lower():
|
543 |
+
st.warning("Rate limit reached. Using fallback values.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
+
df.at[idx, 'Impact'] = impact
|
546 |
+
df.at[idx, 'Reasoning'] = reasoning
|
547 |
|
548 |
+
# Show negative alert in real-time
|
549 |
+
ui.show_negative(
|
550 |
+
row['Объект'],
|
551 |
+
row['Заголовок'],
|
552 |
+
reasoning,
|
553 |
+
impact
|
554 |
+
)
|
555 |
+
|
556 |
+
# Update progress
|
557 |
+
processed_rows += 1
|
558 |
+
ui.update_progress(processed_rows, total_rows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
+
except Exception as e:
|
561 |
+
st.warning(f"Error processing row {idx + 1}: {str(e)}")
|
562 |
+
continue
|
563 |
|
564 |
+
time.sleep(0.1)
|
565 |
+
|
566 |
+
# Handle stopped processing
|
567 |
+
if st.session_state.control.is_stopped() and len(df) > 0:
|
568 |
+
st.warning("Processing was stopped. Showing partial results.")
|
569 |
+
if st.button("Download Partial Results"):
|
570 |
+
output = create_output_file(df, uploaded_file, llm)
|
571 |
+
st.download_button(
|
572 |
+
label="📊 Download Partial Results",
|
573 |
+
data=output,
|
574 |
+
file_name="partial_analysis.xlsx",
|
575 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
576 |
+
)
|
577 |
|
578 |
return df
|
579 |
|
580 |
except Exception as e:
|
581 |
+
st.error(f"Error processing file: {str(e)}")
|
582 |
return None
|
583 |
|
584 |
def translate_reasoning_to_russian(llm, text):
|
|
|
672 |
|
673 |
|
674 |
def analyze_sentiment(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
675 |
try:
|
676 |
+
finbert_result = get_mapped_sentiment(
|
677 |
+
finbert(text, truncation=True, max_length=512)[0]
|
678 |
+
)
|
679 |
+
roberta_result = get_mapped_sentiment(
|
680 |
+
roberta(text, truncation=True, max_length=512)[0]
|
681 |
+
)
|
682 |
+
finbert_tone_result = get_mapped_sentiment(
|
683 |
+
finbert_tone(text, truncation=True, max_length=512)[0]
|
684 |
+
)
|
685 |
+
|
686 |
+
# Count occurrences of each sentiment
|
687 |
+
sentiments = [finbert_result, roberta_result, finbert_tone_result]
|
688 |
+
sentiment_counts = {s: sentiments.count(s) for s in set(sentiments)}
|
689 |
+
|
690 |
+
# Return sentiment if at least two models agree
|
691 |
+
for sentiment, count in sentiment_counts.items():
|
692 |
+
if count >= 2:
|
693 |
+
return sentiment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
694 |
|
695 |
+
# Default to Neutral if no agreement
|
696 |
+
return "Neutral"
|
|
|
|
|
|
|
|
|
|
|
697 |
|
698 |
+
except Exception as e:
|
699 |
+
st.warning(f"Sentiment analysis error: {str(e)}")
|
700 |
+
return "Neutral"
|
701 |
+
|
702 |
|
703 |
def fuzzy_deduplicate(df, column, threshold=50):
|
704 |
seen_texts = []
|
|
|
937 |
wb.save(output)
|
938 |
output.seek(0)
|
939 |
return output
|
940 |
+
|
941 |
def main():
|
942 |
+
st.set_page_config(layout="wide")
|
943 |
+
|
944 |
with st.sidebar:
|
945 |
+
st.title("::: AI-анализ мониторинга новостей (v.3.54):::")
|
946 |
+
st.subheader("по материалам СКАН-ИНТЕРФАКС")
|
|
|
|
|
947 |
|
948 |
model_choice = st.radio(
|
949 |
"Выберите модель для анализа:",
|
|
|
951 |
key="model_selector",
|
952 |
help="Выберите модель для анализа новостей"
|
953 |
)
|
954 |
+
|
955 |
+
uploaded_file = st.file_uploader(
|
956 |
+
"Выбирайте Excel-файл",
|
957 |
+
type="xlsx",
|
958 |
+
key="file_uploader"
|
959 |
+
)
|
960 |
+
|
961 |
st.markdown(
|
962 |
+
"""
|
963 |
+
Использованы технологии:
|
964 |
+
- Анализ естественного языка с помощью предтренированных нейросетей **BERT**
|
965 |
+
- Дополнительная обработка при помощи больших языковых моделей (**LLM**)
|
966 |
+
- Фреймворк **LangChain** для оркестрации
|
967 |
+
""",
|
968 |
+
unsafe_allow_html=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
969 |
)
|
970 |
|
971 |
+
# Main content area
|
972 |
st.title("Анализ мониторинга новостей")
|
973 |
|
974 |
+
# Initialize session state
|
975 |
if 'processed_df' not in st.session_state:
|
976 |
st.session_state.processed_df = None
|
977 |
+
|
978 |
+
# Create display areas
|
979 |
+
col1, col2 = st.columns([2, 1])
|
980 |
|
981 |
+
with col1:
|
982 |
+
# Area for real-time updates
|
983 |
+
st.subheader("Live Updates")
|
984 |
+
st.markdown("""
|
985 |
+
<style>
|
986 |
+
.stProgress .st-bo {
|
987 |
+
background-color: #f0f2f6;
|
988 |
+
}
|
989 |
+
.negative-alert {
|
990 |
+
background-color: #ffebee;
|
991 |
+
border-left: 5px solid #f44336;
|
992 |
+
padding: 10px;
|
993 |
+
margin: 5px 0;
|
994 |
+
}
|
995 |
+
.event-alert {
|
996 |
+
background-color: #e3f2fd;
|
997 |
+
border-left: 5px solid #2196f3;
|
998 |
+
padding: 10px;
|
999 |
+
margin: 5px 0;
|
1000 |
+
}
|
1001 |
+
</style>
|
1002 |
+
""", unsafe_allow_html=True)
|
1003 |
+
|
1004 |
+
with col2:
|
1005 |
+
# Area for statistics
|
1006 |
+
st.subheader("Statistics")
|
1007 |
+
if st.session_state.processed_df is not None:
|
1008 |
+
st.metric("Total Items", len(st.session_state.processed_df))
|
1009 |
+
st.metric("Negative Items",
|
1010 |
+
len(st.session_state.processed_df[
|
1011 |
+
st.session_state.processed_df['Sentiment'] == 'Negative'
|
1012 |
+
])
|
1013 |
+
)
|
1014 |
+
st.metric("Events Detected",
|
1015 |
+
len(st.session_state.processed_df[
|
1016 |
+
st.session_state.processed_df['Event_Type'] != 'Нет'
|
1017 |
+
])
|
1018 |
+
)
|
1019 |
|
1020 |
if uploaded_file is not None and st.session_state.processed_df is None:
|
1021 |
+
start_time = time.time()
|
1022 |
+
|
1023 |
try:
|
1024 |
st.session_state.processed_df = process_file(
|
1025 |
uploaded_file,
|
|
|
1028 |
)
|
1029 |
|
1030 |
if st.session_state.processed_df is not None:
|
1031 |
+
end_time = time.time()
|
1032 |
+
elapsed_time = format_elapsed_time(end_time - start_time)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1033 |
|
1034 |
+
# Show results
|
1035 |
+
st.subheader("Results Summary")
|
1036 |
+
|
1037 |
+
# Display statistics
|
1038 |
+
stats_cols = st.columns(4)
|
1039 |
+
with stats_cols[0]:
|
1040 |
+
st.metric("Total Processed", len(st.session_state.processed_df))
|
1041 |
+
with stats_cols[1]:
|
1042 |
+
st.metric("Negative Items",
|
1043 |
+
len(st.session_state.processed_df[
|
1044 |
+
st.session_state.processed_df['Sentiment'] == 'Negative'
|
1045 |
+
])
|
1046 |
+
)
|
1047 |
+
with stats_cols[2]:
|
1048 |
+
st.metric("Events Detected",
|
1049 |
+
len(st.session_state.processed_df[
|
1050 |
+
st.session_state.processed_df['Event_Type'] != 'Нет'
|
1051 |
+
])
|
1052 |
+
)
|
1053 |
+
with stats_cols[3]:
|
1054 |
+
st.metric("Processing Time", elapsed_time)
|
1055 |
|
1056 |
+
# Show data previews
|
1057 |
+
with st.expander("📊 Data Preview", expanded=True):
|
1058 |
+
preview_cols = ['Объект', 'Заголовок', 'Sentiment', 'Event_Type']
|
1059 |
+
st.dataframe(
|
1060 |
+
st.session_state.processed_df[preview_cols],
|
1061 |
+
use_container_width=True
|
1062 |
+
)
|
1063 |
+
|
1064 |
+
# Create downloadable report
|
1065 |
+
output = create_output_file(
|
1066 |
+
st.session_state.processed_df,
|
1067 |
+
uploaded_file,
|
1068 |
+
init_langchain_llm(model_choice)
|
1069 |
+
)
|
1070 |
+
|
1071 |
+
st.download_button(
|
1072 |
+
label="📥 Download Full Report",
|
1073 |
+
data=output,
|
1074 |
+
file_name="analysis_report.xlsx",
|
1075 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
1076 |
+
key='download_button'
|
1077 |
+
)
|
1078 |
|
1079 |
except Exception as e:
|
1080 |
+
st.error(f"Error processing file: {str(e)}")
|
1081 |
st.session_state.processed_df = None
|
1082 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1083 |
|
1084 |
if __name__ == "__main__":
|
1085 |
main()
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
streamlit
|
2 |
pandas
|
3 |
-
vaderSentiment
|
4 |
transformers>=4.30.0
|
5 |
torch
|
6 |
tqdm
|
@@ -20,5 +19,4 @@ pdfkit
|
|
20 |
Jinja2==3.1.2
|
21 |
langchain_openai
|
22 |
optimum
|
23 |
-
|
24 |
-
deep_translator
|
|
|
1 |
streamlit
|
2 |
pandas
|
|
|
3 |
transformers>=4.30.0
|
4 |
torch
|
5 |
tqdm
|
|
|
19 |
Jinja2==3.1.2
|
20 |
langchain_openai
|
21 |
optimum
|
22 |
+
sentencepiece
|
|