pentarosarium commited on
Commit
6036a45
·
1 Parent(s): d007853
Files changed (2) hide show
  1. app.py +405 -302
  2. requirements.txt +1 -3
app.py CHANGED
@@ -29,6 +29,40 @@ from transformers import (
29
  AutoModelForCausalLM # 4 Qwen
30
  )
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  class FallbackLLMSystem:
33
  def __init__(self):
34
  """Initialize fallback models for event detection and reasoning"""
@@ -249,98 +283,197 @@ class QwenSystem:
249
  raise
250
 
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  class TranslationSystem:
253
- def __init__(self, batch_size=5):
254
- """
255
- Initialize translation system using Helsinki NLP model.
256
- """
257
  try:
258
- self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en") # Note: ru-en for Russian to English
259
- self.batch_size = batch_size
260
  except Exception as e:
261
- st.error(f"Error initializing Helsinki NLP translator: {str(e)}")
262
  raise
263
 
264
  def translate_text(self, text):
265
- """
266
- Translate single text using Helsinki NLP model with chunking for long texts.
267
- """
268
  if pd.isna(text) or not isinstance(text, str) or not text.strip():
269
- return text
270
 
271
  text = str(text).strip()
272
  if not text:
273
- return text
274
 
275
  try:
276
- # Helsinki NLP model typically has a max length limit
277
- max_chunk_size = 512 # Standard transformer length
278
-
279
- if len(text.split()) <= max_chunk_size:
280
- # Direct translation for short texts
281
- result = self.translator(text, max_length=512)
282
- return result[0]['translation_text']
283
-
284
- # Split long text into chunks by sentences
285
  chunks = self._split_into_chunks(text, max_chunk_size)
286
  translated_chunks = []
287
 
288
  for chunk in chunks:
289
- result = self.translator(chunk, max_length=512)
290
- translated_chunks.append(result[0]['translation_text'])
291
- time.sleep(0.1) # Small delay between chunks
 
 
 
 
 
 
 
 
 
 
292
 
293
  return ' '.join(translated_chunks)
294
 
295
  except Exception as e:
296
- st.warning(f"Translation error: {str(e)}. Using original text.")
297
  return text
298
-
299
- def _split_into_chunks(self, text, max_length):
300
- """
301
- Split text into chunks by sentences, respecting max length.
302
- """
303
- # Simple sentence splitting by common punctuation
304
- sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if s.strip()]
305
-
306
- chunks = []
307
- current_chunk = []
308
- current_length = 0
309
-
310
- for sentence in sentences:
311
- sentence_length = len(sentence.split())
312
-
313
- if current_length + sentence_length > max_length:
314
- if current_chunk:
315
- chunks.append(' '.join(current_chunk))
316
- current_chunk = [sentence]
317
- current_length = sentence_length
318
- else:
319
- current_chunk.append(sentence)
320
- current_length += sentence_length
321
 
322
- if current_chunk:
323
- chunks.append(' '.join(current_chunk))
324
-
325
- return chunks
326
-
327
 
328
 
329
  def process_file(uploaded_file, model_choice, translation_method=None):
330
  df = None
331
  try:
 
 
 
 
 
 
332
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
333
  llm = init_langchain_llm(model_choice)
334
- # Add fallback initialization here
335
- fallback_llm = FallbackLLMSystem() if model_choice != "Local-MT5" else llm
336
- translator = TranslationSystem(batch_size=5)
337
 
338
- # Pre-initialize Groq for impact estimation
339
  groq_llm = ensure_groq_llm()
340
  if groq_llm is None:
341
  st.warning("Failed to initialize Groq LLM for impact estimation. Using fallback model.")
342
 
343
- # Initialize all required columns first
 
 
 
 
 
344
  df['Translated'] = ''
345
  df['Sentiment'] = ''
346
  df['Impact'] = ''
@@ -348,104 +481,104 @@ def process_file(uploaded_file, model_choice, translation_method=None):
348
  df['Event_Type'] = ''
349
  df['Event_Summary'] = ''
350
 
351
- # Validate required columns
352
- required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
353
- missing_columns = [col for col in required_columns if col not in df.columns]
354
- if missing_columns:
355
- st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
356
- return None
357
-
358
  # Deduplication
359
- original_news_count = len(df)
360
  df = df.groupby('Объект', group_keys=False).apply(
361
  lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
362
  ).reset_index(drop=True)
363
-
364
- remaining_news_count = len(df)
365
- duplicates_removed = original_news_count - remaining_news_count
366
- st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
367
-
368
- # Initialize progress tracking
369
- progress_bar = st.progress(0)
370
- status_text = st.empty()
371
 
372
- # Process in batches
373
- batch_size = 5
374
- for i in range(0, len(df), batch_size):
375
- batch_df = df.iloc[i:i+batch_size]
376
-
377
- for idx, row in batch_df.iterrows():
378
- try:
379
- # Translation with Helsinki NLP
380
- translated_text = translator.translate_text(row['Выдержки из текста'])
381
- df.at[idx, 'Translated'] = translated_text
382
-
383
- # Sentiment analysis
384
- sentiment = analyze_sentiment(translated_text)
385
- df.at[idx, 'Sentiment'] = sentiment
386
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  try:
388
- # Try with primary LLM
389
- event_type, event_summary = detect_events(
390
- llm,
391
- row['Выдержки из текста'],
392
  row['Объект']
393
  )
394
  except Exception as e:
 
 
395
  if 'rate limit' in str(e).lower():
396
- st.warning("Rate limit reached. Using fallback model for event detection.")
397
- event_type, event_summary = fallback_llm.detect_events(
398
- row['Выдержки из текста'],
399
- row['Объект']
400
- )
401
-
402
- df.at[idx, 'Event_Type'] = event_type
403
- df.at[idx, 'Event_Summary'] = event_summary
404
 
 
 
405
 
406
- # Similar for impact estimation
407
- if sentiment == "Negative":
408
- try:
409
- impact, reasoning = estimate_impact(
410
- groq_llm if groq_llm is not None else llm,
411
- translated_text,
412
- row['Объект']
413
- )
414
- df.at[idx, 'Impact'] = impact
415
- df.at[idx, 'Reasoning'] = reasoning
416
- except Exception as e:
417
- if 'rate limit' in str(e).lower():
418
- st.warning("Groq rate limit reached. Waiting before retry...")
419
- time.sleep(240) # Wait 4 minutes
420
- continue
421
-
422
- df.at[idx, 'Impact'] = impact
423
- df.at[idx, 'Reasoning'] = reasoning
424
-
425
- # Update progress
426
- progress = (idx + 1) / len(df)
427
- progress_bar.progress(progress)
428
- status_text.text(f"Проанализировано {idx + 1} из {len(df)} новостей")
429
-
430
- except Exception as e:
431
- if 'rate limit' in str(e).lower():
432
- wait_time = 240 # 4 minutes wait for rate limit
433
- st.warning(f"Rate limit reached. Waiting {wait_time} seconds...")
434
- time.sleep(wait_time)
435
- continue
436
- st.warning(f"Ошибка при обработке новости {idx + 1}: {str(e)}")
437
- continue
438
 
439
- # Small delay between items
440
- time.sleep(0.5)
 
441
 
442
- # Delay between batches
443
- time.sleep(2)
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  return df
446
 
447
  except Exception as e:
448
- st.error(f" Ошибка при обработке файла: {str(e)}")
449
  return None
450
 
451
  def translate_reasoning_to_russian(llm, text):
@@ -539,81 +672,33 @@ def get_mapped_sentiment(result):
539
 
540
 
541
  def analyze_sentiment(text):
542
- finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
543
- roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
544
- finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
545
-
546
- # Consider sentiment negative if any model says it's negative
547
- if any(result == "Negative" for result in [finbert_result, roberta_result, finbert_tone_result]):
548
- return "Negative"
549
- elif all(result == "Positive" for result in [finbert_result, roberta_result, finbert_tone_result]):
550
- return "Positive"
551
- return "Neutral"
552
-
553
- def analyze_sentiment(text):
554
- finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
555
- roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
556
- finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
557
-
558
- # Count occurrences of each sentiment
559
- sentiments = [finbert_result, roberta_result, finbert_tone_result]
560
- sentiment_counts = {s: sentiments.count(s) for s in set(sentiments)}
561
-
562
- # Return sentiment if at least two models agree, otherwise return Neutral
563
- for sentiment, count in sentiment_counts.items():
564
- if count >= 2:
565
- return sentiment
566
- return "Neutral"
567
-
568
-
569
- def detect_events(llm, text, entity):
570
- """
571
- Detect events in news text. This function works with both API-based LLMs and local models.
572
- """
573
- # Initialize default return values
574
- event_type = "Нет"
575
- summary = ""
576
-
577
  try:
578
- # Handle API-based LLMs (Groq, GPT-4, Qwen)
579
- if hasattr(llm, 'invoke'):
580
- template = """
581
- Проанализируйте следующую новость о компании "{entity}" и определите наличие следующих событий:
582
- 1. Публикация отчетности и ключевые показатели (выручка, прибыль, EBITDA)
583
- 2. События на рынке ценных бумаг (погашение облигаций, выплата/невыплата купона, дефолт, реструктуризация)
584
- 3. Судебные иски или юридические действия против компании, акционеров, менеджеров
585
-
586
- Новость: {text}
587
-
588
- Ответьте в следующем формате:
589
- Тип: ["Отчетность" или "РЦБ" или "Суд" или "Нет"]
590
- Краткое описание: [краткое описание события на русском языке, не более 2 предложений]
591
- """
592
-
593
- prompt = PromptTemplate(template=template, input_variables=["entity", "text"])
594
- chain = prompt | llm
595
- response = chain.invoke({"entity": entity, "text": text})
596
-
597
- response_text = response.content if hasattr(response, 'content') else str(response)
598
-
599
- if "Тип:" in response_text and "Краткое описание:" in response_text:
600
- type_part, summary_part = response_text.split("Краткое описание:")
601
- event_type_temp = type_part.split("Тип:")[1].strip()
602
- # Validate event type
603
- valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
604
- if event_type_temp in valid_types:
605
- event_type = event_type_temp
606
- summary = summary_part.strip()
607
 
608
- # Handle local MT5 model
609
- else:
610
- # Assuming llm is FallbackLLMSystem instance
611
- event_type, summary = llm.detect_events(text, entity)
612
-
613
- except Exception as e:
614
- st.warning(f"Ошибка при анализе событий: {str(e)}")
615
 
616
- return event_type, summary
 
 
 
617
 
618
  def fuzzy_deduplicate(df, column, threshold=50):
619
  seen_texts = []
@@ -852,12 +937,13 @@ def create_output_file(df, uploaded_file, llm):
852
  wb.save(output)
853
  output.seek(0)
854
  return output
 
855
  def main():
 
 
856
  with st.sidebar:
857
- st.title("::: AI-анализ мониторинга новостей (v.3.51):::")
858
- st.subheader("по материалам СКАН-ИНТЕРФАКС ")
859
-
860
-
861
 
862
  model_choice = st.radio(
863
  "Выберите модель для анализа:",
@@ -865,53 +951,75 @@ def main():
865
  key="model_selector",
866
  help="Выберите модель для анализа новостей"
867
  )
 
 
 
 
 
 
 
868
  st.markdown(
869
- """
870
- Использованы технологии:
871
- - Анализ естественного языка с помощью предтренированных нейросетей **BERT**,<br/>
872
- - Дополнительная обработка при помощи больших языковых моделей (**LLM**),<br/>
873
- - объединенные при помощи фреймворка **LangChain**.<br>
874
- """,
875
- unsafe_allow_html=True)
876
-
877
- with st.expander("ℹ️ Инструкция"):
878
- st.markdown("""
879
- 1. Выберите модель для анализа
880
- 2. Выберите метод перевода
881
- 3. Загрузите Excel файл с новостями
882
- 4. Дождитесь завершения анализа
883
- 5. Скачайте результаты анализа в формате Excel
884
- """, unsafe_allow_html=True)
885
-
886
-
887
- st.markdown(
888
- """
889
- <style>
890
- .signature {
891
- position: fixed;
892
- right: 12px;
893
- up: 12px;
894
- font-size: 14px;
895
- color: #FF0000;
896
- opacity: 0.9;
897
- z-index: 999;
898
- }
899
- </style>
900
- <div class="signature">denis.pokrovsky.npff</div>
901
- """,
902
- unsafe_allow_html=True
903
  )
904
 
 
905
  st.title("Анализ мониторинга новостей")
906
 
 
907
  if 'processed_df' not in st.session_state:
908
  st.session_state.processed_df = None
 
 
 
909
 
910
- # Single file uploader with unique key
911
- uploaded_file = st.sidebar.file_uploader("Выбирайте Excel-файл", type="xlsx", key="unique_file_uploader")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
912
 
913
  if uploaded_file is not None and st.session_state.processed_df is None:
914
- start_time = time.time()
 
915
  try:
916
  st.session_state.processed_df = process_file(
917
  uploaded_file,
@@ -920,63 +1028,58 @@ def main():
920
  )
921
 
922
  if st.session_state.processed_df is not None:
923
- # Show preview with safe column access
924
- st.subheader("Предпросмотр данных")
925
- preview_columns = ['Объект', 'Заголовок']
926
- if 'Sentiment' in st.session_state.processed_df.columns:
927
- preview_columns.append('Sentiment')
928
- if 'Impact' in st.session_state.processed_df.columns:
929
- preview_columns.append('Impact')
930
-
931
- preview_df = st.session_state.processed_df[preview_columns].head()
932
- st.dataframe(preview_df)
933
 
934
- # Show monitoring results
935
- st.subheader("Предпросмотр мониторинга событий и риск-факторов эмитентов")
936
- if 'Event_Type' in st.session_state.processed_df.columns:
937
- monitoring_df = st.session_state.processed_df[
938
- (st.session_state.processed_df['Event_Type'] != 'Нет') &
939
- (st.session_state.processed_df['Event_Type'].notna())
940
- ][['Объект', 'Заголовок', 'Event_Type', 'Event_Summary']].head()
941
-
942
- if len(monitoring_df) > 0:
943
- st.dataframe(monitoring_df)
944
- else:
945
- st.info("Не обнаружено значимых событий для мониторинга")
946
-
947
- # Create analysis data
948
- analysis_df = create_analysis_data(st.session_state.processed_df)
949
- st.subheader("Анализ")
950
- st.dataframe(analysis_df)
 
 
 
 
951
 
952
- else:
953
- st.error("Ошибка при обработке файла")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
 
955
  except Exception as e:
956
- st.error(f"Ошибка при обработке файла: {str(e)}")
957
  st.session_state.processed_df = None
958
 
959
-
960
-
961
-
962
- output = create_output_file(
963
- st.session_state.processed_df,
964
- uploaded_file,
965
- init_langchain_llm(model_choice) # Initialize new LLM instance
966
- )
967
-
968
-
969
- end_time = time.time()
970
- elapsed_time = end_time - start_time
971
- formatted_time = format_elapsed_time(elapsed_time)
972
- st.success(f"Обработка и анализ завершены за {formatted_time}.")
973
-
974
- st.download_button(
975
- label="Скачать результат анализа",
976
- data=output,
977
- file_name="результат_анализа.xlsx",
978
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
979
- )
980
 
981
  if __name__ == "__main__":
982
  main()
 
29
  AutoModelForCausalLM # 4 Qwen
30
  )
31
 
32
+ from threading import Event
33
+ import threading
34
+ from queue import Queue
35
+
36
+ class ProcessControl:
37
+ def __init__(self):
38
+ self.pause_event = Event()
39
+ self.stop_event = Event()
40
+ self.pause_event.set() # Start in non-paused state
41
+
42
+ def pause(self):
43
+ self.pause_event.clear()
44
+
45
+ def resume(self):
46
+ self.pause_event.set()
47
+
48
+ def stop(self):
49
+ self.stop_event.set()
50
+ self.pause_event.set() # Ensure not stuck in pause
51
+
52
+ def reset(self):
53
+ self.stop_event.clear()
54
+ self.pause_event.set()
55
+
56
+ def is_paused(self):
57
+ return not self.pause_event.is_set()
58
+
59
+ def is_stopped(self):
60
+ return self.stop_event.is_set()
61
+
62
+ def wait_if_paused(self):
63
+ self.pause_event.wait()
64
+
65
+
66
  class FallbackLLMSystem:
67
  def __init__(self):
68
  """Initialize fallback models for event detection and reasoning"""
 
283
  raise
284
 
285
 
286
+ class ProcessingUI:
287
+ def __init__(self):
288
+ if 'control' not in st.session_state:
289
+ st.session_state.control = ProcessControl()
290
+ if 'negative_container' not in st.session_state:
291
+ st.session_state.negative_container = st.empty()
292
+ if 'events_container' not in st.session_state:
293
+ st.session_state.events_container = st.empty()
294
+
295
+ # Create control buttons
296
+ col1, col2 = st.columns(2)
297
+ with col1:
298
+ if st.button("⏸️ Pause/Resume" if not st.session_state.control.is_paused() else "▶️ Resume", key="pause_button"):
299
+ if st.session_state.control.is_paused():
300
+ st.session_state.control.resume()
301
+ else:
302
+ st.session_state.control.pause()
303
+
304
+ with col2:
305
+ if st.button("⏹️ Stop", key="stop_button"):
306
+ st.session_state.control.stop()
307
+
308
+ self.progress_bar = st.progress(0)
309
+ self.status = st.empty()
310
+
311
+ def update_progress(self, current, total):
312
+ progress = current / total
313
+ self.progress_bar.progress(progress)
314
+ self.status.text(f"Processing {current} of {total} items...")
315
+
316
+ def show_negative(self, entity, headline, analysis, impact=None):
317
+ with st.session_state.negative_container:
318
+ st.markdown(f"""
319
+ <div style='background-color: #ffebee; padding: 10px; border-radius: 5px; margin: 5px 0;'>
320
+ <strong style='color: #d32f2f;'>⚠️ Negative Alert:</strong><br>
321
+ <strong>Entity:</strong> {entity}<br>
322
+ <strong>News:</strong> {headline}<br>
323
+ <strong>Analysis:</strong> {analysis}<br>
324
+ {f"<strong>Impact:</strong> {impact}<br>" if impact else ""}
325
+ </div>
326
+ """, unsafe_allow_html=True)
327
+
328
+ def show_event(self, entity, event_type, headline):
329
+ with st.session_state.events_container:
330
+ st.markdown(f"""
331
+ <div style='background-color: #e3f2fd; padding: 10px; border-radius: 5px; margin: 5px 0;'>
332
+ <strong style='color: #1976d2;'>🔔 Event Detected:</strong><br>
333
+ <strong>Entity:</strong> {entity}<br>
334
+ <strong>Type:</strong> {event_type}<br>
335
+ <strong>News:</strong> {headline}
336
+ </div>
337
+ """, unsafe_allow_html=True)
338
+
339
+ class EventDetectionSystem:
340
+ def __init__(self):
341
+ try:
342
+ # Initialize models with specific labels
343
+ self.finbert = pipeline(
344
+ "text-classification",
345
+ model="ProsusAI/finbert",
346
+ return_all_scores=True
347
+ )
348
+ self.business_classifier = pipeline(
349
+ "text-classification",
350
+ model="yiyanghkust/finbert-tone",
351
+ return_all_scores=True
352
+ )
353
+ st.success("BERT models initialized for event detection")
354
+ except Exception as e:
355
+ st.error(f"Error initializing BERT models: {str(e)}")
356
+ raise
357
+
358
+ def detect_event_type(self, text, entity):
359
+ event_type = "Нет"
360
+ summary = ""
361
+
362
+ try:
363
+ # Ensure text is properly formatted
364
+ text = str(text).strip()
365
+ if not text:
366
+ return "Нет", "Empty text"
367
+
368
+ # Get predictions
369
+ finbert_scores = self.finbert(
370
+ text,
371
+ truncation=True,
372
+ max_length=512
373
+ )
374
+ business_scores = self.business_classifier(
375
+ text,
376
+ truncation=True,
377
+ max_length=512
378
+ )
379
+
380
+ # Get highest scoring predictions
381
+ finbert_pred = max(finbert_scores[0], key=lambda x: x['score'])
382
+ business_pred = max(business_scores[0], key=lambda x: x['score'])
383
+
384
+ # Map to event types with confidence threshold
385
+ confidence_threshold = 0.6
386
+ max_confidence = max(finbert_pred['score'], business_pred['score'])
387
+
388
+ if max_confidence >= confidence_threshold:
389
+ if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
390
+ event_type = "Отчетность"
391
+ summary = f"Финансовая отчетность (confidence: {max_confidence:.2f})"
392
+ elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт', 'реструктуризац']):
393
+ event_type = "РЦБ"
394
+ summary = f"Событие РЦБ (confidence: {max_confidence:.2f})"
395
+ elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
396
+ event_type = "Суд"
397
+ summary = f"Судебное разбирательство (confidence: {max_confidence:.2f})"
398
+
399
+ if event_type != "Нет":
400
+ summary += f"\nКомпания: {entity}"
401
+
402
+ return event_type, summary
403
+
404
+ except Exception as e:
405
+ st.warning(f"Event detection error: {str(e)}")
406
+ return "Нет", "Error in event detection"
407
+
408
  class TranslationSystem:
409
+ def __init__(self):
410
+ """Initialize translation system using Helsinki NLP model"""
 
 
411
  try:
412
+ self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
413
+ st.success("Translation system initialized")
414
  except Exception as e:
415
+ st.error(f"Error initializing translator: {str(e)}")
416
  raise
417
 
418
  def translate_text(self, text):
 
 
 
419
  if pd.isna(text) or not isinstance(text, str) or not text.strip():
420
+ return str(text) if pd.notna(text) else ""
421
 
422
  text = str(text).strip()
423
  if not text:
424
+ return ""
425
 
426
  try:
427
+ max_chunk_size = 450
 
 
 
 
 
 
 
 
428
  chunks = self._split_into_chunks(text, max_chunk_size)
429
  translated_chunks = []
430
 
431
  for chunk in chunks:
432
+ if not chunk.strip():
433
+ continue
434
+
435
+ try:
436
+ result = self.translator(chunk, max_length=512)
437
+ if result and isinstance(result, list) and len(result) > 0:
438
+ translated_chunks.append(result[0].get('translation_text', chunk))
439
+ else:
440
+ translated_chunks.append(chunk)
441
+ except Exception as e:
442
+ st.warning(f"Chunk translation error: {str(e)}")
443
+ translated_chunks.append(chunk)
444
+ time.sleep(0.1)
445
 
446
  return ' '.join(translated_chunks)
447
 
448
  except Exception as e:
449
+ st.warning(f"Translation error: {str(e)}")
450
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
 
 
 
 
 
452
 
453
 
454
  def process_file(uploaded_file, model_choice, translation_method=None):
455
  df = None
456
  try:
457
+ # Initialize UI and control systems
458
+ ui = ProcessingUI()
459
+ translator = TranslationSystem()
460
+ event_detector = EventDetectionSystem()
461
+
462
+ # Load and prepare data
463
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
464
  llm = init_langchain_llm(model_choice)
 
 
 
465
 
466
+ # Initialize Groq for impact estimation
467
  groq_llm = ensure_groq_llm()
468
  if groq_llm is None:
469
  st.warning("Failed to initialize Groq LLM for impact estimation. Using fallback model.")
470
 
471
+ # Prepare dataframe
472
+ text_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
473
+ for col in text_columns:
474
+ df[col] = df[col].fillna('').astype(str).apply(lambda x: x.strip())
475
+
476
+ # Initialize required columns
477
  df['Translated'] = ''
478
  df['Sentiment'] = ''
479
  df['Impact'] = ''
 
481
  df['Event_Type'] = ''
482
  df['Event_Summary'] = ''
483
 
 
 
 
 
 
 
 
484
  # Deduplication
485
+ original_count = len(df)
486
  df = df.groupby('Объект', group_keys=False).apply(
487
  lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
488
  ).reset_index(drop=True)
489
+ st.write(f"Removed {original_count - len(df)} duplicates.")
 
 
 
 
 
 
 
490
 
491
+ # Process rows
492
+ total_rows = len(df)
493
+ processed_rows = 0
494
+
495
+ for idx, row in df.iterrows():
496
+ # Check for stop/pause
497
+ if st.session_state.control.is_stopped():
498
+ st.warning("Processing stopped by user")
499
+ break
500
+
501
+ st.session_state.control.wait_if_paused()
502
+ if st.session_state.control.is_paused():
503
+ st.info("Processing paused... Click Resume to continue")
504
+ continue
505
+
506
+ try:
507
+ # Translation
508
+ translated_text = translator.translate_text(row['Выдержки из текста'])
509
+ df.at[idx, 'Translated'] = translated_text
510
+
511
+ # Sentiment analysis
512
+ sentiment = analyze_sentiment(translated_text)
513
+ df.at[idx, 'Sentiment'] = sentiment
514
+
515
+ # Event detection using BERT
516
+ event_type, event_summary = event_detector.detect_event_type(
517
+ translated_text,
518
+ row['Объект']
519
+ )
520
+ df.at[idx, 'Event_Type'] = event_type
521
+ df.at[idx, 'Event_Summary'] = event_summary
522
+
523
+ # Show events in real-time
524
+ if event_type != "Нет":
525
+ ui.show_event(
526
+ row['Объект'],
527
+ event_type,
528
+ row['Заголовок']
529
+ )
530
+
531
+ # Handle negative sentiment
532
+ if sentiment == "Negative":
533
  try:
534
+ impact, reasoning = estimate_impact(
535
+ groq_llm if groq_llm is not None else llm,
536
+ translated_text,
 
537
  row['Объект']
538
  )
539
  except Exception as e:
540
+ impact = "Неопределенный эффект"
541
+ reasoning = "Error in impact estimation"
542
  if 'rate limit' in str(e).lower():
543
+ st.warning("Rate limit reached. Using fallback values.")
 
 
 
 
 
 
 
544
 
545
+ df.at[idx, 'Impact'] = impact
546
+ df.at[idx, 'Reasoning'] = reasoning
547
 
548
+ # Show negative alert in real-time
549
+ ui.show_negative(
550
+ row['Объект'],
551
+ row['Заголовок'],
552
+ reasoning,
553
+ impact
554
+ )
555
+
556
+ # Update progress
557
+ processed_rows += 1
558
+ ui.update_progress(processed_rows, total_rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
+ except Exception as e:
561
+ st.warning(f"Error processing row {idx + 1}: {str(e)}")
562
+ continue
563
 
564
+ time.sleep(0.1)
565
+
566
+ # Handle stopped processing
567
+ if st.session_state.control.is_stopped() and len(df) > 0:
568
+ st.warning("Processing was stopped. Showing partial results.")
569
+ if st.button("Download Partial Results"):
570
+ output = create_output_file(df, uploaded_file, llm)
571
+ st.download_button(
572
+ label="📊 Download Partial Results",
573
+ data=output,
574
+ file_name="partial_analysis.xlsx",
575
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
576
+ )
577
 
578
  return df
579
 
580
  except Exception as e:
581
+ st.error(f"Error processing file: {str(e)}")
582
  return None
583
 
584
  def translate_reasoning_to_russian(llm, text):
 
672
 
673
 
674
  def analyze_sentiment(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  try:
676
+ finbert_result = get_mapped_sentiment(
677
+ finbert(text, truncation=True, max_length=512)[0]
678
+ )
679
+ roberta_result = get_mapped_sentiment(
680
+ roberta(text, truncation=True, max_length=512)[0]
681
+ )
682
+ finbert_tone_result = get_mapped_sentiment(
683
+ finbert_tone(text, truncation=True, max_length=512)[0]
684
+ )
685
+
686
+ # Count occurrences of each sentiment
687
+ sentiments = [finbert_result, roberta_result, finbert_tone_result]
688
+ sentiment_counts = {s: sentiments.count(s) for s in set(sentiments)}
689
+
690
+ # Return sentiment if at least two models agree
691
+ for sentiment, count in sentiment_counts.items():
692
+ if count >= 2:
693
+ return sentiment
 
 
 
 
 
 
 
 
 
 
 
694
 
695
+ # Default to Neutral if no agreement
696
+ return "Neutral"
 
 
 
 
 
697
 
698
+ except Exception as e:
699
+ st.warning(f"Sentiment analysis error: {str(e)}")
700
+ return "Neutral"
701
+
702
 
703
  def fuzzy_deduplicate(df, column, threshold=50):
704
  seen_texts = []
 
937
  wb.save(output)
938
  output.seek(0)
939
  return output
940
+
941
  def main():
942
+ st.set_page_config(layout="wide")
943
+
944
  with st.sidebar:
945
+ st.title("::: AI-анализ мониторинга новостей (v.3.54):::")
946
+ st.subheader("по материалам СКАН-ИНТЕРФАКС")
 
 
947
 
948
  model_choice = st.radio(
949
  "Выберите модель для анализа:",
 
951
  key="model_selector",
952
  help="Выберите модель для анализа новостей"
953
  )
954
+
955
+ uploaded_file = st.file_uploader(
956
+ "Выбирайте Excel-файл",
957
+ type="xlsx",
958
+ key="file_uploader"
959
+ )
960
+
961
  st.markdown(
962
+ """
963
+ Использованы технологии:
964
+ - Анализ естественного языка с помощью предтренированных нейросетей **BERT**
965
+ - Дополнительная обработка при помощи больших языковых моделей (**LLM**)
966
+ - Фреймворк **LangChain** для оркестрации
967
+ """,
968
+ unsafe_allow_html=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
969
  )
970
 
971
+ # Main content area
972
  st.title("Анализ мониторинга новостей")
973
 
974
+ # Initialize session state
975
  if 'processed_df' not in st.session_state:
976
  st.session_state.processed_df = None
977
+
978
+ # Create display areas
979
+ col1, col2 = st.columns([2, 1])
980
 
981
+ with col1:
982
+ # Area for real-time updates
983
+ st.subheader("Live Updates")
984
+ st.markdown("""
985
+ <style>
986
+ .stProgress .st-bo {
987
+ background-color: #f0f2f6;
988
+ }
989
+ .negative-alert {
990
+ background-color: #ffebee;
991
+ border-left: 5px solid #f44336;
992
+ padding: 10px;
993
+ margin: 5px 0;
994
+ }
995
+ .event-alert {
996
+ background-color: #e3f2fd;
997
+ border-left: 5px solid #2196f3;
998
+ padding: 10px;
999
+ margin: 5px 0;
1000
+ }
1001
+ </style>
1002
+ """, unsafe_allow_html=True)
1003
+
1004
+ with col2:
1005
+ # Area for statistics
1006
+ st.subheader("Statistics")
1007
+ if st.session_state.processed_df is not None:
1008
+ st.metric("Total Items", len(st.session_state.processed_df))
1009
+ st.metric("Negative Items",
1010
+ len(st.session_state.processed_df[
1011
+ st.session_state.processed_df['Sentiment'] == 'Negative'
1012
+ ])
1013
+ )
1014
+ st.metric("Events Detected",
1015
+ len(st.session_state.processed_df[
1016
+ st.session_state.processed_df['Event_Type'] != 'Нет'
1017
+ ])
1018
+ )
1019
 
1020
  if uploaded_file is not None and st.session_state.processed_df is None:
1021
+ start_time = time.time()
1022
+
1023
  try:
1024
  st.session_state.processed_df = process_file(
1025
  uploaded_file,
 
1028
  )
1029
 
1030
  if st.session_state.processed_df is not None:
1031
+ end_time = time.time()
1032
+ elapsed_time = format_elapsed_time(end_time - start_time)
 
 
 
 
 
 
 
 
1033
 
1034
+ # Show results
1035
+ st.subheader("Results Summary")
1036
+
1037
+ # Display statistics
1038
+ stats_cols = st.columns(4)
1039
+ with stats_cols[0]:
1040
+ st.metric("Total Processed", len(st.session_state.processed_df))
1041
+ with stats_cols[1]:
1042
+ st.metric("Negative Items",
1043
+ len(st.session_state.processed_df[
1044
+ st.session_state.processed_df['Sentiment'] == 'Negative'
1045
+ ])
1046
+ )
1047
+ with stats_cols[2]:
1048
+ st.metric("Events Detected",
1049
+ len(st.session_state.processed_df[
1050
+ st.session_state.processed_df['Event_Type'] != 'Нет'
1051
+ ])
1052
+ )
1053
+ with stats_cols[3]:
1054
+ st.metric("Processing Time", elapsed_time)
1055
 
1056
+ # Show data previews
1057
+ with st.expander("📊 Data Preview", expanded=True):
1058
+ preview_cols = ['Объект', 'Заголовок', 'Sentiment', 'Event_Type']
1059
+ st.dataframe(
1060
+ st.session_state.processed_df[preview_cols],
1061
+ use_container_width=True
1062
+ )
1063
+
1064
+ # Create downloadable report
1065
+ output = create_output_file(
1066
+ st.session_state.processed_df,
1067
+ uploaded_file,
1068
+ init_langchain_llm(model_choice)
1069
+ )
1070
+
1071
+ st.download_button(
1072
+ label="📥 Download Full Report",
1073
+ data=output,
1074
+ file_name="analysis_report.xlsx",
1075
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1076
+ key='download_button'
1077
+ )
1078
 
1079
  except Exception as e:
1080
+ st.error(f"Error processing file: {str(e)}")
1081
  st.session_state.processed_df = None
1082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1083
 
1084
  if __name__ == "__main__":
1085
  main()
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  streamlit
2
  pandas
3
- vaderSentiment
4
  transformers>=4.30.0
5
  torch
6
  tqdm
@@ -20,5 +19,4 @@ pdfkit
20
  Jinja2==3.1.2
21
  langchain_openai
22
  optimum
23
- googletrans
24
- deep_translator
 
1
  streamlit
2
  pandas
 
3
  transformers>=4.30.0
4
  torch
5
  tqdm
 
19
  Jinja2==3.1.2
20
  langchain_openai
21
  optimum
22
+ sentencepiece