pentarosarium commited on
Commit
0d683d7
·
1 Parent(s): f70decf

progress more 69

Browse files
Files changed (2) hide show
  1. app.py +70 -121
  2. requirements.txt +2 -1
app.py CHANGED
@@ -8,7 +8,6 @@ from pymystem3 import Mystem
8
  import io
9
  from rapidfuzz import fuzz
10
  from tqdm.auto import tqdm
11
- import time
12
  import torch
13
  from openpyxl import load_workbook
14
  from openpyxl import Workbook
@@ -26,7 +25,8 @@ import os
26
  import openai
27
  from transformers import MarianMTModel, MarianTokenizer
28
  from langchain_community.chat_models import ChatOpenAI
29
-
 
30
 
31
  class TranslationModel:
32
  def __init__(self, model_name="Helsinki-NLP/opus-mt-ru-en"):
@@ -151,42 +151,7 @@ def estimate_impact(llm, news_text, entity):
151
 
152
  return impact, reasoning
153
 
154
- def process_file_with_llm(df, llm):
155
- df['LLM_Impact'] = ''
156
- df['LLM_Reasoning'] = ''
157
-
158
- # Create a progress bar
159
- progress_bar = st.progress(0)
160
- status_text = st.empty()
161
-
162
- total_rows = len(df)
163
- rows_to_process = df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].isin(['Negative', 'Positive']).any(axis=1)]
164
-
165
-
166
- for index, row in df.iterrows():
167
- if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
168
- impact, reasoning = estimate_impact(llm, row['Translated'], row['Объект']) # Use translated text
169
- df.at[index, 'LLM_Impact'] = impact
170
- df.at[index, 'LLM_Reasoning'] = reasoning
171
- # Display each LLM response
172
- t.write(f"Объект: {row['Объект']}")
173
- st.write(f"Новость: {row['Заголовок']}")
174
- st.write(f"Эффект: {impact}")
175
- st.write(f"Обоснование: {reasoning}")
176
- st.write("---") # Add a separator between responses
177
-
178
 
179
- # Update progress
180
- progress = (index + 1) / total_rows
181
- progress_bar.progress(progress)
182
- status_text.text(f"Проанализировано {index + 1} из {total_rows} новостей")
183
-
184
- # Clear the progress bar and status text
185
- progress_bar.empty()
186
- status_text.empty()
187
-
188
-
189
- return df
190
 
191
  def create_output_file_with_llm(df, uploaded_file, analysis_df):
192
  wb = load_workbook("sample_file.xlsx")
@@ -378,51 +343,57 @@ def process_file(uploaded_file):
378
 
379
  st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
380
 
381
- # Translate texts
382
- translated_texts = []
383
- lemmatized_texts = []
384
- progress_bar = st.progress(0)
385
- progress_text = st.empty()
386
- total_news = len(df)
387
-
388
- st.write("Начинаем предобработку текстов...")
389
 
390
  texts = df['Выдержки из текста'].tolist()
391
- # Data validation
392
  texts = [str(text) if not pd.isna(text) else "" for text in texts]
393
 
394
- for text in df['Выдержки из текста']:
395
- lemmatized_texts.append(lemmatize_text(text))
396
-
397
- #for i, text in enumerate(lemmatized_texts):
398
- # translated_text = translate(str(text))
399
- # translated_texts.append(translated_text)
400
- # progress_bar.progress((i + 1) / len(df))
401
- # progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
402
-
403
  translated_texts = batch_translate(lemmatized_texts)
404
  df['Translated'] = translated_texts
405
 
406
-
407
  # Perform sentiment analysis
408
- rubert2_results = [get_rubert2_sentiment(text) for text in texts]
409
- finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
410
- roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
411
- finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
412
-
413
- # Create a new DataFrame with processed data
414
- processed_df = pd.DataFrame({
415
- 'Объект': df['Объект'],
416
- 'Заголовок': df['Заголовок'], # Preserve original 'Заголовок'
417
- 'ruBERT2': rubert2_results,
418
- 'FinBERT': finbert_results,
419
- 'RoBERTa': roberta_results,
420
- 'FinBERT-Tone': finbert_tone_results,
421
- 'Выдержки из текста': df['Выдержки из текста'],
422
- 'Translated': translated_texts
423
- })
 
424
 
425
- return processed_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
  def create_output_file(df, uploaded_file, analysis_df):
428
  # Load the sample file to use as a template
@@ -493,16 +464,31 @@ def create_output_file(df, uploaded_file, analysis_df):
493
 
494
  return output
495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  def main():
497
- st.title("... приступим к анализу... версия 67")
498
 
499
  # Initialize session state
500
  if 'processed_df' not in st.session_state:
501
  st.session_state.processed_df = None
502
- if 'analysis_df' not in st.session_state:
503
- st.session_state.analysis_df = None
504
- if 'llm_analyzed' not in st.session_state:
505
- st.session_state.llm_analyzed = False
506
 
507
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
508
 
@@ -510,61 +496,24 @@ def main():
510
  start_time = time.time()
511
 
512
  st.session_state.processed_df = process_file(uploaded_file)
513
- st.session_state.analysis_df = create_analysis_data(st.session_state.processed_df)
514
 
515
  st.subheader("Предпросмотр данных")
516
  st.write(st.session_state.processed_df.head())
517
-
518
- st.subheader("Распределение окраски")
519
- fig, axs = plt.subplots(2, 2, figsize=(12, 8))
520
- fig.suptitle("Распределение окраски по моделям")
521
-
522
- models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
523
- for i, model in enumerate(models):
524
- ax = axs[i // 2, i % 2]
525
- sentiment_counts = st.session_state.processed_df[model].value_counts()
526
- sentiment_counts.plot(kind='bar', ax=ax)
527
- ax.set_title(f"{model} Sentiment")
528
- ax.set_xlabel("Sentiment")
529
- ax.set_ylabel("Count")
530
-
531
- plt.tight_layout()
532
- st.pyplot(fig)
533
-
534
  st.subheader("Анализ")
535
- st.dataframe(st.session_state.analysis_df)
536
 
537
- output = create_output_file(st.session_state.processed_df, uploaded_file, st.session_state.analysis_df)
538
 
539
  end_time = time.time()
540
  elapsed_time = end_time - start_time
541
  formatted_time = format_elapsed_time(elapsed_time)
542
- st.success(f"Обработка завершена за {formatted_time}.")
543
 
544
  st.download_button(
545
- label="Скачать результат анализа новостей",
546
  data=output,
547
- file_name="результат_анализа_новостей.xlsx",
548
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
549
- )
550
-
551
- if st.session_state.processed_df is not None and not st.session_state.llm_analyzed:
552
- if st.button("Что скажет нейросеть?"):
553
- st.info("Анализ нейросетью начался. Это может занять некоторое время...")
554
- llm = init_langchain_llm()
555
- if llm:
556
- df_with_llm = process_file_with_llm(st.session_state.processed_df, llm)
557
- output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, st.session_state.analysis_df)
558
- st.success("Анализ нейросетью завершен!")
559
- st.session_state.llm_analyzed = True
560
- st.session_state.output_with_llm = output_with_llm
561
- else:
562
- st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
563
-
564
- if st.session_state.llm_analyzed:
565
- st.download_button(
566
- label="Скачать результат анализа с оценкой нейросети",
567
- data=st.session_state.output_with_llm,
568
  file_name="результат_анализа_с_нейросетью.xlsx",
569
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
570
  )
 
8
  import io
9
  from rapidfuzz import fuzz
10
  from tqdm.auto import tqdm
 
11
  import torch
12
  from openpyxl import load_workbook
13
  from openpyxl import Workbook
 
25
  import openai
26
  from transformers import MarianMTModel, MarianTokenizer
27
  from langchain_community.chat_models import ChatOpenAI
28
+ from wordcloud import WordCloud
29
+ from collections import Counter
30
 
31
  class TranslationModel:
32
  def __init__(self, model_name="Helsinki-NLP/opus-mt-ru-en"):
 
151
 
152
  return impact, reasoning
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def create_output_file_with_llm(df, uploaded_file, analysis_df):
157
  wb = load_workbook("sample_file.xlsx")
 
343
 
344
  st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
345
 
346
+ st.write("Начинаем предобработку и анализ текстов...")
 
 
 
 
 
 
 
347
 
348
  texts = df['Выдержки из текста'].tolist()
 
349
  texts = [str(text) if not pd.isna(text) else "" for text in texts]
350
 
351
+ lemmatized_texts = [lemmatize_text(text) for text in texts]
 
 
 
 
 
 
 
 
352
  translated_texts = batch_translate(lemmatized_texts)
353
  df['Translated'] = translated_texts
354
 
 
355
  # Perform sentiment analysis
356
+ df['ruBERT2'] = [get_rubert2_sentiment(text) for text in texts]
357
+ df['FinBERT'] = [get_finbert_sentiment(text) for text in translated_texts]
358
+ df['RoBERTa'] = [get_roberta_sentiment(text) for text in translated_texts]
359
+ df['FinBERT-Tone'] = [get_finbert_tone_sentiment(text) for text in translated_texts]
360
+
361
+ # Initialize LLM
362
+ llm = init_langchain_llm()
363
+ if not llm:
364
+ st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
365
+ st.stop()
366
+
367
+ # Perform LLM analysis
368
+ df['LLM_Impact'] = ''
369
+ df['LLM_Reasoning'] = ''
370
+
371
+ progress_bar = st.progress(0)
372
+ status_text = st.empty()
373
 
374
+ for index, row in df.iterrows():
375
+ if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
376
+ impact, reasoning = estimate_impact(llm, row['Translated'], row['Объект'])
377
+ df.at[index, 'LLM_Impact'] = impact
378
+ df.at[index, 'LLM_Reasoning'] = reasoning
379
+
380
+ st.write(f"Объект: {row['Объект']}")
381
+ st.write(f"Новость: {row['Заголовок']}")
382
+ st.write(f"Эффект: {impact}")
383
+ st.write(f"Обоснование: {reasoning}")
384
+ st.write("---")
385
+
386
+ progress = (index + 1) / len(df)
387
+ progress_bar.progress(progress)
388
+ status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
389
+
390
+ progress_bar.empty()
391
+ status_text.empty()
392
+
393
+ word_cloud_plot = generate_word_cloud(df)
394
+ st.pyplot(word_cloud_plot)
395
+
396
+ return df
397
 
398
  def create_output_file(df, uploaded_file, analysis_df):
399
  # Load the sample file to use as a template
 
464
 
465
  return output
466
 
467
+ def generate_word_cloud(df):
468
+ # Filter for negative sentiments
469
+ negative_df = df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Negative').any(axis=1)]
470
+
471
+ # Combine entity names with their frequency of negative mentions
472
+ entity_counts = Counter(negative_df['Объект'])
473
+
474
+ # Create and generate a word cloud image
475
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(entity_counts)
476
+
477
+ # Display the generated image
478
+ plt.figure(figsize=(10, 5))
479
+ plt.imshow(wordcloud, interpolation='bilinear')
480
+ plt.axis('off')
481
+ plt.title('Облако слов: Объекты с негативными упоминаниями')
482
+ return plt
483
+
484
+
485
  def main():
486
+ st.title("... приступим к анализу... версия 69")
487
 
488
  # Initialize session state
489
  if 'processed_df' not in st.session_state:
490
  st.session_state.processed_df = None
491
+
 
 
 
492
 
493
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
494
 
 
496
  start_time = time.time()
497
 
498
  st.session_state.processed_df = process_file(uploaded_file)
 
499
 
500
  st.subheader("Предпросмотр данных")
501
  st.write(st.session_state.processed_df.head())
502
+
503
+ analysis_df = create_analysis_data(st.session_state.processed_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  st.subheader("Анализ")
505
+ st.dataframe(analysis_df)
506
 
507
+ output = create_output_file_with_llm(st.session_state.processed_df, uploaded_file, analysis_df)
508
 
509
  end_time = time.time()
510
  elapsed_time = end_time - start_time
511
  formatted_time = format_elapsed_time(elapsed_time)
512
+ st.success(f"Обработка и анализ завершены за {formatted_time}.")
513
 
514
  st.download_button(
515
+ label="Скачать результат анализа новостей с оценкой нейросети",
516
  data=output,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  file_name="результат_анализа_с_нейросетью.xlsx",
518
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
519
  )
requirements.txt CHANGED
@@ -14,4 +14,5 @@ langchain
14
  langchain-community
15
  huggingface_hub
16
  accelerate>=0.26.0
17
- openai
 
 
14
  langchain-community
15
  huggingface_hub
16
  accelerate>=0.26.0
17
+ openai
18
+ wordcloud