Spaces:
Running
Running
Commit
·
0d683d7
1
Parent(s):
f70decf
progress more 69
Browse files- app.py +70 -121
- requirements.txt +2 -1
app.py
CHANGED
@@ -8,7 +8,6 @@ from pymystem3 import Mystem
|
|
8 |
import io
|
9 |
from rapidfuzz import fuzz
|
10 |
from tqdm.auto import tqdm
|
11 |
-
import time
|
12 |
import torch
|
13 |
from openpyxl import load_workbook
|
14 |
from openpyxl import Workbook
|
@@ -26,7 +25,8 @@ import os
|
|
26 |
import openai
|
27 |
from transformers import MarianMTModel, MarianTokenizer
|
28 |
from langchain_community.chat_models import ChatOpenAI
|
29 |
-
|
|
|
30 |
|
31 |
class TranslationModel:
|
32 |
def __init__(self, model_name="Helsinki-NLP/opus-mt-ru-en"):
|
@@ -151,42 +151,7 @@ def estimate_impact(llm, news_text, entity):
|
|
151 |
|
152 |
return impact, reasoning
|
153 |
|
154 |
-
def process_file_with_llm(df, llm):
|
155 |
-
df['LLM_Impact'] = ''
|
156 |
-
df['LLM_Reasoning'] = ''
|
157 |
-
|
158 |
-
# Create a progress bar
|
159 |
-
progress_bar = st.progress(0)
|
160 |
-
status_text = st.empty()
|
161 |
-
|
162 |
-
total_rows = len(df)
|
163 |
-
rows_to_process = df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].isin(['Negative', 'Positive']).any(axis=1)]
|
164 |
-
|
165 |
-
|
166 |
-
for index, row in df.iterrows():
|
167 |
-
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
168 |
-
impact, reasoning = estimate_impact(llm, row['Translated'], row['Объект']) # Use translated text
|
169 |
-
df.at[index, 'LLM_Impact'] = impact
|
170 |
-
df.at[index, 'LLM_Reasoning'] = reasoning
|
171 |
-
# Display each LLM response
|
172 |
-
t.write(f"Объект: {row['Объект']}")
|
173 |
-
st.write(f"Новость: {row['Заголовок']}")
|
174 |
-
st.write(f"Эффект: {impact}")
|
175 |
-
st.write(f"Обоснование: {reasoning}")
|
176 |
-
st.write("---") # Add a separator between responses
|
177 |
-
|
178 |
|
179 |
-
# Update progress
|
180 |
-
progress = (index + 1) / total_rows
|
181 |
-
progress_bar.progress(progress)
|
182 |
-
status_text.text(f"Проанализировано {index + 1} из {total_rows} новостей")
|
183 |
-
|
184 |
-
# Clear the progress bar and status text
|
185 |
-
progress_bar.empty()
|
186 |
-
status_text.empty()
|
187 |
-
|
188 |
-
|
189 |
-
return df
|
190 |
|
191 |
def create_output_file_with_llm(df, uploaded_file, analysis_df):
|
192 |
wb = load_workbook("sample_file.xlsx")
|
@@ -378,51 +343,57 @@ def process_file(uploaded_file):
|
|
378 |
|
379 |
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
380 |
|
381 |
-
|
382 |
-
translated_texts = []
|
383 |
-
lemmatized_texts = []
|
384 |
-
progress_bar = st.progress(0)
|
385 |
-
progress_text = st.empty()
|
386 |
-
total_news = len(df)
|
387 |
-
|
388 |
-
st.write("Начинаем предобработку текстов...")
|
389 |
|
390 |
texts = df['Выдержки из текста'].tolist()
|
391 |
-
# Data validation
|
392 |
texts = [str(text) if not pd.isna(text) else "" for text in texts]
|
393 |
|
394 |
-
for text in
|
395 |
-
lemmatized_texts.append(lemmatize_text(text))
|
396 |
-
|
397 |
-
#for i, text in enumerate(lemmatized_texts):
|
398 |
-
# translated_text = translate(str(text))
|
399 |
-
# translated_texts.append(translated_text)
|
400 |
-
# progress_bar.progress((i + 1) / len(df))
|
401 |
-
# progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
|
402 |
-
|
403 |
translated_texts = batch_translate(lemmatized_texts)
|
404 |
df['Translated'] = translated_texts
|
405 |
|
406 |
-
|
407 |
# Perform sentiment analysis
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
#
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
|
|
424 |
|
425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
def create_output_file(df, uploaded_file, analysis_df):
|
428 |
# Load the sample file to use as a template
|
@@ -493,16 +464,31 @@ def create_output_file(df, uploaded_file, analysis_df):
|
|
493 |
|
494 |
return output
|
495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
def main():
|
497 |
-
st.title("... приступим к анализу... версия
|
498 |
|
499 |
# Initialize session state
|
500 |
if 'processed_df' not in st.session_state:
|
501 |
st.session_state.processed_df = None
|
502 |
-
|
503 |
-
st.session_state.analysis_df = None
|
504 |
-
if 'llm_analyzed' not in st.session_state:
|
505 |
-
st.session_state.llm_analyzed = False
|
506 |
|
507 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
508 |
|
@@ -510,61 +496,24 @@ def main():
|
|
510 |
start_time = time.time()
|
511 |
|
512 |
st.session_state.processed_df = process_file(uploaded_file)
|
513 |
-
st.session_state.analysis_df = create_analysis_data(st.session_state.processed_df)
|
514 |
|
515 |
st.subheader("Предпросмотр данных")
|
516 |
st.write(st.session_state.processed_df.head())
|
517 |
-
|
518 |
-
st.
|
519 |
-
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
|
520 |
-
fig.suptitle("Распределение окраски по моделям")
|
521 |
-
|
522 |
-
models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
|
523 |
-
for i, model in enumerate(models):
|
524 |
-
ax = axs[i // 2, i % 2]
|
525 |
-
sentiment_counts = st.session_state.processed_df[model].value_counts()
|
526 |
-
sentiment_counts.plot(kind='bar', ax=ax)
|
527 |
-
ax.set_title(f"{model} Sentiment")
|
528 |
-
ax.set_xlabel("Sentiment")
|
529 |
-
ax.set_ylabel("Count")
|
530 |
-
|
531 |
-
plt.tight_layout()
|
532 |
-
st.pyplot(fig)
|
533 |
-
|
534 |
st.subheader("Анализ")
|
535 |
-
st.dataframe(
|
536 |
|
537 |
-
output =
|
538 |
|
539 |
end_time = time.time()
|
540 |
elapsed_time = end_time - start_time
|
541 |
formatted_time = format_elapsed_time(elapsed_time)
|
542 |
-
st.success(f"Обработка
|
543 |
|
544 |
st.download_button(
|
545 |
-
label="Скачать результат анализа новостей",
|
546 |
data=output,
|
547 |
-
file_name="результат_анализа_новостей.xlsx",
|
548 |
-
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
549 |
-
)
|
550 |
-
|
551 |
-
if st.session_state.processed_df is not None and not st.session_state.llm_analyzed:
|
552 |
-
if st.button("Что скажет нейросеть?"):
|
553 |
-
st.info("Анализ нейросетью начался. Это может занять некоторое время...")
|
554 |
-
llm = init_langchain_llm()
|
555 |
-
if llm:
|
556 |
-
df_with_llm = process_file_with_llm(st.session_state.processed_df, llm)
|
557 |
-
output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, st.session_state.analysis_df)
|
558 |
-
st.success("Анализ нейросетью завершен!")
|
559 |
-
st.session_state.llm_analyzed = True
|
560 |
-
st.session_state.output_with_llm = output_with_llm
|
561 |
-
else:
|
562 |
-
st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
|
563 |
-
|
564 |
-
if st.session_state.llm_analyzed:
|
565 |
-
st.download_button(
|
566 |
-
label="Скачать результат анализа с оценкой нейросети",
|
567 |
-
data=st.session_state.output_with_llm,
|
568 |
file_name="результат_анализа_с_нейросетью.xlsx",
|
569 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
570 |
)
|
|
|
8 |
import io
|
9 |
from rapidfuzz import fuzz
|
10 |
from tqdm.auto import tqdm
|
|
|
11 |
import torch
|
12 |
from openpyxl import load_workbook
|
13 |
from openpyxl import Workbook
|
|
|
25 |
import openai
|
26 |
from transformers import MarianMTModel, MarianTokenizer
|
27 |
from langchain_community.chat_models import ChatOpenAI
|
28 |
+
from wordcloud import WordCloud
|
29 |
+
from collections import Counter
|
30 |
|
31 |
class TranslationModel:
|
32 |
def __init__(self, model_name="Helsinki-NLP/opus-mt-ru-en"):
|
|
|
151 |
|
152 |
return impact, reasoning
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
def create_output_file_with_llm(df, uploaded_file, analysis_df):
|
157 |
wb = load_workbook("sample_file.xlsx")
|
|
|
343 |
|
344 |
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
345 |
|
346 |
+
st.write("Начинаем предобработку и анализ текстов...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
texts = df['Выдержки из текста'].tolist()
|
|
|
349 |
texts = [str(text) if not pd.isna(text) else "" for text in texts]
|
350 |
|
351 |
+
lemmatized_texts = [lemmatize_text(text) for text in texts]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
translated_texts = batch_translate(lemmatized_texts)
|
353 |
df['Translated'] = translated_texts
|
354 |
|
|
|
355 |
# Perform sentiment analysis
|
356 |
+
df['ruBERT2'] = [get_rubert2_sentiment(text) for text in texts]
|
357 |
+
df['FinBERT'] = [get_finbert_sentiment(text) for text in translated_texts]
|
358 |
+
df['RoBERTa'] = [get_roberta_sentiment(text) for text in translated_texts]
|
359 |
+
df['FinBERT-Tone'] = [get_finbert_tone_sentiment(text) for text in translated_texts]
|
360 |
+
|
361 |
+
# Initialize LLM
|
362 |
+
llm = init_langchain_llm()
|
363 |
+
if not llm:
|
364 |
+
st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
|
365 |
+
st.stop()
|
366 |
+
|
367 |
+
# Perform LLM analysis
|
368 |
+
df['LLM_Impact'] = ''
|
369 |
+
df['LLM_Reasoning'] = ''
|
370 |
+
|
371 |
+
progress_bar = st.progress(0)
|
372 |
+
status_text = st.empty()
|
373 |
|
374 |
+
for index, row in df.iterrows():
|
375 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
376 |
+
impact, reasoning = estimate_impact(llm, row['Translated'], row['Объект'])
|
377 |
+
df.at[index, 'LLM_Impact'] = impact
|
378 |
+
df.at[index, 'LLM_Reasoning'] = reasoning
|
379 |
+
|
380 |
+
st.write(f"Объект: {row['Объект']}")
|
381 |
+
st.write(f"Новость: {row['Заголовок']}")
|
382 |
+
st.write(f"Эффект: {impact}")
|
383 |
+
st.write(f"Обоснование: {reasoning}")
|
384 |
+
st.write("---")
|
385 |
+
|
386 |
+
progress = (index + 1) / len(df)
|
387 |
+
progress_bar.progress(progress)
|
388 |
+
status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
|
389 |
+
|
390 |
+
progress_bar.empty()
|
391 |
+
status_text.empty()
|
392 |
+
|
393 |
+
word_cloud_plot = generate_word_cloud(df)
|
394 |
+
st.pyplot(word_cloud_plot)
|
395 |
+
|
396 |
+
return df
|
397 |
|
398 |
def create_output_file(df, uploaded_file, analysis_df):
|
399 |
# Load the sample file to use as a template
|
|
|
464 |
|
465 |
return output
|
466 |
|
467 |
+
def generate_word_cloud(df):
|
468 |
+
# Filter for negative sentiments
|
469 |
+
negative_df = df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Negative').any(axis=1)]
|
470 |
+
|
471 |
+
# Combine entity names with their frequency of negative mentions
|
472 |
+
entity_counts = Counter(negative_df['Объект'])
|
473 |
+
|
474 |
+
# Create and generate a word cloud image
|
475 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(entity_counts)
|
476 |
+
|
477 |
+
# Display the generated image
|
478 |
+
plt.figure(figsize=(10, 5))
|
479 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
480 |
+
plt.axis('off')
|
481 |
+
plt.title('Облако слов: Объекты с негативными упоминаниями')
|
482 |
+
return plt
|
483 |
+
|
484 |
+
|
485 |
def main():
|
486 |
+
st.title("... приступим к анализу... версия 69")
|
487 |
|
488 |
# Initialize session state
|
489 |
if 'processed_df' not in st.session_state:
|
490 |
st.session_state.processed_df = None
|
491 |
+
|
|
|
|
|
|
|
492 |
|
493 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
494 |
|
|
|
496 |
start_time = time.time()
|
497 |
|
498 |
st.session_state.processed_df = process_file(uploaded_file)
|
|
|
499 |
|
500 |
st.subheader("Предпросмотр данных")
|
501 |
st.write(st.session_state.processed_df.head())
|
502 |
+
|
503 |
+
analysis_df = create_analysis_data(st.session_state.processed_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
st.subheader("Анализ")
|
505 |
+
st.dataframe(analysis_df)
|
506 |
|
507 |
+
output = create_output_file_with_llm(st.session_state.processed_df, uploaded_file, analysis_df)
|
508 |
|
509 |
end_time = time.time()
|
510 |
elapsed_time = end_time - start_time
|
511 |
formatted_time = format_elapsed_time(elapsed_time)
|
512 |
+
st.success(f"Обработка и анализ завершены за {formatted_time}.")
|
513 |
|
514 |
st.download_button(
|
515 |
+
label="Скачать результат анализа новостей с оценкой нейросети",
|
516 |
data=output,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
file_name="результат_анализа_с_нейросетью.xlsx",
|
518 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
519 |
)
|
requirements.txt
CHANGED
@@ -14,4 +14,5 @@ langchain
|
|
14 |
langchain-community
|
15 |
huggingface_hub
|
16 |
accelerate>=0.26.0
|
17 |
-
openai
|
|
|
|
14 |
langchain-community
|
15 |
huggingface_hub
|
16 |
accelerate>=0.26.0
|
17 |
+
openai
|
18 |
+
wordcloud
|