pentarosarium commited on
Commit
03eddb7
·
1 Parent(s): bc222e3

progress more 73

Browse files
Files changed (1) hide show
  1. app.py +83 -367
app.py CHANGED
@@ -1,84 +1,46 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import time
4
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
  import matplotlib.pyplot as plt
7
- from pymystem3 import Mystem
8
  import io
9
  from rapidfuzz import fuzz
10
- from tqdm.auto import tqdm
11
- import torch
12
  from openpyxl import load_workbook
13
- from openpyxl import Workbook
14
- from openpyxl.utils.dataframe import dataframe_to_rows
15
- from sentiment_decorators import sentiment_analysis_decorator
16
- import transformers
17
- from langchain_community.llms import HuggingFacePipeline
18
  from langchain.prompts import PromptTemplate
19
- from langchain.chains import LLMChain
20
  from langchain_core.runnables import RunnablePassthrough
21
- from huggingface_hub import login
22
- from accelerate import init_empty_weights
23
- import logging
24
- import os
25
- import openai
26
- from transformers import MarianMTModel, MarianTokenizer
27
- from langchain_community.chat_models import ChatOpenAI
28
- from wordcloud import WordCloud
29
- from collections import Counter
30
-
31
- class TranslationModel:
32
- def __init__(self, model_name="Helsinki-NLP/opus-mt-ru-en"):
33
- self.tokenizer = MarianTokenizer.from_pretrained(model_name)
34
- self.model = MarianMTModel.from_pretrained(model_name)
35
- if torch.cuda.is_available():
36
- self.model = self.model.to('cuda')
37
-
38
- def translate(self, text):
39
- inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
40
- if torch.cuda.is_available():
41
- inputs = {k: v.to('cuda') for k, v in inputs.items()}
42
-
43
- with torch.no_grad():
44
- translated = self.model.generate(**inputs)
45
-
46
- return self.tokenizer.decode(translated[0], skip_special_tokens=True)
47
-
48
-
49
- def batch_translate(texts, batch_size=32):
50
- translator = TranslationModel()
51
- translated_texts = []
52
-
53
- for i in range(0, len(texts), batch_size):
54
- batch = texts[i:i+batch_size]
55
- translations = [translator.translate(text) for text in batch]
56
- translated_texts.extend(translations)
57
-
58
- # Update progress
59
- progress = (i + len(batch)) / len(texts)
60
- st.progress(progress)
61
- st.text(f"Предобработано {i + len(batch)} из {len(texts)} текстов")
62
-
63
- return translated_texts
64
-
65
-
66
-
67
- logging.basicConfig(level=logging.INFO)
68
- logger = logging.getLogger(__name__)
69
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Initialize pymystem3 for lemmatization
72
- mystem = Mystem()
73
-
74
- # Set up the sentiment analyzers
75
-
76
- finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
77
- roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
78
- finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
79
- rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
80
- rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
81
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  def estimate_sentiment_and_impact(llm, news_text, entity):
84
  template = """
@@ -106,14 +68,12 @@ def estimate_sentiment_and_impact(llm, news_text, entity):
106
  chain = prompt | llm | RunnablePassthrough()
107
  response = chain.invoke({"entity": entity, "news": news_text})
108
 
109
- # Parse the response
110
  sentiment = "Neutral"
111
  impact = "Неопределенный эффект"
112
  reasoning = "Не удалось получить обоснование"
113
 
114
  if isinstance(response, str):
115
  try:
116
- # Extract sentiment
117
  if "Sentiment:" in response:
118
  sentiment_part = response.split("Sentiment:")[1].split("\n")[0].strip().lower()
119
  if "positive" in sentiment_part:
@@ -121,7 +81,6 @@ def estimate_sentiment_and_impact(llm, news_text, entity):
121
  elif "negative" in sentiment_part:
122
  sentiment = "Negative"
123
 
124
- # Extract impact and reasoning
125
  if "Impact:" in response and "Reasoning:" in response:
126
  impact_part, reasoning_part = response.split("Reasoning:")
127
  impact = impact_part.split("Impact:")[1].strip()
@@ -130,245 +89,6 @@ def estimate_sentiment_and_impact(llm, news_text, entity):
130
  st.error(f"Error parsing LLM response: {str(e)}")
131
 
132
  return sentiment, impact, reasoning
133
-
134
- @st.cache_resource
135
- def load_model(model_id):
136
- tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
137
- model = transformers.AutoModelForCausalLM.from_pretrained(
138
- model_id,
139
- torch_dtype=torch.float16,
140
- device_map="cpu",
141
- low_cpu_mem_usage=True
142
- )
143
- return tokenizer, model
144
-
145
-
146
-
147
- def init_langchain_llm():
148
- try:
149
- # Try to get the Groq API key from Hugging Face secrets
150
- if 'groq_key' in st.secrets:
151
- groq_api_key = st.secrets['groq_key']
152
- else:
153
- st.error("Groq API key not found in Hugging Face secrets. Please add it with the key 'groq_key'.")
154
- st.stop()
155
-
156
- llm = ChatOpenAI(
157
- base_url="https://api.groq.com/openai/v1",
158
- model="llama-3.1-70b-versatile",
159
- api_key=groq_api_key,
160
- temperature=0.0
161
- )
162
- return llm
163
- except Exception as e:
164
- st.error(f"Error initializing the Groq LLM: {str(e)}")
165
- st.stop()
166
-
167
-
168
- def estimate_impact(llm, news_text, entity):
169
- template = """
170
- Analyze the following news piece about the entity "{entity}" and estimate its monetary impact in Russian rubles for this entity in the next 6 months. You should estimate the risk of loss or probability of profit.
171
-
172
- If a precise monetary estimate is not possible, categorize the impact as one of the following:
173
- 1. "Значительный риск убытков" (Significant risk of loss)
174
- 2. "Умеренный риск убытков" (Moderate risk of loss)
175
- 3. "Незначительный риск убытков" (Minor risk of loss)
176
- 4. "Вероятность прибыли" (Probability of profit)
177
- 5. "Неопределенный эффект" (Uncertain effect)
178
-
179
- Also provide a short reasoning (max 100 words) for your assessment.
180
-
181
- Entity: {entity}
182
- News: {news}
183
-
184
- Your response should be in the following format:
185
- Estimated Impact: [Your estimate or category]
186
- Reasoning: [Your reasoning]
187
- """
188
- prompt = PromptTemplate(template=template, input_variables=["entity", "news"])
189
- chain = prompt | llm | RunnablePassthrough()
190
- response = chain.invoke({"entity": entity, "news": news_text})
191
-
192
- # Parse the response
193
- impact = "Неопределенный эффект"
194
- reasoning = "Не удалось получить обоснование"
195
-
196
- if isinstance(response, str) and "Estimated Impact:" in response and "Reasoning:" in response:
197
- impact_part, reasoning_part = response.split("Reasoning:")
198
- impact = impact_part.split("Estimated Impact:")[1].strip()
199
- reasoning = reasoning_part.strip()
200
-
201
- return impact, reasoning
202
-
203
-
204
-
205
- def create_output_file_with_llm(df, uploaded_file, analysis_df):
206
- wb = load_workbook("sample_file.xlsx")
207
-
208
- # Update 'Сводка' sheet
209
- summary_df = pd.DataFrame({
210
- 'Объект': df['Объект'].unique(),
211
- 'Всего новостей': df.groupby('Объект').size(),
212
- 'Отрицательные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Negative').any(axis=1)].groupby('Объект').size(),
213
- 'Положительные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Positive').any(axis=1)].groupby('Объект').size(),
214
- 'Impact': df.groupby('Объект')['LLM_Impact'].agg(lambda x: x.value_counts().index[0] if x.any() else 'Неопределенный')
215
- })
216
- ws = wb['Сводка']
217
- for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
218
- for c_idx, value in enumerate(row, start=5):
219
- ws.cell(row=r_idx, column=c_idx, value=value)
220
-
221
- # Update 'Значимые' sheet
222
- significant_data = []
223
- for _, row in df.iterrows():
224
- if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
225
- sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
226
- significant_data.append([row['Объект'], 'релевантен', sentiment, row['LLM_Impact'], row['Заголовок'], row['Выдержки из текста']])
227
-
228
- ws = wb['Значимые']
229
- for r_idx, row in enumerate(significant_data, start=3):
230
- for c_idx, value in enumerate(row, start=3):
231
- ws.cell(row=r_idx, column=c_idx, value=value)
232
-
233
- # Update 'Анализ' sheet
234
- analysis_df['LLM_Reasoning'] = df['LLM_Reasoning']
235
- ws = wb['Анализ']
236
- for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
237
- for c_idx, value in enumerate(row, start=5):
238
- ws.cell(row=r_idx, column=c_idx, value=value)
239
-
240
- # Copy 'Публикации' sheet from original uploaded file
241
- original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
242
- ws = wb['Публикации']
243
- for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
244
- for c_idx, value in enumerate(row, start=1):
245
- ws.cell(row=r_idx, column=c_idx, value=value)
246
-
247
- # Add 'Тех.приложение' sheet with processed data
248
- if 'Тех.приложение' not in wb.sheetnames:
249
- wb.create_sheet('Тех.приложение')
250
- ws = wb['Тех.приложение']
251
- for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
252
- for c_idx, value in enumerate(row, start=1):
253
- ws.cell(row=r_idx, column=c_idx, value=value)
254
-
255
- output = io.BytesIO()
256
- wb.save(output)
257
- output.seek(0)
258
- return output
259
-
260
- def create_analysis_data(df):
261
- analysis_data = []
262
- for _, row in df.iterrows():
263
- if row['Sentiment'] == 'Negative':
264
- analysis_data.append([
265
- row['Объект'],
266
- row['Заголовок'],
267
- 'РИСК УБЫТКА',
268
- row['Impact'], # Now using LLM's impact assessment
269
- row['Reasoning'], # Adding LLM's reasoning
270
- row['Выдержки из текста']
271
- ])
272
- return pd.DataFrame(analysis_data, columns=[
273
- 'Объект',
274
- 'Заголовок',
275
- 'Признак',
276
- 'Оценка влияния',
277
- 'Обоснование',
278
- 'Текст сообщения'
279
- ])
280
-
281
- # Function for lemmatizing Russian text
282
- def lemmatize_text(text):
283
- if pd.isna(text):
284
- return ""
285
-
286
- if not isinstance(text, str):
287
- text = str(text)
288
-
289
- words = text.split()
290
- lemmatized_words = []
291
- for word in tqdm(words, desc="Lemmatizing", unit="word"):
292
- lemmatized_word = ''.join(mystem.lemmatize(word))
293
- lemmatized_words.append(lemmatized_word)
294
- return ' '.join(lemmatized_words)
295
-
296
- # Translation model for Russian to English
297
- model_name = "Helsinki-NLP/opus-mt-ru-en"
298
- translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
299
- translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
300
-
301
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
302
-
303
-
304
- def translate(text):
305
- # Tokenize the input text
306
- inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
307
-
308
- # Calculate max_length based on input length
309
- input_length = inputs.input_ids.shape[1]
310
- max_length = max(input_length + 10, int(input_length * 1.5)) # Ensure at least 10 new tokens
311
-
312
- # Generate translation
313
- translated_tokens = translation_model.generate(
314
- **inputs,
315
- max_new_tokens=max_length, # Use max_new_tokens instead of max_length
316
- num_beams=5,
317
- no_repeat_ngram_size=2,
318
- early_stopping=True
319
- )
320
-
321
- # Decode the translated tokens
322
- translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
323
- return translated_text
324
-
325
- # Functions for FinBERT, RoBERTa, and FinBERT-Tone with label mapping
326
- def get_mapped_sentiment(result):
327
- label = result['label'].lower()
328
- if label in ["positive", "label_2", "pos", "pos_label"]:
329
- return "Positive"
330
- elif label in ["negative", "label_0", "neg", "neg_label"]:
331
- return "Negative"
332
- return "Neutral"
333
-
334
- @sentiment_analysis_decorator
335
- def get_rubert1_sentiment(text):
336
- result = rubert1(text, truncation=True, max_length=512)[0]
337
- return get_mapped_sentiment(result)
338
-
339
- @sentiment_analysis_decorator
340
- def get_rubert2_sentiment(text):
341
- result = rubert2(text, truncation=True, max_length=512)[0]
342
- return get_mapped_sentiment(result)
343
-
344
- @sentiment_analysis_decorator
345
- def get_finbert_sentiment(text):
346
- result = finbert(text, truncation=True, max_length=512)[0]
347
- return get_mapped_sentiment(result)
348
-
349
- @sentiment_analysis_decorator
350
- def get_roberta_sentiment(text):
351
- result = roberta(text, truncation=True, max_length=512)[0]
352
- return get_mapped_sentiment(result)
353
-
354
- @sentiment_analysis_decorator
355
- def get_finbert_tone_sentiment(text):
356
- result = finbert_tone(text, truncation=True, max_length=512)[0]
357
- return get_mapped_sentiment(result)
358
-
359
- #Fuzzy filter out similar news for the same NER
360
- def fuzzy_deduplicate(df, column, threshold=65):
361
- seen_texts = []
362
- indices_to_keep = []
363
- for i, text in enumerate(df[column]):
364
- if pd.isna(text):
365
- indices_to_keep.append(i)
366
- continue
367
- text = str(text)
368
- if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
369
- seen_texts.append(text)
370
- indices_to_keep.append(i)
371
- return df.iloc[indices_to_keep]
372
 
373
  def format_elapsed_time(seconds):
374
  hours, remainder = divmod(int(seconds), 3600)
@@ -379,11 +99,30 @@ def format_elapsed_time(seconds):
379
  time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
380
  if minutes > 0:
381
  time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
382
- if seconds > 0 or not time_parts: # always show seconds if it's the only non-zero value
383
  time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
384
 
385
  return " ".join(time_parts)
386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
  def process_file(uploaded_file):
389
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
@@ -395,24 +134,19 @@ def process_file(uploaded_file):
395
  st.stop()
396
 
397
  original_news_count = len(df)
398
-
399
- # Apply fuzzy deduplication
400
  df = df.groupby('Объект').apply(
401
  lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
402
  ).reset_index(drop=True)
403
 
404
  remaining_news_count = len(df)
405
  duplicates_removed = original_news_count - remaining_news_count
406
-
407
  st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
408
 
409
- # Initialize LLM
410
  llm = init_langchain_llm()
411
  if not llm:
412
  st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
413
  st.stop()
414
 
415
- # Initialize columns for results
416
  df['Sentiment'] = ''
417
  df['Impact'] = ''
418
  df['Reasoning'] = ''
@@ -420,7 +154,6 @@ def process_file(uploaded_file):
420
  progress_bar = st.progress(0)
421
  status_text = st.empty()
422
 
423
- # Process each news item
424
  for index, row in df.iterrows():
425
  sentiment, impact, reasoning = estimate_sentiment_and_impact(
426
  llm,
@@ -432,12 +165,10 @@ def process_file(uploaded_file):
432
  df.at[index, 'Impact'] = impact
433
  df.at[index, 'Reasoning'] = reasoning
434
 
435
- # Display progress
436
  progress = (index + 1) / len(df)
437
  progress_bar.progress(progress)
438
  status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
439
 
440
- # Display each analysis result
441
  st.write(f"Объект: {row['Объект']}")
442
  st.write(f"Новость: {row['Заголовок']}")
443
  st.write(f"Тональность: {sentiment}")
@@ -448,18 +179,36 @@ def process_file(uploaded_file):
448
  progress_bar.empty()
449
  status_text.empty()
450
 
451
- # Generate visualization after processing
452
  visualization = generate_sentiment_visualization(df)
453
  if visualization:
454
  st.pyplot(visualization)
455
 
456
  return df
457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
  def create_output_file(df, uploaded_file):
460
  wb = load_workbook("sample_file.xlsx")
461
 
462
- # Update 'Сводка' sheet
463
  summary_df = pd.DataFrame({
464
  'Объект': df['Объект'].unique(),
465
  'Всего новостей': df.groupby('Объект').size(),
@@ -470,16 +219,13 @@ def create_output_file(df, uploaded_file):
470
  )
471
  })
472
 
473
- # Sort by number of negative mentions
474
  summary_df = summary_df.sort_values('Негативные', ascending=False)
475
 
476
- # Write 'Сводка' sheet
477
  ws = wb['Сводка']
478
  for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=True), start=4):
479
  for c_idx, value in enumerate(row, start=5):
480
  ws.cell(row=r_idx, column=c_idx, value=value)
481
 
482
- # Update 'Значимые' sheet
483
  significant_data = []
484
  for _, row in df.iterrows():
485
  if row['Sentiment'] in ['Negative', 'Positive']:
@@ -497,21 +243,18 @@ def create_output_file(df, uploaded_file):
497
  for c_idx, value in enumerate(row, start=3):
498
  ws.cell(row=r_idx, column=c_idx, value=value)
499
 
500
- # Update 'Анализ' sheet
501
  analysis_df = create_analysis_data(df)
502
  ws = wb['Анализ']
503
  for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=True), start=4):
504
  for c_idx, value in enumerate(row, start=5):
505
  ws.cell(row=r_idx, column=c_idx, value=value)
506
 
507
- # Copy 'Публикации' sheet from original uploaded file
508
  original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
509
  ws = wb['Публикации']
510
  for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
511
  for c_idx, value in enumerate(row, start=1):
512
  ws.cell(row=r_idx, column=c_idx, value=value)
513
 
514
- # Add 'Тех.приложение' sheet with processed data
515
  if 'Тех.приложение' not in wb.sheetnames:
516
  wb.create_sheet('Тех.приложение')
517
  ws = wb['Тех.приложение']
@@ -524,43 +267,17 @@ def create_output_file(df, uploaded_file):
524
  output.seek(0)
525
  return output
526
 
527
-
528
-
529
- def generate_sentiment_visualization(df):
530
- # Filter for negative sentiments
531
- negative_df = df[df['Sentiment'] == 'Negative']
532
-
533
- if negative_df.empty:
534
- st.warning("Не обнаружено негативных упоминаний. Отображаем общую статистику по объектам.")
535
- entity_counts = df['Объект'].value_counts()
536
- else:
537
- entity_counts = negative_df['Объект'].value_counts()
538
-
539
- if len(entity_counts) == 0:
540
- st.warning("Нет данных для визуализации.")
541
- return None
542
-
543
- # Create a horizontal bar chart showing entity risk levels
544
- fig, ax = plt.subplots(figsize=(12, max(6, len(entity_counts) * 0.5)))
545
- entity_counts.plot(kind='barh', ax=ax)
546
- ax.set_title('Количество негативных упоминаний по объектам')
547
- ax.set_xlabel('Количество упоминаний')
548
- plt.tight_layout()
549
- return fig
550
-
551
-
552
  def main():
553
- # Add custom CSS for the signature
554
  st.markdown(
555
  """
556
  <style>
557
  .signature {
558
  position: fixed;
559
- right: 10px;
560
- bottom: 10px;
561
- font-size: 12px;
562
- color: #666;
563
- opacity: 0.7;
564
  z-index: 999;
565
  }
566
  </style>
@@ -569,12 +286,10 @@ def main():
569
  unsafe_allow_html=True
570
  )
571
 
572
- st.title("... приступим к анализу... версия 72")
573
 
574
- # Initialize session state
575
  if 'processed_df' not in st.session_state:
576
  st.session_state.processed_df = None
577
-
578
 
579
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
580
 
@@ -584,13 +299,14 @@ def main():
584
  st.session_state.processed_df = process_file(uploaded_file)
585
 
586
  st.subheader("Предпросмотр данных")
587
- st.write(st.session_state.processed_df.head())
588
-
 
589
  analysis_df = create_analysis_data(st.session_state.processed_df)
590
  st.subheader("Анализ")
591
  st.dataframe(analysis_df)
592
 
593
- output = create_output_file_with_llm(st.session_state.processed_df, uploaded_file, analysis_df)
594
 
595
  end_time = time.time()
596
  elapsed_time = end_time - start_time
@@ -598,9 +314,9 @@ def main():
598
  st.success(f"Обработка и анализ завершены за {formatted_time}.")
599
 
600
  st.download_button(
601
- label="Скачать результат анализа новостей с оценкой нейросети",
602
  data=output,
603
- file_name="результат_анализа_с_нейросетью.xlsx",
604
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
605
  )
606
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import time
 
 
4
  import matplotlib.pyplot as plt
 
5
  import io
6
  from rapidfuzz import fuzz
7
+ import os
 
8
  from openpyxl import load_workbook
9
+ from langchain_community.chat_models import ChatOpenAI
 
 
 
 
10
  from langchain.prompts import PromptTemplate
 
11
  from langchain_core.runnables import RunnablePassthrough
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def fuzzy_deduplicate(df, column, threshold=65):
14
+ seen_texts = []
15
+ indices_to_keep = []
16
+ for i, text in enumerate(df[column]):
17
+ if pd.isna(text):
18
+ indices_to_keep.append(i)
19
+ continue
20
+ text = str(text)
21
+ if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
22
+ seen_texts.append(text)
23
+ indices_to_keep.append(i)
24
+ return df.iloc[indices_to_keep]
25
 
26
+ def init_langchain_llm():
27
+ try:
28
+ if 'groq_key' in st.secrets:
29
+ groq_api_key = st.secrets['groq_key']
30
+ else:
31
+ st.error("Groq API key not found in Hugging Face secrets. Please add it with the key 'groq_key'.")
32
+ st.stop()
 
 
 
33
 
34
+ llm = ChatOpenAI(
35
+ base_url="https://api.groq.com/openai/v1",
36
+ model="llama-3.1-70b-versatile",
37
+ api_key=groq_api_key,
38
+ temperature=0.0
39
+ )
40
+ return llm
41
+ except Exception as e:
42
+ st.error(f"Error initializing the Groq LLM: {str(e)}")
43
+ st.stop()
44
 
45
  def estimate_sentiment_and_impact(llm, news_text, entity):
46
  template = """
 
68
  chain = prompt | llm | RunnablePassthrough()
69
  response = chain.invoke({"entity": entity, "news": news_text})
70
 
 
71
  sentiment = "Neutral"
72
  impact = "Неопределенный эффект"
73
  reasoning = "Не удалось получить обоснование"
74
 
75
  if isinstance(response, str):
76
  try:
 
77
  if "Sentiment:" in response:
78
  sentiment_part = response.split("Sentiment:")[1].split("\n")[0].strip().lower()
79
  if "positive" in sentiment_part:
 
81
  elif "negative" in sentiment_part:
82
  sentiment = "Negative"
83
 
 
84
  if "Impact:" in response and "Reasoning:" in response:
85
  impact_part, reasoning_part = response.split("Reasoning:")
86
  impact = impact_part.split("Impact:")[1].strip()
 
89
  st.error(f"Error parsing LLM response: {str(e)}")
90
 
91
  return sentiment, impact, reasoning
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def format_elapsed_time(seconds):
94
  hours, remainder = divmod(int(seconds), 3600)
 
99
  time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
100
  if minutes > 0:
101
  time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
102
+ if seconds > 0 or not time_parts:
103
  time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
104
 
105
  return " ".join(time_parts)
106
 
107
+ def generate_sentiment_visualization(df):
108
+ negative_df = df[df['Sentiment'] == 'Negative']
109
+
110
+ if negative_df.empty:
111
+ st.warning("Не обнаружено негативных упоминаний. Отображаем общую статистику по объектам.")
112
+ entity_counts = df['Объект'].value_counts()
113
+ else:
114
+ entity_counts = negative_df['Объект'].value_counts()
115
+
116
+ if len(entity_counts) == 0:
117
+ st.warning("Нет данных для визуализации.")
118
+ return None
119
+
120
+ fig, ax = plt.subplots(figsize=(12, max(6, len(entity_counts) * 0.5)))
121
+ entity_counts.plot(kind='barh', ax=ax)
122
+ ax.set_title('Количество негативных упоминаний по объектам')
123
+ ax.set_xlabel('Количество упоминаний')
124
+ plt.tight_layout()
125
+ return fig
126
 
127
  def process_file(uploaded_file):
128
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
 
134
  st.stop()
135
 
136
  original_news_count = len(df)
 
 
137
  df = df.groupby('Объект').apply(
138
  lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
139
  ).reset_index(drop=True)
140
 
141
  remaining_news_count = len(df)
142
  duplicates_removed = original_news_count - remaining_news_count
 
143
  st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
144
 
 
145
  llm = init_langchain_llm()
146
  if not llm:
147
  st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
148
  st.stop()
149
 
 
150
  df['Sentiment'] = ''
151
  df['Impact'] = ''
152
  df['Reasoning'] = ''
 
154
  progress_bar = st.progress(0)
155
  status_text = st.empty()
156
 
 
157
  for index, row in df.iterrows():
158
  sentiment, impact, reasoning = estimate_sentiment_and_impact(
159
  llm,
 
165
  df.at[index, 'Impact'] = impact
166
  df.at[index, 'Reasoning'] = reasoning
167
 
 
168
  progress = (index + 1) / len(df)
169
  progress_bar.progress(progress)
170
  status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
171
 
 
172
  st.write(f"Объект: {row['Объект']}")
173
  st.write(f"Новость: {row['Заголовок']}")
174
  st.write(f"Тональность: {sentiment}")
 
179
  progress_bar.empty()
180
  status_text.empty()
181
 
 
182
  visualization = generate_sentiment_visualization(df)
183
  if visualization:
184
  st.pyplot(visualization)
185
 
186
  return df
187
 
188
+ def create_analysis_data(df):
189
+ analysis_data = []
190
+ for _, row in df.iterrows():
191
+ if row['Sentiment'] == 'Negative':
192
+ analysis_data.append([
193
+ row['Объект'],
194
+ row['Заголовок'],
195
+ 'РИСК УБЫТКА',
196
+ row['Impact'],
197
+ row['Reasoning'],
198
+ row['Выдержки из текста']
199
+ ])
200
+ return pd.DataFrame(analysis_data, columns=[
201
+ 'Объект',
202
+ 'Заголовок',
203
+ 'Признак',
204
+ 'Оценка влияния',
205
+ 'Обоснование',
206
+ 'Текст сообщения'
207
+ ])
208
 
209
  def create_output_file(df, uploaded_file):
210
  wb = load_workbook("sample_file.xlsx")
211
 
 
212
  summary_df = pd.DataFrame({
213
  'Объект': df['Объект'].unique(),
214
  'Всего новостей': df.groupby('Объект').size(),
 
219
  )
220
  })
221
 
 
222
  summary_df = summary_df.sort_values('Негативные', ascending=False)
223
 
 
224
  ws = wb['Сводка']
225
  for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=True), start=4):
226
  for c_idx, value in enumerate(row, start=5):
227
  ws.cell(row=r_idx, column=c_idx, value=value)
228
 
 
229
  significant_data = []
230
  for _, row in df.iterrows():
231
  if row['Sentiment'] in ['Negative', 'Positive']:
 
243
  for c_idx, value in enumerate(row, start=3):
244
  ws.cell(row=r_idx, column=c_idx, value=value)
245
 
 
246
  analysis_df = create_analysis_data(df)
247
  ws = wb['Анализ']
248
  for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=True), start=4):
249
  for c_idx, value in enumerate(row, start=5):
250
  ws.cell(row=r_idx, column=c_idx, value=value)
251
 
 
252
  original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
253
  ws = wb['Публикации']
254
  for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
255
  for c_idx, value in enumerate(row, start=1):
256
  ws.cell(row=r_idx, column=c_idx, value=value)
257
 
 
258
  if 'Тех.приложение' not in wb.sheetnames:
259
  wb.create_sheet('Тех.приложение')
260
  ws = wb['Тех.приложение']
 
267
  output.seek(0)
268
  return output
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  def main():
 
271
  st.markdown(
272
  """
273
  <style>
274
  .signature {
275
  position: fixed;
276
+ right: 12px;
277
+ bottom: 12px;
278
+ font-size: 14px;
279
+ color: #FF0000;
280
+ opacity: 0.9;
281
  z-index: 999;
282
  }
283
  </style>
 
286
  unsafe_allow_html=True
287
  )
288
 
289
+ st.title("... приступим к анализу... версия 73")
290
 
 
291
  if 'processed_df' not in st.session_state:
292
  st.session_state.processed_df = None
 
293
 
294
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
295
 
 
299
  st.session_state.processed_df = process_file(uploaded_file)
300
 
301
  st.subheader("Предпросмотр данных")
302
+ preview_df = st.session_state.processed_df[['Объект', 'Заголовок', 'Sentiment', 'Impact']].head()
303
+ st.dataframe(preview_df)
304
+
305
  analysis_df = create_analysis_data(st.session_state.processed_df)
306
  st.subheader("Анализ")
307
  st.dataframe(analysis_df)
308
 
309
+ output = create_output_file(st.session_state.processed_df, uploaded_file)
310
 
311
  end_time = time.time()
312
  elapsed_time = end_time - start_time
 
314
  st.success(f"Обработка и анализ завершены за {formatted_time}.")
315
 
316
  st.download_button(
317
+ label="Скачать результат анализа",
318
  data=output,
319
+ file_name="результат_анализа.xlsx",
320
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
321
  )
322