pentarosarium commited on
Commit
45f1473
·
1 Parent(s): 076cf43

progress more 45

Browse files
Files changed (1) hide show
  1. app.py +42 -276
app.py CHANGED
@@ -8,7 +8,6 @@ from pymystem3 import Mystem
8
  import io
9
  from rapidfuzz import fuzz
10
  from tqdm.auto import tqdm
11
- import time
12
  import torch
13
  from openpyxl import load_workbook
14
  from openpyxl import Workbook
@@ -22,19 +21,24 @@ from langchain.chains import LLMChain
22
  mystem = Mystem()
23
 
24
  # Set up the sentiment analyzers
25
-
26
  finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
27
  roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
28
  finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
29
- rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
30
- rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
 
 
 
 
 
 
 
31
 
32
  def init_langchain_llm():
33
  pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
34
  llm = HuggingFacePipeline(pipeline=pipe)
35
  return llm
36
 
37
- # Function to estimate impact using LLM
38
  def estimate_impact(llm, news_text):
39
  template = """
40
  Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
@@ -50,24 +54,19 @@ def estimate_impact(llm, news_text):
50
  chain = LLMChain(llm=llm, prompt=prompt)
51
  response = chain.run(news=news_text)
52
 
53
- # Parse the response to extract impact and reasoning
54
- # Parsing logic is very important! Might be needed to be changed
55
  impact, reasoning = response.split("Reasoning:")
56
  impact = impact.strip()
57
  reasoning = reasoning.strip()
58
 
59
  return impact, reasoning
60
 
61
- def process_file_with_llm(uploaded_file, llm):
62
- df = process_file(uploaded_file)
63
-
64
- # Add new columns for LLM analysis
65
  df['LLM_Impact'] = ''
66
  df['LLM_Reasoning'] = ''
67
 
68
  for index, row in df.iterrows():
69
  if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
70
- impact, reasoning = estimate_impact(llm, row['Выдержки из текста'])
71
  df.at[index, 'LLM_Impact'] = impact
72
  df.at[index, 'LLM_Reasoning'] = reasoning
73
 
@@ -123,268 +122,34 @@ def create_output_file_with_llm(df, uploaded_file, analysis_df):
123
  for c_idx, value in enumerate(row, start=1):
124
  ws.cell(row=r_idx, column=c_idx, value=value)
125
 
126
-
127
  output = io.BytesIO()
128
  wb.save(output)
129
  output.seek(0)
130
  return output
131
 
132
- def create_analysis_data(df):
133
- analysis_data = []
134
- for _, row in df.iterrows():
135
- if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
136
- analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
137
- return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Пояснение', 'Текст сообщения'])
138
-
139
- # Function for lemmatizing Russian text
140
- def lemmatize_text(text):
141
- if pd.isna(text):
142
- return ""
143
-
144
- if not isinstance(text, str):
145
- text = str(text)
146
-
147
- words = text.split()
148
- lemmatized_words = []
149
- for word in tqdm(words, desc="Lemmatizing", unit="word"):
150
- lemmatized_word = ''.join(mystem.lemmatize(word))
151
- lemmatized_words.append(lemmatized_word)
152
- return ' '.join(lemmatized_words)
153
-
154
- # Translation model for Russian to English
155
- model_name = "Helsinki-NLP/opus-mt-ru-en"
156
- translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
157
- translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
158
-
159
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
160
-
161
-
162
- def translate(text):
163
- # Tokenize the input text
164
- inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
165
-
166
- # Calculate max_length based on input length
167
- input_length = inputs.input_ids.shape[1]
168
- max_length = max(input_length + 10, int(input_length * 1.5)) # Ensure at least 10 new tokens
169
-
170
- # Generate translation
171
- translated_tokens = translation_model.generate(
172
- **inputs,
173
- max_new_tokens=max_length, # Use max_new_tokens instead of max_length
174
- num_beams=5,
175
- no_repeat_ngram_size=2,
176
- early_stopping=True
177
- )
178
-
179
- # Decode the translated tokens
180
- translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
181
- return translated_text
182
-
183
- # Functions for FinBERT, RoBERTa, and FinBERT-Tone with label mapping
184
- def get_mapped_sentiment(result):
185
- label = result['label'].lower()
186
- if label in ["positive", "label_2", "pos", "pos_label"]:
187
- return "Positive"
188
- elif label in ["negative", "label_0", "neg", "neg_label"]:
189
- return "Negative"
190
- return "Neutral"
191
-
192
- @sentiment_analysis_decorator
193
- def get_rubert1_sentiment(text):
194
- result = rubert1(text, truncation=True, max_length=512)[0]
195
- return get_mapped_sentiment(result)
196
-
197
- @sentiment_analysis_decorator
198
- def get_rubert2_sentiment(text):
199
- result = rubert2(text, truncation=True, max_length=512)[0]
200
- return get_mapped_sentiment(result)
201
-
202
- @sentiment_analysis_decorator
203
- def get_finbert_sentiment(text):
204
- result = finbert(text, truncation=True, max_length=512)[0]
205
- return get_mapped_sentiment(result)
206
-
207
- @sentiment_analysis_decorator
208
- def get_roberta_sentiment(text):
209
- result = roberta(text, truncation=True, max_length=512)[0]
210
- return get_mapped_sentiment(result)
211
-
212
- @sentiment_analysis_decorator
213
- def get_finbert_tone_sentiment(text):
214
- result = finbert_tone(text, truncation=True, max_length=512)[0]
215
- return get_mapped_sentiment(result)
216
-
217
- #Fuzzy filter out similar news for the same NER
218
- def fuzzy_deduplicate(df, column, threshold=65):
219
- seen_texts = []
220
- indices_to_keep = []
221
- for i, text in enumerate(df[column]):
222
- if pd.isna(text):
223
- indices_to_keep.append(i)
224
- continue
225
- text = str(text)
226
- if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
227
- seen_texts.append(text)
228
- indices_to_keep.append(i)
229
- return df.iloc[indices_to_keep]
230
-
231
- def format_elapsed_time(seconds):
232
- hours, remainder = divmod(int(seconds), 3600)
233
- minutes, seconds = divmod(remainder, 60)
234
-
235
- time_parts = []
236
- if hours > 0:
237
- time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
238
- if minutes > 0:
239
- time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
240
- if seconds > 0 or not time_parts: # always show seconds if it's the only non-zero value
241
- time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
242
-
243
- return " ".join(time_parts)
244
-
245
-
246
- def process_file(uploaded_file):
247
- df = pd.read_excel(uploaded_file, sheet_name='Публикации')
248
-
249
- required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
250
- missing_columns = [col for col in required_columns if col not in df.columns]
251
- if missing_columns:
252
- st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
253
- st.stop()
254
-
255
- original_news_count = len(df)
256
-
257
- # Apply fuzzy deduplication
258
- df = df.groupby('Объект').apply(
259
- lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
260
- ).reset_index(drop=True)
261
-
262
- remaining_news_count = len(df)
263
- duplicates_removed = original_news_count - remaining_news_count
264
-
265
- st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
266
-
267
- # Translate texts
268
- translated_texts = []
269
- lemmatized_texts = []
270
- progress_bar = st.progress(0)
271
- progress_text = st.empty()
272
- total_news = len(df)
273
-
274
- texts = df['Выдержки из текста'].tolist()
275
- # Data validation
276
- texts = [str(text) if not pd.isna(text) else "" for text in texts]
277
-
278
- for text in df['Выдержки из текста']:
279
- lemmatized_texts.append(lemmatize_text(text))
280
-
281
- for i, text in enumerate(lemmatized_texts):
282
- translated_text = translate(str(text))
283
- translated_texts.append(translated_text)
284
- progress_bar.progress((i + 1) / len(df))
285
- progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
286
-
287
- # Perform sentiment analysis
288
- rubert2_results = [get_rubert2_sentiment(text) for text in texts]
289
- finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
290
- roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
291
- finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
292
-
293
- # Create a new DataFrame with processed data
294
- processed_df = pd.DataFrame({
295
- 'Объект': df['Объект'],
296
- 'Заголовок': df['Заголовок'], # Preserve original 'Заголовок'
297
- 'ruBERT2': rubert2_results,
298
- 'FinBERT': finbert_results,
299
- 'RoBERTa': roberta_results,
300
- 'FinBERT-Tone': finbert_tone_results,
301
- 'Выдержки из текста': df['Выдержки из текста'],
302
- 'Translated': translated_texts
303
- })
304
-
305
- return processed_df
306
-
307
- def create_output_file(df, uploaded_file, analysis_df):
308
- # Load the sample file to use as a template
309
- wb = load_workbook("sample_file.xlsx")
310
-
311
- # Process data for 'Сводка' sheet
312
- entities = df['Объект'].unique()
313
- summary_data = []
314
- for entity in entities:
315
- entity_df = df[df['Объект'] == entity]
316
- total_news = len(entity_df)
317
- negative_news = sum((entity_df['FinBERT'] == 'Negative') |
318
- (entity_df['RoBERTa'] == 'Negative') |
319
- (entity_df['FinBERT-Tone'] == 'Negative'))
320
- positive_news = sum((entity_df['FinBERT'] == 'Positive') |
321
- (entity_df['RoBERTa'] == 'Positive') |
322
- (entity_df['FinBERT-Tone'] == 'Positive'))
323
- summary_data.append([entity, total_news, negative_news, positive_news])
324
-
325
- summary_df = pd.DataFrame(summary_data, columns=['Объект', 'Всего новостей', 'Отрицательные', 'Положительные'])
326
- summary_df = summary_df.sort_values('Отрицательные', ascending=False)
327
-
328
- # Write 'Сводка' sheet
329
- ws = wb['Сводка']
330
- for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
331
- for c_idx, value in enumerate(row, start=5):
332
- ws.cell(row=r_idx, column=c_idx, value=value)
333
-
334
- # Process data for 'Значимые' sheet
335
-
336
- significant_data = []
337
- for _, row in df.iterrows():
338
- if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
339
- sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
340
- significant_data.append([row['Объект'], '', sentiment, '', row['Заголовок'], row['Выдержки из текста']])
341
-
342
- # Write 'Значимые' sheet
343
- ws = wb['Значимые']
344
- for r_idx, row in enumerate(significant_data, start=3):
345
- for c_idx, value in enumerate(row, start=3):
346
- ws.cell(row=r_idx, column=c_idx, value=value)
347
-
348
- # Write 'Анализ' sheet
349
- ws = wb['Анализ']
350
- for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
351
- for c_idx, value in enumerate(row, start=5):
352
- ws.cell(row=r_idx, column=c_idx, value=value)
353
-
354
- # Copy 'Публикации' sheet from original uploaded file
355
- original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
356
- ws = wb['Публикации']
357
- for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
358
- for c_idx, value in enumerate(row, start=1):
359
- ws.cell(row=r_idx, column=c_idx, value=value)
360
-
361
- # Add 'Тех.приложение' sheet with processed data
362
- if 'Тех.приложение' not in wb.sheetnames:
363
- wb.create_sheet('Тех.приложение')
364
- ws = wb['Тех.приложение']
365
- for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
366
- for c_idx, value in enumerate(row, start=1):
367
- ws.cell(row=r_idx, column=c_idx, value=value)
368
-
369
- # Save the workbook to a BytesIO object
370
- output = io.BytesIO()
371
- wb.save(output)
372
- output.seek(0)
373
-
374
- return output
375
 
376
  def main():
377
- st.title("... приступим к анализу... версия 44+")
 
 
 
 
 
 
 
 
378
 
379
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
380
 
381
- if uploaded_file is not None:
382
  start_time = time.time()
383
 
384
- df = process_file(uploaded_file)
 
385
 
386
  st.subheader("Предпросмотр данных")
387
- st.write(df.head())
388
 
389
  st.subheader("Распределение окраски")
390
  fig, axs = plt.subplots(2, 2, figsize=(12, 8))
@@ -393,7 +158,7 @@ def main():
393
  models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
394
  for i, model in enumerate(models):
395
  ax = axs[i // 2, i % 2]
396
- sentiment_counts = df[model].value_counts()
397
  sentiment_counts.plot(kind='bar', ax=ax)
398
  ax.set_title(f"{model} Sentiment")
399
  ax.set_xlabel("Sentiment")
@@ -401,19 +166,17 @@ def main():
401
 
402
  plt.tight_layout()
403
  st.pyplot(fig)
404
- analysis_df = create_analysis_data(df)
405
  st.subheader("Анализ")
406
- st.dataframe(analysis_df)
407
- output = create_output_file(df, uploaded_file, analysis_df)
 
408
 
409
- # Calculate elapsed time
410
  end_time = time.time()
411
  elapsed_time = end_time - start_time
412
  formatted_time = format_elapsed_time(elapsed_time)
413
  st.success(f"Обработка завершена за {formatted_time}.")
414
 
415
- # Offer download of results
416
-
417
  st.download_button(
418
  label="Скачать результат анализа новостей",
419
  data=output,
@@ -421,20 +184,23 @@ def main():
421
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
422
  )
423
 
424
- # Add button for LLM analysis
425
  if st.button("Что скажет нейросеть?"):
426
  st.info("Анализ нейросетью начался. Это может занять некоторое время...")
427
  llm = init_langchain_llm()
428
- df_with_llm = process_file_with_llm(uploaded_file, llm)
429
- output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, analysis_df)
430
  st.success("Анализ нейросетью завершен!")
431
- st.download_button(
432
- label="Скачать результат анализа с оценкой нейросети",
433
- data=output_with_llm,
434
- file_name="результат_анализа_с_нейросетью.xlsx",
435
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
436
- )
437
 
 
 
 
 
 
 
 
438
 
439
  if __name__ == "__main__":
440
  main()
 
8
  import io
9
  from rapidfuzz import fuzz
10
  from tqdm.auto import tqdm
 
11
  import torch
12
  from openpyxl import load_workbook
13
  from openpyxl import Workbook
 
21
  mystem = Mystem()
22
 
23
  # Set up the sentiment analyzers
 
24
  finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
25
  roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
26
  finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
27
+ rubert1 = pipeline("sentiment-analysis", model="DeepPavlov/rubert-base-cased")
28
+ rubert2 = pipeline("sentiment-analysis", model="blanchefort/rubert-base-cased-sentiment")
29
+
30
+ # Translation model for Russian to English
31
+ model_name = "Helsinki-NLP/opus-mt-ru-en"
32
+ translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
34
+
35
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
36
 
37
  def init_langchain_llm():
38
  pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
39
  llm = HuggingFacePipeline(pipeline=pipe)
40
  return llm
41
 
 
42
  def estimate_impact(llm, news_text):
43
  template = """
44
  Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
 
54
  chain = LLMChain(llm=llm, prompt=prompt)
55
  response = chain.run(news=news_text)
56
 
 
 
57
  impact, reasoning = response.split("Reasoning:")
58
  impact = impact.strip()
59
  reasoning = reasoning.strip()
60
 
61
  return impact, reasoning
62
 
63
+ def process_file_with_llm(df, llm):
 
 
 
64
  df['LLM_Impact'] = ''
65
  df['LLM_Reasoning'] = ''
66
 
67
  for index, row in df.iterrows():
68
  if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
69
+ impact, reasoning = estimate_impact(llm, row['Translated']) # Use translated text
70
  df.at[index, 'LLM_Impact'] = impact
71
  df.at[index, 'LLM_Reasoning'] = reasoning
72
 
 
122
  for c_idx, value in enumerate(row, start=1):
123
  ws.cell(row=r_idx, column=c_idx, value=value)
124
 
 
125
  output = io.BytesIO()
126
  wb.save(output)
127
  output.seek(0)
128
  return output
129
 
130
+ # ... (keep other functions as they are)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  def main():
133
+ st.title("... приступим к анализу... версия 45")
134
+
135
+ # Initialize session state
136
+ if 'processed_df' not in st.session_state:
137
+ st.session_state.processed_df = None
138
+ if 'analysis_df' not in st.session_state:
139
+ st.session_state.analysis_df = None
140
+ if 'llm_analyzed' not in st.session_state:
141
+ st.session_state.llm_analyzed = False
142
 
143
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
144
 
145
+ if uploaded_file is not None and st.session_state.processed_df is None:
146
  start_time = time.time()
147
 
148
+ st.session_state.processed_df = process_file(uploaded_file)
149
+ st.session_state.analysis_df = create_analysis_data(st.session_state.processed_df)
150
 
151
  st.subheader("Предпросмотр данных")
152
+ st.write(st.session_state.processed_df.head())
153
 
154
  st.subheader("Распределение окраски")
155
  fig, axs = plt.subplots(2, 2, figsize=(12, 8))
 
158
  models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
159
  for i, model in enumerate(models):
160
  ax = axs[i // 2, i % 2]
161
+ sentiment_counts = st.session_state.processed_df[model].value_counts()
162
  sentiment_counts.plot(kind='bar', ax=ax)
163
  ax.set_title(f"{model} Sentiment")
164
  ax.set_xlabel("Sentiment")
 
166
 
167
  plt.tight_layout()
168
  st.pyplot(fig)
169
+
170
  st.subheader("Анализ")
171
+ st.dataframe(st.session_state.analysis_df)
172
+
173
+ output = create_output_file(st.session_state.processed_df, uploaded_file, st.session_state.analysis_df)
174
 
 
175
  end_time = time.time()
176
  elapsed_time = end_time - start_time
177
  formatted_time = format_elapsed_time(elapsed_time)
178
  st.success(f"Обработка завершена за {formatted_time}.")
179
 
 
 
180
  st.download_button(
181
  label="Скачать результат анализа новостей",
182
  data=output,
 
184
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
185
  )
186
 
187
+ if st.session_state.processed_df is not None and not st.session_state.llm_analyzed:
188
  if st.button("Что скажет нейросеть?"):
189
  st.info("Анализ нейросетью начался. Это может занять некоторое время...")
190
  llm = init_langchain_llm()
191
+ df_with_llm = process_file_with_llm(st.session_state.processed_df, llm)
192
+ output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, st.session_state.analysis_df)
193
  st.success("Анализ нейросетью завершен!")
194
+ st.session_state.llm_analyzed = True
195
+ st.session_state.output_with_llm = output_with_llm
 
 
 
 
196
 
197
+ if st.session_state.llm_analyzed:
198
+ st.download_button(
199
+ label="Скачать результат анализа с оценкой нейросети",
200
+ data=st.session_state.output_with_llm,
201
+ file_name="результат_анализа_с_нейросетью.xlsx",
202
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
203
+ )
204
 
205
  if __name__ == "__main__":
206
  main()