pentarosarium commited on
Commit
9e97a7c
·
1 Parent(s): 45f1473

progress more 46

Browse files
Files changed (1) hide show
  1. app.py +252 -10
app.py CHANGED
@@ -8,6 +8,7 @@ from pymystem3 import Mystem
8
  import io
9
  from rapidfuzz import fuzz
10
  from tqdm.auto import tqdm
 
11
  import torch
12
  from openpyxl import load_workbook
13
  from openpyxl import Workbook
@@ -21,18 +22,17 @@ from langchain.chains import LLMChain
21
  mystem = Mystem()
22
 
23
  # Set up the sentiment analyzers
 
24
  finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
25
  roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
26
  finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
27
- rubert1 = pipeline("sentiment-analysis", model="DeepPavlov/rubert-base-cased")
28
- rubert2 = pipeline("sentiment-analysis", model="blanchefort/rubert-base-cased-sentiment")
29
-
30
- # Translation model for Russian to English
31
- model_name = "Helsinki-NLP/opus-mt-ru-en"
32
- translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
33
- translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
34
 
35
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
 
 
 
36
 
37
  def init_langchain_llm():
38
  pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
@@ -127,10 +127,252 @@ def create_output_file_with_llm(df, uploaded_file, analysis_df):
127
  output.seek(0)
128
  return output
129
 
130
- # ... (keep other functions as they are)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  def main():
133
- st.title("... приступим к анализу... версия 45")
134
 
135
  # Initialize session state
136
  if 'processed_df' not in st.session_state:
 
8
  import io
9
  from rapidfuzz import fuzz
10
  from tqdm.auto import tqdm
11
+ import time
12
  import torch
13
  from openpyxl import load_workbook
14
  from openpyxl import Workbook
 
22
  mystem = Mystem()
23
 
24
  # Set up the sentiment analyzers
25
+
26
  finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
27
  roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
28
  finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
29
+ rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
30
+ rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
 
 
 
 
 
31
 
32
+ def init_langchain_llm():
33
+ pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
34
+ llm = HuggingFacePipeline(pipeline=pipe)
35
+ return llm
36
 
37
  def init_langchain_llm():
38
  pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
 
127
  output.seek(0)
128
  return output
129
 
130
+ def create_analysis_data(df):
131
+ analysis_data = []
132
+ for _, row in df.iterrows():
133
+ if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
134
+ analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
135
+ return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Пояснение', 'Текст сообщения'])
136
+
137
+ # Function for lemmatizing Russian text
138
+ def lemmatize_text(text):
139
+ if pd.isna(text):
140
+ return ""
141
+
142
+ if not isinstance(text, str):
143
+ text = str(text)
144
+
145
+ words = text.split()
146
+ lemmatized_words = []
147
+ for word in tqdm(words, desc="Lemmatizing", unit="word"):
148
+ lemmatized_word = ''.join(mystem.lemmatize(word))
149
+ lemmatized_words.append(lemmatized_word)
150
+ return ' '.join(lemmatized_words)
151
+
152
+ # Translation model for Russian to English
153
+ model_name = "Helsinki-NLP/opus-mt-ru-en"
154
+ translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
155
+ translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
156
+
157
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
158
+
159
+
160
+ def translate(text):
161
+ # Tokenize the input text
162
+ inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
163
+
164
+ # Calculate max_length based on input length
165
+ input_length = inputs.input_ids.shape[1]
166
+ max_length = max(input_length + 10, int(input_length * 1.5)) # Ensure at least 10 new tokens
167
+
168
+ # Generate translation
169
+ translated_tokens = translation_model.generate(
170
+ **inputs,
171
+ max_new_tokens=max_length, # Use max_new_tokens instead of max_length
172
+ num_beams=5,
173
+ no_repeat_ngram_size=2,
174
+ early_stopping=True
175
+ )
176
+
177
+ # Decode the translated tokens
178
+ translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
179
+ return translated_text
180
+
181
+ # Functions for FinBERT, RoBERTa, and FinBERT-Tone with label mapping
182
+ def get_mapped_sentiment(result):
183
+ label = result['label'].lower()
184
+ if label in ["positive", "label_2", "pos", "pos_label"]:
185
+ return "Positive"
186
+ elif label in ["negative", "label_0", "neg", "neg_label"]:
187
+ return "Negative"
188
+ return "Neutral"
189
+
190
+ @sentiment_analysis_decorator
191
+ def get_rubert1_sentiment(text):
192
+ result = rubert1(text, truncation=True, max_length=512)[0]
193
+ return get_mapped_sentiment(result)
194
+
195
+ @sentiment_analysis_decorator
196
+ def get_rubert2_sentiment(text):
197
+ result = rubert2(text, truncation=True, max_length=512)[0]
198
+ return get_mapped_sentiment(result)
199
+
200
+ @sentiment_analysis_decorator
201
+ def get_finbert_sentiment(text):
202
+ result = finbert(text, truncation=True, max_length=512)[0]
203
+ return get_mapped_sentiment(result)
204
+
205
+ @sentiment_analysis_decorator
206
+ def get_roberta_sentiment(text):
207
+ result = roberta(text, truncation=True, max_length=512)[0]
208
+ return get_mapped_sentiment(result)
209
+
210
+ @sentiment_analysis_decorator
211
+ def get_finbert_tone_sentiment(text):
212
+ result = finbert_tone(text, truncation=True, max_length=512)[0]
213
+ return get_mapped_sentiment(result)
214
+
215
+ #Fuzzy filter out similar news for the same NER
216
+ def fuzzy_deduplicate(df, column, threshold=65):
217
+ seen_texts = []
218
+ indices_to_keep = []
219
+ for i, text in enumerate(df[column]):
220
+ if pd.isna(text):
221
+ indices_to_keep.append(i)
222
+ continue
223
+ text = str(text)
224
+ if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
225
+ seen_texts.append(text)
226
+ indices_to_keep.append(i)
227
+ return df.iloc[indices_to_keep]
228
+
229
+ def format_elapsed_time(seconds):
230
+ hours, remainder = divmod(int(seconds), 3600)
231
+ minutes, seconds = divmod(remainder, 60)
232
+
233
+ time_parts = []
234
+ if hours > 0:
235
+ time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
236
+ if minutes > 0:
237
+ time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
238
+ if seconds > 0 or not time_parts: # always show seconds if it's the only non-zero value
239
+ time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
240
+
241
+ return " ".join(time_parts)
242
+
243
+
244
+ def process_file(uploaded_file):
245
+ df = pd.read_excel(uploaded_file, sheet_name='Публикации')
246
+
247
+ required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
248
+ missing_columns = [col for col in required_columns if col not in df.columns]
249
+ if missing_columns:
250
+ st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
251
+ st.stop()
252
+
253
+ original_news_count = len(df)
254
+
255
+ # Apply fuzzy deduplication
256
+ df = df.groupby('Объект').apply(
257
+ lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
258
+ ).reset_index(drop=True)
259
+
260
+ remaining_news_count = len(df)
261
+ duplicates_removed = original_news_count - remaining_news_count
262
+
263
+ st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
264
+
265
+ # Translate texts
266
+ translated_texts = []
267
+ lemmatized_texts = []
268
+ progress_bar = st.progress(0)
269
+ progress_text = st.empty()
270
+ total_news = len(df)
271
+
272
+ texts = df['Выдержки из текста'].tolist()
273
+ # Data validation
274
+ texts = [str(text) if not pd.isna(text) else "" for text in texts]
275
+
276
+ for text in df['Выдержки из текста']:
277
+ lemmatized_texts.append(lemmatize_text(text))
278
+
279
+ for i, text in enumerate(lemmatized_texts):
280
+ translated_text = translate(str(text))
281
+ translated_texts.append(translated_text)
282
+ progress_bar.progress((i + 1) / len(df))
283
+ progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
284
+
285
+ # Perform sentiment analysis
286
+ rubert2_results = [get_rubert2_sentiment(text) for text in texts]
287
+ finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
288
+ roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
289
+ finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
290
+
291
+ # Create a new DataFrame with processed data
292
+ processed_df = pd.DataFrame({
293
+ 'Объект': df['Объект'],
294
+ 'Заголовок': df['Заголовок'], # Preserve original 'Заголовок'
295
+ 'ruBERT2': rubert2_results,
296
+ 'FinBERT': finbert_results,
297
+ 'RoBERTa': roberta_results,
298
+ 'FinBERT-Tone': finbert_tone_results,
299
+ 'Выдержки из текста': df['Выдержки из текста'],
300
+ 'Translated': translated_texts
301
+ })
302
+
303
+ return processed_df
304
+
305
+ def create_output_file(df, uploaded_file, analysis_df):
306
+ # Load the sample file to use as a template
307
+ wb = load_workbook("sample_file.xlsx")
308
+
309
+ # Process data for 'Сводка' sheet
310
+ entities = df['Объект'].unique()
311
+ summary_data = []
312
+ for entity in entities:
313
+ entity_df = df[df['Объект'] == entity]
314
+ total_news = len(entity_df)
315
+ negative_news = sum((entity_df['FinBERT'] == 'Negative') |
316
+ (entity_df['RoBERTa'] == 'Negative') |
317
+ (entity_df['FinBERT-Tone'] == 'Negative'))
318
+ positive_news = sum((entity_df['FinBERT'] == 'Positive') |
319
+ (entity_df['RoBERTa'] == 'Positive') |
320
+ (entity_df['FinBERT-Tone'] == 'Positive'))
321
+ summary_data.append([entity, total_news, negative_news, positive_news])
322
+
323
+ summary_df = pd.DataFrame(summary_data, columns=['Объект', 'Всего новостей', 'Отрицательные', 'Положительные'])
324
+ summary_df = summary_df.sort_values('Отрицательные', ascending=False)
325
+
326
+ # Write 'Сводка' sheet
327
+ ws = wb['Сводка']
328
+ for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
329
+ for c_idx, value in enumerate(row, start=5):
330
+ ws.cell(row=r_idx, column=c_idx, value=value)
331
+
332
+ # Process data for '��начимые' sheet
333
+
334
+ significant_data = []
335
+ for _, row in df.iterrows():
336
+ if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
337
+ sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
338
+ significant_data.append([row['Объект'], '', sentiment, '', row['Заголовок'], row['Выдержки из текста']])
339
+
340
+ # Write 'Значимые' sheet
341
+ ws = wb['Значимые']
342
+ for r_idx, row in enumerate(significant_data, start=3):
343
+ for c_idx, value in enumerate(row, start=3):
344
+ ws.cell(row=r_idx, column=c_idx, value=value)
345
+
346
+ # Write 'Анализ' sheet
347
+ ws = wb['Анализ']
348
+ for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
349
+ for c_idx, value in enumerate(row, start=5):
350
+ ws.cell(row=r_idx, column=c_idx, value=value)
351
+
352
+ # Copy 'Публикации' sheet from original uploaded file
353
+ original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
354
+ ws = wb['Публикации']
355
+ for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
356
+ for c_idx, value in enumerate(row, start=1):
357
+ ws.cell(row=r_idx, column=c_idx, value=value)
358
+
359
+ # Add 'Тех.приложение' sheet with processed data
360
+ if 'Тех.приложение' not in wb.sheetnames:
361
+ wb.create_sheet('Тех.приложение')
362
+ ws = wb['Тех.приложение']
363
+ for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
364
+ for c_idx, value in enumerate(row, start=1):
365
+ ws.cell(row=r_idx, column=c_idx, value=value)
366
+
367
+ # Save the workbook to a BytesIO object
368
+ output = io.BytesIO()
369
+ wb.save(output)
370
+ output.seek(0)
371
+
372
+ return output
373
 
374
  def main():
375
+ st.title("... приступим к анализу... версия 46")
376
 
377
  # Initialize session state
378
  if 'processed_df' not in st.session_state: