pentarosarium commited on
Commit
261f952
·
1 Parent(s): c58ea62

progress more 44+

Browse files
Files changed (1) hide show
  1. app.py +136 -3
app.py CHANGED
@@ -14,6 +14,9 @@ from openpyxl import load_workbook
14
  from openpyxl import Workbook
15
  from openpyxl.utils.dataframe import dataframe_to_rows
16
  from sentiment_decorators import sentiment_analysis_decorator
 
 
 
17
 
18
  # Initialize pymystem3 for lemmatization
19
  mystem = Mystem()
@@ -26,12 +29,112 @@ finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
26
  rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
27
  rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def create_analysis_data(df):
30
  analysis_data = []
31
  for _, row in df.iterrows():
32
  if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
33
  analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
34
- return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Материальность', 'Текст сообщения'])
35
 
36
  # Function for lemmatizing Russian text
37
  def lemmatize_text(text):
@@ -125,6 +228,20 @@ def fuzzy_deduplicate(df, column, threshold=65):
125
  indices_to_keep.append(i)
126
  return df.iloc[indices_to_keep]
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  def process_file(uploaded_file):
130
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
@@ -257,7 +374,7 @@ def create_output_file(df, uploaded_file, analysis_df):
257
  return output
258
 
259
  def main():
260
- st.title("... приступим к анализу... версия 43+")
261
 
262
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
263
 
@@ -292,7 +409,8 @@ def main():
292
  # Calculate elapsed time
293
  end_time = time.time()
294
  elapsed_time = end_time - start_time
295
- st.success(f"Обработка завершена за {elapsed_time:.2f} секунд.")
 
296
 
297
  # Offer download of results
298
 
@@ -302,5 +420,20 @@ def main():
302
  file_name="результат_анализа_новостей.xlsx",
303
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
304
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  if __name__ == "__main__":
306
  main()
 
14
  from openpyxl import Workbook
15
  from openpyxl.utils.dataframe import dataframe_to_rows
16
  from sentiment_decorators import sentiment_analysis_decorator
17
+ from langchain.llms import HuggingFacePipeline
18
+ from langchain.prompts import PromptTemplate
19
+ from langchain.chains import LLMChain
20
 
21
  # Initialize pymystem3 for lemmatization
22
  mystem = Mystem()
 
29
  rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
30
  rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
31
 
32
+ def init_langchain_llm():
33
+ pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
34
+ llm = HuggingFacePipeline(pipeline=pipe)
35
+ return llm
36
+
37
+ # Function to estimate impact using LLM
38
+ def estimate_impact(llm, news_text):
39
+ template = """
40
+ Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
41
+ If a monetary estimate is not possible, categorize the impact as "Значительный", "Незначительный", or "Неопределенный".
42
+ Also provide a short reasoning (max 100 words) for your assessment.
43
+
44
+ News: {news}
45
+
46
+ Estimated Impact:
47
+ Reasoning:
48
+ """
49
+ prompt = PromptTemplate(template=template, input_variables=["news"])
50
+ chain = LLMChain(llm=llm, prompt=prompt)
51
+ response = chain.run(news=news_text)
52
+
53
+ # Parse the response to extract impact and reasoning
54
+ # Parsing logic is very important! Might be needed to be changed
55
+ impact, reasoning = response.split("Reasoning:")
56
+ impact = impact.strip()
57
+ reasoning = reasoning.strip()
58
+
59
+ return impact, reasoning
60
+
61
+ def process_file_with_llm(uploaded_file, llm):
62
+ df = process_file(uploaded_file)
63
+
64
+ # Add new columns for LLM analysis
65
+ df['LLM_Impact'] = ''
66
+ df['LLM_Reasoning'] = ''
67
+
68
+ for index, row in df.iterrows():
69
+ if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
70
+ impact, reasoning = estimate_impact(llm, row['Выдержки из текста'])
71
+ df.at[index, 'LLM_Impact'] = impact
72
+ df.at[index, 'LLM_Reasoning'] = reasoning
73
+
74
+ return df
75
+
76
+ def create_output_file_with_llm(df, uploaded_file, analysis_df):
77
+ wb = load_workbook("sample_file.xlsx")
78
+
79
+ # Update 'Сводка' sheet
80
+ summary_df = pd.DataFrame({
81
+ 'Объект': df['Объект'].unique(),
82
+ 'Всего новостей': df.groupby('Объект').size(),
83
+ 'Отрицательные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Negative').any(axis=1)].groupby('Объект').size(),
84
+ 'Положительные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Positive').any(axis=1)].groupby('Объект').size(),
85
+ 'Impact': df.groupby('Объект')['LLM_Impact'].agg(lambda x: x.value_counts().index[0] if x.any() else 'Неопределенный')
86
+ })
87
+ ws = wb['Сводка']
88
+ for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
89
+ for c_idx, value in enumerate(row, start=5):
90
+ ws.cell(row=r_idx, column=c_idx, value=value)
91
+
92
+ # Update 'Значимые' sheet
93
+ significant_data = []
94
+ for _, row in df.iterrows():
95
+ if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
96
+ sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
97
+ significant_data.append([row['Объект'], 'релевантен', sentiment, row['LLM_Impact'], row['Заголовок'], row['Выдержки из текста']])
98
+
99
+ ws = wb['Значимые']
100
+ for r_idx, row in enumerate(significant_data, start=3):
101
+ for c_idx, value in enumerate(row, start=3):
102
+ ws.cell(row=r_idx, column=c_idx, value=value)
103
+
104
+ # Update 'Анализ' sheet
105
+ analysis_df['LLM_Reasoning'] = df['LLM_Reasoning']
106
+ ws = wb['Анализ']
107
+ for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
108
+ for c_idx, value in enumerate(row, start=5):
109
+ ws.cell(row=r_idx, column=c_idx, value=value)
110
+
111
+ # Copy 'Публикации' sheet from original uploaded file
112
+ original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
113
+ ws = wb['Публикации']
114
+ for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
115
+ for c_idx, value in enumerate(row, start=1):
116
+ ws.cell(row=r_idx, column=c_idx, value=value)
117
+
118
+ # Add 'Тех.приложение' sheet with processed data
119
+ if 'Тех.приложение' not in wb.sheetnames:
120
+ wb.create_sheet('Тех.приложение')
121
+ ws = wb['Тех.приложение']
122
+ for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
123
+ for c_idx, value in enumerate(row, start=1):
124
+ ws.cell(row=r_idx, column=c_idx, value=value)
125
+
126
+
127
+ output = io.BytesIO()
128
+ wb.save(output)
129
+ output.seek(0)
130
+ return output
131
+
132
  def create_analysis_data(df):
133
  analysis_data = []
134
  for _, row in df.iterrows():
135
  if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
136
  analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
137
+ return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Пояснение', 'Текст сообщения'])
138
 
139
  # Function for lemmatizing Russian text
140
  def lemmatize_text(text):
 
228
  indices_to_keep.append(i)
229
  return df.iloc[indices_to_keep]
230
 
231
+ def format_elapsed_time(seconds):
232
+ hours, remainder = divmod(int(seconds), 3600)
233
+ minutes, seconds = divmod(remainder, 60)
234
+
235
+ time_parts = []
236
+ if hours > 0:
237
+ time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
238
+ if minutes > 0:
239
+ time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
240
+ if seconds > 0 or not time_parts: # always show seconds if it's the only non-zero value
241
+ time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
242
+
243
+ return " ".join(time_parts)
244
+
245
 
246
  def process_file(uploaded_file):
247
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
 
374
  return output
375
 
376
  def main():
377
+ st.title("... приступим к анализу... версия 44+")
378
 
379
  uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
380
 
 
409
  # Calculate elapsed time
410
  end_time = time.time()
411
  elapsed_time = end_time - start_time
412
+ formatted_time = format_elapsed_time(elapsed_time)
413
+ st.success(f"Обработка завершена за {formatted_time}.")
414
 
415
  # Offer download of results
416
 
 
420
  file_name="результат_анализа_новостей.xlsx",
421
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
422
  )
423
+
424
+ # Add button for LLM analysis
425
+ if st.button("Что скажет нейросеть?"):
426
+ st.info("Анализ нейросетью начался. Это может занять некоторое время...")
427
+ llm = init_langchain_llm()
428
+ df_with_llm = process_file_with_llm(uploaded_file, llm)
429
+ output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, analysis_df)
430
+ st.success("Анализ нейросетью завершен!")
431
+ st.download_button(
432
+ label="Скачать результат анализа с оценкой нейросети",
433
+ data=output_with_llm,
434
+ file_name="результат_анализа_с_нейросетью.xlsx",
435
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
436
+
437
+
438
  if __name__ == "__main__":
439
  main()