pentarosarium commited on
Commit
a87d6f0
·
1 Parent(s): 1254c79

progress more (2)

Browse files
Files changed (2) hide show
  1. app.py +91 -95
  2. requirements.txt +1 -1
app.py CHANGED
@@ -10,6 +10,46 @@ from openpyxl import load_workbook
10
  from langchain_community.chat_models import ChatOpenAI
11
  from langchain.prompts import PromptTemplate
12
  from langchain_core.runnables import RunnablePassthrough
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def fuzzy_deduplicate(df, column, threshold=65):
15
  seen_texts = []
@@ -43,27 +83,22 @@ def init_langchain_llm():
43
  st.error(f"Error initializing the Groq LLM: {str(e)}")
44
  st.stop()
45
 
46
- def estimate_sentiment_and_impact(llm, news_text, entity):
47
  template = """
48
- First, translate this Russian text into English:
49
- "{news}"
50
-
51
- Then, analyze the translated text about the entity "{entity}" and determine:
52
- 1. Sentiment (Positive/Negative/Neutral)
53
- 2. Estimate potential financial impact in Russian rubles for this entity in the next 6 months.
54
 
55
  If precise monetary estimate is not possible, categorize the impact as one of the following:
56
- 1. "Significant risk of loss"
57
- 2. "Moderate risk of loss"
58
- 3. "Minor risk of loss"
59
- 4. "Probability of profit"
60
- 5. "Uncertain effect"
61
 
62
- Provide a brief reasoning (maximum 100 words).
 
 
63
 
64
  Your response should be in the following format:
65
- Translation: [Your English translation]
66
- Sentiment: [Positive/Negative/Neutral]
67
  Impact: [Your estimate or category]
68
  Reasoning: [Your reasoning]
69
  """
@@ -71,44 +106,19 @@ def estimate_sentiment_and_impact(llm, news_text, entity):
71
  chain = prompt | llm | RunnablePassthrough()
72
  response = chain.invoke({"entity": entity, "news": news_text})
73
 
74
- sentiment = "Neutral"
75
- impact = "Uncertain effect"
76
- reasoning = "Unable to provide reasoning"
77
 
78
  if isinstance(response, str):
79
  try:
80
- # Extract sentiment
81
- if "Sentiment:" in response:
82
- sentiment_part = response.split("Sentiment:")[1].split("\n")[0].strip().lower()
83
- if "positive" in sentiment_part:
84
- sentiment = "Positive"
85
- elif "negative" in sentiment_part:
86
- sentiment = "Negative"
87
-
88
- # Extract impact and reasoning
89
  if "Impact:" in response and "Reasoning:" in response:
90
  impact_part, reasoning_part = response.split("Reasoning:")
91
  impact = impact_part.split("Impact:")[1].strip()
92
  reasoning = reasoning_part.strip()
93
-
94
- # Translate impact categories back to Russian
95
- impact_mapping = {
96
- "Significant risk of loss": "Значительный риск убытков",
97
- "Moderate risk of loss": "Умеренный риск убытков",
98
- "Minor risk of loss": "Незначительный риск убытков",
99
- "Probability of profit": "Вероятность прибыли",
100
- "Uncertain effect": "Неопределенный эффект"
101
- }
102
-
103
- for eng, rus in impact_mapping.items():
104
- if eng.lower() in impact.lower():
105
- impact = rus
106
- break
107
-
108
  except Exception as e:
109
  st.error(f"Error parsing LLM response: {str(e)}")
110
 
111
- return sentiment, impact, reasoning
112
 
113
  def format_elapsed_time(seconds):
114
  hours, remainder = divmod(int(seconds), 3600)
@@ -153,52 +163,65 @@ def process_file(uploaded_file):
153
  st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
154
  st.stop()
155
 
 
 
 
 
 
 
 
156
  original_news_count = len(df)
157
  df = df.groupby('Объект').apply(
158
  lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
159
  ).reset_index(drop=True)
160
-
161
  remaining_news_count = len(df)
162
  duplicates_removed = original_news_count - remaining_news_count
163
  st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
164
 
165
- llm = init_langchain_llm()
166
- if not llm:
167
- st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
168
- st.stop()
169
-
 
170
  df['Sentiment'] = ''
171
  df['Impact'] = ''
172
  df['Reasoning'] = ''
173
 
174
- progress_bar = st.progress(0)
175
- status_text = st.empty()
176
-
177
  for index, row in df.iterrows():
178
- sentiment, impact, reasoning = estimate_sentiment_and_impact(
179
- llm,
180
- row['Выдержки из текста'],
181
- row['Объект']
182
- )
183
 
 
 
184
  df.at[index, 'Sentiment'] = sentiment
185
- df.at[index, 'Impact'] = impact
186
- df.at[index, 'Reasoning'] = reasoning
187
 
 
 
 
 
 
 
 
188
  progress = (index + 1) / len(df)
189
  progress_bar.progress(progress)
190
  status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
191
 
 
192
  st.write(f"Объект: {row['Объект']}")
193
  st.write(f"Новость: {row['Заголовок']}")
194
  st.write(f"Тональность: {sentiment}")
195
- st.write(f"Эффект: {impact}")
196
- st.write(f"Обоснование: {reasoning}")
 
197
  st.write("---")
198
 
199
  progress_bar.empty()
200
  status_text.empty()
201
 
 
202
  visualization = generate_sentiment_visualization(df)
203
  if visualization:
204
  st.pyplot(visualization)
@@ -229,6 +252,7 @@ def create_analysis_data(df):
229
  def create_output_file(df, uploaded_file):
230
  wb = load_workbook("sample_file.xlsx")
231
 
 
232
  summary_df = pd.DataFrame({
233
  'Объект': df['Объект'].unique(),
234
  'Всего новостей': df.groupby('Объект').size(),
@@ -241,44 +265,16 @@ def create_output_file(df, uploaded_file):
241
 
242
  summary_df = summary_df.sort_values('Негативные', ascending=False)
243
 
244
- ws = wb['Сводка']
245
- for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=True), start=4):
246
- for c_idx, value in enumerate(row, start=5):
247
- ws.cell(row=r_idx, column=c_idx, value=value)
248
 
249
- significant_data = []
250
- for _, row in df.iterrows():
251
- if row['Sentiment'] in ['Negative', 'Positive']:
252
- significant_data.append([
253
- row['Объект'],
254
- 'релевантен',
255
- row['Sentiment'],
256
- row['Impact'],
257
- row['Заголовок'],
258
- row['Выдержки из текста']
259
- ])
260
-
261
- ws = wb['Значимые']
262
- for r_idx, row in enumerate(significant_data, start=3):
263
- for c_idx, value in enumerate(row, start=3):
264
- ws.cell(row=r_idx, column=c_idx, value=value)
265
-
266
- analysis_df = create_analysis_data(df)
267
- ws = wb['Анализ']
268
- for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=True), start=4):
269
- for c_idx, value in enumerate(row, start=5):
270
- ws.cell(row=r_idx, column=c_idx, value=value)
271
-
272
- original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
273
- ws = wb['Публикации']
274
- for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
275
- for c_idx, value in enumerate(row, start=1):
276
- ws.cell(row=r_idx, column=c_idx, value=value)
277
 
278
  if 'Тех.приложение' not in wb.sheetnames:
279
  wb.create_sheet('Тех.приложение')
280
  ws = wb['Тех.приложение']
281
- for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
282
  for c_idx, value in enumerate(row, start=1):
283
  ws.cell(row=r_idx, column=c_idx, value=value)
284
 
@@ -306,7 +302,7 @@ def main():
306
  unsafe_allow_html=True
307
  )
308
 
309
- st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС :::")
310
 
311
  if 'processed_df' not in st.session_state:
312
  st.session_state.processed_df = None
 
10
  from langchain_community.chat_models import ChatOpenAI
11
  from langchain.prompts import PromptTemplate
12
  from langchain_core.runnables import RunnablePassthrough
13
+ from transformers import pipeline
14
+
15
+ # Initialize sentiment analyzers
16
+ finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
17
+ roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
18
+ finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
19
+
20
+
21
+ def translate_text(llm, text):
22
+ template = """
23
+ Translate this Russian text into English:
24
+ "{text}"
25
+
26
+ Your response should contain only the English translation.
27
+ """
28
+ prompt = PromptTemplate(template=template, input_variables=["text"])
29
+ chain = prompt | llm | RunnablePassthrough()
30
+ response = chain.invoke({"text": text})
31
+ return response.strip()
32
+
33
+ def get_mapped_sentiment(result):
34
+ label = result['label'].lower()
35
+ if label in ["positive", "label_2", "pos", "pos_label"]:
36
+ return "Positive"
37
+ elif label in ["negative", "label_0", "neg", "neg_label"]:
38
+ return "Negative"
39
+ return "Neutral"
40
+
41
+ def analyze_sentiment(text):
42
+ finbert_result = get_mapped_sentiment(finbert(text, truncation=True, max_length=512)[0])
43
+ roberta_result = get_mapped_sentiment(roberta(text, truncation=True, max_length=512)[0])
44
+ finbert_tone_result = get_mapped_sentiment(finbert_tone(text, truncation=True, max_length=512)[0])
45
+
46
+ # Consider sentiment negative if any model says it's negative
47
+ if any(result == "Negative" for result in [finbert_result, roberta_result, finbert_tone_result]):
48
+ return "Negative"
49
+ elif all(result == "Positive" for result in [finbert_result, roberta_result, finbert_tone_result]):
50
+ return "Positive"
51
+ return "Neutral"
52
+
53
 
54
  def fuzzy_deduplicate(df, column, threshold=65):
55
  seen_texts = []
 
83
  st.error(f"Error initializing the Groq LLM: {str(e)}")
84
  st.stop()
85
 
86
+ def estimate_impact(llm, news_text, entity):
87
  template = """
88
+ Analyze the following news piece about the entity "{entity}" and estimate its monetary impact in Russian rubles for this entity in the next 6 months.
 
 
 
 
 
89
 
90
  If precise monetary estimate is not possible, categorize the impact as one of the following:
91
+ 1. "Значительный риск убытков"
92
+ 2. "Умеренный риск убытков"
93
+ 3. "Незначительный риск убытков"
94
+ 4. "Вероятность прибыли"
95
+ 5. "Неопределенный эффект"
96
 
97
+ Provide brief reasoning (maximum 100 words).
98
+
99
+ News: {news}
100
 
101
  Your response should be in the following format:
 
 
102
  Impact: [Your estimate or category]
103
  Reasoning: [Your reasoning]
104
  """
 
106
  chain = prompt | llm | RunnablePassthrough()
107
  response = chain.invoke({"entity": entity, "news": news_text})
108
 
109
+ impact = "Неопределенный эффект"
110
+ reasoning = "Не удалось получить обоснование"
 
111
 
112
  if isinstance(response, str):
113
  try:
 
 
 
 
 
 
 
 
 
114
  if "Impact:" in response and "Reasoning:" in response:
115
  impact_part, reasoning_part = response.split("Reasoning:")
116
  impact = impact_part.split("Impact:")[1].strip()
117
  reasoning = reasoning_part.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
  st.error(f"Error parsing LLM response: {str(e)}")
120
 
121
+ return impact, reasoning
122
 
123
  def format_elapsed_time(seconds):
124
  hours, remainder = divmod(int(seconds), 3600)
 
163
  st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
164
  st.stop()
165
 
166
+ # Initialize LLM
167
+ llm = init_langchain_llm()
168
+ if not llm:
169
+ st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
170
+ st.stop()
171
+
172
+ # Deduplication
173
  original_news_count = len(df)
174
  df = df.groupby('Объект').apply(
175
  lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
176
  ).reset_index(drop=True)
177
+
178
  remaining_news_count = len(df)
179
  duplicates_removed = original_news_count - remaining_news_count
180
  st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
181
 
182
+ # Initialize progress
183
+ progress_bar = st.progress(0)
184
+ status_text = st.empty()
185
+
186
+ # Process each news item
187
+ df['Translated'] = ''
188
  df['Sentiment'] = ''
189
  df['Impact'] = ''
190
  df['Reasoning'] = ''
191
 
 
 
 
192
  for index, row in df.iterrows():
193
+ # First: Translate
194
+ translated_text = translate_text(llm, row['Выдержки из текста'])
195
+ df.at[index, 'Translated'] = translated_text
 
 
196
 
197
+ # Second: Analyze sentiment
198
+ sentiment = analyze_sentiment(translated_text)
199
  df.at[index, 'Sentiment'] = sentiment
 
 
200
 
201
+ # Third: If negative, estimate impact
202
+ if sentiment == "Negative":
203
+ impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
204
+ df.at[index, 'Impact'] = impact
205
+ df.at[index, 'Reasoning'] = reasoning
206
+
207
+ # Update progress
208
  progress = (index + 1) / len(df)
209
  progress_bar.progress(progress)
210
  status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
211
 
212
+ # Display results
213
  st.write(f"Объект: {row['Объект']}")
214
  st.write(f"Новость: {row['Заголовок']}")
215
  st.write(f"Тональность: {sentiment}")
216
+ if sentiment == "Negative":
217
+ st.write(f"Эффект: {impact}")
218
+ st.write(f"Обоснование: {reasoning}")
219
  st.write("---")
220
 
221
  progress_bar.empty()
222
  status_text.empty()
223
 
224
+ # Generate visualization
225
  visualization = generate_sentiment_visualization(df)
226
  if visualization:
227
  st.pyplot(visualization)
 
252
  def create_output_file(df, uploaded_file):
253
  wb = load_workbook("sample_file.xlsx")
254
 
255
+ # Update 'Сводка' sheet
256
  summary_df = pd.DataFrame({
257
  'Объект': df['Объект'].unique(),
258
  'Всего новостей': df.groupby('Объект').size(),
 
265
 
266
  summary_df = summary_df.sort_values('Негативные', ascending=False)
267
 
268
+ # Write sheets...
269
+ # (keep existing code for writing sheets)
 
 
270
 
271
+ # Update 'Тех.приложение' sheet to include translated text
272
+ tech_df = df[['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  if 'Тех.приложение' not in wb.sheetnames:
275
  wb.create_sheet('Тех.приложение')
276
  ws = wb['Тех.приложение']
277
+ for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
278
  for c_idx, value in enumerate(row, start=1):
279
  ws.cell(row=r_idx, column=c_idx, value=value)
280
 
 
302
  unsafe_allow_html=True
303
  )
304
 
305
+ st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС (2):::")
306
 
307
  if 'processed_df' not in st.session_state:
308
  st.session_state.processed_df = None
requirements.txt CHANGED
@@ -15,4 +15,4 @@ langchain-community
15
  huggingface_hub
16
  accelerate>=0.26.0
17
  openai
18
- wordcloud
 
15
  huggingface_hub
16
  accelerate>=0.26.0
17
  openai
18
+ wordcloud