pentarosarium commited on
Commit
f602aaf
·
1 Parent(s): 0a20075
Files changed (1) hide show
  1. app.py +232 -134
app.py CHANGED
@@ -71,41 +71,35 @@ class FallbackLLMSystem:
71
  # Initialize MT5 model (multilingual T5)
72
  self.model_name = "google/mt5-small"
73
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
74
- self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
75
 
76
  # Set device
77
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
78
  self.model = self.model.to(self.device)
79
 
80
- st.success(f"Запустил MT5-модель на {self.device}")
81
 
82
  except Exception as e:
83
  st.error(f"Error initializing MT5: {str(e)}")
84
  raise
85
 
86
- def detect_events(self, text, entity):
87
- """Detect events using MT5"""
88
- # Initialize default return values
89
- event_type = "Нет"
90
- summary = ""
91
-
92
  try:
93
- prompt = f"""<s>Analyze news about company {entity}:
94
-
95
- {text}
96
-
97
- Classify event type as one of:
98
- - Отчетность (financial reports)
99
- - РЦБ (securities market events)
100
- - Суд (legal actions)
101
- - Нет (no significant events)
 
102
 
103
- Format response as:
104
- Тип: [type]
105
- Краткое описание: [summary]</s>"""
106
-
107
  inputs = self.tokenizer(
108
- prompt,
109
  return_tensors="pt",
110
  padding=True,
111
  truncation=True,
@@ -122,25 +116,174 @@ class FallbackLLMSystem:
122
 
123
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
124
 
125
- # Parse response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  if "Тип:" in response and "Краткое описание:" in response:
127
- parts = response.split("Краткое описание:")
128
- type_part = parts[0]
129
- if "Тип:" in type_part:
130
- event_type = type_part.split("Тип:")[1].strip()
131
- # Validate event type
 
132
  valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
133
- if event_type not in valid_types:
134
- event_type = "Нет"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- if len(parts) > 1:
137
- summary = parts[1].strip()
138
-
139
  return event_type, summary
140
-
141
  except Exception as e:
142
  st.warning(f"Event detection error: {str(e)}")
143
- return "Нет", "Ошибка анализа"
 
 
 
 
 
 
 
 
 
144
 
145
  def ensure_groq_llm():
146
  """Initialize Groq LLM for impact estimation"""
@@ -351,7 +494,7 @@ class EventDetectionSystem:
351
  model="yiyanghkust/finbert-tone",
352
  return_all_scores=True
353
  )
354
- st.success("BERT-модели запущены для детекции новостей")
355
  except Exception as e:
356
  st.error(f"Ошибка запуска BERT: {str(e)}")
357
  raise
@@ -414,7 +557,7 @@ class TranslationSystem:
414
  # Initialize fallback translator
415
  self.fallback_translator = GoogleTranslator(source='ru', target='en')
416
  self.legacy_translator = LegacyTranslator()
417
- st.success("Запустил систему перевода")
418
  except Exception as e:
419
  st.error(f"Ошибка запуска перевода: {str(e)}")
420
  raise
@@ -641,24 +784,7 @@ def process_file(uploaded_file, model_choice, translation_method=None):
641
  st.error(f"Ошибка в обработке файла: {str(e)}")
642
  return None
643
 
644
- def translate_reasoning_to_russian(llm, text):
645
- template = """
646
- Translate this English explanation to Russian, maintaining a formal business style:
647
- "{text}"
648
-
649
- Your response should contain only the Russian translation.
650
- """
651
- prompt = PromptTemplate(template=template, input_variables=["text"])
652
- chain = prompt | llm | RunnablePassthrough()
653
- response = chain.invoke({"text": text})
654
-
655
- # Handle different response types
656
- if hasattr(response, 'content'):
657
- return response.content.strip()
658
- elif isinstance(response, str):
659
- return response.strip()
660
- else:
661
- return str(response).strip()
662
 
663
 
664
  def create_download_section(excel_data, pdf_data):
@@ -905,104 +1031,76 @@ def create_analysis_data(df):
905
  'Текст сообщения'
906
  ])
907
 
908
- def create_output_file(df, uploaded_file, llm):
909
- wb = load_workbook("sample_file.xlsx")
910
-
911
- try:
912
- # Update 'Мониторинг' sheet with events
913
- ws = wb['Мониторинг']
914
- row_idx = 4
915
- for _, row in df.iterrows():
916
- if row['Event_Type'] != 'Нет':
917
- ws.cell(row=row_idx, column=5, value=row['Объект']) # Column E
918
- ws.cell(row=row_idx, column=6, value=row['Заголовок']) # Column F
919
- ws.cell(row=row_idx, column=7, value=row['Event_Type']) # Column G
920
- ws.cell(row=row_idx, column=8, value=row['Event_Summary']) # Column H
921
- ws.cell(row=row_idx, column=9, value=row['Выдержки из текста']) # Column I
922
- row_idx += 1
923
-
924
- # Sort entities by number of negative publications
925
- entity_stats = pd.DataFrame({
926
- 'Объект': df['Объект'].unique(),
927
- 'Всего': df.groupby('Объект').size(),
928
- 'Негативные': df[df['Sentiment'] == 'Negative'].groupby('Объект').size().fillna(0).astype(int),
929
- 'Позитивные': df[df['Sentiment'] == 'Positive'].groupby('Объект').size().fillna(0).astype(int)
930
- }).sort_values('Негативные', ascending=False)
931
-
932
- # Calculate most negative impact for each entity
933
- entity_impacts = {}
934
- for entity in df['Объект'].unique():
935
- entity_df = df[df['Объект'] == entity]
936
- negative_impacts = entity_df[entity_df['Sentiment'] == 'Negative']['Impact']
937
- entity_impacts[entity] = negative_impacts.iloc[0] if len(negative_impacts) > 0 else 'Неопределенный эффект'
938
 
939
- # Update 'Сводка' sheet
940
- ws = wb['Сводка']
941
- for idx, (entity, row) in enumerate(entity_stats.iterrows(), start=4):
942
- ws.cell(row=idx, column=5, value=entity) # Column E
943
- ws.cell(row=idx, column=6, value=row['Всего']) # Column F
944
- ws.cell(row=idx, column=7, value=row['Негативные']) # Column G
945
- ws.cell(row=idx, column=8, value=row['Позитивные']) # Column H
946
- ws.cell(row=idx, column=9, value=entity_impacts[entity]) # Column I
947
 
948
- # Update 'Значимые' sheet
949
- ws = wb['Значимые']
950
- row_idx = 3
951
- for _, row in df.iterrows():
952
- if row['Sentiment'] in ['Negative', 'Positive']:
953
- ws.cell(row=row_idx, column=3, value=row['Объект']) # Column C
954
- ws.cell(row=row_idx, column=4, value='релевантно') # Column D
955
- ws.cell(row=row_idx, column=5, value=row['Sentiment']) # Column E
956
- ws.cell(row=row_idx, column=6, value=row['Impact']) # Column F
957
- ws.cell(row=row_idx, column=7, value=row['Заголовок']) # Column G
958
- ws.cell(row=row_idx, column=8, value=row['Выдержки из текста']) # Column H
959
- row_idx += 1
960
 
961
- # Copy 'Публикации' sheet
962
- original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
963
- ws = wb['Публикации']
964
- for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
965
- for c_idx, value in enumerate(row, start=1):
966
- ws.cell(row=r_idx, column=c_idx, value=value)
967
 
968
- # Update 'Анализ' sheet
969
  ws = wb['Анализ']
970
  row_idx = 4
971
  for _, row in df[df['Sentiment'] == 'Negative'].iterrows():
972
- ws.cell(row=row_idx, column=5, value=row['Объект']) # Column E
973
- ws.cell(row=row_idx, column=6, value=row['Заголовок']) # Column F
974
- ws.cell(row=row_idx, column=7, value="Риск убытка") # Column G
975
 
976
- # Translate reasoning if it exists
977
  if pd.notna(row['Reasoning']):
978
- translated_reasoning = translate_reasoning_to_russian(llm, row['Reasoning'])
979
- ws.cell(row=row_idx, column=8, value=translated_reasoning) # Column H
 
 
 
 
980
 
981
- ws.cell(row=row_idx, column=9, value=row['Выдержки из текста']) # Column I
982
  row_idx += 1
983
 
984
- # Update 'Тех.приложение' sheet
985
- tech_df = df[['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']]
986
- if 'Тех.приложение' not in wb.sheetnames:
987
- wb.create_sheet('Тех.приложение')
988
- ws = wb['Тех.приложение']
989
- for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
990
- for c_idx, value in enumerate(row, start=1):
991
- ws.cell(row=r_idx, column=c_idx, value=value)
992
-
993
  except Exception as e:
994
  st.warning(f"Ошибка при создании выходного файла: {str(e)}")
995
-
996
- output = io.BytesIO()
997
- wb.save(output)
998
- output.seek(0)
999
- return output
1000
 
1001
  def main():
1002
  st.set_page_config(layout="wide")
1003
 
1004
  with st.sidebar:
1005
- st.title("::: AI-анализ мониторинга новостей (v.3.59*):::")
1006
  st.subheader("по материалам СКАН-ИНТЕРФАКС")
1007
 
1008
  model_choice = st.radio(
 
71
  # Initialize MT5 model (multilingual T5)
72
  self.model_name = "google/mt5-small"
73
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
74
+ self.model = AutoModelForSeq2SeqM.from_pretrained(self.model_name)
75
 
76
  # Set device
77
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
78
  self.model = self.model.to(self.device)
79
 
80
+ st.success(f"Successfully initialized MT5 model on {self.device}")
81
 
82
  except Exception as e:
83
  st.error(f"Error initializing MT5: {str(e)}")
84
  raise
85
 
86
+ def invoke(self, prompt_args):
87
+ """Make the class compatible with LangChain by implementing invoke"""
 
 
 
 
88
  try:
89
+ if isinstance(prompt_args, dict):
90
+ # Extract the prompt template result
91
+ template_result = prompt_args.get('template_result', '')
92
+ if not template_result:
93
+ # Try to construct from entity and news if available
94
+ entity = prompt_args.get('entity', '')
95
+ news = prompt_args.get('news', '')
96
+ template_result = f"Analyze news about {entity}: {news}"
97
+ else:
98
+ template_result = str(prompt_args)
99
 
100
+ # Process with MT5
 
 
 
101
  inputs = self.tokenizer(
102
+ template_result,
103
  return_tensors="pt",
104
  padding=True,
105
  truncation=True,
 
116
 
117
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
118
 
119
+ # Return in a format compatible with LangChain
120
+ return type('Response', (), {'content': response})()
121
+
122
+ except Exception as e:
123
+ st.warning(f"MT5 generation error: {str(e)}")
124
+ # Return a default response on error
125
+ return type('Response', (), {
126
+ 'content': 'Impact: Неопределенный эффект\nReasoning: Ошибка анализа'
127
+ })()
128
+
129
+ def __or__(self, other):
130
+ """Implement the | operator for chain compatibility"""
131
+ if callable(other):
132
+ return lambda x: other(self(x))
133
+ return NotImplemented
134
+
135
+ def __rrshift__(self, other):
136
+ """Implement the >> operator for chain compatibility"""
137
+ return self.__or__(other)
138
+
139
+ def __call__(self, prompt_args):
140
+ """Make the class callable for chain compatibility"""
141
+ return self.invoke(prompt_args)
142
+
143
+ def detect_events(self, text: str, entity: str) -> tuple[str, str]:
144
+ """
145
+ Detect events using MT5 with improved error handling and response parsing
146
+
147
+ Args:
148
+ text (str): The news text to analyze
149
+ entity (str): The company/entity name
150
+
151
+ Returns:
152
+ tuple[str, str]: (event_type, summary)
153
+ """
154
+ # Initialize default return values
155
+ event_type = "Нет"
156
+ summary = ""
157
+
158
+ # Input validation
159
+ if not text or not entity or not isinstance(text, str) or not isinstance(entity, str):
160
+ return event_type, "Invalid input"
161
+
162
+ try:
163
+ # Clean and prepare input text
164
+ text = text.strip()
165
+ entity = entity.strip()
166
+
167
+ # Construct prompt with better formatting
168
+ prompt = f"""<s>Analyze the following news about {entity}:
169
+
170
+ Text: {text}
171
+
172
+ Task: Identify the main event type and provide a brief summary.
173
+
174
+ Event types:
175
+ 1. Отчетность - Events related to financial reports, earnings, revenue, EBITDA
176
+ 2. РЦБ - Events related to securities, bonds, stock market, defaults, restructuring
177
+ 3. Суд - Events related to legal proceedings, lawsuits, arbitration
178
+ 4. Нет - No significant events detected
179
+
180
+ Required output format:
181
+ Тип: [event type]
182
+ Краткое описание: [1-2 sentence summary]</s>"""
183
+
184
+ # Process with MT5
185
+ try:
186
+ inputs = self.tokenizer(
187
+ prompt,
188
+ return_tensors="pt",
189
+ padding=True,
190
+ truncation=True,
191
+ max_length=512
192
+ ).to(self.device)
193
+
194
+ outputs = self.model.generate(
195
+ **inputs,
196
+ max_length=300, # Increased for better summaries
197
+ num_return_sequences=1,
198
+ do_sample=False,
199
+ pad_token_id=self.tokenizer.pad_token_id,
200
+ eos_token_id=self.tokenizer.eos_token_id,
201
+ no_repeat_ngram_size=3 # Prevent repetition
202
+ )
203
+
204
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
205
+
206
+ except torch.cuda.OutOfMemoryError:
207
+ st.warning("GPU memory exceeded, falling back to CPU")
208
+ self.model = self.model.to('cpu')
209
+ inputs = inputs.to('cpu')
210
+ outputs = self.model.generate(
211
+ **inputs,
212
+ max_length=300,
213
+ num_return_sequences=1,
214
+ do_sample=False,
215
+ pad_token_id=self.tokenizer.pad_token_id
216
+ )
217
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
218
+ self.model = self.model.to(self.device) # Move back to GPU
219
+
220
+ # Enhanced response parsing
221
  if "Тип:" in response and "Краткое описание:" in response:
222
+ try:
223
+ # Split and clean parts
224
+ parts = response.split("Краткое описание:")
225
+ type_part = parts[0].split("Тип:")[1].strip()
226
+
227
+ # Validate event type with fuzzy matching
228
  valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
229
+
230
+ # Check for exact matches first
231
+ if type_part in valid_types:
232
+ event_type = type_part
233
+ else:
234
+ # Check keywords for each type
235
+ keywords = {
236
+ "Отчетность": ["отчет", "выручка", "прибыль", "ebitda", "финанс"],
237
+ "РЦБ": ["облигаци", "купон", "дефолт", "реструктуризац", "ценные бумаги"],
238
+ "Суд": ["суд", "иск", "арбитраж", "разбирательств"]
239
+ }
240
+
241
+ # Look for keywords in both type and summary
242
+ full_text = response.lower()
243
+ for event_category, category_keywords in keywords.items():
244
+ if any(keyword in full_text for keyword in category_keywords):
245
+ event_type = event_category
246
+ break
247
+
248
+ # Extract and clean summary
249
+ if len(parts) > 1:
250
+ summary = parts[1].strip()
251
+ # Ensure summary isn't too long
252
+ if len(summary) > 200:
253
+ summary = summary[:197] + "..."
254
+
255
+ # Add entity reference if missing
256
+ if entity.lower() not in summary.lower():
257
+ summary = f"Компания {entity}: {summary}"
258
+
259
+ except IndexError:
260
+ st.warning("Error parsing model response format")
261
+ return "Нет", "Error parsing response"
262
+
263
+ # Additional validation
264
+ if not summary or len(summary) < 5:
265
+ keywords = {
266
+ "Отчетность": "Обнаружена информация о финансовой отчетности",
267
+ "РЦБ": "Обнаружена информация о ценных бумагах",
268
+ "Суд": "Обнаружена информация о судебном разбирательстве",
269
+ "Нет": "Значимых событий не обнаружено"
270
+ }
271
+ summary = f"{keywords.get(event_type, 'Требуется дополнительный анализ')} ({entity})"
272
 
 
 
 
273
  return event_type, summary
274
+
275
  except Exception as e:
276
  st.warning(f"Event detection error: {str(e)}")
277
+ # Try to provide more specific error information
278
+ if "CUDA" in str(e):
279
+ return "Нет", "GPU error - falling back to CPU needed"
280
+ elif "tokenizer" in str(e):
281
+ return "Нет", "Text processing error"
282
+ elif "model" in str(e):
283
+ return "Нет", "Model inference error"
284
+ else:
285
+ return "Нет", "Ошибка анализа"
286
+
287
 
288
  def ensure_groq_llm():
289
  """Initialize Groq LLM for impact estimation"""
 
494
  model="yiyanghkust/finbert-tone",
495
  return_all_scores=True
496
  )
497
+ st.success("служебное сообщение: BERT-модели запущены для детекции новостей")
498
  except Exception as e:
499
  st.error(f"Ошибка запуска BERT: {str(e)}")
500
  raise
 
557
  # Initialize fallback translator
558
  self.fallback_translator = GoogleTranslator(source='ru', target='en')
559
  self.legacy_translator = LegacyTranslator()
560
+ st.success("служебное сообщение: запустил систему перевода")
561
  except Exception as e:
562
  st.error(f"Ошибка запуска перевода: {str(e)}")
563
  raise
 
784
  st.error(f"Ошибка в обработке файла: {str(e)}")
785
  return None
786
 
787
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
788
 
789
 
790
  def create_download_section(excel_data, pdf_data):
 
1031
  'Текст сообщения'
1032
  ])
1033
 
1034
+ def translate_reasoning_to_russian(llm, text):
1035
+ """Modified to handle both standard LLMs and FallbackLLMSystem"""
1036
+ if isinstance(llm, FallbackLLMSystem):
1037
+ # Direct translation using MT5
1038
+ response = llm.invoke({
1039
+ 'template_result': f"Translate to Russian: {text}"
1040
+ })
1041
+ return response.content.strip()
1042
+ else:
1043
+ # Original LangChain approach
1044
+ template = """
1045
+ Translate this English explanation to Russian, maintaining a formal business style:
1046
+ "{text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
 
1048
+ Your response should contain only the Russian translation.
1049
+ """
1050
+ prompt = PromptTemplate(template=template, input_variables=["text"])
1051
+ chain = prompt | llm
1052
+ response = chain.invoke({"text": text})
 
 
 
1053
 
1054
+ # Handle different response types
1055
+ if hasattr(response, 'content'):
1056
+ return response.content.strip()
1057
+ elif isinstance(response, str):
1058
+ return response.strip()
1059
+ else:
1060
+ return str(response).strip()
1061
+
1062
+ def create_output_file(df, uploaded_file, llm):
1063
+ try:
1064
+ wb = load_workbook("sample_file.xlsx")
 
1065
 
1066
+ # Rest of the code remains the same until the 'Анализ' sheet processing
 
 
 
 
 
1067
 
1068
+ # Update 'Анализ' sheet with modified translation handling
1069
  ws = wb['Анализ']
1070
  row_idx = 4
1071
  for _, row in df[df['Sentiment'] == 'Negative'].iterrows():
1072
+ ws.cell(row=row_idx, column=5, value=row['Объект'])
1073
+ ws.cell(row=row_idx, column=6, value=row['Заголовок'])
1074
+ ws.cell(row=row_idx, column=7, value="Риск убытка")
1075
 
1076
+ # Enhanced translation handling
1077
  if pd.notna(row['Reasoning']):
1078
+ try:
1079
+ translated_reasoning = translate_reasoning_to_russian(llm, row['Reasoning'])
1080
+ ws.cell(row=row_idx, column=8, value=translated_reasoning)
1081
+ except Exception as e:
1082
+ st.warning(f"Translation error for row {row_idx}: {str(e)}")
1083
+ ws.cell(row=row_idx, column=8, value=row['Reasoning']) # Use original text as fallback
1084
 
1085
+ ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])
1086
  row_idx += 1
1087
 
1088
+ # Continue with the rest of the function...
1089
+
1090
+ output = io.BytesIO()
1091
+ wb.save(output)
1092
+ output.seek(0)
1093
+ return output
1094
+
 
 
1095
  except Exception as e:
1096
  st.warning(f"Ошибка при создании выходного файла: {str(e)}")
1097
+ return None
 
 
 
 
1098
 
1099
  def main():
1100
  st.set_page_config(layout="wide")
1101
 
1102
  with st.sidebar:
1103
+ st.title("::: AI-анализ мониторинга новостей (v.3.60):::")
1104
  st.subheader("по материалам СКАН-ИНТЕРФАКС")
1105
 
1106
  model_choice = st.radio(