Spaces:

pentarosarium
/

processor

Running

App Files Files Community

pentarosarium commited on Nov 12, 2024

Commit

f602aaf

1 Parent(s): 0a20075

3.60

Browse files

Files changed (1) hide show

app.py +232 -134

app.py CHANGED Viewed

@@ -71,41 +71,35 @@ class FallbackLLMSystem:
             # Initialize MT5 model (multilingual T5)
             self.model_name = "google/mt5-small"
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
             # Set device
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
             self.model = self.model.to(self.device)
-            st.success(f"Запустил MT5-модель на {self.device}")
         except Exception as e:
             st.error(f"Error initializing MT5: {str(e)}")
             raise
-    def detect_events(self, text, entity):
-        """Detect events using MT5"""
-        # Initialize default return values
-        event_type = "Нет"
-        summary = ""
         try:
-            prompt = f"""<s>Analyze news about company {entity}:
-            {text}
-            Classify event type as one of:
-            - Отчетность (financial reports)
-            - РЦБ (securities market events)
-            - Суд (legal actions)
-            - Нет (no significant events)
-            Format response as:
-            Тип: [type]
-            Краткое описание: [summary]</s>"""
             inputs = self.tokenizer(
-                prompt,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
@@ -122,25 +116,174 @@ class FallbackLLMSystem:
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Parse response
             if "Тип:" in response and "Краткое описание:" in response:
-                parts = response.split("Краткое описание:")
-                type_part = parts[0]
-                if "Тип:" in type_part:
-                    event_type = type_part.split("Тип:")[1].strip()
-                    # Validate event type
                     valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
-                    if event_type not in valid_types:
-                        event_type = "Нет"
-                if len(parts) > 1:
-                    summary = parts[1].strip()
             return event_type, summary
         except Exception as e:
             st.warning(f"Event detection error: {str(e)}")
-            return "Нет", "Ошибка анализа"
 def ensure_groq_llm():
     """Initialize Groq LLM for impact estimation"""
@@ -351,7 +494,7 @@ class EventDetectionSystem:
                 model="yiyanghkust/finbert-tone",
                 return_all_scores=True
             )
-            st.success("BERT-модели запущены для детекции новостей")
         except Exception as e:
             st.error(f"Ошибка запуска BERT: {str(e)}")
             raise
@@ -414,7 +557,7 @@ class TranslationSystem:
             # Initialize fallback translator
             self.fallback_translator = GoogleTranslator(source='ru', target='en')
             self.legacy_translator = LegacyTranslator()
-            st.success("Запустил систему перевода")
         except Exception as e:
             st.error(f"Ошибка запуска перевода: {str(e)}")
             raise
@@ -641,24 +784,7 @@ def process_file(uploaded_file, model_choice, translation_method=None):
         st.error(f"Ошибка в обработке файла: {str(e)}")
         return None
-def translate_reasoning_to_russian(llm, text):
-    template = """
-    Translate this English explanation to Russian, maintaining a formal business style:
-    "{text}"
-    Your response should contain only the Russian translation.
-    """
-    prompt = PromptTemplate(template=template, input_variables=["text"])
-    chain = prompt | llm | RunnablePassthrough()
-    response = chain.invoke({"text": text})
-    # Handle different response types
-    if hasattr(response, 'content'):
-        return response.content.strip()
-    elif isinstance(response, str):
-        return response.strip()
-    else:
-        return str(response).strip()
 def create_download_section(excel_data, pdf_data):
@@ -905,104 +1031,76 @@ def create_analysis_data(df):
         'Текст сообщения'
     ])
-def create_output_file(df, uploaded_file, llm):
-    wb = load_workbook("sample_file.xlsx")
-    try:
-        # Update 'Мониторинг' sheet with events
-        ws = wb['Мониторинг']
-        row_idx = 4
-        for _, row in df.iterrows():
-            if row['Event_Type'] != 'Нет':
-                ws.cell(row=row_idx, column=5, value=row['Объект'])  # Column E
-                ws.cell(row=row_idx, column=6, value=row['Заголовок'])  # Column F
-                ws.cell(row=row_idx, column=7, value=row['Event_Type'])  # Column G
-                ws.cell(row=row_idx, column=8, value=row['Event_Summary'])  # Column H
-                ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])  # Column I
-                row_idx += 1
-        # Sort entities by number of negative publications
-        entity_stats = pd.DataFrame({
-            'Объект': df['Объект'].unique(),
-            'Всего': df.groupby('Объект').size(),
-            'Негативные': df[df['Sentiment'] == 'Negative'].groupby('Объект').size().fillna(0).astype(int),
-            'Позитивные': df[df['Sentiment'] == 'Positive'].groupby('Объект').size().fillna(0).astype(int)
-        }).sort_values('Негативные', ascending=False)
-        # Calculate most negative impact for each entity
-        entity_impacts = {}
-        for entity in df['Объект'].unique():
-            entity_df = df[df['Объект'] == entity]
-            negative_impacts = entity_df[entity_df['Sentiment'] == 'Negative']['Impact']
-            entity_impacts[entity] = negative_impacts.iloc[0] if len(negative_impacts) > 0 else 'Неопределенный эффект'
-        # Update 'Сводка' sheet
-        ws = wb['Сводка']
-        for idx, (entity, row) in enumerate(entity_stats.iterrows(), start=4):
-            ws.cell(row=idx, column=5, value=entity)  # Column E
-            ws.cell(row=idx, column=6, value=row['Всего'])  # Column F
-            ws.cell(row=idx, column=7, value=row['Негативные'])  # Column G
-            ws.cell(row=idx, column=8, value=row['Позитивные'])  # Column H
-            ws.cell(row=idx, column=9, value=entity_impacts[entity])  # Column I
-        # Update 'Значимые' sheet
-        ws = wb['Значимые']
-        row_idx = 3
-        for _, row in df.iterrows():
-            if row['Sentiment'] in ['Negative', 'Positive']:
-                ws.cell(row=row_idx, column=3, value=row['Объект'])  # Column C
-                ws.cell(row=row_idx, column=4, value='релевантно')   # Column D
-                ws.cell(row=row_idx, column=5, value=row['Sentiment']) # Column E
-                ws.cell(row=row_idx, column=6, value=row['Impact'])   # Column F
-                ws.cell(row=row_idx, column=7, value=row['Заголовок']) # Column G
-                ws.cell(row=row_idx, column=8, value=row['Выдержки из текста']) # Column H
-                row_idx += 1
-        # Copy 'Публикации' sheet
-        original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
-        ws = wb['Публикации']
-        for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
-            for c_idx, value in enumerate(row, start=1):
-                ws.cell(row=r_idx, column=c_idx, value=value)
-        # Update 'Анализ' sheet
         ws = wb['Анализ']
         row_idx = 4
         for _, row in df[df['Sentiment'] == 'Negative'].iterrows():
-            ws.cell(row=row_idx, column=5, value=row['Объект'])  # Column E
-            ws.cell(row=row_idx, column=6, value=row['Заголовок'])  # Column F
-            ws.cell(row=row_idx, column=7, value="Риск убытка")  # Column G
-            # Translate reasoning if it exists
             if pd.notna(row['Reasoning']):
-                translated_reasoning = translate_reasoning_to_russian(llm, row['Reasoning'])
-                ws.cell(row=row_idx, column=8, value=translated_reasoning)  # Column H
-            ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])  # Column I
             row_idx += 1
-        # Update 'Тех.приложение' sheet
-        tech_df = df[['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']]
-        if 'Тех.приложение' not in wb.sheetnames:
-            wb.create_sheet('Тех.приложение')
-        ws = wb['Тех.приложение']
-        for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
-            for c_idx, value in enumerate(row, start=1):
-                ws.cell(row=r_idx, column=c_idx, value=value)
     except Exception as e:
         st.warning(f"Ошибка при создании выходного файла: {str(e)}")
-    output = io.BytesIO()
-    wb.save(output)
-    output.seek(0)
-    return output
 def main():
     st.set_page_config(layout="wide")
     with st.sidebar:
-        st.title("::: AI-анализ мониторинга новостей (v.3.59*):::")
         st.subheader("по материалам СКАН-ИНТЕРФАКС")
         model_choice = st.radio(

             # Initialize MT5 model (multilingual T5)
             self.model_name = "google/mt5-small"
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.model = AutoModelForSeq2SeqM.from_pretrained(self.model_name)
             # Set device
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
             self.model = self.model.to(self.device)
+            st.success(f"Successfully initialized MT5 model on {self.device}")
         except Exception as e:
             st.error(f"Error initializing MT5: {str(e)}")
             raise
+    def invoke(self, prompt_args):
+        """Make the class compatible with LangChain by implementing invoke"""
         try:
+            if isinstance(prompt_args, dict):
+                # Extract the prompt template result
+                template_result = prompt_args.get('template_result', '')
+                if not template_result:
+                    # Try to construct from entity and news if available
+                    entity = prompt_args.get('entity', '')
+                    news = prompt_args.get('news', '')
+                    template_result = f"Analyze news about {entity}: {news}"
+            else:
+                template_result = str(prompt_args)
+            # Process with MT5
             inputs = self.tokenizer(
+                template_result,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Return in a format compatible with LangChain
+            return type('Response', (), {'content': response})()
+        except Exception as e:
+            st.warning(f"MT5 generation error: {str(e)}")
+            # Return a default response on error
+            return type('Response', (), {
+                'content': 'Impact: Неопределенный эффект\nReasoning: Ошибка анализа'
+            })()
+    def __or__(self, other):
+        """Implement the | operator for chain compatibility"""
+        if callable(other):
+            return lambda x: other(self(x))
+        return NotImplemented
+    def __rrshift__(self, other):
+        """Implement the >> operator for chain compatibility"""
+        return self.__or__(other)
+    def __call__(self, prompt_args):
+        """Make the class callable for chain compatibility"""
+        return self.invoke(prompt_args)
+    def detect_events(self, text: str, entity: str) -> tuple[str, str]:
+        """
+        Detect events using MT5 with improved error handling and response parsing
+        Args:
+            text (str): The news text to analyze
+            entity (str): The company/entity name
+        Returns:
+            tuple[str, str]: (event_type, summary)
+        """
+        # Initialize default return values
+        event_type = "Нет"
+        summary = ""
+        # Input validation
+        if not text or not entity or not isinstance(text, str) or not isinstance(entity, str):
+            return event_type, "Invalid input"
+        try:
+            # Clean and prepare input text
+            text = text.strip()
+            entity = entity.strip()
+            # Construct prompt with better formatting
+            prompt = f"""<s>Analyze the following news about {entity}:
+    Text: {text}
+    Task: Identify the main event type and provide a brief summary.
+    Event types:
+    1. Отчетность - Events related to financial reports, earnings, revenue, EBITDA
+    2. РЦБ - Events related to securities, bonds, stock market, defaults, restructuring
+    3. Суд - Events related to legal proceedings, lawsuits, arbitration
+    4. Нет - No significant events detected
+    Required output format:
+    Тип: [event type]
+    Краткое описание: [1-2 sentence summary]</s>"""
+            # Process with MT5
+            try:
+                inputs = self.tokenizer(
+                    prompt,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=512
+                ).to(self.device)
+                outputs = self.model.generate(
+                    **inputs,
+                    max_length=300,  # Increased for better summaries
+                    num_return_sequences=1,
+                    do_sample=False,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    no_repeat_ngram_size=3  # Prevent repetition
+                )
+                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            except torch.cuda.OutOfMemoryError:
+                st.warning("GPU memory exceeded, falling back to CPU")
+                self.model = self.model.to('cpu')
+                inputs = inputs.to('cpu')
+                outputs = self.model.generate(
+                    **inputs,
+                    max_length=300,
+                    num_return_sequences=1,
+                    do_sample=False,
+                    pad_token_id=self.tokenizer.pad_token_id
+                )
+                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                self.model = self.model.to(self.device)  # Move back to GPU
+            # Enhanced response parsing
             if "Тип:" in response and "Краткое описание:" in response:
+                try:
+                    # Split and clean parts
+                    parts = response.split("Краткое описание:")
+                    type_part = parts[0].split("Тип:")[1].strip()
+                    # Validate event type with fuzzy matching
                     valid_types = ["Отчетность", "РЦБ", "Суд", "Нет"]
+                    # Check for exact matches first
+                    if type_part in valid_types:
+                        event_type = type_part
+                    else:
+                        # Check keywords for each type
+                        keywords = {
+                            "Отчетность": ["отчет", "выручка", "прибыль", "ebitda", "финанс"],
+                            "РЦБ": ["облигаци", "купон", "дефолт", "реструктуризац", "ценные бумаги"],
+                            "Суд": ["суд", "иск", "арбитраж", "разбирательств"]
+                        }
+                        # Look for keywords in both type and summary
+                        full_text = response.lower()
+                        for event_category, category_keywords in keywords.items():
+                            if any(keyword in full_text for keyword in category_keywords):
+                                event_type = event_category
+                                break
+                    # Extract and clean summary
+                    if len(parts) > 1:
+                        summary = parts[1].strip()
+                        # Ensure summary isn't too long
+                        if len(summary) > 200:
+                            summary = summary[:197] + "..."
+                        # Add entity reference if missing
+                        if entity.lower() not in summary.lower():
+                            summary = f"Компания {entity}: {summary}"
+                except IndexError:
+                    st.warning("Error parsing model response format")
+                    return "Нет", "Error parsing response"
+            # Additional validation
+            if not summary or len(summary) < 5:
+                keywords = {
+                    "Отчетность": "Обнаружена информация о финансовой отчетности",
+                    "РЦБ": "Обнаружена информация о ценных бумагах",
+                    "Суд": "Обнаружена информация о судебном разбирательстве",
+                    "Нет": "Значимых событий не обнаружено"
+                }
+                summary = f"{keywords.get(event_type, 'Требуется дополнительный анализ')} ({entity})"
             return event_type, summary
         except Exception as e:
             st.warning(f"Event detection error: {str(e)}")
+            # Try to provide more specific error information
+            if "CUDA" in str(e):
+                return "Нет", "GPU error - falling back to CPU needed"
+            elif "tokenizer" in str(e):
+                return "Нет", "Text processing error"
+            elif "model" in str(e):
+                return "Нет", "Model inference error"
+            else:
+                return "Нет", "Ошибка анализа"
 def ensure_groq_llm():
     """Initialize Groq LLM for impact estimation"""
                 model="yiyanghkust/finbert-tone",
                 return_all_scores=True
             )
+            st.success("служебное сообщение: BERT-модели запущены для детекции новостей")
         except Exception as e:
             st.error(f"Ошибка запуска BERT: {str(e)}")
             raise
             # Initialize fallback translator
             self.fallback_translator = GoogleTranslator(source='ru', target='en')
             self.legacy_translator = LegacyTranslator()
+            st.success("служебное сообщение: запустил систему перевода")
         except Exception as e:
             st.error(f"Ошибка запуска перевода: {str(e)}")
             raise
         st.error(f"Ошибка в обработке файла: {str(e)}")
         return None
 def create_download_section(excel_data, pdf_data):
         'Текст сообщения'
     ])
+def translate_reasoning_to_russian(llm, text):
+    """Modified to handle both standard LLMs and FallbackLLMSystem"""
+    if isinstance(llm, FallbackLLMSystem):
+        # Direct translation using MT5
+        response = llm.invoke({
+            'template_result': f"Translate to Russian: {text}"
+        })
+        return response.content.strip()
+    else:
+        # Original LangChain approach
+        template = """
+        Translate this English explanation to Russian, maintaining a formal business style:
+        "{text}"
+        Your response should contain only the Russian translation.
+        """
+        prompt = PromptTemplate(template=template, input_variables=["text"])
+        chain = prompt | llm
+        response = chain.invoke({"text": text})
+        # Handle different response types
+        if hasattr(response, 'content'):
+            return response.content.strip()
+        elif isinstance(response, str):
+            return response.strip()
+        else:
+            return str(response).strip()
+def create_output_file(df, uploaded_file, llm):
+    try:
+        wb = load_workbook("sample_file.xlsx")
+        # Rest of the code remains the same until the 'Анализ' sheet processing
+        # Update 'Анализ' sheet with modified translation handling
         ws = wb['Анализ']
         row_idx = 4
         for _, row in df[df['Sentiment'] == 'Negative'].iterrows():
+            ws.cell(row=row_idx, column=5, value=row['Объект'])
+            ws.cell(row=row_idx, column=6, value=row['Заголовок'])
+            ws.cell(row=row_idx, column=7, value="Риск убытка")
+            # Enhanced translation handling
             if pd.notna(row['Reasoning']):
+                try:
+                    translated_reasoning = translate_reasoning_to_russian(llm, row['Reasoning'])
+                    ws.cell(row=row_idx, column=8, value=translated_reasoning)
+                except Exception as e:
+                    st.warning(f"Translation error for row {row_idx}: {str(e)}")
+                    ws.cell(row=row_idx, column=8, value=row['Reasoning'])  # Use original text as fallback
+            ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])
             row_idx += 1
+        # Continue with the rest of the function...
+        output = io.BytesIO()
+        wb.save(output)
+        output.seek(0)
+        return output
     except Exception as e:
         st.warning(f"Ошибка при создании выходного файла: {str(e)}")
+        return None
 def main():
     st.set_page_config(layout="wide")
     with st.sidebar:
+        st.title("::: AI-анализ мониторинга новостей (v.3.60):::")
         st.subheader("по материалам СКАН-ИНТЕРФАКС")
         model_choice = st.radio(