Spaces:

pentarosarium
/

processor

Running

App Files Files Community

pentarosarium commited on Oct 18, 2024

Commit

45f1473

1 Parent(s): 076cf43

progress more 45

Browse files

Files changed (1) hide show

app.py +42 -276

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from pymystem3 import Mystem
 import io
 from rapidfuzz import fuzz
 from tqdm.auto import tqdm
-import time
 import torch
 from openpyxl import load_workbook
 from openpyxl import Workbook
@@ -22,19 +21,24 @@ from langchain.chains import LLMChain
 mystem = Mystem()
 # Set up the sentiment analyzers
 finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
 roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
 finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
-rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
-rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
 def init_langchain_llm():
     pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
     llm = HuggingFacePipeline(pipeline=pipe)
     return llm
-# Function to estimate impact using LLM
 def estimate_impact(llm, news_text):
     template = """
     Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
@@ -50,24 +54,19 @@ def estimate_impact(llm, news_text):
     chain = LLMChain(llm=llm, prompt=prompt)
     response = chain.run(news=news_text)
-    # Parse the response to extract impact and reasoning
-    # Parsing logic is very important! Might be needed to be changed
     impact, reasoning = response.split("Reasoning:")
     impact = impact.strip()
     reasoning = reasoning.strip()
     return impact, reasoning
-def process_file_with_llm(uploaded_file, llm):
-    df = process_file(uploaded_file)
-    # Add new columns for LLM analysis
     df['LLM_Impact'] = ''
     df['LLM_Reasoning'] = ''
     for index, row in df.iterrows():
         if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
-            impact, reasoning = estimate_impact(llm, row['Выдержки из текста'])
             df.at[index, 'LLM_Impact'] = impact
             df.at[index, 'LLM_Reasoning'] = reasoning
@@ -123,268 +122,34 @@ def create_output_file_with_llm(df, uploaded_file, analysis_df):
         for c_idx, value in enumerate(row, start=1):
             ws.cell(row=r_idx, column=c_idx, value=value)
     output = io.BytesIO()
     wb.save(output)
     output.seek(0)
     return output
-def create_analysis_data(df):
-    analysis_data = []
-    for _, row in df.iterrows():
-        if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
-            analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
-    return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Пояснение', 'Текст сообщения'])
-# Function for lemmatizing Russian text
-def lemmatize_text(text):
-    if pd.isna(text):
-        return ""
-    if not isinstance(text, str):
-        text = str(text)
-    words = text.split()
-    lemmatized_words = []
-    for word in tqdm(words, desc="Lemmatizing", unit="word"):
-        lemmatized_word = ''.join(mystem.lemmatize(word))
-        lemmatized_words.append(lemmatized_word)
-    return ' '.join(lemmatized_words)
-# Translation model for Russian to English
-model_name = "Helsinki-NLP/opus-mt-ru-en"
-translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
-translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
-def translate(text):
-    # Tokenize the input text
-    inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
-    # Calculate max_length based on input length
-    input_length = inputs.input_ids.shape[1]
-    max_length = max(input_length + 10, int(input_length * 1.5))  # Ensure at least 10 new tokens
-    # Generate translation
-    translated_tokens = translation_model.generate(
-        **inputs,
-        max_new_tokens=max_length,  # Use max_new_tokens instead of max_length
-        num_beams=5,
-        no_repeat_ngram_size=2,
-        early_stopping=True
-    )
-    # Decode the translated tokens
-    translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-    return translated_text
-# Functions for FinBERT, RoBERTa, and FinBERT-Tone with label mapping
-def get_mapped_sentiment(result):
-    label = result['label'].lower()
-    if label in ["positive", "label_2", "pos", "pos_label"]:
-        return "Positive"
-    elif label in ["negative", "label_0", "neg", "neg_label"]:
-        return "Negative"
-    return "Neutral"
-@sentiment_analysis_decorator
-def get_rubert1_sentiment(text):
-    result = rubert1(text, truncation=True, max_length=512)[0]
-    return get_mapped_sentiment(result)
-@sentiment_analysis_decorator
-def get_rubert2_sentiment(text):
-    result = rubert2(text, truncation=True, max_length=512)[0]
-    return get_mapped_sentiment(result)
-@sentiment_analysis_decorator
-def get_finbert_sentiment(text):
-    result = finbert(text, truncation=True, max_length=512)[0]
-    return get_mapped_sentiment(result)
-@sentiment_analysis_decorator
-def get_roberta_sentiment(text):
-    result = roberta(text, truncation=True, max_length=512)[0]
-    return get_mapped_sentiment(result)
-@sentiment_analysis_decorator
-def get_finbert_tone_sentiment(text):
-    result = finbert_tone(text, truncation=True, max_length=512)[0]
-    return get_mapped_sentiment(result)
-#Fuzzy filter out similar news for the same NER
-def fuzzy_deduplicate(df, column, threshold=65):
-    seen_texts = []
-    indices_to_keep = []
-    for i, text in enumerate(df[column]):
-        if pd.isna(text):
-            indices_to_keep.append(i)
-            continue
-        text = str(text)
-        if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
-            seen_texts.append(text)
-            indices_to_keep.append(i)
-    return df.iloc[indices_to_keep]
-def format_elapsed_time(seconds):
-    hours, remainder = divmod(int(seconds), 3600)
-    minutes, seconds = divmod(remainder, 60)
-    time_parts = []
-    if hours > 0:
-        time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
-    if minutes > 0:
-        time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
-    if seconds > 0 or not time_parts:  # always show seconds if it's the only non-zero value
-        time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
-    return " ".join(time_parts)
-def process_file(uploaded_file):
-    df = pd.read_excel(uploaded_file, sheet_name='Публикации')
-    required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
-    missing_columns = [col for col in required_columns if col not in df.columns]
-    if missing_columns:
-        st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
-        st.stop()
-    original_news_count = len(df)
-    # Apply fuzzy deduplication
-    df = df.groupby('Объект').apply(
-        lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
-    ).reset_index(drop=True)
-    remaining_news_count = len(df)
-    duplicates_removed = original_news_count - remaining_news_count
-    st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
-    # Translate texts
-    translated_texts = []
-    lemmatized_texts = []
-    progress_bar = st.progress(0)
-    progress_text = st.empty()
-    total_news = len(df)
-    texts = df['Выдержки из текста'].tolist()
-    # Data validation
-    texts = [str(text) if not pd.isna(text) else "" for text in texts]
-    for text in df['Выдержки из текста']:
-        lemmatized_texts.append(lemmatize_text(text))
-    for i, text in enumerate(lemmatized_texts):
-        translated_text = translate(str(text))
-        translated_texts.append(translated_text)
-        progress_bar.progress((i + 1) / len(df))
-        progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
-    # Perform sentiment analysis
-    rubert2_results = [get_rubert2_sentiment(text) for text in texts]
-    finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
-    roberta_results = [get_roberta_sentiment(text) for text in translated_texts]
-    finbert_tone_results = [get_finbert_tone_sentiment(text) for text in translated_texts]
-    # Create a new DataFrame with processed data
-    processed_df = pd.DataFrame({
-        'Объект': df['Объект'],
-        'Заголовок': df['Заголовок'],  # Preserve original 'Заголовок'
-        'ruBERT2': rubert2_results,
-        'FinBERT': finbert_results,
-        'RoBERTa': roberta_results,
-        'FinBERT-Tone': finbert_tone_results,
-        'Выдержки из текста': df['Выдержки из текста'],
-        'Translated': translated_texts
-    })
-    return processed_df
-def create_output_file(df, uploaded_file, analysis_df):
-    # Load the sample file to use as a template
-    wb = load_workbook("sample_file.xlsx")
-    # Process data for 'Сводка' sheet
-    entities = df['Объект'].unique()
-    summary_data = []
-    for entity in entities:
-        entity_df = df[df['Объект'] == entity]
-        total_news = len(entity_df)
-        negative_news = sum((entity_df['FinBERT'] == 'Negative') |
-                            (entity_df['RoBERTa'] == 'Negative') |
-                            (entity_df['FinBERT-Tone'] == 'Negative'))
-        positive_news = sum((entity_df['FinBERT'] == 'Positive') |
-                            (entity_df['RoBERTa'] == 'Positive') |
-                            (entity_df['FinBERT-Tone'] == 'Positive'))
-        summary_data.append([entity, total_news, negative_news, positive_news])
-    summary_df = pd.DataFrame(summary_data, columns=['Объект', 'Всего новостей', 'Отрицательные', 'Положительные'])
-    summary_df = summary_df.sort_values('Отрицательные', ascending=False)
-    # Write 'Сводка' sheet
-    ws = wb['Сводка']
-    for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
-        for c_idx, value in enumerate(row, start=5):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    # Process data for 'Значимые' sheet
-    significant_data = []
-    for _, row in df.iterrows():
-        if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
-            sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
-            significant_data.append([row['Объект'], '', sentiment, '', row['Заголовок'], row['Выдержки из текста']])
-    # Write 'Значимые' sheet
-    ws = wb['Значимые']
-    for r_idx, row in enumerate(significant_data, start=3):
-        for c_idx, value in enumerate(row, start=3):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    # Write 'Анализ' sheet
-    ws = wb['Анализ']
-    for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
-        for c_idx, value in enumerate(row, start=5):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    # Copy 'Публикации' sheet from original uploaded file
-    original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
-    ws = wb['Публикации']
-    for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
-        for c_idx, value in enumerate(row, start=1):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    # Add 'Тех.приложение' sheet with processed data
-    if 'Тех.приложение' not in wb.sheetnames:
-        wb.create_sheet('Тех.приложение')
-    ws = wb['Тех.приложение']
-    for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
-        for c_idx, value in enumerate(row, start=1):
-            ws.cell(row=r_idx, column=c_idx, value=value)
-    # Save the workbook to a BytesIO object
-    output = io.BytesIO()
-    wb.save(output)
-    output.seek(0)
-    return output
 def main():
-    st.title("... приступим к анализу... версия 44+")
     uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
-    if uploaded_file is not None:
         start_time = time.time()
-        df = process_file(uploaded_file)
         st.subheader("Предпросмотр данных")
-        st.write(df.head())
         st.subheader("Распределение окраски")
         fig, axs = plt.subplots(2, 2, figsize=(12, 8))
@@ -393,7 +158,7 @@ def main():
         models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
         for i, model in enumerate(models):
             ax = axs[i // 2, i % 2]
-            sentiment_counts = df[model].value_counts()
             sentiment_counts.plot(kind='bar', ax=ax)
             ax.set_title(f"{model} Sentiment")
             ax.set_xlabel("Sentiment")
@@ -401,19 +166,17 @@ def main():
         plt.tight_layout()
         st.pyplot(fig)
-        analysis_df = create_analysis_data(df)
         st.subheader("Анализ")
-        st.dataframe(analysis_df)
-        output = create_output_file(df, uploaded_file, analysis_df)
-        # Calculate elapsed time
         end_time = time.time()
         elapsed_time = end_time - start_time
         formatted_time = format_elapsed_time(elapsed_time)
         st.success(f"Обработка завершена за {formatted_time}.")
-        # Offer download of results
         st.download_button(
             label="Скачать результат анализа новостей",
             data=output,
@@ -421,20 +184,23 @@ def main():
             mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
         )
-        # Add button for LLM analysis
         if st.button("Что скажет нейросеть?"):
             st.info("Анализ нейросетью начался. Это может занять некоторое время...")
             llm = init_langchain_llm()
-            df_with_llm = process_file_with_llm(uploaded_file, llm)
-            output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, analysis_df)
             st.success("Анализ нейросетью завершен!")
-            st.download_button(
-                label="Скачать результат анализа с оценкой нейросети",
-                data=output_with_llm,
-                file_name="результат_анализа_с_нейросетью.xlsx",
-                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-            )
 if __name__ == "__main__":
     main()

 import io
 from rapidfuzz import fuzz
 from tqdm.auto import tqdm
 import torch
 from openpyxl import load_workbook
 from openpyxl import Workbook
 mystem = Mystem()
 # Set up the sentiment analyzers
 finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
 roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
 finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
+rubert1 = pipeline("sentiment-analysis", model="DeepPavlov/rubert-base-cased")
+rubert2 = pipeline("sentiment-analysis", model="blanchefort/rubert-base-cased-sentiment")
+# Translation model for Russian to English
+model_name = "Helsinki-NLP/opus-mt-ru-en"
+translation_tokenizer = AutoTokenizer.from_pretrained(model_name)
+translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
 def init_langchain_llm():
     pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
     llm = HuggingFacePipeline(pipeline=pipe)
     return llm
 def estimate_impact(llm, news_text):
     template = """
     Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
     chain = LLMChain(llm=llm, prompt=prompt)
     response = chain.run(news=news_text)
     impact, reasoning = response.split("Reasoning:")
     impact = impact.strip()
     reasoning = reasoning.strip()
     return impact, reasoning
+def process_file_with_llm(df, llm):
     df['LLM_Impact'] = ''
     df['LLM_Reasoning'] = ''
     for index, row in df.iterrows():
         if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
+            impact, reasoning = estimate_impact(llm, row['Translated'])  # Use translated text
             df.at[index, 'LLM_Impact'] = impact
             df.at[index, 'LLM_Reasoning'] = reasoning
         for c_idx, value in enumerate(row, start=1):
             ws.cell(row=r_idx, column=c_idx, value=value)
     output = io.BytesIO()
     wb.save(output)
     output.seek(0)
     return output
+# ... (keep other functions as they are)
 def main():
+    st.title("... приступим к анализу... версия 45")
+    # Initialize session state
+    if 'processed_df' not in st.session_state:
+        st.session_state.processed_df = None
+    if 'analysis_df' not in st.session_state:
+        st.session_state.analysis_df = None
+    if 'llm_analyzed' not in st.session_state:
+        st.session_state.llm_analyzed = False
     uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
+    if uploaded_file is not None and st.session_state.processed_df is None:
         start_time = time.time()
+        st.session_state.processed_df = process_file(uploaded_file)
+        st.session_state.analysis_df = create_analysis_data(st.session_state.processed_df)
         st.subheader("Предпросмотр данных")
+        st.write(st.session_state.processed_df.head())
         st.subheader("Распределение окраски")
         fig, axs = plt.subplots(2, 2, figsize=(12, 8))
         models = ['ruBERT2','FinBERT', 'RoBERTa', 'FinBERT-Tone']
         for i, model in enumerate(models):
             ax = axs[i // 2, i % 2]
+            sentiment_counts = st.session_state.processed_df[model].value_counts()
             sentiment_counts.plot(kind='bar', ax=ax)
             ax.set_title(f"{model} Sentiment")
             ax.set_xlabel("Sentiment")
         plt.tight_layout()
         st.pyplot(fig)
         st.subheader("Анализ")
+        st.dataframe(st.session_state.analysis_df)
+        output = create_output_file(st.session_state.processed_df, uploaded_file, st.session_state.analysis_df)
         end_time = time.time()
         elapsed_time = end_time - start_time
         formatted_time = format_elapsed_time(elapsed_time)
         st.success(f"Обработка завершена за {formatted_time}.")
         st.download_button(
             label="Скачать результат анализа новостей",
             data=output,
             mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
         )
+    if st.session_state.processed_df is not None and not st.session_state.llm_analyzed:
         if st.button("Что скажет нейросеть?"):
             st.info("Анализ нейросетью начался. Это может занять некоторое время...")
             llm = init_langchain_llm()
+            df_with_llm = process_file_with_llm(st.session_state.processed_df, llm)
+            output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, st.session_state.analysis_df)
             st.success("Анализ нейросетью завершен!")
+            st.session_state.llm_analyzed = True
+            st.session_state.output_with_llm = output_with_llm
+    if st.session_state.llm_analyzed:
+        st.download_button(
+            label="Скачать результат анализа с оценкой нейросети",
+            data=st.session_state.output_with_llm,
+            file_name="результат_анализа_с_нейросетью.xlsx",
+            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        )
 if __name__ == "__main__":
     main()