Spaces:
Running
Running
Commit
·
261f952
1
Parent(s):
c58ea62
progress more 44+
Browse files
app.py
CHANGED
@@ -14,6 +14,9 @@ from openpyxl import load_workbook
|
|
14 |
from openpyxl import Workbook
|
15 |
from openpyxl.utils.dataframe import dataframe_to_rows
|
16 |
from sentiment_decorators import sentiment_analysis_decorator
|
|
|
|
|
|
|
17 |
|
18 |
# Initialize pymystem3 for lemmatization
|
19 |
mystem = Mystem()
|
@@ -26,12 +29,112 @@ finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
|
|
26 |
rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
|
27 |
rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def create_analysis_data(df):
|
30 |
analysis_data = []
|
31 |
for _, row in df.iterrows():
|
32 |
if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
33 |
analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
|
34 |
-
return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', '
|
35 |
|
36 |
# Function for lemmatizing Russian text
|
37 |
def lemmatize_text(text):
|
@@ -125,6 +228,20 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
125 |
indices_to_keep.append(i)
|
126 |
return df.iloc[indices_to_keep]
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
def process_file(uploaded_file):
|
130 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
@@ -257,7 +374,7 @@ def create_output_file(df, uploaded_file, analysis_df):
|
|
257 |
return output
|
258 |
|
259 |
def main():
|
260 |
-
st.title("... приступим к анализу... версия
|
261 |
|
262 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
263 |
|
@@ -292,7 +409,8 @@ def main():
|
|
292 |
# Calculate elapsed time
|
293 |
end_time = time.time()
|
294 |
elapsed_time = end_time - start_time
|
295 |
-
|
|
|
296 |
|
297 |
# Offer download of results
|
298 |
|
@@ -302,5 +420,20 @@ def main():
|
|
302 |
file_name="результат_анализа_новостей.xlsx",
|
303 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
304 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
if __name__ == "__main__":
|
306 |
main()
|
|
|
14 |
from openpyxl import Workbook
|
15 |
from openpyxl.utils.dataframe import dataframe_to_rows
|
16 |
from sentiment_decorators import sentiment_analysis_decorator
|
17 |
+
from langchain.llms import HuggingFacePipeline
|
18 |
+
from langchain.prompts import PromptTemplate
|
19 |
+
from langchain.chains import LLMChain
|
20 |
|
21 |
# Initialize pymystem3 for lemmatization
|
22 |
mystem = Mystem()
|
|
|
29 |
rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
|
30 |
rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
|
31 |
|
32 |
+
def init_langchain_llm():
|
33 |
+
pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
|
34 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
35 |
+
return llm
|
36 |
+
|
37 |
+
# Function to estimate impact using LLM
|
38 |
+
def estimate_impact(llm, news_text):
|
39 |
+
template = """
|
40 |
+
Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
|
41 |
+
If a monetary estimate is not possible, categorize the impact as "Значительный", "Незначительный", or "Неопределенный".
|
42 |
+
Also provide a short reasoning (max 100 words) for your assessment.
|
43 |
+
|
44 |
+
News: {news}
|
45 |
+
|
46 |
+
Estimated Impact:
|
47 |
+
Reasoning:
|
48 |
+
"""
|
49 |
+
prompt = PromptTemplate(template=template, input_variables=["news"])
|
50 |
+
chain = LLMChain(llm=llm, prompt=prompt)
|
51 |
+
response = chain.run(news=news_text)
|
52 |
+
|
53 |
+
# Parse the response to extract impact and reasoning
|
54 |
+
# Parsing logic is very important! Might be needed to be changed
|
55 |
+
impact, reasoning = response.split("Reasoning:")
|
56 |
+
impact = impact.strip()
|
57 |
+
reasoning = reasoning.strip()
|
58 |
+
|
59 |
+
return impact, reasoning
|
60 |
+
|
61 |
+
def process_file_with_llm(uploaded_file, llm):
|
62 |
+
df = process_file(uploaded_file)
|
63 |
+
|
64 |
+
# Add new columns for LLM analysis
|
65 |
+
df['LLM_Impact'] = ''
|
66 |
+
df['LLM_Reasoning'] = ''
|
67 |
+
|
68 |
+
for index, row in df.iterrows():
|
69 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
70 |
+
impact, reasoning = estimate_impact(llm, row['Выдержки из текста'])
|
71 |
+
df.at[index, 'LLM_Impact'] = impact
|
72 |
+
df.at[index, 'LLM_Reasoning'] = reasoning
|
73 |
+
|
74 |
+
return df
|
75 |
+
|
76 |
+
def create_output_file_with_llm(df, uploaded_file, analysis_df):
|
77 |
+
wb = load_workbook("sample_file.xlsx")
|
78 |
+
|
79 |
+
# Update 'Сводка' sheet
|
80 |
+
summary_df = pd.DataFrame({
|
81 |
+
'Объект': df['Объект'].unique(),
|
82 |
+
'Всего новостей': df.groupby('Объект').size(),
|
83 |
+
'Отрицательные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Negative').any(axis=1)].groupby('Объект').size(),
|
84 |
+
'Положительные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Positive').any(axis=1)].groupby('Объект').size(),
|
85 |
+
'Impact': df.groupby('Объект')['LLM_Impact'].agg(lambda x: x.value_counts().index[0] if x.any() else 'Неопределенный')
|
86 |
+
})
|
87 |
+
ws = wb['Сводка']
|
88 |
+
for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
|
89 |
+
for c_idx, value in enumerate(row, start=5):
|
90 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
91 |
+
|
92 |
+
# Update 'Значимые' sheet
|
93 |
+
significant_data = []
|
94 |
+
for _, row in df.iterrows():
|
95 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
96 |
+
sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
|
97 |
+
significant_data.append([row['Объект'], 'релевантен', sentiment, row['LLM_Impact'], row['Заголовок'], row['Выдержки из текста']])
|
98 |
+
|
99 |
+
ws = wb['Значимые']
|
100 |
+
for r_idx, row in enumerate(significant_data, start=3):
|
101 |
+
for c_idx, value in enumerate(row, start=3):
|
102 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
103 |
+
|
104 |
+
# Update 'Анализ' sheet
|
105 |
+
analysis_df['LLM_Reasoning'] = df['LLM_Reasoning']
|
106 |
+
ws = wb['Анализ']
|
107 |
+
for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
|
108 |
+
for c_idx, value in enumerate(row, start=5):
|
109 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
110 |
+
|
111 |
+
# Copy 'Публикации' sheet from original uploaded file
|
112 |
+
original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
113 |
+
ws = wb['Публикации']
|
114 |
+
for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
|
115 |
+
for c_idx, value in enumerate(row, start=1):
|
116 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
117 |
+
|
118 |
+
# Add 'Тех.приложение' sheet with processed data
|
119 |
+
if 'Тех.приложение' not in wb.sheetnames:
|
120 |
+
wb.create_sheet('Тех.приложение')
|
121 |
+
ws = wb['Тех.приложение']
|
122 |
+
for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
|
123 |
+
for c_idx, value in enumerate(row, start=1):
|
124 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
125 |
+
|
126 |
+
|
127 |
+
output = io.BytesIO()
|
128 |
+
wb.save(output)
|
129 |
+
output.seek(0)
|
130 |
+
return output
|
131 |
+
|
132 |
def create_analysis_data(df):
|
133 |
analysis_data = []
|
134 |
for _, row in df.iterrows():
|
135 |
if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
136 |
analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
|
137 |
+
return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Пояснение', 'Текст сообщения'])
|
138 |
|
139 |
# Function for lemmatizing Russian text
|
140 |
def lemmatize_text(text):
|
|
|
228 |
indices_to_keep.append(i)
|
229 |
return df.iloc[indices_to_keep]
|
230 |
|
231 |
+
def format_elapsed_time(seconds):
|
232 |
+
hours, remainder = divmod(int(seconds), 3600)
|
233 |
+
minutes, seconds = divmod(remainder, 60)
|
234 |
+
|
235 |
+
time_parts = []
|
236 |
+
if hours > 0:
|
237 |
+
time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
|
238 |
+
if minutes > 0:
|
239 |
+
time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
|
240 |
+
if seconds > 0 or not time_parts: # always show seconds if it's the only non-zero value
|
241 |
+
time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
|
242 |
+
|
243 |
+
return " ".join(time_parts)
|
244 |
+
|
245 |
|
246 |
def process_file(uploaded_file):
|
247 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
|
|
374 |
return output
|
375 |
|
376 |
def main():
|
377 |
+
st.title("... приступим к анализу... версия 44+")
|
378 |
|
379 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
380 |
|
|
|
409 |
# Calculate elapsed time
|
410 |
end_time = time.time()
|
411 |
elapsed_time = end_time - start_time
|
412 |
+
formatted_time = format_elapsed_time(elapsed_time)
|
413 |
+
st.success(f"Обработка завершена за {formatted_time}.")
|
414 |
|
415 |
# Offer download of results
|
416 |
|
|
|
420 |
file_name="результат_анализа_новостей.xlsx",
|
421 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
422 |
)
|
423 |
+
|
424 |
+
# Add button for LLM analysis
|
425 |
+
if st.button("Что скажет нейросеть?"):
|
426 |
+
st.info("Анализ нейросетью начался. Это может занять некоторое время...")
|
427 |
+
llm = init_langchain_llm()
|
428 |
+
df_with_llm = process_file_with_llm(uploaded_file, llm)
|
429 |
+
output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, analysis_df)
|
430 |
+
st.success("Анализ нейросетью завершен!")
|
431 |
+
st.download_button(
|
432 |
+
label="Скачать результат анализа с оценкой нейросети",
|
433 |
+
data=output_with_llm,
|
434 |
+
file_name="результат_анализа_с_нейросетью.xlsx",
|
435 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
436 |
+
|
437 |
+
|
438 |
if __name__ == "__main__":
|
439 |
main()
|