Spaces:
Running
Running
Commit
·
08fb3e7
1
Parent(s):
1fd794f
progress more (3.2)
Browse files
app.py
CHANGED
@@ -15,31 +15,49 @@ from reportlab.lib import colors
|
|
15 |
from reportlab.lib.pagesizes import letter
|
16 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
17 |
from reportlab.lib.styles import getSampleStyleSheet
|
18 |
-
from io import StringIO
|
|
|
19 |
import contextlib
|
20 |
|
21 |
|
22 |
-
|
23 |
-
def
|
24 |
-
|
25 |
-
output = StringIO()
|
26 |
-
with contextlib.redirect_stdout(output):
|
27 |
-
yield output
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# Initialize sentiment analyzers
|
45 |
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
@@ -193,78 +211,93 @@ def generate_sentiment_visualization(df):
|
|
193 |
return fig
|
194 |
|
195 |
def process_file(uploaded_file):
|
196 |
-
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
197 |
|
198 |
-
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
199 |
-
missing_columns = [col for col in required_columns if col not in df.columns]
|
200 |
-
if missing_columns:
|
201 |
-
st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
|
202 |
-
st.stop()
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
# Process each news item
|
225 |
-
df['Translated'] = ''
|
226 |
-
df['Sentiment'] = ''
|
227 |
-
df['Impact'] = ''
|
228 |
-
df['Reasoning'] = ''
|
229 |
-
|
230 |
-
for index, row in df.iterrows():
|
231 |
-
# First: Translate
|
232 |
-
translated_text = translate_text(llm, row['Выдержки из текста'])
|
233 |
-
df.at[index, 'Translated'] = translated_text
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
|
259 |
-
|
260 |
-
status_text.empty()
|
261 |
|
262 |
-
# Generate visualization
|
263 |
-
visualization = generate_sentiment_visualization(df)
|
264 |
-
if visualization:
|
265 |
-
st.pyplot(visualization)
|
266 |
|
267 |
-
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
def create_analysis_data(df):
|
270 |
analysis_data = []
|
@@ -359,27 +392,25 @@ def create_output_file(df, uploaded_file):
|
|
359 |
return output
|
360 |
|
361 |
def main():
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС (v.3.1):::")
|
383 |
|
384 |
if 'processed_df' not in st.session_state:
|
385 |
st.session_state.processed_df = None
|
@@ -406,10 +437,6 @@ def main():
|
|
406 |
formatted_time = format_elapsed_time(elapsed_time)
|
407 |
st.success(f"Обработка и анализ завершены за {formatted_time}.")
|
408 |
|
409 |
-
if st.session_state.processed_df is not None:
|
410 |
-
save_to_pdf(output) # Save the captured output to PDF
|
411 |
-
|
412 |
-
|
413 |
st.download_button(
|
414 |
label="Скачать результат анализа",
|
415 |
data=output,
|
|
|
15 |
from reportlab.lib.pagesizes import letter
|
16 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
17 |
from reportlab.lib.styles import getSampleStyleSheet
|
18 |
+
from io import StringIO, BytesIO
|
19 |
+
import sys
|
20 |
import contextlib
|
21 |
|
22 |
|
23 |
+
class StreamlitOutputCapture:
|
24 |
+
def __init__(self):
|
25 |
+
self.output = []
|
|
|
|
|
|
|
26 |
|
27 |
+
def write(self, text):
|
28 |
+
self.output.append(text)
|
29 |
+
|
30 |
+
def getvalue(self):
|
31 |
+
return ''.join(self.output)
|
32 |
+
|
33 |
+
def flush(self):
|
34 |
+
pass
|
35 |
+
|
36 |
+
def save_to_pdf(captured_output):
|
37 |
+
try:
|
38 |
+
# Create PDF document
|
39 |
+
doc = SimpleDocTemplate("result.pdf", pagesize=letter)
|
40 |
+
styles = getSampleStyleSheet()
|
41 |
+
story = []
|
42 |
+
|
43 |
+
# Convert captured output to string and split into lines
|
44 |
+
output_text = captured_output.getvalue()
|
45 |
+
lines = output_text.split('\n')
|
46 |
+
|
47 |
+
# Add each line to the PDF
|
48 |
+
for line in lines:
|
49 |
+
if line.strip(): # Skip empty lines
|
50 |
+
# Clean the line and handle any encoding issues
|
51 |
+
cleaned_line = line.encode('utf-8', errors='ignore').decode('utf-8')
|
52 |
+
p = Paragraph(cleaned_line, styles['Normal'])
|
53 |
+
story.append(p)
|
54 |
+
story.append(Spacer(1, 12))
|
55 |
+
|
56 |
+
# Build the PDF
|
57 |
+
doc.build(story)
|
58 |
+
st.success("PDF файл 'result.pdf' успешно создан")
|
59 |
+
except Exception as e:
|
60 |
+
st.error(f"Ошибка при создании PDF: {str(e)}")
|
61 |
|
62 |
# Initialize sentiment analyzers
|
63 |
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
|
|
211 |
return fig
|
212 |
|
213 |
def process_file(uploaded_file):
|
|
|
214 |
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
+
output_capture = StreamlitOutputCapture()
|
217 |
+
old_stdout = sys.stdout
|
218 |
+
sys.stdout = output_capture
|
219 |
+
|
220 |
+
try:
|
221 |
+
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
222 |
+
|
223 |
+
required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
|
224 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
225 |
+
if missing_columns:
|
226 |
+
st.error(f"Error: The following required columns are missing from the input file: {', '.join(missing_columns)}")
|
227 |
+
st.stop()
|
228 |
+
|
229 |
+
# Initialize LLM
|
230 |
+
llm = init_langchain_llm()
|
231 |
+
if not llm:
|
232 |
+
st.error("Не удалось инициализировать нейросеть. Пожалуйста, проверьте настройки и попробуйте снова.")
|
233 |
+
st.stop()
|
234 |
+
|
235 |
+
# Deduplication
|
236 |
+
original_news_count = len(df)
|
237 |
+
df = df.groupby('Объект').apply(
|
238 |
+
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
239 |
+
).reset_index(drop=True)
|
240 |
+
|
241 |
+
remaining_news_count = len(df)
|
242 |
+
duplicates_removed = original_news_count - remaining_news_count
|
243 |
+
st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
|
244 |
|
245 |
+
# Initialize progress
|
246 |
+
progress_bar = st.progress(0)
|
247 |
+
status_text = st.empty()
|
248 |
+
|
249 |
+
# Process each news item
|
250 |
+
df['Translated'] = ''
|
251 |
+
df['Sentiment'] = ''
|
252 |
+
df['Impact'] = ''
|
253 |
+
df['Reasoning'] = ''
|
254 |
+
|
255 |
+
for index, row in df.iterrows():
|
256 |
+
# First: Translate
|
257 |
+
translated_text = translate_text(llm, row['Выдержки из текста'])
|
258 |
+
df.at[index, 'Translated'] = translated_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
+
# Second: Analyze sentiment
|
261 |
+
sentiment = analyze_sentiment(translated_text)
|
262 |
+
df.at[index, 'Sentiment'] = sentiment
|
263 |
|
264 |
+
# Third: If negative, estimate impact
|
265 |
+
if sentiment == "Negative":
|
266 |
+
impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
|
267 |
+
df.at[index, 'Impact'] = impact
|
268 |
+
df.at[index, 'Reasoning'] = reasoning
|
269 |
|
270 |
+
# Update progress
|
271 |
+
progress = (index + 1) / len(df)
|
272 |
+
progress_bar.progress(progress)
|
273 |
+
status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
|
274 |
|
275 |
+
# Display results
|
276 |
+
st.write(f"Объект: {row['Объект']}")
|
277 |
+
st.write(f"Новость: {row['Заголовок']}")
|
278 |
+
st.write(f"Тональность: {sentiment}")
|
279 |
+
if sentiment == "Negative":
|
280 |
+
st.write(f"Эффект: {impact}")
|
281 |
+
st.write(f"Обоснование: {reasoning}")
|
282 |
+
st.write("---")
|
283 |
+
|
284 |
+
progress_bar.empty()
|
285 |
+
status_text.empty()
|
286 |
+
|
287 |
+
# Generate visualization
|
288 |
+
visualization = generate_sentiment_visualization(df)
|
289 |
+
if visualization:
|
290 |
+
st.pyplot(visualization)
|
291 |
|
292 |
+
save_to_pdf(output_capture)
|
|
|
293 |
|
|
|
|
|
|
|
|
|
294 |
|
295 |
+
return df
|
296 |
+
|
297 |
+
|
298 |
+
finally:
|
299 |
+
|
300 |
+
sys.stdout = old_stdout
|
301 |
|
302 |
def create_analysis_data(df):
|
303 |
analysis_data = []
|
|
|
392 |
return output
|
393 |
|
394 |
def main():
|
395 |
+
st.markdown(
|
396 |
+
"""
|
397 |
+
<style>
|
398 |
+
.signature {
|
399 |
+
position: fixed;
|
400 |
+
right: 12px;
|
401 |
+
bottom: 12px;
|
402 |
+
font-size: 14px;
|
403 |
+
color: #FF0000;
|
404 |
+
opacity: 0.9;
|
405 |
+
z-index: 999;
|
406 |
+
}
|
407 |
+
</style>
|
408 |
+
<div class="signature">denis.pokrovsky.npff</div>
|
409 |
+
""",
|
410 |
+
unsafe_allow_html=True
|
411 |
+
)
|
412 |
+
|
413 |
+
st.title("::: анализ мониторинга новостей СКАН-ИНТЕРФАКС (v.3.2):::")
|
|
|
|
|
414 |
|
415 |
if 'processed_df' not in st.session_state:
|
416 |
st.session_state.processed_df = None
|
|
|
437 |
formatted_time = format_elapsed_time(elapsed_time)
|
438 |
st.success(f"Обработка и анализ завершены за {formatted_time}.")
|
439 |
|
|
|
|
|
|
|
|
|
440 |
st.download_button(
|
441 |
label="Скачать результат анализа",
|
442 |
data=output,
|