pentarosarium commited on
Commit
d5eb93b
·
1 Parent(s): f036fc2

introduce googletrans

Browse files
Files changed (1) hide show
  1. app.py +171 -103
app.py CHANGED
@@ -16,6 +16,157 @@ import contextlib
16
  from langchain_openai import ChatOpenAI # Updated import
17
  import pdfkit
18
  from jinja2 import Template
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  def translate_reasoning_to_russian(llm, text):
@@ -182,28 +333,6 @@ def fuzzy_deduplicate(df, column, threshold=50):
182
  return df.iloc[indices_to_keep]
183
 
184
 
185
- def translate_text(llm, text):
186
- try:
187
- # All models now use OpenAI-compatible API format
188
- messages = [
189
- {"role": "system", "content": "You are a translator. Translate the given Russian text to English accurately and concisely."},
190
- {"role": "user", "content": f"Translate this Russian text to English: {text}"}
191
- ]
192
- response = llm.invoke(messages)
193
-
194
- if hasattr(response, 'content'):
195
- return response.content.strip()
196
- elif isinstance(response, str):
197
- return response.strip()
198
- else:
199
- return str(response).strip()
200
-
201
- except Exception as e:
202
- st.error(f"Translation error: {str(e)}")
203
- return text
204
-
205
-
206
-
207
  def init_langchain_llm(model_choice):
208
  try:
209
  if model_choice == "Groq (llama-3.1-70b)":
@@ -319,77 +448,6 @@ def generate_sentiment_visualization(df):
319
  plt.tight_layout()
320
  return fig
321
 
322
- def process_file(uploaded_file, model_choice):
323
- df = None
324
- try:
325
- df = pd.read_excel(uploaded_file, sheet_name='Публикации')
326
- llm = init_langchain_llm(model_choice)
327
-
328
- # Validate required columns
329
- required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
330
- missing_columns = [col for col in required_columns if col not in df.columns]
331
- if missing_columns:
332
- st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
333
- return df if df is not None else None
334
-
335
- # Deduplication
336
- original_news_count = len(df)
337
- df = df.groupby('Объект', group_keys=False).apply(
338
- lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
339
- ).reset_index(drop=True)
340
-
341
- remaining_news_count = len(df)
342
- duplicates_removed = original_news_count - remaining_news_count
343
- st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
344
-
345
-
346
- # Initialize progress tracking
347
- progress_bar = st.progress(0)
348
- status_text = st.empty()
349
-
350
- # Initialize new columns
351
- df['Translated'] = ''
352
- df['Sentiment'] = ''
353
- df['Impact'] = ''
354
- df['Reasoning'] = ''
355
- df['Event_Type'] = ''
356
- df['Event_Summary'] = ''
357
-
358
- # Process each news item
359
- for index, row in df.iterrows():
360
- try:
361
- # Translate and analyze sentiment
362
- translated_text = translate_text(llm, row['Выдержки из текста'])
363
- df.at[index, 'Translated'] = translated_text
364
-
365
- sentiment = analyze_sentiment(translated_text)
366
- df.at[index, 'Sentiment'] = sentiment
367
-
368
- # Detect events
369
- event_type, event_summary = detect_events(llm, row['Выдержки из текста'], row['Объект'])
370
- df.at[index, 'Event_Type'] = event_type
371
- df.at[index, 'Event_Summary'] = event_summary
372
-
373
- if sentiment == "Negative":
374
- impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
375
- df.at[index, 'Impact'] = impact
376
- df.at[index, 'Reasoning'] = reasoning
377
-
378
- # Update progress
379
- progress = (index + 1) / len(df)
380
- progress_bar.progress(progress)
381
- status_text.text(f"Проан��лизировано {index + 1} из {len(df)} новостей")
382
-
383
- except Exception as e:
384
- st.warning(f"Ошибка при обработке новости {index + 1}: {str(e)}")
385
- continue
386
-
387
- return df
388
-
389
- except Exception as e:
390
- st.error(f"❌ Ошибка при обработке файла: {str(e)}")
391
- return df if df is not None else None
392
-
393
  def create_analysis_data(df):
394
  analysis_data = []
395
  for _, row in df.iterrows():
@@ -506,7 +564,7 @@ def create_output_file(df, uploaded_file, llm):
506
 
507
  def main():
508
  with st.sidebar:
509
- st.title("::: AI-анализ мониторинга новостей (v.3.30):::")
510
  st.subheader("по материалам СКАН-ИНТЕРФАКС ")
511
 
512
  model_choice = st.radio(
@@ -515,6 +573,13 @@ def main():
515
  key="model_selector"
516
  )
517
 
 
 
 
 
 
 
 
518
  st.markdown(
519
  """
520
  Использованы технологии:
@@ -524,16 +589,16 @@ def main():
524
  """,
525
  unsafe_allow_html=True)
526
 
527
- # Model selection is now handled in init_langchain_llm()
528
-
529
  with st.expander("ℹ️ Инструкция"):
530
  st.markdown("""
531
  1. Выберите модель для анализа
532
- 2. Загрузите Excel файл с новостями <br/>
533
- 3. Дождитесь завершения анализа <br/>
534
- 4. Скачайте результаты анализа в формате Excel <br/>
 
535
  """, unsafe_allow_html=True)
536
-
 
537
  st.markdown(
538
  """
539
  <style>
@@ -563,12 +628,15 @@ def main():
563
  if uploaded_file is not None and st.session_state.processed_df is None:
564
  start_time = time.time()
565
 
566
-
567
  # Initialize LLM with selected model
568
  llm = init_langchain_llm(model_choice)
569
 
570
-
571
- st.session_state.processed_df = process_file(uploaded_file, model_choice)
 
 
 
 
572
 
573
  st.subheader("Предпросмотр данных")
574
  preview_df = st.session_state.processed_df[['Объект', 'Заголовок', 'Sentiment', 'Impact']].head()
 
16
  from langchain_openai import ChatOpenAI # Updated import
17
  import pdfkit
18
  from jinja2 import Template
19
+ from googletrans import Translator as GoogleTranslator
20
+ import time
21
+
22
+ class TranslationSystem:
23
+ def __init__(self, method='googletrans', llm=None):
24
+ """
25
+ Initialize translation system with specified method.
26
+
27
+ Args:
28
+ method (str): 'googletrans' or 'llm'
29
+ llm: LangChain LLM instance (required if method is 'llm')
30
+ """
31
+ self.method = method
32
+ self.llm = llm
33
+ self.google_translator = GoogleTranslator() if method == 'googletrans' else None
34
+
35
+ def translate_text(self, text, src='ru', dest='en'):
36
+ """
37
+ Translate text using the selected translation method.
38
+
39
+ Args:
40
+ text (str): Text to translate
41
+ src (str): Source language code
42
+ dest (str): Destination language code
43
+
44
+ Returns:
45
+ str: Translated text
46
+ """
47
+ if pd.isna(text) or not text.strip():
48
+ return text
49
+
50
+ try:
51
+ if self.method == 'googletrans':
52
+ return self._translate_with_googletrans(text, src, dest)
53
+ else:
54
+ return self._translate_with_llm(text, src, dest)
55
+ except Exception as e:
56
+ st.warning(f"Translation error: {str(e)}")
57
+ return text
58
+
59
+ def _translate_with_googletrans(self, text, src='ru', dest='en'):
60
+ """
61
+ Translate using googletrans library.
62
+ """
63
+ try:
64
+ # Add delay to avoid rate limits
65
+ time.sleep(0.5)
66
+ result = self.google_translator.translate(text, src=src, dest=dest)
67
+ return result.text
68
+ except Exception as e:
69
+ raise Exception(f"Googletrans error: {str(e)}")
70
+
71
+ def _translate_with_llm(self, text, src='ru', dest='en'):
72
+ """
73
+ Translate using LangChain LLM.
74
+ """
75
+ if not self.llm:
76
+ raise Exception("LLM not initialized for translation")
77
+
78
+ messages = [
79
+ {"role": "system", "content": "You are a translator. Translate the given Russian text to English accurately and concisely."},
80
+ {"role": "user", "content": f"Translate this Russian text to English: {text}"}
81
+ ]
82
+
83
+ try:
84
+ response = self.llm.invoke(messages)
85
+
86
+ if hasattr(response, 'content'):
87
+ return response.content.strip()
88
+ elif isinstance(response, str):
89
+ return response.strip()
90
+ else:
91
+ return str(response).strip()
92
+ except Exception as e:
93
+ raise Exception(f"LLM translation error: {str(e)}")
94
+
95
+ def process_file(uploaded_file, model_choice, translation_method='googletrans'):
96
+ df = None
97
+ try:
98
+ df = pd.read_excel(uploaded_file, sheet_name='Публикации')
99
+ llm = init_langchain_llm(model_choice)
100
+
101
+ # Initialize translation system with chosen method
102
+ translator = TranslationSystem(
103
+ method=translation_method,
104
+ llm=llm if translation_method == 'llm' else None
105
+ )
106
+
107
+ # Validate required columns
108
+ required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
109
+ missing_columns = [col for col in required_columns if col not in df.columns]
110
+ if missing_columns:
111
+ st.error(f"Error: The following required columns are missing: {', '.join(missing_columns)}")
112
+ return df if df is not None else None
113
+
114
+ # Deduplication
115
+ original_news_count = len(df)
116
+ df = df.groupby('Объект', group_keys=False).apply(
117
+ lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
118
+ ).reset_index(drop=True)
119
+
120
+ remaining_news_count = len(df)
121
+ duplicates_removed = original_news_count - remaining_news_count
122
+ st.write(f"Из {original_news_count} новостных сообщений удалены {duplicates_removed} дублирующих. Осталось {remaining_news_count}.")
123
+
124
+ # Initialize progress tracking
125
+ progress_bar = st.progress(0)
126
+ status_text = st.empty()
127
+
128
+ # Initialize new columns
129
+ df['Translated'] = ''
130
+ df['Sentiment'] = ''
131
+ df['Impact'] = ''
132
+ df['Reasoning'] = ''
133
+ df['Event_Type'] = ''
134
+ df['Event_Summary'] = ''
135
+
136
+ # Process each news item
137
+ for index, row in df.iterrows():
138
+ try:
139
+ # Translate and analyze sentiment
140
+ translated_text = translator.translate_text(row['Выдержки из текста'])
141
+ df.at[index, 'Translated'] = translated_text
142
+
143
+ sentiment = analyze_sentiment(translated_text)
144
+ df.at[index, 'Sentiment'] = sentiment
145
+
146
+ # Detect events
147
+ event_type, event_summary = detect_events(llm, row['Выдержки из текста'], row['Объект'])
148
+ df.at[index, 'Event_Type'] = event_type
149
+ df.at[index, 'Event_Summary'] = event_summary
150
+
151
+ if sentiment == "Negative":
152
+ impact, reasoning = estimate_impact(llm, translated_text, row['Объект'])
153
+ df.at[index, 'Impact'] = impact
154
+ df.at[index, 'Reasoning'] = reasoning
155
+
156
+ # Update progress
157
+ progress = (index + 1) / len(df)
158
+ progress_bar.progress(progress)
159
+ status_text.text(f"Проанализировано {index + 1} из {len(df)} новостей")
160
+
161
+ except Exception as e:
162
+ st.warning(f"Ошибка при обработке новости {index + 1}: {str(e)}")
163
+ continue
164
+
165
+ return df
166
+
167
+ except Exception as e:
168
+ st.error(f"❌ Ошибка при обработке файла: {str(e)}")
169
+ return df if df is not None else None
170
 
171
 
172
  def translate_reasoning_to_russian(llm, text):
 
333
  return df.iloc[indices_to_keep]
334
 
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  def init_langchain_llm(model_choice):
337
  try:
338
  if model_choice == "Groq (llama-3.1-70b)":
 
448
  plt.tight_layout()
449
  return fig
450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  def create_analysis_data(df):
452
  analysis_data = []
453
  for _, row in df.iterrows():
 
564
 
565
  def main():
566
  with st.sidebar:
567
+ st.title("::: AI-анализ мониторинга новостей (v.3.32 ):::")
568
  st.subheader("по материалам СКАН-ИНТЕРФАКС ")
569
 
570
  model_choice = st.radio(
 
573
  key="model_selector"
574
  )
575
 
576
+ translation_method = st.radio(
577
+ "Выберите метод перевода:",
578
+ ["googletrans", "llm"],
579
+ key="translation_selector",
580
+ help="googletrans - быстрее, llm - качественнее, но медленнее"
581
+ )
582
+
583
  st.markdown(
584
  """
585
  Использованы технологии:
 
589
  """,
590
  unsafe_allow_html=True)
591
 
 
 
592
  with st.expander("ℹ️ Инструкция"):
593
  st.markdown("""
594
  1. Выберите модель для анализа
595
+ 2. Выберите метод перевода
596
+ 3. Загрузите Excel файл с новостями
597
+ 4. Дождитесь завершения анализа
598
+ 5. Скачайте результаты анализа в формате Excel
599
  """, unsafe_allow_html=True)
600
+
601
+
602
  st.markdown(
603
  """
604
  <style>
 
628
  if uploaded_file is not None and st.session_state.processed_df is None:
629
  start_time = time.time()
630
 
 
631
  # Initialize LLM with selected model
632
  llm = init_langchain_llm(model_choice)
633
 
634
+ # Process file with selected translation method
635
+ st.session_state.processed_df = process_file(
636
+ uploaded_file,
637
+ model_choice,
638
+ translation_method
639
+ )
640
 
641
  st.subheader("Предпросмотр данных")
642
  preview_df = st.session_state.processed_df[['Объект', 'Заголовок', 'Sentiment', 'Impact']].head()