pentarosarium commited on
Commit
05922ea
·
1 Parent(s): 7844008

3.41 translat helsinki

Browse files
Files changed (1) hide show
  1. app.py +63 -34
app.py CHANGED
@@ -27,14 +27,18 @@ from googletrans import Translator as LegacyTranslator
27
  class TranslationSystem:
28
  def __init__(self, batch_size=5):
29
  """
30
- Initialize translation system using only deep-translator.
31
  """
32
- self.batch_size = batch_size
33
- self.translator = GoogleTranslator(source='russian', target='english') # Using full language names
 
 
 
 
34
 
35
  def translate_text(self, text):
36
  """
37
- Translate single text using deep-translator with chunking for long texts.
38
  """
39
  if pd.isna(text) or not isinstance(text, str) or not text.strip():
40
  return text
@@ -44,33 +48,73 @@ class TranslationSystem:
44
  return text
45
 
46
  try:
47
- # deep-translator has a character limit, so we need to chunk long texts
48
- max_chunk_size = 4500 # Deep translator limit is 5000, using 4500 to be safe
49
 
50
- if len(text) <= max_chunk_size:
51
- return self.translator.translate(text=text)
 
 
52
 
53
- # Split long text into chunks
54
- chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
55
  translated_chunks = []
56
 
57
  for chunk in chunks:
58
- translated_chunk = self.translator.translate(text=chunk)
59
- translated_chunks.append(translated_chunk)
60
- time.sleep(0.5) # Small delay between chunks
61
 
62
  return ' '.join(translated_chunks)
63
 
64
  except Exception as e:
65
  st.warning(f"Translation error: {str(e)}. Using original text.")
66
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- def process_file(uploaded_file, model_choice, translation_method=None): # Added translation_method parameter with default None
69
  df = None
70
  try:
71
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
72
  llm = init_langchain_llm(model_choice)
73
- translator = TranslationSystem(batch_size=5) # We'll use deep-translator regardless of translation_method
 
 
 
 
 
 
 
 
74
 
75
  # Validate required columns
76
  required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
@@ -93,14 +137,6 @@ def process_file(uploaded_file, model_choice, translation_method=None): # Added
93
  progress_bar = st.progress(0)
94
  status_text = st.empty()
95
 
96
- # Initialize new columns
97
- df['Translated'] = ''
98
- df['Sentiment'] = ''
99
- df['Impact'] = ''
100
- df['Reasoning'] = ''
101
- df['Event_Type'] = ''
102
- df['Event_Summary'] = ''
103
-
104
  # Process in batches
105
  batch_size = 5
106
  for i in range(0, len(df), batch_size):
@@ -108,7 +144,7 @@ def process_file(uploaded_file, model_choice, translation_method=None): # Added
108
 
109
  for idx, row in batch_df.iterrows():
110
  try:
111
- # Translation
112
  translated_text = translator.translate_text(row['Выдержки из текста'])
113
  df.at[idx, 'Translated'] = translated_text
114
 
@@ -116,7 +152,7 @@ def process_file(uploaded_file, model_choice, translation_method=None): # Added
116
  sentiment = analyze_sentiment(translated_text)
117
  df.at[idx, 'Sentiment'] = sentiment
118
 
119
- # Event detection with rate limit handling
120
  event_type, event_summary = detect_events(
121
  llm,
122
  row['Выдержки из текста'],
@@ -554,7 +590,7 @@ def create_output_file(df, uploaded_file, llm):
554
  return output
555
  def main():
556
  with st.sidebar:
557
- st.title("::: AI-анализ мониторинга новостей (v.3.40 ):::")
558
  st.subheader("по материалам СКАН-ИНТЕРФАКС ")
559
 
560
  model_choice = st.radio(
@@ -563,14 +599,7 @@ def main():
563
  key="model_selector"
564
  )
565
 
566
- # We'll keep this for compatibility but it won't affect the translation method
567
- translation_method = st.radio(
568
- "Выберите метод перевода:",
569
- ["googletrans", "llm"],
570
- key="translation_selector",
571
- help="Используется deep-translator независимо от выбора"
572
- )
573
-
574
  st.markdown(
575
  """
576
  Использованы технологии:
 
27
  class TranslationSystem:
28
  def __init__(self, batch_size=5):
29
  """
30
+ Initialize translation system using Helsinki NLP model.
31
  """
32
+ try:
33
+ self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en") # Note: ru-en for Russian to English
34
+ self.batch_size = batch_size
35
+ except Exception as e:
36
+ st.error(f"Error initializing Helsinki NLP translator: {str(e)}")
37
+ raise
38
 
39
  def translate_text(self, text):
40
  """
41
+ Translate single text using Helsinki NLP model with chunking for long texts.
42
  """
43
  if pd.isna(text) or not isinstance(text, str) or not text.strip():
44
  return text
 
48
  return text
49
 
50
  try:
51
+ # Helsinki NLP model typically has a max length limit
52
+ max_chunk_size = 512 # Standard transformer length
53
 
54
+ if len(text.split()) <= max_chunk_size:
55
+ # Direct translation for short texts
56
+ result = self.translator(text, max_length=512)
57
+ return result[0]['translation_text']
58
 
59
+ # Split long text into chunks by sentences
60
+ chunks = self._split_into_chunks(text, max_chunk_size)
61
  translated_chunks = []
62
 
63
  for chunk in chunks:
64
+ result = self.translator(chunk, max_length=512)
65
+ translated_chunks.append(result[0]['translation_text'])
66
+ time.sleep(0.1) # Small delay between chunks
67
 
68
  return ' '.join(translated_chunks)
69
 
70
  except Exception as e:
71
  st.warning(f"Translation error: {str(e)}. Using original text.")
72
  return text
73
+
74
+ def _split_into_chunks(self, text, max_length):
75
+ """
76
+ Split text into chunks by sentences, respecting max length.
77
+ """
78
+ # Simple sentence splitting by common punctuation
79
+ sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if s.strip()]
80
+
81
+ chunks = []
82
+ current_chunk = []
83
+ current_length = 0
84
+
85
+ for sentence in sentences:
86
+ sentence_length = len(sentence.split())
87
+
88
+ if current_length + sentence_length > max_length:
89
+ if current_chunk:
90
+ chunks.append(' '.join(current_chunk))
91
+ current_chunk = [sentence]
92
+ current_length = sentence_length
93
+ else:
94
+ current_chunk.append(sentence)
95
+ current_length += sentence_length
96
+
97
+ if current_chunk:
98
+ chunks.append(' '.join(current_chunk))
99
+
100
+ return chunks
101
+
102
+
103
 
104
+ def process_file(uploaded_file, model_choice, translation_method=None):
105
  df = None
106
  try:
107
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
108
  llm = init_langchain_llm(model_choice)
109
+ translator = TranslationSystem(batch_size=5)
110
+
111
+ # Initialize all required columns first
112
+ df['Translated'] = ''
113
+ df['Sentiment'] = ''
114
+ df['Impact'] = ''
115
+ df['Reasoning'] = ''
116
+ df['Event_Type'] = ''
117
+ df['Event_Summary'] = ''
118
 
119
  # Validate required columns
120
  required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
 
137
  progress_bar = st.progress(0)
138
  status_text = st.empty()
139
 
 
 
 
 
 
 
 
 
140
  # Process in batches
141
  batch_size = 5
142
  for i in range(0, len(df), batch_size):
 
144
 
145
  for idx, row in batch_df.iterrows():
146
  try:
147
+ # Translation with Helsinki NLP
148
  translated_text = translator.translate_text(row['Выдержки из текста'])
149
  df.at[idx, 'Translated'] = translated_text
150
 
 
152
  sentiment = analyze_sentiment(translated_text)
153
  df.at[idx, 'Sentiment'] = sentiment
154
 
155
+ # Event detection
156
  event_type, event_summary = detect_events(
157
  llm,
158
  row['Выдержки из текста'],
 
590
  return output
591
  def main():
592
  with st.sidebar:
593
+ st.title("::: AI-анализ мониторинга новостей (v.3.41 ):::")
594
  st.subheader("по материалам СКАН-ИНТЕРФАКС ")
595
 
596
  model_choice = st.radio(
 
599
  key="model_selector"
600
  )
601
 
602
+
 
 
 
 
 
 
 
603
  st.markdown(
604
  """
605
  Использованы технологии: