pentarosarium commited on
Commit
5137985
·
1 Parent(s): 2ed5230

progress more 59

Browse files
Files changed (1) hide show
  1. app.py +65 -60
app.py CHANGED
@@ -22,6 +22,43 @@ from huggingface_hub import login
22
  from accelerate import init_empty_weights
23
  import logging
24
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
@@ -69,62 +106,24 @@ def load_model(model_id):
69
 
70
 
71
  def init_langchain_llm():
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- if not authenticate_huggingface():
74
- st.stop()
 
75
 
76
- try:
77
- model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
78
- tokenizer, model = load_model(model_id)
79
-
80
- except Exception as e:
81
- logger.error(f"Error loading model: {str(e)}", exc_info=True)
82
- st.error(f"Failed to load model: {str(e)}")
83
- st.stop()
84
-
85
- # Authenticate using the token from Streamlit secrets
86
- if 'hf_token' in st.secrets:
87
- login(token=st.secrets['hf_token'])
88
- else:
89
- st.error("Hugging Face token not found in Streamlit secrets. Please add it to access the model.")
90
- st.stop()
91
-
92
- model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
93
-
94
- try:
95
- tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
96
-
97
- # Use Accelerate for efficient model loading
98
- with init_empty_weights():
99
- config = transformers.AutoConfig.from_pretrained(model_id)
100
- model = transformers.AutoModelForCausalLM.from_config(config)
101
-
102
- model = model.from_pretrained(
103
- model_id,
104
- torch_dtype=torch.float16,
105
- device_map="auto",
106
- low_cpu_mem_usage=True
107
- )
108
-
109
- pipeline = transformers.pipeline(
110
- "text-generation",
111
- model=model,
112
- tokenizer=tokenizer,
113
- torch_dtype=torch.float16,
114
- device_map="auto",
115
- )
116
-
117
- def llama_wrapper(prompt):
118
- result = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
119
- return result[0]['generated_text']
120
-
121
- llm = HuggingFacePipeline(pipeline=llama_wrapper)
122
- return llm
123
-
124
- except Exception as e:
125
- logger.error(f"Error loading model: {str(e)}", exc_info=True)
126
- st.error(f"Failed to load model: {str(e)}")
127
- st.stop()
128
 
129
 
130
  def estimate_impact(llm, news_text, entity):
@@ -395,6 +394,8 @@ def process_file(uploaded_file):
395
  progress_bar = st.progress(0)
396
  progress_text = st.empty()
397
  total_news = len(df)
 
 
398
 
399
  texts = df['Выдержки из текста'].tolist()
400
  # Data validation
@@ -403,12 +404,16 @@ def process_file(uploaded_file):
403
  for text in df['Выдержки из текста']:
404
  lemmatized_texts.append(lemmatize_text(text))
405
 
406
- for i, text in enumerate(lemmatized_texts):
407
- translated_text = translate(str(text))
408
- translated_texts.append(translated_text)
409
- progress_bar.progress((i + 1) / len(df))
410
- progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
411
 
 
 
 
 
412
  # Perform sentiment analysis
413
  rubert2_results = [get_rubert2_sentiment(text) for text in texts]
414
  finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
@@ -499,7 +504,7 @@ def create_output_file(df, uploaded_file, analysis_df):
499
  return output
500
 
501
  def main():
502
- st.title("... приступим к анализу... версия 58")
503
 
504
  # Initialize session state
505
  if 'processed_df' not in st.session_state:
 
22
  from accelerate import init_empty_weights
23
  import logging
24
  import os
25
+ from transformers import MarianMTModel, MarianTokenizer
26
+
27
+ class TranslationModel:
28
+ def __init__(self, model_name="Helsinki-NLP/opus-mt-ru-en"):
29
+ self.tokenizer = MarianTokenizer.from_pretrained(model_name)
30
+ self.model = MarianMTModel.from_pretrained(model_name)
31
+ if torch.cuda.is_available():
32
+ self.model = self.model.to('cuda')
33
+
34
+ def translate(self, text):
35
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
36
+ if torch.cuda.is_available():
37
+ inputs = {k: v.to('cuda') for k, v in inputs.items()}
38
+
39
+ with torch.no_grad():
40
+ translated = self.model.generate(**inputs)
41
+
42
+ return self.tokenizer.decode(translated[0], skip_special_tokens=True)
43
+
44
+
45
+ def batch_translate(texts, batch_size=32):
46
+ translator = TranslationModel()
47
+ translated_texts = []
48
+
49
+ for i in range(0, len(texts), batch_size):
50
+ batch = texts[i:i+batch_size]
51
+ translations = [translator.translate(text) for text in batch]
52
+ translated_texts.extend(translations)
53
+
54
+ # Update progress
55
+ progress = (i + len(batch)) / len(texts)
56
+ st.progress(progress)
57
+ st.text(f"Переведено {i + len(batch)} из {len(texts)} текстов")
58
+
59
+ return translated_texts
60
+
61
+
62
 
63
  logging.basicConfig(level=logging.INFO)
64
  logger = logging.getLogger(__name__)
 
106
 
107
 
108
  def init_langchain_llm():
109
+ model_id = "gpt2" # Using the publicly available GPT-2 model
110
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
111
+ model = transformers.AutoModelForCausalLM.from_pretrained(model_id)
112
+
113
+ pipeline = transformers.pipeline(
114
+ "text-generation",
115
+ model=model,
116
+ tokenizer=tokenizer,
117
+ torch_dtype=torch.float32,
118
+ device_map="auto",
119
+ )
120
 
121
+ def gpt2_wrapper(prompt):
122
+ result = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
123
+ return result[0]['generated_text']
124
 
125
+ llm = HuggingFacePipeline(pipeline=gpt2_wrapper)
126
+ return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
  def estimate_impact(llm, news_text, entity):
 
394
  progress_bar = st.progress(0)
395
  progress_text = st.empty()
396
  total_news = len(df)
397
+
398
+ st.write("Начинаем предобработку текстов...")
399
 
400
  texts = df['Выдержки из текста'].tolist()
401
  # Data validation
 
404
  for text in df['Выдержки из текста']:
405
  lemmatized_texts.append(lemmatize_text(text))
406
 
407
+ #for i, text in enumerate(lemmatized_texts):
408
+ # translated_text = translate(str(text))
409
+ # translated_texts.append(translated_text)
410
+ # progress_bar.progress((i + 1) / len(df))
411
+ # progress_text.text(f"{i + 1} из {total_news} сообщений предобработано")
412
 
413
+ translated_texts = batch_translate(lemmatized_texts)
414
+ df['Translated'] = translated_texts
415
+
416
+
417
  # Perform sentiment analysis
418
  rubert2_results = [get_rubert2_sentiment(text) for text in texts]
419
  finbert_results = [get_finbert_sentiment(text) for text in translated_texts]
 
504
  return output
505
 
506
  def main():
507
+ st.title("... приступим к анализу... версия 59")
508
 
509
  # Initialize session state
510
  if 'processed_df' not in st.session_state: