awacke1 commited on
Commit
e9cd015
·
1 Parent(s): 5e5bd97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -59
app.py CHANGED
@@ -151,67 +151,48 @@ def readitaloud(result):
151
  components.html(documentHTML5, width=800, height=300)
152
  #return result
153
 
154
- def chat_with_model(prompt, document_section, model_choice='Llama-2-7b-chat-hf'):
155
- API_URL = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud' # Dr Llama
156
- #API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf" # HF model for Llama 7B
157
- #API_KEY = os.getenv('API_KEY')
158
- API_KEY = os.getenv('HF_KEY')
159
- MODEL1="meta-llama/Llama-2-7b-chat-hf"
160
- MODEL1URL="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
161
- HF_KEY = os.getenv('HF_KEY')
162
- headers = {
163
- "Authorization": f"Bearer {HF_KEY}",
164
- "Content-Type": "application/json"
165
- }
166
-
167
- model = model_choice
168
- conversation = [{'role': 'system', 'content': 'You are a python script writer.'}]
169
- conversation.append({'role': 'user', 'content': prompt})
170
- if len(document_section)>0:
171
- conversation.append({'role': 'assistant', 'content': document_section})
172
-
173
- start_time = time.time()
174
- st.write('starting at ', start_time)
175
- report = []
176
- res_box = st.empty()
177
-
178
- collected_chunks = []
179
- collected_messages = []
180
-
181
- endpoint_url = API_URL
182
- hf_token = API_KEY
183
-
184
- client = InferenceClient(endpoint_url, token=hf_token)
185
- gen_kwargs = dict(
186
- max_new_tokens=512,
187
- top_k=30,
188
- top_p=0.9,
189
- temperature=0.2,
190
- repetition_penalty=1.02,
191
- stop_sequences=["\nUser:", "<|endoftext|>", "</s>"] )
192
-
193
- stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)
194
- report=[]
195
- res_box = st.empty()
196
- collected_chunks=[]
197
- collected_messages=[]
198
- allresults=''
199
- for r in stream:
200
- if r.token.special:
201
- continue
202
- if r.token.text in gen_kwargs["stop_sequences"]:
203
- break
204
- collected_chunks.append(r.token.text)
205
- chunk_message = r.token.text
206
- collected_messages.append(chunk_message)
207
  try:
208
- report.append(r.token.text)
209
- if len(r.token.text) > 0:
210
- result="".join(report).strip()
211
- res_box.markdown(f'*{result}*')
212
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  except:
214
- st.write('Stream llm issue')
 
215
  full_reply_content = result
216
  st.write("Elapsed time:")
217
  st.write(time.time() - start_time)
 
151
  components.html(documentHTML5, width=800, height=300)
152
  #return result
153
 
154
+ def chat_with_model(prompt, document_section, model_choice='Llama-2-7b-chat-hf'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  try:
156
+ endpoint_url = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud' # Dr Llama
157
+ hf_token = os.getenv('HF_KEY')
158
+ client = InferenceClient(endpoint_url, token=hf_token)
159
+ gen_kwargs = dict(
160
+ max_new_tokens=512,
161
+ top_k=30,
162
+ top_p=0.9,
163
+ temperature=0.2,
164
+ repetition_penalty=1.02,
165
+ stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
166
+ )
167
+
168
+ stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)
169
+ report=[]
170
+ res_box = st.empty()
171
+ collected_chunks=[]
172
+ collected_messages=[]
173
+ allresults=''
174
+
175
+ for r in stream:
176
+ if r.token.special:
177
+ continue
178
+ if r.token.text in gen_kwargs["stop_sequences"]:
179
+ break
180
+ collected_chunks.append(r.token.text)
181
+ chunk_message = r.token.text
182
+ collected_messages.append(chunk_message)
183
+ try:
184
+ report.append(r.token.text)
185
+ if len(r.token.text) > 0:
186
+ result="".join(report).strip()
187
+ res_box.markdown(f'*{result}*')
188
+
189
+ except:
190
+ st.write('Stream llm issue')
191
+ SpeechSynthesis(result)
192
+ return result
193
  except:
194
+ st.write('Llama model is asleep. Starting up now on A10 - please give 5 minutes then retry as KEDA scales up from zero to activate running container(s).')
195
+
196
  full_reply_content = result
197
  st.write("Elapsed time:")
198
  st.write(time.time() - start_time)