awacke1 commited on
Commit
0bc666d
·
1 Parent(s): 1dec5ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -38
app.py CHANGED
@@ -153,44 +153,44 @@ def StreamLLMChatResponse(prompt):
153
  API_URL = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud' # Dr Llama
154
  API_KEY = os.getenv('API_KEY')
155
 
156
- try:
157
- endpoint_url = API_URL
158
- hf_token = API_KEY
159
- client = InferenceClient(endpoint_url, token=hf_token)
160
- gen_kwargs = dict(
161
- max_new_tokens=512,
162
- top_k=30,
163
- top_p=0.9,
164
- temperature=0.2,
165
- repetition_penalty=1.02,
166
- stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
167
- )
168
- stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)
169
- report=[]
170
- res_box = st.empty()
171
- collected_chunks=[]
172
- collected_messages=[]
173
- allresults=''
174
- for r in stream:
175
- if r.token.special:
176
- continue
177
- if r.token.text in gen_kwargs["stop_sequences"]:
178
- break
179
- collected_chunks.append(r.token.text)
180
- chunk_message = r.token.text
181
- collected_messages.append(chunk_message)
182
- #try:
183
- report.append(r.token.text)
184
- if len(r.token.text) > 0:
185
- result="".join(report).strip()
186
- res_box.markdown(f'*{result}*')
187
-
188
- #except:
189
- #st.write('Stream llm issue')
190
- SpeechSynthesis(result)
191
- return result
192
- except:
193
- st.write('Llama model is asleep. Starting up now on A10 - please give 5 minutes then retry as KEDA scales up from zero to activate running container(s).')
194
 
195
 
196
 
 
153
  API_URL = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud' # Dr Llama
154
  API_KEY = os.getenv('API_KEY')
155
 
156
+ #try:
157
+ endpoint_url = API_URL
158
+ hf_token = API_KEY
159
+ client = InferenceClient(endpoint_url, token=hf_token)
160
+ gen_kwargs = dict(
161
+ max_new_tokens=512,
162
+ top_k=30,
163
+ top_p=0.9,
164
+ temperature=0.2,
165
+ repetition_penalty=1.02,
166
+ stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
167
+ )
168
+ stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)
169
+ report=[]
170
+ res_box = st.empty()
171
+ collected_chunks=[]
172
+ collected_messages=[]
173
+ allresults=''
174
+ for r in stream:
175
+ if r.token.special:
176
+ continue
177
+ if r.token.text in gen_kwargs["stop_sequences"]:
178
+ break
179
+ collected_chunks.append(r.token.text)
180
+ chunk_message = r.token.text
181
+ collected_messages.append(chunk_message)
182
+ #try:
183
+ report.append(r.token.text)
184
+ if len(r.token.text) > 0:
185
+ result="".join(report).strip()
186
+ res_box.markdown(f'*{result}*')
187
+
188
+ #except:
189
+ #st.write('Stream llm issue')
190
+ SpeechSynthesis(result)
191
+ return result
192
+ #except:
193
+ #st.write('Llama model is asleep. Starting up now on A10 - please give 5 minutes then retry as KEDA scales up from zero to activate running container(s).')
194
 
195
 
196