PythonAIPairProgrammer

Runtime error

App Files Files Community

awacke1 commited on Nov 16, 2023

Commit

e9cd015

1 Parent(s): 5e5bd97

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -59

app.py CHANGED Viewed

@@ -151,67 +151,48 @@ def readitaloud(result):
     components.html(documentHTML5, width=800, height=300)
     #return result
-def chat_with_model(prompt, document_section, model_choice='Llama-2-7b-chat-hf'):
-    API_URL = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud'  # Dr Llama
-    #API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"  # HF model for Llama 7B
-    #API_KEY = os.getenv('API_KEY')
-    API_KEY = os.getenv('HF_KEY')
-    MODEL1="meta-llama/Llama-2-7b-chat-hf"
-    MODEL1URL="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
-    HF_KEY = os.getenv('HF_KEY')
-    headers = {
-        "Authorization": f"Bearer {HF_KEY}",
-        "Content-Type": "application/json"
-    }
-    model = model_choice
-    conversation = [{'role': 'system', 'content': 'You are a python script writer.'}]
-    conversation.append({'role': 'user', 'content': prompt})
-    if len(document_section)>0:
-        conversation.append({'role': 'assistant', 'content': document_section})
-    start_time = time.time()
-    st.write('starting at ', start_time)
-    report = []
-    res_box = st.empty()
-    collected_chunks = []
-    collected_messages = []
-    endpoint_url = API_URL
-    hf_token = API_KEY
-    client = InferenceClient(endpoint_url, token=hf_token)
-    gen_kwargs = dict(
-        max_new_tokens=512,
-        top_k=30,
-        top_p=0.9,
-        temperature=0.2,
-        repetition_penalty=1.02,
-        stop_sequences=["\nUser:", "<|endoftext|>", "</s>"]    )
-    stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)
-    report=[]
-    res_box = st.empty()
-    collected_chunks=[]
-    collected_messages=[]
-    allresults=''
-    for r in stream:
-        if r.token.special:
-            continue
-        if r.token.text in gen_kwargs["stop_sequences"]:
-            break
-        collected_chunks.append(r.token.text)
-        chunk_message = r.token.text
-        collected_messages.append(chunk_message)
     try:
-        report.append(r.token.text)
-        if len(r.token.text) > 0:
-            result="".join(report).strip()
-            res_box.markdown(f'*{result}*')
     except:
-        st.write('Stream llm issue')
     full_reply_content = result
     st.write("Elapsed time:")
     st.write(time.time() - start_time)

     components.html(documentHTML5, width=800, height=300)
     #return result
+def chat_with_model(prompt, document_section, model_choice='Llama-2-7b-chat-hf'):
     try:
+        endpoint_url = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud'  # Dr Llama
+        hf_token = os.getenv('HF_KEY')
+        client = InferenceClient(endpoint_url, token=hf_token)
+        gen_kwargs = dict(
+            max_new_tokens=512,
+            top_k=30,
+            top_p=0.9,
+            temperature=0.2,
+            repetition_penalty=1.02,
+            stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
+        )
+        stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)
+        report=[]
+        res_box = st.empty()
+        collected_chunks=[]
+        collected_messages=[]
+        allresults=''
+        for r in stream:
+            if r.token.special:
+                continue
+            if r.token.text in gen_kwargs["stop_sequences"]:
+                break
+            collected_chunks.append(r.token.text)
+            chunk_message = r.token.text
+            collected_messages.append(chunk_message)
+            try:
+                report.append(r.token.text)
+                if len(r.token.text) > 0:
+                    result="".join(report).strip()
+                    res_box.markdown(f'*{result}*')
+            except:
+                st.write('Stream llm issue')
+        SpeechSynthesis(result)
+        return result
     except:
+        st.write('Llama model is asleep. Starting up now on A10 - please give 5 minutes then retry as KEDA scales up from zero to activate running container(s).')
     full_reply_content = result
     st.write("Elapsed time:")
     st.write(time.time() - start_time)