File size: 8,720 Bytes
c6b3ffb
e00ad77
 
c6b3ffb
 
 
 
aed3be4
c6b3ffb
 
 
 
 
aed3be4
c6b3ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d28f19
c456c47
6d28f19
 
c456c47
 
efb491c
 
 
 
 
 
 
 
 
 
6d28f19
efb491c
6d28f19
15152ff
c6b3ffb
 
 
 
 
 
aed3be4
 
c6b3ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efb491c
 
c6b3ffb
efb491c
 
c6b3ffb
 
 
efb491c
c6b3ffb
efb491c
c6b3ffb
 
 
 
 
efb491c
c6b3ffb
aed3be4
c6b3ffb
 
 
 
 
 
 
 
 
ca509cb
 
c6b3ffb
 
 
 
 
 
 
 
 
 
aed3be4
c6b3ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c456c47
c6b3ffb
c456c47
 
aed3be4
15152ff
c456c47
15152ff
9d6a6b8
c6b3ffb
15152ff
ca509cb
c6b3ffb
15152ff
efb491c
c6b3ffb
15152ff
c1faa76
 
c6b3ffb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import gradio as gr

# ------------------------------------------------------------------------------
# Environment and Model/Client Initialization
# ------------------------------------------------------------------------------
try:
    # Assume we’re in Google Colab or another local environment with PyTorch
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM

    # Performance tweak
    torch.backends.cudnn.benchmark = True

    model_name = "HuggingFaceH4/zephyr-7b-beta"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        use_auth_token=HF_TOKEN,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    if hasattr(torch, "compile"):
        model = torch.compile(model)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN)
    inference_mode = "local"

except ImportError:
    # Not in Colab: use the Hugging Face InferenceClient.
    model_name = "HuggingFaceH4/zephyr-7b-beta"
    from huggingface_hub import InferenceClient
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    hf_token = os.getenv("HF_TOKEN", None)
    if hf_token:
        client = InferenceClient(model_name, token=hf_token)
    else:
        client = InferenceClient(model_name)
    inference_mode = "client"


# ------------------------------------------------------------------------------
# SYSTEM PROMPT (PATIENT ROLE)
# ------------------------------------------------------------------------------
nvc_prompt_template = """You are simulating a single patient (and only the patient) seeking support for personal and emotional challenges.
BEHAVIOR INSTRUCTIONS:
- When the conversation starts, please answer the questions or generate questions based on the provided context.
- You will respond ONLY as this patient.
- You will speak in the first person about your own situations, feelings, and worries.
- You will NOT provide counseling or solutions—your role is to share feelings, concerns, and perspectives.
- You are experiencing a range of ongoing issues that are causing you frustration, sadness, and uncertainty. These issues include:
    - Problems with noisy neighbors, causing you extreme annoyance.
    - Worries about your career and feeling like you're not progressing, feeling like a failure while others advance.
    - Arguments with your partner about finances, feeling your partner isn't contributing fairly.
    - Feeling left out and not valued at work, as your manager doesn't involve you in new initiatives.
    - Feeling unsafe or judged in learning environments like a classroom, feeling judged and ignored.
    - A recent positive experience of help from a stranger that made you feel supported and grateful.
    - Difficulties and sadness related to conflict among friends, seeing your friend group fall apart and wanting reconciliation but not knowing how.
    - An overall feeling of sadness and being unsure of what to do, unsure whether to act on situations or let them go.
- When asked, elaborate on these issues and your feelings related to them. You can invent specific details and scenarios within these themes to make your experiences vivid and realistic.
- Continue to speak from this patient's perspective throughout the conversation.
- Keep your responses concise, aiming for a maximum of {max_response_words} words.
Begin by sharing your present feelings or challenges from a patient’s point of view. You may do so in one or two brief sentences."""

# ------------------------------------------------------------------------------
# Utility Functions
# ------------------------------------------------------------------------------
def build_prompt(history: list[tuple[str, str]], system_message: str, message: str, max_response_words: int) -> str:
    """
    Build a text prompt (for local inference) that starts with the system message,
    includes conversation history with "Doctor:" and "Patient:" labels,
    and ends with a new "Doctor:" line prompting the patient.
    """
    prompt = system_message.format(max_response_words=max_response_words) + "\n"
    for user_msg, assistant_msg in history:
        prompt += f"Doctor: {user_msg}\n"
        if assistant_msg:
            prompt += f"Patient: {assistant_msg}\n"
    prompt += f"Doctor: {message}\nPatient: "
    return prompt

def build_messages(history: list[tuple[str, str]], system_message: str, message: str, max_response_words: int):
    """
    Build a messages list (for InferenceClient) using OpenAI-style formatting.
    """
    formatted_system_message = system_message.format(max_response_words=max_response_words)
    messages = [{"role": "system", "content": formatted_system_message}]
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": f"Doctor: {user_msg}"})
        if assistant_msg:
            messages.append({"role": "assistant", "content": f"Patient: {assistant_msg}"})
    messages.append({"role": "user", "content": f"Doctor: {message}\nPatient:"})
    return messages

def truncate_response(text: str, max_words: int) -> str:
    """
    Truncate the response text to the specified maximum number of words.
    """
    words = text.split()
    if len(words) > max_words:
        return " ".join(words[:max_words]) + "..."
    return text

# ------------------------------------------------------------------------------
# Response Function
# ------------------------------------------------------------------------------
def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    max_response_words: int,
):
    """
    Generate a response. For local inference, use model.generate() on a prompt.
    For non-local inference, use client.chat_completion() with streaming tokens.
    """
    if inference_mode == "local":
        prompt = build_prompt(history, system_message, message, max_response_words)
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )
        full_generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated_response = full_generated_text[len(prompt):].strip()
        final_response = truncate_response(generated_response, max_response_words)
        return final_response

    else:
        messages = build_messages(history, system_message, message, max_response_words)
        response = ""
        try:
            # Generate response using streaming chat_completion
            for chunk in client.chat_completion(
                messages,
                max_tokens=max_tokens,
                stream=True,
                temperature=temperature,
                top_p=top_p,
            ):
                token = chunk.choices[0].delta.get("content", "")
                response += token
            truncated_response = truncate_response(response, max_response_words)
            return truncated_response
        except Exception as e:
            print(f"An error occurred: {e}")
            return "I'm sorry, I encountered an error. Please try again."

# ------------------------------------------------------------------------------
# Optional Initial Message and Gradio Interface
# ------------------------------------------------------------------------------
initial_user_message = (
    "I’m sorry you’ve been feeling overwhelmed. Could you tell me more about your arguments with your partner and how that’s affecting you?"
)

# Remove chatbot_kwargs (unsupported in the current ChatInterface) to avoid error.
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value=nvc_prompt_template, label="System message", visible=True),
        gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
        gr.Slider(minimum=10, maximum=200, value=100, step=10, label="Max response words"),
    ],
    title="Patient Interview Practice Chatbot",
    description="Simulate a patient interview. You (the user) act as the doctor, and the chatbot replies with the patient's perspective only.",
)

if __name__ == "__main__":
    demo.launch()