File size: 6,413 Bytes
abed9dd
 
 
 
 
ae274fc
249f47f
 
 
 
 
 
 
 
 
 
31a98e0
4f262bf
afeb14b
249f47f
 
3f30de8
249f47f
 
 
 
 
 
 
ae274fc
4adfe65
249f47f
 
4adfe65
 
 
 
 
249f47f
 
 
 
ae274fc
249f47f
 
 
 
 
ae274fc
249f47f
abed9dd
8ce7d79
abed9dd
2c6942f
249f47f
 
 
 
 
faac8da
abed9dd
 
 
 
 
 
 
 
 
 
 
 
249f47f
 
 
f0d8be5
8ce7d79
249f47f
 
 
 
 
 
 
873ccb3
5a1795d
249f47f
 
 
06d3491
249f47f
19c4612
b1dda8a
 
 
 
 
 
 
 
 
 
 
 
 
ca8be06
b1dda8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9995a0
ca8be06
19c4612
 
b1dda8a
 
19c4612
 
78a0b1e
 
76c7f05
 
 
 
7687287
249f47f
abed9dd
249f47f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

text_generator = None
is_hugging_face = False
def init():
    global text_generator
    huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
    if not huggingface_token:
        pass
        print("no HUGGINGFACE_TOKEN if you need set secret ")
        #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
    
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    model_id = "google/gemma-2b"
    model_id = "Qwen/Qwen2.5-0.5B-Instruct"
    
    device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = "cuda"
    dtype = torch.bfloat16
    
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
    
    print(model_id,device,dtype)
    histories = []
    #model = None

    model = AutoModelForCausalLM.from_pretrained(
            model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
        )
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
       
    
    if not is_hugging_face:
         
        if next(model.parameters()).is_cuda:
            print("The model is on a GPU")
        else:
            print("The model is on a CPU")
    
        #print(f"text_generator.device='{text_generator.device}")
        if str(text_generator.device).strip() == 'cuda':
            print("The pipeline is using a GPU")
        else:
            print("The pipeline is using a CPU")
    
    print("initialized")

@spaces.GPU
def generate_text(messages):
    global text_generator
    if is_hugging_face:#need everytime initialize for ZeroGPU
        model = AutoModelForCausalLM.from_pretrained(
                model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
            )
        text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
    result = text_generator(messages, max_new_tokens=32, do_sample=True, temperature=0.7)

    generated_output = result[0]["generated_text"]
    if isinstance(generated_output, list):
        for message in reversed(generated_output):
            if message.get("role") == "assistant":
                content= message.get("content", "No content found.")
                return content
            
        return "No assistant response found."
    else:
        return "Unexpected output format."



def call_generate_text(message, history):
    if len(message) == 0:
        message.append({"role": "system", "content": "you response around 10 words"})
   # history.append({"role": "user", "content": message})
    print(message)
    print(history)
   
    messages = history+[{"role":"user","content":message}]
    try:
        text = generate_text(messages)
        messages +=  [{"role":"assistant","content":text}]
        return "",messages
    except RuntimeError  as e:
        print(f"An unexpected error occurred: {e}")
       
    return "",history



head = '''
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.webgpu.min.js" ></script>
    
    <script type="module">
        
        import { MatchaTTSRaw } from "https://akjava.github.io/Matcha-TTS-Japanese/js-esm/matcha_tts_raw.js";
        import { webWavPlay } from "https://akjava.github.io/Matcha-TTS-Japanese/js-esm/web_wav_play.js";
        import { arpa_to_ipa } from "https://akjava.github.io/Matcha-TTS-Japanese/js-esm/arpa_to_ipa.js";
        import { loadCmudict } from "https://akjava.github.io/Matcha-TTS-Japanese/js-esm/cmudict_loader.js";
        import { env,textToArpa} from "https://akjava.github.io/Matcha-TTS-Japanese/js-esm/text_to_arpa.js";
        
        env.allowLocalModels = true;
        env.localModelPath = "https://akjava.github.io/Matcha-TTS-Japanese/models/";
        env.backends.onnx.logLevel = "fatal";
        
        let matcha_tts_raw;
        let cmudict ={};
        let speaking = false;
        async function main(text,speed=1.0,tempature=0.5,spk=0) {
            console.log(text)
            if (speaking){
                console.log("speaking return")
            }
            speaking = true
            console.log("main called")
            if(!matcha_tts_raw){
                matcha_tts_raw = new MatchaTTSRaw()
                console.time("load model");
                await matcha_tts_raw.load_model('https://huggingface.co./spaces/Akjava/matcha-tts-onnx-benchmarks/resolve/main/models/matcha-tts/ljspeech_sim.onnx',{ executionProviders: ['webgpu','wasm'] });
                
                console.timeEnd("load model");
                
                let cmudictReady = loadCmudict(cmudict,'https://akjava.github.io/Matcha-TTS-Japanese/dictionaries/cmudict-0.7b')
                await cmudictReady
            }else{
                console.log("session exist skip load model")
            }
           
            const arpa_text = await textToArpa(cmudict,text)
            const ipa_text = arpa_to_ipa(arpa_text).replace(/\s/g, "");
            console.log(ipa_text)
            const spks = 0
            
            console.time("infer");
            const result = await matcha_tts_raw.infer(ipa_text, tempature, speed,spks);
            
            if (result!=null){
                console.timeEnd("infer");
                webWavPlay(result)
               
            }
    
            speaking = false
        }
        window.MatchaTTSEn = main
        console.log(MatchaTTSRaw)
</script>
'''
with gr.Blocks(title="LLM with TTS",head=head) as demo:
    gr.Markdown("## Please be patient, the first response may have a delay of up to 20 seconds while loading.")
    gr.Markdown("**Qwen2.5-0.5B-Instruct/LJSpeech**.LLM and TTS models will change without notice.")
    js = """
    function(chatbot){
    text = (chatbot[chatbot.length -1])["content"]
    window.MatchaTTSEn(text)
    }
    """
    chatbot = gr.Chatbot(type="messages")
    chatbot.change(None,[chatbot],[],js=js)
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])
    
#demo = gr.ChatInterface(call_generate_text,chatbot=chatbot,type="messages")
    msg.submit(call_generate_text, [msg, chatbot], [msg, chatbot])

if __name__ == "__main__":
    init()
    demo.launch(share=True)