import gradio as gr from huggingface_hub import InferenceClient import os # Khởi tạo client client = InferenceClient( "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF", ) def generate_text(prompt, system_prompt="", max_new_tokens=512, temperature=0.7, top_p=0.95): # Chuẩn bị prompt theo định dạng mà mô hình yêu cầu if system_prompt: formatted_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" else: formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" # Gọi API inference response = client.text_generation( formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, stopping_words=["<|im_end|>"] ) return response # Tạo Gradio interface with gr.Blocks() as demo: gr.Markdown("# OpenHermes-2.5-Mistral-7B API") with gr.Row(): with gr.Column(): system_prompt = gr.Textbox(label="System Prompt (optional)", lines=2) prompt = gr.Textbox(label="User Prompt", lines=4) with gr.Row(): max_tokens = gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max New Tokens") temp = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature") top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p") submit_btn = gr.Button("Generate") with gr.Column(): output = gr.Textbox(label="Generated Output", lines=10) submit_btn.click( generate_text, inputs=[prompt, system_prompt, max_tokens, temp, top_p], outputs=output ) # Thêm API endpoint demo.queue().launch(share=True)