import os import gradio as gr from subprocess import Popen, PIPE import subprocess import logging import threading import time import queue logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def check_gpu_available(): try: nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}") if nvidia_smi.returncode == 0: logger.info("通过nvidia-smi命令检测到GPU") return True if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu": logger.info("通过环境变量检测到GPU") return True import torch has_gpu = torch.cuda.is_available() logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU") return has_gpu except Exception as e: logger.error(f"GPU检测失败: {str(e)}") return False MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B") API_PORT = int(os.environ.get("API_PORT", 8000)) GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860)) API_KEY = os.environ.get("API_KEY", "token-abc123") # 默认API密钥设置为token-abc123 USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true" ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true" HAS_GPU = check_gpu_available() FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true" if FORCE_GPU: HAS_GPU = True vllm_process = None log_queue = queue.Queue(maxsize=1000) # 用于存储日志的队列 log_thread = None stop_log_thread = False def log_reader_thread(process): """异步读取进程日志的线程""" global stop_log_thread while not stop_log_thread: # 读取stderr if process.stderr: line = process.stderr.readline() if line: log_queue.put(f"[ERROR] {line.strip()}") continue # 读取stdout if process.stdout: line = process.stdout.readline() if line: log_queue.put(line.strip()) continue # 检查进程是否还在运行 if process.poll() is not None: log_queue.put(f"进程已退出,返回码: {process.poll()}") break # 短暂休眠以减少CPU使用 time.sleep(0.1) def start_vllm_server(): global vllm_process, log_thread, stop_log_thread if vllm_process is not None: return "vLLM 服务已经在运行" os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG" cmd = [ "vllm", "serve", MODEL_NAME, "--host", "0.0.0.0", "--port", str(API_PORT), "--dtype", "auto", "--trust-remote-code", "--disable-async-output-proc", "--api-key", API_KEY, # 添加API密钥 ] if ENFORCE_EAGER: cmd.append("--enforce-eager") if USE_TRANSFORMERS_IMPL: cmd.extend(["--model-impl", "transformers"]) if HAS_GPU: logger.info("使用GPU模式启动vLLM") cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"]) else: logger.info("使用CPU模式启动vLLM") cmd.extend(["--device", "cpu", "--max-model-len", "1024"]) logger.info(f"启动命令: {' '.join(cmd)}") try: vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True, bufsize=1) # 启动日志读取线程 stop_log_thread = False log_thread = threading.Thread(target=log_reader_thread, args=(vllm_process,)) log_thread.daemon = True log_thread.start() return "vLLM 服务器已启动!请等待模型加载完成..." except Exception as e: logger.error(f"启动vLLM服务器时出错: {str(e)}") return f"启动vLLM服务器时出错: {str(e)}" def stop_vllm_server(): global vllm_process, stop_log_thread if vllm_process is None: return "vLLM 服务未运行" # 停止日志线程 stop_log_thread = True if log_thread and log_thread.is_alive(): log_thread.join(timeout=2) # 终止进程 vllm_process.terminate() try: vllm_process.wait(timeout=5) except subprocess.TimeoutExpired: vllm_process.kill() vllm_process = None return "vLLM 服务已停止" def check_server_status(): if vllm_process is None: return "未运行" return_code = vllm_process.poll() return "运行中" if return_code is None else f"已停止 (返回码: {return_code})" def get_server_logs(): """获取日志,不会阻塞UI""" if vllm_process is None: return "服务未运行,无日志可显示" # 从队列中获取日志 logs = [] try: # 最多获取200行日志,避免过多 for _ in range(200): if log_queue.empty(): break logs.append(log_queue.get_nowait()) log_queue.task_done() except queue.Empty: pass if logs: return "\n".join(logs) else: # 检查进程状态 if vllm_process.poll() is not None: return f"服务已停止,返回码: {vllm_process.poll()}" return "服务正在运行,暂无新日志" def serve_test_ui(): with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo: with gr.Row(): with gr.Column(): gr.Markdown("# vLLM OpenAI兼容API服务控制面板") # 系统信息 gpu_info = "已检测到" if HAS_GPU else "未检测到" system_info = f""" ## 系统信息 - GPU: {gpu_info} - 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'} - 当前加载模型: `{MODEL_NAME}` - API密钥: `{API_KEY}` """ gr.Markdown(system_info) with gr.Row(): start_btn = gr.Button("启动服务", variant="primary") stop_btn = gr.Button("停止服务", variant="stop") status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False) refresh_btn = gr.Button("刷新状态") logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15) logs_refresh_btn = gr.Button("刷新日志") # 高级选项 with gr.Accordion("高级选项", open=False): model_input = gr.Textbox(label="模型名称", value=MODEL_NAME, placeholder="输入模型名称,如 zhangchenxu/TinyV-1.5B") with gr.Row(): force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU, info="如果自动检测失败但您确定有GPU,请选中此项") use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL, info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低") enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER, info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题") apply_btn = gr.Button("应用设置", variant="primary") # API使用说明 gr.Markdown("## API 使用说明") api_info = gr.Markdown(f""" ### 接口地址 在Hugging Face Space上,接口地址为您Space的URL加上`/v1`路径: ``` https://你的HF_SPACE_URL/v1 ``` ### 支持的API 1. **Chat Completions API** (`/v1/chat/completions`) - 用于聊天生成 - 与OpenAI的Chat API兼容 2. **Completions API** (`/v1/completions`) - 用于文本生成 - 与OpenAI的Completions API兼容 ### Python示例代码 ```python from openai import OpenAI # 创建客户端 client = OpenAI( base_url="https://你的HF_SPACE_URL/v1", api_key="{API_KEY}", ) # 聊天完成示例 chat_completion = client.chat.completions.create( model="{MODEL_NAME}", # 模型名称可以是任意值 messages=[ {{"role": "user", "content": "Hello!"}} ] ) print(chat_completion.choices[0].message.content) # 文本生成示例 completion = client.completions.create( model="{MODEL_NAME}", prompt="Once upon a time", max_tokens=50 ) print(completion.choices[0].text) ``` ### curl示例 ```bash # 聊天完成 curl https://你的HF_SPACE_URL/v1/chat/completions \\ -H "Content-Type: application/json" \\ -H "Authorization: Bearer {API_KEY}" \\ -d '{{"model": "{MODEL_NAME}", "messages": [{{"role": "user", "content": "Hello!"}}]}}' # 文本生成 curl https://你的HF_SPACE_URL/v1/completions \\ -H "Content-Type: application/json" \\ -H "Authorization: Bearer {API_KEY}" \\ -d '{{"model": "{MODEL_NAME}", "prompt": "Once upon a time", "max_tokens": 50}}' ``` """) # 自动刷新日志 gr.Markdown("## 日志自动刷新") auto_refresh = gr.Checkbox(label="启用日志自动刷新", value=False) # 设置事件处理 start_btn.click(start_vllm_server, inputs=[], outputs=status_text) stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text) refresh_btn.click(check_server_status, inputs=[], outputs=status_text) logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text) # 高级选项的事件处理 def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode): global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER changed = [] if model_name.strip() and model_name != MODEL_NAME: MODEL_NAME = model_name.strip() changed.append(f"模型已更改为: {MODEL_NAME}") if force_gpu_mode != FORCE_GPU: FORCE_GPU = force_gpu_mode if FORCE_GPU: HAS_GPU = True changed.append("已强制启用GPU模式") else: HAS_GPU = check_gpu_available() changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}") if use_transformers_impl != USE_TRANSFORMERS_IMPL: USE_TRANSFORMERS_IMPL = use_transformers_impl changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}") if enforce_eager_mode != ENFORCE_EAGER: ENFORCE_EAGER = enforce_eager_mode changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}") if changed: return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。" else: return "没有设置被更改" def auto_refresh_logs(auto): if auto: # 每秒自动刷新日志 return gr.update(every=1) return gr.update(every=0) apply_btn.click( apply_settings, inputs=[model_input, force_gpu, use_transformers, enforce_eager], outputs=status_text ) auto_refresh.change( auto_refresh_logs, inputs=[auto_refresh], outputs=[logs_text] ) # 页面加载时自动启动状态检查 demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}", inputs=[], outputs=status_text) return demo if __name__ == "__main__": demo = serve_test_ui() demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)