Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
from subprocess import Popen, PIPE | |
import subprocess | |
import logging | |
import threading | |
import time | |
import queue | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def check_gpu_available(): | |
try: | |
nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}") | |
if nvidia_smi.returncode == 0: | |
logger.info("通过nvidia-smi命令检测到GPU") | |
return True | |
if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu": | |
logger.info("通过环境变量检测到GPU") | |
return True | |
import torch | |
has_gpu = torch.cuda.is_available() | |
logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU") | |
return has_gpu | |
except Exception as e: | |
logger.error(f"GPU检测失败: {str(e)}") | |
return False | |
MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B") | |
API_PORT = int(os.environ.get("API_PORT", 8000)) | |
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860)) | |
API_KEY = os.environ.get("API_KEY", "token-abc123") # 默认API密钥设置为token-abc123 | |
USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true" | |
ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true" | |
HAS_GPU = check_gpu_available() | |
FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true" | |
if FORCE_GPU: | |
HAS_GPU = True | |
vllm_process = None | |
log_queue = queue.Queue(maxsize=1000) # 用于存储日志的队列 | |
log_thread = None | |
stop_log_thread = False | |
def log_reader_thread(process): | |
"""异步读取进程日志的线程""" | |
global stop_log_thread | |
while not stop_log_thread: | |
# 读取stderr | |
if process.stderr: | |
line = process.stderr.readline() | |
if line: | |
log_queue.put(f"[ERROR] {line.strip()}") | |
continue | |
# 读取stdout | |
if process.stdout: | |
line = process.stdout.readline() | |
if line: | |
log_queue.put(line.strip()) | |
continue | |
# 检查进程是否还在运行 | |
if process.poll() is not None: | |
log_queue.put(f"进程已退出,返回码: {process.poll()}") | |
break | |
# 短暂休眠以减少CPU使用 | |
time.sleep(0.1) | |
def start_vllm_server(): | |
global vllm_process, log_thread, stop_log_thread | |
if vllm_process is not None: | |
return "vLLM 服务已经在运行" | |
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG" | |
cmd = [ | |
"vllm", | |
"serve", | |
MODEL_NAME, | |
"--host", "0.0.0.0", | |
"--port", str(API_PORT), | |
"--dtype", "auto", | |
"--trust-remote-code", | |
"--disable-async-output-proc", | |
"--api-key", API_KEY, # 添加API密钥 | |
] | |
if ENFORCE_EAGER: | |
cmd.append("--enforce-eager") | |
if USE_TRANSFORMERS_IMPL: | |
cmd.extend(["--model-impl", "transformers"]) | |
if HAS_GPU: | |
logger.info("使用GPU模式启动vLLM") | |
cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"]) | |
else: | |
logger.info("使用CPU模式启动vLLM") | |
cmd.extend(["--device", "cpu", "--max-model-len", "1024"]) | |
logger.info(f"启动命令: {' '.join(cmd)}") | |
try: | |
vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True, bufsize=1) | |
# 启动日志读取线程 | |
stop_log_thread = False | |
log_thread = threading.Thread(target=log_reader_thread, args=(vllm_process,)) | |
log_thread.daemon = True | |
log_thread.start() | |
return "vLLM 服务器已启动!请等待模型加载完成..." | |
except Exception as e: | |
logger.error(f"启动vLLM服务器时出错: {str(e)}") | |
return f"启动vLLM服务器时出错: {str(e)}" | |
def stop_vllm_server(): | |
global vllm_process, stop_log_thread | |
if vllm_process is None: | |
return "vLLM 服务未运行" | |
# 停止日志线程 | |
stop_log_thread = True | |
if log_thread and log_thread.is_alive(): | |
log_thread.join(timeout=2) | |
# 终止进程 | |
vllm_process.terminate() | |
try: | |
vllm_process.wait(timeout=5) | |
except subprocess.TimeoutExpired: | |
vllm_process.kill() | |
vllm_process = None | |
return "vLLM 服务已停止" | |
def check_server_status(): | |
if vllm_process is None: | |
return "未运行" | |
return_code = vllm_process.poll() | |
return "运行中" if return_code is None else f"已停止 (返回码: {return_code})" | |
def get_server_logs(): | |
"""获取日志,不会阻塞UI""" | |
if vllm_process is None: | |
return "服务未运行,无日志可显示" | |
# 从队列中获取日志 | |
logs = [] | |
try: | |
# 最多获取200行日志,避免过多 | |
for _ in range(200): | |
if log_queue.empty(): | |
break | |
logs.append(log_queue.get_nowait()) | |
log_queue.task_done() | |
except queue.Empty: | |
pass | |
if logs: | |
return "\n".join(logs) | |
else: | |
# 检查进程状态 | |
if vllm_process.poll() is not None: | |
return f"服务已停止,返回码: {vllm_process.poll()}" | |
return "服务正在运行,暂无新日志" | |
def serve_test_ui(): | |
with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo: | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("# vLLM OpenAI兼容API服务控制面板") | |
# 系统信息 | |
gpu_info = "已检测到" if HAS_GPU else "未检测到" | |
system_info = f""" | |
## 系统信息 | |
- GPU: {gpu_info} | |
- 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'} | |
- 当前加载模型: `{MODEL_NAME}` | |
- API密钥: `{API_KEY}` | |
""" | |
gr.Markdown(system_info) | |
with gr.Row(): | |
start_btn = gr.Button("启动服务", variant="primary") | |
stop_btn = gr.Button("停止服务", variant="stop") | |
status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False) | |
refresh_btn = gr.Button("刷新状态") | |
logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15) | |
logs_refresh_btn = gr.Button("刷新日志") | |
# 高级选项 | |
with gr.Accordion("高级选项", open=False): | |
model_input = gr.Textbox(label="模型名称", value=MODEL_NAME, | |
placeholder="输入模型名称,如 zhangchenxu/TinyV-1.5B") | |
with gr.Row(): | |
force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU, | |
info="如果自动检测失败但您确定有GPU,请选中此项") | |
use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL, | |
info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低") | |
enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER, | |
info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题") | |
apply_btn = gr.Button("应用设置", variant="primary") | |
# API使用说明 | |
gr.Markdown("## API 使用说明") | |
api_info = gr.Markdown(f""" | |
### 接口地址 | |
在Hugging Face Space上,接口地址为您Space的URL加上`/v1`路径: | |
``` | |
https://你的HF_SPACE_URL/v1 | |
``` | |
### 支持的API | |
1. **Chat Completions API** (`/v1/chat/completions`) | |
- 用于聊天生成 | |
- 与OpenAI的Chat API兼容 | |
2. **Completions API** (`/v1/completions`) | |
- 用于文本生成 | |
- 与OpenAI的Completions API兼容 | |
### Python示例代码 | |
```python | |
from openai import OpenAI | |
# 创建客户端 | |
client = OpenAI( | |
base_url="https://你的HF_SPACE_URL/v1", | |
api_key="{API_KEY}", | |
) | |
# 聊天完成示例 | |
chat_completion = client.chat.completions.create( | |
model="{MODEL_NAME}", # 模型名称可以是任意值 | |
messages=[ | |
{{"role": "user", "content": "Hello!"}} | |
] | |
) | |
print(chat_completion.choices[0].message.content) | |
# 文本生成示例 | |
completion = client.completions.create( | |
model="{MODEL_NAME}", | |
prompt="Once upon a time", | |
max_tokens=50 | |
) | |
print(completion.choices[0].text) | |
``` | |
### curl示例 | |
```bash | |
# 聊天完成 | |
curl https://你的HF_SPACE_URL/v1/chat/completions \\ | |
-H "Content-Type: application/json" \\ | |
-H "Authorization: Bearer {API_KEY}" \\ | |
-d '{{"model": "{MODEL_NAME}", "messages": [{{"role": "user", "content": "Hello!"}}]}}' | |
# 文本生成 | |
curl https://你的HF_SPACE_URL/v1/completions \\ | |
-H "Content-Type: application/json" \\ | |
-H "Authorization: Bearer {API_KEY}" \\ | |
-d '{{"model": "{MODEL_NAME}", "prompt": "Once upon a time", "max_tokens": 50}}' | |
``` | |
""") | |
# 自动刷新日志 | |
gr.Markdown("## 日志自动刷新") | |
auto_refresh = gr.Checkbox(label="启用日志自动刷新", value=False) | |
# 设置事件处理 | |
start_btn.click(start_vllm_server, inputs=[], outputs=status_text) | |
stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text) | |
refresh_btn.click(check_server_status, inputs=[], outputs=status_text) | |
logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text) | |
# 高级选项的事件处理 | |
def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode): | |
global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER | |
changed = [] | |
if model_name.strip() and model_name != MODEL_NAME: | |
MODEL_NAME = model_name.strip() | |
changed.append(f"模型已更改为: {MODEL_NAME}") | |
if force_gpu_mode != FORCE_GPU: | |
FORCE_GPU = force_gpu_mode | |
if FORCE_GPU: | |
HAS_GPU = True | |
changed.append("已强制启用GPU模式") | |
else: | |
HAS_GPU = check_gpu_available() | |
changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}") | |
if use_transformers_impl != USE_TRANSFORMERS_IMPL: | |
USE_TRANSFORMERS_IMPL = use_transformers_impl | |
changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}") | |
if enforce_eager_mode != ENFORCE_EAGER: | |
ENFORCE_EAGER = enforce_eager_mode | |
changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}") | |
if changed: | |
return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。" | |
else: | |
return "没有设置被更改" | |
def auto_refresh_logs(auto): | |
if auto: | |
# 每秒自动刷新日志 | |
return gr.update(every=1) | |
return gr.update(every=0) | |
apply_btn.click( | |
apply_settings, | |
inputs=[model_input, force_gpu, use_transformers, enforce_eager], | |
outputs=status_text | |
) | |
auto_refresh.change( | |
auto_refresh_logs, | |
inputs=[auto_refresh], | |
outputs=[logs_text] | |
) | |
# 页面加载时自动启动状态检查 | |
demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}", | |
inputs=[], outputs=status_text) | |
return demo | |
if __name__ == "__main__": | |
demo = serve_test_ui() | |
demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True) |