NanoV / app.py
zhangchenxu's picture
update
f228726
import os
import gradio as gr
from subprocess import Popen, PIPE
import subprocess
import logging
import threading
import time
import queue
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def check_gpu_available():
try:
nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}")
if nvidia_smi.returncode == 0:
logger.info("通过nvidia-smi命令检测到GPU")
return True
if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu":
logger.info("通过环境变量检测到GPU")
return True
import torch
has_gpu = torch.cuda.is_available()
logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU")
return has_gpu
except Exception as e:
logger.error(f"GPU检测失败: {str(e)}")
return False
MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B")
API_PORT = int(os.environ.get("API_PORT", 8000))
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
API_KEY = os.environ.get("API_KEY", "token-abc123") # 默认API密钥设置为token-abc123
USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true"
ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true"
HAS_GPU = check_gpu_available()
FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true"
if FORCE_GPU:
HAS_GPU = True
vllm_process = None
log_queue = queue.Queue(maxsize=1000) # 用于存储日志的队列
log_thread = None
stop_log_thread = False
def log_reader_thread(process):
"""异步读取进程日志的线程"""
global stop_log_thread
while not stop_log_thread:
# 读取stderr
if process.stderr:
line = process.stderr.readline()
if line:
log_queue.put(f"[ERROR] {line.strip()}")
continue
# 读取stdout
if process.stdout:
line = process.stdout.readline()
if line:
log_queue.put(line.strip())
continue
# 检查进程是否还在运行
if process.poll() is not None:
log_queue.put(f"进程已退出,返回码: {process.poll()}")
break
# 短暂休眠以减少CPU使用
time.sleep(0.1)
def start_vllm_server():
global vllm_process, log_thread, stop_log_thread
if vllm_process is not None:
return "vLLM 服务已经在运行"
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
cmd = [
"vllm",
"serve",
MODEL_NAME,
"--host", "0.0.0.0",
"--port", str(API_PORT),
"--dtype", "auto",
"--trust-remote-code",
"--disable-async-output-proc",
"--api-key", API_KEY, # 添加API密钥
]
if ENFORCE_EAGER:
cmd.append("--enforce-eager")
if USE_TRANSFORMERS_IMPL:
cmd.extend(["--model-impl", "transformers"])
if HAS_GPU:
logger.info("使用GPU模式启动vLLM")
cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"])
else:
logger.info("使用CPU模式启动vLLM")
cmd.extend(["--device", "cpu", "--max-model-len", "1024"])
logger.info(f"启动命令: {' '.join(cmd)}")
try:
vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True, bufsize=1)
# 启动日志读取线程
stop_log_thread = False
log_thread = threading.Thread(target=log_reader_thread, args=(vllm_process,))
log_thread.daemon = True
log_thread.start()
return "vLLM 服务器已启动!请等待模型加载完成..."
except Exception as e:
logger.error(f"启动vLLM服务器时出错: {str(e)}")
return f"启动vLLM服务器时出错: {str(e)}"
def stop_vllm_server():
global vllm_process, stop_log_thread
if vllm_process is None:
return "vLLM 服务未运行"
# 停止日志线程
stop_log_thread = True
if log_thread and log_thread.is_alive():
log_thread.join(timeout=2)
# 终止进程
vllm_process.terminate()
try:
vllm_process.wait(timeout=5)
except subprocess.TimeoutExpired:
vllm_process.kill()
vllm_process = None
return "vLLM 服务已停止"
def check_server_status():
if vllm_process is None:
return "未运行"
return_code = vllm_process.poll()
return "运行中" if return_code is None else f"已停止 (返回码: {return_code})"
def get_server_logs():
"""获取日志,不会阻塞UI"""
if vllm_process is None:
return "服务未运行,无日志可显示"
# 从队列中获取日志
logs = []
try:
# 最多获取200行日志,避免过多
for _ in range(200):
if log_queue.empty():
break
logs.append(log_queue.get_nowait())
log_queue.task_done()
except queue.Empty:
pass
if logs:
return "\n".join(logs)
else:
# 检查进程状态
if vllm_process.poll() is not None:
return f"服务已停止,返回码: {vllm_process.poll()}"
return "服务正在运行,暂无新日志"
def serve_test_ui():
with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
with gr.Row():
with gr.Column():
gr.Markdown("# vLLM OpenAI兼容API服务控制面板")
# 系统信息
gpu_info = "已检测到" if HAS_GPU else "未检测到"
system_info = f"""
## 系统信息
- GPU: {gpu_info}
- 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'}
- 当前加载模型: `{MODEL_NAME}`
- API密钥: `{API_KEY}`
"""
gr.Markdown(system_info)
with gr.Row():
start_btn = gr.Button("启动服务", variant="primary")
stop_btn = gr.Button("停止服务", variant="stop")
status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
refresh_btn = gr.Button("刷新状态")
logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
logs_refresh_btn = gr.Button("刷新日志")
# 高级选项
with gr.Accordion("高级选项", open=False):
model_input = gr.Textbox(label="模型名称", value=MODEL_NAME,
placeholder="输入模型名称,如 zhangchenxu/TinyV-1.5B")
with gr.Row():
force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU,
info="如果自动检测失败但您确定有GPU,请选中此项")
use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL,
info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低")
enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER,
info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题")
apply_btn = gr.Button("应用设置", variant="primary")
# API使用说明
gr.Markdown("## API 使用说明")
api_info = gr.Markdown(f"""
### 接口地址
在Hugging Face Space上,接口地址为您Space的URL加上`/v1`路径:
```
https://你的HF_SPACE_URL/v1
```
### 支持的API
1. **Chat Completions API** (`/v1/chat/completions`)
- 用于聊天生成
- 与OpenAI的Chat API兼容
2. **Completions API** (`/v1/completions`)
- 用于文本生成
- 与OpenAI的Completions API兼容
### Python示例代码
```python
from openai import OpenAI
# 创建客户端
client = OpenAI(
base_url="https://你的HF_SPACE_URL/v1",
api_key="{API_KEY}",
)
# 聊天完成示例
chat_completion = client.chat.completions.create(
model="{MODEL_NAME}", # 模型名称可以是任意值
messages=[
{{"role": "user", "content": "Hello!"}}
]
)
print(chat_completion.choices[0].message.content)
# 文本生成示例
completion = client.completions.create(
model="{MODEL_NAME}",
prompt="Once upon a time",
max_tokens=50
)
print(completion.choices[0].text)
```
### curl示例
```bash
# 聊天完成
curl https://你的HF_SPACE_URL/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer {API_KEY}" \\
-d '{{"model": "{MODEL_NAME}", "messages": [{{"role": "user", "content": "Hello!"}}]}}'
# 文本生成
curl https://你的HF_SPACE_URL/v1/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer {API_KEY}" \\
-d '{{"model": "{MODEL_NAME}", "prompt": "Once upon a time", "max_tokens": 50}}'
```
""")
# 自动刷新日志
gr.Markdown("## 日志自动刷新")
auto_refresh = gr.Checkbox(label="启用日志自动刷新", value=False)
# 设置事件处理
start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
# 高级选项的事件处理
def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode):
global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER
changed = []
if model_name.strip() and model_name != MODEL_NAME:
MODEL_NAME = model_name.strip()
changed.append(f"模型已更改为: {MODEL_NAME}")
if force_gpu_mode != FORCE_GPU:
FORCE_GPU = force_gpu_mode
if FORCE_GPU:
HAS_GPU = True
changed.append("已强制启用GPU模式")
else:
HAS_GPU = check_gpu_available()
changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}")
if use_transformers_impl != USE_TRANSFORMERS_IMPL:
USE_TRANSFORMERS_IMPL = use_transformers_impl
changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}")
if enforce_eager_mode != ENFORCE_EAGER:
ENFORCE_EAGER = enforce_eager_mode
changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}")
if changed:
return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。"
else:
return "没有设置被更改"
def auto_refresh_logs(auto):
if auto:
# 每秒自动刷新日志
return gr.update(every=1)
return gr.update(every=0)
apply_btn.click(
apply_settings,
inputs=[model_input, force_gpu, use_transformers, enforce_eager],
outputs=status_text
)
auto_refresh.change(
auto_refresh_logs,
inputs=[auto_refresh],
outputs=[logs_text]
)
# 页面加载时自动启动状态检查
demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}",
inputs=[], outputs=status_text)
return demo
if __name__ == "__main__":
demo = serve_test_ui()
demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)