Spaces:
Runtime error
Runtime error
File size: 13,566 Bytes
1337025 b248825 1337025 6c7be96 f228726 b248825 6c7be96 a1dddfb 6c7be96 a1dddfb 6c7be96 d813b23 6c7be96 a1dddfb d813b23 1337025 f228726 6c7be96 a1dddfb 6c7be96 a1dddfb 1337025 f228726 1337025 f228726 1337025 d813b23 a1dddfb 1337025 d813b23 f228726 1337025 d813b23 6c7be96 a1dddfb 6c7be96 d813b23 a1dddfb 6c7be96 d813b23 1337025 f228726 1337025 d813b23 1337025 f228726 1337025 f228726 1337025 f228726 1337025 d813b23 1337025 f228726 1337025 f228726 6c7be96 f228726 6c7be96 f228726 1337025 f228726 1337025 f228726 6c7be96 f228726 1337025 f228726 1337025 f228726 d813b23 f228726 1337025 f228726 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
import os
import gradio as gr
from subprocess import Popen, PIPE
import subprocess
import logging
import threading
import time
import queue
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def check_gpu_available():
try:
nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}")
if nvidia_smi.returncode == 0:
logger.info("通过nvidia-smi命令检测到GPU")
return True
if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu":
logger.info("通过环境变量检测到GPU")
return True
import torch
has_gpu = torch.cuda.is_available()
logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU")
return has_gpu
except Exception as e:
logger.error(f"GPU检测失败: {str(e)}")
return False
MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B")
API_PORT = int(os.environ.get("API_PORT", 8000))
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
API_KEY = os.environ.get("API_KEY", "token-abc123") # 默认API密钥设置为token-abc123
USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true"
ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true"
HAS_GPU = check_gpu_available()
FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true"
if FORCE_GPU:
HAS_GPU = True
vllm_process = None
log_queue = queue.Queue(maxsize=1000) # 用于存储日志的队列
log_thread = None
stop_log_thread = False
def log_reader_thread(process):
"""异步读取进程日志的线程"""
global stop_log_thread
while not stop_log_thread:
# 读取stderr
if process.stderr:
line = process.stderr.readline()
if line:
log_queue.put(f"[ERROR] {line.strip()}")
continue
# 读取stdout
if process.stdout:
line = process.stdout.readline()
if line:
log_queue.put(line.strip())
continue
# 检查进程是否还在运行
if process.poll() is not None:
log_queue.put(f"进程已退出,返回码: {process.poll()}")
break
# 短暂休眠以减少CPU使用
time.sleep(0.1)
def start_vllm_server():
global vllm_process, log_thread, stop_log_thread
if vllm_process is not None:
return "vLLM 服务已经在运行"
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
cmd = [
"vllm",
"serve",
MODEL_NAME,
"--host", "0.0.0.0",
"--port", str(API_PORT),
"--dtype", "auto",
"--trust-remote-code",
"--disable-async-output-proc",
"--api-key", API_KEY, # 添加API密钥
]
if ENFORCE_EAGER:
cmd.append("--enforce-eager")
if USE_TRANSFORMERS_IMPL:
cmd.extend(["--model-impl", "transformers"])
if HAS_GPU:
logger.info("使用GPU模式启动vLLM")
cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"])
else:
logger.info("使用CPU模式启动vLLM")
cmd.extend(["--device", "cpu", "--max-model-len", "1024"])
logger.info(f"启动命令: {' '.join(cmd)}")
try:
vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True, bufsize=1)
# 启动日志读取线程
stop_log_thread = False
log_thread = threading.Thread(target=log_reader_thread, args=(vllm_process,))
log_thread.daemon = True
log_thread.start()
return "vLLM 服务器已启动!请等待模型加载完成..."
except Exception as e:
logger.error(f"启动vLLM服务器时出错: {str(e)}")
return f"启动vLLM服务器时出错: {str(e)}"
def stop_vllm_server():
global vllm_process, stop_log_thread
if vllm_process is None:
return "vLLM 服务未运行"
# 停止日志线程
stop_log_thread = True
if log_thread and log_thread.is_alive():
log_thread.join(timeout=2)
# 终止进程
vllm_process.terminate()
try:
vllm_process.wait(timeout=5)
except subprocess.TimeoutExpired:
vllm_process.kill()
vllm_process = None
return "vLLM 服务已停止"
def check_server_status():
if vllm_process is None:
return "未运行"
return_code = vllm_process.poll()
return "运行中" if return_code is None else f"已停止 (返回码: {return_code})"
def get_server_logs():
"""获取日志,不会阻塞UI"""
if vllm_process is None:
return "服务未运行,无日志可显示"
# 从队列中获取日志
logs = []
try:
# 最多获取200行日志,避免过多
for _ in range(200):
if log_queue.empty():
break
logs.append(log_queue.get_nowait())
log_queue.task_done()
except queue.Empty:
pass
if logs:
return "\n".join(logs)
else:
# 检查进程状态
if vllm_process.poll() is not None:
return f"服务已停止,返回码: {vllm_process.poll()}"
return "服务正在运行,暂无新日志"
def serve_test_ui():
with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
with gr.Row():
with gr.Column():
gr.Markdown("# vLLM OpenAI兼容API服务控制面板")
# 系统信息
gpu_info = "已检测到" if HAS_GPU else "未检测到"
system_info = f"""
## 系统信息
- GPU: {gpu_info}
- 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'}
- 当前加载模型: `{MODEL_NAME}`
- API密钥: `{API_KEY}`
"""
gr.Markdown(system_info)
with gr.Row():
start_btn = gr.Button("启动服务", variant="primary")
stop_btn = gr.Button("停止服务", variant="stop")
status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
refresh_btn = gr.Button("刷新状态")
logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
logs_refresh_btn = gr.Button("刷新日志")
# 高级选项
with gr.Accordion("高级选项", open=False):
model_input = gr.Textbox(label="模型名称", value=MODEL_NAME,
placeholder="输入模型名称,如 zhangchenxu/TinyV-1.5B")
with gr.Row():
force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU,
info="如果自动检测失败但您确定有GPU,请选中此项")
use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL,
info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低")
enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER,
info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题")
apply_btn = gr.Button("应用设置", variant="primary")
# API使用说明
gr.Markdown("## API 使用说明")
api_info = gr.Markdown(f"""
### 接口地址
在Hugging Face Space上,接口地址为您Space的URL加上`/v1`路径:
```
https://你的HF_SPACE_URL/v1
```
### 支持的API
1. **Chat Completions API** (`/v1/chat/completions`)
- 用于聊天生成
- 与OpenAI的Chat API兼容
2. **Completions API** (`/v1/completions`)
- 用于文本生成
- 与OpenAI的Completions API兼容
### Python示例代码
```python
from openai import OpenAI
# 创建客户端
client = OpenAI(
base_url="https://你的HF_SPACE_URL/v1",
api_key="{API_KEY}",
)
# 聊天完成示例
chat_completion = client.chat.completions.create(
model="{MODEL_NAME}", # 模型名称可以是任意值
messages=[
{{"role": "user", "content": "Hello!"}}
]
)
print(chat_completion.choices[0].message.content)
# 文本生成示例
completion = client.completions.create(
model="{MODEL_NAME}",
prompt="Once upon a time",
max_tokens=50
)
print(completion.choices[0].text)
```
### curl示例
```bash
# 聊天完成
curl https://你的HF_SPACE_URL/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer {API_KEY}" \\
-d '{{"model": "{MODEL_NAME}", "messages": [{{"role": "user", "content": "Hello!"}}]}}'
# 文本生成
curl https://你的HF_SPACE_URL/v1/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer {API_KEY}" \\
-d '{{"model": "{MODEL_NAME}", "prompt": "Once upon a time", "max_tokens": 50}}'
```
""")
# 自动刷新日志
gr.Markdown("## 日志自动刷新")
auto_refresh = gr.Checkbox(label="启用日志自动刷新", value=False)
# 设置事件处理
start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
# 高级选项的事件处理
def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode):
global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER
changed = []
if model_name.strip() and model_name != MODEL_NAME:
MODEL_NAME = model_name.strip()
changed.append(f"模型已更改为: {MODEL_NAME}")
if force_gpu_mode != FORCE_GPU:
FORCE_GPU = force_gpu_mode
if FORCE_GPU:
HAS_GPU = True
changed.append("已强制启用GPU模式")
else:
HAS_GPU = check_gpu_available()
changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}")
if use_transformers_impl != USE_TRANSFORMERS_IMPL:
USE_TRANSFORMERS_IMPL = use_transformers_impl
changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}")
if enforce_eager_mode != ENFORCE_EAGER:
ENFORCE_EAGER = enforce_eager_mode
changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}")
if changed:
return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。"
else:
return "没有设置被更改"
def auto_refresh_logs(auto):
if auto:
# 每秒自动刷新日志
return gr.update(every=1)
return gr.update(every=0)
apply_btn.click(
apply_settings,
inputs=[model_input, force_gpu, use_transformers, enforce_eager],
outputs=status_text
)
auto_refresh.change(
auto_refresh_logs,
inputs=[auto_refresh],
outputs=[logs_text]
)
# 页面加载时自动启动状态检查
demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}",
inputs=[], outputs=status_text)
return demo
if __name__ == "__main__":
demo = serve_test_ui()
demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True) |