File size: 13,566 Bytes
1337025
b248825
1337025
6c7be96
 
f228726
 
 
b248825
6c7be96
 
 
a1dddfb
 
6c7be96
 
 
 
 
 
 
 
a1dddfb
6c7be96
d813b23
6c7be96
 
 
a1dddfb
 
d813b23
1337025
 
f228726
6c7be96
 
a1dddfb
6c7be96
 
 
a1dddfb
1337025
f228726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1337025
 
f228726
1337025
 
d813b23
a1dddfb
1337025
 
 
 
 
 
 
d813b23
 
f228726
1337025
d813b23
6c7be96
 
 
 
a1dddfb
6c7be96
d813b23
a1dddfb
6c7be96
d813b23
 
 
1337025
f228726
 
 
 
 
 
 
 
1337025
 
d813b23
1337025
 
 
f228726
1337025
 
f228726
 
 
 
 
 
 
1337025
f228726
 
 
 
 
1337025
 
 
 
 
 
 
d813b23
1337025
 
f228726
1337025
 
f228726
 
 
6c7be96
f228726
 
 
6c7be96
f228726
 
 
 
 
 
 
 
 
 
 
 
1337025
 
f228726
1337025
 
f228726
 
 
6c7be96
f228726
 
 
 
 
 
 
 
 
 
 
 
 
1337025
 
f228726
 
1337025
f228726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d813b23
f228726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1337025
 
 
 
f228726
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import os
import gradio as gr
from subprocess import Popen, PIPE
import subprocess
import logging
import threading
import time
import queue

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def check_gpu_available():
    try:
        nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}")
        if nvidia_smi.returncode == 0:
            logger.info("通过nvidia-smi命令检测到GPU")
            return True
        if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu":
            logger.info("通过环境变量检测到GPU")
            return True
        import torch
        has_gpu = torch.cuda.is_available()
        logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU")
        return has_gpu
    except Exception as e:
        logger.error(f"GPU检测失败: {str(e)}")
        return False

MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B")
API_PORT = int(os.environ.get("API_PORT", 8000))
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
API_KEY = os.environ.get("API_KEY", "token-abc123")  # 默认API密钥设置为token-abc123
USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true"
ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true"
HAS_GPU = check_gpu_available()
FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true"
if FORCE_GPU:
    HAS_GPU = True

vllm_process = None
log_queue = queue.Queue(maxsize=1000)  # 用于存储日志的队列
log_thread = None
stop_log_thread = False

def log_reader_thread(process):
    """异步读取进程日志的线程"""
    global stop_log_thread
    while not stop_log_thread:
        # 读取stderr
        if process.stderr:
            line = process.stderr.readline()
            if line:
                log_queue.put(f"[ERROR] {line.strip()}")
                continue
                
        # 读取stdout
        if process.stdout:
            line = process.stdout.readline()
            if line:
                log_queue.put(line.strip())
                continue
                
        # 检查进程是否还在运行
        if process.poll() is not None:
            log_queue.put(f"进程已退出,返回码: {process.poll()}")
            break
            
        # 短暂休眠以减少CPU使用
        time.sleep(0.1)

def start_vllm_server():
    global vllm_process, log_thread, stop_log_thread
    if vllm_process is not None:
        return "vLLM 服务已经在运行"

    os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
    cmd = [
        "vllm", 
        "serve", 
        MODEL_NAME,
        "--host", "0.0.0.0", 
        "--port", str(API_PORT),
        "--dtype", "auto",
        "--trust-remote-code",
        "--disable-async-output-proc",
        "--api-key", API_KEY,  # 添加API密钥
    ]

    if ENFORCE_EAGER:
        cmd.append("--enforce-eager")
    if USE_TRANSFORMERS_IMPL:
        cmd.extend(["--model-impl", "transformers"])
    if HAS_GPU:
        logger.info("使用GPU模式启动vLLM")
        cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"])
    else:
        logger.info("使用CPU模式启动vLLM")
        cmd.extend(["--device", "cpu", "--max-model-len", "1024"])

    logger.info(f"启动命令: {' '.join(cmd)}")
    try:
        vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True, bufsize=1)
        
        # 启动日志读取线程
        stop_log_thread = False
        log_thread = threading.Thread(target=log_reader_thread, args=(vllm_process,))
        log_thread.daemon = True
        log_thread.start()
        
        return "vLLM 服务器已启动!请等待模型加载完成..."
    except Exception as e:
        logger.error(f"启动vLLM服务器时出错: {str(e)}")
        return f"启动vLLM服务器时出错: {str(e)}"

def stop_vllm_server():
    global vllm_process, stop_log_thread
    if vllm_process is None:
        return "vLLM 服务未运行"
    
    # 停止日志线程
    stop_log_thread = True
    if log_thread and log_thread.is_alive():
        log_thread.join(timeout=2)
    
    # 终止进程
    vllm_process.terminate()
    try:
        vllm_process.wait(timeout=5)
    except subprocess.TimeoutExpired:
        vllm_process.kill()
    
    vllm_process = None
    return "vLLM 服务已停止"

def check_server_status():
    if vllm_process is None:
        return "未运行"
    return_code = vllm_process.poll()
    return "运行中" if return_code is None else f"已停止 (返回码: {return_code})"

def get_server_logs():
    """获取日志,不会阻塞UI"""
    if vllm_process is None:
        return "服务未运行,无日志可显示"
    
    # 从队列中获取日志
    logs = []
    try:
        # 最多获取200行日志,避免过多
        for _ in range(200):
            if log_queue.empty():
                break
            logs.append(log_queue.get_nowait())
            log_queue.task_done()
    except queue.Empty:
        pass
    
    if logs:
        return "\n".join(logs)
    else:
        # 检查进程状态
        if vllm_process.poll() is not None:
            return f"服务已停止,返回码: {vllm_process.poll()}"
        return "服务正在运行,暂无新日志"

def serve_test_ui():
    with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
        with gr.Row():
            with gr.Column():
                gr.Markdown("# vLLM OpenAI兼容API服务控制面板")
                
                # 系统信息
                gpu_info = "已检测到" if HAS_GPU else "未检测到"
                system_info = f"""
                ## 系统信息
                - GPU: {gpu_info}
                - 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'}
                - 当前加载模型: `{MODEL_NAME}`
                - API密钥: `{API_KEY}`
                """
                gr.Markdown(system_info)
                
                with gr.Row():
                    start_btn = gr.Button("启动服务", variant="primary")
                    stop_btn = gr.Button("停止服务", variant="stop")
                
                status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
                refresh_btn = gr.Button("刷新状态")
                
                logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
                logs_refresh_btn = gr.Button("刷新日志")
                
                # 高级选项
                with gr.Accordion("高级选项", open=False):
                    model_input = gr.Textbox(label="模型名称", value=MODEL_NAME, 
                                          placeholder="输入模型名称,如 zhangchenxu/TinyV-1.5B")
                    
                    with gr.Row():
                        force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU, 
                                             info="如果自动检测失败但您确定有GPU,请选中此项")
                        use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL,
                                                    info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低")
                        enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER,
                                                 info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题")
                    
                    apply_btn = gr.Button("应用设置", variant="primary")
                
                # API使用说明
                gr.Markdown("## API 使用说明")
                
                api_info = gr.Markdown(f"""
                ### 接口地址
                
                在Hugging Face Space上,接口地址为您Space的URL加上`/v1`路径:
                ```
                https://你的HF_SPACE_URL/v1
                ```
                
                ### 支持的API
                
                1. **Chat Completions API** (`/v1/chat/completions`)
                   - 用于聊天生成
                   - 与OpenAI的Chat API兼容
                
                2. **Completions API** (`/v1/completions`)
                   - 用于文本生成
                   - 与OpenAI的Completions API兼容
                
                ### Python示例代码
                
                ```python
                from openai import OpenAI
                
                # 创建客户端
                client = OpenAI(
                    base_url="https://你的HF_SPACE_URL/v1",
                    api_key="{API_KEY}",
                )
                
                # 聊天完成示例
                chat_completion = client.chat.completions.create(
                  model="{MODEL_NAME}",  # 模型名称可以是任意值
                  messages=[
                    {{"role": "user", "content": "Hello!"}}
                  ]
                )
                
                print(chat_completion.choices[0].message.content)
                
                # 文本生成示例
                completion = client.completions.create(
                  model="{MODEL_NAME}",
                  prompt="Once upon a time",
                  max_tokens=50
                )
                
                print(completion.choices[0].text)
                ```
                
                ### curl示例
                
                ```bash
                # 聊天完成
                curl https://你的HF_SPACE_URL/v1/chat/completions \\
                  -H "Content-Type: application/json" \\
                  -H "Authorization: Bearer {API_KEY}" \\
                  -d '{{"model": "{MODEL_NAME}", "messages": [{{"role": "user", "content": "Hello!"}}]}}'
                  
                # 文本生成
                curl https://你的HF_SPACE_URL/v1/completions \\
                  -H "Content-Type: application/json" \\
                  -H "Authorization: Bearer {API_KEY}" \\
                  -d '{{"model": "{MODEL_NAME}", "prompt": "Once upon a time", "max_tokens": 50}}'
                ```
                """)
                
                # 自动刷新日志
                gr.Markdown("## 日志自动刷新")
                auto_refresh = gr.Checkbox(label="启用日志自动刷新", value=False)

        # 设置事件处理
        start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
        stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
        refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
        logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
        
        # 高级选项的事件处理
        def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode):
            global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER
            
            changed = []
            
            if model_name.strip() and model_name != MODEL_NAME:
                MODEL_NAME = model_name.strip()
                changed.append(f"模型已更改为: {MODEL_NAME}")
            
            if force_gpu_mode != FORCE_GPU:
                FORCE_GPU = force_gpu_mode
                if FORCE_GPU:
                    HAS_GPU = True
                    changed.append("已强制启用GPU模式")
                else:
                    HAS_GPU = check_gpu_available()
                    changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}")
            
            if use_transformers_impl != USE_TRANSFORMERS_IMPL:
                USE_TRANSFORMERS_IMPL = use_transformers_impl
                changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}")
            
            if enforce_eager_mode != ENFORCE_EAGER:
                ENFORCE_EAGER = enforce_eager_mode
                changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}")
            
            if changed:
                return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。"
            else:
                return "没有设置被更改"

        def auto_refresh_logs(auto):
            if auto:
                # 每秒自动刷新日志
                return gr.update(every=1)
            return gr.update(every=0)
        
        apply_btn.click(
            apply_settings, 
            inputs=[model_input, force_gpu, use_transformers, enforce_eager], 
            outputs=status_text
        )
        
        auto_refresh.change(
            auto_refresh_logs,
            inputs=[auto_refresh],
            outputs=[logs_text]
        )
        
        # 页面加载时自动启动状态检查
        demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}", 
                  inputs=[], outputs=status_text)
    
    return demo

if __name__ == "__main__":
    demo = serve_test_ui()
    demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)