Spaces:

damienbenveniste
/

deploy-vllm2

Sleeping

Damien Benveniste commited on Aug 14, 2024

Commit

10dd1af

1 Parent(s): 6490764

modified

Files changed (1) hide show

entrypoint.sh CHANGED Viewed

@@ -37,20 +37,26 @@ for dir in /tmp/huggingface /tmp/cache /tmp/numba_cache /tmp/outlines_cache /.co
     ls -la "$dir"
 done
-# Construct the command
-CMD="vllm serve $MODEL \
---host 0.0.0.0 \
---port 8000 \
---dtype $DTYPE \
---max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
---max-num-seqs $MAX_NUM_SEQS \
---gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
---max-model-len $MAX_MODEL_LEN"
-# Add enforce-eager only if it's set to true
-if [ "$ENFORCE_EAGER" = "true" ]; then
-    CMD="$CMD --enforce-eager"
-fi
 # Execute the command

     ls -la "$dir"
 done
+# # Construct the command
+# CMD="vllm serve $MODEL \
+# --host 0.0.0.0 \
+# --port 8000 \
+# --dtype $DTYPE \
+# --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+# --max-num-seqs $MAX_NUM_SEQS \
+# --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+# --max-model-len $MAX_MODEL_LEN"
+# # Add enforce-eager only if it's set to true
+# if [ "$ENFORCE_EAGER" = "true" ]; then
+#     CMD="$CMD --enforce-eager"
+# fi
+CMD="python3 -m vllm.entrypoints.openai.api_server \
+        --model EleutherAI/pythia-70m \
+        --gpu-memory-utilization 0.9
+        --max-model-len 200"
 # Execute the command