Spaces:

damienbenveniste
/

deploy-vllm2

Sleeping

deploy-vllm2 / entrypoint.sh

Damien Benveniste

modified

adf79f3 9 months ago

1.65 kB

	#!/bin/bash

	# Default values
	MODEL=${MODEL:-"microsoft/Phi-3-mini-4k-instruct"}
	# MODEL=${MODEL:-"EleutherAI/pythia-70m"}
	DTYPE=${DTYPE:-"half"}
	MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-512}
	MAX_NUM_SEQS=${MAX_NUM_SEQS:-16}
	GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.85}
	MAX_MODEL_LEN=${MAX_MODEL_LEN:-512}
	ENFORCE_EAGER=${ENFORCE_EAGER:-true}

	# Disable usage stats via environment variable
	export VLLM_DISABLE_USAGE_STATS=true

	# Print environment for debugging
	echo "Environment variables:"
	env

	# Create and set permissions for the config directory
	CONFIG_DIR=${XDG_CONFIG_HOME:-"/tmp/config"}

	if [ ! -d "$CONFIG_DIR" ]; then
	mkdir -p "$CONFIG_DIR"
	fi
	chmod -R 777 "$CONFIG_DIR"
	echo "Permissions for $CONFIG_DIR:"
	ls -la "$CONFIG_DIR"

	# Check and set permissions for directories
	for dir in /tmp/huggingface /tmp/cache /tmp/numba_cache /tmp/outlines_cache /.config; do
	if [ ! -d "$dir" ]; then
	mkdir -p "$dir"
	fi
	chmod -R 777 "$dir"
	echo "Permissions for $dir:"
	ls -la "$dir"
	done

	# Construct the command
	CMD="vllm serve $MODEL \
	--host 0.0.0.0 \
	--port 8000 \
	--dtype $DTYPE \
	--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
	--max-num-seqs $MAX_NUM_SEQS \
	--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
	--max-model-len $MAX_MODEL_LEN"

	# Add enforce-eager only if it's set to true
	if [ "$ENFORCE_EAGER" = "true" ]; then
	CMD="$CMD --enforce-eager"
	fi


	# python3 -m vllm.entrypoints.openai.api_server \
	# --model EleutherAI/pythia-70m \
	# --gpu-memory-utilization 0.9 \
	# --max-model-len 200


	# Execute the command
	echo "Running command: $CMD"
	exec $CMD