From ed023bcdedd93407a8dd10eda94400270e054818 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau Date: Fri, 31 Jan 2025 10:53:33 +0100 Subject: [PATCH 1/8] Solved an issue where lighteval vllm would hang indefinitely in multi node settings --- src/lighteval/models/vllm/vllm_model.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 3398f7218..b8ddcf1e2 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -93,6 +93,7 @@ class VLLMModelConfig: ) pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. generation_parameters: GenerationParameters = None # sampling parameters to use for generation + enforce_eager: bool = False # whether or not to disable cuda graphs with vllm subfolder: Optional[str] = None @@ -136,13 +137,19 @@ def tokenizer(self): return self._tokenizer def cleanup(self): - destroy_model_parallel() + if ray is not None: + ray.get(ray.remote(destroy_model_parallel).remote()) + else: + destroy_model_parallel() if self.model is not None: del self.model.llm_engine.model_executor.driver_worker self.model = None gc.collect() ray.shutdown() - destroy_distributed_environment() + if ray is not None: + ray.get(ray.remote(destroy_distributed_environment).remote()) + else: + destroy_distributed_environment() torch.cuda.empty_cache() @property @@ -182,6 +189,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) -> "max_model_len": self._max_length, "swap_space": 4, "seed": 1234, + "enforce_eager": config.enforce_eager, } if int(config.data_parallel_size) > 1: self.model_args["worker_use_ray"] = True From c66362831f985c9f9ccbd8accbbae49409f82741 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau Date: Fri, 31 Jan 2025 13:08:22 +0100 Subject: [PATCH 2/8] Allows using custom OpenAI endpoint (for instance with vLLM) --- src/lighteval/models/endpoints/openai_model.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py index 37b8ca347..c3ad8f98e 100644 --- a/src/lighteval/models/endpoints/openai_model.py +++ b/src/lighteval/models/endpoints/openai_model.py @@ -84,7 +84,7 @@ class OpenAIClient(LightevalModel): def __init__(self, config: OpenAIModelConfig, env_config) -> None: api_key = os.environ["OPENAI_API_KEY"] - self.client = OpenAI(api_key=api_key) + self.client = OpenAI(api_key=api_key, base_url=os.getenv("OPENAI_BASE_URL")) self.generation_parameters = config.generation_parameters self.sampling_params = self.generation_parameters.to_vllm_openai_dict() @@ -99,7 +99,19 @@ def __init__(self, config: OpenAIModelConfig, env_config) -> None: self.API_RETRY_MULTIPLIER = 2 self.CONCURENT_CALLS = 100 self.model = config.model - self._tokenizer = tiktoken.encoding_for_model(self.model) + try: + self._tokenizer = tiktoken.encoding_for_model(self.model) + except KeyError: + if "TOKENIZER_PATH" in os.environ: + from transformers import AutoTokenizer + + self._tokenizer = AutoTokenizer.from_pretrained(os.getenv("TOKENIZER_PATH")) + elif os.path.exists(self.model) and os.path.isdir(self.model): + from transformers import AutoTokenizer + + self._tokenizer = AutoTokenizer.from_pretrained(self.model) + else: + raise self.pairwise_tokenization = False def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_bias): From b42ec3f8e5e0fc0acf2642366f4cf8ec3fd05cf0 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau Date: Fri, 31 Jan 2025 14:30:41 +0100 Subject: [PATCH 3/8] Added documentation for lighteval vllm and an example for the lost souls --- docs/source/use-vllm-as-backend.mdx | 49 ++++++++++++++++++++++++ examples/slurm/multi_node_vllm.slurm | 57 ++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 examples/slurm/multi_node_vllm.slurm diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx index 787848c36..42e421241 100644 --- a/docs/source/use-vllm-as-backend.mdx +++ b/docs/source/use-vllm-as-backend.mdx @@ -47,3 +47,52 @@ Available arguments for `vllm` can be found in the `VLLMModelConfig`: > [!WARNING] > In the case of OOM issues, you might need to reduce the context size of the > model as well as reduce the `gpu_memory_utilisation` parameter. + + +## Multi-node vLLM + +It is entirely possible to use vLLM in a multi-node setting. For this, we will use Ray. +In these examples, we will assume that we are on a Slurm cluster where nodes do not have internet access. +Those scripts are heavily inspired by [https://github.com/NERSC/slurm-ray-cluster/](https://github.com/NERSC/slurm-ray-cluster/). + +First you need to start the Ray cluster. It has one master, and you need to find an available +port on this node + +```bash +function find_available_port { + printf $(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') +} + +PORT=$(find_available_port) +NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST)) +MASTER=${NODELIST[0]} # Name of master node +MASTER_IP=$(hostname --ip-address) # IP address of master node + +function set_VLLM_HOST_IP { + export VLLM_HOST_IP=$(hostname --ip-address); +} +export -f set_VLLM_HOST_IP; + +# Start the master +srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) -w $MASTER bash -c "set_VLLM_HOST_IP; ray start --head --port=$PORT --block" & +sleep 5 + +# Start all other nodes :) +if [[ $SLURM_NNODES -gt 1 ]]; then + srun -N $(( SLURM_NNODES-1 )) --ntasks-per-node=1 -c $(( SLURM_CPUS_PER_TASK/2 )) -x $MASTER bash -c "set_VLLM_HOST_IP; ray start --address=$MASTER_IP:$PORT --block" & + sleep 5 +fi +``` + +Then, once the Ray cluster is running, you can launch vLLM through lighteval. + +```bash +set_VLLM_HOST_IP + +MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=16384,tensor_parallel_size=" +TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" + +srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE +``` + +The full script is available here: [https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm) diff --git a/examples/slurm/multi_node_vllm.slurm b/examples/slurm/multi_node_vllm.slurm new file mode 100644 index 000000000..ddc193939 --- /dev/null +++ b/examples/slurm/multi_node_vllm.slurm @@ -0,0 +1,57 @@ +#! /bin/bash + +#SBATCH --job-name=EVALUATE_Llama-3.2-1B-Instruct +#SBATCH --account=brb@h100 +#SBATCH --output=evaluation.log +#SBATCH --error=evaluation.log +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=96 +#SBATCH --gres=gpu:4 +#SBATCH --hint=nomultithread +#SBATCH --constraint=h100 +#SBATCH --time=02:00:00 +#SBATCH --exclusive +#SBATCH --parsable + +set -e +set -x + +module purge +module load miniforge/24.9.0 +conda activate $WORKDIR/lighteval-h100 + +function find_available_port { + printf $(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') +} + +PORT=$(find_available_port) +NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST)) +MASTER=${NODELIST[0]} +MASTER_IP=$(hostname --ip-address) + +export HF_HOME=$WORKDIR/HF_HOME +export MODEL_DIRECTORY=$WORKDIR/HuggingFace_Models/meta-llama/Llama-3.2-1B-Instruct +export TASK_FILE=$WORKDIR/community_tasks/french_eval.py +export HF_HUB_OFFLINE=1 +export VLLM_WORKER_MULTIPROC_METHOD=spawn + +function set_VLLM_HOST_IP { + export VLLM_HOST_IP=$(hostname --ip-address); +} +export -f set_VLLM_HOST_IP; + +srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) -w $MASTER bash -c "set_VLLM_HOST_IP; ray start --head --port=$PORT --block" & +sleep 5 + +if [[ $SLURM_NNODES -gt 1 ]]; then + srun -N $(( SLURM_NNODES-1 )) --ntasks-per-node=1 -c $(( SLURM_CPUS_PER_TASK/2 )) -x $MASTER bash -c "set_VLLM_HOST_IP; ray start --address=$MASTER_IP:$PORT --block" & + sleep 5 +fi + +set_VLLM_HOST_IP + +MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=8192,tensor_parallel_size=8" +TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" + +srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE From a5ad8b5f868f723f0e832fbd6e38a1baf4f0db54 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau Date: Fri, 31 Jan 2025 14:31:30 +0100 Subject: [PATCH 4/8] Added documentation for lighteval endpoint with vllm serve backend and an example for the lost souls --- docs/source/use-vllm-as-backend.mdx | 62 +++++++++++++++ examples/slurm/multi_node_vllm_serve.slurm | 90 ++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 examples/slurm/multi_node_vllm_serve.slurm diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx index 42e421241..92af929ad 100644 --- a/docs/source/use-vllm-as-backend.mdx +++ b/docs/source/use-vllm-as-backend.mdx @@ -96,3 +96,65 @@ srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm ``` The full script is available here: [https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm) + +## With vLLM Serve + +It is also possible to use the vLLM serve command to achieve a similar result. +It has the following benefits: can be queried by multiple jobs, can be launched only once when needing multiple evaluation, +has lower peak memory on rank 0. + +We also need to start the Ray cluster, the exact same way as before. However, now, +before calling lighteval, we need to start our vllm server. + +```bash +MODEL_NAME="Llama-3.2-1B-Instruct" +SERVER_PORT=$(find_available_port) +export OPENAI_API_KEY="I-love-vLLM" +export OPENAI_BASE_URL="http://localhost:$SERVER_PORT/v1" + +vllm serve $MODEL_DIRECTORY \ + --served-model-name $MODEL_NAME \ + --api-key $OPENAI_API_KEY \ + --enforce-eager \ + --port $SERVER_PORT \ + --tensor-parallel-size \ + --dtype bfloat16 \ + --max-model-len 16384 \ + --gpu-memory-utilization 0.8 \ + --disable-custom-all-reduce \ + 1>vllm.stdout 2>vllm.stderr & +``` + +In my case, I want the evaluation to be done within the same job, hence why vllm serve +is put in the background. Therefore, we need to wait until it is up & running before +launching lighteval + +```bash +ATTEMPT=0 +DELAY=5 +MAX_ATTEMPTS=60 # Might need to be increased in case of very large model +until curl -s -o /dev/null -w "%{http_code}" $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | grep -E "^2[0-9]{2}$"; do + ATTEMPT=$((ATTEMPT + 1)) + echo "$ATTEMPT attempts" + if [ "$ATTEMPT" -ge "$MAX_ATTEMPTS" ]; then + echo "Failed: the server did not respond any of the $MAX_ATTEMPTS requests." + exit 1 + fi + echo "vllm serve is not ready yet" + sleep $DELAY +done +``` + +Finally, the above script only finishes when vllm serve is ready, so we can launch +the evaluation. + +```bash +export TOKENIZER_PATH="$MODEL_DIRECTORY" + +TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" + +lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE +``` + +The full script is available here: +[https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm) diff --git a/examples/slurm/multi_node_vllm_serve.slurm b/examples/slurm/multi_node_vllm_serve.slurm new file mode 100644 index 000000000..2b4a12eb7 --- /dev/null +++ b/examples/slurm/multi_node_vllm_serve.slurm @@ -0,0 +1,90 @@ +#! /bin/bash + +#SBATCH --job-name=EVALUATE_Llama-3.2-1B-Instruct +#SBATCH --account=brb@h100 +#SBATCH --output=evaluation.log +#SBATCH --error=evaluation.log +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=96 +#SBATCH --gres=gpu:4 +#SBATCH --hint=nomultithread +#SBATCH --constraint=h100 +#SBATCH --time=02:00:00 +#SBATCH --exclusive +#SBATCH --parsable + +set -e +set -x + +module purge +module load miniforge/24.9.0 +conda activate $WORKDIR/lighteval-h100 + +function find_available_port { + printf $(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()') +} + +PORT=$(find_available_port) +NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST)) +MASTER=${NODELIST[0]} +MASTER_IP=$(hostname --ip-address) + +export HF_HOME=$WORKDIR/HF_HOME +export MODEL_DIRECTORY=$WORKDIR/HuggingFace_Models/meta-llama/Llama-3.2-1B-Instruct +export TASK_FILE=$WORKDIR/community_tasks/french_eval.py +export HF_HUB_OFFLINE=1 +export VLLM_WORKER_MULTIPROC_METHOD=spawn + +function set_VLLM_HOST_IP { + export VLLM_HOST_IP=$(hostname --ip-address); +} +export -f set_VLLM_HOST_IP; + +srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) -w $MASTER bash -c "set_VLLM_HOST_IP; ray start --head --port=$PORT --block" & +sleep 5 + +if [[ $SLURM_NNODES -gt 1 ]]; then + srun -N $(( SLURM_NNODES-1 )) --ntasks-per-node=1 -c $(( SLURM_CPUS_PER_TASK/2 )) -x $MASTER bash -c "set_VLLM_HOST_IP; ray start --address=$MASTER_IP:$PORT --block" & + sleep 5 +fi + +set_VLLM_HOST_IP + +MODEL_NAME="Llama-3.2-1B-Instruct" +SERVER_PORT=$(find_available_port) +export OPENAI_API_KEY="I-love-vllm-serve" +export OPENAI_BASE_URL="http://localhost:$SERVER_PORT/v1" + +vllm serve $MODEL_DIRECTORY \ + --served-model-name $MODEL_NAME \ + --api-key $OPENAI_API_KEY \ + --enforce-eager \ + --port $SERVER_PORT \ + --tensor-parallel-size 8 \ + --dtype bfloat16 \ + --max-model-len 16384 \ + --gpu-memory-utilization 0.8 \ + --disable-custom-all-reduce \ + 1>vllm.stdout 2>vllm.stderr & + + +ATTEMPT=0 +DELAY=5 +MAX_ATTEMPTS=60 +until curl -s -o /dev/null -w "%{http_code}" $OPENAI_BASE_URL/models -H "Authorization: Bearer $OPENAI_API_KEY" | grep -E "^2[0-9]{2}$"; do + ATTEMPT=$((ATTEMPT + 1)) + echo "$ATTEMPT attempts" + if [ "$ATTEMPT" -ge "$MAX_ATTEMPTS" ]; then + echo "Failed: the server did not respond any of the $MAX_ATTEMPTS requests." + exit 1 + fi + echo "vllm serve is not ready yet" + sleep $DELAY +done + +export TOKENIZER_PATH="$MODEL_DIRECTORY" + +TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" + +lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE From 0c41fad5c7aa0acab834fc066ee7e316a9d4875e Mon Sep 17 00:00:00 2001 From: Nathan Cassereau <84033440+ncassereau@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:24:13 +0100 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- docs/source/use-vllm-as-backend.mdx | 5 ++--- examples/slurm/multi_node_vllm.slurm | 3 +-- examples/slurm/multi_node_vllm_serve.slurm | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx index db1f503cb..7933f14bb 100644 --- a/docs/source/use-vllm-as-backend.mdx +++ b/docs/source/use-vllm-as-backend.mdx @@ -135,7 +135,7 @@ TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE ``` -The full script is available here: [https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm) +The full script is available [here](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm.slurm) ## With vLLM Serve @@ -196,6 +196,5 @@ TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE ``` -The full script is available here: -[https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm) +The full script is available [here](https://github.com/huggingface/lighteval/blob/main/examples/slurm/multi_node_vllm_serve.slurm). diff --git a/examples/slurm/multi_node_vllm.slurm b/examples/slurm/multi_node_vllm.slurm index ddc193939..686b00f04 100644 --- a/examples/slurm/multi_node_vllm.slurm +++ b/examples/slurm/multi_node_vllm.slurm @@ -1,7 +1,6 @@ #! /bin/bash #SBATCH --job-name=EVALUATE_Llama-3.2-1B-Instruct -#SBATCH --account=brb@h100 #SBATCH --output=evaluation.log #SBATCH --error=evaluation.log #SBATCH --nodes=2 @@ -54,4 +53,4 @@ set_VLLM_HOST_IP MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=8192,tensor_parallel_size=8" TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" -srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE +srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE --use-chat-template diff --git a/examples/slurm/multi_node_vllm_serve.slurm b/examples/slurm/multi_node_vllm_serve.slurm index 2b4a12eb7..1ec5c5c30 100644 --- a/examples/slurm/multi_node_vllm_serve.slurm +++ b/examples/slurm/multi_node_vllm_serve.slurm @@ -87,4 +87,4 @@ export TOKENIZER_PATH="$MODEL_DIRECTORY" TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" -lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE +lighteval endpoint litellm "model_name=${MODEL_NAME},base_url=http://localhost:$SERVER_PORT/v1,provider=openai" "$TASK_ARGS" --custom-tasks $TASK_FILE --use-chat-template From b0a506064776d5fbfa24741c41e4942f7582aaa4 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau <84033440+ncassereau@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:26:03 +0100 Subject: [PATCH 6/8] Remove useless env variable --- examples/slurm/multi_node_vllm_serve.slurm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/slurm/multi_node_vllm_serve.slurm b/examples/slurm/multi_node_vllm_serve.slurm index 1ec5c5c30..1f8927443 100644 --- a/examples/slurm/multi_node_vllm_serve.slurm +++ b/examples/slurm/multi_node_vllm_serve.slurm @@ -54,7 +54,6 @@ set_VLLM_HOST_IP MODEL_NAME="Llama-3.2-1B-Instruct" SERVER_PORT=$(find_available_port) export OPENAI_API_KEY="I-love-vllm-serve" -export OPENAI_BASE_URL="http://localhost:$SERVER_PORT/v1" vllm serve $MODEL_DIRECTORY \ --served-model-name $MODEL_NAME \ @@ -87,4 +86,4 @@ export TOKENIZER_PATH="$MODEL_DIRECTORY" TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" -lighteval endpoint litellm "model_name=${MODEL_NAME},base_url=http://localhost:$SERVER_PORT/v1,provider=openai" "$TASK_ARGS" --custom-tasks $TASK_FILE --use-chat-template +lighteval endpoint litellm "model_name=${MODEL_NAME},base_url=http://localhost:$SERVER_PORT/v1,provider=openai,api_key=${OPENAI_API_KEY}" "$TASK_ARGS" --custom-tasks $TASK_FILE --use-chat-template From eea55d93ce15b160aee9a73e42986925de955e63 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau <84033440+ncassereau@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:26:52 +0100 Subject: [PATCH 7/8] Remove JZ acct --- examples/slurm/multi_node_vllm_serve.slurm | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/slurm/multi_node_vllm_serve.slurm b/examples/slurm/multi_node_vllm_serve.slurm index 1f8927443..10c0d5e0f 100644 --- a/examples/slurm/multi_node_vllm_serve.slurm +++ b/examples/slurm/multi_node_vllm_serve.slurm @@ -1,7 +1,6 @@ #! /bin/bash #SBATCH --job-name=EVALUATE_Llama-3.2-1B-Instruct -#SBATCH --account=brb@h100 #SBATCH --output=evaluation.log #SBATCH --error=evaluation.log #SBATCH --nodes=2 From 4ebcc85260cafefddda2f1a956d5243ebdb86478 Mon Sep 17 00:00:00 2001 From: Nathan Cassereau Date: Mon, 17 Mar 2025 15:30:11 +0100 Subject: [PATCH 8/8] use more official bench --- docs/source/use-vllm-as-backend.mdx | 4 ++-- examples/slurm/multi_node_vllm.slurm | 2 +- examples/slurm/multi_node_vllm_serve.slurm | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx index 7933f14bb..5dbf93709 100644 --- a/docs/source/use-vllm-as-backend.mdx +++ b/docs/source/use-vllm-as-backend.mdx @@ -130,7 +130,7 @@ Then, once the Ray cluster is running, you can launch vLLM through lighteval. set_VLLM_HOST_IP MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=16384,tensor_parallel_size=" -TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" +TASK_ARGS="lighteval|gsm8k|0|0" srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE ``` @@ -191,7 +191,7 @@ the evaluation. ```bash export TOKENIZER_PATH="$MODEL_DIRECTORY" -TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" +TASK_ARGS="lighteval|gsm8k|0|0" lighteval endpoint openai "$MODEL_NAME" "$TASK_ARGS" --custom-tasks $TASK_FILE ``` diff --git a/examples/slurm/multi_node_vllm.slurm b/examples/slurm/multi_node_vllm.slurm index 686b00f04..42a0feacf 100644 --- a/examples/slurm/multi_node_vllm.slurm +++ b/examples/slurm/multi_node_vllm.slurm @@ -51,6 +51,6 @@ fi set_VLLM_HOST_IP MODEL_ARGS="pretrained=$MODEL_DIRECTORY,gpu_memory_utilisation=0.5,trust_remote_code=False,dtype=bfloat16,max_model_length=8192,tensor_parallel_size=8" -TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" +TASK_ARGS="lighteval|gsm8k|0|0" srun -N1 -n1 -c $(( SLURM_CPUS_PER_TASK/2 )) --overlap -w $MASTER lighteval vllm "$MODEL_ARGS" "$TASK_ARGS" --custom-tasks $TASK_FILE --use-chat-template diff --git a/examples/slurm/multi_node_vllm_serve.slurm b/examples/slurm/multi_node_vllm_serve.slurm index 10c0d5e0f..8b435a81a 100644 --- a/examples/slurm/multi_node_vllm_serve.slurm +++ b/examples/slurm/multi_node_vllm_serve.slurm @@ -83,6 +83,6 @@ done export TOKENIZER_PATH="$MODEL_DIRECTORY" -TASK_ARGS="community|gpqa-fr|0|0,community|ifeval-fr|0|0" +TASK_ARGS="lighteval|gsm8k|0|0" lighteval endpoint litellm "model_name=${MODEL_NAME},base_url=http://localhost:$SERVER_PORT/v1,provider=openai,api_key=${OPENAI_API_KEY}" "$TASK_ARGS" --custom-tasks $TASK_FILE --use-chat-template