Skip to content

[AMD] [P/D] Compute num gpus for ROCm correctly in run_accuracy_test.sh #18568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2} # Default to 2
# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)

SMI_BIN=$(which nvidia-smi || which rocm-smi)

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT

Expand Down Expand Up @@ -44,6 +46,13 @@ get_model_args() {
echo "$extra_args"
}

get_num_gpus() {
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
else
echo "$($SMI_BIN -l | grep GPU | wc -l)"
fi
}

# Function to run tests for a specific model
run_tests_for_model() {
Expand All @@ -64,7 +73,7 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
GPU_ID=$((i % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8100 + i))
# Calculate side channel port
Expand Down Expand Up @@ -96,7 +105,7 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8200 + i))
# Calculate side channel port
Expand Down