|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -x |
| 4 | + |
| 5 | +CLUSTER_NAME=$1 |
| 6 | +rm -rf stability-hpc |
| 7 | +git clone "https://github.com/stability-ai/stability-hpc" |
| 8 | + |
| 9 | +python3 -m pip install "aws-parallelcluster" --user --upgrade --quiet |
| 10 | + |
| 11 | +#source cluster profile |
| 12 | +. cluster_env |
| 13 | + |
| 14 | +#source cluster parameters |
| 15 | +. ${CLUSTER_NAME}.conf |
| 16 | + |
| 17 | +#create the S3 bucket for the cluster |
| 18 | +aws s3api create-bucket --bucket "${S3_BUCKET}" --region "${AWS_REGION_NAME}" --create-bucket-configuration LocationConstraint="${AWS_REGION_NAME}" > /dev/null 2>&1 |
| 19 | +aws s3 cp --quiet --recursive stability-hpc "s3://${S3_BUCKET}/1click-hpc" --region ${AWS_REGION_NAME} |
| 20 | + |
| 21 | +#needed to config the domain |
| 22 | +ADName=$(aws ds describe-directories --directory-id "${AD_ID}" --query 'DirectoryDescriptions[*].Name' --output text) |
| 23 | + |
| 24 | +export DC0=$(echo "${ADName}" | awk -F'.' '{print $1}') |
| 25 | +export DC1=$(echo "${ADName}" | awk -F'.' '{print $2}') |
| 26 | +export DC2=$(echo "${ADName}" | awk -F'.' '{print $3}') |
| 27 | +export OU="AD-Manage" |
| 28 | + |
| 29 | +#note that pcluster will refuse to create two lustre file systems if both values are AUTO |
| 30 | + |
| 31 | +if [[ $FSX_ID == "AUTO" ]];then |
| 32 | +FSX=$(cat <<EOF |
| 33 | + - MountDir: /fsx |
| 34 | + Name: newfsx |
| 35 | + StorageType: FsxLustre |
| 36 | + FsxLustreSettings: |
| 37 | + StorageCapacity: 1200 |
| 38 | + DeploymentType: SCRATCH_2 |
| 39 | + DataCompressionType: LZ4 |
| 40 | +EOF |
| 41 | +) |
| 42 | +else |
| 43 | +FSX=$(cat <<EOF |
| 44 | + - MountDir: /fsx |
| 45 | + Name: existingfsx |
| 46 | + StorageType: FsxLustre |
| 47 | + FsxLustreSettings: |
| 48 | + FileSystemId: ${FSX_ID} |
| 49 | +EOF |
| 50 | +) |
| 51 | +fi |
| 52 | + |
| 53 | +if [[ $ADMINFSX_ID == "AUTO" ]];then |
| 54 | +ADMINFSX=$(cat <<EOF |
| 55 | + - MountDir: /admin |
| 56 | + Name: newadmin |
| 57 | + StorageType: FsxLustre |
| 58 | + FsxLustreSettings: |
| 59 | + StorageCapacity: 1200 |
| 60 | + DeploymentType: SCRATCH_2 |
| 61 | + DataCompressionType: LZ4 |
| 62 | +EOF |
| 63 | +) |
| 64 | +else |
| 65 | +ADMINFSX=$(cat <<EOF |
| 66 | + - MountDir: /admin |
| 67 | + Name: existingadmin |
| 68 | + StorageType: FsxLustre |
| 69 | + FsxLustreSettings: |
| 70 | + FileSystemId: ${ADMINFSX_ID} |
| 71 | +EOF |
| 72 | +) |
| 73 | +fi |
| 74 | + |
| 75 | +export FSX=${FSX} |
| 76 | +export ADMINFSX=${ADMINFSX} |
| 77 | + |
| 78 | +if [[ $PRIVATE_SUBNET_ID == "NONE" ]];then |
| 79 | + export SUBNET_ID="${PUBLIC_SUBNET_ID}" |
| 80 | + export USE_PUBLIC_IPS='true' |
| 81 | + export HN_SUBNET_ID="${PUBLIC_SUBNET_ID}" |
| 82 | +else |
| 83 | + export SUBNET_ID="${PRIVATE_SUBNET_ID}" |
| 84 | + export USE_PUBLIC_IPS='false' |
| 85 | + export HN_SUBNET_ID="${PUBLIC_SUBNET_ID}" |
| 86 | +fi |
| 87 | + |
| 88 | +/usr/bin/envsubst < "stability-hpc/parallelcluster/${TEMPLATE}" > config.${CLUSTER_NAME}.yaml |
| 89 | + |
| 90 | +aws ec2 create-key-pair --key-name ${KEY_PAIR} --query KeyMaterial --output text > "/home/ec2-user/.ssh/id_rsa_${CLUSTER_NAME}" |
| 91 | +if [ $? -ne 0 ]; then |
| 92 | + aws ec2 delete-key-pair --key-name ${KEY_PAIR} |
| 93 | + aws ec2 create-key-pair --key-name ${KEY_PAIR} --query KeyMaterial --output text > "/home/ec2-user/.ssh/id_rsa_${CLUSTER_NAME}" |
| 94 | +fi |
| 95 | +sudo chmod 600 "/home/ec2-user/.ssh/id_rsa_${CLUSTER_NAME}" |
| 96 | + |
| 97 | +#Create the cluster and wait |
| 98 | +/home/ec2-user/.local/bin/pcluster create-cluster --cluster-name "stability-${CLUSTER_NAME}" --cluster-configuration config.${CLUSTER_NAME}.yaml --rollback-on-failure false --wait >> bootstrap.log |
| 99 | + |
| 100 | +aws s3 cp --quiet bootstrap.log "s3://${S3_BUCKET}/install.log" --region "${AWS_REGION_NAME}" |
| 101 | +aws s3 cp --quiet config.${AWS_REGION_NAME}.yaml "s3://${S3_BUCKET}/config.${AWS_REGION_NAME}.yaml" --region "${AWS_REGION_NAME}" |
0 commit comments