Skip to content

Commit a804928

Browse files
committed
more monitoring
1 parent 23e5054 commit a804928

File tree

5 files changed

+56
-25
lines changed

5 files changed

+56
-25
lines changed

Diff for: Templates/AWS-HPC-Cluster.yaml

+9-24
Original file line numberDiff line numberDiff line change
@@ -66,37 +66,37 @@ Parameters:
6666
Description: 'Please, enter your VPC ID, or just leave "AUTO" if you want to re-use an existing one.'
6767
Type: String
6868
AllowedPattern: ^(AUTO|vpc-[0-9a-z]+)$
69-
Default: AUTO
69+
Default: vpc-4678d63b
7070

7171
PublicSubnetAId:
7272
Description: 'Please, enter the ID of the Public Subnet you wish to use, or just leave "AUTO" if you want to re-use an existing one.'
7373
Type: String
7474
AllowedPattern: ^(AUTO|subnet-[0-9a-z]+)$
75-
Default : AUTO
75+
Default : subnet-2fd94770
7676

7777
PublicSubnetBId:
7878
Description: 'Please, enter another ID of the Public Subnet you wish to use, or just leave "AUTO" if you want to re-use an existing one.'
7979
Type: String
8080
AllowedPattern: ^(AUTO|subnet-[0-9a-z]+)$
81-
Default : AUTO
81+
Default : subnet-79775a34
8282

8383
PrivateSubnetAId:
8484
Description: 'Please, enter the ID of the Private Subnet you wish to use, or put NONE if you want to use only Public Subnets, or just leave "AUTO" if you want to re-use an existing one.'
8585
Type: String
8686
AllowedPattern: ^(AUTO|NONE|subnet-[0-9a-z]+)$
87-
Default : AUTO
87+
Default : subnet-0b1fd419324ad200e
8888

8989
FSx:
9090
Description: 'Please, enter your FSx ID, or just leave "AUTO" if you want to re-use an existing one.'
9191
Type: String
9292
AllowedPattern: ^(AUTO|fs-[0-9a-z]+)$
93-
Default : AUTO
93+
Default : fs-00bb39380c5b90f76
9494

9595
SlurmDBE:
9696
Description: 'Please, enter your organization SlurmDB Endpoint, or just leave "AUTO" to create a new one.'
9797
Type: String
9898
AllowedPattern: ^(AUTO|[0-9a-z\-\.]+\.rds.amazonaws.com)$
99-
Default : AUTO
99+
Default : stability-sacct.c2yrzdczuacd.us-east-1.rds.amazonaws.com
100100

101101
SlurmDBEPassword:
102102
Description: 'Please, enter the password for the Slurm DB Endpoint. Leave empty if Slurm DB Endpoint is set to AUTO'
@@ -110,19 +110,19 @@ Parameters:
110110
Description: 'Please, enter your Active Directory ID, or just leave "AUTO" if you want to create a new one.'
111111
Type: String
112112
AllowedPattern: ^(AUTO|d-[0-9a-z]+)$
113-
Default : AUTO
113+
Default : d-9067b1eaa3
114114

115115
ADDNS0:
116116
Description: 'Please enter IP of the first DNS server of the custom AD domain'
117117
Type: String
118118
AllowedPattern: (\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})
119-
Default: '0.0.0.0'
119+
Default: '172.31.18.41'
120120

121121
ADDNS1:
122122
Description: 'Please enter IP of the first DNS server of the custom AD domain'
123123
Type: String
124124
AllowedPattern: (\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})
125-
Default: '0.0.0.0'
125+
Default: '172.31.38.78'
126126

127127
ADPassword:
128128
Description: 'Please enter Admin user password of the custom AD domain'
@@ -1039,21 +1039,6 @@ Resources:
10391039
TargetGroupArn: !Ref TargetGroupMonitoring
10401040
Type: "forward"
10411041

1042-
8888SListener:
1043-
Type: "AWS::ElasticLoadBalancingV2::Listener"
1044-
Properties:
1045-
LoadBalancerArn: !Ref ApplicationLoadBalancer
1046-
Port: 8888
1047-
Protocol: "HTTPS"
1048-
SslPolicy: "ELBSecurityPolicy-2016-08"
1049-
Certificates:
1050-
- CertificateArn: !GetAtt LBInit.IamCertificateArn
1051-
DefaultActions:
1052-
-
1053-
Order: 1
1054-
TargetGroupArn: arn:aws:elasticloadbalancing:us-east-1:842865360552:targetgroup/ad-selfservice/2e6c3f1b41b09a53
1055-
Type: "forward"
1056-
10571042
HTTPListener:
10581043
Type: "AWS::ElasticLoadBalancingV2::Listener"
10591044
Properties:

Diff for: modules/40.install.monitoring.gpu.sh

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
set -x
3+
set -e
4+
5+
installEFAmon() {
6+
# Install EFA Exporter
7+
/usr/bin/python3 -m pip install --upgrade pip
8+
pip3 install boto3
9+
yum install amazon-cloudwatch-agent -y
10+
git clone https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline.git /tmp/aws-efa-nccl-baseami
11+
mv /tmp/aws-efa-nccl-baseami/nvidia-efa-ami_base/cloudwatch /opt/aws/
12+
mv /opt/aws/cloudwatch/aws-hw-monitor.service /lib/systemd/system
13+
echo -e "#!/bin/sh\n" | tee /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh
14+
echo -e "/usr/bin/python3 /opt/aws/cloudwatch/nvidia/aws-hwaccel-error-parser.py &\n" | tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh
15+
echo -e "/usr/bin/python3 /opt/aws/cloudwatch/nvidia/accel-to-cw.py /opt/aws/cloudwatch/nvidia/nvidia-exporter >> /dev/null 2>&1 &\n" | tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh
16+
echo -e "/usr/bin/python3 /opt/aws/cloudwatch/efa/efa-to-cw.py /opt/aws/cloudwatch/efa/efa-exporter >> /dev/null 2>&1 &\n" | tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh
17+
chmod +x /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh
18+
systemctl enable aws-hw-monitor.service
19+
systemctl start aws-hw-monitor.service
20+
}
21+
22+
# main
23+
# ----------------------------------------------------------------------------
24+
main() {
25+
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.gpu.sh: START" >&2
26+
installEFAmon
27+
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.gpu.sh: STOP" >&2
28+
}
29+
30+
main "$@"

Diff for: modules/90.allow.nvidia.debug.gpu.sh

+9
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,21 @@ allowDebugGPU() {
66
aws s3 cp --quiet "${post_install_base}/nvidia/99-nvidia-debug" /etc/sudoers.d/ --region "${cfn_region}" || exit 1
77
}
88

9+
installDCGM {
10+
yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
11+
yum clean all
12+
yum install -y datacenter-gpu-manager
13+
# Start nv-hostengine
14+
sudo -u root nv-hostengine -b 0
15+
}
16+
917

1018
# main
1119
# ----------------------------------------------------------------------------
1220
main() {
1321
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 90.allow.nvidia.debug.gpu.sh: START" >&2
1422
allowDebugGPU
23+
installDCGM
1524
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 90.allow.nvidia.debug.gpu.sh: STOP" >&2
1625
}
1726

Diff for: nvidia/99-nvidia-debug

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
Cmnd_Alias NVIDIA_DEBUG = /usr/bin/nvidia-bug-report.sh
2+
Cmnd_Alias NVIDIA_SMI = /usr/bin/nvidia-smi
23

3-
slurm ALL = (root) NOPASSWD: NVIDIA_DEBUG
4+
%hpc-cluster-users ALL=(ALL) NOPASSWD: NVIDIA_DEBUG, NVIDIA_SMI

Diff for: parallelcluster/config.us-east-1.sample.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,13 @@ Scheduling:
8181
- 04.configure.disable.anacron.compute.sh
8282
- 35.boost.gpu.clock.gpu.sh
8383
- 40.install.monitoring.compute.sh
84+
- 40.install.monitoring.gpu.sh
8485
- 45.install.nccl.compute.sh
8586
- 60.install.gpumon.cloudwatch.metrics.sh
8687
- 70.install.enroot.pyxis.sh
8788
- 70.install.singularity.compute.sh
8889
- 80.fast.aws.cli.compute.sh
90+
- 90.allow.nvidia.debug.gpu.sh
8991
Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh
9092
Iam:
9193
AdditionalIamPolicies:
@@ -131,10 +133,12 @@ Scheduling:
131133
- 04.configure.disable.anacron.compute.sh
132134
- 35.boost.gpu.clock.gpu.sh
133135
- 40.install.monitoring.compute.sh
136+
- 40.install.monitoring.gpu.sh
134137
- 45.install.nccl.compute.sh
135138
- 60.install.gpumon.cloudwatch.metrics.sh
136139
- 80.fast.aws.cli.compute.sh
137140
- 85.install.jupiter.notebook.gpu.sh
141+
- 90.allow.nvidia.debug.gpu.sh
138142
Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh
139143
Iam:
140144
AdditionalIamPolicies:
@@ -180,11 +184,13 @@ Scheduling:
180184
- 04.configure.disable.anacron.compute.sh
181185
- 35.boost.gpu.clock.gpu.sh
182186
- 40.install.monitoring.compute.sh
187+
- 40.install.monitoring.gpu.sh
183188
- 45.install.nccl.compute.sh
184189
- 60.install.gpumon.cloudwatch.metrics.sh
185190
- 70.install.enroot.pyxis.sh
186191
- 70.install.singularity.compute.sh
187192
- 80.fast.aws.cli.compute.sh
193+
- 90.allow.nvidia.debug.gpu.sh
188194
Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh
189195
Iam:
190196
AdditionalIamPolicies:

0 commit comments

Comments
 (0)