Skip to content

Commit 21f326d

Browse files
authored
MPICH support (#562)
* Add support for MPICH * Fix CI errors * Temporary: manual trigger * Fix file name * Add an empty line at the end of the file * Fix formatting * Revert "Temporary: manual trigger" This reverts commit 15164a8b7036482f138562d49630c96eb92114c5. * fix formatting * Regenerate the mpi-operator.yaml * Adding an empy line at the end of Dockerfiles * Share the same entrypoin for Intel and MPICH * share hostfile generation between Intel and MPICH * Add validation test for MPICH * Fix formatting * Don't over engineer the tests - be explicit * add non-root tests for IntelMPI and MPICH
1 parent caa1112 commit 21f326d

26 files changed

+381
-59
lines changed

Makefile

+5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ BASE_IMAGE_SSH_PORT?=2222
2323
IMG_BUILDER=docker
2424
PLATFORMS ?= linux/amd64
2525
INTEL_PLATFORMS ?= linux/amd64
26+
MPICH_PLATFORMS ?= linux/amd64
2627
LD_FLAGS_V2=" \
2728
-X '${REPO_PATH}/pkg/version.GitSHA=${GitSHA}' \
2829
-X '${REPO_PATH}/pkg/version.Built=${Date}' \
@@ -71,6 +72,7 @@ test: bin/envtest scheduler-plugins-crd
7172
test_e2e: export TEST_MPI_OPERATOR_IMAGE=${IMAGE_NAME}:${RELEASE_VERSION}
7273
test_e2e: export TEST_OPENMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-openmpi
7374
test_e2e: export TEST_INTELMPI_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-intel
75+
test_e2e: export TEST_MPICH_IMAGE=mpioperator/mpi-pi:${RELEASE_VERSION}-mpich
7476
test_e2e: bin/kubectl kind helm images test_images dev_manifest scheduler-plugins-chart
7577
go test -v ./test/e2e/...
7678

@@ -108,6 +110,9 @@ test_images:
108110
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/intel:${RELEASE_VERSION} build/base -f build/base/intel.Dockerfile
109111
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) -t mpioperator/intel-builder:${RELEASE_VERSION} build/base -f build/base/intel-builder.Dockerfile
110112
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(INTEL_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-intel examples/v2beta1/pi -f examples/v2beta1/pi/intel.Dockerfile
113+
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpich:${RELEASE_VERSION} build/base -f build/base/mpich.Dockerfile
114+
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) -t mpioperator/mpich-builder:${RELEASE_VERSION} build/base -f build/base/mpich-builder.Dockerfile
115+
${IMG_BUILDER} build $(BUILD_ARGS) --platform $(MPICH_PLATFORMS) --build-arg BASE_LABEL=${RELEASE_VERSION} -t mpioperator/mpi-pi:${RELEASE_VERSION}-mpich examples/v2beta1/pi -f examples/v2beta1/pi/mpich.Dockerfile
111116

112117
.PHONY: tidy
113118
tidy:

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,12 @@ For a sample that uses Intel MPI, see:
218218
cat examples/pi/pi-intel.yaml
219219
```
220220

221+
For a sample that uses MPICH, see:
222+
223+
```bash
224+
cat examples/pi/pi-mpich.yaml
225+
```
226+
221227
## Exposed Metrics
222228

223229
| Metric name | Metric type | Description | Labels |
File renamed without changes.

build/base/intel.Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ RUN apt update \
2222
intel-oneapi-mpi \
2323
&& rm -rf /var/lib/apt/lists/*
2424

25-
COPY intel-entrypoint.sh /entrypoint.sh
25+
COPY entrypoint.sh /entrypoint.sh
2626
ENTRYPOINT ["/entrypoint.sh"]

build/base/mpich-builder.Dockerfile

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM debian:bullseye as builder
2+
3+
RUN apt update \
4+
&& apt install -y --no-install-recommends \
5+
g++ \
6+
libmpich-dev \
7+
&& rm -rf /var/lib/apt/lists/*

build/base/mpich.Dockerfile

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
ARG BASE_LABEL
2+
3+
FROM mpioperator/base:${BASE_LABEL}
4+
5+
RUN apt update \
6+
&& apt install -y --no-install-recommends \
7+
dnsutils \
8+
mpich \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
COPY entrypoint.sh /entrypoint.sh
12+
ENTRYPOINT ["/entrypoint.sh"]

deploy/v2beta1/mpi-operator.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,11 @@ spec:
5858
mpiImplementation:
5959
default: OpenMPI
6060
description: MPIImplementation is the MPI implementation. Options
61-
are "OpenMPI" (default) and "Intel".
61+
are "OpenMPI" (default), "Intel" and "MPICH".
6262
enum:
6363
- OpenMPI
6464
- Intel
65+
- MPICH
6566
type: string
6667
mpiReplicaSpecs:
6768
additionalProperties:

examples/v2beta1/pi/Dockerfile

+1-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ FROM mpioperator/openmpi-builder:${BASE_LABEL} as builder
55
COPY pi.cc /src/pi.cc
66
RUN mpic++ /src/pi.cc -o /pi
77

8-
98
FROM mpioperator/openmpi:${BASE_LABEL}
109

11-
COPY --from=builder /pi /home/mpiuser/pi
10+
COPY --from=builder /pi /home/mpiuser/pi

examples/v2beta1/pi/README.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,15 @@ For Intel MPI:
1919
docker build -t mpi-pi . -f intel.Dockerfile
2020
```
2121

22+
For MPICH:
23+
24+
```bash
25+
docker build -t mpi-pi . -f mpich.Dockerfile
26+
```
27+
2228
## Create MPIJob
2329

24-
Modify `pi.yaml` (for OpenMPI) or `pi-intel.yaml` (for Intel MPI) to set up the
30+
Modify `pi.yaml` (for OpenMPI), `pi-intel.yaml` (for Intel MPI) or `pi-mpich.yaml` (for MPICH) to set up the
2531
image name from your own registry.
2632

2733
Then, run:

examples/v2beta1/pi/intel.Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ RUN bash -c "source /opt/intel/oneapi/setvars.sh && mpicxx /src/pi.cc -o /pi"
77

88
FROM mpioperator/intel:${BASE_LABEL}
99

10-
COPY --from=builder /pi /home/mpiuser/pi
10+
COPY --from=builder /pi /home/mpiuser/pi

examples/v2beta1/pi/mpich.Dockerfile

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ARG BASE_LABEL
2+
3+
FROM mpioperator/mpich-builder:${BASE_LABEL} as builder
4+
5+
COPY pi.cc /src/pi.cc
6+
RUN mpic++ /src/pi.cc -o /pi
7+
8+
FROM mpioperator/mpich:${BASE_LABEL}
9+
10+
COPY --from=builder /pi /home/mpiuser/pi

examples/v2beta1/pi/pi-mpich.yaml

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
apiVersion: kubeflow.org/v2beta1
2+
kind: MPIJob
3+
metadata:
4+
name: pi
5+
spec:
6+
slotsPerWorker: 1
7+
runPolicy:
8+
cleanPodPolicy: Running
9+
sshAuthMountPath: /home/mpiuser/.ssh
10+
mpiImplementation: MPICH
11+
mpiReplicaSpecs:
12+
Launcher:
13+
replicas: 1
14+
template:
15+
spec:
16+
containers:
17+
- image: mpioperator/mpi-pi:mpich
18+
imagePullPolicy: Always
19+
name: mpi-launcher
20+
securityContext:
21+
runAsUser: 1000
22+
args:
23+
- mpirun
24+
- -n
25+
- "2"
26+
- /home/mpiuser/pi
27+
resources:
28+
limits:
29+
cpu: 1
30+
memory: 1Gi
31+
Worker:
32+
replicas: 2
33+
template:
34+
spec:
35+
containers:
36+
- image: mpioperator/mpi-pi:mpich
37+
imagePullPolicy: Always
38+
name: mpi-worker
39+
securityContext:
40+
runAsUser: 1000
41+
command:
42+
args:
43+
- /usr/sbin/sshd
44+
- -De
45+
- -f
46+
- /home/mpiuser/.sshd_config
47+
readinessProbe:
48+
tcpSocket:
49+
port: 2222
50+
initialDelaySeconds: 2
51+
resources:
52+
limits:
53+
cpu: 1
54+
memory: 1Gi

manifests/base/kubeflow.org_mpijobs.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,11 @@ spec:
3535
mpiImplementation:
3636
default: OpenMPI
3737
description: MPIImplementation is the MPI implementation. Options
38-
are "OpenMPI" (default) and "Intel".
38+
are "OpenMPI" (default), "Intel" and "MPICH".
3939
enum:
4040
- OpenMPI
4141
- Intel
42+
- MPICH
4243
type: string
4344
mpiReplicaSpecs:
4445
additionalProperties:

pkg/apis/kubeflow/v2beta1/default_test.go

+29-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ func TestSetDefaults_MPIJob(t *testing.T) {
3838
},
3939
},
4040
},
41-
"base defaults overridden": {
41+
"base defaults overridden (intel)": {
4242
job: MPIJob{
4343
Spec: MPIJobSpec{
4444
SlotsPerWorker: newInt32(10),
@@ -66,6 +66,34 @@ func TestSetDefaults_MPIJob(t *testing.T) {
6666
},
6767
},
6868
},
69+
"base defaults overridden (mpich)": {
70+
job: MPIJob{
71+
Spec: MPIJobSpec{
72+
SlotsPerWorker: newInt32(10),
73+
RunPolicy: RunPolicy{
74+
CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyRunning),
75+
TTLSecondsAfterFinished: newInt32(2),
76+
ActiveDeadlineSeconds: newInt64(3),
77+
BackoffLimit: newInt32(4),
78+
},
79+
SSHAuthMountPath: "/home/mpiuser/.ssh",
80+
MPIImplementation: MPIImplementationMPICH,
81+
},
82+
},
83+
want: MPIJob{
84+
Spec: MPIJobSpec{
85+
SlotsPerWorker: newInt32(10),
86+
RunPolicy: RunPolicy{
87+
CleanPodPolicy: NewCleanPodPolicy(CleanPodPolicyRunning),
88+
TTLSecondsAfterFinished: newInt32(2),
89+
ActiveDeadlineSeconds: newInt64(3),
90+
BackoffLimit: newInt32(4),
91+
},
92+
SSHAuthMountPath: "/home/mpiuser/.ssh",
93+
MPIImplementation: MPIImplementationMPICH,
94+
},
95+
},
96+
},
6997
"launcher defaults": {
7098
job: MPIJob{
7199
Spec: MPIJobSpec{

pkg/apis/kubeflow/v2beta1/openapi_generated.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/kubeflow/v2beta1/swagger.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@
322322
],
323323
"properties": {
324324
"mpiImplementation": {
325-
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default) and \"Intel\".",
325+
"description": "MPIImplementation is the MPI implementation. Options are \"OpenMPI\" (default), \"Intel\" and \"MPICH\".",
326326
"type": "string"
327327
},
328328
"mpiReplicaSpecs": {

pkg/apis/kubeflow/v2beta1/types.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,8 @@ type MPIJobSpec struct {
155155
SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
156156

157157
// MPIImplementation is the MPI implementation.
158-
// Options are "OpenMPI" (default) and "Intel".
159-
// +kubebuilder:validation:Enum:=OpenMPI;Intel
158+
// Options are "OpenMPI" (default), "Intel" and "MPICH".
159+
// +kubebuilder:validation:Enum:=OpenMPI;Intel;MPICH
160160
// +kubebuilder:default:=OpenMPI
161161
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
162162
}
@@ -177,6 +177,7 @@ type MPIImplementation string
177177
const (
178178
MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
179179
MPIImplementationIntel MPIImplementation = "Intel"
180+
MPIImplementationMPICH MPIImplementation = "MPICH"
180181
)
181182

182183
// JobStatus represents the current observed state of the training Job.

pkg/apis/kubeflow/validation/validation.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ var (
3535

3636
validMPIImplementations = sets.NewString(
3737
string(kubeflow.MPIImplementationOpenMPI),
38-
string(kubeflow.MPIImplementationIntel))
38+
string(kubeflow.MPIImplementationIntel),
39+
string(kubeflow.MPIImplementationMPICH))
3940

4041
validRestartPolicies = sets.NewString(
4142
string(common.RestartPolicyNever),

pkg/apis/kubeflow/validation/validation_test.go

+63-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ func TestValidateMPIJob(t *testing.T) {
3131
job kubeflow.MPIJob
3232
wantErrs field.ErrorList
3333
}{
34-
"valid": {
34+
"valid (intel)": {
3535
job: kubeflow.MPIJob{
3636
ObjectMeta: metav1.ObjectMeta{
3737
Name: "foo",
@@ -57,7 +57,7 @@ func TestValidateMPIJob(t *testing.T) {
5757
},
5858
},
5959
},
60-
"valid with worker": {
60+
"valid with worker (intel)": {
6161
job: kubeflow.MPIJob{
6262
ObjectMeta: metav1.ObjectMeta{
6363
Name: "foo",
@@ -92,6 +92,67 @@ func TestValidateMPIJob(t *testing.T) {
9292
},
9393
},
9494
},
95+
"valid (mpich)": {
96+
job: kubeflow.MPIJob{
97+
ObjectMeta: metav1.ObjectMeta{
98+
Name: "foo",
99+
},
100+
Spec: kubeflow.MPIJobSpec{
101+
SlotsPerWorker: newInt32(2),
102+
RunPolicy: kubeflow.RunPolicy{
103+
CleanPodPolicy: kubeflow.NewCleanPodPolicy(kubeflow.CleanPodPolicyRunning),
104+
},
105+
SSHAuthMountPath: "/home/mpiuser/.ssh",
106+
MPIImplementation: kubeflow.MPIImplementationMPICH,
107+
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
108+
kubeflow.MPIReplicaTypeLauncher: {
109+
Replicas: newInt32(1),
110+
RestartPolicy: common.RestartPolicyNever,
111+
Template: corev1.PodTemplateSpec{
112+
Spec: corev1.PodSpec{
113+
Containers: []corev1.Container{{}},
114+
},
115+
},
116+
},
117+
},
118+
},
119+
},
120+
},
121+
"valid with worker (mpich)": {
122+
job: kubeflow.MPIJob{
123+
ObjectMeta: metav1.ObjectMeta{
124+
Name: "foo",
125+
},
126+
Spec: kubeflow.MPIJobSpec{
127+
SlotsPerWorker: newInt32(2),
128+
RunPolicy: kubeflow.RunPolicy{
129+
CleanPodPolicy: kubeflow.NewCleanPodPolicy(kubeflow.CleanPodPolicyRunning),
130+
},
131+
SSHAuthMountPath: "/home/mpiuser/.ssh",
132+
MPIImplementation: kubeflow.MPIImplementationMPICH,
133+
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*common.ReplicaSpec{
134+
kubeflow.MPIReplicaTypeLauncher: {
135+
Replicas: newInt32(1),
136+
RestartPolicy: common.RestartPolicyOnFailure,
137+
Template: corev1.PodTemplateSpec{
138+
Spec: corev1.PodSpec{
139+
Containers: []corev1.Container{{}},
140+
},
141+
},
142+
},
143+
kubeflow.MPIReplicaTypeWorker: {
144+
Replicas: newInt32(3),
145+
RestartPolicy: common.RestartPolicyNever,
146+
Template: corev1.PodTemplateSpec{
147+
Spec: corev1.PodSpec{
148+
Containers: []corev1.Container{{}},
149+
},
150+
},
151+
},
152+
},
153+
},
154+
},
155+
},
95156
"empty job": {
96157
wantErrs: field.ErrorList{
97158
&field.Error{

0 commit comments

Comments
 (0)