Skip to content

Commit aa90861

Browse files
authored
Add shm_volume option (#427)
Add possibility to mount a tmpfs volume to /dev/shm to avoid issues like [this](docker-library/postgres#416). To achieve that two new options were introduced: * `enableShmVolume` to PostgreSQL manifest, to specify whether or not mount this volume per database cluster * `enable_shm_volume` to operator configuration, to specify whether or not mount per operator. The first one, `enableShmVolume` takes precedence to allow us to be more flexible.
1 parent 0bdc3e4 commit aa90861

File tree

10 files changed

+137
-7
lines changed

10 files changed

+137
-7
lines changed

docs/reference/cluster_manifest.md

+14-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,19 @@ Those are parameters grouped directly under the `spec` key in the manifest.
9696
that should be assigned to the cluster pods. When not specified, the value
9797
is taken from the `pod_priority_class_name` operator parameter, if not set
9898
then the default priority class is taken. The priority class itself must be defined in advance.
99-
99+
100+
* **enableShmVolume**
101+
Start a database pod without limitations on shm memory. By default docker
102+
limit `/dev/shm` to `64M` (see e.g. the [docker
103+
issue](https://github.com/docker-library/postgres/issues/416), which could be
104+
not enough if PostgreSQL uses parallel workers heavily. If this option is
105+
present and value is `true`, to the target database pod will be mounted a new
106+
tmpfs volume to remove this limitation. If it's not present, the decision
107+
about mounting a volume will be made based on operator configuration
108+
(`enable_shm_volume`, which is `true` by default). It it's present and value
109+
is `false`, then no volume will be mounted no matter how operator was
110+
configured (so you can override the operator configuration).
111+
100112
## Postgres parameters
101113

102114
Those parameters are grouped under the `postgresql` top-level key.
@@ -112,6 +124,7 @@ Those parameters are grouped under the `postgresql` top-level key.
112124
cluster. Optional (Spilo automatically sets reasonable defaults for
113125
parameters like work_mem or max_connections).
114126

127+
115128
## Patroni parameters
116129

117130
Those parameters are grouped under the `patroni` top-level key. See the [patroni

docs/reference/operator_parameters.md

+8
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,14 @@ CRD-based configuration.
224224
* **set_memory_request_to_limit**
225225
Set `memory_request` to `memory_limit` for all Postgres clusters (the default value is also increased). This prevents certain cases of memory overcommitment at the cost of overprovisioning memory and potential scheduling problems for containers with high memory limits due to the lack of memory on Kubernetes cluster nodes. This affects all containers (Postgres, Scalyr sidecar, and other sidecars). The default is `false`.
226226

227+
* **enable_shm_volume**
228+
Instruct operator to start any new database pod without limitations on shm
229+
memory. If this option is enabled, to the target database pod will be mounted
230+
a new tmpfs volume to remove shm memory limitation (see e.g. the [docker
231+
issue](https://github.com/docker-library/postgres/issues/416)). This option
232+
is global for an operator object, and can be overwritten by `enableShmVolume`
233+
parameter from Postgres manifest. The default is `true`
234+
227235
## Operator timeouts
228236

229237
This set of parameters define various timeouts related to some operator

manifests/complete-postgres-manifest.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@ spec:
1313
- superuser
1414
- createdb
1515
enableMasterLoadBalancer: true
16-
enableReplicaLoadBalancer: true
16+
enableReplicaLoadBalancer: true
1717
allowedSourceRanges: # load balancers' source ranges for both master and replica services
1818
- 127.0.0.1/32
1919
databases:
2020
foo: zalando
2121
#Expert section
22+
enableShmVolume: true
2223
postgresql:
2324
version: "10"
2425
parameters:

pkg/apis/acid.zalan.do/v1/postgresql_type.go

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type PostgresSpec struct {
5151
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
5252
Sidecars []Sidecar `json:"sidecars,omitempty"`
5353
PodPriorityClassName string `json:"pod_priority_class_name,omitempty"`
54+
ShmVolume *bool `json:"enableShmVolume,omitempty"`
5455
}
5556

5657
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

pkg/apis/acid.zalan.do/v1/util_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -499,19 +499,19 @@ func TestMarshal(t *testing.T) {
499499
t.Errorf("Marshal error: %v", err)
500500
}
501501
if !bytes.Equal(m, tt.marshal) {
502-
t.Errorf("Marshal Postgresql expected: %q, got: %q", string(tt.marshal), string(m))
502+
t.Errorf("Marshal Postgresql \nexpected: %q, \ngot: %q", string(tt.marshal), string(m))
503503
}
504504
}
505505
}
506506

507507
func TestPostgresMeta(t *testing.T) {
508508
for _, tt := range unmarshalCluster {
509509
if a := tt.out.GetObjectKind(); a != &tt.out.TypeMeta {
510-
t.Errorf("GetObjectKindMeta expected: %v, got: %v", tt.out.TypeMeta, a)
510+
t.Errorf("GetObjectKindMeta \nexpected: %v, \ngot: %v", tt.out.TypeMeta, a)
511511
}
512512

513513
if a := tt.out.GetObjectMeta(); reflect.DeepEqual(a, tt.out.ObjectMeta) {
514-
t.Errorf("GetObjectMeta expected: %v, got: %v", tt.out.ObjectMeta, a)
514+
t.Errorf("GetObjectMeta \nexpected: %v, \ngot: %v", tt.out.ObjectMeta, a)
515515
}
516516
}
517517
}

pkg/cluster/k8sres.go

+50-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
acidv1 "github.com/zalando-incubator/postgres-operator/pkg/apis/acid.zalan.do/v1"
1919
"github.com/zalando-incubator/postgres-operator/pkg/spec"
2020
"github.com/zalando-incubator/postgres-operator/pkg/util"
21+
"github.com/zalando-incubator/postgres-operator/pkg/util/config"
2122
"github.com/zalando-incubator/postgres-operator/pkg/util/constants"
2223
"k8s.io/apimachinery/pkg/labels"
2324
)
@@ -396,6 +397,16 @@ func generateSidecarContainers(sidecars []acidv1.Sidecar,
396397
return nil, nil
397398
}
398399

400+
// Check whether or not we're requested to mount an shm volume,
401+
// taking into account that PostgreSQL manifest has precedence.
402+
func mountShmVolumeNeeded(opConfig config.Config, pgSpec *acidv1.PostgresSpec) bool {
403+
if pgSpec.ShmVolume != nil {
404+
return *pgSpec.ShmVolume
405+
}
406+
407+
return opConfig.ShmVolume
408+
}
409+
399410
func generatePodTemplate(
400411
namespace string,
401412
labels labels.Set,
@@ -407,6 +418,7 @@ func generatePodTemplate(
407418
podServiceAccountName string,
408419
kubeIAMRole string,
409420
priorityClassName string,
421+
shmVolume bool,
410422
) (*v1.PodTemplateSpec, error) {
411423

412424
terminateGracePeriodSeconds := terminateGracePeriod
@@ -420,6 +432,10 @@ func generatePodTemplate(
420432
Tolerations: *tolerationsSpec,
421433
}
422434

435+
if shmVolume {
436+
addShmVolume(&podSpec)
437+
}
438+
423439
if nodeAffinity != nil {
424440
podSpec.Affinity = nodeAffinity
425441
}
@@ -733,7 +749,12 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State
733749
volumeMounts := generateVolumeMounts()
734750

735751
// generate the spilo container
736-
spiloContainer := generateSpiloContainer(c.containerName(), &effectiveDockerImage, resourceRequirements, spiloEnvVars, volumeMounts)
752+
spiloContainer := generateSpiloContainer(c.containerName(),
753+
&effectiveDockerImage,
754+
resourceRequirements,
755+
spiloEnvVars,
756+
volumeMounts,
757+
)
737758

738759
// resolve conflicts between operator-global and per-cluster sidecards
739760
sideCars := c.mergeSidecars(spec.Sidecars)
@@ -775,7 +796,8 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State
775796
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
776797
c.OpConfig.PodServiceAccountName,
777798
c.OpConfig.KubeIAMRole,
778-
effectivePodPriorityClassName); err != nil {
799+
effectivePodPriorityClassName,
800+
mountShmVolumeNeeded(c.OpConfig, spec)); err != nil {
779801
return nil, fmt.Errorf("could not generate pod template: %v", err)
780802
}
781803

@@ -882,6 +904,32 @@ func (c *Cluster) getNumberOfInstances(spec *acidv1.PostgresSpec) int32 {
882904
return newcur
883905
}
884906

907+
// To avoid issues with limited /dev/shm inside docker environment, when
908+
// PostgreSQL can't allocate enough of dsa segments from it, we can
909+
// mount an extra memory volume
910+
//
911+
// see https://docs.okd.io/latest/dev_guide/shared_memory.html
912+
func addShmVolume(podSpec *v1.PodSpec) {
913+
volumes := append(podSpec.Volumes, v1.Volume{
914+
Name: constants.ShmVolumeName,
915+
VolumeSource: v1.VolumeSource{
916+
EmptyDir: &v1.EmptyDirVolumeSource{
917+
Medium: "Memory",
918+
},
919+
},
920+
})
921+
922+
pgIdx := constants.PostgresContainerIdx
923+
mounts := append(podSpec.Containers[pgIdx].VolumeMounts,
924+
v1.VolumeMount{
925+
Name: constants.ShmVolumeName,
926+
MountPath: constants.ShmVolumePath,
927+
})
928+
929+
podSpec.Containers[0].VolumeMounts = mounts
930+
podSpec.Volumes = volumes
931+
}
932+
885933
func generatePersistentVolumeClaimTemplate(volumeSize, volumeStorageClass string) (*v1.PersistentVolumeClaim, error) {
886934

887935
var storageClassName *string

pkg/cluster/k8sres_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
package cluster
22

33
import (
4+
"k8s.io/api/core/v1"
5+
46
acidv1 "github.com/zalando-incubator/postgres-operator/pkg/apis/acid.zalan.do/v1"
57
"github.com/zalando-incubator/postgres-operator/pkg/util/config"
8+
"github.com/zalando-incubator/postgres-operator/pkg/util/constants"
69
"github.com/zalando-incubator/postgres-operator/pkg/util/k8sutil"
710
"testing"
811
)
@@ -75,3 +78,54 @@ func TestCreateLoadBalancerLogic(t *testing.T) {
7578
}
7679
}
7780
}
81+
82+
func TestShmVolume(t *testing.T) {
83+
testName := "TestShmVolume"
84+
tests := []struct {
85+
subTest string
86+
podSpec *v1.PodSpec
87+
shmPos int
88+
}{
89+
{
90+
subTest: "empty PodSpec",
91+
podSpec: &v1.PodSpec{
92+
Volumes: []v1.Volume{},
93+
Containers: []v1.Container{
94+
v1.Container{
95+
VolumeMounts: []v1.VolumeMount{},
96+
},
97+
},
98+
},
99+
shmPos: 0,
100+
},
101+
{
102+
subTest: "non empty PodSpec",
103+
podSpec: &v1.PodSpec{
104+
Volumes: []v1.Volume{v1.Volume{}},
105+
Containers: []v1.Container{
106+
v1.Container{
107+
VolumeMounts: []v1.VolumeMount{
108+
v1.VolumeMount{},
109+
},
110+
},
111+
},
112+
},
113+
shmPos: 1,
114+
},
115+
}
116+
for _, tt := range tests {
117+
addShmVolume(tt.podSpec)
118+
119+
volumeName := tt.podSpec.Volumes[tt.shmPos].Name
120+
volumeMountName := tt.podSpec.Containers[0].VolumeMounts[tt.shmPos].Name
121+
122+
if volumeName != constants.ShmVolumeName {
123+
t.Errorf("%s %s: Expected volume %s was not created, have %s instead",
124+
testName, tt.subTest, constants.ShmVolumeName, volumeName)
125+
}
126+
if volumeMountName != constants.ShmVolumeName {
127+
t.Errorf("%s %s: Expected mount %s was not created, have %s instead",
128+
testName, tt.subTest, constants.ShmVolumeName, volumeMountName)
129+
}
130+
}
131+
}

pkg/util/config/config.go

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ type Resources struct {
3838
NodeReadinessLabel map[string]string `name:"node_readiness_label" default:""`
3939
MaxInstances int32 `name:"max_instances" default:"-1"`
4040
MinInstances int32 `name:"min_instances" default:"-1"`
41+
ShmVolume bool `name:"enable_shm_volume" default:"true"`
4142
}
4243

4344
// Auth describes authentication specific configuration parameters

pkg/util/constants/kubernetes.go

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import "time"
55
// General kubernetes-related constants
66
const (
77
PostgresContainerName = "postgres"
8+
PostgresContainerIdx = 0
89
K8sAPIPath = "/apis"
910
StatefulsetDeletionInterval = 1 * time.Second
1011
StatefulsetDeletionTimeout = 30 * time.Second

pkg/util/constants/postgresql.go

+3
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,7 @@ const (
1010

1111
PostgresConnectRetryTimeout = 2 * time.Minute
1212
PostgresConnectTimeout = 15 * time.Second
13+
14+
ShmVolumeName = "dshm"
15+
ShmVolumePath = "/dev/shm"
1316
)

0 commit comments

Comments
 (0)