diff --git a/cmd/nerdctl/container/container_create.go b/cmd/nerdctl/container/container_create.go index 1ef0f958f9c..934551aa84a 100644 --- a/cmd/nerdctl/container/container_create.go +++ b/cmd/nerdctl/container/container_create.go @@ -199,19 +199,42 @@ func createOptions(cmd *cobra.Command) (types.ContainerCreateOptions, error) { if err != nil { return opt, err } + opt.Cgroupns, err = cmd.Flags().GetString("cgroupns") + if err != nil { + return opt, err + } + opt.CgroupParent, err = cmd.Flags().GetString("cgroup-parent") + if err != nil { + return opt, err + } + opt.Device, err = cmd.Flags().GetStringSlice("device") + if err != nil { + return opt, err + } + // #endregion + + // #region for blkio flags opt.BlkioWeight, err = cmd.Flags().GetUint16("blkio-weight") if err != nil { return opt, err } - opt.Cgroupns, err = cmd.Flags().GetString("cgroupns") + opt.BlkioWeightDevice, err = cmd.Flags().GetStringArray("blkio-weight-device") if err != nil { return opt, err } - opt.CgroupParent, err = cmd.Flags().GetString("cgroup-parent") + opt.BlkioDeviceReadBps, err = cmd.Flags().GetStringArray("device-read-bps") if err != nil { return opt, err } - opt.Device, err = cmd.Flags().GetStringSlice("device") + opt.BlkioDeviceWriteBps, err = cmd.Flags().GetStringArray("device-write-bps") + if err != nil { + return opt, err + } + opt.BlkioDeviceReadIOps, err = cmd.Flags().GetStringArray("device-read-iops") + if err != nil { + return opt, err + } + opt.BlkioDeviceWriteIOps, err = cmd.Flags().GetStringArray("device-write-iops") if err != nil { return opt, err } diff --git a/cmd/nerdctl/container/container_inspect_linux_test.go b/cmd/nerdctl/container/container_inspect_linux_test.go index 2c7bc7b8197..610c17e45f2 100644 --- a/cmd/nerdctl/container/container_inspect_linux_test.go +++ b/cmd/nerdctl/container/container_inspect_linux_test.go @@ -19,6 +19,7 @@ package container import ( "fmt" "os" + "os/exec" "slices" "strings" "testing" @@ -246,7 +247,6 @@ func TestContainerInspectHostConfig(t *testing.T) { base.Cmd("run", "-d", "--name", testContainer, "--cpuset-cpus", "0-1", "--cpuset-mems", "0", - "--blkio-weight", "500", "--cpu-shares", "1024", "--cpu-quota", "100000", "--group-add", "1000", @@ -266,7 +266,6 @@ func TestContainerInspectHostConfig(t *testing.T) { assert.Equal(t, "0-1", inspect.HostConfig.CPUSetCPUs) assert.Equal(t, "0", inspect.HostConfig.CPUSetMems) - assert.Equal(t, uint16(500), inspect.HostConfig.BlkioWeight) assert.Equal(t, uint64(1024), inspect.HostConfig.CPUShares) assert.Equal(t, int64(100000), inspect.HostConfig.CPUQuota) assert.Assert(t, slices.Contains(inspect.HostConfig.GroupAdd, "1000"), "Expected '1000' to be in GroupAdd") @@ -311,6 +310,11 @@ func TestContainerInspectHostConfigDefaults(t *testing.T) { assert.Equal(t, "", inspect.HostConfig.CPUSetCPUs) assert.Equal(t, "", inspect.HostConfig.CPUSetMems) assert.Equal(t, uint16(0), inspect.HostConfig.BlkioWeight) + assert.Equal(t, 0, len(inspect.HostConfig.BlkioWeightDevice)) + assert.Equal(t, 0, len(inspect.HostConfig.BlkioDeviceReadBps)) + assert.Equal(t, 0, len(inspect.HostConfig.BlkioDeviceReadIOps)) + assert.Equal(t, 0, len(inspect.HostConfig.BlkioDeviceWriteBps)) + assert.Equal(t, 0, len(inspect.HostConfig.BlkioDeviceWriteIOps)) assert.Equal(t, uint64(0), inspect.HostConfig.CPUShares) assert.Equal(t, int64(0), inspect.HostConfig.CPUQuota) assert.Equal(t, hc.GroupAddSize, len(inspect.HostConfig.GroupAdd)) @@ -456,6 +460,60 @@ func TestContainerInspectDevices(t *testing.T) { assert.DeepEqual(t, expectedDevices, inspect.HostConfig.Devices) } +func TestContainerInspectBlkioSettings(t *testing.T) { + testutil.DockerIncompatible(t) + testContainer := testutil.Identifier(t) + // Some of the blkio settings are not supported in cgroup v1. + // So skip this test if running on cgroup v1 + if infoutil.CgroupsVersion() == "1" { + t.Skip("test skipped for rootless containers or if running with cgroup v1") + } + + if rootlessutil.IsRootless() { + t.Skip("test requires root privilege to create a dummy device") + } + + devPath := "/dev/dummy-zero" + // a dummy zero device: mknod /dev/dummy-zero c 1 5 + helperCmd := exec.Command("mknod", []string{devPath, "c", "1", "5"}...) + if out, err := helperCmd.CombinedOutput(); err != nil { + err = fmt.Errorf("cannot create %q: %q: %w", devPath, string(out), err) + t.Fatal(err) + } + + // ensure the file will be removed in case of failed in the test + defer func() { + if err := exec.Command("rm", "-f", devPath).Run(); err != nil { + t.Logf("failed to remove device %s: %v", devPath, err) + } + }() + + base := testutil.NewBase(t) + defer base.Cmd("rm", "-f", testContainer).AssertOK() + + base.Cmd("run", "-d", "--name", testContainer, + "--blkio-weight", "500", + "--blkio-weight-device", "/dev/dummy-zero:500", + "--device-read-bps", "/dev/dummy-zero:1048576", + "--device-read-iops", "/dev/dummy-zero:1000", + "--device-write-bps", "/dev/dummy-zero:2097152", + "--device-write-iops", "/dev/dummy-zero:2000", + testutil.AlpineImage, "sleep", "infinity").AssertOK() + + inspect := base.InspectContainer(testContainer) + assert.Equal(t, uint16(500), inspect.HostConfig.BlkioWeight) + assert.Equal(t, 1, len(inspect.HostConfig.BlkioWeightDevice)) + assert.Equal(t, uint16(500), *inspect.HostConfig.BlkioWeightDevice[0].Weight) + assert.Equal(t, 1, len(inspect.HostConfig.BlkioDeviceReadBps)) + assert.Equal(t, uint64(1048576), inspect.HostConfig.BlkioDeviceReadBps[0].Rate) + assert.Equal(t, 1, len(inspect.HostConfig.BlkioDeviceWriteBps)) + assert.Equal(t, uint64(2097152), inspect.HostConfig.BlkioDeviceWriteBps[0].Rate) + assert.Equal(t, 1, len(inspect.HostConfig.BlkioDeviceReadIOps)) + assert.Equal(t, uint64(1000), inspect.HostConfig.BlkioDeviceReadIOps[0].Rate) + assert.Equal(t, 1, len(inspect.HostConfig.BlkioDeviceWriteIOps)) + assert.Equal(t, uint64(2000), inspect.HostConfig.BlkioDeviceWriteIOps[0].Rate) +} + type hostConfigValues struct { Driver string ShmSize int64 diff --git a/cmd/nerdctl/container/container_run.go b/cmd/nerdctl/container/container_run.go index e45a1617c5a..d42d61e1d2f 100644 --- a/cmd/nerdctl/container/container_run.go +++ b/cmd/nerdctl/container/container_run.go @@ -155,7 +155,6 @@ func setCreateFlags(cmd *cobra.Command) { }) cmd.Flags().Int64("pids-limit", -1, "Tune container pids limit (set -1 for unlimited)") cmd.Flags().StringSlice("cgroup-conf", nil, "Configure cgroup v2 (key=value)") - cmd.Flags().Uint16("blkio-weight", 0, "Block IO (relative weight), between 10 and 1000, or 0 to disable (default 0)") cmd.Flags().String("cgroupns", defaults.CgroupnsMode(), `Cgroup namespace to use, the default depends on the cgroup version ("host"|"private")`) cmd.Flags().String("cgroup-parent", "", "Optional parent cgroup for the container") cmd.RegisterFlagCompletionFunc("cgroupns", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { @@ -173,6 +172,15 @@ func setCreateFlags(cmd *cobra.Command) { cmd.Flags().String("rdt-class", "", "Name of the RDT class (or CLOS) to associate the container with") // #endregion + // #region blkio flags + cmd.Flags().Uint16("blkio-weight", 0, "Block IO (relative weight), between 10 and 1000, or 0 to disable (default 0)") + cmd.Flags().StringArray("blkio-weight-device", []string{}, "Block IO weight (relative device weight) (default [])") + cmd.Flags().StringArray("device-read-bps", []string{}, "Limit read rate (bytes per second) from a device (default [])") + cmd.Flags().StringArray("device-read-iops", []string{}, "Limit read rate (IO per second) from a device (default [])") + cmd.Flags().StringArray("device-write-bps", []string{}, "Limit write rate (bytes per second) to a device (default [])") + cmd.Flags().StringArray("device-write-iops", []string{}, "Limit write rate (IO per second) to a device (default [])") + // #endregion + // user flags cmd.Flags().StringP("user", "u", "", "Username or UID (format: [:])") cmd.Flags().String("umask", "", "Set the umask inside the container. Defaults to 0022") diff --git a/cmd/nerdctl/container/container_run_cgroup_linux_test.go b/cmd/nerdctl/container/container_run_cgroup_linux_test.go index edf42711730..2e1fce340df 100644 --- a/cmd/nerdctl/container/container_run_cgroup_linux_test.go +++ b/cmd/nerdctl/container/container_run_cgroup_linux_test.go @@ -21,8 +21,10 @@ import ( "context" "fmt" "os" + "os/exec" "path/filepath" "strconv" + "strings" "testing" "gotest.tools/v3/assert" @@ -477,3 +479,192 @@ func TestRunBlkioWeightCgroupV2(t *testing.T) { base.Cmd("update", containerName, "--blkio-weight", "400").AssertOK() base.Cmd("exec", containerName, "cat", "io.bfq.weight").AssertOutExactly("default 400\n") } + +func TestRunBlkioSettingCgroupV2(t *testing.T) { + testCase := nerdtest.Setup() + testCase.Require = nerdtest.Rootful + + // Create dummy device path + dummyDev := "/dev/dummy-zero" + + testCase.Setup = func(data test.Data, helpers test.Helpers) { + // Create dummy device + helperCmd := exec.Command("mknod", dummyDev, "c", "1", "5") + if out, err := helperCmd.CombinedOutput(); err != nil { + t.Fatalf("cannot create %q: %q: %v", dummyDev, string(out), err) + } + } + + testCase.Cleanup = func(data test.Data, helpers test.Helpers) { + // Clean up the dummy device + if err := exec.Command("rm", "-f", dummyDev).Run(); err != nil { + t.Logf("failed to remove device %s: %v", dummyDev, err) + } + } + + testCase.SubTests = []*test.Case{ + { + Description: "blkio-weight", + Require: nerdtest.CGroupV2, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("run", "-d", "--name", data.Identifier(), + "--blkio-weight", "150", + testutil.AlpineImage, "sleep", "infinity") + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All( + func(stdout string, info string, t *testing.T) { + assert.Assert(t, strings.Contains(helpers.Capture("inspect", "--format", "{{.HostConfig.BlkioWeight}}", data.Identifier()), "150")) + }, + ), + } + }, + }, + { + Description: "blkio-weight-device", + Require: nerdtest.CGroupV2, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("run", "-d", "--name", data.Identifier(), + "--blkio-weight-device", dummyDev+":100", + testutil.AlpineImage, "sleep", "infinity") + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All( + func(stdout string, info string, t *testing.T) { + inspectOut := helpers.Capture("inspect", "--format", "{{range .HostConfig.BlkioWeightDevice}}{{.Weight}}{{end}}", data.Identifier()) + assert.Assert(t, strings.Contains(inspectOut, "100")) + }, + ), + } + }, + }, + { + Description: "device-read-bps", + Require: require.All( + nerdtest.CGroupV2, + // Docker cli (v26.1.3) available in github runners has a bug where some of the blkio options + // do not work https://github.com/docker/cli/issues/5321. The fix has been merged to the latest releases + // but not currently available in the v26 release. + require.Not(nerdtest.Docker), + ), + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("run", "-d", "--name", data.Identifier(), + "--device-read-bps", dummyDev+":1048576", + testutil.AlpineImage, "sleep", "infinity") + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All( + func(stdout string, info string, t *testing.T) { + inspectOut := helpers.Capture("inspect", "--format", "{{range .HostConfig.BlkioDeviceReadBps}}{{.Rate}}{{end}}", data.Identifier()) + assert.Assert(t, strings.Contains(inspectOut, "1048576")) + }, + ), + } + }, + }, + { + Description: "device-write-bps", + Require: require.All( + nerdtest.CGroupV2, + // Docker cli (v26.1.3) available in github runners has a bug where some of the blkio options + // do not work https://github.com/docker/cli/issues/5321. The fix has been merged to the latest releases + // but not currently available in the v26 release. + require.Not(nerdtest.Docker), + ), + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("run", "-d", "--name", data.Identifier(), + "--device-write-bps", dummyDev+":2097152", + testutil.AlpineImage, "sleep", "infinity") + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All( + func(stdout string, info string, t *testing.T) { + inspectOut := helpers.Capture("inspect", "--format", "{{range .HostConfig.BlkioDeviceWriteBps}}{{.Rate}}{{end}}", data.Identifier()) + assert.Assert(t, strings.Contains(inspectOut, "2097152")) + }, + ), + } + }, + }, + { + Description: "device-read-iops", + Require: require.All( + nerdtest.CGroupV2, + // Docker cli (v26.1.3) available in github runners has a bug where some of the blkio options + // do not work https://github.com/docker/cli/issues/5321. The fix has been merged to the latest releases + // but not currently available in the v26 release. + require.Not(nerdtest.Docker), + ), + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("run", "-d", "--name", data.Identifier(), + "--device-read-iops", dummyDev+":1000", + testutil.AlpineImage, "sleep", "infinity") + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All( + func(stdout string, info string, t *testing.T) { + inspectOut := helpers.Capture("inspect", "--format", "{{range .HostConfig.BlkioDeviceReadIOps}}{{.Rate}}{{end}}", data.Identifier()) + assert.Assert(t, strings.Contains(inspectOut, "1000")) + }, + ), + } + }, + }, + { + Description: "device-write-iops", + Require: require.All( + nerdtest.CGroupV2, + // Docker cli (v26.1.3) available in github runners has a bug where some of the blkio options + // do not work https://github.com/docker/cli/issues/5321. The fix has been merged to the latest releases + // but not currently available in the v26 release. + require.Not(nerdtest.Docker), + ), + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("run", "-d", "--name", data.Identifier(), + "--device-write-iops", dummyDev+":2000", + testutil.AlpineImage, "sleep", "infinity") + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All( + func(stdout string, info string, t *testing.T) { + inspectOut := helpers.Capture("inspect", "--format", "{{range .HostConfig.BlkioDeviceWriteIOps}}{{.Rate}}{{end}}", data.Identifier()) + assert.Assert(t, strings.Contains(inspectOut, "2000")) + }, + ), + } + }, + }, + } + + testCase.Run(t) +} diff --git a/docs/command-reference.md b/docs/command-reference.md index 41fcd969fd6..a4d2f7af7be 100644 --- a/docs/command-reference.md +++ b/docs/command-reference.md @@ -213,6 +213,11 @@ Resource flags: - :whale: `--pids-limit`: Tune container pids limit - :nerd_face: `--cgroup-conf`: Configure cgroup v2 (key=value) - :whale: `--blkio-weight`: Block IO (relative weight), between 10 and 1000, or 0 to disable (default 0) +- :whale: `--blkio-weight-device`: Block IO weight (relative device weight) +- :whale: `--device-read-bps`: Limit read rate (bytes per second) from a device +- :whale: `--device-read-iops`: Limit read rate (IO per second) from a device +- :whale: `--device-write-bps`: Limit write rate (bytes per second) to a device +- :whale: `--device-write-iops`: Limit write rate (IO per second) to a device - :whale: `--cgroupns=(host|private)`: Cgroup namespace to use - Default: "private" on cgroup v2 hosts, "host" on cgroup v1 hosts - :whale: `--cgroup-parent`: Optional parent cgroup for the container @@ -414,7 +419,7 @@ IPFS flags: - :nerd_face: `--ipfs-address`: Multiaddr of IPFS API (default uses `$IPFS_PATH` env variable if defined or local directory `~/.ipfs`) Unimplemented `docker run` flags: - `--blkio-weight-device`, `--cpu-rt-*`, `--device-*`, + `--cpu-rt-*`, `--device-cgroup-rule`, `--disable-content-trust`, `--expose`, `--health-*`, `--isolation`, `--no-healthcheck`, `--link*`, `--publish-all`, `--storage-opt`, `--userns`, `--volume-driver` diff --git a/pkg/api/types/container_types.go b/pkg/api/types/container_types.go index 3d0ae773ce1..5325915631d 100644 --- a/pkg/api/types/container_types.go +++ b/pkg/api/types/container_types.go @@ -140,8 +140,6 @@ type ContainerCreateOptions struct { PidsLimit int64 // CgroupConf specifies to configure cgroup v2 (key=value) CgroupConf []string - // BlkioWeight specifies the block IO (relative weight), between 10 and 1000, or 0 to disable (default 0) - BlkioWeight uint16 // Cgroupns specifies the cgroup namespace to use Cgroupns string // CgroupParent specifies the optional parent cgroup for the container @@ -150,6 +148,21 @@ type ContainerCreateOptions struct { Device []string // #endregion + // #region for blkio related flags + // BlkioWeight specifies the block IO (relative weight), between 10 and 1000, or 0 to disable (default 0) + BlkioWeight uint16 + // BlkioWeightDevice specifies the Block IO weight (relative device weight) + BlkioWeightDevice []string + // BlkioDeviceReadBps specifies the Block IO read rate limit(bytes per second) of a device + BlkioDeviceReadBps []string + // BlkioDeviceWriteBps specifies the Block IO write rate limit(bytes per second) of a device + BlkioDeviceWriteBps []string + // BlkioDeviceReadIOps specifies the Block IO read rate limit(IO per second) of a device + BlkioDeviceReadIOps []string + // BlkioDeviceWriteIOps specifies the Block IO read rate limit(IO per second) of a device + BlkioDeviceWriteIOps []string + // #endregion + // #region for intel RDT flags // RDTClass specifies the Intel Resource Director Technology (RDT) class RDTClass string diff --git a/pkg/cmd/container/create.go b/pkg/cmd/container/create.go index 1dfe123bd0a..7127e364302 100644 --- a/pkg/cmd/container/create.go +++ b/pkg/cmd/container/create.go @@ -331,8 +331,6 @@ func Create(ctx context.Context, client *containerd.Client, args []string, netMa internalLabels.rm = containerutil.EncodeContainerRmOptLabel(options.Rm) - internalLabels.blkioWeight = options.BlkioWeight - // TODO: abolish internal labels and only use annotations ilOpt, err := withInternalLabels(internalLabels) if err != nil { @@ -624,11 +622,10 @@ func withStop(stopSignal string, stopTimeout int, ensuredImage *imgutil.EnsuredI type internalLabels struct { // labels from cmd options - namespace string - platform string - extraHosts []string - pidFile string - blkioWeight uint16 + namespace string + platform string + extraHosts []string + pidFile string // labels from cmd options or automatically set name string hostname string @@ -754,10 +751,6 @@ func withInternalLabels(internalLabels internalLabels) (containerd.NewContainerO m[labels.ContainerAutoRemove] = internalLabels.rm } - if internalLabels.blkioWeight > 0 { - hostConfigLabel.BlkioWeight = internalLabels.blkioWeight - } - if internalLabels.cidFile != "" { hostConfigLabel.CidFile = internalLabels.cidFile } diff --git a/pkg/cmd/container/run_blkio_linux.go b/pkg/cmd/container/run_blkio_linux.go new file mode 100644 index 00000000000..1b3f03929c3 --- /dev/null +++ b/pkg/cmd/container/run_blkio_linux.go @@ -0,0 +1,346 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package container + +import ( + "context" + "errors" + "fmt" + "strconv" + "strings" + + "github.com/docker/go-units" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + + "github.com/containerd/containerd/v2/core/containers" + "github.com/containerd/containerd/v2/pkg/oci" + "github.com/containerd/log" + + "github.com/containerd/nerdctl/v2/pkg/api/types" + "github.com/containerd/nerdctl/v2/pkg/infoutil" +) + +// WeightDevice is a structure that holds device:weight pair +type WeightDevice struct { + Path string + Weight uint16 +} + +func (w *WeightDevice) String() string { + return fmt.Sprintf("%s:%d", w.Path, w.Weight) +} + +// ThrottleDevice is a structure that holds device:rate_per_second pair +type ThrottleDevice struct { + Path string + Rate uint64 +} + +func (t *ThrottleDevice) String() string { + return fmt.Sprintf("%s:%d", t.Path, t.Rate) +} + +func toOCIWeightDevices(weightDevices []*WeightDevice) ([]specs.LinuxWeightDevice, error) { + var stat unix.Stat_t + blkioWeightDevices := make([]specs.LinuxWeightDevice, 0, len(weightDevices)) + + for _, weightDevice := range weightDevices { + if err := unix.Stat(weightDevice.Path, &stat); err != nil { + return nil, fmt.Errorf("failed to stat %s: %w", weightDevice.Path, err) + } + weight := weightDevice.Weight + d := specs.LinuxWeightDevice{Weight: &weight} + // The type is 32bit on mips. + d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert + d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert + blkioWeightDevices = append(blkioWeightDevices, d) + } + + return blkioWeightDevices, nil +} + +func toOCIThrottleDevices(devs []*ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { + var stat unix.Stat_t + throttleDevices := make([]specs.LinuxThrottleDevice, 0, len(devs)) + + for _, d := range devs { + if err := unix.Stat(d.Path, &stat); err != nil { + return nil, fmt.Errorf("failed to stat %s: %w", d.Path, err) + } + d := specs.LinuxThrottleDevice{Rate: d.Rate} + // the type is 32bit on mips + d.Major = int64(unix.Major(uint64(stat.Rdev))) //nolint: unconvert + d.Minor = int64(unix.Minor(uint64(stat.Rdev))) //nolint: unconvert + throttleDevices = append(throttleDevices, d) + } + + return throttleDevices, nil +} + +func withBlkioWeight(blkioWeight uint16) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Linux.Resources.BlockIO == nil { + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{} + } + s.Linux.Resources.BlockIO.Weight = &blkioWeight + return nil + } +} + +func withBlkioWeightDevice(weightDevices []specs.LinuxWeightDevice) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Linux.Resources.BlockIO == nil { + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{} + } + s.Linux.Resources.BlockIO.WeightDevice = weightDevices + return nil + } +} + +func withBlkioReadBpsDevice(devices []specs.LinuxThrottleDevice) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Linux.Resources.BlockIO == nil { + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{} + } + s.Linux.Resources.BlockIO.ThrottleReadBpsDevice = devices + return nil + } +} + +func withBlkioWriteBpsDevice(devices []specs.LinuxThrottleDevice) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Linux.Resources.BlockIO == nil { + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{} + } + s.Linux.Resources.BlockIO.ThrottleWriteBpsDevice = devices + return nil + } +} + +func withBlkioReadIOPSDevice(devices []specs.LinuxThrottleDevice) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Linux.Resources.BlockIO == nil { + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{} + } + s.Linux.Resources.BlockIO.ThrottleReadIOPSDevice = devices + return nil + } +} + +func withBlkioWriteIOPSDevice(devices []specs.LinuxThrottleDevice) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Linux.Resources.BlockIO == nil { + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{} + } + s.Linux.Resources.BlockIO.ThrottleWriteIOPSDevice = devices + return nil + } +} + +func BlkioOCIOpts(options types.ContainerCreateOptions) ([]oci.SpecOpts, error) { + var opts []oci.SpecOpts + + // Handle BlkioWeight + if options.BlkioWeight != 0 { + if !infoutil.BlockIOWeight(options.GOptions.CgroupManager) { + // blkio weight is not available on cgroup v1 since kernel 5.0. + // On cgroup v2, blkio weight is implemented using io.weight + log.L.Warn("kernel support for cgroup blkio weight missing, weight discarded") + } else { + if options.BlkioWeight < 10 || options.BlkioWeight > 1000 { + return nil, errors.New("range of blkio weight is from 10 to 1000") + } + opts = append(opts, withBlkioWeight(options.BlkioWeight)) + } + } + + // Handle BlkioWeightDevice + if len(options.BlkioWeightDevice) > 0 { + if !infoutil.BlockIOWeightDevice(options.GOptions.CgroupManager) { + // blkio weight device is not available on cgroup v1 since kernel 5.0. + // On cgroup v2, blkio weight is implemented using io.weight + log.L.Warn("kernel support for cgroup blkio weight device missing, weight device discarded") + } else { + weightDevices, err := validateWeightDevices(options.BlkioWeightDevice) + if err != nil { + return nil, fmt.Errorf("invalid weight device: %w", err) + } + linuxWeightDevices, err := toOCIWeightDevices(weightDevices) + if err != nil { + return nil, err + } + opts = append(opts, withBlkioWeightDevice(linuxWeightDevices)) + } + } + + // Handle BlockIOReadBpsDevice + if len(options.BlkioDeviceReadBps) > 0 { + if !infoutil.BlockIOReadBpsDevice(options.GOptions.CgroupManager) { + log.L.Warn("kernel support for cgroup blkio read bps device missing, read bps device discarded") + } else { + readBpsDevices, err := validateThrottleBpsDevices(options.BlkioDeviceReadBps) + if err != nil { + return nil, fmt.Errorf("invalid read bps device: %w", err) + } + throttleDevices, err := toOCIThrottleDevices(readBpsDevices) + if err != nil { + return nil, err + } + opts = append(opts, withBlkioReadBpsDevice(throttleDevices)) + } + } + + // Handle BlockIOWriteBpsDevice + if len(options.BlkioDeviceWriteBps) > 0 { + if !infoutil.BlockIOWriteBpsDevice(options.GOptions.CgroupManager) { + log.L.Warn("kernel support for cgroup blkio write bps device missing, write bps device discarded") + } else { + writeBpsDevices, err := validateThrottleBpsDevices(options.BlkioDeviceWriteBps) + if err != nil { + return nil, fmt.Errorf("invalid write bps device: %w", err) + } + throttleDevices, err := toOCIThrottleDevices(writeBpsDevices) + if err != nil { + return nil, err + } + opts = append(opts, withBlkioWriteBpsDevice(throttleDevices)) + } + } + + // Handle BlockIOReadIopsDevice + if len(options.BlkioDeviceReadIOps) > 0 { + if !infoutil.BlockIOReadIOpsDevice(options.GOptions.CgroupManager) { + log.L.Warn("kernel support for cgroup blkio read iops device missing, read iops device discarded") + } else { + readIopsDevices, err := validateThrottleIOpsDevices(options.BlkioDeviceReadIOps) + if err != nil { + return nil, fmt.Errorf("invalid read iops device: %w", err) + } + throttleDevices, err := toOCIThrottleDevices(readIopsDevices) + if err != nil { + return nil, err + } + opts = append(opts, withBlkioReadIOPSDevice(throttleDevices)) + } + } + + // Handle BlockIOWriteIopsDevice + if len(options.BlkioDeviceWriteIOps) > 0 { + if !infoutil.BlockIOWriteIOpsDevice(options.GOptions.CgroupManager) { + log.L.Warn("kernel support for cgroup blkio write iops device missing, write iops device discarded") + } else { + writeIopsDevices, err := validateThrottleIOpsDevices(options.BlkioDeviceWriteIOps) + if err != nil { + return nil, fmt.Errorf("invalid write iops device: %w", err) + } + throttleDevices, err := toOCIThrottleDevices(writeIopsDevices) + if err != nil { + return nil, err + } + opts = append(opts, withBlkioWriteIOPSDevice(throttleDevices)) + } + } + + return opts, nil +} + +// validateWeightDevices validates an array of device-weight strings +// +// from https://github.com/docker/cli/blob/master/opts/weightdevice.go#L15 +func validateWeightDevices(vals []string) ([]*WeightDevice, error) { + weightDevices := make([]*WeightDevice, 0, len(vals)) + for _, val := range vals { + k, v, ok := strings.Cut(val, ":") + if !ok || k == "" { + return nil, fmt.Errorf("bad format: %s", val) + } + if !strings.HasPrefix(k, "/dev/") { + return nil, fmt.Errorf("bad format for device path: %s", val) + } + weight, err := strconv.ParseUint(v, 10, 16) + if err != nil { + return nil, fmt.Errorf("invalid weight for device: %s", val) + } + if weight > 0 && (weight < 10 || weight > 1000) { + return nil, fmt.Errorf("invalid weight for device: %s", val) + } + + weightDevices = append(weightDevices, &WeightDevice{ + Path: k, + Weight: uint16(weight), + }) + } + return weightDevices, nil +} + +// validateThrottleBpsDevices validates an array of device-rate strings for bytes per second +// +// from https://github.com/docker/cli/blob/master/opts/throttledevice.go#L16 +func validateThrottleBpsDevices(vals []string) ([]*ThrottleDevice, error) { + throttleDevices := make([]*ThrottleDevice, 0, len(vals)) + for _, val := range vals { + k, v, ok := strings.Cut(val, ":") + if !ok || k == "" { + return nil, fmt.Errorf("bad format: %s", val) + } + + if !strings.HasPrefix(k, "/dev/") { + return nil, fmt.Errorf("bad format for device path: %s", val) + } + rate, err := units.RAMInBytes(v) + if err != nil { + return nil, fmt.Errorf("invalid rate for device: %s. The correct format is :[]. Number must be a positive integer. Unit is optional and can be kb, mb, or gb", val) + } + if rate < 0 { + return nil, fmt.Errorf("invalid rate for device: %s. The correct format is :[]. Number must be a positive integer. Unit is optional and can be kb, mb, or gb", val) + } + + throttleDevices = append(throttleDevices, &ThrottleDevice{ + Path: k, + Rate: uint64(rate), + }) + } + return throttleDevices, nil +} + +// validateThrottleIOpsDevices validates an array of device-rate strings for IO operations per second +// +// from https://github.com/docker/cli/blob/master/opts/throttledevice.go#L40 +func validateThrottleIOpsDevices(vals []string) ([]*ThrottleDevice, error) { + throttleDevices := make([]*ThrottleDevice, 0, len(vals)) + for _, val := range vals { + k, v, ok := strings.Cut(val, ":") + if !ok || k == "" { + return nil, fmt.Errorf("bad format: %s", val) + } + + if !strings.HasPrefix(k, "/dev/") { + return nil, fmt.Errorf("bad format for device path: %s", val) + } + rate, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid rate for device: %s. The correct format is :. Number must be a positive integer", val) + } + + throttleDevices = append(throttleDevices, &ThrottleDevice{ + Path: k, + Rate: rate, + }) + } + return throttleDevices, nil +} diff --git a/pkg/cmd/container/run_cgroup_linux.go b/pkg/cmd/container/run_cgroup_linux.go index a4d6fb7a266..5216ecc9c57 100644 --- a/pkg/cmd/container/run_cgroup_linux.go +++ b/pkg/cmd/container/run_cgroup_linux.go @@ -180,14 +180,11 @@ func generateCgroupOpts(id string, options types.ContainerCreateOptions, interna } opts = append(opts, withUnified(unifieds)) - if options.BlkioWeight != 0 && !infoutil.BlockIOWeight(options.GOptions.CgroupManager) { - log.L.Warn("kernel support for cgroup blkio weight missing, weight discarded") - options.BlkioWeight = 0 - } - if options.BlkioWeight > 0 && options.BlkioWeight < 10 || options.BlkioWeight > 1000 { - return nil, errors.New("range of blkio weight is from 10 to 1000") + blkioOpts, err := BlkioOCIOpts(options) + if err != nil { + return nil, err } - opts = append(opts, withBlkioWeight(options.BlkioWeight)) + opts = append(opts, blkioOpts...) switch options.Cgroupns { case "private": @@ -314,16 +311,6 @@ func withUnified(unified map[string]string) oci.SpecOpts { } } -func withBlkioWeight(blkioWeight uint16) oci.SpecOpts { - return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error { - if blkioWeight == 0 { - return nil - } - s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{Weight: &blkioWeight} - return nil - } -} - func withCustomMemoryResources(memoryOptions customMemoryOptions) oci.SpecOpts { return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error { if s.Linux != nil { diff --git a/pkg/infoutil/infoutil.go b/pkg/infoutil/infoutil.go index 2886c9fe049..965ae4a0c90 100644 --- a/pkg/infoutil/infoutil.go +++ b/pkg/infoutil/infoutil.go @@ -26,6 +26,7 @@ import ( "time" "github.com/Masterminds/semver/v3" + "github.com/docker/docker/pkg/sysinfo" containerd "github.com/containerd/containerd/v2/client" "github.com/containerd/containerd/v2/core/introspection" @@ -244,13 +245,42 @@ func parseRuncVersion(runcVersionStdout []byte) (*dockercompat.ComponentVersion, }, nil } -// BlockIOWeight return whether Block IO weight is supported or not -func BlockIOWeight(cgroupManager string) bool { +// getMobySysInfo returns the moby system info for the given cgroup manager +func getMobySysInfo(cgroupManager string) *sysinfo.SysInfo { var info dockercompat.Info info.CgroupVersion = CgroupsVersion() info.CgroupDriver = cgroupManager - mobySysInfo := mobySysInfo(&info) + return mobySysInfo(&info) +} + +// BlockIOWeight returns whether Block IO weight is supported or not +func BlockIOWeight(cgroupManager string) bool { // blkio weight is not available on cgroup v1 since kernel 5.0. // On cgroup v2, blkio weight is implemented using io.weight - return mobySysInfo.BlkioWeight + return getMobySysInfo(cgroupManager).BlkioWeight +} + +// BlockIOWeightDevice returns whether Block IO weight device is supported or not +func BlockIOWeightDevice(cgroupManager string) bool { + return getMobySysInfo(cgroupManager).BlkioWeightDevice +} + +// BlockIOReadBpsDevice returns whether Block IO read limit in bytes per second is supported or not +func BlockIOReadBpsDevice(cgroupManager string) bool { + return getMobySysInfo(cgroupManager).BlkioReadBpsDevice +} + +// BlockIOWriteBpsDevice returns whether Block IO write limit in bytes per second is supported or not +func BlockIOWriteBpsDevice(cgroupManager string) bool { + return getMobySysInfo(cgroupManager).BlkioWriteBpsDevice +} + +// BlockIOReadIOpsDevice returns whether Block IO read limit in IO per second is supported or not +func BlockIOReadIOpsDevice(cgroupManager string) bool { + return getMobySysInfo(cgroupManager).BlkioReadIOpsDevice +} + +// BlockIOWriteIOpsDevice returns whether Block IO write limit in IO per second is supported or not +func BlockIOWriteIOpsDevice(cgroupManager string) bool { + return getMobySysInfo(cgroupManager).BlkioWriteIOpsDevice } diff --git a/pkg/inspecttypes/dockercompat/dockercompat.go b/pkg/inspecttypes/dockercompat/dockercompat.go index 027726f14d8..b0831352d54 100644 --- a/pkg/inspecttypes/dockercompat/dockercompat.go +++ b/pkg/inspecttypes/dockercompat/dockercompat.go @@ -167,19 +167,18 @@ type HostConfig struct { Tmpfs map[string]string `json:"Tmpfs,omitempty"` // List of tmpfs (mounts) used for the container UTSMode string // UTS namespace to use for the container // UsernsMode UsernsMode // The user namespace to use for the container - ShmSize int64 // Size of /dev/shm in bytes. The size must be greater than 0. - Sysctls map[string]string // List of Namespaced sysctls used for the container - Runtime string // Runtime to use with this container - - BlkioWeight uint16 // Block IO weight (relative weight vs. other containers) - CPUSetMems string `json:"CpusetMems"` // CpusetMems 0-2, 0,1 - CPUSetCPUs string `json:"CpusetCpus"` // CpusetCpus 0-2, 0,1 - CPUQuota int64 `json:"CpuQuota"` // CPU CFS (Completely Fair Scheduler) quota - CPUShares uint64 `json:"CpuShares"` // CPU shares (relative weight vs. other containers) - Memory int64 // Memory limit (in bytes) - MemorySwap int64 // Total memory usage (memory + swap); set `-1` to enable unlimited swap - OomKillDisable bool // specifies whether to disable OOM Killer - Devices []DeviceMapping // List of devices to map inside the container + ShmSize int64 // Size of /dev/shm in bytes. The size must be greater than 0. + Sysctls map[string]string // List of Namespaced sysctls used for the container + Runtime string // Runtime to use with this container + CPUSetMems string `json:"CpusetMems"` // CpusetMems 0-2, 0,1 + CPUSetCPUs string `json:"CpusetCpus"` // CpusetCpus 0-2, 0,1 + CPUQuota int64 `json:"CpuQuota"` // CPU CFS (Completely Fair Scheduler) quota + CPUShares uint64 `json:"CpuShares"` // CPU shares (relative weight vs. other containers) + Memory int64 // Memory limit (in bytes) + MemorySwap int64 // Total memory usage (memory + swap); set `-1` to enable unlimited swap + OomKillDisable bool // specifies whether to disable OOM Killer + Devices []DeviceMapping // List of devices to map inside the container + LinuxBlkioSettings } // From https://github.com/moby/moby/blob/v20.10.1/api/types/types.go#L416-L427 @@ -303,6 +302,15 @@ type NetworkEndpointSettings struct { // TODO DriverOpts map[string]string } +type LinuxBlkioSettings struct { + BlkioWeight uint16 // Block IO weight (relative weight vs. other containers) + BlkioWeightDevice []*specs.LinuxWeightDevice + BlkioDeviceReadBps []*specs.LinuxThrottleDevice + BlkioDeviceWriteBps []*specs.LinuxThrottleDevice + BlkioDeviceReadIOps []*specs.LinuxThrottleDevice + BlkioDeviceWriteIOps []*specs.LinuxThrottleDevice +} + // ContainerFromNative instantiates a Docker-compatible Container from containerd-native Container. func ContainerFromNative(n *native.Container) (*Container, error) { var hostname string @@ -540,6 +548,11 @@ func ContainerFromNative(n *native.Container) (*Container, error) { pidMode = n.Labels[labels.PIDContainer] } c.HostConfig.PidMode = pidMode + + if err := getBlkioSettingsFromSpec(n.Spec.(*specs.Spec), c.HostConfig); err != nil { + return nil, fmt.Errorf("failed to get blkio settings: %w", err) + } + return c, nil } @@ -925,3 +938,78 @@ func ParseMountProperties(option []string) (rw bool, propagation string) { } return } + +func getDefaultLinuxBlkioSettings() LinuxBlkioSettings { + return LinuxBlkioSettings{ + BlkioWeight: 0, + BlkioWeightDevice: make([]*specs.LinuxWeightDevice, 0), + BlkioDeviceReadBps: make([]*specs.LinuxThrottleDevice, 0), + BlkioDeviceWriteBps: make([]*specs.LinuxThrottleDevice, 0), + BlkioDeviceReadIOps: make([]*specs.LinuxThrottleDevice, 0), + BlkioDeviceWriteIOps: make([]*specs.LinuxThrottleDevice, 0), + } +} + +func getBlkioSettingsFromSpec(spec *specs.Spec, hostConfig *HostConfig) error { + if spec == nil { + return fmt.Errorf("spec cannot be nil") + } + if hostConfig == nil { + return fmt.Errorf("hostConfig cannot be nil") + } + + // Initialize empty arrays by default + hostConfig.LinuxBlkioSettings = getDefaultLinuxBlkioSettings() + + if spec.Linux == nil || spec.Linux.Resources == nil || spec.Linux.Resources.BlockIO == nil { + return nil + } + + blockIO := spec.Linux.Resources.BlockIO + + // Set block IO weight + if blockIO.Weight != nil { + hostConfig.BlkioWeight = *blockIO.Weight + } + + // Set weight devices + if len(blockIO.WeightDevice) > 0 { + hostConfig.BlkioWeightDevice = make([]*specs.LinuxWeightDevice, len(blockIO.WeightDevice)) + for i, dev := range blockIO.WeightDevice { + hostConfig.BlkioWeightDevice[i] = &dev + } + } + + // Set throttle devices for read BPS + if len(blockIO.ThrottleReadBpsDevice) > 0 { + hostConfig.BlkioDeviceReadBps = make([]*specs.LinuxThrottleDevice, len(blockIO.ThrottleReadBpsDevice)) + for i, dev := range blockIO.ThrottleReadBpsDevice { + hostConfig.BlkioDeviceReadBps[i] = &dev + } + } + + // Set throttle devices for write BPS + if len(blockIO.ThrottleWriteBpsDevice) > 0 { + hostConfig.BlkioDeviceWriteBps = make([]*specs.LinuxThrottleDevice, len(blockIO.ThrottleWriteBpsDevice)) + for i, dev := range blockIO.ThrottleWriteBpsDevice { + hostConfig.BlkioDeviceWriteBps[i] = &dev + } + } + + // Set throttle devices for read IOPs + if len(blockIO.ThrottleReadIOPSDevice) > 0 { + hostConfig.BlkioDeviceReadIOps = make([]*specs.LinuxThrottleDevice, len(blockIO.ThrottleReadIOPSDevice)) + for i, dev := range blockIO.ThrottleReadIOPSDevice { + hostConfig.BlkioDeviceReadIOps[i] = &dev + } + } + + // Set throttle devices for write IOPs + if len(blockIO.ThrottleWriteIOPSDevice) > 0 { + hostConfig.BlkioDeviceWriteIOps = make([]*specs.LinuxThrottleDevice, len(blockIO.ThrottleWriteIOPSDevice)) + for i, dev := range blockIO.ThrottleWriteIOPSDevice { + hostConfig.BlkioDeviceWriteIOps[i] = &dev + } + } + return nil +} diff --git a/pkg/inspecttypes/dockercompat/dockercompat_test.go b/pkg/inspecttypes/dockercompat/dockercompat_test.go index 27beead6c59..ddb621b9f39 100644 --- a/pkg/inspecttypes/dockercompat/dockercompat_test.go +++ b/pkg/inspecttypes/dockercompat/dockercompat_test.go @@ -82,8 +82,9 @@ func TestContainerFromNative(t *testing.T) { Driver: "json-file", Opts: map[string]string{}, }, - UTSMode: "host", - Tmpfs: map[string]string{}, + UTSMode: "host", + Tmpfs: map[string]string{}, + LinuxBlkioSettings: getDefaultLinuxBlkioSettings(), }, Mounts: []MountPoint{ { @@ -174,8 +175,9 @@ func TestContainerFromNative(t *testing.T) { Driver: "json-file", Opts: map[string]string{}, }, - UTSMode: "host", - Tmpfs: map[string]string{}, + UTSMode: "host", + Tmpfs: map[string]string{}, + LinuxBlkioSettings: getDefaultLinuxBlkioSettings(), }, Mounts: []MountPoint{ { @@ -264,8 +266,9 @@ func TestContainerFromNative(t *testing.T) { Driver: "json-file", Opts: map[string]string{}, }, - UTSMode: "host", - Tmpfs: map[string]string{}, + UTSMode: "host", + Tmpfs: map[string]string{}, + LinuxBlkioSettings: getDefaultLinuxBlkioSettings(), }, Mounts: []MountPoint{ { diff --git a/pkg/testutil/nerdtest/requirements.go b/pkg/testutil/nerdtest/requirements.go index 88acba1802d..3566b273f0d 100644 --- a/pkg/testutil/nerdtest/requirements.go +++ b/pkg/testutil/nerdtest/requirements.go @@ -190,6 +190,28 @@ var CgroupsAccessible = require.All( }, ) +// CGroupV2 requires that cgroup is enabled and cgroup version is 2 +var CGroupV2 = &test.Requirement{ + Check: func(data test.Data, helpers test.Helpers) (ret bool, mess string) { + ret = true + mess = "cgroup is enabled" + stdout := helpers.Capture("info", "--format", "{{ json . }}") + var dinf dockercompat.Info + err := json.Unmarshal([]byte(stdout), &dinf) + assert.NilError(helpers.T(), err, "failed to parse docker info") + switch dinf.CgroupDriver { + case "none", "": + ret = false + mess = "cgroup is none" + } + if dinf.CgroupVersion != "2" { + ret = false + mess = "cgroup version is not 2" + } + return ret, mess + }, +} + // Soci requires that the soci snapshotter is enabled var Soci = &test.Requirement{ Check: func(data test.Data, helpers test.Helpers) (ret bool, mess string) {