From ce352accdfb07a91b5527e70ec8bce658a8b68de Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 23 Feb 2021 18:27:42 -0800 Subject: [PATCH 4/5] Fix cgroup2 mount for rootless case In case of rootless, cgroup2 mount is not possible (see [1] for more details), so since commit 9c81440fb5a7 runc bind-mounts the whole /sys/fs/cgroup into container. Problem is, if cgroupns is enabled, /sys/fs/cgroup inside the container is supposed to show the cgroup files for this cgroup, not the root one. The fix is to pass through and use the cgroup path in case cgroup2 mount failed, cgroupns is enabled, and the path is non-empty. Surely this requires the /sys/fs/cgroup mount in the spec, so modify runc spec --rootless to keep it. Before: $ ./runc run aaa # find /sys/fs/cgroup/ -type d /sys/fs/cgroup /sys/fs/cgroup/user.slice /sys/fs/cgroup/user.slice/user-1000.slice /sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service ... # ls -l /sys/fs/cgroup/cgroup.controllers -r--r--r-- 1 nobody nogroup 0 Feb 24 02:22 /sys/fs/cgroup/cgroup.controllers # wc -w /sys/fs/cgroup/cgroup.procs 142 /sys/fs/cgroup/cgroup.procs # cat /sys/fs/cgroup/memory.current cat: can't open '/sys/fs/cgroup/memory.current': No such file or directory After: # find /sys/fs/cgroup/ -type d /sys/fs/cgroup/ # ls -l /sys/fs/cgroup/cgroup.controllers -r--r--r-- 1 root root 0 Feb 24 02:43 /sys/fs/cgroup/cgroup.controllers # wc -w /sys/fs/cgroup/cgroup.procs 2 /sys/fs/cgroup/cgroup.procs # cat /sys/fs/cgroup/memory.current 577536 [1] https://github.com/opencontainers/runc/issues/2158 Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 3 +++ libcontainer/init_linux.go | 1 + libcontainer/rootfs_linux.go | 28 +++++++++++++++++++++------- libcontainer/specconv/example.go | 18 +++++++++--------- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index b6100aae9d5a..1cbc734172d0 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -610,6 +610,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { if len(process.Rlimits) > 0 { cfg.Rlimits = process.Rlimits } + if cgroups.IsCgroup2UnifiedMode() { + cfg.Cgroup2Path = c.cgroupManager.Path("") + } return cfg } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index c57af0eebb8b..681797099f46 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -70,6 +70,7 @@ type initConfig struct { RootlessEUID bool `json:"rootless_euid,omitempty"` RootlessCgroups bool `json:"rootless_cgroups,omitempty"` SpecState *specs.State `json:"spec_state,omitempty"` + Cgroup2Path string `json:"cgroup2_path,omitempty"` } type initer interface { diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 0f0495b93b3e..5d2d74cf924b 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -31,9 +31,11 @@ import ( const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV type mountConfig struct { - root string - label string - cgroupns bool + root string + label string + cgroup2Path string + rootlessCgroups bool + cgroupns bool } // needsSetupDev returns true if /dev needs to be set up. @@ -56,9 +58,11 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { } mountConfig := &mountConfig{ - root: config.Rootfs, - label: config.MountLabel, - cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), + root: config.Rootfs, + label: config.MountLabel, + cgroup2Path: iConfig.Cgroup2Path, + rootlessCgroups: iConfig.RootlessCgroups, + cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), } setupDev := needsSetupDev(config) for _, m := range config.Mounts { @@ -307,7 +311,17 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158) if err == unix.EPERM || err == unix.EBUSY { src := fs2.UnifiedMountpoint - return unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "") + if c.cgroupns && c.cgroup2Path != "" { + // Emulate cgroupns by bind-mounting + // the container cgroup path rather than + // the whole /sys/fs/cgroup. + src = c.cgroup2Path + } + err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "") + if err == unix.ENOENT && c.rootlessCgroups { + err = nil + } + return err } return err } diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go index 8a201bc78dd9..56bab3bfbfa5 100644 --- a/libcontainer/specconv/example.go +++ b/libcontainer/specconv/example.go @@ -2,6 +2,7 @@ package specconv import ( "os" + "path/filepath" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -200,8 +201,14 @@ func ToRootless(spec *specs.Spec) { // Fix up mounts. var mounts []specs.Mount for _, mount := range spec.Mounts { - // Ignore all mounts that are under /sys. - if strings.HasPrefix(mount.Destination, "/sys") { + // Replace the /sys mount with an rbind. + if filepath.Clean(mount.Destination) == "/sys" { + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) continue } @@ -216,13 +223,6 @@ func ToRootless(spec *specs.Spec) { mount.Options = options mounts = append(mounts, mount) } - // Add the sysfs mount as an rbind. - mounts = append(mounts, specs.Mount{ - Source: "/sys", - Destination: "/sys", - Type: "none", - Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, - }) spec.Mounts = mounts // Remove cgroup settings. -- 2.31.1