Blob Blame History Raw
From ce352accdfb07a91b5527e70ec8bce658a8b68de Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Tue, 23 Feb 2021 18:27:42 -0800
Subject: [PATCH 4/5] Fix cgroup2 mount for rootless case

In case of rootless, cgroup2 mount is not possible (see [1] for more
details), so since commit 9c81440fb5a7 runc bind-mounts the whole
/sys/fs/cgroup into container.

Problem is, if cgroupns is enabled, /sys/fs/cgroup inside the container
is supposed to show the cgroup files for this cgroup, not the root one.

The fix is to pass through and use the cgroup path in case cgroup2
mount failed, cgroupns is enabled, and the path is non-empty.

Surely this requires the /sys/fs/cgroup mount in the spec, so modify
runc spec --rootless to keep it.

Before:

	$ ./runc run aaa
	# find /sys/fs/cgroup/ -type d
	/sys/fs/cgroup
	/sys/fs/cgroup/user.slice
	/sys/fs/cgroup/user.slice/user-1000.slice
	/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service
	...
	# ls -l /sys/fs/cgroup/cgroup.controllers
	-r--r--r--    1 nobody   nogroup          0 Feb 24 02:22 /sys/fs/cgroup/cgroup.controllers
	# wc -w /sys/fs/cgroup/cgroup.procs
	142 /sys/fs/cgroup/cgroup.procs
	# cat /sys/fs/cgroup/memory.current
	cat: can't open '/sys/fs/cgroup/memory.current': No such file or directory

After:

	# find /sys/fs/cgroup/ -type d
	/sys/fs/cgroup/
	# ls -l /sys/fs/cgroup/cgroup.controllers
	-r--r--r--    1 root     root             0 Feb 24 02:43 /sys/fs/cgroup/cgroup.controllers
	# wc -w /sys/fs/cgroup/cgroup.procs
	2 /sys/fs/cgroup/cgroup.procs
	# cat /sys/fs/cgroup/memory.current
	577536

[1] https://github.com/opencontainers/runc/issues/2158

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/container_linux.go  |  3 +++
 libcontainer/init_linux.go       |  1 +
 libcontainer/rootfs_linux.go     | 28 +++++++++++++++++++++-------
 libcontainer/specconv/example.go | 18 +++++++++---------
 4 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index b6100aae9d5a..1cbc734172d0 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -610,6 +610,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 	if len(process.Rlimits) > 0 {
 		cfg.Rlimits = process.Rlimits
 	}
+	if cgroups.IsCgroup2UnifiedMode() {
+		cfg.Cgroup2Path = c.cgroupManager.Path("")
+	}
 
 	return cfg
 }
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index c57af0eebb8b..681797099f46 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -70,6 +70,7 @@ type initConfig struct {
 	RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
 	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
 	SpecState        *specs.State          `json:"spec_state,omitempty"`
+	Cgroup2Path      string                `json:"cgroup2_path,omitempty"`
 }
 
 type initer interface {
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 0f0495b93b3e..5d2d74cf924b 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -31,9 +31,11 @@ import (
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
 
 type mountConfig struct {
-	root     string
-	label    string
-	cgroupns bool
+	root            string
+	label           string
+	cgroup2Path     string
+	rootlessCgroups bool
+	cgroupns        bool
 }
 
 // needsSetupDev returns true if /dev needs to be set up.
@@ -56,9 +58,11 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
 	}
 
 	mountConfig := &mountConfig{
-		root:     config.Rootfs,
-		label:    config.MountLabel,
-		cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
+		root:            config.Rootfs,
+		label:           config.MountLabel,
+		cgroup2Path:     iConfig.Cgroup2Path,
+		rootlessCgroups: iConfig.RootlessCgroups,
+		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
 	}
 	setupDev := needsSetupDev(config)
 	for _, m := range config.Mounts {
@@ -307,7 +311,17 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
 		if err == unix.EPERM || err == unix.EBUSY {
 			src := fs2.UnifiedMountpoint
-			return unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
+			if c.cgroupns && c.cgroup2Path != "" {
+				// Emulate cgroupns by bind-mounting
+				// the container cgroup path rather than
+				// the whole /sys/fs/cgroup.
+				src = c.cgroup2Path
+			}
+			err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
+			if err == unix.ENOENT && c.rootlessCgroups {
+				err = nil
+			}
+			return err
 		}
 		return err
 	}
diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go
index 8a201bc78dd9..56bab3bfbfa5 100644
--- a/libcontainer/specconv/example.go
+++ b/libcontainer/specconv/example.go
@@ -2,6 +2,7 @@ package specconv
 
 import (
 	"os"
+	"path/filepath"
 	"strings"
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -200,8 +201,14 @@ func ToRootless(spec *specs.Spec) {
 	// Fix up mounts.
 	var mounts []specs.Mount
 	for _, mount := range spec.Mounts {
-		// Ignore all mounts that are under /sys.
-		if strings.HasPrefix(mount.Destination, "/sys") {
+		// Replace the /sys mount with an rbind.
+		if filepath.Clean(mount.Destination) == "/sys" {
+			mounts = append(mounts, specs.Mount{
+				Source:      "/sys",
+				Destination: "/sys",
+				Type:        "none",
+				Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
+			})
 			continue
 		}
 
@@ -216,13 +223,6 @@ func ToRootless(spec *specs.Spec) {
 		mount.Options = options
 		mounts = append(mounts, mount)
 	}
-	// Add the sysfs mount as an rbind.
-	mounts = append(mounts, specs.Mount{
-		Source:      "/sys",
-		Destination: "/sys",
-		Type:        "none",
-		Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
-	})
 	spec.Mounts = mounts
 
 	// Remove cgroup settings.
-- 
2.31.1