16d03e
From ce352accdfb07a91b5527e70ec8bce658a8b68de Mon Sep 17 00:00:00 2001
16d03e
From: Kir Kolyshkin <kolyshkin@gmail.com>
16d03e
Date: Tue, 23 Feb 2021 18:27:42 -0800
16d03e
Subject: [PATCH 4/5] Fix cgroup2 mount for rootless case
16d03e
16d03e
In case of rootless, cgroup2 mount is not possible (see [1] for more
16d03e
details), so since commit 9c81440fb5a7 runc bind-mounts the whole
16d03e
/sys/fs/cgroup into container.
16d03e
16d03e
Problem is, if cgroupns is enabled, /sys/fs/cgroup inside the container
16d03e
is supposed to show the cgroup files for this cgroup, not the root one.
16d03e
16d03e
The fix is to pass through and use the cgroup path in case cgroup2
16d03e
mount failed, cgroupns is enabled, and the path is non-empty.
16d03e
16d03e
Surely this requires the /sys/fs/cgroup mount in the spec, so modify
16d03e
runc spec --rootless to keep it.
16d03e
16d03e
Before:
16d03e
16d03e
	$ ./runc run aaa
16d03e
	# find /sys/fs/cgroup/ -type d
16d03e
	/sys/fs/cgroup
16d03e
	/sys/fs/cgroup/user.slice
16d03e
	/sys/fs/cgroup/user.slice/user-1000.slice
16d03e
	/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service
16d03e
	...
16d03e
	# ls -l /sys/fs/cgroup/cgroup.controllers
16d03e
	-r--r--r--    1 nobody   nogroup          0 Feb 24 02:22 /sys/fs/cgroup/cgroup.controllers
16d03e
	# wc -w /sys/fs/cgroup/cgroup.procs
16d03e
	142 /sys/fs/cgroup/cgroup.procs
16d03e
	# cat /sys/fs/cgroup/memory.current
16d03e
	cat: can't open '/sys/fs/cgroup/memory.current': No such file or directory
16d03e
16d03e
After:
16d03e
16d03e
	# find /sys/fs/cgroup/ -type d
16d03e
	/sys/fs/cgroup/
16d03e
	# ls -l /sys/fs/cgroup/cgroup.controllers
16d03e
	-r--r--r--    1 root     root             0 Feb 24 02:43 /sys/fs/cgroup/cgroup.controllers
16d03e
	# wc -w /sys/fs/cgroup/cgroup.procs
16d03e
	2 /sys/fs/cgroup/cgroup.procs
16d03e
	# cat /sys/fs/cgroup/memory.current
16d03e
	577536
16d03e
16d03e
[1] https://github.com/opencontainers/runc/issues/2158
16d03e
16d03e
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
16d03e
---
16d03e
 libcontainer/container_linux.go  |  3 +++
16d03e
 libcontainer/init_linux.go       |  1 +
16d03e
 libcontainer/rootfs_linux.go     | 28 +++++++++++++++++++++-------
16d03e
 libcontainer/specconv/example.go | 18 +++++++++---------
16d03e
 4 files changed, 34 insertions(+), 16 deletions(-)
16d03e
16d03e
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
16d03e
index b6100aae9d5a..1cbc734172d0 100644
16d03e
--- a/libcontainer/container_linux.go
16d03e
+++ b/libcontainer/container_linux.go
16d03e
@@ -610,6 +610,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
16d03e
 	if len(process.Rlimits) > 0 {
16d03e
 		cfg.Rlimits = process.Rlimits
16d03e
 	}
16d03e
+	if cgroups.IsCgroup2UnifiedMode() {
16d03e
+		cfg.Cgroup2Path = c.cgroupManager.Path("")
16d03e
+	}
16d03e
 
16d03e
 	return cfg
16d03e
 }
16d03e
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
16d03e
index c57af0eebb8b..681797099f46 100644
16d03e
--- a/libcontainer/init_linux.go
16d03e
+++ b/libcontainer/init_linux.go
16d03e
@@ -70,6 +70,7 @@ type initConfig struct {
16d03e
 	RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
16d03e
 	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
16d03e
 	SpecState        *specs.State          `json:"spec_state,omitempty"`
16d03e
+	Cgroup2Path      string                `json:"cgroup2_path,omitempty"`
16d03e
 }
16d03e
 
16d03e
 type initer interface {
16d03e
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
16d03e
index 0f0495b93b3e..5d2d74cf924b 100644
16d03e
--- a/libcontainer/rootfs_linux.go
16d03e
+++ b/libcontainer/rootfs_linux.go
16d03e
@@ -31,9 +31,11 @@ import (
16d03e
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
16d03e
 
16d03e
 type mountConfig struct {
16d03e
-	root     string
16d03e
-	label    string
16d03e
-	cgroupns bool
16d03e
+	root            string
16d03e
+	label           string
16d03e
+	cgroup2Path     string
16d03e
+	rootlessCgroups bool
16d03e
+	cgroupns        bool
16d03e
 }
16d03e
 
16d03e
 // needsSetupDev returns true if /dev needs to be set up.
16d03e
@@ -56,9 +58,11 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
16d03e
 	}
16d03e
 
16d03e
 	mountConfig := &mountConfig{
16d03e
-		root:     config.Rootfs,
16d03e
-		label:    config.MountLabel,
16d03e
-		cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
16d03e
+		root:            config.Rootfs,
16d03e
+		label:           config.MountLabel,
16d03e
+		cgroup2Path:     iConfig.Cgroup2Path,
16d03e
+		rootlessCgroups: iConfig.RootlessCgroups,
16d03e
+		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
16d03e
 	}
16d03e
 	setupDev := needsSetupDev(config)
16d03e
 	for _, m := range config.Mounts {
16d03e
@@ -307,7 +311,17 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
16d03e
 		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
16d03e
 		if err == unix.EPERM || err == unix.EBUSY {
16d03e
 			src := fs2.UnifiedMountpoint
16d03e
-			return unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
16d03e
+			if c.cgroupns && c.cgroup2Path != "" {
16d03e
+				// Emulate cgroupns by bind-mounting
16d03e
+				// the container cgroup path rather than
16d03e
+				// the whole /sys/fs/cgroup.
16d03e
+				src = c.cgroup2Path
16d03e
+			}
16d03e
+			err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
16d03e
+			if err == unix.ENOENT && c.rootlessCgroups {
16d03e
+				err = nil
16d03e
+			}
16d03e
+			return err
16d03e
 		}
16d03e
 		return err
16d03e
 	}
16d03e
diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go
16d03e
index 8a201bc78dd9..56bab3bfbfa5 100644
16d03e
--- a/libcontainer/specconv/example.go
16d03e
+++ b/libcontainer/specconv/example.go
16d03e
@@ -2,6 +2,7 @@ package specconv
16d03e
 
16d03e
 import (
16d03e
 	"os"
16d03e
+	"path/filepath"
16d03e
 	"strings"
16d03e
 
16d03e
 	"github.com/opencontainers/runc/libcontainer/cgroups"
16d03e
@@ -200,8 +201,14 @@ func ToRootless(spec *specs.Spec) {
16d03e
 	// Fix up mounts.
16d03e
 	var mounts []specs.Mount
16d03e
 	for _, mount := range spec.Mounts {
16d03e
-		// Ignore all mounts that are under /sys.
16d03e
-		if strings.HasPrefix(mount.Destination, "/sys") {
16d03e
+		// Replace the /sys mount with an rbind.
16d03e
+		if filepath.Clean(mount.Destination) == "/sys" {
16d03e
+			mounts = append(mounts, specs.Mount{
16d03e
+				Source:      "/sys",
16d03e
+				Destination: "/sys",
16d03e
+				Type:        "none",
16d03e
+				Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
16d03e
+			})
16d03e
 			continue
16d03e
 		}
16d03e
 
16d03e
@@ -216,13 +223,6 @@ func ToRootless(spec *specs.Spec) {
16d03e
 		mount.Options = options
16d03e
 		mounts = append(mounts, mount)
16d03e
 	}
16d03e
-	// Add the sysfs mount as an rbind.
16d03e
-	mounts = append(mounts, specs.Mount{
16d03e
-		Source:      "/sys",
16d03e
-		Destination: "/sys",
16d03e
-		Type:        "none",
16d03e
-		Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
16d03e
-	})
16d03e
 	spec.Mounts = mounts
16d03e
 
16d03e
 	// Remove cgroup settings.
16d03e
-- 
16d03e
2.31.1
16d03e