|
 |
16d03e |
From ce352accdfb07a91b5527e70ec8bce658a8b68de Mon Sep 17 00:00:00 2001
|
|
 |
16d03e |
From: Kir Kolyshkin <kolyshkin@gmail.com>
|
|
 |
16d03e |
Date: Tue, 23 Feb 2021 18:27:42 -0800
|
|
 |
16d03e |
Subject: [PATCH 4/5] Fix cgroup2 mount for rootless case
|
|
 |
16d03e |
|
|
 |
16d03e |
In case of rootless, cgroup2 mount is not possible (see [1] for more
|
|
 |
16d03e |
details), so since commit 9c81440fb5a7 runc bind-mounts the whole
|
|
 |
16d03e |
/sys/fs/cgroup into container.
|
|
 |
16d03e |
|
|
 |
16d03e |
Problem is, if cgroupns is enabled, /sys/fs/cgroup inside the container
|
|
 |
16d03e |
is supposed to show the cgroup files for this cgroup, not the root one.
|
|
 |
16d03e |
|
|
 |
16d03e |
The fix is to pass through and use the cgroup path in case cgroup2
|
|
 |
16d03e |
mount failed, cgroupns is enabled, and the path is non-empty.
|
|
 |
16d03e |
|
|
 |
16d03e |
Surely this requires the /sys/fs/cgroup mount in the spec, so modify
|
|
 |
16d03e |
runc spec --rootless to keep it.
|
|
 |
16d03e |
|
|
 |
16d03e |
Before:
|
|
 |
16d03e |
|
|
 |
16d03e |
$ ./runc run aaa
|
|
 |
16d03e |
# find /sys/fs/cgroup/ -type d
|
|
 |
16d03e |
/sys/fs/cgroup
|
|
 |
16d03e |
/sys/fs/cgroup/user.slice
|
|
 |
16d03e |
/sys/fs/cgroup/user.slice/user-1000.slice
|
|
 |
16d03e |
/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service
|
|
 |
16d03e |
...
|
|
 |
16d03e |
# ls -l /sys/fs/cgroup/cgroup.controllers
|
|
 |
16d03e |
-r--r--r-- 1 nobody nogroup 0 Feb 24 02:22 /sys/fs/cgroup/cgroup.controllers
|
|
 |
16d03e |
# wc -w /sys/fs/cgroup/cgroup.procs
|
|
 |
16d03e |
142 /sys/fs/cgroup/cgroup.procs
|
|
 |
16d03e |
# cat /sys/fs/cgroup/memory.current
|
|
 |
16d03e |
cat: can't open '/sys/fs/cgroup/memory.current': No such file or directory
|
|
 |
16d03e |
|
|
 |
16d03e |
After:
|
|
 |
16d03e |
|
|
 |
16d03e |
# find /sys/fs/cgroup/ -type d
|
|
 |
16d03e |
/sys/fs/cgroup/
|
|
 |
16d03e |
# ls -l /sys/fs/cgroup/cgroup.controllers
|
|
 |
16d03e |
-r--r--r-- 1 root root 0 Feb 24 02:43 /sys/fs/cgroup/cgroup.controllers
|
|
 |
16d03e |
# wc -w /sys/fs/cgroup/cgroup.procs
|
|
 |
16d03e |
2 /sys/fs/cgroup/cgroup.procs
|
|
 |
16d03e |
# cat /sys/fs/cgroup/memory.current
|
|
 |
16d03e |
577536
|
|
 |
16d03e |
|
|
 |
16d03e |
[1] https://github.com/opencontainers/runc/issues/2158
|
|
 |
16d03e |
|
|
 |
16d03e |
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
|
 |
16d03e |
---
|
|
 |
16d03e |
libcontainer/container_linux.go | 3 +++
|
|
 |
16d03e |
libcontainer/init_linux.go | 1 +
|
|
 |
16d03e |
libcontainer/rootfs_linux.go | 28 +++++++++++++++++++++-------
|
|
 |
16d03e |
libcontainer/specconv/example.go | 18 +++++++++---------
|
|
 |
16d03e |
4 files changed, 34 insertions(+), 16 deletions(-)
|
|
 |
16d03e |
|
|
 |
16d03e |
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
|
|
 |
16d03e |
index b6100aae9d5a..1cbc734172d0 100644
|
|
 |
16d03e |
--- a/libcontainer/container_linux.go
|
|
 |
16d03e |
+++ b/libcontainer/container_linux.go
|
|
 |
16d03e |
@@ -610,6 +610,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
|
|
 |
16d03e |
if len(process.Rlimits) > 0 {
|
|
 |
16d03e |
cfg.Rlimits = process.Rlimits
|
|
 |
16d03e |
}
|
|
 |
16d03e |
+ if cgroups.IsCgroup2UnifiedMode() {
|
|
 |
16d03e |
+ cfg.Cgroup2Path = c.cgroupManager.Path("")
|
|
 |
16d03e |
+ }
|
|
 |
16d03e |
|
|
 |
16d03e |
return cfg
|
|
 |
16d03e |
}
|
|
 |
16d03e |
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
|
|
 |
16d03e |
index c57af0eebb8b..681797099f46 100644
|
|
 |
16d03e |
--- a/libcontainer/init_linux.go
|
|
 |
16d03e |
+++ b/libcontainer/init_linux.go
|
|
 |
16d03e |
@@ -70,6 +70,7 @@ type initConfig struct {
|
|
 |
16d03e |
RootlessEUID bool `json:"rootless_euid,omitempty"`
|
|
 |
16d03e |
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
|
|
 |
16d03e |
SpecState *specs.State `json:"spec_state,omitempty"`
|
|
 |
16d03e |
+ Cgroup2Path string `json:"cgroup2_path,omitempty"`
|
|
 |
16d03e |
}
|
|
 |
16d03e |
|
|
 |
16d03e |
type initer interface {
|
|
 |
16d03e |
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
|
|
 |
16d03e |
index 0f0495b93b3e..5d2d74cf924b 100644
|
|
 |
16d03e |
--- a/libcontainer/rootfs_linux.go
|
|
 |
16d03e |
+++ b/libcontainer/rootfs_linux.go
|
|
 |
16d03e |
@@ -31,9 +31,11 @@ import (
|
|
 |
16d03e |
const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
|
|
 |
16d03e |
|
|
 |
16d03e |
type mountConfig struct {
|
|
 |
16d03e |
- root string
|
|
 |
16d03e |
- label string
|
|
 |
16d03e |
- cgroupns bool
|
|
 |
16d03e |
+ root string
|
|
 |
16d03e |
+ label string
|
|
 |
16d03e |
+ cgroup2Path string
|
|
 |
16d03e |
+ rootlessCgroups bool
|
|
 |
16d03e |
+ cgroupns bool
|
|
 |
16d03e |
}
|
|
 |
16d03e |
|
|
 |
16d03e |
// needsSetupDev returns true if /dev needs to be set up.
|
|
 |
16d03e |
@@ -56,9 +58,11 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
|
|
 |
16d03e |
}
|
|
 |
16d03e |
|
|
 |
16d03e |
mountConfig := &mountConfig{
|
|
 |
16d03e |
- root: config.Rootfs,
|
|
 |
16d03e |
- label: config.MountLabel,
|
|
 |
16d03e |
- cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
|
|
 |
16d03e |
+ root: config.Rootfs,
|
|
 |
16d03e |
+ label: config.MountLabel,
|
|
 |
16d03e |
+ cgroup2Path: iConfig.Cgroup2Path,
|
|
 |
16d03e |
+ rootlessCgroups: iConfig.RootlessCgroups,
|
|
 |
16d03e |
+ cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
|
|
 |
16d03e |
}
|
|
 |
16d03e |
setupDev := needsSetupDev(config)
|
|
 |
16d03e |
for _, m := range config.Mounts {
|
|
 |
16d03e |
@@ -307,7 +311,17 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
|
|
 |
16d03e |
// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
|
|
 |
16d03e |
if err == unix.EPERM || err == unix.EBUSY {
|
|
 |
16d03e |
src := fs2.UnifiedMountpoint
|
|
 |
16d03e |
- return unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
|
|
 |
16d03e |
+ if c.cgroupns && c.cgroup2Path != "" {
|
|
 |
16d03e |
+ // Emulate cgroupns by bind-mounting
|
|
 |
16d03e |
+ // the container cgroup path rather than
|
|
 |
16d03e |
+ // the whole /sys/fs/cgroup.
|
|
 |
16d03e |
+ src = c.cgroup2Path
|
|
 |
16d03e |
+ }
|
|
 |
16d03e |
+ err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
|
|
 |
16d03e |
+ if err == unix.ENOENT && c.rootlessCgroups {
|
|
 |
16d03e |
+ err = nil
|
|
 |
16d03e |
+ }
|
|
 |
16d03e |
+ return err
|
|
 |
16d03e |
}
|
|
 |
16d03e |
return err
|
|
 |
16d03e |
}
|
|
 |
16d03e |
diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go
|
|
 |
16d03e |
index 8a201bc78dd9..56bab3bfbfa5 100644
|
|
 |
16d03e |
--- a/libcontainer/specconv/example.go
|
|
 |
16d03e |
+++ b/libcontainer/specconv/example.go
|
|
 |
16d03e |
@@ -2,6 +2,7 @@ package specconv
|
|
 |
16d03e |
|
|
 |
16d03e |
import (
|
|
 |
16d03e |
"os"
|
|
 |
16d03e |
+ "path/filepath"
|
|
 |
16d03e |
"strings"
|
|
 |
16d03e |
|
|
 |
16d03e |
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
 |
16d03e |
@@ -200,8 +201,14 @@ func ToRootless(spec *specs.Spec) {
|
|
 |
16d03e |
// Fix up mounts.
|
|
 |
16d03e |
var mounts []specs.Mount
|
|
 |
16d03e |
for _, mount := range spec.Mounts {
|
|
 |
16d03e |
- // Ignore all mounts that are under /sys.
|
|
 |
16d03e |
- if strings.HasPrefix(mount.Destination, "/sys") {
|
|
 |
16d03e |
+ // Replace the /sys mount with an rbind.
|
|
 |
16d03e |
+ if filepath.Clean(mount.Destination) == "/sys" {
|
|
 |
16d03e |
+ mounts = append(mounts, specs.Mount{
|
|
 |
16d03e |
+ Source: "/sys",
|
|
 |
16d03e |
+ Destination: "/sys",
|
|
 |
16d03e |
+ Type: "none",
|
|
 |
16d03e |
+ Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
|
|
 |
16d03e |
+ })
|
|
 |
16d03e |
continue
|
|
 |
16d03e |
}
|
|
 |
16d03e |
|
|
 |
16d03e |
@@ -216,13 +223,6 @@ func ToRootless(spec *specs.Spec) {
|
|
 |
16d03e |
mount.Options = options
|
|
 |
16d03e |
mounts = append(mounts, mount)
|
|
 |
16d03e |
}
|
|
 |
16d03e |
- // Add the sysfs mount as an rbind.
|
|
 |
16d03e |
- mounts = append(mounts, specs.Mount{
|
|
 |
16d03e |
- Source: "/sys",
|
|
 |
16d03e |
- Destination: "/sys",
|
|
 |
16d03e |
- Type: "none",
|
|
 |
16d03e |
- Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
|
|
 |
16d03e |
- })
|
|
 |
16d03e |
spec.Mounts = mounts
|
|
 |
16d03e |
|
|
 |
16d03e |
// Remove cgroup settings.
|
|
 |
16d03e |
--
|
|
 |
16d03e |
2.31.1
|
|
 |
16d03e |
|