2b979c
From 3ca79e786cb0e0098f1d2ab06212a5608a8b257a Mon Sep 17 00:00:00 2001
2b979c
From: Aleksa Sarai <cyphar@cyphar.com>
2b979c
Date: Thu, 1 Apr 2021 12:00:31 -0700
2b979c
Subject: [PATCH] [rc92] rootfs: add mount destination validation
2b979c
2b979c
This is a manual backport of upstream fix for CVE-2021-30465 to runc
2b979c
v1.0.0-rc92. Original description follows.
2b979c
2b979c
----
2b979c
2b979c
Because the target of a mount is inside a container (which may be a
2b979c
volume that is shared with another container), there exists a race
2b979c
condition where the target of the mount may change to a path containing
2b979c
a symlink after we have sanitised the path -- resulting in us
2b979c
inadvertently mounting the path outside of the container.
2b979c
2b979c
This is not immediately useful because we are in a mount namespace with
2b979c
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
2b979c
host paths in the host namespace. However, if any subsequent mountpoints
2b979c
in the configuration use a subdirectory of that host path as a source,
2b979c
those subsequent mounts will use an attacker-controlled source path
2b979c
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
2b979c
into the container.
2b979c
2b979c
While arguably configuration issues like this are not entirely within
2b979c
runc's threat model, within the context of Kubernetes (and possibly
2b979c
other container managers that provide semi-arbitrary container creation
2b979c
privileges to untrusted users) this is a legitimate issue. Since we
2b979c
cannot block mounting from the host into the container, we need to block
2b979c
the first stage of this attack (mounting onto a path outside the
2b979c
container).
2b979c
2b979c
The long-term plan to solve this would be to migrate to libpathrs, but
2b979c
as a stop-gap we implement libpathrs-like path verification through
2b979c
readlink(/proc/self/fd/$n) and then do mount operations through the
2b979c
procfd once it's been verified to be inside the container. The target
2b979c
could move after we've checked it, but if it is inside the container
2b979c
then we can assume that it is safe for the same reason that libpathrs
2b979c
operations would be safe.
2b979c
2b979c
A slight wrinkle is the "copyup" functionality we provide for tmpfs,
2b979c
which is the only case where we want to do a mount on the host
2b979c
filesystem. To facilitate this, I split out the copy-up functionality
2b979c
entirely so that the logic isn't interspersed with the regular tmpfs
2b979c
logic. In addition, all dependencies on m.Destination being overwritten
2b979c
have been removed since that pattern was just begging to be a source of
2b979c
more mount-target bugs (we do still have to modify m.Destination for
2b979c
tmpfs-copyup but we only do it temporarily).
2b979c
2b979c
Fixes: CVE-2021-30465
2b979c
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
2b979c
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
2b979c
Reviewed-by: Samuel Karp <skarp@amazon.com>
2b979c
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
2b979c
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2b979c
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2b979c
---
2b979c
 libcontainer/rootfs_linux.go     | 226 ++++++++++++++++---------------
2b979c
 libcontainer/utils/utils.go      |  54 ++++++++
2b979c
 libcontainer/utils/utils_test.go |  35 +++++
2b979c
 3 files changed, 205 insertions(+), 110 deletions(-)
2b979c
2b979c
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
2b979c
index e00df0a2..e24e0e0c 100644
2b979c
--- a/libcontainer/rootfs_linux.go
2b979c
+++ b/libcontainer/rootfs_linux.go
2b979c
@@ -19,9 +19,10 @@ import (
2b979c
 	"github.com/opencontainers/runc/libcontainer/cgroups"
2b979c
 	"github.com/opencontainers/runc/libcontainer/configs"
2b979c
 	"github.com/opencontainers/runc/libcontainer/system"
2b979c
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
2b979c
+	"github.com/opencontainers/runc/libcontainer/utils"
2b979c
 	"github.com/opencontainers/runtime-spec/specs-go"
2b979c
 	"github.com/opencontainers/selinux/go-selinux/label"
2b979c
+	"github.com/sirupsen/logrus"
2b979c
 
2b979c
 	"golang.org/x/sys/unix"
2b979c
 )
2b979c
@@ -31,7 +32,7 @@ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
2b979c
 // needsSetupDev returns true if /dev needs to be set up.
2b979c
 func needsSetupDev(config *configs.Config) bool {
2b979c
 	for _, m := range config.Mounts {
2b979c
-		if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
2b979c
+		if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
2b979c
 			return false
2b979c
 		}
2b979c
 	}
2b979c
@@ -139,7 +140,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
2b979c
 func finalizeRootfs(config *configs.Config) (err error) {
2b979c
 	// remount dev as ro if specified
2b979c
 	for _, m := range config.Mounts {
2b979c
-		if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
2b979c
+		if utils.CleanPath(m.Destination) == "/dev" {
2b979c
 			if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
2b979c
 				if err := remountReadonly(m); err != nil {
2b979c
 					return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
2b979c
@@ -208,8 +209,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
2b979c
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
2b979c
 		return err
2b979c
 	}
2b979c
-	// update the mount with the correct dest after symlinks are resolved.
2b979c
-	m.Destination = dest
2b979c
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
2b979c
 		return err
2b979c
 	}
2b979c
@@ -246,18 +245,21 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
2b979c
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
2b979c
 				return err
2b979c
 			}
2b979c
-			flags := defaultMountFlags
2b979c
-			if m.Flags&unix.MS_RDONLY != 0 {
2b979c
-				flags = flags | unix.MS_RDONLY
2b979c
-			}
2b979c
-			cgroupmount := &configs.Mount{
2b979c
-				Source:      "cgroup",
2b979c
-				Device:      "cgroup", // this is actually fstype
2b979c
-				Destination: subsystemPath,
2b979c
-				Flags:       flags,
2b979c
-				Data:        filepath.Base(subsystemPath),
2b979c
-			}
2b979c
-			if err := mountNewCgroup(cgroupmount); err != nil {
2b979c
+			if err := utils.WithProcfd(rootfs, b.Destination, func(procfd string) error {
2b979c
+				flags := defaultMountFlags
2b979c
+				if m.Flags&unix.MS_RDONLY != 0 {
2b979c
+					flags = flags | unix.MS_RDONLY
2b979c
+				}
2b979c
+				var (
2b979c
+					source = "cgroup"
2b979c
+					data   = filepath.Base(subsystemPath)
2b979c
+				)
2b979c
+				if data == "systemd" {
2b979c
+					data = cgroups.CgroupNamePrefix + data
2b979c
+					source = "systemd"
2b979c
+				}
2b979c
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
2b979c
+			}); err != nil {
2b979c
 				return err
2b979c
 			}
2b979c
 		} else {
2b979c
@@ -287,22 +289,67 @@ func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
2b979c
 	if err := os.MkdirAll(cgroupPath, 0755); err != nil {
2b979c
 		return err
2b979c
 	}
2b979c
-	if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
2b979c
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
2b979c
-		if err == unix.EPERM || err == unix.EBUSY {
2b979c
-			return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
2b979c
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
2b979c
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
2b979c
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
2b979c
+			if err == unix.EPERM || err == unix.EBUSY {
2b979c
+				return unix.Mount("/sys/fs/cgroup", procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
2b979c
+			}
2b979c
+			return err
2b979c
 		}
2b979c
+		return nil
2b979c
+	})
2b979c
+}
2b979c
+
2b979c
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
2b979c
+	// Set up a scratch dir for the tmpfs on the host.
2b979c
+	tmpdir, err := prepareTmp("/tmp")
2b979c
+	if err != nil {
2b979c
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
2b979c
+	}
2b979c
+	defer cleanupTmp(tmpdir)
2b979c
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
2b979c
+	if err != nil {
2b979c
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
2b979c
+	}
2b979c
+	defer os.RemoveAll(tmpDir)
2b979c
+
2b979c
+	// Configure the *host* tmpdir as if it's the container mount. We change
2b979c
+	// m.Destination since we are going to mount *on the host*.
2b979c
+	oldDest := m.Destination
2b979c
+	m.Destination = tmpDir
2b979c
+	err = mountPropagate(m, "/", mountLabel)
2b979c
+	m.Destination = oldDest
2b979c
+	if err != nil {
2b979c
 		return err
2b979c
 	}
2b979c
-	return nil
2b979c
+	defer func() {
2b979c
+		if Err != nil {
2b979c
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
2b979c
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
2b979c
+			}
2b979c
+		}
2b979c
+	}()
2b979c
+
2b979c
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
2b979c
+		// Copy the container data to the host tmpdir. We append "/" to force
2b979c
+		// CopyDirectory to resolve the symlink rather than trying to copy the
2b979c
+		// symlink itself.
2b979c
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
2b979c
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err)
2b979c
+		}
2b979c
+		// Now move the mount into the container.
2b979c
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
2b979c
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %w", tmpDir, procfd, m.Destination, err)
2b979c
+		}
2b979c
+		return nil
2b979c
+	})
2b979c
 }
2b979c
 
2b979c
 func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
2b979c
-	var (
2b979c
-		dest = m.Destination
2b979c
-	)
2b979c
-	if !strings.HasPrefix(dest, rootfs) {
2b979c
-		dest = filepath.Join(rootfs, dest)
2b979c
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
2b979c
+	if err != nil {
2b979c
+		return err
2b979c
 	}
2b979c
 
2b979c
 	switch m.Device {
2b979c
@@ -337,46 +384,22 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
2b979c
 		}
2b979c
 		return nil
2b979c
 	case "tmpfs":
2b979c
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
2b979c
-		tmpDir := ""
2b979c
 		stat, err := os.Stat(dest)
2b979c
 		if err != nil {
2b979c
 			if err := os.MkdirAll(dest, 0755); err != nil {
2b979c
 				return err
2b979c
 			}
2b979c
 		}
2b979c
-		if copyUp {
2b979c
-			tmpdir, err := prepareTmp("/tmp")
2b979c
-			if err != nil {
2b979c
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
2b979c
-			}
2b979c
-			defer cleanupTmp(tmpdir)
2b979c
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
2b979c
-			if err != nil {
2b979c
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
2b979c
-			}
2b979c
-			defer os.RemoveAll(tmpDir)
2b979c
-			m.Destination = tmpDir
2b979c
+
2b979c
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
2b979c
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
2b979c
+		} else {
2b979c
+			err = mountPropagate(m, rootfs, mountLabel)
2b979c
 		}
2b979c
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
2b979c
+		if err != nil {
2b979c
 			return err
2b979c
 		}
2b979c
-		if copyUp {
2b979c
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
2b979c
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
2b979c
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
2b979c
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
2b979c
-				}
2b979c
-				return errMsg
2b979c
-			}
2b979c
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
2b979c
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
2b979c
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
2b979c
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
2b979c
-				}
2b979c
-				return errMsg
2b979c
-			}
2b979c
-		}
2b979c
+
2b979c
 		if stat != nil {
2b979c
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
2b979c
 				return err
2b979c
@@ -414,19 +437,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
2b979c
 		}
2b979c
 		return mountCgroupV1(m, rootfs, mountLabel, enableCgroupns)
2b979c
 	default:
2b979c
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
2b979c
-		// any previous mounts can invalidate the next mount's destination.
2b979c
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
2b979c
-		// evil stuff to try to escape the container's rootfs.
2b979c
-		var err error
2b979c
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
2b979c
-			return err
2b979c
-		}
2b979c
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
2b979c
 			return err
2b979c
 		}
2b979c
-		// update the mount with the correct dest after symlinks are resolved.
2b979c
-		m.Destination = dest
2b979c
 		if err := os.MkdirAll(dest, 0755); err != nil {
2b979c
 			return err
2b979c
 		}
2b979c
@@ -601,7 +614,7 @@ func createDevices(config *configs.Config) error {
2b979c
 	return nil
2b979c
 }
2b979c
 
2b979c
-func bindMountDeviceNode(dest string, node *configs.Device) error {
2b979c
+func bindMountDeviceNode(rootfs, dest string, node *configs.Device) error {
2b979c
 	f, err := os.Create(dest)
2b979c
 	if err != nil && !os.IsExist(err) {
2b979c
 		return err
2b979c
@@ -609,7 +622,9 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
2b979c
 	if f != nil {
2b979c
 		f.Close()
2b979c
 	}
2b979c
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
2b979c
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
2b979c
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
2b979c
+	})
2b979c
 }
2b979c
 
2b979c
 // Creates the device node in the rootfs of the container.
2b979c
@@ -618,18 +633,21 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
2b979c
 		// The node only exists for cgroup reasons, ignore it here.
2b979c
 		return nil
2b979c
 	}
2b979c
-	dest := filepath.Join(rootfs, node.Path)
2b979c
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
2b979c
+	if err != nil {
2b979c
+		return err
2b979c
+	}
2b979c
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
2b979c
 		return err
2b979c
 	}
2b979c
 	if bind {
2b979c
-		return bindMountDeviceNode(dest, node)
2b979c
+		return bindMountDeviceNode(rootfs, dest, node)
2b979c
 	}
2b979c
 	if err := mknodDevice(dest, node); err != nil {
2b979c
 		if os.IsExist(err) {
2b979c
 			return nil
2b979c
 		} else if os.IsPermission(err) {
2b979c
-			return bindMountDeviceNode(dest, node)
2b979c
+			return bindMountDeviceNode(rootfs, dest, node)
2b979c
 		}
2b979c
 		return err
2b979c
 	}
2b979c
@@ -929,55 +947,43 @@ func writeSystemProperty(key, value string) error {
2b979c
 }
2b979c
 
2b979c
 func remount(m *configs.Mount, rootfs string) error {
2b979c
-	var (
2b979c
-		dest = m.Destination
2b979c
-	)
2b979c
-	if !strings.HasPrefix(dest, rootfs) {
2b979c
-		dest = filepath.Join(rootfs, dest)
2b979c
-	}
2b979c
-	return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
2b979c
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
2b979c
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
2b979c
+	})
2b979c
 }
2b979c
 
2b979c
 // Do the mount operation followed by additional mounts required to take care
2b979c
-// of propagation flags.
2b979c
+// of propagation flags. This will always be scoped inside the container rootfs.
2b979c
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
2b979c
 	var (
2b979c
-		dest  = m.Destination
2b979c
 		data  = label.FormatMountLabel(m.Data, mountLabel)
2b979c
 		flags = m.Flags
2b979c
 	)
2b979c
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
2b979c
+	if utils.CleanPath(m.Destination) == "/dev" {
2b979c
 		flags &= ^unix.MS_RDONLY
2b979c
 	}
2b979c
 
2b979c
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
2b979c
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
2b979c
-		dest = filepath.Join(rootfs, dest)
2b979c
-	}
2b979c
-
2b979c
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
2b979c
-		return err
2b979c
-	}
2b979c
-
2b979c
-	for _, pflag := range m.PropagationFlags {
2b979c
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
2b979c
-			return err
2b979c
+	// Because the destination is inside a container path which might be
2b979c
+	// mutating underneath us, we verify that we are actually going to mount
2b979c
+	// inside the container with WithProcfd() -- mounting through a procfd
2b979c
+	// mounts on the target.
2b979c
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
2b979c
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
2b979c
+	}); err != nil {
2b979c
+		return fmt.Errorf("mount through procfd: %w", err)
2b979c
+	}
2b979c
+	// We have to apply mount propagation flags in a separate WithProcfd() call
2b979c
+	// because the previous call invalidates the passed procfd -- the mount
2b979c
+	// target needs to be re-opened.
2b979c
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
2b979c
+		for _, pflag := range m.PropagationFlags {
2b979c
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
2b979c
+				return err
2b979c
+			}
2b979c
 		}
2b979c
-	}
2b979c
-	return nil
2b979c
-}
2b979c
-
2b979c
-func mountNewCgroup(m *configs.Mount) error {
2b979c
-	var (
2b979c
-		data   = m.Data
2b979c
-		source = m.Source
2b979c
-	)
2b979c
-	if data == "systemd" {
2b979c
-		data = cgroups.CgroupNamePrefix + data
2b979c
-		source = "systemd"
2b979c
-	}
2b979c
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
2b979c
-		return err
2b979c
+		return nil
2b979c
+	}); err != nil {
2b979c
+		return fmt.Errorf("change mount propagation through procfd: %w", err)
2b979c
 	}
2b979c
 	return nil
2b979c
 }
2b979c
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
2b979c
index 40ccfaa1..53563951 100644
2b979c
--- a/libcontainer/utils/utils.go
2b979c
+++ b/libcontainer/utils/utils.go
2b979c
@@ -2,12 +2,15 @@ package utils
2b979c
 
2b979c
 import (
2b979c
 	"encoding/json"
2b979c
+	"fmt"
2b979c
 	"io"
2b979c
 	"os"
2b979c
 	"path/filepath"
2b979c
+	"strconv"
2b979c
 	"strings"
2b979c
 	"unsafe"
2b979c
 
2b979c
+	"github.com/cyphar/filepath-securejoin"
2b979c
 	"golang.org/x/sys/unix"
2b979c
 )
2b979c
 
2b979c
@@ -73,6 +76,57 @@ func CleanPath(path string) string {
2b979c
 	return filepath.Clean(path)
2b979c
 }
2b979c
 
2b979c
+// stripRoot returns the passed path, stripping the root path if it was
2b979c
+// (lexicially) inside it. Note that both passed paths will always be treated
2b979c
+// as absolute, and the returned path will also always be absolute. In
2b979c
+// addition, the paths are cleaned before stripping the root.
2b979c
+func stripRoot(root, path string) string {
2b979c
+	// Make the paths clean and absolute.
2b979c
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
2b979c
+	switch {
2b979c
+	case path == root:
2b979c
+		path = "/"
2b979c
+	case root == "/":
2b979c
+		// do nothing
2b979c
+	case strings.HasPrefix(path, root+"/"):
2b979c
+		path = strings.TrimPrefix(path, root+"/")
2b979c
+	}
2b979c
+	return CleanPath("/" + path)
2b979c
+}
2b979c
+
2b979c
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
2b979c
+// corresponding to the unsafePath resolved within the root. Before passing the
2b979c
+// fd, this path is verified to have been inside the root -- so operating on it
2b979c
+// through the passed fdpath should be safe. Do not access this path through
2b979c
+// the original path strings, and do not attempt to use the pathname outside of
2b979c
+// the passed closure (the file handle will be freed once the closure returns).
2b979c
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
2b979c
+	// Remove the root then forcefully resolve inside the root.
2b979c
+	unsafePath = stripRoot(root, unsafePath)
2b979c
+	path, err := securejoin.SecureJoin(root, unsafePath)
2b979c
+	if err != nil {
2b979c
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
2b979c
+	}
2b979c
+
2b979c
+	// Open the target path.
2b979c
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
2b979c
+	if err != nil {
2b979c
+		return fmt.Errorf("open o_path procfd: %w", err)
2b979c
+	}
2b979c
+	defer fh.Close()
2b979c
+
2b979c
+	// Double-check the path is the one we expected.
2b979c
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
2b979c
+	if realpath, err := os.Readlink(procfd); err != nil {
2b979c
+		return fmt.Errorf("procfd verification failed: %w", err)
2b979c
+	} else if realpath != path {
2b979c
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
2b979c
+	}
2b979c
+
2b979c
+	// Run the closure.
2b979c
+	return fn(procfd)
2b979c
+}
2b979c
+
2b979c
 // SearchLabels searches a list of key-value pairs for the provided key and
2b979c
 // returns the corresponding value. The pairs must be separated with '='.
2b979c
 func SearchLabels(labels []string, query string) string {
2b979c
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
2b979c
index 395eedcf..5b80cac6 100644
2b979c
--- a/libcontainer/utils/utils_test.go
2b979c
+++ b/libcontainer/utils/utils_test.go
2b979c
@@ -140,3 +140,38 @@ func TestCleanPath(t *testing.T) {
2b979c
 		t.Errorf("expected to receive '/foo' and received %s", path)
2b979c
 	}
2b979c
 }
2b979c
+
2b979c
+func TestStripRoot(t *testing.T) {
2b979c
+	for _, test := range []struct {
2b979c
+		root, path, out string
2b979c
+	}{
2b979c
+		// Works with multiple components.
2b979c
+		{"/a/b", "/a/b/c", "/c"},
2b979c
+		{"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"},
2b979c
+		// '/' must be a no-op.
2b979c
+		{"/", "/a/b/c", "/a/b/c"},
2b979c
+		// Must be the correct order.
2b979c
+		{"/a/b", "/a/c/b", "/a/c/b"},
2b979c
+		// Must be at start.
2b979c
+		{"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"},
2b979c
+		// Must be a lexical parent.
2b979c
+		{"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"},
2b979c
+		// Must only strip the root once.
2b979c
+		{"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"},
2b979c
+		// Deal with .. in a fairly sane way.
2b979c
+		{"/foo/bar", "/foo/bar/../baz", "/foo/baz"},
2b979c
+		{"/foo/bar", "../../../../../../foo/bar/baz", "/baz"},
2b979c
+		{"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"},
2b979c
+		{"/foo/bar/../baz", "/foo/baz/bar", "/bar"},
2b979c
+		{"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"},
2b979c
+		// All paths are made absolute before stripping.
2b979c
+		{"foo/bar", "/foo/bar/baz/bee", "/baz/bee"},
2b979c
+		{"/foo/bar", "foo/bar/baz/beef", "/baz/beef"},
2b979c
+		{"foo/bar", "foo/bar/baz/beets", "/baz/beets"},
2b979c
+	} {
2b979c
+		got := stripRoot(test.root, test.path)
2b979c
+		if got != test.out {
2b979c
+			t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out)
2b979c
+		}
2b979c
+	}
2b979c
+}
2b979c
-- 
2b979c
2.31.1
2b979c