16d03e
From 14faf1c20948688a48edb9b41367ab07ac11ca91 Mon Sep 17 00:00:00 2001
16d03e
From: Aleksa Sarai <cyphar@cyphar.com>
16d03e
Date: Thu, 1 Apr 2021 12:00:31 -0700
16d03e
Subject: [PATCH 5/5] rootfs: add mount destination validation
16d03e
16d03e
Because the target of a mount is inside a container (which may be a
16d03e
volume that is shared with another container), there exists a race
16d03e
condition where the target of the mount may change to a path containing
16d03e
a symlink after we have sanitised the path -- resulting in us
16d03e
inadvertently mounting the path outside of the container.
16d03e
16d03e
This is not immediately useful because we are in a mount namespace with
16d03e
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
16d03e
host paths in the host namespace. However, if any subsequent mountpoints
16d03e
in the configuration use a subdirectory of that host path as a source,
16d03e
those subsequent mounts will use an attacker-controlled source path
16d03e
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
16d03e
into the container.
16d03e
16d03e
While arguably configuration issues like this are not entirely within
16d03e
runc's threat model, within the context of Kubernetes (and possibly
16d03e
other container managers that provide semi-arbitrary container creation
16d03e
privileges to untrusted users) this is a legitimate issue. Since we
16d03e
cannot block mounting from the host into the container, we need to block
16d03e
the first stage of this attack (mounting onto a path outside the
16d03e
container).
16d03e
16d03e
The long-term plan to solve this would be to migrate to libpathrs, but
16d03e
as a stop-gap we implement libpathrs-like path verification through
16d03e
readlink(/proc/self/fd/$n) and then do mount operations through the
16d03e
procfd once it's been verified to be inside the container. The target
16d03e
could move after we've checked it, but if it is inside the container
16d03e
then we can assume that it is safe for the same reason that libpathrs
16d03e
operations would be safe.
16d03e
16d03e
A slight wrinkle is the "copyup" functionality we provide for tmpfs,
16d03e
which is the only case where we want to do a mount on the host
16d03e
filesystem. To facilitate this, I split out the copy-up functionality
16d03e
entirely so that the logic isn't interspersed with the regular tmpfs
16d03e
logic. In addition, all dependencies on m.Destination being overwritten
16d03e
have been removed since that pattern was just begging to be a source of
16d03e
more mount-target bugs (we do still have to modify m.Destination for
16d03e
tmpfs-copyup but we only do it temporarily).
16d03e
16d03e
Fixes: CVE-2021-30465
16d03e
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
16d03e
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
16d03e
Reviewed-by: Samuel Karp <skarp@amazon.com>
16d03e
Reviewed-by: Kir Kolyshkin <kolyshkin@gmail.com> (@kolyshkin)
16d03e
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
16d03e
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
16d03e
---
16d03e
 libcontainer/container_linux.go  |   1 -
16d03e
 libcontainer/rootfs_linux.go     | 251 +++++++++++++++----------------
16d03e
 libcontainer/utils/utils.go      |  54 +++++++
16d03e
 libcontainer/utils/utils_test.go |  35 +++++
16d03e
 4 files changed, 213 insertions(+), 128 deletions(-)
16d03e
16d03e
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
16d03e
index 1cbc734172d0..70b388b1252e 100644
16d03e
--- a/libcontainer/container_linux.go
16d03e
+++ b/libcontainer/container_linux.go
16d03e
@@ -1202,7 +1202,6 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
16d03e
 		if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
16d03e
 			return err
16d03e
 		}
16d03e
-		m.Destination = dest
16d03e
 		if err := os.MkdirAll(dest, 0755); err != nil {
16d03e
 			return err
16d03e
 		}
16d03e
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
16d03e
index 5d2d74cf924b..96be669c365e 100644
16d03e
--- a/libcontainer/rootfs_linux.go
16d03e
+++ b/libcontainer/rootfs_linux.go
16d03e
@@ -25,6 +25,7 @@ import (
16d03e
 	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
16d03e
 	"github.com/opencontainers/runtime-spec/specs-go"
16d03e
 	"github.com/opencontainers/selinux/go-selinux/label"
16d03e
+	"github.com/sirupsen/logrus"
16d03e
 	"golang.org/x/sys/unix"
16d03e
 )
16d03e
 
16d03e
@@ -228,8 +229,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
16d03e
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
16d03e
 		return err
16d03e
 	}
16d03e
-	// update the mount with the correct dest after symlinks are resolved.
16d03e
-	m.Destination = dest
16d03e
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
16d03e
 		return err
16d03e
 	}
16d03e
@@ -266,18 +265,21 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
16d03e
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
16d03e
 				return err
16d03e
 			}
16d03e
-			flags := defaultMountFlags
16d03e
-			if m.Flags&unix.MS_RDONLY != 0 {
16d03e
-				flags = flags | unix.MS_RDONLY
16d03e
-			}
16d03e
-			cgroupmount := &configs.Mount{
16d03e
-				Source:      "cgroup",
16d03e
-				Device:      "cgroup", // this is actually fstype
16d03e
-				Destination: subsystemPath,
16d03e
-				Flags:       flags,
16d03e
-				Data:        filepath.Base(subsystemPath),
16d03e
-			}
16d03e
-			if err := mountNewCgroup(cgroupmount); err != nil {
16d03e
+			if err := utils.WithProcfd(c.root, b.Destination, func(procfd string) error {
16d03e
+				flags := defaultMountFlags
16d03e
+				if m.Flags&unix.MS_RDONLY != 0 {
16d03e
+					flags = flags | unix.MS_RDONLY
16d03e
+				}
16d03e
+				var (
16d03e
+					source = "cgroup"
16d03e
+					data   = filepath.Base(subsystemPath)
16d03e
+				)
16d03e
+				if data == "systemd" {
16d03e
+					data = cgroups.CgroupNamePrefix + data
16d03e
+					source = "systemd"
16d03e
+				}
16d03e
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
16d03e
+			}); err != nil {
16d03e
 				return err
16d03e
 			}
16d03e
 		} else {
16d03e
@@ -307,33 +309,79 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
16d03e
 	if err := os.MkdirAll(dest, 0755); err != nil {
16d03e
 		return err
16d03e
 	}
16d03e
-	if err := unix.Mount(m.Source, dest, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
16d03e
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
16d03e
-		if err == unix.EPERM || err == unix.EBUSY {
16d03e
-			src := fs2.UnifiedMountpoint
16d03e
-			if c.cgroupns && c.cgroup2Path != "" {
16d03e
-				// Emulate cgroupns by bind-mounting
16d03e
-				// the container cgroup path rather than
16d03e
-				// the whole /sys/fs/cgroup.
16d03e
-				src = c.cgroup2Path
16d03e
-			}
16d03e
-			err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
16d03e
-			if err == unix.ENOENT && c.rootlessCgroups {
16d03e
-				err = nil
16d03e
+	return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
16d03e
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
16d03e
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
16d03e
+			if err == unix.EPERM || err == unix.EBUSY {
16d03e
+				src := fs2.UnifiedMountpoint
16d03e
+				if c.cgroupns && c.cgroup2Path != "" {
16d03e
+					// Emulate cgroupns by bind-mounting
16d03e
+					// the container cgroup path rather than
16d03e
+					// the whole /sys/fs/cgroup.
16d03e
+					src = c.cgroup2Path
16d03e
+				}
16d03e
+				err = unix.Mount(src, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
16d03e
+				if err == unix.ENOENT && c.rootlessCgroups {
16d03e
+					err = nil
16d03e
+				}
16d03e
 			}
16d03e
 			return err
16d03e
 		}
16d03e
+		return nil
16d03e
+	})
16d03e
+}
16d03e
+
16d03e
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
16d03e
+	// Set up a scratch dir for the tmpfs on the host.
16d03e
+	tmpdir, err := prepareTmp("/tmp")
16d03e
+	if err != nil {
16d03e
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
16d03e
+	}
16d03e
+	defer cleanupTmp(tmpdir)
16d03e
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
16d03e
+	if err != nil {
16d03e
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
16d03e
+	}
16d03e
+	defer os.RemoveAll(tmpDir)
16d03e
+
16d03e
+	// Configure the *host* tmpdir as if it's the container mount. We change
16d03e
+	// m.Destination since we are going to mount *on the host*.
16d03e
+	oldDest := m.Destination
16d03e
+	m.Destination = tmpDir
16d03e
+	err = mountPropagate(m, "/", mountLabel)
16d03e
+	m.Destination = oldDest
16d03e
+	if err != nil {
16d03e
 		return err
16d03e
 	}
16d03e
-	return nil
16d03e
+	defer func() {
16d03e
+		if Err != nil {
16d03e
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
16d03e
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
16d03e
+			}
16d03e
+		}
16d03e
+	}()
16d03e
+
16d03e
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
16d03e
+		// Copy the container data to the host tmpdir. We append "/" to force
16d03e
+		// CopyDirectory to resolve the symlink rather than trying to copy the
16d03e
+		// symlink itself.
16d03e
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
16d03e
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err)
16d03e
+		}
16d03e
+		// Now move the mount into the container.
16d03e
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
16d03e
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %w", tmpDir, procfd, m.Destination, err)
16d03e
+		}
16d03e
+		return nil
16d03e
+	})
16d03e
 }
16d03e
 
16d03e
 func mountToRootfs(m *configs.Mount, c *mountConfig) error {
16d03e
 	rootfs := c.root
16d03e
 	mountLabel := c.label
16d03e
-	dest := m.Destination
16d03e
-	if !strings.HasPrefix(dest, rootfs) {
16d03e
-		dest = filepath.Join(rootfs, dest)
16d03e
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
16d03e
+	if err != nil {
16d03e
+		return err
16d03e
 	}
16d03e
 
16d03e
 	switch m.Device {
16d03e
@@ -364,53 +412,21 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
16d03e
 		}
16d03e
 		return label.SetFileLabel(dest, mountLabel)
16d03e
 	case "tmpfs":
16d03e
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
16d03e
-		tmpDir := ""
16d03e
-		// dest might be an absolute symlink, so it needs
16d03e
-		// to be resolved under rootfs.
16d03e
-		dest, err := securejoin.SecureJoin(rootfs, m.Destination)
16d03e
-		if err != nil {
16d03e
-			return err
16d03e
-		}
16d03e
-		m.Destination = dest
16d03e
 		stat, err := os.Stat(dest)
16d03e
 		if err != nil {
16d03e
 			if err := os.MkdirAll(dest, 0755); err != nil {
16d03e
 				return err
16d03e
 			}
16d03e
 		}
16d03e
-		if copyUp {
16d03e
-			tmpdir, err := prepareTmp("/tmp")
16d03e
-			if err != nil {
16d03e
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
16d03e
-			}
16d03e
-			defer cleanupTmp(tmpdir)
16d03e
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
16d03e
-			if err != nil {
16d03e
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
16d03e
-			}
16d03e
-			defer os.RemoveAll(tmpDir)
16d03e
-			m.Destination = tmpDir
16d03e
+
16d03e
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
16d03e
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
16d03e
+		} else {
16d03e
+			err = mountPropagate(m, rootfs, mountLabel)
16d03e
 		}
16d03e
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
16d03e
+		if err != nil {
16d03e
 			return err
16d03e
 		}
16d03e
-		if copyUp {
16d03e
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
16d03e
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
16d03e
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
16d03e
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
16d03e
-				}
16d03e
-				return errMsg
16d03e
-			}
16d03e
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
16d03e
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
16d03e
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
16d03e
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
16d03e
-				}
16d03e
-				return errMsg
16d03e
-			}
16d03e
-		}
16d03e
 		if stat != nil {
16d03e
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
16d03e
 				return err
16d03e
@@ -454,19 +470,9 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
16d03e
 		}
16d03e
 		return mountCgroupV1(m, c)
16d03e
 	default:
16d03e
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
16d03e
-		// any previous mounts can invalidate the next mount's destination.
16d03e
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
16d03e
-		// evil stuff to try to escape the container's rootfs.
16d03e
-		var err error
16d03e
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
16d03e
-			return err
16d03e
-		}
16d03e
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
16d03e
 			return err
16d03e
 		}
16d03e
-		// update the mount with the correct dest after symlinks are resolved.
16d03e
-		m.Destination = dest
16d03e
 		if err := os.MkdirAll(dest, 0755); err != nil {
16d03e
 			return err
16d03e
 		}
16d03e
@@ -649,7 +655,7 @@ func createDevices(config *configs.Config) error {
16d03e
 	return nil
16d03e
 }
16d03e
 
16d03e
-func bindMountDeviceNode(dest string, node *devices.Device) error {
16d03e
+func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
16d03e
 	f, err := os.Create(dest)
16d03e
 	if err != nil && !os.IsExist(err) {
16d03e
 		return err
16d03e
@@ -657,7 +663,9 @@ func bindMountDeviceNode(dest string, node *devices.Device) error {
16d03e
 	if f != nil {
16d03e
 		f.Close()
16d03e
 	}
16d03e
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
16d03e
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
16d03e
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
16d03e
+	})
16d03e
 }
16d03e
 
16d03e
 // Creates the device node in the rootfs of the container.
16d03e
@@ -666,18 +674,21 @@ func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
16d03e
 		// The node only exists for cgroup reasons, ignore it here.
16d03e
 		return nil
16d03e
 	}
16d03e
-	dest := filepath.Join(rootfs, node.Path)
16d03e
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
16d03e
+	if err != nil {
16d03e
+		return err
16d03e
+	}
16d03e
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
16d03e
 		return err
16d03e
 	}
16d03e
 	if bind {
16d03e
-		return bindMountDeviceNode(dest, node)
16d03e
+		return bindMountDeviceNode(rootfs, dest, node)
16d03e
 	}
16d03e
 	if err := mknodDevice(dest, node); err != nil {
16d03e
 		if os.IsExist(err) {
16d03e
 			return nil
16d03e
 		} else if os.IsPermission(err) {
16d03e
-			return bindMountDeviceNode(dest, node)
16d03e
+			return bindMountDeviceNode(rootfs, dest, node)
16d03e
 		}
16d03e
 		return err
16d03e
 	}
16d03e
@@ -1013,61 +1024,47 @@ func writeSystemProperty(key, value string) error {
16d03e
 }
16d03e
 
16d03e
 func remount(m *configs.Mount, rootfs string) error {
16d03e
-	var (
16d03e
-		dest = m.Destination
16d03e
-	)
16d03e
-	if !strings.HasPrefix(dest, rootfs) {
16d03e
-		dest = filepath.Join(rootfs, dest)
16d03e
-	}
16d03e
-	return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
16d03e
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
16d03e
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
16d03e
+	})
16d03e
 }
16d03e
 
16d03e
 // Do the mount operation followed by additional mounts required to take care
16d03e
-// of propagation flags.
16d03e
+// of propagation flags. This will always be scoped inside the container rootfs.
16d03e
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
16d03e
 	var (
16d03e
-		dest  = m.Destination
16d03e
 		data  = label.FormatMountLabel(m.Data, mountLabel)
16d03e
 		flags = m.Flags
16d03e
 	)
16d03e
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
16d03e
-		flags &= ^unix.MS_RDONLY
16d03e
-	}
16d03e
-
16d03e
-	// Mount it rw to allow chmod operation. A remount will be performed
16d03e
-	// later to make it ro if set.
16d03e
-	if m.Device == "tmpfs" {
16d03e
+	// Delay mounting the filesystem read-only if we need to do further
16d03e
+	// operations on it. We need to set up files in "/dev" and tmpfs mounts may
16d03e
+	// need to be chmod-ed after mounting. The mount will be remounted ro later
16d03e
+	// in finalizeRootfs() if necessary.
16d03e
+	if libcontainerUtils.CleanPath(m.Destination) == "/dev" || m.Device == "tmpfs" {
16d03e
 		flags &= ^unix.MS_RDONLY
16d03e
 	}
16d03e
 
16d03e
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
16d03e
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
16d03e
-		dest = filepath.Join(rootfs, dest)
16d03e
-	}
16d03e
-
16d03e
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
16d03e
-		return err
16d03e
-	}
16d03e
-
16d03e
-	for _, pflag := range m.PropagationFlags {
16d03e
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
16d03e
-			return err
16d03e
+	// Because the destination is inside a container path which might be
16d03e
+	// mutating underneath us, we verify that we are actually going to mount
16d03e
+	// inside the container with WithProcfd() -- mounting through a procfd
16d03e
+	// mounts on the target.
16d03e
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
16d03e
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
16d03e
+	}); err != nil {
16d03e
+		return fmt.Errorf("mount through procfd: %w", err)
16d03e
+	}
16d03e
+	// We have to apply mount propagation flags in a separate WithProcfd() call
16d03e
+	// because the previous call invalidates the passed procfd -- the mount
16d03e
+	// target needs to be re-opened.
16d03e
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
16d03e
+		for _, pflag := range m.PropagationFlags {
16d03e
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
16d03e
+				return err
16d03e
+			}
16d03e
 		}
16d03e
-	}
16d03e
-	return nil
16d03e
-}
16d03e
-
16d03e
-func mountNewCgroup(m *configs.Mount) error {
16d03e
-	var (
16d03e
-		data   = m.Data
16d03e
-		source = m.Source
16d03e
-	)
16d03e
-	if data == "systemd" {
16d03e
-		data = cgroups.CgroupNamePrefix + data
16d03e
-		source = "systemd"
16d03e
-	}
16d03e
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
16d03e
-		return err
16d03e
+		return nil
16d03e
+	}); err != nil {
16d03e
+		return fmt.Errorf("change mount propagation through procfd: %w", err)
16d03e
 	}
16d03e
 	return nil
16d03e
 }
16d03e
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
16d03e
index 1b72b7a1c1ba..cd78f23e1bd0 100644
16d03e
--- a/libcontainer/utils/utils.go
16d03e
+++ b/libcontainer/utils/utils.go
16d03e
@@ -3,12 +3,15 @@ package utils
16d03e
 import (
16d03e
 	"encoding/binary"
16d03e
 	"encoding/json"
16d03e
+	"fmt"
16d03e
 	"io"
16d03e
 	"os"
16d03e
 	"path/filepath"
16d03e
+	"strconv"
16d03e
 	"strings"
16d03e
 	"unsafe"
16d03e
 
16d03e
+	"github.com/cyphar/filepath-securejoin"
16d03e
 	"golang.org/x/sys/unix"
16d03e
 )
16d03e
 
16d03e
@@ -88,6 +91,57 @@ func CleanPath(path string) string {
16d03e
 	return filepath.Clean(path)
16d03e
 }
16d03e
 
16d03e
+// stripRoot returns the passed path, stripping the root path if it was
16d03e
+// (lexicially) inside it. Note that both passed paths will always be treated
16d03e
+// as absolute, and the returned path will also always be absolute. In
16d03e
+// addition, the paths are cleaned before stripping the root.
16d03e
+func stripRoot(root, path string) string {
16d03e
+	// Make the paths clean and absolute.
16d03e
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
16d03e
+	switch {
16d03e
+	case path == root:
16d03e
+		path = "/"
16d03e
+	case root == "/":
16d03e
+		// do nothing
16d03e
+	case strings.HasPrefix(path, root+"/"):
16d03e
+		path = strings.TrimPrefix(path, root+"/")
16d03e
+	}
16d03e
+	return CleanPath("/" + path)
16d03e
+}
16d03e
+
16d03e
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
16d03e
+// corresponding to the unsafePath resolved within the root. Before passing the
16d03e
+// fd, this path is verified to have been inside the root -- so operating on it
16d03e
+// through the passed fdpath should be safe. Do not access this path through
16d03e
+// the original path strings, and do not attempt to use the pathname outside of
16d03e
+// the passed closure (the file handle will be freed once the closure returns).
16d03e
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
16d03e
+	// Remove the root then forcefully resolve inside the root.
16d03e
+	unsafePath = stripRoot(root, unsafePath)
16d03e
+	path, err := securejoin.SecureJoin(root, unsafePath)
16d03e
+	if err != nil {
16d03e
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
16d03e
+	}
16d03e
+
16d03e
+	// Open the target path.
16d03e
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
16d03e
+	if err != nil {
16d03e
+		return fmt.Errorf("open o_path procfd: %w", err)
16d03e
+	}
16d03e
+	defer fh.Close()
16d03e
+
16d03e
+	// Double-check the path is the one we expected.
16d03e
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
16d03e
+	if realpath, err := os.Readlink(procfd); err != nil {
16d03e
+		return fmt.Errorf("procfd verification failed: %w", err)
16d03e
+	} else if realpath != path {
16d03e
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
16d03e
+	}
16d03e
+
16d03e
+	// Run the closure.
16d03e
+	return fn(procfd)
16d03e
+}
16d03e
+
16d03e
 // SearchLabels searches a list of key-value pairs for the provided key and
16d03e
 // returns the corresponding value. The pairs must be separated with '='.
16d03e
 func SearchLabels(labels []string, query string) string {
16d03e
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
16d03e
index 7f38ed169a6b..d33662238d36 100644
16d03e
--- a/libcontainer/utils/utils_test.go
16d03e
+++ b/libcontainer/utils/utils_test.go
16d03e
@@ -143,3 +143,38 @@ func TestCleanPath(t *testing.T) {
16d03e
 		t.Errorf("expected to receive '/foo' and received %s", path)
16d03e
 	}
16d03e
 }
16d03e
+
16d03e
+func TestStripRoot(t *testing.T) {
16d03e
+	for _, test := range []struct {
16d03e
+		root, path, out string
16d03e
+	}{
16d03e
+		// Works with multiple components.
16d03e
+		{"/a/b", "/a/b/c", "/c"},
16d03e
+		{"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"},
16d03e
+		// '/' must be a no-op.
16d03e
+		{"/", "/a/b/c", "/a/b/c"},
16d03e
+		// Must be the correct order.
16d03e
+		{"/a/b", "/a/c/b", "/a/c/b"},
16d03e
+		// Must be at start.
16d03e
+		{"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"},
16d03e
+		// Must be a lexical parent.
16d03e
+		{"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"},
16d03e
+		// Must only strip the root once.
16d03e
+		{"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"},
16d03e
+		// Deal with .. in a fairly sane way.
16d03e
+		{"/foo/bar", "/foo/bar/../baz", "/foo/baz"},
16d03e
+		{"/foo/bar", "../../../../../../foo/bar/baz", "/baz"},
16d03e
+		{"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"},
16d03e
+		{"/foo/bar/../baz", "/foo/baz/bar", "/bar"},
16d03e
+		{"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"},
16d03e
+		// All paths are made absolute before stripping.
16d03e
+		{"foo/bar", "/foo/bar/baz/bee", "/baz/bee"},
16d03e
+		{"/foo/bar", "foo/bar/baz/beef", "/baz/beef"},
16d03e
+		{"foo/bar", "foo/bar/baz/beets", "/baz/beets"},
16d03e
+	} {
16d03e
+		got := stripRoot(test.root, test.path)
16d03e
+		if got != test.out {
16d03e
+			t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out)
16d03e
+		}
16d03e
+	}
16d03e
+}
16d03e
-- 
16d03e
2.31.1
16d03e