d036ea
From 2dd156b190c02476191fc2522f9b0e0a1a098608 Mon Sep 17 00:00:00 2001
d036ea
From: Kir Kolyshkin <kolyshkin@gmail.com>
d036ea
Date: Mon, 17 May 2021 16:11:35 -0700
d036ea
Subject: [PATCH] rootfs: add mount destination validation
d036ea
d036ea
This is a manual backport of fix for CVE-2021-30465 to runc-1.0.0-rc10
d036ea
(aka -rc90), upstream commit 84c14b43fa703db7 by Aleksa Sarai.
d036ea
d036ea
Original description follows.
d036ea
d036ea
----
d036ea
d036ea
Because the target of a mount is inside a container (which may be a
d036ea
volume that is shared with another container), there exists a race
d036ea
condition where the target of the mount may change to a path containing
d036ea
a symlink after we have sanitised the path -- resulting in us
d036ea
inadvertently mounting the path outside of the container.
d036ea
d036ea
This is not immediately useful because we are in a mount namespace with
d036ea
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
d036ea
host paths in the host namespace. However, if any subsequent mountpoints
d036ea
in the configuration use a subdirectory of that host path as a source,
d036ea
those subsequent mounts will use an attacker-controlled source path
d036ea
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
d036ea
into the container.
d036ea
d036ea
While arguably configuration issues like this are not entirely within
d036ea
runc's threat model, within the context of Kubernetes (and possibly
d036ea
other container managers that provide semi-arbitrary container creation
d036ea
privileges to untrusted users) this is a legitimate issue. Since we
d036ea
cannot block mounting from the host into the container, we need to block
d036ea
the first stage of this attack (mounting onto a path outside the
d036ea
container).
d036ea
d036ea
The long-term plan to solve this would be to migrate to libpathrs, but
d036ea
as a stop-gap we implement libpathrs-like path verification through
d036ea
readlink(/proc/self/fd/$n) and then do mount operations through the
d036ea
procfd once it's been verified to be inside the container. The target
d036ea
could move after we've checked it, but if it is inside the container
d036ea
then we can assume that it is safe for the same reason that libpathrs
d036ea
operations would be safe.
d036ea
d036ea
A slight wrinkle is the "copyup" functionality we provide for tmpfs,
d036ea
which is the only case where we want to do a mount on the host
d036ea
filesystem. To facilitate this, I split out the copy-up functionality
d036ea
entirely so that the logic isn't interspersed with the regular tmpfs
d036ea
logic. In addition, all dependencies on m.Destination being overwritten
d036ea
have been removed since that pattern was just begging to be a source of
d036ea
more mount-target bugs (we do still have to modify m.Destination for
d036ea
tmpfs-copyup but we only do it temporarily).
d036ea
d036ea
Fixes: CVE-2021-30465
d036ea
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
d036ea
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
d036ea
Reviewed-by: Samuel Karp <skarp@amazon.com>
d036ea
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
d036ea
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
d036ea
d036ea
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
d036ea
---
d036ea
 libcontainer/rootfs_linux.go     | 225 ++++++++++++++++---------------
d036ea
 libcontainer/utils/utils.go      |  54 ++++++++
d036ea
 libcontainer/utils/utils_test.go |  35 +++++
d036ea
 3 files changed, 204 insertions(+), 110 deletions(-)
d036ea
d036ea
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
d036ea
index 106c4c2b..fe9afe48 100644
d036ea
--- a/libcontainer/rootfs_linux.go
d036ea
+++ b/libcontainer/rootfs_linux.go
d036ea
@@ -19,8 +19,9 @@ import (
d036ea
 	"github.com/opencontainers/runc/libcontainer/configs"
d036ea
 	"github.com/opencontainers/runc/libcontainer/mount"
d036ea
 	"github.com/opencontainers/runc/libcontainer/system"
d036ea
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
d036ea
+	"github.com/opencontainers/runc/libcontainer/utils"
d036ea
 	"github.com/opencontainers/selinux/go-selinux/label"
d036ea
+	"github.com/sirupsen/logrus"
d036ea
 
d036ea
 	"golang.org/x/sys/unix"
d036ea
 )
d036ea
@@ -30,7 +31,7 @@ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
d036ea
 // needsSetupDev returns true if /dev needs to be set up.
d036ea
 func needsSetupDev(config *configs.Config) bool {
d036ea
 	for _, m := range config.Mounts {
d036ea
-		if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
d036ea
+		if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
d036ea
 			return false
d036ea
 		}
d036ea
 	}
d036ea
@@ -131,7 +132,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
d036ea
 func finalizeRootfs(config *configs.Config) (err error) {
d036ea
 	// remount dev as ro if specified
d036ea
 	for _, m := range config.Mounts {
d036ea
-		if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
d036ea
+		if utils.CleanPath(m.Destination) == "/dev" {
d036ea
 			if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
d036ea
 				if err := remountReadonly(m); err != nil {
d036ea
 					return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
d036ea
@@ -200,8 +201,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
d036ea
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
d036ea
 		return err
d036ea
 	}
d036ea
-	// update the mount with the correct dest after symlinks are resolved.
d036ea
-	m.Destination = dest
d036ea
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
d036ea
 		return err
d036ea
 	}
d036ea
@@ -238,18 +237,21 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
d036ea
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
d036ea
 				return err
d036ea
 			}
d036ea
-			flags := defaultMountFlags
d036ea
-			if m.Flags&unix.MS_RDONLY != 0 {
d036ea
-				flags = flags | unix.MS_RDONLY
d036ea
-			}
d036ea
-			cgroupmount := &configs.Mount{
d036ea
-				Source:      "cgroup",
d036ea
-				Device:      "cgroup",
d036ea
-				Destination: subsystemPath,
d036ea
-				Flags:       flags,
d036ea
-				Data:        filepath.Base(subsystemPath),
d036ea
-			}
d036ea
-			if err := mountNewCgroup(cgroupmount); err != nil {
d036ea
+			if err := utils.WithProcfd(rootfs, b.Destination, func(procfd string) error {
d036ea
+				flags := defaultMountFlags
d036ea
+				if m.Flags&unix.MS_RDONLY != 0 {
d036ea
+					flags = flags | unix.MS_RDONLY
d036ea
+				}
d036ea
+				var (
d036ea
+					source = "cgroup"
d036ea
+					data   = filepath.Base(subsystemPath)
d036ea
+				)
d036ea
+				if data == "systemd" {
d036ea
+					data = cgroups.CgroupNamePrefix + data
d036ea
+					source = "systemd"
d036ea
+				}
d036ea
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
d036ea
+			}); err != nil {
d036ea
 				return err
d036ea
 			}
d036ea
 		} else {
d036ea
@@ -279,22 +281,67 @@ func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
d036ea
 	if err := os.MkdirAll(cgroupPath, 0755); err != nil {
d036ea
 		return err
d036ea
 	}
d036ea
-	if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
d036ea
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
d036ea
-		if err == unix.EPERM || err == unix.EBUSY {
d036ea
-			return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
d036ea
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
d036ea
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
d036ea
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
d036ea
+			if err == unix.EPERM || err == unix.EBUSY {
d036ea
+				return unix.Mount("/sys/fs/cgroup", procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
d036ea
+			}
d036ea
+			return err
d036ea
 		}
d036ea
+		return nil
d036ea
+	})
d036ea
+}
d036ea
+
d036ea
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
d036ea
+	// Set up a scratch dir for the tmpfs on the host.
d036ea
+	tmpdir, err := prepareTmp("/tmp")
d036ea
+	if err != nil {
d036ea
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
d036ea
+	}
d036ea
+	defer cleanupTmp(tmpdir)
d036ea
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
d036ea
+	if err != nil {
d036ea
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
d036ea
+	}
d036ea
+	defer os.RemoveAll(tmpDir)
d036ea
+
d036ea
+	// Configure the *host* tmpdir as if it's the container mount. We change
d036ea
+	// m.Destination since we are going to mount *on the host*.
d036ea
+	oldDest := m.Destination
d036ea
+	m.Destination = tmpDir
d036ea
+	err = mountPropagate(m, "/", mountLabel)
d036ea
+	m.Destination = oldDest
d036ea
+	if err != nil {
d036ea
 		return err
d036ea
 	}
d036ea
-	return nil
d036ea
+	defer func() {
d036ea
+		if Err != nil {
d036ea
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
d036ea
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
d036ea
+			}
d036ea
+		}
d036ea
+	}()
d036ea
+
d036ea
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
d036ea
+		// Copy the container data to the host tmpdir. We append "/" to force
d036ea
+		// CopyDirectory to resolve the symlink rather than trying to copy the
d036ea
+		// symlink itself.
d036ea
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
d036ea
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %v", m.Destination, procfd, tmpDir, err)
d036ea
+		}
d036ea
+		// Now move the mount into the container.
d036ea
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
d036ea
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %v", tmpDir, procfd, m.Destination, err)
d036ea
+		}
d036ea
+		return nil
d036ea
+	})
d036ea
 }
d036ea
 
d036ea
 func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
d036ea
-	var (
d036ea
-		dest = m.Destination
d036ea
-	)
d036ea
-	if !strings.HasPrefix(dest, rootfs) {
d036ea
-		dest = filepath.Join(rootfs, dest)
d036ea
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
d036ea
+	if err != nil {
d036ea
+		return err
d036ea
 	}
d036ea
 
d036ea
 	switch m.Device {
d036ea
@@ -329,46 +376,21 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
d036ea
 		}
d036ea
 		return nil
d036ea
 	case "tmpfs":
d036ea
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
d036ea
-		tmpDir := ""
d036ea
 		stat, err := os.Stat(dest)
d036ea
 		if err != nil {
d036ea
 			if err := os.MkdirAll(dest, 0755); err != nil {
d036ea
 				return err
d036ea
 			}
d036ea
 		}
d036ea
-		if copyUp {
d036ea
-			tmpdir, err := prepareTmp("/tmp")
d036ea
-			if err != nil {
d036ea
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
d036ea
-			}
d036ea
-			defer cleanupTmp(tmpdir)
d036ea
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
d036ea
-			if err != nil {
d036ea
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
d036ea
-			}
d036ea
-			defer os.RemoveAll(tmpDir)
d036ea
-			m.Destination = tmpDir
d036ea
+
d036ea
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
d036ea
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
d036ea
+		} else {
d036ea
+			err = mountPropagate(m, rootfs, mountLabel)
d036ea
 		}
d036ea
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
d036ea
+		if err != nil {
d036ea
 			return err
d036ea
 		}
d036ea
-		if copyUp {
d036ea
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
d036ea
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
d036ea
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
d036ea
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
d036ea
-				}
d036ea
-				return errMsg
d036ea
-			}
d036ea
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
d036ea
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
d036ea
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
d036ea
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
d036ea
-				}
d036ea
-				return errMsg
d036ea
-			}
d036ea
-		}
d036ea
 		if stat != nil {
d036ea
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
d036ea
 				return err
d036ea
@@ -424,19 +446,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
d036ea
 			}
d036ea
 		}
d036ea
 	default:
d036ea
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
d036ea
-		// any previous mounts can invalidate the next mount's destination.
d036ea
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
d036ea
-		// evil stuff to try to escape the container's rootfs.
d036ea
-		var err error
d036ea
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
d036ea
-			return err
d036ea
-		}
d036ea
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
d036ea
 			return err
d036ea
 		}
d036ea
-		// update the mount with the correct dest after symlinks are resolved.
d036ea
-		m.Destination = dest
d036ea
 		if err := os.MkdirAll(dest, 0755); err != nil {
d036ea
 			return err
d036ea
 		}
d036ea
@@ -611,7 +623,7 @@ func createDevices(config *configs.Config) error {
d036ea
 	return nil
d036ea
 }
d036ea
 
d036ea
-func bindMountDeviceNode(dest string, node *configs.Device) error {
d036ea
+func bindMountDeviceNode(rootfs, dest string, node *configs.Device) error {
d036ea
 	f, err := os.Create(dest)
d036ea
 	if err != nil && !os.IsExist(err) {
d036ea
 		return err
d036ea
@@ -619,24 +631,29 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
d036ea
 	if f != nil {
d036ea
 		f.Close()
d036ea
 	}
d036ea
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
d036ea
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
d036ea
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
d036ea
+	})
d036ea
 }
d036ea
 
d036ea
 // Creates the device node in the rootfs of the container.
d036ea
 func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
d036ea
-	dest := filepath.Join(rootfs, node.Path)
d036ea
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
d036ea
+	if err != nil {
d036ea
+		return err
d036ea
+	}
d036ea
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
d036ea
 		return err
d036ea
 	}
d036ea
 
d036ea
 	if bind {
d036ea
-		return bindMountDeviceNode(dest, node)
d036ea
+		return bindMountDeviceNode(rootfs, dest, node)
d036ea
 	}
d036ea
 	if err := mknodDevice(dest, node); err != nil {
d036ea
 		if os.IsExist(err) {
d036ea
 			return nil
d036ea
 		} else if os.IsPermission(err) {
d036ea
-			return bindMountDeviceNode(dest, node)
d036ea
+			return bindMountDeviceNode(rootfs, dest, node)
d036ea
 		}
d036ea
 		return err
d036ea
 	}
d036ea
@@ -955,55 +972,43 @@ func writeSystemProperty(key, value string) error {
d036ea
 }
d036ea
 
d036ea
 func remount(m *configs.Mount, rootfs string) error {
d036ea
-	var (
d036ea
-		dest = m.Destination
d036ea
-	)
d036ea
-	if !strings.HasPrefix(dest, rootfs) {
d036ea
-		dest = filepath.Join(rootfs, dest)
d036ea
-	}
d036ea
-	return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
d036ea
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
d036ea
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
d036ea
+	})
d036ea
 }
d036ea
 
d036ea
 // Do the mount operation followed by additional mounts required to take care
d036ea
-// of propagation flags.
d036ea
+// of propagation flags.  This will always be scoped inside the container rootfs.
d036ea
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
d036ea
 	var (
d036ea
-		dest  = m.Destination
d036ea
 		data  = label.FormatMountLabel(m.Data, mountLabel)
d036ea
 		flags = m.Flags
d036ea
 	)
d036ea
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
d036ea
+	if utils.CleanPath(m.Destination) == "/dev" {
d036ea
 		flags &= ^unix.MS_RDONLY
d036ea
 	}
d036ea
 
d036ea
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
d036ea
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
d036ea
-		dest = filepath.Join(rootfs, dest)
d036ea
-	}
d036ea
-
d036ea
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
d036ea
-		return err
d036ea
-	}
d036ea
-
d036ea
-	for _, pflag := range m.PropagationFlags {
d036ea
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
d036ea
-			return err
d036ea
+	// Because the destination is inside a container path which might be
d036ea
+	// mutating underneath us, we verify that we are actually going to mount
d036ea
+	// inside the container with WithProcfd() -- mounting through a procfd
d036ea
+	// mounts on the target.
d036ea
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
d036ea
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
d036ea
+	}); err != nil {
d036ea
+		return fmt.Errorf("mount through procfd: %v", err)
d036ea
+	}
d036ea
+	// We have to apply mount propagation flags in a separate WithProcfd() call
d036ea
+	// because the previous call invalidates the passed procfd -- the mount
d036ea
+	// target needs to be re-opened.
d036ea
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
d036ea
+		for _, pflag := range m.PropagationFlags {
d036ea
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
d036ea
+				return err
d036ea
+			}
d036ea
 		}
d036ea
-	}
d036ea
-	return nil
d036ea
-}
d036ea
-
d036ea
-func mountNewCgroup(m *configs.Mount) error {
d036ea
-	var (
d036ea
-		data   = m.Data
d036ea
-		source = m.Source
d036ea
-	)
d036ea
-	if data == "systemd" {
d036ea
-		data = cgroups.CgroupNamePrefix + data
d036ea
-		source = "systemd"
d036ea
-	}
d036ea
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
d036ea
-		return err
d036ea
+		return nil
d036ea
+	}); err != nil {
d036ea
+		return fmt.Errorf("change mount propagation through procfd: %v", err)
d036ea
 	}
d036ea
 	return nil
d036ea
 }
d036ea
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
d036ea
index 40ccfaa1..c1418ef9 100644
d036ea
--- a/libcontainer/utils/utils.go
d036ea
+++ b/libcontainer/utils/utils.go
d036ea
@@ -2,12 +2,15 @@ package utils
d036ea
 
d036ea
 import (
d036ea
 	"encoding/json"
d036ea
+	"fmt"
d036ea
 	"io"
d036ea
 	"os"
d036ea
 	"path/filepath"
d036ea
+	"strconv"
d036ea
 	"strings"
d036ea
 	"unsafe"
d036ea
 
d036ea
+	securejoin "github.com/cyphar/filepath-securejoin"
d036ea
 	"golang.org/x/sys/unix"
d036ea
 )
d036ea
 
d036ea
@@ -73,6 +76,57 @@ func CleanPath(path string) string {
d036ea
 	return filepath.Clean(path)
d036ea
 }
d036ea
 
d036ea
+// stripRoot returns the passed path, stripping the root path if it was
d036ea
+// (lexicially) inside it. Note that both passed paths will always be treated
d036ea
+// as absolute, and the returned path will also always be absolute. In
d036ea
+// addition, the paths are cleaned before stripping the root.
d036ea
+func stripRoot(root, path string) string {
d036ea
+	// Make the paths clean and absolute.
d036ea
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
d036ea
+	switch {
d036ea
+	case path == root:
d036ea
+		path = "/"
d036ea
+	case root == "/":
d036ea
+		// do nothing
d036ea
+	case strings.HasPrefix(path, root+"/"):
d036ea
+		path = strings.TrimPrefix(path, root+"/")
d036ea
+	}
d036ea
+	return CleanPath("/" + path)
d036ea
+}
d036ea
+
d036ea
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
d036ea
+// corresponding to the unsafePath resolved within the root. Before passing the
d036ea
+// fd, this path is verified to have been inside the root -- so operating on it
d036ea
+// through the passed fdpath should be safe. Do not access this path through
d036ea
+// the original path strings, and do not attempt to use the pathname outside of
d036ea
+// the passed closure (the file handle will be freed once the closure returns).
d036ea
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
d036ea
+	// Remove the root then forcefully resolve inside the root.
d036ea
+	unsafePath = stripRoot(root, unsafePath)
d036ea
+	path, err := securejoin.SecureJoin(root, unsafePath)
d036ea
+	if err != nil {
d036ea
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
d036ea
+	}
d036ea
+
d036ea
+	// Open the target path.
d036ea
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
d036ea
+	if err != nil {
d036ea
+		return fmt.Errorf("open o_path procfd: %v", err)
d036ea
+	}
d036ea
+	defer fh.Close()
d036ea
+
d036ea
+	// Double-check the path is the one we expected.
d036ea
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
d036ea
+	if realpath, err := os.Readlink(procfd); err != nil {
d036ea
+		return fmt.Errorf("procfd verification failed: %v", err)
d036ea
+	} else if realpath != path {
d036ea
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
d036ea
+	}
d036ea
+
d036ea
+	// Run the closure.
d036ea
+	return fn(procfd)
d036ea
+}
d036ea
+
d036ea
 // SearchLabels searches a list of key-value pairs for the provided key and
d036ea
 // returns the corresponding value. The pairs must be separated with '='.
d036ea
 func SearchLabels(labels []string, query string) string {
d036ea
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
d036ea
index 395eedcf..5b80cac6 100644
d036ea
--- a/libcontainer/utils/utils_test.go
d036ea
+++ b/libcontainer/utils/utils_test.go
d036ea
@@ -140,3 +140,38 @@ func TestCleanPath(t *testing.T) {
d036ea
 		t.Errorf("expected to receive '/foo' and received %s", path)
d036ea
 	}
d036ea
 }
d036ea
+
d036ea
+func TestStripRoot(t *testing.T) {
d036ea
+	for _, test := range []struct {
d036ea
+		root, path, out string
d036ea
+	}{
d036ea
+		// Works with multiple components.
d036ea
+		{"/a/b", "/a/b/c", "/c"},
d036ea
+		{"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"},
d036ea
+		// '/' must be a no-op.
d036ea
+		{"/", "/a/b/c", "/a/b/c"},
d036ea
+		// Must be the correct order.
d036ea
+		{"/a/b", "/a/c/b", "/a/c/b"},
d036ea
+		// Must be at start.
d036ea
+		{"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"},
d036ea
+		// Must be a lexical parent.
d036ea
+		{"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"},
d036ea
+		// Must only strip the root once.
d036ea
+		{"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"},
d036ea
+		// Deal with .. in a fairly sane way.
d036ea
+		{"/foo/bar", "/foo/bar/../baz", "/foo/baz"},
d036ea
+		{"/foo/bar", "../../../../../../foo/bar/baz", "/baz"},
d036ea
+		{"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"},
d036ea
+		{"/foo/bar/../baz", "/foo/baz/bar", "/bar"},
d036ea
+		{"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"},
d036ea
+		// All paths are made absolute before stripping.
d036ea
+		{"foo/bar", "/foo/bar/baz/bee", "/baz/bee"},
d036ea
+		{"/foo/bar", "foo/bar/baz/beef", "/baz/beef"},
d036ea
+		{"foo/bar", "foo/bar/baz/beets", "/baz/beets"},
d036ea
+	} {
d036ea
+		got := stripRoot(test.root, test.path)
d036ea
+		if got != test.out {
d036ea
+			t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out)
d036ea
+		}
d036ea
+	}
d036ea
+}
d036ea
-- 
d036ea
2.31.1
d036ea