76e8ff
From 2dd156b190c02476191fc2522f9b0e0a1a098608 Mon Sep 17 00:00:00 2001
76e8ff
From: Kir Kolyshkin <kolyshkin@gmail.com>
76e8ff
Date: Mon, 17 May 2021 16:11:35 -0700
76e8ff
Subject: [PATCH] rootfs: add mount destination validation
76e8ff
76e8ff
This is a manual backport of fix for CVE-2021-30465 to runc-1.0.0-rc10
76e8ff
(aka -rc90), upstream commit 84c14b43fa703db7 by Aleksa Sarai.
76e8ff
76e8ff
Original description follows.
76e8ff
76e8ff
----
76e8ff
76e8ff
Because the target of a mount is inside a container (which may be a
76e8ff
volume that is shared with another container), there exists a race
76e8ff
condition where the target of the mount may change to a path containing
76e8ff
a symlink after we have sanitised the path -- resulting in us
76e8ff
inadvertently mounting the path outside of the container.
76e8ff
76e8ff
This is not immediately useful because we are in a mount namespace with
76e8ff
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
76e8ff
host paths in the host namespace. However, if any subsequent mountpoints
76e8ff
in the configuration use a subdirectory of that host path as a source,
76e8ff
those subsequent mounts will use an attacker-controlled source path
76e8ff
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
76e8ff
into the container.
76e8ff
76e8ff
While arguably configuration issues like this are not entirely within
76e8ff
runc's threat model, within the context of Kubernetes (and possibly
76e8ff
other container managers that provide semi-arbitrary container creation
76e8ff
privileges to untrusted users) this is a legitimate issue. Since we
76e8ff
cannot block mounting from the host into the container, we need to block
76e8ff
the first stage of this attack (mounting onto a path outside the
76e8ff
container).
76e8ff
76e8ff
The long-term plan to solve this would be to migrate to libpathrs, but
76e8ff
as a stop-gap we implement libpathrs-like path verification through
76e8ff
readlink(/proc/self/fd/$n) and then do mount operations through the
76e8ff
procfd once it's been verified to be inside the container. The target
76e8ff
could move after we've checked it, but if it is inside the container
76e8ff
then we can assume that it is safe for the same reason that libpathrs
76e8ff
operations would be safe.
76e8ff
76e8ff
A slight wrinkle is the "copyup" functionality we provide for tmpfs,
76e8ff
which is the only case where we want to do a mount on the host
76e8ff
filesystem. To facilitate this, I split out the copy-up functionality
76e8ff
entirely so that the logic isn't interspersed with the regular tmpfs
76e8ff
logic. In addition, all dependencies on m.Destination being overwritten
76e8ff
have been removed since that pattern was just begging to be a source of
76e8ff
more mount-target bugs (we do still have to modify m.Destination for
76e8ff
tmpfs-copyup but we only do it temporarily).
76e8ff
76e8ff
Fixes: CVE-2021-30465
76e8ff
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
76e8ff
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
76e8ff
Reviewed-by: Samuel Karp <skarp@amazon.com>
76e8ff
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
76e8ff
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
76e8ff
76e8ff
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
76e8ff
---
76e8ff
 libcontainer/rootfs_linux.go     | 225 ++++++++++++++++---------------
76e8ff
 libcontainer/utils/utils.go      |  54 ++++++++
76e8ff
 libcontainer/utils/utils_test.go |  35 +++++
76e8ff
 3 files changed, 204 insertions(+), 110 deletions(-)
76e8ff
76e8ff
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
76e8ff
index 106c4c2b..fe9afe48 100644
76e8ff
--- a/libcontainer/rootfs_linux.go
76e8ff
+++ b/libcontainer/rootfs_linux.go
76e8ff
@@ -19,8 +19,9 @@ import (
76e8ff
 	"github.com/opencontainers/runc/libcontainer/configs"
76e8ff
 	"github.com/opencontainers/runc/libcontainer/mount"
76e8ff
 	"github.com/opencontainers/runc/libcontainer/system"
76e8ff
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
76e8ff
+	"github.com/opencontainers/runc/libcontainer/utils"
76e8ff
 	"github.com/opencontainers/selinux/go-selinux/label"
76e8ff
+	"github.com/sirupsen/logrus"
76e8ff
 
76e8ff
 	"golang.org/x/sys/unix"
76e8ff
 )
76e8ff
@@ -30,7 +31,7 @@ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
76e8ff
 // needsSetupDev returns true if /dev needs to be set up.
76e8ff
 func needsSetupDev(config *configs.Config) bool {
76e8ff
 	for _, m := range config.Mounts {
76e8ff
-		if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
76e8ff
+		if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
76e8ff
 			return false
76e8ff
 		}
76e8ff
 	}
76e8ff
@@ -131,7 +132,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
76e8ff
 func finalizeRootfs(config *configs.Config) (err error) {
76e8ff
 	// remount dev as ro if specified
76e8ff
 	for _, m := range config.Mounts {
76e8ff
-		if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
76e8ff
+		if utils.CleanPath(m.Destination) == "/dev" {
76e8ff
 			if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
76e8ff
 				if err := remountReadonly(m); err != nil {
76e8ff
 					return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
76e8ff
@@ -200,8 +201,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
76e8ff
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
76e8ff
 		return err
76e8ff
 	}
76e8ff
-	// update the mount with the correct dest after symlinks are resolved.
76e8ff
-	m.Destination = dest
76e8ff
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
76e8ff
 		return err
76e8ff
 	}
76e8ff
@@ -238,18 +237,21 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
76e8ff
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
76e8ff
 				return err
76e8ff
 			}
76e8ff
-			flags := defaultMountFlags
76e8ff
-			if m.Flags&unix.MS_RDONLY != 0 {
76e8ff
-				flags = flags | unix.MS_RDONLY
76e8ff
-			}
76e8ff
-			cgroupmount := &configs.Mount{
76e8ff
-				Source:      "cgroup",
76e8ff
-				Device:      "cgroup",
76e8ff
-				Destination: subsystemPath,
76e8ff
-				Flags:       flags,
76e8ff
-				Data:        filepath.Base(subsystemPath),
76e8ff
-			}
76e8ff
-			if err := mountNewCgroup(cgroupmount); err != nil {
76e8ff
+			if err := utils.WithProcfd(rootfs, b.Destination, func(procfd string) error {
76e8ff
+				flags := defaultMountFlags
76e8ff
+				if m.Flags&unix.MS_RDONLY != 0 {
76e8ff
+					flags = flags | unix.MS_RDONLY
76e8ff
+				}
76e8ff
+				var (
76e8ff
+					source = "cgroup"
76e8ff
+					data   = filepath.Base(subsystemPath)
76e8ff
+				)
76e8ff
+				if data == "systemd" {
76e8ff
+					data = cgroups.CgroupNamePrefix + data
76e8ff
+					source = "systemd"
76e8ff
+				}
76e8ff
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
76e8ff
+			}); err != nil {
76e8ff
 				return err
76e8ff
 			}
76e8ff
 		} else {
76e8ff
@@ -279,22 +281,67 @@ func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
76e8ff
 	if err := os.MkdirAll(cgroupPath, 0755); err != nil {
76e8ff
 		return err
76e8ff
 	}
76e8ff
-	if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
76e8ff
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
76e8ff
-		if err == unix.EPERM || err == unix.EBUSY {
76e8ff
-			return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
76e8ff
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
76e8ff
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
76e8ff
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
76e8ff
+			if err == unix.EPERM || err == unix.EBUSY {
76e8ff
+				return unix.Mount("/sys/fs/cgroup", procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
76e8ff
+			}
76e8ff
+			return err
76e8ff
 		}
76e8ff
+		return nil
76e8ff
+	})
76e8ff
+}
76e8ff
+
76e8ff
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
76e8ff
+	// Set up a scratch dir for the tmpfs on the host.
76e8ff
+	tmpdir, err := prepareTmp("/tmp")
76e8ff
+	if err != nil {
76e8ff
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
76e8ff
+	}
76e8ff
+	defer cleanupTmp(tmpdir)
76e8ff
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
76e8ff
+	if err != nil {
76e8ff
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
76e8ff
+	}
76e8ff
+	defer os.RemoveAll(tmpDir)
76e8ff
+
76e8ff
+	// Configure the *host* tmpdir as if it's the container mount. We change
76e8ff
+	// m.Destination since we are going to mount *on the host*.
76e8ff
+	oldDest := m.Destination
76e8ff
+	m.Destination = tmpDir
76e8ff
+	err = mountPropagate(m, "/", mountLabel)
76e8ff
+	m.Destination = oldDest
76e8ff
+	if err != nil {
76e8ff
 		return err
76e8ff
 	}
76e8ff
-	return nil
76e8ff
+	defer func() {
76e8ff
+		if Err != nil {
76e8ff
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
76e8ff
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
76e8ff
+			}
76e8ff
+		}
76e8ff
+	}()
76e8ff
+
76e8ff
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
76e8ff
+		// Copy the container data to the host tmpdir. We append "/" to force
76e8ff
+		// CopyDirectory to resolve the symlink rather than trying to copy the
76e8ff
+		// symlink itself.
76e8ff
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
76e8ff
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %v", m.Destination, procfd, tmpDir, err)
76e8ff
+		}
76e8ff
+		// Now move the mount into the container.
76e8ff
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
76e8ff
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %v", tmpDir, procfd, m.Destination, err)
76e8ff
+		}
76e8ff
+		return nil
76e8ff
+	})
76e8ff
 }
76e8ff
 
76e8ff
 func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
76e8ff
-	var (
76e8ff
-		dest = m.Destination
76e8ff
-	)
76e8ff
-	if !strings.HasPrefix(dest, rootfs) {
76e8ff
-		dest = filepath.Join(rootfs, dest)
76e8ff
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
76e8ff
+	if err != nil {
76e8ff
+		return err
76e8ff
 	}
76e8ff
 
76e8ff
 	switch m.Device {
76e8ff
@@ -329,46 +376,21 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
76e8ff
 		}
76e8ff
 		return nil
76e8ff
 	case "tmpfs":
76e8ff
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
76e8ff
-		tmpDir := ""
76e8ff
 		stat, err := os.Stat(dest)
76e8ff
 		if err != nil {
76e8ff
 			if err := os.MkdirAll(dest, 0755); err != nil {
76e8ff
 				return err
76e8ff
 			}
76e8ff
 		}
76e8ff
-		if copyUp {
76e8ff
-			tmpdir, err := prepareTmp("/tmp")
76e8ff
-			if err != nil {
76e8ff
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
76e8ff
-			}
76e8ff
-			defer cleanupTmp(tmpdir)
76e8ff
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
76e8ff
-			if err != nil {
76e8ff
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
76e8ff
-			}
76e8ff
-			defer os.RemoveAll(tmpDir)
76e8ff
-			m.Destination = tmpDir
76e8ff
+
76e8ff
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
76e8ff
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
76e8ff
+		} else {
76e8ff
+			err = mountPropagate(m, rootfs, mountLabel)
76e8ff
 		}
76e8ff
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
76e8ff
+		if err != nil {
76e8ff
 			return err
76e8ff
 		}
76e8ff
-		if copyUp {
76e8ff
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
76e8ff
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
76e8ff
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
76e8ff
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
76e8ff
-				}
76e8ff
-				return errMsg
76e8ff
-			}
76e8ff
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
76e8ff
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
76e8ff
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
76e8ff
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
76e8ff
-				}
76e8ff
-				return errMsg
76e8ff
-			}
76e8ff
-		}
76e8ff
 		if stat != nil {
76e8ff
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
76e8ff
 				return err
76e8ff
@@ -424,19 +446,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
76e8ff
 			}
76e8ff
 		}
76e8ff
 	default:
76e8ff
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
76e8ff
-		// any previous mounts can invalidate the next mount's destination.
76e8ff
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
76e8ff
-		// evil stuff to try to escape the container's rootfs.
76e8ff
-		var err error
76e8ff
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
76e8ff
-			return err
76e8ff
-		}
76e8ff
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
76e8ff
 			return err
76e8ff
 		}
76e8ff
-		// update the mount with the correct dest after symlinks are resolved.
76e8ff
-		m.Destination = dest
76e8ff
 		if err := os.MkdirAll(dest, 0755); err != nil {
76e8ff
 			return err
76e8ff
 		}
76e8ff
@@ -611,7 +623,7 @@ func createDevices(config *configs.Config) error {
76e8ff
 	return nil
76e8ff
 }
76e8ff
 
76e8ff
-func bindMountDeviceNode(dest string, node *configs.Device) error {
76e8ff
+func bindMountDeviceNode(rootfs, dest string, node *configs.Device) error {
76e8ff
 	f, err := os.Create(dest)
76e8ff
 	if err != nil && !os.IsExist(err) {
76e8ff
 		return err
76e8ff
@@ -619,24 +631,29 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
76e8ff
 	if f != nil {
76e8ff
 		f.Close()
76e8ff
 	}
76e8ff
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
76e8ff
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
76e8ff
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
76e8ff
+	})
76e8ff
 }
76e8ff
 
76e8ff
 // Creates the device node in the rootfs of the container.
76e8ff
 func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
76e8ff
-	dest := filepath.Join(rootfs, node.Path)
76e8ff
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
76e8ff
+	if err != nil {
76e8ff
+		return err
76e8ff
+	}
76e8ff
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
76e8ff
 		return err
76e8ff
 	}
76e8ff
 
76e8ff
 	if bind {
76e8ff
-		return bindMountDeviceNode(dest, node)
76e8ff
+		return bindMountDeviceNode(rootfs, dest, node)
76e8ff
 	}
76e8ff
 	if err := mknodDevice(dest, node); err != nil {
76e8ff
 		if os.IsExist(err) {
76e8ff
 			return nil
76e8ff
 		} else if os.IsPermission(err) {
76e8ff
-			return bindMountDeviceNode(dest, node)
76e8ff
+			return bindMountDeviceNode(rootfs, dest, node)
76e8ff
 		}
76e8ff
 		return err
76e8ff
 	}
76e8ff
@@ -955,55 +972,43 @@ func writeSystemProperty(key, value string) error {
76e8ff
 }
76e8ff
 
76e8ff
 func remount(m *configs.Mount, rootfs string) error {
76e8ff
-	var (
76e8ff
-		dest = m.Destination
76e8ff
-	)
76e8ff
-	if !strings.HasPrefix(dest, rootfs) {
76e8ff
-		dest = filepath.Join(rootfs, dest)
76e8ff
-	}
76e8ff
-	return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
76e8ff
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
76e8ff
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
76e8ff
+	})
76e8ff
 }
76e8ff
 
76e8ff
 // Do the mount operation followed by additional mounts required to take care
76e8ff
-// of propagation flags.
76e8ff
+// of propagation flags.  This will always be scoped inside the container rootfs.
76e8ff
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
76e8ff
 	var (
76e8ff
-		dest  = m.Destination
76e8ff
 		data  = label.FormatMountLabel(m.Data, mountLabel)
76e8ff
 		flags = m.Flags
76e8ff
 	)
76e8ff
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
76e8ff
+	if utils.CleanPath(m.Destination) == "/dev" {
76e8ff
 		flags &= ^unix.MS_RDONLY
76e8ff
 	}
76e8ff
 
76e8ff
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
76e8ff
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
76e8ff
-		dest = filepath.Join(rootfs, dest)
76e8ff
-	}
76e8ff
-
76e8ff
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
76e8ff
-		return err
76e8ff
-	}
76e8ff
-
76e8ff
-	for _, pflag := range m.PropagationFlags {
76e8ff
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
76e8ff
-			return err
76e8ff
+	// Because the destination is inside a container path which might be
76e8ff
+	// mutating underneath us, we verify that we are actually going to mount
76e8ff
+	// inside the container with WithProcfd() -- mounting through a procfd
76e8ff
+	// mounts on the target.
76e8ff
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
76e8ff
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
76e8ff
+	}); err != nil {
76e8ff
+		return fmt.Errorf("mount through procfd: %v", err)
76e8ff
+	}
76e8ff
+	// We have to apply mount propagation flags in a separate WithProcfd() call
76e8ff
+	// because the previous call invalidates the passed procfd -- the mount
76e8ff
+	// target needs to be re-opened.
76e8ff
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
76e8ff
+		for _, pflag := range m.PropagationFlags {
76e8ff
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
76e8ff
+				return err
76e8ff
+			}
76e8ff
 		}
76e8ff
-	}
76e8ff
-	return nil
76e8ff
-}
76e8ff
-
76e8ff
-func mountNewCgroup(m *configs.Mount) error {
76e8ff
-	var (
76e8ff
-		data   = m.Data
76e8ff
-		source = m.Source
76e8ff
-	)
76e8ff
-	if data == "systemd" {
76e8ff
-		data = cgroups.CgroupNamePrefix + data
76e8ff
-		source = "systemd"
76e8ff
-	}
76e8ff
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
76e8ff
-		return err
76e8ff
+		return nil
76e8ff
+	}); err != nil {
76e8ff
+		return fmt.Errorf("change mount propagation through procfd: %v", err)
76e8ff
 	}
76e8ff
 	return nil
76e8ff
 }
76e8ff
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
76e8ff
index 40ccfaa1..c1418ef9 100644
76e8ff
--- a/libcontainer/utils/utils.go
76e8ff
+++ b/libcontainer/utils/utils.go
76e8ff
@@ -2,12 +2,15 @@ package utils
76e8ff
 
76e8ff
 import (
76e8ff
 	"encoding/json"
76e8ff
+	"fmt"
76e8ff
 	"io"
76e8ff
 	"os"
76e8ff
 	"path/filepath"
76e8ff
+	"strconv"
76e8ff
 	"strings"
76e8ff
 	"unsafe"
76e8ff
 
76e8ff
+	securejoin "github.com/cyphar/filepath-securejoin"
76e8ff
 	"golang.org/x/sys/unix"
76e8ff
 )
76e8ff
 
76e8ff
@@ -73,6 +76,57 @@ func CleanPath(path string) string {
76e8ff
 	return filepath.Clean(path)
76e8ff
 }
76e8ff
 
76e8ff
+// stripRoot returns the passed path, stripping the root path if it was
76e8ff
+// (lexicially) inside it. Note that both passed paths will always be treated
76e8ff
+// as absolute, and the returned path will also always be absolute. In
76e8ff
+// addition, the paths are cleaned before stripping the root.
76e8ff
+func stripRoot(root, path string) string {
76e8ff
+	// Make the paths clean and absolute.
76e8ff
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
76e8ff
+	switch {
76e8ff
+	case path == root:
76e8ff
+		path = "/"
76e8ff
+	case root == "/":
76e8ff
+		// do nothing
76e8ff
+	case strings.HasPrefix(path, root+"/"):
76e8ff
+		path = strings.TrimPrefix(path, root+"/")
76e8ff
+	}
76e8ff
+	return CleanPath("/" + path)
76e8ff
+}
76e8ff
+
76e8ff
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
76e8ff
+// corresponding to the unsafePath resolved within the root. Before passing the
76e8ff
+// fd, this path is verified to have been inside the root -- so operating on it
76e8ff
+// through the passed fdpath should be safe. Do not access this path through
76e8ff
+// the original path strings, and do not attempt to use the pathname outside of
76e8ff
+// the passed closure (the file handle will be freed once the closure returns).
76e8ff
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
76e8ff
+	// Remove the root then forcefully resolve inside the root.
76e8ff
+	unsafePath = stripRoot(root, unsafePath)
76e8ff
+	path, err := securejoin.SecureJoin(root, unsafePath)
76e8ff
+	if err != nil {
76e8ff
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
76e8ff
+	}
76e8ff
+
76e8ff
+	// Open the target path.
76e8ff
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
76e8ff
+	if err != nil {
76e8ff
+		return fmt.Errorf("open o_path procfd: %v", err)
76e8ff
+	}
76e8ff
+	defer fh.Close()
76e8ff
+
76e8ff
+	// Double-check the path is the one we expected.
76e8ff
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
76e8ff
+	if realpath, err := os.Readlink(procfd); err != nil {
76e8ff
+		return fmt.Errorf("procfd verification failed: %v", err)
76e8ff
+	} else if realpath != path {
76e8ff
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
76e8ff
+	}
76e8ff
+
76e8ff
+	// Run the closure.
76e8ff
+	return fn(procfd)
76e8ff
+}
76e8ff
+
76e8ff
 // SearchLabels searches a list of key-value pairs for the provided key and
76e8ff
 // returns the corresponding value. The pairs must be separated with '='.
76e8ff
 func SearchLabels(labels []string, query string) string {
76e8ff
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
76e8ff
index 395eedcf..5b80cac6 100644
76e8ff
--- a/libcontainer/utils/utils_test.go
76e8ff
+++ b/libcontainer/utils/utils_test.go
76e8ff
@@ -140,3 +140,38 @@ func TestCleanPath(t *testing.T) {
76e8ff
 		t.Errorf("expected to receive '/foo' and received %s", path)
76e8ff
 	}
76e8ff
 }
76e8ff
+
76e8ff
+func TestStripRoot(t *testing.T) {
76e8ff
+	for _, test := range []struct {
76e8ff
+		root, path, out string
76e8ff
+	}{
76e8ff
+		// Works with multiple components.
76e8ff
+		{"/a/b", "/a/b/c", "/c"},
76e8ff
+		{"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"},
76e8ff
+		// '/' must be a no-op.
76e8ff
+		{"/", "/a/b/c", "/a/b/c"},
76e8ff
+		// Must be the correct order.
76e8ff
+		{"/a/b", "/a/c/b", "/a/c/b"},
76e8ff
+		// Must be at start.
76e8ff
+		{"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"},
76e8ff
+		// Must be a lexical parent.
76e8ff
+		{"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"},
76e8ff
+		// Must only strip the root once.
76e8ff
+		{"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"},
76e8ff
+		// Deal with .. in a fairly sane way.
76e8ff
+		{"/foo/bar", "/foo/bar/../baz", "/foo/baz"},
76e8ff
+		{"/foo/bar", "../../../../../../foo/bar/baz", "/baz"},
76e8ff
+		{"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"},
76e8ff
+		{"/foo/bar/../baz", "/foo/baz/bar", "/bar"},
76e8ff
+		{"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"},
76e8ff
+		// All paths are made absolute before stripping.
76e8ff
+		{"foo/bar", "/foo/bar/baz/bee", "/baz/bee"},
76e8ff
+		{"/foo/bar", "foo/bar/baz/beef", "/baz/beef"},
76e8ff
+		{"foo/bar", "foo/bar/baz/beets", "/baz/beets"},
76e8ff
+	} {
76e8ff
+		got := stripRoot(test.root, test.path)
76e8ff
+		if got != test.out {
76e8ff
+			t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out)
76e8ff
+		}
76e8ff
+	}
76e8ff
+}
76e8ff
-- 
76e8ff
2.31.1
76e8ff