fd5e94
From 2dd156b190c02476191fc2522f9b0e0a1a098608 Mon Sep 17 00:00:00 2001
fd5e94
From: Kir Kolyshkin <kolyshkin@gmail.com>
fd5e94
Date: Mon, 17 May 2021 16:11:35 -0700
fd5e94
Subject: [PATCH] rootfs: add mount destination validation
fd5e94
fd5e94
This is a manual backport of fix for CVE-2021-30465 to runc-1.0.0-rc10
fd5e94
(aka -rc90), upstream commit 84c14b43fa703db7 by Aleksa Sarai.
fd5e94
fd5e94
Original description follows.
fd5e94
fd5e94
----
fd5e94
fd5e94
Because the target of a mount is inside a container (which may be a
fd5e94
volume that is shared with another container), there exists a race
fd5e94
condition where the target of the mount may change to a path containing
fd5e94
a symlink after we have sanitised the path -- resulting in us
fd5e94
inadvertently mounting the path outside of the container.
fd5e94
fd5e94
This is not immediately useful because we are in a mount namespace with
fd5e94
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
fd5e94
host paths in the host namespace. However, if any subsequent mountpoints
fd5e94
in the configuration use a subdirectory of that host path as a source,
fd5e94
those subsequent mounts will use an attacker-controlled source path
fd5e94
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
fd5e94
into the container.
fd5e94
fd5e94
While arguably configuration issues like this are not entirely within
fd5e94
runc's threat model, within the context of Kubernetes (and possibly
fd5e94
other container managers that provide semi-arbitrary container creation
fd5e94
privileges to untrusted users) this is a legitimate issue. Since we
fd5e94
cannot block mounting from the host into the container, we need to block
fd5e94
the first stage of this attack (mounting onto a path outside the
fd5e94
container).
fd5e94
fd5e94
The long-term plan to solve this would be to migrate to libpathrs, but
fd5e94
as a stop-gap we implement libpathrs-like path verification through
fd5e94
readlink(/proc/self/fd/$n) and then do mount operations through the
fd5e94
procfd once it's been verified to be inside the container. The target
fd5e94
could move after we've checked it, but if it is inside the container
fd5e94
then we can assume that it is safe for the same reason that libpathrs
fd5e94
operations would be safe.
fd5e94
fd5e94
A slight wrinkle is the "copyup" functionality we provide for tmpfs,
fd5e94
which is the only case where we want to do a mount on the host
fd5e94
filesystem. To facilitate this, I split out the copy-up functionality
fd5e94
entirely so that the logic isn't interspersed with the regular tmpfs
fd5e94
logic. In addition, all dependencies on m.Destination being overwritten
fd5e94
have been removed since that pattern was just begging to be a source of
fd5e94
more mount-target bugs (we do still have to modify m.Destination for
fd5e94
tmpfs-copyup but we only do it temporarily).
fd5e94
fd5e94
Fixes: CVE-2021-30465
fd5e94
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
fd5e94
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
fd5e94
Reviewed-by: Samuel Karp <skarp@amazon.com>
fd5e94
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
fd5e94
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
fd5e94
fd5e94
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
fd5e94
---
fd5e94
 libcontainer/rootfs_linux.go     | 225 ++++++++++++++++---------------
fd5e94
 libcontainer/utils/utils.go      |  54 ++++++++
fd5e94
 libcontainer/utils/utils_test.go |  35 +++++
fd5e94
 3 files changed, 204 insertions(+), 110 deletions(-)
fd5e94
fd5e94
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
fd5e94
index 106c4c2b..fe9afe48 100644
fd5e94
--- a/libcontainer/rootfs_linux.go
fd5e94
+++ b/libcontainer/rootfs_linux.go
fd5e94
@@ -19,8 +19,9 @@ import (
fd5e94
 	"github.com/opencontainers/runc/libcontainer/configs"
fd5e94
 	"github.com/opencontainers/runc/libcontainer/mount"
fd5e94
 	"github.com/opencontainers/runc/libcontainer/system"
fd5e94
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
fd5e94
+	"github.com/opencontainers/runc/libcontainer/utils"
fd5e94
 	"github.com/opencontainers/selinux/go-selinux/label"
fd5e94
+	"github.com/sirupsen/logrus"
fd5e94
 
fd5e94
 	"golang.org/x/sys/unix"
fd5e94
 )
fd5e94
@@ -30,7 +31,7 @@ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
fd5e94
 // needsSetupDev returns true if /dev needs to be set up.
fd5e94
 func needsSetupDev(config *configs.Config) bool {
fd5e94
 	for _, m := range config.Mounts {
fd5e94
-		if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
fd5e94
+		if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
fd5e94
 			return false
fd5e94
 		}
fd5e94
 	}
fd5e94
@@ -131,7 +132,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
fd5e94
 func finalizeRootfs(config *configs.Config) (err error) {
fd5e94
 	// remount dev as ro if specified
fd5e94
 	for _, m := range config.Mounts {
fd5e94
-		if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
fd5e94
+		if utils.CleanPath(m.Destination) == "/dev" {
fd5e94
 			if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
fd5e94
 				if err := remountReadonly(m); err != nil {
fd5e94
 					return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
fd5e94
@@ -200,8 +201,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
fd5e94
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
fd5e94
 		return err
fd5e94
 	}
fd5e94
-	// update the mount with the correct dest after symlinks are resolved.
fd5e94
-	m.Destination = dest
fd5e94
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
fd5e94
 		return err
fd5e94
 	}
fd5e94
@@ -238,18 +237,21 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
fd5e94
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
fd5e94
 				return err
fd5e94
 			}
fd5e94
-			flags := defaultMountFlags
fd5e94
-			if m.Flags&unix.MS_RDONLY != 0 {
fd5e94
-				flags = flags | unix.MS_RDONLY
fd5e94
-			}
fd5e94
-			cgroupmount := &configs.Mount{
fd5e94
-				Source:      "cgroup",
fd5e94
-				Device:      "cgroup",
fd5e94
-				Destination: subsystemPath,
fd5e94
-				Flags:       flags,
fd5e94
-				Data:        filepath.Base(subsystemPath),
fd5e94
-			}
fd5e94
-			if err := mountNewCgroup(cgroupmount); err != nil {
fd5e94
+			if err := utils.WithProcfd(rootfs, b.Destination, func(procfd string) error {
fd5e94
+				flags := defaultMountFlags
fd5e94
+				if m.Flags&unix.MS_RDONLY != 0 {
fd5e94
+					flags = flags | unix.MS_RDONLY
fd5e94
+				}
fd5e94
+				var (
fd5e94
+					source = "cgroup"
fd5e94
+					data   = filepath.Base(subsystemPath)
fd5e94
+				)
fd5e94
+				if data == "systemd" {
fd5e94
+					data = cgroups.CgroupNamePrefix + data
fd5e94
+					source = "systemd"
fd5e94
+				}
fd5e94
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
fd5e94
+			}); err != nil {
fd5e94
 				return err
fd5e94
 			}
fd5e94
 		} else {
fd5e94
@@ -279,22 +281,67 @@ func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
fd5e94
 	if err := os.MkdirAll(cgroupPath, 0755); err != nil {
fd5e94
 		return err
fd5e94
 	}
fd5e94
-	if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
fd5e94
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
fd5e94
-		if err == unix.EPERM || err == unix.EBUSY {
fd5e94
-			return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
fd5e94
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
fd5e94
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
fd5e94
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
fd5e94
+			if err == unix.EPERM || err == unix.EBUSY {
fd5e94
+				return unix.Mount("/sys/fs/cgroup", procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
fd5e94
+			}
fd5e94
+			return err
fd5e94
 		}
fd5e94
+		return nil
fd5e94
+	})
fd5e94
+}
fd5e94
+
fd5e94
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
fd5e94
+	// Set up a scratch dir for the tmpfs on the host.
fd5e94
+	tmpdir, err := prepareTmp("/tmp")
fd5e94
+	if err != nil {
fd5e94
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
fd5e94
+	}
fd5e94
+	defer cleanupTmp(tmpdir)
fd5e94
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
fd5e94
+	if err != nil {
fd5e94
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
fd5e94
+	}
fd5e94
+	defer os.RemoveAll(tmpDir)
fd5e94
+
fd5e94
+	// Configure the *host* tmpdir as if it's the container mount. We change
fd5e94
+	// m.Destination since we are going to mount *on the host*.
fd5e94
+	oldDest := m.Destination
fd5e94
+	m.Destination = tmpDir
fd5e94
+	err = mountPropagate(m, "/", mountLabel)
fd5e94
+	m.Destination = oldDest
fd5e94
+	if err != nil {
fd5e94
 		return err
fd5e94
 	}
fd5e94
-	return nil
fd5e94
+	defer func() {
fd5e94
+		if Err != nil {
fd5e94
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
fd5e94
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
fd5e94
+			}
fd5e94
+		}
fd5e94
+	}()
fd5e94
+
fd5e94
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
fd5e94
+		// Copy the container data to the host tmpdir. We append "/" to force
fd5e94
+		// CopyDirectory to resolve the symlink rather than trying to copy the
fd5e94
+		// symlink itself.
fd5e94
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
fd5e94
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %v", m.Destination, procfd, tmpDir, err)
fd5e94
+		}
fd5e94
+		// Now move the mount into the container.
fd5e94
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
fd5e94
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %v", tmpDir, procfd, m.Destination, err)
fd5e94
+		}
fd5e94
+		return nil
fd5e94
+	})
fd5e94
 }
fd5e94
 
fd5e94
 func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
fd5e94
-	var (
fd5e94
-		dest = m.Destination
fd5e94
-	)
fd5e94
-	if !strings.HasPrefix(dest, rootfs) {
fd5e94
-		dest = filepath.Join(rootfs, dest)
fd5e94
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
fd5e94
+	if err != nil {
fd5e94
+		return err
fd5e94
 	}
fd5e94
 
fd5e94
 	switch m.Device {
fd5e94
@@ -329,46 +376,21 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
fd5e94
 		}
fd5e94
 		return nil
fd5e94
 	case "tmpfs":
fd5e94
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
fd5e94
-		tmpDir := ""
fd5e94
 		stat, err := os.Stat(dest)
fd5e94
 		if err != nil {
fd5e94
 			if err := os.MkdirAll(dest, 0755); err != nil {
fd5e94
 				return err
fd5e94
 			}
fd5e94
 		}
fd5e94
-		if copyUp {
fd5e94
-			tmpdir, err := prepareTmp("/tmp")
fd5e94
-			if err != nil {
fd5e94
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
fd5e94
-			}
fd5e94
-			defer cleanupTmp(tmpdir)
fd5e94
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
fd5e94
-			if err != nil {
fd5e94
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
fd5e94
-			}
fd5e94
-			defer os.RemoveAll(tmpDir)
fd5e94
-			m.Destination = tmpDir
fd5e94
+
fd5e94
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
fd5e94
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
fd5e94
+		} else {
fd5e94
+			err = mountPropagate(m, rootfs, mountLabel)
fd5e94
 		}
fd5e94
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
fd5e94
+		if err != nil {
fd5e94
 			return err
fd5e94
 		}
fd5e94
-		if copyUp {
fd5e94
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
fd5e94
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
fd5e94
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
fd5e94
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
fd5e94
-				}
fd5e94
-				return errMsg
fd5e94
-			}
fd5e94
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
fd5e94
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
fd5e94
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
fd5e94
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
fd5e94
-				}
fd5e94
-				return errMsg
fd5e94
-			}
fd5e94
-		}
fd5e94
 		if stat != nil {
fd5e94
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
fd5e94
 				return err
fd5e94
@@ -424,19 +446,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
fd5e94
 			}
fd5e94
 		}
fd5e94
 	default:
fd5e94
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
fd5e94
-		// any previous mounts can invalidate the next mount's destination.
fd5e94
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
fd5e94
-		// evil stuff to try to escape the container's rootfs.
fd5e94
-		var err error
fd5e94
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
fd5e94
-			return err
fd5e94
-		}
fd5e94
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
fd5e94
 			return err
fd5e94
 		}
fd5e94
-		// update the mount with the correct dest after symlinks are resolved.
fd5e94
-		m.Destination = dest
fd5e94
 		if err := os.MkdirAll(dest, 0755); err != nil {
fd5e94
 			return err
fd5e94
 		}
fd5e94
@@ -611,7 +623,7 @@ func createDevices(config *configs.Config) error {
fd5e94
 	return nil
fd5e94
 }
fd5e94
 
fd5e94
-func bindMountDeviceNode(dest string, node *configs.Device) error {
fd5e94
+func bindMountDeviceNode(rootfs, dest string, node *configs.Device) error {
fd5e94
 	f, err := os.Create(dest)
fd5e94
 	if err != nil && !os.IsExist(err) {
fd5e94
 		return err
fd5e94
@@ -619,24 +631,29 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
fd5e94
 	if f != nil {
fd5e94
 		f.Close()
fd5e94
 	}
fd5e94
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
fd5e94
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
fd5e94
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
fd5e94
+	})
fd5e94
 }
fd5e94
 
fd5e94
 // Creates the device node in the rootfs of the container.
fd5e94
 func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
fd5e94
-	dest := filepath.Join(rootfs, node.Path)
fd5e94
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
fd5e94
+	if err != nil {
fd5e94
+		return err
fd5e94
+	}
fd5e94
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
fd5e94
 		return err
fd5e94
 	}
fd5e94
 
fd5e94
 	if bind {
fd5e94
-		return bindMountDeviceNode(dest, node)
fd5e94
+		return bindMountDeviceNode(rootfs, dest, node)
fd5e94
 	}
fd5e94
 	if err := mknodDevice(dest, node); err != nil {
fd5e94
 		if os.IsExist(err) {
fd5e94
 			return nil
fd5e94
 		} else if os.IsPermission(err) {
fd5e94
-			return bindMountDeviceNode(dest, node)
fd5e94
+			return bindMountDeviceNode(rootfs, dest, node)
fd5e94
 		}
fd5e94
 		return err
fd5e94
 	}
fd5e94
@@ -955,55 +972,43 @@ func writeSystemProperty(key, value string) error {
fd5e94
 }
fd5e94
 
fd5e94
 func remount(m *configs.Mount, rootfs string) error {
fd5e94
-	var (
fd5e94
-		dest = m.Destination
fd5e94
-	)
fd5e94
-	if !strings.HasPrefix(dest, rootfs) {
fd5e94
-		dest = filepath.Join(rootfs, dest)
fd5e94
-	}
fd5e94
-	return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
fd5e94
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
fd5e94
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
fd5e94
+	})
fd5e94
 }
fd5e94
 
fd5e94
 // Do the mount operation followed by additional mounts required to take care
fd5e94
-// of propagation flags.
fd5e94
+// of propagation flags.  This will always be scoped inside the container rootfs.
fd5e94
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
fd5e94
 	var (
fd5e94
-		dest  = m.Destination
fd5e94
 		data  = label.FormatMountLabel(m.Data, mountLabel)
fd5e94
 		flags = m.Flags
fd5e94
 	)
fd5e94
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
fd5e94
+	if utils.CleanPath(m.Destination) == "/dev" {
fd5e94
 		flags &= ^unix.MS_RDONLY
fd5e94
 	}
fd5e94
 
fd5e94
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
fd5e94
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
fd5e94
-		dest = filepath.Join(rootfs, dest)
fd5e94
-	}
fd5e94
-
fd5e94
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
fd5e94
-		return err
fd5e94
-	}
fd5e94
-
fd5e94
-	for _, pflag := range m.PropagationFlags {
fd5e94
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
fd5e94
-			return err
fd5e94
+	// Because the destination is inside a container path which might be
fd5e94
+	// mutating underneath us, we verify that we are actually going to mount
fd5e94
+	// inside the container with WithProcfd() -- mounting through a procfd
fd5e94
+	// mounts on the target.
fd5e94
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
fd5e94
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
fd5e94
+	}); err != nil {
fd5e94
+		return fmt.Errorf("mount through procfd: %v", err)
fd5e94
+	}
fd5e94
+	// We have to apply mount propagation flags in a separate WithProcfd() call
fd5e94
+	// because the previous call invalidates the passed procfd -- the mount
fd5e94
+	// target needs to be re-opened.
fd5e94
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
fd5e94
+		for _, pflag := range m.PropagationFlags {
fd5e94
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
fd5e94
+				return err
fd5e94
+			}
fd5e94
 		}
fd5e94
-	}
fd5e94
-	return nil
fd5e94
-}
fd5e94
-
fd5e94
-func mountNewCgroup(m *configs.Mount) error {
fd5e94
-	var (
fd5e94
-		data   = m.Data
fd5e94
-		source = m.Source
fd5e94
-	)
fd5e94
-	if data == "systemd" {
fd5e94
-		data = cgroups.CgroupNamePrefix + data
fd5e94
-		source = "systemd"
fd5e94
-	}
fd5e94
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
fd5e94
-		return err
fd5e94
+		return nil
fd5e94
+	}); err != nil {
fd5e94
+		return fmt.Errorf("change mount propagation through procfd: %v", err)
fd5e94
 	}
fd5e94
 	return nil
fd5e94
 }
fd5e94
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
fd5e94
index 40ccfaa1..c1418ef9 100644
fd5e94
--- a/libcontainer/utils/utils.go
fd5e94
+++ b/libcontainer/utils/utils.go
fd5e94
@@ -2,12 +2,15 @@ package utils
fd5e94
 
fd5e94
 import (
fd5e94
 	"encoding/json"
fd5e94
+	"fmt"
fd5e94
 	"io"
fd5e94
 	"os"
fd5e94
 	"path/filepath"
fd5e94
+	"strconv"
fd5e94
 	"strings"
fd5e94
 	"unsafe"
fd5e94
 
fd5e94
+	securejoin "github.com/cyphar/filepath-securejoin"
fd5e94
 	"golang.org/x/sys/unix"
fd5e94
 )
fd5e94
 
fd5e94
@@ -73,6 +76,57 @@ func CleanPath(path string) string {
fd5e94
 	return filepath.Clean(path)
fd5e94
 }
fd5e94
 
fd5e94
+// stripRoot returns the passed path, stripping the root path if it was
fd5e94
+// (lexicially) inside it. Note that both passed paths will always be treated
fd5e94
+// as absolute, and the returned path will also always be absolute. In
fd5e94
+// addition, the paths are cleaned before stripping the root.
fd5e94
+func stripRoot(root, path string) string {
fd5e94
+	// Make the paths clean and absolute.
fd5e94
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
fd5e94
+	switch {
fd5e94
+	case path == root:
fd5e94
+		path = "/"
fd5e94
+	case root == "/":
fd5e94
+		// do nothing
fd5e94
+	case strings.HasPrefix(path, root+"/"):
fd5e94
+		path = strings.TrimPrefix(path, root+"/")
fd5e94
+	}
fd5e94
+	return CleanPath("/" + path)
fd5e94
+}
fd5e94
+
fd5e94
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
fd5e94
+// corresponding to the unsafePath resolved within the root. Before passing the
fd5e94
+// fd, this path is verified to have been inside the root -- so operating on it
fd5e94
+// through the passed fdpath should be safe. Do not access this path through
fd5e94
+// the original path strings, and do not attempt to use the pathname outside of
fd5e94
+// the passed closure (the file handle will be freed once the closure returns).
fd5e94
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
fd5e94
+	// Remove the root then forcefully resolve inside the root.
fd5e94
+	unsafePath = stripRoot(root, unsafePath)
fd5e94
+	path, err := securejoin.SecureJoin(root, unsafePath)
fd5e94
+	if err != nil {
fd5e94
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
fd5e94
+	}
fd5e94
+
fd5e94
+	// Open the target path.
fd5e94
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
fd5e94
+	if err != nil {
fd5e94
+		return fmt.Errorf("open o_path procfd: %v", err)
fd5e94
+	}
fd5e94
+	defer fh.Close()
fd5e94
+
fd5e94
+	// Double-check the path is the one we expected.
fd5e94
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
fd5e94
+	if realpath, err := os.Readlink(procfd); err != nil {
fd5e94
+		return fmt.Errorf("procfd verification failed: %v", err)
fd5e94
+	} else if realpath != path {
fd5e94
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
fd5e94
+	}
fd5e94
+
fd5e94
+	// Run the closure.
fd5e94
+	return fn(procfd)
fd5e94
+}
fd5e94
+
fd5e94
 // SearchLabels searches a list of key-value pairs for the provided key and
fd5e94
 // returns the corresponding value. The pairs must be separated with '='.
fd5e94
 func SearchLabels(labels []string, query string) string {
fd5e94
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
fd5e94
index 395eedcf..5b80cac6 100644
fd5e94
--- a/libcontainer/utils/utils_test.go
fd5e94
+++ b/libcontainer/utils/utils_test.go
fd5e94
@@ -140,3 +140,38 @@ func TestCleanPath(t *testing.T) {
fd5e94
 		t.Errorf("expected to receive '/foo' and received %s", path)
fd5e94
 	}
fd5e94
 }
fd5e94
+
fd5e94
+func TestStripRoot(t *testing.T) {
fd5e94
+	for _, test := range []struct {
fd5e94
+		root, path, out string
fd5e94
+	}{
fd5e94
+		// Works with multiple components.
fd5e94
+		{"/a/b", "/a/b/c", "/c"},
fd5e94
+		{"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"},
fd5e94
+		// '/' must be a no-op.
fd5e94
+		{"/", "/a/b/c", "/a/b/c"},
fd5e94
+		// Must be the correct order.
fd5e94
+		{"/a/b", "/a/c/b", "/a/c/b"},
fd5e94
+		// Must be at start.
fd5e94
+		{"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"},
fd5e94
+		// Must be a lexical parent.
fd5e94
+		{"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"},
fd5e94
+		// Must only strip the root once.
fd5e94
+		{"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"},
fd5e94
+		// Deal with .. in a fairly sane way.
fd5e94
+		{"/foo/bar", "/foo/bar/../baz", "/foo/baz"},
fd5e94
+		{"/foo/bar", "../../../../../../foo/bar/baz", "/baz"},
fd5e94
+		{"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"},
fd5e94
+		{"/foo/bar/../baz", "/foo/baz/bar", "/bar"},
fd5e94
+		{"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"},
fd5e94
+		// All paths are made absolute before stripping.
fd5e94
+		{"foo/bar", "/foo/bar/baz/bee", "/baz/bee"},
fd5e94
+		{"/foo/bar", "foo/bar/baz/beef", "/baz/beef"},
fd5e94
+		{"foo/bar", "foo/bar/baz/beets", "/baz/beets"},
fd5e94
+	} {
fd5e94
+		got := stripRoot(test.root, test.path)
fd5e94
+		if got != test.out {
fd5e94
+			t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out)
fd5e94
+		}
fd5e94
+	}
fd5e94
+}
fd5e94
-- 
fd5e94
2.31.1
fd5e94