Blame SOURCES/0001-rootfs-add-mount-destination-validation.patch

5a4e4c
From 9e0940c49a1b65f0fa656905a6b6a14ed2f70dfe Mon Sep 17 00:00:00 2001
5a4e4c
From: Kir Kolyshkin <kolyshkin@gmail.com>
5a4e4c
Date: Mon, 17 May 2021 14:11:35 -0700
5a4e4c
Subject: [PATCH] rootfs: add mount destination validation
5a4e4c
5a4e4c
This is a manual backport of the CVE-2021-30465 fix. Original
5a4e4c
description follows.
5a4e4c
5a4e4c
---
5a4e4c
5a4e4c
Because the target of a mount is inside a container (which may be a
5a4e4c
volume that is shared with another container), there exists a race
5a4e4c
condition where the target of the mount may change to a path containing
5a4e4c
a symlink after we have sanitised the path -- resulting in us
5a4e4c
inadvertently mounting the path outside of the container.
5a4e4c
5a4e4c
This is not immediately useful because we are in a mount namespace with
5a4e4c
MS_SLAVE mount propagation applied to "/", so we cannot mount on top of
5a4e4c
host paths in the host namespace. However, if any subsequent mountpoints
5a4e4c
in the configuration use a subdirectory of that host path as a source,
5a4e4c
those subsequent mounts will use an attacker-controlled source path
5a4e4c
(resolved within the host rootfs) -- allowing the bind-mounting of "/"
5a4e4c
into the container.
5a4e4c
5a4e4c
While arguably configuration issues like this are not entirely within
5a4e4c
runc's threat model, within the context of Kubernetes (and possibly
5a4e4c
other container managers that provide semi-arbitrary container creation
5a4e4c
privileges to untrusted users) this is a legitimate issue. Since we
5a4e4c
cannot block mounting from the host into the container, we need to block
5a4e4c
the first stage of this attack (mounting onto a path outside the
5a4e4c
container).
5a4e4c
5a4e4c
The long-term plan to solve this would be to migrate to libpathrs, but
5a4e4c
as a stop-gap we implement libpathrs-like path verification through
5a4e4c
readlink(/proc/self/fd/$n) and then do mount operations through the
5a4e4c
procfd once it's been verified to be inside the container. The target
5a4e4c
could move after we've checked it, but if it is inside the container
5a4e4c
then we can assume that it is safe for the same reason that libpathrs
5a4e4c
operations would be safe.
5a4e4c
5a4e4c
A slight wrinkle is the "copyup" functionality we provide for tmpfs,
5a4e4c
which is the only case where we want to do a mount on the host
5a4e4c
filesystem. To facilitate this, I split out the copy-up functionality
5a4e4c
entirely so that the logic isn't interspersed with the regular tmpfs
5a4e4c
logic. In addition, all dependencies on m.Destination being overwritten
5a4e4c
have been removed since that pattern was just begging to be a source of
5a4e4c
more mount-target bugs (we do still have to modify m.Destination for
5a4e4c
tmpfs-copyup but we only do it temporarily).
5a4e4c
5a4e4c
Fixes: CVE-2021-30465
5a4e4c
Reported-by: Etienne Champetier <champetier.etienne@gmail.com>
5a4e4c
Co-authored-by: Noah Meyerhans <nmeyerha@amazon.com>
5a4e4c
Reviewed-by: Samuel Karp <skarp@amazon.com>
5a4e4c
Reviewed-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
5a4e4c
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
5a4e4c
5a4e4c
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
5a4e4c
---
5a4e4c
 libcontainer/rootfs_linux.go | 158 ++++++++++++++++++-----------------
5a4e4c
 libcontainer/utils/utils.go  |  56 +++++++++++++
5a4e4c
 2 files changed, 139 insertions(+), 75 deletions(-)
5a4e4c
5a4e4c
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
5a4e4c
index 42b66141..56bfb39e 100644
5a4e4c
--- a/runc-66aedde759f33c190954815fb765eedc1d782dd9/libcontainer/rootfs_linux.go
5a4e4c
+++ b/runc-66aedde759f33c190954815fb765eedc1d782dd9/libcontainer/rootfs_linux.go
5a4e4c
@@ -14,6 +14,7 @@ import (
5a4e4c
 	"syscall"
5a4e4c
 	"time"
5a4e4c
 
5a4e4c
+	"github.com/Sirupsen/logrus"
5a4e4c
 	securejoin "github.com/cyphar/filepath-securejoin"
5a4e4c
 	"github.com/docker/docker/pkg/mount"
5a4e4c
 	"github.com/mrunalp/fileutils"
5a4e4c
@@ -145,8 +146,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
5a4e4c
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
5a4e4c
 		return err
5a4e4c
 	}
5a4e4c
-	// update the mount with the correct dest after symlinks are resolved.
5a4e4c
-	m.Destination = dest
5a4e4c
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
5a4e4c
 		return err
5a4e4c
 	}
5a4e4c
@@ -154,12 +153,50 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
5a4e4c
 	return nil
5a4e4c
 }
5a4e4c
 
5a4e4c
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
5a4e4c
+	// Set up a scratch dir for the tmpfs on the host.
5a4e4c
+	tmpDir, err := ioutil.TempDir("/tmp", "runctmpdir")
5a4e4c
+	if err != nil {
5a4e4c
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
5a4e4c
+	}
5a4e4c
+	defer os.RemoveAll(tmpDir)
5a4e4c
+
5a4e4c
+	// Configure the *host* tmpdir as if it's the container mount. We change
5a4e4c
+	// m.Destination since we are going to mount *on the host*.
5a4e4c
+	oldDest := m.Destination
5a4e4c
+	m.Destination = tmpDir
5a4e4c
+	err = mountPropagate(m, "/", mountLabel)
5a4e4c
+	m.Destination = oldDest
5a4e4c
+	if err != nil {
5a4e4c
+		return err
5a4e4c
+	}
5a4e4c
+	defer func() {
5a4e4c
+		if Err != nil {
5a4e4c
+			if err := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err != nil {
5a4e4c
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
5a4e4c
+			}
5a4e4c
+		}
5a4e4c
+	}()
5a4e4c
+
5a4e4c
+	return libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
5a4e4c
+		// Copy the container data to the host tmpdir. We append "/" to force
5a4e4c
+		// CopyDirectory to resolve the symlink rather than trying to copy the
5a4e4c
+		// symlink itself.
5a4e4c
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
5a4e4c
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %v", m.Destination, procfd, tmpDir, err)
5a4e4c
+		}
5a4e4c
+		// Now move the mount into the container.
5a4e4c
+		if err := syscall.Mount(tmpDir, procfd, "", syscall.MS_MOVE, ""); err != nil {
5a4e4c
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %v", tmpDir, procfd, m.Destination, err)
5a4e4c
+		}
5a4e4c
+		return nil
5a4e4c
+	})
5a4e4c
+}
5a4e4c
+
5a4e4c
 func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
5a4e4c
-	var (
5a4e4c
-		dest = m.Destination
5a4e4c
-	)
5a4e4c
-	if !strings.HasPrefix(dest, rootfs) {
5a4e4c
-		dest = filepath.Join(rootfs, dest)
5a4e4c
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
5a4e4c
+	if err != nil {
5a4e4c
+		return err
5a4e4c
 	}
5a4e4c
 
5a4e4c
 	switch m.Device {
5a4e4c
@@ -182,41 +219,20 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
5a4e4c
 		}
5a4e4c
 		return nil
5a4e4c
 	case "tmpfs":
5a4e4c
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
5a4e4c
-		tmpDir := ""
5a4e4c
 		stat, err := os.Stat(dest)
5a4e4c
 		if err != nil {
5a4e4c
 			if err := os.MkdirAll(dest, 0755); err != nil {
5a4e4c
 				return err
5a4e4c
 			}
5a4e4c
 		}
5a4e4c
-		if copyUp {
5a4e4c
-			tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
5a4e4c
-			if err != nil {
5a4e4c
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
5a4e4c
-			}
5a4e4c
-			defer os.RemoveAll(tmpDir)
5a4e4c
-			m.Destination = tmpDir
5a4e4c
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
5a4e4c
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
5a4e4c
+		} else {
5a4e4c
+			err = mountPropagate(m, rootfs, mountLabel)
5a4e4c
 		}
5a4e4c
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
5a4e4c
+		if err != nil {
5a4e4c
 			return err
5a4e4c
 		}
5a4e4c
-		if copyUp {
5a4e4c
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
5a4e4c
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
5a4e4c
-				if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
5a4e4c
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
5a4e4c
-				}
5a4e4c
-				return errMsg
5a4e4c
-			}
5a4e4c
-			if err := syscall.Mount(tmpDir, dest, "", syscall.MS_MOVE, ""); err != nil {
5a4e4c
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
5a4e4c
-				if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
5a4e4c
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
5a4e4c
-				}
5a4e4c
-				return errMsg
5a4e4c
-			}
5a4e4c
-		}
5a4e4c
 		if stat != nil {
5a4e4c
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
5a4e4c
 				return err
5a4e4c
@@ -299,19 +315,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
5a4e4c
 			}
5a4e4c
 		}
5a4e4c
 	default:
5a4e4c
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
5a4e4c
-		// any previous mounts can invalidate the next mount's destination.
5a4e4c
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
5a4e4c
-		// evil stuff to try to escape the container's rootfs.
5a4e4c
-		var err error
5a4e4c
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
5a4e4c
-			return err
5a4e4c
-		}
5a4e4c
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
5a4e4c
 			return err
5a4e4c
 		}
5a4e4c
-		// update the mount with the correct dest after symlinks are resolved.
5a4e4c
-		m.Destination = dest
5a4e4c
 		if err := os.MkdirAll(dest, 0755); err != nil {
5a4e4c
 			return err
5a4e4c
 		}
5a4e4c
@@ -485,7 +491,7 @@ func createDevices(config *configs.Config) error {
5a4e4c
 	return nil
5a4e4c
 }
5a4e4c
 
5a4e4c
-func bindMountDeviceNode(dest string, node *configs.Device) error {
5a4e4c
+func bindMountDeviceNode(rootfs, dest string, node *configs.Device) error {
5a4e4c
 	f, err := os.Create(dest)
5a4e4c
 	if err != nil && !os.IsExist(err) {
5a4e4c
 		return err
5a4e4c
@@ -493,24 +499,29 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
5a4e4c
 	if f != nil {
5a4e4c
 		f.Close()
5a4e4c
 	}
5a4e4c
-	return syscall.Mount(node.Path, dest, "bind", syscall.MS_BIND, "")
5a4e4c
+	return libcontainerUtils.WithProcfd(rootfs, dest, func(procfd string) error {
5a4e4c
+		return syscall.Mount(node.Path, procfd, "bind", syscall.MS_BIND, "")
5a4e4c
+	})
5a4e4c
 }
5a4e4c
 
5a4e4c
 // Creates the device node in the rootfs of the container.
5a4e4c
 func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
5a4e4c
-	dest := filepath.Join(rootfs, node.Path)
5a4e4c
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
5a4e4c
+	if err != nil {
5a4e4c
+		return err
5a4e4c
+	}
5a4e4c
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
5a4e4c
 		return err
5a4e4c
 	}
5a4e4c
 
5a4e4c
 	if bind {
5a4e4c
-		return bindMountDeviceNode(dest, node)
5a4e4c
+		return bindMountDeviceNode(rootfs, dest, node)
5a4e4c
 	}
5a4e4c
 	if err := mknodDevice(dest, node); err != nil {
5a4e4c
 		if os.IsExist(err) {
5a4e4c
 			return nil
5a4e4c
 		} else if os.IsPermission(err) {
5a4e4c
-			return bindMountDeviceNode(dest, node)
5a4e4c
+			return bindMountDeviceNode(rootfs, dest, node)
5a4e4c
 		}
5a4e4c
 		return err
5a4e4c
 	}
5a4e4c
@@ -681,10 +692,6 @@ func pivotRoot(rootfs string) error {
5a4e4c
 
5a4e4c
 	// Make oldroot rprivate to make sure our unmounts don't propagate to the
5a4e4c
 	// host (and thus bork the machine).
5a4e4c
-	if err := syscall.Mount("", ".", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
5a4e4c
-		return err
5a4e4c
-	}
5a4e4c
-	// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
5a4e4c
 	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
5a4e4c
 		return err
5a4e4c
 	}
5a4e4c
@@ -807,43 +814,44 @@ func writeSystemProperty(key, value string) error {
5a4e4c
 }
5a4e4c
 
5a4e4c
 func remount(m *configs.Mount, rootfs string) error {
5a4e4c
-	var (
5a4e4c
-		dest = m.Destination
5a4e4c
-	)
5a4e4c
-	if !strings.HasPrefix(dest, rootfs) {
5a4e4c
-		dest = filepath.Join(rootfs, dest)
5a4e4c
-	}
5a4e4c
-	if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags|syscall.MS_REMOUNT), ""); err != nil {
5a4e4c
-		return err
5a4e4c
-	}
5a4e4c
-	return nil
5a4e4c
+	return libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
5a4e4c
+		return syscall.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|syscall.MS_REMOUNT), "")
5a4e4c
+	})
5a4e4c
 }
5a4e4c
 
5a4e4c
 // Do the mount operation followed by additional mounts required to take care
5a4e4c
-// of propagation flags.
5a4e4c
+// of propagation flags.  This will always be scoped inside the container rootfs.
5a4e4c
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
5a4e4c
 	var (
5a4e4c
-		dest  = m.Destination
5a4e4c
 		data  = label.FormatMountLabel(m.Data, mountLabel)
5a4e4c
 		flags = m.Flags
5a4e4c
 	)
5a4e4c
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
5a4e4c
+	if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
5a4e4c
 		flags &= ^syscall.MS_RDONLY
5a4e4c
 	}
5a4e4c
 
5a4e4c
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
5a4e4c
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
5a4e4c
-		dest = filepath.Join(rootfs, dest)
5a4e4c
+	// Because the destination is inside a container path which might be
5a4e4c
+	// mutating underneath us, we verify that we are actually going to mount
5a4e4c
+	// inside the container with WithProcfd() -- mounting through a procfd
5a4e4c
+	// mounts on the target.
5a4e4c
+	if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
5a4e4c
+		return syscall.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
5a4e4c
+	}); err != nil {
5a4e4c
+		return fmt.Errorf("mount through procfd: %v", err)
5a4e4c
 	}
5a4e4c
 
5a4e4c
-	if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
5a4e4c
-		return err
5a4e4c
-	}
5a4e4c
-
5a4e4c
-	for _, pflag := range m.PropagationFlags {
5a4e4c
-		if err := syscall.Mount("", dest, "", uintptr(pflag), ""); err != nil {
5a4e4c
-			return err
5a4e4c
+	// We have to apply mount propagation flags in a separate WithProcfd() call
5a4e4c
+	// because the previous call invalidates the passed procfd -- the mount
5a4e4c
+	// target needs to be re-opened.
5a4e4c
+	if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
5a4e4c
+		for _, pflag := range m.PropagationFlags {
5a4e4c
+			if err := syscall.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
5a4e4c
+				return err
5a4e4c
+			}
5a4e4c
 		}
5a4e4c
+		return nil
5a4e4c
+	}); err != nil {
5a4e4c
+		return fmt.Errorf("change mount propagation through procfd: %v", err)
5a4e4c
 	}
5a4e4c
 	return nil
5a4e4c
 }
5a4e4c
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
5a4e4c
index 2b35b9a7..a89fc6a2 100644
5a4e4c
--- a/runc-66aedde759f33c190954815fb765eedc1d782dd9/libcontainer/utils/utils.go
5a4e4c
+++ b/runc-66aedde759f33c190954815fb765eedc1d782dd9/libcontainer/utils/utils.go
5a4e4c
@@ -4,12 +4,17 @@ import (
5a4e4c
 	"crypto/rand"
5a4e4c
 	"encoding/hex"
5a4e4c
 	"encoding/json"
5a4e4c
+	"fmt"
5a4e4c
 	"io"
5a4e4c
 	"os"
5a4e4c
 	"path/filepath"
5a4e4c
+	"strconv"
5a4e4c
 	"strings"
5a4e4c
 	"syscall"
5a4e4c
 	"unsafe"
5a4e4c
+
5a4e4c
+	securejoin "github.com/cyphar/filepath-securejoin"
5a4e4c
+	"golang.org/x/sys/unix"
5a4e4c
 )
5a4e4c
 
5a4e4c
 const (
5a4e4c
@@ -87,6 +92,57 @@ func CleanPath(path string) string {
5a4e4c
 	return filepath.Clean(path)
5a4e4c
 }
5a4e4c
 
5a4e4c
+// stripRoot returns the passed path, stripping the root path if it was
5a4e4c
+// (lexicially) inside it. Note that both passed paths will always be treated
5a4e4c
+// as absolute, and the returned path will also always be absolute. In
5a4e4c
+// addition, the paths are cleaned before stripping the root.
5a4e4c
+func stripRoot(root, path string) string {
5a4e4c
+	// Make the paths clean and absolute.
5a4e4c
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
5a4e4c
+	switch {
5a4e4c
+	case path == root:
5a4e4c
+		path = "/"
5a4e4c
+	case root == "/":
5a4e4c
+		// do nothing
5a4e4c
+	case strings.HasPrefix(path, root+"/"):
5a4e4c
+		path = strings.TrimPrefix(path, root+"/")
5a4e4c
+	}
5a4e4c
+	return CleanPath("/" + path)
5a4e4c
+}
5a4e4c
+
5a4e4c
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
5a4e4c
+// corresponding to the unsafePath resolved within the root. Before passing the
5a4e4c
+// fd, this path is verified to have been inside the root -- so operating on it
5a4e4c
+// through the passed fdpath should be safe. Do not access this path through
5a4e4c
+// the original path strings, and do not attempt to use the pathname outside of
5a4e4c
+// the passed closure (the file handle will be freed once the closure returns).
5a4e4c
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
5a4e4c
+	// Remove the root then forcefully resolve inside the root.
5a4e4c
+	unsafePath = stripRoot(root, unsafePath)
5a4e4c
+	path, err := securejoin.SecureJoin(root, unsafePath)
5a4e4c
+	if err != nil {
5a4e4c
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
5a4e4c
+	}
5a4e4c
+
5a4e4c
+	// Open the target path.
5a4e4c
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
5a4e4c
+	if err != nil {
5a4e4c
+		return fmt.Errorf("open o_path procfd: %v", err)
5a4e4c
+	}
5a4e4c
+	defer fh.Close()
5a4e4c
+
5a4e4c
+	// Double-check the path is the one we expected.
5a4e4c
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
5a4e4c
+	if realpath, err := os.Readlink(procfd); err != nil {
5a4e4c
+		return fmt.Errorf("procfd verification failed: %v", err)
5a4e4c
+	} else if realpath != path {
5a4e4c
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
5a4e4c
+	}
5a4e4c
+
5a4e4c
+	// Run the closure.
5a4e4c
+	return fn(procfd)
5a4e4c
+}
5a4e4c
+
5a4e4c
 // SearchLabels searches a list of key-value pairs for the provided key and
5a4e4c
 // returns the corresponding value. The pairs must be separated with '='.
5a4e4c
 func SearchLabels(labels []string, query string) string {
5a4e4c
-- 
5a4e4c
2.31.1
5a4e4c