diff -urN runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/container_linux.go runc-my/libcontainer/container_linux.go --- runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/container_linux.go 2020-01-22 08:19:15.000000000 -0800 +++ runc-my/libcontainer/container_linux.go 2024-01-23 15:03:45.697128791 -0800 @@ -341,6 +341,17 @@ return newSystemErrorWithCause(err, "creating new parent process") } parent.forwardChildLogs() + + // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC + // to make sure we don't leak any files into "runc init". Any files to be + // passed to "runc init" through ExtraFiles will get dup2'd by the Go + // runtime and thus their O_CLOEXEC flag will be cleared. This is some + // additional protection against attacks like CVE-2024-21626, by making + // sure we never leak files to "runc init" we didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return newSystemErrorWithCause(err, "unable to mark non-stdio fds as cloexec") + } + if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. if err := ignoreTerminateErrors(parent.terminate()); err != nil { diff -urN runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/init_linux.go runc-my/libcontainer/init_linux.go --- runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/init_linux.go 2020-01-22 08:19:15.000000000 -0800 +++ runc-my/libcontainer/init_linux.go 2024-01-23 15:25:54.398330999 -0800 @@ -9,6 +9,7 @@ "io/ioutil" "net" "os" + "path/filepath" "strings" "syscall" // only for Errno "unsafe" @@ -116,6 +117,32 @@ return nil } +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); err == unix.ENOENT { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %s", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace @@ -154,6 +181,10 @@ if err := setupUser(config); err != nil { return errors.Wrap(err, "setup user") } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } if err := system.ClearKeepCaps(); err != nil { return errors.Wrap(err, "clear keep caps") } diff -urN runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/setns_init_linux.go runc-my/libcontainer/setns_init_linux.go --- runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/setns_init_linux.go 2020-01-22 08:19:15.000000000 -0800 +++ runc-my/libcontainer/setns_init_linux.go 2024-01-23 15:26:54.230209188 -0800 @@ -5,12 +5,15 @@ import ( "fmt" "os" + "os/exec" "runtime" + "syscall" //only for Exec "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/selinux/go-selinux/label" "github.com/pkg/errors" @@ -80,6 +83,12 @@ if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). @@ -88,5 +97,21 @@ return newSystemErrorWithCause(err, "init seccomp") } } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } + return nil } diff -urN runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/standard_init_linux.go runc-my/libcontainer/standard_init_linux.go --- runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/standard_init_linux.go 2020-01-22 08:19:15.000000000 -0800 +++ runc-my/libcontainer/standard_init_linux.go 2024-01-23 15:27:25.396145744 -0800 @@ -14,6 +14,7 @@ "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/selinux/go-selinux/label" "github.com/pkg/errors" @@ -207,6 +208,19 @@ return newSystemErrorWithCause(err, "init seccomp") } } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { return newSystemErrorWithCause(err, "exec user process") } diff -urN runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/utils/utils_unix.go runc-my/libcontainer/utils/utils_unix.go --- runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/libcontainer/utils/utils_unix.go 2020-01-22 08:19:15.000000000 -0800 +++ runc-my/libcontainer/utils/utils_unix.go 2024-01-23 15:27:53.435088668 -0800 @@ -6,6 +6,7 @@ "fmt" "os" "strconv" + _ "unsafe" // for go:linkname "golang.org/x/sys/unix" ) @@ -22,9 +23,11 @@ return nil } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { fdDir, err := os.Open("/proc/self/fd") if err != nil { return err @@ -49,15 +52,60 @@ if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // ioutil.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We must not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + // NewSockPair returns a new unix socket pair func NewSockPair(name string) (parent *os.File, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) diff -urN runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/update.go runc-my/update.go --- runc-dc9208a3303feef5b3839f4323d9beb36df0a9dd/update.go 2020-01-22 08:19:15.000000000 -0800 +++ runc-my/update.go 2024-01-23 14:41:07.528142160 -0800 @@ -173,6 +173,7 @@ return err } } + defer f.Close() err = json.NewDecoder(f).Decode(&r) if err != nil { return err