bcd21b
From bf6405284aa3870a39b402309003633a1c230ed9 Mon Sep 17 00:00:00 2001
bcd21b
From: Aleksa Sarai <asarai@suse.de>
bcd21b
Date: Wed, 9 Jan 2019 13:40:01 +1100
bcd21b
Subject: [PATCH 1/1] nsenter: clone /proc/self/exe to avoid exposing host
bcd21b
 binary to container
bcd21b
bcd21b
There are quite a few circumstances where /proc/self/exe pointing to a
bcd21b
pretty important container binary is a _bad_ thing, so to avoid this we
bcd21b
have to make a copy (preferably doing self-clean-up and not being
bcd21b
writeable).
bcd21b
bcd21b
As a hotfix we require memfd_create(2), but we can always extend this to
bcd21b
use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this
bcd21b
approach is no page-cache sharing for the runc binary (which overlayfs
bcd21b
would give us) but this is far less complicated.
bcd21b
bcd21b
This is only done during nsenter so that it happens transparently to the
bcd21b
Go code, and any libcontainer users benefit from it. This also makes
bcd21b
ExtraFiles and --preserve-fds handling trivial (because we don't need to
bcd21b
worry about it).
bcd21b
bcd21b
Fixes: CVE-2019-5736
bcd21b
Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
bcd21b
Signed-off-by: Aleksa Sarai <asarai@suse.de>
bcd21b
Signed-off-by: Mrunal Patel <mrunalp@gmail.com>
bcd21b
---
bcd21b
 libcontainer/nsenter/cloned_binary.c | 221 +++++++++++++++++++++++++++
bcd21b
 libcontainer/nsenter/nsexec.c        |  11 ++
bcd21b
 2 files changed, 232 insertions(+)
bcd21b
 create mode 100644 libcontainer/nsenter/cloned_binary.c
bcd21b
bcd21b
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
bcd21b
new file mode 100644
bcd21b
index 00000000..d9f6093a
bcd21b
--- /dev/null
bcd21b
+++ b/libcontainer/nsenter/cloned_binary.c
bcd21b
@@ -0,0 +1,221 @@
bcd21b
+#define _GNU_SOURCE
bcd21b
+#include <unistd.h>
bcd21b
+#include <stdio.h>
bcd21b
+#include <stdlib.h>
bcd21b
+#include <stdbool.h>
bcd21b
+#include <string.h>
bcd21b
+#include <limits.h>
bcd21b
+#include <fcntl.h>
bcd21b
+#include <errno.h>
bcd21b
+
bcd21b
+#include <sys/types.h>
bcd21b
+#include <sys/stat.h>
bcd21b
+#include <sys/vfs.h>
bcd21b
+#include <sys/mman.h>
bcd21b
+#include <sys/sendfile.h>
bcd21b
+#include <sys/syscall.h>
bcd21b
+
bcd21b
+#include <linux/magic.h>
bcd21b
+#include <linux/memfd.h>
bcd21b
+
bcd21b
+/* Use our own wrapper for memfd_create. */
bcd21b
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
bcd21b
+#  define SYS_memfd_create __NR_memfd_create
bcd21b
+#endif
bcd21b
+#ifndef SYS_memfd_create
bcd21b
+#  error "memfd_create(2) syscall not supported by this glibc version"
bcd21b
+#endif
bcd21b
+int memfd_create(const char *name, unsigned int flags)
bcd21b
+{
bcd21b
+	return syscall(SYS_memfd_create, name, flags);
bcd21b
+}
bcd21b
+
bcd21b
+/* This comes directly from <linux/fcntl.h>. */
bcd21b
+#ifndef F_LINUX_SPECIFIC_BASE
bcd21b
+#  define F_LINUX_SPECIFIC_BASE 1024
bcd21b
+#endif
bcd21b
+#ifndef F_ADD_SEALS
bcd21b
+#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
bcd21b
+#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
bcd21b
+#endif
bcd21b
+#ifndef F_SEAL_SEAL
bcd21b
+#  define F_SEAL_SEAL   0x0001	/* prevent further seals from being set */
bcd21b
+#  define F_SEAL_SHRINK 0x0002	/* prevent file from shrinking */
bcd21b
+#  define F_SEAL_GROW   0x0004	/* prevent file from growing */
bcd21b
+#  define F_SEAL_WRITE  0x0008	/* prevent writes */
bcd21b
+#endif
bcd21b
+
bcd21b
+
bcd21b
+#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
bcd21b
+#define OUR_MEMFD_SEALS \
bcd21b
+	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
bcd21b
+
bcd21b
+static void *must_realloc(void *ptr, size_t size)
bcd21b
+{
bcd21b
+	void *old = ptr;
bcd21b
+	do {
bcd21b
+		ptr = realloc(old, size);
bcd21b
+	} while(!ptr);
bcd21b
+	return ptr;
bcd21b
+}
bcd21b
+
bcd21b
+/*
bcd21b
+ * Verify whether we are currently in a self-cloned program (namely, is
bcd21b
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
bcd21b
+ * for shmem files), and we want to be sure it's actually sealed.
bcd21b
+ */
bcd21b
+static int is_self_cloned(void)
bcd21b
+{
bcd21b
+	int fd, seals;
bcd21b
+
bcd21b
+	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
bcd21b
+	if (fd < 0)
bcd21b
+		return -ENOTRECOVERABLE;
bcd21b
+
bcd21b
+	seals = fcntl(fd, F_GET_SEALS);
bcd21b
+	close(fd);
bcd21b
+	return seals == OUR_MEMFD_SEALS;
bcd21b
+}
bcd21b
+
bcd21b
+/*
bcd21b
+ * Basic wrapper around mmap(2) that gives you the file length so you can
bcd21b
+ * safely treat it as an ordinary buffer. Only gives you read access.
bcd21b
+ */
bcd21b
+static char *read_file(char *path, size_t *length)
bcd21b
+{
bcd21b
+	int fd;
bcd21b
+	char buf[4096], *copy = NULL;
bcd21b
+
bcd21b
+	if (!length)
bcd21b
+		return NULL;
bcd21b
+
bcd21b
+	fd = open(path, O_RDONLY | O_CLOEXEC);
bcd21b
+	if (fd < 0)
bcd21b
+		return NULL;
bcd21b
+
bcd21b
+	*length = 0;
bcd21b
+	for (;;) {
bcd21b
+		int n;
bcd21b
+
bcd21b
+		n = read(fd, buf, sizeof(buf));
bcd21b
+		if (n < 0)
bcd21b
+			goto error;
bcd21b
+		if (!n)
bcd21b
+			break;
bcd21b
+
bcd21b
+		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
bcd21b
+		memcpy(copy + *length, buf, n);
bcd21b
+		*length += n;
bcd21b
+	}
bcd21b
+	close(fd);
bcd21b
+	return copy;
bcd21b
+
bcd21b
+error:
bcd21b
+	close(fd);
bcd21b
+	free(copy);
bcd21b
+	return NULL;
bcd21b
+}
bcd21b
+
bcd21b
+/*
bcd21b
+ * A poor-man's version of "xargs -0". Basically parses a given block of
bcd21b
+ * NUL-delimited data, within the given length and adds a pointer to each entry
bcd21b
+ * to the array of pointers.
bcd21b
+ */
bcd21b
+static int parse_xargs(char *data, int data_length, char ***output)
bcd21b
+{
bcd21b
+	int num = 0;
bcd21b
+	char *cur = data;
bcd21b
+
bcd21b
+	if (!data || *output != NULL)
bcd21b
+		return -1;
bcd21b
+
bcd21b
+	while (cur < data + data_length) {
bcd21b
+		num++;
bcd21b
+		*output = must_realloc(*output, (num + 1) * sizeof(**output));
bcd21b
+		(*output)[num - 1] = cur;
bcd21b
+		cur += strlen(cur) + 1;
bcd21b
+	}
bcd21b
+	(*output)[num] = NULL;
bcd21b
+	return num;
bcd21b
+}
bcd21b
+
bcd21b
+/*
bcd21b
+ * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ.
bcd21b
+ * This is necessary because we are running in a context where we don't have a
bcd21b
+ * main() that we can just get the arguments from.
bcd21b
+ */
bcd21b
+static int fetchve(char ***argv, char ***envp)
bcd21b
+{
bcd21b
+	char *cmdline = NULL, *environ = NULL;
bcd21b
+	size_t cmdline_size, environ_size;
bcd21b
+
bcd21b
+	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
bcd21b
+	if (!cmdline)
bcd21b
+		goto error;
bcd21b
+	environ = read_file("/proc/self/environ", &environ_size);
bcd21b
+	if (!environ)
bcd21b
+		goto error;
bcd21b
+
bcd21b
+	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
bcd21b
+		goto error;
bcd21b
+	if (parse_xargs(environ, environ_size, envp) <= 0)
bcd21b
+		goto error;
bcd21b
+
bcd21b
+	return 0;
bcd21b
+
bcd21b
+error:
bcd21b
+	free(environ);
bcd21b
+	free(cmdline);
bcd21b
+	return -EINVAL;
bcd21b
+}
bcd21b
+
bcd21b
+#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */
bcd21b
+static int clone_binary(void)
bcd21b
+{
bcd21b
+	int binfd, memfd, err;
bcd21b
+	ssize_t sent = 0;
bcd21b
+
bcd21b
+	memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
bcd21b
+	if (memfd < 0)
bcd21b
+		return -ENOTRECOVERABLE;
bcd21b
+
bcd21b
+	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
bcd21b
+	if (binfd < 0)
bcd21b
+		goto error;
bcd21b
+
bcd21b
+	sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX);
bcd21b
+	close(binfd);
bcd21b
+	if (sent < 0)
bcd21b
+		goto error;
bcd21b
+
bcd21b
+	err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS);
bcd21b
+	if (err < 0)
bcd21b
+		goto error;
bcd21b
+
bcd21b
+	return memfd;
bcd21b
+
bcd21b
+error:
bcd21b
+	close(memfd);
bcd21b
+	return -EIO;
bcd21b
+}
bcd21b
+
bcd21b
+int ensure_cloned_binary(void)
bcd21b
+{
bcd21b
+	int execfd;
bcd21b
+	char **argv = NULL, **envp = NULL;
bcd21b
+
bcd21b
+	/* Check that we're not self-cloned, and if we are then bail. */
bcd21b
+	int cloned = is_self_cloned();
bcd21b
+	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
bcd21b
+		return cloned;
bcd21b
+
bcd21b
+	if (fetchve(&argv, &envp) < 0)
bcd21b
+		return -EINVAL;
bcd21b
+
bcd21b
+	execfd = clone_binary();
bcd21b
+	if (execfd < 0)
bcd21b
+		return -EIO;
bcd21b
+
bcd21b
+	fexecve(execfd, argv, envp);
bcd21b
+	return -ENOEXEC;
bcd21b
+}
bcd21b
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
bcd21b
index cb224314..784fd9b0 100644
bcd21b
--- a/libcontainer/nsenter/nsexec.c
bcd21b
+++ b/libcontainer/nsenter/nsexec.c
bcd21b
@@ -528,6 +528,9 @@ void join_namespaces(char *nslist)
bcd21b
 	free(namespaces);
bcd21b
 }
bcd21b
 
bcd21b
+/* Defined in cloned_binary.c. */
bcd21b
+int ensure_cloned_binary(void);
bcd21b
+
bcd21b
 void nsexec(void)
bcd21b
 {
bcd21b
 	int pipenum;
bcd21b
@@ -543,6 +546,14 @@ void nsexec(void)
bcd21b
 	if (pipenum == -1)
bcd21b
 		return;
bcd21b
 
bcd21b
+	/*
bcd21b
+	 * We need to re-exec if we are not in a cloned binary. This is necessary
bcd21b
+	 * to ensure that containers won't be able to access the host binary
bcd21b
+	 * through /proc/self/exe. See CVE-2019-5736.
bcd21b
+	 */
bcd21b
+	if (ensure_cloned_binary() < 0)
bcd21b
+		bail("could not ensure we are a cloned binary");
bcd21b
+
bcd21b
 	/* Parse all of the netlink configuration. */
bcd21b
 	nl_parse(pipenum, &config);
bcd21b
 
bcd21b
-- 
bcd21b
2.20.1
bcd21b