3bb17a
From bf6405284aa3870a39b402309003633a1c230ed9 Mon Sep 17 00:00:00 2001
3bb17a
From: Aleksa Sarai <asarai@suse.de>
3bb17a
Date: Wed, 9 Jan 2019 13:40:01 +1100
3bb17a
Subject: [PATCH 1/1] nsenter: clone /proc/self/exe to avoid exposing host
3bb17a
 binary to container
3bb17a
3bb17a
There are quite a few circumstances where /proc/self/exe pointing to a
3bb17a
pretty important container binary is a _bad_ thing, so to avoid this we
3bb17a
have to make a copy (preferably doing self-clean-up and not being
3bb17a
writeable).
3bb17a
3bb17a
As a hotfix we require memfd_create(2), but we can always extend this to
3bb17a
use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this
3bb17a
approach is no page-cache sharing for the runc binary (which overlayfs
3bb17a
would give us) but this is far less complicated.
3bb17a
3bb17a
This is only done during nsenter so that it happens transparently to the
3bb17a
Go code, and any libcontainer users benefit from it. This also makes
3bb17a
ExtraFiles and --preserve-fds handling trivial (because we don't need to
3bb17a
worry about it).
3bb17a
3bb17a
Fixes: CVE-2019-5736
3bb17a
Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
3bb17a
Signed-off-by: Aleksa Sarai <asarai@suse.de>
3bb17a
Signed-off-by: Mrunal Patel <mrunalp@gmail.com>
3bb17a
---
3bb17a
 libcontainer/nsenter/cloned_binary.c | 221 +++++++++++++++++++++++++++
3bb17a
 libcontainer/nsenter/nsexec.c        |  11 ++
3bb17a
 2 files changed, 232 insertions(+)
3bb17a
 create mode 100644 libcontainer/nsenter/cloned_binary.c
3bb17a
3bb17a
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
3bb17a
new file mode 100644
3bb17a
index 00000000..d9f6093a
3bb17a
--- /dev/null
3bb17a
+++ b/libcontainer/nsenter/cloned_binary.c
3bb17a
@@ -0,0 +1,221 @@
3bb17a
+#define _GNU_SOURCE
3bb17a
+#include <unistd.h>
3bb17a
+#include <stdio.h>
3bb17a
+#include <stdlib.h>
3bb17a
+#include <stdbool.h>
3bb17a
+#include <string.h>
3bb17a
+#include <limits.h>
3bb17a
+#include <fcntl.h>
3bb17a
+#include <errno.h>
3bb17a
+
3bb17a
+#include <sys/types.h>
3bb17a
+#include <sys/stat.h>
3bb17a
+#include <sys/vfs.h>
3bb17a
+#include <sys/mman.h>
3bb17a
+#include <sys/sendfile.h>
3bb17a
+#include <sys/syscall.h>
3bb17a
+
3bb17a
+#include <linux/magic.h>
3bb17a
+#include <linux/memfd.h>
3bb17a
+
3bb17a
+/* Use our own wrapper for memfd_create. */
3bb17a
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
3bb17a
+#  define SYS_memfd_create __NR_memfd_create
3bb17a
+#endif
3bb17a
+#ifndef SYS_memfd_create
3bb17a
+#  error "memfd_create(2) syscall not supported by this glibc version"
3bb17a
+#endif
3bb17a
+int memfd_create(const char *name, unsigned int flags)
3bb17a
+{
3bb17a
+	return syscall(SYS_memfd_create, name, flags);
3bb17a
+}
3bb17a
+
3bb17a
+/* This comes directly from <linux/fcntl.h>. */
3bb17a
+#ifndef F_LINUX_SPECIFIC_BASE
3bb17a
+#  define F_LINUX_SPECIFIC_BASE 1024
3bb17a
+#endif
3bb17a
+#ifndef F_ADD_SEALS
3bb17a
+#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
3bb17a
+#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
3bb17a
+#endif
3bb17a
+#ifndef F_SEAL_SEAL
3bb17a
+#  define F_SEAL_SEAL   0x0001	/* prevent further seals from being set */
3bb17a
+#  define F_SEAL_SHRINK 0x0002	/* prevent file from shrinking */
3bb17a
+#  define F_SEAL_GROW   0x0004	/* prevent file from growing */
3bb17a
+#  define F_SEAL_WRITE  0x0008	/* prevent writes */
3bb17a
+#endif
3bb17a
+
3bb17a
+
3bb17a
+#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
3bb17a
+#define OUR_MEMFD_SEALS \
3bb17a
+	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
3bb17a
+
3bb17a
+static void *must_realloc(void *ptr, size_t size)
3bb17a
+{
3bb17a
+	void *old = ptr;
3bb17a
+	do {
3bb17a
+		ptr = realloc(old, size);
3bb17a
+	} while(!ptr);
3bb17a
+	return ptr;
3bb17a
+}
3bb17a
+
3bb17a
+/*
3bb17a
+ * Verify whether we are currently in a self-cloned program (namely, is
3bb17a
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
3bb17a
+ * for shmem files), and we want to be sure it's actually sealed.
3bb17a
+ */
3bb17a
+static int is_self_cloned(void)
3bb17a
+{
3bb17a
+	int fd, seals;
3bb17a
+
3bb17a
+	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
3bb17a
+	if (fd < 0)
3bb17a
+		return -ENOTRECOVERABLE;
3bb17a
+
3bb17a
+	seals = fcntl(fd, F_GET_SEALS);
3bb17a
+	close(fd);
3bb17a
+	return seals == OUR_MEMFD_SEALS;
3bb17a
+}
3bb17a
+
3bb17a
+/*
3bb17a
+ * Basic wrapper around mmap(2) that gives you the file length so you can
3bb17a
+ * safely treat it as an ordinary buffer. Only gives you read access.
3bb17a
+ */
3bb17a
+static char *read_file(char *path, size_t *length)
3bb17a
+{
3bb17a
+	int fd;
3bb17a
+	char buf[4096], *copy = NULL;
3bb17a
+
3bb17a
+	if (!length)
3bb17a
+		return NULL;
3bb17a
+
3bb17a
+	fd = open(path, O_RDONLY | O_CLOEXEC);
3bb17a
+	if (fd < 0)
3bb17a
+		return NULL;
3bb17a
+
3bb17a
+	*length = 0;
3bb17a
+	for (;;) {
3bb17a
+		int n;
3bb17a
+
3bb17a
+		n = read(fd, buf, sizeof(buf));
3bb17a
+		if (n < 0)
3bb17a
+			goto error;
3bb17a
+		if (!n)
3bb17a
+			break;
3bb17a
+
3bb17a
+		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
3bb17a
+		memcpy(copy + *length, buf, n);
3bb17a
+		*length += n;
3bb17a
+	}
3bb17a
+	close(fd);
3bb17a
+	return copy;
3bb17a
+
3bb17a
+error:
3bb17a
+	close(fd);
3bb17a
+	free(copy);
3bb17a
+	return NULL;
3bb17a
+}
3bb17a
+
3bb17a
+/*
3bb17a
+ * A poor-man's version of "xargs -0". Basically parses a given block of
3bb17a
+ * NUL-delimited data, within the given length and adds a pointer to each entry
3bb17a
+ * to the array of pointers.
3bb17a
+ */
3bb17a
+static int parse_xargs(char *data, int data_length, char ***output)
3bb17a
+{
3bb17a
+	int num = 0;
3bb17a
+	char *cur = data;
3bb17a
+
3bb17a
+	if (!data || *output != NULL)
3bb17a
+		return -1;
3bb17a
+
3bb17a
+	while (cur < data + data_length) {
3bb17a
+		num++;
3bb17a
+		*output = must_realloc(*output, (num + 1) * sizeof(**output));
3bb17a
+		(*output)[num - 1] = cur;
3bb17a
+		cur += strlen(cur) + 1;
3bb17a
+	}
3bb17a
+	(*output)[num] = NULL;
3bb17a
+	return num;
3bb17a
+}
3bb17a
+
3bb17a
+/*
3bb17a
+ * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ.
3bb17a
+ * This is necessary because we are running in a context where we don't have a
3bb17a
+ * main() that we can just get the arguments from.
3bb17a
+ */
3bb17a
+static int fetchve(char ***argv, char ***envp)
3bb17a
+{
3bb17a
+	char *cmdline = NULL, *environ = NULL;
3bb17a
+	size_t cmdline_size, environ_size;
3bb17a
+
3bb17a
+	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
3bb17a
+	if (!cmdline)
3bb17a
+		goto error;
3bb17a
+	environ = read_file("/proc/self/environ", &environ_size);
3bb17a
+	if (!environ)
3bb17a
+		goto error;
3bb17a
+
3bb17a
+	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
3bb17a
+		goto error;
3bb17a
+	if (parse_xargs(environ, environ_size, envp) <= 0)
3bb17a
+		goto error;
3bb17a
+
3bb17a
+	return 0;
3bb17a
+
3bb17a
+error:
3bb17a
+	free(environ);
3bb17a
+	free(cmdline);
3bb17a
+	return -EINVAL;
3bb17a
+}
3bb17a
+
3bb17a
+#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */
3bb17a
+static int clone_binary(void)
3bb17a
+{
3bb17a
+	int binfd, memfd, err;
3bb17a
+	ssize_t sent = 0;
3bb17a
+
3bb17a
+	memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
3bb17a
+	if (memfd < 0)
3bb17a
+		return -ENOTRECOVERABLE;
3bb17a
+
3bb17a
+	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
3bb17a
+	if (binfd < 0)
3bb17a
+		goto error;
3bb17a
+
3bb17a
+	sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX);
3bb17a
+	close(binfd);
3bb17a
+	if (sent < 0)
3bb17a
+		goto error;
3bb17a
+
3bb17a
+	err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS);
3bb17a
+	if (err < 0)
3bb17a
+		goto error;
3bb17a
+
3bb17a
+	return memfd;
3bb17a
+
3bb17a
+error:
3bb17a
+	close(memfd);
3bb17a
+	return -EIO;
3bb17a
+}
3bb17a
+
3bb17a
+int ensure_cloned_binary(void)
3bb17a
+{
3bb17a
+	int execfd;
3bb17a
+	char **argv = NULL, **envp = NULL;
3bb17a
+
3bb17a
+	/* Check that we're not self-cloned, and if we are then bail. */
3bb17a
+	int cloned = is_self_cloned();
3bb17a
+	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
3bb17a
+		return cloned;
3bb17a
+
3bb17a
+	if (fetchve(&argv, &envp) < 0)
3bb17a
+		return -EINVAL;
3bb17a
+
3bb17a
+	execfd = clone_binary();
3bb17a
+	if (execfd < 0)
3bb17a
+		return -EIO;
3bb17a
+
3bb17a
+	fexecve(execfd, argv, envp);
3bb17a
+	return -ENOEXEC;
3bb17a
+}
3bb17a
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
3bb17a
index cb224314..784fd9b0 100644
3bb17a
--- a/libcontainer/nsenter/nsexec.c
3bb17a
+++ b/libcontainer/nsenter/nsexec.c
3bb17a
@@ -528,6 +528,9 @@ void join_namespaces(char *nslist)
3bb17a
 	free(namespaces);
3bb17a
 }
3bb17a
 
3bb17a
+/* Defined in cloned_binary.c. */
3bb17a
+int ensure_cloned_binary(void);
3bb17a
+
3bb17a
 void nsexec(void)
3bb17a
 {
3bb17a
 	int pipenum;
3bb17a
@@ -543,6 +546,14 @@ void nsexec(void)
3bb17a
 	if (pipenum == -1)
3bb17a
 		return;
3bb17a
 
3bb17a
+	/*
3bb17a
+	 * We need to re-exec if we are not in a cloned binary. This is necessary
3bb17a
+	 * to ensure that containers won't be able to access the host binary
3bb17a
+	 * through /proc/self/exe. See CVE-2019-5736.
3bb17a
+	 */
3bb17a
+	if (ensure_cloned_binary() < 0)
3bb17a
+		bail("could not ensure we are a cloned binary");
3bb17a
+
3bb17a
 	/* Parse all of the netlink configuration. */
3bb17a
 	nl_parse(pipenum, &config);
3bb17a
 
3bb17a
-- 
3bb17a
2.20.1
3bb17a