Blob Blame History Raw
diff -up util-linux-2.23.2/include/pathnames.h.kzak util-linux-2.23.2/include/pathnames.h
--- util-linux-2.23.2/include/pathnames.h.kzak	2015-06-26 10:00:19.111877564 +0200
+++ util-linux-2.23.2/include/pathnames.h	2015-06-26 10:00:51.623630869 +0200
@@ -85,6 +85,10 @@
 #define _PATH_PROC_LOCKS        "/proc/locks"
 #define _PATH_PROC_CDROMINFO	"/proc/sys/dev/cdrom/info"
 
+#define _PATH_PROC_UIDMAP	"/proc/self/uid_map"
+#define _PATH_PROC_GIDMAP	"/proc/self/gid_map"
+#define _PATH_PROC_SETGROUPS	"/proc/self/setgroups"
+
 #define _PATH_PROC_ATTR_CURRENT	"/proc/self/attr/current"
 #define _PATH_PROC_ATTR_EXEC	"/proc/self/attr/exec"
 #define _PATH_PROC_CAPLASTCAP	"/proc/sys/kernel/cap_last_cap"
diff -up util-linux-2.23.2/sys-utils/Makemodule.am.kzak util-linux-2.23.2/sys-utils/Makemodule.am
diff -up util-linux-2.23.2/sys-utils/nsenter.1.kzak util-linux-2.23.2/sys-utils/nsenter.1
--- util-linux-2.23.2/sys-utils/nsenter.1.kzak	2015-06-26 09:58:39.468633643 +0200
+++ util-linux-2.23.2/sys-utils/nsenter.1	2015-06-26 09:58:51.672541041 +0200
@@ -1,44 +1,45 @@
-.TH NSENTER 1 "January 2013" "util-linux" "User Commands"
+.TH NSENTER 1 "June 2013" "util-linux" "User Commands"
 .SH NAME
 nsenter \- run program with namespaces of other processes
 .SH SYNOPSIS
 .B nsenter
-.RI [ options ]
-.RI [ program ]
-.RI [ arguments ]
+[options]
+.RI [ program
+.RI [ arguments ]]
 .SH DESCRIPTION
 Enters the namespaces of one or more other processes and then executes the specified
 program.  Enterable namespaces are:
 .TP
 .B mount namespace
-mounting and unmounting filesystems will not affect rest of the system
+Mounting and unmounting filesystems will not affect the rest of the system
 .RB ( CLONE_\:NEWNS
-flag), except for filesystems which are explicitly marked as shared (by mount
---make-\:shared).  See /proc\:/self\:/mountinfo for the shared flag.
+flag), except for filesystems which are explicitly marked as shared (with
+\fBmount --make-\:shared\fP; see \fI/proc\:/self\:/mountinfo\fP for the
+\fBshared\fP flag).
 .TP
 .B UTS namespace
-setting hostname, domainname will not affect rest of the system
+Setting hostname or domainname will not affect the rest of the system.
 .RB ( CLONE_\:NEWUTS
-flag).
+flag)
 .TP
 .B IPC namespace
-process will have independent namespace for System V message queues, semaphore
-sets and shared memory segments
+The process will have an independent namespace for System V message queues,
+semaphore sets and shared memory segments.
 .RB ( CLONE_\:NEWIPC
-flag).
+flag)
 .TP
 .B network namespace
-process will have independent IPv4 and IPv6 stacks, IP routing tables, firewall
-rules, the
+The process will have independent IPv4 and IPv6 stacks, IP routing tables,
+firewall rules, the
 .I /proc\:/net
 and
 .I /sys\:/class\:/net
-directory trees, sockets etc.
+directory trees, sockets, etc.
 .RB ( CLONE_\:NEWNET
-flag).
+flag)
 .TP
 .B PID namespace
-children will have a set of PID to process mappings separate from the
+Children will have a set of PID to process mappings separate from the
 .B nsenter
 process
 .RB ( CLONE_\:NEWPID
@@ -46,18 +47,18 @@ flag).
 .B nsenter
 will fork by default if changing the PID namespace, so that the new program
 and its children share the same PID namespace and are visible to each other.
-If \-\-no\-fork is used, the new program will be exec'ed without forking.
-.PP
-See the
-.BR clone (2)
-for exact semantics of the flags.
+If \fB\-\-no\-fork\fP is used, the new program will be exec'ed without forking.
 .TP
-If program is not given, run ``${SHELL}'' (default: /bin\:/sh).
+.B user namespace
+The process will have a distinct set of UIDs, GIDs and capabilities.
+.RB ( CLONE_\:NEWUSER
+flag)
+.TP
+See \fBclone\fP(2) for the exact semantics of the flags.
+.TP
+If \fIprogram\fP is not given, then ``${SHELL}'' is run (default: /bin\:/sh).
 
 .SH OPTIONS
-Argument with square brakets, such as [\fIfile\fR], means optional argument.
-Command line syntax to specify optional argument \-\-mount=/path\:/to\:/file.
-Please notice the equals sign.
 .TP
 \fB\-t\fR, \fB\-\-target\fR \fIpid\fP
 Specify a target process to get contexts from.  The paths to the contexts
@@ -83,6 +84,9 @@ the network namespace
 /proc/\fIpid\fR/ns/pid
 the PID namespace
 .TP
+/proc/\fIpid\fR/ns/user
+the user namespace
+.TP
 /proc/\fIpid\fR/root
 the root directory
 .TP
@@ -91,51 +95,71 @@ the working directory respectively
 .PD
 .RE
 .TP
-\fB\-m\fR, \fB\-\-mount\fR [\fIfile\fR]
-Enter the mount namespace.  If no file is specified enter the mount namespace
-of the target process.  If file is specified enter the mount namespace
+\fB\-m\fR, \fB\-\-mount\fR[=\fIfile\fR]
+Enter the mount namespace.  If no file is specified, enter the mount namespace
+of the target process.  If file is specified, enter the mount namespace
 specified by file.
 .TP
-\fB\-u\fR, \fB\-\-uts\fR [\fIfile\fR]
-Enter the UTS namespace.  If no file is specified enter the UTS namespace of
-the target process.  If file is specified enter the UTS namespace specified by
+\fB\-u\fR, \fB\-\-uts\fR[=\fIfile\fR]
+Enter the UTS namespace.  If no file is specified, enter the UTS namespace of
+the target process.  If file is specified, enter the UTS namespace specified by
 file.
 .TP
-\fB\-i\fR, \fB\-\-ipc\fR [\fIfile\fR]
-Enter the IPC namespace.  If no file is specified enter the IPC namespace of
-the target process.  If file is specified enter the IPC namespace specified by
+\fB\-i\fR, \fB\-\-ipc\fR[=\fIfile\fR]
+Enter the IPC namespace.  If no file is specified, enter the IPC namespace of
+the target process.  If file is specified, enter the IPC namespace specified by
 file.
 .TP
-\fB\-n\fR, \fB\-\-net\fR [\fIfile\fR]
-Enter the network namespace.  If no file is specified enter the network
-namespace of the target process.  If file is specified enter the network
+\fB\-n\fR, \fB\-\-net\fR[=\fIfile\fR]
+Enter the network namespace.  If no file is specified, enter the network
+namespace of the target process.  If file is specified, enter the network
 namespace specified by file.
 .TP
-\fB\-p\fR, \fB\-\-pid\fR [\fIfile\fR]
-Enter the PID namespace.  If no file is specified enter the PID namespace of
-the target process.  If file is specified enter the PID namespace specified by
+\fB\-p\fR, \fB\-\-pid\fR[=\fIfile\fR]
+Enter the PID namespace.  If no file is specified, enter the PID namespace of
+the target process.  If file is specified, enter the PID namespace specified by
 file.
 .TP
-\fB\-r\fR, \fB\-\-root\fR [\fIdirectory\fR]
-Set the root directory.  If no directory is specified set the root directory to
-the root directory of the target process.  If directory is specified set the
+\fB\-U\fR, \fB\-\-user\fR[=\fIfile\fR]
+Enter the user namespace.  If no file is specified, enter the user namespace of
+the target process.  If file is specified, enter the user namespace specified by
+file.  See also the \fB\-\-setuid\fR and \fB\-\-setgid\fR options.
+.TP
+\fB\-G\fR, \fB\-\-setgid\fR \fIgid\fR
+Set the group ID which will be used in the entered namespace and drop
+supplementary groups.
+.BR nsenter (1)
+always sets GID for user namespaces, the default is 0.
+.TP
+\fB\-S\fR, \fB\-\-setuid\fR \fIuid\fR
+Set the user ID which will be used in the entered namespace.
+.BR nsenter (1)
+always sets UID for user namespaces, the default is 0.
+.TP
+\fB\-\-preserve\-credentials\fR
+Don't modify UID and GID when enter user namespace. The default is to
+drops supplementary groups and sets GID and UID to 0.
+.TP
+\fB\-r\fR, \fB\-\-root\fR[=\fIdirectory\fR]
+Set the root directory.  If no directory is specified, set the root directory to
+the root directory of the target process.  If directory is specified, set the
 root directory to the specified directory.
 .TP
-\fB\-w\fR, \fB\-\-wd\fR [\fIdirectory\fR]
-Set the working directory.  If no directory is specified set the working
+\fB\-w\fR, \fB\-\-wd\fR[=\fIdirectory\fR]
+Set the working directory.  If no directory is specified, set the working
 directory to the working directory of the target process.  If directory is
-specified set the working directory to the specified directory.
+specified, set the working directory to the specified directory.
 .TP
-\fB\-F\fR, \fB\-\-no-fork\fR
-Do not fork before exec'ing the specified program.  By default when entering a
-pid namespace enter calls fork before calling exec so that the children will be
-in the newly entered pid namespace.
+\fB\-F\fR, \fB\-\-no\-fork\fR
+Do not fork before exec'ing the specified program.  By default, when entering a
+PID namespace, \fBnsenter\fP calls \fBfork\fP before calling \fBexec\fP so that
+any children will also be in the newly entered PID namespace.
 .TP
 \fB\-V\fR, \fB\-\-version\fR
 Display version information and exit.
 .TP
 \fB\-h\fR, \fB\-\-help\fR
-Print a help message.
+Display help text and exit.
 .SH SEE ALSO
 .BR setns (2),
 .BR clone (2)
diff -up util-linux-2.23.2/sys-utils/nsenter.c.kzak util-linux-2.23.2/sys-utils/nsenter.c
--- util-linux-2.23.2/sys-utils/nsenter.c.kzak	2015-06-26 09:58:39.468633643 +0200
+++ util-linux-2.23.2/sys-utils/nsenter.c	2015-06-26 09:58:51.673541033 +0200
@@ -28,6 +28,7 @@
 #include <assert.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <grp.h>
 
 #include "strutils.h"
 #include "nls.h"
@@ -42,7 +43,12 @@ static struct namespace_file {
 	int fd;
 } namespace_files[] = {
 	/* Careful the order is significant in this array.
+	 *
+	 * The user namespace comes first, so that it is entered
+	 * first.  This gives an unprivileged user the potential to
+	 * enter the other namespaces.
 	 */
+	{ .nstype = CLONE_NEWUSER, .name = "ns/user", .fd = -1 },
 	{ .nstype = CLONE_NEWIPC,  .name = "ns/ipc",  .fd = -1 },
 	{ .nstype = CLONE_NEWUTS,  .name = "ns/uts",  .fd = -1 },
 	{ .nstype = CLONE_NEWNET,  .name = "ns/net",  .fd = -1 },
@@ -56,18 +62,25 @@ static void usage(int status)
 	FILE *out = status == EXIT_SUCCESS ? stdout : stderr;
 
 	fputs(USAGE_HEADER, out);
-	fprintf(out, _(" %s [options] <program> [args...]\n"),
+	fprintf(out, _(" %s [options] <program> [<argument>...]\n"),
 		program_invocation_short_name);
 
+	fputs(USAGE_SEPARATOR, out);
+	fputs(_("Run a program with namespaces of other processes.\n"), out);
+
 	fputs(USAGE_OPTIONS, out);
 	fputs(_(" -t, --target <pid>     target process to get namespaces from\n"), out);
-	fputs(_(" -m, --mount [=<file>]  enter mount namespace\n"), out);
-	fputs(_(" -u, --uts   [=<file>]  enter UTS namespace (hostname etc)\n"), out);
-	fputs(_(" -i, --ipc   [=<file>]  enter System V IPC namespace\n"), out);
-	fputs(_(" -n, --net   [=<file>]  enter network namespace\n"), out);
-	fputs(_(" -p, --pid   [=<file>]  enter pid namespace\n"), out);
-	fputs(_(" -r, --root  [=<dir>]   set the root directory\n"), out);
-	fputs(_(" -w, --wd    [=<dir>]   set the working directory\n"), out);
+	fputs(_(" -m, --mount[=<file>]   enter mount namespace\n"), out);
+	fputs(_(" -u, --uts[=<file>]     enter UTS namespace (hostname etc)\n"), out);
+	fputs(_(" -i, --ipc[=<file>]     enter System V IPC namespace\n"), out);
+	fputs(_(" -n, --net[=<file>]     enter network namespace\n"), out);
+	fputs(_(" -p, --pid[=<file>]     enter pid namespace\n"), out);
+	fputs(_(" -U, --user[=<file>]    enter user namespace\n"), out);
+	fputs(_(" -S, --setuid <uid>     set uid in entered namespace\n"), out);
+	fputs(_(" -G, --setgid <gid>     set gid in entered namespace\n"), out);
+	fputs(_("     --preserve-credentials do not touch uids or gids\n"), out);
+	fputs(_(" -r, --root[=<dir>]     set the root directory\n"), out);
+	fputs(_(" -w, --wd[=<dir>]       set the working directory\n"), out);
 	fputs(_(" -F, --no-fork          do not fork before exec'ing <program>\n"), out);
 
 	fputs(USAGE_SEPARATOR, out);
@@ -153,6 +166,9 @@ static void continue_as_child(void)
 
 int main(int argc, char *argv[])
 {
+	enum {
+		OPT_PRESERVE_CRED = CHAR_MAX + 1
+	};
 	static const struct option longopts[] = {
 		{ "help", no_argument, NULL, 'h' },
 		{ "version", no_argument, NULL, 'V'},
@@ -162,24 +178,30 @@ int main(int argc, char *argv[])
 		{ "ipc", optional_argument, NULL, 'i' },
 		{ "net", optional_argument, NULL, 'n' },
 		{ "pid", optional_argument, NULL, 'p' },
+		{ "user", optional_argument, NULL, 'U' },
+		{ "setuid", required_argument, NULL, 'S' },
+		{ "setgid", required_argument, NULL, 'G' },
 		{ "root", optional_argument, NULL, 'r' },
 		{ "wd", optional_argument, NULL, 'w' },
 		{ "no-fork", no_argument, NULL, 'F' },
+		{ "preserve-credentials", no_argument, NULL, OPT_PRESERVE_CRED },
 		{ NULL, 0, NULL, 0 }
 	};
 
 	struct namespace_file *nsfile;
-	int c, namespaces = 0;
-	bool do_rd = false, do_wd = false;
+	int c, namespaces = 0, setgroups_nerrs = 0, preserve_cred = 0;
+	bool do_rd = false, do_wd = false, force_uid = false, force_gid = false;
 	int do_fork = -1; /* unknown yet */
+	uid_t uid = 0;
+	gid_t gid = 0;
 
-	setlocale(LC_MESSAGES, "");
+	setlocale(LC_ALL, "");
 	bindtextdomain(PACKAGE, LOCALEDIR);
 	textdomain(PACKAGE);
 	atexit(close_stdout);
 
 	while ((c =
-		getopt_long(argc, argv, "hVt:m::u::i::n::p::r::w::F",
+		getopt_long(argc, argv, "+hVt:m::u::i::n::p::U::S:G:r::w::F",
 			    longopts, NULL)) != -1) {
 		switch (c) {
 		case 'h':
@@ -221,6 +243,20 @@ int main(int argc, char *argv[])
 			else
 				namespaces |= CLONE_NEWPID;
 			break;
+		case 'U':
+			if (optarg)
+				open_namespace_fd(CLONE_NEWUSER, optarg);
+			else
+				namespaces |= CLONE_NEWUSER;
+			break;
+		case 'S':
+			uid = strtoul_or_err(optarg, _("failed to parse uid"));
+			force_uid = true;
+			break;
+		case 'G':
+			gid = strtoul_or_err(optarg, _("failed to parse gid"));
+			force_gid = true;
+			break;
 		case 'F':
 			do_fork = 0;
 			break;
@@ -236,6 +272,9 @@ int main(int argc, char *argv[])
 			else
 				do_wd = true;
 			break;
+		case OPT_PRESERVE_CRED:
+			preserve_cred = 1;
+			break;
 		default:
 			usage(EXIT_FAILURE);
 		}
@@ -253,6 +292,26 @@ int main(int argc, char *argv[])
 		open_target_fd(&wd_fd, "cwd", NULL);
 
 	/*
+	 * Update namespaces variable to contain all requested namespaces
+	 */
+	for (nsfile = namespace_files; nsfile->nstype; nsfile++) {
+		if (nsfile->fd < 0)
+			continue;
+		namespaces |= nsfile->nstype;
+	}
+
+	/* for user namespaces we always set UID and GID (default is 0)
+	 * and clear root's groups if --preserve-credentials is no specified */
+	if ((namespaces & CLONE_NEWUSER) && !preserve_cred) {
+		force_uid = true, force_gid = true;
+
+		/* We call setgroups() before and after we enter user namespace,
+		 * let's complain only if both fail */
+		if (setgroups(0, NULL) != 0)
+			setgroups_nerrs++;
+	}
+
+	/*
 	 * Now that we know which namespaces we want to enter, enter them.
 	 */
 	for (nsfile = namespace_files; nsfile->nstype; nsfile++) {
@@ -302,6 +361,15 @@ int main(int argc, char *argv[])
 	if (do_fork == 1)
 		continue_as_child();
 
+	if (force_uid || force_gid) {
+		if (force_gid && setgroups(0, NULL) != 0 && setgroups_nerrs)	/* drop supplementary groups */
+			err(EXIT_FAILURE, _("setgroups failed"));
+		if (force_gid && setgid(gid) < 0)		/* change GID */
+			err(EXIT_FAILURE, _("setgid failed"));
+		if (force_uid && setuid(uid) < 0)		/* change UID */
+			err(EXIT_FAILURE, _("setuid failed"));
+	}
+
 	if (optind < argc) {
 		execvp(argv[optind], argv + optind);
 		err(EXIT_FAILURE, _("failed to execute %s"), argv[optind]);
diff -up util-linux-2.23.2/sys-utils/unshare.1.kzak util-linux-2.23.2/sys-utils/unshare.1
--- util-linux-2.23.2/sys-utils/unshare.1.kzak	2015-06-26 09:58:39.484633521 +0200
+++ util-linux-2.23.2/sys-utils/unshare.1	2015-06-26 09:58:51.673541033 +0200
@@ -1,28 +1,27 @@
-.\" Process this file with
-.\" groff -man -Tascii lscpu.1
-.\"
-.TH UNSHARE 1 "July 2013" "util-linux" "User Commands"
+.TH UNSHARE 1 "July 2014" "util-linux" "User Commands"
 .SH NAME
 unshare \- run program with some namespaces unshared from parent
 .SH SYNOPSIS
 .B unshare
-.RI [ options ]
+[options]
 .I program
 .RI [ arguments ]
 .SH DESCRIPTION
 Unshares the indicated namespaces from the parent process and then executes
-the specified program.  The namespaces to be unshared are indicated via
+the specified \fIprogram\fR.  The namespaces to be unshared are indicated via
 options.  Unshareable namespaces are:
 .TP
 .BR "mount namespace"
 Mounting and unmounting filesystems will not affect the rest of the system
 (\fBCLONE_NEWNS\fP flag), except for filesystems which are explicitly marked as
-shared (with \fBmount --make-shared\fP; see \fI/proc/self/mountinfo\fP for the
-\fBshared\fP flags).
-
-It's recommended to use \fBmount --make-rprivate\fP or \fBmount --make-rslave\fP
-after \fBunshare --mount\fP to make sure that mountpoints in the new namespace
-are really unshared from parental namespace.
+shared (with \fBmount --make-shared\fP; see \fI/proc/self/mountinfo\fP or
+\fBfindmnt -o+PROPAGATION\fP for the \fBshared\fP flags).
+.sp
+.B unshare
+automatically sets propagation to \fBprivate\fP
+in the new mount namespace to make sure that the new namespace is really
+unshared. This feature is possible to disable by option \fB\-\-propagation unchanged\fP.
+Note that \fBprivate\fP is the kernel default.
 .TP
 .BR "UTS namespace"
 Setting hostname or domainname will not affect the rest of the system.
@@ -40,13 +39,14 @@ sockets, etc.  (\fBCLONE_NEWNET\fP flag)
 .BR "pid namespace"
 Children will have a distinct set of PID to process mappings from their parent.
 (\fBCLONE_NEWPID\fP flag)
+.TP
+.BR "user namespace"
+The process will have a distinct set of UIDs, GIDs and capabilities.
+(\fBCLONE_NEWUSER\fP flag)
 .PP
 See \fBclone\fR(2) for the exact semantics of the flags.
 .SH OPTIONS
 .TP
-.BR \-h , " \-\-help"
-Display help text and exit.
-.TP
 .BR \-i , " \-\-ipc"
 Unshare the IPC namespace.
 .TP
@@ -63,16 +63,68 @@ See also the \fB--fork\fP and \fB--mount
 .BR \-u , " \-\-uts"
 Unshare the UTS namespace.
 .TP
+.BR \-U , " \-\-user"
+Unshare the user namespace.
+.TP
 .BR \-f , " \-\-fork"
 Fork the specified \fIprogram\fR as a child process of \fBunshare\fR rather than
 running it directly.  This is useful when creating a new pid namespace.
 .TP
-.BR \-\-mount-proc "[=\fImountpoint\fP]"
-Just before running the program, mount the proc filesystem at the \fImountpoint\fP
+.BR \-\-mount\-proc "[=\fImountpoint\fP]"
+Just before running the program, mount the proc filesystem at \fImountpoint\fP
 (default is /proc).  This is useful when creating a new pid namespace.  It also
 implies creating a new mount namespace since the /proc mount would otherwise
-mess up existing programs on the system. The new proc filesystem is explicitly
+mess up existing programs on the system.  The new proc filesystem is explicitly
 mounted as private (by MS_PRIVATE|MS_REC).
+.TP
+.BR \-r , " \-\-map\-root\-user"
+Run the program only after the current effective user and group IDs have been mapped to
+the superuser UID and GID in the newly created user namespace.  This makes it possible to
+conveniently gain capabilities needed to manage various aspects of the newly created
+namespaces (such as configuring interfaces in the network namespace or mounting filesystems in
+the mount namespace) even when run unprivileged.  As a mere convenience feature, it does not support
+more sophisticated use cases, such as mapping multiple ranges of UIDs and GIDs.
+This option implies --setgroups=deny.
+.TP
+.BR "\-\-propagation \fIprivate|shared|slave|unchanged\fP"
+Recursively sets mount propagation flag in the new mount namespace. The default
+is to set the propagation to \fIprivate\fP, this feature is possible to disable
+by \fIunchanged\fP argument. The options is silently ignored when mount namespace (\fB\-\-mount\fP)
+is not requested.
+.TP
+.BR "\-\-setgroups \fIallow|deny\fP"
+Allow or deny
+.BR setgroups (2)
+syscall in user namespaces.
+
+.BR setgroups(2)
+is only callable with CAP_SETGID and CAP_SETGID in a user
+namespace (since Linux 3.19) does not give you permission to call setgroups(2)
+until after GID map has been set. The GID map is writable by root when
+.BR setgroups(2)
+is enabled and GID map becomes writable by unprivileged processes when
+.BR setgroups(2)
+is permanently disabled.
+.TP
+.BR \-V , " \-\-version"
+Display version information and exit.
+.TP
+.BR \-h , " \-\-help"
+Display help text and exit.
+.SH EXAMPLES
+.TP
+.B # unshare --fork --pid --mount-proc readlink /proc/self
+.TQ
+1
+.br
+Establish a PID namespace, ensure we're PID 1 in it against newly mounted
+procfs instance.
+.TP
+.B $ unshare --map-root-user --user sh -c whoami
+.TQ
+root
+.br
+Establish a user namespace as an unprivileged user with a root user within it.
 .SH SEE ALSO
 .BR unshare (2),
 .BR clone (2),
diff -up util-linux-2.23.2/sys-utils/unshare.c.kzak util-linux-2.23.2/sys-utils/unshare.c
--- util-linux-2.23.2/sys-utils/unshare.c.kzak	2015-06-26 09:58:39.484633521 +0200
+++ util-linux-2.23.2/sys-utils/unshare.c	2015-06-26 09:58:51.673541033 +0200
@@ -32,19 +32,117 @@
 
 #include "nls.h"
 #include "c.h"
+#include "closestream.h"
 #include "namespace.h"
 #include "exec_shell.h"
 #include "xalloc.h"
 #include "pathnames.h"
+#include "all-io.h"
 
+/* 'private' is kernel default */
+#define UNSHARE_PROPAGATION_DEFAULT	(MS_REC | MS_PRIVATE)
+
+enum {
+	SETGROUPS_NONE = -1,
+	SETGROUPS_DENY = 0,
+	SETGROUPS_ALLOW = 1,
+};
+
+static const char *setgroups_strings[] =
+{
+	[SETGROUPS_DENY] = "deny",
+	[SETGROUPS_ALLOW] = "allow"
+};
+
+static int setgroups_str2id(const char *str)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
+		if (strcmp(str, setgroups_strings[i]) == 0)
+			return i;
+
+	errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
+}
+
+static void setgroups_control(int action)
+{
+	const char *file = _PATH_PROC_SETGROUPS;
+	const char *cmd;
+	int fd;
+
+	if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
+		return;
+	cmd = setgroups_strings[action];
+
+	fd = open(file, O_WRONLY);
+	if (fd < 0) {
+		if (errno == ENOENT)
+			return;
+		 err(EXIT_FAILURE, _("cannot open %s"), file);
+	}
+
+	if (write_all(fd, cmd, strlen(cmd)))
+		err(EXIT_FAILURE, _("write failed %s"), file);
+	close(fd);
+}
+
+static void map_id(const char *file, uint32_t from, uint32_t to)
+{
+	char *buf;
+	int fd;
+
+	fd = open(file, O_WRONLY);
+	if (fd < 0)
+		 err(EXIT_FAILURE, _("cannot open %s"), file);
+
+	xasprintf(&buf, "%u %u 1", from, to);
+	if (write_all(fd, buf, strlen(buf)))
+		err(EXIT_FAILURE, _("write failed %s"), file);
+	free(buf);
+	close(fd);
+}
+
+static unsigned long parse_propagation(const char *str)
+{
+	size_t i;
+	static const struct prop_opts {
+		const char *name;
+		unsigned long flag;
+	} opts[] = {
+		{ "slave",	MS_REC | MS_SLAVE },
+		{ "private",	MS_REC | MS_PRIVATE },
+		{ "shared",     MS_REC | MS_SHARED },
+		{ "unchanged",        0 }
+	};
+
+	for (i = 0; i < ARRAY_SIZE(opts); i++) {
+		if (strcmp(opts[i].name, str) == 0)
+			return opts[i].flag;
+	}
+
+	errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
+}
+
+static void set_propagation(unsigned long flags)
+{
+	if (flags == 0)
+		return;
+
+	if (mount("none", "/", NULL, flags, NULL) != 0)
+		err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
+}
 
 static void usage(int status)
 {
 	FILE *out = status == EXIT_SUCCESS ? stdout : stderr;
 
 	fputs(USAGE_HEADER, out);
-	fprintf(out,
-	      _(" %s [options] <program> [args...]\n"),	program_invocation_short_name);
+	fprintf(out, _(" %s [options] <program> [<argument>...]\n"),
+		program_invocation_short_name);
+
+	fputs(USAGE_SEPARATOR, out);
+	fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
 
 	fputs(USAGE_OPTIONS, out);
 	fputs(_(" -m, --mount               unshare mounts namespace\n"), out);
@@ -52,8 +150,13 @@ static void usage(int status)
 	fputs(_(" -i, --ipc                 unshare System V IPC namespace\n"), out);
 	fputs(_(" -n, --net                 unshare network namespace\n"), out);
 	fputs(_(" -p, --pid                 unshare pid namespace\n"), out);
+	fputs(_(" -U, --user                unshare user namespace\n"), out);
 	fputs(_(" -f, --fork                fork before launching <program>\n"), out);
 	fputs(_("     --mount-proc[=<dir>]  mount proc filesystem first (implies --mount)\n"), out);
+	fputs(_(" -r, --map-root-user       map current user to root (implies --user)\n"), out);
+	fputs(_("     --propagation <slave|shared|private|unchanged>\n"
+	        "                           modify mount propagation in mount namespace\n"), out);
+	fputs(_(" -s, --setgroups allow|deny  control the setgroups syscall in user namespaces\n"), out);
 
 	fputs(USAGE_SEPARATOR, out);
 	fputs(USAGE_HELP, out);
@@ -66,7 +169,9 @@ static void usage(int status)
 int main(int argc, char *argv[])
 {
 	enum {
-		OPT_MOUNTPROC = CHAR_MAX + 1
+		OPT_MOUNTPROC = CHAR_MAX + 1,
+		OPT_PROPAGATION,
+		OPT_SETGROUPS
 	};
 	static const struct option longopts[] = {
 		{ "help", no_argument, 0, 'h' },
@@ -76,20 +181,29 @@ int main(int argc, char *argv[])
 		{ "ipc", no_argument, 0, 'i' },
 		{ "net", no_argument, 0, 'n' },
 		{ "pid", no_argument, 0, 'p' },
+		{ "user", no_argument, 0, 'U' },
 		{ "fork", no_argument, 0, 'f' },
 		{ "mount-proc", optional_argument, 0, OPT_MOUNTPROC },
+		{ "map-root-user", no_argument, 0, 'r' },
+		{ "propagation", required_argument, 0, OPT_PROPAGATION },
+		{ "setgroups", required_argument, 0, OPT_SETGROUPS },
 		{ NULL, 0, 0, 0 }
 	};
 
+	int setgrpcmd = SETGROUPS_NONE;
 	int unshare_flags = 0;
-	int c, forkit = 0;
+	int c, forkit = 0, maproot = 0;
 	const char *procmnt = NULL;
+	unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
+	uid_t real_euid = geteuid();
+	gid_t real_egid = getegid();;
 
 	setlocale(LC_ALL, "");
 	bindtextdomain(PACKAGE, LOCALEDIR);
 	textdomain(PACKAGE);
+	atexit(close_stdout);
 
-	while ((c = getopt_long(argc, argv, "+fhVmuinp", longopts, NULL)) != -1) {
+	while ((c = getopt_long(argc, argv, "+fhVmuinpUr", longopts, NULL)) != -1) {
 		switch (c) {
 		case 'f':
 			forkit = 1;
@@ -114,10 +228,23 @@ int main(int argc, char *argv[])
 		case 'p':
 			unshare_flags |= CLONE_NEWPID;
 			break;
+		case 'U':
+			unshare_flags |= CLONE_NEWUSER;
+			break;
 		case OPT_MOUNTPROC:
 			unshare_flags |= CLONE_NEWNS;
 			procmnt = optarg ? optarg : "/proc";
 			break;
+		case 'r':
+			unshare_flags |= CLONE_NEWUSER;
+			maproot = 1;
+			break;
+		case OPT_SETGROUPS:
+			setgrpcmd = setgroups_str2id(optarg);
+			break;
+		case OPT_PROPAGATION:
+			propagation = parse_propagation(optarg);
+			break;
 		default:
 			usage(EXIT_FAILURE);
 		}
@@ -146,6 +273,25 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (maproot) {
+		if (setgrpcmd == SETGROUPS_ALLOW)
+			errx(EXIT_FAILURE, _("options --setgroups=allow and "
+					"--map-root-user are mutually exclusive"));
+
+		/* since Linux 3.19 unprivileged writing of /proc/self/gid_map
+		 * has s been disabled unless /proc/self/setgroups is written
+		 * first to permanently disable the ability to call setgroups
+		 * in that user namespace. */
+		setgroups_control(SETGROUPS_DENY);
+		map_id(_PATH_PROC_UIDMAP, 0, real_euid);
+		map_id(_PATH_PROC_GIDMAP, 0, real_egid);
+
+	} else if (setgrpcmd != SETGROUPS_NONE)
+		setgroups_control(setgrpcmd);
+
+	if ((unshare_flags & CLONE_NEWNS) && propagation)
+		set_propagation(propagation);
+
 	if (procmnt &&
 	    (mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL) != 0 ||
 	     mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0))