From 41e91ccdf9fa3097d7b90718cc83e743f4dc8d6b Mon Sep 17 00:00:00 2001 From: Jan Rybar Date: Thu, 17 Aug 2017 18:01:42 +0200 Subject: [PATCH] nspawn: new option to start as PID2 Cherry-picked from: 7732f92 Resolves: #1417387 --- Makefile.am | 2 + man/systemd-nspawn.xml | 65 +++++++++-- src/nspawn/nspawn-stub-pid1.c | 196 ++++++++++++++++++++++++++++++++++ src/nspawn/nspawn-stub-pid1.h | 22 ++++ src/nspawn/nspawn.c | 56 ++++++++-- 5 files changed, 328 insertions(+), 13 deletions(-) create mode 100644 src/nspawn/nspawn-stub-pid1.c create mode 100644 src/nspawn/nspawn-stub-pid1.h diff --git a/Makefile.am b/Makefile.am index 7c58fd0504..0e2f8d561c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2658,6 +2658,8 @@ systemd_cgtop_LDADD = \ # ------------------------------------------------------------------------------ systemd_nspawn_SOURCES = \ src/nspawn/nspawn.c \ + src/nspawn/nspawn-stub-pid1.c \ + src/nspawn/nspawn-stub-pid1.h \ src/core/mount-setup.c \ src/core/mount-setup.h \ src/core/loopback-setup.c \ diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index cbd44d4aba..d0eddaacc3 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -241,16 +241,69 @@ . + + + + + Invoke the shell or specified program as process ID (PID) 2 instead of PID 1 (init). By + default, if neither this option nor is used, the selected binary is run as process with + PID 1, a mode only suitable for programs that are aware of the special semantics that the process with PID 1 + has on UNIX. For example, it needs to reap all processes reparented to it, and should implement + sysvinit compatible signal handling (specifically: it needs to reboot on SIGINT, reexecute + on SIGTERM, reload configuration on SIGHUP, and so on). With a minimal stub init + process is run as PID 1 and the selected binary is executed as PID 2 (and hence does not need to implement any + special semantics). The stub init process will reap processes as necessary and react appropriately to + signals. It is recommended to use this mode to invoke arbitrary commands in containers, unless they have been + modified to run correctly as PID 1. Or in other words: this switch should be used for pretty much all commands, + except when the command refers to an init or shell implementation, as these are generally capable of running + correctly as PID 1). This option may not be combined with or + . + + + - Automatically search for an init binary and - invoke it instead of a shell or a user supplied program. If - this option is used, arguments specified on the command line - are used as arguments for the init binary. This option may not - be combined with . - + Automatically search for an init binary and invoke it as PID 1, instead of a shell or a user + supplied program. If this option is used, arguments specified on the command line are used as arguments for the + init binary. This option may not be combined with or + . + + The following table explains the different modes of invocation and relationship to + (see above): + + + Invocation Mode + + + + + + Switch + Explanation + + + + + Neither nor specified + The passed parameters are interpreted as command line, which is executed as PID 1 in the container. + + + + specified + The passed parameters are interpreted as command line, which are executed as PID 2 in the container. A stub init process is run as PID 1. + + + + specified + An init binary as automatically searched and run as PID 1 in the container. The passed parameters are used as invocation parameters for this process. + + + + +
+
diff --git a/src/nspawn/nspawn-stub-pid1.c b/src/nspawn/nspawn-stub-pid1.c new file mode 100644 index 0000000000..11c11560c4 --- /dev/null +++ b/src/nspawn/nspawn-stub-pid1.c @@ -0,0 +1,196 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include + +#include "log.h" +#include "nspawn-stub-pid1.h" +#include "util.h" +#include "time-util.h" +#include "def.h" + +static int reset_environ(const char *new_environment, size_t length) { + unsigned long start, end; + + start = (unsigned long) new_environment; + end = start + length; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0) + return -errno; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0) + return -errno; + + return 0; +} + +int stub_pid1(sd_id128_t uuid) { + enum { + STATE_RUNNING, + STATE_REBOOT, + STATE_POWEROFF, + } state = STATE_RUNNING; + + sigset_t fullmask, oldmask, waitmask; + usec_t quit_usec = USEC_INFINITY; + pid_t pid; + int r; + + /* The new environment we set up, on the stack. */ + char new_environment[] = + "container=systemd-nspawn\0" + "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; + + /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful + * for allowing arbitrary processes run in a container, and still have all zombies reaped. */ + + assert_se(sigfillset(&fullmask) >= 0); + assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0); + + pid = fork(); + if (pid < 0) + return log_error_errno(errno, "Failed to fork child pid: %m"); + + if (pid == 0) { + /* Return in the child */ + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0); + setsid(); + return 0; + } + + reset_all_signal_handlers(); + + log_close(); + close_all_fds(NULL, 0); + log_open(); + + /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also, + * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ + * find them set. */ + sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX); + reset_environ(new_environment, sizeof(new_environment)); + + rename_process("STUBINIT"); + + assert_se(sigemptyset(&waitmask) >= 0); + + sigset_add_many(&waitmask, + SIGCHLD, /* posix: process died */ + SIGINT, /* sysv: ctrl-alt-del */ + SIGRTMIN+3, /* systemd: halt */ + SIGRTMIN+4, /* systemd: poweroff */ + SIGRTMIN+5, /* systemd: reboot */ + SIGRTMIN+6, /* systemd: kexec */ + SIGRTMIN+13, /* systemd: halt */ + SIGRTMIN+14, /* systemd: poweroff */ + SIGRTMIN+15, /* systemd: reboot */ + SIGRTMIN+16, /* systemd: kexec */ + -1); + + /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't + * support reexec/reloading in this stub process. */ + + for (;;) { + siginfo_t si; + usec_t current_usec; + + si.si_pid = 0; + r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG); + if (r < 0) { + r = log_error_errno(errno, "Failed to reap children: %m"); + goto finish; + } + + current_usec = now(CLOCK_MONOTONIC); + + if (si.si_pid == pid || current_usec >= quit_usec) { + + /* The child we started ourselves died or we reached a timeout. */ + + if (state == STATE_REBOOT) { /* dispatch a queued reboot */ + (void) reboot(RB_AUTOBOOT); + r = log_error_errno(errno, "Failed to reboot: %m"); + goto finish; + + } else if (state == STATE_POWEROFF) + (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */ + + if (si.si_pid == pid && si.si_code == CLD_EXITED) + r = si.si_status; /* pass on exit code */ + else + r = 255; /* signal, coredump, timeout, … */ + + goto finish; + } + if (si.si_pid != 0) + /* We reaped something. Retry until there's nothing more to reap. */ + continue; + + if (quit_usec == USEC_INFINITY) + r = sigwaitinfo(&waitmask, &si); + else { + struct timespec ts; + r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec)); + } + if (r < 0) { + if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */ + continue; + if (errno == EAGAIN) /* timeout reached */ + continue; + + r = log_error_errno(errno, "Failed to wait for signal: %m"); + goto finish; + } + + if (si.si_signo == SIGCHLD) + continue; /* Let's reap this */ + + if (state != STATE_RUNNING) + continue; + + /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a + * constant… */ + + if (si.si_signo == SIGRTMIN+3 || + si.si_signo == SIGRTMIN+4 || + si.si_signo == SIGRTMIN+13 || + si.si_signo == SIGRTMIN+14) + + state = STATE_POWEROFF; + + else if (si.si_signo == SIGINT || + si.si_signo == SIGRTMIN+5 || + si.si_signo == SIGRTMIN+6 || + si.si_signo == SIGRTMIN+15 || + si.si_signo == SIGRTMIN+16) + + state = STATE_REBOOT; + else + assert_not_reached("Got unexpected signal"); + + /* (void) kill_and_sigcont(pid, SIGTERM); */ + quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC; + } + +finish: + _exit(r < 0 ? EXIT_FAILURE : r); +} diff --git a/src/nspawn/nspawn-stub-pid1.h b/src/nspawn/nspawn-stub-pid1.h new file mode 100644 index 0000000000..be0f1af4cb --- /dev/null +++ b/src/nspawn/nspawn-stub-pid1.h @@ -0,0 +1,22 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +int stub_pid1(sd_id128_t uuid); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index d0003d3790..ea365b3f9b 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -99,6 +99,7 @@ #include "in-addr-util.h" #include "fw-util.h" #include "local-addresses.h" +#include "nspawn-stub-pid1.h" #ifdef HAVE_SECCOMP #include "seccomp-util.h" @@ -129,6 +130,14 @@ typedef enum Volatile { VOLATILE_STATE, } Volatile; +typedef enum StartMode { + START_PID1, /* Run parameters as command line as process 1 */ + START_PID2, /* Use stub init process as PID 1, run parameters as command line as process 2 */ + START_BOOT, /* Search for init system, pass arguments as parameters */ + _START_MODE_MAX, + _START_MODE_INVALID = -1 +} StartMode; + static char *arg_directory = NULL; static char *arg_template = NULL; static char *arg_user = NULL; @@ -139,7 +148,7 @@ static const char *arg_selinux_apifs_context = NULL; static const char *arg_slice = NULL; static bool arg_private_network = false; static bool arg_read_only = false; -static bool arg_boot = false; +static StartMode arg_start_mode = START_PID1; static bool arg_ephemeral = false; static LinkJournal arg_link_journal = LINK_AUTO; static bool arg_link_journal_try = false; @@ -200,6 +209,7 @@ static void help(void) { " -x --ephemeral Run container with snapshot of root directory, and\n" " remove it after exit\n" " -i --image=PATH File system device or disk image for the container\n" + " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n" " -b --boot Boot up full system (i.e. invoke init)\n" " -u --user=USER Run the command under specified user or uid\n" " -M --machine=NAME Set the machine name for the container\n" @@ -304,6 +314,7 @@ static int parse_argv(int argc, char *argv[]) { { "ephemeral", no_argument, NULL, 'x' }, { "user", required_argument, NULL, 'u' }, { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK }, + { "as-pid2", no_argument, NULL, 'a' }, { "boot", no_argument, NULL, 'b' }, { "uuid", required_argument, NULL, ARG_UUID }, { "read-only", no_argument, NULL, ARG_READ_ONLY }, @@ -340,7 +351,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argc >= 0); assert(argv); - while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0) switch (c) { @@ -421,7 +432,21 @@ static int parse_argv(int argc, char *argv[]) { break; case 'b': - arg_boot = true; + if (arg_start_mode == START_PID2) { + log_error("--boot and --as-pid2 may not be combined."); + return -EINVAL; + } + + arg_start_mode = START_BOOT; + break; + + case 'a': + if (arg_start_mode == START_BOOT) { + log_error("--boot and --as-pid2 may not be combined."); + return -EINVAL; + } + + arg_start_mode = START_PID2; break; case ARG_UUID: @@ -741,7 +766,7 @@ static int parse_argv(int argc, char *argv[]) { if (arg_share_system) arg_register = false; - if (arg_boot && arg_share_system) { + if (arg_start_mode != START_PID1 && arg_share_system) { log_error("--boot and --share-system may not be combined."); return -EINVAL; } @@ -3586,6 +3611,10 @@ int main(int argc, char *argv[]) { log_parse_environment(); log_open(); + /* Make sure rename_process() in the stub init process can work */ + saved_argv = argv; + saved_argc = argc; + r = parse_argv(argc, argv); if (r <= 0) goto finish; @@ -3694,7 +3723,7 @@ int main(int argc, char *argv[]) { } } - if (arg_boot) { + if (arg_start_mode == START_BOOT) { if (path_is_os_tree(arg_directory) <= 0) { log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory); r = -EINVAL; @@ -4109,7 +4138,19 @@ int main(int argc, char *argv[]) { if (!barrier_place_and_sync(&barrier)) _exit(EXIT_FAILURE); - if (arg_boot) { + if (arg_start_mode == START_PID2) { + r = stub_pid1(arg_uuid); + if (r < 0) + { + log_error_errno(r, "Failed to start as PID2: %m"); + _exit(EXIT_FAILURE); + } + } + + log_close(); + (void) fdset_close_others(fds); + + if (arg_start_mode == START_BOOT) { char **a; size_t l; @@ -4135,6 +4176,7 @@ int main(int argc, char *argv[]) { execle("/bin/sh", "-sh", NULL, env_use); } + log_open(); log_error_errno(errno, "execv() failed: %m"); _exit(EXIT_FAILURE); } @@ -4210,7 +4252,7 @@ int main(int argc, char *argv[]) { goto finish; } - if (arg_boot) { + if (arg_start_mode == START_BOOT) { /* Try to kill the init system on SIGINT or SIGTERM */ sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid)); sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));