ff6046
From c7861c541e49e0bf3678d9f3c9093ee819ed436a Mon Sep 17 00:00:00 2001
ff6046
From: Lennart Poettering <lennart@poettering.net>
ff6046
Date: Tue, 17 Jul 2018 11:47:14 +0200
ff6046
Subject: [PATCH] core: introduce new Type=exec service type
ff6046
ff6046
Users are often surprised that "systemd-run" command lines like
ff6046
"systemd-run -p User=idontexist /bin/true" will return successfully,
ff6046
even though the logs show that the process couldn't be invoked, as the
ff6046
user "idontexist" doesn't exist. This is because Type=simple will only
ff6046
wait until fork() succeeded before returning start-up success.
ff6046
ff6046
This patch adds a new service type Type=exec, which is very similar to
ff6046
Type=simple, but waits until the child process completed the execve()
ff6046
before returning success. It uses a pipe that has O_CLOEXEC set for this
ff6046
logic, so that the kernel automatically sends POLLHUP on it when the
ff6046
execve() succeeded but leaves the pipe open if not. This means PID 1
ff6046
waits exactly until the execve() succeeded in the child, and not longer
ff6046
and not shorter, which is the desired functionality.
ff6046
ff6046
Making use of this new functionality, the command line
ff6046
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
ff6046
as expected.
ff6046
ff6046
(cherry picked from commit 5686391b006ee82d8a4559067ad9818e3e631247)
ff6046
ff6046
Resolves: #1683334
ff6046
---
ff6046
 src/core/execute.c |  89 +++++++++++++++++++++---
ff6046
 src/core/execute.h |   3 +
ff6046
 src/core/mount.c   |   9 +--
ff6046
 src/core/service.c | 167 ++++++++++++++++++++++++++++++++++++++++++---
ff6046
 src/core/service.h |   4 ++
ff6046
 src/core/socket.c  |   9 +--
ff6046
 src/core/swap.c    |   1 +
ff6046
 7 files changed, 254 insertions(+), 28 deletions(-)
ff6046
ff6046
diff --git a/src/core/execute.c b/src/core/execute.c
ff6046
index 7476ac51da..c62f3cf849 100644
ff6046
--- a/src/core/execute.c
ff6046
+++ b/src/core/execute.c
ff6046
@@ -2566,6 +2566,7 @@ static int close_remaining_fds(
ff6046
                 const DynamicCreds *dcreds,
ff6046
                 int user_lookup_fd,
ff6046
                 int socket_fd,
ff6046
+                int exec_fd,
ff6046
                 int *fds, size_t n_fds) {
ff6046
 
ff6046
         size_t n_dont_close = 0;
ff6046
@@ -2582,6 +2583,8 @@ static int close_remaining_fds(
ff6046
 
ff6046
         if (socket_fd >= 0)
ff6046
                 dont_close[n_dont_close++] = socket_fd;
ff6046
+        if (exec_fd >= 0)
ff6046
+                dont_close[n_dont_close++] = exec_fd;
ff6046
         if (n_fds > 0) {
ff6046
                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
ff6046
                 n_dont_close += n_fds;
ff6046
@@ -2725,9 +2728,10 @@ static int exec_child(
ff6046
                 int *exit_status) {
ff6046
 
ff6046
         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
ff6046
-        _cleanup_free_ char *home_buffer = NULL;
ff6046
+        int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
ff6046
         _cleanup_free_ gid_t *supplementary_gids = NULL;
ff6046
         const char *username = NULL, *groupname = NULL;
ff6046
+        _cleanup_free_ char *home_buffer = NULL;
ff6046
         const char *home = NULL, *shell = NULL;
ff6046
         dev_t journal_stream_dev = 0;
ff6046
         ino_t journal_stream_ino = 0;
ff6046
@@ -2747,7 +2751,6 @@ static int exec_child(
ff6046
 #endif
ff6046
         uid_t uid = UID_INVALID;
ff6046
         gid_t gid = GID_INVALID;
ff6046
-        int r, ngids = 0;
ff6046
         size_t n_fds;
ff6046
         ExecDirectoryType dt;
ff6046
         int secure_bits;
ff6046
@@ -2791,8 +2794,8 @@ static int exec_child(
ff6046
         /* In case anything used libc syslog(), close this here, too */
ff6046
         closelog();
ff6046
 
ff6046
-        n_fds = n_storage_fds + n_socket_fds;
ff6046
-        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
ff6046
+        n_fds = n_socket_fds + n_storage_fds;
ff6046
+        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
ff6046
         if (r < 0) {
ff6046
                 *exit_status = EXIT_FDS;
ff6046
                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
ff6046
@@ -3165,9 +3168,45 @@ static int exec_child(
ff6046
         }
ff6046
 
ff6046
         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
ff6046
-         * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
ff6046
-         * was needed to upload the policy and can now be closed as well. */
ff6046
-        r = close_all_fds(fds, n_fds);
ff6046
+         * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
ff6046
+         * however if we have it as we want to keep it open until the final execve(). */
ff6046
+
ff6046
+        if (params->exec_fd >= 0) {
ff6046
+                exec_fd = params->exec_fd;
ff6046
+
ff6046
+                if (exec_fd < 3 + (int) n_fds) {
ff6046
+                        int moved_fd;
ff6046
+
ff6046
+                        /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
ff6046
+                         * process we are about to execute. */
ff6046
+
ff6046
+                        moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
ff6046
+                        if (moved_fd < 0) {
ff6046
+                                *exit_status = EXIT_FDS;
ff6046
+                                return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
ff6046
+                        }
ff6046
+
ff6046
+                        safe_close(exec_fd);
ff6046
+                        exec_fd = moved_fd;
ff6046
+                } else {
ff6046
+                        /* This fd should be FD_CLOEXEC already, but let's make sure. */
ff6046
+                        r = fd_cloexec(exec_fd, true);
ff6046
+                        if (r < 0) {
ff6046
+                                *exit_status = EXIT_FDS;
ff6046
+                                return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
ff6046
+                        }
ff6046
+                }
ff6046
+
ff6046
+                fds_with_exec_fd = newa(int, n_fds + 1);
ff6046
+                memcpy(fds_with_exec_fd, fds, n_fds * sizeof(int));
ff6046
+                fds_with_exec_fd[n_fds] = exec_fd;
ff6046
+                n_fds_with_exec_fd = n_fds + 1;
ff6046
+        } else {
ff6046
+                fds_with_exec_fd = fds;
ff6046
+                n_fds_with_exec_fd = n_fds;
ff6046
+        }
ff6046
+
ff6046
+        r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
ff6046
         if (r >= 0)
ff6046
                 r = shift_fds(fds, n_fds);
ff6046
         if (r >= 0)
ff6046
@@ -3177,6 +3216,11 @@ static int exec_child(
ff6046
                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
ff6046
         }
ff6046
 
ff6046
+        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
ff6046
+         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
ff6046
+         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
ff6046
+         * came this far. */
ff6046
+
ff6046
         secure_bits = context->secure_bits;
ff6046
 
ff6046
         if (needs_sandboxing) {
ff6046
@@ -3407,10 +3451,35 @@ static int exec_child(
ff6046
                                    LOG_UNIT_INVOCATION_ID(unit));
ff6046
         }
ff6046
 
ff6046
+        if (exec_fd >= 0) {
ff6046
+                uint8_t hot = 1;
ff6046
+
ff6046
+                /* We have finished with all our initializations. Let's now let the manager know that. From this point
ff6046
+                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
ff6046
+
ff6046
+                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
ff6046
+                        *exit_status = EXIT_EXEC;
ff6046
+                        return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
ff6046
+                }
ff6046
+        }
ff6046
+
ff6046
         execve(command->path, final_argv, accum_env);
ff6046
+        r = -errno;
ff6046
+
ff6046
+        if (exec_fd >= 0) {
ff6046
+                uint8_t hot = 0;
ff6046
+
ff6046
+                /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
ff6046
+                 * that POLLHUP on it no longer means execve() succeeded. */
ff6046
+
ff6046
+                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
ff6046
+                        *exit_status = EXIT_EXEC;
ff6046
+                        return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
ff6046
+                }
ff6046
+        }
ff6046
 
ff6046
-        if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
ff6046
-                log_struct_errno(LOG_INFO, errno,
ff6046
+        if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
ff6046
+                log_struct_errno(LOG_INFO, r,
ff6046
                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
ff6046
                                  LOG_UNIT_ID(unit),
ff6046
                                  LOG_UNIT_INVOCATION_ID(unit),
ff6046
@@ -3421,7 +3490,7 @@ static int exec_child(
ff6046
         }
ff6046
 
ff6046
         *exit_status = EXIT_EXEC;
ff6046
-        return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
ff6046
+        return log_unit_error_errno(unit, r, "Failed to execute command: %m");
ff6046
 }
ff6046
 
ff6046
 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
ff6046
diff --git a/src/core/execute.h b/src/core/execute.h
ff6046
index f24dbf581a..bff1634b88 100644
ff6046
--- a/src/core/execute.h
ff6046
+++ b/src/core/execute.h
ff6046
@@ -316,6 +316,9 @@ struct ExecParameters {
ff6046
         int stdin_fd;
ff6046
         int stdout_fd;
ff6046
         int stderr_fd;
ff6046
+
ff6046
+        /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
ff6046
+        int exec_fd;
ff6046
 };
ff6046
 
ff6046
 #include "unit.h"
ff6046
diff --git a/src/core/mount.c b/src/core/mount.c
ff6046
index 21437dad08..16229d4af1 100644
ff6046
--- a/src/core/mount.c
ff6046
+++ b/src/core/mount.c
ff6046
@@ -747,10 +747,11 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) {
ff6046
 static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
ff6046
 
ff6046
         ExecParameters exec_params = {
ff6046
-                .flags      = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
ff6046
-                .stdin_fd   = -1,
ff6046
-                .stdout_fd  = -1,
ff6046
-                .stderr_fd  = -1,
ff6046
+                .flags     = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
ff6046
+                .stdin_fd  = -1,
ff6046
+                .stdout_fd = -1,
ff6046
+                .stderr_fd = -1,
ff6046
+                .exec_fd   = -1,
ff6046
         };
ff6046
         pid_t pid;
ff6046
         int r;
ff6046
diff --git a/src/core/service.c b/src/core/service.c
ff6046
index 7f8ce1b998..3eab749362 100644
ff6046
--- a/src/core/service.c
ff6046
+++ b/src/core/service.c
ff6046
@@ -79,9 +79,10 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
ff6046
         [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING
ff6046
 };
ff6046
 
ff6046
-static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
ff6046
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
ff6046
 static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
ff6046
 static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata);
ff6046
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
ff6046
 
ff6046
 static void service_enter_signal(Service *s, ServiceState state, ServiceResult f);
ff6046
 static void service_enter_reload_by_notify(Service *s);
ff6046
@@ -389,6 +390,7 @@ static void service_done(Unit *u) {
ff6046
         service_stop_watchdog(s);
ff6046
 
ff6046
         s->timer_event_source = sd_event_source_unref(s->timer_event_source);
ff6046
+        s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
ff6046
 
ff6046
         service_release_resources(u);
ff6046
 }
ff6046
@@ -1066,6 +1068,9 @@ static void service_set_state(Service *s, ServiceState state) {
ff6046
             !(state == SERVICE_DEAD && UNIT(s)->job))
ff6046
                 service_close_socket_fd(s);
ff6046
 
ff6046
+        if (state != SERVICE_START)
ff6046
+                s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
ff6046
+
ff6046
         if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
ff6046
                 service_stop_watchdog(s);
ff6046
 
ff6046
@@ -1296,6 +1301,63 @@ static int service_collect_fds(
ff6046
         return 0;
ff6046
 }
ff6046
 
ff6046
+static int service_allocate_exec_fd_event_source(
ff6046
+                Service *s,
ff6046
+                int fd,
ff6046
+                sd_event_source **ret_event_source) {
ff6046
+
ff6046
+        _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL;
ff6046
+        int r;
ff6046
+
ff6046
+        assert(s);
ff6046
+        assert(fd >= 0);
ff6046
+        assert(ret_event_source);
ff6046
+
ff6046
+        r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s);
ff6046
+        if (r < 0)
ff6046
+                return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m");
ff6046
+
ff6046
+        /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */
ff6046
+
ff6046
+        r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3);
ff6046
+        if (r < 0)
ff6046
+                return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m");
ff6046
+
ff6046
+        (void) sd_event_source_set_description(source, "service event_fd");
ff6046
+
ff6046
+        r = sd_event_source_set_io_fd_own(source, true);
ff6046
+        if (r < 0)
ff6046
+                return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m");
ff6046
+
ff6046
+        *ret_event_source = TAKE_PTR(source);
ff6046
+        return 0;
ff6046
+}
ff6046
+
ff6046
+static int service_allocate_exec_fd(
ff6046
+                Service *s,
ff6046
+                sd_event_source **ret_event_source,
ff6046
+                int* ret_exec_fd) {
ff6046
+
ff6046
+        _cleanup_close_pair_ int p[2] = { -1, -1 };
ff6046
+        int r;
ff6046
+
ff6046
+        assert(s);
ff6046
+        assert(ret_event_source);
ff6046
+        assert(ret_exec_fd);
ff6046
+
ff6046
+        if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0)
ff6046
+                return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m");
ff6046
+
ff6046
+        r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source);
ff6046
+        if (r < 0)
ff6046
+                return r;
ff6046
+
ff6046
+        p[0] = -1;
ff6046
+        *ret_exec_fd = TAKE_FD(p[1]);
ff6046
+
ff6046
+        return 0;
ff6046
+}
ff6046
+
ff6046
 static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) {
ff6046
         assert(s);
ff6046
 
ff6046
@@ -1330,7 +1392,9 @@ static int service_spawn(
ff6046
                 .exec_fd    = -1,
ff6046
         };
ff6046
         _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL;
ff6046
+        _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
ff6046
         size_t n_socket_fds = 0, n_storage_fds = 0, n_env = 0;
ff6046
+        _cleanup_close_ int exec_fd = -1;
ff6046
         _cleanup_free_ int *fds = NULL;
ff6046
         pid_t pid;
ff6046
         int r;
ff6046
@@ -1363,6 +1427,14 @@ static int service_spawn(
ff6046
                 log_unit_debug(UNIT(s), "Passing %zu fds to service", n_socket_fds + n_storage_fds);
ff6046
         }
ff6046
 
ff6046
+        if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
ff6046
+                assert(!s->exec_fd_event_source);
ff6046
+
ff6046
+                r = service_allocate_exec_fd(s, &exec_fd_source, &exec_fd);
ff6046
+                if (r < 0)
ff6046
+                        return r;
ff6046
+        }
ff6046
+
ff6046
         r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout));
ff6046
         if (r < 0)
ff6046
                 return r;
ff6046
@@ -1462,6 +1534,7 @@ static int service_spawn(
ff6046
         exec_params.stdin_fd = s->stdin_fd;
ff6046
         exec_params.stdout_fd = s->stdout_fd;
ff6046
         exec_params.stderr_fd = s->stderr_fd;
ff6046
+        exec_params.exec_fd = exec_fd;
ff6046
 
ff6046
         r = exec_spawn(UNIT(s),
ff6046
                        c,
ff6046
@@ -1473,6 +1546,9 @@ static int service_spawn(
ff6046
         if (r < 0)
ff6046
                 return r;
ff6046
 
ff6046
+        s->exec_fd_event_source = TAKE_PTR(exec_fd_source);
ff6046
+        s->exec_fd_hot = false;
ff6046
+
ff6046
         r = unit_watch_pid(UNIT(s), pid);
ff6046
         if (r < 0) /* FIXME: we need to do something here */
ff6046
                 return r;
ff6046
@@ -1984,14 +2060,12 @@ static void service_enter_start(Service *s) {
ff6046
                 s->control_pid = pid;
ff6046
                 service_set_state(s, SERVICE_START);
ff6046
 
ff6046
-        } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY)) {
ff6046
+        } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) {
ff6046
 
ff6046
-                /* For oneshot services we wait until the start
ff6046
-                 * process exited, too, but it is our main process. */
ff6046
+                /* For oneshot services we wait until the start process exited, too, but it is our main process. */
ff6046
 
ff6046
-                /* For D-Bus services we know the main pid right away,
ff6046
-                 * but wait for the bus name to appear on the
ff6046
-                 * bus. Notify services are similar. */
ff6046
+                /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the
ff6046
+                 * bus. 'notify' and 'exec' services are similar. */
ff6046
 
ff6046
                 service_set_main_pid(s, pid);
ff6046
                 service_set_state(s, SERVICE_START);
ff6046
@@ -2444,6 +2518,13 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
ff6046
         if (r < 0)
ff6046
                 return r;
ff6046
 
ff6046
+        if (s->exec_fd_event_source) {
ff6046
+                r = unit_serialize_item_fd(u, f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source));
ff6046
+                if (r < 0)
ff6046
+                        return r;
ff6046
+                unit_serialize_item(u, f, "exec-fd-hot", yes_no(s->exec_fd_hot));
ff6046
+        }
ff6046
+
ff6046
         if (UNIT_ISSET(s->accept_socket)) {
ff6046
                 r = unit_serialize_item(u, f, "accept-socket", UNIT_DEREF(s->accept_socket)->id);
ff6046
                 if (r < 0)
ff6046
@@ -2777,6 +2858,18 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
ff6046
                         s->stderr_fd = fdset_remove(fds, fd);
ff6046
                         s->exec_context.stdio_as_fds = true;
ff6046
                 }
ff6046
+        } else if (streq(key, "exec-fd")) {
ff6046
+                int fd;
ff6046
+
ff6046
+                if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
ff6046
+                        log_unit_debug(u, "Failed to parse exec-fd value: %s", value);
ff6046
+                else {
ff6046
+                        s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
ff6046
+
ff6046
+                        fd = fdset_remove(fds, fd);
ff6046
+                        if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0)
ff6046
+                                safe_close(fd);
ff6046
+                }
ff6046
         } else if (streq(key, "watchdog-override-usec")) {
ff6046
                 usec_t watchdog_override_usec;
ff6046
                 if (timestamp_deserialize(value, &watchdog_override_usec) < 0)
ff6046
@@ -2860,7 +2953,7 @@ static int service_watch_pid_file(Service *s) {
ff6046
 
ff6046
         log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path);
ff6046
 
ff6046
-        r = path_spec_watch(s->pid_file_pathspec, service_dispatch_io);
ff6046
+        r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io);
ff6046
         if (r < 0)
ff6046
                 goto fail;
ff6046
 
ff6046
@@ -2904,7 +2997,7 @@ static int service_demand_pid_file(Service *s) {
ff6046
         return service_watch_pid_file(s);
ff6046
 }
ff6046
 
ff6046
-static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
ff6046
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
ff6046
         PathSpec *p = userdata;
ff6046
         Service *s;
ff6046
 
ff6046
@@ -2937,6 +3030,59 @@ fail:
ff6046
         return 0;
ff6046
 }
ff6046
 
ff6046
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
ff6046
+        Service *s = SERVICE(userdata);
ff6046
+
ff6046
+        assert(s);
ff6046
+
ff6046
+        log_unit_debug(UNIT(s), "got exec-fd event");
ff6046
+
ff6046
+        /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
ff6046
+         * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
ff6046
+         * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
ff6046
+         * parent. We need to be careful however, as there are other reasons that we might cause the child's side of
ff6046
+         * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
ff6046
+         * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
ff6046
+         * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
ff6046
+         * sends a zero byte we'll ignore POLLHUP on the fd again. */
ff6046
+
ff6046
+        for (;;) {
ff6046
+                uint8_t x;
ff6046
+                ssize_t n;
ff6046
+
ff6046
+                n = read(fd, &x, sizeof(x));
ff6046
+                if (n < 0) {
ff6046
+                        if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
ff6046
+                                return 0;
ff6046
+
ff6046
+                        return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
ff6046
+                }
ff6046
+                if (n == 0) { /* EOF → the event we are waiting for */
ff6046
+
ff6046
+                        s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
ff6046
+
ff6046
+                        if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
ff6046
+                                log_unit_debug(UNIT(s), "Got EOF on exec-fd");
ff6046
+
ff6046
+                                s->exec_fd_hot = false;
ff6046
+
ff6046
+                                /* Nice! This is what we have been waiting for. Transition to next state. */
ff6046
+                                if (s->type == SERVICE_EXEC && s->state == SERVICE_START)
ff6046
+                                        service_enter_start_post(s);
ff6046
+                        } else
ff6046
+                                log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring.");
ff6046
+
ff6046
+                        return 0;
ff6046
+                }
ff6046
+
ff6046
+                /* A byte was read → this turns on/off the exec fd logic */
ff6046
+                assert(n == sizeof(x));
ff6046
+                s->exec_fd_hot = x;
ff6046
+        }
ff6046
+
ff6046
+        return 0;
ff6046
+}
ff6046
+
ff6046
 static void service_notify_cgroup_empty_event(Unit *u) {
ff6046
         Service *s = SERVICE(u);
ff6046
 
ff6046
@@ -3850,7 +3996,8 @@ static const char* const service_type_table[_SERVICE_TYPE_MAX] = {
ff6046
         [SERVICE_ONESHOT] = "oneshot",
ff6046
         [SERVICE_DBUS] = "dbus",
ff6046
         [SERVICE_NOTIFY] = "notify",
ff6046
-        [SERVICE_IDLE] = "idle"
ff6046
+        [SERVICE_IDLE] = "idle",
ff6046
+        [SERVICE_EXEC] = "exec",
ff6046
 };
ff6046
 
ff6046
 DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType);
ff6046
diff --git a/src/core/service.h b/src/core/service.h
ff6046
index a142b09f0d..1206e3cdda 100644
ff6046
--- a/src/core/service.h
ff6046
+++ b/src/core/service.h
ff6046
@@ -30,6 +30,7 @@ typedef enum ServiceType {
ff6046
         SERVICE_DBUS,     /* we fork and wait until a specific D-Bus name appears on the bus */
ff6046
         SERVICE_NOTIFY,   /* we fork and wait until a daemon sends us a ready message with sd_notify() */
ff6046
         SERVICE_IDLE,     /* much like simple, but delay exec() until all jobs are dispatched. */
ff6046
+        SERVICE_EXEC,     /* we fork and wait until we execute exec() (this means our own setup is waited for) */
ff6046
         _SERVICE_TYPE_MAX,
ff6046
         _SERVICE_TYPE_INVALID = -1
ff6046
 } ServiceType;
ff6046
@@ -165,6 +166,8 @@ struct Service {
ff6046
         NotifyAccess notify_access;
ff6046
         NotifyState notify_state;
ff6046
 
ff6046
+        sd_event_source *exec_fd_event_source;
ff6046
+
ff6046
         ServiceFDStore *fd_store;
ff6046
         size_t n_fd_store;
ff6046
         unsigned n_fd_store_max;
ff6046
@@ -179,6 +182,7 @@ struct Service {
ff6046
 
ff6046
         unsigned n_restarts;
ff6046
         bool flush_n_restarts;
ff6046
+        bool exec_fd_hot;
ff6046
 };
ff6046
 
ff6046
 extern const UnitVTable service_vtable;
ff6046
diff --git a/src/core/socket.c b/src/core/socket.c
ff6046
index 56d32225c4..d488c64e91 100644
ff6046
--- a/src/core/socket.c
ff6046
+++ b/src/core/socket.c
ff6046
@@ -1867,10 +1867,11 @@ static int socket_coldplug(Unit *u) {
ff6046
 static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
ff6046
 
ff6046
         ExecParameters exec_params = {
ff6046
-                .flags      = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
ff6046
-                .stdin_fd   = -1,
ff6046
-                .stdout_fd  = -1,
ff6046
-                .stderr_fd  = -1,
ff6046
+                .flags     = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
ff6046
+                .stdin_fd  = -1,
ff6046
+                .stdout_fd = -1,
ff6046
+                .stderr_fd = -1,
ff6046
+                .exec_fd   = -1,
ff6046
         };
ff6046
         pid_t pid;
ff6046
         int r;
ff6046
diff --git a/src/core/swap.c b/src/core/swap.c
ff6046
index b78b1aa266..e01e61e56d 100644
ff6046
--- a/src/core/swap.c
ff6046
+++ b/src/core/swap.c
ff6046
@@ -606,6 +606,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
ff6046
                 .stdin_fd  = -1,
ff6046
                 .stdout_fd = -1,
ff6046
                 .stderr_fd = -1,
ff6046
+                .exec_fd   = -1,
ff6046
         };
ff6046
         pid_t pid;
ff6046
         int r;