4bff0a
From c7861c541e49e0bf3678d9f3c9093ee819ed436a Mon Sep 17 00:00:00 2001
4bff0a
From: Lennart Poettering <lennart@poettering.net>
4bff0a
Date: Tue, 17 Jul 2018 11:47:14 +0200
4bff0a
Subject: [PATCH] core: introduce new Type=exec service type
4bff0a
4bff0a
Users are often surprised that "systemd-run" command lines like
4bff0a
"systemd-run -p User=idontexist /bin/true" will return successfully,
4bff0a
even though the logs show that the process couldn't be invoked, as the
4bff0a
user "idontexist" doesn't exist. This is because Type=simple will only
4bff0a
wait until fork() succeeded before returning start-up success.
4bff0a
4bff0a
This patch adds a new service type Type=exec, which is very similar to
4bff0a
Type=simple, but waits until the child process completed the execve()
4bff0a
before returning success. It uses a pipe that has O_CLOEXEC set for this
4bff0a
logic, so that the kernel automatically sends POLLHUP on it when the
4bff0a
execve() succeeded but leaves the pipe open if not. This means PID 1
4bff0a
waits exactly until the execve() succeeded in the child, and not longer
4bff0a
and not shorter, which is the desired functionality.
4bff0a
4bff0a
Making use of this new functionality, the command line
4bff0a
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
4bff0a
as expected.
4bff0a
4bff0a
(cherry picked from commit 5686391b006ee82d8a4559067ad9818e3e631247)
4bff0a
4bff0a
Resolves: #1683334
4bff0a
---
4bff0a
 src/core/execute.c |  89 +++++++++++++++++++++---
4bff0a
 src/core/execute.h |   3 +
4bff0a
 src/core/mount.c   |   9 +--
4bff0a
 src/core/service.c | 167 ++++++++++++++++++++++++++++++++++++++++++---
4bff0a
 src/core/service.h |   4 ++
4bff0a
 src/core/socket.c  |   9 +--
4bff0a
 src/core/swap.c    |   1 +
4bff0a
 7 files changed, 254 insertions(+), 28 deletions(-)
4bff0a
4bff0a
diff --git a/src/core/execute.c b/src/core/execute.c
4bff0a
index 7476ac51da..c62f3cf849 100644
4bff0a
--- a/src/core/execute.c
4bff0a
+++ b/src/core/execute.c
4bff0a
@@ -2566,6 +2566,7 @@ static int close_remaining_fds(
4bff0a
                 const DynamicCreds *dcreds,
4bff0a
                 int user_lookup_fd,
4bff0a
                 int socket_fd,
4bff0a
+                int exec_fd,
4bff0a
                 int *fds, size_t n_fds) {
4bff0a
 
4bff0a
         size_t n_dont_close = 0;
4bff0a
@@ -2582,6 +2583,8 @@ static int close_remaining_fds(
4bff0a
 
4bff0a
         if (socket_fd >= 0)
4bff0a
                 dont_close[n_dont_close++] = socket_fd;
4bff0a
+        if (exec_fd >= 0)
4bff0a
+                dont_close[n_dont_close++] = exec_fd;
4bff0a
         if (n_fds > 0) {
4bff0a
                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4bff0a
                 n_dont_close += n_fds;
4bff0a
@@ -2725,9 +2728,10 @@ static int exec_child(
4bff0a
                 int *exit_status) {
4bff0a
 
4bff0a
         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
4bff0a
-        _cleanup_free_ char *home_buffer = NULL;
4bff0a
+        int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
4bff0a
         _cleanup_free_ gid_t *supplementary_gids = NULL;
4bff0a
         const char *username = NULL, *groupname = NULL;
4bff0a
+        _cleanup_free_ char *home_buffer = NULL;
4bff0a
         const char *home = NULL, *shell = NULL;
4bff0a
         dev_t journal_stream_dev = 0;
4bff0a
         ino_t journal_stream_ino = 0;
4bff0a
@@ -2747,7 +2751,6 @@ static int exec_child(
4bff0a
 #endif
4bff0a
         uid_t uid = UID_INVALID;
4bff0a
         gid_t gid = GID_INVALID;
4bff0a
-        int r, ngids = 0;
4bff0a
         size_t n_fds;
4bff0a
         ExecDirectoryType dt;
4bff0a
         int secure_bits;
4bff0a
@@ -2791,8 +2794,8 @@ static int exec_child(
4bff0a
         /* In case anything used libc syslog(), close this here, too */
4bff0a
         closelog();
4bff0a
 
4bff0a
-        n_fds = n_storage_fds + n_socket_fds;
4bff0a
-        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
4bff0a
+        n_fds = n_socket_fds + n_storage_fds;
4bff0a
+        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
4bff0a
         if (r < 0) {
4bff0a
                 *exit_status = EXIT_FDS;
4bff0a
                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4bff0a
@@ -3165,9 +3168,45 @@ static int exec_child(
4bff0a
         }
4bff0a
 
4bff0a
         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4bff0a
-         * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
4bff0a
-         * was needed to upload the policy and can now be closed as well. */
4bff0a
-        r = close_all_fds(fds, n_fds);
4bff0a
+         * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
4bff0a
+         * however if we have it as we want to keep it open until the final execve(). */
4bff0a
+
4bff0a
+        if (params->exec_fd >= 0) {
4bff0a
+                exec_fd = params->exec_fd;
4bff0a
+
4bff0a
+                if (exec_fd < 3 + (int) n_fds) {
4bff0a
+                        int moved_fd;
4bff0a
+
4bff0a
+                        /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
4bff0a
+                         * process we are about to execute. */
4bff0a
+
4bff0a
+                        moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
4bff0a
+                        if (moved_fd < 0) {
4bff0a
+                                *exit_status = EXIT_FDS;
4bff0a
+                                return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
4bff0a
+                        }
4bff0a
+
4bff0a
+                        safe_close(exec_fd);
4bff0a
+                        exec_fd = moved_fd;
4bff0a
+                } else {
4bff0a
+                        /* This fd should be FD_CLOEXEC already, but let's make sure. */
4bff0a
+                        r = fd_cloexec(exec_fd, true);
4bff0a
+                        if (r < 0) {
4bff0a
+                                *exit_status = EXIT_FDS;
4bff0a
+                                return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
4bff0a
+                        }
4bff0a
+                }
4bff0a
+
4bff0a
+                fds_with_exec_fd = newa(int, n_fds + 1);
4bff0a
+                memcpy(fds_with_exec_fd, fds, n_fds * sizeof(int));
4bff0a
+                fds_with_exec_fd[n_fds] = exec_fd;
4bff0a
+                n_fds_with_exec_fd = n_fds + 1;
4bff0a
+        } else {
4bff0a
+                fds_with_exec_fd = fds;
4bff0a
+                n_fds_with_exec_fd = n_fds;
4bff0a
+        }
4bff0a
+
4bff0a
+        r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
4bff0a
         if (r >= 0)
4bff0a
                 r = shift_fds(fds, n_fds);
4bff0a
         if (r >= 0)
4bff0a
@@ -3177,6 +3216,11 @@ static int exec_child(
4bff0a
                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4bff0a
         }
4bff0a
 
4bff0a
+        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4bff0a
+         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4bff0a
+         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4bff0a
+         * came this far. */
4bff0a
+
4bff0a
         secure_bits = context->secure_bits;
4bff0a
 
4bff0a
         if (needs_sandboxing) {
4bff0a
@@ -3407,10 +3451,35 @@ static int exec_child(
4bff0a
                                    LOG_UNIT_INVOCATION_ID(unit));
4bff0a
         }
4bff0a
 
4bff0a
+        if (exec_fd >= 0) {
4bff0a
+                uint8_t hot = 1;
4bff0a
+
4bff0a
+                /* We have finished with all our initializations. Let's now let the manager know that. From this point
4bff0a
+                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4bff0a
+
4bff0a
+                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4bff0a
+                        *exit_status = EXIT_EXEC;
4bff0a
+                        return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4bff0a
+                }
4bff0a
+        }
4bff0a
+
4bff0a
         execve(command->path, final_argv, accum_env);
4bff0a
+        r = -errno;
4bff0a
+
4bff0a
+        if (exec_fd >= 0) {
4bff0a
+                uint8_t hot = 0;
4bff0a
+
4bff0a
+                /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4bff0a
+                 * that POLLHUP on it no longer means execve() succeeded. */
4bff0a
+
4bff0a
+                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4bff0a
+                        *exit_status = EXIT_EXEC;
4bff0a
+                        return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4bff0a
+                }
4bff0a
+        }
4bff0a
 
4bff0a
-        if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4bff0a
-                log_struct_errno(LOG_INFO, errno,
4bff0a
+        if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4bff0a
+                log_struct_errno(LOG_INFO, r,
4bff0a
                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4bff0a
                                  LOG_UNIT_ID(unit),
4bff0a
                                  LOG_UNIT_INVOCATION_ID(unit),
4bff0a
@@ -3421,7 +3490,7 @@ static int exec_child(
4bff0a
         }
4bff0a
 
4bff0a
         *exit_status = EXIT_EXEC;
4bff0a
-        return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
4bff0a
+        return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4bff0a
 }
4bff0a
 
4bff0a
 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4bff0a
diff --git a/src/core/execute.h b/src/core/execute.h
4bff0a
index f24dbf581a..bff1634b88 100644
4bff0a
--- a/src/core/execute.h
4bff0a
+++ b/src/core/execute.h
4bff0a
@@ -316,6 +316,9 @@ struct ExecParameters {
4bff0a
         int stdin_fd;
4bff0a
         int stdout_fd;
4bff0a
         int stderr_fd;
4bff0a
+
4bff0a
+        /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
4bff0a
+        int exec_fd;
4bff0a
 };
4bff0a
 
4bff0a
 #include "unit.h"
4bff0a
diff --git a/src/core/mount.c b/src/core/mount.c
4bff0a
index 21437dad08..16229d4af1 100644
4bff0a
--- a/src/core/mount.c
4bff0a
+++ b/src/core/mount.c
4bff0a
@@ -747,10 +747,11 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) {
4bff0a
 static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
4bff0a
 
4bff0a
         ExecParameters exec_params = {
4bff0a
-                .flags      = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
4bff0a
-                .stdin_fd   = -1,
4bff0a
-                .stdout_fd  = -1,
4bff0a
-                .stderr_fd  = -1,
4bff0a
+                .flags     = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
4bff0a
+                .stdin_fd  = -1,
4bff0a
+                .stdout_fd = -1,
4bff0a
+                .stderr_fd = -1,
4bff0a
+                .exec_fd   = -1,
4bff0a
         };
4bff0a
         pid_t pid;
4bff0a
         int r;
4bff0a
diff --git a/src/core/service.c b/src/core/service.c
4bff0a
index 7f8ce1b998..3eab749362 100644
4bff0a
--- a/src/core/service.c
4bff0a
+++ b/src/core/service.c
4bff0a
@@ -79,9 +79,10 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
4bff0a
         [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING
4bff0a
 };
4bff0a
 
4bff0a
-static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
4bff0a
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
4bff0a
 static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
4bff0a
 static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata);
4bff0a
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
4bff0a
 
4bff0a
 static void service_enter_signal(Service *s, ServiceState state, ServiceResult f);
4bff0a
 static void service_enter_reload_by_notify(Service *s);
4bff0a
@@ -389,6 +390,7 @@ static void service_done(Unit *u) {
4bff0a
         service_stop_watchdog(s);
4bff0a
 
4bff0a
         s->timer_event_source = sd_event_source_unref(s->timer_event_source);
4bff0a
+        s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
4bff0a
 
4bff0a
         service_release_resources(u);
4bff0a
 }
4bff0a
@@ -1066,6 +1068,9 @@ static void service_set_state(Service *s, ServiceState state) {
4bff0a
             !(state == SERVICE_DEAD && UNIT(s)->job))
4bff0a
                 service_close_socket_fd(s);
4bff0a
 
4bff0a
+        if (state != SERVICE_START)
4bff0a
+                s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
4bff0a
+
4bff0a
         if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
4bff0a
                 service_stop_watchdog(s);
4bff0a
 
4bff0a
@@ -1296,6 +1301,63 @@ static int service_collect_fds(
4bff0a
         return 0;
4bff0a
 }
4bff0a
 
4bff0a
+static int service_allocate_exec_fd_event_source(
4bff0a
+                Service *s,
4bff0a
+                int fd,
4bff0a
+                sd_event_source **ret_event_source) {
4bff0a
+
4bff0a
+        _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL;
4bff0a
+        int r;
4bff0a
+
4bff0a
+        assert(s);
4bff0a
+        assert(fd >= 0);
4bff0a
+        assert(ret_event_source);
4bff0a
+
4bff0a
+        r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s);
4bff0a
+        if (r < 0)
4bff0a
+                return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m");
4bff0a
+
4bff0a
+        /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */
4bff0a
+
4bff0a
+        r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3);
4bff0a
+        if (r < 0)
4bff0a
+                return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m");
4bff0a
+
4bff0a
+        (void) sd_event_source_set_description(source, "service event_fd");
4bff0a
+
4bff0a
+        r = sd_event_source_set_io_fd_own(source, true);
4bff0a
+        if (r < 0)
4bff0a
+                return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m");
4bff0a
+
4bff0a
+        *ret_event_source = TAKE_PTR(source);
4bff0a
+        return 0;
4bff0a
+}
4bff0a
+
4bff0a
+static int service_allocate_exec_fd(
4bff0a
+                Service *s,
4bff0a
+                sd_event_source **ret_event_source,
4bff0a
+                int* ret_exec_fd) {
4bff0a
+
4bff0a
+        _cleanup_close_pair_ int p[2] = { -1, -1 };
4bff0a
+        int r;
4bff0a
+
4bff0a
+        assert(s);
4bff0a
+        assert(ret_event_source);
4bff0a
+        assert(ret_exec_fd);
4bff0a
+
4bff0a
+        if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0)
4bff0a
+                return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m");
4bff0a
+
4bff0a
+        r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source);
4bff0a
+        if (r < 0)
4bff0a
+                return r;
4bff0a
+
4bff0a
+        p[0] = -1;
4bff0a
+        *ret_exec_fd = TAKE_FD(p[1]);
4bff0a
+
4bff0a
+        return 0;
4bff0a
+}
4bff0a
+
4bff0a
 static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) {
4bff0a
         assert(s);
4bff0a
 
4bff0a
@@ -1330,7 +1392,9 @@ static int service_spawn(
4bff0a
                 .exec_fd    = -1,
4bff0a
         };
4bff0a
         _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL;
4bff0a
+        _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
4bff0a
         size_t n_socket_fds = 0, n_storage_fds = 0, n_env = 0;
4bff0a
+        _cleanup_close_ int exec_fd = -1;
4bff0a
         _cleanup_free_ int *fds = NULL;
4bff0a
         pid_t pid;
4bff0a
         int r;
4bff0a
@@ -1363,6 +1427,14 @@ static int service_spawn(
4bff0a
                 log_unit_debug(UNIT(s), "Passing %zu fds to service", n_socket_fds + n_storage_fds);
4bff0a
         }
4bff0a
 
4bff0a
+        if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
4bff0a
+                assert(!s->exec_fd_event_source);
4bff0a
+
4bff0a
+                r = service_allocate_exec_fd(s, &exec_fd_source, &exec_fd);
4bff0a
+                if (r < 0)
4bff0a
+                        return r;
4bff0a
+        }
4bff0a
+
4bff0a
         r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout));
4bff0a
         if (r < 0)
4bff0a
                 return r;
4bff0a
@@ -1462,6 +1534,7 @@ static int service_spawn(
4bff0a
         exec_params.stdin_fd = s->stdin_fd;
4bff0a
         exec_params.stdout_fd = s->stdout_fd;
4bff0a
         exec_params.stderr_fd = s->stderr_fd;
4bff0a
+        exec_params.exec_fd = exec_fd;
4bff0a
 
4bff0a
         r = exec_spawn(UNIT(s),
4bff0a
                        c,
4bff0a
@@ -1473,6 +1546,9 @@ static int service_spawn(
4bff0a
         if (r < 0)
4bff0a
                 return r;
4bff0a
 
4bff0a
+        s->exec_fd_event_source = TAKE_PTR(exec_fd_source);
4bff0a
+        s->exec_fd_hot = false;
4bff0a
+
4bff0a
         r = unit_watch_pid(UNIT(s), pid);
4bff0a
         if (r < 0) /* FIXME: we need to do something here */
4bff0a
                 return r;
4bff0a
@@ -1984,14 +2060,12 @@ static void service_enter_start(Service *s) {
4bff0a
                 s->control_pid = pid;
4bff0a
                 service_set_state(s, SERVICE_START);
4bff0a
 
4bff0a
-        } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY)) {
4bff0a
+        } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) {
4bff0a
 
4bff0a
-                /* For oneshot services we wait until the start
4bff0a
-                 * process exited, too, but it is our main process. */
4bff0a
+                /* For oneshot services we wait until the start process exited, too, but it is our main process. */
4bff0a
 
4bff0a
-                /* For D-Bus services we know the main pid right away,
4bff0a
-                 * but wait for the bus name to appear on the
4bff0a
-                 * bus. Notify services are similar. */
4bff0a
+                /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the
4bff0a
+                 * bus. 'notify' and 'exec' services are similar. */
4bff0a
 
4bff0a
                 service_set_main_pid(s, pid);
4bff0a
                 service_set_state(s, SERVICE_START);
4bff0a
@@ -2444,6 +2518,13 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
4bff0a
         if (r < 0)
4bff0a
                 return r;
4bff0a
 
4bff0a
+        if (s->exec_fd_event_source) {
4bff0a
+                r = unit_serialize_item_fd(u, f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source));
4bff0a
+                if (r < 0)
4bff0a
+                        return r;
4bff0a
+                unit_serialize_item(u, f, "exec-fd-hot", yes_no(s->exec_fd_hot));
4bff0a
+        }
4bff0a
+
4bff0a
         if (UNIT_ISSET(s->accept_socket)) {
4bff0a
                 r = unit_serialize_item(u, f, "accept-socket", UNIT_DEREF(s->accept_socket)->id);
4bff0a
                 if (r < 0)
4bff0a
@@ -2777,6 +2858,18 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
4bff0a
                         s->stderr_fd = fdset_remove(fds, fd);
4bff0a
                         s->exec_context.stdio_as_fds = true;
4bff0a
                 }
4bff0a
+        } else if (streq(key, "exec-fd")) {
4bff0a
+                int fd;
4bff0a
+
4bff0a
+                if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
4bff0a
+                        log_unit_debug(u, "Failed to parse exec-fd value: %s", value);
4bff0a
+                else {
4bff0a
+                        s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
4bff0a
+
4bff0a
+                        fd = fdset_remove(fds, fd);
4bff0a
+                        if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0)
4bff0a
+                                safe_close(fd);
4bff0a
+                }
4bff0a
         } else if (streq(key, "watchdog-override-usec")) {
4bff0a
                 usec_t watchdog_override_usec;
4bff0a
                 if (timestamp_deserialize(value, &watchdog_override_usec) < 0)
4bff0a
@@ -2860,7 +2953,7 @@ static int service_watch_pid_file(Service *s) {
4bff0a
 
4bff0a
         log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path);
4bff0a
 
4bff0a
-        r = path_spec_watch(s->pid_file_pathspec, service_dispatch_io);
4bff0a
+        r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io);
4bff0a
         if (r < 0)
4bff0a
                 goto fail;
4bff0a
 
4bff0a
@@ -2904,7 +2997,7 @@ static int service_demand_pid_file(Service *s) {
4bff0a
         return service_watch_pid_file(s);
4bff0a
 }
4bff0a
 
4bff0a
-static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
4bff0a
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
4bff0a
         PathSpec *p = userdata;
4bff0a
         Service *s;
4bff0a
 
4bff0a
@@ -2937,6 +3030,59 @@ fail:
4bff0a
         return 0;
4bff0a
 }
4bff0a
 
4bff0a
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
4bff0a
+        Service *s = SERVICE(userdata);
4bff0a
+
4bff0a
+        assert(s);
4bff0a
+
4bff0a
+        log_unit_debug(UNIT(s), "got exec-fd event");
4bff0a
+
4bff0a
+        /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
4bff0a
+         * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
4bff0a
+         * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
4bff0a
+         * parent. We need to be careful however, as there are other reasons that we might cause the child's side of
4bff0a
+         * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
4bff0a
+         * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
4bff0a
+         * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
4bff0a
+         * sends a zero byte we'll ignore POLLHUP on the fd again. */
4bff0a
+
4bff0a
+        for (;;) {
4bff0a
+                uint8_t x;
4bff0a
+                ssize_t n;
4bff0a
+
4bff0a
+                n = read(fd, &x, sizeof(x));
4bff0a
+                if (n < 0) {
4bff0a
+                        if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
4bff0a
+                                return 0;
4bff0a
+
4bff0a
+                        return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
4bff0a
+                }
4bff0a
+                if (n == 0) { /* EOF → the event we are waiting for */
4bff0a
+
4bff0a
+                        s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source);
4bff0a
+
4bff0a
+                        if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
4bff0a
+                                log_unit_debug(UNIT(s), "Got EOF on exec-fd");
4bff0a
+
4bff0a
+                                s->exec_fd_hot = false;
4bff0a
+
4bff0a
+                                /* Nice! This is what we have been waiting for. Transition to next state. */
4bff0a
+                                if (s->type == SERVICE_EXEC && s->state == SERVICE_START)
4bff0a
+                                        service_enter_start_post(s);
4bff0a
+                        } else
4bff0a
+                                log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring.");
4bff0a
+
4bff0a
+                        return 0;
4bff0a
+                }
4bff0a
+
4bff0a
+                /* A byte was read → this turns on/off the exec fd logic */
4bff0a
+                assert(n == sizeof(x));
4bff0a
+                s->exec_fd_hot = x;
4bff0a
+        }
4bff0a
+
4bff0a
+        return 0;
4bff0a
+}
4bff0a
+
4bff0a
 static void service_notify_cgroup_empty_event(Unit *u) {
4bff0a
         Service *s = SERVICE(u);
4bff0a
 
4bff0a
@@ -3850,7 +3996,8 @@ static const char* const service_type_table[_SERVICE_TYPE_MAX] = {
4bff0a
         [SERVICE_ONESHOT] = "oneshot",
4bff0a
         [SERVICE_DBUS] = "dbus",
4bff0a
         [SERVICE_NOTIFY] = "notify",
4bff0a
-        [SERVICE_IDLE] = "idle"
4bff0a
+        [SERVICE_IDLE] = "idle",
4bff0a
+        [SERVICE_EXEC] = "exec",
4bff0a
 };
4bff0a
 
4bff0a
 DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType);
4bff0a
diff --git a/src/core/service.h b/src/core/service.h
4bff0a
index a142b09f0d..1206e3cdda 100644
4bff0a
--- a/src/core/service.h
4bff0a
+++ b/src/core/service.h
4bff0a
@@ -30,6 +30,7 @@ typedef enum ServiceType {
4bff0a
         SERVICE_DBUS,     /* we fork and wait until a specific D-Bus name appears on the bus */
4bff0a
         SERVICE_NOTIFY,   /* we fork and wait until a daemon sends us a ready message with sd_notify() */
4bff0a
         SERVICE_IDLE,     /* much like simple, but delay exec() until all jobs are dispatched. */
4bff0a
+        SERVICE_EXEC,     /* we fork and wait until we execute exec() (this means our own setup is waited for) */
4bff0a
         _SERVICE_TYPE_MAX,
4bff0a
         _SERVICE_TYPE_INVALID = -1
4bff0a
 } ServiceType;
4bff0a
@@ -165,6 +166,8 @@ struct Service {
4bff0a
         NotifyAccess notify_access;
4bff0a
         NotifyState notify_state;
4bff0a
 
4bff0a
+        sd_event_source *exec_fd_event_source;
4bff0a
+
4bff0a
         ServiceFDStore *fd_store;
4bff0a
         size_t n_fd_store;
4bff0a
         unsigned n_fd_store_max;
4bff0a
@@ -179,6 +182,7 @@ struct Service {
4bff0a
 
4bff0a
         unsigned n_restarts;
4bff0a
         bool flush_n_restarts;
4bff0a
+        bool exec_fd_hot;
4bff0a
 };
4bff0a
 
4bff0a
 extern const UnitVTable service_vtable;
4bff0a
diff --git a/src/core/socket.c b/src/core/socket.c
4bff0a
index 56d32225c4..d488c64e91 100644
4bff0a
--- a/src/core/socket.c
4bff0a
+++ b/src/core/socket.c
4bff0a
@@ -1867,10 +1867,11 @@ static int socket_coldplug(Unit *u) {
4bff0a
 static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
4bff0a
 
4bff0a
         ExecParameters exec_params = {
4bff0a
-                .flags      = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
4bff0a
-                .stdin_fd   = -1,
4bff0a
-                .stdout_fd  = -1,
4bff0a
-                .stderr_fd  = -1,
4bff0a
+                .flags     = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
4bff0a
+                .stdin_fd  = -1,
4bff0a
+                .stdout_fd = -1,
4bff0a
+                .stderr_fd = -1,
4bff0a
+                .exec_fd   = -1,
4bff0a
         };
4bff0a
         pid_t pid;
4bff0a
         int r;
4bff0a
diff --git a/src/core/swap.c b/src/core/swap.c
4bff0a
index b78b1aa266..e01e61e56d 100644
4bff0a
--- a/src/core/swap.c
4bff0a
+++ b/src/core/swap.c
4bff0a
@@ -606,6 +606,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
4bff0a
                 .stdin_fd  = -1,
4bff0a
                 .stdout_fd = -1,
4bff0a
                 .stderr_fd = -1,
4bff0a
+                .exec_fd   = -1,
4bff0a
         };
4bff0a
         pid_t pid;
4bff0a
         int r;