aed857
From 13bcf85ffab4b4e67039599246604a3f5b503975 Mon Sep 17 00:00:00 2001
aed857
From: David Tardon <dtardon@redhat.com>
aed857
Date: Tue, 24 Apr 2018 15:19:38 +0200
aed857
Subject: [PATCH] fix race between daemon-reload and other commands
aed857
aed857
When "systemctl daemon-reload" is run at the same time as "systemctl
aed857
start foo", the latter might hang. That's because commands like start
aed857
wait for JobRemoved signal to know when the job is finished. But if the
aed857
job is finished during reloading, the signal is never sent.
aed857
aed857
The hang can be easily reproduced by running
aed857
aed857
    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl daemon-reload ; done
aed857
    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl start systemd-coredump.socket ; done
aed857
aed857
in two different terminals. The start command will hang after 1-2
aed857
iterations.
aed857
aed857
This keeps track of jobs that were started before reload and finished
aed857
during it and sends JobRemoved after the reload has finished.
aed857
aed857
(cherry picked from commit a7a7163df7fc8a9f794f6803b2f6c9c9b0745a1f)
aed857
---
aed857
 src/core/job.c     | 45 ++++++++++++++++++++++++++++++++++++++++-----
aed857
 src/core/job.h     |  2 ++
aed857
 src/core/manager.c | 15 +++++++++++++++
aed857
 src/core/manager.h |  3 +++
aed857
 4 files changed, 60 insertions(+), 5 deletions(-)
aed857
aed857
diff --git a/src/core/job.c b/src/core/job.c
c62b8e
index 1861c8a633..275503169b 100644
aed857
--- a/src/core/job.c
aed857
+++ b/src/core/job.c
aed857
@@ -53,6 +53,7 @@ Job* job_new_raw(Unit *unit) {
aed857
         j->manager = unit->manager;
aed857
         j->unit = unit;
aed857
         j->type = _JOB_TYPE_INVALID;
aed857
+        j->reloaded = false;
aed857
 
aed857
         return j;
aed857
 }
aed857
@@ -74,7 +75,7 @@ Job* job_new(Unit *unit, JobType type) {
aed857
         return j;
aed857
 }
aed857
 
aed857
-void job_free(Job *j) {
aed857
+void job_unlink(Job *j) {
aed857
         assert(j);
aed857
         assert(!j->installed);
aed857
         assert(!j->transaction_prev);
aed857
@@ -82,13 +83,28 @@ void job_free(Job *j) {
aed857
         assert(!j->subject_list);
aed857
         assert(!j->object_list);
aed857
 
aed857
-        if (j->in_run_queue)
aed857
+        if (j->in_run_queue) {
aed857
                 LIST_REMOVE(run_queue, j->manager->run_queue, j);
aed857
+                j->in_run_queue = false;
aed857
+        }
aed857
 
aed857
-        if (j->in_dbus_queue)
aed857
+        if (j->in_dbus_queue) {
aed857
                 LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
aed857
+                j->in_dbus_queue = false;
aed857
+        }
aed857
+
aed857
+        j->timer_event_source = sd_event_source_unref(j->timer_event_source);
aed857
+}
aed857
+
aed857
+void job_free(Job *j) {
aed857
+        assert(j);
aed857
+        assert(!j->installed);
aed857
+        assert(!j->transaction_prev);
aed857
+        assert(!j->transaction_next);
aed857
+        assert(!j->subject_list);
aed857
+        assert(!j->object_list);
aed857
 
aed857
-        sd_event_source_unref(j->timer_event_source);
aed857
+        job_unlink(j);
aed857
 
aed857
         sd_bus_track_unref(j->clients);
aed857
         strv_free(j->deserialized_clients);
aed857
@@ -246,6 +262,7 @@ int job_install_deserialized(Job *j) {
aed857
 
aed857
         *pj = j;
aed857
         j->installed = true;
aed857
+        j->reloaded = true;
aed857
 
aed857
         if (j->state == JOB_RUNNING)
aed857
                 j->unit->manager->n_running_jobs++;
aed857
@@ -790,6 +807,19 @@ static void job_emit_status_message(Unit *u, JobType t, JobResult result) {
aed857
                 job_print_status_message(u, t, result);
aed857
 }
aed857
 
aed857
+static int job_save_pending_finished_job(Job *j) {
aed857
+        int r;
aed857
+
aed857
+        assert(j);
aed857
+
aed857
+        r = set_ensure_allocated(&j->manager->pending_finished_jobs, NULL);
aed857
+        if (r < 0)
aed857
+                return r;
aed857
+
aed857
+        job_unlink(j);
aed857
+        return set_put(j->manager->pending_finished_jobs, j);
aed857
+}
aed857
+
aed857
 int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
aed857
         Unit *u;
aed857
         Unit *other;
aed857
@@ -829,7 +859,12 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr
aed857
                 j->manager->n_failed_jobs ++;
aed857
 
aed857
         job_uninstall(j);
aed857
-        job_free(j);
aed857
+        /* Remember jobs started before the reload */
aed857
+        if (j->manager->n_reloading > 0 && j->reloaded) {
aed857
+                if (job_save_pending_finished_job(j) < 0)
aed857
+                        job_free(j);
aed857
+        } else
aed857
+                job_free(j);
aed857
 
aed857
         /* Fail depending jobs on failure */
aed857
         if (result != JOB_DONE && recursive) {
aed857
diff --git a/src/core/job.h b/src/core/job.h
c62b8e
index 535052b48f..4ae6f28020 100644
aed857
--- a/src/core/job.h
aed857
+++ b/src/core/job.h
aed857
@@ -172,10 +172,12 @@ struct Job {
aed857
         bool sent_dbus_new_signal:1;
aed857
         bool ignore_order:1;
aed857
         bool irreversible:1;
aed857
+        bool reloaded:1;
aed857
 };
aed857
 
aed857
 Job* job_new(Unit *unit, JobType type);
aed857
 Job* job_new_raw(Unit *unit);
aed857
+void job_unlink(Job *job);
aed857
 void job_free(Job *job);
aed857
 Job* job_install(Job *j);
aed857
 int job_install_deserialized(Job *j);
aed857
diff --git a/src/core/manager.c b/src/core/manager.c
c62b8e
index 47b09e1e93..9c406bb5bf 100644
aed857
--- a/src/core/manager.c
aed857
+++ b/src/core/manager.c
aed857
@@ -2702,6 +2702,18 @@ finish:
aed857
         return r;
aed857
 }
aed857
 
aed857
+static void manager_flush_finished_jobs(Manager *m) {
aed857
+        Job *j;
aed857
+
aed857
+        while ((j = set_steal_first(m->pending_finished_jobs))) {
aed857
+                bus_job_send_removed_signal(j);
aed857
+                job_free(j);
aed857
+        }
aed857
+
aed857
+        set_free(m->pending_finished_jobs);
aed857
+        m->pending_finished_jobs = NULL;
aed857
+}
aed857
+
aed857
 int manager_reload(Manager *m) {
aed857
         int r, q;
aed857
         _cleanup_fclose_ FILE *f = NULL;
aed857
@@ -2784,6 +2796,9 @@ int manager_reload(Manager *m) {
aed857
         assert(m->n_reloading > 0);
aed857
         m->n_reloading--;
aed857
 
aed857
+        if (m->n_reloading <= 0)
aed857
+                manager_flush_finished_jobs(m);
aed857
+
aed857
         m->send_reloading_done = true;
aed857
 
aed857
         return r;
aed857
diff --git a/src/core/manager.h b/src/core/manager.h
c62b8e
index e91e7bd8b3..90d2d982e1 100644
aed857
--- a/src/core/manager.h
aed857
+++ b/src/core/manager.h
aed857
@@ -270,6 +270,9 @@ struct Manager {
aed857
 
aed857
         /* non-zero if we are reloading or reexecuting, */
aed857
         int n_reloading;
aed857
+        /* A set which contains all jobs that started before reload and finished
aed857
+         * during it */
aed857
+        Set *pending_finished_jobs;
aed857
 
aed857
         unsigned n_installed_jobs;
aed857
         unsigned n_failed_jobs;