Blame SOURCES/0044-Ticket-48184-close-connections-at-shutdown-cleanly.patch

b045b9
From 0c1fbfaf77d6f7b2a6628deaf309bbe1c3e7a8e8 Mon Sep 17 00:00:00 2001
b045b9
From: William Brown <firstyear@redhat.com>
b045b9
Date: Tue, 28 Nov 2017 13:39:19 +0100
b045b9
Subject: [PATCH] Ticket 48184 - close connections at shutdown cleanly.
b045b9
b045b9
Bug Description:  During shutdown we would not close connections.
b045b9
In the past this may have just been an annoyance, but now with the way
b045b9
nunc-stans works, io events can still trigger on open xeisting connectinos
b045b9
during shutdown.
b045b9
b045b9
Fix Description:  Close connections during shutdown rather than
b045b9
leaving them alive.
b045b9
b045b9
https://pagure.io/389-ds-base/issue/48184
b045b9
b045b9
Author: wibrown
b045b9
b045b9
Review by: lkrispen, vashirov (Thank you!)
b045b9
---
b045b9
 ldap/servers/slapd/conntable.c | 13 +++++++
b045b9
 ldap/servers/slapd/daemon.c    | 77 ++++++++++++++++++++++++++----------------
b045b9
 ldap/servers/slapd/fe.h        |  1 +
b045b9
 ldap/servers/slapd/slap.h      |  1 +
b045b9
 4 files changed, 63 insertions(+), 29 deletions(-)
b045b9
b045b9
diff --git a/ldap/servers/slapd/conntable.c b/ldap/servers/slapd/conntable.c
b045b9
index 7c57b47cd..f2f763dfa 100644
b045b9
--- a/ldap/servers/slapd/conntable.c
b045b9
+++ b/ldap/servers/slapd/conntable.c
b045b9
@@ -91,6 +91,19 @@ connection_table_abandon_all_operations(Connection_Table *ct)
b045b9
     }
b045b9
 }
b045b9
 
b045b9
+void
b045b9
+connection_table_disconnect_all(Connection_Table *ct)
b045b9
+{
b045b9
+    for (size_t i = 0; i < ct->size; i++) {
b045b9
+        if (ct->c[i].c_mutex) {
b045b9
+            Connection *c = &(ct->c[i]);
b045b9
+            PR_EnterMonitor(c->c_mutex);
b045b9
+            disconnect_server_nomutex(c, c->c_connid, -1, SLAPD_DISCONNECT_ABORT, ECANCELED);
b045b9
+            PR_ExitMonitor(c->c_mutex);
b045b9
+        }
b045b9
+    }
b045b9
+}
b045b9
+
b045b9
 /* Given a file descriptor for a socket, this function will return
b045b9
  * a slot in the connection table to use.
b045b9
  *
b045b9
diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
b045b9
index 4e0466ab3..c245a4d4e 100644
b045b9
--- a/ldap/servers/slapd/daemon.c
b045b9
+++ b/ldap/servers/slapd/daemon.c
b045b9
@@ -1176,6 +1176,30 @@ slapd_daemon(daemon_ports_t *ports, ns_thrpool_t *tp)
b045b9
     housekeeping_stop(); /* Run this after op_thread_cleanup() logged sth */
b045b9
     disk_monitoring_stop();
b045b9
 
b045b9
+    /*
b045b9
+     * Now that they are abandonded, we need to mark them as done.
b045b9
+     * In NS while it's safe to allow excess jobs to be cleaned by
b045b9
+     * by the walk and ns_job_done of remaining queued events, the
b045b9
+     * issue is that if we allow something to live past this point
b045b9
+     * the CT is freed from underneath, and bad things happen (tm).
b045b9
+     *
b045b9
+     * NOTE: We do this after we stop psearch, because there could
b045b9
+     * be a race between flagging the psearch done, and users still
b045b9
+     * try to send on the connection. Similar with op_threads.
b045b9
+     */
b045b9
+    connection_table_disconnect_all(the_connection_table);
b045b9
+
b045b9
+    /*
b045b9
+     * WARNING: Normally we should close the tp in main
b045b9
+     * but because of issues in the current connection design
b045b9
+     * we need to close it here to guarantee events won't fire!
b045b9
+     *
b045b9
+     * All the connection close jobs "should" complete before
b045b9
+     * shutdown at least.
b045b9
+     */
b045b9
+    ns_thrpool_shutdown(tp);
b045b9
+    ns_thrpool_wait(tp);
b045b9
+
b045b9
     threads = g_get_active_threadcnt();
b045b9
     if (threads > 0) {
b045b9
         slapi_log_err(SLAPI_LOG_INFO, "slapd_daemon",
b045b9
@@ -1628,23 +1652,18 @@ ns_handle_closure(struct ns_job_t *job)
b045b9
     Connection *c = (Connection *)ns_job_get_data(job);
b045b9
     int do_yield = 0;
b045b9
 
b045b9
-/* this function must be called from the event loop thread */
b045b9
-#ifdef DEBUG
b045b9
-    PR_ASSERT(0 == NS_JOB_IS_THREAD(ns_job_get_type(job)));
b045b9
-#else
b045b9
-    /* This doesn't actually confirm it's in the event loop thread, but it's a start */
b045b9
-    if (NS_JOB_IS_THREAD(ns_job_get_type(job)) != 0) {
b045b9
-        slapi_log_err(SLAPI_LOG_ERR, "ns_handle_closure", "Attempt to close outside of event loop thread %" PRIu64 " for fd=%d\n",
b045b9
-                      c->c_connid, c->c_sd);
b045b9
-        return;
b045b9
-    }
b045b9
-#endif
b045b9
     PR_EnterMonitor(c->c_mutex);
b045b9
+    /* Assert we really have the right job state. */
b045b9
+    PR_ASSERT(job == c->c_job);
b045b9
+
b045b9
     connection_release_nolock_ext(c, 1); /* release ref acquired for event framework */
b045b9
     PR_ASSERT(c->c_ns_close_jobs == 1);  /* should be exactly 1 active close job - this one */
b045b9
     c->c_ns_close_jobs--;                /* this job is processing closure */
b045b9
+    /* Because handle closure will add a new job, we need to detach our current one. */
b045b9
+    c->c_job = NULL;
b045b9
     do_yield = ns_handle_closure_nomutex(c);
b045b9
     PR_ExitMonitor(c->c_mutex);
b045b9
+    /* Remove this task now. */
b045b9
     ns_job_done(job);
b045b9
     if (do_yield) {
b045b9
         /* closure not done - another reference still outstanding */
b045b9
@@ -1667,6 +1686,14 @@ ns_connection_post_io_or_closing(Connection *conn)
b045b9
         return;
b045b9
     }
b045b9
 
b045b9
+    /*
b045b9
+     * Cancel any existing ns jobs we have registered.
b045b9
+     */
b045b9
+    if (conn->c_job != NULL) {
b045b9
+        ns_job_done(conn->c_job);
b045b9
+        conn->c_job = NULL;
b045b9
+    }
b045b9
+
b045b9
     if (CONN_NEEDS_CLOSING(conn)) {
b045b9
         /* there should only ever be 0 or 1 active closure jobs */
b045b9
         PR_ASSERT((conn->c_ns_close_jobs == 0) || (conn->c_ns_close_jobs == 1));
b045b9
@@ -1676,13 +1703,10 @@ ns_connection_post_io_or_closing(Connection *conn)
b045b9
                           conn->c_connid, conn->c_sd);
b045b9
             return;
b045b9
         } else {
b045b9
-            /* just make sure we schedule the event to be closed in a timely manner */
b045b9
-            tv.tv_sec = 0;
b045b9
-            tv.tv_usec = slapd_wakeup_timer * 1000;
b045b9
             conn->c_ns_close_jobs++;                                                      /* now 1 active closure job */
b045b9
             connection_acquire_nolock_ext(conn, 1 /* allow acquire even when closing */); /* event framework now has a reference */
b045b9
-            ns_result_t job_result = ns_add_timeout_job(conn->c_tp, &tv, NS_JOB_TIMER,
b045b9
-                                                        ns_handle_closure, conn, NULL);
b045b9
+            /* Close the job asynchronously. Why? */
b045b9
+            ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handle_closure, conn, &(conn->c_job));
b045b9
             if (job_result != NS_SUCCESS) {
b045b9
                 if (job_result == NS_SHUTDOWN) {
b045b9
                     slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post closure job "
b045b9
@@ -1726,7 +1750,7 @@ ns_connection_post_io_or_closing(Connection *conn)
b045b9
 #endif
b045b9
         ns_result_t job_result = ns_add_io_timeout_job(conn->c_tp, conn->c_prfd, &tv,
b045b9
                                                        NS_JOB_READ | NS_JOB_PRESERVE_FD,
b045b9
-                                                       ns_handle_pr_read_ready, conn, NULL);
b045b9
+                                                       ns_handle_pr_read_ready, conn, &(conn->c_job));
b045b9
         if (job_result != NS_SUCCESS) {
b045b9
             if (job_result == NS_SHUTDOWN) {
b045b9
                 slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post I/O job for "
b045b9
@@ -1755,19 +1779,13 @@ ns_handle_pr_read_ready(struct ns_job_t *job)
b045b9
     int maxthreads = config_get_maxthreadsperconn();
b045b9
     Connection *c = (Connection *)ns_job_get_data(job);
b045b9
 
b045b9
-/* this function must be called from the event loop thread */
b045b9
-#ifdef DEBUG
b045b9
-    PR_ASSERT(0 == NS_JOB_IS_THREAD(ns_job_get_type(job)));
b045b9
-#else
b045b9
-    /* This doesn't actually confirm it's in the event loop thread, but it's a start */
b045b9
-    if (NS_JOB_IS_THREAD(ns_job_get_type(job)) != 0) {
b045b9
-        slapi_log_err(SLAPI_LOG_ERR, "ns_handle_pr_read_ready", "Attempt to handle read ready outside of event loop thread %" PRIu64 " for fd=%d\n",
b045b9
-                      c->c_connid, c->c_sd);
b045b9
-        return;
b045b9
-    }
b045b9
-#endif
b045b9
-
b045b9
     PR_EnterMonitor(c->c_mutex);
b045b9
+    /* Assert we really have the right job state. */
b045b9
+    PR_ASSERT(job == c->c_job);
b045b9
+
b045b9
+    /* On all code paths we remove the job, so set it null now */
b045b9
+    c->c_job = NULL;
b045b9
+
b045b9
     slapi_log_err(SLAPI_LOG_CONNS, "ns_handle_pr_read_ready", "activity on conn %" PRIu64 " for fd=%d\n",
b045b9
                   c->c_connid, c->c_sd);
b045b9
     /* if we were called due to some i/o event, see what the state of the socket is */
b045b9
@@ -1826,6 +1844,7 @@ ns_handle_pr_read_ready(struct ns_job_t *job)
b045b9
         slapi_log_err(SLAPI_LOG_CONNS, "ns_handle_pr_read_ready", "queued conn %" PRIu64 " for fd=%d\n",
b045b9
                       c->c_connid, c->c_sd);
b045b9
     }
b045b9
+    /* Since we call done on the job, we need to remove it here. */
b045b9
     PR_ExitMonitor(c->c_mutex);
b045b9
     ns_job_done(job);
b045b9
     return;
b045b9
diff --git a/ldap/servers/slapd/fe.h b/ldap/servers/slapd/fe.h
b045b9
index 4d25a9fb8..f47bb6145 100644
b045b9
--- a/ldap/servers/slapd/fe.h
b045b9
+++ b/ldap/servers/slapd/fe.h
b045b9
@@ -100,6 +100,7 @@ extern Connection_Table *the_connection_table; /* JCM - Exported from globals.c
b045b9
 Connection_Table *connection_table_new(int table_size);
b045b9
 void connection_table_free(Connection_Table *ct);
b045b9
 void connection_table_abandon_all_operations(Connection_Table *ct);
b045b9
+void connection_table_disconnect_all(Connection_Table *ct);
b045b9
 Connection *connection_table_get_connection(Connection_Table *ct, int sd);
b045b9
 int connection_table_move_connection_out_of_active_list(Connection_Table *ct, Connection *c);
b045b9
 void connection_table_move_connection_on_to_active_list(Connection_Table *ct, Connection *c);
b045b9
diff --git a/ldap/servers/slapd/slap.h b/ldap/servers/slapd/slap.h
b045b9
index 830944f72..08754d8fb 100644
b045b9
--- a/ldap/servers/slapd/slap.h
b045b9
+++ b/ldap/servers/slapd/slap.h
b045b9
@@ -1644,6 +1644,7 @@ typedef struct conn
b045b9
     void *c_io_layer_cb_data;            /* callback data */
b045b9
     struct connection_table *c_ct;       /* connection table that this connection belongs to */
b045b9
     ns_thrpool_t *c_tp;                  /* thread pool for this connection */
b045b9
+    struct ns_job_t *c_job;                     /* If it exists, the current ns_job_t */
b045b9
     int c_ns_close_jobs;                 /* number of current close jobs */
b045b9
     char *c_ipaddr;                      /* ip address str - used by monitor */
b045b9
 } Connection;
b045b9
-- 
b045b9
2.13.6
b045b9