Blame SOURCES/0083-Ticket-48184-clean-up-and-delete-connections-at-shut.patch

3b7e51
From 5a5d3dffd0b36edb543fd31fa53d7128dd5161c2 Mon Sep 17 00:00:00 2001
3b7e51
From: Thierry Bordaz <tbordaz@redhat.com>
3b7e51
Date: Fri, 18 May 2018 10:13:46 +0200
3b7e51
Subject: [PATCH] Ticket 48184 - clean up and delete connections at shutdown
3b7e51
 (2nd try)
3b7e51
3b7e51
Bug description:
3b7e51
    During shutdown we would not close connections.
3b7e51
    In the past this may have just been an annoyance, but now with the way
3b7e51
    nunc-stans works, io events can still trigger on open xeisting connectinos
3b7e51
    during shutdown.
3b7e51
3b7e51
    Because of NS dynamic it can happen that several jobs wants to work on the
3b7e51
    same connection. In such case (a job is already set in c_job) we delay the
3b7e51
    new job that will retry.
3b7e51
    In addition:
3b7e51
	- some call needed c_mutex
3b7e51
	- test uninitialized nunc-stans in case of shutdown while startup is not completed
3b7e51
3b7e51
Fix Description:  Close connections during shutdown rather than
3b7e51
    leaving them alive.
3b7e51
3b7e51
https://pagure.io/389-ds-base/issue/48184
3b7e51
3b7e51
Reviewed by:
3b7e51
	Original was Ludwig and Viktor
3b7e51
	Second fix reviewed by Mark
3b7e51
3b7e51
Platforms tested: F26
3b7e51
3b7e51
Flag Day: no
3b7e51
3b7e51
Doc impact: no
3b7e51
3b7e51
(cherry picked from commit e562157ca3e97867d902996cc18fb04f90dc10a8)
3b7e51
---
3b7e51
 ldap/servers/slapd/connection.c |   2 +
3b7e51
 ldap/servers/slapd/conntable.c  |  13 ++++
3b7e51
 ldap/servers/slapd/daemon.c     | 131 ++++++++++++++++++++++++++++------------
3b7e51
 ldap/servers/slapd/fe.h         |   1 +
3b7e51
 ldap/servers/slapd/slap.h       |   1 +
3b7e51
 5 files changed, 108 insertions(+), 40 deletions(-)
3b7e51
3b7e51
diff --git a/ldap/servers/slapd/connection.c b/ldap/servers/slapd/connection.c
3b7e51
index b5030f0cb..76e83112b 100644
3b7e51
--- a/ldap/servers/slapd/connection.c
3b7e51
+++ b/ldap/servers/slapd/connection.c
3b7e51
@@ -1716,7 +1716,9 @@ connection_threadmain()
3b7e51
         if ((tag != LDAP_REQ_UNBIND) && !thread_turbo_flag && !replication_connection) {
3b7e51
             if (!more_data) {
3b7e51
                 conn->c_flags &= ~CONN_FLAG_MAX_THREADS;
3b7e51
+                PR_EnterMonitor(conn->c_mutex);
3b7e51
                 connection_make_readable_nolock(conn);
3b7e51
+                PR_ExitMonitor(conn->c_mutex);
3b7e51
                 /* once the connection is readable, another thread may access conn,
3b7e51
                  * so need locking from here on */
3b7e51
                 signal_listner();
3b7e51
diff --git a/ldap/servers/slapd/conntable.c b/ldap/servers/slapd/conntable.c
3b7e51
index 7c57b47cd..f2f763dfa 100644
3b7e51
--- a/ldap/servers/slapd/conntable.c
3b7e51
+++ b/ldap/servers/slapd/conntable.c
3b7e51
@@ -91,6 +91,19 @@ connection_table_abandon_all_operations(Connection_Table *ct)
3b7e51
     }
3b7e51
 }
3b7e51
 
3b7e51
+void
3b7e51
+connection_table_disconnect_all(Connection_Table *ct)
3b7e51
+{
3b7e51
+    for (size_t i = 0; i < ct->size; i++) {
3b7e51
+        if (ct->c[i].c_mutex) {
3b7e51
+            Connection *c = &(ct->c[i]);
3b7e51
+            PR_EnterMonitor(c->c_mutex);
3b7e51
+            disconnect_server_nomutex(c, c->c_connid, -1, SLAPD_DISCONNECT_ABORT, ECANCELED);
3b7e51
+            PR_ExitMonitor(c->c_mutex);
3b7e51
+        }
3b7e51
+    }
3b7e51
+}
3b7e51
+
3b7e51
 /* Given a file descriptor for a socket, this function will return
3b7e51
  * a slot in the connection table to use.
3b7e51
  *
3b7e51
diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
3b7e51
index fcc461a90..50e67474e 100644
3b7e51
--- a/ldap/servers/slapd/daemon.c
3b7e51
+++ b/ldap/servers/slapd/daemon.c
3b7e51
@@ -1087,12 +1087,18 @@ slapd_daemon(daemon_ports_t *ports, ns_thrpool_t *tp)
3b7e51
         /* we have exited from ns_thrpool_wait. This means we are shutting down! */
3b7e51
         /* Please see https://firstyear.fedorapeople.org/nunc-stans/md_docs_job-safety.html */
3b7e51
         /* tldr is shutdown needs to run first to allow job_done on an ARMED job */
3b7e51
-        for (size_t i = 0; i < listeners; i++) {
3b7e51
-            PRStatus shutdown_status = ns_job_done(listener_idxs[i].ns_job);
3b7e51
-            if (shutdown_status != PR_SUCCESS) {
3b7e51
-                slapi_log_err(SLAPI_LOG_CRIT, "ns_set_shutdown", "Failed to shutdown listener idx %" PRIu64 " !\n", i);
3b7e51
+        for (uint64_t i = 0; i < listeners; i++) {
3b7e51
+            PRStatus shutdown_status;
3b7e51
+
3b7e51
+            if (listener_idxs[i].ns_job) {
3b7e51
+                shutdown_status = ns_job_done(listener_idxs[i].ns_job);
3b7e51
+                if (shutdown_status != PR_SUCCESS) {
3b7e51
+                    slapi_log_err(SLAPI_LOG_CRIT, "ns_set_shutdown", "Failed to shutdown listener idx %" PRIu64 " !\n", i);
3b7e51
+                }
3b7e51
+                PR_ASSERT(shutdown_status == PR_SUCCESS);
3b7e51
+            } else {
3b7e51
+                slapi_log_err(SLAPI_LOG_CRIT, "slapd_daemon", "Listeners uninitialized. Possibly the server was shutdown while starting\n");
3b7e51
             }
3b7e51
-            PR_ASSERT(shutdown_status == PR_SUCCESS);
3b7e51
             listener_idxs[i].ns_job = NULL;
3b7e51
         }
3b7e51
     } else {
3b7e51
@@ -1176,6 +1182,32 @@ slapd_daemon(daemon_ports_t *ports, ns_thrpool_t *tp)
3b7e51
     housekeeping_stop(); /* Run this after op_thread_cleanup() logged sth */
3b7e51
     disk_monitoring_stop();
3b7e51
 
3b7e51
+    /*
3b7e51
+     * Now that they are abandonded, we need to mark them as done.
3b7e51
+     * In NS while it's safe to allow excess jobs to be cleaned by
3b7e51
+     * by the walk and ns_job_done of remaining queued events, the
3b7e51
+     * issue is that if we allow something to live past this point
3b7e51
+     * the CT is freed from underneath, and bad things happen (tm).
3b7e51
+     *
3b7e51
+     * NOTE: We do this after we stop psearch, because there could
3b7e51
+     * be a race between flagging the psearch done, and users still
3b7e51
+     * try to send on the connection. Similar with op_threads.
3b7e51
+     */
3b7e51
+    connection_table_disconnect_all(the_connection_table);
3b7e51
+
3b7e51
+    /*
3b7e51
+     * WARNING: Normally we should close the tp in main
3b7e51
+     * but because of issues in the current connection design
3b7e51
+     * we need to close it here to guarantee events won't fire!
3b7e51
+     *
3b7e51
+     * All the connection close jobs "should" complete before
3b7e51
+     * shutdown at least.
3b7e51
+     */
3b7e51
+    if (enable_nunc_stans) {
3b7e51
+        ns_thrpool_shutdown(tp);
3b7e51
+        ns_thrpool_wait(tp);
3b7e51
+    }
3b7e51
+
3b7e51
     threads = g_get_active_threadcnt();
3b7e51
     if (threads > 0) {
3b7e51
         slapi_log_err(SLAPI_LOG_INFO, "slapd_daemon",
3b7e51
@@ -1628,25 +1660,18 @@ ns_handle_closure(struct ns_job_t *job)
3b7e51
     Connection *c = (Connection *)ns_job_get_data(job);
3b7e51
     int do_yield = 0;
3b7e51
 
3b7e51
-/* this function must be called from the event loop thread */
3b7e51
-#ifdef DEBUG
3b7e51
-    PR_ASSERT(0 == NS_JOB_IS_THREAD(ns_job_get_type(job)));
3b7e51
-#else
3b7e51
-    /* This doesn't actually confirm it's in the event loop thread, but it's a start */
3b7e51
-    if (NS_JOB_IS_THREAD(ns_job_get_type(job)) != 0) {
3b7e51
-        slapi_log_err(SLAPI_LOG_ERR, "ns_handle_closure", "Attempt to close outside of event loop thread %" PRIu64 " for fd=%d\n",
3b7e51
-                      c->c_connid, c->c_sd);
3b7e51
-        return;
3b7e51
-    }
3b7e51
-#endif
3b7e51
-
3b7e51
     PR_EnterMonitor(c->c_mutex);
3b7e51
+    /* Assert we really have the right job state. */
3b7e51
+    PR_ASSERT(job == c->c_job);
3b7e51
 
3b7e51
     connection_release_nolock_ext(c, 1); /* release ref acquired for event framework */
3b7e51
     PR_ASSERT(c->c_ns_close_jobs == 1);  /* should be exactly 1 active close job - this one */
3b7e51
     c->c_ns_close_jobs--;                /* this job is processing closure */
3b7e51
+    /* Because handle closure will add a new job, we need to detach our current one. */
3b7e51
+    c->c_job = NULL;
3b7e51
     do_yield = ns_handle_closure_nomutex(c);
3b7e51
     PR_ExitMonitor(c->c_mutex);
3b7e51
+    /* Remove this task now. */
3b7e51
     ns_job_done(job);
3b7e51
     if (do_yield) {
3b7e51
         /* closure not done - another reference still outstanding */
3b7e51
@@ -1659,14 +1684,25 @@ ns_handle_closure(struct ns_job_t *job)
3b7e51
 /**
3b7e51
  * Schedule more I/O for this connection, or make sure that it
3b7e51
  * is closed in the event loop.
3b7e51
+ * caller must hold c_mutex
3b7e51
+ * It returns
3b7e51
+ *  0 on success
3b7e51
+ *  1 on need to retry
3b7e51
  */
3b7e51
-void
3b7e51
-ns_connection_post_io_or_closing(Connection *conn)
3b7e51
+static int
3b7e51
+ns_connection_post_io_or_closing_try(Connection *conn)
3b7e51
 {
3b7e51
     struct timeval tv;
3b7e51
 
3b7e51
     if (!enable_nunc_stans) {
3b7e51
-        return;
3b7e51
+        return 0;
3b7e51
+    }
3b7e51
+
3b7e51
+    /*
3b7e51
+     * Cancel any existing ns jobs we have registered.
3b7e51
+     */
3b7e51
+    if (conn->c_job != NULL) {
3b7e51
+        return 1;
3b7e51
     }
3b7e51
 
3b7e51
     if (CONN_NEEDS_CLOSING(conn)) {
3b7e51
@@ -1676,15 +1712,12 @@ ns_connection_post_io_or_closing(Connection *conn)
3b7e51
             slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already a close "
3b7e51
                                                                                "job in progress on conn %" PRIu64 " for fd=%d\n",
3b7e51
                           conn->c_connid, conn->c_sd);
3b7e51
-            return;
3b7e51
+            return 0;
3b7e51
         } else {
3b7e51
-            /* just make sure we schedule the event to be closed in a timely manner */
3b7e51
-            tv.tv_sec = 0;
3b7e51
-            tv.tv_usec = slapd_wakeup_timer * 1000;
3b7e51
             conn->c_ns_close_jobs++;                                                      /* now 1 active closure job */
3b7e51
             connection_acquire_nolock_ext(conn, 1 /* allow acquire even when closing */); /* event framework now has a reference */
3b7e51
-            ns_result_t job_result = ns_add_timeout_job(conn->c_tp, &tv, NS_JOB_TIMER,
3b7e51
-                                                        ns_handle_closure, conn, NULL);
3b7e51
+            /* Close the job asynchronously. Why? */
3b7e51
+            ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handle_closure, conn, &(conn->c_job));
3b7e51
             if (job_result != NS_SUCCESS) {
3b7e51
                 if (job_result == NS_SHUTDOWN) {
3b7e51
                     slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post closure job "
3b7e51
@@ -1723,12 +1756,12 @@ ns_connection_post_io_or_closing(Connection *conn)
3b7e51
              * The error occurs when we get a connection in a closing state.
3b7e51
              * For now we return, but there is probably a better way to handle the error case.
3b7e51
              */
3b7e51
-            return;
3b7e51
+            return 0;
3b7e51
         }
3b7e51
 #endif
3b7e51
         ns_result_t job_result = ns_add_io_timeout_job(conn->c_tp, conn->c_prfd, &tv,
3b7e51
                                                        NS_JOB_READ | NS_JOB_PRESERVE_FD,
3b7e51
-                                                       ns_handle_pr_read_ready, conn, NULL);
3b7e51
+                                                       ns_handle_pr_read_ready, conn, &(conn->c_job));
3b7e51
         if (job_result != NS_SUCCESS) {
3b7e51
             if (job_result == NS_SHUTDOWN) {
3b7e51
                 slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post I/O job for "
3b7e51
@@ -1745,6 +1778,28 @@ ns_connection_post_io_or_closing(Connection *conn)
3b7e51
                           conn->c_connid, conn->c_sd);
3b7e51
         }
3b7e51
     }
3b7e51
+    return 0;
3b7e51
+}
3b7e51
+void
3b7e51
+ns_connection_post_io_or_closing(Connection *conn)
3b7e51
+{
3b7e51
+    while (ns_connection_post_io_or_closing_try(conn)) {
3b7e51
+	/* we should retry later */
3b7e51
+	
3b7e51
+	/* We are not suppose to work immediately on the connection that is taken by
3b7e51
+	 * another job
3b7e51
+	 * release the lock and give some time
3b7e51
+	 */
3b7e51
+	
3b7e51
+	if (CONN_NEEDS_CLOSING(conn) && conn->c_ns_close_jobs) {
3b7e51
+	    return;
3b7e51
+	} else {
3b7e51
+	    PR_ExitMonitor(conn->c_mutex);
3b7e51
+	    DS_Sleep(PR_MillisecondsToInterval(100));
3b7e51
+
3b7e51
+	    PR_EnterMonitor(conn->c_mutex);
3b7e51
+	}
3b7e51
+    }
3b7e51
 }
3b7e51
 
3b7e51
 /* This function must be called without the thread flag, in the
3b7e51
@@ -1757,19 +1812,12 @@ ns_handle_pr_read_ready(struct ns_job_t *job)
3b7e51
     int maxthreads = config_get_maxthreadsperconn();
3b7e51
     Connection *c = (Connection *)ns_job_get_data(job);
3b7e51
 
3b7e51
-/* this function must be called from the event loop thread */
3b7e51
-#ifdef DEBUG
3b7e51
-    PR_ASSERT(0 == NS_JOB_IS_THREAD(ns_job_get_type(job)));
3b7e51
-#else
3b7e51
-    /* This doesn't actually confirm it's in the event loop thread, but it's a start */
3b7e51
-    if (NS_JOB_IS_THREAD(ns_job_get_type(job)) != 0) {
3b7e51
-        slapi_log_err(SLAPI_LOG_ERR, "ns_handle_pr_read_ready", "Attempt to handle read ready outside of event loop thread %" PRIu64 " for fd=%d\n",
3b7e51
-                      c->c_connid, c->c_sd);
3b7e51
-        return;
3b7e51
-    }
3b7e51
-#endif
3b7e51
-
3b7e51
     PR_EnterMonitor(c->c_mutex);
3b7e51
+    /* Assert we really have the right job state. */
3b7e51
+    PR_ASSERT(job == c->c_job);
3b7e51
+
3b7e51
+    /* On all code paths we remove the job, so set it null now */
3b7e51
+    c->c_job = NULL;
3b7e51
 
3b7e51
     slapi_log_err(SLAPI_LOG_CONNS, "ns_handle_pr_read_ready", "activity on conn %" PRIu64 " for fd=%d\n",
3b7e51
                   c->c_connid, c->c_sd);
3b7e51
@@ -1829,6 +1877,7 @@ ns_handle_pr_read_ready(struct ns_job_t *job)
3b7e51
         slapi_log_err(SLAPI_LOG_CONNS, "ns_handle_pr_read_ready", "queued conn %" PRIu64 " for fd=%d\n",
3b7e51
                       c->c_connid, c->c_sd);
3b7e51
     }
3b7e51
+    /* Since we call done on the job, we need to remove it here. */
3b7e51
     PR_ExitMonitor(c->c_mutex);
3b7e51
     ns_job_done(job);
3b7e51
     return;
3b7e51
@@ -2451,7 +2500,9 @@ ns_handle_new_connection(struct ns_job_t *job)
3b7e51
      * that poll() was avoided, even at the expense of putting this new fd back
3b7e51
      * in nunc-stans to poll for read ready.
3b7e51
      */
3b7e51
+    PR_EnterMonitor(c->c_mutex);
3b7e51
     ns_connection_post_io_or_closing(c);
3b7e51
+    PR_ExitMonitor(c->c_mutex);
3b7e51
     return;
3b7e51
 }
3b7e51
 
3b7e51
diff --git a/ldap/servers/slapd/fe.h b/ldap/servers/slapd/fe.h
3b7e51
index 4d25a9fb8..f47bb6145 100644
3b7e51
--- a/ldap/servers/slapd/fe.h
3b7e51
+++ b/ldap/servers/slapd/fe.h
3b7e51
@@ -100,6 +100,7 @@ extern Connection_Table *the_connection_table; /* JCM - Exported from globals.c
3b7e51
 Connection_Table *connection_table_new(int table_size);
3b7e51
 void connection_table_free(Connection_Table *ct);
3b7e51
 void connection_table_abandon_all_operations(Connection_Table *ct);
3b7e51
+void connection_table_disconnect_all(Connection_Table *ct);
3b7e51
 Connection *connection_table_get_connection(Connection_Table *ct, int sd);
3b7e51
 int connection_table_move_connection_out_of_active_list(Connection_Table *ct, Connection *c);
3b7e51
 void connection_table_move_connection_on_to_active_list(Connection_Table *ct, Connection *c);
3b7e51
diff --git a/ldap/servers/slapd/slap.h b/ldap/servers/slapd/slap.h
3b7e51
index 03355f5fe..de4ac35c0 100644
3b7e51
--- a/ldap/servers/slapd/slap.h
3b7e51
+++ b/ldap/servers/slapd/slap.h
3b7e51
@@ -1650,6 +1650,7 @@ typedef struct conn
3b7e51
     void *c_io_layer_cb_data;            /* callback data */
3b7e51
     struct connection_table *c_ct;       /* connection table that this connection belongs to */
3b7e51
     ns_thrpool_t *c_tp;                  /* thread pool for this connection */
3b7e51
+    struct ns_job_t *c_job;              /* If it exists, the current ns_job_t */
3b7e51
     int c_ns_close_jobs;                 /* number of current close jobs */
3b7e51
     char *c_ipaddr;                      /* ip address str - used by monitor */
3b7e51
 } Connection;
3b7e51
-- 
3b7e51
2.13.6
3b7e51