Blame SOURCES/0091-Ticket-49765-Async-operations-can-hang-when-the-serv.patch

967e4b
From 70e6c68196d381d05d35414c138894b54939d236 Mon Sep 17 00:00:00 2001
967e4b
From: Thierry Bordaz <tbordaz@redhat.com>
967e4b
Date: Thu, 7 Jun 2018 16:19:34 +0200
967e4b
Subject: [PATCH] Ticket 49765 - Async operations can hang when the server is
967e4b
 running nunc-stans
967e4b
967e4b
Bug Description:
967e4b
	The fix https://pagure.io/389-ds-base/issue/48184 allowed to schedule
967e4b
	several NS handlers where each handler waits for the dispatch of the
967e4b
	previous handler before being schedule.
967e4b
967e4b
	In case the current handler is never called (connection inactivity)
967e4b
	those that are waiting can wait indefinitely (until timeout or connection
967e4b
	closure). But those that are waiting delay the processing of the operation
967e4b
	when the scheduling is called by connection_threadmain.
967e4b
967e4b
	The some operations can appear hanging.
967e4b
	This scenario happens with async operations
967e4b
967e4b
Fix Description:
967e4b
	Instead of waiting for the completion of the scheduled handler,
967e4b
	evaluates if the scheduled handler needs to be cleared (ns_job_done)
967e4b
	or the waiting  handler to be canceled.
967e4b
967e4b
https://pagure.io/389-ds-base/issue/49765
967e4b
967e4b
Reviewed by: Mark Reynolds (thanks Mark !)
967e4b
967e4b
Platforms tested: F26
967e4b
967e4b
Flag Day: no
967e4b
967e4b
Doc impact: no
967e4b
---
967e4b
 ldap/servers/slapd/daemon.c         | 142 +++++++++++++++-------------
967e4b
 src/nunc-stans/include/nunc-stans.h |   3 +
967e4b
 src/nunc-stans/ns/ns_thrpool.c      |   5 +
967e4b
 3 files changed, 84 insertions(+), 66 deletions(-)
967e4b
967e4b
diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
967e4b
index 35cfe7de0..0a723c4a8 100644
967e4b
--- a/ldap/servers/slapd/daemon.c
967e4b
+++ b/ldap/servers/slapd/daemon.c
967e4b
@@ -152,12 +152,21 @@ accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRN
967e4b
  */
967e4b
 static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn);
967e4b
 static void ns_handle_new_connection(struct ns_job_t *job);
967e4b
+static void ns_handle_closure(struct ns_job_t *job);
967e4b
 static void handle_pr_read_ready(Connection_Table *ct, PRIntn num_poll);
967e4b
 static int clear_signal(struct POLL_STRUCT *fds);
967e4b
 static void unfurl_banners(Connection_Table *ct, daemon_ports_t *ports, PRFileDesc **n_tcps, PRFileDesc **s_tcps, PRFileDesc **i_unix);
967e4b
 static int write_pid_file(void);
967e4b
 static int init_shutdown_detect(void);
967e4b
 
967e4b
+#define NS_HANDLER_NEW_CONNECTION   0
967e4b
+#define NS_HANDLER_READ_CONNECTION  1
967e4b
+#define NS_HANDLER_CLOSE_CONNECTION 2
967e4b
+static ns_job_func_t ns_handlers[] = {
967e4b
+    ns_handle_new_connection,
967e4b
+    ns_handle_pr_read_ready,
967e4b
+    ns_handle_closure
967e4b
+};
967e4b
 /*
967e4b
  * NSPR has different implementations for PRMonitor, depending
967e4b
  * on the availble threading model
967e4b
@@ -1058,7 +1067,7 @@ slapd_daemon(daemon_ports_t *ports, ns_thrpool_t *tp)
967e4b
         for (size_t ii = 0; ii < listeners; ++ii) {
967e4b
             listener_idxs[ii].ct = the_connection_table; /* to pass to handle_new_connection */
967e4b
             ns_result_t result = ns_add_io_job(tp, listener_idxs[ii].listenfd, NS_JOB_ACCEPT | NS_JOB_PERSIST | NS_JOB_PRESERVE_FD,
967e4b
-                                               ns_handle_new_connection, &listener_idxs[ii], &(listener_idxs[ii].ns_job));
967e4b
+                                               ns_handlers[NS_HANDLER_NEW_CONNECTION], &listener_idxs[ii], &(listener_idxs[ii].ns_job));
967e4b
             if (result != NS_SUCCESS) {
967e4b
                 slapi_log_err(SLAPI_LOG_CRIT, "slapd_daemon", "ns_add_io_job failed to create add acceptor %d\n", result);
967e4b
             }
967e4b
@@ -1684,28 +1693,84 @@ ns_handle_closure(struct ns_job_t *job)
967e4b
 /**
967e4b
  * Schedule more I/O for this connection, or make sure that it
967e4b
  * is closed in the event loop.
967e4b
+ * 
967e4b
  * caller must hold c_mutex
967e4b
- * It returns
967e4b
- *  0 on success
967e4b
- *  1 on need to retry
967e4b
  */
967e4b
-static int
967e4b
-ns_connection_post_io_or_closing_try(Connection *conn)
967e4b
+void
967e4b
+ns_connection_post_io_or_closing(Connection *conn)
967e4b
 {
967e4b
     struct timeval tv;
967e4b
 
967e4b
     if (!enable_nunc_stans) {
967e4b
-        return 0;
967e4b
+        return;
967e4b
     }
967e4b
 
967e4b
     /*
967e4b
      * A job was already scheduled.
967e4b
-     * Let it be dispatched first
967e4b
+     * Check if it is the appropriate one
967e4b
      */
967e4b
     if (conn->c_job != NULL) {
967e4b
-        return 1;
967e4b
+        if (connection_is_free(conn, 0)) {
967e4b
+            PRStatus shutdown_status;
967e4b
+
967e4b
+            /* The connection being freed,
967e4b
+             * It means that ns_handle_closure already completed and the connection
967e4b
+             * is no longer on the active list.
967e4b
+             * The scheduled job is useless and scheduling a new one as well
967e4b
+             */
967e4b
+            shutdown_status = ns_job_done(conn->c_job);
967e4b
+            if (shutdown_status != PR_SUCCESS) {
967e4b
+                slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);
967e4b
+            }
967e4b
+            conn->c_job = NULL;
967e4b
+            return;
967e4b
+        }
967e4b
+        if (CONN_NEEDS_CLOSING(conn)) {
967e4b
+            if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION])) {
967e4b
+                /* Due to the closing state we would schedule a ns_handle_closure
967e4b
+                 * but one is already registered.
967e4b
+                 * Just return;
967e4b
+                 */
967e4b
+                slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_closure "
967e4b
+                        "job in progress on conn %" PRIu64 " for fd=%d\n",
967e4b
+                        conn->c_connid, conn->c_sd);
967e4b
+                return;
967e4b
+            } else {
967e4b
+                /* Due to the closing state we would schedule a ns_handle_closure
967e4b
+                 * but a different handler is registered. Stop it and schedule (below) ns_handle_closure
967e4b
+                 */
967e4b
+                ns_job_done(conn->c_job);
967e4b
+                conn->c_job = NULL;
967e4b
+            }
967e4b
+        } else {
967e4b
+            /* Here the connection is still active => ignore the call and return */
967e4b
+            if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_READ_CONNECTION])) {
967e4b
+                /* Connection is still active and a read_ready is already scheduled
967e4b
+                 * Likely a consequence of async operations
967e4b
+                 * Just let the current read_ready do its job
967e4b
+                 */
967e4b
+                slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_pr_read_ready "
967e4b
+                                                                               "job in progress on conn %" PRIu64 " for fd=%d\n",
967e4b
+                          conn->c_connid, conn->c_sd);
967e4b
+            } else {
967e4b
+                /* Weird situation where the connection is not flagged closing but ns_handle_closure
967e4b
+                 * is scheduled.
967e4b
+                 * We should not try to read it anymore
967e4b
+                 */
967e4b
+                PR_ASSERT(ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION]));
967e4b
+            }
967e4b
+            return;
967e4b
+        }
967e4b
     }
967e4b
 
967e4b
+    /* At this point conn->c_job is NULL
967e4b
+     * Either it was null when the function was called
967e4b
+     * Or we cleared it (+ns_job_done) if the wrong (according
967e4b
+     * to the connection state) handler was scheduled
967e4b
+     *
967e4b
+     * Now we need to determine which handler to schedule
967e4b
+     */
967e4b
+
967e4b
     if (CONN_NEEDS_CLOSING(conn)) {
967e4b
         /* there should only ever be 0 or 1 active closure jobs */
967e4b
         PR_ASSERT((conn->c_ns_close_jobs == 0) || (conn->c_ns_close_jobs == 1));
967e4b
@@ -1718,7 +1783,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)
967e4b
             conn->c_ns_close_jobs++;                                                      /* now 1 active closure job */
967e4b
             connection_acquire_nolock_ext(conn, 1 /* allow acquire even when closing */); /* event framework now has a reference */
967e4b
             /* Close the job asynchronously. Why? */
967e4b
-            ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handle_closure, conn, &(conn->c_job));
967e4b
+            ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handlers[NS_HANDLER_CLOSE_CONNECTION], conn, &(conn->c_job));
967e4b
             if (job_result != NS_SUCCESS) {
967e4b
                 if (job_result == NS_SHUTDOWN) {
967e4b
                     slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post closure job "
967e4b
@@ -1762,7 +1827,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)
967e4b
 #endif
967e4b
         ns_result_t job_result = ns_add_io_timeout_job(conn->c_tp, conn->c_prfd, &tv,
967e4b
                                                        NS_JOB_READ | NS_JOB_PRESERVE_FD,
967e4b
-                                                       ns_handle_pr_read_ready, conn, &(conn->c_job));
967e4b
+                                                       ns_handlers[NS_HANDLER_READ_CONNECTION], conn, &(conn->c_job));
967e4b
         if (job_result != NS_SUCCESS) {
967e4b
             if (job_result == NS_SHUTDOWN) {
967e4b
                 slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post I/O job for "
967e4b
@@ -1782,61 +1847,6 @@ ns_connection_post_io_or_closing_try(Connection *conn)
967e4b
     return 0;
967e4b
 }
967e4b
 
967e4b
-/*
967e4b
- * Tries to schedule I/O for this connection
967e4b
- * If the connection is already busy with a scheduled I/O
967e4b
- * it can wait until scheduled I/O is dispatched
967e4b
- *
967e4b
- * caller must hold c_mutex
967e4b
- */
967e4b
-void
967e4b
-ns_connection_post_io_or_closing(Connection *conn)
967e4b
-{
967e4b
-    while (ns_connection_post_io_or_closing_try(conn)) {
967e4b
-        /* Here a job is currently scheduled (c->job is set) and not yet dispatched
967e4b
-         * Job can be either:
967e4b
-         *  - ns_handle_closure
967e4b
-         *  - ns_handle_pr_read_ready
967e4b
-         */
967e4b
-
967e4b
-        if (connection_is_free(conn, 0)) {
967e4b
-            PRStatus shutdown_status;
967e4b
-
967e4b
-            /* The connection being freed,
967e4b
-             * It means that ns_handle_closure already completed and the connection
967e4b
-             * is no longer on the active list.
967e4b
-             * The scheduled job is useless and scheduling a new one as well
967e4b
-             */
967e4b
-            shutdown_status = ns_job_done(conn->c_job);
967e4b
-            if (shutdown_status != PR_SUCCESS) {
967e4b
-                slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);
967e4b
-            }
967e4b
-            conn->c_job = NULL;
967e4b
-            return;
967e4b
-        }
967e4b
-        if (g_get_shutdown() && CONN_NEEDS_CLOSING(conn)) {
967e4b
-            PRStatus shutdown_status;
967e4b
-
967e4b
-            /* This is shutting down cancel any scheduled job to register ns_handle_closure
967e4b
-             */
967e4b
-            shutdown_status = ns_job_done(conn->c_job);
967e4b
-            if (shutdown_status != PR_SUCCESS) {
967e4b
-                slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed to cancel a job during shutdown %d !\n", conn->c_sd);
967e4b
-            }
967e4b
-            conn->c_job = NULL;
967e4b
-            continue;
967e4b
-        }
967e4b
-
967e4b
-        /* We are not suppose to work immediately on the connection that is taken by
967e4b
-         * another job
967e4b
-         * release the lock and give some time
967e4b
-         */
967e4b
-        PR_ExitMonitor(conn->c_mutex);
967e4b
-        DS_Sleep(PR_MillisecondsToInterval(100));
967e4b
-        PR_EnterMonitor(conn->c_mutex);
967e4b
-    }
967e4b
-}
967e4b
-
967e4b
 /* This function must be called without the thread flag, in the
967e4b
  * event loop.  This function may free the connection.  This can
967e4b
  * only be done in the event loop thread.
967e4b
diff --git a/src/nunc-stans/include/nunc-stans.h b/src/nunc-stans/include/nunc-stans.h
967e4b
index 192e38ec3..a0ddbdb42 100644
967e4b
--- a/src/nunc-stans/include/nunc-stans.h
967e4b
+++ b/src/nunc-stans/include/nunc-stans.h
967e4b
@@ -959,4 +959,7 @@ ns_result_t ns_thrpool_wait(struct ns_thrpool_t *tp);
967e4b
  */
967e4b
 ns_result_t ns_job_rearm(struct ns_job_t *job);
967e4b
 
967e4b
+int
967e4b
+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func);
967e4b
+
967e4b
 #endif /* NS_THRPOOL_H */
967e4b
diff --git a/src/nunc-stans/ns/ns_thrpool.c b/src/nunc-stans/ns/ns_thrpool.c
967e4b
index d95b0c38b..774607c88 100644
967e4b
--- a/src/nunc-stans/ns/ns_thrpool.c
967e4b
+++ b/src/nunc-stans/ns/ns_thrpool.c
967e4b
@@ -1237,6 +1237,11 @@ ns_job_rearm(ns_job_t *job)
967e4b
     /* Unreachable code .... */
967e4b
     return NS_INVALID_REQUEST;
967e4b
 }
967e4b
+int
967e4b
+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func)
967e4b
+{
967e4b
+    return(job && job->func == func);
967e4b
+}
967e4b
 
967e4b
 static void
967e4b
 ns_thrpool_delete(ns_thrpool_t *tp)
967e4b
-- 
967e4b
2.17.1
967e4b