Blame SOURCES/0091-Ticket-49765-Async-operations-can-hang-when-the-serv.patch

e06446
From 70e6c68196d381d05d35414c138894b54939d236 Mon Sep 17 00:00:00 2001
e06446
From: Thierry Bordaz <tbordaz@redhat.com>
e06446
Date: Thu, 7 Jun 2018 16:19:34 +0200
e06446
Subject: [PATCH] Ticket 49765 - Async operations can hang when the server is
e06446
 running nunc-stans
e06446
e06446
Bug Description:
e06446
	The fix https://pagure.io/389-ds-base/issue/48184 allowed to schedule
e06446
	several NS handlers where each handler waits for the dispatch of the
e06446
	previous handler before being schedule.
e06446
e06446
	In case the current handler is never called (connection inactivity)
e06446
	those that are waiting can wait indefinitely (until timeout or connection
e06446
	closure). But those that are waiting delay the processing of the operation
e06446
	when the scheduling is called by connection_threadmain.
e06446
e06446
	The some operations can appear hanging.
e06446
	This scenario happens with async operations
e06446
e06446
Fix Description:
e06446
	Instead of waiting for the completion of the scheduled handler,
e06446
	evaluates if the scheduled handler needs to be cleared (ns_job_done)
e06446
	or the waiting  handler to be canceled.
e06446
e06446
https://pagure.io/389-ds-base/issue/49765
e06446
e06446
Reviewed by: Mark Reynolds (thanks Mark !)
e06446
e06446
Platforms tested: F26
e06446
e06446
Flag Day: no
e06446
e06446
Doc impact: no
e06446
---
e06446
 ldap/servers/slapd/daemon.c         | 142 +++++++++++++++-------------
e06446
 src/nunc-stans/include/nunc-stans.h |   3 +
e06446
 src/nunc-stans/ns/ns_thrpool.c      |   5 +
e06446
 3 files changed, 84 insertions(+), 66 deletions(-)
e06446
e06446
diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
e06446
index 35cfe7de0..0a723c4a8 100644
e06446
--- a/ldap/servers/slapd/daemon.c
e06446
+++ b/ldap/servers/slapd/daemon.c
e06446
@@ -152,12 +152,21 @@ accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRN
e06446
  */
e06446
 static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn);
e06446
 static void ns_handle_new_connection(struct ns_job_t *job);
e06446
+static void ns_handle_closure(struct ns_job_t *job);
e06446
 static void handle_pr_read_ready(Connection_Table *ct, PRIntn num_poll);
e06446
 static int clear_signal(struct POLL_STRUCT *fds);
e06446
 static void unfurl_banners(Connection_Table *ct, daemon_ports_t *ports, PRFileDesc **n_tcps, PRFileDesc **s_tcps, PRFileDesc **i_unix);
e06446
 static int write_pid_file(void);
e06446
 static int init_shutdown_detect(void);
e06446
 
e06446
+#define NS_HANDLER_NEW_CONNECTION   0
e06446
+#define NS_HANDLER_READ_CONNECTION  1
e06446
+#define NS_HANDLER_CLOSE_CONNECTION 2
e06446
+static ns_job_func_t ns_handlers[] = {
e06446
+    ns_handle_new_connection,
e06446
+    ns_handle_pr_read_ready,
e06446
+    ns_handle_closure
e06446
+};
e06446
 /*
e06446
  * NSPR has different implementations for PRMonitor, depending
e06446
  * on the availble threading model
e06446
@@ -1058,7 +1067,7 @@ slapd_daemon(daemon_ports_t *ports, ns_thrpool_t *tp)
e06446
         for (size_t ii = 0; ii < listeners; ++ii) {
e06446
             listener_idxs[ii].ct = the_connection_table; /* to pass to handle_new_connection */
e06446
             ns_result_t result = ns_add_io_job(tp, listener_idxs[ii].listenfd, NS_JOB_ACCEPT | NS_JOB_PERSIST | NS_JOB_PRESERVE_FD,
e06446
-                                               ns_handle_new_connection, &listener_idxs[ii], &(listener_idxs[ii].ns_job));
e06446
+                                               ns_handlers[NS_HANDLER_NEW_CONNECTION], &listener_idxs[ii], &(listener_idxs[ii].ns_job));
e06446
             if (result != NS_SUCCESS) {
e06446
                 slapi_log_err(SLAPI_LOG_CRIT, "slapd_daemon", "ns_add_io_job failed to create add acceptor %d\n", result);
e06446
             }
e06446
@@ -1684,28 +1693,84 @@ ns_handle_closure(struct ns_job_t *job)
e06446
 /**
e06446
  * Schedule more I/O for this connection, or make sure that it
e06446
  * is closed in the event loop.
e06446
+ * 
e06446
  * caller must hold c_mutex
e06446
- * It returns
e06446
- *  0 on success
e06446
- *  1 on need to retry
e06446
  */
e06446
-static int
e06446
-ns_connection_post_io_or_closing_try(Connection *conn)
e06446
+void
e06446
+ns_connection_post_io_or_closing(Connection *conn)
e06446
 {
e06446
     struct timeval tv;
e06446
 
e06446
     if (!enable_nunc_stans) {
e06446
-        return 0;
e06446
+        return;
e06446
     }
e06446
 
e06446
     /*
e06446
      * A job was already scheduled.
e06446
-     * Let it be dispatched first
e06446
+     * Check if it is the appropriate one
e06446
      */
e06446
     if (conn->c_job != NULL) {
e06446
-        return 1;
e06446
+        if (connection_is_free(conn, 0)) {
e06446
+            PRStatus shutdown_status;
e06446
+
e06446
+            /* The connection being freed,
e06446
+             * It means that ns_handle_closure already completed and the connection
e06446
+             * is no longer on the active list.
e06446
+             * The scheduled job is useless and scheduling a new one as well
e06446
+             */
e06446
+            shutdown_status = ns_job_done(conn->c_job);
e06446
+            if (shutdown_status != PR_SUCCESS) {
e06446
+                slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);
e06446
+            }
e06446
+            conn->c_job = NULL;
e06446
+            return;
e06446
+        }
e06446
+        if (CONN_NEEDS_CLOSING(conn)) {
e06446
+            if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION])) {
e06446
+                /* Due to the closing state we would schedule a ns_handle_closure
e06446
+                 * but one is already registered.
e06446
+                 * Just return;
e06446
+                 */
e06446
+                slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_closure "
e06446
+                        "job in progress on conn %" PRIu64 " for fd=%d\n",
e06446
+                        conn->c_connid, conn->c_sd);
e06446
+                return;
e06446
+            } else {
e06446
+                /* Due to the closing state we would schedule a ns_handle_closure
e06446
+                 * but a different handler is registered. Stop it and schedule (below) ns_handle_closure
e06446
+                 */
e06446
+                ns_job_done(conn->c_job);
e06446
+                conn->c_job = NULL;
e06446
+            }
e06446
+        } else {
e06446
+            /* Here the connection is still active => ignore the call and return */
e06446
+            if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_READ_CONNECTION])) {
e06446
+                /* Connection is still active and a read_ready is already scheduled
e06446
+                 * Likely a consequence of async operations
e06446
+                 * Just let the current read_ready do its job
e06446
+                 */
e06446
+                slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_pr_read_ready "
e06446
+                                                                               "job in progress on conn %" PRIu64 " for fd=%d\n",
e06446
+                          conn->c_connid, conn->c_sd);
e06446
+            } else {
e06446
+                /* Weird situation where the connection is not flagged closing but ns_handle_closure
e06446
+                 * is scheduled.
e06446
+                 * We should not try to read it anymore
e06446
+                 */
e06446
+                PR_ASSERT(ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION]));
e06446
+            }
e06446
+            return;
e06446
+        }
e06446
     }
e06446
 
e06446
+    /* At this point conn->c_job is NULL
e06446
+     * Either it was null when the function was called
e06446
+     * Or we cleared it (+ns_job_done) if the wrong (according
e06446
+     * to the connection state) handler was scheduled
e06446
+     *
e06446
+     * Now we need to determine which handler to schedule
e06446
+     */
e06446
+
e06446
     if (CONN_NEEDS_CLOSING(conn)) {
e06446
         /* there should only ever be 0 or 1 active closure jobs */
e06446
         PR_ASSERT((conn->c_ns_close_jobs == 0) || (conn->c_ns_close_jobs == 1));
e06446
@@ -1718,7 +1783,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)
e06446
             conn->c_ns_close_jobs++;                                                      /* now 1 active closure job */
e06446
             connection_acquire_nolock_ext(conn, 1 /* allow acquire even when closing */); /* event framework now has a reference */
e06446
             /* Close the job asynchronously. Why? */
e06446
-            ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handle_closure, conn, &(conn->c_job));
e06446
+            ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handlers[NS_HANDLER_CLOSE_CONNECTION], conn, &(conn->c_job));
e06446
             if (job_result != NS_SUCCESS) {
e06446
                 if (job_result == NS_SHUTDOWN) {
e06446
                     slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post closure job "
e06446
@@ -1762,7 +1827,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)
e06446
 #endif
e06446
         ns_result_t job_result = ns_add_io_timeout_job(conn->c_tp, conn->c_prfd, &tv,
e06446
                                                        NS_JOB_READ | NS_JOB_PRESERVE_FD,
e06446
-                                                       ns_handle_pr_read_ready, conn, &(conn->c_job));
e06446
+                                                       ns_handlers[NS_HANDLER_READ_CONNECTION], conn, &(conn->c_job));
e06446
         if (job_result != NS_SUCCESS) {
e06446
             if (job_result == NS_SHUTDOWN) {
e06446
                 slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post I/O job for "
e06446
@@ -1782,61 +1847,6 @@ ns_connection_post_io_or_closing_try(Connection *conn)
e06446
     return 0;
e06446
 }
e06446
 
e06446
-/*
e06446
- * Tries to schedule I/O for this connection
e06446
- * If the connection is already busy with a scheduled I/O
e06446
- * it can wait until scheduled I/O is dispatched
e06446
- *
e06446
- * caller must hold c_mutex
e06446
- */
e06446
-void
e06446
-ns_connection_post_io_or_closing(Connection *conn)
e06446
-{
e06446
-    while (ns_connection_post_io_or_closing_try(conn)) {
e06446
-        /* Here a job is currently scheduled (c->job is set) and not yet dispatched
e06446
-         * Job can be either:
e06446
-         *  - ns_handle_closure
e06446
-         *  - ns_handle_pr_read_ready
e06446
-         */
e06446
-
e06446
-        if (connection_is_free(conn, 0)) {
e06446
-            PRStatus shutdown_status;
e06446
-
e06446
-            /* The connection being freed,
e06446
-             * It means that ns_handle_closure already completed and the connection
e06446
-             * is no longer on the active list.
e06446
-             * The scheduled job is useless and scheduling a new one as well
e06446
-             */
e06446
-            shutdown_status = ns_job_done(conn->c_job);
e06446
-            if (shutdown_status != PR_SUCCESS) {
e06446
-                slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);
e06446
-            }
e06446
-            conn->c_job = NULL;
e06446
-            return;
e06446
-        }
e06446
-        if (g_get_shutdown() && CONN_NEEDS_CLOSING(conn)) {
e06446
-            PRStatus shutdown_status;
e06446
-
e06446
-            /* This is shutting down cancel any scheduled job to register ns_handle_closure
e06446
-             */
e06446
-            shutdown_status = ns_job_done(conn->c_job);
e06446
-            if (shutdown_status != PR_SUCCESS) {
e06446
-                slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed to cancel a job during shutdown %d !\n", conn->c_sd);
e06446
-            }
e06446
-            conn->c_job = NULL;
e06446
-            continue;
e06446
-        }
e06446
-
e06446
-        /* We are not suppose to work immediately on the connection that is taken by
e06446
-         * another job
e06446
-         * release the lock and give some time
e06446
-         */
e06446
-        PR_ExitMonitor(conn->c_mutex);
e06446
-        DS_Sleep(PR_MillisecondsToInterval(100));
e06446
-        PR_EnterMonitor(conn->c_mutex);
e06446
-    }
e06446
-}
e06446
-
e06446
 /* This function must be called without the thread flag, in the
e06446
  * event loop.  This function may free the connection.  This can
e06446
  * only be done in the event loop thread.
e06446
diff --git a/src/nunc-stans/include/nunc-stans.h b/src/nunc-stans/include/nunc-stans.h
e06446
index 192e38ec3..a0ddbdb42 100644
e06446
--- a/src/nunc-stans/include/nunc-stans.h
e06446
+++ b/src/nunc-stans/include/nunc-stans.h
e06446
@@ -959,4 +959,7 @@ ns_result_t ns_thrpool_wait(struct ns_thrpool_t *tp);
e06446
  */
e06446
 ns_result_t ns_job_rearm(struct ns_job_t *job);
e06446
 
e06446
+int
e06446
+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func);
e06446
+
e06446
 #endif /* NS_THRPOOL_H */
e06446
diff --git a/src/nunc-stans/ns/ns_thrpool.c b/src/nunc-stans/ns/ns_thrpool.c
e06446
index d95b0c38b..774607c88 100644
e06446
--- a/src/nunc-stans/ns/ns_thrpool.c
e06446
+++ b/src/nunc-stans/ns/ns_thrpool.c
e06446
@@ -1237,6 +1237,11 @@ ns_job_rearm(ns_job_t *job)
e06446
     /* Unreachable code .... */
e06446
     return NS_INVALID_REQUEST;
e06446
 }
e06446
+int
e06446
+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func)
e06446
+{
e06446
+    return(job && job->func == func);
e06446
+}
e06446
 
e06446
 static void
e06446
 ns_thrpool_delete(ns_thrpool_t *tp)
e06446
-- 
e06446
2.17.1
e06446