Tree - rpms/389-ds-base - CentOS Git server

rpms / 389-ds-base

Blame SOURCES/0091-Ticket-49765-Async-operations-can-hang-when-the-serv.patch

Blob History Raw

		967e4b	`From 70e6c68196d381d05d35414c138894b54939d236 Mon Sep 17 00:00:00 2001`
		967e4b	`From: Thierry Bordaz <tbordaz@redhat.com>`
		967e4b	`Date: Thu, 7 Jun 2018 16:19:34 +0200`
		967e4b	`Subject: [PATCH] Ticket 49765 - Async operations can hang when the server is`
		967e4b	`running nunc-stans`
		967e4b
		967e4b	`Bug Description:`
		967e4b	`The fix https://pagure.io/389-ds-base/issue/48184 allowed to schedule`
		967e4b	`several NS handlers where each handler waits for the dispatch of the`
		967e4b	`previous handler before being schedule.`
		967e4b
		967e4b	`In case the current handler is never called (connection inactivity)`
		967e4b	`those that are waiting can wait indefinitely (until timeout or connection`
		967e4b	`closure). But those that are waiting delay the processing of the operation`
		967e4b	`when the scheduling is called by connection_threadmain.`
		967e4b
		967e4b	`The some operations can appear hanging.`
		967e4b	`This scenario happens with async operations`
		967e4b
		967e4b	`Fix Description:`
		967e4b	`Instead of waiting for the completion of the scheduled handler,`
		967e4b	`evaluates if the scheduled handler needs to be cleared (ns_job_done)`
		967e4b	`or the waiting handler to be canceled.`
		967e4b
		967e4b	`https://pagure.io/389-ds-base/issue/49765`
		967e4b
		967e4b	`Reviewed by: Mark Reynolds (thanks Mark !)`
		967e4b
		967e4b	`Platforms tested: F26`
		967e4b
		967e4b	`Flag Day: no`
		967e4b
		967e4b	`Doc impact: no`
		967e4b	`---`
		967e4b	`ldap/servers/slapd/daemon.c \| 142 +++++++++++++++-------------`
		967e4b	`src/nunc-stans/include/nunc-stans.h \| 3 +`
		967e4b	`src/nunc-stans/ns/ns_thrpool.c \| 5 +`
		967e4b	`3 files changed, 84 insertions(+), 66 deletions(-)`
		967e4b
		967e4b	`diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c`
		967e4b	`index 35cfe7de0..0a723c4a8 100644`
		967e4b	`--- a/ldap/servers/slapd/daemon.c`
		967e4b	`+++ b/ldap/servers/slapd/daemon.c`
		967e4b	`@@ -152,12 +152,21 @@ accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRN`
		967e4b	`*/`
		967e4b	`static int handle_new_connection(Connection_Table ct, int tcps, PRFileDesc pr_acceptfd, int secure, int local, Connection **newconn);`
		967e4b	`static void ns_handle_new_connection(struct ns_job_t *job);`
		967e4b	`+static void ns_handle_closure(struct ns_job_t *job);`
		967e4b	`static void handle_pr_read_ready(Connection_Table *ct, PRIntn num_poll);`
		967e4b	`static int clear_signal(struct POLL_STRUCT *fds);`
		967e4b	`static void unfurl_banners(Connection_Table ct, daemon_ports_t ports, PRFileDesc n_tcps, PRFileDesc s_tcps, PRFileDesc **i_unix);`
		967e4b	`static int write_pid_file(void);`
		967e4b	`static int init_shutdown_detect(void);`
		967e4b
		967e4b	`+#define NS_HANDLER_NEW_CONNECTION 0`
		967e4b	`+#define NS_HANDLER_READ_CONNECTION 1`
		967e4b	`+#define NS_HANDLER_CLOSE_CONNECTION 2`
		967e4b	`+static ns_job_func_t ns_handlers[] = {`
		967e4b	`+ ns_handle_new_connection,`
		967e4b	`+ ns_handle_pr_read_ready,`
		967e4b	`+ ns_handle_closure`
		967e4b	`+};`
		967e4b	`/*`
		967e4b	`* NSPR has different implementations for PRMonitor, depending`
		967e4b	`* on the availble threading model`
		967e4b	`@@ -1058,7 +1067,7 @@ slapd_daemon(daemon_ports_t ports, ns_thrpool_t tp)`
		967e4b	`for (size_t ii = 0; ii < listeners; ++ii) {`
		967e4b	`listener_idxs[ii].ct = the_connection_table; /* to pass to handle_new_connection */`
		967e4b	`ns_result_t result = ns_add_io_job(tp, listener_idxs[ii].listenfd, NS_JOB_ACCEPT \| NS_JOB_PERSIST \| NS_JOB_PRESERVE_FD,`
		967e4b	`- ns_handle_new_connection, &listener_idxs[ii], &(listener_idxs[ii].ns_job));`
		967e4b	`+ ns_handlers[NS_HANDLER_NEW_CONNECTION], &listener_idxs[ii], &(listener_idxs[ii].ns_job));`
		967e4b	`if (result != NS_SUCCESS) {`
		967e4b	`slapi_log_err(SLAPI_LOG_CRIT, "slapd_daemon", "ns_add_io_job failed to create add acceptor %d\n", result);`
		967e4b	`}`
		967e4b	`@@ -1684,28 +1693,84 @@ ns_handle_closure(struct ns_job_t *job)`
		967e4b	`/**`
		967e4b	`* Schedule more I/O for this connection, or make sure that it`
		967e4b	`* is closed in the event loop.`
		967e4b	`+ *`
		967e4b	`* caller must hold c_mutex`
		967e4b	`- * It returns`
		967e4b	`- * 0 on success`
		967e4b	`- * 1 on need to retry`
		967e4b	`*/`
		967e4b	`-static int`
		967e4b	`-ns_connection_post_io_or_closing_try(Connection *conn)`
		967e4b	`+void`
		967e4b	`+ns_connection_post_io_or_closing(Connection *conn)`
		967e4b	`{`
		967e4b	`struct timeval tv;`
		967e4b
		967e4b	`if (!enable_nunc_stans) {`
		967e4b	`- return 0;`
		967e4b	`+ return;`
		967e4b	`}`
		967e4b
		967e4b	`/*`
		967e4b	`* A job was already scheduled.`
		967e4b	`- * Let it be dispatched first`
		967e4b	`+ * Check if it is the appropriate one`
		967e4b	`*/`
		967e4b	`if (conn->c_job != NULL) {`
		967e4b	`- return 1;`
		967e4b	`+ if (connection_is_free(conn, 0)) {`
		967e4b	`+ PRStatus shutdown_status;`
		967e4b	`+`
		967e4b	`+ /* The connection being freed,`
		967e4b	`+ * It means that ns_handle_closure already completed and the connection`
		967e4b	`+ * is no longer on the active list.`
		967e4b	`+ * The scheduled job is useless and scheduling a new one as well`
		967e4b	`+ */`
		967e4b	`+ shutdown_status = ns_job_done(conn->c_job);`
		967e4b	`+ if (shutdown_status != PR_SUCCESS) {`
		967e4b	`+ slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);`
		967e4b	`+ }`
		967e4b	`+ conn->c_job = NULL;`
		967e4b	`+ return;`
		967e4b	`+ }`
		967e4b	`+ if (CONN_NEEDS_CLOSING(conn)) {`
		967e4b	`+ if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION])) {`
		967e4b	`+ /* Due to the closing state we would schedule a ns_handle_closure`
		967e4b	`+ * but one is already registered.`
		967e4b	`+ * Just return;`
		967e4b	`+ */`
		967e4b	`+ slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_closure "`
		967e4b	`+ "job in progress on conn %" PRIu64 " for fd=%d\n",`
		967e4b	`+ conn->c_connid, conn->c_sd);`
		967e4b	`+ return;`
		967e4b	`+ } else {`
		967e4b	`+ /* Due to the closing state we would schedule a ns_handle_closure`
		967e4b	`+ * but a different handler is registered. Stop it and schedule (below) ns_handle_closure`
		967e4b	`+ */`
		967e4b	`+ ns_job_done(conn->c_job);`
		967e4b	`+ conn->c_job = NULL;`
		967e4b	`+ }`
		967e4b	`+ } else {`
		967e4b	`+ /* Here the connection is still active => ignore the call and return */`
		967e4b	`+ if (ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_READ_CONNECTION])) {`
		967e4b	`+ /* Connection is still active and a read_ready is already scheduled`
		967e4b	`+ * Likely a consequence of async operations`
		967e4b	`+ * Just let the current read_ready do its job`
		967e4b	`+ */`
		967e4b	`+ slapi_log_err(SLAPI_LOG_CONNS, "ns_connection_post_io_or_closing", "Already ns_handle_pr_read_ready "`
		967e4b	`+ "job in progress on conn %" PRIu64 " for fd=%d\n",`
		967e4b	`+ conn->c_connid, conn->c_sd);`
		967e4b	`+ } else {`
		967e4b	`+ /* Weird situation where the connection is not flagged closing but ns_handle_closure`
		967e4b	`+ * is scheduled.`
		967e4b	`+ * We should not try to read it anymore`
		967e4b	`+ */`
		967e4b	`+ PR_ASSERT(ns_job_is_func(conn->c_job, ns_handlers[NS_HANDLER_CLOSE_CONNECTION]));`
		967e4b	`+ }`
		967e4b	`+ return;`
		967e4b	`+ }`
		967e4b	`}`
		967e4b
		967e4b	`+ /* At this point conn->c_job is NULL`
		967e4b	`+ * Either it was null when the function was called`
		967e4b	`+ * Or we cleared it (+ns_job_done) if the wrong (according`
		967e4b	`+ * to the connection state) handler was scheduled`
		967e4b	`+ *`
		967e4b	`+ * Now we need to determine which handler to schedule`
		967e4b	`+ */`
		967e4b	`+`
		967e4b	`if (CONN_NEEDS_CLOSING(conn)) {`
		967e4b	`/* there should only ever be 0 or 1 active closure jobs */`
		967e4b	`PR_ASSERT((conn->c_ns_close_jobs == 0) \|\| (conn->c_ns_close_jobs == 1));`
		967e4b	`@@ -1718,7 +1783,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)`
		967e4b	`conn->c_ns_close_jobs++; /* now 1 active closure job */`
		967e4b	`connection_acquire_nolock_ext(conn, 1 /* allow acquire even when closing /); / event framework now has a reference */`
		967e4b	`/* Close the job asynchronously. Why? */`
		967e4b	`- ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handle_closure, conn, &(conn->c_job));`
		967e4b	`+ ns_result_t job_result = ns_add_job(conn->c_tp, NS_JOB_TIMER, ns_handlers[NS_HANDLER_CLOSE_CONNECTION], conn, &(conn->c_job));`
		967e4b	`if (job_result != NS_SUCCESS) {`
		967e4b	`if (job_result == NS_SHUTDOWN) {`
		967e4b	`slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post closure job "`
		967e4b	`@@ -1762,7 +1827,7 @@ ns_connection_post_io_or_closing_try(Connection *conn)`
		967e4b	`#endif`
		967e4b	`ns_result_t job_result = ns_add_io_timeout_job(conn->c_tp, conn->c_prfd, &tv,`
		967e4b	`NS_JOB_READ \| NS_JOB_PRESERVE_FD,`
		967e4b	`- ns_handle_pr_read_ready, conn, &(conn->c_job));`
		967e4b	`+ ns_handlers[NS_HANDLER_READ_CONNECTION], conn, &(conn->c_job));`
		967e4b	`if (job_result != NS_SUCCESS) {`
		967e4b	`if (job_result == NS_SHUTDOWN) {`
		967e4b	`slapi_log_err(SLAPI_LOG_INFO, "ns_connection_post_io_or_closing", "post I/O job for "`
		967e4b	`@@ -1782,61 +1847,6 @@ ns_connection_post_io_or_closing_try(Connection *conn)`
		967e4b	`return 0;`
		967e4b	`}`
		967e4b
		967e4b	`-/*`
		967e4b	`- * Tries to schedule I/O for this connection`
		967e4b	`- * If the connection is already busy with a scheduled I/O`
		967e4b	`- * it can wait until scheduled I/O is dispatched`
		967e4b	`- *`
		967e4b	`- * caller must hold c_mutex`
		967e4b	`- */`
		967e4b	`-void`
		967e4b	`-ns_connection_post_io_or_closing(Connection *conn)`
		967e4b	`-{`
		967e4b	`- while (ns_connection_post_io_or_closing_try(conn)) {`
		967e4b	`- /* Here a job is currently scheduled (c->job is set) and not yet dispatched`
		967e4b	`- * Job can be either:`
		967e4b	`- * - ns_handle_closure`
		967e4b	`- * - ns_handle_pr_read_ready`
		967e4b	`- */`
		967e4b	`-`
		967e4b	`- if (connection_is_free(conn, 0)) {`
		967e4b	`- PRStatus shutdown_status;`
		967e4b	`-`
		967e4b	`- /* The connection being freed,`
		967e4b	`- * It means that ns_handle_closure already completed and the connection`
		967e4b	`- * is no longer on the active list.`
		967e4b	`- * The scheduled job is useless and scheduling a new one as well`
		967e4b	`- */`
		967e4b	`- shutdown_status = ns_job_done(conn->c_job);`
		967e4b	`- if (shutdown_status != PR_SUCCESS) {`
		967e4b	`- slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed cancel a job on a freed connection %d !\n", conn->c_sd);`
		967e4b	`- }`
		967e4b	`- conn->c_job = NULL;`
		967e4b	`- return;`
		967e4b	`- }`
		967e4b	`- if (g_get_shutdown() && CONN_NEEDS_CLOSING(conn)) {`
		967e4b	`- PRStatus shutdown_status;`
		967e4b	`-`
		967e4b	`- /* This is shutting down cancel any scheduled job to register ns_handle_closure`
		967e4b	`- */`
		967e4b	`- shutdown_status = ns_job_done(conn->c_job);`
		967e4b	`- if (shutdown_status != PR_SUCCESS) {`
		967e4b	`- slapi_log_err(SLAPI_LOG_CRIT, "ns_connection_post_io_or_closing", "Failed to cancel a job during shutdown %d !\n", conn->c_sd);`
		967e4b	`- }`
		967e4b	`- conn->c_job = NULL;`
		967e4b	`- continue;`
		967e4b	`- }`
		967e4b	`-`
		967e4b	`- /* We are not suppose to work immediately on the connection that is taken by`
		967e4b	`- * another job`
		967e4b	`- * release the lock and give some time`
		967e4b	`- */`
		967e4b	`- PR_ExitMonitor(conn->c_mutex);`
		967e4b	`- DS_Sleep(PR_MillisecondsToInterval(100));`
		967e4b	`- PR_EnterMonitor(conn->c_mutex);`
		967e4b	`- }`
		967e4b	`-}`
		967e4b	`-`
		967e4b	`/* This function must be called without the thread flag, in the`
		967e4b	`* event loop. This function may free the connection. This can`
		967e4b	`* only be done in the event loop thread.`
		967e4b	`diff --git a/src/nunc-stans/include/nunc-stans.h b/src/nunc-stans/include/nunc-stans.h`
		967e4b	`index 192e38ec3..a0ddbdb42 100644`
		967e4b	`--- a/src/nunc-stans/include/nunc-stans.h`
		967e4b	`+++ b/src/nunc-stans/include/nunc-stans.h`
		967e4b	`@@ -959,4 +959,7 @@ ns_result_t ns_thrpool_wait(struct ns_thrpool_t *tp);`
		967e4b	`*/`
		967e4b	`ns_result_t ns_job_rearm(struct ns_job_t *job);`
		967e4b
		967e4b	`+int`
		967e4b	`+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func);`
		967e4b	`+`
		967e4b	`#endif /* NS_THRPOOL_H */`
		967e4b	`diff --git a/src/nunc-stans/ns/ns_thrpool.c b/src/nunc-stans/ns/ns_thrpool.c`
		967e4b	`index d95b0c38b..774607c88 100644`
		967e4b	`--- a/src/nunc-stans/ns/ns_thrpool.c`
		967e4b	`+++ b/src/nunc-stans/ns/ns_thrpool.c`
		967e4b	`@@ -1237,6 +1237,11 @@ ns_job_rearm(ns_job_t *job)`
		967e4b	`/* Unreachable code .... */`
		967e4b	`return NS_INVALID_REQUEST;`
		967e4b	`}`
		967e4b	`+int`
		967e4b	`+ns_job_is_func(struct ns_job_t *job, ns_job_func_t func)`
		967e4b	`+{`
		967e4b	`+ return(job && job->func == func);`
		967e4b	`+}`
		967e4b
		967e4b	`static void`
		967e4b	`ns_thrpool_delete(ns_thrpool_t *tp)`
		967e4b	`--`
		967e4b	`2.17.1`
		967e4b

rpms / 389-ds-base

Source Code

Blame SOURCES/0091-Ticket-49765-Async-operations-can-hang-when-the-serv.patch