diff --git a/include/crm/common/logging.h b/include/crm/common/logging.h index 5b8e47f..642fa92 100644 --- a/include/crm/common/logging.h +++ b/include/crm/common/logging.h @@ -157,7 +157,7 @@ unsigned int get_crm_log_level(void); } \ } while(0) -# define do_crm_log_always(level, fmt, args...) qb_log(level, "%s: " fmt, __FUNCTION__ , ##args) +# define do_crm_log_always(level, fmt, args...) qb_log(level, fmt , ##args) # define crm_perror(level, fmt, args...) do { \ const char *err = strerror(errno); \ diff --git a/include/crm/services.h b/include/crm/services.h index 5310709..1a02656 100644 --- a/include/crm/services.h +++ b/include/crm/services.h @@ -262,8 +262,9 @@ enum nagios_exitcode { */ svc_action_t *services_action_create_generic(const char *exec, const char *args[]); - void - services_action_free(svc_action_t * op); + void services_action_cleanup(svc_action_t * op); + + void services_action_free(svc_action_t * op); gboolean services_action_sync(svc_action_t * op); diff --git a/lib/cib/cib_utils.c b/lib/cib/cib_utils.c index 15b354b..ea37c4b 100644 --- a/lib/cib/cib_utils.c +++ b/lib/cib/cib_utils.c @@ -112,8 +112,13 @@ get_cib_copy(cib_t * cib) { xmlNode *xml_cib; int options = cib_scope_local | cib_sync_call; - int rc = cib->cmds->query(cib, NULL, &xml_cib, options); + int rc = pcmk_ok; + + if (cib->state == cib_disconnected) { + return NULL; + } + rc = cib->cmds->query(cib, NULL, &xml_cib, options); if (rc == -EACCES) { return NULL; diff --git a/lib/services/dbus.c b/lib/services/dbus.c index c0153b5..f44b590 100644 --- a/lib/services/dbus.c +++ b/lib/services/dbus.c @@ -145,7 +145,7 @@ DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, D return reply; } -bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, +DBusPendingCall* pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, void(*done)(DBusPendingCall *pending, void *user_data), void *user_data) { DBusError error; @@ -161,27 +161,30 @@ bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, // send message and get a handle for a reply if (!dbus_connection_send_with_reply (connection, msg, &pending, -1/* aka. DBUS_TIMEOUT_USE_DEFAULT */)) { // -1 is default timeout crm_err("Send with reply failed for %s", method); - return FALSE; + return NULL; } else if (pending == NULL) { crm_err("No pending call found for %s", method); - return FALSE; - + return NULL; } + crm_trace("DBus %s call sent", method); if (dbus_pending_call_get_completed(pending)) { - crm_info("DBus %s call completed too soon"); -#if 1 + crm_info("DBus %s call completed too soon", method); + if(done) { +#if 0 /* This sounds like a good idea, but allegedly it breaks things */ done(pending, user_data); + pending = NULL; #else CRM_ASSERT(dbus_pending_call_set_notify(pending, done, user_data, NULL)); #endif + } - } else { + } else if(done) { CRM_ASSERT(dbus_pending_call_set_notify(pending, done, user_data, NULL)); } - return TRUE; + return pending; } bool pcmk_dbus_type_check(DBusMessage *msg, DBusMessageIter *field, int expected, const char *function, int line) @@ -286,6 +289,11 @@ pcmk_dbus_lookup_result(DBusMessage *reply, struct db_getall_data *data) dbus_message_iter_next (&dict); } + if(data->name && data->callback) { + crm_trace("No value for property %s[%s]", data->object, data->name); + data->callback(data->name, NULL, data->userdata); + } + cleanup: free(data->target); free(data->object); @@ -306,6 +314,9 @@ pcmk_dbus_lookup_cb(DBusPendingCall *pending, void *user_data) pcmk_dbus_lookup_result(reply, user_data); + if(pending) { + dbus_pending_call_unref(pending); + } if(reply) { dbus_message_unref(reply); } @@ -375,20 +386,59 @@ static void pcmk_dbus_connection_dispatch(DBusConnection *connection, DBusDispat crm_trace("status %d for %p", new_status, data); if (new_status == DBUS_DISPATCH_DATA_REMAINS){ dbus_connection_dispatch(connection); + + while (dbus_connection_get_dispatch_status(connection) == DBUS_DISPATCH_DATA_REMAINS) { + dbus_connection_dispatch(connection); + } } } +/* Copied from dbus-watch.c */ + +static const char* +dbus_watch_flags_to_string (int flags) +{ + const char *watch_type; + + if ((flags & DBUS_WATCH_READABLE) && + (flags & DBUS_WATCH_WRITABLE)) + watch_type = "readwrite"; + else if (flags & DBUS_WATCH_READABLE) + watch_type = "read"; + else if (flags & DBUS_WATCH_WRITABLE) + watch_type = "write"; + else + watch_type = "not read or write"; + return watch_type; +} + static int pcmk_dbus_watch_dispatch(gpointer userdata) { + bool oom = FALSE; DBusWatch *watch = userdata; int flags = dbus_watch_get_flags(watch); + bool enabled = dbus_watch_get_enabled (watch); + mainloop_io_t *client = dbus_watch_get_data(watch); - crm_trace("Dispatching %p with flags %d", watch, flags); - if(flags & DBUS_WATCH_READABLE) { - dbus_watch_handle(watch, DBUS_WATCH_READABLE); - } else { - dbus_watch_handle(watch, DBUS_WATCH_ERROR); + crm_trace("Dispatching client %p: %s", client, dbus_watch_flags_to_string(flags)); + if (enabled && is_set(flags, DBUS_WATCH_READABLE)) { + oom = !dbus_watch_handle(watch, flags); + + } else if (enabled && is_set(flags, DBUS_WATCH_READABLE)) { + oom = !dbus_watch_handle(watch, flags); + + } else if(enabled) { + oom = !dbus_watch_handle(watch, DBUS_WATCH_ERROR); + } + + if(flags != dbus_watch_get_flags(watch)) { + flags = dbus_watch_get_flags(watch); + crm_trace("Dispatched client %p: %s (%d)", client, dbus_watch_flags_to_string(flags), flags); + } + + if(oom) { + crm_err("DBus encountered OOM while attempting to dispatch %p (%s)", client, dbus_watch_flags_to_string(flags)); } return 0; } @@ -396,7 +446,8 @@ pcmk_dbus_watch_dispatch(gpointer userdata) static void pcmk_dbus_watch_destroy(gpointer userdata) { - crm_trace("Destroyed %p", userdata); + mainloop_io_t *client = dbus_watch_get_data(userdata); + crm_trace("Destroyed %p", client); } @@ -412,7 +463,7 @@ pcmk_dbus_watch_add(DBusWatch *watch, void *data){ mainloop_io_t *client = mainloop_add_fd( "dbus", G_PRIORITY_DEFAULT, fd, watch, &pcmk_dbus_cb); - crm_trace("Added %p with fd=%d", watch, fd); + crm_trace("Added watch %p with fd=%d to client %p", watch, fd, client); dbus_watch_set_data(watch, client, NULL); return TRUE; } @@ -429,14 +480,14 @@ static void pcmk_dbus_watch_remove(DBusWatch *watch, void *data){ mainloop_io_t *client = dbus_watch_get_data(watch); - crm_trace("Removed %p", watch); + crm_trace("Removed client %p (%p)", client, data); mainloop_del_fd(client); } static gboolean pcmk_dbus_timeout_dispatch(gpointer data) { - crm_trace("Timeout for %p"); + crm_info("Timeout %p expired", data); dbus_timeout_handle(data); return FALSE; } @@ -445,6 +496,8 @@ static dbus_bool_t pcmk_dbus_timeout_add(DBusTimeout *timeout, void *data){ guint id = g_timeout_add(dbus_timeout_get_interval(timeout), pcmk_dbus_timeout_dispatch, timeout); + crm_trace("Adding timeout %p (%ld)", timeout, dbus_timeout_get_interval(timeout)); + if(id) { dbus_timeout_set_data(timeout, GUINT_TO_POINTER(id), NULL); } @@ -456,6 +509,8 @@ pcmk_dbus_timeout_remove(DBusTimeout *timeout, void *data){ void *vid = dbus_timeout_get_data(timeout); guint id = GPOINTER_TO_UINT(vid); + crm_trace("Removing timeout %p (%p)", timeout, data); + if(id) { g_source_remove(id); dbus_timeout_set_data(timeout, 0, NULL); @@ -464,7 +519,11 @@ pcmk_dbus_timeout_remove(DBusTimeout *timeout, void *data){ static void pcmk_dbus_timeout_toggle(DBusTimeout *timeout, void *data){ - if(dbus_timeout_get_enabled(timeout)) { + bool enabled = dbus_timeout_get_enabled(timeout); + + crm_trace("Toggling timeout for %p to %s", timeout, enabled?"off":"on"); + + if(enabled) { pcmk_dbus_timeout_add(timeout, data); } else { pcmk_dbus_timeout_remove(timeout, data); diff --git a/lib/services/pcmk-dbus.h b/lib/services/pcmk-dbus.h index ed80c5f..468020e 100644 --- a/lib/services/pcmk-dbus.h +++ b/lib/services/pcmk-dbus.h @@ -2,7 +2,7 @@ DBusConnection *pcmk_dbus_connect(void); void pcmk_dbus_connection_setup_with_select(DBusConnection *c); void pcmk_dbus_disconnect(DBusConnection *connection); -bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, +DBusPendingCall *pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, void(*done)(DBusPendingCall *pending, void *user_data), void *user_data); DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, DBusError *error); bool pcmk_dbus_type_check(DBusMessage *msg, DBusMessageIter *field, int expected, const char *function, int line); diff --git a/lib/services/services.c b/lib/services/services.c index 9936c72..582fbe1 100644 --- a/lib/services/services.c +++ b/lib/services/services.c @@ -303,18 +303,24 @@ services_action_create_generic(const char *exec, const char *args[]) } void -services_action_free(svc_action_t * op) +services_action_cleanup(svc_action_t * op) { - unsigned int i; - - if (op == NULL) { - return; + if(op->opaque->timerid != 0) { + crm_trace("Removing timer for call %s to %s", op->action, op->rsc); + g_source_remove(op->opaque->timerid); + op->opaque->timerid = 0; } - if (op->opaque->repeat_timer) { - g_source_remove(op->opaque->repeat_timer); - op->opaque->repeat_timer = 0; + if(op->opaque->pending) { + crm_trace("Cleaning up pending dbus call %p %s for %s", op->opaque->pending, op->action, op->rsc); + if(dbus_pending_call_get_completed(op->opaque->pending)) { + crm_warn("Pending dbus call %s for %s did not complete", op->action, op->rsc); + } + dbus_pending_call_cancel(op->opaque->pending); + dbus_pending_call_unref(op->opaque->pending); + op->opaque->pending = NULL; } + if (op->opaque->stderr_gsource) { mainloop_del_fd(op->opaque->stderr_gsource); op->opaque->stderr_gsource = NULL; @@ -324,6 +330,23 @@ services_action_free(svc_action_t * op) mainloop_del_fd(op->opaque->stdout_gsource); op->opaque->stdout_gsource = NULL; } +} + +void +services_action_free(svc_action_t * op) +{ + unsigned int i; + + if (op == NULL) { + return; + } + + services_action_cleanup(op); + + if (op->opaque->repeat_timer) { + g_source_remove(op->opaque->repeat_timer); + op->opaque->repeat_timer = 0; + } free(op->id); free(op->opaque->exec); diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c index 2279e4e..8d6f450 100644 --- a/lib/services/services_linux.c +++ b/lib/services/services_linux.c @@ -264,6 +264,8 @@ operation_finalize(svc_action_t * op) services_action_free(op); return TRUE; } + + services_action_cleanup(op); return FALSE; } diff --git a/lib/services/services_private.h b/lib/services/services_private.h index dd759e3..bcf882c 100644 --- a/lib/services/services_private.h +++ b/lib/services/services_private.h @@ -19,6 +19,10 @@ #ifndef __MH_SERVICES_PRIVATE_H__ # define __MH_SERVICES_PRIVATE_H__ +#if SUPPORT_DBUS +# include +#endif + struct svc_action_private_s { char *exec; char *args[255]; @@ -31,6 +35,10 @@ struct svc_action_private_s { int stdout_fd; mainloop_io_t *stdout_gsource; +#if SUPPORT_DBUS + DBusPendingCall* pending; + unsigned timerid; +#endif }; GList *services_os_get_directory_list(const char *root, gboolean files, gboolean executable); diff --git a/lib/services/systemd.c b/lib/services/systemd.c index 9a7b078..51ade44 100644 --- a/lib/services/systemd.c +++ b/lib/services/systemd.c @@ -110,20 +110,48 @@ systemd_service_name(const char *name) return g_strdup_printf("%s.service", name); } -static bool -systemd_daemon_reload(void) +static void +systemd_daemon_reload_complete(DBusPendingCall *pending, void *user_data) { - /* TODO: Make this asynchronous */ - const char *method = "Reload"; + DBusError error; DBusMessage *reply = NULL; - DBusMessage *msg = systemd_new_method(BUS_NAME".Manager", method); + unsigned int reload_count = GPOINTER_TO_UINT(user_data); - CRM_ASSERT(msg != NULL); - reply = pcmk_dbus_send_recv(msg, systemd_proxy, NULL); - dbus_message_unref(msg); + dbus_error_init(&error); + if(pending) { + reply = dbus_pending_call_steal_reply(pending); + } + + if(pcmk_dbus_find_error("Reload", pending, reply, &error)) { + crm_err("Could not issue systemd reload %d: %s", reload_count, error.message); + + } else { + crm_trace("Reload %d complete", reload_count); + } + + if(pending) { + dbus_pending_call_unref(pending); + } if(reply) { dbus_message_unref(reply); } +} + +static bool +systemd_daemon_reload(void) +{ + static unsigned int reload_count = 0; + const char *method = "Reload"; + + + reload_count++; + if(reload_count % 10 == 0) { + DBusMessage *msg = systemd_new_method(BUS_NAME".Manager", method); + + CRM_ASSERT(msg != NULL); + pcmk_dbus_send(msg, systemd_proxy, systemd_daemon_reload_complete, GUINT_TO_POINTER(reload_count)); + dbus_message_unref(msg); + } return TRUE; } @@ -155,13 +183,22 @@ static void systemd_loadunit_cb(DBusPendingCall *pending, void *user_data) { DBusMessage *reply = NULL; + svc_action_t * op = user_data; if(pending) { reply = dbus_pending_call_steal_reply(pending); } + if(op) { + crm_trace("Got result: %p for %p for %s, %s", reply, pending, op->rsc, op->action); + } else { + crm_trace("Got result: %p for %p", reply, pending); + } systemd_loadunit_result(reply, user_data); + if(pending) { + dbus_pending_call_unref(pending); + } if(reply) { dbus_message_unref(reply); } @@ -213,6 +250,7 @@ systemd_unit_by_name(const gchar * arg_name, svc_action_t *op) } pcmk_dbus_send(msg, systemd_proxy, systemd_loadunit_cb, op); + dbus_message_unref(msg); return NULL; } @@ -421,6 +459,12 @@ systemd_async_dispatch(DBusPendingCall *pending, void *user_data) reply = dbus_pending_call_steal_reply(pending); } + if(op) { + crm_trace("Got result: %p for %p for %s, %s", reply, pending, op->rsc, op->action); + } else { + crm_trace("Got result: %p for %p", reply, pending); + } + op->opaque->pending = NULL; systemd_exec_result(reply, op); if(pending) { @@ -437,10 +481,13 @@ static void systemd_unit_check(const char *name, const char *state, void *userdata) { svc_action_t * op = userdata; - - CRM_ASSERT(state != NULL); - if (g_strcmp0(state, "active") == 0) { + crm_trace("Resource %s has %s='%s'", op->rsc, name, state); + + if(state == NULL) { + op->rc = PCMK_OCF_NOT_RUNNING; + + } else if (g_strcmp0(state, "active") == 0) { op->rc = PCMK_OCF_OK; } else if (g_strcmp0(state, "activating") == 0) { op->rc = PCMK_OCF_PENDING; @@ -449,6 +496,7 @@ systemd_unit_check(const char *name, const char *state, void *userdata) } if (op->synchronous == FALSE) { + op->opaque->pending = NULL; operation_finalize(op); } } @@ -539,28 +587,29 @@ systemd_unit_exec_with_unit(svc_action_t * op, const char *unit) } if (op->synchronous == FALSE) { - return pcmk_dbus_send(msg, systemd_proxy, systemd_async_dispatch, op); + DBusPendingCall* pending = pcmk_dbus_send(msg, systemd_proxy, systemd_async_dispatch, op); + + dbus_message_unref(msg); + if(pending) { + dbus_pending_call_ref(pending); + op->opaque->pending = pending; + return TRUE; + } + return FALSE; } else { DBusError error; reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error); + dbus_message_unref(msg); systemd_exec_result(reply, op); if(reply) { dbus_message_unref(reply); } - if(msg) { - dbus_message_unref(msg); - } - return FALSE; } - if(msg) { - dbus_message_unref(msg); - } - cleanup: if (op->synchronous == FALSE) { operation_finalize(op); @@ -570,6 +619,18 @@ systemd_unit_exec_with_unit(svc_action_t * op, const char *unit) return op->rc == PCMK_OCF_OK; } +static gboolean +systemd_timeout_callback(gpointer p) +{ + svc_action_t * op = p; + + op->opaque->timerid = 0; + crm_warn("%s operation on systemd unit %s named '%s' timed out", op->action, op->agent, op->rsc); + operation_finalize(op); + + return FALSE; +} + gboolean systemd_unit_exec(svc_action_t * op) { @@ -596,6 +657,7 @@ systemd_unit_exec(svc_action_t * op) free(unit); if (op->synchronous == FALSE) { + op->opaque->timerid = g_timeout_add(op->timeout + 5000, systemd_timeout_callback, op); return TRUE; } diff --git a/lib/services/upstart.c b/lib/services/upstart.c index 4c7211d..01ff817 100644 --- a/lib/services/upstart.c +++ b/lib/services/upstart.c @@ -513,8 +513,15 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous) CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_BOOLEAN, &arg_wait, DBUS_TYPE_INVALID)); if (op->synchronous == FALSE) { + DBusPendingCall* pending = pcmk_dbus_send(msg, upstart_proxy, upstart_async_dispatch, op); free(job); - return pcmk_dbus_send(msg, upstart_proxy, upstart_async_dispatch, op); + + if(pending) { + dbus_pending_call_ref(pending); + op->opaque->pending = pending; + return TRUE; + } + return FALSE; } dbus_error_init(&error); diff --git a/lrmd/regression.py.in b/lrmd/regression.py.in index a9a32ef..649c984 100755 --- a/lrmd/regression.py.in +++ b/lrmd/regression.py.in @@ -27,12 +27,12 @@ build_dir="@abs_top_builddir@" test_dir=sys.path[0] new_path=os.environ['PATH'] - if os.path.exists("%s/regression.py.in" % test_dir): print "Running tests from the source tree: %s (%s)" % (build_dir, test_dir) new_path = "%s/lrmd:%s" % (build_dir, new_path) # For lrmd, lrmd_test and pacemaker_remoted new_path = "%s/tools:%s" % (build_dir, new_path) # For crm_resource new_path = "%s/fencing:%s" % (build_dir, new_path) # For stonithd + else: print "Running tests from the install tree: @CRM_DAEMON_DIR@ (not %s)" % test_dir new_path = "@CRM_DAEMON_DIR@:%s" % (new_path) # For stonithd, lrmd, lrmd_test and pacemaker_remoted @@ -434,13 +434,15 @@ if __name__ == "__main__": for ra in [ "Dummy", "Stateful", "ping" ]: os.system("cp %s/extra/resources/%s @OCF_RA_DIR@/pacemaker/%s" % (build_dir, ra, ra)) os.system("chmod a+x @OCF_RA_DIR@/pacemaker/%s" % (ra)) - else: - # Assume it's installed - print "Using @datadir@/@PACKAGE@/tests/cts/LSBDummy" - os.system("cp @datadir@/@PACKAGE@/tests/cts/LSBDummy /etc/init.d/LSBDummy") - os.system("chmod a+x /etc/init.d/LSBDummy") - os.system("ls -al /etc/init.d/LSBDummy") + else: + # Assume it's installed + print "Using @datadir@/@PACKAGE@/tests/cts/LSBDummy" + os.system("cp @datadir@/@PACKAGE@/tests/cts/LSBDummy /etc/init.d/LSBDummy") + + os.system("chmod a+x /etc/init.d/LSBDummy") + os.system("ls -al /etc/init.d/LSBDummy") + os.system("mkdir -p @CRM_CORE_DIR@/root") if os.path.exists("/bin/systemctl"): @@ -747,6 +749,33 @@ if __name__ == "__main__": test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+ "-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ") + ### stress tests ### + def build_stress_tests(self): + timeout = "-t 20000" + iterations = 25 + + test = self.new_test("ocf_stress", "Verify systemd dbus connection works under load") + for i in range(iterations): + test.add_cmd("-c register_rsc -r rsc_%s %s -C ocf -P heartbeat -T Dummy -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) + test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i)) + test.add_cmd("-c exec -r rsc_%s -a monitor %s -i 1000 -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete\"" % (i, timeout, i)) + for i in range(iterations): + test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i)) + test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) + + + if "systemd" in self.rsc_classes: + test = self.new_test("systemd_stress", "Verify systemd dbus connection works under load") + for i in range(iterations): + test.add_cmd("-c register_rsc -r rsc_%s %s -C systemd -T lrmd_dummy_daemon -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) + test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i)) + test.add_cmd("-c exec -r rsc_%s -a monitor %s -i 1000 -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete\"" % (i, timeout, i)) + + for i in range(iterations): + test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i)) + test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i)) + + ### These are tests that target specific cases ### def build_custom_tests(self): @@ -1016,6 +1045,7 @@ def main(argv): tests.build_multi_rsc_tests() tests.build_negative_tests() tests.build_custom_tests() + tests.build_stress_tests() tests.setup_test_environment()