commit 0ea59c13caf51db459bfc6448ce8b7661778405d Author: Andrew Beekhof Date: Fri Jun 27 13:26:57 2014 +1000 Fix: lrmd: Handle systemd reporting 'done' before a resource is actually stopped (cherry picked from commit 3bd6c30adbb46891ee962cd2c1f2e191da88b808) diff --git a/lib/services/systemd.c b/lib/services/systemd.c index a28ae14..f9d6d29 100644 --- a/lib/services/systemd.c +++ b/lib/services/systemd.c @@ -422,6 +422,8 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous) if (g_strcmp0(state, "active") == 0) { op->rc = PCMK_OCF_OK; + } else if (g_strcmp0(state, "activating") == 0) { + op->rc = PCMK_OCF_PENDING; } else { op->rc = PCMK_OCF_NOT_RUNNING; } diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c index 517e98f..5443fa4 100644 --- a/lrmd/lrmd.c +++ b/lrmd/lrmd.c @@ -58,6 +58,7 @@ typedef struct lrmd_cmd_s { char *origin; char *rsc_id; char *action; + char *real_action; char *output; char *userdata_str; @@ -359,7 +360,11 @@ send_cmd_complete_notify(lrmd_cmd_t * cmd) crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC); crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id); - crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action); + if(cmd->real_action) { + crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action); + } else { + crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action); + } crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str); crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->output); @@ -584,6 +589,8 @@ action_complete(svc_action_t * action) lrmd_rsc_t *rsc; lrmd_cmd_t *cmd = action->cb_data; + bool goagain = false; + if (!cmd) { crm_err("LRMD action (%s) completed does not match any known operations.", action->id); return; @@ -604,6 +611,30 @@ action_complete(svc_action_t * action) } else if (action->stdout_data) { cmd->output = strdup(action->stdout_data); } + + if (rsc && safe_str_eq(rsc->class, "systemd")) { + if(safe_str_eq(cmd->action, "start")) { + /* systemd I curse thee! + * + * systemd returns from start actions after the start _begins_ + * not after it completes. + * + * So we have to jump through a few hoops so that we don't + * report 'complete' to the rest of pacemaker until, you know, + * its actually done. + */ + goagain = true; + cmd->real_action = cmd->action; + cmd->action = strdup("monitor"); + + } else if(cmd->real_action) { + /* Ok, so this is the follow up monitor action to check if start actually completed */ + if(cmd->lrmd_op_status == PCMK_LRM_OP_DONE && cmd->exec_rc == PCMK_OCF_PENDING) { + goagain = true; + } + } + } + #if SUPPORT_NAGIOS if (rsc && safe_str_eq(rsc->class, "nagios")) { if (safe_str_eq(cmd->action, "monitor") && @@ -612,41 +643,46 @@ action_complete(svc_action_t * action) cmd->exec_rc = PCMK_OCF_NOT_RUNNING; } else if (safe_str_eq(cmd->action, "start") && cmd->exec_rc != PCMK_OCF_OK) { - int time_sum = 0; - int timeout_left = 0; - int delay = cmd->timeout_orig / 10; + goagain = true; + } + } +#endif + + if(goagain) { + int time_sum = 0; + int timeout_left = 0; + int delay = cmd->timeout_orig / 10; # ifdef HAVE_SYS_TIMEB_H - struct timeb now = { 0, }; + struct timeb now = { 0, }; - ftime(&now); - time_sum = time_diff_ms(&now, &cmd->t_first_run); - timeout_left = cmd->timeout_orig - time_sum; - if (delay < timeout_left) { - cmd->start_delay = delay; - cmd->timeout = timeout_left; + ftime(&now); + time_sum = time_diff_ms(&now, &cmd->t_first_run); + timeout_left = cmd->timeout_orig - time_sum; + if (delay < timeout_left) { + cmd->start_delay = delay; + cmd->timeout = timeout_left; + if(cmd->exec_rc != PCMK_OCF_OK) { crm_notice ("%s %s failed (rc=%d): re-scheduling (time_sum=%dms, start_delay=%dms, timeout=%dms)", cmd->rsc_id, cmd->action, cmd->exec_rc, time_sum, cmd->start_delay, cmd->timeout); + } - cmd->lrmd_op_status = 0; - cmd->last_pid = 0; - memset(&cmd->t_run, 0, sizeof(cmd->t_run)); - memset(&cmd->t_queue, 0, sizeof(cmd->t_queue)); - free(cmd->output); - cmd->output = NULL; + cmd->lrmd_op_status = 0; + cmd->last_pid = 0; + memset(&cmd->t_run, 0, sizeof(cmd->t_run)); + memset(&cmd->t_queue, 0, sizeof(cmd->t_queue)); + free(cmd->output); + cmd->output = NULL; - rsc->active = NULL; - schedule_lrmd_cmd(rsc, cmd); - return; - } -# endif + rsc->active = NULL; + schedule_lrmd_cmd(rsc, cmd); + return; } +# endif } -#endif - cmd_finalize(cmd, rsc); }