commit 0ea59c13caf51db459bfc6448ce8b7661778405d
Author: Andrew Beekhof <andrew@beekhof.net>
Date: Fri Jun 27 13:26:57 2014 +1000
Fix: lrmd: Handle systemd reporting 'done' before a resource is actually stopped
(cherry picked from commit 3bd6c30adbb46891ee962cd2c1f2e191da88b808)
diff --git a/lib/services/systemd.c b/lib/services/systemd.c
index a28ae14..f9d6d29 100644
--- a/lib/services/systemd.c
+++ b/lib/services/systemd.c
@@ -422,6 +422,8 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous)
if (g_strcmp0(state, "active") == 0) {
op->rc = PCMK_OCF_OK;
+ } else if (g_strcmp0(state, "activating") == 0) {
+ op->rc = PCMK_OCF_PENDING;
} else {
op->rc = PCMK_OCF_NOT_RUNNING;
}
diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c
index 517e98f..5443fa4 100644
--- a/lrmd/lrmd.c
+++ b/lrmd/lrmd.c
@@ -58,6 +58,7 @@ typedef struct lrmd_cmd_s {
char *origin;
char *rsc_id;
char *action;
+ char *real_action;
char *output;
char *userdata_str;
@@ -359,7 +360,11 @@ send_cmd_complete_notify(lrmd_cmd_t * cmd)
crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC);
crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id);
- crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action);
+ if(cmd->real_action) {
+ crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action);
+ } else {
+ crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action);
+ }
crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str);
crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->output);
@@ -584,6 +589,8 @@ action_complete(svc_action_t * action)
lrmd_rsc_t *rsc;
lrmd_cmd_t *cmd = action->cb_data;
+ bool goagain = false;
+
if (!cmd) {
crm_err("LRMD action (%s) completed does not match any known operations.", action->id);
return;
@@ -604,6 +611,30 @@ action_complete(svc_action_t * action)
} else if (action->stdout_data) {
cmd->output = strdup(action->stdout_data);
}
+
+ if (rsc && safe_str_eq(rsc->class, "systemd")) {
+ if(safe_str_eq(cmd->action, "start")) {
+ /* systemd I curse thee!
+ *
+ * systemd returns from start actions after the start _begins_
+ * not after it completes.
+ *
+ * So we have to jump through a few hoops so that we don't
+ * report 'complete' to the rest of pacemaker until, you know,
+ * its actually done.
+ */
+ goagain = true;
+ cmd->real_action = cmd->action;
+ cmd->action = strdup("monitor");
+
+ } else if(cmd->real_action) {
+ /* Ok, so this is the follow up monitor action to check if start actually completed */
+ if(cmd->lrmd_op_status == PCMK_LRM_OP_DONE && cmd->exec_rc == PCMK_OCF_PENDING) {
+ goagain = true;
+ }
+ }
+ }
+
#if SUPPORT_NAGIOS
if (rsc && safe_str_eq(rsc->class, "nagios")) {
if (safe_str_eq(cmd->action, "monitor") &&
@@ -612,41 +643,46 @@ action_complete(svc_action_t * action)
cmd->exec_rc = PCMK_OCF_NOT_RUNNING;
} else if (safe_str_eq(cmd->action, "start") && cmd->exec_rc != PCMK_OCF_OK) {
- int time_sum = 0;
- int timeout_left = 0;
- int delay = cmd->timeout_orig / 10;
+ goagain = true;
+ }
+ }
+#endif
+
+ if(goagain) {
+ int time_sum = 0;
+ int timeout_left = 0;
+ int delay = cmd->timeout_orig / 10;
# ifdef HAVE_SYS_TIMEB_H
- struct timeb now = { 0, };
+ struct timeb now = { 0, };
- ftime(&now);
- time_sum = time_diff_ms(&now, &cmd->t_first_run);
- timeout_left = cmd->timeout_orig - time_sum;
- if (delay < timeout_left) {
- cmd->start_delay = delay;
- cmd->timeout = timeout_left;
+ ftime(&now);
+ time_sum = time_diff_ms(&now, &cmd->t_first_run);
+ timeout_left = cmd->timeout_orig - time_sum;
+ if (delay < timeout_left) {
+ cmd->start_delay = delay;
+ cmd->timeout = timeout_left;
+ if(cmd->exec_rc != PCMK_OCF_OK) {
crm_notice
("%s %s failed (rc=%d): re-scheduling (time_sum=%dms, start_delay=%dms, timeout=%dms)",
cmd->rsc_id, cmd->action, cmd->exec_rc, time_sum, cmd->start_delay,
cmd->timeout);
+ }
- cmd->lrmd_op_status = 0;
- cmd->last_pid = 0;
- memset(&cmd->t_run, 0, sizeof(cmd->t_run));
- memset(&cmd->t_queue, 0, sizeof(cmd->t_queue));
- free(cmd->output);
- cmd->output = NULL;
+ cmd->lrmd_op_status = 0;
+ cmd->last_pid = 0;
+ memset(&cmd->t_run, 0, sizeof(cmd->t_run));
+ memset(&cmd->t_queue, 0, sizeof(cmd->t_queue));
+ free(cmd->output);
+ cmd->output = NULL;
- rsc->active = NULL;
- schedule_lrmd_cmd(rsc, cmd);
- return;
- }
-# endif
+ rsc->active = NULL;
+ schedule_lrmd_cmd(rsc, cmd);
+ return;
}
+# endif
}
-#endif
-
cmd_finalize(cmd, rsc);
}