From 6016283dfdcb45bf750f96715fc653a4c0904bca Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Fri, 28 Jun 2019 13:34:40 +0200 Subject: [PATCH] podman: only use exec to manage container's lifecycle Under heavy IO load, podman may be impacted and take a long time to execute some actions. If that takes more than the default 20s container monitoring timeout, containers will restart unexpectedly. Replace all IO-sensitive podman calls (inspect, exists...) by equivalent "podman exec" calls, because the latter command seems less prone to performance degradation under IO load. With this commit, the resource agent now requires podman 1.0.2+, because it relies on of two different patches [1,2] that improve IO performance and enable to distinguish "container stopped" "container doesn't exist" error codes. Tested on an OpenStack environment with podman 1.0.2, with the following scenario: . regular start/stop/monitor operations . probe operations (pcs resource cleanup/refresh) . unmanage/manage operations . reboot [1] https://github.com/containers/libpod/commit/90b835db69d589de559462d988cb3fae5cf1ef49 [2] https://github.com/containers/libpod/commit/a19975f96d2ee7efe186d9aa0be42285cfafa3f4 --- heartbeat/podman | 75 ++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/heartbeat/podman b/heartbeat/podman index 51f6ba883..8fc2c4695 100755 --- a/heartbeat/podman +++ b/heartbeat/podman @@ -129,9 +129,6 @@ the health of the container. This command must return 0 to indicate that the container is healthy. A non-zero return code will indicate that the container has failed and should be recovered. -If 'podman exec' is supported, it is used to execute the command. If not, -nsenter is used. - Note: Using this method for monitoring processes inside a container is not recommended, as containerd tries to track processes running inside the container and does not deal well with many short-lived @@ -192,17 +189,13 @@ monitor_cmd_exec() local rc=$OCF_SUCCESS local out - if [ -z "$OCF_RESKEY_monitor_cmd" ]; then - return $rc - fi - out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) rc=$? - if [ $rc -eq 127 ]; then - ocf_log err "monitor cmd failed (rc=$rc), output: $out" - ocf_exit_reason "monitor_cmd, ${OCF_RESKEY_monitor_cmd} , not found within container." - # there is no recovering from this, exit immediately - exit $OCF_ERR_ARGS + # 125: no container with name or ID ${CONTAINER} found + # 126: container state improper (not running) + # 127: any other error + if [ $rc -eq 125 ] || [ $rc -eq 126 ]; then + rc=$OCF_NOT_RUNNING elif [ $rc -ne 0 ]; then ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" rc=$OCF_ERR_GENERIC @@ -215,7 +208,16 @@ monitor_cmd_exec() container_exists() { - podman inspect --format {{.State.Running}} $CONTAINER | egrep '(true|false)' >/dev/null 2>&1 + local rc + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + if [ $rc -ne 125 ]; then + return 0 + fi + return 1 } remove_container() @@ -236,30 +238,30 @@ remove_container() podman_simple_status() { - local val - - # retrieve the 'Running' attribute for the container - val=$(podman inspect --format {{.State.Running}} $CONTAINER 2>/dev/null) - if [ $? -ne 0 ]; then - #not running as a result of container not being found - return $OCF_NOT_RUNNING - fi + local rc - if ocf_is_true "$val"; then - # container exists and is running - return $OCF_SUCCESS + # simple status is implemented via podman exec + # everything besides success is considered "not running" + monitor_cmd_exec + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING; fi - - return $OCF_NOT_RUNNING + return $rc } podman_monitor() { - if [ -z "$OCF_RESKEY_monitor_cmd" ]; then - podman_simple_status - return $? - fi + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report + # when a container is not running. Here, we're not interested + # in distinguishing whether it's stopped or non existing + # (there's function container_exists for that) monitor_cmd_exec + return $? } podman_create_mounts() { @@ -416,14 +418,6 @@ podman_validate() exit $OCF_ERR_CONFIGURED fi - if [ -n "$OCF_RESKEY_monitor_cmd" ]; then - podman exec --help >/dev/null 2>&1 - if [ ! $? ]; then - ocf_log info "checking for nsenter, which is required when 'monitor_cmd' is specified" - check_binary nsenter - fi - fi - image_exists if [ $? -ne 0 ]; then ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." @@ -457,6 +451,11 @@ fi CONTAINER=$OCF_RESKEY_name +# Note: we currently monitor podman containers by with the "podman exec" +# command, so make sure that invocation is always valid by enforcing the +# exec command to be non-empty +: ${OCF_RESKEY_monitor_cmd:=/bin/true} + case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS;;