From 2a080e5986331989a3164a35129e576641b2cca5 Mon Sep 17 00:00:00 2001 From: Tomas Jelinek Date: Tue, 19 Jul 2016 16:42:44 +0200 Subject: [PATCH 1/2] allow to remove a dead node from a cluster --- pcs/cluster.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/pcs/cluster.py b/pcs/cluster.py index baa0f44..7a8615d 100644 --- a/pcs/cluster.py +++ b/pcs/cluster.py @@ -1076,7 +1076,7 @@ def disable_cluster_nodes(nodes): if len(error_list) > 0: utils.err("unable to disable all nodes\n" + "\n".join(error_list)) -def destroy_cluster(argv): +def destroy_cluster(argv, keep_going=False): if len(argv) > 0: # stop pacemaker and resources while cluster is still quorate nodes = argv @@ -1085,7 +1085,14 @@ def destroy_cluster(argv): # destroy will stop any remaining cluster daemons error_list = parallel_for_nodes(utils.destroyCluster, nodes, quiet=True) if error_list: - utils.err("unable to destroy cluster\n" + "\n".join(error_list)) + if keep_going: + print( + "Warning: unable to destroy cluster\n" + + + "\n".join(error_list) + ) + else: + utils.err("unable to destroy cluster\n" + "\n".join(error_list)) def stop_cluster(argv): if len(argv) > 0: @@ -1347,19 +1354,25 @@ def cluster_node(argv): node = argv[1] node0, node1 = utils.parse_multiring_node(node) - if not node0: utils.err("missing ring 0 address of the node") - status,output = utils.checkAuthorization(node0) - if status == 2: - utils.err("pcsd is not running on %s" % node0) - elif status == 3: - utils.err( - "%s is not yet authenticated (try pcs cluster auth %s)" - % (node0, node0) - ) - elif status != 0: - utils.err(output) + + # allow to continue if removing a node with --force + if add_node or "--force" not in utils.pcs_options: + status, output = utils.checkAuthorization(node0) + if status != 0: + if status == 2: + msg = "pcsd is not running on {0}".format(node0) + elif status == 3: + msg = ( + "{node} is not yet authenticated " + + " (try pcs cluster auth {node})" + ).format(node=node0) + else: + msg = output + if not add_node: + msg += ", use --force to override" + utils.err(msg) if add_node == True: wait = False @@ -1540,7 +1553,7 @@ def cluster_node(argv): nodesRemoved = False c_nodes = utils.getNodesFromCorosyncConf() - destroy_cluster([node0]) + destroy_cluster([node0], keep_going=("--force" in utils.pcs_options)) for my_node in c_nodes: if my_node == node0: continue -- 1.8.3.1 From c48716233ace08c16e7e4b66075aebeca9366321 Mon Sep 17 00:00:00 2001 From: Tomas Jelinek Date: Wed, 20 Jul 2016 10:01:13 +0200 Subject: [PATCH 2/2] gui: allow to remove a dead node from a cluster --- pcsd/remote.rb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pcsd/remote.rb b/pcsd/remote.rb index 25fb74d..05a6d03 100644 --- a/pcsd/remote.rb +++ b/pcsd/remote.rb @@ -837,8 +837,15 @@ def remote_remove_nodes(params, request, auth_user) stdout, stderr, retval = run_cmd( auth_user, PCS, "cluster", "stop", *stop_params ) - if retval != 0 - return [400, stderr.join] + if retval != 0 and not params['force'] + # If forced, keep going even if unable to stop all nodes (they may be dead). + # Add info this error is forceable if pcs did not do it (e.g. when unable + # to connect to some nodes). + message = stderr.join + if not message.include?(', use --force to override') + message += ', use --force to override' + end + return [400, message] end node_list.each {|node| -- 1.8.3.1