From cf68ded959ad03244c94de308b79fc1af806a474 Mon Sep 17 00:00:00 2001 From: Ondrej Mular Date: Wed, 15 Sep 2021 07:55:50 +0200 Subject: [PATCH 1/2] fix unfencing in `pcs stonith update-scsi-devices` * do not unfence newly added devices on fenced cluster nodes --- pcs/common/reports/codes.py | 6 ++ pcs/common/reports/messages.py | 41 +++++++ pcs/lib/commands/scsi.py | 55 +++++++++- pcs/lib/commands/stonith.py | 26 +++-- pcs/lib/communication/scsi.py | 40 ++++--- .../tier0/common/reports/test_messages.py | 24 +++++ pcs_test/tier0/lib/commands/test_scsi.py | 101 ++++++++++++++++-- .../test_stonith_update_scsi_devices.py | 87 ++++++++++++--- .../tools/command_env/config_http_scsi.py | 16 ++- .../tools/command_env/config_runner_scsi.py | 36 ++++++- pcsd/api_v1.rb | 2 +- pcsd/capabilities.xml | 8 +- 12 files changed, 387 insertions(+), 55 deletions(-) diff --git a/pcs/common/reports/codes.py b/pcs/common/reports/codes.py index bbd61500..4bee0bac 100644 --- a/pcs/common/reports/codes.py +++ b/pcs/common/reports/codes.py @@ -468,6 +468,12 @@ STONITH_RESTARTLESS_UPDATE_UNSUPPORTED_AGENT = M( "STONITH_RESTARTLESS_UPDATE_UNSUPPORTED_AGENT" ) STONITH_UNFENCING_FAILED = M("STONITH_UNFENCING_FAILED") +STONITH_UNFENCING_DEVICE_STATUS_FAILED = M( + "STONITH_UNFENCING_DEVICE_STATUS_FAILED" +) +STONITH_UNFENCING_SKIPPED_DEVICES_FENCED = M( + "STONITH_UNFENCING_SKIPPED_DEVICES_FENCED" +) STONITH_RESTARTLESS_UPDATE_UNABLE_TO_PERFORM = M( "STONITH_RESTARTLESS_UPDATE_UNABLE_TO_PERFORM" ) diff --git a/pcs/common/reports/messages.py b/pcs/common/reports/messages.py index f9688437..be8dd154 100644 --- a/pcs/common/reports/messages.py +++ b/pcs/common/reports/messages.py @@ -2782,6 +2782,47 @@ class StonithUnfencingFailed(ReportItemMessage): return f"Unfencing failed:\n{self.reason}" +@dataclass(frozen=True) +class StonithUnfencingDeviceStatusFailed(ReportItemMessage): + """ + Unfencing failed on a cluster node. + """ + + device: str + reason: str + + _code = codes.STONITH_UNFENCING_DEVICE_STATUS_FAILED + + @property + def message(self) -> str: + return ( + "Unfencing failed, unable to check status of device " + f"'{self.device}': {self.reason}" + ) + + +@dataclass(frozen=True) +class StonithUnfencingSkippedDevicesFenced(ReportItemMessage): + """ + Unfencing skipped on a cluster node, because fenced devices were found on + the node. + """ + + devices: List[str] + + _code = codes.STONITH_UNFENCING_SKIPPED_DEVICES_FENCED + + @property + def message(self) -> str: + return ( + "Unfencing skipped, {device_pl} {devices} {is_pl} fenced" + ).format( + device_pl=format_plural(self.devices, "device"), + devices=format_list(self.devices), + is_pl=format_plural(self.devices, "is", "are"), + ) + + @dataclass(frozen=True) class StonithRestartlessUpdateUnableToPerform(ReportItemMessage): """ diff --git a/pcs/lib/commands/scsi.py b/pcs/lib/commands/scsi.py index 31a3ef2d..ff20a563 100644 --- a/pcs/lib/commands/scsi.py +++ b/pcs/lib/commands/scsi.py @@ -8,20 +8,65 @@ from pcs.lib.env import LibraryEnvironment from pcs.lib.errors import LibraryError -def unfence_node(env: LibraryEnvironment, node: str, devices: Iterable[str]): +def unfence_node( + env: LibraryEnvironment, + node: str, + original_devices: Iterable[str], + updated_devices: Iterable[str], +) -> None: """ - Unfence scsi devices on a node by calling fence_scsi agent script. + Unfence scsi devices on a node by calling fence_scsi agent script. Only + newly added devices will be unfenced (set(updated_devices) - + set(original_devices)). Before unfencing, original devices are be checked + if any of them are not fenced. If there is a fenced device, unfencing will + be skipped. env -- provides communication with externals node -- node name on wich is unfencing performed - devices -- scsi devices to be unfenced + original_devices -- list of devices defined before update + updated_devices -- list of devices defined after update """ + devices_to_unfence = set(updated_devices) - set(original_devices) + if not devices_to_unfence: + return + fence_scsi_bin = os.path.join(settings.fence_agent_binaries, "fence_scsi") + fenced_devices = [] + for device in original_devices: + stdout, stderr, return_code = env.cmd_runner().run( + [ + fence_scsi_bin, + "--action=status", + f"--devices={device}", + f"--plug={node}", + ] + ) + if return_code == 2: + fenced_devices.append(device) + elif return_code != 0: + raise LibraryError( + reports.ReportItem.error( + reports.messages.StonithUnfencingDeviceStatusFailed( + device, join_multilines([stderr, stdout]) + ) + ) + ) + if fenced_devices: + # At least one of existing devices is off, which means the node has + # been fenced and new devices should not be unfenced. + env.report_processor.report( + reports.ReportItem.info( + reports.messages.StonithUnfencingSkippedDevicesFenced( + fenced_devices + ) + ) + ) + return stdout, stderr, return_code = env.cmd_runner().run( [ - os.path.join(settings.fence_agent_binaries, "fence_scsi"), + fence_scsi_bin, "--action=on", "--devices", - ",".join(sorted(devices)), + ",".join(sorted(devices_to_unfence)), f"--plug={node}", ], ) diff --git a/pcs/lib/commands/stonith.py b/pcs/lib/commands/stonith.py index 6f26e7d3..0dcf44f2 100644 --- a/pcs/lib/commands/stonith.py +++ b/pcs/lib/commands/stonith.py @@ -453,7 +453,8 @@ def _update_scsi_devices_get_element_and_devices( def _unfencing_scsi_devices( env: LibraryEnvironment, - device_list: Iterable[str], + original_devices: Iterable[str], + updated_devices: Iterable[str], force_flags: Container[reports.types.ForceCode] = (), ) -> None: """ @@ -461,9 +462,13 @@ def _unfencing_scsi_devices( to pcsd and corosync is running. env -- provides all for communication with externals - device_list -- devices to be unfenced + original_devices -- devices before update + updated_devices -- devices after update force_flags -- list of flags codes """ + devices_to_unfence = set(updated_devices) - set(original_devices) + if not devices_to_unfence: + return cluster_nodes_names, nodes_report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True, @@ -487,7 +492,11 @@ def _unfencing_scsi_devices( online_corosync_target_list = run_and_raise( env.get_node_communicator(), com_cmd ) - com_cmd = Unfence(env.report_processor, sorted(device_list)) + com_cmd = Unfence( + env.report_processor, + original_devices=sorted(original_devices), + updated_devices=sorted(updated_devices), + ) com_cmd.set_targets(online_corosync_target_list) run_and_raise(env.get_node_communicator(), com_cmd) @@ -531,9 +540,9 @@ def update_scsi_devices( IdProvider(stonith_el), set_device_list, ) - devices_for_unfencing = set(set_device_list).difference(current_device_list) - if devices_for_unfencing: - _unfencing_scsi_devices(env, devices_for_unfencing, force_flags) + _unfencing_scsi_devices( + env, current_device_list, set_device_list, force_flags + ) env.push_cib() @@ -585,6 +594,7 @@ def update_scsi_devices_add_remove( IdProvider(stonith_el), updated_device_set, ) - if add_device_list: - _unfencing_scsi_devices(env, add_device_list, force_flags) + _unfencing_scsi_devices( + env, current_device_list, updated_device_set, force_flags + ) env.push_cib() diff --git a/pcs/lib/communication/scsi.py b/pcs/lib/communication/scsi.py index 7b272017..250d67aa 100644 --- a/pcs/lib/communication/scsi.py +++ b/pcs/lib/communication/scsi.py @@ -1,4 +1,5 @@ import json +from typing import Iterable from dacite import DaciteError @@ -26,9 +27,15 @@ class Unfence( MarkSuccessfulMixin, RunRemotelyBase, ): - def __init__(self, report_processor, devices): + def __init__( + self, + report_processor: reports.ReportProcessor, + original_devices: Iterable[str], + updated_devices: Iterable[str], + ) -> None: super().__init__(report_processor) - self._devices = devices + self._original_devices = original_devices + self._updated_devices = updated_devices def _get_request_data(self): return None @@ -38,9 +45,13 @@ class Unfence( Request( target, RequestData( - "api/v1/scsi-unfence-node/v1", + "api/v1/scsi-unfence-node/v2", data=json.dumps( - {"devices": self._devices, "node": target.label} + dict( + node=target.label, + original_devices=self._original_devices, + updated_devices=self._updated_devices, + ) ), ), ) @@ -48,7 +59,9 @@ class Unfence( ] def _process_response(self, response): - report_item = response_to_report_item(response) + report_item = response_to_report_item( + response, report_pcsd_too_old_on_404=True + ) if report_item: self._report(report_item) return @@ -57,15 +70,14 @@ class Unfence( result = from_dict( InternalCommunicationResultDto, json.loads(response.data) ) - if result.status != const.COM_STATUS_SUCCESS: - context = reports.ReportItemContext(node_label) - self._report_list( - [ - reports.report_dto_to_item(report, context) - for report in result.report_list - ] - ) - else: + context = reports.ReportItemContext(node_label) + self._report_list( + [ + reports.report_dto_to_item(report, context) + for report in result.report_list + ] + ) + if result.status == const.COM_STATUS_SUCCESS: self._on_success() except (json.JSONDecodeError, DaciteError): diff --git a/pcs_test/tier0/common/reports/test_messages.py b/pcs_test/tier0/common/reports/test_messages.py index b0826cfd..05c3f619 100644 --- a/pcs_test/tier0/common/reports/test_messages.py +++ b/pcs_test/tier0/common/reports/test_messages.py @@ -1904,6 +1904,30 @@ class StonithUnfencingFailed(NameBuildTest): ) +class StonithUnfencingDeviceStatusFailed(NameBuildTest): + def test_build_message(self): + self.assert_message_from_report( + "Unfencing failed, unable to check status of device 'dev1': reason", + reports.StonithUnfencingDeviceStatusFailed("dev1", "reason"), + ) + + +class StonithUnfencingSkippedDevicesFenced(NameBuildTest): + def test_one_device(self): + self.assert_message_from_report( + "Unfencing skipped, device 'dev1' is fenced", + reports.StonithUnfencingSkippedDevicesFenced(["dev1"]), + ) + + def test_multiple_devices(self): + self.assert_message_from_report( + "Unfencing skipped, devices 'dev1', 'dev2', 'dev3' are fenced", + reports.StonithUnfencingSkippedDevicesFenced( + ["dev2", "dev1", "dev3"] + ), + ) + + class StonithRestartlessUpdateUnableToPerform(NameBuildTest): def test_build_message(self): self.assert_message_from_report( diff --git a/pcs_test/tier0/lib/commands/test_scsi.py b/pcs_test/tier0/lib/commands/test_scsi.py index de75743f..8ef9836a 100644 --- a/pcs_test/tier0/lib/commands/test_scsi.py +++ b/pcs_test/tier0/lib/commands/test_scsi.py @@ -10,26 +10,113 @@ from pcs.lib.commands import scsi class TestUnfenceNode(TestCase): def setUp(self): self.env_assist, self.config = get_env_tools(self) + self.old_devices = ["device1", "device3"] + self.new_devices = ["device3", "device0", "device2"] + self.added_devices = set(self.new_devices) - set(self.old_devices) + self.node = "node1" - def test_success(self): - self.config.runner.scsi.unfence_node("node1", ["/dev/sda", "/dev/sdb"]) + def test_success_devices_to_unfence(self): + for old_dev in self.old_devices: + self.config.runner.scsi.get_status( + self.node, old_dev, name=f"runner.scsi.is_fenced.{old_dev}" + ) + self.config.runner.scsi.unfence_node(self.node, self.added_devices) scsi.unfence_node( - self.env_assist.get_env(), "node1", ["/dev/sdb", "/dev/sda"] + self.env_assist.get_env(), + self.node, + self.old_devices, + self.new_devices, ) self.env_assist.assert_reports([]) - def test_failure(self): + def test_success_no_devices_to_unfence(self): + scsi.unfence_node( + self.env_assist.get_env(), + self.node, + {"device1", "device2", "device3"}, + {"device3"}, + ) + self.env_assist.assert_reports([]) + + def test_unfencing_failure(self): + err_msg = "stderr" + for old_dev in self.old_devices: + self.config.runner.scsi.get_status( + self.node, old_dev, name=f"runner.scsi.is_fenced.{old_dev}" + ) self.config.runner.scsi.unfence_node( - "node1", ["/dev/sda", "/dev/sdb"], stderr="stderr", return_code=1 + self.node, self.added_devices, stderr=err_msg, return_code=1 ) self.env_assist.assert_raise_library_error( lambda: scsi.unfence_node( - self.env_assist.get_env(), "node1", ["/dev/sdb", "/dev/sda"] + self.env_assist.get_env(), + self.node, + self.old_devices, + self.new_devices, ), [ fixture.error( - report_codes.STONITH_UNFENCING_FAILED, reason="stderr" + report_codes.STONITH_UNFENCING_FAILED, reason=err_msg ) ], expected_in_processor=False, ) + + def test_device_status_failed(self): + err_msg = "stderr" + new_devices = ["device1", "device2", "device3", "device4"] + old_devices = new_devices[:-1] + ok_devices = new_devices[0:2] + err_device = new_devices[2] + for dev in ok_devices: + self.config.runner.scsi.get_status( + self.node, dev, name=f"runner.scsi.is_fenced.{dev}" + ) + self.config.runner.scsi.get_status( + self.node, + err_device, + name=f"runner.scsi.is_fenced.{err_device}", + stderr=err_msg, + return_code=1, + ) + self.env_assist.assert_raise_library_error( + lambda: scsi.unfence_node( + self.env_assist.get_env(), + self.node, + old_devices, + new_devices, + ), + [ + fixture.error( + report_codes.STONITH_UNFENCING_DEVICE_STATUS_FAILED, + device=err_device, + reason=err_msg, + ) + ], + expected_in_processor=False, + ) + + def test_unfencing_skipped_devices_are_fenced(self): + stdout_off = "Status: OFF" + for old_dev in self.old_devices: + self.config.runner.scsi.get_status( + self.node, + old_dev, + name=f"runner.scsi.is_fenced.{old_dev}", + stdout=stdout_off, + return_code=2, + ) + scsi.unfence_node( + self.env_assist.get_env(), + self.node, + self.old_devices, + self.new_devices, + ) + self.env_assist.assert_reports( + [ + fixture.info( + report_codes.STONITH_UNFENCING_SKIPPED_DEVICES_FENCED, + devices=sorted(self.old_devices), + ) + ] + ) diff --git a/pcs_test/tier0/lib/commands/test_stonith_update_scsi_devices.py b/pcs_test/tier0/lib/commands/test_stonith_update_scsi_devices.py index 6ff6b99a..ed8f5d4f 100644 --- a/pcs_test/tier0/lib/commands/test_stonith_update_scsi_devices.py +++ b/pcs_test/tier0/lib/commands/test_stonith_update_scsi_devices.py @@ -1,3 +1,4 @@ +# pylint: disable=too-many-lines import json from unittest import mock, TestCase @@ -297,7 +298,9 @@ class UpdateScsiDevicesMixin: node_labels=self.existing_nodes ) self.config.http.scsi.unfence_node( - unfence, node_labels=self.existing_nodes + original_devices=devices_before, + updated_devices=devices_updated, + node_labels=self.existing_nodes, ) self.config.env.push_cib( resources=fixture_scsi( @@ -449,14 +452,14 @@ class UpdateScsiDevicesFailuresMixin: node_labels=self.existing_nodes ) self.config.http.scsi.unfence_node( - DEVICES_2, communication_list=[ dict( label=self.existing_nodes[0], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[0], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), was_connected=False, @@ -466,8 +469,9 @@ class UpdateScsiDevicesFailuresMixin: label=self.existing_nodes[1], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[1], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), output=json.dumps( @@ -491,8 +495,9 @@ class UpdateScsiDevicesFailuresMixin: label=self.existing_nodes[2], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[2], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), ), @@ -504,7 +509,7 @@ class UpdateScsiDevicesFailuresMixin: fixture.error( reports.codes.NODE_COMMUNICATION_ERROR_UNABLE_TO_CONNECT, node=self.existing_nodes[0], - command="api/v1/scsi-unfence-node/v1", + command="api/v1/scsi-unfence-node/v2", reason="errA", ), fixture.error( @@ -517,20 +522,76 @@ class UpdateScsiDevicesFailuresMixin: ] ) + def test_unfence_failure_unknown_command(self): + self._unfence_failure_common_calls() + self.config.http.corosync.get_corosync_online_targets( + node_labels=self.existing_nodes + ) + communication_list = [ + dict( + label=node, + raw_data=json.dumps( + dict( + node=node, + original_devices=DEVICES_1, + updated_devices=DEVICES_2, + ) + ), + ) + for node in self.existing_nodes[0:2] + ] + communication_list.append( + dict( + label=self.existing_nodes[2], + response_code=404, + raw_data=json.dumps( + dict( + node=self.existing_nodes[2], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, + ) + ), + output=json.dumps( + dto.to_dict( + communication.dto.InternalCommunicationResultDto( + status=communication.const.COM_STATUS_UNKNOWN_CMD, + status_msg=( + "Unknown command '/api/v1/scsi-unfence-node/v2'" + ), + report_list=[], + data=None, + ) + ) + ), + ), + ) + self.config.http.scsi.unfence_node( + communication_list=communication_list + ) + self.env_assist.assert_raise_library_error(self.command()) + self.env_assist.assert_reports( + [ + fixture.error( + reports.codes.PCSD_VERSION_TOO_OLD, + node=self.existing_nodes[2], + ), + ] + ) + def test_unfence_failure_agent_script_failed(self): self._unfence_failure_common_calls() self.config.http.corosync.get_corosync_online_targets( node_labels=self.existing_nodes ) self.config.http.scsi.unfence_node( - DEVICES_2, communication_list=[ dict( label=self.existing_nodes[0], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[0], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), ), @@ -538,8 +599,9 @@ class UpdateScsiDevicesFailuresMixin: label=self.existing_nodes[1], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[1], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), output=json.dumps( @@ -563,8 +625,9 @@ class UpdateScsiDevicesFailuresMixin: label=self.existing_nodes[2], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[2], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), ), @@ -639,14 +702,14 @@ class UpdateScsiDevicesFailuresMixin: ] ) self.config.http.scsi.unfence_node( - DEVICES_2, communication_list=[ dict( label=self.existing_nodes[0], raw_data=json.dumps( dict( - devices=[DEV_2], node=self.existing_nodes[0], + original_devices=DEVICES_1, + updated_devices=DEVICES_2, ) ), ), diff --git a/pcs_test/tools/command_env/config_http_scsi.py b/pcs_test/tools/command_env/config_http_scsi.py index 0e9f63af..7150eef9 100644 --- a/pcs_test/tools/command_env/config_http_scsi.py +++ b/pcs_test/tools/command_env/config_http_scsi.py @@ -14,7 +14,8 @@ class ScsiShortcuts: def unfence_node( self, - devices, + original_devices=(), + updated_devices=(), node_labels=None, communication_list=None, name="http.scsi.unfence_node", @@ -22,7 +23,8 @@ class ScsiShortcuts: """ Create a calls for node unfencing - list devices -- list of scsi devices + list original_devices -- list of scsi devices before an update + list updated_devices -- list of scsi devices after an update list node_labels -- create success responses from these nodes list communication_list -- use these custom responses string name -- the key of this call @@ -39,7 +41,13 @@ class ScsiShortcuts: communication_list = [ dict( label=node, - raw_data=json.dumps(dict(devices=devices, node=node)), + raw_data=json.dumps( + dict( + node=node, + original_devices=original_devices, + updated_devices=updated_devices, + ) + ), ) for node in node_labels ] @@ -47,7 +55,7 @@ class ScsiShortcuts: self.__calls, name, communication_list, - action="api/v1/scsi-unfence-node/v1", + action="api/v1/scsi-unfence-node/v2", output=json.dumps( to_dict( communication.dto.InternalCommunicationResultDto( diff --git a/pcs_test/tools/command_env/config_runner_scsi.py b/pcs_test/tools/command_env/config_runner_scsi.py index 4b671bb7..3cee13d6 100644 --- a/pcs_test/tools/command_env/config_runner_scsi.py +++ b/pcs_test/tools/command_env/config_runner_scsi.py @@ -35,7 +35,41 @@ class ScsiShortcuts: os.path.join(settings.fence_agent_binaries, "fence_scsi"), "--action=on", "--devices", - ",".join(devices), + ",".join(sorted(devices)), + f"--plug={node}", + ], + stdout=stdout, + stderr=stderr, + returncode=return_code, + ), + ) + + def get_status( + self, + node, + device, + stdout="", + stderr="", + return_code=0, + name="runner.scsi.is_fenced", + ): + """ + Create a call for getting scsi status + + string node -- a node from which is unfencing performed + str device -- a device to check + string stdout -- stdout from fence_scsi agent script + string stderr -- stderr from fence_scsi agent script + int return_code -- return code of the fence_scsi agent script + string name -- the key of this call + """ + self.__calls.place( + name, + RunnerCall( + [ + os.path.join(settings.fence_agent_binaries, "fence_scsi"), + "--action=status", + f"--devices={device}", f"--plug={node}", ], stdout=stdout, diff --git a/pcsd/api_v1.rb b/pcsd/api_v1.rb index 7edeeabf..e55c2be7 100644 --- a/pcsd/api_v1.rb +++ b/pcsd/api_v1.rb @@ -291,7 +291,7 @@ def route_api_v1(auth_user, params, request) :only_superuser => false, :permissions => Permissions::WRITE, }, - 'scsi-unfence-node/v1' => { + 'scsi-unfence-node/v2' => { :cmd => 'scsi.unfence_node', :only_superuser => false, :permissions => Permissions::WRITE, diff --git a/pcsd/capabilities.xml b/pcsd/capabilities.xml index 58ebcf0f..3954aa5d 100644 --- a/pcsd/capabilities.xml +++ b/pcsd/capabilities.xml @@ -1892,11 +1892,13 @@ pcs commands: stonith update-scsi-devices - + - Unfence scsi devices on a cluster node. + Unfence scsi devices on a cluster node. In comparison with v1, only + newly added devices are unfenced. In case any existing device is + fenced, unfencing will be skipped. - daemon urls: /api/v1/scsi-unfence-node/v1 + daemon urls: /api/v1/scsi-unfence-node/v2 -- 2.31.1