diff --git a/SOURCES/bz2042070-1-gcp-pd-move-gcp-vpc-move-route-dont-fail-1st-try.patch b/SOURCES/bz2042070-1-gcp-pd-move-gcp-vpc-move-route-dont-fail-1st-try.patch new file mode 100644 index 0000000..00a04b4 --- /dev/null +++ b/SOURCES/bz2042070-1-gcp-pd-move-gcp-vpc-move-route-dont-fail-1st-try.patch @@ -0,0 +1,64 @@ +From fcd2565602146c0b9317d159cecb8935e304c7ce Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 30 Sep 2021 10:23:17 +0200 +Subject: [PATCH] gcp-pd-move/gcp-vpc-move-route: dont fail failed resources + instantly (caused by OCF_ERR_CONFIGURED) + +--- + heartbeat/gcp-pd-move.in | 4 ++-- + heartbeat/gcp-vpc-move-route.in | 6 +++--- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/heartbeat/gcp-pd-move.in b/heartbeat/gcp-pd-move.in +index e99cc71f88..cbe703c3c5 100644 +--- a/heartbeat/gcp-pd-move.in ++++ b/heartbeat/gcp-pd-move.in +@@ -157,7 +157,7 @@ def populate_vars(): + CONN = googleapiclient.discovery.build('compute', 'v1') + except Exception as e: + logger.error('Couldn\'t connect with google api: ' + str(e)) +- sys.exit(ocf.OCF_ERR_CONFIGURED) ++ sys.exit(ocf.OCF_ERR_GENERIC) + + for param in PARAMETERS: + value = os.environ.get('OCF_RESKEY_%s' % param, PARAMETERS[param]) +@@ -172,7 +172,7 @@ def populate_vars(): + except Exception as e: + logger.error( + 'Couldn\'t get instance name, is this running inside GCE?: ' + str(e)) +- sys.exit(ocf.OCF_ERR_CONFIGURED) ++ sys.exit(ocf.OCF_ERR_GENERIC) + + PROJECT = get_metadata('project/project-id') + if PARAMETERS['disk_scope'] in ['detect', 'regional']: +diff --git a/heartbeat/gcp-vpc-move-route.in b/heartbeat/gcp-vpc-move-route.in +index dac6e4ea8c..6b240c04d0 100644 +--- a/heartbeat/gcp-vpc-move-route.in ++++ b/heartbeat/gcp-vpc-move-route.in +@@ -243,7 +243,7 @@ def validate(ctx): + ctx.conn = googleapiclient.discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False) + except Exception as e: + logger.error('Couldn\'t connect with google api: ' + str(e)) +- sys.exit(OCF_ERR_CONFIGURED) ++ sys.exit(OCF_ERR_GENERIC) + + ctx.ip = os.environ.get('OCF_RESKEY_ip') + if not ctx.ip: +@@ -258,7 +258,7 @@ def validate(ctx): + except Exception as e: + logger.error( + 'Instance information not found. Is this a GCE instance ?: %s', str(e)) +- sys.exit(OCF_ERR_CONFIGURED) ++ sys.exit(OCF_ERR_GENERIC) + + ctx.instance_url = '%s/projects/%s/zones/%s/instances/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.zone, ctx.instance) +@@ -273,7 +273,7 @@ def validate(ctx): + idxs = ctx.iproute.link_lookup(ifname=ctx.interface) + if not idxs: + logger.error('Network interface not found') +- sys.exit(OCF_ERR_CONFIGURED) ++ sys.exit(OCF_ERR_GENERIC) + ctx.iface_idx = idxs[0] + + diff --git a/SOURCES/bz2042070-2-gcp-vpc-move-vip-add-retries.patch b/SOURCES/bz2042070-2-gcp-vpc-move-vip-add-retries.patch new file mode 100644 index 0000000..2350f1a --- /dev/null +++ b/SOURCES/bz2042070-2-gcp-vpc-move-vip-add-retries.patch @@ -0,0 +1,102 @@ +From 3ae6d8f0a34d099945d9bf005ed45dbfe9452202 Mon Sep 17 00:00:00 2001 +From: kj1724 <78624900+kj1724@users.noreply.github.com> +Date: Wed, 28 Apr 2021 10:22:38 -0400 +Subject: [PATCH] gcp-vpc-move-vip.in: Adds retries + +If the cluster fails a monitoring event, it will try to restart the resource. If the resource agent makes an API/metadata call that fails at that time, the resource will be considered "failed", but in certain case also "unconfigured", which prevents further operations. + +These changes can help the agent recover on certain intermittent failures. +--- + heartbeat/gcp-vpc-move-vip.in | 62 ++++++++++++++++++++--------------- + 1 file changed, 35 insertions(+), 27 deletions(-) + +diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in +index bbbd87b7a9..c411555110 100755 +--- a/heartbeat/gcp-vpc-move-vip.in ++++ b/heartbeat/gcp-vpc-move-vip.in +@@ -50,6 +50,8 @@ REMOVE = 1 + CONN = None + THIS_VM = None + ALIAS = None ++MAX_RETRIES = 3 ++RETRY_BACKOFF_SECS = 1 + METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' + METADATA_HEADERS = {'Metadata-Flavor': 'Google'} + METADATA = \ +@@ -111,18 +113,37 @@ def get_metadata(metadata_key, params=None, timeout=None): + + Returns: + HTTP response from the GET request. +- +- Raises: +- urlerror.HTTPError: raises when the GET request fails. + """ +- timeout = timeout or 60 +- metadata_url = os.path.join(METADATA_SERVER, metadata_key) +- params = urlparse.urlencode(params or {}) +- url = '%s?%s' % (metadata_url, params) +- request = urlrequest.Request(url, headers=METADATA_HEADERS) +- request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) +- return request_opener.open( +- request, timeout=timeout * 1.1).read().decode("utf-8") ++ for i in range(MAX_RETRIES): ++ try: ++ timeout = timeout or 60 ++ metadata_url = os.path.join(METADATA_SERVER, metadata_key) ++ params = urlparse.urlencode(params or {}) ++ url = '%s?%s' % (metadata_url, params) ++ request = urlrequest.Request(url, headers=METADATA_HEADERS) ++ request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) ++ return request_opener.open( ++ request, timeout=timeout * 1.1).read().decode("utf-8") ++ except Exception as e: ++ logger.error('Couldn\'t get instance name, is this running inside GCE?: ' ++ + str(e)) ++ time.sleep(RETRY_BACKOFF_SECS * (i + 1)) ++ ++ # If the retries are exhausted we exit with a generic error. ++ sys.exit(OCF_ERR_GENERIC) ++ ++ ++def create_api_connection(): ++ for i in range(MAX_RETRIES): ++ try: ++ return googleapiclient.discovery.build('compute', 'v1', ++ cache_discovery=False) ++ except Exception as e: ++ logger.error('Couldn\'t connect with google api: ' + str(e)) ++ time.sleep(RETRY_BACKOFF_SECS * (i + 1)) ++ ++ # If the retries are exhausted we exit with a generic error. ++ sys.exit(OCF_ERR_GENERIC) + + + def get_instance(project, zone, instance): +@@ -358,24 +379,11 @@ def gcp_alias_status(alias): + + def validate(): + global ALIAS +- global CONN + global THIS_VM ++ global CONN + +- # Populate global vars +- try: +- CONN = googleapiclient.discovery.build('compute', 'v1', +- cache_discovery=False) +- except Exception as e: +- logger.error('Couldn\'t connect with google api: ' + str(e)) +- sys.exit(OCF_ERR_CONFIGURED) +- +- try: +- THIS_VM = get_metadata('instance/name') +- except Exception as e: +- logger.error('Couldn\'t get instance name, is this running inside GCE?: ' +- + str(e)) +- sys.exit(OCF_ERR_CONFIGURED) +- ++ CONN = create_api_connection() ++ THIS_VM = get_metadata('instance/name') + ALIAS = os.environ.get('OCF_RESKEY_alias_ip') + if not ALIAS: + logger.error('Missing alias_ip parameter') diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 9660896..09165bb 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -99,7 +99,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 61%{?dist}.13 +Release: 61%{?dist}.15 License: GPLv2+ and LGPLv2+ and ASL 2.0 URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -253,6 +253,8 @@ Patch127: bz1913936-3-gcp-vpc-move-route-make-vpc_network-optional.patch Patch128: bz1937151-azure-lb-redirect-to-avoid-nc-dying-EPIPE-error.patch Patch129: bz1939282-aws-vpc-move-ip-add-ENI-lookup.patch Patch130: bz1935798-gcp-pd-move-fix-partially-matched-disk_name.patch +Patch131: bz2042070-1-gcp-pd-move-gcp-vpc-move-route-dont-fail-1st-try.patch +Patch132: bz2042070-2-gcp-vpc-move-vip-add-retries.patch # SAPHana* Patch500: bz1855888-SAPHana-use-actual-mode.patch @@ -450,7 +452,7 @@ SAP instances to be managed in a cluster environment. License: GPLv2+ Summary: SAP HANA Scale-Out cluster resource agents Version: 0.164.0 -Release: 6%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.13 +Release: 6%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.15 %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} Group: System Environment/Base %else @@ -470,7 +472,7 @@ environment. License: GPLv2+ Summary: SAP cluster connector script Version: 3.0.1 -Release: 37%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.13 +Release: 37%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.15 %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} Group: System Environment/Base %else @@ -625,6 +627,8 @@ exit 1 %patch128 -p1 %patch129 -p1 -F2 %patch130 -p1 +%patch131 -p1 -F1 +%patch132 -p1 # add SAPHana agents to Makefile.am mv %{saphana_prefix}-%{saphana_version}/ra/SAPHana* heartbeat @@ -1286,6 +1290,12 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Tue Jan 25 2022 Oyvind Albrigtsen - 4.1.1-61.15 +- gcp-pd-move/gcp-vpc-move-route/gcp-vpc-move-vip: dont fail with + configuration rc when it might be a network hickup + + Resolves: rhbz#2042070 + * Thu Jul 22 2021 Oyvind Albrigtsen - 4.1.1-61.13 - SAPHana: use actual_mode from global.ini and fallback to mode when it's not set