Blame SOURCES/bz1957765-gcp-vpc-move-vip-retry.patch

4dce39
From 3ae6d8f0a34d099945d9bf005ed45dbfe9452202 Mon Sep 17 00:00:00 2001
4dce39
From: kj1724 <78624900+kj1724@users.noreply.github.com>
4dce39
Date: Wed, 28 Apr 2021 10:22:38 -0400
4dce39
Subject: [PATCH] gcp-vpc-move-vip.in: Adds retries
4dce39
4dce39
If the cluster fails a monitoring event, it will try to restart the resource. If the resource agent makes an API/metadata call that fails at that time, the resource will be considered "failed", but in certain case also "unconfigured", which prevents further operations.
4dce39
4dce39
These changes can help the agent recover on certain intermittent failures.
4dce39
---
4dce39
 heartbeat/gcp-vpc-move-vip.in | 62 ++++++++++++++++++++---------------
4dce39
 1 file changed, 35 insertions(+), 27 deletions(-)
4dce39
4dce39
diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in
4dce39
index bbbd87b7a9..c411555110 100755
4dce39
--- a/heartbeat/gcp-vpc-move-vip.in
4dce39
+++ b/heartbeat/gcp-vpc-move-vip.in
4dce39
@@ -50,6 +50,8 @@ REMOVE = 1
4dce39
 CONN = None
4dce39
 THIS_VM = None
4dce39
 ALIAS = None
4dce39
+MAX_RETRIES = 3
4dce39
+RETRY_BACKOFF_SECS = 1
4dce39
 METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/'
4dce39
 METADATA_HEADERS = {'Metadata-Flavor': 'Google'}
4dce39
 METADATA = \
4dce39
@@ -111,18 +113,37 @@ def get_metadata(metadata_key, params=None, timeout=None):
4dce39
 
4dce39
   Returns:
4dce39
     HTTP response from the GET request.
4dce39
-
4dce39
-  Raises:
4dce39
-    urlerror.HTTPError: raises when the GET request fails.
4dce39
   """
4dce39
-  timeout = timeout or 60
4dce39
-  metadata_url = os.path.join(METADATA_SERVER, metadata_key)
4dce39
-  params = urlparse.urlencode(params or {})
4dce39
-  url = '%s?%s' % (metadata_url, params)
4dce39
-  request = urlrequest.Request(url, headers=METADATA_HEADERS)
4dce39
-  request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({}))
4dce39
-  return request_opener.open(
4dce39
-      request, timeout=timeout * 1.1).read().decode("utf-8")
4dce39
+  for i in range(MAX_RETRIES):
4dce39
+    try:
4dce39
+      timeout = timeout or 60
4dce39
+      metadata_url = os.path.join(METADATA_SERVER, metadata_key)
4dce39
+      params = urlparse.urlencode(params or {})
4dce39
+      url = '%s?%s' % (metadata_url, params)
4dce39
+      request = urlrequest.Request(url, headers=METADATA_HEADERS)
4dce39
+      request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({}))
4dce39
+      return request_opener.open(
4dce39
+          request, timeout=timeout * 1.1).read().decode("utf-8")
4dce39
+    except Exception as e:
4dce39
+      logger.error('Couldn\'t get instance name, is this running inside GCE?: '
4dce39
+                   + str(e))
4dce39
+      time.sleep(RETRY_BACKOFF_SECS * (i + 1))
4dce39
+
4dce39
+  # If the retries are exhausted we exit with a generic error.
4dce39
+  sys.exit(OCF_ERR_GENERIC)
4dce39
+
4dce39
+
4dce39
+def create_api_connection():
4dce39
+  for i in range(MAX_RETRIES):
4dce39
+    try:
4dce39
+      return googleapiclient.discovery.build('compute', 'v1',
4dce39
+                                             cache_discovery=False)
4dce39
+    except Exception as e:
4dce39
+      logger.error('Couldn\'t connect with google api: ' + str(e))
4dce39
+      time.sleep(RETRY_BACKOFF_SECS * (i + 1))
4dce39
+
4dce39
+  # If the retries are exhausted we exit with a generic error.
4dce39
+  sys.exit(OCF_ERR_GENERIC)
4dce39
 
4dce39
 
4dce39
 def get_instance(project, zone, instance):
4dce39
@@ -358,24 +379,11 @@ def gcp_alias_status(alias):
4dce39
 
4dce39
 def validate():
4dce39
   global ALIAS
4dce39
-  global CONN
4dce39
   global THIS_VM
4dce39
+  global CONN
4dce39
 
4dce39
-  # Populate global vars
4dce39
-  try:
4dce39
-    CONN = googleapiclient.discovery.build('compute', 'v1',
4dce39
-                                           cache_discovery=False)
4dce39
-  except Exception as e:
4dce39
-    logger.error('Couldn\'t connect with google api: ' + str(e))
4dce39
-    sys.exit(OCF_ERR_CONFIGURED)
4dce39
-
4dce39
-  try:
4dce39
-    THIS_VM = get_metadata('instance/name')
4dce39
-  except Exception as e:
4dce39
-    logger.error('Couldn\'t get instance name, is this running inside GCE?: '
4dce39
-                 + str(e))
4dce39
-    sys.exit(OCF_ERR_CONFIGURED)
4dce39
-
4dce39
+  CONN = create_api_connection()
4dce39
+  THIS_VM = get_metadata('instance/name')
4dce39
   ALIAS = os.environ.get('OCF_RESKEY_alias_ip')
4dce39
   if not ALIAS:
4dce39
     logger.error('Missing alias_ip parameter')