2737e7
From 83445df2894426cdaf2f09f8bf2ac4c829922620 Mon Sep 17 00:00:00 2001
2737e7
From: Christian Heimes <cheimes@redhat.com>
2737e7
Date: Sun, 8 Jul 2018 11:53:58 +0200
2737e7
Subject: [PATCH] Auto-retry failed certmonger requests
2737e7
2737e7
During parallel replica installation, a request sometimes fails with
2737e7
CA_REJECTED or CA_UNREACHABLE. The error occur when the master is
2737e7
either busy or some information haven't been replicated yet. Even
2737e7
a stuck request can be recovered, e.g. when permission and group
2737e7
information have been replicated.
2737e7
2737e7
A new function request_and_retry_cert() automatically resubmits failing
2737e7
requests until it times out.
2737e7
2737e7
Fixes: https://pagure.io/freeipa/issue/7623
2737e7
Signed-off-by: Christian Heimes <cheimes@redhat.com>
2737e7
Reviewed-By: Tibor Dudlak <tdudlak@redhat.com>
2737e7
---
2737e7
 ipalib/install/certmonger.py      | 64 ++++++++++++++++++++++++-------
2737e7
 ipaserver/install/cainstance.py   |  4 +-
2737e7
 ipaserver/install/certs.py        | 19 ++++++---
2737e7
 ipaserver/install/dsinstance.py   |  4 +-
2737e7
 ipaserver/install/httpinstance.py |  5 ++-
2737e7
 ipaserver/install/krbinstance.py  |  4 +-
2737e7
 6 files changed, 76 insertions(+), 24 deletions(-)
2737e7
2737e7
diff --git a/ipalib/install/certmonger.py b/ipalib/install/certmonger.py
2737e7
index d2b782ddb0c746a3dfd96d0222bb31c6a960fdff..d915cbf7638c68f3ad5170b2878a706ad39def62 100644
2737e7
--- a/ipalib/install/certmonger.py
2737e7
+++ b/ipalib/install/certmonger.py
2737e7
@@ -302,20 +302,56 @@ def add_subject(request_id, subject):
2737e7
 def request_and_wait_for_cert(
2737e7
         certpath, subject, principal, nickname=None, passwd_fname=None,
2737e7
         dns=None, ca='IPA', profile=None,
2737e7
-        pre_command=None, post_command=None, storage='NSSDB', perms=None):
2737e7
-    """
2737e7
-    Execute certmonger to request a server certificate.
2737e7
-
2737e7
-    The method also waits for the certificate to be available.
2737e7
-    """
2737e7
-    reqId = request_cert(certpath, subject, principal, nickname,
2737e7
-                         passwd_fname, dns, ca, profile,
2737e7
-                         pre_command, post_command, storage, perms)
2737e7
-    state = wait_for_request(reqId, api.env.startup_timeout)
2737e7
-    ca_error = get_request_value(reqId, 'ca-error')
2737e7
-    if state != 'MONITORING' or ca_error:
2737e7
-        raise RuntimeError("Certificate issuance failed ({})".format(state))
2737e7
-    return reqId
2737e7
+        pre_command=None, post_command=None, storage='NSSDB', perms=None,
2737e7
+        resubmit_timeout=0):
2737e7
+    """Request certificate, wait and possibly resubmit failing requests
2737e7
+
2737e7
+    Submit a cert request to certmonger and wait until the request has
2737e7
+    finished.
2737e7
+
2737e7
+    With timeout, a failed request is resubmitted. During parallel replica
2737e7
+    installation, a request sometimes fails with CA_REJECTED or
2737e7
+    CA_UNREACHABLE. The error occurs when the master is either busy or some
2737e7
+    information haven't been replicated yet. Even a stuck request can be
2737e7
+    recovered, e.g. when permission and group information have been
2737e7
+    replicated.
2737e7
+    """
2737e7
+    req_id = request_cert(
2737e7
+        certpath, subject, principal, nickname, passwd_fname, dns, ca,
2737e7
+        profile, pre_command, post_command, storage, perms
2737e7
+    )
2737e7
+
2737e7
+    deadline = time.time() + resubmit_timeout
2737e7
+    while True:  # until success, timeout, or error
2737e7
+        state = wait_for_request(req_id, api.env.replication_wait_timeout)
2737e7
+        ca_error = get_request_value(req_id, 'ca-error')
2737e7
+        if state == 'MONITORING' and ca_error is None:
2737e7
+            # we got a winner, exiting
2737e7
+            root_logger.debug("Cert request %s was successful", req_id)
2737e7
+            return req_id
2737e7
+
2737e7
+        root_logger.debug(
2737e7
+            "Cert request %s failed: %s (%s)", req_id, state, ca_error
2737e7
+        )
2737e7
+        if state not in {'CA_REJECTED', 'CA_UNREACHABLE'}:
2737e7
+            # probably unrecoverable error
2737e7
+            root_logger.debug("Giving up on cert request %s", req_id)
2737e7
+            break
2737e7
+        elif not resubmit_timeout:
2737e7
+            # no resubmit
2737e7
+            break
2737e7
+        elif time.time() > deadline:
2737e7
+            root_logger.debug("Request %s reached resubmit dead line", req_id)
2737e7
+            break
2737e7
+        else:
2737e7
+            # sleep and resubmit
2737e7
+            root_logger.debug("Sleep and resubmit cert request %s", req_id)
2737e7
+            time.sleep(10)
2737e7
+            resubmit_request(req_id)
2737e7
+
2737e7
+    raise RuntimeError(
2737e7
+        "Certificate issuance failed ({}: {})".format(state, ca_error)
2737e7
+    )
2737e7
 
2737e7
 
2737e7
 def request_cert(
2737e7
diff --git a/ipaserver/install/cainstance.py b/ipaserver/install/cainstance.py
2737e7
index 62e9ad7de6f00eabb48f726a3931eb8acf0ba22b..e207911814e3553c5aa5310694170d3575337c55 100644
2737e7
--- a/ipaserver/install/cainstance.py
2737e7
+++ b/ipaserver/install/cainstance.py
2737e7
@@ -863,7 +863,9 @@ class CAInstance(DogtagInstance):
2737e7
                 profile='caServerCert',
2737e7
                 pre_command='renew_ra_cert_pre',
2737e7
                 post_command='renew_ra_cert',
2737e7
-                storage="FILE")
2737e7
+                storage="FILE",
2737e7
+                resubmit_timeout=api.env.replication_wait_timeout
2737e7
+            )
2737e7
             self.__set_ra_cert_perms()
2737e7
 
2737e7
             self.requestId = str(reqId)
2737e7
diff --git a/ipaserver/install/certs.py b/ipaserver/install/certs.py
2737e7
index de96318db51b03f2515814d574cfebf1b242b6a6..5670d468bb1b168af7ada7b2d8924b8ec9f5d9c1 100644
2737e7
--- a/ipaserver/install/certs.py
2737e7
+++ b/ipaserver/install/certs.py
2737e7
@@ -663,12 +663,19 @@ class CertDB(object):
2737e7
     def export_pem_cert(self, nickname, location):
2737e7
         return self.nssdb.export_pem_cert(nickname, location)
2737e7
 
2737e7
-    def request_service_cert(self, nickname, principal, host):
2737e7
-        certmonger.request_and_wait_for_cert(certpath=self.secdir,
2737e7
-                                             nickname=nickname,
2737e7
-                                             principal=principal,
2737e7
-                                             subject=host,
2737e7
-                                             passwd_fname=self.passwd_fname)
2737e7
+    def request_service_cert(self, nickname, principal, host,
2737e7
+                             resubmit_timeout=None):
2737e7
+        if resubmit_timeout is None:
2737e7
+            resubmit_timeout = api.env.replication_wait_timeout
2737e7
+        return certmonger.request_and_wait_for_cert(
2737e7
+            certpath=self.secdir,
2737e7
+            storage='NSSDB',
2737e7
+            nickname=nickname,
2737e7
+            principal=principal,
2737e7
+            subject=host,
2737e7
+            passwd_fname=self.passwd_fname,
2737e7
+            resubmit_timeout=resubmit_timeout
2737e7
+        )
2737e7
 
2737e7
     def is_ipa_issued_cert(self, api, nickname):
2737e7
         """
2737e7
diff --git a/ipaserver/install/dsinstance.py b/ipaserver/install/dsinstance.py
2737e7
index 7adaabd3c1280709150329003130f70233de37f4..8ec5cdd7a4a324a1a40dbe968defcc797db8f054 100644
2737e7
--- a/ipaserver/install/dsinstance.py
2737e7
+++ b/ipaserver/install/dsinstance.py
2737e7
@@ -847,7 +847,9 @@ class DsInstance(service.Service):
2737e7
                     ca='IPA',
2737e7
                     profile=dogtag.DEFAULT_PROFILE,
2737e7
                     dns=[self.fqdn],
2737e7
-                    post_command=cmd)
2737e7
+                    post_command=cmd,
2737e7
+                    resubmit_timeout=api.env.replication_wait_timeout
2737e7
+                )
2737e7
             finally:
2737e7
                 if prev_helper is not None:
2737e7
                     certmonger.modify_ca_helper('IPA', prev_helper)
2737e7
diff --git a/ipaserver/install/httpinstance.py b/ipaserver/install/httpinstance.py
2737e7
index 2df51eaa77d3ee3246027a6bcbc4023dbad61160..2366698bb0d6a7c9cd481ba9d5568d320742f6bb 100644
2737e7
--- a/ipaserver/install/httpinstance.py
2737e7
+++ b/ipaserver/install/httpinstance.py
2737e7
@@ -450,7 +450,10 @@ class HTTPInstance(service.Service):
2737e7
                     ca='IPA',
2737e7
                     profile=dogtag.DEFAULT_PROFILE,
2737e7
                     dns=[self.fqdn],
2737e7
-                    post_command='restart_httpd')
2737e7
+                    post_command='restart_httpd',
2737e7
+                    storage='NSSDB',
2737e7
+                    resubmit_timeout=api.env.replication_wait_timeout
2737e7
+                )
2737e7
             finally:
2737e7
                 if prev_helper is not None:
2737e7
                     certmonger.modify_ca_helper('IPA', prev_helper)
2737e7
diff --git a/ipaserver/install/krbinstance.py b/ipaserver/install/krbinstance.py
2737e7
index 4041d1b5fb3c3cf3db78b6cb282ce5f17793a0e3..62de118b7104fe7d72d2c1fd5577e9a76000c663 100644
2737e7
--- a/ipaserver/install/krbinstance.py
2737e7
+++ b/ipaserver/install/krbinstance.py
2737e7
@@ -436,7 +436,9 @@ class KrbInstance(service.Service):
2737e7
                 storage='FILE',
2737e7
                 profile=KDC_PROFILE,
2737e7
                 post_command='renew_kdc_cert',
2737e7
-                perms=(0o644, 0o600))
2737e7
+                perms=(0o644, 0o600),
2737e7
+                resubmit_timeout=api.env.replication_wait_timeout
2737e7
+            )
2737e7
         except dbus.DBusException as e:
2737e7
             # if the certificate is already tracked, ignore the error
2737e7
             name = e.get_dbus_name()
2737e7
-- 
2737e7
2.17.1
2737e7