Blob Blame History Raw
From d24a4a587dbd9e67d4401d3032008cdfe015f69b Mon Sep 17 00:00:00 2001
From: Fraser Tweedale <ftweedal@redhat.com>
Date: Mon, 12 Nov 2018 11:47:48 +1100
Subject: [PATCH 1/2] LDAPProfileSubsystem: add watchdog timer for initial load

During initial profile loading, if we receive fewer entries than
indicated by the parent entry's numSubordinates attribute, the
AsyncLoader will not unlock, and the Dogtag startup thread is
blocked.  This situation can arise when there are entries that are
contributing to the numSubordinates count, which are not visible to
Dogtag.  Replication conflicts are one such example.

The implementation currently uses a persistent search that also
returns existing entries.  The alternative approach - a regular
search followed by a persistent search - leaves open the possibility
of missing replicated changes to the subtree that were processed in
between the regular and persistent search.  Therefore we use a
single search, which avoids this possibility.

We also *do* want to block startup until all profiles are loaded.
The system reporting ready before profiles are loaded has led to
issues in CI and production environments.  During a persistent
search, there is no in-band signal that indicates when all the
"immediate" results have been delivered.  The solution was to read
the numSubordinates value of the container to know how many
immediate results to process.  So we have to address the corner
cases discussed above.

The approach to resolving this is to use a watchdog timer during
initial load of profiles.  The AsyncLoader is now initialised with a
timeout value (in seconds).  A timer is started and the lock is
forcibly released after the timeout.  A value <= 0 suppresses the
watchdog.  Update the LDAPProfileSubsystem to time out the loader
after 10 seconds.  The existing behaviour of unlocking when the
expected number of entries have been processed is maintained.

Also add a log message when the start await gets interrupted, to
indicate that there may be replication conflicts or other extraneous
data in the profile configuration subtree.

Fixes: https://pagure.io/dogtagpki/issue/3078
(cherry picked from commit 758d2a7e551e532f464419d68306cf13e096fe85)
---
 .../src/com/netscape/certsrv/util/AsyncLoader.java | 105 +++++++++++++++++++--
 .../cmscore/profile/LDAPProfileSubsystem.java      |  65 +++++++++----
 2 files changed, 145 insertions(+), 25 deletions(-)

diff --git a/base/common/src/com/netscape/certsrv/util/AsyncLoader.java b/base/common/src/com/netscape/certsrv/util/AsyncLoader.java
index 39f8efd..6d10531 100644
--- a/base/common/src/com/netscape/certsrv/util/AsyncLoader.java
+++ b/base/common/src/com/netscape/certsrv/util/AsyncLoader.java
@@ -18,6 +18,8 @@
 
 package com.netscape.certsrv.util;
 
+import java.util.Timer;
+import java.util.TimerTask;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.locks.ReentrantLock;
 
@@ -33,28 +35,54 @@ import java.util.concurrent.locks.ReentrantLock;
  */
 public class AsyncLoader {
     private CountDownLatch producerInitialised = new CountDownLatch(1);
-    private ReentrantLock loadingLock = new ReentrantLock();
+    private GoAwayLock loadingLock = new GoAwayLock();
     private Integer numItems = null;
     private int numItemsLoaded = 0;
+    private boolean loading = true;
+    private int timeoutSeconds = 0;
+    private Timer timer = new Timer("AsyncLoader watchdog");
+    private TimerTask watchdog = null;
+
+    /** Create an AsyncLoader with the specified timeout.
+     *
+     * If timeoutSeconds > 0, startLoading() will start a timer
+     * that will forcibly unlock the loader after the specified
+     * timeout.
+     */
+    public AsyncLoader(int timeoutSeconds) {
+        this.timeoutSeconds = timeoutSeconds;
+    }
 
     /**
-     * Acquire the lock as a producer.
+     * Acquire the lock as a producer and reset
+     * progress-tracking variables.
      */
     public void startLoading() {
+        loadingLock.lock();
+        loading = true;
         numItems = null;
         numItemsLoaded = 0;
-        loadingLock.lock();
         producerInitialised.countDown();
+        if (timeoutSeconds > 0) {
+            if (watchdog != null)
+                watchdog.cancel();
+            watchdog = new AsyncLoaderWatchdog();
+            timer.schedule(watchdog, timeoutSeconds * 1000);
+        }
     }
 
     /**
      * Increment the number of items loaded by 1.  If the number
      * of items is known and that many items have been loaded,
      * unlock the loader.
+     *
+     * If the loader is not currently loading, does nothing.
      */
     public void increment() {
-        numItemsLoaded += 1;
-        checkLoadDone();
+        if (loading) {
+            numItemsLoaded += 1;
+            checkLoadDone();
+        }
     }
 
     /**
@@ -69,18 +97,77 @@ public class AsyncLoader {
 
     private void checkLoadDone() {
         if (numItems != null && numItemsLoaded >= numItems) {
+            watchdog.cancel();
+            loading = false;
             while (loadingLock.isHeldByCurrentThread())
                 loadingLock.unlock();
         }
     }
 
+    /**
+     * Wait upon the consumer to finish loading items.
+     *
+     * @throws InterruptedException if the thread is interrupted
+     * while waiting for the loading lock.  This can happen due
+     * to timeout.
+     */
     public void awaitLoadDone() throws InterruptedException {
         /* A consumer may await upon the Loader immediately after
          * starting the producer.  To ensure that the producer
-         * has time to acquire the lock, we use a CountDownLatch.
+         * has time to acquire the lock, we use a CountDownLatch
+         * that only the producer can release (in 'startLoading').
          */
-        producerInitialised.await();
-        loadingLock.lock();
-        loadingLock.unlock();
+        if (loading) {
+            producerInitialised.await();
+            loadingLock.lockInterruptibly();
+            loadingLock.unlock();
+        }
+    }
+
+    /** Forcibly unlock this AsyncLoader.
+     *
+     * There's no way we can safely interrupt the producer to
+     * release the loadingLock.  So here's what we do.
+     *
+     * - Interrupt all threads that are waiting on the lock.
+     * - Set loading = false so that future call to awaitLoadDone()
+     *   return immediately.
+     *
+     * Upon subseqent re-loads (e.g. due to loss and reesablishment
+     * of LDAP persistent search), the producer thread will call
+     * startLoading() again, which will increment the producer's
+     * hold count.  That's OK because when the unlock condition is
+     * met, checkLoadDone() will call loadingLock.unlock() as many
+     * times as needed to effect the unlock.
+     *
+     * This method DOES NOT interrupt threads waiting on the
+     * producerInitialised CountDownLatch.  The producer MUST call
+     * startLoading() which will acquire the loading lock then
+     * release the CountDownLatch.
+     */
+    private void forceUnlock() {
+        loading = false;
+        loadingLock.interruptWaitingThreads();
+    }
+
+    /** Subclass of ReentrantLock that can tell waiting threads
+     * to go away (by interrupting them).  Awaiters must use
+     * lockInterruptibly() to acquire the lock.
+     *
+     * This needed to be a subclass of ReentrantLock because
+     * ReentrantLock.getQueuedThreads() has visibility 'protected'.
+     */
+    private static class GoAwayLock extends ReentrantLock {
+        public void interruptWaitingThreads() {
+            for (Thread thread : getQueuedThreads()) {
+                thread.interrupt();
+            }
+        }
+    }
+
+    private class AsyncLoaderWatchdog extends TimerTask {
+        public void run() {
+            forceUnlock();
+        }
     }
 }
diff --git a/base/server/cmscore/src/com/netscape/cmscore/profile/LDAPProfileSubsystem.java b/base/server/cmscore/src/com/netscape/cmscore/profile/LDAPProfileSubsystem.java
index bce675e..7a9d0ba 100644
--- a/base/server/cmscore/src/com/netscape/cmscore/profile/LDAPProfileSubsystem.java
+++ b/base/server/cmscore/src/com/netscape/cmscore/profile/LDAPProfileSubsystem.java
@@ -58,7 +58,9 @@ public class LDAPProfileSubsystem
         extends AbstractProfileSubsystem
         implements IProfileSubsystem, Runnable {
 
-    private String dn;
+    private String profileContainerDNString;
+    private DN profileContainerDN;
+
     private ILdapConnFactory dbFactory;
 
     private boolean stopped = false;
@@ -73,7 +75,7 @@ public class LDAPProfileSubsystem
     /* Set of nsUniqueIds of deleted entries */
     private TreeSet<String> deletedNsUniqueIds;
 
-    private AsyncLoader loader = new AsyncLoader();
+    private AsyncLoader loader = new AsyncLoader(10 /*10s timeout*/);
 
     /**
      * Initializes this subsystem with the given configuration
@@ -112,7 +114,8 @@ public class LDAPProfileSubsystem
 
         // read profile id, implementation, and its configuration files
         String basedn = cs.getString("internaldb.basedn");
-        dn = "ou=certificateProfiles,ou=ca," + basedn;
+        profileContainerDNString = "ou=certificateProfiles,ou=ca," + basedn;
+        profileContainerDN = new DN(profileContainerDNString);
 
         monitor = new Thread(this, "profileChangeMonitor");
         monitor.start();
@@ -121,6 +124,8 @@ public class LDAPProfileSubsystem
         } catch (InterruptedException e) {
             CMS.debug("LDAPProfileSubsystem: caught InterruptedException "
                     + "while waiting for initial load of profiles.");
+            CMS.debug("You may have replication conflict entries or "
+                    + "extraneous data under " + profileContainerDNString);
         }
         CMS.debug("LDAPProfileSubsystem: finished init");
     }
@@ -294,12 +299,10 @@ public class LDAPProfileSubsystem
     }
 
     private synchronized void handleMODDN(DN oldDN, LDAPEntry entry) {
-        DN profilesDN = new DN(dn);
-
-        if (oldDN.isDescendantOf(profilesDN))
+        if (oldDN.isDescendantOf(profileContainerDN))
             forgetProfile(oldDN.explodeDN(true)[0]);
 
-        if ((new DN(entry.getDN())).isDescendantOf(profilesDN))
+        if ((new DN(entry.getDN())).isDescendantOf(profileContainerDN))
             readProfile(entry);
     }
 
@@ -384,12 +387,14 @@ public class LDAPProfileSubsystem
         if (id == null) {
             throw new EProfileException("CMS_PROFILE_ID_NOT_FOUND");
         }
-        return "cn=" + id + "," + dn;
+        return "cn=" + id + "," + profileContainerDNString;
     }
 
     private void ensureProfilesOU(LDAPConnection conn) throws LDAPException {
         try {
-            conn.search(dn, LDAPConnection.SCOPE_BASE, "(objectclass=*)", null, false);
+            conn.search(
+                profileContainerDNString, LDAPConnection.SCOPE_BASE,
+                "(objectclass=*)", null, false);
         } catch (LDAPException e) {
             if (e.getLDAPResultCode() == LDAPException.NO_SUCH_OBJECT) {
                 CMS.debug("Adding LDAP certificate profiles container");
@@ -398,7 +403,7 @@ public class LDAPProfileSubsystem
                     new LDAPAttribute("ou", "certificateProfiles")
                 };
                 LDAPAttributeSet attrSet = new LDAPAttributeSet(attrs);
-                LDAPEntry entry = new LDAPEntry(dn, attrSet);
+                LDAPEntry entry = new LDAPEntry(profileContainerDNString, attrSet);
                 conn.add(entry);
             }
         }
@@ -426,8 +431,8 @@ public class LDAPProfileSubsystem
                 cons.setServerTimeLimit(0 /* seconds */);
                 String[] attrs = {"*", "entryUSN", "nsUniqueId", "numSubordinates"};
                 LDAPSearchResults results = conn.search(
-                    dn, LDAPConnection.SCOPE_SUB, "(objectclass=*)",
-                    attrs, false, cons);
+                    profileContainerDNString, LDAPConnection.SCOPE_SUB,
+                    "(objectclass=*)", attrs, false, cons);
 
                 /* Wait until the last possible moment before taking
                  * the load lock and dropping all profiles, so that
@@ -443,16 +448,44 @@ public class LDAPProfileSubsystem
 
                 while (!stopped && results.hasMoreElements()) {
                     LDAPEntry entry = results.next();
-
-                    String[] objectClasses =
-                        entry.getAttribute("objectClass").getStringValueArray();
-                    if (Arrays.asList(objectClasses).contains("organizationalUnit")) {
+                    DN entryDN = new DN(entry.getDN());
+
+                    if (entryDN.countRDNs() == profileContainerDN.countRDNs()) {
+                        /* This is the profile container.  Read numSubordinates to get
+                         * the expected number of profiles entries to read.
+                         *
+                         * numSubordinates is not reliable; it may be too high
+                         * due to objects we cannot see (e.g. replication conflict
+                         * entries).  In that case AsyncLoader has a watchdog
+                         * timer to interrupt waiting threads.
+                         */
                         loader.setNumItems(new Integer(
                             entry.getAttribute("numSubordinates")
                                 .getStringValueArray()[0]));
                         continue;
                     }
 
+                    if (entryDN.countRDNs() > profileContainerDN.countRDNs() + 1) {
+                        /* This entry is unexpectedly deep.  We ignore it.
+                         * numSubordinates only counts immediate subordinates
+                         * (https://tools.ietf.org/html/draft-boreham-numsubordinates-01)
+                         * so don't increment() the AsyncLoader.
+                         */
+                        continue;
+                    }
+
+                    /* This entry is at the expected depth.  Is it a certProfile? */
+                    String[] objectClasses =
+                        entry.getAttribute("objectClass").getStringValueArray();
+                    if (!Arrays.asList(objectClasses).contains("certProfile")) {
+                        /* It is not a certProfile; ignore it.  But it does
+                         * contribute to numSubordinates so increment the loader. */
+                        loader.increment();
+                        continue;
+                    }
+
+                    /* We have a profile.  Process it. */
+
                     LDAPEntryChangeControl changeControl = (LDAPEntryChangeControl)
                         LDAPUtil.getControl(
                             LDAPEntryChangeControl.class, results.getResponseControls());
-- 
1.8.3.1


From 94ce76abe56b680982b363bca8edf6d10d6d9a7f Mon Sep 17 00:00:00 2001
From: Fraser Tweedale <ftweedal@redhat.com>
Date: Thu, 28 Mar 2019 18:06:20 +1100
Subject: [PATCH 2/2] Add watchdog timer for initial load of LWCAs

Similar to the work done for LDAPProfileSubsystem, to avoid hanging
startup when the number of entries processed during initial load of
LWCAs is less than suggested by the numSubordinates attribute of the
container entry (replication conflict entries can cause this).
Switch the authority monitor to use AsyncLoader which provides the
watchdog timer, and takes care of some of the existing logic.

Also add a log message when the startup await gets interrupted, to
indicate that there may be replication conflicts or other extraneous
data in the LWCA subtree.

Related: https://pagure.io/dogtagpki/issue/3078
(cherry picked from commit 54c15eb4eba3568eace3791d183f8d2700e5d04e)
---
 .../src/com/netscape/ca/CertificateAuthority.java  | 102 +++++++++++----------
 1 file changed, 55 insertions(+), 47 deletions(-)

diff --git a/base/ca/src/com/netscape/ca/CertificateAuthority.java b/base/ca/src/com/netscape/ca/CertificateAuthority.java
index 90a8fba..0281db0 100644
--- a/base/ca/src/com/netscape/ca/CertificateAuthority.java
+++ b/base/ca/src/com/netscape/ca/CertificateAuthority.java
@@ -125,6 +125,7 @@ import com.netscape.certsrv.request.IRequestScheduler;
 import com.netscape.certsrv.request.IService;
 import com.netscape.certsrv.request.RequestStatus;
 import com.netscape.certsrv.security.ISigningUnit;
+import com.netscape.certsrv.util.AsyncLoader;
 import com.netscape.certsrv.util.IStatsSubsystem;
 import com.netscape.cms.logging.Logger;
 import com.netscape.cms.logging.SignedAuditLogger;
@@ -335,20 +336,10 @@ public class CertificateAuthority
     private boolean mUseNonces = true;
     private int mMaxNonces = 100;
 
-    /* Variables to manage loading and tracking of lightweight CAs
-     *
-     * The initialLoadDone latch causes the host authority's 'init'
-     * method to block until the monitor thread has finished the
-     * initial loading of lightweight CAs.
-     *
-     * In other words: the "server startup" cannot complete until
-     * all the lightweight CAs that exist at start time are loaded.
-     */
+    /* Variables to manage loading and tracking of lightweight CAs */
     private static boolean stopped = false;
     private static boolean foundHostAuthority = false;
-    private static Integer initialNumAuthorities = null;
-    private static int numAuthoritiesLoaded = 0;
-    private static CountDownLatch initialLoadDone = new CountDownLatch(1);
+    private AsyncLoader lwcaLoader = new AsyncLoader(10 /*10s timeout*/);
 
     /* Maps and sets of entryUSNs and nsUniqueIds for avoiding race
      * conditions and unnecessary reloads related to replication */
@@ -637,10 +628,17 @@ public class CertificateAuthority
             if (isHostAuthority() && haveLightweightCAsContainer()) {
                 new Thread(this, "authorityMonitor").start();
                 try {
-                    initialLoadDone.await();
+                    // block until the expected number of authorities
+                    // have been loaded (based on numSubordinates of
+                    // container entry), or watchdog times it out (in case
+                    // numSubordinates is larger than the number of entries
+                    // we can see, e.g. replication conflict entries).
+                    lwcaLoader.awaitLoadDone();
                 } catch (InterruptedException e) {
                     CMS.debug("CertificateAuthority: caught InterruptedException "
                             + "while waiting for initial load of authorities.");
+                    CMS.debug("You may have replication conflict entries or "
+                            + "extraneous data under " + authorityBaseDN());
                 }
 
                 if (!foundHostAuthority) {
@@ -3260,12 +3258,6 @@ public class CertificateAuthority
         }
     }
 
-    private void checkInitialLoadDone() {
-        if (initialNumAuthorities != null
-                && numAuthoritiesLoaded >= initialNumAuthorities)
-            initialLoadDone.countDown();
-    }
-
     public void run() {
         int op = LDAPPersistSearchControl.ADD
             | LDAPPersistSearchControl.MODIFY
@@ -3274,6 +3266,9 @@ public class CertificateAuthority
         LDAPPersistSearchControl persistCtrl =
             new LDAPPersistSearchControl(op, false, true, true);
 
+        String lwcaContainerDNString = authorityBaseDN();
+        DN lwcaContainerDN = new DN(lwcaContainerDNString);
+
         CMS.debug("authorityMonitor: starting.");
 
         while (!stopped) {
@@ -3286,38 +3281,52 @@ public class CertificateAuthority
                 cons.setServerTimeLimit(0 /* seconds */);
                 String[] attrs = {"*", "entryUSN", "nsUniqueId", "numSubordinates"};
                 LDAPSearchResults results = conn.search(
-                    authorityBaseDN(), LDAPConnection.SCOPE_SUB,
+                    lwcaContainerDNString, LDAPConnection.SCOPE_SUB,
                     "(objectclass=*)", attrs, false, cons);
+
+                /* Wait until the last possible moment before taking
+                 * the load lock so that we can continue to service
+                 * requests while LDAP is down.
+                 */
+                lwcaLoader.startLoading();
+
                 while (!stopped && results.hasMoreElements()) {
                     LDAPEntry entry = results.next();
+                    DN entryDN = new DN(entry.getDN());
+
+                    if (entryDN.countRDNs() == lwcaContainerDN.countRDNs()) {
+                        /* This must be the base entry of the search, i.e. the
+                         * LWCA container.  Read numSubordinates to get the
+                         * expected number of LWCA entries to read.
+                         *
+                         * numSubordinates is not reliable; it may be too high
+                         * due to objects we cannot see (e.g. replication
+                         * conflict entries).  In that case AsyncLoader has a
+                         * watchdog timer to interrupt waiting threads after it
+                         * times out.
+                         */
+                        lwcaLoader.setNumItems(new Integer(
+                            entry.getAttribute("numSubordinates")
+                                .getStringValueArray()[0]));
+                        continue;
+                    }
 
-                    /* This behaviour requires detailed explanation.
-                     *
-                     * We want to block startup until all the
-                     * lightweight CAs existing at startup time are
-                     * loaded.  To do this, we need to know how many
-                     * authority entries there are.  And we must do
-                     * this atomically - we cannot issue two LDAP
-                     * searches in case things change.
-                     *
-                     * Therefore, we do a subtree search from the
-                     * authority container.  When we find the
-                     * container (objectClass=organizationalUnit),
-                     * we set initialNumAuthorities to the value of
-                     * its numSubordinates attribute.
-                     *
-                     * We increment numAuthoritiesLoaded for each
-                     * authority entry.  When numAuthoritiesLoaded
-                     * equals initialNumAuthorities, we unlock the
-                     * initialLoadDone latch.
-                     */
+                    if (entryDN.countRDNs() > lwcaContainerDN.countRDNs() + 1) {
+                        /* This entry is unexpectedly deep.  We ignore it.
+                         * numSubordinates only counts immediate subordinates
+                         * (https://tools.ietf.org/html/draft-boreham-numsubordinates-01)
+                         * so don't increment() the AsyncLoader.
+                         */
+                        continue;
+                    }
+
+                    /* This entry is at the expected depth.  Is it a LWCA entry? */
                     String[] objectClasses =
                         entry.getAttribute("objectClass").getStringValueArray();
-                    if (Arrays.asList(objectClasses).contains("organizationalUnit")) {
-                        initialNumAuthorities = new Integer(
-                            entry.getAttribute("numSubordinates")
-                                .getStringValueArray()[0]);
-                        checkInitialLoadDone();
+                    if (!Arrays.asList(objectClasses).contains("authority")) {
+                        /* It is not a LWCA entry; ignore it.  But it does
+                         * contribute to numSubordinates so increment the loader. */
+                        lwcaLoader.increment();
                         continue;
                     }
 
@@ -3352,8 +3361,7 @@ public class CertificateAuthority
                     } else {
                         CMS.debug("authorityMonitor: immediate result");
                         readAuthority(entry);
-                        numAuthoritiesLoaded += 1;
-                        checkInitialLoadDone();
+                        lwcaLoader.increment();
                     }
                 }
             } catch (ELdapException e) {
-- 
1.8.3.1