Blob Blame Raw
From 29c9e1c3c760f0941b022d45d14c248e9ceb9738 Mon Sep 17 00:00:00 2001
From: progier389 <72748589+progier389@users.noreply.github.com>
Date: Tue, 3 Nov 2020 12:18:50 +0100
Subject: [PATCH 2/3] ticket 2058: Add keep alive entry after on-line
 initialization - second version (#4399)

Bug description:
Keep alive entry is not created on target master after on line initialization,
and its RUVelement stays empty until a direct update is issued on that master

Fix description:
The patch allows a consumer (configured as a master) to create (if it did not
exist before) the consumer's keep alive entry. It creates it at the end of a
replication session at a time we are sure the changelog exists and will not
be reset. It allows a consumer to have RUVelement with csn in the RUV at the
first incoming replication session.

That is basically lkrispen's proposal with an associated pytest testcase

Second version changes:
   - moved the testcase to suites/replication/regression_test.py
   - set up the topology from a 2 master topology then
    reinitialized the replicas from an ldif without replication metadata
    rather than using the cli.
   - search for keepalive entries using search_s instead of getEntry
   - add a comment about keep alive entries purpose

last commit:
   - wait that ruv are in sync before checking keep alive entries

Reviewed by: droideck, Firstyear

Platforms tested: F32

relates: #2058
---
 .../suites/replication/regression_test.py     | 130 ++++++++++++++++++
 .../plugins/replication/repl5_replica.c       |  14 ++
 ldap/servers/plugins/replication/repl_extop.c |   4 +
 3 files changed, 148 insertions(+)

diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py
index 844d762b9..14b9d6a44 100644
--- a/dirsrvtests/tests/suites/replication/regression_test.py
+++ b/dirsrvtests/tests/suites/replication/regression_test.py
@@ -98,6 +98,30 @@ def _move_ruv(ldif_file):
         for dn, entry in ldif_list:
             ldif_writer.unparse(dn, entry)
 
+def _remove_replication_data(ldif_file):
+    """ Remove the replication data from ldif file:
+        db2lif without -r includes some of the replica data like 
+        - nsUniqueId
+        - keepalive entries
+        This function filters the ldif fil to remove these data
+    """
+
+    with open(ldif_file) as f:
+        parser = ldif.LDIFRecordList(f)
+        parser.parse()
+
+        ldif_list = parser.all_records
+        # Iterate on a copy of the ldif entry list
+        for dn, entry in ldif_list[:]:
+            if dn.startswith('cn=repl keep alive'):
+                ldif_list.remove((dn,entry))
+            else:
+                entry.pop('nsUniqueId')
+    with open(ldif_file, 'w') as f:
+        ldif_writer = ldif.LDIFWriter(f)
+        for dn, entry in ldif_list:
+            ldif_writer.unparse(dn, entry)
+
 
 @pytest.fixture(scope="module")
 def topo_with_sigkill(request):
@@ -897,6 +921,112 @@ def test_moving_entry_make_online_init_fail(topology_m2):
     assert len(m1entries) == len(m2entries)
 
 
+def get_keepalive_entries(instance,replica):
+    # Returns the keep alive entries that exists with the suffix of the server instance
+    try:
+        entries = instance.search_s(replica.get_suffix(), ldap.SCOPE_ONELEVEL,
+                    "(&(objectclass=ldapsubentry)(cn=repl keep alive*))",
+                    ['cn', 'nsUniqueId', 'modifierTimestamp'])
+    except ldap.LDAPError as e:
+        log.fatal('Failed to retrieve keepalive entry (%s) on instance %s: error %s' % (dn, instance, str(e)))
+        assert False
+    # No error, so lets log the keepalive entries
+    if log.isEnabledFor(logging.DEBUG):
+        for ret in entries:
+            log.debug("Found keepalive entry:\n"+str(ret));
+    return entries
+
+def verify_keepalive_entries(topo, expected):
+    #Check that keep alive entries exists (or not exists) for every masters on every masters
+    #Note: The testing method is quite basic: counting that there is one keepalive entry per master.
+    # that is ok for simple test cases like test_online_init_should_create_keepalive_entries but
+    # not for the general case as keep alive associated with no more existing master may exists
+    # (for example after: db2ldif / demote a master / ldif2db / init other masters)
+    # ==> if the function is somehow pushed in lib389, a check better than simply counting the entries
+    # should be done.
+    for masterId in topo.ms:
+        master=topo.ms[masterId]
+        for replica in Replicas(master).list():
+            if (replica.get_role() != ReplicaRole.MASTER):
+               continue
+            replica_info = f'master: {masterId} RID: {replica.get_rid()} suffix: {replica.get_suffix()}'
+            log.debug(f'Checking keepAliveEntries on {replica_info}')
+            keepaliveEntries = get_keepalive_entries(master, replica);
+            expectedCount = len(topo.ms) if expected else 0
+            foundCount = len(keepaliveEntries)
+            if (foundCount == expectedCount):
+                log.debug(f'Found {foundCount} keepalive entries as expected on {replica_info}.')
+            else:
+                log.error(f'{foundCount} Keepalive entries are found '
+                          f'while {expectedCount} were expected on {replica_info}.')
+                assert False
+
+
+def test_online_init_should_create_keepalive_entries(topo_m2):
+    """Check that keep alive entries are created when initializinf a master from another one
+
+    :id: d5940e71-d18a-4b71-aaf7-b9185361fffe
+    :setup: Two masters replication setup
+    :steps:
+        1. Generate ldif without replication data
+        2  Init both masters from that ldif
+        3  Check that keep alive entries does not exists
+        4  Perform on line init of master2 from master1
+        5  Check that keep alive entries exists
+    :expectedresults:
+        1. No error while generating ldif
+        2. No error while importing the ldif file
+        3. No keepalive entrie should exists on any masters
+        4. No error while initializing master2
+        5. All keepalive entries should exist on every masters
+
+    """
+
+    repl = ReplicationManager(DEFAULT_SUFFIX)
+    m1 = topo_m2.ms["master1"]
+    m2 = topo_m2.ms["master2"]
+    # Step 1: Generate ldif without replication data
+    m1.stop()
+    m2.stop()
+    ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
+    m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
+               excludeSuffixes=None, repl_data=False,
+               outputfile=ldif_file, encrypt=False)
+    # Remove replication metadata that are still in the ldif
+    _remove_replication_data(ldif_file)
+
+    # Step 2: Init both masters from that ldif
+    m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+    m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+    m1.start()
+    m2.start()
+
+    """ Replica state is now as if CLI setup has been done using:
+        dsconf master1 replication enable --suffix "${SUFFIX}" --role master
+        dsconf master2 replication enable --suffix "${SUFFIX}" --role master
+        dsconf master1 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}"
+        dsconf master2 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}"
+        dsconf master1 repl-agmt create --suffix "${SUFFIX}"
+        dsconf master2 repl-agmt create --suffix "${SUFFIX}"
+    """
+
+    # Step 3: No keepalive entrie should exists on any masters
+    verify_keepalive_entries(topo_m2, False)
+
+    # Step 4: Perform on line init of master2 from master1
+    agmt = Agreements(m1).list()[0]
+    agmt.begin_reinit()
+    (done, error) = agmt.wait_reinit()
+    assert done is True
+    assert error is False
+
+    # Step 5: All keepalive entries should exists on every masters
+    #  Verify the keep alive entry once replication is in sync
+    # (that is the step that fails when bug is not fixed)
+    repl.wait_for_ruv(m2,m1)
+    verify_keepalive_entries(topo_m2, True);
+
+
 if __name__ == '__main__':
     # Run isolated
     # -s for DEBUG mode
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
index f01782330..f0ea0f8ef 100644
--- a/ldap/servers/plugins/replication/repl5_replica.c
+++ b/ldap/servers/plugins/replication/repl5_replica.c
@@ -373,6 +373,20 @@ replica_destroy(void **arg)
     slapi_ch_free((void **)arg);
 }
 
+/******************************************************************************
+ ******************** REPLICATION KEEP ALIVE ENTRIES **************************
+ ******************************************************************************
+ * They are subentries of the replicated suffix and there is one per master.  *
+ * These entries exist only to trigger a change that get replicated over the  *
+ * topology.                                                                  *
+ * Their main purpose is to generate records in the changelog and they are    *
+ * updated from time to time by fractional replication to insure that at      *
+ * least a change must be replicated by FR after a great number of not        *
+ * replicated changes are found in the changelog. The interest is that the    *
+ * fractional RUV get then updated so less changes need to be walked in the   *
+ * changelog when searching for the first change to send                      *
+ ******************************************************************************/
+
 #define KEEP_ALIVE_ATTR "keepalivetimestamp"
 #define KEEP_ALIVE_ENTRY "repl keep alive"
 #define KEEP_ALIVE_DN_FORMAT "cn=%s %d,%s"
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
index 14c8e0bcc..af486f730 100644
--- a/ldap/servers/plugins/replication/repl_extop.c
+++ b/ldap/servers/plugins/replication/repl_extop.c
@@ -1173,6 +1173,10 @@ multimaster_extop_EndNSDS50ReplicationRequest(Slapi_PBlock *pb)
                 */
                 if (cl5GetState() == CL5_STATE_OPEN) {
                     replica_log_ruv_elements(r);
+                    /* now that the changelog is open and started, we can alos cretae the
+                     * keep alive entry without risk that db and cl will not match
+                     */
+                    replica_subentry_check(replica_get_root(r), replica_get_rid(r));
                 }
 
                 /* ONREPL code that dealt with new RUV, etc was moved into the code
-- 
2.26.2