Blame SOURCES/0002-ticket-2058-Add-keep-alive-entry-after-on-line-initi.patch

be9751
From 29c9e1c3c760f0941b022d45d14c248e9ceb9738 Mon Sep 17 00:00:00 2001
be9751
From: progier389 <72748589+progier389@users.noreply.github.com>
be9751
Date: Tue, 3 Nov 2020 12:18:50 +0100
be9751
Subject: [PATCH 2/3] ticket 2058: Add keep alive entry after on-line
be9751
 initialization - second version (#4399)
be9751
be9751
Bug description:
be9751
Keep alive entry is not created on target master after on line initialization,
be9751
and its RUVelement stays empty until a direct update is issued on that master
be9751
be9751
Fix description:
be9751
The patch allows a consumer (configured as a master) to create (if it did not
be9751
exist before) the consumer's keep alive entry. It creates it at the end of a
be9751
replication session at a time we are sure the changelog exists and will not
be9751
be reset. It allows a consumer to have RUVelement with csn in the RUV at the
be9751
first incoming replication session.
be9751
be9751
That is basically lkrispen's proposal with an associated pytest testcase
be9751
be9751
Second version changes:
be9751
   - moved the testcase to suites/replication/regression_test.py
be9751
   - set up the topology from a 2 master topology then
be9751
    reinitialized the replicas from an ldif without replication metadata
be9751
    rather than using the cli.
be9751
   - search for keepalive entries using search_s instead of getEntry
be9751
   - add a comment about keep alive entries purpose
be9751
be9751
last commit:
be9751
   - wait that ruv are in sync before checking keep alive entries
be9751
be9751
Reviewed by: droideck, Firstyear
be9751
be9751
Platforms tested: F32
be9751
be9751
relates: #2058
be9751
---
be9751
 .../suites/replication/regression_test.py     | 130 ++++++++++++++++++
be9751
 .../plugins/replication/repl5_replica.c       |  14 ++
be9751
 ldap/servers/plugins/replication/repl_extop.c |   4 +
be9751
 3 files changed, 148 insertions(+)
be9751
be9751
diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py
be9751
index 844d762b9..14b9d6a44 100644
be9751
--- a/dirsrvtests/tests/suites/replication/regression_test.py
be9751
+++ b/dirsrvtests/tests/suites/replication/regression_test.py
be9751
@@ -98,6 +98,30 @@ def _move_ruv(ldif_file):
be9751
         for dn, entry in ldif_list:
be9751
             ldif_writer.unparse(dn, entry)
be9751
 
be9751
+def _remove_replication_data(ldif_file):
be9751
+    """ Remove the replication data from ldif file:
be9751
+        db2lif without -r includes some of the replica data like 
be9751
+        - nsUniqueId
be9751
+        - keepalive entries
be9751
+        This function filters the ldif fil to remove these data
be9751
+    """
be9751
+
be9751
+    with open(ldif_file) as f:
be9751
+        parser = ldif.LDIFRecordList(f)
be9751
+        parser.parse()
be9751
+
be9751
+        ldif_list = parser.all_records
be9751
+        # Iterate on a copy of the ldif entry list
be9751
+        for dn, entry in ldif_list[:]:
be9751
+            if dn.startswith('cn=repl keep alive'):
be9751
+                ldif_list.remove((dn,entry))
be9751
+            else:
be9751
+                entry.pop('nsUniqueId')
be9751
+    with open(ldif_file, 'w') as f:
be9751
+        ldif_writer = ldif.LDIFWriter(f)
be9751
+        for dn, entry in ldif_list:
be9751
+            ldif_writer.unparse(dn, entry)
be9751
+
be9751
 
be9751
 @pytest.fixture(scope="module")
be9751
 def topo_with_sigkill(request):
be9751
@@ -897,6 +921,112 @@ def test_moving_entry_make_online_init_fail(topology_m2):
be9751
     assert len(m1entries) == len(m2entries)
be9751
 
be9751
 
be9751
+def get_keepalive_entries(instance,replica):
be9751
+    # Returns the keep alive entries that exists with the suffix of the server instance
be9751
+    try:
be9751
+        entries = instance.search_s(replica.get_suffix(), ldap.SCOPE_ONELEVEL,
be9751
+                    "(&(objectclass=ldapsubentry)(cn=repl keep alive*))",
be9751
+                    ['cn', 'nsUniqueId', 'modifierTimestamp'])
be9751
+    except ldap.LDAPError as e:
be9751
+        log.fatal('Failed to retrieve keepalive entry (%s) on instance %s: error %s' % (dn, instance, str(e)))
be9751
+        assert False
be9751
+    # No error, so lets log the keepalive entries
be9751
+    if log.isEnabledFor(logging.DEBUG):
be9751
+        for ret in entries:
be9751
+            log.debug("Found keepalive entry:\n"+str(ret));
be9751
+    return entries
be9751
+
be9751
+def verify_keepalive_entries(topo, expected):
be9751
+    #Check that keep alive entries exists (or not exists) for every masters on every masters
be9751
+    #Note: The testing method is quite basic: counting that there is one keepalive entry per master.
be9751
+    # that is ok for simple test cases like test_online_init_should_create_keepalive_entries but
be9751
+    # not for the general case as keep alive associated with no more existing master may exists
be9751
+    # (for example after: db2ldif / demote a master / ldif2db / init other masters)
be9751
+    # ==> if the function is somehow pushed in lib389, a check better than simply counting the entries
be9751
+    # should be done.
be9751
+    for masterId in topo.ms:
be9751
+        master=topo.ms[masterId]
be9751
+        for replica in Replicas(master).list():
be9751
+            if (replica.get_role() != ReplicaRole.MASTER):
be9751
+               continue
be9751
+            replica_info = f'master: {masterId} RID: {replica.get_rid()} suffix: {replica.get_suffix()}'
be9751
+            log.debug(f'Checking keepAliveEntries on {replica_info}')
be9751
+            keepaliveEntries = get_keepalive_entries(master, replica);
be9751
+            expectedCount = len(topo.ms) if expected else 0
be9751
+            foundCount = len(keepaliveEntries)
be9751
+            if (foundCount == expectedCount):
be9751
+                log.debug(f'Found {foundCount} keepalive entries as expected on {replica_info}.')
be9751
+            else:
be9751
+                log.error(f'{foundCount} Keepalive entries are found '
be9751
+                          f'while {expectedCount} were expected on {replica_info}.')
be9751
+                assert False
be9751
+
be9751
+
be9751
+def test_online_init_should_create_keepalive_entries(topo_m2):
be9751
+    """Check that keep alive entries are created when initializinf a master from another one
be9751
+
be9751
+    :id: d5940e71-d18a-4b71-aaf7-b9185361fffe
be9751
+    :setup: Two masters replication setup
be9751
+    :steps:
be9751
+        1. Generate ldif without replication data
be9751
+        2  Init both masters from that ldif
be9751
+        3  Check that keep alive entries does not exists
be9751
+        4  Perform on line init of master2 from master1
be9751
+        5  Check that keep alive entries exists
be9751
+    :expectedresults:
be9751
+        1. No error while generating ldif
be9751
+        2. No error while importing the ldif file
be9751
+        3. No keepalive entrie should exists on any masters
be9751
+        4. No error while initializing master2
be9751
+        5. All keepalive entries should exist on every masters
be9751
+
be9751
+    """
be9751
+
be9751
+    repl = ReplicationManager(DEFAULT_SUFFIX)
be9751
+    m1 = topo_m2.ms["master1"]
be9751
+    m2 = topo_m2.ms["master2"]
be9751
+    # Step 1: Generate ldif without replication data
be9751
+    m1.stop()
be9751
+    m2.stop()
be9751
+    ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
be9751
+    m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
be9751
+               excludeSuffixes=None, repl_data=False,
be9751
+               outputfile=ldif_file, encrypt=False)
be9751
+    # Remove replication metadata that are still in the ldif
be9751
+    _remove_replication_data(ldif_file)
be9751
+
be9751
+    # Step 2: Init both masters from that ldif
be9751
+    m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
be9751
+    m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
be9751
+    m1.start()
be9751
+    m2.start()
be9751
+
be9751
+    """ Replica state is now as if CLI setup has been done using:
be9751
+        dsconf master1 replication enable --suffix "${SUFFIX}" --role master
be9751
+        dsconf master2 replication enable --suffix "${SUFFIX}" --role master
be9751
+        dsconf master1 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}"
be9751
+        dsconf master2 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}"
be9751
+        dsconf master1 repl-agmt create --suffix "${SUFFIX}"
be9751
+        dsconf master2 repl-agmt create --suffix "${SUFFIX}"
be9751
+    """
be9751
+
be9751
+    # Step 3: No keepalive entrie should exists on any masters
be9751
+    verify_keepalive_entries(topo_m2, False)
be9751
+
be9751
+    # Step 4: Perform on line init of master2 from master1
be9751
+    agmt = Agreements(m1).list()[0]
be9751
+    agmt.begin_reinit()
be9751
+    (done, error) = agmt.wait_reinit()
be9751
+    assert done is True
be9751
+    assert error is False
be9751
+
be9751
+    # Step 5: All keepalive entries should exists on every masters
be9751
+    #  Verify the keep alive entry once replication is in sync
be9751
+    # (that is the step that fails when bug is not fixed)
be9751
+    repl.wait_for_ruv(m2,m1)
be9751
+    verify_keepalive_entries(topo_m2, True);
be9751
+
be9751
+
be9751
 if __name__ == '__main__':
be9751
     # Run isolated
be9751
     # -s for DEBUG mode
be9751
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
be9751
index f01782330..f0ea0f8ef 100644
be9751
--- a/ldap/servers/plugins/replication/repl5_replica.c
be9751
+++ b/ldap/servers/plugins/replication/repl5_replica.c
be9751
@@ -373,6 +373,20 @@ replica_destroy(void **arg)
be9751
     slapi_ch_free((void **)arg);
be9751
 }
be9751
 
be9751
+/******************************************************************************
be9751
+ ******************** REPLICATION KEEP ALIVE ENTRIES **************************
be9751
+ ******************************************************************************
be9751
+ * They are subentries of the replicated suffix and there is one per master.  *
be9751
+ * These entries exist only to trigger a change that get replicated over the  *
be9751
+ * topology.                                                                  *
be9751
+ * Their main purpose is to generate records in the changelog and they are    *
be9751
+ * updated from time to time by fractional replication to insure that at      *
be9751
+ * least a change must be replicated by FR after a great number of not        *
be9751
+ * replicated changes are found in the changelog. The interest is that the    *
be9751
+ * fractional RUV get then updated so less changes need to be walked in the   *
be9751
+ * changelog when searching for the first change to send                      *
be9751
+ ******************************************************************************/
be9751
+
be9751
 #define KEEP_ALIVE_ATTR "keepalivetimestamp"
be9751
 #define KEEP_ALIVE_ENTRY "repl keep alive"
be9751
 #define KEEP_ALIVE_DN_FORMAT "cn=%s %d,%s"
be9751
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
be9751
index 14c8e0bcc..af486f730 100644
be9751
--- a/ldap/servers/plugins/replication/repl_extop.c
be9751
+++ b/ldap/servers/plugins/replication/repl_extop.c
be9751
@@ -1173,6 +1173,10 @@ multimaster_extop_EndNSDS50ReplicationRequest(Slapi_PBlock *pb)
be9751
                 */
be9751
                 if (cl5GetState() == CL5_STATE_OPEN) {
be9751
                     replica_log_ruv_elements(r);
be9751
+                    /* now that the changelog is open and started, we can alos cretae the
be9751
+                     * keep alive entry without risk that db and cl will not match
be9751
+                     */
be9751
+                    replica_subentry_check(replica_get_root(r), replica_get_rid(r));
be9751
                 }
be9751
 
be9751
                 /* ONREPL code that dealt with new RUV, etc was moved into the code
be9751
-- 
be9751
2.26.2
be9751