Blame SOURCES/0002-ticket-2058-Add-keep-alive-entry-after-on-line-initi.patch

47c2e9
From 29c9e1c3c760f0941b022d45d14c248e9ceb9738 Mon Sep 17 00:00:00 2001
47c2e9
From: progier389 <72748589+progier389@users.noreply.github.com>
47c2e9
Date: Tue, 3 Nov 2020 12:18:50 +0100
47c2e9
Subject: [PATCH 2/3] ticket 2058: Add keep alive entry after on-line
47c2e9
 initialization - second version (#4399)
47c2e9
47c2e9
Bug description:
47c2e9
Keep alive entry is not created on target master after on line initialization,
47c2e9
and its RUVelement stays empty until a direct update is issued on that master
47c2e9
47c2e9
Fix description:
47c2e9
The patch allows a consumer (configured as a master) to create (if it did not
47c2e9
exist before) the consumer's keep alive entry. It creates it at the end of a
47c2e9
replication session at a time we are sure the changelog exists and will not
47c2e9
be reset. It allows a consumer to have RUVelement with csn in the RUV at the
47c2e9
first incoming replication session.
47c2e9
47c2e9
That is basically lkrispen's proposal with an associated pytest testcase
47c2e9
47c2e9
Second version changes:
47c2e9
   - moved the testcase to suites/replication/regression_test.py
47c2e9
   - set up the topology from a 2 master topology then
47c2e9
    reinitialized the replicas from an ldif without replication metadata
47c2e9
    rather than using the cli.
47c2e9
   - search for keepalive entries using search_s instead of getEntry
47c2e9
   - add a comment about keep alive entries purpose
47c2e9
47c2e9
last commit:
47c2e9
   - wait that ruv are in sync before checking keep alive entries
47c2e9
47c2e9
Reviewed by: droideck, Firstyear
47c2e9
47c2e9
Platforms tested: F32
47c2e9
47c2e9
relates: #2058
47c2e9
---
47c2e9
 .../suites/replication/regression_test.py     | 130 ++++++++++++++++++
47c2e9
 .../plugins/replication/repl5_replica.c       |  14 ++
47c2e9
 ldap/servers/plugins/replication/repl_extop.c |   4 +
47c2e9
 3 files changed, 148 insertions(+)
47c2e9
47c2e9
diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py
47c2e9
index 844d762b9..14b9d6a44 100644
47c2e9
--- a/dirsrvtests/tests/suites/replication/regression_test.py
47c2e9
+++ b/dirsrvtests/tests/suites/replication/regression_test.py
47c2e9
@@ -98,6 +98,30 @@ def _move_ruv(ldif_file):
47c2e9
         for dn, entry in ldif_list:
47c2e9
             ldif_writer.unparse(dn, entry)
47c2e9
 
47c2e9
+def _remove_replication_data(ldif_file):
47c2e9
+    """ Remove the replication data from ldif file:
47c2e9
+        db2lif without -r includes some of the replica data like 
47c2e9
+        - nsUniqueId
47c2e9
+        - keepalive entries
47c2e9
+        This function filters the ldif fil to remove these data
47c2e9
+    """
47c2e9
+
47c2e9
+    with open(ldif_file) as f:
47c2e9
+        parser = ldif.LDIFRecordList(f)
47c2e9
+        parser.parse()
47c2e9
+
47c2e9
+        ldif_list = parser.all_records
47c2e9
+        # Iterate on a copy of the ldif entry list
47c2e9
+        for dn, entry in ldif_list[:]:
47c2e9
+            if dn.startswith('cn=repl keep alive'):
47c2e9
+                ldif_list.remove((dn,entry))
47c2e9
+            else:
47c2e9
+                entry.pop('nsUniqueId')
47c2e9
+    with open(ldif_file, 'w') as f:
47c2e9
+        ldif_writer = ldif.LDIFWriter(f)
47c2e9
+        for dn, entry in ldif_list:
47c2e9
+            ldif_writer.unparse(dn, entry)
47c2e9
+
47c2e9
 
47c2e9
 @pytest.fixture(scope="module")
47c2e9
 def topo_with_sigkill(request):
47c2e9
@@ -897,6 +921,112 @@ def test_moving_entry_make_online_init_fail(topology_m2):
47c2e9
     assert len(m1entries) == len(m2entries)
47c2e9
 
47c2e9
 
47c2e9
+def get_keepalive_entries(instance,replica):
47c2e9
+    # Returns the keep alive entries that exists with the suffix of the server instance
47c2e9
+    try:
47c2e9
+        entries = instance.search_s(replica.get_suffix(), ldap.SCOPE_ONELEVEL,
47c2e9
+                    "(&(objectclass=ldapsubentry)(cn=repl keep alive*))",
47c2e9
+                    ['cn', 'nsUniqueId', 'modifierTimestamp'])
47c2e9
+    except ldap.LDAPError as e:
47c2e9
+        log.fatal('Failed to retrieve keepalive entry (%s) on instance %s: error %s' % (dn, instance, str(e)))
47c2e9
+        assert False
47c2e9
+    # No error, so lets log the keepalive entries
47c2e9
+    if log.isEnabledFor(logging.DEBUG):
47c2e9
+        for ret in entries:
47c2e9
+            log.debug("Found keepalive entry:\n"+str(ret));
47c2e9
+    return entries
47c2e9
+
47c2e9
+def verify_keepalive_entries(topo, expected):
47c2e9
+    #Check that keep alive entries exists (or not exists) for every masters on every masters
47c2e9
+    #Note: The testing method is quite basic: counting that there is one keepalive entry per master.
47c2e9
+    # that is ok for simple test cases like test_online_init_should_create_keepalive_entries but
47c2e9
+    # not for the general case as keep alive associated with no more existing master may exists
47c2e9
+    # (for example after: db2ldif / demote a master / ldif2db / init other masters)
47c2e9
+    # ==> if the function is somehow pushed in lib389, a check better than simply counting the entries
47c2e9
+    # should be done.
47c2e9
+    for masterId in topo.ms:
47c2e9
+        master=topo.ms[masterId]
47c2e9
+        for replica in Replicas(master).list():
47c2e9
+            if (replica.get_role() != ReplicaRole.MASTER):
47c2e9
+               continue
47c2e9
+            replica_info = f'master: {masterId} RID: {replica.get_rid()} suffix: {replica.get_suffix()}'
47c2e9
+            log.debug(f'Checking keepAliveEntries on {replica_info}')
47c2e9
+            keepaliveEntries = get_keepalive_entries(master, replica);
47c2e9
+            expectedCount = len(topo.ms) if expected else 0
47c2e9
+            foundCount = len(keepaliveEntries)
47c2e9
+            if (foundCount == expectedCount):
47c2e9
+                log.debug(f'Found {foundCount} keepalive entries as expected on {replica_info}.')
47c2e9
+            else:
47c2e9
+                log.error(f'{foundCount} Keepalive entries are found '
47c2e9
+                          f'while {expectedCount} were expected on {replica_info}.')
47c2e9
+                assert False
47c2e9
+
47c2e9
+
47c2e9
+def test_online_init_should_create_keepalive_entries(topo_m2):
47c2e9
+    """Check that keep alive entries are created when initializinf a master from another one
47c2e9
+
47c2e9
+    :id: d5940e71-d18a-4b71-aaf7-b9185361fffe
47c2e9
+    :setup: Two masters replication setup
47c2e9
+    :steps:
47c2e9
+        1. Generate ldif without replication data
47c2e9
+        2  Init both masters from that ldif
47c2e9
+        3  Check that keep alive entries does not exists
47c2e9
+        4  Perform on line init of master2 from master1
47c2e9
+        5  Check that keep alive entries exists
47c2e9
+    :expectedresults:
47c2e9
+        1. No error while generating ldif
47c2e9
+        2. No error while importing the ldif file
47c2e9
+        3. No keepalive entrie should exists on any masters
47c2e9
+        4. No error while initializing master2
47c2e9
+        5. All keepalive entries should exist on every masters
47c2e9
+
47c2e9
+    """
47c2e9
+
47c2e9
+    repl = ReplicationManager(DEFAULT_SUFFIX)
47c2e9
+    m1 = topo_m2.ms["master1"]
47c2e9
+    m2 = topo_m2.ms["master2"]
47c2e9
+    # Step 1: Generate ldif without replication data
47c2e9
+    m1.stop()
47c2e9
+    m2.stop()
47c2e9
+    ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
47c2e9
+    m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
47c2e9
+               excludeSuffixes=None, repl_data=False,
47c2e9
+               outputfile=ldif_file, encrypt=False)
47c2e9
+    # Remove replication metadata that are still in the ldif
47c2e9
+    _remove_replication_data(ldif_file)
47c2e9
+
47c2e9
+    # Step 2: Init both masters from that ldif
47c2e9
+    m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
47c2e9
+    m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
47c2e9
+    m1.start()
47c2e9
+    m2.start()
47c2e9
+
47c2e9
+    """ Replica state is now as if CLI setup has been done using:
47c2e9
+        dsconf master1 replication enable --suffix "${SUFFIX}" --role master
47c2e9
+        dsconf master2 replication enable --suffix "${SUFFIX}" --role master
47c2e9
+        dsconf master1 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}"
47c2e9
+        dsconf master2 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}"
47c2e9
+        dsconf master1 repl-agmt create --suffix "${SUFFIX}"
47c2e9
+        dsconf master2 repl-agmt create --suffix "${SUFFIX}"
47c2e9
+    """
47c2e9
+
47c2e9
+    # Step 3: No keepalive entrie should exists on any masters
47c2e9
+    verify_keepalive_entries(topo_m2, False)
47c2e9
+
47c2e9
+    # Step 4: Perform on line init of master2 from master1
47c2e9
+    agmt = Agreements(m1).list()[0]
47c2e9
+    agmt.begin_reinit()
47c2e9
+    (done, error) = agmt.wait_reinit()
47c2e9
+    assert done is True
47c2e9
+    assert error is False
47c2e9
+
47c2e9
+    # Step 5: All keepalive entries should exists on every masters
47c2e9
+    #  Verify the keep alive entry once replication is in sync
47c2e9
+    # (that is the step that fails when bug is not fixed)
47c2e9
+    repl.wait_for_ruv(m2,m1)
47c2e9
+    verify_keepalive_entries(topo_m2, True);
47c2e9
+
47c2e9
+
47c2e9
 if __name__ == '__main__':
47c2e9
     # Run isolated
47c2e9
     # -s for DEBUG mode
47c2e9
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
47c2e9
index f01782330..f0ea0f8ef 100644
47c2e9
--- a/ldap/servers/plugins/replication/repl5_replica.c
47c2e9
+++ b/ldap/servers/plugins/replication/repl5_replica.c
47c2e9
@@ -373,6 +373,20 @@ replica_destroy(void **arg)
47c2e9
     slapi_ch_free((void **)arg);
47c2e9
 }
47c2e9
 
47c2e9
+/******************************************************************************
47c2e9
+ ******************** REPLICATION KEEP ALIVE ENTRIES **************************
47c2e9
+ ******************************************************************************
47c2e9
+ * They are subentries of the replicated suffix and there is one per master.  *
47c2e9
+ * These entries exist only to trigger a change that get replicated over the  *
47c2e9
+ * topology.                                                                  *
47c2e9
+ * Their main purpose is to generate records in the changelog and they are    *
47c2e9
+ * updated from time to time by fractional replication to insure that at      *
47c2e9
+ * least a change must be replicated by FR after a great number of not        *
47c2e9
+ * replicated changes are found in the changelog. The interest is that the    *
47c2e9
+ * fractional RUV get then updated so less changes need to be walked in the   *
47c2e9
+ * changelog when searching for the first change to send                      *
47c2e9
+ ******************************************************************************/
47c2e9
+
47c2e9
 #define KEEP_ALIVE_ATTR "keepalivetimestamp"
47c2e9
 #define KEEP_ALIVE_ENTRY "repl keep alive"
47c2e9
 #define KEEP_ALIVE_DN_FORMAT "cn=%s %d,%s"
47c2e9
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
47c2e9
index 14c8e0bcc..af486f730 100644
47c2e9
--- a/ldap/servers/plugins/replication/repl_extop.c
47c2e9
+++ b/ldap/servers/plugins/replication/repl_extop.c
47c2e9
@@ -1173,6 +1173,10 @@ multimaster_extop_EndNSDS50ReplicationRequest(Slapi_PBlock *pb)
47c2e9
                 */
47c2e9
                 if (cl5GetState() == CL5_STATE_OPEN) {
47c2e9
                     replica_log_ruv_elements(r);
47c2e9
+                    /* now that the changelog is open and started, we can alos cretae the
47c2e9
+                     * keep alive entry without risk that db and cl will not match
47c2e9
+                     */
47c2e9
+                    replica_subentry_check(replica_get_root(r), replica_get_rid(r));
47c2e9
                 }
47c2e9
 
47c2e9
                 /* ONREPL code that dealt with new RUV, etc was moved into the code
47c2e9
-- 
47c2e9
2.26.2
47c2e9