From 29c9e1c3c760f0941b022d45d14c248e9ceb9738 Mon Sep 17 00:00:00 2001 From: progier389 <72748589+progier389@users.noreply.github.com> Date: Tue, 3 Nov 2020 12:18:50 +0100 Subject: [PATCH 2/3] ticket 2058: Add keep alive entry after on-line initialization - second version (#4399) Bug description: Keep alive entry is not created on target master after on line initialization, and its RUVelement stays empty until a direct update is issued on that master Fix description: The patch allows a consumer (configured as a master) to create (if it did not exist before) the consumer's keep alive entry. It creates it at the end of a replication session at a time we are sure the changelog exists and will not be reset. It allows a consumer to have RUVelement with csn in the RUV at the first incoming replication session. That is basically lkrispen's proposal with an associated pytest testcase Second version changes: - moved the testcase to suites/replication/regression_test.py - set up the topology from a 2 master topology then reinitialized the replicas from an ldif without replication metadata rather than using the cli. - search for keepalive entries using search_s instead of getEntry - add a comment about keep alive entries purpose last commit: - wait that ruv are in sync before checking keep alive entries Reviewed by: droideck, Firstyear Platforms tested: F32 relates: #2058 --- .../suites/replication/regression_test.py | 130 ++++++++++++++++++ .../plugins/replication/repl5_replica.c | 14 ++ ldap/servers/plugins/replication/repl_extop.c | 4 + 3 files changed, 148 insertions(+) diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py index 844d762b9..14b9d6a44 100644 --- a/dirsrvtests/tests/suites/replication/regression_test.py +++ b/dirsrvtests/tests/suites/replication/regression_test.py @@ -98,6 +98,30 @@ def _move_ruv(ldif_file): for dn, entry in ldif_list: ldif_writer.unparse(dn, entry) +def _remove_replication_data(ldif_file): + """ Remove the replication data from ldif file: + db2lif without -r includes some of the replica data like + - nsUniqueId + - keepalive entries + This function filters the ldif fil to remove these data + """ + + with open(ldif_file) as f: + parser = ldif.LDIFRecordList(f) + parser.parse() + + ldif_list = parser.all_records + # Iterate on a copy of the ldif entry list + for dn, entry in ldif_list[:]: + if dn.startswith('cn=repl keep alive'): + ldif_list.remove((dn,entry)) + else: + entry.pop('nsUniqueId') + with open(ldif_file, 'w') as f: + ldif_writer = ldif.LDIFWriter(f) + for dn, entry in ldif_list: + ldif_writer.unparse(dn, entry) + @pytest.fixture(scope="module") def topo_with_sigkill(request): @@ -897,6 +921,112 @@ def test_moving_entry_make_online_init_fail(topology_m2): assert len(m1entries) == len(m2entries) +def get_keepalive_entries(instance,replica): + # Returns the keep alive entries that exists with the suffix of the server instance + try: + entries = instance.search_s(replica.get_suffix(), ldap.SCOPE_ONELEVEL, + "(&(objectclass=ldapsubentry)(cn=repl keep alive*))", + ['cn', 'nsUniqueId', 'modifierTimestamp']) + except ldap.LDAPError as e: + log.fatal('Failed to retrieve keepalive entry (%s) on instance %s: error %s' % (dn, instance, str(e))) + assert False + # No error, so lets log the keepalive entries + if log.isEnabledFor(logging.DEBUG): + for ret in entries: + log.debug("Found keepalive entry:\n"+str(ret)); + return entries + +def verify_keepalive_entries(topo, expected): + #Check that keep alive entries exists (or not exists) for every masters on every masters + #Note: The testing method is quite basic: counting that there is one keepalive entry per master. + # that is ok for simple test cases like test_online_init_should_create_keepalive_entries but + # not for the general case as keep alive associated with no more existing master may exists + # (for example after: db2ldif / demote a master / ldif2db / init other masters) + # ==> if the function is somehow pushed in lib389, a check better than simply counting the entries + # should be done. + for masterId in topo.ms: + master=topo.ms[masterId] + for replica in Replicas(master).list(): + if (replica.get_role() != ReplicaRole.MASTER): + continue + replica_info = f'master: {masterId} RID: {replica.get_rid()} suffix: {replica.get_suffix()}' + log.debug(f'Checking keepAliveEntries on {replica_info}') + keepaliveEntries = get_keepalive_entries(master, replica); + expectedCount = len(topo.ms) if expected else 0 + foundCount = len(keepaliveEntries) + if (foundCount == expectedCount): + log.debug(f'Found {foundCount} keepalive entries as expected on {replica_info}.') + else: + log.error(f'{foundCount} Keepalive entries are found ' + f'while {expectedCount} were expected on {replica_info}.') + assert False + + +def test_online_init_should_create_keepalive_entries(topo_m2): + """Check that keep alive entries are created when initializinf a master from another one + + :id: d5940e71-d18a-4b71-aaf7-b9185361fffe + :setup: Two masters replication setup + :steps: + 1. Generate ldif without replication data + 2 Init both masters from that ldif + 3 Check that keep alive entries does not exists + 4 Perform on line init of master2 from master1 + 5 Check that keep alive entries exists + :expectedresults: + 1. No error while generating ldif + 2. No error while importing the ldif file + 3. No keepalive entrie should exists on any masters + 4. No error while initializing master2 + 5. All keepalive entries should exist on every masters + + """ + + repl = ReplicationManager(DEFAULT_SUFFIX) + m1 = topo_m2.ms["master1"] + m2 = topo_m2.ms["master2"] + # Step 1: Generate ldif without replication data + m1.stop() + m2.stop() + ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir() + m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX], + excludeSuffixes=None, repl_data=False, + outputfile=ldif_file, encrypt=False) + # Remove replication metadata that are still in the ldif + _remove_replication_data(ldif_file) + + # Step 2: Init both masters from that ldif + m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file) + m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file) + m1.start() + m2.start() + + """ Replica state is now as if CLI setup has been done using: + dsconf master1 replication enable --suffix "${SUFFIX}" --role master + dsconf master2 replication enable --suffix "${SUFFIX}" --role master + dsconf master1 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}" + dsconf master2 replication create-manager --name "${REPLICATION_MANAGER_NAME}" --passwd "${REPLICATION_MANAGER_PASSWORD}" + dsconf master1 repl-agmt create --suffix "${SUFFIX}" + dsconf master2 repl-agmt create --suffix "${SUFFIX}" + """ + + # Step 3: No keepalive entrie should exists on any masters + verify_keepalive_entries(topo_m2, False) + + # Step 4: Perform on line init of master2 from master1 + agmt = Agreements(m1).list()[0] + agmt.begin_reinit() + (done, error) = agmt.wait_reinit() + assert done is True + assert error is False + + # Step 5: All keepalive entries should exists on every masters + # Verify the keep alive entry once replication is in sync + # (that is the step that fails when bug is not fixed) + repl.wait_for_ruv(m2,m1) + verify_keepalive_entries(topo_m2, True); + + if __name__ == '__main__': # Run isolated # -s for DEBUG mode diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c index f01782330..f0ea0f8ef 100644 --- a/ldap/servers/plugins/replication/repl5_replica.c +++ b/ldap/servers/plugins/replication/repl5_replica.c @@ -373,6 +373,20 @@ replica_destroy(void **arg) slapi_ch_free((void **)arg); } +/****************************************************************************** + ******************** REPLICATION KEEP ALIVE ENTRIES ************************** + ****************************************************************************** + * They are subentries of the replicated suffix and there is one per master. * + * These entries exist only to trigger a change that get replicated over the * + * topology. * + * Their main purpose is to generate records in the changelog and they are * + * updated from time to time by fractional replication to insure that at * + * least a change must be replicated by FR after a great number of not * + * replicated changes are found in the changelog. The interest is that the * + * fractional RUV get then updated so less changes need to be walked in the * + * changelog when searching for the first change to send * + ******************************************************************************/ + #define KEEP_ALIVE_ATTR "keepalivetimestamp" #define KEEP_ALIVE_ENTRY "repl keep alive" #define KEEP_ALIVE_DN_FORMAT "cn=%s %d,%s" diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c index 14c8e0bcc..af486f730 100644 --- a/ldap/servers/plugins/replication/repl_extop.c +++ b/ldap/servers/plugins/replication/repl_extop.c @@ -1173,6 +1173,10 @@ multimaster_extop_EndNSDS50ReplicationRequest(Slapi_PBlock *pb) */ if (cl5GetState() == CL5_STATE_OPEN) { replica_log_ruv_elements(r); + /* now that the changelog is open and started, we can alos cretae the + * keep alive entry without risk that db and cl will not match + */ + replica_subentry_check(replica_get_root(r), replica_get_rid(r)); } /* ONREPL code that dealt with new RUV, etc was moved into the code -- 2.26.2