From e202c62c3b4c92163d2de9f3da9a9f3efc81e4b8 Mon Sep 17 00:00:00 2001 From: progier389 <72748589+progier389@users.noreply.github.com> Date: Thu, 12 Nov 2020 18:50:04 +0100 Subject: [PATCH 3/3] do not add referrals for masters with different data generation #2054 (#4427) Bug description: The problem is that some operation mandatory in the usual cases are also performed when replication cannot take place because the database set are differents (i.e: RUV generation ids are different) One of the issue is that the csn generator state is updated when starting a replication session (it is a problem when trying to reset the time skew, as freshly reinstalled replicas get infected by the old ones) A second issue is that the RUV got updated when ending a replication session (which may add replica that does not share the same data set, then update operations on consumer retun referrals towards wrong masters Fix description: The fix checks the RUVs generation id before updating the csn generator and before updating the RUV. Reviewed by: mreynolds firstyear vashirov Platforms tested: F32 --- .../suites/replication/regression_test.py | 290 ++++++++++++++++++ ldap/servers/plugins/replication/repl5.h | 1 + .../plugins/replication/repl5_inc_protocol.c | 20 +- .../plugins/replication/repl5_replica.c | 39 ++- src/lib389/lib389/dseldif.py | 37 +++ 5 files changed, 368 insertions(+), 19 deletions(-) diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py index 14b9d6a44..a72af6b30 100644 --- a/dirsrvtests/tests/suites/replication/regression_test.py +++ b/dirsrvtests/tests/suites/replication/regression_test.py @@ -13,6 +13,7 @@ from lib389.idm.user import TEST_USER_PROPERTIES, UserAccounts from lib389.pwpolicy import PwPolicyManager from lib389.utils import * from lib389.topologies import topology_m2 as topo_m2, TopologyMain, topology_m3 as topo_m3, create_topology, _remove_ssca_db, topology_i2 as topo_i2 +from lib389.topologies import topology_m2c2 as topo_m2c2 from lib389._constants import * from lib389.idm.organizationalunit import OrganizationalUnits from lib389.idm.user import UserAccount @@ -22,6 +23,7 @@ from lib389.idm.directorymanager import DirectoryManager from lib389.replica import Replicas, ReplicationManager, Changelog5, BootstrapReplicationManager from lib389.agreement import Agreements from lib389 import pid_from_file +from lib389.dseldif import * pytestmark = pytest.mark.tier1 @@ -1027,6 +1029,294 @@ def test_online_init_should_create_keepalive_entries(topo_m2): verify_keepalive_entries(topo_m2, True); +def get_agreement(agmts, consumer): + # Get agreement towards consumer among the agremment list + for agmt in agmts.list(): + if (agmt.get_attr_val_utf8('nsDS5ReplicaPort') == str(consumer.port) and + agmt.get_attr_val_utf8('nsDS5ReplicaHost') == consumer.host): + return agmt + return None; + + +def test_ruv_url_not_added_if_different_uuid(topo_m2c2): + """Check that RUV url is not updated if RUV generation uuid are different + + :id: 7cc30a4e-0ffd-4758-8f00-e500279af344 + :setup: Two masters + two consumers replication setup + :steps: + 1. Generate ldif without replication data + 2. Init both masters from that ldif + (to clear the ruvs and generates different generation uuid) + 3. Perform on line init from master1 to consumer1 + and from master2 to consumer2 + 4. Perform update on both masters + 5. Check that c1 RUV does not contains URL towards m2 + 6. Check that c2 RUV does contains URL towards m2 + 7. Perform on line init from master1 to master2 + 8. Perform update on master2 + 9. Check that c1 RUV does contains URL towards m2 + :expectedresults: + 1. No error while generating ldif + 2. No error while importing the ldif file + 3. No error and Initialization done. + 4. No error + 5. master2 replicaid should not be in the consumer1 RUV + 6. master2 replicaid should be in the consumer2 RUV + 7. No error and Initialization done. + 8. No error + 9. master2 replicaid should be in the consumer1 RUV + + """ + + # Variables initialization + repl = ReplicationManager(DEFAULT_SUFFIX) + + m1 = topo_m2c2.ms["master1"] + m2 = topo_m2c2.ms["master2"] + c1 = topo_m2c2.cs["consumer1"] + c2 = topo_m2c2.cs["consumer2"] + + replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX) + replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX) + replica_c1 = Replicas(c1).get(DEFAULT_SUFFIX) + replica_c2 = Replicas(c2).get(DEFAULT_SUFFIX) + + replicid_m2 = replica_m2.get_rid() + + agmts_m1 = Agreements(m1, replica_m1.dn) + agmts_m2 = Agreements(m2, replica_m2.dn) + + m1_m2 = get_agreement(agmts_m1, m2) + m1_c1 = get_agreement(agmts_m1, c1) + m1_c2 = get_agreement(agmts_m1, c2) + m2_m1 = get_agreement(agmts_m2, m1) + m2_c1 = get_agreement(agmts_m2, c1) + m2_c2 = get_agreement(agmts_m2, c2) + + # Step 1: Generate ldif without replication data + m1.stop() + m2.stop() + ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir() + m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX], + excludeSuffixes=None, repl_data=False, + outputfile=ldif_file, encrypt=False) + # Remove replication metadata that are still in the ldif + # _remove_replication_data(ldif_file) + + # Step 2: Init both masters from that ldif + m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file) + m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file) + m1.start() + m2.start() + + # Step 3: Perform on line init from master1 to consumer1 + # and from master2 to consumer2 + m1_c1.begin_reinit() + m2_c2.begin_reinit() + (done, error) = m1_c1.wait_reinit() + assert done is True + assert error is False + (done, error) = m2_c2.wait_reinit() + assert done is True + assert error is False + + # Step 4: Perform update on both masters + repl.test_replication(m1, c1) + repl.test_replication(m2, c2) + + # Step 5: Check that c1 RUV does not contains URL towards m2 + ruv = replica_c1.get_ruv() + log.debug(f"c1 RUV: {ruv}") + url=ruv._rid_url.get(replica_m2.get_rid()) + if (url == None): + log.debug(f"No URL for RID {replica_m2.get_rid()} in RUV"); + else: + log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}"); + log.error(f"URL for RID {replica_m2.get_rid()} found in RUV") + #Note: this assertion fails if issue 2054 is not fixed. + assert False + + # Step 6: Check that c2 RUV does contains URL towards m2 + ruv = replica_c2.get_ruv() + log.debug(f"c1 RUV: {ruv} {ruv._rids} ") + url=ruv._rid_url.get(replica_m2.get_rid()) + if (url == None): + log.error(f"No URL for RID {replica_m2.get_rid()} in RUV"); + assert False + else: + log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}"); + + + # Step 7: Perform on line init from master1 to master2 + m1_m2.begin_reinit() + (done, error) = m1_m2.wait_reinit() + assert done is True + assert error is False + + # Step 8: Perform update on master2 + repl.test_replication(m2, c1) + + # Step 9: Check that c1 RUV does contains URL towards m2 + ruv = replica_c1.get_ruv() + log.debug(f"c1 RUV: {ruv} {ruv._rids} ") + url=ruv._rid_url.get(replica_m2.get_rid()) + if (url == None): + log.error(f"No URL for RID {replica_m2.get_rid()} in RUV"); + assert False + else: + log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}"); + + +def test_csngen_state_not_updated_if_different_uuid(topo_m2c2): + """Check that csngen remote offset is not updated if RUV generation uuid are different + + :id: 77694b8e-22ae-11eb-89b2-482ae39447e5 + :setup: Two masters + two consumers replication setup + :steps: + 1. Disable m1<->m2 agreement to avoid propagate timeSkew + 2. Generate ldif without replication data + 3. Increase time skew on master2 + 4. Init both masters from that ldif + (to clear the ruvs and generates different generation uuid) + 5. Perform on line init from master1 to consumer1 and master2 to consumer2 + 6. Perform update on both masters + 7: Check that c1 has no time skew + 8: Check that c2 has time skew + 9. Init master2 from master1 + 10. Perform update on master2 + 11. Check that c1 has time skew + :expectedresults: + 1. No error + 2. No error while generating ldif + 3. No error + 4. No error while importing the ldif file + 5. No error and Initialization done. + 6. No error + 7. c1 time skew should be lesser than threshold + 8. c2 time skew should be higher than threshold + 9. No error and Initialization done. + 10. No error + 11. c1 time skew should be higher than threshold + + """ + + # Variables initialization + repl = ReplicationManager(DEFAULT_SUFFIX) + + m1 = topo_m2c2.ms["master1"] + m2 = topo_m2c2.ms["master2"] + c1 = topo_m2c2.cs["consumer1"] + c2 = topo_m2c2.cs["consumer2"] + + replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX) + replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX) + replica_c1 = Replicas(c1).get(DEFAULT_SUFFIX) + replica_c2 = Replicas(c2).get(DEFAULT_SUFFIX) + + replicid_m2 = replica_m2.get_rid() + + agmts_m1 = Agreements(m1, replica_m1.dn) + agmts_m2 = Agreements(m2, replica_m2.dn) + + m1_m2 = get_agreement(agmts_m1, m2) + m1_c1 = get_agreement(agmts_m1, c1) + m1_c2 = get_agreement(agmts_m1, c2) + m2_m1 = get_agreement(agmts_m2, m1) + m2_c1 = get_agreement(agmts_m2, c1) + m2_c2 = get_agreement(agmts_m2, c2) + + # Step 1: Disable m1<->m2 agreement to avoid propagate timeSkew + m1_m2.pause() + m2_m1.pause() + + # Step 2: Generate ldif without replication data + m1.stop() + m2.stop() + ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir() + m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX], + excludeSuffixes=None, repl_data=False, + outputfile=ldif_file, encrypt=False) + # Remove replication metadata that are still in the ldif + # _remove_replication_data(ldif_file) + + # Step 3: Increase time skew on master2 + timeSkew=6*3600 + # We can modify master2 time skew + # But the time skew on the consumer may be smaller + # depending on when the cnsgen generation time is updated + # and when first csn get replicated. + # Since we use timeSkew has threshold value to detect + # whether there are time skew or not, + # lets add a significative margin (longer than the test duration) + # to avoid any risk of erroneous failure + timeSkewMargin = 300 + DSEldif(m2)._increaseTimeSkew(DEFAULT_SUFFIX, timeSkew+timeSkewMargin) + + # Step 4: Init both masters from that ldif + m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file) + m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file) + m1.start() + m2.start() + + # Step 5: Perform on line init from master1 to consumer1 + # and from master2 to consumer2 + m1_c1.begin_reinit() + m2_c2.begin_reinit() + (done, error) = m1_c1.wait_reinit() + assert done is True + assert error is False + (done, error) = m2_c2.wait_reinit() + assert done is True + assert error is False + + # Step 6: Perform update on both masters + repl.test_replication(m1, c1) + repl.test_replication(m2, c2) + + # Step 7: Check that c1 has no time skew + # Stop server to insure that dse.ldif is uptodate + c1.stop() + c1_nsState = DSEldif(c1).readNsState(DEFAULT_SUFFIX)[0] + c1_timeSkew = int(c1_nsState['time_skew']) + log.debug(f"c1 time skew: {c1_timeSkew}") + if (c1_timeSkew >= timeSkew): + log.error(f"c1 csngen state has unexpectedly been synchronized with m2: time skew {c1_timeSkew}") + assert False + c1.start() + + # Step 8: Check that c2 has time skew + # Stop server to insure that dse.ldif is uptodate + c2.stop() + c2_nsState = DSEldif(c2).readNsState(DEFAULT_SUFFIX)[0] + c2_timeSkew = int(c2_nsState['time_skew']) + log.debug(f"c2 time skew: {c2_timeSkew}") + if (c2_timeSkew < timeSkew): + log.error(f"c2 csngen state has not been synchronized with m2: time skew {c2_timeSkew}") + assert False + c2.start() + + # Step 9: Perform on line init from master1 to master2 + m1_c1.pause() + m1_m2.resume() + m1_m2.begin_reinit() + (done, error) = m1_m2.wait_reinit() + assert done is True + assert error is False + + # Step 10: Perform update on master2 + repl.test_replication(m2, c1) + + # Step 11: Check that c1 has time skew + # Stop server to insure that dse.ldif is uptodate + c1.stop() + c1_nsState = DSEldif(c1).readNsState(DEFAULT_SUFFIX)[0] + c1_timeSkew = int(c1_nsState['time_skew']) + log.debug(f"c1 time skew: {c1_timeSkew}") + if (c1_timeSkew < timeSkew): + log.error(f"c1 csngen state has not been synchronized with m2: time skew {c1_timeSkew}") + assert False + + if __name__ == '__main__': # Run isolated # -s for DEBUG mode diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h index b35f724c2..f1c596a3f 100644 --- a/ldap/servers/plugins/replication/repl5.h +++ b/ldap/servers/plugins/replication/repl5.h @@ -708,6 +708,7 @@ void replica_dump(Replica *r); void replica_set_enabled(Replica *r, PRBool enable); Replica *replica_get_replica_from_dn(const Slapi_DN *dn); Replica *replica_get_replica_from_root(const char *repl_root); +int replica_check_generation(Replica *r, const RUV *remote_ruv); int replica_update_ruv(Replica *replica, const CSN *csn, const char *replica_purl); Replica *replica_get_replica_for_op(Slapi_PBlock *pb); /* the functions below manipulate replica hash */ diff --git a/ldap/servers/plugins/replication/repl5_inc_protocol.c b/ldap/servers/plugins/replication/repl5_inc_protocol.c index 29b1fb073..af5e5897c 100644 --- a/ldap/servers/plugins/replication/repl5_inc_protocol.c +++ b/ldap/servers/plugins/replication/repl5_inc_protocol.c @@ -2161,26 +2161,12 @@ examine_update_vector(Private_Repl_Protocol *prp, RUV *remote_ruv) } else if (NULL == remote_ruv) { return_value = EXAMINE_RUV_PRISTINE_REPLICA; } else { - char *local_gen = NULL; - char *remote_gen = ruv_get_replica_generation(remote_ruv); - Object *local_ruv_obj; - RUV *local_ruv; - PR_ASSERT(NULL != prp->replica); - local_ruv_obj = replica_get_ruv(prp->replica); - if (NULL != local_ruv_obj) { - local_ruv = (RUV *)object_get_data(local_ruv_obj); - PR_ASSERT(local_ruv); - local_gen = ruv_get_replica_generation(local_ruv); - object_release(local_ruv_obj); - } - if (NULL == remote_gen || NULL == local_gen || strcmp(remote_gen, local_gen) != 0) { - return_value = EXAMINE_RUV_GENERATION_MISMATCH; - } else { + if (replica_check_generation(prp->replica, remote_ruv)) { return_value = EXAMINE_RUV_OK; + } else { + return_value = EXAMINE_RUV_GENERATION_MISMATCH; } - slapi_ch_free((void **)&remote_gen); - slapi_ch_free((void **)&local_gen); } return return_value; } diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c index f0ea0f8ef..7e56d6557 100644 --- a/ldap/servers/plugins/replication/repl5_replica.c +++ b/ldap/servers/plugins/replication/repl5_replica.c @@ -812,6 +812,36 @@ replica_set_ruv(Replica *r, RUV *ruv) replica_unlock(r->repl_lock); } +/* + * Check if replica generation is the same than the remote ruv one + */ +int +replica_check_generation(Replica *r, const RUV *remote_ruv) +{ + int return_value; + char *local_gen = NULL; + char *remote_gen = ruv_get_replica_generation(remote_ruv); + Object *local_ruv_obj; + RUV *local_ruv; + + PR_ASSERT(NULL != r); + local_ruv_obj = replica_get_ruv(r); + if (NULL != local_ruv_obj) { + local_ruv = (RUV *)object_get_data(local_ruv_obj); + PR_ASSERT(local_ruv); + local_gen = ruv_get_replica_generation(local_ruv); + object_release(local_ruv_obj); + } + if (NULL == remote_gen || NULL == local_gen || strcmp(remote_gen, local_gen) != 0) { + return_value = PR_FALSE; + } else { + return_value = PR_TRUE; + } + slapi_ch_free_string(&remote_gen); + slapi_ch_free_string(&local_gen); + return return_value; +} + /* * Update one particular CSN in an RUV. This is meant to be called * whenever (a) the server has processed a client operation and @@ -1298,6 +1328,11 @@ replica_update_csngen_state_ext(Replica *r, const RUV *ruv, const CSN *extracsn) PR_ASSERT(r && ruv); + if (!replica_check_generation(r, ruv)) /* ruv has wrong generation - we are done */ + { + return 0; + } + rc = ruv_get_max_csn(ruv, &csn); if (rc != RUV_SUCCESS) { return -1; @@ -3713,8 +3748,8 @@ replica_update_ruv_consumer(Replica *r, RUV *supplier_ruv) replica_lock(r->repl_lock); local_ruv = (RUV *)object_get_data(r->repl_ruv); - - if (is_cleaned_rid(supplier_id) || local_ruv == NULL) { + if (is_cleaned_rid(supplier_id) || local_ruv == NULL || + !replica_check_generation(r, supplier_ruv)) { replica_unlock(r->repl_lock); return; } diff --git a/src/lib389/lib389/dseldif.py b/src/lib389/lib389/dseldif.py index 10baba4d7..6850c9a8a 100644 --- a/src/lib389/lib389/dseldif.py +++ b/src/lib389/lib389/dseldif.py @@ -317,6 +317,43 @@ class DSEldif(DSLint): return states + def _increaseTimeSkew(self, suffix, timeSkew): + # Increase csngen state local_offset by timeSkew + # Warning: instance must be stopped before calling this function + assert (timeSkew >= 0) + nsState = self.readNsState(suffix)[0] + self._instance.log.debug(f'_increaseTimeSkew nsState is {nsState}') + oldNsState = self.get(nsState['dn'], 'nsState', True) + self._instance.log.debug(f'oldNsState is {oldNsState}') + + # Lets reencode the new nsState + from lib389.utils import print_nice_time + if pack('h', 1) == pack('=h',1): + end = '>' + else: + raise ValueError("Unknown endian, unable to proceed") + + thelen = len(oldNsState) + if thelen <= 20: + pad = 2 # padding for short H values + timefmt = 'I' # timevals are unsigned 32-bit int + else: + pad = 6 # padding for short H values + timefmt = 'Q' # timevals are unsigned 64-bit int + fmtstr = "%sH%dx3%sH%dx" % (end, pad, timefmt, pad) + newNsState = base64.b64encode(pack(fmtstr, int(nsState['rid']), + int(nsState['gen_time']), int(nsState['local_offset'])+timeSkew, + int(nsState['remote_offset']), int(nsState['seq_num']))) + newNsState = newNsState.decode('utf-8') + self._instance.log.debug(f'newNsState is {newNsState}') + # Lets replace the value. + (entry_dn_i, attr_data) = self._find_attr(nsState['dn'], 'nsState') + attr_i = next(iter(attr_data)) + self._contents[entry_dn_i + attr_i] = f"nsState:: {newNsState}" + self._update() + class FSChecks(DSLint): """This is for the healthcheck feature, check commonly used system config files the -- 2.26.2