Blob Blame Raw
From e202c62c3b4c92163d2de9f3da9a9f3efc81e4b8 Mon Sep 17 00:00:00 2001
From: progier389 <72748589+progier389@users.noreply.github.com>
Date: Thu, 12 Nov 2020 18:50:04 +0100
Subject: [PATCH 3/3] do not add referrals for masters with different data
 generation #2054 (#4427)

Bug description:
The problem is that some operation mandatory in the usual cases are
also performed when replication cannot take place because the
database set are differents (i.e: RUV generation ids are different)

One of the issue is that the csn generator state is updated when
starting a replication session (it is a problem when trying to
reset the time skew, as freshly reinstalled replicas get infected
by the old ones)

A second issue is that the RUV got updated when ending a replication session
(which may add replica that does not share the same data set,
then update operations on consumer retun referrals towards wrong masters

Fix description:
The fix checks the RUVs generation id before updating the csn generator
and before updating the RUV.

Reviewed by: mreynolds
             firstyear
             vashirov

Platforms tested: F32
---
 .../suites/replication/regression_test.py     | 290 ++++++++++++++++++
 ldap/servers/plugins/replication/repl5.h      |   1 +
 .../plugins/replication/repl5_inc_protocol.c  |  20 +-
 .../plugins/replication/repl5_replica.c       |  39 ++-
 src/lib389/lib389/dseldif.py                  |  37 +++
 5 files changed, 368 insertions(+), 19 deletions(-)

diff --git a/dirsrvtests/tests/suites/replication/regression_test.py b/dirsrvtests/tests/suites/replication/regression_test.py
index 14b9d6a44..a72af6b30 100644
--- a/dirsrvtests/tests/suites/replication/regression_test.py
+++ b/dirsrvtests/tests/suites/replication/regression_test.py
@@ -13,6 +13,7 @@ from lib389.idm.user import TEST_USER_PROPERTIES, UserAccounts
 from lib389.pwpolicy import PwPolicyManager
 from lib389.utils import *
 from lib389.topologies import topology_m2 as topo_m2, TopologyMain, topology_m3 as topo_m3, create_topology, _remove_ssca_db, topology_i2 as topo_i2
+from lib389.topologies import topology_m2c2 as topo_m2c2
 from lib389._constants import *
 from lib389.idm.organizationalunit import OrganizationalUnits
 from lib389.idm.user import UserAccount
@@ -22,6 +23,7 @@ from lib389.idm.directorymanager import DirectoryManager
 from lib389.replica import Replicas, ReplicationManager, Changelog5, BootstrapReplicationManager
 from lib389.agreement import Agreements
 from lib389 import pid_from_file
+from lib389.dseldif import *
 
 
 pytestmark = pytest.mark.tier1
@@ -1027,6 +1029,294 @@ def test_online_init_should_create_keepalive_entries(topo_m2):
     verify_keepalive_entries(topo_m2, True);
 
 
+def get_agreement(agmts, consumer):
+    # Get agreement towards consumer among the agremment list
+    for agmt in agmts.list():
+        if (agmt.get_attr_val_utf8('nsDS5ReplicaPort') == str(consumer.port) and
+              agmt.get_attr_val_utf8('nsDS5ReplicaHost') == consumer.host):
+            return agmt
+    return None;
+
+
+def test_ruv_url_not_added_if_different_uuid(topo_m2c2):
+    """Check that RUV url is not updated if RUV generation uuid are different
+
+    :id: 7cc30a4e-0ffd-4758-8f00-e500279af344
+    :setup: Two masters + two consumers replication setup
+    :steps:
+        1. Generate ldif without replication data
+        2. Init both masters from that ldif
+             (to clear the ruvs and generates different generation uuid)
+        3. Perform on line init from master1 to consumer1
+               and from master2 to consumer2
+        4. Perform update on both masters
+        5. Check that c1 RUV does not contains URL towards m2
+        6. Check that c2 RUV does contains URL towards m2
+        7. Perform on line init from master1 to master2
+        8. Perform update on master2
+        9. Check that c1 RUV does contains URL towards m2
+    :expectedresults:
+        1. No error while generating ldif
+        2. No error while importing the ldif file
+        3. No error and Initialization done.
+        4. No error
+        5. master2 replicaid should not be in the consumer1 RUV
+        6. master2 replicaid should be in the consumer2 RUV
+        7. No error and Initialization done.
+        8. No error
+        9. master2 replicaid should be in the consumer1 RUV
+
+    """
+
+    # Variables initialization
+    repl = ReplicationManager(DEFAULT_SUFFIX)
+
+    m1 = topo_m2c2.ms["master1"]
+    m2 = topo_m2c2.ms["master2"]
+    c1 = topo_m2c2.cs["consumer1"]
+    c2 = topo_m2c2.cs["consumer2"]
+
+    replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX)
+    replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX)
+    replica_c1 = Replicas(c1).get(DEFAULT_SUFFIX)
+    replica_c2 = Replicas(c2).get(DEFAULT_SUFFIX)
+
+    replicid_m2 = replica_m2.get_rid()
+
+    agmts_m1 = Agreements(m1, replica_m1.dn)
+    agmts_m2 = Agreements(m2, replica_m2.dn)
+
+    m1_m2 = get_agreement(agmts_m1, m2)
+    m1_c1 = get_agreement(agmts_m1, c1)
+    m1_c2 = get_agreement(agmts_m1, c2)
+    m2_m1 = get_agreement(agmts_m2, m1)
+    m2_c1 = get_agreement(agmts_m2, c1)
+    m2_c2 = get_agreement(agmts_m2, c2)
+
+    # Step 1: Generate ldif without replication data
+    m1.stop()
+    m2.stop()
+    ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
+    m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
+               excludeSuffixes=None, repl_data=False,
+               outputfile=ldif_file, encrypt=False)
+    # Remove replication metadata that are still in the ldif
+    # _remove_replication_data(ldif_file)
+
+    # Step 2: Init both masters from that ldif
+    m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+    m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+    m1.start()
+    m2.start()
+
+    # Step 3: Perform on line init from master1 to consumer1
+    #          and from master2 to consumer2
+    m1_c1.begin_reinit()
+    m2_c2.begin_reinit()
+    (done, error) = m1_c1.wait_reinit()
+    assert done is True
+    assert error is False
+    (done, error) = m2_c2.wait_reinit()
+    assert done is True
+    assert error is False
+
+    # Step 4: Perform update on both masters
+    repl.test_replication(m1, c1)
+    repl.test_replication(m2, c2)
+
+    # Step 5: Check that c1 RUV does not contains URL towards m2
+    ruv = replica_c1.get_ruv()
+    log.debug(f"c1 RUV: {ruv}")
+    url=ruv._rid_url.get(replica_m2.get_rid())
+    if (url == None):
+        log.debug(f"No URL for RID {replica_m2.get_rid()} in RUV");
+    else:
+        log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}");
+        log.error(f"URL for RID {replica_m2.get_rid()} found in RUV")
+        #Note: this assertion fails if issue 2054 is not fixed.
+        assert False
+
+    # Step 6: Check that c2 RUV does contains URL towards m2
+    ruv = replica_c2.get_ruv()
+    log.debug(f"c1 RUV: {ruv} {ruv._rids} ")
+    url=ruv._rid_url.get(replica_m2.get_rid())
+    if (url == None):
+        log.error(f"No URL for RID {replica_m2.get_rid()} in RUV");
+        assert False
+    else:
+        log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}");
+
+
+    # Step 7: Perform on line init from master1 to master2
+    m1_m2.begin_reinit()
+    (done, error) = m1_m2.wait_reinit()
+    assert done is True
+    assert error is False
+
+    # Step 8: Perform update on master2
+    repl.test_replication(m2, c1)
+
+    # Step 9: Check that c1 RUV does contains URL towards m2
+    ruv = replica_c1.get_ruv()
+    log.debug(f"c1 RUV: {ruv} {ruv._rids} ")
+    url=ruv._rid_url.get(replica_m2.get_rid())
+    if (url == None):
+        log.error(f"No URL for RID {replica_m2.get_rid()} in RUV");
+        assert False
+    else:
+        log.debug(f"URL for RID {replica_m2.get_rid()} in RUV is {url}");
+
+
+def test_csngen_state_not_updated_if_different_uuid(topo_m2c2):
+    """Check that csngen remote offset is not updated if RUV generation uuid are different
+
+    :id: 77694b8e-22ae-11eb-89b2-482ae39447e5
+    :setup: Two masters + two consumers replication setup
+    :steps:
+        1. Disable m1<->m2 agreement to avoid propagate timeSkew
+        2. Generate ldif without replication data
+        3. Increase time skew on master2
+        4. Init both masters from that ldif
+             (to clear the ruvs and generates different generation uuid)
+        5. Perform on line init from master1 to consumer1 and master2 to consumer2
+        6. Perform update on both masters
+        7: Check that c1 has no time skew
+        8: Check that c2 has time skew
+        9. Init master2 from master1
+        10. Perform update on master2
+        11. Check that c1 has time skew
+    :expectedresults:
+        1. No error
+        2. No error while generating ldif
+        3. No error
+        4. No error while importing the ldif file
+        5. No error and Initialization done.
+        6. No error
+        7. c1 time skew should be lesser than threshold
+        8. c2 time skew should be higher than threshold
+        9. No error and Initialization done.
+        10. No error
+        11. c1 time skew should be higher than threshold
+
+    """
+
+    # Variables initialization
+    repl = ReplicationManager(DEFAULT_SUFFIX)
+
+    m1 = topo_m2c2.ms["master1"]
+    m2 = topo_m2c2.ms["master2"]
+    c1 = topo_m2c2.cs["consumer1"]
+    c2 = topo_m2c2.cs["consumer2"]
+
+    replica_m1 = Replicas(m1).get(DEFAULT_SUFFIX)
+    replica_m2 = Replicas(m2).get(DEFAULT_SUFFIX)
+    replica_c1 = Replicas(c1).get(DEFAULT_SUFFIX)
+    replica_c2 = Replicas(c2).get(DEFAULT_SUFFIX)
+
+    replicid_m2 = replica_m2.get_rid()
+
+    agmts_m1 = Agreements(m1, replica_m1.dn)
+    agmts_m2 = Agreements(m2, replica_m2.dn)
+
+    m1_m2 = get_agreement(agmts_m1, m2)
+    m1_c1 = get_agreement(agmts_m1, c1)
+    m1_c2 = get_agreement(agmts_m1, c2)
+    m2_m1 = get_agreement(agmts_m2, m1)
+    m2_c1 = get_agreement(agmts_m2, c1)
+    m2_c2 = get_agreement(agmts_m2, c2)
+
+    # Step 1: Disable m1<->m2 agreement to avoid propagate timeSkew
+    m1_m2.pause()
+    m2_m1.pause()
+
+    # Step 2: Generate ldif without replication data
+    m1.stop()
+    m2.stop()
+    ldif_file = '%s/norepl.ldif' % m1.get_ldif_dir()
+    m1.db2ldif(bename=DEFAULT_BENAME, suffixes=[DEFAULT_SUFFIX],
+               excludeSuffixes=None, repl_data=False,
+               outputfile=ldif_file, encrypt=False)
+    # Remove replication metadata that are still in the ldif
+    # _remove_replication_data(ldif_file)
+
+    # Step 3: Increase time skew on master2
+    timeSkew=6*3600
+    # We can modify master2 time skew
+    # But the time skew on the consumer may be smaller
+    # depending on when the cnsgen generation time is updated
+    # and when first csn get replicated.
+    # Since we use timeSkew has threshold value to detect
+    # whether there are time skew or not,
+    # lets add a significative margin (longer than the test duration)
+    # to avoid any risk of erroneous failure
+    timeSkewMargin = 300
+    DSEldif(m2)._increaseTimeSkew(DEFAULT_SUFFIX, timeSkew+timeSkewMargin)
+
+    # Step 4: Init both masters from that ldif
+    m1.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+    m2.ldif2db(DEFAULT_BENAME, None, None, None, ldif_file)
+    m1.start()
+    m2.start()
+
+    # Step 5: Perform on line init from master1 to consumer1
+    #          and from master2 to consumer2
+    m1_c1.begin_reinit()
+    m2_c2.begin_reinit()
+    (done, error) = m1_c1.wait_reinit()
+    assert done is True
+    assert error is False
+    (done, error) = m2_c2.wait_reinit()
+    assert done is True
+    assert error is False
+
+    # Step 6: Perform update on both masters
+    repl.test_replication(m1, c1)
+    repl.test_replication(m2, c2)
+
+    # Step 7: Check that c1 has no time skew
+    # Stop server to insure that dse.ldif is uptodate
+    c1.stop()
+    c1_nsState = DSEldif(c1).readNsState(DEFAULT_SUFFIX)[0]
+    c1_timeSkew = int(c1_nsState['time_skew'])
+    log.debug(f"c1 time skew: {c1_timeSkew}")
+    if (c1_timeSkew >= timeSkew):
+        log.error(f"c1 csngen state has unexpectedly been synchronized with m2: time skew {c1_timeSkew}")
+        assert False
+    c1.start()
+
+    # Step 8: Check that c2 has time skew
+    # Stop server to insure that dse.ldif is uptodate
+    c2.stop()
+    c2_nsState = DSEldif(c2).readNsState(DEFAULT_SUFFIX)[0]
+    c2_timeSkew = int(c2_nsState['time_skew'])
+    log.debug(f"c2 time skew: {c2_timeSkew}")
+    if (c2_timeSkew < timeSkew):
+        log.error(f"c2 csngen state has not been synchronized with m2: time skew {c2_timeSkew}")
+        assert False
+    c2.start()
+
+    # Step 9: Perform on line init from master1 to master2
+    m1_c1.pause()
+    m1_m2.resume()
+    m1_m2.begin_reinit()
+    (done, error) = m1_m2.wait_reinit()
+    assert done is True
+    assert error is False
+
+    # Step 10: Perform update on master2
+    repl.test_replication(m2, c1)
+
+    # Step 11: Check that c1 has time skew
+    # Stop server to insure that dse.ldif is uptodate
+    c1.stop()
+    c1_nsState = DSEldif(c1).readNsState(DEFAULT_SUFFIX)[0]
+    c1_timeSkew = int(c1_nsState['time_skew'])
+    log.debug(f"c1 time skew: {c1_timeSkew}")
+    if (c1_timeSkew < timeSkew):
+        log.error(f"c1 csngen state has not been synchronized with m2: time skew {c1_timeSkew}")
+        assert False
+
+
 if __name__ == '__main__':
     # Run isolated
     # -s for DEBUG mode
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
index b35f724c2..f1c596a3f 100644
--- a/ldap/servers/plugins/replication/repl5.h
+++ b/ldap/servers/plugins/replication/repl5.h
@@ -708,6 +708,7 @@ void replica_dump(Replica *r);
 void replica_set_enabled(Replica *r, PRBool enable);
 Replica *replica_get_replica_from_dn(const Slapi_DN *dn);
 Replica *replica_get_replica_from_root(const char *repl_root);
+int replica_check_generation(Replica *r, const RUV *remote_ruv);
 int replica_update_ruv(Replica *replica, const CSN *csn, const char *replica_purl);
 Replica *replica_get_replica_for_op(Slapi_PBlock *pb);
 /* the functions below manipulate replica hash */
diff --git a/ldap/servers/plugins/replication/repl5_inc_protocol.c b/ldap/servers/plugins/replication/repl5_inc_protocol.c
index 29b1fb073..af5e5897c 100644
--- a/ldap/servers/plugins/replication/repl5_inc_protocol.c
+++ b/ldap/servers/plugins/replication/repl5_inc_protocol.c
@@ -2161,26 +2161,12 @@ examine_update_vector(Private_Repl_Protocol *prp, RUV *remote_ruv)
     } else if (NULL == remote_ruv) {
         return_value = EXAMINE_RUV_PRISTINE_REPLICA;
     } else {
-        char *local_gen = NULL;
-        char *remote_gen = ruv_get_replica_generation(remote_ruv);
-        Object *local_ruv_obj;
-        RUV *local_ruv;
-
         PR_ASSERT(NULL != prp->replica);
-        local_ruv_obj = replica_get_ruv(prp->replica);
-        if (NULL != local_ruv_obj) {
-            local_ruv = (RUV *)object_get_data(local_ruv_obj);
-            PR_ASSERT(local_ruv);
-            local_gen = ruv_get_replica_generation(local_ruv);
-            object_release(local_ruv_obj);
-        }
-        if (NULL == remote_gen || NULL == local_gen || strcmp(remote_gen, local_gen) != 0) {
-            return_value = EXAMINE_RUV_GENERATION_MISMATCH;
-        } else {
+        if (replica_check_generation(prp->replica, remote_ruv)) {
             return_value = EXAMINE_RUV_OK;
+        } else {
+            return_value = EXAMINE_RUV_GENERATION_MISMATCH;
         }
-        slapi_ch_free((void **)&remote_gen);
-        slapi_ch_free((void **)&local_gen);
     }
     return return_value;
 }
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
index f0ea0f8ef..7e56d6557 100644
--- a/ldap/servers/plugins/replication/repl5_replica.c
+++ b/ldap/servers/plugins/replication/repl5_replica.c
@@ -812,6 +812,36 @@ replica_set_ruv(Replica *r, RUV *ruv)
     replica_unlock(r->repl_lock);
 }
 
+/*
+ * Check if replica generation is the same than the remote ruv one
+ */
+int
+replica_check_generation(Replica *r, const RUV *remote_ruv)
+{
+    int return_value;
+    char *local_gen = NULL;
+    char *remote_gen = ruv_get_replica_generation(remote_ruv);
+    Object *local_ruv_obj;
+    RUV *local_ruv;
+
+    PR_ASSERT(NULL != r);
+    local_ruv_obj = replica_get_ruv(r);
+    if (NULL != local_ruv_obj) {
+        local_ruv = (RUV *)object_get_data(local_ruv_obj);
+        PR_ASSERT(local_ruv);
+        local_gen = ruv_get_replica_generation(local_ruv);
+        object_release(local_ruv_obj);
+    }
+    if (NULL == remote_gen || NULL == local_gen || strcmp(remote_gen, local_gen) != 0) {
+        return_value = PR_FALSE;
+    } else {
+        return_value = PR_TRUE;
+    }
+    slapi_ch_free_string(&remote_gen);
+    slapi_ch_free_string(&local_gen);
+    return return_value;
+}
+
 /*
  * Update one particular CSN in an RUV. This is meant to be called
  * whenever (a) the server has processed a client operation and
@@ -1298,6 +1328,11 @@ replica_update_csngen_state_ext(Replica *r, const RUV *ruv, const CSN *extracsn)
 
     PR_ASSERT(r && ruv);
 
+    if (!replica_check_generation(r, ruv)) /* ruv has wrong generation - we are done */
+    {
+        return 0;
+    }
+
     rc = ruv_get_max_csn(ruv, &csn);
     if (rc != RUV_SUCCESS) {
         return -1;
@@ -3713,8 +3748,8 @@ replica_update_ruv_consumer(Replica *r, RUV *supplier_ruv)
         replica_lock(r->repl_lock);
 
         local_ruv = (RUV *)object_get_data(r->repl_ruv);
-
-        if (is_cleaned_rid(supplier_id) || local_ruv == NULL) {
+        if (is_cleaned_rid(supplier_id) || local_ruv == NULL ||
+                !replica_check_generation(r, supplier_ruv)) {
             replica_unlock(r->repl_lock);
             return;
         }
diff --git a/src/lib389/lib389/dseldif.py b/src/lib389/lib389/dseldif.py
index 10baba4d7..6850c9a8a 100644
--- a/src/lib389/lib389/dseldif.py
+++ b/src/lib389/lib389/dseldif.py
@@ -317,6 +317,43 @@ class DSEldif(DSLint):
 
         return states
 
+    def _increaseTimeSkew(self, suffix, timeSkew):
+        # Increase csngen state local_offset by timeSkew
+        # Warning: instance must be stopped before calling this function
+        assert (timeSkew >= 0)
+        nsState = self.readNsState(suffix)[0]
+        self._instance.log.debug(f'_increaseTimeSkew nsState is {nsState}')
+        oldNsState = self.get(nsState['dn'], 'nsState', True)
+        self._instance.log.debug(f'oldNsState is {oldNsState}')
+
+        # Lets reencode the new nsState
+        from lib389.utils import print_nice_time
+        if pack('<h', 1) == pack('=h',1):
+            end = '<'
+        elif pack('>h', 1) == pack('=h',1):
+            end = '>'
+        else:
+            raise ValueError("Unknown endian, unable to proceed")
+
+        thelen = len(oldNsState)
+        if thelen <= 20:
+            pad = 2 # padding for short H values
+            timefmt = 'I' # timevals are unsigned 32-bit int
+        else:
+            pad = 6 # padding for short H values
+            timefmt = 'Q' # timevals are unsigned 64-bit int
+        fmtstr = "%sH%dx3%sH%dx" % (end, pad, timefmt, pad)
+        newNsState = base64.b64encode(pack(fmtstr, int(nsState['rid']),
+           int(nsState['gen_time']), int(nsState['local_offset'])+timeSkew,
+           int(nsState['remote_offset']), int(nsState['seq_num'])))
+        newNsState = newNsState.decode('utf-8')
+        self._instance.log.debug(f'newNsState is {newNsState}')
+        # Lets replace the value.
+        (entry_dn_i, attr_data) = self._find_attr(nsState['dn'], 'nsState')
+        attr_i = next(iter(attr_data))
+        self._contents[entry_dn_i + attr_i] = f"nsState:: {newNsState}"
+        self._update()
+
 
 class FSChecks(DSLint):
     """This is for the healthcheck feature, check commonly used system config files the
-- 
2.26.2