Blob Blame Raw
From e98e41731051b7bf4a443b51a9d3563fc1853773 Mon Sep 17 00:00:00 2001
From: Rich Megginson <rmeggins@redhat.com>
Date: Wed, 6 Nov 2013 14:22:31 -0700
Subject: [PATCH 47/49] Ticket #47585 Replication Failures related to skipped
 entries due to cleaned rids

https://fedorahosted.org/389/ticket/47585
Reviewed by: nhosoi (Thanks!)
Branch: 389-ds-base-1.3.1
Fix Description: If a change was found in the changelog buffer that is
skipped due to having an unknown replica ID (rid), the entire buffer was
marked as CLC_STATE_NEW_RID.  When the buffer is exhausted and the iterator
code goes to read in the new buffer, it would not read in the new buffer
because it only loads a new buffer if the current buffer state is
CLC_STATE_READY.  I don't know why the entire buffer would be marked as
CLC_STATE_NEW_RID and stop iteration.  It seems to me that just the update
should be skipped, but new buffers should be loaded in order to keep sending
non-skipped updates to the consumer.
It is possible for a CSN with an unknown RID to get into the changelog if
the server with that RID had been removed by cleanruv/cleanallruv.  In that
case, the CSN should be skipped.  It is assumed that the change was already
sent - cleanallruv is supposed to wait until all known changes have been
seen before removing the RID from the RUV - so it is safe to skip it.
Added additional debugging, so that we can better tell why changelog entries
were skipped.
Platforms tested: RHEL6 x86_64
Flag Day: no
Doc impact: no
(cherry picked from commit cf08f1274404e4796966011a98a6a0acbbfd6070)
(cherry picked from commit 30bb98fb693ea1aac9774bdc43b923eacd72570a)
(cherry picked from commit fc70e4ac6accaa14d140e333829e98897f6ff164)
---
 ldap/servers/plugins/replication/cl5_clcache.c | 48 ++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
index 7a6a446..8218312 100644
--- a/ldap/servers/plugins/replication/cl5_clcache.c
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
@@ -120,6 +120,11 @@ struct clc_buffer {
 	int		 	 buf_load_cnt;		/* number of loads for session */
 	int		 	 buf_record_cnt;	/* number of changes for session */
 	int		 	 buf_record_skipped;	/* number of changes skipped */
+	int		 	 buf_skipped_new_rid;	/* number of changes skipped due to new_rid */
+	int		 	 buf_skipped_csn_gt_cons_maxcsn;	/* number of changes skipped due to csn greater than consumer maxcsn */
+	int		 	 buf_skipped_up_to_date;	/* number of changes skipped due to consumer being up-to-date for the given rid */
+	int		 	 buf_skipped_csn_gt_ruv;	/* number of changes skipped due to preceedents are not covered by local RUV snapshot */
+	int		 	 buf_skipped_csn_covered;	/* number of changes skipped due to CSNs already covered by consumer RUV */
 
 	/*
 	 * fields that should be accessed via bl_lock or pl_lock
@@ -252,6 +257,11 @@ clcache_get_buffer ( CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV
 		(*buf)->buf_record_skipped = 0;
 		(*buf)->buf_cursor = NULL;
 		(*buf)->buf_num_cscbs = 0;
+		(*buf)->buf_skipped_new_rid = 0;
+		(*buf)->buf_skipped_csn_gt_cons_maxcsn = 0;
+		(*buf)->buf_skipped_up_to_date = 0;
+		(*buf)->buf_skipped_csn_gt_ruv = 0;
+		(*buf)->buf_skipped_csn_covered = 0;
 	}
 	else {
 		*buf = clcache_new_buffer ( consumer_rid );
@@ -287,11 +297,16 @@ clcache_return_buffer ( CLC_Buffer **buf )
 	int i;
 
 	slapi_log_error ( SLAPI_LOG_REPL, (*buf)->buf_agmt_name,
-			"session end: state=%d load=%d sent=%d skipped=%d\n",
-			 (*buf)->buf_state,
-			 (*buf)->buf_load_cnt,
-			 (*buf)->buf_record_cnt - (*buf)->buf_record_skipped,
-			 (*buf)->buf_record_skipped );
+			  "session end: state=%d load=%d sent=%d skipped=%d skipped_new_rid=%d "
+			  "skipped_csn_gt_cons_maxcsn=%d skipped_up_to_date=%d "
+			  "skipped_csn_gt_ruv=%d skipped_csn_covered=%d\n",
+			  (*buf)->buf_state,
+			  (*buf)->buf_load_cnt,
+			  (*buf)->buf_record_cnt - (*buf)->buf_record_skipped,
+			  (*buf)->buf_record_skipped, (*buf)->buf_skipped_new_rid,
+			  (*buf)->buf_skipped_csn_gt_cons_maxcsn,
+			  (*buf)->buf_skipped_up_to_date, (*buf)->buf_skipped_csn_gt_ruv,
+			  (*buf)->buf_skipped_csn_covered);
 
 	for ( i = 0; i < (*buf)->buf_num_cscbs; i++ ) {
 		clcache_free_cscb ( &(*buf)->buf_cscbs[i] );
@@ -676,6 +691,8 @@ clcache_skip_change ( CLC_Buffer *buf )
 	ReplicaId rid;
 	int skip = 1;
 	int i;
+	char buf_cur_csn_str[CSN_STRSIZE];
+	char oth_csn_str[CSN_STRSIZE];
 
 	do {
 
@@ -697,6 +714,14 @@ clcache_skip_change ( CLC_Buffer *buf )
 				 *  The consumer must have been "restored" and needs this newer update.
 				 */
 				skip = 0;
+			} else if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) {
+				csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str);
+				csn_as_string(cons_maxcsn, 0, oth_csn_str);
+				slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name,
+					"Skipping update because the changelog buffer current csn [%s] is "
+				        "less than or equal to the consumer max csn [%s]\n",
+				        buf_cur_csn_str, oth_csn_str);
+				buf->buf_skipped_csn_gt_cons_maxcsn++;
 			}
 			csn_free(&cons_maxcsn);
 			break;
@@ -714,7 +739,14 @@ clcache_skip_change ( CLC_Buffer *buf )
 
 		/* Skip CSN whose RID is unknown to the local RUV snapshot */
 		if ( i >= buf->buf_num_cscbs ) {
-			buf->buf_state = CLC_STATE_NEW_RID;
+			if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) {
+				csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str);
+				slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name,
+					"Skipping update because the changelog buffer current csn [%s] rid "
+				        "[%d] is not in the list of changelog csn buffers (length %d)\n",
+				        buf_cur_csn_str, rid, buf->buf_num_cscbs);
+			}
+			buf->buf_skipped_new_rid++;
 			break;
 		}
 
@@ -722,17 +754,20 @@ clcache_skip_change ( CLC_Buffer *buf )
 
 		/* Skip if the consumer is already up-to-date for the RID */
 		if ( cscb->state == CLC_STATE_UP_TO_DATE ) {
+			buf->buf_skipped_up_to_date++;
 			break;
 		}
 
 		/* Skip CSN whose preceedents are not covered by local RUV snapshot */
 		if ( cscb->state == CLC_STATE_CSN_GT_RUV ) {
+			buf->buf_skipped_csn_gt_ruv++;
 			break;
 		}
 
 		/* Skip CSNs already covered by consumer RUV */
 		if ( cscb->consumer_maxcsn &&
 			 csn_compare ( buf->buf_current_csn, cscb->consumer_maxcsn ) <= 0 ) {
+			buf->buf_skipped_csn_covered++;
 				break;
 		}
 
@@ -762,6 +797,7 @@ clcache_skip_change ( CLC_Buffer *buf )
 
 		/* Skip CSNs not covered by local RUV snapshot */
 		cscb->state = CLC_STATE_CSN_GT_RUV;
+		buf->buf_skipped_csn_gt_ruv++;
 
 	} while (0);
 
-- 
1.8.1.4