Blame SOURCES/0047-Ticket-47585-Replication-Failures-related-to-skipped.patch

ba46c7
From e98e41731051b7bf4a443b51a9d3563fc1853773 Mon Sep 17 00:00:00 2001
ba46c7
From: Rich Megginson <rmeggins@redhat.com>
ba46c7
Date: Wed, 6 Nov 2013 14:22:31 -0700
ba46c7
Subject: [PATCH 47/49] Ticket #47585 Replication Failures related to skipped
ba46c7
 entries due to cleaned rids
ba46c7
ba46c7
https://fedorahosted.org/389/ticket/47585
ba46c7
Reviewed by: nhosoi (Thanks!)
ba46c7
Branch: 389-ds-base-1.3.1
ba46c7
Fix Description: If a change was found in the changelog buffer that is
ba46c7
skipped due to having an unknown replica ID (rid), the entire buffer was
ba46c7
marked as CLC_STATE_NEW_RID.  When the buffer is exhausted and the iterator
ba46c7
code goes to read in the new buffer, it would not read in the new buffer
ba46c7
because it only loads a new buffer if the current buffer state is
ba46c7
CLC_STATE_READY.  I don't know why the entire buffer would be marked as
ba46c7
CLC_STATE_NEW_RID and stop iteration.  It seems to me that just the update
ba46c7
should be skipped, but new buffers should be loaded in order to keep sending
ba46c7
non-skipped updates to the consumer.
ba46c7
It is possible for a CSN with an unknown RID to get into the changelog if
ba46c7
the server with that RID had been removed by cleanruv/cleanallruv.  In that
ba46c7
case, the CSN should be skipped.  It is assumed that the change was already
ba46c7
sent - cleanallruv is supposed to wait until all known changes have been
ba46c7
seen before removing the RID from the RUV - so it is safe to skip it.
ba46c7
Added additional debugging, so that we can better tell why changelog entries
ba46c7
were skipped.
ba46c7
Platforms tested: RHEL6 x86_64
ba46c7
Flag Day: no
ba46c7
Doc impact: no
ba46c7
(cherry picked from commit cf08f1274404e4796966011a98a6a0acbbfd6070)
ba46c7
(cherry picked from commit 30bb98fb693ea1aac9774bdc43b923eacd72570a)
ba46c7
(cherry picked from commit fc70e4ac6accaa14d140e333829e98897f6ff164)
ba46c7
---
ba46c7
 ldap/servers/plugins/replication/cl5_clcache.c | 48 ++++++++++++++++++++++----
ba46c7
 1 file changed, 42 insertions(+), 6 deletions(-)
ba46c7
ba46c7
diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c
ba46c7
index 7a6a446..8218312 100644
ba46c7
--- a/ldap/servers/plugins/replication/cl5_clcache.c
ba46c7
+++ b/ldap/servers/plugins/replication/cl5_clcache.c
ba46c7
@@ -120,6 +120,11 @@ struct clc_buffer {
ba46c7
 	int		 	 buf_load_cnt;		/* number of loads for session */
ba46c7
 	int		 	 buf_record_cnt;	/* number of changes for session */
ba46c7
 	int		 	 buf_record_skipped;	/* number of changes skipped */
ba46c7
+	int		 	 buf_skipped_new_rid;	/* number of changes skipped due to new_rid */
ba46c7
+	int		 	 buf_skipped_csn_gt_cons_maxcsn;	/* number of changes skipped due to csn greater than consumer maxcsn */
ba46c7
+	int		 	 buf_skipped_up_to_date;	/* number of changes skipped due to consumer being up-to-date for the given rid */
ba46c7
+	int		 	 buf_skipped_csn_gt_ruv;	/* number of changes skipped due to preceedents are not covered by local RUV snapshot */
ba46c7
+	int		 	 buf_skipped_csn_covered;	/* number of changes skipped due to CSNs already covered by consumer RUV */
ba46c7
 
ba46c7
 	/*
ba46c7
 	 * fields that should be accessed via bl_lock or pl_lock
ba46c7
@@ -252,6 +257,11 @@ clcache_get_buffer ( CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV
ba46c7
 		(*buf)->buf_record_skipped = 0;
ba46c7
 		(*buf)->buf_cursor = NULL;
ba46c7
 		(*buf)->buf_num_cscbs = 0;
ba46c7
+		(*buf)->buf_skipped_new_rid = 0;
ba46c7
+		(*buf)->buf_skipped_csn_gt_cons_maxcsn = 0;
ba46c7
+		(*buf)->buf_skipped_up_to_date = 0;
ba46c7
+		(*buf)->buf_skipped_csn_gt_ruv = 0;
ba46c7
+		(*buf)->buf_skipped_csn_covered = 0;
ba46c7
 	}
ba46c7
 	else {
ba46c7
 		*buf = clcache_new_buffer ( consumer_rid );
ba46c7
@@ -287,11 +297,16 @@ clcache_return_buffer ( CLC_Buffer **buf )
ba46c7
 	int i;
ba46c7
 
ba46c7
 	slapi_log_error ( SLAPI_LOG_REPL, (*buf)->buf_agmt_name,
ba46c7
-			"session end: state=%d load=%d sent=%d skipped=%d\n",
ba46c7
-			 (*buf)->buf_state,
ba46c7
-			 (*buf)->buf_load_cnt,
ba46c7
-			 (*buf)->buf_record_cnt - (*buf)->buf_record_skipped,
ba46c7
-			 (*buf)->buf_record_skipped );
ba46c7
+			  "session end: state=%d load=%d sent=%d skipped=%d skipped_new_rid=%d "
ba46c7
+			  "skipped_csn_gt_cons_maxcsn=%d skipped_up_to_date=%d "
ba46c7
+			  "skipped_csn_gt_ruv=%d skipped_csn_covered=%d\n",
ba46c7
+			  (*buf)->buf_state,
ba46c7
+			  (*buf)->buf_load_cnt,
ba46c7
+			  (*buf)->buf_record_cnt - (*buf)->buf_record_skipped,
ba46c7
+			  (*buf)->buf_record_skipped, (*buf)->buf_skipped_new_rid,
ba46c7
+			  (*buf)->buf_skipped_csn_gt_cons_maxcsn,
ba46c7
+			  (*buf)->buf_skipped_up_to_date, (*buf)->buf_skipped_csn_gt_ruv,
ba46c7
+			  (*buf)->buf_skipped_csn_covered);
ba46c7
 
ba46c7
 	for ( i = 0; i < (*buf)->buf_num_cscbs; i++ ) {
ba46c7
 		clcache_free_cscb ( &(*buf)->buf_cscbs[i] );
ba46c7
@@ -676,6 +691,8 @@ clcache_skip_change ( CLC_Buffer *buf )
ba46c7
 	ReplicaId rid;
ba46c7
 	int skip = 1;
ba46c7
 	int i;
ba46c7
+	char buf_cur_csn_str[CSN_STRSIZE];
ba46c7
+	char oth_csn_str[CSN_STRSIZE];
ba46c7
 
ba46c7
 	do {
ba46c7
 
ba46c7
@@ -697,6 +714,14 @@ clcache_skip_change ( CLC_Buffer *buf )
ba46c7
 				 *  The consumer must have been "restored" and needs this newer update.
ba46c7
 				 */
ba46c7
 				skip = 0;
ba46c7
+			} else if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) {
ba46c7
+				csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str);
ba46c7
+				csn_as_string(cons_maxcsn, 0, oth_csn_str);
ba46c7
+				slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name,
ba46c7
+					"Skipping update because the changelog buffer current csn [%s] is "
ba46c7
+				        "less than or equal to the consumer max csn [%s]\n",
ba46c7
+				        buf_cur_csn_str, oth_csn_str);
ba46c7
+				buf->buf_skipped_csn_gt_cons_maxcsn++;
ba46c7
 			}
ba46c7
 			csn_free(&cons_maxcsn);
ba46c7
 			break;
ba46c7
@@ -714,7 +739,14 @@ clcache_skip_change ( CLC_Buffer *buf )
ba46c7
 
ba46c7
 		/* Skip CSN whose RID is unknown to the local RUV snapshot */
ba46c7
 		if ( i >= buf->buf_num_cscbs ) {
ba46c7
-			buf->buf_state = CLC_STATE_NEW_RID;
ba46c7
+			if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) {
ba46c7
+				csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str);
ba46c7
+				slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name,
ba46c7
+					"Skipping update because the changelog buffer current csn [%s] rid "
ba46c7
+				        "[%d] is not in the list of changelog csn buffers (length %d)\n",
ba46c7
+				        buf_cur_csn_str, rid, buf->buf_num_cscbs);
ba46c7
+			}
ba46c7
+			buf->buf_skipped_new_rid++;
ba46c7
 			break;
ba46c7
 		}
ba46c7
 
ba46c7
@@ -722,17 +754,20 @@ clcache_skip_change ( CLC_Buffer *buf )
ba46c7
 
ba46c7
 		/* Skip if the consumer is already up-to-date for the RID */
ba46c7
 		if ( cscb->state == CLC_STATE_UP_TO_DATE ) {
ba46c7
+			buf->buf_skipped_up_to_date++;
ba46c7
 			break;
ba46c7
 		}
ba46c7
 
ba46c7
 		/* Skip CSN whose preceedents are not covered by local RUV snapshot */
ba46c7
 		if ( cscb->state == CLC_STATE_CSN_GT_RUV ) {
ba46c7
+			buf->buf_skipped_csn_gt_ruv++;
ba46c7
 			break;
ba46c7
 		}
ba46c7
 
ba46c7
 		/* Skip CSNs already covered by consumer RUV */
ba46c7
 		if ( cscb->consumer_maxcsn &&
ba46c7
 			 csn_compare ( buf->buf_current_csn, cscb->consumer_maxcsn ) <= 0 ) {
ba46c7
+			buf->buf_skipped_csn_covered++;
ba46c7
 				break;
ba46c7
 		}
ba46c7
 
ba46c7
@@ -762,6 +797,7 @@ clcache_skip_change ( CLC_Buffer *buf )
ba46c7
 
ba46c7
 		/* Skip CSNs not covered by local RUV snapshot */
ba46c7
 		cscb->state = CLC_STATE_CSN_GT_RUV;
ba46c7
+		buf->buf_skipped_csn_gt_ruv++;
ba46c7
 
ba46c7
 	} while (0);
ba46c7
 
ba46c7
-- 
ba46c7
1.8.1.4
ba46c7