From e98e41731051b7bf4a443b51a9d3563fc1853773 Mon Sep 17 00:00:00 2001 From: Rich Megginson Date: Wed, 6 Nov 2013 14:22:31 -0700 Subject: [PATCH 47/49] Ticket #47585 Replication Failures related to skipped entries due to cleaned rids https://fedorahosted.org/389/ticket/47585 Reviewed by: nhosoi (Thanks!) Branch: 389-ds-base-1.3.1 Fix Description: If a change was found in the changelog buffer that is skipped due to having an unknown replica ID (rid), the entire buffer was marked as CLC_STATE_NEW_RID. When the buffer is exhausted and the iterator code goes to read in the new buffer, it would not read in the new buffer because it only loads a new buffer if the current buffer state is CLC_STATE_READY. I don't know why the entire buffer would be marked as CLC_STATE_NEW_RID and stop iteration. It seems to me that just the update should be skipped, but new buffers should be loaded in order to keep sending non-skipped updates to the consumer. It is possible for a CSN with an unknown RID to get into the changelog if the server with that RID had been removed by cleanruv/cleanallruv. In that case, the CSN should be skipped. It is assumed that the change was already sent - cleanallruv is supposed to wait until all known changes have been seen before removing the RID from the RUV - so it is safe to skip it. Added additional debugging, so that we can better tell why changelog entries were skipped. Platforms tested: RHEL6 x86_64 Flag Day: no Doc impact: no (cherry picked from commit cf08f1274404e4796966011a98a6a0acbbfd6070) (cherry picked from commit 30bb98fb693ea1aac9774bdc43b923eacd72570a) (cherry picked from commit fc70e4ac6accaa14d140e333829e98897f6ff164) --- ldap/servers/plugins/replication/cl5_clcache.c | 48 ++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/ldap/servers/plugins/replication/cl5_clcache.c b/ldap/servers/plugins/replication/cl5_clcache.c index 7a6a446..8218312 100644 --- a/ldap/servers/plugins/replication/cl5_clcache.c +++ b/ldap/servers/plugins/replication/cl5_clcache.c @@ -120,6 +120,11 @@ struct clc_buffer { int buf_load_cnt; /* number of loads for session */ int buf_record_cnt; /* number of changes for session */ int buf_record_skipped; /* number of changes skipped */ + int buf_skipped_new_rid; /* number of changes skipped due to new_rid */ + int buf_skipped_csn_gt_cons_maxcsn; /* number of changes skipped due to csn greater than consumer maxcsn */ + int buf_skipped_up_to_date; /* number of changes skipped due to consumer being up-to-date for the given rid */ + int buf_skipped_csn_gt_ruv; /* number of changes skipped due to preceedents are not covered by local RUV snapshot */ + int buf_skipped_csn_covered; /* number of changes skipped due to CSNs already covered by consumer RUV */ /* * fields that should be accessed via bl_lock or pl_lock @@ -252,6 +257,11 @@ clcache_get_buffer ( CLC_Buffer **buf, DB *db, ReplicaId consumer_rid, const RUV (*buf)->buf_record_skipped = 0; (*buf)->buf_cursor = NULL; (*buf)->buf_num_cscbs = 0; + (*buf)->buf_skipped_new_rid = 0; + (*buf)->buf_skipped_csn_gt_cons_maxcsn = 0; + (*buf)->buf_skipped_up_to_date = 0; + (*buf)->buf_skipped_csn_gt_ruv = 0; + (*buf)->buf_skipped_csn_covered = 0; } else { *buf = clcache_new_buffer ( consumer_rid ); @@ -287,11 +297,16 @@ clcache_return_buffer ( CLC_Buffer **buf ) int i; slapi_log_error ( SLAPI_LOG_REPL, (*buf)->buf_agmt_name, - "session end: state=%d load=%d sent=%d skipped=%d\n", - (*buf)->buf_state, - (*buf)->buf_load_cnt, - (*buf)->buf_record_cnt - (*buf)->buf_record_skipped, - (*buf)->buf_record_skipped ); + "session end: state=%d load=%d sent=%d skipped=%d skipped_new_rid=%d " + "skipped_csn_gt_cons_maxcsn=%d skipped_up_to_date=%d " + "skipped_csn_gt_ruv=%d skipped_csn_covered=%d\n", + (*buf)->buf_state, + (*buf)->buf_load_cnt, + (*buf)->buf_record_cnt - (*buf)->buf_record_skipped, + (*buf)->buf_record_skipped, (*buf)->buf_skipped_new_rid, + (*buf)->buf_skipped_csn_gt_cons_maxcsn, + (*buf)->buf_skipped_up_to_date, (*buf)->buf_skipped_csn_gt_ruv, + (*buf)->buf_skipped_csn_covered); for ( i = 0; i < (*buf)->buf_num_cscbs; i++ ) { clcache_free_cscb ( &(*buf)->buf_cscbs[i] ); @@ -676,6 +691,8 @@ clcache_skip_change ( CLC_Buffer *buf ) ReplicaId rid; int skip = 1; int i; + char buf_cur_csn_str[CSN_STRSIZE]; + char oth_csn_str[CSN_STRSIZE]; do { @@ -697,6 +714,14 @@ clcache_skip_change ( CLC_Buffer *buf ) * The consumer must have been "restored" and needs this newer update. */ skip = 0; + } else if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) { + csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str); + csn_as_string(cons_maxcsn, 0, oth_csn_str); + slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name, + "Skipping update because the changelog buffer current csn [%s] is " + "less than or equal to the consumer max csn [%s]\n", + buf_cur_csn_str, oth_csn_str); + buf->buf_skipped_csn_gt_cons_maxcsn++; } csn_free(&cons_maxcsn); break; @@ -714,7 +739,14 @@ clcache_skip_change ( CLC_Buffer *buf ) /* Skip CSN whose RID is unknown to the local RUV snapshot */ if ( i >= buf->buf_num_cscbs ) { - buf->buf_state = CLC_STATE_NEW_RID; + if (slapi_is_loglevel_set(SLAPI_LOG_REPL)) { + csn_as_string(buf->buf_current_csn, 0, buf_cur_csn_str); + slapi_log_error(SLAPI_LOG_REPL, buf->buf_agmt_name, + "Skipping update because the changelog buffer current csn [%s] rid " + "[%d] is not in the list of changelog csn buffers (length %d)\n", + buf_cur_csn_str, rid, buf->buf_num_cscbs); + } + buf->buf_skipped_new_rid++; break; } @@ -722,17 +754,20 @@ clcache_skip_change ( CLC_Buffer *buf ) /* Skip if the consumer is already up-to-date for the RID */ if ( cscb->state == CLC_STATE_UP_TO_DATE ) { + buf->buf_skipped_up_to_date++; break; } /* Skip CSN whose preceedents are not covered by local RUV snapshot */ if ( cscb->state == CLC_STATE_CSN_GT_RUV ) { + buf->buf_skipped_csn_gt_ruv++; break; } /* Skip CSNs already covered by consumer RUV */ if ( cscb->consumer_maxcsn && csn_compare ( buf->buf_current_csn, cscb->consumer_maxcsn ) <= 0 ) { + buf->buf_skipped_csn_covered++; break; } @@ -762,6 +797,7 @@ clcache_skip_change ( CLC_Buffer *buf ) /* Skip CSNs not covered by local RUV snapshot */ cscb->state = CLC_STATE_CSN_GT_RUV; + buf->buf_skipped_csn_gt_ruv++; } while (0); -- 1.8.1.4