Blame SOURCES/redhat-bugzilla-1975069.patch

5fe4d4
3f5ba2218 libpcp_web: add mutex to struct webgroup protecting the context dict
5fe4d4
107633192 src/libpcp: be more careful when calling __pmLogChangeVol()
5fe4d4
49bdfdfff libpcp: redefine __pmLogSetTime()
5fe4d4
5e3b792d3 libpcp_web: plug mem leak in redisMapInsert during daily log-rolling
5fe4d4
2a00a90b0 libpcp_web/discovery: improve lock handling and scalability
5fe4d4
5fe4d4
commit 3f5ba221842e6a02e9fb22e23c754854271c3c9a
5fe4d4
Author: Mark Goodwin <mgoodwin@redhat.com>
5fe4d4
Date:   Wed Jun 9 16:44:30 2021 +1000
5fe4d4
5fe4d4
    libpcp_web: add mutex to struct webgroup protecting the context dict
5fe4d4
    
5fe4d4
    Add a mutex to the local webgroups structure in libpcp_web and
5fe4d4
    use it to protect multithreaded parallel updates (dictAdd,
5fe4d4
    dictDelete) to the groups->contexts dict and the dict traversal
5fe4d4
    in the timer driven garbage collector.
5fe4d4
    
5fe4d4
    Tested by qa/297 and related tests and also an updated version
5fe4d4
    of qa/1457 (which now stress tests parallel http and https/tls
5fe4d4
    pmproxy RESTAPI calls .. in a later commit).
5fe4d4
    
5fe4d4
    Related: RHBZ#1947989
5fe4d4
    Resolves: https://github.com/performancecopilot/pcp/issues/1311
5fe4d4
5fe4d4
diff --git a/src/libpcp_web/src/webgroup.c b/src/libpcp_web/src/webgroup.c
5fe4d4
index 08c2518ed..35f05441b 100644
5fe4d4
--- a/src/libpcp_web/src/webgroup.c
5fe4d4
+++ b/src/libpcp_web/src/webgroup.c
5fe4d4
@@ -51,14 +51,20 @@ typedef struct webgroups {
5fe4d4
     uv_loop_t		*events;
5fe4d4
     unsigned int	active;
5fe4d4
     uv_timer_t		timer;
5fe4d4
+    uv_mutex_t		mutex;
5fe4d4
 } webgroups;
5fe4d4
 
5fe4d4
 static struct webgroups *
5fe4d4
 webgroups_lookup(pmWebGroupModule *module)
5fe4d4
 {
5fe4d4
-    if (module->privdata == NULL)
5fe4d4
+    struct webgroups *groups = module->privdata;
5fe4d4
+
5fe4d4
+    if (module->privdata == NULL) {
5fe4d4
 	module->privdata = calloc(1, sizeof(struct webgroups));
5fe4d4
-    return (struct webgroups *)module->privdata;
5fe4d4
+	groups = (struct webgroups *)module->privdata;
5fe4d4
+	uv_mutex_init(&groups->mutex);
5fe4d4
+    }
5fe4d4
+    return groups;
5fe4d4
 }
5fe4d4
 
5fe4d4
 static int
5fe4d4
@@ -94,8 +100,11 @@ webgroup_drop_context(struct context *context, struct webgroups *groups)
5fe4d4
 	    context->garbage = 1;
5fe4d4
 	    uv_timer_stop(&context->timer);
5fe4d4
 	}
5fe4d4
-	if (groups)
5fe4d4
+	if (groups) {
5fe4d4
+	    uv_mutex_lock(&groups->mutex);
5fe4d4
 	    dictDelete(groups->contexts, &context->randomid);
5fe4d4
+	    uv_mutex_unlock(&groups->mutex);
5fe4d4
+	}
5fe4d4
 	uv_close((uv_handle_t *)&context->timer, webgroup_release_context);
5fe4d4
     }
5fe4d4
 }
5fe4d4
@@ -207,13 +216,16 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
5fe4d4
     cp->context = -1;
5fe4d4
     cp->timeout = polltime;
5fe4d4
 
5fe4d4
+    uv_mutex_lock(&groups->mutex);
5fe4d4
     if ((cp->randomid = random()) < 0 ||
5fe4d4
 	dictFind(groups->contexts, &cp->randomid) != NULL) {
5fe4d4
 	infofmt(*message, "random number failure on new web context");
5fe4d4
 	pmwebapi_free_context(cp);
5fe4d4
 	*status = -ESRCH;
5fe4d4
+	uv_mutex_unlock(&groups->mutex);
5fe4d4
 	return NULL;
5fe4d4
     }
5fe4d4
+    uv_mutex_unlock(&groups->mutex);
5fe4d4
     cp->origin = sdscatfmt(sdsempty(), "%i", cp->randomid);
5fe4d4
     cp->name.sds = sdsdup(hostspec ? hostspec : LOCALHOST);
5fe4d4
     cp->realm = sdscatfmt(sdsempty(), "pmapi/%i", cp->randomid);
5fe4d4
@@ -242,7 +254,9 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
5fe4d4
 	pmwebapi_free_context(cp);
5fe4d4
 	return NULL;
5fe4d4
     }
5fe4d4
+    uv_mutex_lock(&groups->mutex);
5fe4d4
     dictAdd(groups->contexts, &cp->randomid, cp);
5fe4d4
+    uv_mutex_unlock(&groups->mutex);
5fe4d4
 
5fe4d4
     /* leave until the end because uv_timer_init makes this visible in uv_run */
5fe4d4
     handle = (uv_handle_t *)&cp->timer;
5fe4d4
@@ -261,25 +275,34 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
5fe4d4
 static void
5fe4d4
 webgroup_garbage_collect(struct webgroups *groups)
5fe4d4
 {
5fe4d4
-    dictIterator        *iterator = dictGetSafeIterator(groups->contexts);
5fe4d4
+    dictIterator        *iterator;
5fe4d4
     dictEntry           *entry;
5fe4d4
     context_t		*cp;
5fe4d4
 
5fe4d4
     if (pmDebugOptions.http || pmDebugOptions.libweb)
5fe4d4
 	fprintf(stderr, "%s: started\n", "webgroup_garbage_collect");
5fe4d4
 
5fe4d4
-    while ((entry = dictNext(iterator)) != NULL) {
5fe4d4
-	cp = (context_t *)dictGetVal(entry);
5fe4d4
-	if (cp->garbage && cp->privdata == groups) {
5fe4d4
-	    if (pmDebugOptions.http || pmDebugOptions.libweb)
5fe4d4
-		fprintf(stderr, "GC context %u (%p)\n", cp->randomid, cp);
5fe4d4
-	    webgroup_drop_context(cp, groups);
5fe4d4
+    /* do context GC if we get the lock (else don't block here) */
5fe4d4
+    if (uv_mutex_trylock(&groups->mutex) == 0) {
5fe4d4
+	iterator = dictGetSafeIterator(groups->contexts);
5fe4d4
+	for (entry = dictNext(iterator); entry;) {
5fe4d4
+	    cp = (context_t *)dictGetVal(entry);
5fe4d4
+	    entry = dictNext(iterator);
5fe4d4
+	    if (cp->garbage && cp->privdata == groups) {
5fe4d4
+		if (pmDebugOptions.http || pmDebugOptions.libweb)
5fe4d4
+		    fprintf(stderr, "GC context %u (%p)\n", cp->randomid, cp);
5fe4d4
+		uv_mutex_unlock(&groups->mutex);
5fe4d4
+		webgroup_drop_context(cp, groups);
5fe4d4
+		uv_mutex_lock(&groups->mutex);
5fe4d4
+	    }
5fe4d4
 	}
5fe4d4
+	dictReleaseIterator(iterator);
5fe4d4
+	uv_mutex_unlock(&groups->mutex);
5fe4d4
     }
5fe4d4
-    dictReleaseIterator(iterator);
5fe4d4
 
5fe4d4
     /* TODO - trim maps, particularly instmap if proc metrics are not excluded */
5fe4d4
 
5fe4d4
+    /* TODO move the following to a new stats timer */
5fe4d4
     if (groups->metrics_handle) {
5fe4d4
 	mmv_stats_set(groups->metrics_handle, "contextmap.size",
5fe4d4
 	    NULL, dictSize(contextmap));
5fe4d4
5fe4d4
commit 107633192326b27ae571d4d4955052b8d86222c2
5fe4d4
Author: Ken McDonell <kenj@kenj.id.au>
5fe4d4
Date:   Fri Jul 2 16:52:48 2021 +1000
5fe4d4
5fe4d4
    src/libpcp: be more careful when calling __pmLogChangeVol()
5fe4d4
    
5fe4d4
    Mark observed a SEGV which looks like __pmLogFetch() died because
5fe4d4
    ctxp->c_archctl->ac_mfp was (unexpectedly) NULL.
5fe4d4
    
5fe4d4
    See: https://github.com/performancecopilot/pcp/issues/1338
5fe4d4
    
5fe4d4
    Initial guess is that a physical file was removed by concurrent
5fe4d4
    activity (like pmlogger_check or pmlogger_daily), causing
5fe4d4
    __pmLogChangeVol() to fail ... and this was not being checked for
5fe4d4
    on the __pmLogFetch() path and in a couple of other places.
5fe4d4
    
5fe4d4
    modified:   interp.c
5fe4d4
    modified:   logutil.c
5fe4d4
5fe4d4
diff --git a/src/libpcp/src/interp.c b/src/libpcp/src/interp.c
5fe4d4
index d7effbc1e..c8f6fe382 100644
5fe4d4
--- a/src/libpcp/src/interp.c
5fe4d4
+++ b/src/libpcp/src/interp.c
5fe4d4
@@ -1312,7 +1312,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
5fe4d4
     }
5fe4d4
 
5fe4d4
     /* get to the last remembered place */
5fe4d4
-    __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+    sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+    if (sts < 0)
5fe4d4
+	goto all_done;
5fe4d4
     __pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
5fe4d4
 
5fe4d4
     seen_mark = 0;	/* interested in <mark> records seen from here on */
5fe4d4
@@ -1397,7 +1399,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
5fe4d4
 	 * at least one metric requires a bound from earlier in the log ...
5fe4d4
 	 * position ourselves, ... and search
5fe4d4
 	 */
5fe4d4
-	__pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+	sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+	if (sts < 0)
5fe4d4
+	    goto all_done;
5fe4d4
 	__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
5fe4d4
 	done = 0;
5fe4d4
 
5fe4d4
@@ -1542,7 +1546,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
5fe4d4
 	 * at least one metric requires a bound from later in the log ...
5fe4d4
 	 * position ourselves ... and search
5fe4d4
 	 */
5fe4d4
-	__pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+	sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+	if (sts < 0)
5fe4d4
+	    goto all_done;
5fe4d4
 	__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
5fe4d4
 	done = 0;
5fe4d4
 
5fe4d4
diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c
5fe4d4
index fe35ed422..0ef76de25 100644
5fe4d4
--- a/src/libpcp/src/logutil.c
5fe4d4
+++ b/src/libpcp/src/logutil.c
5fe4d4
@@ -1992,7 +1992,10 @@ __pmLogFetch(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **result)
5fe4d4
     all_derived = check_all_derived(numpmid, pmidlist);
5fe4d4
 
5fe4d4
     /* re-establish position */
5fe4d4
-    __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+    sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
+    if (sts < 0)
5fe4d4
+	goto func_return;
5fe4d4
+    assert(ctxp->c_archctl->ac_mfp != NULL);
5fe4d4
     __pmFseek(ctxp->c_archctl->ac_mfp, 
5fe4d4
 	    (long)ctxp->c_archctl->ac_offset, SEEK_SET);
5fe4d4
 
5fe4d4
@@ -2489,10 +2492,12 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 	/* index either not available, or not useful */
5fe4d4
 	if (mode == PM_MODE_FORW) {
5fe4d4
 	    __pmLogChangeVol(acp, lcp->l_minvol);
5fe4d4
+	    assert(acp->ac_mfp != NULL);
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)(sizeof(__pmLogLabel) + 2*sizeof(int)), SEEK_SET);
5fe4d4
 	}
5fe4d4
 	else if (mode == PM_MODE_BACK) {
5fe4d4
 	    __pmLogChangeVol(acp, lcp->l_maxvol);
5fe4d4
+	    assert(acp->ac_mfp != NULL);
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
5fe4d4
 	}
5fe4d4
 
5fe4d4
@@ -3141,6 +3146,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
5fe4d4
 
5fe4d4
     /* Set up to scan backwards from the end of the archive. */
5fe4d4
     __pmLogChangeVol(acp, lcp->l_maxvol);
5fe4d4
+    assert(acp->ac_mfp != NULL);
5fe4d4
     __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
5fe4d4
     ctxp->c_archctl->ac_offset = __pmFtell(acp->ac_mfp);
5fe4d4
     assert(ctxp->c_archctl->ac_offset >= 0);
5fe4d4
5fe4d4
commit 49bdfdfff83ac165de2bdc9a40e61a56512585d8
5fe4d4
Author: Ken McDonell <kenj@kenj.id.au>
5fe4d4
Date:   Sun Jul 4 10:07:09 2021 +1000
5fe4d4
5fe4d4
    libpcp: redefine __pmLogSetTime()
5fe4d4
    
5fe4d4
    The problem is that if physical files for the data volumes of an
5fe4d4
    archive are removed (asynchronously by someone else) while we're
5fe4d4
    trying to switch volumes then we don't handle this safely.
5fe4d4
    
5fe4d4
    The previous commit 10763319 was as stop-gap to address Mark's SEGV
5fe4d4
    issue at https://github.com/performancecopilot/pcp/issues/1338 and
5fe4d4
    simply handled direct calls to __pmLogChangeVol() and ensured the
5fe4d4
    return status was checked.
5fe4d4
    
5fe4d4
    I was aware, then Coverity made a lot more people aware, that this
5fe4d4
    "fix" was incomplete, specifically the calls to __pmLogChangeVol()
5fe4d4
    from within __pmLogSetTime() were not checked.
5fe4d4
    
5fe4d4
    To fix the latter we have to change the type of __pmLogSetTime() from
5fe4d4
    void to int so we can return status to indicate that __pmLogChangeVol()
5fe4d4
    has failed.  And then make sure all the callers of __pmLogSetTime()
5fe4d4
    check the status returned from that function.
5fe4d4
    
5fe4d4
        modified:   src/libpcp/src/fetch.c
5fe4d4
        modified:   src/libpcp/src/internal.h
5fe4d4
        modified:   src/libpcp/src/logutil.c
5fe4d4
    
5fe4d4
    Because this introduces some new -Dlog diagnostics, qa/251 needed
5fe4d4
    a bit of a make-over.
5fe4d4
5fe4d4
diff --git a/qa/251 b/qa/251
5fe4d4
index 2b8a07917..f9b293e98 100755
5fe4d4
--- a/qa/251
5fe4d4
+++ b/qa/251
5fe4d4
@@ -37,7 +37,7 @@ _filter()
5fe4d4
 
5fe4d4
 status=1	# failure is the default!
5fe4d4
 $sudo rm -rf $tmp.* $seq.full
5fe4d4
-trap "cd $here; rm -rf $tmp; exit \$status" 0 1 2 3 15
5fe4d4
+trap "cd $here; rm -rf $tmp $tmp.*; exit \$status" 0 1 2 3 15
5fe4d4
 
5fe4d4
 # real QA test starts here
5fe4d4
 mkdir $tmp
5fe4d4
@@ -50,56 +50,62 @@ cd $tmp
5fe4d4
 for inst in "bin-100" "bin-100,bin-500,bin-900"
5fe4d4
 do
5fe4d4
     echo
5fe4d4
-    echo "All volumes present ... $inst ..."
5fe4d4
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
5fe4d4
-    egrep 'Skip|Change' err
5fe4d4
-    _filter 
5fe4d4
+    echo "All volumes present ... $inst ..." | tee -a $here/$seq.full
5fe4d4
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
5fe4d4
+    cat $tmp.err >>$here/$seq.full
5fe4d4
+    grep '^__pmLogChangeVol:' $tmp.err
5fe4d4
+    _filter <$tmp.out
5fe4d4
     [ -f die ] && exit
5fe4d4
 
5fe4d4
     echo
5fe4d4
-    echo "First volume missing ... $inst ..."
5fe4d4
+    echo "First volume missing ... $inst ..." | tee -a $here/$seq.full
5fe4d4
     mv ok-mv-bar.0 foo.0
5fe4d4
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
5fe4d4
-    egrep 'Skip|Change' err
5fe4d4
-    _filter 
5fe4d4
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
5fe4d4
+    cat $tmp.err >>$here/$seq.full
5fe4d4
+    grep '^__pmLogChangeVol:' $tmp.err
5fe4d4
+    _filter <$tmp.out
5fe4d4
     [ -f die ] && exit
5fe4d4
     mv foo.0 ok-mv-bar.0
5fe4d4
 
5fe4d4
     echo
5fe4d4
-    echo "Last volume missing ... $inst ..."
5fe4d4
+    echo "Last volume missing ... $inst ..." | tee -a $here/$seq.full
5fe4d4
     mv ok-mv-bar.3 foo.3
5fe4d4
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
5fe4d4
-    egrep 'Skip|Change' err
5fe4d4
-    _filter 
5fe4d4
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
5fe4d4
+    cat $tmp.err >>$here/$seq.full
5fe4d4
+    grep '^__pmLogChangeVol:' $tmp.err
5fe4d4
+    _filter <$tmp.out
5fe4d4
     [ -f die ] && exit
5fe4d4
     mv foo.3 ok-mv-bar.3
5fe4d4
 
5fe4d4
     echo
5fe4d4
-    echo "Second volume missing ... $inst ..."
5fe4d4
+    echo "Second volume missing ... $inst ..." | tee -a $here/$seq.full
5fe4d4
     mv ok-mv-bar.1 foo.1
5fe4d4
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
5fe4d4
-    egrep 'Skip|Change' err
5fe4d4
-    _filter 
5fe4d4
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
5fe4d4
+    cat $tmp.err >>$here/$seq.full
5fe4d4
+    grep '^__pmLogChangeVol:' $tmp.err
5fe4d4
+    _filter <$tmp.out
5fe4d4
     [ -f die ] && exit
5fe4d4
     mv foo.1 ok-mv-bar.1
5fe4d4
 
5fe4d4
     echo
5fe4d4
-    echo "Second last volume missing ... $inst ..."
5fe4d4
+    echo "Second last volume missing ... $inst ..." | tee -a $here/$seq.full
5fe4d4
     mv ok-mv-bar.2 foo.2
5fe4d4
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
5fe4d4
-    egrep 'Skip|Change' err
5fe4d4
-    _filter 
5fe4d4
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
5fe4d4
+    cat $tmp.err >>$here/$seq.full
5fe4d4
+    grep '^__pmLogChangeVol:' $tmp.err
5fe4d4
+    _filter <$tmp.out
5fe4d4
     [ -f die ] && exit
5fe4d4
     mv foo.2 ok-mv-bar.2
5fe4d4
 
5fe4d4
     echo
5fe4d4
-    echo "All volumes but second missing ... $inst ..."
5fe4d4
+    echo "All volumes but second missing ... $inst ..." | tee -a $here/$seq.full
5fe4d4
     mv ok-mv-bar.0 foo.0
5fe4d4
     mv ok-mv-bar.2 foo.2
5fe4d4
     mv ok-mv-bar.3 foo.3
5fe4d4
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
5fe4d4
-    egrep 'Skip|Change' err
5fe4d4
-    _filter 
5fe4d4
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
5fe4d4
+    cat $tmp.err >>$here/$seq.full
5fe4d4
+    grep '^__pmLogChangeVol:' $tmp.err
5fe4d4
+    _filter <$tmp.out
5fe4d4
     [ -f die ] && exit
5fe4d4
     mv foo.0 ok-mv-bar.0
5fe4d4
     mv foo.2 ok-mv-bar.2
5fe4d4
diff --git a/src/libpcp/src/fetch.c b/src/libpcp/src/fetch.c
5fe4d4
index 5328a2807..01d5bf7fc 100644
5fe4d4
--- a/src/libpcp/src/fetch.c
5fe4d4
+++ b/src/libpcp/src/fetch.c
5fe4d4
@@ -458,6 +458,7 @@ pmSetMode(int mode, const struct timeval *when, int delta)
5fe4d4
 	    /* assume PM_CONTEXT_ARCHIVE */
5fe4d4
 	    if (l_mode == PM_MODE_INTERP ||
5fe4d4
 		l_mode == PM_MODE_FORW || l_mode == PM_MODE_BACK) {
5fe4d4
+		int	lsts;
5fe4d4
 		if (when != NULL) {
5fe4d4
 		    /*
5fe4d4
 		     * special case of NULL for timestamp
5fe4d4
@@ -468,7 +469,18 @@ pmSetMode(int mode, const struct timeval *when, int delta)
5fe4d4
 		}
5fe4d4
 		ctxp->c_mode = mode;
5fe4d4
 		ctxp->c_delta = delta;
5fe4d4
-		__pmLogSetTime(ctxp);
5fe4d4
+		lsts = __pmLogSetTime(ctxp);
5fe4d4
+		if (lsts < 0) {
5fe4d4
+		    /*
5fe4d4
+		     * most unlikely; not much we can do here but expect
5fe4d4
+		     * PMAPI error to be returned once pmFetch's start
5fe4d4
+		     */
5fe4d4
+		    if (pmDebugOptions.log) {
5fe4d4
+			char	errmsg[PM_MAXERRMSGLEN];
5fe4d4
+			fprintf(stderr, "pmSetMode: __pmLogSetTime failed: %s\n",
5fe4d4
+			    pmErrStr_r(lsts, errmsg, sizeof(errmsg)));
5fe4d4
+		    }
5fe4d4
+		}
5fe4d4
 		__pmLogResetInterp(ctxp);
5fe4d4
 		sts = 0;
5fe4d4
 	    }
5fe4d4
diff --git a/src/libpcp/src/internal.h b/src/libpcp/src/internal.h
5fe4d4
index 977efdcf6..fd8d6e740 100644
5fe4d4
--- a/src/libpcp/src/internal.h
5fe4d4
+++ b/src/libpcp/src/internal.h
5fe4d4
@@ -407,7 +407,7 @@ extern int __pmLogGenerateMark(__pmLogCtl *, int, pmResult **) _PCP_HIDDEN;
5fe4d4
 extern int __pmLogFetchInterp(__pmContext *, int, pmID *, pmResult **) _PCP_HIDDEN;
5fe4d4
 extern int __pmGetArchiveLabel(__pmLogCtl *, pmLogLabel *) _PCP_HIDDEN;
5fe4d4
 extern pmTimeval *__pmLogStartTime(__pmArchCtl *) _PCP_HIDDEN;
5fe4d4
-extern void __pmLogSetTime(__pmContext *) _PCP_HIDDEN;
5fe4d4
+extern int __pmLogSetTime(__pmContext *) _PCP_HIDDEN;
5fe4d4
 extern void __pmLogResetInterp(__pmContext *) _PCP_HIDDEN;
5fe4d4
 extern void __pmArchCtlFree(__pmArchCtl *) _PCP_HIDDEN;
5fe4d4
 extern int __pmLogChangeArchive(__pmContext *, int) _PCP_HIDDEN;
5fe4d4
diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c
5fe4d4
index 0ef76de25..2ea559bfe 100644
5fe4d4
--- a/src/libpcp/src/logutil.c
5fe4d4
+++ b/src/libpcp/src/logutil.c
5fe4d4
@@ -1995,7 +1995,6 @@ __pmLogFetch(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **result)
5fe4d4
     sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
5fe4d4
     if (sts < 0)
5fe4d4
 	goto func_return;
5fe4d4
-    assert(ctxp->c_archctl->ac_mfp != NULL);
5fe4d4
     __pmFseek(ctxp->c_archctl->ac_mfp, 
5fe4d4
 	    (long)ctxp->c_archctl->ac_offset, SEEK_SET);
5fe4d4
 
5fe4d4
@@ -2010,7 +2009,9 @@ more:
5fe4d4
 	     * no serial access, so need to make sure we are
5fe4d4
 	     * starting in the correct place
5fe4d4
 	     */
5fe4d4
-	    __pmLogSetTime(ctxp);
5fe4d4
+	    sts = __pmLogSetTime(ctxp);
5fe4d4
+	    if (sts < 0)
5fe4d4
+		goto func_return;
5fe4d4
 	    ctxp->c_archctl->ac_offset = __pmFtell(ctxp->c_archctl->ac_mfp);
5fe4d4
 	    ctxp->c_archctl->ac_vol = ctxp->c_archctl->ac_curvol;
5fe4d4
 	    /*
5fe4d4
@@ -2299,7 +2300,7 @@ VolSkip(__pmArchCtl *acp, int mode,  int j)
5fe4d4
     return PM_ERR_EOL;
5fe4d4
 }
5fe4d4
 
5fe4d4
-void
5fe4d4
+int
5fe4d4
 __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 {
5fe4d4
     __pmArchCtl	*acp = ctxp->c_archctl;
5fe4d4
@@ -2356,6 +2357,7 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
     if (lcp->l_numti) {
5fe4d4
 	/* we have a temporal index, use it! */
5fe4d4
 	int		j = -1;
5fe4d4
+	int		try;
5fe4d4
 	int		toobig = 0;
5fe4d4
 	int		match = 0;
5fe4d4
 	int		vol;
5fe4d4
@@ -2406,9 +2408,13 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 	acp->ac_serial = 1;
5fe4d4
 
5fe4d4
 	if (match) {
5fe4d4
+	    try = j;
5fe4d4
 	    j = VolSkip(acp, mode, j);
5fe4d4
-	    if (j < 0)
5fe4d4
-		return;
5fe4d4
+	    if (j < 0) {
5fe4d4
+		if (pmDebugOptions.log)
5fe4d4
+		    fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #1\n", mode, try);
5fe4d4
+		return PM_ERR_LOGFILE;
5fe4d4
+	    }
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
5fe4d4
 	    if (mode == PM_MODE_BACK)
5fe4d4
 		acp->ac_serial = 0;
5fe4d4
@@ -2418,9 +2424,13 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 	    }
5fe4d4
 	}
5fe4d4
 	else if (j < 1) {
5fe4d4
+	    try = 0;
5fe4d4
 	    j = VolSkip(acp, PM_MODE_FORW, 0);
5fe4d4
-	    if (j < 0)
5fe4d4
-		return;
5fe4d4
+	    if (j < 0) {
5fe4d4
+		if (pmDebugOptions.log)
5fe4d4
+		    fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #2\n", PM_MODE_FORW, try);
5fe4d4
+		return PM_ERR_LOGFILE;
5fe4d4
+	    }
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
5fe4d4
 	    if (pmDebugOptions.log) {
5fe4d4
 		fprintf(stderr, " before start ti@");
5fe4d4
@@ -2428,9 +2438,13 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 	    }
5fe4d4
 	}
5fe4d4
 	else if (j == numti) {
5fe4d4
+	    try = numti-1;
5fe4d4
 	    j = VolSkip(acp, PM_MODE_BACK, numti-1);
5fe4d4
-	    if (j < 0)
5fe4d4
-		return;
5fe4d4
+	    if (j < 0) {
5fe4d4
+		if (pmDebugOptions.log)
5fe4d4
+		    fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #3\n", PM_MODE_BACK, try);
5fe4d4
+		return PM_ERR_LOGFILE;
5fe4d4
+	    }
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
5fe4d4
 	    if (mode == PM_MODE_BACK)
5fe4d4
 		acp->ac_serial = 0;
5fe4d4
@@ -2450,9 +2464,13 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 	    t_hi = __pmTimevalSub(&lcp->l_ti[j].ti_stamp, &ctxp->c_origin);
5fe4d4
 	    t_lo = __pmTimevalSub(&ctxp->c_origin, &lcp->l_ti[j-1].ti_stamp);
5fe4d4
 	    if (t_hi <= t_lo && !toobig) {
5fe4d4
+		try = j;
5fe4d4
 		j = VolSkip(acp, mode, j);
5fe4d4
-		if (j < 0)
5fe4d4
-		    return;
5fe4d4
+		if (j < 0) {
5fe4d4
+		    if (pmDebugOptions.log)
5fe4d4
+			fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #4\n", mode, try);
5fe4d4
+		    return PM_ERR_LOGFILE;
5fe4d4
+		}
5fe4d4
 		__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
5fe4d4
 		if (mode == PM_MODE_FORW)
5fe4d4
 		    acp->ac_serial = 0;
5fe4d4
@@ -2462,9 +2480,13 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
 		}
5fe4d4
 	    }
5fe4d4
 	    else {
5fe4d4
+		try = j-1;
5fe4d4
 		j = VolSkip(acp, mode, j-1);
5fe4d4
-		if (j < 0)
5fe4d4
-		    return;
5fe4d4
+		if (j < 0) {
5fe4d4
+		    if (pmDebugOptions.log)
5fe4d4
+			fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #5\n", mode, try);
5fe4d4
+		    return PM_ERR_LOGFILE;
5fe4d4
+		}
5fe4d4
 		__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
5fe4d4
 		if (mode == PM_MODE_BACK)
5fe4d4
 		    acp->ac_serial = 0;
5fe4d4
@@ -2490,14 +2512,37 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
     }
5fe4d4
     else {
5fe4d4
 	/* index either not available, or not useful */
5fe4d4
+	int	j;
5fe4d4
 	if (mode == PM_MODE_FORW) {
5fe4d4
-	    __pmLogChangeVol(acp, lcp->l_minvol);
5fe4d4
-	    assert(acp->ac_mfp != NULL);
5fe4d4
+	    for (j = lcp->l_minvol; j <= lcp->l_maxvol; j++) {
5fe4d4
+		if (__pmLogChangeVol(acp, j) >= 0)
5fe4d4
+		    break;
5fe4d4
+	    }
5fe4d4
+	    if (j > lcp->l_maxvol) {
5fe4d4
+		/* no volume found */
5fe4d4
+		if (pmDebugOptions.log)
5fe4d4
+		    fprintf(stderr, " index not useful, no volume between %d...%d\n",
5fe4d4
+			    lcp->l_minvol, lcp->l_maxvol);
5fe4d4
+		acp->ac_curvol = -1;
5fe4d4
+		acp->ac_mfp = NULL;
5fe4d4
+		return PM_ERR_LOGFILE;
5fe4d4
+	    }
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)(sizeof(__pmLogLabel) + 2*sizeof(int)), SEEK_SET);
5fe4d4
 	}
5fe4d4
 	else if (mode == PM_MODE_BACK) {
5fe4d4
-	    __pmLogChangeVol(acp, lcp->l_maxvol);
5fe4d4
-	    assert(acp->ac_mfp != NULL);
5fe4d4
+	    for (j = lcp->l_maxvol; j >= lcp->l_minvol; j--) {
5fe4d4
+		if (__pmLogChangeVol(acp, j) >= 0)
5fe4d4
+		    break;
5fe4d4
+	    }
5fe4d4
+	    if (j < lcp->l_minvol) {
5fe4d4
+		/* no volume found */
5fe4d4
+		if (pmDebugOptions.log)
5fe4d4
+		    fprintf(stderr, " index not useful, no volume between %d...%d\n",
5fe4d4
+			    lcp->l_maxvol, lcp->l_minvol);
5fe4d4
+		acp->ac_curvol = -1;
5fe4d4
+		acp->ac_mfp = NULL;
5fe4d4
+		return PM_ERR_LOGFILE;
5fe4d4
+	    }
5fe4d4
 	    __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
5fe4d4
 	}
5fe4d4
 
5fe4d4
@@ -2513,6 +2558,8 @@ __pmLogSetTime(__pmContext *ctxp)
5fe4d4
     acp->ac_offset = __pmFtell(acp->ac_mfp);
5fe4d4
     assert(acp->ac_offset >= 0);
5fe4d4
     acp->ac_vol = acp->ac_curvol;
5fe4d4
+
5fe4d4
+    return 0;
5fe4d4
 }
5fe4d4
 
5fe4d4
 /* Read the label of the current archive. */
5fe4d4
@@ -3100,6 +3147,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
5fe4d4
     pmTimeval		save_origin;
5fe4d4
     int			save_mode;
5fe4d4
     int			sts;
5fe4d4
+    int			j;
5fe4d4
 
5fe4d4
     /*
5fe4d4
      * Check whether there is a previous archive to switch to.
5fe4d4
@@ -3145,12 +3193,23 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
5fe4d4
     }
5fe4d4
 
5fe4d4
     /* Set up to scan backwards from the end of the archive. */
5fe4d4
-    __pmLogChangeVol(acp, lcp->l_maxvol);
5fe4d4
-    assert(acp->ac_mfp != NULL);
5fe4d4
+    for (j = lcp->l_maxvol; j >= lcp->l_minvol; j--) {
5fe4d4
+	if (__pmLogChangeVol(acp, j) >= 0)
5fe4d4
+	    break;
5fe4d4
+    }
5fe4d4
+    if (j < lcp->l_minvol) {
5fe4d4
+	/* no volume found */
5fe4d4
+	if (pmDebugOptions.log)
5fe4d4
+	    fprintf(stderr, "LogChangeToPreviousArchive: no volume between %d...%d\n",
5fe4d4
+		    lcp->l_maxvol, lcp->l_minvol);
5fe4d4
+	acp->ac_curvol = -1;
5fe4d4
+	acp->ac_mfp = NULL;
5fe4d4
+	return PM_ERR_LOGFILE;
5fe4d4
+    }
5fe4d4
     __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
5fe4d4
-    ctxp->c_archctl->ac_offset = __pmFtell(acp->ac_mfp);
5fe4d4
-    assert(ctxp->c_archctl->ac_offset >= 0);
5fe4d4
-    ctxp->c_archctl->ac_vol = ctxp->c_archctl->ac_curvol;
5fe4d4
+    acp->ac_offset = __pmFtell(acp->ac_mfp);
5fe4d4
+    assert(acp->ac_offset >= 0);
5fe4d4
+    acp->ac_vol = acp->ac_curvol;
5fe4d4
 
5fe4d4
     /*
5fe4d4
      * Check for temporal overlap here. Do this last in case the API client
5fe4d4
5fe4d4
commit 5e3b792d3d8ae60f2cebbd51c37b9b0722c3b26e
5fe4d4
Author: Mark Goodwin <mgoodwin@redhat.com>
5fe4d4
Date:   Tue Jul 6 20:09:28 2021 +1000
5fe4d4
5fe4d4
    libpcp_web: plug mem leak in redisMapInsert during daily log-rolling
5fe4d4
    
5fe4d4
    When pmlogger_daily processes daily archives, the resulting
5fe4d4
    merged archive(s) are discovered and processed by pmproxy
5fe4d4
    (if the discovery module is enabled). Since the metadata and
5fe4d4
    logvol data in each merged archive is likely to have already
5fe4d4
    been previously processed (but discovery doesn't know this),
5fe4d4
    we see a lot of dict updates for existing keys and values that
5fe4d4
    are already mapped.
5fe4d4
    
5fe4d4
    Static analysis by Coverity (CID323605 Resource Leak) shows
5fe4d4
    when redisMapInsert calls dictAdd for an existing key, the
5fe4d4
    new value field is assigned but the old value is not free'd,
5fe4d4
    and so it leaks.
5fe4d4
    
5fe4d4
    Related: RHBZ1975069 and https://github.com/performancecopilot/pcp/issues/1318
5fe4d4
5fe4d4
diff --git a/src/libpcp_web/src/maps.c b/src/libpcp_web/src/maps.c
5fe4d4
index 013ef02d3..ce20476c9 100644
5fe4d4
--- a/src/libpcp_web/src/maps.c
5fe4d4
+++ b/src/libpcp_web/src/maps.c
5fe4d4
@@ -160,6 +160,12 @@ redisMapLookup(redisMap *map, sds key)
5fe4d4
 void
5fe4d4
 redisMapInsert(redisMap *map, sds key, sds value)
5fe4d4
 {
5fe4d4
+    redisMapEntry *entry = redisMapLookup(map, key);
5fe4d4
+
5fe4d4
+    if (entry) {
5fe4d4
+	/* fix for Coverity CID323605 Resource Leak */
5fe4d4
+	dictDelete(map, key);
5fe4d4
+    }
5fe4d4
     dictAdd(map, key, value);
5fe4d4
 }
5fe4d4
 
5fe4d4
5fe4d4
commit 2a00a90b0bc3aecb8465fd32aef1ddbe745b2c91
5fe4d4
Author: Mark Goodwin <mgoodwin@redhat.com>
5fe4d4
Date:   Tue Jul 6 20:43:01 2021 +1000
5fe4d4
5fe4d4
    libpcp_web/discovery: improve lock handling and scalability
5fe4d4
    
5fe4d4
    Rework the global log-rolling lock detection with finer grain
5fe4d4
    (per-pmlogger directory) detection, and break early in
5fe4d4
    process_meta() and process_logvol() if a lock file is found
5fe4d4
    in the same directory as a monitored archive. This is much
5fe4d4
    more scalable since archive directories that are not locked
5fe4d4
    can continue to be processed and ingested whilst log-rolling
5fe4d4
    progresses elsewhere. Also uses much less CPU time since we
5fe4d4
    don't need a traversal of all monitored archives looking for
5fe4d4
    locks on every fs change event.
5fe4d4
    
5fe4d4
    Also improve process_logvol/meta handling for archives that
5fe4d4
    are deleted whilst being processed by the discovery module.
5fe4d4
    In conjunction with Kenj's changes in libpcp - stop processing
5fe4d4
    metadata and logvols if pmFetchArchive returns -ENOENT .. the
5fe4d4
    archive has been deleted so there is no point further ingesting
5fe4d4
    it's data.
5fe4d4
    
5fe4d4
    Related: RHBZ1975069
5fe4d4
    Related: https://github.com/performancecopilot/pcp/issues/1338
5fe4d4
5fe4d4
diff --git a/src/libpcp_web/src/discover.c b/src/libpcp_web/src/discover.c
5fe4d4
index 991055ce5..964813f66 100644
5fe4d4
--- a/src/libpcp_web/src/discover.c
5fe4d4
+++ b/src/libpcp_web/src/discover.c
5fe4d4
@@ -33,9 +33,6 @@ static char *pmDiscoverFlagsStr(pmDiscover *);
5fe4d4
 #define PM_DISCOVER_HASHTAB_SIZE 32
5fe4d4
 static pmDiscover *discover_hashtable[PM_DISCOVER_HASHTAB_SIZE];
5fe4d4
 
5fe4d4
-/* pmlogger_daily log-roll lock count */
5fe4d4
-static int logrolling = 0;
5fe4d4
-
5fe4d4
 /* number of archives or directories currently being monitored */
5fe4d4
 static int n_monitored = 0;
5fe4d4
 
5fe4d4
@@ -426,28 +423,6 @@ is_deleted(pmDiscover *p, struct stat *sbuf)
5fe4d4
     return ret;
5fe4d4
 }
5fe4d4
 
5fe4d4
-static int
5fe4d4
-check_for_locks()
5fe4d4
-{
5fe4d4
-    int			i;
5fe4d4
-    pmDiscover		*p;
5fe4d4
-    char                sep = pmPathSeparator();
5fe4d4
-    char                path[MAXNAMELEN];
5fe4d4
-
5fe4d4
-    for (i=0; i < PM_DISCOVER_HASHTAB_SIZE; i++) {
5fe4d4
-    	for (p = discover_hashtable[i]; p; p = p->next) {
5fe4d4
-	    if (p->flags & PM_DISCOVER_FLAGS_DIRECTORY) {
5fe4d4
-		pmsprintf(path, sizeof(path), "%s%c%s", p->context.name, sep, "lock");
5fe4d4
-		if (access(path, F_OK) == 0)
5fe4d4
-		    return 1;
5fe4d4
-	    }
5fe4d4
-	}
5fe4d4
-    }
5fe4d4
-
5fe4d4
-    /* no locks */
5fe4d4
-    return 0;
5fe4d4
-}
5fe4d4
-
5fe4d4
 static void
5fe4d4
 check_deleted(pmDiscover *p)
5fe4d4
 {
5fe4d4
@@ -465,37 +440,8 @@ fs_change_callBack(uv_fs_event_t *handle, const char *filename, int events, int
5fe4d4
     pmDiscover		*p;
5fe4d4
     char		*s;
5fe4d4
     sds			path;
5fe4d4
-    int			locksfound = 0;
5fe4d4
     struct stat		statbuf;
5fe4d4
 
5fe4d4
-    /*
5fe4d4
-     * check if logs are currently being rolled by pmlogger_daily et al
5fe4d4
-     * in any of the directories we are tracking. For mutex, the log control
5fe4d4
-     * scripts use a 'lock' file in each directory as it is processed.
5fe4d4
-     */
5fe4d4
-    locksfound = check_for_locks();
5fe4d4
-
5fe4d4
-    if (!logrolling && locksfound) {
5fe4d4
-	/* log-rolling has started */
5fe4d4
-	if (pmDebugOptions.discovery)
5fe4d4
-	    fprintf(stderr, "%s discovery callback: log-rolling in progress\n", stamp());
5fe4d4
-	logrolling = locksfound;
5fe4d4
-	return;
5fe4d4
-    }
5fe4d4
-
5fe4d4
-    if (logrolling && locksfound) {
5fe4d4
-	logrolling = locksfound;
5fe4d4
-    	return; /* still in progress */
5fe4d4
-    }
5fe4d4
-
5fe4d4
-    if (logrolling && !locksfound) {
5fe4d4
-    	/* log-rolling is finished: check what got deleted, and then purge */
5fe4d4
-	if (pmDebugOptions.discovery)
5fe4d4
-	    fprintf(stderr, "%s discovery callback: finished log-rolling\n", stamp());
5fe4d4
-	pmDiscoverTraverse(PM_DISCOVER_FLAGS_META|PM_DISCOVER_FLAGS_DATAVOL, check_deleted);
5fe4d4
-    }
5fe4d4
-    logrolling = locksfound;
5fe4d4
-
5fe4d4
     uv_fs_event_getpath(handle, buffer, &bytes);
5fe4d4
     path = sdsnewlen(buffer, bytes);
5fe4d4
 
5fe4d4
@@ -1037,6 +983,17 @@ pmDiscoverNewSource(pmDiscover *p, int context)
5fe4d4
     pmDiscoverInvokeSourceCallBacks(p, &timestamp);
5fe4d4
 }
5fe4d4
 
5fe4d4
+static char *
5fe4d4
+archive_dir_lock_path(pmDiscover *p)
5fe4d4
+{
5fe4d4
+    char	path[MAXNAMELEN], lockpath[MAXNAMELEN];
5fe4d4
+    int		sep = pmPathSeparator();
5fe4d4
+
5fe4d4
+    strncpy(path, p->context.name, sizeof(path)-1);
5fe4d4
+    pmsprintf(lockpath, sizeof(lockpath), "%s%c%s", dirname(path), sep, "lock");
5fe4d4
+    return strndup(lockpath, sizeof(lockpath));
5fe4d4
+}
5fe4d4
+
5fe4d4
 /*
5fe4d4
  * Process metadata records until EOF. That can span multiple
5fe4d4
  * callbacks if we get a partial record read.
5fe4d4
@@ -1059,6 +1016,7 @@ process_metadata(pmDiscover *p)
5fe4d4
     __pmLogHdr		hdr;
5fe4d4
     sds			msg, source;
5fe4d4
     static uint32_t	*buf = NULL;
5fe4d4
+    char		*lock_path;
5fe4d4
     int			deleted;
5fe4d4
     struct stat		sbuf;
5fe4d4
     static int		buflen = 0;
5fe4d4
@@ -1073,7 +1031,10 @@ process_metadata(pmDiscover *p)
5fe4d4
 	fprintf(stderr, "process_metadata: %s in progress %s\n",
5fe4d4
 		p->context.name, pmDiscoverFlagsStr(p));
5fe4d4
     pmDiscoverStatsAdd(p->module, "metadata.callbacks", NULL, 1);
5fe4d4
+    lock_path = archive_dir_lock_path(p);
5fe4d4
     for (;;) {
5fe4d4
+	if (lock_path && access(lock_path, F_OK) == 0)
5fe4d4
+	    break;
5fe4d4
 	pmDiscoverStatsAdd(p->module, "metadata.loops", NULL, 1);
5fe4d4
 	off = lseek(p->fd, 0, SEEK_CUR);
5fe4d4
 	nb = read(p->fd, &hdr, sizeof(__pmLogHdr));
5fe4d4
@@ -1240,6 +1201,9 @@ process_metadata(pmDiscover *p)
5fe4d4
 	/* flag that all available metadata has now been read */
5fe4d4
 	p->flags &= ~PM_DISCOVER_FLAGS_META_IN_PROGRESS;
5fe4d4
 
5fe4d4
+    if (lock_path)
5fe4d4
+    	free(lock_path);
5fe4d4
+
5fe4d4
     if (pmDebugOptions.discovery)
5fe4d4
 	fprintf(stderr, "%s: completed, partial=%d %s %s\n",
5fe4d4
 			"process_metadata", partial, p->context.name, pmDiscoverFlagsStr(p));
5fe4d4
@@ -1266,14 +1230,18 @@ static void
5fe4d4
 process_logvol(pmDiscover *p)
5fe4d4
 {
5fe4d4
     int			sts;
5fe4d4
-    pmResult		*r;
5fe4d4
+    pmResult		*r = NULL;
5fe4d4
     pmTimespec		ts;
5fe4d4
     int			oldcurvol;
5fe4d4
     __pmContext		*ctxp;
5fe4d4
     __pmArchCtl		*acp;
5fe4d4
+    char		*lock_path;
5fe4d4
 
5fe4d4
     pmDiscoverStatsAdd(p->module, "logvol.callbacks", NULL, 1);
5fe4d4
+    lock_path = archive_dir_lock_path(p);
5fe4d4
     for (;;) {
5fe4d4
+	if (lock_path && access(lock_path, F_OK) == 0)
5fe4d4
+	    break;
5fe4d4
 	pmDiscoverStatsAdd(p->module, "logvol.loops", NULL, 1);
5fe4d4
 	pmUseContext(p->ctx);
5fe4d4
 	ctxp = __pmHandleToPtr(p->ctx);
5fe4d4
@@ -1312,6 +1280,7 @@ process_logvol(pmDiscover *p)
5fe4d4
 	    }
5fe4d4
 
5fe4d4
 	    /* we are done - return and wait for another callback */
5fe4d4
+	    r = NULL;
5fe4d4
 	    break;
5fe4d4
 	}
5fe4d4
 
5fe4d4
@@ -1328,14 +1297,15 @@ process_logvol(pmDiscover *p)
5fe4d4
 	}
5fe4d4
 
5fe4d4
 	/*
5fe4d4
-	 * TODO: persistently save current timestamp, so after being restarted,
5fe4d4
-	 * pmproxy can resume where it left off for each archive.
5fe4d4
+	 * TODO (perhaps): persistently save current timestamp, so after being
5fe4d4
+	 * restarted, pmproxy can resume where it left off for each archive.
5fe4d4
 	 */
5fe4d4
 	ts.tv_sec = r->timestamp.tv_sec;
5fe4d4
 	ts.tv_nsec = r->timestamp.tv_usec * 1000;
5fe4d4
 	bump_logvol_decode_stats(p, r);
5fe4d4
 	pmDiscoverInvokeValuesCallBack(p, &ts, r);
5fe4d4
 	pmFreeResult(r);
5fe4d4
+	r = NULL;
5fe4d4
     }
5fe4d4
 
5fe4d4
     if (r) {
5fe4d4
@@ -1348,6 +1318,9 @@ process_logvol(pmDiscover *p)
5fe4d4
 
5fe4d4
     /* datavol is now up-to-date and at EOF */
5fe4d4
     p->flags &= ~PM_DISCOVER_FLAGS_DATAVOL_READY;
5fe4d4
+
5fe4d4
+    if (lock_path)
5fe4d4
+    	free(lock_path);
5fe4d4
 }
5fe4d4
 
5fe4d4
 static void
5fe4d4
@@ -1357,6 +1330,10 @@ pmDiscoverInvokeCallBacks(pmDiscover *p)
5fe4d4
     sds			msg;
5fe4d4
     sds			metaname;
5fe4d4
 
5fe4d4
+    check_deleted(p);
5fe4d4
+    if (p->flags & PM_DISCOVER_FLAGS_DELETED)
5fe4d4
+    	return; /* ignore deleted archive */
5fe4d4
+
5fe4d4
     if (p->ctx < 0) {
5fe4d4
 	/*
5fe4d4
 	 * once off initialization on the first event
5fe4d4
@@ -1366,16 +1343,23 @@ pmDiscoverInvokeCallBacks(pmDiscover *p)
5fe4d4
 
5fe4d4
 	    /* create the PMAPI context (once off) */
5fe4d4
 	    if ((sts = pmNewContext(p->context.type, p->context.name)) < 0) {
5fe4d4
-		/*
5fe4d4
-		 * Likely an early callback on a new (still empty) archive.
5fe4d4
-		 * If so, just ignore the callback and don't log any scary
5fe4d4
-		 * looking messages. We'll get another CB soon.
5fe4d4
-		 */
5fe4d4
-		if (sts != PM_ERR_NODATA || pmDebugOptions.desperate) {
5fe4d4
-		    infofmt(msg, "pmNewContext failed for %s: %s\n",
5fe4d4
-			    p->context.name, pmErrStr(sts));
5fe4d4
-		    moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
5fe4d4
+		if (sts == -ENOENT) {
5fe4d4
+		    /* newly deleted archive */
5fe4d4
+		    p->flags |= PM_DISCOVER_FLAGS_DELETED;
5fe4d4
 		}
5fe4d4
+		else {
5fe4d4
+		    /*
5fe4d4
+		     * Likely an early callback on a new (still empty) archive.
5fe4d4
+		     * If so, just ignore the callback and don't log any scary
5fe4d4
+		     * looking messages. We'll get another CB soon.
5fe4d4
+		     */
5fe4d4
+		    if (sts != PM_ERR_NODATA || pmDebugOptions.desperate) {
5fe4d4
+			infofmt(msg, "pmNewContext failed for %s: %s\n",
5fe4d4
+				p->context.name, pmErrStr(sts));
5fe4d4
+			moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
5fe4d4
+		    }
5fe4d4
+		}
5fe4d4
+		/* no further processing for this archive */
5fe4d4
 		return;
5fe4d4
 	    }
5fe4d4
 	    pmDiscoverStatsAdd(p->module, "logvol.new_contexts", NULL, 1);
5fe4d4
@@ -1410,8 +1394,12 @@ pmDiscoverInvokeCallBacks(pmDiscover *p)
5fe4d4
 	    metaname = sdsnew(p->context.name);
5fe4d4
 	    metaname = sdscat(metaname, ".meta");
5fe4d4
 	    if ((p->fd = open(metaname, O_RDONLY)) < 0) {
5fe4d4
-		infofmt(msg, "open failed for %s: %s\n", metaname, osstrerror());
5fe4d4
-		moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
5fe4d4
+		if (p->fd == -ENOENT)
5fe4d4
+		    p->flags |= PM_DISCOVER_FLAGS_DELETED;
5fe4d4
+		else {
5fe4d4
+		    infofmt(msg, "open failed for %s: %s\n", metaname, osstrerror());
5fe4d4
+		    moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
5fe4d4
+		}
5fe4d4
 		sdsfree(metaname);
5fe4d4
 		return;
5fe4d4
 	    }