Blame SOURCES/redhat-bugzilla-1975069.patch

3e5d4c
3f5ba2218 libpcp_web: add mutex to struct webgroup protecting the context dict
3e5d4c
107633192 src/libpcp: be more careful when calling __pmLogChangeVol()
3e5d4c
49bdfdfff libpcp: redefine __pmLogSetTime()
3e5d4c
5e3b792d3 libpcp_web: plug mem leak in redisMapInsert during daily log-rolling
3e5d4c
2a00a90b0 libpcp_web/discovery: improve lock handling and scalability
3e5d4c
3e5d4c
commit 3f5ba221842e6a02e9fb22e23c754854271c3c9a
3e5d4c
Author: Mark Goodwin <mgoodwin@redhat.com>
3e5d4c
Date:   Wed Jun 9 16:44:30 2021 +1000
3e5d4c
3e5d4c
    libpcp_web: add mutex to struct webgroup protecting the context dict
3e5d4c
    
3e5d4c
    Add a mutex to the local webgroups structure in libpcp_web and
3e5d4c
    use it to protect multithreaded parallel updates (dictAdd,
3e5d4c
    dictDelete) to the groups->contexts dict and the dict traversal
3e5d4c
    in the timer driven garbage collector.
3e5d4c
    
3e5d4c
    Tested by qa/297 and related tests and also an updated version
3e5d4c
    of qa/1457 (which now stress tests parallel http and https/tls
3e5d4c
    pmproxy RESTAPI calls .. in a later commit).
3e5d4c
    
3e5d4c
    Related: RHBZ#1947989
3e5d4c
    Resolves: https://github.com/performancecopilot/pcp/issues/1311
3e5d4c
3e5d4c
diff --git a/src/libpcp_web/src/webgroup.c b/src/libpcp_web/src/webgroup.c
3e5d4c
index 08c2518ed..35f05441b 100644
3e5d4c
--- a/src/libpcp_web/src/webgroup.c
3e5d4c
+++ b/src/libpcp_web/src/webgroup.c
3e5d4c
@@ -51,14 +51,20 @@ typedef struct webgroups {
3e5d4c
     uv_loop_t		*events;
3e5d4c
     unsigned int	active;
3e5d4c
     uv_timer_t		timer;
3e5d4c
+    uv_mutex_t		mutex;
3e5d4c
 } webgroups;
3e5d4c
 
3e5d4c
 static struct webgroups *
3e5d4c
 webgroups_lookup(pmWebGroupModule *module)
3e5d4c
 {
3e5d4c
-    if (module->privdata == NULL)
3e5d4c
+    struct webgroups *groups = module->privdata;
3e5d4c
+
3e5d4c
+    if (module->privdata == NULL) {
3e5d4c
 	module->privdata = calloc(1, sizeof(struct webgroups));
3e5d4c
-    return (struct webgroups *)module->privdata;
3e5d4c
+	groups = (struct webgroups *)module->privdata;
3e5d4c
+	uv_mutex_init(&groups->mutex);
3e5d4c
+    }
3e5d4c
+    return groups;
3e5d4c
 }
3e5d4c
 
3e5d4c
 static int
3e5d4c
@@ -94,8 +100,11 @@ webgroup_drop_context(struct context *context, struct webgroups *groups)
3e5d4c
 	    context->garbage = 1;
3e5d4c
 	    uv_timer_stop(&context->timer);
3e5d4c
 	}
3e5d4c
-	if (groups)
3e5d4c
+	if (groups) {
3e5d4c
+	    uv_mutex_lock(&groups->mutex);
3e5d4c
 	    dictDelete(groups->contexts, &context->randomid);
3e5d4c
+	    uv_mutex_unlock(&groups->mutex);
3e5d4c
+	}
3e5d4c
 	uv_close((uv_handle_t *)&context->timer, webgroup_release_context);
3e5d4c
     }
3e5d4c
 }
3e5d4c
@@ -207,13 +216,16 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
3e5d4c
     cp->context = -1;
3e5d4c
     cp->timeout = polltime;
3e5d4c
 
3e5d4c
+    uv_mutex_lock(&groups->mutex);
3e5d4c
     if ((cp->randomid = random()) < 0 ||
3e5d4c
 	dictFind(groups->contexts, &cp->randomid) != NULL) {
3e5d4c
 	infofmt(*message, "random number failure on new web context");
3e5d4c
 	pmwebapi_free_context(cp);
3e5d4c
 	*status = -ESRCH;
3e5d4c
+	uv_mutex_unlock(&groups->mutex);
3e5d4c
 	return NULL;
3e5d4c
     }
3e5d4c
+    uv_mutex_unlock(&groups->mutex);
3e5d4c
     cp->origin = sdscatfmt(sdsempty(), "%i", cp->randomid);
3e5d4c
     cp->name.sds = sdsdup(hostspec ? hostspec : LOCALHOST);
3e5d4c
     cp->realm = sdscatfmt(sdsempty(), "pmapi/%i", cp->randomid);
3e5d4c
@@ -242,7 +254,9 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
3e5d4c
 	pmwebapi_free_context(cp);
3e5d4c
 	return NULL;
3e5d4c
     }
3e5d4c
+    uv_mutex_lock(&groups->mutex);
3e5d4c
     dictAdd(groups->contexts, &cp->randomid, cp);
3e5d4c
+    uv_mutex_unlock(&groups->mutex);
3e5d4c
 
3e5d4c
     /* leave until the end because uv_timer_init makes this visible in uv_run */
3e5d4c
     handle = (uv_handle_t *)&cp->timer;
3e5d4c
@@ -261,25 +275,34 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
3e5d4c
 static void
3e5d4c
 webgroup_garbage_collect(struct webgroups *groups)
3e5d4c
 {
3e5d4c
-    dictIterator        *iterator = dictGetSafeIterator(groups->contexts);
3e5d4c
+    dictIterator        *iterator;
3e5d4c
     dictEntry           *entry;
3e5d4c
     context_t		*cp;
3e5d4c
 
3e5d4c
     if (pmDebugOptions.http || pmDebugOptions.libweb)
3e5d4c
 	fprintf(stderr, "%s: started\n", "webgroup_garbage_collect");
3e5d4c
 
3e5d4c
-    while ((entry = dictNext(iterator)) != NULL) {
3e5d4c
-	cp = (context_t *)dictGetVal(entry);
3e5d4c
-	if (cp->garbage && cp->privdata == groups) {
3e5d4c
-	    if (pmDebugOptions.http || pmDebugOptions.libweb)
3e5d4c
-		fprintf(stderr, "GC context %u (%p)\n", cp->randomid, cp);
3e5d4c
-	    webgroup_drop_context(cp, groups);
3e5d4c
+    /* do context GC if we get the lock (else don't block here) */
3e5d4c
+    if (uv_mutex_trylock(&groups->mutex) == 0) {
3e5d4c
+	iterator = dictGetSafeIterator(groups->contexts);
3e5d4c
+	for (entry = dictNext(iterator); entry;) {
3e5d4c
+	    cp = (context_t *)dictGetVal(entry);
3e5d4c
+	    entry = dictNext(iterator);
3e5d4c
+	    if (cp->garbage && cp->privdata == groups) {
3e5d4c
+		if (pmDebugOptions.http || pmDebugOptions.libweb)
3e5d4c
+		    fprintf(stderr, "GC context %u (%p)\n", cp->randomid, cp);
3e5d4c
+		uv_mutex_unlock(&groups->mutex);
3e5d4c
+		webgroup_drop_context(cp, groups);
3e5d4c
+		uv_mutex_lock(&groups->mutex);
3e5d4c
+	    }
3e5d4c
 	}
3e5d4c
+	dictReleaseIterator(iterator);
3e5d4c
+	uv_mutex_unlock(&groups->mutex);
3e5d4c
     }
3e5d4c
-    dictReleaseIterator(iterator);
3e5d4c
 
3e5d4c
     /* TODO - trim maps, particularly instmap if proc metrics are not excluded */
3e5d4c
 
3e5d4c
+    /* TODO move the following to a new stats timer */
3e5d4c
     if (groups->metrics_handle) {
3e5d4c
 	mmv_stats_set(groups->metrics_handle, "contextmap.size",
3e5d4c
 	    NULL, dictSize(contextmap));
3e5d4c
3e5d4c
commit 107633192326b27ae571d4d4955052b8d86222c2
3e5d4c
Author: Ken McDonell <kenj@kenj.id.au>
3e5d4c
Date:   Fri Jul 2 16:52:48 2021 +1000
3e5d4c
3e5d4c
    src/libpcp: be more careful when calling __pmLogChangeVol()
3e5d4c
    
3e5d4c
    Mark observed a SEGV which looks like __pmLogFetch() died because
3e5d4c
    ctxp->c_archctl->ac_mfp was (unexpectedly) NULL.
3e5d4c
    
3e5d4c
    See: https://github.com/performancecopilot/pcp/issues/1338
3e5d4c
    
3e5d4c
    Initial guess is that a physical file was removed by concurrent
3e5d4c
    activity (like pmlogger_check or pmlogger_daily), causing
3e5d4c
    __pmLogChangeVol() to fail ... and this was not being checked for
3e5d4c
    on the __pmLogFetch() path and in a couple of other places.
3e5d4c
    
3e5d4c
    modified:   interp.c
3e5d4c
    modified:   logutil.c
3e5d4c
3e5d4c
diff --git a/src/libpcp/src/interp.c b/src/libpcp/src/interp.c
3e5d4c
index d7effbc1e..c8f6fe382 100644
3e5d4c
--- a/src/libpcp/src/interp.c
3e5d4c
+++ b/src/libpcp/src/interp.c
3e5d4c
@@ -1312,7 +1312,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
3e5d4c
     }
3e5d4c
 
3e5d4c
     /* get to the last remembered place */
3e5d4c
-    __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+    sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+    if (sts < 0)
3e5d4c
+	goto all_done;
3e5d4c
     __pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
3e5d4c
 
3e5d4c
     seen_mark = 0;	/* interested in <mark> records seen from here on */
3e5d4c
@@ -1397,7 +1399,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
3e5d4c
 	 * at least one metric requires a bound from earlier in the log ...
3e5d4c
 	 * position ourselves, ... and search
3e5d4c
 	 */
3e5d4c
-	__pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+	sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+	if (sts < 0)
3e5d4c
+	    goto all_done;
3e5d4c
 	__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
3e5d4c
 	done = 0;
3e5d4c
 
3e5d4c
@@ -1542,7 +1546,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
3e5d4c
 	 * at least one metric requires a bound from later in the log ...
3e5d4c
 	 * position ourselves ... and search
3e5d4c
 	 */
3e5d4c
-	__pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+	sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+	if (sts < 0)
3e5d4c
+	    goto all_done;
3e5d4c
 	__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
3e5d4c
 	done = 0;
3e5d4c
 
3e5d4c
diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c
3e5d4c
index fe35ed422..0ef76de25 100644
3e5d4c
--- a/src/libpcp/src/logutil.c
3e5d4c
+++ b/src/libpcp/src/logutil.c
3e5d4c
@@ -1992,7 +1992,10 @@ __pmLogFetch(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **result)
3e5d4c
     all_derived = check_all_derived(numpmid, pmidlist);
3e5d4c
 
3e5d4c
     /* re-establish position */
3e5d4c
-    __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+    sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
+    if (sts < 0)
3e5d4c
+	goto func_return;
3e5d4c
+    assert(ctxp->c_archctl->ac_mfp != NULL);
3e5d4c
     __pmFseek(ctxp->c_archctl->ac_mfp, 
3e5d4c
 	    (long)ctxp->c_archctl->ac_offset, SEEK_SET);
3e5d4c
 
3e5d4c
@@ -2489,10 +2492,12 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 	/* index either not available, or not useful */
3e5d4c
 	if (mode == PM_MODE_FORW) {
3e5d4c
 	    __pmLogChangeVol(acp, lcp->l_minvol);
3e5d4c
+	    assert(acp->ac_mfp != NULL);
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)(sizeof(__pmLogLabel) + 2*sizeof(int)), SEEK_SET);
3e5d4c
 	}
3e5d4c
 	else if (mode == PM_MODE_BACK) {
3e5d4c
 	    __pmLogChangeVol(acp, lcp->l_maxvol);
3e5d4c
+	    assert(acp->ac_mfp != NULL);
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
3e5d4c
 	}
3e5d4c
 
3e5d4c
@@ -3141,6 +3146,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
3e5d4c
 
3e5d4c
     /* Set up to scan backwards from the end of the archive. */
3e5d4c
     __pmLogChangeVol(acp, lcp->l_maxvol);
3e5d4c
+    assert(acp->ac_mfp != NULL);
3e5d4c
     __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
3e5d4c
     ctxp->c_archctl->ac_offset = __pmFtell(acp->ac_mfp);
3e5d4c
     assert(ctxp->c_archctl->ac_offset >= 0);
3e5d4c
3e5d4c
commit 49bdfdfff83ac165de2bdc9a40e61a56512585d8
3e5d4c
Author: Ken McDonell <kenj@kenj.id.au>
3e5d4c
Date:   Sun Jul 4 10:07:09 2021 +1000
3e5d4c
3e5d4c
    libpcp: redefine __pmLogSetTime()
3e5d4c
    
3e5d4c
    The problem is that if physical files for the data volumes of an
3e5d4c
    archive are removed (asynchronously by someone else) while we're
3e5d4c
    trying to switch volumes then we don't handle this safely.
3e5d4c
    
3e5d4c
    The previous commit 10763319 was as stop-gap to address Mark's SEGV
3e5d4c
    issue at https://github.com/performancecopilot/pcp/issues/1338 and
3e5d4c
    simply handled direct calls to __pmLogChangeVol() and ensured the
3e5d4c
    return status was checked.
3e5d4c
    
3e5d4c
    I was aware, then Coverity made a lot more people aware, that this
3e5d4c
    "fix" was incomplete, specifically the calls to __pmLogChangeVol()
3e5d4c
    from within __pmLogSetTime() were not checked.
3e5d4c
    
3e5d4c
    To fix the latter we have to change the type of __pmLogSetTime() from
3e5d4c
    void to int so we can return status to indicate that __pmLogChangeVol()
3e5d4c
    has failed.  And then make sure all the callers of __pmLogSetTime()
3e5d4c
    check the status returned from that function.
3e5d4c
    
3e5d4c
        modified:   src/libpcp/src/fetch.c
3e5d4c
        modified:   src/libpcp/src/internal.h
3e5d4c
        modified:   src/libpcp/src/logutil.c
3e5d4c
    
3e5d4c
    Because this introduces some new -Dlog diagnostics, qa/251 needed
3e5d4c
    a bit of a make-over.
3e5d4c
3e5d4c
diff --git a/qa/251 b/qa/251
3e5d4c
index 2b8a07917..f9b293e98 100755
3e5d4c
--- a/qa/251
3e5d4c
+++ b/qa/251
3e5d4c
@@ -37,7 +37,7 @@ _filter()
3e5d4c
 
3e5d4c
 status=1	# failure is the default!
3e5d4c
 $sudo rm -rf $tmp.* $seq.full
3e5d4c
-trap "cd $here; rm -rf $tmp; exit \$status" 0 1 2 3 15
3e5d4c
+trap "cd $here; rm -rf $tmp $tmp.*; exit \$status" 0 1 2 3 15
3e5d4c
 
3e5d4c
 # real QA test starts here
3e5d4c
 mkdir $tmp
3e5d4c
@@ -50,56 +50,62 @@ cd $tmp
3e5d4c
 for inst in "bin-100" "bin-100,bin-500,bin-900"
3e5d4c
 do
3e5d4c
     echo
3e5d4c
-    echo "All volumes present ... $inst ..."
3e5d4c
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
3e5d4c
-    egrep 'Skip|Change' err
3e5d4c
-    _filter 
3e5d4c
+    echo "All volumes present ... $inst ..." | tee -a $here/$seq.full
3e5d4c
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
3e5d4c
+    cat $tmp.err >>$here/$seq.full
3e5d4c
+    grep '^__pmLogChangeVol:' $tmp.err
3e5d4c
+    _filter <$tmp.out
3e5d4c
     [ -f die ] && exit
3e5d4c
 
3e5d4c
     echo
3e5d4c
-    echo "First volume missing ... $inst ..."
3e5d4c
+    echo "First volume missing ... $inst ..." | tee -a $here/$seq.full
3e5d4c
     mv ok-mv-bar.0 foo.0
3e5d4c
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
3e5d4c
-    egrep 'Skip|Change' err
3e5d4c
-    _filter 
3e5d4c
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
3e5d4c
+    cat $tmp.err >>$here/$seq.full
3e5d4c
+    grep '^__pmLogChangeVol:' $tmp.err
3e5d4c
+    _filter <$tmp.out
3e5d4c
     [ -f die ] && exit
3e5d4c
     mv foo.0 ok-mv-bar.0
3e5d4c
 
3e5d4c
     echo
3e5d4c
-    echo "Last volume missing ... $inst ..."
3e5d4c
+    echo "Last volume missing ... $inst ..." | tee -a $here/$seq.full
3e5d4c
     mv ok-mv-bar.3 foo.3
3e5d4c
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
3e5d4c
-    egrep 'Skip|Change' err
3e5d4c
-    _filter 
3e5d4c
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
3e5d4c
+    cat $tmp.err >>$here/$seq.full
3e5d4c
+    grep '^__pmLogChangeVol:' $tmp.err
3e5d4c
+    _filter <$tmp.out
3e5d4c
     [ -f die ] && exit
3e5d4c
     mv foo.3 ok-mv-bar.3
3e5d4c
 
3e5d4c
     echo
3e5d4c
-    echo "Second volume missing ... $inst ..."
3e5d4c
+    echo "Second volume missing ... $inst ..." | tee -a $here/$seq.full
3e5d4c
     mv ok-mv-bar.1 foo.1
3e5d4c
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
3e5d4c
-    egrep 'Skip|Change' err
3e5d4c
-    _filter 
3e5d4c
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
3e5d4c
+    cat $tmp.err >>$here/$seq.full
3e5d4c
+    grep '^__pmLogChangeVol:' $tmp.err
3e5d4c
+    _filter <$tmp.out
3e5d4c
     [ -f die ] && exit
3e5d4c
     mv foo.1 ok-mv-bar.1
3e5d4c
 
3e5d4c
     echo
3e5d4c
-    echo "Second last volume missing ... $inst ..."
3e5d4c
+    echo "Second last volume missing ... $inst ..." | tee -a $here/$seq.full
3e5d4c
     mv ok-mv-bar.2 foo.2
3e5d4c
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
3e5d4c
-    egrep 'Skip|Change' err
3e5d4c
-    _filter 
3e5d4c
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
3e5d4c
+    cat $tmp.err >>$here/$seq.full
3e5d4c
+    grep '^__pmLogChangeVol:' $tmp.err
3e5d4c
+    _filter <$tmp.out
3e5d4c
     [ -f die ] && exit
3e5d4c
     mv foo.2 ok-mv-bar.2
3e5d4c
 
3e5d4c
     echo
3e5d4c
-    echo "All volumes but second missing ... $inst ..."
3e5d4c
+    echo "All volumes but second missing ... $inst ..." | tee -a $here/$seq.full
3e5d4c
     mv ok-mv-bar.0 foo.0
3e5d4c
     mv ok-mv-bar.2 foo.2
3e5d4c
     mv ok-mv-bar.3 foo.3
3e5d4c
-    pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
3e5d4c
-    egrep 'Skip|Change' err
3e5d4c
-    _filter 
3e5d4c
+    pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
3e5d4c
+    cat $tmp.err >>$here/$seq.full
3e5d4c
+    grep '^__pmLogChangeVol:' $tmp.err
3e5d4c
+    _filter <$tmp.out
3e5d4c
     [ -f die ] && exit
3e5d4c
     mv foo.0 ok-mv-bar.0
3e5d4c
     mv foo.2 ok-mv-bar.2
3e5d4c
diff --git a/src/libpcp/src/fetch.c b/src/libpcp/src/fetch.c
3e5d4c
index 5328a2807..01d5bf7fc 100644
3e5d4c
--- a/src/libpcp/src/fetch.c
3e5d4c
+++ b/src/libpcp/src/fetch.c
3e5d4c
@@ -458,6 +458,7 @@ pmSetMode(int mode, const struct timeval *when, int delta)
3e5d4c
 	    /* assume PM_CONTEXT_ARCHIVE */
3e5d4c
 	    if (l_mode == PM_MODE_INTERP ||
3e5d4c
 		l_mode == PM_MODE_FORW || l_mode == PM_MODE_BACK) {
3e5d4c
+		int	lsts;
3e5d4c
 		if (when != NULL) {
3e5d4c
 		    /*
3e5d4c
 		     * special case of NULL for timestamp
3e5d4c
@@ -468,7 +469,18 @@ pmSetMode(int mode, const struct timeval *when, int delta)
3e5d4c
 		}
3e5d4c
 		ctxp->c_mode = mode;
3e5d4c
 		ctxp->c_delta = delta;
3e5d4c
-		__pmLogSetTime(ctxp);
3e5d4c
+		lsts = __pmLogSetTime(ctxp);
3e5d4c
+		if (lsts < 0) {
3e5d4c
+		    /*
3e5d4c
+		     * most unlikely; not much we can do here but expect
3e5d4c
+		     * PMAPI error to be returned once pmFetch's start
3e5d4c
+		     */
3e5d4c
+		    if (pmDebugOptions.log) {
3e5d4c
+			char	errmsg[PM_MAXERRMSGLEN];
3e5d4c
+			fprintf(stderr, "pmSetMode: __pmLogSetTime failed: %s\n",
3e5d4c
+			    pmErrStr_r(lsts, errmsg, sizeof(errmsg)));
3e5d4c
+		    }
3e5d4c
+		}
3e5d4c
 		__pmLogResetInterp(ctxp);
3e5d4c
 		sts = 0;
3e5d4c
 	    }
3e5d4c
diff --git a/src/libpcp/src/internal.h b/src/libpcp/src/internal.h
3e5d4c
index 977efdcf6..fd8d6e740 100644
3e5d4c
--- a/src/libpcp/src/internal.h
3e5d4c
+++ b/src/libpcp/src/internal.h
3e5d4c
@@ -407,7 +407,7 @@ extern int __pmLogGenerateMark(__pmLogCtl *, int, pmResult **) _PCP_HIDDEN;
3e5d4c
 extern int __pmLogFetchInterp(__pmContext *, int, pmID *, pmResult **) _PCP_HIDDEN;
3e5d4c
 extern int __pmGetArchiveLabel(__pmLogCtl *, pmLogLabel *) _PCP_HIDDEN;
3e5d4c
 extern pmTimeval *__pmLogStartTime(__pmArchCtl *) _PCP_HIDDEN;
3e5d4c
-extern void __pmLogSetTime(__pmContext *) _PCP_HIDDEN;
3e5d4c
+extern int __pmLogSetTime(__pmContext *) _PCP_HIDDEN;
3e5d4c
 extern void __pmLogResetInterp(__pmContext *) _PCP_HIDDEN;
3e5d4c
 extern void __pmArchCtlFree(__pmArchCtl *) _PCP_HIDDEN;
3e5d4c
 extern int __pmLogChangeArchive(__pmContext *, int) _PCP_HIDDEN;
3e5d4c
diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c
3e5d4c
index 0ef76de25..2ea559bfe 100644
3e5d4c
--- a/src/libpcp/src/logutil.c
3e5d4c
+++ b/src/libpcp/src/logutil.c
3e5d4c
@@ -1995,7 +1995,6 @@ __pmLogFetch(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **result)
3e5d4c
     sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
3e5d4c
     if (sts < 0)
3e5d4c
 	goto func_return;
3e5d4c
-    assert(ctxp->c_archctl->ac_mfp != NULL);
3e5d4c
     __pmFseek(ctxp->c_archctl->ac_mfp, 
3e5d4c
 	    (long)ctxp->c_archctl->ac_offset, SEEK_SET);
3e5d4c
 
3e5d4c
@@ -2010,7 +2009,9 @@ more:
3e5d4c
 	     * no serial access, so need to make sure we are
3e5d4c
 	     * starting in the correct place
3e5d4c
 	     */
3e5d4c
-	    __pmLogSetTime(ctxp);
3e5d4c
+	    sts = __pmLogSetTime(ctxp);
3e5d4c
+	    if (sts < 0)
3e5d4c
+		goto func_return;
3e5d4c
 	    ctxp->c_archctl->ac_offset = __pmFtell(ctxp->c_archctl->ac_mfp);
3e5d4c
 	    ctxp->c_archctl->ac_vol = ctxp->c_archctl->ac_curvol;
3e5d4c
 	    /*
3e5d4c
@@ -2299,7 +2300,7 @@ VolSkip(__pmArchCtl *acp, int mode,  int j)
3e5d4c
     return PM_ERR_EOL;
3e5d4c
 }
3e5d4c
 
3e5d4c
-void
3e5d4c
+int
3e5d4c
 __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 {
3e5d4c
     __pmArchCtl	*acp = ctxp->c_archctl;
3e5d4c
@@ -2356,6 +2357,7 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
     if (lcp->l_numti) {
3e5d4c
 	/* we have a temporal index, use it! */
3e5d4c
 	int		j = -1;
3e5d4c
+	int		try;
3e5d4c
 	int		toobig = 0;
3e5d4c
 	int		match = 0;
3e5d4c
 	int		vol;
3e5d4c
@@ -2406,9 +2408,13 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 	acp->ac_serial = 1;
3e5d4c
 
3e5d4c
 	if (match) {
3e5d4c
+	    try = j;
3e5d4c
 	    j = VolSkip(acp, mode, j);
3e5d4c
-	    if (j < 0)
3e5d4c
-		return;
3e5d4c
+	    if (j < 0) {
3e5d4c
+		if (pmDebugOptions.log)
3e5d4c
+		    fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #1\n", mode, try);
3e5d4c
+		return PM_ERR_LOGFILE;
3e5d4c
+	    }
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
3e5d4c
 	    if (mode == PM_MODE_BACK)
3e5d4c
 		acp->ac_serial = 0;
3e5d4c
@@ -2418,9 +2424,13 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 	    }
3e5d4c
 	}
3e5d4c
 	else if (j < 1) {
3e5d4c
+	    try = 0;
3e5d4c
 	    j = VolSkip(acp, PM_MODE_FORW, 0);
3e5d4c
-	    if (j < 0)
3e5d4c
-		return;
3e5d4c
+	    if (j < 0) {
3e5d4c
+		if (pmDebugOptions.log)
3e5d4c
+		    fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #2\n", PM_MODE_FORW, try);
3e5d4c
+		return PM_ERR_LOGFILE;
3e5d4c
+	    }
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
3e5d4c
 	    if (pmDebugOptions.log) {
3e5d4c
 		fprintf(stderr, " before start ti@");
3e5d4c
@@ -2428,9 +2438,13 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 	    }
3e5d4c
 	}
3e5d4c
 	else if (j == numti) {
3e5d4c
+	    try = numti-1;
3e5d4c
 	    j = VolSkip(acp, PM_MODE_BACK, numti-1);
3e5d4c
-	    if (j < 0)
3e5d4c
-		return;
3e5d4c
+	    if (j < 0) {
3e5d4c
+		if (pmDebugOptions.log)
3e5d4c
+		    fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #3\n", PM_MODE_BACK, try);
3e5d4c
+		return PM_ERR_LOGFILE;
3e5d4c
+	    }
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
3e5d4c
 	    if (mode == PM_MODE_BACK)
3e5d4c
 		acp->ac_serial = 0;
3e5d4c
@@ -2450,9 +2464,13 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 	    t_hi = __pmTimevalSub(&lcp->l_ti[j].ti_stamp, &ctxp->c_origin);
3e5d4c
 	    t_lo = __pmTimevalSub(&ctxp->c_origin, &lcp->l_ti[j-1].ti_stamp);
3e5d4c
 	    if (t_hi <= t_lo && !toobig) {
3e5d4c
+		try = j;
3e5d4c
 		j = VolSkip(acp, mode, j);
3e5d4c
-		if (j < 0)
3e5d4c
-		    return;
3e5d4c
+		if (j < 0) {
3e5d4c
+		    if (pmDebugOptions.log)
3e5d4c
+			fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #4\n", mode, try);
3e5d4c
+		    return PM_ERR_LOGFILE;
3e5d4c
+		}
3e5d4c
 		__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
3e5d4c
 		if (mode == PM_MODE_FORW)
3e5d4c
 		    acp->ac_serial = 0;
3e5d4c
@@ -2462,9 +2480,13 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
 		}
3e5d4c
 	    }
3e5d4c
 	    else {
3e5d4c
+		try = j-1;
3e5d4c
 		j = VolSkip(acp, mode, j-1);
3e5d4c
-		if (j < 0)
3e5d4c
-		    return;
3e5d4c
+		if (j < 0) {
3e5d4c
+		    if (pmDebugOptions.log)
3e5d4c
+			fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #5\n", mode, try);
3e5d4c
+		    return PM_ERR_LOGFILE;
3e5d4c
+		}
3e5d4c
 		__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
3e5d4c
 		if (mode == PM_MODE_BACK)
3e5d4c
 		    acp->ac_serial = 0;
3e5d4c
@@ -2490,14 +2512,37 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
     }
3e5d4c
     else {
3e5d4c
 	/* index either not available, or not useful */
3e5d4c
+	int	j;
3e5d4c
 	if (mode == PM_MODE_FORW) {
3e5d4c
-	    __pmLogChangeVol(acp, lcp->l_minvol);
3e5d4c
-	    assert(acp->ac_mfp != NULL);
3e5d4c
+	    for (j = lcp->l_minvol; j <= lcp->l_maxvol; j++) {
3e5d4c
+		if (__pmLogChangeVol(acp, j) >= 0)
3e5d4c
+		    break;
3e5d4c
+	    }
3e5d4c
+	    if (j > lcp->l_maxvol) {
3e5d4c
+		/* no volume found */
3e5d4c
+		if (pmDebugOptions.log)
3e5d4c
+		    fprintf(stderr, " index not useful, no volume between %d...%d\n",
3e5d4c
+			    lcp->l_minvol, lcp->l_maxvol);
3e5d4c
+		acp->ac_curvol = -1;
3e5d4c
+		acp->ac_mfp = NULL;
3e5d4c
+		return PM_ERR_LOGFILE;
3e5d4c
+	    }
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)(sizeof(__pmLogLabel) + 2*sizeof(int)), SEEK_SET);
3e5d4c
 	}
3e5d4c
 	else if (mode == PM_MODE_BACK) {
3e5d4c
-	    __pmLogChangeVol(acp, lcp->l_maxvol);
3e5d4c
-	    assert(acp->ac_mfp != NULL);
3e5d4c
+	    for (j = lcp->l_maxvol; j >= lcp->l_minvol; j--) {
3e5d4c
+		if (__pmLogChangeVol(acp, j) >= 0)
3e5d4c
+		    break;
3e5d4c
+	    }
3e5d4c
+	    if (j < lcp->l_minvol) {
3e5d4c
+		/* no volume found */
3e5d4c
+		if (pmDebugOptions.log)
3e5d4c
+		    fprintf(stderr, " index not useful, no volume between %d...%d\n",
3e5d4c
+			    lcp->l_maxvol, lcp->l_minvol);
3e5d4c
+		acp->ac_curvol = -1;
3e5d4c
+		acp->ac_mfp = NULL;
3e5d4c
+		return PM_ERR_LOGFILE;
3e5d4c
+	    }
3e5d4c
 	    __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
3e5d4c
 	}
3e5d4c
 
3e5d4c
@@ -2513,6 +2558,8 @@ __pmLogSetTime(__pmContext *ctxp)
3e5d4c
     acp->ac_offset = __pmFtell(acp->ac_mfp);
3e5d4c
     assert(acp->ac_offset >= 0);
3e5d4c
     acp->ac_vol = acp->ac_curvol;
3e5d4c
+
3e5d4c
+    return 0;
3e5d4c
 }
3e5d4c
 
3e5d4c
 /* Read the label of the current archive. */
3e5d4c
@@ -3100,6 +3147,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
3e5d4c
     pmTimeval		save_origin;
3e5d4c
     int			save_mode;
3e5d4c
     int			sts;
3e5d4c
+    int			j;
3e5d4c
 
3e5d4c
     /*
3e5d4c
      * Check whether there is a previous archive to switch to.
3e5d4c
@@ -3145,12 +3193,23 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
3e5d4c
     }
3e5d4c
 
3e5d4c
     /* Set up to scan backwards from the end of the archive. */
3e5d4c
-    __pmLogChangeVol(acp, lcp->l_maxvol);
3e5d4c
-    assert(acp->ac_mfp != NULL);
3e5d4c
+    for (j = lcp->l_maxvol; j >= lcp->l_minvol; j--) {
3e5d4c
+	if (__pmLogChangeVol(acp, j) >= 0)
3e5d4c
+	    break;
3e5d4c
+    }
3e5d4c
+    if (j < lcp->l_minvol) {
3e5d4c
+	/* no volume found */
3e5d4c
+	if (pmDebugOptions.log)
3e5d4c
+	    fprintf(stderr, "LogChangeToPreviousArchive: no volume between %d...%d\n",
3e5d4c
+		    lcp->l_maxvol, lcp->l_minvol);
3e5d4c
+	acp->ac_curvol = -1;
3e5d4c
+	acp->ac_mfp = NULL;
3e5d4c
+	return PM_ERR_LOGFILE;
3e5d4c
+    }
3e5d4c
     __pmFseek(acp->ac_mfp, (long)0, SEEK_END);
3e5d4c
-    ctxp->c_archctl->ac_offset = __pmFtell(acp->ac_mfp);
3e5d4c
-    assert(ctxp->c_archctl->ac_offset >= 0);
3e5d4c
-    ctxp->c_archctl->ac_vol = ctxp->c_archctl->ac_curvol;
3e5d4c
+    acp->ac_offset = __pmFtell(acp->ac_mfp);
3e5d4c
+    assert(acp->ac_offset >= 0);
3e5d4c
+    acp->ac_vol = acp->ac_curvol;
3e5d4c
 
3e5d4c
     /*
3e5d4c
      * Check for temporal overlap here. Do this last in case the API client
3e5d4c
3e5d4c
commit 5e3b792d3d8ae60f2cebbd51c37b9b0722c3b26e
3e5d4c
Author: Mark Goodwin <mgoodwin@redhat.com>
3e5d4c
Date:   Tue Jul 6 20:09:28 2021 +1000
3e5d4c
3e5d4c
    libpcp_web: plug mem leak in redisMapInsert during daily log-rolling
3e5d4c
    
3e5d4c
    When pmlogger_daily processes daily archives, the resulting
3e5d4c
    merged archive(s) are discovered and processed by pmproxy
3e5d4c
    (if the discovery module is enabled). Since the metadata and
3e5d4c
    logvol data in each merged archive is likely to have already
3e5d4c
    been previously processed (but discovery doesn't know this),
3e5d4c
    we see a lot of dict updates for existing keys and values that
3e5d4c
    are already mapped.
3e5d4c
    
3e5d4c
    Static analysis by Coverity (CID323605 Resource Leak) shows
3e5d4c
    when redisMapInsert calls dictAdd for an existing key, the
3e5d4c
    new value field is assigned but the old value is not free'd,
3e5d4c
    and so it leaks.
3e5d4c
    
3e5d4c
    Related: RHBZ1975069 and https://github.com/performancecopilot/pcp/issues/1318
3e5d4c
3e5d4c
diff --git a/src/libpcp_web/src/maps.c b/src/libpcp_web/src/maps.c
3e5d4c
index 013ef02d3..ce20476c9 100644
3e5d4c
--- a/src/libpcp_web/src/maps.c
3e5d4c
+++ b/src/libpcp_web/src/maps.c
3e5d4c
@@ -160,6 +160,12 @@ redisMapLookup(redisMap *map, sds key)
3e5d4c
 void
3e5d4c
 redisMapInsert(redisMap *map, sds key, sds value)
3e5d4c
 {
3e5d4c
+    redisMapEntry *entry = redisMapLookup(map, key);
3e5d4c
+
3e5d4c
+    if (entry) {
3e5d4c
+	/* fix for Coverity CID323605 Resource Leak */
3e5d4c
+	dictDelete(map, key);
3e5d4c
+    }
3e5d4c
     dictAdd(map, key, value);
3e5d4c
 }
3e5d4c
 
3e5d4c
3e5d4c
commit 2a00a90b0bc3aecb8465fd32aef1ddbe745b2c91
3e5d4c
Author: Mark Goodwin <mgoodwin@redhat.com>
3e5d4c
Date:   Tue Jul 6 20:43:01 2021 +1000
3e5d4c
3e5d4c
    libpcp_web/discovery: improve lock handling and scalability
3e5d4c
    
3e5d4c
    Rework the global log-rolling lock detection with finer grain
3e5d4c
    (per-pmlogger directory) detection, and break early in
3e5d4c
    process_meta() and process_logvol() if a lock file is found
3e5d4c
    in the same directory as a monitored archive. This is much
3e5d4c
    more scalable since archive directories that are not locked
3e5d4c
    can continue to be processed and ingested whilst log-rolling
3e5d4c
    progresses elsewhere. Also uses much less CPU time since we
3e5d4c
    don't need a traversal of all monitored archives looking for
3e5d4c
    locks on every fs change event.
3e5d4c
    
3e5d4c
    Also improve process_logvol/meta handling for archives that
3e5d4c
    are deleted whilst being processed by the discovery module.
3e5d4c
    In conjunction with Kenj's changes in libpcp - stop processing
3e5d4c
    metadata and logvols if pmFetchArchive returns -ENOENT .. the
3e5d4c
    archive has been deleted so there is no point further ingesting
3e5d4c
    it's data.
3e5d4c
    
3e5d4c
    Related: RHBZ1975069
3e5d4c
    Related: https://github.com/performancecopilot/pcp/issues/1338
3e5d4c
3e5d4c
diff --git a/src/libpcp_web/src/discover.c b/src/libpcp_web/src/discover.c
3e5d4c
index 991055ce5..964813f66 100644
3e5d4c
--- a/src/libpcp_web/src/discover.c
3e5d4c
+++ b/src/libpcp_web/src/discover.c
3e5d4c
@@ -33,9 +33,6 @@ static char *pmDiscoverFlagsStr(pmDiscover *);
3e5d4c
 #define PM_DISCOVER_HASHTAB_SIZE 32
3e5d4c
 static pmDiscover *discover_hashtable[PM_DISCOVER_HASHTAB_SIZE];
3e5d4c
 
3e5d4c
-/* pmlogger_daily log-roll lock count */
3e5d4c
-static int logrolling = 0;
3e5d4c
-
3e5d4c
 /* number of archives or directories currently being monitored */
3e5d4c
 static int n_monitored = 0;
3e5d4c
 
3e5d4c
@@ -426,28 +423,6 @@ is_deleted(pmDiscover *p, struct stat *sbuf)
3e5d4c
     return ret;
3e5d4c
 }
3e5d4c
 
3e5d4c
-static int
3e5d4c
-check_for_locks()
3e5d4c
-{
3e5d4c
-    int			i;
3e5d4c
-    pmDiscover		*p;
3e5d4c
-    char                sep = pmPathSeparator();
3e5d4c
-    char                path[MAXNAMELEN];
3e5d4c
-
3e5d4c
-    for (i=0; i < PM_DISCOVER_HASHTAB_SIZE; i++) {
3e5d4c
-    	for (p = discover_hashtable[i]; p; p = p->next) {
3e5d4c
-	    if (p->flags & PM_DISCOVER_FLAGS_DIRECTORY) {
3e5d4c
-		pmsprintf(path, sizeof(path), "%s%c%s", p->context.name, sep, "lock");
3e5d4c
-		if (access(path, F_OK) == 0)
3e5d4c
-		    return 1;
3e5d4c
-	    }
3e5d4c
-	}
3e5d4c
-    }
3e5d4c
-
3e5d4c
-    /* no locks */
3e5d4c
-    return 0;
3e5d4c
-}
3e5d4c
-
3e5d4c
 static void
3e5d4c
 check_deleted(pmDiscover *p)
3e5d4c
 {
3e5d4c
@@ -465,37 +440,8 @@ fs_change_callBack(uv_fs_event_t *handle, const char *filename, int events, int
3e5d4c
     pmDiscover		*p;
3e5d4c
     char		*s;
3e5d4c
     sds			path;
3e5d4c
-    int			locksfound = 0;
3e5d4c
     struct stat		statbuf;
3e5d4c
 
3e5d4c
-    /*
3e5d4c
-     * check if logs are currently being rolled by pmlogger_daily et al
3e5d4c
-     * in any of the directories we are tracking. For mutex, the log control
3e5d4c
-     * scripts use a 'lock' file in each directory as it is processed.
3e5d4c
-     */
3e5d4c
-    locksfound = check_for_locks();
3e5d4c
-
3e5d4c
-    if (!logrolling && locksfound) {
3e5d4c
-	/* log-rolling has started */
3e5d4c
-	if (pmDebugOptions.discovery)
3e5d4c
-	    fprintf(stderr, "%s discovery callback: log-rolling in progress\n", stamp());
3e5d4c
-	logrolling = locksfound;
3e5d4c
-	return;
3e5d4c
-    }
3e5d4c
-
3e5d4c
-    if (logrolling && locksfound) {
3e5d4c
-	logrolling = locksfound;
3e5d4c
-    	return; /* still in progress */
3e5d4c
-    }
3e5d4c
-
3e5d4c
-    if (logrolling && !locksfound) {
3e5d4c
-    	/* log-rolling is finished: check what got deleted, and then purge */
3e5d4c
-	if (pmDebugOptions.discovery)
3e5d4c
-	    fprintf(stderr, "%s discovery callback: finished log-rolling\n", stamp());
3e5d4c
-	pmDiscoverTraverse(PM_DISCOVER_FLAGS_META|PM_DISCOVER_FLAGS_DATAVOL, check_deleted);
3e5d4c
-    }
3e5d4c
-    logrolling = locksfound;
3e5d4c
-
3e5d4c
     uv_fs_event_getpath(handle, buffer, &bytes);
3e5d4c
     path = sdsnewlen(buffer, bytes);
3e5d4c
 
3e5d4c
@@ -1037,6 +983,17 @@ pmDiscoverNewSource(pmDiscover *p, int context)
3e5d4c
     pmDiscoverInvokeSourceCallBacks(p, &timestamp);
3e5d4c
 }
3e5d4c
 
3e5d4c
+static char *
3e5d4c
+archive_dir_lock_path(pmDiscover *p)
3e5d4c
+{
3e5d4c
+    char	path[MAXNAMELEN], lockpath[MAXNAMELEN];
3e5d4c
+    int		sep = pmPathSeparator();
3e5d4c
+
3e5d4c
+    strncpy(path, p->context.name, sizeof(path)-1);
3e5d4c
+    pmsprintf(lockpath, sizeof(lockpath), "%s%c%s", dirname(path), sep, "lock");
3e5d4c
+    return strndup(lockpath, sizeof(lockpath));
3e5d4c
+}
3e5d4c
+
3e5d4c
 /*
3e5d4c
  * Process metadata records until EOF. That can span multiple
3e5d4c
  * callbacks if we get a partial record read.
3e5d4c
@@ -1059,6 +1016,7 @@ process_metadata(pmDiscover *p)
3e5d4c
     __pmLogHdr		hdr;
3e5d4c
     sds			msg, source;
3e5d4c
     static uint32_t	*buf = NULL;
3e5d4c
+    char		*lock_path;
3e5d4c
     int			deleted;
3e5d4c
     struct stat		sbuf;
3e5d4c
     static int		buflen = 0;
3e5d4c
@@ -1073,7 +1031,10 @@ process_metadata(pmDiscover *p)
3e5d4c
 	fprintf(stderr, "process_metadata: %s in progress %s\n",
3e5d4c
 		p->context.name, pmDiscoverFlagsStr(p));
3e5d4c
     pmDiscoverStatsAdd(p->module, "metadata.callbacks", NULL, 1);
3e5d4c
+    lock_path = archive_dir_lock_path(p);
3e5d4c
     for (;;) {
3e5d4c
+	if (lock_path && access(lock_path, F_OK) == 0)
3e5d4c
+	    break;
3e5d4c
 	pmDiscoverStatsAdd(p->module, "metadata.loops", NULL, 1);
3e5d4c
 	off = lseek(p->fd, 0, SEEK_CUR);
3e5d4c
 	nb = read(p->fd, &hdr, sizeof(__pmLogHdr));
3e5d4c
@@ -1240,6 +1201,9 @@ process_metadata(pmDiscover *p)
3e5d4c
 	/* flag that all available metadata has now been read */
3e5d4c
 	p->flags &= ~PM_DISCOVER_FLAGS_META_IN_PROGRESS;
3e5d4c
 
3e5d4c
+    if (lock_path)
3e5d4c
+    	free(lock_path);
3e5d4c
+
3e5d4c
     if (pmDebugOptions.discovery)
3e5d4c
 	fprintf(stderr, "%s: completed, partial=%d %s %s\n",
3e5d4c
 			"process_metadata", partial, p->context.name, pmDiscoverFlagsStr(p));
3e5d4c
@@ -1266,14 +1230,18 @@ static void
3e5d4c
 process_logvol(pmDiscover *p)
3e5d4c
 {
3e5d4c
     int			sts;
3e5d4c
-    pmResult		*r;
3e5d4c
+    pmResult		*r = NULL;
3e5d4c
     pmTimespec		ts;
3e5d4c
     int			oldcurvol;
3e5d4c
     __pmContext		*ctxp;
3e5d4c
     __pmArchCtl		*acp;
3e5d4c
+    char		*lock_path;
3e5d4c
 
3e5d4c
     pmDiscoverStatsAdd(p->module, "logvol.callbacks", NULL, 1);
3e5d4c
+    lock_path = archive_dir_lock_path(p);
3e5d4c
     for (;;) {
3e5d4c
+	if (lock_path && access(lock_path, F_OK) == 0)
3e5d4c
+	    break;
3e5d4c
 	pmDiscoverStatsAdd(p->module, "logvol.loops", NULL, 1);
3e5d4c
 	pmUseContext(p->ctx);
3e5d4c
 	ctxp = __pmHandleToPtr(p->ctx);
3e5d4c
@@ -1312,6 +1280,7 @@ process_logvol(pmDiscover *p)
3e5d4c
 	    }
3e5d4c
 
3e5d4c
 	    /* we are done - return and wait for another callback */
3e5d4c
+	    r = NULL;
3e5d4c
 	    break;
3e5d4c
 	}
3e5d4c
 
3e5d4c
@@ -1328,14 +1297,15 @@ process_logvol(pmDiscover *p)
3e5d4c
 	}
3e5d4c
 
3e5d4c
 	/*
3e5d4c
-	 * TODO: persistently save current timestamp, so after being restarted,
3e5d4c
-	 * pmproxy can resume where it left off for each archive.
3e5d4c
+	 * TODO (perhaps): persistently save current timestamp, so after being
3e5d4c
+	 * restarted, pmproxy can resume where it left off for each archive.
3e5d4c
 	 */
3e5d4c
 	ts.tv_sec = r->timestamp.tv_sec;
3e5d4c
 	ts.tv_nsec = r->timestamp.tv_usec * 1000;
3e5d4c
 	bump_logvol_decode_stats(p, r);
3e5d4c
 	pmDiscoverInvokeValuesCallBack(p, &ts, r);
3e5d4c
 	pmFreeResult(r);
3e5d4c
+	r = NULL;
3e5d4c
     }
3e5d4c
 
3e5d4c
     if (r) {
3e5d4c
@@ -1348,6 +1318,9 @@ process_logvol(pmDiscover *p)
3e5d4c
 
3e5d4c
     /* datavol is now up-to-date and at EOF */
3e5d4c
     p->flags &= ~PM_DISCOVER_FLAGS_DATAVOL_READY;
3e5d4c
+
3e5d4c
+    if (lock_path)
3e5d4c
+    	free(lock_path);
3e5d4c
 }
3e5d4c
 
3e5d4c
 static void
3e5d4c
@@ -1357,6 +1330,10 @@ pmDiscoverInvokeCallBacks(pmDiscover *p)
3e5d4c
     sds			msg;
3e5d4c
     sds			metaname;
3e5d4c
 
3e5d4c
+    check_deleted(p);
3e5d4c
+    if (p->flags & PM_DISCOVER_FLAGS_DELETED)
3e5d4c
+    	return; /* ignore deleted archive */
3e5d4c
+
3e5d4c
     if (p->ctx < 0) {
3e5d4c
 	/*
3e5d4c
 	 * once off initialization on the first event
3e5d4c
@@ -1366,16 +1343,23 @@ pmDiscoverInvokeCallBacks(pmDiscover *p)
3e5d4c
 
3e5d4c
 	    /* create the PMAPI context (once off) */
3e5d4c
 	    if ((sts = pmNewContext(p->context.type, p->context.name)) < 0) {
3e5d4c
-		/*
3e5d4c
-		 * Likely an early callback on a new (still empty) archive.
3e5d4c
-		 * If so, just ignore the callback and don't log any scary
3e5d4c
-		 * looking messages. We'll get another CB soon.
3e5d4c
-		 */
3e5d4c
-		if (sts != PM_ERR_NODATA || pmDebugOptions.desperate) {
3e5d4c
-		    infofmt(msg, "pmNewContext failed for %s: %s\n",
3e5d4c
-			    p->context.name, pmErrStr(sts));
3e5d4c
-		    moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
3e5d4c
+		if (sts == -ENOENT) {
3e5d4c
+		    /* newly deleted archive */
3e5d4c
+		    p->flags |= PM_DISCOVER_FLAGS_DELETED;
3e5d4c
 		}
3e5d4c
+		else {
3e5d4c
+		    /*
3e5d4c
+		     * Likely an early callback on a new (still empty) archive.
3e5d4c
+		     * If so, just ignore the callback and don't log any scary
3e5d4c
+		     * looking messages. We'll get another CB soon.
3e5d4c
+		     */
3e5d4c
+		    if (sts != PM_ERR_NODATA || pmDebugOptions.desperate) {
3e5d4c
+			infofmt(msg, "pmNewContext failed for %s: %s\n",
3e5d4c
+				p->context.name, pmErrStr(sts));
3e5d4c
+			moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
3e5d4c
+		    }
3e5d4c
+		}
3e5d4c
+		/* no further processing for this archive */
3e5d4c
 		return;
3e5d4c
 	    }
3e5d4c
 	    pmDiscoverStatsAdd(p->module, "logvol.new_contexts", NULL, 1);
3e5d4c
@@ -1410,8 +1394,12 @@ pmDiscoverInvokeCallBacks(pmDiscover *p)
3e5d4c
 	    metaname = sdsnew(p->context.name);
3e5d4c
 	    metaname = sdscat(metaname, ".meta");
3e5d4c
 	    if ((p->fd = open(metaname, O_RDONLY)) < 0) {
3e5d4c
-		infofmt(msg, "open failed for %s: %s\n", metaname, osstrerror());
3e5d4c
-		moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
3e5d4c
+		if (p->fd == -ENOENT)
3e5d4c
+		    p->flags |= PM_DISCOVER_FLAGS_DELETED;
3e5d4c
+		else {
3e5d4c
+		    infofmt(msg, "open failed for %s: %s\n", metaname, osstrerror());
3e5d4c
+		    moduleinfo(p->module, PMLOG_ERROR, msg, p->data);
3e5d4c
+		}
3e5d4c
 		sdsfree(metaname);
3e5d4c
 		return;
3e5d4c
 	    }