Blob Blame History Raw
diff -Naurp a/man/man1/pmlogger.1 b/man/man1/pmlogger.1
--- a/man/man1/pmlogger.1	2014-01-14 08:40:15.000000000 +1100
+++ b/man/man1/pmlogger.1	2014-02-26 14:34:37.597237966 +1100
@@ -35,6 +35,7 @@
 [\f3\-v\f1 \f2volsize\f1]
 [\f3\-V\f1 \f2version\f1]
 [\f3\-x\f1 \f2fd\f1]
+[\f3\-y\f1]
 \f2archive\f1
 .SH DESCRIPTION
 .B pmlogger
@@ -329,6 +330,11 @@ The
 option may be used to limit the execution time using the format
 of time as prescribed by 
 .BR PCPIntro (1). 
+The time is interpreted within the time zone of the PMCD server,
+unless the
+.B \-y
+option is given, within which case the time zone at this logger
+host is used.
 .nf
 Some examples of different formats:
 .in 1i
diff -Naurp a/src/pmlogger/pmnewlog.sh b/src/pmlogger/pmnewlog.sh
--- a/src/pmlogger/pmnewlog.sh	2014-01-09 07:45:21.000000000 +1100
+++ b/src/pmlogger/pmnewlog.sh	2014-02-26 14:34:37.598238056 +1100
@@ -179,7 +179,7 @@ _do_cmd()
 # part of the pmlogger control file for a long-running pmlogger.
 #
 
-while getopts "a:C:c:D:Ll:Nm:n:Pp:rst:T:Vv:" c
+while getopts "a:C:c:D:Ll:Nm:n:Pp:rst:T:Vv:y" c
 do
     case $c
     in
@@ -253,7 +253,7 @@ do
 # pmlogger flags passed through
 #
 
-	L|r)	
+	L|r|y)
 		args="$args-$c "
 		;;
 
diff -Naurp a/src/pmlogger/src/pmlogger.c b/src/pmlogger/src/pmlogger.c
--- a/src/pmlogger/src/pmlogger.c	2014-01-14 14:52:52.000000000 +1100
+++ b/src/pmlogger/src/pmlogger.c	2014-02-26 14:34:37.599237958 +1100
@@ -476,6 +476,7 @@ main(int argc, char **argv)
     int			sts;
     int			sep = __pmPathSeparator();
     int			errflag = 0;
+    int			use_localtime = 0;
     int			isdaemon = 0;
     char		*pmnsfile = PM_NS_DEFAULT;
     char		*username;
@@ -500,7 +501,7 @@ main(int argc, char **argv)
      *		corresponding changes are made to pmnewlog when pmlogger
      *		options are passed through from the control file
      */
-    while ((c = getopt(argc, argv, "c:D:h:l:Lm:n:Prs:T:t:uU:v:V:x:?")) != EOF) {
+    while ((c = getopt(argc, argv, "c:D:h:l:Lm:n:Prs:T:t:uU:v:V:x:y?")) != EOF) {
 	switch (c) {
 
 	case 'c':		/* config file */
@@ -633,6 +634,10 @@ main(int argc, char **argv)
 	    time(&rsc_start);
 	    break;
 
+	case 'y':
+	    use_localtime = 1;
+	    break;
+
 	case '?':
 	default:
 	    errflag++;
@@ -661,7 +666,8 @@ Options:\n\
   -v volsize	switch log volumes after volsize has been accumulated\n\
   -V version    version for archive (default and only version is 2)\n\
   -x fd		control file descriptor for application launching pmlogger\n\
-		via pmRecordControl(3)\n",
+		via pmRecordControl(3)\n\
+  -y		set timezone for times to local time rather than that of PMCD host\n",
 			pmProgname);
 	exit(1);
     }
@@ -817,7 +823,8 @@ Options:\n\
 		strcpy(logctl.l_label.ill_tz, resp->vset[0]->vlist[0].value.pval->vbuf);
 		/* prefer to use remote time to avoid clock drift problems */
 		epoch = resp->timestamp;		/* struct assignment */
-		pmNewZone(logctl.l_label.ill_tz);
+		if (! use_localtime)
+		    pmNewZone(logctl.l_label.ill_tz);
 	    }
 #ifdef PCP_DEBUG
 	    else if (pmDebug & DBG_TRACE_LOG) {
diff -Naurp a/src/pmmgr/config/GNUmakefile b/src/pmmgr/config/GNUmakefile
--- a/src/pmmgr/config/GNUmakefile	2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/config/GNUmakefile	2014-02-26 14:34:37.417238128 +1100
@@ -1,6 +1,6 @@
 #!gmake
 #
-# Copyright (c) 2013 Red Hat.
+# Copyright (c) 2013-2014 Red Hat.
 # 
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
@@ -18,7 +18,7 @@ LLDIRT =
 TOPDIR = ../../..
 include $(TOPDIR)/src/include/builddefs
 
-LSRCFILES = pmie pmieconf pmlogconf pmlogger pmlogmerge \
+LSRCFILES = pmie pmieconf pmlogconf pmlogger pmlogmerge pmlogmerge-rewrite pmlogmerge-granular \
 	README target-discovery.example-avahi 
 
 PMMGR_SYSCONF_DIR=$(PCP_SYSCONF_DIR)/pmmgr
diff -Naurp a/src/pmmgr/GNUmakefile b/src/pmmgr/GNUmakefile
--- a/src/pmmgr/GNUmakefile	2014-01-10 08:45:59.000000000 +1100
+++ b/src/pmmgr/GNUmakefile	2014-02-26 14:34:37.552225230 +1100
@@ -1,6 +1,6 @@
 #!gmake
 #
-# Copyright (c) 2013 Red Hat.
+# Copyright (c) 2013-2014 Red Hat.
 # 
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
@@ -39,7 +39,7 @@ install: $(SUBDIRS) $(CXXMDTARGET)
 	$(INSTALL) -m 644 pmmgr.options $(PCP_PMMGROPTIONS_PATH)
 	$(INSTALL) -m 755 rc_pmmgr $(PCP_RC_DIR)/pmmgr
 	$(INSTALL) -m 755 $(CXXMDTARGET) $(PCP_BINADM_DIR)/$(CXXMDTARGET)
-	$(INSTALL) -m 775 -o $(PCP_USER) -g $(PCP_USER) -d $(PCP_LOG_DIR)/pmmgr
+	$(INSTALL) -m 775 -o $(PCP_USER) -g $(PCP_GROUP) -d $(PCP_LOG_DIR)/pmmgr
 else
 build-me:
 	@echo not building pmmgr
diff -Naurp a/src/pmmgr/pmmgr.cxx b/src/pmmgr/pmmgr.cxx
--- a/src/pmmgr/pmmgr.cxx	2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/pmmgr.cxx	2014-02-26 14:34:37.559238008 +1100
@@ -1,18 +1,20 @@
 /*
  * Copyright (c) 2013-2014 Red Hat.
- * 
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2 of the License, or (at your
  * option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * for more details.
  */
 
-
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
 #include "pmmgr.h"
 #include "impl.h"
 
@@ -23,6 +25,8 @@
 #include <iostream>
 
 extern "C" {
+#include <fcntl.h>
+#include <unistd.h>
 #include <glob.h>
 #include <sys/wait.h>
 #ifdef HAVE_PTHREAD_H
@@ -40,6 +44,7 @@ using namespace std;
 
 
 int quit;
+int polltime = 60;
 
 
 // ------------------------------------------------------------------------
@@ -76,7 +81,7 @@ timestamp(ostream &o)
   if (now2)
     now2[19] = '\0'; // overwrite \n
 
-  return o << "[" << (now2 ? now2 : "") << "] " << pmProgname << "(" 
+  return o << "[" << (now2 ? now2 : "") << "] " << pmProgname << "("
            << getpid()
 #ifdef HAVE_PTHREAD_H
 #ifdef IS_LINUX
@@ -89,13 +94,6 @@ timestamp(ostream &o)
 }
 
 
-extern "C" int
-pmValue_compare (const void* a, const void* b)
-{
-  return ((pmValue *)a)->inst - ((pmValue *)b)->inst;
-}
-
-
 extern "C" void *
 pmmgr_daemon_poll_thread (void* a)
 {
@@ -105,6 +103,51 @@ pmmgr_daemon_poll_thread (void* a)
 }
 
 
+// A wrapper for something like system(3), but responding quicker to
+// interrupts and standardizing tracing.
+int
+pmmgr_configurable::wrap_system(const std::string& cmd)
+{
+  if (pmDebug & DBG_TRACE_APPL0)
+    timestamp(cout) << "running " << cmd << endl;
+
+  int pid = fork();
+  if (pid == 0)
+    {
+      // child
+      int rc = execl ("/bin/sh", "sh", "-c", cmd.c_str(), NULL);
+      timestamp(cerr) << "failed to execl sh -c " << cmd << " rc=" << rc << endl;
+      _exit (1);
+    }
+  else if (pid < 0)
+    {
+      // error
+      timestamp(cerr) << "fork for " << cmd << " failed: errno=" << errno << endl;
+      return -1;
+    }
+  else 
+    { 
+      // parent
+      int status = -1;
+      int rc;
+      //timestamp(cout) << "waiting for pid=" << pid << endl;
+
+      do { rc = waitpid(pid, &status, 0); } while (!quit && rc == -1 && errno == EINTR); // TEMP_FAILURE_RETRY
+      if (quit)
+        {
+          // timestamp(cout) << "killing pid=" << pid << endl;
+          kill (pid, SIGTERM); // just to be on the safe side
+          // it might linger a few seconds in zombie mode
+        }
+
+      //timestamp(cout) << "done status=" << status << endl;
+      if (status != 0)
+        timestamp(cerr) << "system(" << cmd << ") failed: rc=" << status << endl;
+      return status;
+    }
+}
+
+
 
 // ------------------------------------------------------------------------
 
@@ -255,15 +298,12 @@ pmmgr_job_spec::compute_hostid (const pc
         continue;
       // NB: after this point, 'continue' must also pmFreeResult(r)
 
+      // in-place sort value list by indom number
+      pmSortInstances(r);
+
       // only vset[0] will be set, for csb->pmid
       if (r->vset[0]->numval > 0)
         {
-          // in-place sort value list by indom number
-          qsort (r->vset[0]->vlist,
-                 (size_t) r->vset[0]->numval,
-                 sizeof(pmValue),
-                 pmValue_compare);
-
           for (int j=0; j<r->vset[0]->numval; j++) // iterate over instances
             {
               // fetch the string value
@@ -273,7 +313,7 @@ pmmgr_job_spec::compute_hostid (const pc
                                   PM_TYPE_STRING, & av, PM_TYPE_STRING);
               if (rc < 0)
                 continue;
-              
+
               // at last!  we have a string we can accumulate
               hostid_fields.push_back (av.cp);
               free (av.cp);
@@ -356,10 +396,10 @@ pmmgr_job_spec::poll()
     new_specs.insert(target_hosts[i]);
 
   vector<string> target_discovery = get_config_multi("target-discovery");
-  for (unsigned i=0; i<target_discovery.size(); i++)
+  for (unsigned i=0; i<target_discovery.size() && !quit; i++)
     {
       char **urls = NULL;
-      const char *discovery = (target_discovery[i] == "") 
+      const char *discovery = (target_discovery[i] == "")
         ? NULL
         : target_discovery[i].c_str();
       int numUrls = pmDiscoverServices (PM_SERVER_SERVICE_SPEC, discovery, &urls);
@@ -381,15 +421,29 @@ pmmgr_job_spec::poll()
   known_targets.clear();
 
   // phase 3: map the context-specs to hostids to find new hosts
+  map<pmmgr_hostid,double> known_target_scores;
   for (set<pcp_context_spec>::iterator it = new_specs.begin();
-       it != new_specs.end();
+       it != new_specs.end() && !quit;
        ++it)
     {
+      struct timeval before, after;
+      __pmtimevalNow(& before);
       pmmgr_hostid hostid = compute_hostid (*it);
+      __pmtimevalNow(& after);
+      double score = __pmtimevalSub(& after, & before); // the smaller, the preferreder
+
       if (hostid != "") // verified existence/liveness
-        known_targets[hostid] = *it;
-      // NB: for hostid's with multiple specs, this logic will pick an
-      // *arbitrary* one.  Perhaps we want to tie-break deterministically.
+        {
+          if (pmDebug & DBG_TRACE_APPL0)
+            timestamp(cout) << "hostid " << hostid << " via " << *it << " time " << score << endl;
+
+          if (known_target_scores.find(hostid) == known_target_scores.end() ||
+              known_target_scores[hostid] > score) // previous slower than this one
+            {
+              known_targets[hostid] = *it;
+              known_target_scores[hostid] = score;
+            }
+        }
     }
 
   // phase 4a: compare old_known_targets vs. known_targets: look for any recently died
@@ -415,11 +469,11 @@ pmmgr_job_spec::poll()
   // phase 5: poll all the live daemons
   // NB: there is a parallelism opportunity, as running many pmlogconf/etc.'s in series
   // is a possible bottleneck.
-#ifdef HAVE_PTHREAD_H  
+#ifdef HAVE_PTHREAD_H
   vector<pthread_t> threads;
 #endif
   for (multimap<pmmgr_hostid,pmmgr_daemon*>::iterator it = daemons.begin();
-       it != daemons.end();
+       it != daemons.end() && !quit;
        ++it)
     {
 #ifdef HAVE_PTHREAD_H
@@ -468,7 +522,7 @@ pmmgr_job_spec::poll()
 
   glob_t the_blob;
   string glob_pattern = log_dir + (char)__pmPathSeparator() + "*";
-  rc = glob (glob_pattern.c_str(), 
+  rc = glob (glob_pattern.c_str(),
              GLOB_NOESCAPE
 #ifdef GLOB_ONLYDIR
              | GLOB_ONLYDIR
@@ -476,10 +530,10 @@ pmmgr_job_spec::poll()
              , NULL, & the_blob);
   if (rc == 0)
     {
-      for (unsigned i=0; i<the_blob.gl_pathc; i++)
+      for (unsigned i=0; i<the_blob.gl_pathc && !quit; i++)
         {
           string item_name = the_blob.gl_pathv[i];
-          
+
           // Reject if currently live hostid
           // NB: basename(3) might modify the argument string, so we don't feed
           // it item_name.c_str().
@@ -496,11 +550,7 @@ pmmgr_job_spec::poll()
               // <Janine Melnitz>We've got one!!!!!</>
               timestamp(cout) << "gc subdirectory " << item_name << endl;
               string cleanup_cmd = "/bin/rm -rf " + sh_quote(item_name);
-              if (pmDebug & DBG_TRACE_APPL0)
-                timestamp(cout) << "running " << cleanup_cmd << endl;
-              rc = system(cleanup_cmd.c_str());
-              if (rc != 0) 
-                timestamp(cerr) << "system(" << cleanup_cmd << ") failed: rc=" << rc << endl;
+              (void) wrap_system(cleanup_cmd);
             }
         }
     }
@@ -546,18 +596,19 @@ pmmgr_job_spec::note_dead_hostid(const p
 
 
 pmmgr_daemon::pmmgr_daemon(const std::string& config_directory,
-                           const pmmgr_hostid& hostid, 
+                           const pmmgr_hostid& hostid,
                            const pcp_context_spec& spec):
   pmmgr_configurable(config_directory),
   hostid(hostid),
   spec(spec),
-  pid(0)
+  pid(0),
+  last_restart_attempt(0)
 {
 }
 
 
 pmmgr_pmlogger_daemon::pmmgr_pmlogger_daemon(const std::string& config_directory,
-                                             const pmmgr_hostid& hostid, 
+                                             const pmmgr_hostid& hostid,
                                              const pcp_context_spec& spec):
   pmmgr_daemon(config_directory, hostid, spec)
 {
@@ -565,7 +616,7 @@ pmmgr_pmlogger_daemon::pmmgr_pmlogger_da
 
 
 pmmgr_pmie_daemon::pmmgr_pmie_daemon(const std::string& config_directory,
-                                         const pmmgr_hostid& hostid, 
+                                         const pmmgr_hostid& hostid,
                                          const pcp_context_spec& spec):
   pmmgr_daemon(config_directory, hostid, spec)
 {
@@ -596,7 +647,7 @@ void pmmgr_daemon::poll()
       int rc = waitpid ((pid_t) pid, &ignored, WNOHANG);
 
       rc = kill ((pid_t) pid, 0);
-      if (rc < 0) 
+      if (rc < 0)
         {
           if (pmDebug & DBG_TRACE_APPL0)
             timestamp(cout) << "daemon pid " << pid << " found dead" << endl;
@@ -607,7 +658,32 @@ void pmmgr_daemon::poll()
 
   if (pid == 0) // needs a restart
     {
-      string commandline = daemon_command_line();
+      time_t now;
+      time (& now);
+
+      // Prevent an error in the environment or the pmmgr daemon
+      // command lines from generating a tight loop of failure /
+      // retry, wasting time and log file space.  Limit retry attempts
+      // to one per poll interval (pmmgr -p N parameter).
+      if (last_restart_attempt && (last_restart_attempt + polltime) >= now)
+        return; // quietly, without attempting to restart
+
+      string commandline = daemon_command_line(); // <--- may take many seconds!
+
+      // NB: Note this time as a restart attempt, even if daemon_command_line() 
+      // returned an empty string, so that we don't try to restart it too soon.
+      // We note this time rather than the beginning of daemon_command_line(), 
+      // to ensure at least polltime seconds of rest between attempts.
+      last_restart_attempt = now;
+
+      if (quit) return; // without starting the daemon process
+
+      if (commandline == "") // error in some intermediate processing stage
+        {
+          timestamp(cerr) << "failed to prepare daemon command line" << endl;
+          return;
+        }
+
       if (pmDebug & DBG_TRACE_APPL0)
         timestamp(cout) << "fork/exec sh -c " << commandline << endl;
       pid = fork();
@@ -633,7 +709,7 @@ void pmmgr_daemon::poll()
 }
 
 
-std::string                                             
+std::string
 pmmgr_pmlogger_daemon::daemon_command_line()
 {
   string default_log_dir =
@@ -648,7 +724,7 @@ pmmgr_pmlogger_daemon::daemon_command_li
   (void) mkdir2 (host_log_dir.c_str(), 0777);
   // (errors creating actual files under host_log_dir will be noted shortly)
 
-  string pmlogger_command = 
+  string pmlogger_command =
         string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmlogger";
   string pmlogger_options = sh_quote(pmlogger_command);
   pmlogger_options += " " + get_config_single ("pmlogger");
@@ -658,20 +734,17 @@ pmmgr_pmlogger_daemon::daemon_command_li
     {
       string pmlogconf_output_file = host_log_dir + (char)__pmPathSeparator() + "config.pmlogger";
       (void) unlink (pmlogconf_output_file.c_str());
-      string pmlogconf_command = 
+      string pmlogconf_command =
         string(pmGetConfig("PCP_BINADM_DIR")) + (char)__pmPathSeparator() + "pmlogconf";
-      string pmlogconf_options = 
+      string pmlogconf_options =
         sh_quote(pmlogconf_command)
         + " -c -r -h " + sh_quote(spec)
         + " " + get_config_single ("pmlogconf")
         + " " + sh_quote(pmlogconf_output_file)
         + " >/dev/null"; // pmlogconf is too chatty
-       
-      if (pmDebug & DBG_TRACE_APPL0)
-        timestamp(cout) << "running " << pmlogconf_options << endl;
-      int rc = system(pmlogconf_options.c_str());
-      if (rc != 0) 
-        timestamp(cerr) << "system(" << pmlogconf_options << ") failed: rc=" << rc << endl;
+
+      int rc = wrap_system(pmlogconf_options);
+      if (rc) return "";
 
       pmlogger_options += " -c " + sh_quote(pmlogconf_output_file);
     }
@@ -688,61 +761,176 @@ pmmgr_pmlogger_daemon::daemon_command_li
   // do log merging
   if (get_config_exists ("pmlogmerge"))
     {
-      string pmlogextract_command = 
+      string pmlogextract_command =
         string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmlogextract";
 
-      string pmlogcheck_command = 
+      string pmlogcheck_command =
         string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmlogcheck";
 
+      string pmlogrewrite_command =
+        string(pmGetConfig("PCP_BINADM_DIR")) + (char)__pmPathSeparator() + "pmlogrewrite";
+
       string pmlogextract_options = sh_quote(pmlogextract_command);
 
       string retention = get_config_single ("pmlogmerge-retain");
       if (retention == "") retention = "14days";
+      struct timeval retention_tv;
+      char *errmsg;
+      int rc = pmParseInterval(retention.c_str(), &retention_tv, &errmsg);
+      if (rc)
+        {
+          timestamp(cerr) << "pmlogmerge-retain '" << retention << "' parse error: " << errmsg << endl;
+          free (errmsg);
+          retention = "14days";
+          retention_tv.tv_sec = 14*24*60*60;
+          retention_tv.tv_usec = 0;
+        }
       pmlogextract_options += " -S -" + sh_quote(retention);
 
       // Arrange our new pmlogger to kill itself after the given
       // period, to give us a chance to rerun.
       string period = get_config_single ("pmlogmerge");
       if (period == "") period = "24hours";
-      pmlogger_options += " -s " + sh_quote(period);
-      
-      // Find prior archives by globbing for *.index files, 
-      // just like pmlogger_merge does
-      vector<string> old_archives;
+      struct timeval period_tv;
+      rc = pmParseInterval(period.c_str(), &period_tv, &errmsg);
+      if (rc)
+        {
+          timestamp(cerr) << "pmlogmerge '" << period << "' parse error: " << errmsg << endl;
+          free (errmsg);
+          period = "24hours";
+          period_tv.tv_sec = 24*60*60;
+          period_tv.tv_usec = 0;
+        }
+      if (get_config_exists ("pmlogmerge-granular"))
+        {
+          // adjust stopping time to the next multiple of period
+          struct timeval now_tv;
+          __pmtimevalNow (&now_tv);
+          time_t period_s = period_tv.tv_sec;
+          if (period_s < 1) period_s = 1; // at least one second
+          time_t period_end = ((now_tv.tv_sec + period_s - 1) / period_s) * period_s;
+          period = string(" @") +
+            string(ctime(& period_end)).substr(0,24); // 24: ctime(3) magic value, sans \n
+        }
+      pmlogger_options += " -y -T " + sh_quote(period); // NB: pmmgr host local time!
+
+      // Find prior archives by globbing for *.index files,
+      // just like pmlogger_merge does.
+      // Er ... but aren't .index files optional?
+      vector<string> mergeable_archives; // those to merge
       glob_t the_blob;
       string glob_pattern = host_log_dir + (char)__pmPathSeparator() + "*.index";
-      int rc = glob (glob_pattern.c_str(), GLOB_NOESCAPE, NULL, & the_blob);
+      rc = glob (glob_pattern.c_str(), GLOB_NOESCAPE, NULL, & the_blob);
       if (rc == 0)
         {
+          // compute appropriate
+          struct timeval now_tv;
+          __pmtimevalNow (&now_tv);
+          time_t period_s = period_tv.tv_sec;
+          if (period_s < 1) period_s = 1; // at least one second
+          time_t prior_period_start = ((now_tv.tv_sec - period_s) / period_s) * period_s;
+          time_t prior_period_end = prior_period_start + period_s;
+
           for (unsigned i=0; i<the_blob.gl_pathc; i++)
             {
+              if (quit) return "";
+
               string index_name = the_blob.gl_pathv[i];
               string base_name = index_name.substr(0,index_name.length()-6); // trim .index
 
+              // Manage retention based upon the stat timestamps of the .index file,
+              // because the archives might be so corrupt that even loglabel-based
+              // checks could fail.  Non-corrupt archives will have already been merged
+              // into a fresher archive.
+              struct stat foo;
+              rc = stat (the_blob.gl_pathv[i], & foo);
+              if (rc)
+                {
+                  // this apprx. can't happen
+                  timestamp(cerr) << "stat '" << the_blob.gl_pathv[i] << "' error; skipping cleanup" << endl;
+                  continue; // likely nothing can be done to this one
+                }
+              else if ((foo.st_mtime + retention_tv.tv_sec) < now_tv.tv_sec)
+                {
+                  string bnq = sh_quote(base_name);
+                  string cleanup_cmd = string("/bin/rm -f")
+                    + " " + bnq + ".[0-9]*"
+                    + " " + bnq + ".index" +
+                    + " " + bnq + ".meta";
+
+                  (void) wrap_system(cleanup_cmd);
+                  continue; // it's gone now; don't try to merge it or anything
+                }
+
+              if (quit) return "";
+
               // sic pmlogcheck on it; if it is broken, pmlogextract
               // will give up and make no progress
-
               string pmlogcheck_options = sh_quote(pmlogcheck_command);
               pmlogcheck_options += " " + sh_quote(base_name) + " >/dev/null";
 
-              if (pmDebug & DBG_TRACE_APPL0)
-                timestamp(cout) << "running " << pmlogcheck_options << endl;
-              rc = system(pmlogcheck_options.c_str());
+              rc = wrap_system(pmlogcheck_options);
               if (rc != 0)
                 {
-                  timestamp(cerr) << "system(" << pmlogcheck_options << ") failed: rc=" << rc << endl;
                   timestamp(cerr) << "corrupt archive " << base_name << " preserved." << endl;
                   continue;
                 }
 
-              // XXX: pmlogrewrite here
+              if (quit) return "";
+
+              // In granular mode, skip if this file is too old or too new.  NB: Decide
+              // based upon the log-label, not fstat timestamps, since files postdate
+              // the time region they cover.
+              if (get_config_exists ("pmlogmerge-granular"))
+                {
+                  // One could do this the pmloglabel(1) __pmLog* way,
+                  // rather than the pmlogsummary(1) PMAPI way.
+
+                  int ctx = pmNewContext(PM_CONTEXT_ARCHIVE, base_name.c_str());
+                  if (ctx < 0)
+                    continue; // skip; gc later
+
+                  pmLogLabel label;
+                  rc = pmGetArchiveLabel (& label);
+                  if (rc < 0)
+                    continue; // skip; gc later
+
+                  if (label.ll_start.tv_sec >= prior_period_end) // archive too new?
+                    {
+                      if (pmDebug & DBG_TRACE_APPL0)
+                        timestamp(cout) << "skipping merge of too-new archive " << base_name << endl;
+                      pmDestroyContext (ctx);
+                      continue;
+                    }
+
+                  struct timeval archive_end;
+                  rc = pmGetArchiveEnd(&archive_end);
+                  if (rc < 0)
+                    {
+                      pmDestroyContext (ctx);
+                      continue; // skip; gc later
+                    }
+
+                  if (archive_end.tv_sec < prior_period_start) // archive too old?
+                    {
+                      if (pmDebug & DBG_TRACE_APPL0)
+                        timestamp(cout) << "skipping merge of too-old archive " << base_name << endl;
+                      pmDestroyContext (ctx);
+                      continue; // skip; gc later
+                    }
 
-              old_archives.push_back (base_name);
+                  pmDestroyContext (ctx);
+                  // fallthrough: the archive intersects the prior_period_{start,end} interval
+
+                  // XXX: What happens for archives that span across granular periods?
+                }
+
+              mergeable_archives.push_back (base_name);
             }
           globfree (& the_blob);
         }
 
-      string timestr = "merged-archive";
+      string timestr = "archive";
       time_t now2 = time(NULL);
       struct tm *now = gmtime(& now2);
       if (now != NULL)
@@ -754,42 +942,52 @@ pmmgr_pmlogger_daemon::daemon_command_li
         }
       string merged_archive_name = host_log_dir + (char)__pmPathSeparator() + timestr;
 
-      if (old_archives.size() > 1) // 1 or 0 are not worth merging!
+      if (mergeable_archives.size() > 1) // 1 or 0 are not worth merging!
         {
           // assemble final bits of pmlogextract command line: the inputs and the output
-          for (unsigned i=0; i<old_archives.size(); i++)
-            pmlogextract_options += " " + sh_quote(old_archives[i]);
+          for (unsigned i=0; i<mergeable_archives.size(); i++)
+            {
+              if (quit) return "";
 
-          pmlogextract_options += " " + sh_quote(merged_archive_name);
+              if (get_config_exists("pmlogmerge-rewrite"))
+                {
+                  string pmlogrewrite_options = sh_quote(pmlogrewrite_command);
+                  pmlogrewrite_options += " -i " + get_config_single("pmlogmerge-rewrite");
+                  pmlogrewrite_options += " " + sh_quote(mergeable_archives[i]);
+
+                  (void) wrap_system(pmlogrewrite_options.c_str());
+                  // In case of error, don't break; let's try to merge it anyway.
+                  // Maybe pmlogrewrite will succeed and will get rid of this file.
+                }
 
-          if (pmDebug & DBG_TRACE_APPL0)
-            timestamp(cout) << "running " << pmlogextract_options << endl;
+              pmlogextract_options += " " + sh_quote(mergeable_archives[i]);
+            }
 
-          rc = system(pmlogextract_options.c_str());
-          if (rc != 0) 
-            // will try again later
-            timestamp(cerr) << "system(" << pmlogextract_options << ") failed: rc=" << rc << endl;
-          else
+          if (quit) return "";
+
+          pmlogextract_options += " " + sh_quote(merged_archive_name);
+
+          rc = wrap_system(pmlogextract_options.c_str());
+          if (rc == 0)
             {
-              // zap the previous archive files 
-              for (unsigned i=0; i<old_archives.size(); i++)
+              // zap the previous archive files
+              //
+              // Don't skip this upon "if (quit)", since the new merged archive is already complete;
+              // it'd be a waste to keep these files around for a future re-merge.
+              for (unsigned i=0; i<mergeable_archives.size(); i++)
                 {
-                  string base_name = sh_quote(old_archives[i]);
+                  string base_name = sh_quote(mergeable_archives[i]);
                   string cleanup_cmd = string("/bin/rm -f")
                     + " " + base_name + ".[0-9]*"
-                    + " " + base_name + ".index" + 
+                    + " " + base_name + ".index" +
                     + " " + base_name + ".meta";
 
-                  if (pmDebug & DBG_TRACE_APPL0)
-                    timestamp(cout) << "running " << cleanup_cmd << endl;
-                  rc = system(cleanup_cmd.c_str());
-                  if (rc != 0) 
-                    timestamp(cerr) << "system(" << cleanup_cmd << ") failed: rc=" << rc << endl;
+                  (void) wrap_system(cleanup_cmd.c_str());
                 }
             }
         }
     }
-  
+
   // synthesize a logfile name similarly as pmlogger_check, but add %S (seconds)
   // to reduce likelihood of conflict with a short poll interval
   string timestr = "archive";
@@ -800,7 +998,7 @@ pmmgr_pmlogger_daemon::daemon_command_li
       char timestr2[100];
       int rc = strftime(timestr2, sizeof(timestr2), "-%Y%m%d.%H%M%S", now);
       if (rc > 0)
-        timestr += timestr2;
+        timestr += timestr2; // no sh_quote required
     }
 
   // last argument
@@ -810,7 +1008,7 @@ pmmgr_pmlogger_daemon::daemon_command_li
 }
 
 
-std::string                                             
+std::string
 pmmgr_pmie_daemon::daemon_command_line()
 {
   string default_log_dir =
@@ -825,7 +1023,7 @@ pmmgr_pmie_daemon::daemon_command_line()
   (void) mkdir2 (host_log_dir.c_str(), 0777);
   // (errors creating actual files under host_log_dir will be noted shortly)
 
-  string pmie_command = 
+  string pmie_command =
         string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmie";
   string pmie_options = sh_quote (pmie_command);
 
@@ -835,29 +1033,29 @@ pmmgr_pmie_daemon::daemon_command_line()
   if (get_config_exists ("pmieconf"))
     {
       string pmieconf_output_file = host_log_dir + (char)__pmPathSeparator() + "config.pmie";
-      string pmieconf_command = 
+      string pmieconf_command =
         string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmieconf";
 
       // NB: pmieconf doesn't take a host name as an argument, unlike pmlogconf
-      string pmieconf_options = 
+      string pmieconf_options =
         sh_quote(pmieconf_command)
         + " -F -c " + get_config_single ("pmieconf")
         + " -f " + sh_quote(pmieconf_output_file);
-      if (pmDebug & DBG_TRACE_APPL0)
-        timestamp(cout) << "running " << pmieconf_options << endl;
-      int rc = system(pmieconf_options.c_str());
-      if (rc != 0) 
-        timestamp(cerr) << "system(" << pmieconf_options << ") failed: rc=" << rc << endl;
+
+      int rc = wrap_system(pmieconf_options.c_str());
+      if (rc) return "";
 
       pmie_options += "-c " + sh_quote(pmieconf_output_file);
     }
 
+  if (quit) return "";
+
   // collect -h direction
   pmie_options += " -h " + sh_quote(spec);
 
   // collect -f, to get it to run in the foreground, avoid setuid
   pmie_options += " -f";
- 
+
   // collect subsidiary pmlogger diagnostics
   pmie_options += " -l " + sh_quote(host_log_dir + (char)__pmPathSeparator() + "pmie.log");
 
@@ -876,10 +1074,10 @@ void handle_interrupt (int sig)
   // recursive signals or whatnot, despite sa_mask in
   // setup_signals()).
   if (quit == 0)
-    kill(-getpid(), sig);
+    kill(-getpid(), SIGTERM);
 
   quit ++;
-  if (quit > 2)
+  if (quit > 3) // ignore 1 from user; 1 from kill(-getpid) above; 1 from same near main() exit
     {
       char msg[] = "Too many interrupts received, exiting.\n";
       int rc = write (2, msg, sizeof(msg)-1);
@@ -928,21 +1126,22 @@ void setup_signals()
 
 int main (int argc, char *argv[])
 {
+  /* Become our own process group, to assist signal passing to children. */
+  setpgid(getpid(), 0);
   __pmSetProgname(argv[0]);
-
   setup_signals();
 
-  string default_config_dir = 
+  string default_config_dir =
     string(pmGetConfig("PCP_SYSCONF_DIR")) + (char)__pmPathSeparator() + "pmmgr";
   vector<pmmgr_job_spec*> js;
 
   int c;
-  int polltime = 60;
   char* username_str;
   __pmGetUsername(& username_str);
   string username = username_str;
+  char* output_filename = NULL;
 
-  while ((c = getopt(argc, argv, "D:c:vp:U:h")) != EOF)
+  while ((c = getopt(argc, argv, "D:c:vp:U:l:h")) != EOF)
     {
       switch (c)
         {
@@ -950,13 +1149,17 @@ int main (int argc, char *argv[])
           (void) __pmParseDebug(optarg);
           break;
 
+        case 'l':
+          output_filename = optarg;
+          break;
+
         case 'v':
           pmDebug |= DBG_TRACE_APPL0;
           break;
 
         case 'p':
           polltime = atoi(optarg);
-          if (polltime <= 0) 
+          if (polltime <= 0)
             {
               cerr << "Poll time too short." << endl;
               exit(1);
@@ -994,8 +1197,34 @@ int main (int argc, char *argv[])
   // lose root privileges if we have them
   __pmSetProcessIdentity(username.c_str());
 
+  // (re)create log file, redirect stdout/stderr
+  // NB: must be done after __pmSetProcessIdentity() for proper file permissions
+  if (output_filename)
+    {
+      int fd;
+      (void) unlink (output_filename); // in case one's left over from a previous other-uid run
+      fd = open (output_filename, O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 0666);
+      if (fd < 0)
+        timestamp(cerr) << "Cannot re-create logfile " << output_filename << endl;
+      else
+        {
+          int rc;
+          // Move the new file descriptors on top of stdout/stderr
+          rc = dup2 (fd, STDOUT_FILENO);
+          if (rc < 0) // rather unlikely
+            timestamp(cerr) << "Cannot redirect logfile to stdout" << endl;
+          rc = dup2 (fd, STDERR_FILENO);
+          if (rc < 0) // rather unlikely
+            timestamp(cerr) << "Cannot redirect logfile to stderr" << endl;
+          rc = close (fd);
+          if (rc < 0) // rather unlikely
+            timestamp(cerr) << "Cannot close logfile fd" << endl;
+        }
+
+    }
+
   timestamp(cout) << "Log started" << endl;
-  while (1)
+  while (! quit)
     {
       // In this section, we must not fidget with SIGCHLD, due to use of system(3).
       for (unsigned i=0; i<js.size() && !quit; i++)
@@ -1014,10 +1243,15 @@ int main (int argc, char *argv[])
       (void) signal (SIGALRM, SIG_DFL);
     }
 
+  // NB: don't let this cleanup be interrupted by pending-quit signals;
+  // we want the daemon pid's killed.
   for (unsigned i=0; i<js.size(); i++)
     delete js[i];
-  // XXX: send a suicide signal to the process group
 
   timestamp(cout) << "Log finished" << endl;
+
+  // Send a last-gasp signal out, just in case daemons somehow missed 
+  kill(-getpid(), SIGTERM);
+
   return 0;
 }
diff -Naurp a/src/pmmgr/pmmgr.h b/src/pmmgr/pmmgr.h
--- a/src/pmmgr/pmmgr.h	2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/pmmgr.h	2014-02-26 14:34:37.567238053 +1100
@@ -46,6 +46,7 @@ protected:
   std::string config_directory;
 
   std::ostream& timestamp(std::ostream&);
+  int wrap_system(const std::string& cmd);
 };
 
 
@@ -64,6 +65,7 @@ protected:
   pmmgr_hostid hostid;
   pcp_context_spec spec;
   int pid;
+  time_t last_restart_attempt;
 
   virtual std::string daemon_command_line() = 0;
 };
diff -Naurp a/src/pmmgr/pmmgr.options b/src/pmmgr/pmmgr.options
--- a/src/pmmgr/pmmgr.options	2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/pmmgr.options	2014-02-26 14:34:37.576238032 +1100
@@ -15,6 +15,9 @@
 # assume identity of some user other than "pcp"
 # -U foobar
 
+# make log go someplace else
+# -l /some/place/else
+
 # setting of environment variables for pmmgr
 
 # timeouts for interactions with pmcd on behalf of clients
diff -Naurp a/src/pmmgr/rc_pmmgr b/src/pmmgr/rc_pmmgr
--- a/src/pmmgr/rc_pmmgr	2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/rc_pmmgr	2014-02-26 14:34:37.584237959 +1100
@@ -215,7 +215,11 @@ case "$1" in
 Error: pmmgr control file "$PMMGROPTS" is missing, cannot start pmmgr.'
 		exit
 	    fi
-	    [ ! -d $RUNDIR ] && mkdir -p $RUNDIR
+	    if [ ! -d "$RUNDIR" ]
+	    then
+		mkdir -p -m 775 "$RUNDIR"
+		chown $PCP_USER:$PCP_GROUP "$RUNDIR"
+	    fi
 	    cd $RUNDIR
 
 	    # salvage the previous versions of any pmmgr
@@ -226,7 +230,7 @@ Error: pmmgr control file "$PMMGROPTS" i
 		mv pmmgr.log pmmgr.log.prev
 	    fi
 
-	    $ECHO $PCP_ECHO_N "Performance Co-Pilot starting pmmgr (logfile is $RUNDIR/pmmgr.log) ..." "$PCP_ECHO_C"
+	    $ECHO $PCP_ECHO_N "Starting pmmgr ..." "$PCP_ECHO_C"
 	    # options file processing ...
 	    # only consider lines which start with a hyphen
 	    # get rid of the -f option
@@ -250,7 +254,7 @@ BEGIN			{ exports="" }
 			}
 END			{ if (exports != "") print "export", exports }'`
 
-	    $PMMGR $OPTS > pmmgr.log 2>&1 &
+	    $PMMGR -l pmmgr.log $OPTS &
 	    $RC_STATUS -v
 
 	    pmpost "start pmmgr from $pmprog"
diff -Naurp a/src/pmmgr/TODO b/src/pmmgr/TODO
--- a/src/pmmgr/TODO	2014-01-13 13:27:28.000000000 +1100
+++ b/src/pmmgr/TODO	2014-02-26 14:34:37.593238086 +1100
@@ -1,8 +1,12 @@
-- pmlogrewrite
-- log aging without logmerge (which is high on I/O)
+- pmmgr.1 EXAMPLE CONFIGURATIONS
+- optionally delay pm*conf
+- pmlogreduce
+- log aging in background while new pmlogger's already running
 - old log compression (until we get libpcp zlib or something)
 - email error reporting?
-- test buildability on non-linux
 - qa
 - port to mingw?
+- port to cygwin?
 - pmlogger/pmie .log rotation
+- pid->pid_t cleanup
+