diff -Naurp a/man/man1/pmlogger.1 b/man/man1/pmlogger.1
--- a/man/man1/pmlogger.1 2014-01-14 08:40:15.000000000 +1100
+++ b/man/man1/pmlogger.1 2014-02-26 14:34:37.597237966 +1100
@@ -35,6 +35,7 @@
[\f3\-v\f1 \f2volsize\f1]
[\f3\-V\f1 \f2version\f1]
[\f3\-x\f1 \f2fd\f1]
+[\f3\-y\f1]
\f2archive\f1
.SH DESCRIPTION
.B pmlogger
@@ -329,6 +330,11 @@ The
option may be used to limit the execution time using the format
of time as prescribed by
.BR PCPIntro (1).
+The time is interpreted within the time zone of the PMCD server,
+unless the
+.B \-y
+option is given, within which case the time zone at this logger
+host is used.
.nf
Some examples of different formats:
.in 1i
diff -Naurp a/src/pmlogger/pmnewlog.sh b/src/pmlogger/pmnewlog.sh
--- a/src/pmlogger/pmnewlog.sh 2014-01-09 07:45:21.000000000 +1100
+++ b/src/pmlogger/pmnewlog.sh 2014-02-26 14:34:37.598238056 +1100
@@ -179,7 +179,7 @@ _do_cmd()
# part of the pmlogger control file for a long-running pmlogger.
#
-while getopts "a:C:c:D:Ll:Nm:n:Pp:rst:T:Vv:" c
+while getopts "a:C:c:D:Ll:Nm:n:Pp:rst:T:Vv:y" c
do
case $c
in
@@ -253,7 +253,7 @@ do
# pmlogger flags passed through
#
- L|r)
+ L|r|y)
args="$args-$c "
;;
diff -Naurp a/src/pmlogger/src/pmlogger.c b/src/pmlogger/src/pmlogger.c
--- a/src/pmlogger/src/pmlogger.c 2014-01-14 14:52:52.000000000 +1100
+++ b/src/pmlogger/src/pmlogger.c 2014-02-26 14:34:37.599237958 +1100
@@ -476,6 +476,7 @@ main(int argc, char **argv)
int sts;
int sep = __pmPathSeparator();
int errflag = 0;
+ int use_localtime = 0;
int isdaemon = 0;
char *pmnsfile = PM_NS_DEFAULT;
char *username;
@@ -500,7 +501,7 @@ main(int argc, char **argv)
* corresponding changes are made to pmnewlog when pmlogger
* options are passed through from the control file
*/
- while ((c = getopt(argc, argv, "c:D:h:l:Lm:n:Prs:T:t:uU:v:V:x:?")) != EOF) {
+ while ((c = getopt(argc, argv, "c:D:h:l:Lm:n:Prs:T:t:uU:v:V:x:y?")) != EOF) {
switch (c) {
case 'c': /* config file */
@@ -633,6 +634,10 @@ main(int argc, char **argv)
time(&rsc_start);
break;
+ case 'y':
+ use_localtime = 1;
+ break;
+
case '?':
default:
errflag++;
@@ -661,7 +666,8 @@ Options:\n\
-v volsize switch log volumes after volsize has been accumulated\n\
-V version version for archive (default and only version is 2)\n\
-x fd control file descriptor for application launching pmlogger\n\
- via pmRecordControl(3)\n",
+ via pmRecordControl(3)\n\
+ -y set timezone for times to local time rather than that of PMCD host\n",
pmProgname);
exit(1);
}
@@ -817,7 +823,8 @@ Options:\n\
strcpy(logctl.l_label.ill_tz, resp->vset[0]->vlist[0].value.pval->vbuf);
/* prefer to use remote time to avoid clock drift problems */
epoch = resp->timestamp; /* struct assignment */
- pmNewZone(logctl.l_label.ill_tz);
+ if (! use_localtime)
+ pmNewZone(logctl.l_label.ill_tz);
}
#ifdef PCP_DEBUG
else if (pmDebug & DBG_TRACE_LOG) {
diff -Naurp a/src/pmmgr/config/GNUmakefile b/src/pmmgr/config/GNUmakefile
--- a/src/pmmgr/config/GNUmakefile 2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/config/GNUmakefile 2014-02-26 14:34:37.417238128 +1100
@@ -1,6 +1,6 @@
#!gmake
#
-# Copyright (c) 2013 Red Hat.
+# Copyright (c) 2013-2014 Red Hat.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
@@ -18,7 +18,7 @@ LLDIRT =
TOPDIR = ../../..
include $(TOPDIR)/src/include/builddefs
-LSRCFILES = pmie pmieconf pmlogconf pmlogger pmlogmerge \
+LSRCFILES = pmie pmieconf pmlogconf pmlogger pmlogmerge pmlogmerge-rewrite pmlogmerge-granular \
README target-discovery.example-avahi
PMMGR_SYSCONF_DIR=$(PCP_SYSCONF_DIR)/pmmgr
diff -Naurp a/src/pmmgr/GNUmakefile b/src/pmmgr/GNUmakefile
--- a/src/pmmgr/GNUmakefile 2014-01-10 08:45:59.000000000 +1100
+++ b/src/pmmgr/GNUmakefile 2014-02-26 14:34:37.552225230 +1100
@@ -1,6 +1,6 @@
#!gmake
#
-# Copyright (c) 2013 Red Hat.
+# Copyright (c) 2013-2014 Red Hat.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
@@ -39,7 +39,7 @@ install: $(SUBDIRS) $(CXXMDTARGET)
$(INSTALL) -m 644 pmmgr.options $(PCP_PMMGROPTIONS_PATH)
$(INSTALL) -m 755 rc_pmmgr $(PCP_RC_DIR)/pmmgr
$(INSTALL) -m 755 $(CXXMDTARGET) $(PCP_BINADM_DIR)/$(CXXMDTARGET)
- $(INSTALL) -m 775 -o $(PCP_USER) -g $(PCP_USER) -d $(PCP_LOG_DIR)/pmmgr
+ $(INSTALL) -m 775 -o $(PCP_USER) -g $(PCP_GROUP) -d $(PCP_LOG_DIR)/pmmgr
else
build-me:
@echo not building pmmgr
diff -Naurp a/src/pmmgr/pmmgr.cxx b/src/pmmgr/pmmgr.cxx
--- a/src/pmmgr/pmmgr.cxx 2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/pmmgr.cxx 2014-02-26 14:34:37.559238008 +1100
@@ -1,18 +1,20 @@
/*
* Copyright (c) 2013-2014 Red Hat.
- *
+ *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*/
-
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
#include "pmmgr.h"
#include "impl.h"
@@ -23,6 +25,8 @@
#include <iostream>
extern "C" {
+#include <fcntl.h>
+#include <unistd.h>
#include <glob.h>
#include <sys/wait.h>
#ifdef HAVE_PTHREAD_H
@@ -40,6 +44,7 @@ using namespace std;
int quit;
+int polltime = 60;
// ------------------------------------------------------------------------
@@ -76,7 +81,7 @@ timestamp(ostream &o)
if (now2)
now2[19] = '\0'; // overwrite \n
- return o << "[" << (now2 ? now2 : "") << "] " << pmProgname << "("
+ return o << "[" << (now2 ? now2 : "") << "] " << pmProgname << "("
<< getpid()
#ifdef HAVE_PTHREAD_H
#ifdef IS_LINUX
@@ -89,13 +94,6 @@ timestamp(ostream &o)
}
-extern "C" int
-pmValue_compare (const void* a, const void* b)
-{
- return ((pmValue *)a)->inst - ((pmValue *)b)->inst;
-}
-
-
extern "C" void *
pmmgr_daemon_poll_thread (void* a)
{
@@ -105,6 +103,51 @@ pmmgr_daemon_poll_thread (void* a)
}
+// A wrapper for something like system(3), but responding quicker to
+// interrupts and standardizing tracing.
+int
+pmmgr_configurable::wrap_system(const std::string& cmd)
+{
+ if (pmDebug & DBG_TRACE_APPL0)
+ timestamp(cout) << "running " << cmd << endl;
+
+ int pid = fork();
+ if (pid == 0)
+ {
+ // child
+ int rc = execl ("/bin/sh", "sh", "-c", cmd.c_str(), NULL);
+ timestamp(cerr) << "failed to execl sh -c " << cmd << " rc=" << rc << endl;
+ _exit (1);
+ }
+ else if (pid < 0)
+ {
+ // error
+ timestamp(cerr) << "fork for " << cmd << " failed: errno=" << errno << endl;
+ return -1;
+ }
+ else
+ {
+ // parent
+ int status = -1;
+ int rc;
+ //timestamp(cout) << "waiting for pid=" << pid << endl;
+
+ do { rc = waitpid(pid, &status, 0); } while (!quit && rc == -1 && errno == EINTR); // TEMP_FAILURE_RETRY
+ if (quit)
+ {
+ // timestamp(cout) << "killing pid=" << pid << endl;
+ kill (pid, SIGTERM); // just to be on the safe side
+ // it might linger a few seconds in zombie mode
+ }
+
+ //timestamp(cout) << "done status=" << status << endl;
+ if (status != 0)
+ timestamp(cerr) << "system(" << cmd << ") failed: rc=" << status << endl;
+ return status;
+ }
+}
+
+
// ------------------------------------------------------------------------
@@ -255,15 +298,12 @@ pmmgr_job_spec::compute_hostid (const pc
continue;
// NB: after this point, 'continue' must also pmFreeResult(r)
+ // in-place sort value list by indom number
+ pmSortInstances(r);
+
// only vset[0] will be set, for csb->pmid
if (r->vset[0]->numval > 0)
{
- // in-place sort value list by indom number
- qsort (r->vset[0]->vlist,
- (size_t) r->vset[0]->numval,
- sizeof(pmValue),
- pmValue_compare);
-
for (int j=0; j<r->vset[0]->numval; j++) // iterate over instances
{
// fetch the string value
@@ -273,7 +313,7 @@ pmmgr_job_spec::compute_hostid (const pc
PM_TYPE_STRING, & av, PM_TYPE_STRING);
if (rc < 0)
continue;
-
+
// at last! we have a string we can accumulate
hostid_fields.push_back (av.cp);
free (av.cp);
@@ -356,10 +396,10 @@ pmmgr_job_spec::poll()
new_specs.insert(target_hosts[i]);
vector<string> target_discovery = get_config_multi("target-discovery");
- for (unsigned i=0; i<target_discovery.size(); i++)
+ for (unsigned i=0; i<target_discovery.size() && !quit; i++)
{
char **urls = NULL;
- const char *discovery = (target_discovery[i] == "")
+ const char *discovery = (target_discovery[i] == "")
? NULL
: target_discovery[i].c_str();
int numUrls = pmDiscoverServices (PM_SERVER_SERVICE_SPEC, discovery, &urls);
@@ -381,15 +421,29 @@ pmmgr_job_spec::poll()
known_targets.clear();
// phase 3: map the context-specs to hostids to find new hosts
+ map<pmmgr_hostid,double> known_target_scores;
for (set<pcp_context_spec>::iterator it = new_specs.begin();
- it != new_specs.end();
+ it != new_specs.end() && !quit;
++it)
{
+ struct timeval before, after;
+ __pmtimevalNow(& before);
pmmgr_hostid hostid = compute_hostid (*it);
+ __pmtimevalNow(& after);
+ double score = __pmtimevalSub(& after, & before); // the smaller, the preferreder
+
if (hostid != "") // verified existence/liveness
- known_targets[hostid] = *it;
- // NB: for hostid's with multiple specs, this logic will pick an
- // *arbitrary* one. Perhaps we want to tie-break deterministically.
+ {
+ if (pmDebug & DBG_TRACE_APPL0)
+ timestamp(cout) << "hostid " << hostid << " via " << *it << " time " << score << endl;
+
+ if (known_target_scores.find(hostid) == known_target_scores.end() ||
+ known_target_scores[hostid] > score) // previous slower than this one
+ {
+ known_targets[hostid] = *it;
+ known_target_scores[hostid] = score;
+ }
+ }
}
// phase 4a: compare old_known_targets vs. known_targets: look for any recently died
@@ -415,11 +469,11 @@ pmmgr_job_spec::poll()
// phase 5: poll all the live daemons
// NB: there is a parallelism opportunity, as running many pmlogconf/etc.'s in series
// is a possible bottleneck.
-#ifdef HAVE_PTHREAD_H
+#ifdef HAVE_PTHREAD_H
vector<pthread_t> threads;
#endif
for (multimap<pmmgr_hostid,pmmgr_daemon*>::iterator it = daemons.begin();
- it != daemons.end();
+ it != daemons.end() && !quit;
++it)
{
#ifdef HAVE_PTHREAD_H
@@ -468,7 +522,7 @@ pmmgr_job_spec::poll()
glob_t the_blob;
string glob_pattern = log_dir + (char)__pmPathSeparator() + "*";
- rc = glob (glob_pattern.c_str(),
+ rc = glob (glob_pattern.c_str(),
GLOB_NOESCAPE
#ifdef GLOB_ONLYDIR
| GLOB_ONLYDIR
@@ -476,10 +530,10 @@ pmmgr_job_spec::poll()
, NULL, & the_blob);
if (rc == 0)
{
- for (unsigned i=0; i<the_blob.gl_pathc; i++)
+ for (unsigned i=0; i<the_blob.gl_pathc && !quit; i++)
{
string item_name = the_blob.gl_pathv[i];
-
+
// Reject if currently live hostid
// NB: basename(3) might modify the argument string, so we don't feed
// it item_name.c_str().
@@ -496,11 +550,7 @@ pmmgr_job_spec::poll()
// <Janine Melnitz>We've got one!!!!!</>
timestamp(cout) << "gc subdirectory " << item_name << endl;
string cleanup_cmd = "/bin/rm -rf " + sh_quote(item_name);
- if (pmDebug & DBG_TRACE_APPL0)
- timestamp(cout) << "running " << cleanup_cmd << endl;
- rc = system(cleanup_cmd.c_str());
- if (rc != 0)
- timestamp(cerr) << "system(" << cleanup_cmd << ") failed: rc=" << rc << endl;
+ (void) wrap_system(cleanup_cmd);
}
}
}
@@ -546,18 +596,19 @@ pmmgr_job_spec::note_dead_hostid(const p
pmmgr_daemon::pmmgr_daemon(const std::string& config_directory,
- const pmmgr_hostid& hostid,
+ const pmmgr_hostid& hostid,
const pcp_context_spec& spec):
pmmgr_configurable(config_directory),
hostid(hostid),
spec(spec),
- pid(0)
+ pid(0),
+ last_restart_attempt(0)
{
}
pmmgr_pmlogger_daemon::pmmgr_pmlogger_daemon(const std::string& config_directory,
- const pmmgr_hostid& hostid,
+ const pmmgr_hostid& hostid,
const pcp_context_spec& spec):
pmmgr_daemon(config_directory, hostid, spec)
{
@@ -565,7 +616,7 @@ pmmgr_pmlogger_daemon::pmmgr_pmlogger_da
pmmgr_pmie_daemon::pmmgr_pmie_daemon(const std::string& config_directory,
- const pmmgr_hostid& hostid,
+ const pmmgr_hostid& hostid,
const pcp_context_spec& spec):
pmmgr_daemon(config_directory, hostid, spec)
{
@@ -596,7 +647,7 @@ void pmmgr_daemon::poll()
int rc = waitpid ((pid_t) pid, &ignored, WNOHANG);
rc = kill ((pid_t) pid, 0);
- if (rc < 0)
+ if (rc < 0)
{
if (pmDebug & DBG_TRACE_APPL0)
timestamp(cout) << "daemon pid " << pid << " found dead" << endl;
@@ -607,7 +658,32 @@ void pmmgr_daemon::poll()
if (pid == 0) // needs a restart
{
- string commandline = daemon_command_line();
+ time_t now;
+ time (& now);
+
+ // Prevent an error in the environment or the pmmgr daemon
+ // command lines from generating a tight loop of failure /
+ // retry, wasting time and log file space. Limit retry attempts
+ // to one per poll interval (pmmgr -p N parameter).
+ if (last_restart_attempt && (last_restart_attempt + polltime) >= now)
+ return; // quietly, without attempting to restart
+
+ string commandline = daemon_command_line(); // <--- may take many seconds!
+
+ // NB: Note this time as a restart attempt, even if daemon_command_line()
+ // returned an empty string, so that we don't try to restart it too soon.
+ // We note this time rather than the beginning of daemon_command_line(),
+ // to ensure at least polltime seconds of rest between attempts.
+ last_restart_attempt = now;
+
+ if (quit) return; // without starting the daemon process
+
+ if (commandline == "") // error in some intermediate processing stage
+ {
+ timestamp(cerr) << "failed to prepare daemon command line" << endl;
+ return;
+ }
+
if (pmDebug & DBG_TRACE_APPL0)
timestamp(cout) << "fork/exec sh -c " << commandline << endl;
pid = fork();
@@ -633,7 +709,7 @@ void pmmgr_daemon::poll()
}
-std::string
+std::string
pmmgr_pmlogger_daemon::daemon_command_line()
{
string default_log_dir =
@@ -648,7 +724,7 @@ pmmgr_pmlogger_daemon::daemon_command_li
(void) mkdir2 (host_log_dir.c_str(), 0777);
// (errors creating actual files under host_log_dir will be noted shortly)
- string pmlogger_command =
+ string pmlogger_command =
string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmlogger";
string pmlogger_options = sh_quote(pmlogger_command);
pmlogger_options += " " + get_config_single ("pmlogger");
@@ -658,20 +734,17 @@ pmmgr_pmlogger_daemon::daemon_command_li
{
string pmlogconf_output_file = host_log_dir + (char)__pmPathSeparator() + "config.pmlogger";
(void) unlink (pmlogconf_output_file.c_str());
- string pmlogconf_command =
+ string pmlogconf_command =
string(pmGetConfig("PCP_BINADM_DIR")) + (char)__pmPathSeparator() + "pmlogconf";
- string pmlogconf_options =
+ string pmlogconf_options =
sh_quote(pmlogconf_command)
+ " -c -r -h " + sh_quote(spec)
+ " " + get_config_single ("pmlogconf")
+ " " + sh_quote(pmlogconf_output_file)
+ " >/dev/null"; // pmlogconf is too chatty
-
- if (pmDebug & DBG_TRACE_APPL0)
- timestamp(cout) << "running " << pmlogconf_options << endl;
- int rc = system(pmlogconf_options.c_str());
- if (rc != 0)
- timestamp(cerr) << "system(" << pmlogconf_options << ") failed: rc=" << rc << endl;
+
+ int rc = wrap_system(pmlogconf_options);
+ if (rc) return "";
pmlogger_options += " -c " + sh_quote(pmlogconf_output_file);
}
@@ -688,61 +761,176 @@ pmmgr_pmlogger_daemon::daemon_command_li
// do log merging
if (get_config_exists ("pmlogmerge"))
{
- string pmlogextract_command =
+ string pmlogextract_command =
string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmlogextract";
- string pmlogcheck_command =
+ string pmlogcheck_command =
string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmlogcheck";
+ string pmlogrewrite_command =
+ string(pmGetConfig("PCP_BINADM_DIR")) + (char)__pmPathSeparator() + "pmlogrewrite";
+
string pmlogextract_options = sh_quote(pmlogextract_command);
string retention = get_config_single ("pmlogmerge-retain");
if (retention == "") retention = "14days";
+ struct timeval retention_tv;
+ char *errmsg;
+ int rc = pmParseInterval(retention.c_str(), &retention_tv, &errmsg);
+ if (rc)
+ {
+ timestamp(cerr) << "pmlogmerge-retain '" << retention << "' parse error: " << errmsg << endl;
+ free (errmsg);
+ retention = "14days";
+ retention_tv.tv_sec = 14*24*60*60;
+ retention_tv.tv_usec = 0;
+ }
pmlogextract_options += " -S -" + sh_quote(retention);
// Arrange our new pmlogger to kill itself after the given
// period, to give us a chance to rerun.
string period = get_config_single ("pmlogmerge");
if (period == "") period = "24hours";
- pmlogger_options += " -s " + sh_quote(period);
-
- // Find prior archives by globbing for *.index files,
- // just like pmlogger_merge does
- vector<string> old_archives;
+ struct timeval period_tv;
+ rc = pmParseInterval(period.c_str(), &period_tv, &errmsg);
+ if (rc)
+ {
+ timestamp(cerr) << "pmlogmerge '" << period << "' parse error: " << errmsg << endl;
+ free (errmsg);
+ period = "24hours";
+ period_tv.tv_sec = 24*60*60;
+ period_tv.tv_usec = 0;
+ }
+ if (get_config_exists ("pmlogmerge-granular"))
+ {
+ // adjust stopping time to the next multiple of period
+ struct timeval now_tv;
+ __pmtimevalNow (&now_tv);
+ time_t period_s = period_tv.tv_sec;
+ if (period_s < 1) period_s = 1; // at least one second
+ time_t period_end = ((now_tv.tv_sec + period_s - 1) / period_s) * period_s;
+ period = string(" @") +
+ string(ctime(& period_end)).substr(0,24); // 24: ctime(3) magic value, sans \n
+ }
+ pmlogger_options += " -y -T " + sh_quote(period); // NB: pmmgr host local time!
+
+ // Find prior archives by globbing for *.index files,
+ // just like pmlogger_merge does.
+ // Er ... but aren't .index files optional?
+ vector<string> mergeable_archives; // those to merge
glob_t the_blob;
string glob_pattern = host_log_dir + (char)__pmPathSeparator() + "*.index";
- int rc = glob (glob_pattern.c_str(), GLOB_NOESCAPE, NULL, & the_blob);
+ rc = glob (glob_pattern.c_str(), GLOB_NOESCAPE, NULL, & the_blob);
if (rc == 0)
{
+ // compute appropriate
+ struct timeval now_tv;
+ __pmtimevalNow (&now_tv);
+ time_t period_s = period_tv.tv_sec;
+ if (period_s < 1) period_s = 1; // at least one second
+ time_t prior_period_start = ((now_tv.tv_sec - period_s) / period_s) * period_s;
+ time_t prior_period_end = prior_period_start + period_s;
+
for (unsigned i=0; i<the_blob.gl_pathc; i++)
{
+ if (quit) return "";
+
string index_name = the_blob.gl_pathv[i];
string base_name = index_name.substr(0,index_name.length()-6); // trim .index
+ // Manage retention based upon the stat timestamps of the .index file,
+ // because the archives might be so corrupt that even loglabel-based
+ // checks could fail. Non-corrupt archives will have already been merged
+ // into a fresher archive.
+ struct stat foo;
+ rc = stat (the_blob.gl_pathv[i], & foo);
+ if (rc)
+ {
+ // this apprx. can't happen
+ timestamp(cerr) << "stat '" << the_blob.gl_pathv[i] << "' error; skipping cleanup" << endl;
+ continue; // likely nothing can be done to this one
+ }
+ else if ((foo.st_mtime + retention_tv.tv_sec) < now_tv.tv_sec)
+ {
+ string bnq = sh_quote(base_name);
+ string cleanup_cmd = string("/bin/rm -f")
+ + " " + bnq + ".[0-9]*"
+ + " " + bnq + ".index" +
+ + " " + bnq + ".meta";
+
+ (void) wrap_system(cleanup_cmd);
+ continue; // it's gone now; don't try to merge it or anything
+ }
+
+ if (quit) return "";
+
// sic pmlogcheck on it; if it is broken, pmlogextract
// will give up and make no progress
-
string pmlogcheck_options = sh_quote(pmlogcheck_command);
pmlogcheck_options += " " + sh_quote(base_name) + " >/dev/null";
- if (pmDebug & DBG_TRACE_APPL0)
- timestamp(cout) << "running " << pmlogcheck_options << endl;
- rc = system(pmlogcheck_options.c_str());
+ rc = wrap_system(pmlogcheck_options);
if (rc != 0)
{
- timestamp(cerr) << "system(" << pmlogcheck_options << ") failed: rc=" << rc << endl;
timestamp(cerr) << "corrupt archive " << base_name << " preserved." << endl;
continue;
}
- // XXX: pmlogrewrite here
+ if (quit) return "";
+
+ // In granular mode, skip if this file is too old or too new. NB: Decide
+ // based upon the log-label, not fstat timestamps, since files postdate
+ // the time region they cover.
+ if (get_config_exists ("pmlogmerge-granular"))
+ {
+ // One could do this the pmloglabel(1) __pmLog* way,
+ // rather than the pmlogsummary(1) PMAPI way.
+
+ int ctx = pmNewContext(PM_CONTEXT_ARCHIVE, base_name.c_str());
+ if (ctx < 0)
+ continue; // skip; gc later
+
+ pmLogLabel label;
+ rc = pmGetArchiveLabel (& label);
+ if (rc < 0)
+ continue; // skip; gc later
+
+ if (label.ll_start.tv_sec >= prior_period_end) // archive too new?
+ {
+ if (pmDebug & DBG_TRACE_APPL0)
+ timestamp(cout) << "skipping merge of too-new archive " << base_name << endl;
+ pmDestroyContext (ctx);
+ continue;
+ }
+
+ struct timeval archive_end;
+ rc = pmGetArchiveEnd(&archive_end);
+ if (rc < 0)
+ {
+ pmDestroyContext (ctx);
+ continue; // skip; gc later
+ }
+
+ if (archive_end.tv_sec < prior_period_start) // archive too old?
+ {
+ if (pmDebug & DBG_TRACE_APPL0)
+ timestamp(cout) << "skipping merge of too-old archive " << base_name << endl;
+ pmDestroyContext (ctx);
+ continue; // skip; gc later
+ }
- old_archives.push_back (base_name);
+ pmDestroyContext (ctx);
+ // fallthrough: the archive intersects the prior_period_{start,end} interval
+
+ // XXX: What happens for archives that span across granular periods?
+ }
+
+ mergeable_archives.push_back (base_name);
}
globfree (& the_blob);
}
- string timestr = "merged-archive";
+ string timestr = "archive";
time_t now2 = time(NULL);
struct tm *now = gmtime(& now2);
if (now != NULL)
@@ -754,42 +942,52 @@ pmmgr_pmlogger_daemon::daemon_command_li
}
string merged_archive_name = host_log_dir + (char)__pmPathSeparator() + timestr;
- if (old_archives.size() > 1) // 1 or 0 are not worth merging!
+ if (mergeable_archives.size() > 1) // 1 or 0 are not worth merging!
{
// assemble final bits of pmlogextract command line: the inputs and the output
- for (unsigned i=0; i<old_archives.size(); i++)
- pmlogextract_options += " " + sh_quote(old_archives[i]);
+ for (unsigned i=0; i<mergeable_archives.size(); i++)
+ {
+ if (quit) return "";
- pmlogextract_options += " " + sh_quote(merged_archive_name);
+ if (get_config_exists("pmlogmerge-rewrite"))
+ {
+ string pmlogrewrite_options = sh_quote(pmlogrewrite_command);
+ pmlogrewrite_options += " -i " + get_config_single("pmlogmerge-rewrite");
+ pmlogrewrite_options += " " + sh_quote(mergeable_archives[i]);
+
+ (void) wrap_system(pmlogrewrite_options.c_str());
+ // In case of error, don't break; let's try to merge it anyway.
+ // Maybe pmlogrewrite will succeed and will get rid of this file.
+ }
- if (pmDebug & DBG_TRACE_APPL0)
- timestamp(cout) << "running " << pmlogextract_options << endl;
+ pmlogextract_options += " " + sh_quote(mergeable_archives[i]);
+ }
- rc = system(pmlogextract_options.c_str());
- if (rc != 0)
- // will try again later
- timestamp(cerr) << "system(" << pmlogextract_options << ") failed: rc=" << rc << endl;
- else
+ if (quit) return "";
+
+ pmlogextract_options += " " + sh_quote(merged_archive_name);
+
+ rc = wrap_system(pmlogextract_options.c_str());
+ if (rc == 0)
{
- // zap the previous archive files
- for (unsigned i=0; i<old_archives.size(); i++)
+ // zap the previous archive files
+ //
+ // Don't skip this upon "if (quit)", since the new merged archive is already complete;
+ // it'd be a waste to keep these files around for a future re-merge.
+ for (unsigned i=0; i<mergeable_archives.size(); i++)
{
- string base_name = sh_quote(old_archives[i]);
+ string base_name = sh_quote(mergeable_archives[i]);
string cleanup_cmd = string("/bin/rm -f")
+ " " + base_name + ".[0-9]*"
- + " " + base_name + ".index" +
+ + " " + base_name + ".index" +
+ " " + base_name + ".meta";
- if (pmDebug & DBG_TRACE_APPL0)
- timestamp(cout) << "running " << cleanup_cmd << endl;
- rc = system(cleanup_cmd.c_str());
- if (rc != 0)
- timestamp(cerr) << "system(" << cleanup_cmd << ") failed: rc=" << rc << endl;
+ (void) wrap_system(cleanup_cmd.c_str());
}
}
}
}
-
+
// synthesize a logfile name similarly as pmlogger_check, but add %S (seconds)
// to reduce likelihood of conflict with a short poll interval
string timestr = "archive";
@@ -800,7 +998,7 @@ pmmgr_pmlogger_daemon::daemon_command_li
char timestr2[100];
int rc = strftime(timestr2, sizeof(timestr2), "-%Y%m%d.%H%M%S", now);
if (rc > 0)
- timestr += timestr2;
+ timestr += timestr2; // no sh_quote required
}
// last argument
@@ -810,7 +1008,7 @@ pmmgr_pmlogger_daemon::daemon_command_li
}
-std::string
+std::string
pmmgr_pmie_daemon::daemon_command_line()
{
string default_log_dir =
@@ -825,7 +1023,7 @@ pmmgr_pmie_daemon::daemon_command_line()
(void) mkdir2 (host_log_dir.c_str(), 0777);
// (errors creating actual files under host_log_dir will be noted shortly)
- string pmie_command =
+ string pmie_command =
string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmie";
string pmie_options = sh_quote (pmie_command);
@@ -835,29 +1033,29 @@ pmmgr_pmie_daemon::daemon_command_line()
if (get_config_exists ("pmieconf"))
{
string pmieconf_output_file = host_log_dir + (char)__pmPathSeparator() + "config.pmie";
- string pmieconf_command =
+ string pmieconf_command =
string(pmGetConfig("PCP_BIN_DIR")) + (char)__pmPathSeparator() + "pmieconf";
// NB: pmieconf doesn't take a host name as an argument, unlike pmlogconf
- string pmieconf_options =
+ string pmieconf_options =
sh_quote(pmieconf_command)
+ " -F -c " + get_config_single ("pmieconf")
+ " -f " + sh_quote(pmieconf_output_file);
- if (pmDebug & DBG_TRACE_APPL0)
- timestamp(cout) << "running " << pmieconf_options << endl;
- int rc = system(pmieconf_options.c_str());
- if (rc != 0)
- timestamp(cerr) << "system(" << pmieconf_options << ") failed: rc=" << rc << endl;
+
+ int rc = wrap_system(pmieconf_options.c_str());
+ if (rc) return "";
pmie_options += "-c " + sh_quote(pmieconf_output_file);
}
+ if (quit) return "";
+
// collect -h direction
pmie_options += " -h " + sh_quote(spec);
// collect -f, to get it to run in the foreground, avoid setuid
pmie_options += " -f";
-
+
// collect subsidiary pmlogger diagnostics
pmie_options += " -l " + sh_quote(host_log_dir + (char)__pmPathSeparator() + "pmie.log");
@@ -876,10 +1074,10 @@ void handle_interrupt (int sig)
// recursive signals or whatnot, despite sa_mask in
// setup_signals()).
if (quit == 0)
- kill(-getpid(), sig);
+ kill(-getpid(), SIGTERM);
quit ++;
- if (quit > 2)
+ if (quit > 3) // ignore 1 from user; 1 from kill(-getpid) above; 1 from same near main() exit
{
char msg[] = "Too many interrupts received, exiting.\n";
int rc = write (2, msg, sizeof(msg)-1);
@@ -928,21 +1126,22 @@ void setup_signals()
int main (int argc, char *argv[])
{
+ /* Become our own process group, to assist signal passing to children. */
+ setpgid(getpid(), 0);
__pmSetProgname(argv[0]);
-
setup_signals();
- string default_config_dir =
+ string default_config_dir =
string(pmGetConfig("PCP_SYSCONF_DIR")) + (char)__pmPathSeparator() + "pmmgr";
vector<pmmgr_job_spec*> js;
int c;
- int polltime = 60;
char* username_str;
__pmGetUsername(& username_str);
string username = username_str;
+ char* output_filename = NULL;
- while ((c = getopt(argc, argv, "D:c:vp:U:h")) != EOF)
+ while ((c = getopt(argc, argv, "D:c:vp:U:l:h")) != EOF)
{
switch (c)
{
@@ -950,13 +1149,17 @@ int main (int argc, char *argv[])
(void) __pmParseDebug(optarg);
break;
+ case 'l':
+ output_filename = optarg;
+ break;
+
case 'v':
pmDebug |= DBG_TRACE_APPL0;
break;
case 'p':
polltime = atoi(optarg);
- if (polltime <= 0)
+ if (polltime <= 0)
{
cerr << "Poll time too short." << endl;
exit(1);
@@ -994,8 +1197,34 @@ int main (int argc, char *argv[])
// lose root privileges if we have them
__pmSetProcessIdentity(username.c_str());
+ // (re)create log file, redirect stdout/stderr
+ // NB: must be done after __pmSetProcessIdentity() for proper file permissions
+ if (output_filename)
+ {
+ int fd;
+ (void) unlink (output_filename); // in case one's left over from a previous other-uid run
+ fd = open (output_filename, O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, 0666);
+ if (fd < 0)
+ timestamp(cerr) << "Cannot re-create logfile " << output_filename << endl;
+ else
+ {
+ int rc;
+ // Move the new file descriptors on top of stdout/stderr
+ rc = dup2 (fd, STDOUT_FILENO);
+ if (rc < 0) // rather unlikely
+ timestamp(cerr) << "Cannot redirect logfile to stdout" << endl;
+ rc = dup2 (fd, STDERR_FILENO);
+ if (rc < 0) // rather unlikely
+ timestamp(cerr) << "Cannot redirect logfile to stderr" << endl;
+ rc = close (fd);
+ if (rc < 0) // rather unlikely
+ timestamp(cerr) << "Cannot close logfile fd" << endl;
+ }
+
+ }
+
timestamp(cout) << "Log started" << endl;
- while (1)
+ while (! quit)
{
// In this section, we must not fidget with SIGCHLD, due to use of system(3).
for (unsigned i=0; i<js.size() && !quit; i++)
@@ -1014,10 +1243,15 @@ int main (int argc, char *argv[])
(void) signal (SIGALRM, SIG_DFL);
}
+ // NB: don't let this cleanup be interrupted by pending-quit signals;
+ // we want the daemon pid's killed.
for (unsigned i=0; i<js.size(); i++)
delete js[i];
- // XXX: send a suicide signal to the process group
timestamp(cout) << "Log finished" << endl;
+
+ // Send a last-gasp signal out, just in case daemons somehow missed
+ kill(-getpid(), SIGTERM);
+
return 0;
}
diff -Naurp a/src/pmmgr/pmmgr.h b/src/pmmgr/pmmgr.h
--- a/src/pmmgr/pmmgr.h 2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/pmmgr.h 2014-02-26 14:34:37.567238053 +1100
@@ -46,6 +46,7 @@ protected:
std::string config_directory;
std::ostream& timestamp(std::ostream&);
+ int wrap_system(const std::string& cmd);
};
@@ -64,6 +65,7 @@ protected:
pmmgr_hostid hostid;
pcp_context_spec spec;
int pid;
+ time_t last_restart_attempt;
virtual std::string daemon_command_line() = 0;
};
diff -Naurp a/src/pmmgr/pmmgr.options b/src/pmmgr/pmmgr.options
--- a/src/pmmgr/pmmgr.options 2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/pmmgr.options 2014-02-26 14:34:37.576238032 +1100
@@ -15,6 +15,9 @@
# assume identity of some user other than "pcp"
# -U foobar
+# make log go someplace else
+# -l /some/place/else
+
# setting of environment variables for pmmgr
# timeouts for interactions with pmcd on behalf of clients
diff -Naurp a/src/pmmgr/rc_pmmgr b/src/pmmgr/rc_pmmgr
--- a/src/pmmgr/rc_pmmgr 2014-01-08 17:25:46.000000000 +1100
+++ b/src/pmmgr/rc_pmmgr 2014-02-26 14:34:37.584237959 +1100
@@ -215,7 +215,11 @@ case "$1" in
Error: pmmgr control file "$PMMGROPTS" is missing, cannot start pmmgr.'
exit
fi
- [ ! -d $RUNDIR ] && mkdir -p $RUNDIR
+ if [ ! -d "$RUNDIR" ]
+ then
+ mkdir -p -m 775 "$RUNDIR"
+ chown $PCP_USER:$PCP_GROUP "$RUNDIR"
+ fi
cd $RUNDIR
# salvage the previous versions of any pmmgr
@@ -226,7 +230,7 @@ Error: pmmgr control file "$PMMGROPTS" i
mv pmmgr.log pmmgr.log.prev
fi
- $ECHO $PCP_ECHO_N "Performance Co-Pilot starting pmmgr (logfile is $RUNDIR/pmmgr.log) ..." "$PCP_ECHO_C"
+ $ECHO $PCP_ECHO_N "Starting pmmgr ..." "$PCP_ECHO_C"
# options file processing ...
# only consider lines which start with a hyphen
# get rid of the -f option
@@ -250,7 +254,7 @@ BEGIN { exports="" }
}
END { if (exports != "") print "export", exports }'`
- $PMMGR $OPTS > pmmgr.log 2>&1 &
+ $PMMGR -l pmmgr.log $OPTS &
$RC_STATUS -v
pmpost "start pmmgr from $pmprog"
diff -Naurp a/src/pmmgr/TODO b/src/pmmgr/TODO
--- a/src/pmmgr/TODO 2014-01-13 13:27:28.000000000 +1100
+++ b/src/pmmgr/TODO 2014-02-26 14:34:37.593238086 +1100
@@ -1,8 +1,12 @@
-- pmlogrewrite
-- log aging without logmerge (which is high on I/O)
+- pmmgr.1 EXAMPLE CONFIGURATIONS
+- optionally delay pm*conf
+- pmlogreduce
+- log aging in background while new pmlogger's already running
- old log compression (until we get libpcp zlib or something)
- email error reporting?
-- test buildability on non-linux
- qa
- port to mingw?
+- port to cygwin?
- pmlogger/pmie .log rotation
+- pid->pid_t cleanup
+