Blob Blame History Raw
commit e79e5040a0e7efd622ecdd572bee40c90e59c3bd
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date:   Fri Apr 13 17:11:58 2018 +0200

    timemaster: restart terminated processes.
    
    If a ptp4l or phc2sys process is terminated (e.g. due to a crash) and
    timemaster was running for at least one second (i.e. it's not an error
    in ptp4l/phc2sys configuration), start the process again. Restart all
    processes corresponding to the same time source at the same time to
    ensure phc2sys is always connected to the currently running ptp4l.
    
    Add a new option to disable the restarting.
    
    Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>

diff --git a/timemaster.8 b/timemaster.8
index e0e22eb..7288972 100644
--- a/timemaster.8
+++ b/timemaster.8
@@ -87,6 +87,16 @@ Specify the first number in a sequence of SHM segments that will be used by
 can be useful to avoid conflicts with time sources that are not started by
 \fBtimemaster\fR, e.g. \fBgpsd\fR using segments number 0 and 1.
 
+.TP
+.B restart_processes
+Enable or disable restarting of processes started by \fBtimemaster\fR. If the
+option is set to a non-zero value, all processes except \fBchronyd\fR and
+\fBntpd\fR will be automatically restarted when terminated and \fBtimemaster\fR
+is running for at least one second (i.e. the process did not terminate due to a
+configuration error). If a process was terminated and is not started again,
+\fBtimemaster\fR will kill the other processes and exit with a non-zero status.
+The default value is 1 (enabled).
+
 .SS [ntp_server address]
 
 The \fBntp_server\fR section specifies an NTP server that should be used as a
@@ -318,6 +328,7 @@ ptp4l_option delay_mechanism P2P
 ntp_program chronyd
 rundir /var/run/timemaster
 first_shm_segment 1
+restart_processes 0
 
 [chronyd]
 path /usr/sbin/chronyd
diff --git a/timemaster.c b/timemaster.c
index fc3ba31..4ba921e 100644
--- a/timemaster.c
+++ b/timemaster.c
@@ -44,6 +44,7 @@
 #define DEFAULT_RUNDIR "/var/run/timemaster"
 
 #define DEFAULT_FIRST_SHM_SEGMENT 0
+#define DEFAULT_RESTART_PROCESSES 1
 
 #define DEFAULT_NTP_PROGRAM CHRONYD
 #define DEFAULT_NTP_MINPOLL 6
@@ -108,6 +109,7 @@ struct timemaster_config {
 	enum ntp_program ntp_program;
 	char *rundir;
 	int first_shm_segment;
+	int restart_processes;
 	struct program_config chronyd;
 	struct program_config ntpd;
 	struct program_config phc2sys;
@@ -122,6 +124,9 @@ struct config_file {
 struct script {
 	struct config_file **configs;
 	char ***commands;
+	int **command_groups;
+	int restart_groups;
+	int no_restart_group;
 };
 
 static void free_parray(void **a)
@@ -385,6 +390,8 @@ static int parse_timemaster_settings(char **settings,
 			replace_string(value, &config->rundir);
 		} else if (!strcasecmp(name, "first_shm_segment")) {
 			r = parse_int(value, &config->first_shm_segment);
+		} else if (!strcasecmp(name, "restart_processes")) {
+			r = parse_int(value, &config->restart_processes);
 		} else {
 			pr_err("unknown timemaster setting %s", name);
 			return 1;
@@ -508,6 +515,7 @@ static struct timemaster_config *config_parse(char *path)
 	config->ntp_program = DEFAULT_NTP_PROGRAM;
 	config->rundir = xstrdup(DEFAULT_RUNDIR);
 	config->first_shm_segment = DEFAULT_FIRST_SHM_SEGMENT;
+	config->restart_processes = DEFAULT_RESTART_PROCESSES;
 
 	init_program_config(&config->chronyd, "chronyd",
 			    NULL, DEFAULT_CHRONYD_SETTINGS, NULL);
@@ -632,6 +640,18 @@ static char *get_refid(char *prefix, unsigned int number)
 	return NULL;
 };
 
+static void add_command(char **command, int command_group,
+			struct script *script)
+{
+	int *group;
+
+	parray_append((void ***)&script->commands, command);
+
+	group = xmalloc(sizeof(int));
+	*group = command_group;
+	parray_append((void ***)&script->command_groups, group);
+}
+
 static void add_shm_source(int shm_segment, int poll, int dpoll, double delay,
 			   char *ntp_options, char *prefix,
 			   struct timemaster_config *config, char **ntp_config)
@@ -671,8 +691,8 @@ static int add_ntp_source(struct ntp_server *source, char **ntp_config)
 
 static int add_ptp_source(struct ptp_domain *source,
 			  struct timemaster_config *config, int *shm_segment,
-			  int ***allocated_phcs, char **ntp_config,
-			  struct script *script)
+			  int *command_group, int ***allocated_phcs,
+			  char **ntp_config, struct script *script)
 {
 	struct config_file *config_file;
 	char **command, *uds_path, **interfaces, *message_tag;
@@ -798,19 +818,19 @@ static int add_ptp_source(struct ptp_domain *source,
 			/* HW time stamping */
 			command = get_ptp4l_command(&config->ptp4l, config_file,
 						    interfaces, 1);
-			parray_append((void ***)&script->commands, command);
+			add_command(command, *command_group, script);
 
 			command = get_phc2sys_command(&config->phc2sys,
 						      source->domain,
 						      source->phc2sys_poll,
 						      *shm_segment, uds_path,
 						      message_tag);
-			parray_append((void ***)&script->commands, command);
+			add_command(command, (*command_group)++, script);
 		} else {
 			/* SW time stamping */
 			command = get_ptp4l_command(&config->ptp4l, config_file,
 						    interfaces, 0);
-			parray_append((void ***)&script->commands, command);
+			add_command(command, (*command_group)++, script);
 
 			string_appendf(&config_file->content,
 				       "clock_servo ntpshm\n"
@@ -862,7 +882,8 @@ static char **get_ntpd_command(struct program_config *config,
 }
 
 static struct config_file *add_ntp_program(struct timemaster_config *config,
-					   struct script *script)
+					   struct script *script,
+					   int command_group)
 {
 	struct config_file *ntp_config = xmalloc(sizeof(*ntp_config));
 	char **command = NULL;
@@ -886,7 +907,7 @@ static struct config_file *add_ntp_program(struct timemaster_config *config,
 	}
 
 	parray_append((void ***)&script->configs, ntp_config);
-	parray_append((void ***)&script->commands, command);
+	add_command(command, command_group, script);
 
 	return ntp_config;
 }
@@ -894,6 +915,7 @@ static struct config_file *add_ntp_program(struct timemaster_config *config,
 static void script_destroy(struct script *script)
 {
 	char ***commands, **command;
+	int **groups;
 	struct config_file *config, **configs;
 
 	for (configs = script->configs; *configs; configs++) {
@@ -911,6 +933,10 @@ static void script_destroy(struct script *script)
 	}
 	free(script->commands);
 
+	for (groups = script->command_groups; *groups; groups++)
+		free(*groups);
+	free(script->command_groups);
+
 	free(script);
 }
 
@@ -920,12 +946,15 @@ static struct script *script_create(struct timemaster_config *config)
 	struct source *source, **sources;
 	struct config_file *ntp_config = NULL;
 	int **allocated_phcs = (int **)parray_new();
-	int ret = 0, shm_segment;
+	int ret = 0, shm_segment, command_group = 0;
 
 	script->configs = (struct config_file **)parray_new();
 	script->commands = (char ***)parray_new();
+	script->command_groups = (int **)parray_new();
+	script->no_restart_group = command_group;
+	script->restart_groups = config->restart_processes;
 
-	ntp_config = add_ntp_program(config, script);
+	ntp_config = add_ntp_program(config, script, command_group++);
 	shm_segment = config->first_shm_segment;
 
 	for (sources = config->sources; (source = *sources); sources++) {
@@ -936,7 +965,7 @@ static struct script *script_create(struct timemaster_config *config)
 			break;
 		case PTP_DOMAIN:
 			if (add_ptp_source(&source->ptp, config, &shm_segment,
-					   &allocated_phcs,
+					   &command_group, &allocated_phcs,
 					   &ntp_config->content, script))
 				ret = 1;
 			break;
@@ -1063,10 +1092,11 @@ static int remove_config_files(struct config_file **configs)
 
 static int script_run(struct script *script)
 {
+	struct timespec ts_start, ts_now;
 	sigset_t mask, old_mask;
 	siginfo_t info;
 	pid_t pid, *pids;
-	int i, num_commands, status, ret = 0;
+	int i, group, num_commands, status, quit = 0, ret = 0;
 
 	for (num_commands = 0; script->commands[num_commands]; num_commands++)
 		;
@@ -1101,7 +1131,9 @@ static int script_run(struct script *script)
 		}
 	}
 
-	/* wait for one of the blocked signals */
+	clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+	/* process the blocked signals */
 	while (1) {
 		if (sigwaitinfo(&mask, &info) < 0) {
 			if (errno == EINTR)
@@ -1110,36 +1142,111 @@ static int script_run(struct script *script)
 			break;
 		}
 
-		/*
-		 * assume only the first process (i.e. chronyd or ntpd) is
-		 * essential and continue if other processes terminate
-		 */
-		if (info.si_signo == SIGCHLD && info.si_pid != pids[0]) {
-			pr_info("process %d terminated (ignored)", info.si_pid);
+		clock_gettime(CLOCK_MONOTONIC, &ts_now);
+
+		if (info.si_signo != SIGCHLD) {
+			if (quit)
+				continue;
+
+			quit = 1;
+			pr_debug("exiting on signal %d", info.si_signo);
+
+			/* terminate remaining processes */
+			for (i = 0; i < num_commands; i++) {
+				if (pids[i] > 0) {
+					pr_debug("killing process %d", pids[i]);
+					kill(pids[i], SIGTERM);
+				}
+			}
+
 			continue;
 		}
 
-		pr_info("received signal %d", info.si_signo);
-		break;
-	}
+		/* wait for all terminated processes */
+		while (1) {
+			pid = waitpid(-1, &status, WNOHANG);
+			if (pid <= 0)
+				break;
 
-	/* kill all started processes */
-	for (i = 0; i < num_commands; i++) {
-		if (pids[i] > 0) {
-			pr_debug("killing process %d", pids[i]);
-			kill(pids[i], SIGTERM);
+			if (!WIFEXITED(status)) {
+				pr_info("process %d terminated abnormally",
+					pid);
+			} else {
+				pr_info("process %d terminated with status %d",
+					pid, WEXITSTATUS(status));
+			}
+
+			for (i = 0; i < num_commands; i++) {
+				if (pids[i] == pid)
+					pids[i] = 0;
+			}
 		}
-	}
 
-	while ((pid = wait(&status)) >= 0) {
-		if (!WIFEXITED(status)) {
-			pr_info("process %d terminated abnormally", pid);
-			ret = 1;
-		} else {
-			if (WEXITSTATUS(status))
+		/* wait for all processes to terminate when exiting */
+		if (quit) {
+			for (i = 0; i < num_commands; i++) {
+				if (pids[i])
+					break;
+			}
+			if (i == num_commands)
+				break;
+
+			pr_debug("waiting for other processes to terminate");
+			continue;
+		}
+
+		/*
+		 * terminate (and then restart if allowed) all processes in
+		 * groups that have a terminated process
+		 */
+		for (group = 0; group < num_commands; group++) {
+			int terminated = 0, running = 0;
+
+			for (i = 0; i < num_commands; i++) {
+				if (*(script->command_groups[i]) != group)
+					continue;
+				if (pids[i])
+					running++;
+				else
+					terminated++;
+			}
+
+			if (!terminated)
+				continue;
+
+			/*
+			 * exit with a non-zero status if the group should not
+			 * be restarted (i.e. chronyd/ntpd), timemaster is
+			 * running only for a short time (and it is likely a
+			 * configuration error), or restarting is disabled
+			 * completely
+			 */
+			if (group == script->no_restart_group ||
+			    ts_now.tv_sec - ts_start.tv_sec <= 1 ||
+			    !script->restart_groups) {
+				kill(getpid(), SIGTERM);
 				ret = 1;
-			pr_info("process %d terminated with status %d", pid,
-				WEXITSTATUS(status));
+				break;
+			}
+
+			for (i = 0; i < num_commands; i++) {
+				if (*(script->command_groups[i]) != group)
+					continue;
+
+				/* terminate all processes in the group first */
+				if (running && pids[i]) {
+					pr_debug("killing process %d", pids[i]);
+					kill(pids[i], SIGTERM);
+				} else if (!running && !pids[i]) {
+					pids[i] = start_program(script->commands[i],
+								&old_mask);
+					if (!pids[i])
+						kill(getpid(), SIGTERM);
+
+					/* limit restarting rate */
+					sleep(1);
+				}
+			}
 		}
 	}
 
@@ -1154,6 +1261,7 @@ static int script_run(struct script *script)
 static void script_print(struct script *script)
 {
 	char ***commands, **command;
+	int **groups;
 	struct config_file *config, **configs;
 
 	for (configs = script->configs; *configs; configs++) {
@@ -1162,7 +1270,9 @@ static void script_print(struct script *script)
 	}
 
 	fprintf(stderr, "commands:\n\n");
-	for (commands = script->commands; *commands; commands++) {
+	for (commands = script->commands, groups = script->command_groups;
+	     *commands; commands++, groups++) {
+		fprintf(stderr, "[%d] ", **groups);
 		for (command = *commands; *command; command++)
 			fprintf(stderr, "%s ", *command);
 		fprintf(stderr, "\n");