c1aeb8
commit e79e5040a0e7efd622ecdd572bee40c90e59c3bd
c1aeb8
Author: Miroslav Lichvar <mlichvar@redhat.com>
c1aeb8
Date:   Fri Apr 13 17:11:58 2018 +0200
c1aeb8
c1aeb8
    timemaster: restart terminated processes.
c1aeb8
    
c1aeb8
    If a ptp4l or phc2sys process is terminated (e.g. due to a crash) and
c1aeb8
    timemaster was running for at least one second (i.e. it's not an error
c1aeb8
    in ptp4l/phc2sys configuration), start the process again. Restart all
c1aeb8
    processes corresponding to the same time source at the same time to
c1aeb8
    ensure phc2sys is always connected to the currently running ptp4l.
c1aeb8
    
c1aeb8
    Add a new option to disable the restarting.
c1aeb8
    
c1aeb8
    Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
c1aeb8
c1aeb8
diff --git a/timemaster.8 b/timemaster.8
c1aeb8
index e0e22eb..7288972 100644
c1aeb8
--- a/timemaster.8
c1aeb8
+++ b/timemaster.8
c1aeb8
@@ -87,6 +87,16 @@ Specify the first number in a sequence of SHM segments that will be used by
c1aeb8
 can be useful to avoid conflicts with time sources that are not started by
c1aeb8
 \fBtimemaster\fR, e.g. \fBgpsd\fR using segments number 0 and 1.
c1aeb8
 
c1aeb8
+.TP
c1aeb8
+.B restart_processes
c1aeb8
+Enable or disable restarting of processes started by \fBtimemaster\fR. If the
c1aeb8
+option is set to a non-zero value, all processes except \fBchronyd\fR and
c1aeb8
+\fBntpd\fR will be automatically restarted when terminated and \fBtimemaster\fR
c1aeb8
+is running for at least one second (i.e. the process did not terminate due to a
c1aeb8
+configuration error). If a process was terminated and is not started again,
c1aeb8
+\fBtimemaster\fR will kill the other processes and exit with a non-zero status.
c1aeb8
+The default value is 1 (enabled).
c1aeb8
+
c1aeb8
 .SS [ntp_server address]
c1aeb8
 
c1aeb8
 The \fBntp_server\fR section specifies an NTP server that should be used as a
c1aeb8
@@ -318,6 +328,7 @@ ptp4l_option delay_mechanism P2P
c1aeb8
 ntp_program chronyd
c1aeb8
 rundir /var/run/timemaster
c1aeb8
 first_shm_segment 1
c1aeb8
+restart_processes 0
c1aeb8
 
c1aeb8
 [chronyd]
c1aeb8
 path /usr/sbin/chronyd
c1aeb8
diff --git a/timemaster.c b/timemaster.c
c1aeb8
index fc3ba31..4ba921e 100644
c1aeb8
--- a/timemaster.c
c1aeb8
+++ b/timemaster.c
c1aeb8
@@ -44,6 +44,7 @@
c1aeb8
 #define DEFAULT_RUNDIR "/var/run/timemaster"
c1aeb8
 
c1aeb8
 #define DEFAULT_FIRST_SHM_SEGMENT 0
c1aeb8
+#define DEFAULT_RESTART_PROCESSES 1
c1aeb8
 
c1aeb8
 #define DEFAULT_NTP_PROGRAM CHRONYD
c1aeb8
 #define DEFAULT_NTP_MINPOLL 6
c1aeb8
@@ -108,6 +109,7 @@ struct timemaster_config {
c1aeb8
 	enum ntp_program ntp_program;
c1aeb8
 	char *rundir;
c1aeb8
 	int first_shm_segment;
c1aeb8
+	int restart_processes;
c1aeb8
 	struct program_config chronyd;
c1aeb8
 	struct program_config ntpd;
c1aeb8
 	struct program_config phc2sys;
c1aeb8
@@ -122,6 +124,9 @@ struct config_file {
c1aeb8
 struct script {
c1aeb8
 	struct config_file **configs;
c1aeb8
 	char ***commands;
c1aeb8
+	int **command_groups;
c1aeb8
+	int restart_groups;
c1aeb8
+	int no_restart_group;
c1aeb8
 };
c1aeb8
 
c1aeb8
 static void free_parray(void **a)
c1aeb8
@@ -385,6 +390,8 @@ static int parse_timemaster_settings(char **settings,
c1aeb8
 			replace_string(value, &config->rundir);
c1aeb8
 		} else if (!strcasecmp(name, "first_shm_segment")) {
c1aeb8
 			r = parse_int(value, &config->first_shm_segment);
c1aeb8
+		} else if (!strcasecmp(name, "restart_processes")) {
c1aeb8
+			r = parse_int(value, &config->restart_processes);
c1aeb8
 		} else {
c1aeb8
 			pr_err("unknown timemaster setting %s", name);
c1aeb8
 			return 1;
c1aeb8
@@ -508,6 +515,7 @@ static struct timemaster_config *config_parse(char *path)
c1aeb8
 	config->ntp_program = DEFAULT_NTP_PROGRAM;
c1aeb8
 	config->rundir = xstrdup(DEFAULT_RUNDIR);
c1aeb8
 	config->first_shm_segment = DEFAULT_FIRST_SHM_SEGMENT;
c1aeb8
+	config->restart_processes = DEFAULT_RESTART_PROCESSES;
c1aeb8
 
c1aeb8
 	init_program_config(&config->chronyd, "chronyd",
c1aeb8
 			    NULL, DEFAULT_CHRONYD_SETTINGS, NULL);
c1aeb8
@@ -632,6 +640,18 @@ static char *get_refid(char *prefix, unsigned int number)
c1aeb8
 	return NULL;
c1aeb8
 };
c1aeb8
 
c1aeb8
+static void add_command(char **command, int command_group,
c1aeb8
+			struct script *script)
c1aeb8
+{
c1aeb8
+	int *group;
c1aeb8
+
c1aeb8
+	parray_append((void ***)&script->commands, command);
c1aeb8
+
c1aeb8
+	group = xmalloc(sizeof(int));
c1aeb8
+	*group = command_group;
c1aeb8
+	parray_append((void ***)&script->command_groups, group);
c1aeb8
+}
c1aeb8
+
c1aeb8
 static void add_shm_source(int shm_segment, int poll, int dpoll, double delay,
c1aeb8
 			   char *ntp_options, char *prefix,
c1aeb8
 			   struct timemaster_config *config, char **ntp_config)
c1aeb8
@@ -671,8 +691,8 @@ static int add_ntp_source(struct ntp_server *source, char **ntp_config)
c1aeb8
 
c1aeb8
 static int add_ptp_source(struct ptp_domain *source,
c1aeb8
 			  struct timemaster_config *config, int *shm_segment,
c1aeb8
-			  int ***allocated_phcs, char **ntp_config,
c1aeb8
-			  struct script *script)
c1aeb8
+			  int *command_group, int ***allocated_phcs,
c1aeb8
+			  char **ntp_config, struct script *script)
c1aeb8
 {
c1aeb8
 	struct config_file *config_file;
c1aeb8
 	char **command, *uds_path, **interfaces, *message_tag;
c1aeb8
@@ -798,19 +818,19 @@ static int add_ptp_source(struct ptp_domain *source,
c1aeb8
 			/* HW time stamping */
c1aeb8
 			command = get_ptp4l_command(&config->ptp4l, config_file,
c1aeb8
 						    interfaces, 1);
c1aeb8
-			parray_append((void ***)&script->commands, command);
c1aeb8
+			add_command(command, *command_group, script);
c1aeb8
 
c1aeb8
 			command = get_phc2sys_command(&config->phc2sys,
c1aeb8
 						      source->domain,
c1aeb8
 						      source->phc2sys_poll,
c1aeb8
 						      *shm_segment, uds_path,
c1aeb8
 						      message_tag);
c1aeb8
-			parray_append((void ***)&script->commands, command);
c1aeb8
+			add_command(command, (*command_group)++, script);
c1aeb8
 		} else {
c1aeb8
 			/* SW time stamping */
c1aeb8
 			command = get_ptp4l_command(&config->ptp4l, config_file,
c1aeb8
 						    interfaces, 0);
c1aeb8
-			parray_append((void ***)&script->commands, command);
c1aeb8
+			add_command(command, (*command_group)++, script);
c1aeb8
 
c1aeb8
 			string_appendf(&config_file->content,
c1aeb8
 				       "clock_servo ntpshm\n"
c1aeb8
@@ -862,7 +882,8 @@ static char **get_ntpd_command(struct program_config *config,
c1aeb8
 }
c1aeb8
 
c1aeb8
 static struct config_file *add_ntp_program(struct timemaster_config *config,
c1aeb8
-					   struct script *script)
c1aeb8
+					   struct script *script,
c1aeb8
+					   int command_group)
c1aeb8
 {
c1aeb8
 	struct config_file *ntp_config = xmalloc(sizeof(*ntp_config));
c1aeb8
 	char **command = NULL;
c1aeb8
@@ -886,7 +907,7 @@ static struct config_file *add_ntp_program(struct timemaster_config *config,
c1aeb8
 	}
c1aeb8
 
c1aeb8
 	parray_append((void ***)&script->configs, ntp_config);
c1aeb8
-	parray_append((void ***)&script->commands, command);
c1aeb8
+	add_command(command, command_group, script);
c1aeb8
 
c1aeb8
 	return ntp_config;
c1aeb8
 }
c1aeb8
@@ -894,6 +915,7 @@ static struct config_file *add_ntp_program(struct timemaster_config *config,
c1aeb8
 static void script_destroy(struct script *script)
c1aeb8
 {
c1aeb8
 	char ***commands, **command;
c1aeb8
+	int **groups;
c1aeb8
 	struct config_file *config, **configs;
c1aeb8
 
c1aeb8
 	for (configs = script->configs; *configs; configs++) {
c1aeb8
@@ -911,6 +933,10 @@ static void script_destroy(struct script *script)
c1aeb8
 	}
c1aeb8
 	free(script->commands);
c1aeb8
 
c1aeb8
+	for (groups = script->command_groups; *groups; groups++)
c1aeb8
+		free(*groups);
c1aeb8
+	free(script->command_groups);
c1aeb8
+
c1aeb8
 	free(script);
c1aeb8
 }
c1aeb8
 
c1aeb8
@@ -920,12 +946,15 @@ static struct script *script_create(struct timemaster_config *config)
c1aeb8
 	struct source *source, **sources;
c1aeb8
 	struct config_file *ntp_config = NULL;
c1aeb8
 	int **allocated_phcs = (int **)parray_new();
c1aeb8
-	int ret = 0, shm_segment;
c1aeb8
+	int ret = 0, shm_segment, command_group = 0;
c1aeb8
 
c1aeb8
 	script->configs = (struct config_file **)parray_new();
c1aeb8
 	script->commands = (char ***)parray_new();
c1aeb8
+	script->command_groups = (int **)parray_new();
c1aeb8
+	script->no_restart_group = command_group;
c1aeb8
+	script->restart_groups = config->restart_processes;
c1aeb8
 
c1aeb8
-	ntp_config = add_ntp_program(config, script);
c1aeb8
+	ntp_config = add_ntp_program(config, script, command_group++);
c1aeb8
 	shm_segment = config->first_shm_segment;
c1aeb8
 
c1aeb8
 	for (sources = config->sources; (source = *sources); sources++) {
c1aeb8
@@ -936,7 +965,7 @@ static struct script *script_create(struct timemaster_config *config)
c1aeb8
 			break;
c1aeb8
 		case PTP_DOMAIN:
c1aeb8
 			if (add_ptp_source(&source->ptp, config, &shm_segment,
c1aeb8
-					   &allocated_phcs,
c1aeb8
+					   &command_group, &allocated_phcs,
c1aeb8
 					   &ntp_config->content, script))
c1aeb8
 				ret = 1;
c1aeb8
 			break;
c1aeb8
@@ -1063,10 +1092,11 @@ static int remove_config_files(struct config_file **configs)
c1aeb8
 
c1aeb8
 static int script_run(struct script *script)
c1aeb8
 {
c1aeb8
+	struct timespec ts_start, ts_now;
c1aeb8
 	sigset_t mask, old_mask;
c1aeb8
 	siginfo_t info;
c1aeb8
 	pid_t pid, *pids;
c1aeb8
-	int i, num_commands, status, ret = 0;
c1aeb8
+	int i, group, num_commands, status, quit = 0, ret = 0;
c1aeb8
 
c1aeb8
 	for (num_commands = 0; script->commands[num_commands]; num_commands++)
c1aeb8
 		;
c1aeb8
@@ -1101,7 +1131,9 @@ static int script_run(struct script *script)
c1aeb8
 		}
c1aeb8
 	}
c1aeb8
 
c1aeb8
-	/* wait for one of the blocked signals */
c1aeb8
+	clock_gettime(CLOCK_MONOTONIC, &ts_start);
c1aeb8
+
c1aeb8
+	/* process the blocked signals */
c1aeb8
 	while (1) {
c1aeb8
 		if (sigwaitinfo(&mask, &info) < 0) {
c1aeb8
 			if (errno == EINTR)
c1aeb8
@@ -1110,36 +1142,111 @@ static int script_run(struct script *script)
c1aeb8
 			break;
c1aeb8
 		}
c1aeb8
 
c1aeb8
-		/*
c1aeb8
-		 * assume only the first process (i.e. chronyd or ntpd) is
c1aeb8
-		 * essential and continue if other processes terminate
c1aeb8
-		 */
c1aeb8
-		if (info.si_signo == SIGCHLD && info.si_pid != pids[0]) {
c1aeb8
-			pr_info("process %d terminated (ignored)", info.si_pid);
c1aeb8
+		clock_gettime(CLOCK_MONOTONIC, &ts_now);
c1aeb8
+
c1aeb8
+		if (info.si_signo != SIGCHLD) {
c1aeb8
+			if (quit)
c1aeb8
+				continue;
c1aeb8
+
c1aeb8
+			quit = 1;
c1aeb8
+			pr_debug("exiting on signal %d", info.si_signo);
c1aeb8
+
c1aeb8
+			/* terminate remaining processes */
c1aeb8
+			for (i = 0; i < num_commands; i++) {
c1aeb8
+				if (pids[i] > 0) {
c1aeb8
+					pr_debug("killing process %d", pids[i]);
c1aeb8
+					kill(pids[i], SIGTERM);
c1aeb8
+				}
c1aeb8
+			}
c1aeb8
+
c1aeb8
 			continue;
c1aeb8
 		}
c1aeb8
 
c1aeb8
-		pr_info("received signal %d", info.si_signo);
c1aeb8
-		break;
c1aeb8
-	}
c1aeb8
+		/* wait for all terminated processes */
c1aeb8
+		while (1) {
c1aeb8
+			pid = waitpid(-1, &status, WNOHANG);
c1aeb8
+			if (pid <= 0)
c1aeb8
+				break;
c1aeb8
 
c1aeb8
-	/* kill all started processes */
c1aeb8
-	for (i = 0; i < num_commands; i++) {
c1aeb8
-		if (pids[i] > 0) {
c1aeb8
-			pr_debug("killing process %d", pids[i]);
c1aeb8
-			kill(pids[i], SIGTERM);
c1aeb8
+			if (!WIFEXITED(status)) {
c1aeb8
+				pr_info("process %d terminated abnormally",
c1aeb8
+					pid);
c1aeb8
+			} else {
c1aeb8
+				pr_info("process %d terminated with status %d",
c1aeb8
+					pid, WEXITSTATUS(status));
c1aeb8
+			}
c1aeb8
+
c1aeb8
+			for (i = 0; i < num_commands; i++) {
c1aeb8
+				if (pids[i] == pid)
c1aeb8
+					pids[i] = 0;
c1aeb8
+			}
c1aeb8
 		}
c1aeb8
-	}
c1aeb8
 
c1aeb8
-	while ((pid = wait(&status)) >= 0) {
c1aeb8
-		if (!WIFEXITED(status)) {
c1aeb8
-			pr_info("process %d terminated abnormally", pid);
c1aeb8
-			ret = 1;
c1aeb8
-		} else {
c1aeb8
-			if (WEXITSTATUS(status))
c1aeb8
+		/* wait for all processes to terminate when exiting */
c1aeb8
+		if (quit) {
c1aeb8
+			for (i = 0; i < num_commands; i++) {
c1aeb8
+				if (pids[i])
c1aeb8
+					break;
c1aeb8
+			}
c1aeb8
+			if (i == num_commands)
c1aeb8
+				break;
c1aeb8
+
c1aeb8
+			pr_debug("waiting for other processes to terminate");
c1aeb8
+			continue;
c1aeb8
+		}
c1aeb8
+
c1aeb8
+		/*
c1aeb8
+		 * terminate (and then restart if allowed) all processes in
c1aeb8
+		 * groups that have a terminated process
c1aeb8
+		 */
c1aeb8
+		for (group = 0; group < num_commands; group++) {
c1aeb8
+			int terminated = 0, running = 0;
c1aeb8
+
c1aeb8
+			for (i = 0; i < num_commands; i++) {
c1aeb8
+				if (*(script->command_groups[i]) != group)
c1aeb8
+					continue;
c1aeb8
+				if (pids[i])
c1aeb8
+					running++;
c1aeb8
+				else
c1aeb8
+					terminated++;
c1aeb8
+			}
c1aeb8
+
c1aeb8
+			if (!terminated)
c1aeb8
+				continue;
c1aeb8
+
c1aeb8
+			/*
c1aeb8
+			 * exit with a non-zero status if the group should not
c1aeb8
+			 * be restarted (i.e. chronyd/ntpd), timemaster is
c1aeb8
+			 * running only for a short time (and it is likely a
c1aeb8
+			 * configuration error), or restarting is disabled
c1aeb8
+			 * completely
c1aeb8
+			 */
c1aeb8
+			if (group == script->no_restart_group ||
c1aeb8
+			    ts_now.tv_sec - ts_start.tv_sec <= 1 ||
c1aeb8
+			    !script->restart_groups) {
c1aeb8
+				kill(getpid(), SIGTERM);
c1aeb8
 				ret = 1;
c1aeb8
-			pr_info("process %d terminated with status %d", pid,
c1aeb8
-				WEXITSTATUS(status));
c1aeb8
+				break;
c1aeb8
+			}
c1aeb8
+
c1aeb8
+			for (i = 0; i < num_commands; i++) {
c1aeb8
+				if (*(script->command_groups[i]) != group)
c1aeb8
+					continue;
c1aeb8
+
c1aeb8
+				/* terminate all processes in the group first */
c1aeb8
+				if (running && pids[i]) {
c1aeb8
+					pr_debug("killing process %d", pids[i]);
c1aeb8
+					kill(pids[i], SIGTERM);
c1aeb8
+				} else if (!running && !pids[i]) {
c1aeb8
+					pids[i] = start_program(script->commands[i],
c1aeb8
+								&old_mask);
c1aeb8
+					if (!pids[i])
c1aeb8
+						kill(getpid(), SIGTERM);
c1aeb8
+
c1aeb8
+					/* limit restarting rate */
c1aeb8
+					sleep(1);
c1aeb8
+				}
c1aeb8
+			}
c1aeb8
 		}
c1aeb8
 	}
c1aeb8
 
c1aeb8
@@ -1154,6 +1261,7 @@ static int script_run(struct script *script)
c1aeb8
 static void script_print(struct script *script)
c1aeb8
 {
c1aeb8
 	char ***commands, **command;
c1aeb8
+	int **groups;
c1aeb8
 	struct config_file *config, **configs;
c1aeb8
 
c1aeb8
 	for (configs = script->configs; *configs; configs++) {
c1aeb8
@@ -1162,7 +1270,9 @@ static void script_print(struct script *script)
c1aeb8
 	}
c1aeb8
 
c1aeb8
 	fprintf(stderr, "commands:\n\n");
c1aeb8
-	for (commands = script->commands; *commands; commands++) {
c1aeb8
+	for (commands = script->commands, groups = script->command_groups;
c1aeb8
+	     *commands; commands++, groups++) {
c1aeb8
+		fprintf(stderr, "[%d] ", **groups);
c1aeb8
 		for (command = *commands; *command; command++)
c1aeb8
 			fprintf(stderr, "%s ", *command);
c1aeb8
 		fprintf(stderr, "\n");