6717ab
From 4a44a54b3caf77923f0e3f1d5bdf5eda6ef07f62 Mon Sep 17 00:00:00 2001
6717ab
From: Chris MacGregor <chrismacgregor@google.com>
6717ab
Date: Thu, 27 Feb 2014 10:40:59 -0800
6717ab
Subject: [PATCH] hwclock: fix possible hang and other
6717ab
 set_hardware_clock_exact() issues
6717ab
6717ab
In sys-utils/hwclock.c, set_hardware_clock_exact() has some problems when the
6717ab
process gets pre-empted (for more than 100ms) before reaching the time for
6717ab
which it waits:
6717ab
6717ab
1. The "continue" statement causes execution to skip the final tdiff
6717ab
assignment at the end of the do...while loop, leading to the while condition
6717ab
using the wrong value of tdiff, and thus always exiting the loop once
6717ab
newhwtime != sethwtime (e.g., after 1 second).  This masks bug # 2, below.
6717ab
6717ab
2. The previously-existing bug is that because it starts over waiting for the
6717ab
desired time whenever two successive calls to gettimeofday() return values >
6717ab
100ms apart, the loop will never terminate unless the process holds the CPU
6717ab
(without losing it for more than 100ms) for at least 500ms.  This can happen
6717ab
on a heavily loaded machine or on a virtual machine (or on a heavily loaded
6717ab
virtual machine).  This has been observed to occur, preventing a machine from
6717ab
completing the shutdown or reboot process due to a "hwclock --systohc" call in
6717ab
a shutdown script.
6717ab
6717ab
The new implementation presented in this patch takes a somewhat different
6717ab
approach, intended to accomplish the same goals:
6717ab
6717ab
It computes the desired target system time (at which the requested hardware
6717ab
clock time will be applied to the hardware clock), and waits for that time to
6717ab
arrive.  If it misses the time (such as due to being pre-empted for too long),
6717ab
it recalculates the target time, and increases the tolerance (how late it can
6717ab
be relative to the target time, and still be "close enough".  Thus, if all is
6717ab
well, the time will be set *very* precisely.  On a machine where the hwclock
6717ab
process is repeatedly pre-empted, it will set the time as precisely as is
6717ab
possible under the conditions present on that particular machine.  In any
6717ab
case, it will always terminate eventually (and pretty quickly); it will never
6717ab
hang forever.
6717ab
6717ab
[kzak@redhat.com: - tiny coding style changes]
6717ab
6717ab
Signed-off-by: Chris MacGregor <chrismacgregor@google.com>
6717ab
Signed-off-by: Karel Zak <kzak@redhat.com>
6717ab
---
6717ab
 sys-utils/hwclock.c | 170 ++++++++++++++++++++++++++++++++++++++++------------
6717ab
 1 file changed, 131 insertions(+), 39 deletions(-)
6717ab
6717ab
diff --git a/sys-utils/hwclock.c b/sys-utils/hwclock.c
6717ab
index 30660d4..395b5c3 100644
6717ab
--- a/sys-utils/hwclock.c
6717ab
+++ b/sys-utils/hwclock.c
6717ab
@@ -125,7 +125,7 @@ struct adjtime {
6717ab
  * We are running in debug mode, wherein we put a lot of information about
6717ab
  * what we're doing to standard output.
6717ab
  */
6717ab
-bool debug;
6717ab
+int debug;
6717ab
 
6717ab
 /* Workaround for Award 4.50g BIOS bug: keep the year in a file. */
6717ab
 bool badyear;
6717ab
@@ -526,43 +526,141 @@ set_hardware_clock_exact(const time_t sethwtime,
6717ab
 			 const struct timeval refsystime,
6717ab
 			 const bool universal, const bool testing)
6717ab
 {
6717ab
-	time_t newhwtime = sethwtime;
6717ab
-	struct timeval beginsystime, nowsystime;
6717ab
-	double tdiff;
6717ab
-	int time_resync = 1;
6717ab
-
6717ab
 	/*
6717ab
-	 * Now delay some more until Hardware Clock time newhwtime arrives.
6717ab
-	 * The 0.5 s is because the Hardware Clock always sets to your set
6717ab
-	 * time plus 500 ms (because it is designed to update to the next
6717ab
-	 * second precisely 500 ms after you finish the setting).
6717ab
+	 * The Hardware Clock can only be set to any integer time plus one
6717ab
+	 * half second.	 The integer time is required because there is no
6717ab
+	 * interface to set or get a fractional second.	 The additional half
6717ab
+	 * second is because the Hardware Clock updates to the following
6717ab
+	 * second precisely 500 ms (not 1 second!) after you release the
6717ab
+	 * divider reset (after setting the new time) - see description of
6717ab
+	 * DV2, DV1, DV0 in Register A in the MC146818A data sheet (and note
6717ab
+	 * that although that document doesn't say so, real-world code seems
6717ab
+	 * to expect that the SET bit in Register B functions the same way).
6717ab
+	 * That means that, e.g., when you set the clock to 1:02:03, it
6717ab
+	 * effectively really sets it to 1:02:03.5, because it will update to
6717ab
+	 * 1:02:04 only half a second later.  Our caller passes the desired
6717ab
+	 * integer Hardware Clock time in sethwtime, and the corresponding
6717ab
+	 * system time (which may have a fractional part, and which may or may
6717ab
+	 * not be the same!) in refsystime.  In an ideal situation, we would
6717ab
+	 * then apply sethwtime to the Hardware Clock at refsystime+500ms, so
6717ab
+	 * that when the Hardware Clock ticks forward to sethwtime+1s half a
6717ab
+	 * second later at refsystime+1000ms, everything is in sync.  So we
6717ab
+	 * spin, waiting for gettimeofday() to return a time at or after that
6717ab
+	 * time (refsystime+500ms) up to a tolerance value, initially 1ms.  If
6717ab
+	 * we miss that time due to being preempted for some other process,
6717ab
+	 * then we increase the margin a little bit (initially 1ms, doubling
6717ab
+	 * each time), add 1 second (or more, if needed to get a time that is
6717ab
+	 * in the future) to both the time for which we are waiting and the
6717ab
+	 * time that we will apply to the Hardware Clock, and start waiting
6717ab
+	 * again.
6717ab
+	 * 
6717ab
+	 * For example, the caller requests that we set the Hardware Clock to
6717ab
+	 * 1:02:03, with reference time (current system time) = 6:07:08.250.
6717ab
+	 * We want the Hardware Clock to update to 1:02:04 at 6:07:09.250 on
6717ab
+	 * the system clock, and the first such update will occur 0.500
6717ab
+	 * seconds after we write to the Hardware Clock, so we spin until the
6717ab
+	 * system clock reads 6:07:08.750.  If we get there, great, but let's
6717ab
+	 * imagine the system is so heavily loaded that our process is
6717ab
+	 * preempted and by the time we get to run again, the system clock
6717ab
+	 * reads 6:07:11.990.  We now want to wait until the next xx:xx:xx.750
6717ab
+	 * time, which is 6:07:12.750 (4.5 seconds after the reference time),
6717ab
+	 * at which point we will set the Hardware Clock to 1:02:07 (4 seconds
6717ab
+	 * after the originally requested time).  If we do that successfully,
6717ab
+	 * then at 6:07:13.250 (5 seconds after the reference time), the
6717ab
+	 * Hardware Clock will update to 1:02:08 (5 seconds after the
6717ab
+	 * originally requested time), and all is well thereafter.
6717ab
 	 */
6717ab
-	do {
6717ab
-		if (time_resync) {
6717ab
-			gettimeofday(&beginsystime, NULL);
6717ab
-			tdiff = time_diff(beginsystime, refsystime);
6717ab
-			newhwtime = sethwtime + (int)(tdiff + 0.5);
6717ab
-			if (debug)
6717ab
-				printf(_
6717ab
-				       ("Time elapsed since reference time has been %.6f seconds.\n"
6717ab
-					"Delaying further to reach the new time.\n"),
6717ab
-				       tdiff);
6717ab
-			time_resync = 0;
6717ab
+
6717ab
+	time_t newhwtime = sethwtime;
6717ab
+	double target_time_tolerance_secs = 0.001;  /* initial value */
6717ab
+	double tolerance_incr_secs = 0.001;	    /* initial value */
6717ab
+	const double RTC_SET_DELAY_SECS = 0.5;	    /* 500 ms */
6717ab
+	const struct timeval RTC_SET_DELAY_TV = { 0, RTC_SET_DELAY_SECS * 1E6 };
6717ab
+
6717ab
+	struct timeval targetsystime;
6717ab
+	struct timeval nowsystime;
6717ab
+	struct timeval prevsystime = refsystime;
6717ab
+	double deltavstarget;
6717ab
+
6717ab
+	timeradd(&refsystime, &RTC_SET_DELAY_TV, &targetsystime);
6717ab
+
6717ab
+	while (1) {
6717ab
+		double ticksize;
6717ab
+
6717ab
+		/* FOR TESTING ONLY: inject random delays of up to 1000ms */
6717ab
+		if (debug >= 10) {
6717ab
+			int usec = random() % 1000000;
6717ab
+			printf(_("sleeping ~%d usec\n"), usec);
6717ab
+			usleep(usec);
6717ab
 		}
6717ab
 
6717ab
 		gettimeofday(&nowsystime, NULL);
6717ab
-		tdiff = time_diff(nowsystime, beginsystime);
6717ab
-		if (tdiff < 0) {
6717ab
-			time_resync = 1;	/* probably backward time reset */
6717ab
-			continue;
6717ab
-		}
6717ab
-		if (tdiff > 0.1) {
6717ab
-			time_resync = 1;	/* probably forward time reset */
6717ab
-			continue;
6717ab
+		deltavstarget = time_diff(nowsystime, targetsystime);
6717ab
+		ticksize = time_diff(nowsystime, prevsystime);
6717ab
+		prevsystime = nowsystime;
6717ab
+
6717ab
+		if (ticksize < 0) {
6717ab
+			if (debug)
6717ab
+				printf(_("time jumped backward %.6f seconds "
6717ab
+					 "to %ld.%06d - retargeting\n"),
6717ab
+				       ticksize, (long)nowsystime.tv_sec,
6717ab
+				       (int)nowsystime.tv_usec);
6717ab
+			/* The retarget is handled at the end of the loop. */
6717ab
+		} else if (deltavstarget < 0) {
6717ab
+			/* deltavstarget < 0 if current time < target time */
6717ab
+			if (debug >= 2)
6717ab
+				printf(_("%ld.%06d < %ld.%06d (%.6f)\n"),
6717ab
+				       (long)nowsystime.tv_sec,
6717ab
+				       (int)nowsystime.tv_usec,
6717ab
+				       (long)targetsystime.tv_sec,
6717ab
+				       (int)targetsystime.tv_usec,
6717ab
+				       deltavstarget);
6717ab
+			continue;  /* not there yet - keep spinning */
6717ab
+		} else if (deltavstarget <= target_time_tolerance_secs) {
6717ab
+			/* Close enough to the target time; done waiting. */
6717ab
+			break;
6717ab
+		} else /* (deltavstarget > target_time_tolerance_secs) */ {
6717ab
+			/*
6717ab
+			 * We missed our window.  Increase the tolerance and
6717ab
+			 * aim for the next opportunity.
6717ab
+			 */
6717ab
+			if (debug)
6717ab
+				printf(_("missed it - %ld.%06d is too far "
6717ab
+					 "past %ld.%06d (%.6f > %.6f)\n"),
6717ab
+				       (long)nowsystime.tv_sec,
6717ab
+				       (int)nowsystime.tv_usec,
6717ab
+				       (long)targetsystime.tv_sec,
6717ab
+				       (int)targetsystime.tv_usec,
6717ab
+				       deltavstarget,
6717ab
+				       target_time_tolerance_secs);
6717ab
+			target_time_tolerance_secs += tolerance_incr_secs;
6717ab
+			tolerance_incr_secs *= 2;
6717ab
 		}
6717ab
-		beginsystime = nowsystime;
6717ab
-		tdiff = time_diff(nowsystime, refsystime);
6717ab
-	} while (newhwtime == sethwtime + (int)(tdiff + 0.5));
6717ab
+
6717ab
+		/*
6717ab
+		 * Aim for the same offset (tv_usec) within the second in
6717ab
+		 * either the current second (if that offset hasn't arrived
6717ab
+		 * yet), or the next second.
6717ab
+		 */
6717ab
+		if (nowsystime.tv_usec < targetsystime.tv_usec)
6717ab
+			targetsystime.tv_sec = nowsystime.tv_sec;
6717ab
+		else
6717ab
+			targetsystime.tv_sec = nowsystime.tv_sec + 1;
6717ab
+	}
6717ab
+
6717ab
+	newhwtime = sethwtime
6717ab
+		    + (int)(time_diff(nowsystime, refsystime)
6717ab
+			    - RTC_SET_DELAY_SECS /* don't count this */
6717ab
+			    + 0.5 /* for rounding */);
6717ab
+	if (debug)
6717ab
+		printf(_("%ld.%06d is close enough to %ld.%06d (%.6f < %.6f)\n"
6717ab
+			 "Set RTC to %ld (%ld + %d; refsystime = %ld.%06d)\n"),
6717ab
+		       (long)nowsystime.tv_sec, (int)nowsystime.tv_usec,
6717ab
+		       (long)targetsystime.tv_sec, (int)targetsystime.tv_usec,
6717ab
+		       deltavstarget, target_time_tolerance_secs,
6717ab
+		       (long)newhwtime, (long)sethwtime,
6717ab
+		       (int)(newhwtime - sethwtime),
6717ab
+		       (long)refsystime.tv_sec, (int)refsystime.tv_usec);
6717ab
 
6717ab
 	set_hardware_clock(newhwtime, universal, testing);
6717ab
 }
6717ab
@@ -1636,7 +1734,7 @@ int main(int argc, char **argv)
6717ab
 
6717ab
 		switch (c) {
6717ab
 		case 'D':
6717ab
-			debug = TRUE;
6717ab
+			++debug;
6717ab
 			break;
6717ab
 		case 'a':
6717ab
 			adjust = TRUE;
6717ab
@@ -1953,10 +2051,4 @@ void __attribute__((__noreturn__)) hwaudit_exit(int status)
6717ab
  *
6717ab
  * hwclock uses this method, and considers the Hardware Clock to have
6717ab
  * infinite precision.
6717ab
- *
6717ab
- * TODO: Enhancements needed:
6717ab
- *
6717ab
- *  - When waiting for whole second boundary in set_hardware_clock_exact,
6717ab
- *    fail if we miss the goal by more than .1 second, as could happen if we
6717ab
- *    get pre-empted (by the kernel dispatcher).
6717ab
  */
6717ab
-- 
6717ab
1.9.3
6717ab