05ad79
From 4a44a54b3caf77923f0e3f1d5bdf5eda6ef07f62 Mon Sep 17 00:00:00 2001
05ad79
From: Chris MacGregor <chrismacgregor@google.com>
05ad79
Date: Thu, 27 Feb 2014 10:40:59 -0800
05ad79
Subject: [PATCH] hwclock: fix possible hang and other
05ad79
 set_hardware_clock_exact() issues
05ad79
05ad79
In sys-utils/hwclock.c, set_hardware_clock_exact() has some problems when the
05ad79
process gets pre-empted (for more than 100ms) before reaching the time for
05ad79
which it waits:
05ad79
05ad79
1. The "continue" statement causes execution to skip the final tdiff
05ad79
assignment at the end of the do...while loop, leading to the while condition
05ad79
using the wrong value of tdiff, and thus always exiting the loop once
05ad79
newhwtime != sethwtime (e.g., after 1 second).  This masks bug # 2, below.
05ad79
05ad79
2. The previously-existing bug is that because it starts over waiting for the
05ad79
desired time whenever two successive calls to gettimeofday() return values >
05ad79
100ms apart, the loop will never terminate unless the process holds the CPU
05ad79
(without losing it for more than 100ms) for at least 500ms.  This can happen
05ad79
on a heavily loaded machine or on a virtual machine (or on a heavily loaded
05ad79
virtual machine).  This has been observed to occur, preventing a machine from
05ad79
completing the shutdown or reboot process due to a "hwclock --systohc" call in
05ad79
a shutdown script.
05ad79
05ad79
The new implementation presented in this patch takes a somewhat different
05ad79
approach, intended to accomplish the same goals:
05ad79
05ad79
It computes the desired target system time (at which the requested hardware
05ad79
clock time will be applied to the hardware clock), and waits for that time to
05ad79
arrive.  If it misses the time (such as due to being pre-empted for too long),
05ad79
it recalculates the target time, and increases the tolerance (how late it can
05ad79
be relative to the target time, and still be "close enough".  Thus, if all is
05ad79
well, the time will be set *very* precisely.  On a machine where the hwclock
05ad79
process is repeatedly pre-empted, it will set the time as precisely as is
05ad79
possible under the conditions present on that particular machine.  In any
05ad79
case, it will always terminate eventually (and pretty quickly); it will never
05ad79
hang forever.
05ad79
05ad79
[kzak@redhat.com: - tiny coding style changes]
05ad79
05ad79
Signed-off-by: Chris MacGregor <chrismacgregor@google.com>
05ad79
Signed-off-by: Karel Zak <kzak@redhat.com>
05ad79
---
05ad79
 sys-utils/hwclock.c | 170 ++++++++++++++++++++++++++++++++++++++++------------
05ad79
 1 file changed, 131 insertions(+), 39 deletions(-)
05ad79
05ad79
diff --git a/sys-utils/hwclock.c b/sys-utils/hwclock.c
05ad79
index 30660d4..395b5c3 100644
05ad79
--- a/sys-utils/hwclock.c
05ad79
+++ b/sys-utils/hwclock.c
05ad79
@@ -125,7 +125,7 @@ struct adjtime {
05ad79
  * We are running in debug mode, wherein we put a lot of information about
05ad79
  * what we're doing to standard output.
05ad79
  */
05ad79
-bool debug;
05ad79
+int debug;
05ad79
 
05ad79
 /* Workaround for Award 4.50g BIOS bug: keep the year in a file. */
05ad79
 bool badyear;
05ad79
@@ -526,43 +526,141 @@ set_hardware_clock_exact(const time_t sethwtime,
05ad79
 			 const struct timeval refsystime,
05ad79
 			 const bool universal, const bool testing)
05ad79
 {
05ad79
-	time_t newhwtime = sethwtime;
05ad79
-	struct timeval beginsystime, nowsystime;
05ad79
-	double tdiff;
05ad79
-	int time_resync = 1;
05ad79
-
05ad79
 	/*
05ad79
-	 * Now delay some more until Hardware Clock time newhwtime arrives.
05ad79
-	 * The 0.5 s is because the Hardware Clock always sets to your set
05ad79
-	 * time plus 500 ms (because it is designed to update to the next
05ad79
-	 * second precisely 500 ms after you finish the setting).
05ad79
+	 * The Hardware Clock can only be set to any integer time plus one
05ad79
+	 * half second.	 The integer time is required because there is no
05ad79
+	 * interface to set or get a fractional second.	 The additional half
05ad79
+	 * second is because the Hardware Clock updates to the following
05ad79
+	 * second precisely 500 ms (not 1 second!) after you release the
05ad79
+	 * divider reset (after setting the new time) - see description of
05ad79
+	 * DV2, DV1, DV0 in Register A in the MC146818A data sheet (and note
05ad79
+	 * that although that document doesn't say so, real-world code seems
05ad79
+	 * to expect that the SET bit in Register B functions the same way).
05ad79
+	 * That means that, e.g., when you set the clock to 1:02:03, it
05ad79
+	 * effectively really sets it to 1:02:03.5, because it will update to
05ad79
+	 * 1:02:04 only half a second later.  Our caller passes the desired
05ad79
+	 * integer Hardware Clock time in sethwtime, and the corresponding
05ad79
+	 * system time (which may have a fractional part, and which may or may
05ad79
+	 * not be the same!) in refsystime.  In an ideal situation, we would
05ad79
+	 * then apply sethwtime to the Hardware Clock at refsystime+500ms, so
05ad79
+	 * that when the Hardware Clock ticks forward to sethwtime+1s half a
05ad79
+	 * second later at refsystime+1000ms, everything is in sync.  So we
05ad79
+	 * spin, waiting for gettimeofday() to return a time at or after that
05ad79
+	 * time (refsystime+500ms) up to a tolerance value, initially 1ms.  If
05ad79
+	 * we miss that time due to being preempted for some other process,
05ad79
+	 * then we increase the margin a little bit (initially 1ms, doubling
05ad79
+	 * each time), add 1 second (or more, if needed to get a time that is
05ad79
+	 * in the future) to both the time for which we are waiting and the
05ad79
+	 * time that we will apply to the Hardware Clock, and start waiting
05ad79
+	 * again.
05ad79
+	 * 
05ad79
+	 * For example, the caller requests that we set the Hardware Clock to
05ad79
+	 * 1:02:03, with reference time (current system time) = 6:07:08.250.
05ad79
+	 * We want the Hardware Clock to update to 1:02:04 at 6:07:09.250 on
05ad79
+	 * the system clock, and the first such update will occur 0.500
05ad79
+	 * seconds after we write to the Hardware Clock, so we spin until the
05ad79
+	 * system clock reads 6:07:08.750.  If we get there, great, but let's
05ad79
+	 * imagine the system is so heavily loaded that our process is
05ad79
+	 * preempted and by the time we get to run again, the system clock
05ad79
+	 * reads 6:07:11.990.  We now want to wait until the next xx:xx:xx.750
05ad79
+	 * time, which is 6:07:12.750 (4.5 seconds after the reference time),
05ad79
+	 * at which point we will set the Hardware Clock to 1:02:07 (4 seconds
05ad79
+	 * after the originally requested time).  If we do that successfully,
05ad79
+	 * then at 6:07:13.250 (5 seconds after the reference time), the
05ad79
+	 * Hardware Clock will update to 1:02:08 (5 seconds after the
05ad79
+	 * originally requested time), and all is well thereafter.
05ad79
 	 */
05ad79
-	do {
05ad79
-		if (time_resync) {
05ad79
-			gettimeofday(&beginsystime, NULL);
05ad79
-			tdiff = time_diff(beginsystime, refsystime);
05ad79
-			newhwtime = sethwtime + (int)(tdiff + 0.5);
05ad79
-			if (debug)
05ad79
-				printf(_
05ad79
-				       ("Time elapsed since reference time has been %.6f seconds.\n"
05ad79
-					"Delaying further to reach the new time.\n"),
05ad79
-				       tdiff);
05ad79
-			time_resync = 0;
05ad79
+
05ad79
+	time_t newhwtime = sethwtime;
05ad79
+	double target_time_tolerance_secs = 0.001;  /* initial value */
05ad79
+	double tolerance_incr_secs = 0.001;	    /* initial value */
05ad79
+	const double RTC_SET_DELAY_SECS = 0.5;	    /* 500 ms */
05ad79
+	const struct timeval RTC_SET_DELAY_TV = { 0, RTC_SET_DELAY_SECS * 1E6 };
05ad79
+
05ad79
+	struct timeval targetsystime;
05ad79
+	struct timeval nowsystime;
05ad79
+	struct timeval prevsystime = refsystime;
05ad79
+	double deltavstarget;
05ad79
+
05ad79
+	timeradd(&refsystime, &RTC_SET_DELAY_TV, &targetsystime);
05ad79
+
05ad79
+	while (1) {
05ad79
+		double ticksize;
05ad79
+
05ad79
+		/* FOR TESTING ONLY: inject random delays of up to 1000ms */
05ad79
+		if (debug >= 10) {
05ad79
+			int usec = random() % 1000000;
05ad79
+			printf(_("sleeping ~%d usec\n"), usec);
05ad79
+			usleep(usec);
05ad79
 		}
05ad79
 
05ad79
 		gettimeofday(&nowsystime, NULL);
05ad79
-		tdiff = time_diff(nowsystime, beginsystime);
05ad79
-		if (tdiff < 0) {
05ad79
-			time_resync = 1;	/* probably backward time reset */
05ad79
-			continue;
05ad79
-		}
05ad79
-		if (tdiff > 0.1) {
05ad79
-			time_resync = 1;	/* probably forward time reset */
05ad79
-			continue;
05ad79
+		deltavstarget = time_diff(nowsystime, targetsystime);
05ad79
+		ticksize = time_diff(nowsystime, prevsystime);
05ad79
+		prevsystime = nowsystime;
05ad79
+
05ad79
+		if (ticksize < 0) {
05ad79
+			if (debug)
05ad79
+				printf(_("time jumped backward %.6f seconds "
05ad79
+					 "to %ld.%06d - retargeting\n"),
05ad79
+				       ticksize, (long)nowsystime.tv_sec,
05ad79
+				       (int)nowsystime.tv_usec);
05ad79
+			/* The retarget is handled at the end of the loop. */
05ad79
+		} else if (deltavstarget < 0) {
05ad79
+			/* deltavstarget < 0 if current time < target time */
05ad79
+			if (debug >= 2)
05ad79
+				printf(_("%ld.%06d < %ld.%06d (%.6f)\n"),
05ad79
+				       (long)nowsystime.tv_sec,
05ad79
+				       (int)nowsystime.tv_usec,
05ad79
+				       (long)targetsystime.tv_sec,
05ad79
+				       (int)targetsystime.tv_usec,
05ad79
+				       deltavstarget);
05ad79
+			continue;  /* not there yet - keep spinning */
05ad79
+		} else if (deltavstarget <= target_time_tolerance_secs) {
05ad79
+			/* Close enough to the target time; done waiting. */
05ad79
+			break;
05ad79
+		} else /* (deltavstarget > target_time_tolerance_secs) */ {
05ad79
+			/*
05ad79
+			 * We missed our window.  Increase the tolerance and
05ad79
+			 * aim for the next opportunity.
05ad79
+			 */
05ad79
+			if (debug)
05ad79
+				printf(_("missed it - %ld.%06d is too far "
05ad79
+					 "past %ld.%06d (%.6f > %.6f)\n"),
05ad79
+				       (long)nowsystime.tv_sec,
05ad79
+				       (int)nowsystime.tv_usec,
05ad79
+				       (long)targetsystime.tv_sec,
05ad79
+				       (int)targetsystime.tv_usec,
05ad79
+				       deltavstarget,
05ad79
+				       target_time_tolerance_secs);
05ad79
+			target_time_tolerance_secs += tolerance_incr_secs;
05ad79
+			tolerance_incr_secs *= 2;
05ad79
 		}
05ad79
-		beginsystime = nowsystime;
05ad79
-		tdiff = time_diff(nowsystime, refsystime);
05ad79
-	} while (newhwtime == sethwtime + (int)(tdiff + 0.5));
05ad79
+
05ad79
+		/*
05ad79
+		 * Aim for the same offset (tv_usec) within the second in
05ad79
+		 * either the current second (if that offset hasn't arrived
05ad79
+		 * yet), or the next second.
05ad79
+		 */
05ad79
+		if (nowsystime.tv_usec < targetsystime.tv_usec)
05ad79
+			targetsystime.tv_sec = nowsystime.tv_sec;
05ad79
+		else
05ad79
+			targetsystime.tv_sec = nowsystime.tv_sec + 1;
05ad79
+	}
05ad79
+
05ad79
+	newhwtime = sethwtime
05ad79
+		    + (int)(time_diff(nowsystime, refsystime)
05ad79
+			    - RTC_SET_DELAY_SECS /* don't count this */
05ad79
+			    + 0.5 /* for rounding */);
05ad79
+	if (debug)
05ad79
+		printf(_("%ld.%06d is close enough to %ld.%06d (%.6f < %.6f)\n"
05ad79
+			 "Set RTC to %ld (%ld + %d; refsystime = %ld.%06d)\n"),
05ad79
+		       (long)nowsystime.tv_sec, (int)nowsystime.tv_usec,
05ad79
+		       (long)targetsystime.tv_sec, (int)targetsystime.tv_usec,
05ad79
+		       deltavstarget, target_time_tolerance_secs,
05ad79
+		       (long)newhwtime, (long)sethwtime,
05ad79
+		       (int)(newhwtime - sethwtime),
05ad79
+		       (long)refsystime.tv_sec, (int)refsystime.tv_usec);
05ad79
 
05ad79
 	set_hardware_clock(newhwtime, universal, testing);
05ad79
 }
05ad79
@@ -1636,7 +1734,7 @@ int main(int argc, char **argv)
05ad79
 
05ad79
 		switch (c) {
05ad79
 		case 'D':
05ad79
-			debug = TRUE;
05ad79
+			++debug;
05ad79
 			break;
05ad79
 		case 'a':
05ad79
 			adjust = TRUE;
05ad79
@@ -1953,10 +2051,4 @@ void __attribute__((__noreturn__)) hwaudit_exit(int status)
05ad79
  *
05ad79
  * hwclock uses this method, and considers the Hardware Clock to have
05ad79
  * infinite precision.
05ad79
- *
05ad79
- * TODO: Enhancements needed:
05ad79
- *
05ad79
- *  - When waiting for whole second boundary in set_hardware_clock_exact,
05ad79
- *    fail if we miss the goal by more than .1 second, as could happen if we
05ad79
- *    get pre-empted (by the kernel dispatcher).
05ad79
  */
05ad79
-- 
05ad79
1.9.3
05ad79