3df8c3
From 4a44a54b3caf77923f0e3f1d5bdf5eda6ef07f62 Mon Sep 17 00:00:00 2001
3df8c3
From: Chris MacGregor <chrismacgregor@google.com>
3df8c3
Date: Thu, 27 Feb 2014 10:40:59 -0800
3df8c3
Subject: [PATCH] hwclock: fix possible hang and other
3df8c3
 set_hardware_clock_exact() issues
3df8c3
3df8c3
In sys-utils/hwclock.c, set_hardware_clock_exact() has some problems when the
3df8c3
process gets pre-empted (for more than 100ms) before reaching the time for
3df8c3
which it waits:
3df8c3
3df8c3
1. The "continue" statement causes execution to skip the final tdiff
3df8c3
assignment at the end of the do...while loop, leading to the while condition
3df8c3
using the wrong value of tdiff, and thus always exiting the loop once
3df8c3
newhwtime != sethwtime (e.g., after 1 second).  This masks bug # 2, below.
3df8c3
3df8c3
2. The previously-existing bug is that because it starts over waiting for the
3df8c3
desired time whenever two successive calls to gettimeofday() return values >
3df8c3
100ms apart, the loop will never terminate unless the process holds the CPU
3df8c3
(without losing it for more than 100ms) for at least 500ms.  This can happen
3df8c3
on a heavily loaded machine or on a virtual machine (or on a heavily loaded
3df8c3
virtual machine).  This has been observed to occur, preventing a machine from
3df8c3
completing the shutdown or reboot process due to a "hwclock --systohc" call in
3df8c3
a shutdown script.
3df8c3
3df8c3
The new implementation presented in this patch takes a somewhat different
3df8c3
approach, intended to accomplish the same goals:
3df8c3
3df8c3
It computes the desired target system time (at which the requested hardware
3df8c3
clock time will be applied to the hardware clock), and waits for that time to
3df8c3
arrive.  If it misses the time (such as due to being pre-empted for too long),
3df8c3
it recalculates the target time, and increases the tolerance (how late it can
3df8c3
be relative to the target time, and still be "close enough".  Thus, if all is
3df8c3
well, the time will be set *very* precisely.  On a machine where the hwclock
3df8c3
process is repeatedly pre-empted, it will set the time as precisely as is
3df8c3
possible under the conditions present on that particular machine.  In any
3df8c3
case, it will always terminate eventually (and pretty quickly); it will never
3df8c3
hang forever.
3df8c3
3df8c3
[kzak@redhat.com: - tiny coding style changes]
3df8c3
3df8c3
Signed-off-by: Chris MacGregor <chrismacgregor@google.com>
3df8c3
Signed-off-by: Karel Zak <kzak@redhat.com>
3df8c3
---
3df8c3
 sys-utils/hwclock.c | 170 ++++++++++++++++++++++++++++++++++++++++------------
3df8c3
 1 file changed, 131 insertions(+), 39 deletions(-)
3df8c3
3df8c3
diff --git a/sys-utils/hwclock.c b/sys-utils/hwclock.c
3df8c3
index 30660d4..395b5c3 100644
3df8c3
--- a/sys-utils/hwclock.c
3df8c3
+++ b/sys-utils/hwclock.c
3df8c3
@@ -125,7 +125,7 @@ struct adjtime {
3df8c3
  * We are running in debug mode, wherein we put a lot of information about
3df8c3
  * what we're doing to standard output.
3df8c3
  */
3df8c3
-bool debug;
3df8c3
+int debug;
3df8c3
 
3df8c3
 /* Workaround for Award 4.50g BIOS bug: keep the year in a file. */
3df8c3
 bool badyear;
3df8c3
@@ -526,43 +526,141 @@ set_hardware_clock_exact(const time_t sethwtime,
3df8c3
 			 const struct timeval refsystime,
3df8c3
 			 const bool universal, const bool testing)
3df8c3
 {
3df8c3
-	time_t newhwtime = sethwtime;
3df8c3
-	struct timeval beginsystime, nowsystime;
3df8c3
-	double tdiff;
3df8c3
-	int time_resync = 1;
3df8c3
-
3df8c3
 	/*
3df8c3
-	 * Now delay some more until Hardware Clock time newhwtime arrives.
3df8c3
-	 * The 0.5 s is because the Hardware Clock always sets to your set
3df8c3
-	 * time plus 500 ms (because it is designed to update to the next
3df8c3
-	 * second precisely 500 ms after you finish the setting).
3df8c3
+	 * The Hardware Clock can only be set to any integer time plus one
3df8c3
+	 * half second.	 The integer time is required because there is no
3df8c3
+	 * interface to set or get a fractional second.	 The additional half
3df8c3
+	 * second is because the Hardware Clock updates to the following
3df8c3
+	 * second precisely 500 ms (not 1 second!) after you release the
3df8c3
+	 * divider reset (after setting the new time) - see description of
3df8c3
+	 * DV2, DV1, DV0 in Register A in the MC146818A data sheet (and note
3df8c3
+	 * that although that document doesn't say so, real-world code seems
3df8c3
+	 * to expect that the SET bit in Register B functions the same way).
3df8c3
+	 * That means that, e.g., when you set the clock to 1:02:03, it
3df8c3
+	 * effectively really sets it to 1:02:03.5, because it will update to
3df8c3
+	 * 1:02:04 only half a second later.  Our caller passes the desired
3df8c3
+	 * integer Hardware Clock time in sethwtime, and the corresponding
3df8c3
+	 * system time (which may have a fractional part, and which may or may
3df8c3
+	 * not be the same!) in refsystime.  In an ideal situation, we would
3df8c3
+	 * then apply sethwtime to the Hardware Clock at refsystime+500ms, so
3df8c3
+	 * that when the Hardware Clock ticks forward to sethwtime+1s half a
3df8c3
+	 * second later at refsystime+1000ms, everything is in sync.  So we
3df8c3
+	 * spin, waiting for gettimeofday() to return a time at or after that
3df8c3
+	 * time (refsystime+500ms) up to a tolerance value, initially 1ms.  If
3df8c3
+	 * we miss that time due to being preempted for some other process,
3df8c3
+	 * then we increase the margin a little bit (initially 1ms, doubling
3df8c3
+	 * each time), add 1 second (or more, if needed to get a time that is
3df8c3
+	 * in the future) to both the time for which we are waiting and the
3df8c3
+	 * time that we will apply to the Hardware Clock, and start waiting
3df8c3
+	 * again.
3df8c3
+	 * 
3df8c3
+	 * For example, the caller requests that we set the Hardware Clock to
3df8c3
+	 * 1:02:03, with reference time (current system time) = 6:07:08.250.
3df8c3
+	 * We want the Hardware Clock to update to 1:02:04 at 6:07:09.250 on
3df8c3
+	 * the system clock, and the first such update will occur 0.500
3df8c3
+	 * seconds after we write to the Hardware Clock, so we spin until the
3df8c3
+	 * system clock reads 6:07:08.750.  If we get there, great, but let's
3df8c3
+	 * imagine the system is so heavily loaded that our process is
3df8c3
+	 * preempted and by the time we get to run again, the system clock
3df8c3
+	 * reads 6:07:11.990.  We now want to wait until the next xx:xx:xx.750
3df8c3
+	 * time, which is 6:07:12.750 (4.5 seconds after the reference time),
3df8c3
+	 * at which point we will set the Hardware Clock to 1:02:07 (4 seconds
3df8c3
+	 * after the originally requested time).  If we do that successfully,
3df8c3
+	 * then at 6:07:13.250 (5 seconds after the reference time), the
3df8c3
+	 * Hardware Clock will update to 1:02:08 (5 seconds after the
3df8c3
+	 * originally requested time), and all is well thereafter.
3df8c3
 	 */
3df8c3
-	do {
3df8c3
-		if (time_resync) {
3df8c3
-			gettimeofday(&beginsystime, NULL);
3df8c3
-			tdiff = time_diff(beginsystime, refsystime);
3df8c3
-			newhwtime = sethwtime + (int)(tdiff + 0.5);
3df8c3
-			if (debug)
3df8c3
-				printf(_
3df8c3
-				       ("Time elapsed since reference time has been %.6f seconds.\n"
3df8c3
-					"Delaying further to reach the new time.\n"),
3df8c3
-				       tdiff);
3df8c3
-			time_resync = 0;
3df8c3
+
3df8c3
+	time_t newhwtime = sethwtime;
3df8c3
+	double target_time_tolerance_secs = 0.001;  /* initial value */
3df8c3
+	double tolerance_incr_secs = 0.001;	    /* initial value */
3df8c3
+	const double RTC_SET_DELAY_SECS = 0.5;	    /* 500 ms */
3df8c3
+	const struct timeval RTC_SET_DELAY_TV = { 0, RTC_SET_DELAY_SECS * 1E6 };
3df8c3
+
3df8c3
+	struct timeval targetsystime;
3df8c3
+	struct timeval nowsystime;
3df8c3
+	struct timeval prevsystime = refsystime;
3df8c3
+	double deltavstarget;
3df8c3
+
3df8c3
+	timeradd(&refsystime, &RTC_SET_DELAY_TV, &targetsystime);
3df8c3
+
3df8c3
+	while (1) {
3df8c3
+		double ticksize;
3df8c3
+
3df8c3
+		/* FOR TESTING ONLY: inject random delays of up to 1000ms */
3df8c3
+		if (debug >= 10) {
3df8c3
+			int usec = random() % 1000000;
3df8c3
+			printf(_("sleeping ~%d usec\n"), usec);
3df8c3
+			usleep(usec);
3df8c3
 		}
3df8c3
 
3df8c3
 		gettimeofday(&nowsystime, NULL);
3df8c3
-		tdiff = time_diff(nowsystime, beginsystime);
3df8c3
-		if (tdiff < 0) {
3df8c3
-			time_resync = 1;	/* probably backward time reset */
3df8c3
-			continue;
3df8c3
-		}
3df8c3
-		if (tdiff > 0.1) {
3df8c3
-			time_resync = 1;	/* probably forward time reset */
3df8c3
-			continue;
3df8c3
+		deltavstarget = time_diff(nowsystime, targetsystime);
3df8c3
+		ticksize = time_diff(nowsystime, prevsystime);
3df8c3
+		prevsystime = nowsystime;
3df8c3
+
3df8c3
+		if (ticksize < 0) {
3df8c3
+			if (debug)
3df8c3
+				printf(_("time jumped backward %.6f seconds "
3df8c3
+					 "to %ld.%06d - retargeting\n"),
3df8c3
+				       ticksize, (long)nowsystime.tv_sec,
3df8c3
+				       (int)nowsystime.tv_usec);
3df8c3
+			/* The retarget is handled at the end of the loop. */
3df8c3
+		} else if (deltavstarget < 0) {
3df8c3
+			/* deltavstarget < 0 if current time < target time */
3df8c3
+			if (debug >= 2)
3df8c3
+				printf(_("%ld.%06d < %ld.%06d (%.6f)\n"),
3df8c3
+				       (long)nowsystime.tv_sec,
3df8c3
+				       (int)nowsystime.tv_usec,
3df8c3
+				       (long)targetsystime.tv_sec,
3df8c3
+				       (int)targetsystime.tv_usec,
3df8c3
+				       deltavstarget);
3df8c3
+			continue;  /* not there yet - keep spinning */
3df8c3
+		} else if (deltavstarget <= target_time_tolerance_secs) {
3df8c3
+			/* Close enough to the target time; done waiting. */
3df8c3
+			break;
3df8c3
+		} else /* (deltavstarget > target_time_tolerance_secs) */ {
3df8c3
+			/*
3df8c3
+			 * We missed our window.  Increase the tolerance and
3df8c3
+			 * aim for the next opportunity.
3df8c3
+			 */
3df8c3
+			if (debug)
3df8c3
+				printf(_("missed it - %ld.%06d is too far "
3df8c3
+					 "past %ld.%06d (%.6f > %.6f)\n"),
3df8c3
+				       (long)nowsystime.tv_sec,
3df8c3
+				       (int)nowsystime.tv_usec,
3df8c3
+				       (long)targetsystime.tv_sec,
3df8c3
+				       (int)targetsystime.tv_usec,
3df8c3
+				       deltavstarget,
3df8c3
+				       target_time_tolerance_secs);
3df8c3
+			target_time_tolerance_secs += tolerance_incr_secs;
3df8c3
+			tolerance_incr_secs *= 2;
3df8c3
 		}
3df8c3
-		beginsystime = nowsystime;
3df8c3
-		tdiff = time_diff(nowsystime, refsystime);
3df8c3
-	} while (newhwtime == sethwtime + (int)(tdiff + 0.5));
3df8c3
+
3df8c3
+		/*
3df8c3
+		 * Aim for the same offset (tv_usec) within the second in
3df8c3
+		 * either the current second (if that offset hasn't arrived
3df8c3
+		 * yet), or the next second.
3df8c3
+		 */
3df8c3
+		if (nowsystime.tv_usec < targetsystime.tv_usec)
3df8c3
+			targetsystime.tv_sec = nowsystime.tv_sec;
3df8c3
+		else
3df8c3
+			targetsystime.tv_sec = nowsystime.tv_sec + 1;
3df8c3
+	}
3df8c3
+
3df8c3
+	newhwtime = sethwtime
3df8c3
+		    + (int)(time_diff(nowsystime, refsystime)
3df8c3
+			    - RTC_SET_DELAY_SECS /* don't count this */
3df8c3
+			    + 0.5 /* for rounding */);
3df8c3
+	if (debug)
3df8c3
+		printf(_("%ld.%06d is close enough to %ld.%06d (%.6f < %.6f)\n"
3df8c3
+			 "Set RTC to %ld (%ld + %d; refsystime = %ld.%06d)\n"),
3df8c3
+		       (long)nowsystime.tv_sec, (int)nowsystime.tv_usec,
3df8c3
+		       (long)targetsystime.tv_sec, (int)targetsystime.tv_usec,
3df8c3
+		       deltavstarget, target_time_tolerance_secs,
3df8c3
+		       (long)newhwtime, (long)sethwtime,
3df8c3
+		       (int)(newhwtime - sethwtime),
3df8c3
+		       (long)refsystime.tv_sec, (int)refsystime.tv_usec);
3df8c3
 
3df8c3
 	set_hardware_clock(newhwtime, universal, testing);
3df8c3
 }
3df8c3
@@ -1636,7 +1734,7 @@ int main(int argc, char **argv)
3df8c3
 
3df8c3
 		switch (c) {
3df8c3
 		case 'D':
3df8c3
-			debug = TRUE;
3df8c3
+			++debug;
3df8c3
 			break;
3df8c3
 		case 'a':
3df8c3
 			adjust = TRUE;
3df8c3
@@ -1953,10 +2051,4 @@ void __attribute__((__noreturn__)) hwaudit_exit(int status)
3df8c3
  *
3df8c3
  * hwclock uses this method, and considers the Hardware Clock to have
3df8c3
  * infinite precision.
3df8c3
- *
3df8c3
- * TODO: Enhancements needed:
3df8c3
- *
3df8c3
- *  - When waiting for whole second boundary in set_hardware_clock_exact,
3df8c3
- *    fail if we miss the goal by more than .1 second, as could happen if we
3df8c3
- *    get pre-empted (by the kernel dispatcher).
3df8c3
  */
3df8c3
-- 
3df8c3
1.9.3
3df8c3