|
|
3df8c3 |
From 4a44a54b3caf77923f0e3f1d5bdf5eda6ef07f62 Mon Sep 17 00:00:00 2001
|
|
|
3df8c3 |
From: Chris MacGregor <chrismacgregor@google.com>
|
|
|
3df8c3 |
Date: Thu, 27 Feb 2014 10:40:59 -0800
|
|
|
3df8c3 |
Subject: [PATCH] hwclock: fix possible hang and other
|
|
|
3df8c3 |
set_hardware_clock_exact() issues
|
|
|
3df8c3 |
|
|
|
3df8c3 |
In sys-utils/hwclock.c, set_hardware_clock_exact() has some problems when the
|
|
|
3df8c3 |
process gets pre-empted (for more than 100ms) before reaching the time for
|
|
|
3df8c3 |
which it waits:
|
|
|
3df8c3 |
|
|
|
3df8c3 |
1. The "continue" statement causes execution to skip the final tdiff
|
|
|
3df8c3 |
assignment at the end of the do...while loop, leading to the while condition
|
|
|
3df8c3 |
using the wrong value of tdiff, and thus always exiting the loop once
|
|
|
3df8c3 |
newhwtime != sethwtime (e.g., after 1 second). This masks bug # 2, below.
|
|
|
3df8c3 |
|
|
|
3df8c3 |
2. The previously-existing bug is that because it starts over waiting for the
|
|
|
3df8c3 |
desired time whenever two successive calls to gettimeofday() return values >
|
|
|
3df8c3 |
100ms apart, the loop will never terminate unless the process holds the CPU
|
|
|
3df8c3 |
(without losing it for more than 100ms) for at least 500ms. This can happen
|
|
|
3df8c3 |
on a heavily loaded machine or on a virtual machine (or on a heavily loaded
|
|
|
3df8c3 |
virtual machine). This has been observed to occur, preventing a machine from
|
|
|
3df8c3 |
completing the shutdown or reboot process due to a "hwclock --systohc" call in
|
|
|
3df8c3 |
a shutdown script.
|
|
|
3df8c3 |
|
|
|
3df8c3 |
The new implementation presented in this patch takes a somewhat different
|
|
|
3df8c3 |
approach, intended to accomplish the same goals:
|
|
|
3df8c3 |
|
|
|
3df8c3 |
It computes the desired target system time (at which the requested hardware
|
|
|
3df8c3 |
clock time will be applied to the hardware clock), and waits for that time to
|
|
|
3df8c3 |
arrive. If it misses the time (such as due to being pre-empted for too long),
|
|
|
3df8c3 |
it recalculates the target time, and increases the tolerance (how late it can
|
|
|
3df8c3 |
be relative to the target time, and still be "close enough". Thus, if all is
|
|
|
3df8c3 |
well, the time will be set *very* precisely. On a machine where the hwclock
|
|
|
3df8c3 |
process is repeatedly pre-empted, it will set the time as precisely as is
|
|
|
3df8c3 |
possible under the conditions present on that particular machine. In any
|
|
|
3df8c3 |
case, it will always terminate eventually (and pretty quickly); it will never
|
|
|
3df8c3 |
hang forever.
|
|
|
3df8c3 |
|
|
|
3df8c3 |
[kzak@redhat.com: - tiny coding style changes]
|
|
|
3df8c3 |
|
|
|
3df8c3 |
Signed-off-by: Chris MacGregor <chrismacgregor@google.com>
|
|
|
3df8c3 |
Signed-off-by: Karel Zak <kzak@redhat.com>
|
|
|
3df8c3 |
---
|
|
|
3df8c3 |
sys-utils/hwclock.c | 170 ++++++++++++++++++++++++++++++++++++++++------------
|
|
|
3df8c3 |
1 file changed, 131 insertions(+), 39 deletions(-)
|
|
|
3df8c3 |
|
|
|
3df8c3 |
diff --git a/sys-utils/hwclock.c b/sys-utils/hwclock.c
|
|
|
3df8c3 |
index 30660d4..395b5c3 100644
|
|
|
3df8c3 |
--- a/sys-utils/hwclock.c
|
|
|
3df8c3 |
+++ b/sys-utils/hwclock.c
|
|
|
3df8c3 |
@@ -125,7 +125,7 @@ struct adjtime {
|
|
|
3df8c3 |
* We are running in debug mode, wherein we put a lot of information about
|
|
|
3df8c3 |
* what we're doing to standard output.
|
|
|
3df8c3 |
*/
|
|
|
3df8c3 |
-bool debug;
|
|
|
3df8c3 |
+int debug;
|
|
|
3df8c3 |
|
|
|
3df8c3 |
/* Workaround for Award 4.50g BIOS bug: keep the year in a file. */
|
|
|
3df8c3 |
bool badyear;
|
|
|
3df8c3 |
@@ -526,43 +526,141 @@ set_hardware_clock_exact(const time_t sethwtime,
|
|
|
3df8c3 |
const struct timeval refsystime,
|
|
|
3df8c3 |
const bool universal, const bool testing)
|
|
|
3df8c3 |
{
|
|
|
3df8c3 |
- time_t newhwtime = sethwtime;
|
|
|
3df8c3 |
- struct timeval beginsystime, nowsystime;
|
|
|
3df8c3 |
- double tdiff;
|
|
|
3df8c3 |
- int time_resync = 1;
|
|
|
3df8c3 |
-
|
|
|
3df8c3 |
/*
|
|
|
3df8c3 |
- * Now delay some more until Hardware Clock time newhwtime arrives.
|
|
|
3df8c3 |
- * The 0.5 s is because the Hardware Clock always sets to your set
|
|
|
3df8c3 |
- * time plus 500 ms (because it is designed to update to the next
|
|
|
3df8c3 |
- * second precisely 500 ms after you finish the setting).
|
|
|
3df8c3 |
+ * The Hardware Clock can only be set to any integer time plus one
|
|
|
3df8c3 |
+ * half second. The integer time is required because there is no
|
|
|
3df8c3 |
+ * interface to set or get a fractional second. The additional half
|
|
|
3df8c3 |
+ * second is because the Hardware Clock updates to the following
|
|
|
3df8c3 |
+ * second precisely 500 ms (not 1 second!) after you release the
|
|
|
3df8c3 |
+ * divider reset (after setting the new time) - see description of
|
|
|
3df8c3 |
+ * DV2, DV1, DV0 in Register A in the MC146818A data sheet (and note
|
|
|
3df8c3 |
+ * that although that document doesn't say so, real-world code seems
|
|
|
3df8c3 |
+ * to expect that the SET bit in Register B functions the same way).
|
|
|
3df8c3 |
+ * That means that, e.g., when you set the clock to 1:02:03, it
|
|
|
3df8c3 |
+ * effectively really sets it to 1:02:03.5, because it will update to
|
|
|
3df8c3 |
+ * 1:02:04 only half a second later. Our caller passes the desired
|
|
|
3df8c3 |
+ * integer Hardware Clock time in sethwtime, and the corresponding
|
|
|
3df8c3 |
+ * system time (which may have a fractional part, and which may or may
|
|
|
3df8c3 |
+ * not be the same!) in refsystime. In an ideal situation, we would
|
|
|
3df8c3 |
+ * then apply sethwtime to the Hardware Clock at refsystime+500ms, so
|
|
|
3df8c3 |
+ * that when the Hardware Clock ticks forward to sethwtime+1s half a
|
|
|
3df8c3 |
+ * second later at refsystime+1000ms, everything is in sync. So we
|
|
|
3df8c3 |
+ * spin, waiting for gettimeofday() to return a time at or after that
|
|
|
3df8c3 |
+ * time (refsystime+500ms) up to a tolerance value, initially 1ms. If
|
|
|
3df8c3 |
+ * we miss that time due to being preempted for some other process,
|
|
|
3df8c3 |
+ * then we increase the margin a little bit (initially 1ms, doubling
|
|
|
3df8c3 |
+ * each time), add 1 second (or more, if needed to get a time that is
|
|
|
3df8c3 |
+ * in the future) to both the time for which we are waiting and the
|
|
|
3df8c3 |
+ * time that we will apply to the Hardware Clock, and start waiting
|
|
|
3df8c3 |
+ * again.
|
|
|
3df8c3 |
+ *
|
|
|
3df8c3 |
+ * For example, the caller requests that we set the Hardware Clock to
|
|
|
3df8c3 |
+ * 1:02:03, with reference time (current system time) = 6:07:08.250.
|
|
|
3df8c3 |
+ * We want the Hardware Clock to update to 1:02:04 at 6:07:09.250 on
|
|
|
3df8c3 |
+ * the system clock, and the first such update will occur 0.500
|
|
|
3df8c3 |
+ * seconds after we write to the Hardware Clock, so we spin until the
|
|
|
3df8c3 |
+ * system clock reads 6:07:08.750. If we get there, great, but let's
|
|
|
3df8c3 |
+ * imagine the system is so heavily loaded that our process is
|
|
|
3df8c3 |
+ * preempted and by the time we get to run again, the system clock
|
|
|
3df8c3 |
+ * reads 6:07:11.990. We now want to wait until the next xx:xx:xx.750
|
|
|
3df8c3 |
+ * time, which is 6:07:12.750 (4.5 seconds after the reference time),
|
|
|
3df8c3 |
+ * at which point we will set the Hardware Clock to 1:02:07 (4 seconds
|
|
|
3df8c3 |
+ * after the originally requested time). If we do that successfully,
|
|
|
3df8c3 |
+ * then at 6:07:13.250 (5 seconds after the reference time), the
|
|
|
3df8c3 |
+ * Hardware Clock will update to 1:02:08 (5 seconds after the
|
|
|
3df8c3 |
+ * originally requested time), and all is well thereafter.
|
|
|
3df8c3 |
*/
|
|
|
3df8c3 |
- do {
|
|
|
3df8c3 |
- if (time_resync) {
|
|
|
3df8c3 |
- gettimeofday(&beginsystime, NULL);
|
|
|
3df8c3 |
- tdiff = time_diff(beginsystime, refsystime);
|
|
|
3df8c3 |
- newhwtime = sethwtime + (int)(tdiff + 0.5);
|
|
|
3df8c3 |
- if (debug)
|
|
|
3df8c3 |
- printf(_
|
|
|
3df8c3 |
- ("Time elapsed since reference time has been %.6f seconds.\n"
|
|
|
3df8c3 |
- "Delaying further to reach the new time.\n"),
|
|
|
3df8c3 |
- tdiff);
|
|
|
3df8c3 |
- time_resync = 0;
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ time_t newhwtime = sethwtime;
|
|
|
3df8c3 |
+ double target_time_tolerance_secs = 0.001; /* initial value */
|
|
|
3df8c3 |
+ double tolerance_incr_secs = 0.001; /* initial value */
|
|
|
3df8c3 |
+ const double RTC_SET_DELAY_SECS = 0.5; /* 500 ms */
|
|
|
3df8c3 |
+ const struct timeval RTC_SET_DELAY_TV = { 0, RTC_SET_DELAY_SECS * 1E6 };
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ struct timeval targetsystime;
|
|
|
3df8c3 |
+ struct timeval nowsystime;
|
|
|
3df8c3 |
+ struct timeval prevsystime = refsystime;
|
|
|
3df8c3 |
+ double deltavstarget;
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ timeradd(&refsystime, &RTC_SET_DELAY_TV, &targetsystime);
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ while (1) {
|
|
|
3df8c3 |
+ double ticksize;
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ /* FOR TESTING ONLY: inject random delays of up to 1000ms */
|
|
|
3df8c3 |
+ if (debug >= 10) {
|
|
|
3df8c3 |
+ int usec = random() % 1000000;
|
|
|
3df8c3 |
+ printf(_("sleeping ~%d usec\n"), usec);
|
|
|
3df8c3 |
+ usleep(usec);
|
|
|
3df8c3 |
}
|
|
|
3df8c3 |
|
|
|
3df8c3 |
gettimeofday(&nowsystime, NULL);
|
|
|
3df8c3 |
- tdiff = time_diff(nowsystime, beginsystime);
|
|
|
3df8c3 |
- if (tdiff < 0) {
|
|
|
3df8c3 |
- time_resync = 1; /* probably backward time reset */
|
|
|
3df8c3 |
- continue;
|
|
|
3df8c3 |
- }
|
|
|
3df8c3 |
- if (tdiff > 0.1) {
|
|
|
3df8c3 |
- time_resync = 1; /* probably forward time reset */
|
|
|
3df8c3 |
- continue;
|
|
|
3df8c3 |
+ deltavstarget = time_diff(nowsystime, targetsystime);
|
|
|
3df8c3 |
+ ticksize = time_diff(nowsystime, prevsystime);
|
|
|
3df8c3 |
+ prevsystime = nowsystime;
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ if (ticksize < 0) {
|
|
|
3df8c3 |
+ if (debug)
|
|
|
3df8c3 |
+ printf(_("time jumped backward %.6f seconds "
|
|
|
3df8c3 |
+ "to %ld.%06d - retargeting\n"),
|
|
|
3df8c3 |
+ ticksize, (long)nowsystime.tv_sec,
|
|
|
3df8c3 |
+ (int)nowsystime.tv_usec);
|
|
|
3df8c3 |
+ /* The retarget is handled at the end of the loop. */
|
|
|
3df8c3 |
+ } else if (deltavstarget < 0) {
|
|
|
3df8c3 |
+ /* deltavstarget < 0 if current time < target time */
|
|
|
3df8c3 |
+ if (debug >= 2)
|
|
|
3df8c3 |
+ printf(_("%ld.%06d < %ld.%06d (%.6f)\n"),
|
|
|
3df8c3 |
+ (long)nowsystime.tv_sec,
|
|
|
3df8c3 |
+ (int)nowsystime.tv_usec,
|
|
|
3df8c3 |
+ (long)targetsystime.tv_sec,
|
|
|
3df8c3 |
+ (int)targetsystime.tv_usec,
|
|
|
3df8c3 |
+ deltavstarget);
|
|
|
3df8c3 |
+ continue; /* not there yet - keep spinning */
|
|
|
3df8c3 |
+ } else if (deltavstarget <= target_time_tolerance_secs) {
|
|
|
3df8c3 |
+ /* Close enough to the target time; done waiting. */
|
|
|
3df8c3 |
+ break;
|
|
|
3df8c3 |
+ } else /* (deltavstarget > target_time_tolerance_secs) */ {
|
|
|
3df8c3 |
+ /*
|
|
|
3df8c3 |
+ * We missed our window. Increase the tolerance and
|
|
|
3df8c3 |
+ * aim for the next opportunity.
|
|
|
3df8c3 |
+ */
|
|
|
3df8c3 |
+ if (debug)
|
|
|
3df8c3 |
+ printf(_("missed it - %ld.%06d is too far "
|
|
|
3df8c3 |
+ "past %ld.%06d (%.6f > %.6f)\n"),
|
|
|
3df8c3 |
+ (long)nowsystime.tv_sec,
|
|
|
3df8c3 |
+ (int)nowsystime.tv_usec,
|
|
|
3df8c3 |
+ (long)targetsystime.tv_sec,
|
|
|
3df8c3 |
+ (int)targetsystime.tv_usec,
|
|
|
3df8c3 |
+ deltavstarget,
|
|
|
3df8c3 |
+ target_time_tolerance_secs);
|
|
|
3df8c3 |
+ target_time_tolerance_secs += tolerance_incr_secs;
|
|
|
3df8c3 |
+ tolerance_incr_secs *= 2;
|
|
|
3df8c3 |
}
|
|
|
3df8c3 |
- beginsystime = nowsystime;
|
|
|
3df8c3 |
- tdiff = time_diff(nowsystime, refsystime);
|
|
|
3df8c3 |
- } while (newhwtime == sethwtime + (int)(tdiff + 0.5));
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ /*
|
|
|
3df8c3 |
+ * Aim for the same offset (tv_usec) within the second in
|
|
|
3df8c3 |
+ * either the current second (if that offset hasn't arrived
|
|
|
3df8c3 |
+ * yet), or the next second.
|
|
|
3df8c3 |
+ */
|
|
|
3df8c3 |
+ if (nowsystime.tv_usec < targetsystime.tv_usec)
|
|
|
3df8c3 |
+ targetsystime.tv_sec = nowsystime.tv_sec;
|
|
|
3df8c3 |
+ else
|
|
|
3df8c3 |
+ targetsystime.tv_sec = nowsystime.tv_sec + 1;
|
|
|
3df8c3 |
+ }
|
|
|
3df8c3 |
+
|
|
|
3df8c3 |
+ newhwtime = sethwtime
|
|
|
3df8c3 |
+ + (int)(time_diff(nowsystime, refsystime)
|
|
|
3df8c3 |
+ - RTC_SET_DELAY_SECS /* don't count this */
|
|
|
3df8c3 |
+ + 0.5 /* for rounding */);
|
|
|
3df8c3 |
+ if (debug)
|
|
|
3df8c3 |
+ printf(_("%ld.%06d is close enough to %ld.%06d (%.6f < %.6f)\n"
|
|
|
3df8c3 |
+ "Set RTC to %ld (%ld + %d; refsystime = %ld.%06d)\n"),
|
|
|
3df8c3 |
+ (long)nowsystime.tv_sec, (int)nowsystime.tv_usec,
|
|
|
3df8c3 |
+ (long)targetsystime.tv_sec, (int)targetsystime.tv_usec,
|
|
|
3df8c3 |
+ deltavstarget, target_time_tolerance_secs,
|
|
|
3df8c3 |
+ (long)newhwtime, (long)sethwtime,
|
|
|
3df8c3 |
+ (int)(newhwtime - sethwtime),
|
|
|
3df8c3 |
+ (long)refsystime.tv_sec, (int)refsystime.tv_usec);
|
|
|
3df8c3 |
|
|
|
3df8c3 |
set_hardware_clock(newhwtime, universal, testing);
|
|
|
3df8c3 |
}
|
|
|
3df8c3 |
@@ -1636,7 +1734,7 @@ int main(int argc, char **argv)
|
|
|
3df8c3 |
|
|
|
3df8c3 |
switch (c) {
|
|
|
3df8c3 |
case 'D':
|
|
|
3df8c3 |
- debug = TRUE;
|
|
|
3df8c3 |
+ ++debug;
|
|
|
3df8c3 |
break;
|
|
|
3df8c3 |
case 'a':
|
|
|
3df8c3 |
adjust = TRUE;
|
|
|
3df8c3 |
@@ -1953,10 +2051,4 @@ void __attribute__((__noreturn__)) hwaudit_exit(int status)
|
|
|
3df8c3 |
*
|
|
|
3df8c3 |
* hwclock uses this method, and considers the Hardware Clock to have
|
|
|
3df8c3 |
* infinite precision.
|
|
|
3df8c3 |
- *
|
|
|
3df8c3 |
- * TODO: Enhancements needed:
|
|
|
3df8c3 |
- *
|
|
|
3df8c3 |
- * - When waiting for whole second boundary in set_hardware_clock_exact,
|
|
|
3df8c3 |
- * fail if we miss the goal by more than .1 second, as could happen if we
|
|
|
3df8c3 |
- * get pre-empted (by the kernel dispatcher).
|
|
|
3df8c3 |
*/
|
|
|
3df8c3 |
--
|
|
|
3df8c3 |
1.9.3
|
|
|
3df8c3 |
|