Blob Blame History Raw
 WHATS_NEW      |  1 +
 tools/pvscan.c | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/WHATS_NEW b/WHATS_NEW
index aabfc78..7fd107a 100644
--- a/WHATS_NEW
+++ b/WHATS_NEW
@@ -1,5 +1,6 @@
 Version 2.02.104 - 
 ===================================
+  Workaround VG refresh race during autoactivation by retrying the refresh.
   Add dev-block-<major>:<minor>.device systemd alias for complete PV tracking.
   Use major:minor as short form of --major and --minor arg for pvscan --cache.
   Fix lvconvert swap of poolmetadata volume for active thin pool.
diff --git a/tools/pvscan.c b/tools/pvscan.c
index b6a07bd..ce8c446 100644
--- a/tools/pvscan.c
+++ b/tools/pvscan.c
@@ -91,10 +91,15 @@ static void _pvscan_display_single(struct cmd_context *cmd,
 				display_size(cmd, (uint64_t) (pv_pe_count(pv) - pv_pe_alloc_count(pv)) * pv_pe_size(pv)));
 }
 
+#define REFRESH_BEFORE_AUTOACTIVATION_RETRIES 5
+#define REFRESH_BEFORE_AUTOACTIVATION_RETRY_USLEEP_DELAY 100000
+
 static int _auto_activation_handler(struct cmd_context *cmd,
 				    const char *vgid, int partial,
 				    activation_change_t activate)
 {
+	unsigned int refresh_retries = REFRESH_BEFORE_AUTOACTIVATION_RETRIES;
+	int refresh_done = 0;
 	struct volume_group *vg;
 	int consistent = 0;
 	struct id vgid_raw;
@@ -115,7 +120,34 @@ static int _auto_activation_handler(struct cmd_context *cmd,
 		r = 1; goto out;
 	}
 
-	if (!vg_refresh_visible(vg->cmd, vg)) {
+	/* FIXME: There's a tiny race when suspending the device which is part
+	 * of the refresh because when suspend ioctl is performed, the dm
+	 * kernel driver executes (do_suspend and dm_suspend kernel fn):
+	 *
+	 *          step 1: a check whether the dev is already suspended and
+	 *                  if yes it returns success immediately as there's
+	 *                  nothing to do
+	 *          step 2: it grabs the suspend lock
+	 *          step 3: another check whether the dev is already suspended
+	 *                  and if found suspended, it exits with -EINVAL now
+	 *
+	 * The race can occur in between step 1 and step 2. To prevent premature
+	 * autoactivation failure, we're using a simple retry logic here before
+	 * we fail completely. For a complete solution, we need to fix the
+	 * locking so there's no possibility for suspend calls to interleave
+	 * each other to cause this kind of race.
+	 *
+	 * Remove this workaround with "refresh_retries" once we have proper locking in!
+	 */
+	while (refresh_retries--) {
+		if (vg_refresh_visible(vg->cmd, vg)) {
+			refresh_done = 1;
+			break;
+		}
+		usleep(REFRESH_BEFORE_AUTOACTIVATION_RETRY_USLEEP_DELAY);
+	}
+
+	if (!refresh_done) {
 		log_error("%s: refresh before autoactivation failed.", vg->name);
 		goto out;
 	}