9ae3a8
From b0acf414158ed72f3f2ac7550839af74b0530c8e Mon Sep 17 00:00:00 2001
9ae3a8
From: Nigel Croxon <ncroxon@redhat.com>
9ae3a8
Date: Wed, 31 Jul 2013 15:12:19 +0200
9ae3a8
Subject: Force auto-convegence of live migration
9ae3a8
9ae3a8
RH-Author: Nigel Croxon <ncroxon@redhat.com>
9ae3a8
Message-id: <1375283539-18714-4-git-send-email-ncroxon@redhat.com>
9ae3a8
Patchwork-id: 52873
9ae3a8
O-Subject: [RHEL7 PATCH 3/3] Force auto-convegence of live migration
9ae3a8
Bugzilla: 985958
9ae3a8
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
9ae3a8
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
9ae3a8
9ae3a8
Bugzilla: 985958 - Throttle-down guest to help with live migration convergence (backport to RHEL7.0)
9ae3a8
https://bugzilla.redhat.com/show_bug.cgi?id=985958
9ae3a8
9ae3a8
Backported from the following upstream commit:
9ae3a8
9ae3a8
commit 7ca1dfad952d8a8655b32e78623edcc38a51b14a
9ae3a8
Author: Chegu Vinod <chegu_vinod@hp.com>
9ae3a8
Date:   Mon Jun 24 03:47:39 2013 -0600
9ae3a8
9ae3a8
    Force auto-convegence of live migration
9ae3a8
9ae3a8
    If a user chooses to turn on the auto-converge migration capability
9ae3a8
    these changes detect the lack of convergence and throttle down the
9ae3a8
    guest. i.e. force the VCPUs out of the guest for some duration
9ae3a8
    and let the migration thread catchup and help converge.
9ae3a8
9ae3a8
    Verified the convergence using the following :
9ae3a8
     - Java Warehouse workload running on a 20VCPU/256G guest(~80% busy)
9ae3a8
     - OLTP like workload running on a 80VCPU/512G guest (~80% busy)
9ae3a8
9ae3a8
    Sample results with Java warehouse workload : (migrate speed set to 20Gb and
9ae3a8
    migrate downtime set to 4seconds).
9ae3a8
9ae3a8
     (qemu) info migrate
9ae3a8
     capabilities: xbzrle: off auto-converge: off  <----
9ae3a8
     Migration status: active
9ae3a8
     total time: 1487503 milliseconds
9ae3a8
     expected downtime: 519 milliseconds
9ae3a8
     transferred ram: 383749347 kbytes
9ae3a8
     remaining ram: 2753372 kbytes
9ae3a8
     total ram: 268444224 kbytes
9ae3a8
     duplicate: 65461532 pages
9ae3a8
     skipped: 64901568 pages
9ae3a8
     normal: 95750218 pages
9ae3a8
     normal bytes: 383000872 kbytes
9ae3a8
     dirty pages rate: 67551 pages
9ae3a8
9ae3a8
     ---
9ae3a8
9ae3a8
     (qemu) info migrate
9ae3a8
     capabilities: xbzrle: off auto-converge: on   <----
9ae3a8
     Migration status: completed
9ae3a8
     total time: 241161 milliseconds
9ae3a8
     downtime: 6373 milliseconds
9ae3a8
     transferred ram: 28235307 kbytes
9ae3a8
     remaining ram: 0 kbytes
9ae3a8
     total ram: 268444224 kbytes
9ae3a8
     duplicate: 64946416 pages
9ae3a8
     skipped: 64903523 pages
9ae3a8
     normal: 7044971 pages
9ae3a8
     normal bytes: 28179884 kbytes
9ae3a8
9ae3a8
    Signed-off-by: Chegu Vinod <chegu_vinod@hp.com>
9ae3a8
    Signed-off-by: Juan Quintela <quintela@redhat.com>
9ae3a8
9ae3a8
diff --git a/arch_init.c b/arch_init.c
9ae3a8
index 522caeb..d7a5d7c 100644
9ae3a8
--- a/arch_init.c
9ae3a8
+++ b/arch_init.c
9ae3a8
@@ -104,6 +104,9 @@ int graphic_depth = 15;
9ae3a8
 #endif
9ae3a8
 
9ae3a8
 const uint32_t arch_type = QEMU_ARCH;
9ae3a8
+static bool mig_throttle_on;
9ae3a8
+static int dirty_rate_high_cnt;
9ae3a8
+static void check_guest_throttling(void);
9ae3a8
 
9ae3a8
 /***********************************************************/
9ae3a8
 /* ram save/restore */
9ae3a8
@@ -378,8 +381,14 @@ static void migration_bitmap_sync(void)
9ae3a8
     uint64_t num_dirty_pages_init = migration_dirty_pages;
9ae3a8
     MigrationState *s = migrate_get_current();
9ae3a8
     static int64_t start_time;
9ae3a8
+    static int64_t bytes_xfer_prev;
9ae3a8
     static int64_t num_dirty_pages_period;
9ae3a8
     int64_t end_time;
9ae3a8
+    int64_t bytes_xfer_now;
9ae3a8
+
9ae3a8
+    if (!bytes_xfer_prev) {
9ae3a8
+        bytes_xfer_prev = ram_bytes_transferred();
9ae3a8
+    }
9ae3a8
 
9ae3a8
     if (!start_time) {
9ae3a8
         start_time = qemu_get_clock_ms(rt_clock);
9ae3a8
@@ -404,6 +413,25 @@ static void migration_bitmap_sync(void)
9ae3a8
 
9ae3a8
     /* more than 1 second = 1000 millisecons */
9ae3a8
     if (end_time > start_time + 1000) {
9ae3a8
+        if (migrate_auto_converge()) {
9ae3a8
+            /* The following detection logic can be refined later. For now:
9ae3a8
+               Check to see if the dirtied bytes is 50% more than the approx.
9ae3a8
+               amount of bytes that just got transferred since the last time we
9ae3a8
+               were in this routine. If that happens >N times (for now N==4)
9ae3a8
+               we turn on the throttle down logic */
9ae3a8
+            bytes_xfer_now = ram_bytes_transferred();
9ae3a8
+            if (s->dirty_pages_rate &&
9ae3a8
+               (num_dirty_pages_period * TARGET_PAGE_SIZE >
9ae3a8
+                   (bytes_xfer_now - bytes_xfer_prev)/2) &&
9ae3a8
+               (dirty_rate_high_cnt++ > 4)) {
9ae3a8
+                    trace_migration_throttle();
9ae3a8
+                    mig_throttle_on = true;
9ae3a8
+                    dirty_rate_high_cnt = 0;
9ae3a8
+             }
9ae3a8
+             bytes_xfer_prev = bytes_xfer_now;
9ae3a8
+        } else {
9ae3a8
+             mig_throttle_on = false;
9ae3a8
+        }
9ae3a8
         s->dirty_pages_rate = num_dirty_pages_period * 1000
9ae3a8
             / (end_time - start_time);
9ae3a8
         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
9ae3a8
@@ -566,6 +594,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
9ae3a8
     migration_bitmap = bitmap_new(ram_pages);
9ae3a8
     bitmap_set(migration_bitmap, 0, ram_pages);
9ae3a8
     migration_dirty_pages = ram_pages;
9ae3a8
+    mig_throttle_on = false;
9ae3a8
+    dirty_rate_high_cnt = 0;
9ae3a8
 
9ae3a8
     if (migrate_use_xbzrle()) {
9ae3a8
         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
9ae3a8
@@ -628,6 +658,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
9ae3a8
         }
9ae3a8
         total_sent += bytes_sent;
9ae3a8
         acct_info.iterations++;
9ae3a8
+        check_guest_throttling();
9ae3a8
         /* we want to check in the 1st loop, just in case it was the 1st time
9ae3a8
            and we had to sync the dirty bitmap.
9ae3a8
            qemu_get_clock_ns() is a bit expensive, so we only check each some
9ae3a8
@@ -1097,3 +1128,53 @@ TargetInfo *qmp_query_target(Error **errp)
9ae3a8
 
9ae3a8
     return info;
9ae3a8
 }
9ae3a8
+
9ae3a8
+/* Stub function that's gets run on the vcpu when its brought out of the
9ae3a8
+   VM to run inside qemu via async_run_on_cpu()*/
9ae3a8
+static void mig_sleep_cpu(void *opq)
9ae3a8
+{
9ae3a8
+    qemu_mutex_unlock_iothread();
9ae3a8
+    g_usleep(30*1000);
9ae3a8
+    qemu_mutex_lock_iothread();
9ae3a8
+}
9ae3a8
+
9ae3a8
+/* To reduce the dirty rate explicitly disallow the VCPUs from spending
9ae3a8
+   much time in the VM. The migration thread will try to catchup.
9ae3a8
+   Workload will experience a performance drop.
9ae3a8
+*/
9ae3a8
+static void mig_throttle_cpu_down(CPUState *cpu, void *data)
9ae3a8
+{
9ae3a8
+    async_run_on_cpu(cpu, mig_sleep_cpu, NULL);
9ae3a8
+}
9ae3a8
+
9ae3a8
+static void mig_throttle_guest_down(void)
9ae3a8
+{
9ae3a8
+    qemu_mutex_lock_iothread();
9ae3a8
+    qemu_for_each_cpu(mig_throttle_cpu_down, NULL);
9ae3a8
+    qemu_mutex_unlock_iothread();
9ae3a8
+}
9ae3a8
+
9ae3a8
+static void check_guest_throttling(void)
9ae3a8
+{
9ae3a8
+    static int64_t t0;
9ae3a8
+    int64_t        t1;
9ae3a8
+
9ae3a8
+    if (!mig_throttle_on) {
9ae3a8
+        return;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if (!t0)  {
9ae3a8
+        t0 = qemu_get_clock_ns(rt_clock);
9ae3a8
+        return;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    t1 = qemu_get_clock_ns(rt_clock);
9ae3a8
+
9ae3a8
+    /* If it has been more than 40 ms since the last time the guest
9ae3a8
+     * was throttled then do it again.
9ae3a8
+     */
9ae3a8
+    if (40 < (t1-t0)/1000000) {
9ae3a8
+        mig_throttle_guest_down();
9ae3a8
+        t0 = t1;
9ae3a8
+    }
9ae3a8
+}
9ae3a8
diff --git a/trace-events b/trace-events
9ae3a8
index 9c73931..7cd335d 100644
9ae3a8
--- a/trace-events
9ae3a8
+++ b/trace-events
9ae3a8
@@ -1031,6 +1031,7 @@ savevm_section_end(unsigned int section_id) "section_id %u"
9ae3a8
 # arch_init.c
9ae3a8
 migration_bitmap_sync_start(void) ""
9ae3a8
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
9ae3a8
+migration_throttle(void) ""
9ae3a8
 
9ae3a8
 # hw/qxl.c
9ae3a8
 disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"