218e99
From b0acf414158ed72f3f2ac7550839af74b0530c8e Mon Sep 17 00:00:00 2001
218e99
From: Nigel Croxon <ncroxon@redhat.com>
218e99
Date: Wed, 31 Jul 2013 15:12:19 +0200
218e99
Subject: Force auto-convegence of live migration
218e99
218e99
RH-Author: Nigel Croxon <ncroxon@redhat.com>
218e99
Message-id: <1375283539-18714-4-git-send-email-ncroxon@redhat.com>
218e99
Patchwork-id: 52873
218e99
O-Subject: [RHEL7 PATCH 3/3] Force auto-convegence of live migration
218e99
Bugzilla: 985958
218e99
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
218e99
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
218e99
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
218e99
218e99
Bugzilla: 985958 - Throttle-down guest to help with live migration convergence (backport to RHEL7.0)
218e99
https://bugzilla.redhat.com/show_bug.cgi?id=985958
218e99
218e99
Backported from the following upstream commit:
218e99
218e99
commit 7ca1dfad952d8a8655b32e78623edcc38a51b14a
218e99
Author: Chegu Vinod <chegu_vinod@hp.com>
218e99
Date:   Mon Jun 24 03:47:39 2013 -0600
218e99
218e99
    Force auto-convegence of live migration
218e99
218e99
    If a user chooses to turn on the auto-converge migration capability
218e99
    these changes detect the lack of convergence and throttle down the
218e99
    guest. i.e. force the VCPUs out of the guest for some duration
218e99
    and let the migration thread catchup and help converge.
218e99
218e99
    Verified the convergence using the following :
218e99
     - Java Warehouse workload running on a 20VCPU/256G guest(~80% busy)
218e99
     - OLTP like workload running on a 80VCPU/512G guest (~80% busy)
218e99
218e99
    Sample results with Java warehouse workload : (migrate speed set to 20Gb and
218e99
    migrate downtime set to 4seconds).
218e99
218e99
     (qemu) info migrate
218e99
     capabilities: xbzrle: off auto-converge: off  <----
218e99
     Migration status: active
218e99
     total time: 1487503 milliseconds
218e99
     expected downtime: 519 milliseconds
218e99
     transferred ram: 383749347 kbytes
218e99
     remaining ram: 2753372 kbytes
218e99
     total ram: 268444224 kbytes
218e99
     duplicate: 65461532 pages
218e99
     skipped: 64901568 pages
218e99
     normal: 95750218 pages
218e99
     normal bytes: 383000872 kbytes
218e99
     dirty pages rate: 67551 pages
218e99
218e99
     ---
218e99
218e99
     (qemu) info migrate
218e99
     capabilities: xbzrle: off auto-converge: on   <----
218e99
     Migration status: completed
218e99
     total time: 241161 milliseconds
218e99
     downtime: 6373 milliseconds
218e99
     transferred ram: 28235307 kbytes
218e99
     remaining ram: 0 kbytes
218e99
     total ram: 268444224 kbytes
218e99
     duplicate: 64946416 pages
218e99
     skipped: 64903523 pages
218e99
     normal: 7044971 pages
218e99
     normal bytes: 28179884 kbytes
218e99
218e99
    Signed-off-by: Chegu Vinod <chegu_vinod@hp.com>
218e99
    Signed-off-by: Juan Quintela <quintela@redhat.com>
218e99
218e99
diff --git a/arch_init.c b/arch_init.c
218e99
index 522caeb..d7a5d7c 100644
218e99
--- a/arch_init.c
218e99
+++ b/arch_init.c
218e99
@@ -104,6 +104,9 @@ int graphic_depth = 15;
218e99
 #endif
218e99
 
218e99
 const uint32_t arch_type = QEMU_ARCH;
218e99
+static bool mig_throttle_on;
218e99
+static int dirty_rate_high_cnt;
218e99
+static void check_guest_throttling(void);
218e99
 
218e99
 /***********************************************************/
218e99
 /* ram save/restore */
218e99
@@ -378,8 +381,14 @@ static void migration_bitmap_sync(void)
218e99
     uint64_t num_dirty_pages_init = migration_dirty_pages;
218e99
     MigrationState *s = migrate_get_current();
218e99
     static int64_t start_time;
218e99
+    static int64_t bytes_xfer_prev;
218e99
     static int64_t num_dirty_pages_period;
218e99
     int64_t end_time;
218e99
+    int64_t bytes_xfer_now;
218e99
+
218e99
+    if (!bytes_xfer_prev) {
218e99
+        bytes_xfer_prev = ram_bytes_transferred();
218e99
+    }
218e99
 
218e99
     if (!start_time) {
218e99
         start_time = qemu_get_clock_ms(rt_clock);
218e99
@@ -404,6 +413,25 @@ static void migration_bitmap_sync(void)
218e99
 
218e99
     /* more than 1 second = 1000 millisecons */
218e99
     if (end_time > start_time + 1000) {
218e99
+        if (migrate_auto_converge()) {
218e99
+            /* The following detection logic can be refined later. For now:
218e99
+               Check to see if the dirtied bytes is 50% more than the approx.
218e99
+               amount of bytes that just got transferred since the last time we
218e99
+               were in this routine. If that happens >N times (for now N==4)
218e99
+               we turn on the throttle down logic */
218e99
+            bytes_xfer_now = ram_bytes_transferred();
218e99
+            if (s->dirty_pages_rate &&
218e99
+               (num_dirty_pages_period * TARGET_PAGE_SIZE >
218e99
+                   (bytes_xfer_now - bytes_xfer_prev)/2) &&
218e99
+               (dirty_rate_high_cnt++ > 4)) {
218e99
+                    trace_migration_throttle();
218e99
+                    mig_throttle_on = true;
218e99
+                    dirty_rate_high_cnt = 0;
218e99
+             }
218e99
+             bytes_xfer_prev = bytes_xfer_now;
218e99
+        } else {
218e99
+             mig_throttle_on = false;
218e99
+        }
218e99
         s->dirty_pages_rate = num_dirty_pages_period * 1000
218e99
             / (end_time - start_time);
218e99
         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
218e99
@@ -566,6 +594,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
218e99
     migration_bitmap = bitmap_new(ram_pages);
218e99
     bitmap_set(migration_bitmap, 0, ram_pages);
218e99
     migration_dirty_pages = ram_pages;
218e99
+    mig_throttle_on = false;
218e99
+    dirty_rate_high_cnt = 0;
218e99
 
218e99
     if (migrate_use_xbzrle()) {
218e99
         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
218e99
@@ -628,6 +658,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
218e99
         }
218e99
         total_sent += bytes_sent;
218e99
         acct_info.iterations++;
218e99
+        check_guest_throttling();
218e99
         /* we want to check in the 1st loop, just in case it was the 1st time
218e99
            and we had to sync the dirty bitmap.
218e99
            qemu_get_clock_ns() is a bit expensive, so we only check each some
218e99
@@ -1097,3 +1128,53 @@ TargetInfo *qmp_query_target(Error **errp)
218e99
 
218e99
     return info;
218e99
 }
218e99
+
218e99
+/* Stub function that's gets run on the vcpu when its brought out of the
218e99
+   VM to run inside qemu via async_run_on_cpu()*/
218e99
+static void mig_sleep_cpu(void *opq)
218e99
+{
218e99
+    qemu_mutex_unlock_iothread();
218e99
+    g_usleep(30*1000);
218e99
+    qemu_mutex_lock_iothread();
218e99
+}
218e99
+
218e99
+/* To reduce the dirty rate explicitly disallow the VCPUs from spending
218e99
+   much time in the VM. The migration thread will try to catchup.
218e99
+   Workload will experience a performance drop.
218e99
+*/
218e99
+static void mig_throttle_cpu_down(CPUState *cpu, void *data)
218e99
+{
218e99
+    async_run_on_cpu(cpu, mig_sleep_cpu, NULL);
218e99
+}
218e99
+
218e99
+static void mig_throttle_guest_down(void)
218e99
+{
218e99
+    qemu_mutex_lock_iothread();
218e99
+    qemu_for_each_cpu(mig_throttle_cpu_down, NULL);
218e99
+    qemu_mutex_unlock_iothread();
218e99
+}
218e99
+
218e99
+static void check_guest_throttling(void)
218e99
+{
218e99
+    static int64_t t0;
218e99
+    int64_t        t1;
218e99
+
218e99
+    if (!mig_throttle_on) {
218e99
+        return;
218e99
+    }
218e99
+
218e99
+    if (!t0)  {
218e99
+        t0 = qemu_get_clock_ns(rt_clock);
218e99
+        return;
218e99
+    }
218e99
+
218e99
+    t1 = qemu_get_clock_ns(rt_clock);
218e99
+
218e99
+    /* If it has been more than 40 ms since the last time the guest
218e99
+     * was throttled then do it again.
218e99
+     */
218e99
+    if (40 < (t1-t0)/1000000) {
218e99
+        mig_throttle_guest_down();
218e99
+        t0 = t1;
218e99
+    }
218e99
+}
218e99
diff --git a/trace-events b/trace-events
218e99
index 9c73931..7cd335d 100644
218e99
--- a/trace-events
218e99
+++ b/trace-events
218e99
@@ -1031,6 +1031,7 @@ savevm_section_end(unsigned int section_id) "section_id %u"
218e99
 # arch_init.c
218e99
 migration_bitmap_sync_start(void) ""
218e99
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
218e99
+migration_throttle(void) ""
218e99
 
218e99
 # hw/qxl.c
218e99
 disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"