|
|
218e99 |
From b0acf414158ed72f3f2ac7550839af74b0530c8e Mon Sep 17 00:00:00 2001
|
|
|
218e99 |
From: Nigel Croxon <ncroxon@redhat.com>
|
|
|
218e99 |
Date: Wed, 31 Jul 2013 15:12:19 +0200
|
|
|
218e99 |
Subject: Force auto-convegence of live migration
|
|
|
218e99 |
|
|
|
218e99 |
RH-Author: Nigel Croxon <ncroxon@redhat.com>
|
|
|
218e99 |
Message-id: <1375283539-18714-4-git-send-email-ncroxon@redhat.com>
|
|
|
218e99 |
Patchwork-id: 52873
|
|
|
218e99 |
O-Subject: [RHEL7 PATCH 3/3] Force auto-convegence of live migration
|
|
|
218e99 |
Bugzilla: 985958
|
|
|
218e99 |
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
|
|
|
218e99 |
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
|
|
|
218e99 |
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
|
|
|
218e99 |
|
|
|
218e99 |
Bugzilla: 985958 - Throttle-down guest to help with live migration convergence (backport to RHEL7.0)
|
|
|
218e99 |
https://bugzilla.redhat.com/show_bug.cgi?id=985958
|
|
|
218e99 |
|
|
|
218e99 |
Backported from the following upstream commit:
|
|
|
218e99 |
|
|
|
218e99 |
commit 7ca1dfad952d8a8655b32e78623edcc38a51b14a
|
|
|
218e99 |
Author: Chegu Vinod <chegu_vinod@hp.com>
|
|
|
218e99 |
Date: Mon Jun 24 03:47:39 2013 -0600
|
|
|
218e99 |
|
|
|
218e99 |
Force auto-convegence of live migration
|
|
|
218e99 |
|
|
|
218e99 |
If a user chooses to turn on the auto-converge migration capability
|
|
|
218e99 |
these changes detect the lack of convergence and throttle down the
|
|
|
218e99 |
guest. i.e. force the VCPUs out of the guest for some duration
|
|
|
218e99 |
and let the migration thread catchup and help converge.
|
|
|
218e99 |
|
|
|
218e99 |
Verified the convergence using the following :
|
|
|
218e99 |
- Java Warehouse workload running on a 20VCPU/256G guest(~80% busy)
|
|
|
218e99 |
- OLTP like workload running on a 80VCPU/512G guest (~80% busy)
|
|
|
218e99 |
|
|
|
218e99 |
Sample results with Java warehouse workload : (migrate speed set to 20Gb and
|
|
|
218e99 |
migrate downtime set to 4seconds).
|
|
|
218e99 |
|
|
|
218e99 |
(qemu) info migrate
|
|
|
218e99 |
capabilities: xbzrle: off auto-converge: off <----
|
|
|
218e99 |
Migration status: active
|
|
|
218e99 |
total time: 1487503 milliseconds
|
|
|
218e99 |
expected downtime: 519 milliseconds
|
|
|
218e99 |
transferred ram: 383749347 kbytes
|
|
|
218e99 |
remaining ram: 2753372 kbytes
|
|
|
218e99 |
total ram: 268444224 kbytes
|
|
|
218e99 |
duplicate: 65461532 pages
|
|
|
218e99 |
skipped: 64901568 pages
|
|
|
218e99 |
normal: 95750218 pages
|
|
|
218e99 |
normal bytes: 383000872 kbytes
|
|
|
218e99 |
dirty pages rate: 67551 pages
|
|
|
218e99 |
|
|
|
218e99 |
---
|
|
|
218e99 |
|
|
|
218e99 |
(qemu) info migrate
|
|
|
218e99 |
capabilities: xbzrle: off auto-converge: on <----
|
|
|
218e99 |
Migration status: completed
|
|
|
218e99 |
total time: 241161 milliseconds
|
|
|
218e99 |
downtime: 6373 milliseconds
|
|
|
218e99 |
transferred ram: 28235307 kbytes
|
|
|
218e99 |
remaining ram: 0 kbytes
|
|
|
218e99 |
total ram: 268444224 kbytes
|
|
|
218e99 |
duplicate: 64946416 pages
|
|
|
218e99 |
skipped: 64903523 pages
|
|
|
218e99 |
normal: 7044971 pages
|
|
|
218e99 |
normal bytes: 28179884 kbytes
|
|
|
218e99 |
|
|
|
218e99 |
Signed-off-by: Chegu Vinod <chegu_vinod@hp.com>
|
|
|
218e99 |
Signed-off-by: Juan Quintela <quintela@redhat.com>
|
|
|
218e99 |
|
|
|
218e99 |
diff --git a/arch_init.c b/arch_init.c
|
|
|
218e99 |
index 522caeb..d7a5d7c 100644
|
|
|
218e99 |
--- a/arch_init.c
|
|
|
218e99 |
+++ b/arch_init.c
|
|
|
218e99 |
@@ -104,6 +104,9 @@ int graphic_depth = 15;
|
|
|
218e99 |
#endif
|
|
|
218e99 |
|
|
|
218e99 |
const uint32_t arch_type = QEMU_ARCH;
|
|
|
218e99 |
+static bool mig_throttle_on;
|
|
|
218e99 |
+static int dirty_rate_high_cnt;
|
|
|
218e99 |
+static void check_guest_throttling(void);
|
|
|
218e99 |
|
|
|
218e99 |
/***********************************************************/
|
|
|
218e99 |
/* ram save/restore */
|
|
|
218e99 |
@@ -378,8 +381,14 @@ static void migration_bitmap_sync(void)
|
|
|
218e99 |
uint64_t num_dirty_pages_init = migration_dirty_pages;
|
|
|
218e99 |
MigrationState *s = migrate_get_current();
|
|
|
218e99 |
static int64_t start_time;
|
|
|
218e99 |
+ static int64_t bytes_xfer_prev;
|
|
|
218e99 |
static int64_t num_dirty_pages_period;
|
|
|
218e99 |
int64_t end_time;
|
|
|
218e99 |
+ int64_t bytes_xfer_now;
|
|
|
218e99 |
+
|
|
|
218e99 |
+ if (!bytes_xfer_prev) {
|
|
|
218e99 |
+ bytes_xfer_prev = ram_bytes_transferred();
|
|
|
218e99 |
+ }
|
|
|
218e99 |
|
|
|
218e99 |
if (!start_time) {
|
|
|
218e99 |
start_time = qemu_get_clock_ms(rt_clock);
|
|
|
218e99 |
@@ -404,6 +413,25 @@ static void migration_bitmap_sync(void)
|
|
|
218e99 |
|
|
|
218e99 |
/* more than 1 second = 1000 millisecons */
|
|
|
218e99 |
if (end_time > start_time + 1000) {
|
|
|
218e99 |
+ if (migrate_auto_converge()) {
|
|
|
218e99 |
+ /* The following detection logic can be refined later. For now:
|
|
|
218e99 |
+ Check to see if the dirtied bytes is 50% more than the approx.
|
|
|
218e99 |
+ amount of bytes that just got transferred since the last time we
|
|
|
218e99 |
+ were in this routine. If that happens >N times (for now N==4)
|
|
|
218e99 |
+ we turn on the throttle down logic */
|
|
|
218e99 |
+ bytes_xfer_now = ram_bytes_transferred();
|
|
|
218e99 |
+ if (s->dirty_pages_rate &&
|
|
|
218e99 |
+ (num_dirty_pages_period * TARGET_PAGE_SIZE >
|
|
|
218e99 |
+ (bytes_xfer_now - bytes_xfer_prev)/2) &&
|
|
|
218e99 |
+ (dirty_rate_high_cnt++ > 4)) {
|
|
|
218e99 |
+ trace_migration_throttle();
|
|
|
218e99 |
+ mig_throttle_on = true;
|
|
|
218e99 |
+ dirty_rate_high_cnt = 0;
|
|
|
218e99 |
+ }
|
|
|
218e99 |
+ bytes_xfer_prev = bytes_xfer_now;
|
|
|
218e99 |
+ } else {
|
|
|
218e99 |
+ mig_throttle_on = false;
|
|
|
218e99 |
+ }
|
|
|
218e99 |
s->dirty_pages_rate = num_dirty_pages_period * 1000
|
|
|
218e99 |
/ (end_time - start_time);
|
|
|
218e99 |
s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
|
|
|
218e99 |
@@ -566,6 +594,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
|
|
|
218e99 |
migration_bitmap = bitmap_new(ram_pages);
|
|
|
218e99 |
bitmap_set(migration_bitmap, 0, ram_pages);
|
|
|
218e99 |
migration_dirty_pages = ram_pages;
|
|
|
218e99 |
+ mig_throttle_on = false;
|
|
|
218e99 |
+ dirty_rate_high_cnt = 0;
|
|
|
218e99 |
|
|
|
218e99 |
if (migrate_use_xbzrle()) {
|
|
|
218e99 |
XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
|
|
|
218e99 |
@@ -628,6 +658,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
|
|
|
218e99 |
}
|
|
|
218e99 |
total_sent += bytes_sent;
|
|
|
218e99 |
acct_info.iterations++;
|
|
|
218e99 |
+ check_guest_throttling();
|
|
|
218e99 |
/* we want to check in the 1st loop, just in case it was the 1st time
|
|
|
218e99 |
and we had to sync the dirty bitmap.
|
|
|
218e99 |
qemu_get_clock_ns() is a bit expensive, so we only check each some
|
|
|
218e99 |
@@ -1097,3 +1128,53 @@ TargetInfo *qmp_query_target(Error **errp)
|
|
|
218e99 |
|
|
|
218e99 |
return info;
|
|
|
218e99 |
}
|
|
|
218e99 |
+
|
|
|
218e99 |
+/* Stub function that's gets run on the vcpu when its brought out of the
|
|
|
218e99 |
+ VM to run inside qemu via async_run_on_cpu()*/
|
|
|
218e99 |
+static void mig_sleep_cpu(void *opq)
|
|
|
218e99 |
+{
|
|
|
218e99 |
+ qemu_mutex_unlock_iothread();
|
|
|
218e99 |
+ g_usleep(30*1000);
|
|
|
218e99 |
+ qemu_mutex_lock_iothread();
|
|
|
218e99 |
+}
|
|
|
218e99 |
+
|
|
|
218e99 |
+/* To reduce the dirty rate explicitly disallow the VCPUs from spending
|
|
|
218e99 |
+ much time in the VM. The migration thread will try to catchup.
|
|
|
218e99 |
+ Workload will experience a performance drop.
|
|
|
218e99 |
+*/
|
|
|
218e99 |
+static void mig_throttle_cpu_down(CPUState *cpu, void *data)
|
|
|
218e99 |
+{
|
|
|
218e99 |
+ async_run_on_cpu(cpu, mig_sleep_cpu, NULL);
|
|
|
218e99 |
+}
|
|
|
218e99 |
+
|
|
|
218e99 |
+static void mig_throttle_guest_down(void)
|
|
|
218e99 |
+{
|
|
|
218e99 |
+ qemu_mutex_lock_iothread();
|
|
|
218e99 |
+ qemu_for_each_cpu(mig_throttle_cpu_down, NULL);
|
|
|
218e99 |
+ qemu_mutex_unlock_iothread();
|
|
|
218e99 |
+}
|
|
|
218e99 |
+
|
|
|
218e99 |
+static void check_guest_throttling(void)
|
|
|
218e99 |
+{
|
|
|
218e99 |
+ static int64_t t0;
|
|
|
218e99 |
+ int64_t t1;
|
|
|
218e99 |
+
|
|
|
218e99 |
+ if (!mig_throttle_on) {
|
|
|
218e99 |
+ return;
|
|
|
218e99 |
+ }
|
|
|
218e99 |
+
|
|
|
218e99 |
+ if (!t0) {
|
|
|
218e99 |
+ t0 = qemu_get_clock_ns(rt_clock);
|
|
|
218e99 |
+ return;
|
|
|
218e99 |
+ }
|
|
|
218e99 |
+
|
|
|
218e99 |
+ t1 = qemu_get_clock_ns(rt_clock);
|
|
|
218e99 |
+
|
|
|
218e99 |
+ /* If it has been more than 40 ms since the last time the guest
|
|
|
218e99 |
+ * was throttled then do it again.
|
|
|
218e99 |
+ */
|
|
|
218e99 |
+ if (40 < (t1-t0)/1000000) {
|
|
|
218e99 |
+ mig_throttle_guest_down();
|
|
|
218e99 |
+ t0 = t1;
|
|
|
218e99 |
+ }
|
|
|
218e99 |
+}
|
|
|
218e99 |
diff --git a/trace-events b/trace-events
|
|
|
218e99 |
index 9c73931..7cd335d 100644
|
|
|
218e99 |
--- a/trace-events
|
|
|
218e99 |
+++ b/trace-events
|
|
|
218e99 |
@@ -1031,6 +1031,7 @@ savevm_section_end(unsigned int section_id) "section_id %u"
|
|
|
218e99 |
# arch_init.c
|
|
|
218e99 |
migration_bitmap_sync_start(void) ""
|
|
|
218e99 |
migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
|
|
|
218e99 |
+migration_throttle(void) ""
|
|
|
218e99 |
|
|
|
218e99 |
# hw/qxl.c
|
|
|
218e99 |
disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
|