|
|
ed5979 |
From 29eee1fbb84c0e2f0ece9e6d996afa7238ed2912 Mon Sep 17 00:00:00 2001
|
|
|
ed5979 |
From: "manish.mishra" <manish.mishra@nutanix.com>
|
|
|
ed5979 |
Date: Tue, 20 Dec 2022 18:44:18 +0000
|
|
|
ed5979 |
Subject: [PATCH 7/8] migration: check magic value for deciding the mapping of
|
|
|
ed5979 |
channels
|
|
|
ed5979 |
MIME-Version: 1.0
|
|
|
ed5979 |
Content-Type: text/plain; charset=UTF-8
|
|
|
ed5979 |
Content-Transfer-Encoding: 8bit
|
|
|
ed5979 |
|
|
|
ed5979 |
RH-Author: Peter Xu <peterx@redhat.com>
|
|
|
ed5979 |
RH-MergeRequest: 150: migration: Fix multifd crash on channel disorders
|
|
|
ed5979 |
RH-Bugzilla: 2169732
|
|
|
ed5979 |
RH-Acked-by: quintela1 <quintela@redhat.com>
|
|
|
ed5979 |
RH-Acked-by: Leonardo BrĂ¡s <leobras@redhat.com>
|
|
|
ed5979 |
RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
|
|
|
ed5979 |
RH-Commit: [2/2] 4fb9408478923415a91fe0527bf4b1a0f022f329 (peterx/qemu-kvm)
|
|
|
ed5979 |
|
|
|
ed5979 |
Current logic assumes that channel connections on the destination side are
|
|
|
ed5979 |
always established in the same order as the source and the first one will
|
|
|
ed5979 |
always be the main channel followed by the multifid or post-copy
|
|
|
ed5979 |
preemption channel. This may not be always true, as even if a channel has a
|
|
|
ed5979 |
connection established on the source side it can be in the pending state on
|
|
|
ed5979 |
the destination side and a newer connection can be established first.
|
|
|
ed5979 |
Basically causing out of order mapping of channels on the destination side.
|
|
|
ed5979 |
Currently, all channels except post-copy preempt send a magic number, this
|
|
|
ed5979 |
patch uses that magic number to decide the type of channel. This logic is
|
|
|
ed5979 |
applicable only for precopy(multifd) live migration, as mentioned, the
|
|
|
ed5979 |
post-copy preempt channel does not send any magic number. Also, tls live
|
|
|
ed5979 |
migrations already does tls handshake before creating other channels, so
|
|
|
ed5979 |
this issue is not possible with tls, hence this logic is avoided for tls
|
|
|
ed5979 |
live migrations. This patch uses read peek to check the magic number of
|
|
|
ed5979 |
channels so that current data/control stream management remains
|
|
|
ed5979 |
un-effected.
|
|
|
ed5979 |
|
|
|
ed5979 |
Reviewed-by: Peter Xu <peterx@redhat.com>
|
|
|
ed5979 |
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
|
|
|
ed5979 |
Reviewed-by: Juan Quintela <quintela@redhat.com>
|
|
|
ed5979 |
Suggested-by: Daniel P. Berrange <berrange@redhat.com>
|
|
|
ed5979 |
Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
|
|
|
ed5979 |
Signed-off-by: Juan Quintela <quintela@redhat.com>
|
|
|
ed5979 |
(cherry picked from commit 6720c2b32725e6ac404f22851a0ecd0a71d0cbe2)
|
|
|
ed5979 |
Signed-off-by: Peter Xu <peterx@redhat.com>
|
|
|
ed5979 |
---
|
|
|
ed5979 |
migration/channel.c | 45 +++++++++++++++++++++++++++++++++
|
|
|
ed5979 |
migration/channel.h | 5 ++++
|
|
|
ed5979 |
migration/migration.c | 54 ++++++++++++++++++++++++++++------------
|
|
|
ed5979 |
migration/multifd.c | 19 +++++++-------
|
|
|
ed5979 |
migration/multifd.h | 2 +-
|
|
|
ed5979 |
migration/postcopy-ram.c | 5 +---
|
|
|
ed5979 |
migration/postcopy-ram.h | 2 +-
|
|
|
ed5979 |
7 files changed, 101 insertions(+), 31 deletions(-)
|
|
|
ed5979 |
|
|
|
ed5979 |
diff --git a/migration/channel.c b/migration/channel.c
|
|
|
ed5979 |
index 1b0815039f..ca3319a309 100644
|
|
|
ed5979 |
--- a/migration/channel.c
|
|
|
ed5979 |
+++ b/migration/channel.c
|
|
|
ed5979 |
@@ -92,3 +92,48 @@ void migration_channel_connect(MigrationState *s,
|
|
|
ed5979 |
migrate_fd_connect(s, error);
|
|
|
ed5979 |
error_free(error);
|
|
|
ed5979 |
}
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+/**
|
|
|
ed5979 |
+ * @migration_channel_read_peek - Peek at migration channel, without
|
|
|
ed5979 |
+ * actually removing it from channel buffer.
|
|
|
ed5979 |
+ *
|
|
|
ed5979 |
+ * @ioc: the channel object
|
|
|
ed5979 |
+ * @buf: the memory region to read data into
|
|
|
ed5979 |
+ * @buflen: the number of bytes to read in @buf
|
|
|
ed5979 |
+ * @errp: pointer to a NULL-initialized error object
|
|
|
ed5979 |
+ *
|
|
|
ed5979 |
+ * Returns 0 if successful, returns -1 and sets @errp if fails.
|
|
|
ed5979 |
+ */
|
|
|
ed5979 |
+int migration_channel_read_peek(QIOChannel *ioc,
|
|
|
ed5979 |
+ const char *buf,
|
|
|
ed5979 |
+ const size_t buflen,
|
|
|
ed5979 |
+ Error **errp)
|
|
|
ed5979 |
+{
|
|
|
ed5979 |
+ ssize_t len = 0;
|
|
|
ed5979 |
+ struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ while (true) {
|
|
|
ed5979 |
+ len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL,
|
|
|
ed5979 |
+ QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
|
|
|
ed5979 |
+ error_setg(errp,
|
|
|
ed5979 |
+ "Failed to peek at channel");
|
|
|
ed5979 |
+ return -1;
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ if (len == buflen) {
|
|
|
ed5979 |
+ break;
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ /* 1ms sleep. */
|
|
|
ed5979 |
+ if (qemu_in_coroutine()) {
|
|
|
ed5979 |
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
|
|
|
ed5979 |
+ } else {
|
|
|
ed5979 |
+ g_usleep(1000);
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ return 0;
|
|
|
ed5979 |
+}
|
|
|
ed5979 |
diff --git a/migration/channel.h b/migration/channel.h
|
|
|
ed5979 |
index 67a461c28a..5bdb8208a7 100644
|
|
|
ed5979 |
--- a/migration/channel.h
|
|
|
ed5979 |
+++ b/migration/channel.h
|
|
|
ed5979 |
@@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
|
|
|
ed5979 |
QIOChannel *ioc,
|
|
|
ed5979 |
const char *hostname,
|
|
|
ed5979 |
Error *error_in);
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+int migration_channel_read_peek(QIOChannel *ioc,
|
|
|
ed5979 |
+ const char *buf,
|
|
|
ed5979 |
+ const size_t buflen,
|
|
|
ed5979 |
+ Error **errp);
|
|
|
ed5979 |
#endif
|
|
|
ed5979 |
diff --git a/migration/migration.c b/migration/migration.c
|
|
|
ed5979 |
index f485eea5fb..593dbd25de 100644
|
|
|
ed5979 |
--- a/migration/migration.c
|
|
|
ed5979 |
+++ b/migration/migration.c
|
|
|
ed5979 |
@@ -31,6 +31,7 @@
|
|
|
ed5979 |
#include "migration.h"
|
|
|
ed5979 |
#include "savevm.h"
|
|
|
ed5979 |
#include "qemu-file.h"
|
|
|
ed5979 |
+#include "channel.h"
|
|
|
ed5979 |
#include "migration/vmstate.h"
|
|
|
ed5979 |
#include "block/block.h"
|
|
|
ed5979 |
#include "qapi/error.h"
|
|
|
ed5979 |
@@ -663,10 +664,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp)
|
|
|
ed5979 |
{
|
|
|
ed5979 |
MigrationIncomingState *mis = migration_incoming_get_current();
|
|
|
ed5979 |
|
|
|
ed5979 |
- if (multifd_load_setup(errp) != 0) {
|
|
|
ed5979 |
- return false;
|
|
|
ed5979 |
- }
|
|
|
ed5979 |
-
|
|
|
ed5979 |
if (!mis->from_src_file) {
|
|
|
ed5979 |
mis->from_src_file = f;
|
|
|
ed5979 |
}
|
|
|
ed5979 |
@@ -733,31 +730,56 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
{
|
|
|
ed5979 |
MigrationIncomingState *mis = migration_incoming_get_current();
|
|
|
ed5979 |
Error *local_err = NULL;
|
|
|
ed5979 |
- bool start_migration;
|
|
|
ed5979 |
QEMUFile *f;
|
|
|
ed5979 |
+ bool default_channel = true;
|
|
|
ed5979 |
+ uint32_t channel_magic = 0;
|
|
|
ed5979 |
+ int ret = 0;
|
|
|
ed5979 |
|
|
|
ed5979 |
- if (!mis->from_src_file) {
|
|
|
ed5979 |
- /* The first connection (multifd may have multiple) */
|
|
|
ed5979 |
+ if (migrate_use_multifd() && !migrate_postcopy_ram() &&
|
|
|
ed5979 |
+ qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
|
|
|
ed5979 |
+ /*
|
|
|
ed5979 |
+ * With multiple channels, it is possible that we receive channels
|
|
|
ed5979 |
+ * out of order on destination side, causing incorrect mapping of
|
|
|
ed5979 |
+ * source channels on destination side. Check channel MAGIC to
|
|
|
ed5979 |
+ * decide type of channel. Please note this is best effort, postcopy
|
|
|
ed5979 |
+ * preempt channel does not send any magic number so avoid it for
|
|
|
ed5979 |
+ * postcopy live migration. Also tls live migration already does
|
|
|
ed5979 |
+ * tls handshake while initializing main channel so with tls this
|
|
|
ed5979 |
+ * issue is not possible.
|
|
|
ed5979 |
+ */
|
|
|
ed5979 |
+ ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
|
|
|
ed5979 |
+ sizeof(channel_magic), &local_err);
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ if (ret != 0) {
|
|
|
ed5979 |
+ error_propagate(errp, local_err);
|
|
|
ed5979 |
+ return;
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
|
|
|
ed5979 |
+ } else {
|
|
|
ed5979 |
+ default_channel = !mis->from_src_file;
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ if (multifd_load_setup(errp) != 0) {
|
|
|
ed5979 |
+ error_setg(errp, "Failed to setup multifd channels");
|
|
|
ed5979 |
+ return;
|
|
|
ed5979 |
+ }
|
|
|
ed5979 |
+
|
|
|
ed5979 |
+ if (default_channel) {
|
|
|
ed5979 |
f = qemu_file_new_input(ioc);
|
|
|
ed5979 |
|
|
|
ed5979 |
if (!migration_incoming_setup(f, errp)) {
|
|
|
ed5979 |
return;
|
|
|
ed5979 |
}
|
|
|
ed5979 |
-
|
|
|
ed5979 |
- /*
|
|
|
ed5979 |
- * Common migration only needs one channel, so we can start
|
|
|
ed5979 |
- * right now. Some features need more than one channel, we wait.
|
|
|
ed5979 |
- */
|
|
|
ed5979 |
- start_migration = !migration_needs_multiple_sockets();
|
|
|
ed5979 |
} else {
|
|
|
ed5979 |
/* Multiple connections */
|
|
|
ed5979 |
assert(migration_needs_multiple_sockets());
|
|
|
ed5979 |
if (migrate_use_multifd()) {
|
|
|
ed5979 |
- start_migration = multifd_recv_new_channel(ioc, &local_err);
|
|
|
ed5979 |
+ multifd_recv_new_channel(ioc, &local_err);
|
|
|
ed5979 |
} else {
|
|
|
ed5979 |
assert(migrate_postcopy_preempt());
|
|
|
ed5979 |
f = qemu_file_new_input(ioc);
|
|
|
ed5979 |
- start_migration = postcopy_preempt_new_channel(mis, f);
|
|
|
ed5979 |
+ postcopy_preempt_new_channel(mis, f);
|
|
|
ed5979 |
}
|
|
|
ed5979 |
if (local_err) {
|
|
|
ed5979 |
error_propagate(errp, local_err);
|
|
|
ed5979 |
@@ -765,7 +787,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
}
|
|
|
ed5979 |
}
|
|
|
ed5979 |
|
|
|
ed5979 |
- if (start_migration) {
|
|
|
ed5979 |
+ if (migration_has_all_channels()) {
|
|
|
ed5979 |
/* If it's a recovery, we're done */
|
|
|
ed5979 |
if (postcopy_try_recover()) {
|
|
|
ed5979 |
return;
|
|
|
ed5979 |
diff --git a/migration/multifd.c b/migration/multifd.c
|
|
|
ed5979 |
index 509bbbe3bf..c3385529cf 100644
|
|
|
ed5979 |
--- a/migration/multifd.c
|
|
|
ed5979 |
+++ b/migration/multifd.c
|
|
|
ed5979 |
@@ -1167,9 +1167,14 @@ int multifd_load_setup(Error **errp)
|
|
|
ed5979 |
uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
|
|
|
ed5979 |
uint8_t i;
|
|
|
ed5979 |
|
|
|
ed5979 |
- if (!migrate_use_multifd()) {
|
|
|
ed5979 |
+ /*
|
|
|
ed5979 |
+ * Return successfully if multiFD recv state is already initialised
|
|
|
ed5979 |
+ * or multiFD is not enabled.
|
|
|
ed5979 |
+ */
|
|
|
ed5979 |
+ if (multifd_recv_state || !migrate_use_multifd()) {
|
|
|
ed5979 |
return 0;
|
|
|
ed5979 |
}
|
|
|
ed5979 |
+
|
|
|
ed5979 |
if (!migrate_multi_channels_is_allowed()) {
|
|
|
ed5979 |
error_setg(errp, "multifd is not supported by current protocol");
|
|
|
ed5979 |
return -1;
|
|
|
ed5979 |
@@ -1228,11 +1233,9 @@ bool multifd_recv_all_channels_created(void)
|
|
|
ed5979 |
|
|
|
ed5979 |
/*
|
|
|
ed5979 |
* Try to receive all multifd channels to get ready for the migration.
|
|
|
ed5979 |
- * - Return true and do not set @errp when correctly receiving all channels;
|
|
|
ed5979 |
- * - Return false and do not set @errp when correctly receiving the current one;
|
|
|
ed5979 |
- * - Return false and set @errp when failing to receive the current channel.
|
|
|
ed5979 |
+ * Sets @errp when failing to receive the current channel.
|
|
|
ed5979 |
*/
|
|
|
ed5979 |
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
{
|
|
|
ed5979 |
MultiFDRecvParams *p;
|
|
|
ed5979 |
Error *local_err = NULL;
|
|
|
ed5979 |
@@ -1245,7 +1248,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
"failed to receive packet"
|
|
|
ed5979 |
" via multifd channel %d: ",
|
|
|
ed5979 |
qatomic_read(&multifd_recv_state->count));
|
|
|
ed5979 |
- return false;
|
|
|
ed5979 |
+ return;
|
|
|
ed5979 |
}
|
|
|
ed5979 |
trace_multifd_recv_new_channel(id);
|
|
|
ed5979 |
|
|
|
ed5979 |
@@ -1255,7 +1258,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
id);
|
|
|
ed5979 |
multifd_recv_terminate_threads(local_err);
|
|
|
ed5979 |
error_propagate(errp, local_err);
|
|
|
ed5979 |
- return false;
|
|
|
ed5979 |
+ return;
|
|
|
ed5979 |
}
|
|
|
ed5979 |
p->c = ioc;
|
|
|
ed5979 |
object_ref(OBJECT(ioc));
|
|
|
ed5979 |
@@ -1266,6 +1269,4 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
|
|
|
ed5979 |
qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
|
|
|
ed5979 |
QEMU_THREAD_JOINABLE);
|
|
|
ed5979 |
qatomic_inc(&multifd_recv_state->count);
|
|
|
ed5979 |
- return qatomic_read(&multifd_recv_state->count) ==
|
|
|
ed5979 |
- migrate_multifd_channels();
|
|
|
ed5979 |
}
|
|
|
ed5979 |
diff --git a/migration/multifd.h b/migration/multifd.h
|
|
|
ed5979 |
index 519f498643..913e4ba274 100644
|
|
|
ed5979 |
--- a/migration/multifd.h
|
|
|
ed5979 |
+++ b/migration/multifd.h
|
|
|
ed5979 |
@@ -18,7 +18,7 @@ void multifd_save_cleanup(void);
|
|
|
ed5979 |
int multifd_load_setup(Error **errp);
|
|
|
ed5979 |
int multifd_load_cleanup(Error **errp);
|
|
|
ed5979 |
bool multifd_recv_all_channels_created(void);
|
|
|
ed5979 |
-bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
|
|
|
ed5979 |
+void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
|
|
|
ed5979 |
void multifd_recv_sync_main(void);
|
|
|
ed5979 |
int multifd_send_sync_main(QEMUFile *f);
|
|
|
ed5979 |
int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
|
|
|
ed5979 |
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
|
|
|
ed5979 |
index 0c55df0e52..b98e95dab0 100644
|
|
|
ed5979 |
--- a/migration/postcopy-ram.c
|
|
|
ed5979 |
+++ b/migration/postcopy-ram.c
|
|
|
ed5979 |
@@ -1538,7 +1538,7 @@ void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
|
|
|
ed5979 |
}
|
|
|
ed5979 |
}
|
|
|
ed5979 |
|
|
|
ed5979 |
-bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
|
|
|
ed5979 |
+void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
|
|
|
ed5979 |
{
|
|
|
ed5979 |
/*
|
|
|
ed5979 |
* The new loading channel has its own threads, so it needs to be
|
|
|
ed5979 |
@@ -1547,9 +1547,6 @@ bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
|
|
|
ed5979 |
qemu_file_set_blocking(file, true);
|
|
|
ed5979 |
mis->postcopy_qemufile_dst = file;
|
|
|
ed5979 |
trace_postcopy_preempt_new_channel();
|
|
|
ed5979 |
-
|
|
|
ed5979 |
- /* Start the migration immediately */
|
|
|
ed5979 |
- return true;
|
|
|
ed5979 |
}
|
|
|
ed5979 |
|
|
|
ed5979 |
/*
|
|
|
ed5979 |
diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
|
|
|
ed5979 |
index 6147bf7d1d..25881c4127 100644
|
|
|
ed5979 |
--- a/migration/postcopy-ram.h
|
|
|
ed5979 |
+++ b/migration/postcopy-ram.h
|
|
|
ed5979 |
@@ -190,7 +190,7 @@ enum PostcopyChannels {
|
|
|
ed5979 |
RAM_CHANNEL_MAX,
|
|
|
ed5979 |
};
|
|
|
ed5979 |
|
|
|
ed5979 |
-bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
|
|
|
ed5979 |
+void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
|
|
|
ed5979 |
int postcopy_preempt_setup(MigrationState *s, Error **errp);
|
|
|
ed5979 |
int postcopy_preempt_wait_channel(MigrationState *s);
|
|
|
ed5979 |
|
|
|
ed5979 |
--
|
|
|
ed5979 |
2.31.1
|
|
|
ed5979 |
|