Blob Blame History Raw
From 876b9284b61269d977d0b6b8585ba29758957622 Mon Sep 17 00:00:00 2001
Message-Id: <876b9284b61269d977d0b6b8585ba29758957622.1387382496.git.minovotn@redhat.com>
In-Reply-To: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
References: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
From: Nigel Croxon <ncroxon@redhat.com>
Date: Thu, 14 Nov 2013 22:53:07 +0100
Subject: [PATCH 31/46] rdma: IPv6 over Ethernet (RoCE) is broken in linux -
 workaround

RH-Author: Nigel Croxon <ncroxon@redhat.com>
Message-id: <1384469598-13137-32-git-send-email-ncroxon@redhat.com>
Patchwork-id: 55713
O-Subject: [RHEL7.0 PATCH 31/42] rdma: IPv6 over Ethernet (RoCE) is broken in linux - workaround
Bugzilla: 1011720
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
RH-Acked-by: Amit Shah <amit.shah@redhat.com>
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>

Bugzilla: 1011720
https://bugzilla.redhat.com/show_bug.cgi?id=1011720

>From commit ID:
commit 7fc5b13fd7b05babc7bcad9dcb8281ae202a9494
Author: Michael R. Hines <mrhines@us.ibm.com>
Date:   Fri Aug 9 16:05:44 2013 -0400

    rdma: IPv6 over Ethernet (RoCE) is broken in linux - workaround

    We've gotten reports from multiple testers (including Frank Yangjie
    and myself) that RDMA IPv6 support over RocE (Ethernet) is broken
    in linux.

    A patch to Linux is still in review:

    http://comments.gmane.org/gmane.linux.drivers.rdma/16448

    If the user is listening on '[::]', then we will not have a opened a device
    yet and have no way of verifying if the device is RoCE or not.

    In this case, the source VM will throw an error for ALL types of
    connections (both IPv4 and IPv6) if the destination machine does not have
    a regular infiniband network available for use.

    The only way to gaurantee that an error is thrown for broken kernels is
    for the management software to choose a *specific* interface at bind time
    and validate what time of hardware it is.

    Unfortunately, this puts the user in a fix:

     If the source VM connects with an IPv4 address without knowing that the
     destination has bound to '[::]' the migration will unconditionally fail
     unless the management software is not explicitly listening on the the IPv4
     address while using a RoCE-based device.

     If the source VM connects with an IPv6 address, then we're OK because we can
     throw an error on the source (and similarly on the destination).

     But in mixed environments, this will be broken for a while until it is fixed
     inside linux.

    We do provide a *tiny* bit of help in mixed environments, though in this patch:

    We can list all of the devices in the system and check to see if all the
    devices are RoCE or Infiniband.

    If we detect that we have a *pure* RoCE environment, then we can safely
    thrown an error even if the management sofware has specified '[::]' as the
    bind address.

    However, if there is are multiple hetergeneous devices, then we cannot make
    this assumption and the user just has to be sure they know what they are doing.

    Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
    Message-id: 1376078746-24948-6-git-send-email-mrhines@linux.vnet.ibm.com
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 migration-rdma.c |  189 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 169 insertions(+), 20 deletions(-)

Signed-off-by: Michal Novotny <minovotn@redhat.com>
---
 migration-rdma.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 169 insertions(+), 20 deletions(-)

diff --git a/migration-rdma.c b/migration-rdma.c
index e6fd77a..ada488e 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -707,15 +707,27 @@ static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
  */
 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
 {
+    struct ibv_port_attr port;
+
+    if (ibv_query_port(verbs, 1, &port)) {
+        fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
+        return;
+    }
+
     printf("%s RDMA Device opened: kernel name %s "
            "uverbs device name %s, "
-           "infiniband_verbs class device path %s,"
-           " infiniband class device path %s\n",
+           "infiniband_verbs class device path %s, "
+           "infiniband class device path %s, "
+           "transport: (%d) %s\n",
                 who,
                 verbs->device->name,
                 verbs->device->dev_name,
                 verbs->device->dev_path,
-                verbs->device->ibdev_path);
+                verbs->device->ibdev_path,
+                port.link_layer,
+                (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
+                 ((port.link_layer == IBV_LINK_LAYER_ETHERNET) 
+                    ? "Ethernet" : "Unknown"));
 }
 
 /*
@@ -733,6 +745,132 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
 }
 
 /*
+ * As of now, IPv6 over RoCE / iWARP is not supported by linux.
+ * We will try the next addrinfo struct, and fail if there are
+ * no other valid addresses to bind against.
+ *
+ * If user is listening on '[::]', then we will not have a opened a device
+ * yet and have no way of verifying if the device is RoCE or not.
+ *
+ * In this case, the source VM will throw an error for ALL types of
+ * connections (both IPv4 and IPv6) if the destination machine does not have
+ * a regular infiniband network available for use.
+ *
+ * The only way to gaurantee that an error is thrown for broken kernels is
+ * for the management software to choose a *specific* interface at bind time
+ * and validate what time of hardware it is.
+ *
+ * Unfortunately, this puts the user in a fix:
+ * 
+ *  If the source VM connects with an IPv4 address without knowing that the
+ *  destination has bound to '[::]' the migration will unconditionally fail
+ *  unless the management software is explicitly listening on the the IPv4
+ *  address while using a RoCE-based device.
+ *
+ *  If the source VM connects with an IPv6 address, then we're OK because we can
+ *  throw an error on the source (and similarly on the destination).
+ * 
+ *  But in mixed environments, this will be broken for a while until it is fixed
+ *  inside linux.
+ *
+ * We do provide a *tiny* bit of help in this function: We can list all of the
+ * devices in the system and check to see if all the devices are RoCE or
+ * Infiniband. 
+ *
+ * If we detect that we have a *pure* RoCE environment, then we can safely
+ * thrown an error even if the management sofware has specified '[::]' as the
+ * bind address.
+ *
+ * However, if there is are multiple hetergeneous devices, then we cannot make
+ * this assumption and the user just has to be sure they know what they are
+ * doing.
+ *
+ * Patches are being reviewed on linux-rdma.
+ */
+static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
+{
+    struct ibv_port_attr port_attr;
+
+    /* This bug only exists in linux, to our knowledge. */
+#ifdef CONFIG_LINUX
+
+    /* 
+     * Verbs are only NULL if management has bound to '[::]'.
+     * 
+     * Let's iterate through all the devices and see if there any pure IB
+     * devices (non-ethernet).
+     * 
+     * If not, then we can safely proceed with the migration.
+     * Otherwise, there are no gaurantees until the bug is fixed in linux.
+     */
+    if (!verbs) {
+	    int num_devices, x;
+        struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
+        bool roce_found = false;
+        bool ib_found = false;
+
+        for (x = 0; x < num_devices; x++) {
+            verbs = ibv_open_device(dev_list[x]);
+
+            if (ibv_query_port(verbs, 1, &port_attr)) {
+                ibv_close_device(verbs);
+                ERROR(errp, "Could not query initial IB port");
+                return -EINVAL;
+            }
+
+            if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
+                ib_found = true;
+            } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+                roce_found = true;
+            }
+
+            ibv_close_device(verbs);
+
+        }
+
+        if (roce_found) {
+            if (ib_found) {
+                fprintf(stderr, "WARN: migrations may fail:"
+                                " IPv6 over RoCE / iWARP in linux"
+                                " is broken. But since you appear to have a"
+                                " mixed RoCE / IB environment, be sure to only"
+                                " migrate over the IB fabric until the kernel "
+                                " fixes the bug.\n");
+            } else {
+                ERROR(errp, "You only have RoCE / iWARP devices in your systems"
+                            " and your management software has specified '[::]'"
+                            ", but IPv6 over RoCE / iWARP is not supported in Linux.");
+                return -ENONET;
+            }
+        }
+
+        return 0;
+    }
+
+    /*
+     * If we have a verbs context, that means that some other than '[::]' was
+     * used by the management software for binding. In which case we can actually 
+     * warn the user about a potential broken kernel;
+     */
+
+    /* IB ports start with 1, not 0 */
+    if (ibv_query_port(verbs, 1, &port_attr)) {
+        ERROR(errp, "Could not query initial IB port");
+        return -EINVAL;
+    }
+
+    if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+        ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
+                    "(but patches on linux-rdma in progress)");
+        return -ENONET;
+    }
+
+#endif
+
+    return 0;
+}
+
+/*
  * Figure out which RDMA device corresponds to the requested IP hostname
  * Also create the initial connection manager identifiers for opening
  * the connection.
@@ -740,22 +878,22 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
 {
     int ret;
-    struct addrinfo *res;
+    struct rdma_addrinfo *res;
     char port_str[16];
     struct rdma_cm_event *cm_event;
     char ip[40] = "unknown";
-    struct addrinfo *e;
+    struct rdma_addrinfo *e;
 
     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
         ERROR(errp, "RDMA hostname has not been set");
-        return -1;
+        return -EINVAL;
     }
 
     /* create CM channel */
     rdma->channel = rdma_create_event_channel();
     if (!rdma->channel) {
         ERROR(errp, "could not create CM channel");
-        return -1;
+        return -EINVAL;
     }
 
     /* create CM id */
@@ -768,21 +906,24 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
     snprintf(port_str, 16, "%d", rdma->port);
     port_str[15] = '\0';
 
-    ret = getaddrinfo(rdma->host, port_str, NULL, &res);
+    ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
     if (ret < 0) {
-        ERROR(errp, "could not getaddrinfo address %s", rdma->host);
+        ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
         goto err_resolve_get_addr;
     }
 
     for (e = res; e != NULL; e = e->ai_next) {
         inet_ntop(e->ai_family,
-            &((struct sockaddr_in *) e->ai_addr)->sin_addr, ip, sizeof ip);
+            &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
         DPRINTF("Trying %s => %s\n", rdma->host, ip);
 
-        /* resolve the first address */
-        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_addr,
+        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
                 RDMA_RESOLVE_TIMEOUT_MS);
         if (!ret) {
+            ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
+            if (ret) {
+                continue;
+            }
             goto route;
         }
     }
@@ -803,6 +944,7 @@ route:
         ERROR(errp, "result not equal to event_addr_resolved %s",
                 rdma_event_str(cm_event->event));
         perror("rdma_resolve_addr");
+        ret = -EINVAL;
         goto err_resolve_get_addr;
     }
     rdma_ack_cm_event(cm_event);
@@ -823,6 +965,7 @@ route:
         ERROR(errp, "result not equal to event_route_resolved: %s",
                         rdma_event_str(cm_event->event));
         rdma_ack_cm_event(cm_event);
+        ret = -EINVAL;
         goto err_resolve_get_addr;
     }
     rdma_ack_cm_event(cm_event);
@@ -837,8 +980,7 @@ err_resolve_get_addr:
 err_resolve_create_id:
     rdma_destroy_event_channel(rdma->channel);
     rdma->channel = NULL;
-
-    return -1;
+    return ret;
 }
 
 /*
@@ -2266,7 +2408,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
     int ret = -EINVAL, idx;
     struct rdma_cm_id *listen_id;
     char ip[40] = "unknown";
-    struct addrinfo *res;
+    struct rdma_addrinfo *res;
     char port_str[16];
 
     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
@@ -2298,20 +2440,27 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
     port_str[15] = '\0';
 
     if (rdma->host && strcmp("", rdma->host)) {
-        struct addrinfo *e;
+        struct rdma_addrinfo *e;
 
-        ret = getaddrinfo(rdma->host, port_str, NULL, &res);
+        ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
         if (ret < 0) {
-            ERROR(errp, "could not getaddrinfo address %s", rdma->host);
+            ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
             goto err_dest_init_bind_addr;
         }
 
         for (e = res; e != NULL; e = e->ai_next) {
             inet_ntop(e->ai_family,
-                &((struct sockaddr_in *) e->ai_addr)->sin_addr, ip, sizeof ip);
+                &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
             DPRINTF("Trying %s => %s\n", rdma->host, ip);
-            ret = rdma_bind_addr(listen_id, e->ai_addr);
+            ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
             if (!ret) {
+                if (e->ai_family == AF_INET6) {
+                    ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
+                    if (ret) {
+                        continue;
+                    }
+                }
+                    
                 goto listen;
             }
         }
-- 
1.7.11.7