Blame SOURCES/kvm-rdma-IPv6-over-Ethernet-RoCE-is-broken-in-linux-work.patch

9ae3a8
From 876b9284b61269d977d0b6b8585ba29758957622 Mon Sep 17 00:00:00 2001
9ae3a8
Message-Id: <876b9284b61269d977d0b6b8585ba29758957622.1387382496.git.minovotn@redhat.com>
9ae3a8
In-Reply-To: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
9ae3a8
References: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
9ae3a8
From: Nigel Croxon <ncroxon@redhat.com>
9ae3a8
Date: Thu, 14 Nov 2013 22:53:07 +0100
9ae3a8
Subject: [PATCH 31/46] rdma: IPv6 over Ethernet (RoCE) is broken in linux -
9ae3a8
 workaround
9ae3a8
9ae3a8
RH-Author: Nigel Croxon <ncroxon@redhat.com>
9ae3a8
Message-id: <1384469598-13137-32-git-send-email-ncroxon@redhat.com>
9ae3a8
Patchwork-id: 55713
9ae3a8
O-Subject: [RHEL7.0 PATCH 31/42] rdma: IPv6 over Ethernet (RoCE) is broken in linux - workaround
9ae3a8
Bugzilla: 1011720
9ae3a8
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
9ae3a8
RH-Acked-by: Amit Shah <amit.shah@redhat.com>
9ae3a8
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
9ae3a8
Bugzilla: 1011720
9ae3a8
https://bugzilla.redhat.com/show_bug.cgi?id=1011720
9ae3a8
9ae3a8
>From commit ID:
9ae3a8
commit 7fc5b13fd7b05babc7bcad9dcb8281ae202a9494
9ae3a8
Author: Michael R. Hines <mrhines@us.ibm.com>
9ae3a8
Date:   Fri Aug 9 16:05:44 2013 -0400
9ae3a8
9ae3a8
    rdma: IPv6 over Ethernet (RoCE) is broken in linux - workaround
9ae3a8
9ae3a8
    We've gotten reports from multiple testers (including Frank Yangjie
9ae3a8
    and myself) that RDMA IPv6 support over RocE (Ethernet) is broken
9ae3a8
    in linux.
9ae3a8
9ae3a8
    A patch to Linux is still in review:
9ae3a8
9ae3a8
    http://comments.gmane.org/gmane.linux.drivers.rdma/16448
9ae3a8
9ae3a8
    If the user is listening on '[::]', then we will not have a opened a device
9ae3a8
    yet and have no way of verifying if the device is RoCE or not.
9ae3a8
9ae3a8
    In this case, the source VM will throw an error for ALL types of
9ae3a8
    connections (both IPv4 and IPv6) if the destination machine does not have
9ae3a8
    a regular infiniband network available for use.
9ae3a8
9ae3a8
    The only way to gaurantee that an error is thrown for broken kernels is
9ae3a8
    for the management software to choose a *specific* interface at bind time
9ae3a8
    and validate what time of hardware it is.
9ae3a8
9ae3a8
    Unfortunately, this puts the user in a fix:
9ae3a8
9ae3a8
     If the source VM connects with an IPv4 address without knowing that the
9ae3a8
     destination has bound to '[::]' the migration will unconditionally fail
9ae3a8
     unless the management software is not explicitly listening on the the IPv4
9ae3a8
     address while using a RoCE-based device.
9ae3a8
9ae3a8
     If the source VM connects with an IPv6 address, then we're OK because we can
9ae3a8
     throw an error on the source (and similarly on the destination).
9ae3a8
9ae3a8
     But in mixed environments, this will be broken for a while until it is fixed
9ae3a8
     inside linux.
9ae3a8
9ae3a8
    We do provide a *tiny* bit of help in mixed environments, though in this patch:
9ae3a8
9ae3a8
    We can list all of the devices in the system and check to see if all the
9ae3a8
    devices are RoCE or Infiniband.
9ae3a8
9ae3a8
    If we detect that we have a *pure* RoCE environment, then we can safely
9ae3a8
    thrown an error even if the management sofware has specified '[::]' as the
9ae3a8
    bind address.
9ae3a8
9ae3a8
    However, if there is are multiple hetergeneous devices, then we cannot make
9ae3a8
    this assumption and the user just has to be sure they know what they are doing.
9ae3a8
9ae3a8
    Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
9ae3a8
    Message-id: 1376078746-24948-6-git-send-email-mrhines@linux.vnet.ibm.com
9ae3a8
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
9ae3a8
---
9ae3a8
 migration-rdma.c |  189 ++++++++++++++++++++++++++++++++++++++++++++++++------
9ae3a8
 1 files changed, 169 insertions(+), 20 deletions(-)
9ae3a8
9ae3a8
Signed-off-by: Michal Novotny <minovotn@redhat.com>
9ae3a8
---
9ae3a8
 migration-rdma.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++++------
9ae3a8
 1 file changed, 169 insertions(+), 20 deletions(-)
9ae3a8
9ae3a8
diff --git a/migration-rdma.c b/migration-rdma.c
9ae3a8
index e6fd77a..ada488e 100644
9ae3a8
--- a/migration-rdma.c
9ae3a8
+++ b/migration-rdma.c
9ae3a8
@@ -707,15 +707,27 @@ static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
9ae3a8
  */
9ae3a8
 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
9ae3a8
 {
9ae3a8
+    struct ibv_port_attr port;
9ae3a8
+
9ae3a8
+    if (ibv_query_port(verbs, 1, &port)) {
9ae3a8
+        fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
9ae3a8
+        return;
9ae3a8
+    }
9ae3a8
+
9ae3a8
     printf("%s RDMA Device opened: kernel name %s "
9ae3a8
            "uverbs device name %s, "
9ae3a8
-           "infiniband_verbs class device path %s,"
9ae3a8
-           " infiniband class device path %s\n",
9ae3a8
+           "infiniband_verbs class device path %s, "
9ae3a8
+           "infiniband class device path %s, "
9ae3a8
+           "transport: (%d) %s\n",
9ae3a8
                 who,
9ae3a8
                 verbs->device->name,
9ae3a8
                 verbs->device->dev_name,
9ae3a8
                 verbs->device->dev_path,
9ae3a8
-                verbs->device->ibdev_path);
9ae3a8
+                verbs->device->ibdev_path,
9ae3a8
+                port.link_layer,
9ae3a8
+                (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
9ae3a8
+                 ((port.link_layer == IBV_LINK_LAYER_ETHERNET) 
9ae3a8
+                    ? "Ethernet" : "Unknown"));
9ae3a8
 }
9ae3a8
 
9ae3a8
 /*
9ae3a8
@@ -733,6 +745,132 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
9ae3a8
 }
9ae3a8
 
9ae3a8
 /*
9ae3a8
+ * As of now, IPv6 over RoCE / iWARP is not supported by linux.
9ae3a8
+ * We will try the next addrinfo struct, and fail if there are
9ae3a8
+ * no other valid addresses to bind against.
9ae3a8
+ *
9ae3a8
+ * If user is listening on '[::]', then we will not have a opened a device
9ae3a8
+ * yet and have no way of verifying if the device is RoCE or not.
9ae3a8
+ *
9ae3a8
+ * In this case, the source VM will throw an error for ALL types of
9ae3a8
+ * connections (both IPv4 and IPv6) if the destination machine does not have
9ae3a8
+ * a regular infiniband network available for use.
9ae3a8
+ *
9ae3a8
+ * The only way to gaurantee that an error is thrown for broken kernels is
9ae3a8
+ * for the management software to choose a *specific* interface at bind time
9ae3a8
+ * and validate what time of hardware it is.
9ae3a8
+ *
9ae3a8
+ * Unfortunately, this puts the user in a fix:
9ae3a8
+ * 
9ae3a8
+ *  If the source VM connects with an IPv4 address without knowing that the
9ae3a8
+ *  destination has bound to '[::]' the migration will unconditionally fail
9ae3a8
+ *  unless the management software is explicitly listening on the the IPv4
9ae3a8
+ *  address while using a RoCE-based device.
9ae3a8
+ *
9ae3a8
+ *  If the source VM connects with an IPv6 address, then we're OK because we can
9ae3a8
+ *  throw an error on the source (and similarly on the destination).
9ae3a8
+ * 
9ae3a8
+ *  But in mixed environments, this will be broken for a while until it is fixed
9ae3a8
+ *  inside linux.
9ae3a8
+ *
9ae3a8
+ * We do provide a *tiny* bit of help in this function: We can list all of the
9ae3a8
+ * devices in the system and check to see if all the devices are RoCE or
9ae3a8
+ * Infiniband. 
9ae3a8
+ *
9ae3a8
+ * If we detect that we have a *pure* RoCE environment, then we can safely
9ae3a8
+ * thrown an error even if the management sofware has specified '[::]' as the
9ae3a8
+ * bind address.
9ae3a8
+ *
9ae3a8
+ * However, if there is are multiple hetergeneous devices, then we cannot make
9ae3a8
+ * this assumption and the user just has to be sure they know what they are
9ae3a8
+ * doing.
9ae3a8
+ *
9ae3a8
+ * Patches are being reviewed on linux-rdma.
9ae3a8
+ */
9ae3a8
+static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
9ae3a8
+{
9ae3a8
+    struct ibv_port_attr port_attr;
9ae3a8
+
9ae3a8
+    /* This bug only exists in linux, to our knowledge. */
9ae3a8
+#ifdef CONFIG_LINUX
9ae3a8
+
9ae3a8
+    /* 
9ae3a8
+     * Verbs are only NULL if management has bound to '[::]'.
9ae3a8
+     * 
9ae3a8
+     * Let's iterate through all the devices and see if there any pure IB
9ae3a8
+     * devices (non-ethernet).
9ae3a8
+     * 
9ae3a8
+     * If not, then we can safely proceed with the migration.
9ae3a8
+     * Otherwise, there are no gaurantees until the bug is fixed in linux.
9ae3a8
+     */
9ae3a8
+    if (!verbs) {
9ae3a8
+	    int num_devices, x;
9ae3a8
+        struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
9ae3a8
+        bool roce_found = false;
9ae3a8
+        bool ib_found = false;
9ae3a8
+
9ae3a8
+        for (x = 0; x < num_devices; x++) {
9ae3a8
+            verbs = ibv_open_device(dev_list[x]);
9ae3a8
+
9ae3a8
+            if (ibv_query_port(verbs, 1, &port_attr)) {
9ae3a8
+                ibv_close_device(verbs);
9ae3a8
+                ERROR(errp, "Could not query initial IB port");
9ae3a8
+                return -EINVAL;
9ae3a8
+            }
9ae3a8
+
9ae3a8
+            if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
9ae3a8
+                ib_found = true;
9ae3a8
+            } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
9ae3a8
+                roce_found = true;
9ae3a8
+            }
9ae3a8
+
9ae3a8
+            ibv_close_device(verbs);
9ae3a8
+
9ae3a8
+        }
9ae3a8
+
9ae3a8
+        if (roce_found) {
9ae3a8
+            if (ib_found) {
9ae3a8
+                fprintf(stderr, "WARN: migrations may fail:"
9ae3a8
+                                " IPv6 over RoCE / iWARP in linux"
9ae3a8
+                                " is broken. But since you appear to have a"
9ae3a8
+                                " mixed RoCE / IB environment, be sure to only"
9ae3a8
+                                " migrate over the IB fabric until the kernel "
9ae3a8
+                                " fixes the bug.\n");
9ae3a8
+            } else {
9ae3a8
+                ERROR(errp, "You only have RoCE / iWARP devices in your systems"
9ae3a8
+                            " and your management software has specified '[::]'"
9ae3a8
+                            ", but IPv6 over RoCE / iWARP is not supported in Linux.");
9ae3a8
+                return -ENONET;
9ae3a8
+            }
9ae3a8
+        }
9ae3a8
+
9ae3a8
+        return 0;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    /*
9ae3a8
+     * If we have a verbs context, that means that some other than '[::]' was
9ae3a8
+     * used by the management software for binding. In which case we can actually 
9ae3a8
+     * warn the user about a potential broken kernel;
9ae3a8
+     */
9ae3a8
+
9ae3a8
+    /* IB ports start with 1, not 0 */
9ae3a8
+    if (ibv_query_port(verbs, 1, &port_attr)) {
9ae3a8
+        ERROR(errp, "Could not query initial IB port");
9ae3a8
+        return -EINVAL;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
9ae3a8
+        ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
9ae3a8
+                    "(but patches on linux-rdma in progress)");
9ae3a8
+        return -ENONET;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+    return 0;
9ae3a8
+}
9ae3a8
+
9ae3a8
+/*
9ae3a8
  * Figure out which RDMA device corresponds to the requested IP hostname
9ae3a8
  * Also create the initial connection manager identifiers for opening
9ae3a8
  * the connection.
9ae3a8
@@ -740,22 +878,22 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
9ae3a8
 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
9ae3a8
 {
9ae3a8
     int ret;
9ae3a8
-    struct addrinfo *res;
9ae3a8
+    struct rdma_addrinfo *res;
9ae3a8
     char port_str[16];
9ae3a8
     struct rdma_cm_event *cm_event;
9ae3a8
     char ip[40] = "unknown";
9ae3a8
-    struct addrinfo *e;
9ae3a8
+    struct rdma_addrinfo *e;
9ae3a8
 
9ae3a8
     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
9ae3a8
         ERROR(errp, "RDMA hostname has not been set");
9ae3a8
-        return -1;
9ae3a8
+        return -EINVAL;
9ae3a8
     }
9ae3a8
 
9ae3a8
     /* create CM channel */
9ae3a8
     rdma->channel = rdma_create_event_channel();
9ae3a8
     if (!rdma->channel) {
9ae3a8
         ERROR(errp, "could not create CM channel");
9ae3a8
-        return -1;
9ae3a8
+        return -EINVAL;
9ae3a8
     }
9ae3a8
 
9ae3a8
     /* create CM id */
9ae3a8
@@ -768,21 +906,24 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
9ae3a8
     snprintf(port_str, 16, "%d", rdma->port);
9ae3a8
     port_str[15] = '\0';
9ae3a8
 
9ae3a8
-    ret = getaddrinfo(rdma->host, port_str, NULL, &res;;
9ae3a8
+    ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res;;
9ae3a8
     if (ret < 0) {
9ae3a8
-        ERROR(errp, "could not getaddrinfo address %s", rdma->host);
9ae3a8
+        ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
9ae3a8
         goto err_resolve_get_addr;
9ae3a8
     }
9ae3a8
 
9ae3a8
     for (e = res; e != NULL; e = e->ai_next) {
9ae3a8
         inet_ntop(e->ai_family,
9ae3a8
-            &((struct sockaddr_in *) e->ai_addr)->sin_addr, ip, sizeof ip);
9ae3a8
+            &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
9ae3a8
         DPRINTF("Trying %s => %s\n", rdma->host, ip);
9ae3a8
 
9ae3a8
-        /* resolve the first address */
9ae3a8
-        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_addr,
9ae3a8
+        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
9ae3a8
                 RDMA_RESOLVE_TIMEOUT_MS);
9ae3a8
         if (!ret) {
9ae3a8
+            ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
9ae3a8
+            if (ret) {
9ae3a8
+                continue;
9ae3a8
+            }
9ae3a8
             goto route;
9ae3a8
         }
9ae3a8
     }
9ae3a8
@@ -803,6 +944,7 @@ route:
9ae3a8
         ERROR(errp, "result not equal to event_addr_resolved %s",
9ae3a8
                 rdma_event_str(cm_event->event));
9ae3a8
         perror("rdma_resolve_addr");
9ae3a8
+        ret = -EINVAL;
9ae3a8
         goto err_resolve_get_addr;
9ae3a8
     }
9ae3a8
     rdma_ack_cm_event(cm_event);
9ae3a8
@@ -823,6 +965,7 @@ route:
9ae3a8
         ERROR(errp, "result not equal to event_route_resolved: %s",
9ae3a8
                         rdma_event_str(cm_event->event));
9ae3a8
         rdma_ack_cm_event(cm_event);
9ae3a8
+        ret = -EINVAL;
9ae3a8
         goto err_resolve_get_addr;
9ae3a8
     }
9ae3a8
     rdma_ack_cm_event(cm_event);
9ae3a8
@@ -837,8 +980,7 @@ err_resolve_get_addr:
9ae3a8
 err_resolve_create_id:
9ae3a8
     rdma_destroy_event_channel(rdma->channel);
9ae3a8
     rdma->channel = NULL;
9ae3a8
-
9ae3a8
-    return -1;
9ae3a8
+    return ret;
9ae3a8
 }
9ae3a8
 
9ae3a8
 /*
9ae3a8
@@ -2266,7 +2408,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
9ae3a8
     int ret = -EINVAL, idx;
9ae3a8
     struct rdma_cm_id *listen_id;
9ae3a8
     char ip[40] = "unknown";
9ae3a8
-    struct addrinfo *res;
9ae3a8
+    struct rdma_addrinfo *res;
9ae3a8
     char port_str[16];
9ae3a8
 
9ae3a8
     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
9ae3a8
@@ -2298,20 +2440,27 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
9ae3a8
     port_str[15] = '\0';
9ae3a8
 
9ae3a8
     if (rdma->host && strcmp("", rdma->host)) {
9ae3a8
-        struct addrinfo *e;
9ae3a8
+        struct rdma_addrinfo *e;
9ae3a8
 
9ae3a8
-        ret = getaddrinfo(rdma->host, port_str, NULL, &res;;
9ae3a8
+        ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res;;
9ae3a8
         if (ret < 0) {
9ae3a8
-            ERROR(errp, "could not getaddrinfo address %s", rdma->host);
9ae3a8
+            ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
9ae3a8
             goto err_dest_init_bind_addr;
9ae3a8
         }
9ae3a8
 
9ae3a8
         for (e = res; e != NULL; e = e->ai_next) {
9ae3a8
             inet_ntop(e->ai_family,
9ae3a8
-                &((struct sockaddr_in *) e->ai_addr)->sin_addr, ip, sizeof ip);
9ae3a8
+                &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
9ae3a8
             DPRINTF("Trying %s => %s\n", rdma->host, ip);
9ae3a8
-            ret = rdma_bind_addr(listen_id, e->ai_addr);
9ae3a8
+            ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
9ae3a8
             if (!ret) {
9ae3a8
+                if (e->ai_family == AF_INET6) {
9ae3a8
+                    ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
9ae3a8
+                    if (ret) {
9ae3a8
+                        continue;
9ae3a8
+                    }
9ae3a8
+                }
9ae3a8
+                    
9ae3a8
                 goto listen;
9ae3a8
             }
9ae3a8
         }
9ae3a8
-- 
9ae3a8
1.7.11.7
9ae3a8