0a122b
From 876b9284b61269d977d0b6b8585ba29758957622 Mon Sep 17 00:00:00 2001
0a122b
Message-Id: <876b9284b61269d977d0b6b8585ba29758957622.1387382496.git.minovotn@redhat.com>
0a122b
In-Reply-To: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
0a122b
References: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
0a122b
From: Nigel Croxon <ncroxon@redhat.com>
0a122b
Date: Thu, 14 Nov 2013 22:53:07 +0100
0a122b
Subject: [PATCH 31/46] rdma: IPv6 over Ethernet (RoCE) is broken in linux -
0a122b
 workaround
0a122b
0a122b
RH-Author: Nigel Croxon <ncroxon@redhat.com>
0a122b
Message-id: <1384469598-13137-32-git-send-email-ncroxon@redhat.com>
0a122b
Patchwork-id: 55713
0a122b
O-Subject: [RHEL7.0 PATCH 31/42] rdma: IPv6 over Ethernet (RoCE) is broken in linux - workaround
0a122b
Bugzilla: 1011720
0a122b
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
0a122b
RH-Acked-by: Amit Shah <amit.shah@redhat.com>
0a122b
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
0a122b
0a122b
Bugzilla: 1011720
0a122b
https://bugzilla.redhat.com/show_bug.cgi?id=1011720
0a122b
0a122b
>From commit ID:
0a122b
commit 7fc5b13fd7b05babc7bcad9dcb8281ae202a9494
0a122b
Author: Michael R. Hines <mrhines@us.ibm.com>
0a122b
Date:   Fri Aug 9 16:05:44 2013 -0400
0a122b
0a122b
    rdma: IPv6 over Ethernet (RoCE) is broken in linux - workaround
0a122b
0a122b
    We've gotten reports from multiple testers (including Frank Yangjie
0a122b
    and myself) that RDMA IPv6 support over RocE (Ethernet) is broken
0a122b
    in linux.
0a122b
0a122b
    A patch to Linux is still in review:
0a122b
0a122b
    http://comments.gmane.org/gmane.linux.drivers.rdma/16448
0a122b
0a122b
    If the user is listening on '[::]', then we will not have a opened a device
0a122b
    yet and have no way of verifying if the device is RoCE or not.
0a122b
0a122b
    In this case, the source VM will throw an error for ALL types of
0a122b
    connections (both IPv4 and IPv6) if the destination machine does not have
0a122b
    a regular infiniband network available for use.
0a122b
0a122b
    The only way to gaurantee that an error is thrown for broken kernels is
0a122b
    for the management software to choose a *specific* interface at bind time
0a122b
    and validate what time of hardware it is.
0a122b
0a122b
    Unfortunately, this puts the user in a fix:
0a122b
0a122b
     If the source VM connects with an IPv4 address without knowing that the
0a122b
     destination has bound to '[::]' the migration will unconditionally fail
0a122b
     unless the management software is not explicitly listening on the the IPv4
0a122b
     address while using a RoCE-based device.
0a122b
0a122b
     If the source VM connects with an IPv6 address, then we're OK because we can
0a122b
     throw an error on the source (and similarly on the destination).
0a122b
0a122b
     But in mixed environments, this will be broken for a while until it is fixed
0a122b
     inside linux.
0a122b
0a122b
    We do provide a *tiny* bit of help in mixed environments, though in this patch:
0a122b
0a122b
    We can list all of the devices in the system and check to see if all the
0a122b
    devices are RoCE or Infiniband.
0a122b
0a122b
    If we detect that we have a *pure* RoCE environment, then we can safely
0a122b
    thrown an error even if the management sofware has specified '[::]' as the
0a122b
    bind address.
0a122b
0a122b
    However, if there is are multiple hetergeneous devices, then we cannot make
0a122b
    this assumption and the user just has to be sure they know what they are doing.
0a122b
0a122b
    Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
0a122b
    Message-id: 1376078746-24948-6-git-send-email-mrhines@linux.vnet.ibm.com
0a122b
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
0a122b
---
0a122b
 migration-rdma.c |  189 ++++++++++++++++++++++++++++++++++++++++++++++++------
0a122b
 1 files changed, 169 insertions(+), 20 deletions(-)
0a122b
0a122b
Signed-off-by: Michal Novotny <minovotn@redhat.com>
0a122b
---
0a122b
 migration-rdma.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++++------
0a122b
 1 file changed, 169 insertions(+), 20 deletions(-)
0a122b
0a122b
diff --git a/migration-rdma.c b/migration-rdma.c
0a122b
index e6fd77a..ada488e 100644
0a122b
--- a/migration-rdma.c
0a122b
+++ b/migration-rdma.c
0a122b
@@ -707,15 +707,27 @@ static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
0a122b
  */
0a122b
 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
0a122b
 {
0a122b
+    struct ibv_port_attr port;
0a122b
+
0a122b
+    if (ibv_query_port(verbs, 1, &port)) {
0a122b
+        fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
0a122b
+        return;
0a122b
+    }
0a122b
+
0a122b
     printf("%s RDMA Device opened: kernel name %s "
0a122b
            "uverbs device name %s, "
0a122b
-           "infiniband_verbs class device path %s,"
0a122b
-           " infiniband class device path %s\n",
0a122b
+           "infiniband_verbs class device path %s, "
0a122b
+           "infiniband class device path %s, "
0a122b
+           "transport: (%d) %s\n",
0a122b
                 who,
0a122b
                 verbs->device->name,
0a122b
                 verbs->device->dev_name,
0a122b
                 verbs->device->dev_path,
0a122b
-                verbs->device->ibdev_path);
0a122b
+                verbs->device->ibdev_path,
0a122b
+                port.link_layer,
0a122b
+                (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
0a122b
+                 ((port.link_layer == IBV_LINK_LAYER_ETHERNET) 
0a122b
+                    ? "Ethernet" : "Unknown"));
0a122b
 }
0a122b
 
0a122b
 /*
0a122b
@@ -733,6 +745,132 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
0a122b
 }
0a122b
 
0a122b
 /*
0a122b
+ * As of now, IPv6 over RoCE / iWARP is not supported by linux.
0a122b
+ * We will try the next addrinfo struct, and fail if there are
0a122b
+ * no other valid addresses to bind against.
0a122b
+ *
0a122b
+ * If user is listening on '[::]', then we will not have a opened a device
0a122b
+ * yet and have no way of verifying if the device is RoCE or not.
0a122b
+ *
0a122b
+ * In this case, the source VM will throw an error for ALL types of
0a122b
+ * connections (both IPv4 and IPv6) if the destination machine does not have
0a122b
+ * a regular infiniband network available for use.
0a122b
+ *
0a122b
+ * The only way to gaurantee that an error is thrown for broken kernels is
0a122b
+ * for the management software to choose a *specific* interface at bind time
0a122b
+ * and validate what time of hardware it is.
0a122b
+ *
0a122b
+ * Unfortunately, this puts the user in a fix:
0a122b
+ * 
0a122b
+ *  If the source VM connects with an IPv4 address without knowing that the
0a122b
+ *  destination has bound to '[::]' the migration will unconditionally fail
0a122b
+ *  unless the management software is explicitly listening on the the IPv4
0a122b
+ *  address while using a RoCE-based device.
0a122b
+ *
0a122b
+ *  If the source VM connects with an IPv6 address, then we're OK because we can
0a122b
+ *  throw an error on the source (and similarly on the destination).
0a122b
+ * 
0a122b
+ *  But in mixed environments, this will be broken for a while until it is fixed
0a122b
+ *  inside linux.
0a122b
+ *
0a122b
+ * We do provide a *tiny* bit of help in this function: We can list all of the
0a122b
+ * devices in the system and check to see if all the devices are RoCE or
0a122b
+ * Infiniband. 
0a122b
+ *
0a122b
+ * If we detect that we have a *pure* RoCE environment, then we can safely
0a122b
+ * thrown an error even if the management sofware has specified '[::]' as the
0a122b
+ * bind address.
0a122b
+ *
0a122b
+ * However, if there is are multiple hetergeneous devices, then we cannot make
0a122b
+ * this assumption and the user just has to be sure they know what they are
0a122b
+ * doing.
0a122b
+ *
0a122b
+ * Patches are being reviewed on linux-rdma.
0a122b
+ */
0a122b
+static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
0a122b
+{
0a122b
+    struct ibv_port_attr port_attr;
0a122b
+
0a122b
+    /* This bug only exists in linux, to our knowledge. */
0a122b
+#ifdef CONFIG_LINUX
0a122b
+
0a122b
+    /* 
0a122b
+     * Verbs are only NULL if management has bound to '[::]'.
0a122b
+     * 
0a122b
+     * Let's iterate through all the devices and see if there any pure IB
0a122b
+     * devices (non-ethernet).
0a122b
+     * 
0a122b
+     * If not, then we can safely proceed with the migration.
0a122b
+     * Otherwise, there are no gaurantees until the bug is fixed in linux.
0a122b
+     */
0a122b
+    if (!verbs) {
0a122b
+	    int num_devices, x;
0a122b
+        struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
0a122b
+        bool roce_found = false;
0a122b
+        bool ib_found = false;
0a122b
+
0a122b
+        for (x = 0; x < num_devices; x++) {
0a122b
+            verbs = ibv_open_device(dev_list[x]);
0a122b
+
0a122b
+            if (ibv_query_port(verbs, 1, &port_attr)) {
0a122b
+                ibv_close_device(verbs);
0a122b
+                ERROR(errp, "Could not query initial IB port");
0a122b
+                return -EINVAL;
0a122b
+            }
0a122b
+
0a122b
+            if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
0a122b
+                ib_found = true;
0a122b
+            } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
0a122b
+                roce_found = true;
0a122b
+            }
0a122b
+
0a122b
+            ibv_close_device(verbs);
0a122b
+
0a122b
+        }
0a122b
+
0a122b
+        if (roce_found) {
0a122b
+            if (ib_found) {
0a122b
+                fprintf(stderr, "WARN: migrations may fail:"
0a122b
+                                " IPv6 over RoCE / iWARP in linux"
0a122b
+                                " is broken. But since you appear to have a"
0a122b
+                                " mixed RoCE / IB environment, be sure to only"
0a122b
+                                " migrate over the IB fabric until the kernel "
0a122b
+                                " fixes the bug.\n");
0a122b
+            } else {
0a122b
+                ERROR(errp, "You only have RoCE / iWARP devices in your systems"
0a122b
+                            " and your management software has specified '[::]'"
0a122b
+                            ", but IPv6 over RoCE / iWARP is not supported in Linux.");
0a122b
+                return -ENONET;
0a122b
+            }
0a122b
+        }
0a122b
+
0a122b
+        return 0;
0a122b
+    }
0a122b
+
0a122b
+    /*
0a122b
+     * If we have a verbs context, that means that some other than '[::]' was
0a122b
+     * used by the management software for binding. In which case we can actually 
0a122b
+     * warn the user about a potential broken kernel;
0a122b
+     */
0a122b
+
0a122b
+    /* IB ports start with 1, not 0 */
0a122b
+    if (ibv_query_port(verbs, 1, &port_attr)) {
0a122b
+        ERROR(errp, "Could not query initial IB port");
0a122b
+        return -EINVAL;
0a122b
+    }
0a122b
+
0a122b
+    if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
0a122b
+        ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
0a122b
+                    "(but patches on linux-rdma in progress)");
0a122b
+        return -ENONET;
0a122b
+    }
0a122b
+
0a122b
+#endif
0a122b
+
0a122b
+    return 0;
0a122b
+}
0a122b
+
0a122b
+/*
0a122b
  * Figure out which RDMA device corresponds to the requested IP hostname
0a122b
  * Also create the initial connection manager identifiers for opening
0a122b
  * the connection.
0a122b
@@ -740,22 +878,22 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
0a122b
 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
0a122b
 {
0a122b
     int ret;
0a122b
-    struct addrinfo *res;
0a122b
+    struct rdma_addrinfo *res;
0a122b
     char port_str[16];
0a122b
     struct rdma_cm_event *cm_event;
0a122b
     char ip[40] = "unknown";
0a122b
-    struct addrinfo *e;
0a122b
+    struct rdma_addrinfo *e;
0a122b
 
0a122b
     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
0a122b
         ERROR(errp, "RDMA hostname has not been set");
0a122b
-        return -1;
0a122b
+        return -EINVAL;
0a122b
     }
0a122b
 
0a122b
     /* create CM channel */
0a122b
     rdma->channel = rdma_create_event_channel();
0a122b
     if (!rdma->channel) {
0a122b
         ERROR(errp, "could not create CM channel");
0a122b
-        return -1;
0a122b
+        return -EINVAL;
0a122b
     }
0a122b
 
0a122b
     /* create CM id */
0a122b
@@ -768,21 +906,24 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
0a122b
     snprintf(port_str, 16, "%d", rdma->port);
0a122b
     port_str[15] = '\0';
0a122b
 
0a122b
-    ret = getaddrinfo(rdma->host, port_str, NULL, &res;;
0a122b
+    ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res;;
0a122b
     if (ret < 0) {
0a122b
-        ERROR(errp, "could not getaddrinfo address %s", rdma->host);
0a122b
+        ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
0a122b
         goto err_resolve_get_addr;
0a122b
     }
0a122b
 
0a122b
     for (e = res; e != NULL; e = e->ai_next) {
0a122b
         inet_ntop(e->ai_family,
0a122b
-            &((struct sockaddr_in *) e->ai_addr)->sin_addr, ip, sizeof ip);
0a122b
+            &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
0a122b
         DPRINTF("Trying %s => %s\n", rdma->host, ip);
0a122b
 
0a122b
-        /* resolve the first address */
0a122b
-        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_addr,
0a122b
+        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
0a122b
                 RDMA_RESOLVE_TIMEOUT_MS);
0a122b
         if (!ret) {
0a122b
+            ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
0a122b
+            if (ret) {
0a122b
+                continue;
0a122b
+            }
0a122b
             goto route;
0a122b
         }
0a122b
     }
0a122b
@@ -803,6 +944,7 @@ route:
0a122b
         ERROR(errp, "result not equal to event_addr_resolved %s",
0a122b
                 rdma_event_str(cm_event->event));
0a122b
         perror("rdma_resolve_addr");
0a122b
+        ret = -EINVAL;
0a122b
         goto err_resolve_get_addr;
0a122b
     }
0a122b
     rdma_ack_cm_event(cm_event);
0a122b
@@ -823,6 +965,7 @@ route:
0a122b
         ERROR(errp, "result not equal to event_route_resolved: %s",
0a122b
                         rdma_event_str(cm_event->event));
0a122b
         rdma_ack_cm_event(cm_event);
0a122b
+        ret = -EINVAL;
0a122b
         goto err_resolve_get_addr;
0a122b
     }
0a122b
     rdma_ack_cm_event(cm_event);
0a122b
@@ -837,8 +980,7 @@ err_resolve_get_addr:
0a122b
 err_resolve_create_id:
0a122b
     rdma_destroy_event_channel(rdma->channel);
0a122b
     rdma->channel = NULL;
0a122b
-
0a122b
-    return -1;
0a122b
+    return ret;
0a122b
 }
0a122b
 
0a122b
 /*
0a122b
@@ -2266,7 +2408,7 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
0a122b
     int ret = -EINVAL, idx;
0a122b
     struct rdma_cm_id *listen_id;
0a122b
     char ip[40] = "unknown";
0a122b
-    struct addrinfo *res;
0a122b
+    struct rdma_addrinfo *res;
0a122b
     char port_str[16];
0a122b
 
0a122b
     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
0a122b
@@ -2298,20 +2440,27 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
0a122b
     port_str[15] = '\0';
0a122b
 
0a122b
     if (rdma->host && strcmp("", rdma->host)) {
0a122b
-        struct addrinfo *e;
0a122b
+        struct rdma_addrinfo *e;
0a122b
 
0a122b
-        ret = getaddrinfo(rdma->host, port_str, NULL, &res;;
0a122b
+        ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res;;
0a122b
         if (ret < 0) {
0a122b
-            ERROR(errp, "could not getaddrinfo address %s", rdma->host);
0a122b
+            ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
0a122b
             goto err_dest_init_bind_addr;
0a122b
         }
0a122b
 
0a122b
         for (e = res; e != NULL; e = e->ai_next) {
0a122b
             inet_ntop(e->ai_family,
0a122b
-                &((struct sockaddr_in *) e->ai_addr)->sin_addr, ip, sizeof ip);
0a122b
+                &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
0a122b
             DPRINTF("Trying %s => %s\n", rdma->host, ip);
0a122b
-            ret = rdma_bind_addr(listen_id, e->ai_addr);
0a122b
+            ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
0a122b
             if (!ret) {
0a122b
+                if (e->ai_family == AF_INET6) {
0a122b
+                    ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
0a122b
+                    if (ret) {
0a122b
+                        continue;
0a122b
+                    }
0a122b
+                }
0a122b
+                    
0a122b
                 goto listen;
0a122b
             }
0a122b
         }
0a122b
-- 
0a122b
1.7.11.7
0a122b