Blob Blame History Raw
From a5f055c8eb5c761473088866022167d4a9442673 Mon Sep 17 00:00:00 2001
Message-Id: <a5f055c8eb5c761473088866022167d4a9442673@dist-git>
From: Andrea Bolognani <abologna@redhat.com>
Date: Tue, 18 Jul 2017 12:10:07 +0200
Subject: [PATCH] qemu: Isolate hostdevs on pSeries guests

All the pieces are now in place, so we can finally start
using isolation groups to achieve our initial goal, which is
separating hostdevs from emulated PCI devices while keeping
hostdevs that belong to the same host IOMMU group together.

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1280542

Signed-off-by: Andrea Bolognani <abologna@redhat.com>
Reviewed-by: Laine Stump <laine@laine.org>
(cherry picked from commit b84b6ab5027c386d19299771387b4c4cf5e844cd)

Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1280542

Signed-off-by: Andrea Bolognani <abologna@redhat.com>
Signed-off-by: Jiri Denemark <jdenemar@redhat.com>
---
 src/qemu/qemu_domain_address.c                     | 241 +++++++++++++++++++++
 src/qemu/qemu_domain_address.h                     |   4 +
 src/qemu/qemu_hotplug.c                            |   7 +
 tests/qemumemlocktest.c                            |   2 +-
 .../qemuxml2argv-pseries-hostdevs-1.args           |   8 +-
 .../qemuxml2argv-pseries-hostdevs-2.args           |   3 +-
 .../qemuxml2argv-pseries-hostdevs-3.args           |   2 +-
 .../qemuxml2xmlout-pseries-hostdevs-1.xml          |  14 +-
 .../qemuxml2xmlout-pseries-hostdevs-2.xml          |   6 +-
 .../qemuxml2xmlout-pseries-hostdevs-3.xml          |   2 +-
 10 files changed, 278 insertions(+), 11 deletions(-)

diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c
index 02e214b8dd..756cd97970 100644
--- a/src/qemu/qemu_domain_address.c
+++ b/src/qemu/qemu_domain_address.c
@@ -25,6 +25,7 @@
 
 #include "qemu_domain_address.h"
 #include "qemu_domain.h"
+#include "network/bridge_driver.h"
 #include "viralloc.h"
 #include "virerror.h"
 #include "virlog.h"
@@ -901,6 +902,243 @@ qemuDomainFillAllPCIConnectFlags(virDomainDefPtr def,
 
 
 /**
+ * qemuDomainFindUnusedIsolationGroupIter:
+ * @def: domain definition
+ * @dev: device definition
+ * @info: device information
+ * @opaque: user data
+ *
+ * Used to implement qemuDomainFindUnusedIsolationGroup(). You probably
+ * don't want to call this directly.
+ *
+ * Return: 0 if the isolation group is not used by the device, <1 otherwise.
+ */
+static int
+qemuDomainFindUnusedIsolationGroupIter(virDomainDefPtr def ATTRIBUTE_UNUSED,
+                                       virDomainDeviceDefPtr dev ATTRIBUTE_UNUSED,
+                                       virDomainDeviceInfoPtr info,
+                                       void *opaque)
+{
+    unsigned int *isolationGroup = opaque;
+
+    if (info->isolationGroup == *isolationGroup)
+        return -1;
+
+    return 0;
+}
+
+
+/**
+ * qemuDomainFindUnusedIsolationGroup:
+ * @def: domain definition
+ *
+ * Find an isolation group that is not used by any device in @def yet.
+ *
+ * Normally, we'd look up the device's IOMMU group and base its isolation
+ * group on that; however, when a network interface uses a network backed
+ * by SR-IOV Virtual Functions, we can't know at PCI address assignment
+ * time which host device will be used so we can't look up its IOMMU group.
+ *
+ * We still want such a device to be isolated: this function can be used
+ * to obtain a synthetic isolation group usable for the purpose.
+ *
+ * Return: unused isolation group
+ */
+static unsigned int
+qemuDomainFindUnusedIsolationGroup(virDomainDefPtr def)
+{
+    unsigned int isolationGroup = UINT_MAX;
+
+    /* We start from the highest possible isolation group and work our
+     * way backwards so that we're working in a completely different range
+     * from IOMMU groups, thus avoiding clashes. We're realistically going
+     * to call this function just a few times per guest anyway */
+    while (isolationGroup > 0 &&
+           virDomainDeviceInfoIterate(def,
+                                      qemuDomainFindUnusedIsolationGroupIter,
+                                      &isolationGroup) < 0) {
+        isolationGroup--;
+    }
+
+    return isolationGroup;
+}
+
+
+/**
+ * qemuDomainFillDeviceIsolationGroup:
+ * @def: domain definition
+ * @dev: device definition
+ *
+ * Fill isolation group information for a single device.
+ *
+ * Return: 0 on success, <0 on failure
+ * */
+int
+qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def,
+                                   virDomainDeviceDefPtr dev)
+{
+    int ret = -1;
+
+    /* Only host devices need their isolation group to be different from
+     * the default. Interfaces of type hostdev are just host devices in
+     * disguise, but we don't need to handle them separately because for
+     * each such interface a corresponding hostdev is also added to the
+     * guest configuration */
+    if (dev->type == VIR_DOMAIN_DEVICE_HOSTDEV) {
+        virDomainHostdevDefPtr hostdev = dev->data.hostdev;
+        virDomainDeviceInfoPtr info = hostdev->info;
+        virPCIDeviceAddressPtr hostAddr;
+        int tmp;
+
+        /* Only PCI host devices are subject to isolation */
+        if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+            hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+            goto skip;
+        }
+
+        hostAddr = &hostdev->source.subsys.u.pci.addr;
+
+        /* If a non-default isolation has already been assigned to the
+         * device, we can avoid looking up the information again */
+        if (info->isolationGroup > 0)
+            goto skip;
+
+        /* The isolation group depends on the IOMMU group assigned by the host */
+        tmp = virPCIDeviceAddressGetIOMMUGroupNum(hostAddr);
+
+        if (tmp < 0) {
+            VIR_WARN("Can't look up isolation group for host device "
+                     "%04x:%02x:%02x.%x",
+                     hostAddr->domain, hostAddr->bus,
+                     hostAddr->slot, hostAddr->function);
+            goto cleanup;
+        }
+
+        /* The isolation group for a host device is its IOMMU group,
+         * increased by one: this is because zero is a valid IOMMU group but
+         * that's also the default isolation group, which we want to save
+         * for emulated devices. Shifting isolation groups for host devices
+         * by one ensures there is no overlap */
+        info->isolationGroup = tmp + 1;
+
+        VIR_DEBUG("Isolation group for host device %04x:%02x:%02x.%x is %u",
+                  hostAddr->domain, hostAddr->bus,
+                  hostAddr->slot, hostAddr->function,
+                  info->isolationGroup);
+
+    } else if (dev->type == VIR_DOMAIN_DEVICE_NET) {
+        virDomainNetDefPtr iface = dev->data.net;
+        virDomainDeviceInfoPtr info = &iface->info;
+        unsigned int tmp;
+
+        /* Network interfaces can ultimately result in the guest being
+         * assigned a host device if the libvirt network they're connected
+         * to is of type hostdev. All other kinds of network interfaces don't
+         * require us to isolate the guest device, so we can skip them */
+        if (iface->type != VIR_DOMAIN_NET_TYPE_NETWORK ||
+            networkGetActualType(iface) != VIR_DOMAIN_NET_TYPE_HOSTDEV) {
+            goto skip;
+        }
+
+        /* If a non-default isolation has already been assigned to the
+         * device, we can avoid looking up the information again */
+        if (info->isolationGroup > 0)
+            goto skip;
+
+        /* Obtain a synthetic isolation group for the device, since at this
+         * point in time we don't have access to the IOMMU group of the host
+         * device that will eventually be used by the guest */
+        tmp = qemuDomainFindUnusedIsolationGroup(def);
+
+        if (tmp == 0) {
+            VIR_WARN("Can't obtain usable isolation group for interface "
+                     "configured to use hostdev-backed network '%s'",
+                     iface->data.network.name);
+            goto cleanup;
+        }
+
+        info->isolationGroup = tmp;
+
+        VIR_DEBUG("Isolation group for interface configured to use "
+                  "hostdev-backed network '%s' is %u",
+                  iface->data.network.name, info->isolationGroup);
+    }
+
+ skip:
+    ret = 0;
+
+ cleanup:
+    return ret;
+}
+
+
+/**
+ * qemuDomainFillDeviceIsolationGroupIter:
+ * @def: domain definition
+ * @dev: device definition
+ * @info: device information
+ * @opaque: user data
+ *
+ * A version of qemuDomainFillDeviceIsolationGroup() to be used
+ * with virDomainDeviceInfoIterate()
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int
+qemuDomainFillDeviceIsolationGroupIter(virDomainDefPtr def,
+                                       virDomainDeviceDefPtr dev,
+                                       virDomainDeviceInfoPtr info ATTRIBUTE_UNUSED,
+                                       void *opaque ATTRIBUTE_UNUSED)
+{
+    return qemuDomainFillDeviceIsolationGroup(def, dev);
+}
+
+
+/**
+ * qemuDomainSetupIsolationGroups:
+ * @def: domain definition
+ *
+ * High-level function to set up isolation groups for all devices
+ * and controllers in @def. Isolation groups will only be set up if
+ * the guest architecture and machine type require it, so this
+ * function can and should be called unconditionally before attempting
+ * to assign any PCI address.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int
+qemuDomainSetupIsolationGroups(virDomainDefPtr def)
+{
+    int idx;
+    int ret = -1;
+
+    /* Only pSeries guests care about isolation groups at the moment */
+    if (!qemuDomainIsPSeries(def))
+        return 0;
+
+    idx = virDomainControllerFind(def, VIR_DOMAIN_CONTROLLER_TYPE_PCI, 0);
+    if (idx < 0)
+        goto cleanup;
+
+    /* We want to prevent hostdevs from being plugged into the default PHB:
+     * we can make sure that doesn't happen by locking its isolation group */
+    def->controllers[idx]->info.isolationGroupLocked = true;
+
+    /* Fill in isolation groups for all other devices */
+    if (virDomainDeviceInfoIterate(def,
+                                   qemuDomainFillDeviceIsolationGroupIter,
+                                   NULL) < 0) {
+        goto cleanup;
+    }
+
+    ret = 0;
+
+ cleanup:
+    return ret;
+}
+
+
+/**
  * qemuDomainFillDevicePCIConnectFlags:
  *
  * @def: the entire DomainDef
@@ -2049,6 +2287,9 @@ qemuDomainAssignPCIAddresses(virDomainDefPtr def,
     if (qemuDomainFillAllPCIConnectFlags(def, qemuCaps, driver) < 0)
         goto cleanup;
 
+    if (qemuDomainSetupIsolationGroups(def) < 0)
+        goto cleanup;
+
     if (nbuses > 0) {
         /* 1st pass to figure out how many PCI bridges we need */
         if (!(addrs = qemuDomainPCIAddressSetCreate(def, nbuses, true)))
diff --git a/src/qemu/qemu_domain_address.h b/src/qemu/qemu_domain_address.h
index 067f4e7997..b5644fa9c2 100644
--- a/src/qemu/qemu_domain_address.h
+++ b/src/qemu/qemu_domain_address.h
@@ -44,6 +44,10 @@ int qemuDomainEnsurePCIAddress(virDomainObjPtr obj,
                                virQEMUDriverPtr driver)
     ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3);
 
+int qemuDomainFillDeviceIsolationGroup(virDomainDefPtr def,
+                                       virDomainDeviceDefPtr dev)
+    ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2);
+
 void qemuDomainReleaseDeviceAddress(virDomainObjPtr vm,
                                     virDomainDeviceInfoPtr info,
                                     const char *devstr);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index 476e2b81a3..34f1a646e9 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -1468,6 +1468,13 @@ qemuDomainAttachHostPCIDevice(virQEMUDriverPtr driver,
 
     if (qemuAssignDeviceHostdevAlias(vm->def, &info->alias, -1) < 0)
         goto error;
+
+    if (qemuDomainIsPSeries(vm->def)) {
+        /* Isolation groups are only relevant for pSeries guests */
+        if (qemuDomainFillDeviceIsolationGroup(vm->def, &dev) < 0)
+            goto error;
+    }
+
     if (qemuDomainEnsurePCIAddress(vm, &dev, driver) < 0)
         goto error;
     releaseaddr = true;
diff --git a/tests/qemumemlocktest.c b/tests/qemumemlocktest.c
index ea25cd9a66..42561ac19e 100644
--- a/tests/qemumemlocktest.c
+++ b/tests/qemumemlocktest.c
@@ -131,7 +131,7 @@ mymain(void)
 
     DO_TEST("pseries-hardlimit", 2147483648);
     DO_TEST("pseries-locked", VIR_DOMAIN_MEMORY_PARAM_UNLIMITED);
-    DO_TEST("pseries-hostdev", 2168455168);
+    DO_TEST("pseries-hostdev", 4320133120);
 
     DO_TEST("pseries-hardlimit+locked", 2147483648);
     DO_TEST("pseries-hardlimit+hostdev", 2147483648);
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args
index 051ffdeb3e..8a4a4c5a63 100644
--- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args
+++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-1.args
@@ -18,6 +18,8 @@ QEMU_AUDIO_DRV=none \
 server,nowait \
 -mon chardev=charmonitor,id=monitor,mode=readline \
 -boot c \
--device vfio-pci,host=0005:90:01.0,id=hostdev0,bus=pci.0,addr=0x1 \
--device vfio-pci,host=0001:01:00.0,id=hostdev1,bus=pci.0,addr=0x2 \
--device vfio-pci,host=0001:01:00.1,id=hostdev2,bus=pci.0,addr=0x3
+-device spapr-pci-host-bridge,index=1,id=pci.1 \
+-device spapr-pci-host-bridge,index=2,id=pci.2 \
+-device vfio-pci,host=0005:90:01.0,id=hostdev0,bus=pci.1.0,addr=0x1 \
+-device vfio-pci,host=0001:01:00.0,id=hostdev1,bus=pci.2.0,addr=0x1 \
+-device vfio-pci,host=0001:01:00.1,id=hostdev2,bus=pci.2.0,addr=0x2
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args
index 83d4306036..cd5b66404e 100644
--- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args
+++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-2.args
@@ -19,6 +19,7 @@ server,nowait \
 -mon chardev=charmonitor,id=monitor,mode=readline \
 -boot c \
 -device spapr-pci-host-bridge,index=1,id=pci.1 \
+-device spapr-pci-host-bridge,index=2,id=pci.2 \
 -device virtio-scsi-pci,id=scsi0,bus=pci.1.0,addr=0x1 \
 -device vfio-pci,host=0001:01:00.0,id=hostdev0,bus=pci.1.0,addr=0x2 \
--device vfio-pci,host=0005:90:01.0,id=hostdev1,bus=pci.0,addr=0x1
+-device vfio-pci,host=0005:90:01.0,id=hostdev1,bus=pci.2.0,addr=0x1
diff --git a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args
index eda6cc73ac..66a31ba1a8 100644
--- a/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args
+++ b/tests/qemuxml2argvdata/qemuxml2argv-pseries-hostdevs-3.args
@@ -21,4 +21,4 @@ server,nowait \
 -device spapr-pci-host-bridge,index=1,id=pci.1 \
 -device spapr-pci-host-bridge,index=2,id=pci.2 \
 -device vfio-pci,host=0001:01:00.0,id=hostdev0,bus=pci.2.0,addr=0x1 \
--device vfio-pci,host=0001:01:00.1,id=hostdev1,bus=pci.0,addr=0x1
+-device vfio-pci,host=0001:01:00.1,id=hostdev1,bus=pci.2.0,addr=0x2
diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml
index fa9e4daca5..e77a060a38 100644
--- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml
+++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-1.xml
@@ -19,27 +19,35 @@
       <model name='spapr-pci-host-bridge'/>
       <target index='0'/>
     </controller>
+    <controller type='pci' index='1' model='pci-root'>
+      <model name='spapr-pci-host-bridge'/>
+      <target index='1'/>
+    </controller>
+    <controller type='pci' index='2' model='pci-root'>
+      <model name='spapr-pci-host-bridge'/>
+      <target index='2'/>
+    </controller>
     <interface type='hostdev' managed='yes'>
       <mac address='52:54:00:6d:90:02'/>
       <driver name='vfio'/>
       <source>
         <address type='pci' domain='0x0005' bus='0x90' slot='0x01' function='0x0'/>
       </source>
-      <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/>
+      <address type='pci' domain='0x0000' bus='0x01' slot='0x01' function='0x0'/>
     </interface>
     <hostdev mode='subsystem' type='pci' managed='yes'>
       <driver name='vfio'/>
       <source>
         <address domain='0x0001' bus='0x01' slot='0x00' function='0x0'/>
       </source>
-      <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/>
+      <address type='pci' domain='0x0000' bus='0x02' slot='0x01' function='0x0'/>
     </hostdev>
     <hostdev mode='subsystem' type='pci' managed='yes'>
       <driver name='vfio'/>
       <source>
         <address domain='0x0001' bus='0x01' slot='0x00' function='0x1'/>
       </source>
-      <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
+      <address type='pci' domain='0x0000' bus='0x02' slot='0x02' function='0x0'/>
     </hostdev>
     <memballoon model='none'/>
     <panic model='pseries'/>
diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml
index 17ff4c8537..cfa395b001 100644
--- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml
+++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-2.xml
@@ -26,6 +26,10 @@
       <model name='spapr-pci-host-bridge'/>
       <target index='1'/>
     </controller>
+    <controller type='pci' index='2' model='pci-root'>
+      <model name='spapr-pci-host-bridge'/>
+      <target index='2'/>
+    </controller>
     <hostdev mode='subsystem' type='pci' managed='yes'>
       <driver name='vfio'/>
       <source>
@@ -38,7 +42,7 @@
       <source>
         <address domain='0x0005' bus='0x90' slot='0x01' function='0x0'/>
       </source>
-      <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/>
+      <address type='pci' domain='0x0000' bus='0x02' slot='0x01' function='0x0'/>
     </hostdev>
     <memballoon model='none'/>
     <panic model='pseries'/>
diff --git a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml
index 58023ecd72..f91959b805 100644
--- a/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml
+++ b/tests/qemuxml2xmloutdata/qemuxml2xmlout-pseries-hostdevs-3.xml
@@ -39,7 +39,7 @@
       <source>
         <address domain='0x0001' bus='0x01' slot='0x00' function='0x1'/>
       </source>
-      <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/>
+      <address type='pci' domain='0x0000' bus='0x02' slot='0x02' function='0x0'/>
     </hostdev>
     <memballoon model='none'/>
     <panic model='pseries'/>
-- 
2.13.3