Blame SOURCES/kvm-intel-iommu-rework-the-page-walk-logic.patch

383d26
From 0b6a29f743e4dcbf340d5abe0e7e230591b7d5d3 Mon Sep 17 00:00:00 2001
383d26
From: Peter Xu <peterx@redhat.com>
383d26
Date: Mon, 3 Sep 2018 04:52:41 +0200
383d26
Subject: [PATCH 26/29] intel-iommu: rework the page walk logic
383d26
383d26
RH-Author: Peter Xu <peterx@redhat.com>
383d26
Message-id: <20180903045241.6456-10-peterx@redhat.com>
383d26
Patchwork-id: 82029
383d26
O-Subject: [RHEL-7.6 qemu-kvm-rhev PATCH 9/9] intel-iommu: rework the page walk logic
383d26
Bugzilla: 1623859
383d26
RH-Acked-by: Xiao Wang <jasowang@redhat.com>
383d26
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
383d26
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
383d26
383d26
This patch fixes a potential small window that the DMA page table might
383d26
be incomplete or invalid when the guest sends domain/context
383d26
invalidations to a device.  This can cause random DMA errors for
383d26
assigned devices.
383d26
383d26
This is a major change to the VT-d shadow page walking logic. It
383d26
includes but is not limited to:
383d26
383d26
- For each VTDAddressSpace, now we maintain what IOVA ranges we have
383d26
  mapped and what we have not.  With that information, now we only send
383d26
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
383d26
  know we have already mapped the range, meanwhile we don't send UNMAP
383d26
  notifies if we know we never mapped the range at all.
383d26
383d26
- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
383d26
  in any places to resync the shadow page table for a device.
383d26
383d26
- When we receive domain/context invalidation, we should not really run
383d26
  the replay logic, instead we use the new sync shadow page table API to
383d26
  resync the whole shadow page table without unmapping the whole
383d26
  region.  After this change, we'll only do the page walk once for each
383d26
  domain invalidations (before this, it can be multiple, depending on
383d26
  number of notifiers per address space).
383d26
383d26
While at it, the page walking logic is also refactored to be simpler.
383d26
383d26
CC: QEMU Stable <qemu-stable@nongnu.org>
383d26
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
383d26
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
383d26
Signed-off-by: Peter Xu <peterx@redhat.com>
383d26
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
383d26
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
383d26
(cherry picked from commit 63b88968f139b6a77f2f81e6f1eedf70c0170a85)
383d26
Signed-off-by: Peter Xu <peterx@redhat.com>
383d26
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
383d26
---
383d26
 hw/i386/intel_iommu.c         | 213 ++++++++++++++++++++++++++++++------------
383d26
 hw/i386/trace-events          |   3 +-
383d26
 include/hw/i386/intel_iommu.h |   2 +
383d26
 3 files changed, 159 insertions(+), 59 deletions(-)
383d26
383d26
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
383d26
index 61bb3d3..b5a09b7 100644
383d26
--- a/hw/i386/intel_iommu.c
383d26
+++ b/hw/i386/intel_iommu.c
383d26
@@ -769,10 +769,77 @@ typedef struct {
383d26
 
383d26
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
383d26
 {
383d26
+    VTDAddressSpace *as = info->as;
383d26
     vtd_page_walk_hook hook_fn = info->hook_fn;
383d26
     void *private = info->private;
383d26
+    DMAMap target = {
383d26
+        .iova = entry->iova,
383d26
+        .size = entry->addr_mask,
383d26
+        .translated_addr = entry->translated_addr,
383d26
+        .perm = entry->perm,
383d26
+    };
383d26
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
383d26
+
383d26
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
383d26
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
383d26
+        return 0;
383d26
+    }
383d26
 
383d26
     assert(hook_fn);
383d26
+
383d26
+    /* Update local IOVA mapped ranges */
383d26
+    if (entry->perm) {
383d26
+        if (mapped) {
383d26
+            /* If it's exactly the same translation, skip */
383d26
+            if (!memcmp(mapped, &target, sizeof(target))) {
383d26
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
383d26
+                                                 entry->translated_addr);
383d26
+                return 0;
383d26
+            } else {
383d26
+                /*
383d26
+                 * Translation changed.  Normally this should not
383d26
+                 * happen, but it can happen when with buggy guest
383d26
+                 * OSes.  Note that there will be a small window that
383d26
+                 * we don't have map at all.  But that's the best
383d26
+                 * effort we can do.  The ideal way to emulate this is
383d26
+                 * atomically modify the PTE to follow what has
383d26
+                 * changed, but we can't.  One example is that vfio
383d26
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
383d26
+                 * interface to modify a mapping (meanwhile it seems
383d26
+                 * meaningless to even provide one).  Anyway, let's
383d26
+                 * mark this as a TODO in case one day we'll have
383d26
+                 * a better solution.
383d26
+                 */
383d26
+                IOMMUAccessFlags cache_perm = entry->perm;
383d26
+                int ret;
383d26
+
383d26
+                /* Emulate an UNMAP */
383d26
+                entry->perm = IOMMU_NONE;
383d26
+                trace_vtd_page_walk_one(info->domain_id,
383d26
+                                        entry->iova,
383d26
+                                        entry->translated_addr,
383d26
+                                        entry->addr_mask,
383d26
+                                        entry->perm);
383d26
+                ret = hook_fn(entry, private);
383d26
+                if (ret) {
383d26
+                    return ret;
383d26
+                }
383d26
+                /* Drop any existing mapping */
383d26
+                iova_tree_remove(as->iova_tree, &target);
383d26
+                /* Recover the correct permission */
383d26
+                entry->perm = cache_perm;
383d26
+            }
383d26
+        }
383d26
+        iova_tree_insert(as->iova_tree, &target);
383d26
+    } else {
383d26
+        if (!mapped) {
383d26
+            /* Skip since we didn't map this range at all */
383d26
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
383d26
+            return 0;
383d26
+        }
383d26
+        iova_tree_remove(as->iova_tree, &target);
383d26
+    }
383d26
+
383d26
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
383d26
                             entry->translated_addr, entry->addr_mask,
383d26
                             entry->perm);
383d26
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
383d26
          */
383d26
         entry_valid = read_cur | write_cur;
383d26
 
383d26
-        entry.target_as = &address_space_memory;
383d26
-        entry.iova = iova & subpage_mask;
383d26
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
383d26
-        entry.addr_mask = ~subpage_mask;
383d26
-
383d26
-        if (vtd_is_last_slpte(slpte, level)) {
383d26
-            /* NOTE: this is only meaningful if entry_valid == true */
383d26
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
383d26
-            if (!entry_valid && !info->notify_unmap) {
383d26
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
383d26
-                goto next;
383d26
-            }
383d26
-            ret = vtd_page_walk_one(&entry, info);
383d26
-            if (ret < 0) {
383d26
-                return ret;
383d26
-            }
383d26
-        } else {
383d26
-            if (!entry_valid) {
383d26
-                if (info->notify_unmap) {
383d26
-                    /*
383d26
-                     * The whole entry is invalid; unmap it all.
383d26
-                     * Translated address is meaningless, zero it.
383d26
-                     */
383d26
-                    entry.translated_addr = 0x0;
383d26
-                    ret = vtd_page_walk_one(&entry, info);
383d26
-                    if (ret < 0) {
383d26
-                        return ret;
383d26
-                    }
383d26
-                } else {
383d26
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
383d26
-                }
383d26
-                goto next;
383d26
-            }
383d26
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
383d26
+            /*
383d26
+             * This is a valid PDE (or even bigger than PDE).  We need
383d26
+             * to walk one further level.
383d26
+             */
383d26
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
383d26
                                       iova, MIN(iova_next, end), level - 1,
383d26
                                       read_cur, write_cur, info);
383d26
-            if (ret < 0) {
383d26
-                return ret;
383d26
-            }
383d26
+        } else {
383d26
+            /*
383d26
+             * This means we are either:
383d26
+             *
383d26
+             * (1) the real page entry (either 4K page, or huge page)
383d26
+             * (2) the whole range is invalid
383d26
+             *
383d26
+             * In either case, we send an IOTLB notification down.
383d26
+             */
383d26
+            entry.target_as = &address_space_memory;
383d26
+            entry.iova = iova & subpage_mask;
383d26
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
383d26
+            entry.addr_mask = ~subpage_mask;
383d26
+            /* NOTE: this is only meaningful if entry_valid == true */
383d26
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
383d26
+            ret = vtd_page_walk_one(&entry, info);
383d26
+        }
383d26
+
383d26
+        if (ret < 0) {
383d26
+            return ret;
383d26
         }
383d26
 
383d26
 next:
383d26
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
383d26
     return 0;
383d26
 }
383d26
 
383d26
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
383d26
+                                     void *private)
383d26
+{
383d26
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
383d26
+    return 0;
383d26
+}
383d26
+
383d26
+/* If context entry is NULL, we'll try to fetch it on our own. */
383d26
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
383d26
+                                            VTDContextEntry *ce,
383d26
+                                            hwaddr addr, hwaddr size)
383d26
+{
383d26
+    IntelIOMMUState *s = vtd_as->iommu_state;
383d26
+    vtd_page_walk_info info = {
383d26
+        .hook_fn = vtd_sync_shadow_page_hook,
383d26
+        .private = (void *)&vtd_as->iommu,
383d26
+        .notify_unmap = true,
383d26
+        .aw = s->aw_bits,
383d26
+        .as = vtd_as,
383d26
+    };
383d26
+    VTDContextEntry ce_cache;
383d26
+    int ret;
383d26
+
383d26
+    if (ce) {
383d26
+        /* If the caller provided context entry, use it */
383d26
+        ce_cache = *ce;
383d26
+    } else {
383d26
+        /* If the caller didn't provide ce, try to fetch */
383d26
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
383d26
+                                       vtd_as->devfn, &ce_cache);
383d26
+        if (ret) {
383d26
+            /*
383d26
+             * This should not really happen, but in case it happens,
383d26
+             * we just skip the sync for this time.  After all we even
383d26
+             * don't have the root table pointer!
383d26
+             */
383d26
+            trace_vtd_err("Detected invalid context entry when "
383d26
+                          "trying to sync shadow page table");
383d26
+            return 0;
383d26
+        }
383d26
+    }
383d26
+
383d26
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
383d26
+
383d26
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info;;
383d26
+}
383d26
+
383d26
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
383d26
+{
383d26
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
383d26
+}
383d26
+
383d26
 /*
383d26
  * Fetch translation type for specific device. Returns <0 if error
383d26
  * happens, otherwise return the shifted type to check against
383d26
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
383d26
     VTDAddressSpace *vtd_as;
383d26
 
383d26
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
383d26
-        memory_region_iommu_replay_all(&vtd_as->iommu);
383d26
+        vtd_sync_shadow_page_table(vtd_as);
383d26
     }
383d26
 }
383d26
 
383d26
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
383d26
                 vtd_switch_address_space(vtd_as);
383d26
                 /*
383d26
                  * So a device is moving out of (or moving into) a
383d26
-                 * domain, a replay() suites here to notify all the
383d26
-                 * IOMMU_NOTIFIER_MAP registers about this change.
383d26
+                 * domain, resync the shadow page table.
383d26
                  * This won't bring bad even if we have no such
383d26
                  * notifier registered - the IOMMU notification
383d26
                  * framework will skip MAP notifications if that
383d26
                  * happened.
383d26
                  */
383d26
-                memory_region_iommu_replay_all(&vtd_as->iommu);
383d26
+                vtd_sync_shadow_page_table(vtd_as);
383d26
             }
383d26
         }
383d26
     }
383d26
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
383d26
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
383d26
                                       vtd_as->devfn, &ce) &&
383d26
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
383d26
-            memory_region_iommu_replay_all(&vtd_as->iommu);
383d26
+            vtd_sync_shadow_page_table(vtd_as);
383d26
         }
383d26
     }
383d26
 }
383d26
 
383d26
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
383d26
-                                           void *private)
383d26
-{
383d26
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
383d26
-    return 0;
383d26
-}
383d26
-
383d26
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
383d26
                                            uint16_t domain_id, hwaddr addr,
383d26
                                            uint8_t am)
383d26
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
383d26
                                        vtd_as->devfn, &ce);
383d26
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
383d26
             if (vtd_as_has_map_notifier(vtd_as)) {
383d26
-                vtd_page_walk_info info = {
383d26
-                    .hook_fn = vtd_page_invalidate_notify_hook,
383d26
-                    .private = (void *)&vtd_as->iommu,
383d26
-                    .notify_unmap = true,
383d26
-                    .aw = s->aw_bits,
383d26
-                    .as = vtd_as,
383d26
-                    .domain_id = domain_id,
383d26
-                };
383d26
-
383d26
                 /*
383d26
                  * As long as we have MAP notifications registered in
383d26
                  * any of our IOMMU notifiers, we need to sync the
383d26
                  * shadow page table.
383d26
                  */
383d26
-                vtd_page_walk(&ce, addr, addr + size, &info;;
383d26
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
383d26
             } else {
383d26
                 /*
383d26
                  * For UNMAP-only notifiers, we don't need to walk the
383d26
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
383d26
         vtd_dev_as->devfn = (uint8_t)devfn;
383d26
         vtd_dev_as->iommu_state = s;
383d26
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
383d26
+        vtd_dev_as->iova_tree = iova_tree_new();
383d26
 
383d26
         /*
383d26
          * Memory region relationships looks like (Address range shows
383d26
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
383d26
     hwaddr start = n->start;
383d26
     hwaddr end = n->end;
383d26
     IntelIOMMUState *s = as->iommu_state;
383d26
+    DMAMap map;
383d26
 
383d26
     /*
383d26
      * Note: all the codes in this function has a assumption that IOVA
383d26
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
383d26
                              VTD_PCI_FUNC(as->devfn),
383d26
                              entry.iova, size);
383d26
 
383d26
+    map.iova = entry.iova;
383d26
+    map.size = entry.addr_mask;
383d26
+    iova_tree_remove(as->iova_tree, &map);
383d26
+
383d26
     memory_region_notify_one(n, &entry);
383d26
 }
383d26
 
383d26
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
383d26
index ca23ba9..e14d06e 100644
383d26
--- a/hw/i386/trace-events
383d26
+++ b/hw/i386/trace-events
383d26
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
383d26
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
383d26
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
383d26
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
383d26
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
383d26
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
383d26
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
383d26
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
383d26
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
383d26
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
383d26
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
383d26
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
383d26
index 156f35e..fbfedcb 100644
383d26
--- a/include/hw/i386/intel_iommu.h
383d26
+++ b/include/hw/i386/intel_iommu.h
383d26
@@ -27,6 +27,7 @@
383d26
 #include "hw/i386/ioapic.h"
383d26
 #include "hw/pci/msi.h"
383d26
 #include "hw/sysbus.h"
383d26
+#include "qemu/iova-tree.h"
383d26
 
383d26
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
383d26
 #define INTEL_IOMMU_DEVICE(obj) \
383d26
@@ -95,6 +96,7 @@ struct VTDAddressSpace {
383d26
     QLIST_ENTRY(VTDAddressSpace) next;
383d26
     /* Superset of notifier flags that this address space has */
383d26
     IOMMUNotifierFlag notifier_flags;
383d26
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
383d26
 };
383d26
 
383d26
 struct VTDBus {
383d26
-- 
383d26
1.8.3.1
383d26