ae23c9
From 560b10746036972ee8be6ff5c252ab2fc56de0e4 Mon Sep 17 00:00:00 2001
ae23c9
From: Peter Xu <peterx@redhat.com>
ae23c9
Date: Fri, 12 Oct 2018 07:58:46 +0100
ae23c9
Subject: [PATCH 16/17] intel-iommu: rework the page walk logic
ae23c9
ae23c9
RH-Author: Peter Xu <peterx@redhat.com>
ae23c9
Message-id: <20181012075846.25449-10-peterx@redhat.com>
ae23c9
Patchwork-id: 82682
ae23c9
O-Subject: [RHEL-8 qemu-kvm PATCH 9/9] intel-iommu: rework the page walk logic
ae23c9
Bugzilla: 1450712
ae23c9
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
ae23c9
RH-Acked-by: Xiao Wang <jasowang@redhat.com>
ae23c9
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
ae23c9
ae23c9
This patch fixes a potential small window that the DMA page table might
ae23c9
be incomplete or invalid when the guest sends domain/context
ae23c9
invalidations to a device.  This can cause random DMA errors for
ae23c9
assigned devices.
ae23c9
ae23c9
This is a major change to the VT-d shadow page walking logic. It
ae23c9
includes but is not limited to:
ae23c9
ae23c9
- For each VTDAddressSpace, now we maintain what IOVA ranges we have
ae23c9
  mapped and what we have not.  With that information, now we only send
ae23c9
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
ae23c9
  know we have already mapped the range, meanwhile we don't send UNMAP
ae23c9
  notifies if we know we never mapped the range at all.
ae23c9
ae23c9
- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
ae23c9
  in any places to resync the shadow page table for a device.
ae23c9
ae23c9
- When we receive domain/context invalidation, we should not really run
ae23c9
  the replay logic, instead we use the new sync shadow page table API to
ae23c9
  resync the whole shadow page table without unmapping the whole
ae23c9
  region.  After this change, we'll only do the page walk once for each
ae23c9
  domain invalidations (before this, it can be multiple, depending on
ae23c9
  number of notifiers per address space).
ae23c9
ae23c9
While at it, the page walking logic is also refactored to be simpler.
ae23c9
ae23c9
CC: QEMU Stable <qemu-stable@nongnu.org>
ae23c9
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
ae23c9
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
ae23c9
Signed-off-by: Peter Xu <peterx@redhat.com>
ae23c9
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
ae23c9
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
ae23c9
(cherry picked from commit 63b88968f139b6a77f2f81e6f1eedf70c0170a85)
ae23c9
Signed-off-by: Peter Xu <peterx@redhat.com>
ae23c9
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
ae23c9
---
ae23c9
 hw/i386/intel_iommu.c         | 213 ++++++++++++++++++++++++++++++------------
ae23c9
 hw/i386/trace-events          |   3 +-
ae23c9
 include/hw/i386/intel_iommu.h |   2 +
ae23c9
 3 files changed, 159 insertions(+), 59 deletions(-)
ae23c9
ae23c9
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
ae23c9
index 61bb3d3..b5a09b7 100644
ae23c9
--- a/hw/i386/intel_iommu.c
ae23c9
+++ b/hw/i386/intel_iommu.c
ae23c9
@@ -769,10 +769,77 @@ typedef struct {
ae23c9
 
ae23c9
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
ae23c9
 {
ae23c9
+    VTDAddressSpace *as = info->as;
ae23c9
     vtd_page_walk_hook hook_fn = info->hook_fn;
ae23c9
     void *private = info->private;
ae23c9
+    DMAMap target = {
ae23c9
+        .iova = entry->iova,
ae23c9
+        .size = entry->addr_mask,
ae23c9
+        .translated_addr = entry->translated_addr,
ae23c9
+        .perm = entry->perm,
ae23c9
+    };
ae23c9
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
ae23c9
+
ae23c9
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
ae23c9
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
ae23c9
+        return 0;
ae23c9
+    }
ae23c9
 
ae23c9
     assert(hook_fn);
ae23c9
+
ae23c9
+    /* Update local IOVA mapped ranges */
ae23c9
+    if (entry->perm) {
ae23c9
+        if (mapped) {
ae23c9
+            /* If it's exactly the same translation, skip */
ae23c9
+            if (!memcmp(mapped, &target, sizeof(target))) {
ae23c9
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
ae23c9
+                                                 entry->translated_addr);
ae23c9
+                return 0;
ae23c9
+            } else {
ae23c9
+                /*
ae23c9
+                 * Translation changed.  Normally this should not
ae23c9
+                 * happen, but it can happen when with buggy guest
ae23c9
+                 * OSes.  Note that there will be a small window that
ae23c9
+                 * we don't have map at all.  But that's the best
ae23c9
+                 * effort we can do.  The ideal way to emulate this is
ae23c9
+                 * atomically modify the PTE to follow what has
ae23c9
+                 * changed, but we can't.  One example is that vfio
ae23c9
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
ae23c9
+                 * interface to modify a mapping (meanwhile it seems
ae23c9
+                 * meaningless to even provide one).  Anyway, let's
ae23c9
+                 * mark this as a TODO in case one day we'll have
ae23c9
+                 * a better solution.
ae23c9
+                 */
ae23c9
+                IOMMUAccessFlags cache_perm = entry->perm;
ae23c9
+                int ret;
ae23c9
+
ae23c9
+                /* Emulate an UNMAP */
ae23c9
+                entry->perm = IOMMU_NONE;
ae23c9
+                trace_vtd_page_walk_one(info->domain_id,
ae23c9
+                                        entry->iova,
ae23c9
+                                        entry->translated_addr,
ae23c9
+                                        entry->addr_mask,
ae23c9
+                                        entry->perm);
ae23c9
+                ret = hook_fn(entry, private);
ae23c9
+                if (ret) {
ae23c9
+                    return ret;
ae23c9
+                }
ae23c9
+                /* Drop any existing mapping */
ae23c9
+                iova_tree_remove(as->iova_tree, &target);
ae23c9
+                /* Recover the correct permission */
ae23c9
+                entry->perm = cache_perm;
ae23c9
+            }
ae23c9
+        }
ae23c9
+        iova_tree_insert(as->iova_tree, &target);
ae23c9
+    } else {
ae23c9
+        if (!mapped) {
ae23c9
+            /* Skip since we didn't map this range at all */
ae23c9
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
ae23c9
+            return 0;
ae23c9
+        }
ae23c9
+        iova_tree_remove(as->iova_tree, &target);
ae23c9
+    }
ae23c9
+
ae23c9
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
ae23c9
                             entry->translated_addr, entry->addr_mask,
ae23c9
                             entry->perm);
ae23c9
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
ae23c9
          */
ae23c9
         entry_valid = read_cur | write_cur;
ae23c9
 
ae23c9
-        entry.target_as = &address_space_memory;
ae23c9
-        entry.iova = iova & subpage_mask;
ae23c9
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
ae23c9
-        entry.addr_mask = ~subpage_mask;
ae23c9
-
ae23c9
-        if (vtd_is_last_slpte(slpte, level)) {
ae23c9
-            /* NOTE: this is only meaningful if entry_valid == true */
ae23c9
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
ae23c9
-            if (!entry_valid && !info->notify_unmap) {
ae23c9
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
ae23c9
-                goto next;
ae23c9
-            }
ae23c9
-            ret = vtd_page_walk_one(&entry, info);
ae23c9
-            if (ret < 0) {
ae23c9
-                return ret;
ae23c9
-            }
ae23c9
-        } else {
ae23c9
-            if (!entry_valid) {
ae23c9
-                if (info->notify_unmap) {
ae23c9
-                    /*
ae23c9
-                     * The whole entry is invalid; unmap it all.
ae23c9
-                     * Translated address is meaningless, zero it.
ae23c9
-                     */
ae23c9
-                    entry.translated_addr = 0x0;
ae23c9
-                    ret = vtd_page_walk_one(&entry, info);
ae23c9
-                    if (ret < 0) {
ae23c9
-                        return ret;
ae23c9
-                    }
ae23c9
-                } else {
ae23c9
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
ae23c9
-                }
ae23c9
-                goto next;
ae23c9
-            }
ae23c9
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
ae23c9
+            /*
ae23c9
+             * This is a valid PDE (or even bigger than PDE).  We need
ae23c9
+             * to walk one further level.
ae23c9
+             */
ae23c9
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
ae23c9
                                       iova, MIN(iova_next, end), level - 1,
ae23c9
                                       read_cur, write_cur, info);
ae23c9
-            if (ret < 0) {
ae23c9
-                return ret;
ae23c9
-            }
ae23c9
+        } else {
ae23c9
+            /*
ae23c9
+             * This means we are either:
ae23c9
+             *
ae23c9
+             * (1) the real page entry (either 4K page, or huge page)
ae23c9
+             * (2) the whole range is invalid
ae23c9
+             *
ae23c9
+             * In either case, we send an IOTLB notification down.
ae23c9
+             */
ae23c9
+            entry.target_as = &address_space_memory;
ae23c9
+            entry.iova = iova & subpage_mask;
ae23c9
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
ae23c9
+            entry.addr_mask = ~subpage_mask;
ae23c9
+            /* NOTE: this is only meaningful if entry_valid == true */
ae23c9
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
ae23c9
+            ret = vtd_page_walk_one(&entry, info);
ae23c9
+        }
ae23c9
+
ae23c9
+        if (ret < 0) {
ae23c9
+            return ret;
ae23c9
         }
ae23c9
 
ae23c9
 next:
ae23c9
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
ae23c9
     return 0;
ae23c9
 }
ae23c9
 
ae23c9
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
ae23c9
+                                     void *private)
ae23c9
+{
ae23c9
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
ae23c9
+    return 0;
ae23c9
+}
ae23c9
+
ae23c9
+/* If context entry is NULL, we'll try to fetch it on our own. */
ae23c9
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
ae23c9
+                                            VTDContextEntry *ce,
ae23c9
+                                            hwaddr addr, hwaddr size)
ae23c9
+{
ae23c9
+    IntelIOMMUState *s = vtd_as->iommu_state;
ae23c9
+    vtd_page_walk_info info = {
ae23c9
+        .hook_fn = vtd_sync_shadow_page_hook,
ae23c9
+        .private = (void *)&vtd_as->iommu,
ae23c9
+        .notify_unmap = true,
ae23c9
+        .aw = s->aw_bits,
ae23c9
+        .as = vtd_as,
ae23c9
+    };
ae23c9
+    VTDContextEntry ce_cache;
ae23c9
+    int ret;
ae23c9
+
ae23c9
+    if (ce) {
ae23c9
+        /* If the caller provided context entry, use it */
ae23c9
+        ce_cache = *ce;
ae23c9
+    } else {
ae23c9
+        /* If the caller didn't provide ce, try to fetch */
ae23c9
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
ae23c9
+                                       vtd_as->devfn, &ce_cache);
ae23c9
+        if (ret) {
ae23c9
+            /*
ae23c9
+             * This should not really happen, but in case it happens,
ae23c9
+             * we just skip the sync for this time.  After all we even
ae23c9
+             * don't have the root table pointer!
ae23c9
+             */
ae23c9
+            trace_vtd_err("Detected invalid context entry when "
ae23c9
+                          "trying to sync shadow page table");
ae23c9
+            return 0;
ae23c9
+        }
ae23c9
+    }
ae23c9
+
ae23c9
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
ae23c9
+
ae23c9
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info;;
ae23c9
+}
ae23c9
+
ae23c9
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
ae23c9
+{
ae23c9
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
ae23c9
+}
ae23c9
+
ae23c9
 /*
ae23c9
  * Fetch translation type for specific device. Returns <0 if error
ae23c9
  * happens, otherwise return the shifted type to check against
ae23c9
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
ae23c9
     VTDAddressSpace *vtd_as;
ae23c9
 
ae23c9
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
ae23c9
-        memory_region_iommu_replay_all(&vtd_as->iommu);
ae23c9
+        vtd_sync_shadow_page_table(vtd_as);
ae23c9
     }
ae23c9
 }
ae23c9
 
ae23c9
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
ae23c9
                 vtd_switch_address_space(vtd_as);
ae23c9
                 /*
ae23c9
                  * So a device is moving out of (or moving into) a
ae23c9
-                 * domain, a replay() suites here to notify all the
ae23c9
-                 * IOMMU_NOTIFIER_MAP registers about this change.
ae23c9
+                 * domain, resync the shadow page table.
ae23c9
                  * This won't bring bad even if we have no such
ae23c9
                  * notifier registered - the IOMMU notification
ae23c9
                  * framework will skip MAP notifications if that
ae23c9
                  * happened.
ae23c9
                  */
ae23c9
-                memory_region_iommu_replay_all(&vtd_as->iommu);
ae23c9
+                vtd_sync_shadow_page_table(vtd_as);
ae23c9
             }
ae23c9
         }
ae23c9
     }
ae23c9
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
ae23c9
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
ae23c9
                                       vtd_as->devfn, &ce) &&
ae23c9
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
ae23c9
-            memory_region_iommu_replay_all(&vtd_as->iommu);
ae23c9
+            vtd_sync_shadow_page_table(vtd_as);
ae23c9
         }
ae23c9
     }
ae23c9
 }
ae23c9
 
ae23c9
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
ae23c9
-                                           void *private)
ae23c9
-{
ae23c9
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
ae23c9
-    return 0;
ae23c9
-}
ae23c9
-
ae23c9
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
ae23c9
                                            uint16_t domain_id, hwaddr addr,
ae23c9
                                            uint8_t am)
ae23c9
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
ae23c9
                                        vtd_as->devfn, &ce);
ae23c9
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
ae23c9
             if (vtd_as_has_map_notifier(vtd_as)) {
ae23c9
-                vtd_page_walk_info info = {
ae23c9
-                    .hook_fn = vtd_page_invalidate_notify_hook,
ae23c9
-                    .private = (void *)&vtd_as->iommu,
ae23c9
-                    .notify_unmap = true,
ae23c9
-                    .aw = s->aw_bits,
ae23c9
-                    .as = vtd_as,
ae23c9
-                    .domain_id = domain_id,
ae23c9
-                };
ae23c9
-
ae23c9
                 /*
ae23c9
                  * As long as we have MAP notifications registered in
ae23c9
                  * any of our IOMMU notifiers, we need to sync the
ae23c9
                  * shadow page table.
ae23c9
                  */
ae23c9
-                vtd_page_walk(&ce, addr, addr + size, &info;;
ae23c9
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
ae23c9
             } else {
ae23c9
                 /*
ae23c9
                  * For UNMAP-only notifiers, we don't need to walk the
ae23c9
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
ae23c9
         vtd_dev_as->devfn = (uint8_t)devfn;
ae23c9
         vtd_dev_as->iommu_state = s;
ae23c9
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
ae23c9
+        vtd_dev_as->iova_tree = iova_tree_new();
ae23c9
 
ae23c9
         /*
ae23c9
          * Memory region relationships looks like (Address range shows
ae23c9
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
ae23c9
     hwaddr start = n->start;
ae23c9
     hwaddr end = n->end;
ae23c9
     IntelIOMMUState *s = as->iommu_state;
ae23c9
+    DMAMap map;
ae23c9
 
ae23c9
     /*
ae23c9
      * Note: all the codes in this function has a assumption that IOVA
ae23c9
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
ae23c9
                              VTD_PCI_FUNC(as->devfn),
ae23c9
                              entry.iova, size);
ae23c9
 
ae23c9
+    map.iova = entry.iova;
ae23c9
+    map.size = entry.addr_mask;
ae23c9
+    iova_tree_remove(as->iova_tree, &map);
ae23c9
+
ae23c9
     memory_region_notify_one(n, &entry);
ae23c9
 }
ae23c9
 
ae23c9
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
ae23c9
index ca23ba9..e14d06e 100644
ae23c9
--- a/hw/i386/trace-events
ae23c9
+++ b/hw/i386/trace-events
ae23c9
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
ae23c9
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
ae23c9
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
ae23c9
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
ae23c9
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
ae23c9
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
ae23c9
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
ae23c9
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
ae23c9
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
ae23c9
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
ae23c9
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
ae23c9
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
ae23c9
index 156f35e..fbfedcb 100644
ae23c9
--- a/include/hw/i386/intel_iommu.h
ae23c9
+++ b/include/hw/i386/intel_iommu.h
ae23c9
@@ -27,6 +27,7 @@
ae23c9
 #include "hw/i386/ioapic.h"
ae23c9
 #include "hw/pci/msi.h"
ae23c9
 #include "hw/sysbus.h"
ae23c9
+#include "qemu/iova-tree.h"
ae23c9
 
ae23c9
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
ae23c9
 #define INTEL_IOMMU_DEVICE(obj) \
ae23c9
@@ -95,6 +96,7 @@ struct VTDAddressSpace {
ae23c9
     QLIST_ENTRY(VTDAddressSpace) next;
ae23c9
     /* Superset of notifier flags that this address space has */
ae23c9
     IOMMUNotifierFlag notifier_flags;
ae23c9
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
ae23c9
 };
ae23c9
 
ae23c9
 struct VTDBus {
ae23c9
-- 
ae23c9
1.8.3.1
ae23c9