26ba25
From 560b10746036972ee8be6ff5c252ab2fc56de0e4 Mon Sep 17 00:00:00 2001
26ba25
From: Peter Xu <peterx@redhat.com>
26ba25
Date: Fri, 12 Oct 2018 07:58:46 +0100
26ba25
Subject: [PATCH 16/17] intel-iommu: rework the page walk logic
26ba25
26ba25
RH-Author: Peter Xu <peterx@redhat.com>
26ba25
Message-id: <20181012075846.25449-10-peterx@redhat.com>
26ba25
Patchwork-id: 82682
26ba25
O-Subject: [RHEL-8 qemu-kvm PATCH 9/9] intel-iommu: rework the page walk logic
26ba25
Bugzilla: 1450712
26ba25
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
26ba25
RH-Acked-by: Xiao Wang <jasowang@redhat.com>
26ba25
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
26ba25
26ba25
This patch fixes a potential small window that the DMA page table might
26ba25
be incomplete or invalid when the guest sends domain/context
26ba25
invalidations to a device.  This can cause random DMA errors for
26ba25
assigned devices.
26ba25
26ba25
This is a major change to the VT-d shadow page walking logic. It
26ba25
includes but is not limited to:
26ba25
26ba25
- For each VTDAddressSpace, now we maintain what IOVA ranges we have
26ba25
  mapped and what we have not.  With that information, now we only send
26ba25
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
26ba25
  know we have already mapped the range, meanwhile we don't send UNMAP
26ba25
  notifies if we know we never mapped the range at all.
26ba25
26ba25
- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
26ba25
  in any places to resync the shadow page table for a device.
26ba25
26ba25
- When we receive domain/context invalidation, we should not really run
26ba25
  the replay logic, instead we use the new sync shadow page table API to
26ba25
  resync the whole shadow page table without unmapping the whole
26ba25
  region.  After this change, we'll only do the page walk once for each
26ba25
  domain invalidations (before this, it can be multiple, depending on
26ba25
  number of notifiers per address space).
26ba25
26ba25
While at it, the page walking logic is also refactored to be simpler.
26ba25
26ba25
CC: QEMU Stable <qemu-stable@nongnu.org>
26ba25
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
26ba25
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
26ba25
Signed-off-by: Peter Xu <peterx@redhat.com>
26ba25
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
26ba25
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
26ba25
(cherry picked from commit 63b88968f139b6a77f2f81e6f1eedf70c0170a85)
26ba25
Signed-off-by: Peter Xu <peterx@redhat.com>
26ba25
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
26ba25
---
26ba25
 hw/i386/intel_iommu.c         | 213 ++++++++++++++++++++++++++++++------------
26ba25
 hw/i386/trace-events          |   3 +-
26ba25
 include/hw/i386/intel_iommu.h |   2 +
26ba25
 3 files changed, 159 insertions(+), 59 deletions(-)
26ba25
26ba25
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
26ba25
index 61bb3d3..b5a09b7 100644
26ba25
--- a/hw/i386/intel_iommu.c
26ba25
+++ b/hw/i386/intel_iommu.c
26ba25
@@ -769,10 +769,77 @@ typedef struct {
26ba25
 
26ba25
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
26ba25
 {
26ba25
+    VTDAddressSpace *as = info->as;
26ba25
     vtd_page_walk_hook hook_fn = info->hook_fn;
26ba25
     void *private = info->private;
26ba25
+    DMAMap target = {
26ba25
+        .iova = entry->iova,
26ba25
+        .size = entry->addr_mask,
26ba25
+        .translated_addr = entry->translated_addr,
26ba25
+        .perm = entry->perm,
26ba25
+    };
26ba25
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
26ba25
+
26ba25
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
26ba25
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
26ba25
+        return 0;
26ba25
+    }
26ba25
 
26ba25
     assert(hook_fn);
26ba25
+
26ba25
+    /* Update local IOVA mapped ranges */
26ba25
+    if (entry->perm) {
26ba25
+        if (mapped) {
26ba25
+            /* If it's exactly the same translation, skip */
26ba25
+            if (!memcmp(mapped, &target, sizeof(target))) {
26ba25
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
26ba25
+                                                 entry->translated_addr);
26ba25
+                return 0;
26ba25
+            } else {
26ba25
+                /*
26ba25
+                 * Translation changed.  Normally this should not
26ba25
+                 * happen, but it can happen when with buggy guest
26ba25
+                 * OSes.  Note that there will be a small window that
26ba25
+                 * we don't have map at all.  But that's the best
26ba25
+                 * effort we can do.  The ideal way to emulate this is
26ba25
+                 * atomically modify the PTE to follow what has
26ba25
+                 * changed, but we can't.  One example is that vfio
26ba25
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
26ba25
+                 * interface to modify a mapping (meanwhile it seems
26ba25
+                 * meaningless to even provide one).  Anyway, let's
26ba25
+                 * mark this as a TODO in case one day we'll have
26ba25
+                 * a better solution.
26ba25
+                 */
26ba25
+                IOMMUAccessFlags cache_perm = entry->perm;
26ba25
+                int ret;
26ba25
+
26ba25
+                /* Emulate an UNMAP */
26ba25
+                entry->perm = IOMMU_NONE;
26ba25
+                trace_vtd_page_walk_one(info->domain_id,
26ba25
+                                        entry->iova,
26ba25
+                                        entry->translated_addr,
26ba25
+                                        entry->addr_mask,
26ba25
+                                        entry->perm);
26ba25
+                ret = hook_fn(entry, private);
26ba25
+                if (ret) {
26ba25
+                    return ret;
26ba25
+                }
26ba25
+                /* Drop any existing mapping */
26ba25
+                iova_tree_remove(as->iova_tree, &target);
26ba25
+                /* Recover the correct permission */
26ba25
+                entry->perm = cache_perm;
26ba25
+            }
26ba25
+        }
26ba25
+        iova_tree_insert(as->iova_tree, &target);
26ba25
+    } else {
26ba25
+        if (!mapped) {
26ba25
+            /* Skip since we didn't map this range at all */
26ba25
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
26ba25
+            return 0;
26ba25
+        }
26ba25
+        iova_tree_remove(as->iova_tree, &target);
26ba25
+    }
26ba25
+
26ba25
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
26ba25
                             entry->translated_addr, entry->addr_mask,
26ba25
                             entry->perm);
26ba25
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
26ba25
          */
26ba25
         entry_valid = read_cur | write_cur;
26ba25
 
26ba25
-        entry.target_as = &address_space_memory;
26ba25
-        entry.iova = iova & subpage_mask;
26ba25
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
26ba25
-        entry.addr_mask = ~subpage_mask;
26ba25
-
26ba25
-        if (vtd_is_last_slpte(slpte, level)) {
26ba25
-            /* NOTE: this is only meaningful if entry_valid == true */
26ba25
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
26ba25
-            if (!entry_valid && !info->notify_unmap) {
26ba25
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
26ba25
-                goto next;
26ba25
-            }
26ba25
-            ret = vtd_page_walk_one(&entry, info);
26ba25
-            if (ret < 0) {
26ba25
-                return ret;
26ba25
-            }
26ba25
-        } else {
26ba25
-            if (!entry_valid) {
26ba25
-                if (info->notify_unmap) {
26ba25
-                    /*
26ba25
-                     * The whole entry is invalid; unmap it all.
26ba25
-                     * Translated address is meaningless, zero it.
26ba25
-                     */
26ba25
-                    entry.translated_addr = 0x0;
26ba25
-                    ret = vtd_page_walk_one(&entry, info);
26ba25
-                    if (ret < 0) {
26ba25
-                        return ret;
26ba25
-                    }
26ba25
-                } else {
26ba25
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
26ba25
-                }
26ba25
-                goto next;
26ba25
-            }
26ba25
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
26ba25
+            /*
26ba25
+             * This is a valid PDE (or even bigger than PDE).  We need
26ba25
+             * to walk one further level.
26ba25
+             */
26ba25
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
26ba25
                                       iova, MIN(iova_next, end), level - 1,
26ba25
                                       read_cur, write_cur, info);
26ba25
-            if (ret < 0) {
26ba25
-                return ret;
26ba25
-            }
26ba25
+        } else {
26ba25
+            /*
26ba25
+             * This means we are either:
26ba25
+             *
26ba25
+             * (1) the real page entry (either 4K page, or huge page)
26ba25
+             * (2) the whole range is invalid
26ba25
+             *
26ba25
+             * In either case, we send an IOTLB notification down.
26ba25
+             */
26ba25
+            entry.target_as = &address_space_memory;
26ba25
+            entry.iova = iova & subpage_mask;
26ba25
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
26ba25
+            entry.addr_mask = ~subpage_mask;
26ba25
+            /* NOTE: this is only meaningful if entry_valid == true */
26ba25
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
26ba25
+            ret = vtd_page_walk_one(&entry, info);
26ba25
+        }
26ba25
+
26ba25
+        if (ret < 0) {
26ba25
+            return ret;
26ba25
         }
26ba25
 
26ba25
 next:
26ba25
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
26ba25
     return 0;
26ba25
 }
26ba25
 
26ba25
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
26ba25
+                                     void *private)
26ba25
+{
26ba25
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
26ba25
+    return 0;
26ba25
+}
26ba25
+
26ba25
+/* If context entry is NULL, we'll try to fetch it on our own. */
26ba25
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
26ba25
+                                            VTDContextEntry *ce,
26ba25
+                                            hwaddr addr, hwaddr size)
26ba25
+{
26ba25
+    IntelIOMMUState *s = vtd_as->iommu_state;
26ba25
+    vtd_page_walk_info info = {
26ba25
+        .hook_fn = vtd_sync_shadow_page_hook,
26ba25
+        .private = (void *)&vtd_as->iommu,
26ba25
+        .notify_unmap = true,
26ba25
+        .aw = s->aw_bits,
26ba25
+        .as = vtd_as,
26ba25
+    };
26ba25
+    VTDContextEntry ce_cache;
26ba25
+    int ret;
26ba25
+
26ba25
+    if (ce) {
26ba25
+        /* If the caller provided context entry, use it */
26ba25
+        ce_cache = *ce;
26ba25
+    } else {
26ba25
+        /* If the caller didn't provide ce, try to fetch */
26ba25
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
26ba25
+                                       vtd_as->devfn, &ce_cache);
26ba25
+        if (ret) {
26ba25
+            /*
26ba25
+             * This should not really happen, but in case it happens,
26ba25
+             * we just skip the sync for this time.  After all we even
26ba25
+             * don't have the root table pointer!
26ba25
+             */
26ba25
+            trace_vtd_err("Detected invalid context entry when "
26ba25
+                          "trying to sync shadow page table");
26ba25
+            return 0;
26ba25
+        }
26ba25
+    }
26ba25
+
26ba25
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
26ba25
+
26ba25
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info;;
26ba25
+}
26ba25
+
26ba25
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
26ba25
+{
26ba25
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
26ba25
+}
26ba25
+
26ba25
 /*
26ba25
  * Fetch translation type for specific device. Returns <0 if error
26ba25
  * happens, otherwise return the shifted type to check against
26ba25
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
26ba25
     VTDAddressSpace *vtd_as;
26ba25
 
26ba25
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
26ba25
-        memory_region_iommu_replay_all(&vtd_as->iommu);
26ba25
+        vtd_sync_shadow_page_table(vtd_as);
26ba25
     }
26ba25
 }
26ba25
 
26ba25
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
26ba25
                 vtd_switch_address_space(vtd_as);
26ba25
                 /*
26ba25
                  * So a device is moving out of (or moving into) a
26ba25
-                 * domain, a replay() suites here to notify all the
26ba25
-                 * IOMMU_NOTIFIER_MAP registers about this change.
26ba25
+                 * domain, resync the shadow page table.
26ba25
                  * This won't bring bad even if we have no such
26ba25
                  * notifier registered - the IOMMU notification
26ba25
                  * framework will skip MAP notifications if that
26ba25
                  * happened.
26ba25
                  */
26ba25
-                memory_region_iommu_replay_all(&vtd_as->iommu);
26ba25
+                vtd_sync_shadow_page_table(vtd_as);
26ba25
             }
26ba25
         }
26ba25
     }
26ba25
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
26ba25
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
26ba25
                                       vtd_as->devfn, &ce) &&
26ba25
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
26ba25
-            memory_region_iommu_replay_all(&vtd_as->iommu);
26ba25
+            vtd_sync_shadow_page_table(vtd_as);
26ba25
         }
26ba25
     }
26ba25
 }
26ba25
 
26ba25
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
26ba25
-                                           void *private)
26ba25
-{
26ba25
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
26ba25
-    return 0;
26ba25
-}
26ba25
-
26ba25
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
26ba25
                                            uint16_t domain_id, hwaddr addr,
26ba25
                                            uint8_t am)
26ba25
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
26ba25
                                        vtd_as->devfn, &ce);
26ba25
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
26ba25
             if (vtd_as_has_map_notifier(vtd_as)) {
26ba25
-                vtd_page_walk_info info = {
26ba25
-                    .hook_fn = vtd_page_invalidate_notify_hook,
26ba25
-                    .private = (void *)&vtd_as->iommu,
26ba25
-                    .notify_unmap = true,
26ba25
-                    .aw = s->aw_bits,
26ba25
-                    .as = vtd_as,
26ba25
-                    .domain_id = domain_id,
26ba25
-                };
26ba25
-
26ba25
                 /*
26ba25
                  * As long as we have MAP notifications registered in
26ba25
                  * any of our IOMMU notifiers, we need to sync the
26ba25
                  * shadow page table.
26ba25
                  */
26ba25
-                vtd_page_walk(&ce, addr, addr + size, &info;;
26ba25
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
26ba25
             } else {
26ba25
                 /*
26ba25
                  * For UNMAP-only notifiers, we don't need to walk the
26ba25
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
26ba25
         vtd_dev_as->devfn = (uint8_t)devfn;
26ba25
         vtd_dev_as->iommu_state = s;
26ba25
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
26ba25
+        vtd_dev_as->iova_tree = iova_tree_new();
26ba25
 
26ba25
         /*
26ba25
          * Memory region relationships looks like (Address range shows
26ba25
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
26ba25
     hwaddr start = n->start;
26ba25
     hwaddr end = n->end;
26ba25
     IntelIOMMUState *s = as->iommu_state;
26ba25
+    DMAMap map;
26ba25
 
26ba25
     /*
26ba25
      * Note: all the codes in this function has a assumption that IOVA
26ba25
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
26ba25
                              VTD_PCI_FUNC(as->devfn),
26ba25
                              entry.iova, size);
26ba25
 
26ba25
+    map.iova = entry.iova;
26ba25
+    map.size = entry.addr_mask;
26ba25
+    iova_tree_remove(as->iova_tree, &map);
26ba25
+
26ba25
     memory_region_notify_one(n, &entry);
26ba25
 }
26ba25
 
26ba25
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
26ba25
index ca23ba9..e14d06e 100644
26ba25
--- a/hw/i386/trace-events
26ba25
+++ b/hw/i386/trace-events
26ba25
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
26ba25
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
26ba25
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
26ba25
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
26ba25
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
26ba25
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
26ba25
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
26ba25
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
26ba25
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
26ba25
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
26ba25
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
26ba25
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
26ba25
index 156f35e..fbfedcb 100644
26ba25
--- a/include/hw/i386/intel_iommu.h
26ba25
+++ b/include/hw/i386/intel_iommu.h
26ba25
@@ -27,6 +27,7 @@
26ba25
 #include "hw/i386/ioapic.h"
26ba25
 #include "hw/pci/msi.h"
26ba25
 #include "hw/sysbus.h"
26ba25
+#include "qemu/iova-tree.h"
26ba25
 
26ba25
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
26ba25
 #define INTEL_IOMMU_DEVICE(obj) \
26ba25
@@ -95,6 +96,7 @@ struct VTDAddressSpace {
26ba25
     QLIST_ENTRY(VTDAddressSpace) next;
26ba25
     /* Superset of notifier flags that this address space has */
26ba25
     IOMMUNotifierFlag notifier_flags;
26ba25
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
26ba25
 };
26ba25
 
26ba25
 struct VTDBus {
26ba25
-- 
26ba25
1.8.3.1
26ba25