Blame SOURCES/kvm-intel-iommu-rework-the-page-walk-logic.patch

1bdc94
From 0b6a29f743e4dcbf340d5abe0e7e230591b7d5d3 Mon Sep 17 00:00:00 2001
1bdc94
From: Peter Xu <peterx@redhat.com>
1bdc94
Date: Mon, 3 Sep 2018 04:52:41 +0200
1bdc94
Subject: [PATCH 26/29] intel-iommu: rework the page walk logic
1bdc94
1bdc94
RH-Author: Peter Xu <peterx@redhat.com>
1bdc94
Message-id: <20180903045241.6456-10-peterx@redhat.com>
1bdc94
Patchwork-id: 82029
1bdc94
O-Subject: [RHEL-7.6 qemu-kvm-rhev PATCH 9/9] intel-iommu: rework the page walk logic
1bdc94
Bugzilla: 1623859
1bdc94
RH-Acked-by: Xiao Wang <jasowang@redhat.com>
1bdc94
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
1bdc94
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
1bdc94
1bdc94
This patch fixes a potential small window that the DMA page table might
1bdc94
be incomplete or invalid when the guest sends domain/context
1bdc94
invalidations to a device.  This can cause random DMA errors for
1bdc94
assigned devices.
1bdc94
1bdc94
This is a major change to the VT-d shadow page walking logic. It
1bdc94
includes but is not limited to:
1bdc94
1bdc94
- For each VTDAddressSpace, now we maintain what IOVA ranges we have
1bdc94
  mapped and what we have not.  With that information, now we only send
1bdc94
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
1bdc94
  know we have already mapped the range, meanwhile we don't send UNMAP
1bdc94
  notifies if we know we never mapped the range at all.
1bdc94
1bdc94
- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
1bdc94
  in any places to resync the shadow page table for a device.
1bdc94
1bdc94
- When we receive domain/context invalidation, we should not really run
1bdc94
  the replay logic, instead we use the new sync shadow page table API to
1bdc94
  resync the whole shadow page table without unmapping the whole
1bdc94
  region.  After this change, we'll only do the page walk once for each
1bdc94
  domain invalidations (before this, it can be multiple, depending on
1bdc94
  number of notifiers per address space).
1bdc94
1bdc94
While at it, the page walking logic is also refactored to be simpler.
1bdc94
1bdc94
CC: QEMU Stable <qemu-stable@nongnu.org>
1bdc94
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
1bdc94
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
1bdc94
Signed-off-by: Peter Xu <peterx@redhat.com>
1bdc94
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
1bdc94
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
1bdc94
(cherry picked from commit 63b88968f139b6a77f2f81e6f1eedf70c0170a85)
1bdc94
Signed-off-by: Peter Xu <peterx@redhat.com>
1bdc94
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
1bdc94
---
1bdc94
 hw/i386/intel_iommu.c         | 213 ++++++++++++++++++++++++++++++------------
1bdc94
 hw/i386/trace-events          |   3 +-
1bdc94
 include/hw/i386/intel_iommu.h |   2 +
1bdc94
 3 files changed, 159 insertions(+), 59 deletions(-)
1bdc94
1bdc94
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
1bdc94
index 61bb3d3..b5a09b7 100644
1bdc94
--- a/hw/i386/intel_iommu.c
1bdc94
+++ b/hw/i386/intel_iommu.c
1bdc94
@@ -769,10 +769,77 @@ typedef struct {
1bdc94
 
1bdc94
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
1bdc94
 {
1bdc94
+    VTDAddressSpace *as = info->as;
1bdc94
     vtd_page_walk_hook hook_fn = info->hook_fn;
1bdc94
     void *private = info->private;
1bdc94
+    DMAMap target = {
1bdc94
+        .iova = entry->iova,
1bdc94
+        .size = entry->addr_mask,
1bdc94
+        .translated_addr = entry->translated_addr,
1bdc94
+        .perm = entry->perm,
1bdc94
+    };
1bdc94
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
1bdc94
+
1bdc94
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
1bdc94
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1bdc94
+        return 0;
1bdc94
+    }
1bdc94
 
1bdc94
     assert(hook_fn);
1bdc94
+
1bdc94
+    /* Update local IOVA mapped ranges */
1bdc94
+    if (entry->perm) {
1bdc94
+        if (mapped) {
1bdc94
+            /* If it's exactly the same translation, skip */
1bdc94
+            if (!memcmp(mapped, &target, sizeof(target))) {
1bdc94
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
1bdc94
+                                                 entry->translated_addr);
1bdc94
+                return 0;
1bdc94
+            } else {
1bdc94
+                /*
1bdc94
+                 * Translation changed.  Normally this should not
1bdc94
+                 * happen, but it can happen when with buggy guest
1bdc94
+                 * OSes.  Note that there will be a small window that
1bdc94
+                 * we don't have map at all.  But that's the best
1bdc94
+                 * effort we can do.  The ideal way to emulate this is
1bdc94
+                 * atomically modify the PTE to follow what has
1bdc94
+                 * changed, but we can't.  One example is that vfio
1bdc94
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
1bdc94
+                 * interface to modify a mapping (meanwhile it seems
1bdc94
+                 * meaningless to even provide one).  Anyway, let's
1bdc94
+                 * mark this as a TODO in case one day we'll have
1bdc94
+                 * a better solution.
1bdc94
+                 */
1bdc94
+                IOMMUAccessFlags cache_perm = entry->perm;
1bdc94
+                int ret;
1bdc94
+
1bdc94
+                /* Emulate an UNMAP */
1bdc94
+                entry->perm = IOMMU_NONE;
1bdc94
+                trace_vtd_page_walk_one(info->domain_id,
1bdc94
+                                        entry->iova,
1bdc94
+                                        entry->translated_addr,
1bdc94
+                                        entry->addr_mask,
1bdc94
+                                        entry->perm);
1bdc94
+                ret = hook_fn(entry, private);
1bdc94
+                if (ret) {
1bdc94
+                    return ret;
1bdc94
+                }
1bdc94
+                /* Drop any existing mapping */
1bdc94
+                iova_tree_remove(as->iova_tree, &target);
1bdc94
+                /* Recover the correct permission */
1bdc94
+                entry->perm = cache_perm;
1bdc94
+            }
1bdc94
+        }
1bdc94
+        iova_tree_insert(as->iova_tree, &target);
1bdc94
+    } else {
1bdc94
+        if (!mapped) {
1bdc94
+            /* Skip since we didn't map this range at all */
1bdc94
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1bdc94
+            return 0;
1bdc94
+        }
1bdc94
+        iova_tree_remove(as->iova_tree, &target);
1bdc94
+    }
1bdc94
+
1bdc94
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
1bdc94
                             entry->translated_addr, entry->addr_mask,
1bdc94
                             entry->perm);
1bdc94
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
1bdc94
          */
1bdc94
         entry_valid = read_cur | write_cur;
1bdc94
 
1bdc94
-        entry.target_as = &address_space_memory;
1bdc94
-        entry.iova = iova & subpage_mask;
1bdc94
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1bdc94
-        entry.addr_mask = ~subpage_mask;
1bdc94
-
1bdc94
-        if (vtd_is_last_slpte(slpte, level)) {
1bdc94
-            /* NOTE: this is only meaningful if entry_valid == true */
1bdc94
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1bdc94
-            if (!entry_valid && !info->notify_unmap) {
1bdc94
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
1bdc94
-                goto next;
1bdc94
-            }
1bdc94
-            ret = vtd_page_walk_one(&entry, info);
1bdc94
-            if (ret < 0) {
1bdc94
-                return ret;
1bdc94
-            }
1bdc94
-        } else {
1bdc94
-            if (!entry_valid) {
1bdc94
-                if (info->notify_unmap) {
1bdc94
-                    /*
1bdc94
-                     * The whole entry is invalid; unmap it all.
1bdc94
-                     * Translated address is meaningless, zero it.
1bdc94
-                     */
1bdc94
-                    entry.translated_addr = 0x0;
1bdc94
-                    ret = vtd_page_walk_one(&entry, info);
1bdc94
-                    if (ret < 0) {
1bdc94
-                        return ret;
1bdc94
-                    }
1bdc94
-                } else {
1bdc94
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
1bdc94
-                }
1bdc94
-                goto next;
1bdc94
-            }
1bdc94
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
1bdc94
+            /*
1bdc94
+             * This is a valid PDE (or even bigger than PDE).  We need
1bdc94
+             * to walk one further level.
1bdc94
+             */
1bdc94
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
1bdc94
                                       iova, MIN(iova_next, end), level - 1,
1bdc94
                                       read_cur, write_cur, info);
1bdc94
-            if (ret < 0) {
1bdc94
-                return ret;
1bdc94
-            }
1bdc94
+        } else {
1bdc94
+            /*
1bdc94
+             * This means we are either:
1bdc94
+             *
1bdc94
+             * (1) the real page entry (either 4K page, or huge page)
1bdc94
+             * (2) the whole range is invalid
1bdc94
+             *
1bdc94
+             * In either case, we send an IOTLB notification down.
1bdc94
+             */
1bdc94
+            entry.target_as = &address_space_memory;
1bdc94
+            entry.iova = iova & subpage_mask;
1bdc94
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1bdc94
+            entry.addr_mask = ~subpage_mask;
1bdc94
+            /* NOTE: this is only meaningful if entry_valid == true */
1bdc94
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1bdc94
+            ret = vtd_page_walk_one(&entry, info);
1bdc94
+        }
1bdc94
+
1bdc94
+        if (ret < 0) {
1bdc94
+            return ret;
1bdc94
         }
1bdc94
 
1bdc94
 next:
1bdc94
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
1bdc94
     return 0;
1bdc94
 }
1bdc94
 
1bdc94
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
1bdc94
+                                     void *private)
1bdc94
+{
1bdc94
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
1bdc94
+    return 0;
1bdc94
+}
1bdc94
+
1bdc94
+/* If context entry is NULL, we'll try to fetch it on our own. */
1bdc94
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1bdc94
+                                            VTDContextEntry *ce,
1bdc94
+                                            hwaddr addr, hwaddr size)
1bdc94
+{
1bdc94
+    IntelIOMMUState *s = vtd_as->iommu_state;
1bdc94
+    vtd_page_walk_info info = {
1bdc94
+        .hook_fn = vtd_sync_shadow_page_hook,
1bdc94
+        .private = (void *)&vtd_as->iommu,
1bdc94
+        .notify_unmap = true,
1bdc94
+        .aw = s->aw_bits,
1bdc94
+        .as = vtd_as,
1bdc94
+    };
1bdc94
+    VTDContextEntry ce_cache;
1bdc94
+    int ret;
1bdc94
+
1bdc94
+    if (ce) {
1bdc94
+        /* If the caller provided context entry, use it */
1bdc94
+        ce_cache = *ce;
1bdc94
+    } else {
1bdc94
+        /* If the caller didn't provide ce, try to fetch */
1bdc94
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1bdc94
+                                       vtd_as->devfn, &ce_cache);
1bdc94
+        if (ret) {
1bdc94
+            /*
1bdc94
+             * This should not really happen, but in case it happens,
1bdc94
+             * we just skip the sync for this time.  After all we even
1bdc94
+             * don't have the root table pointer!
1bdc94
+             */
1bdc94
+            trace_vtd_err("Detected invalid context entry when "
1bdc94
+                          "trying to sync shadow page table");
1bdc94
+            return 0;
1bdc94
+        }
1bdc94
+    }
1bdc94
+
1bdc94
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
1bdc94
+
1bdc94
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info;;
1bdc94
+}
1bdc94
+
1bdc94
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1bdc94
+{
1bdc94
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
1bdc94
+}
1bdc94
+
1bdc94
 /*
1bdc94
  * Fetch translation type for specific device. Returns <0 if error
1bdc94
  * happens, otherwise return the shifted type to check against
1bdc94
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
1bdc94
     VTDAddressSpace *vtd_as;
1bdc94
 
1bdc94
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1bdc94
-        memory_region_iommu_replay_all(&vtd_as->iommu);
1bdc94
+        vtd_sync_shadow_page_table(vtd_as);
1bdc94
     }
1bdc94
 }
1bdc94
 
1bdc94
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
1bdc94
                 vtd_switch_address_space(vtd_as);
1bdc94
                 /*
1bdc94
                  * So a device is moving out of (or moving into) a
1bdc94
-                 * domain, a replay() suites here to notify all the
1bdc94
-                 * IOMMU_NOTIFIER_MAP registers about this change.
1bdc94
+                 * domain, resync the shadow page table.
1bdc94
                  * This won't bring bad even if we have no such
1bdc94
                  * notifier registered - the IOMMU notification
1bdc94
                  * framework will skip MAP notifications if that
1bdc94
                  * happened.
1bdc94
                  */
1bdc94
-                memory_region_iommu_replay_all(&vtd_as->iommu);
1bdc94
+                vtd_sync_shadow_page_table(vtd_as);
1bdc94
             }
1bdc94
         }
1bdc94
     }
1bdc94
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1bdc94
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1bdc94
                                       vtd_as->devfn, &ce) &&
1bdc94
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
1bdc94
-            memory_region_iommu_replay_all(&vtd_as->iommu);
1bdc94
+            vtd_sync_shadow_page_table(vtd_as);
1bdc94
         }
1bdc94
     }
1bdc94
 }
1bdc94
 
1bdc94
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
1bdc94
-                                           void *private)
1bdc94
-{
1bdc94
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
1bdc94
-    return 0;
1bdc94
-}
1bdc94
-
1bdc94
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1bdc94
                                            uint16_t domain_id, hwaddr addr,
1bdc94
                                            uint8_t am)
1bdc94
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1bdc94
                                        vtd_as->devfn, &ce);
1bdc94
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
1bdc94
             if (vtd_as_has_map_notifier(vtd_as)) {
1bdc94
-                vtd_page_walk_info info = {
1bdc94
-                    .hook_fn = vtd_page_invalidate_notify_hook,
1bdc94
-                    .private = (void *)&vtd_as->iommu,
1bdc94
-                    .notify_unmap = true,
1bdc94
-                    .aw = s->aw_bits,
1bdc94
-                    .as = vtd_as,
1bdc94
-                    .domain_id = domain_id,
1bdc94
-                };
1bdc94
-
1bdc94
                 /*
1bdc94
                  * As long as we have MAP notifications registered in
1bdc94
                  * any of our IOMMU notifiers, we need to sync the
1bdc94
                  * shadow page table.
1bdc94
                  */
1bdc94
-                vtd_page_walk(&ce, addr, addr + size, &info;;
1bdc94
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
1bdc94
             } else {
1bdc94
                 /*
1bdc94
                  * For UNMAP-only notifiers, we don't need to walk the
1bdc94
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
1bdc94
         vtd_dev_as->devfn = (uint8_t)devfn;
1bdc94
         vtd_dev_as->iommu_state = s;
1bdc94
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
1bdc94
+        vtd_dev_as->iova_tree = iova_tree_new();
1bdc94
 
1bdc94
         /*
1bdc94
          * Memory region relationships looks like (Address range shows
1bdc94
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
1bdc94
     hwaddr start = n->start;
1bdc94
     hwaddr end = n->end;
1bdc94
     IntelIOMMUState *s = as->iommu_state;
1bdc94
+    DMAMap map;
1bdc94
 
1bdc94
     /*
1bdc94
      * Note: all the codes in this function has a assumption that IOVA
1bdc94
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
1bdc94
                              VTD_PCI_FUNC(as->devfn),
1bdc94
                              entry.iova, size);
1bdc94
 
1bdc94
+    map.iova = entry.iova;
1bdc94
+    map.size = entry.addr_mask;
1bdc94
+    iova_tree_remove(as->iova_tree, &map);
1bdc94
+
1bdc94
     memory_region_notify_one(n, &entry);
1bdc94
 }
1bdc94
 
1bdc94
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
1bdc94
index ca23ba9..e14d06e 100644
1bdc94
--- a/hw/i386/trace-events
1bdc94
+++ b/hw/i386/trace-events
1bdc94
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
1bdc94
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
1bdc94
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
1bdc94
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
1bdc94
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
1bdc94
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
1bdc94
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
1bdc94
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
1bdc94
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
1bdc94
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
1bdc94
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
1bdc94
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
1bdc94
index 156f35e..fbfedcb 100644
1bdc94
--- a/include/hw/i386/intel_iommu.h
1bdc94
+++ b/include/hw/i386/intel_iommu.h
1bdc94
@@ -27,6 +27,7 @@
1bdc94
 #include "hw/i386/ioapic.h"
1bdc94
 #include "hw/pci/msi.h"
1bdc94
 #include "hw/sysbus.h"
1bdc94
+#include "qemu/iova-tree.h"
1bdc94
 
1bdc94
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
1bdc94
 #define INTEL_IOMMU_DEVICE(obj) \
1bdc94
@@ -95,6 +96,7 @@ struct VTDAddressSpace {
1bdc94
     QLIST_ENTRY(VTDAddressSpace) next;
1bdc94
     /* Superset of notifier flags that this address space has */
1bdc94
     IOMMUNotifierFlag notifier_flags;
1bdc94
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
1bdc94
 };
1bdc94
 
1bdc94
 struct VTDBus {
1bdc94
-- 
1bdc94
1.8.3.1
1bdc94