Blame SOURCES/kvm-intel-iommu-rework-the-page-walk-logic.patch

357786
From 0b6a29f743e4dcbf340d5abe0e7e230591b7d5d3 Mon Sep 17 00:00:00 2001
357786
From: Peter Xu <peterx@redhat.com>
357786
Date: Mon, 3 Sep 2018 04:52:41 +0200
357786
Subject: [PATCH 26/29] intel-iommu: rework the page walk logic
357786
357786
RH-Author: Peter Xu <peterx@redhat.com>
357786
Message-id: <20180903045241.6456-10-peterx@redhat.com>
357786
Patchwork-id: 82029
357786
O-Subject: [RHEL-7.6 qemu-kvm-rhev PATCH 9/9] intel-iommu: rework the page walk logic
357786
Bugzilla: 1623859
357786
RH-Acked-by: Xiao Wang <jasowang@redhat.com>
357786
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
357786
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
357786
357786
This patch fixes a potential small window that the DMA page table might
357786
be incomplete or invalid when the guest sends domain/context
357786
invalidations to a device.  This can cause random DMA errors for
357786
assigned devices.
357786
357786
This is a major change to the VT-d shadow page walking logic. It
357786
includes but is not limited to:
357786
357786
- For each VTDAddressSpace, now we maintain what IOVA ranges we have
357786
  mapped and what we have not.  With that information, now we only send
357786
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
357786
  know we have already mapped the range, meanwhile we don't send UNMAP
357786
  notifies if we know we never mapped the range at all.
357786
357786
- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
357786
  in any places to resync the shadow page table for a device.
357786
357786
- When we receive domain/context invalidation, we should not really run
357786
  the replay logic, instead we use the new sync shadow page table API to
357786
  resync the whole shadow page table without unmapping the whole
357786
  region.  After this change, we'll only do the page walk once for each
357786
  domain invalidations (before this, it can be multiple, depending on
357786
  number of notifiers per address space).
357786
357786
While at it, the page walking logic is also refactored to be simpler.
357786
357786
CC: QEMU Stable <qemu-stable@nongnu.org>
357786
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
357786
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
357786
Signed-off-by: Peter Xu <peterx@redhat.com>
357786
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
357786
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
357786
(cherry picked from commit 63b88968f139b6a77f2f81e6f1eedf70c0170a85)
357786
Signed-off-by: Peter Xu <peterx@redhat.com>
357786
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
357786
---
357786
 hw/i386/intel_iommu.c         | 213 ++++++++++++++++++++++++++++++------------
357786
 hw/i386/trace-events          |   3 +-
357786
 include/hw/i386/intel_iommu.h |   2 +
357786
 3 files changed, 159 insertions(+), 59 deletions(-)
357786
357786
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
357786
index 61bb3d3..b5a09b7 100644
357786
--- a/hw/i386/intel_iommu.c
357786
+++ b/hw/i386/intel_iommu.c
357786
@@ -769,10 +769,77 @@ typedef struct {
357786
 
357786
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
357786
 {
357786
+    VTDAddressSpace *as = info->as;
357786
     vtd_page_walk_hook hook_fn = info->hook_fn;
357786
     void *private = info->private;
357786
+    DMAMap target = {
357786
+        .iova = entry->iova,
357786
+        .size = entry->addr_mask,
357786
+        .translated_addr = entry->translated_addr,
357786
+        .perm = entry->perm,
357786
+    };
357786
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
357786
+
357786
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
357786
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
357786
+        return 0;
357786
+    }
357786
 
357786
     assert(hook_fn);
357786
+
357786
+    /* Update local IOVA mapped ranges */
357786
+    if (entry->perm) {
357786
+        if (mapped) {
357786
+            /* If it's exactly the same translation, skip */
357786
+            if (!memcmp(mapped, &target, sizeof(target))) {
357786
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
357786
+                                                 entry->translated_addr);
357786
+                return 0;
357786
+            } else {
357786
+                /*
357786
+                 * Translation changed.  Normally this should not
357786
+                 * happen, but it can happen when with buggy guest
357786
+                 * OSes.  Note that there will be a small window that
357786
+                 * we don't have map at all.  But that's the best
357786
+                 * effort we can do.  The ideal way to emulate this is
357786
+                 * atomically modify the PTE to follow what has
357786
+                 * changed, but we can't.  One example is that vfio
357786
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
357786
+                 * interface to modify a mapping (meanwhile it seems
357786
+                 * meaningless to even provide one).  Anyway, let's
357786
+                 * mark this as a TODO in case one day we'll have
357786
+                 * a better solution.
357786
+                 */
357786
+                IOMMUAccessFlags cache_perm = entry->perm;
357786
+                int ret;
357786
+
357786
+                /* Emulate an UNMAP */
357786
+                entry->perm = IOMMU_NONE;
357786
+                trace_vtd_page_walk_one(info->domain_id,
357786
+                                        entry->iova,
357786
+                                        entry->translated_addr,
357786
+                                        entry->addr_mask,
357786
+                                        entry->perm);
357786
+                ret = hook_fn(entry, private);
357786
+                if (ret) {
357786
+                    return ret;
357786
+                }
357786
+                /* Drop any existing mapping */
357786
+                iova_tree_remove(as->iova_tree, &target);
357786
+                /* Recover the correct permission */
357786
+                entry->perm = cache_perm;
357786
+            }
357786
+        }
357786
+        iova_tree_insert(as->iova_tree, &target);
357786
+    } else {
357786
+        if (!mapped) {
357786
+            /* Skip since we didn't map this range at all */
357786
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
357786
+            return 0;
357786
+        }
357786
+        iova_tree_remove(as->iova_tree, &target);
357786
+    }
357786
+
357786
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
357786
                             entry->translated_addr, entry->addr_mask,
357786
                             entry->perm);
357786
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
357786
          */
357786
         entry_valid = read_cur | write_cur;
357786
 
357786
-        entry.target_as = &address_space_memory;
357786
-        entry.iova = iova & subpage_mask;
357786
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
357786
-        entry.addr_mask = ~subpage_mask;
357786
-
357786
-        if (vtd_is_last_slpte(slpte, level)) {
357786
-            /* NOTE: this is only meaningful if entry_valid == true */
357786
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
357786
-            if (!entry_valid && !info->notify_unmap) {
357786
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
357786
-                goto next;
357786
-            }
357786
-            ret = vtd_page_walk_one(&entry, info);
357786
-            if (ret < 0) {
357786
-                return ret;
357786
-            }
357786
-        } else {
357786
-            if (!entry_valid) {
357786
-                if (info->notify_unmap) {
357786
-                    /*
357786
-                     * The whole entry is invalid; unmap it all.
357786
-                     * Translated address is meaningless, zero it.
357786
-                     */
357786
-                    entry.translated_addr = 0x0;
357786
-                    ret = vtd_page_walk_one(&entry, info);
357786
-                    if (ret < 0) {
357786
-                        return ret;
357786
-                    }
357786
-                } else {
357786
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
357786
-                }
357786
-                goto next;
357786
-            }
357786
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
357786
+            /*
357786
+             * This is a valid PDE (or even bigger than PDE).  We need
357786
+             * to walk one further level.
357786
+             */
357786
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
357786
                                       iova, MIN(iova_next, end), level - 1,
357786
                                       read_cur, write_cur, info);
357786
-            if (ret < 0) {
357786
-                return ret;
357786
-            }
357786
+        } else {
357786
+            /*
357786
+             * This means we are either:
357786
+             *
357786
+             * (1) the real page entry (either 4K page, or huge page)
357786
+             * (2) the whole range is invalid
357786
+             *
357786
+             * In either case, we send an IOTLB notification down.
357786
+             */
357786
+            entry.target_as = &address_space_memory;
357786
+            entry.iova = iova & subpage_mask;
357786
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
357786
+            entry.addr_mask = ~subpage_mask;
357786
+            /* NOTE: this is only meaningful if entry_valid == true */
357786
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
357786
+            ret = vtd_page_walk_one(&entry, info);
357786
+        }
357786
+
357786
+        if (ret < 0) {
357786
+            return ret;
357786
         }
357786
 
357786
 next:
357786
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
357786
     return 0;
357786
 }
357786
 
357786
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
357786
+                                     void *private)
357786
+{
357786
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
357786
+    return 0;
357786
+}
357786
+
357786
+/* If context entry is NULL, we'll try to fetch it on our own. */
357786
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
357786
+                                            VTDContextEntry *ce,
357786
+                                            hwaddr addr, hwaddr size)
357786
+{
357786
+    IntelIOMMUState *s = vtd_as->iommu_state;
357786
+    vtd_page_walk_info info = {
357786
+        .hook_fn = vtd_sync_shadow_page_hook,
357786
+        .private = (void *)&vtd_as->iommu,
357786
+        .notify_unmap = true,
357786
+        .aw = s->aw_bits,
357786
+        .as = vtd_as,
357786
+    };
357786
+    VTDContextEntry ce_cache;
357786
+    int ret;
357786
+
357786
+    if (ce) {
357786
+        /* If the caller provided context entry, use it */
357786
+        ce_cache = *ce;
357786
+    } else {
357786
+        /* If the caller didn't provide ce, try to fetch */
357786
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
357786
+                                       vtd_as->devfn, &ce_cache);
357786
+        if (ret) {
357786
+            /*
357786
+             * This should not really happen, but in case it happens,
357786
+             * we just skip the sync for this time.  After all we even
357786
+             * don't have the root table pointer!
357786
+             */
357786
+            trace_vtd_err("Detected invalid context entry when "
357786
+                          "trying to sync shadow page table");
357786
+            return 0;
357786
+        }
357786
+    }
357786
+
357786
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
357786
+
357786
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info;;
357786
+}
357786
+
357786
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
357786
+{
357786
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
357786
+}
357786
+
357786
 /*
357786
  * Fetch translation type for specific device. Returns <0 if error
357786
  * happens, otherwise return the shifted type to check against
357786
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
357786
     VTDAddressSpace *vtd_as;
357786
 
357786
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
357786
-        memory_region_iommu_replay_all(&vtd_as->iommu);
357786
+        vtd_sync_shadow_page_table(vtd_as);
357786
     }
357786
 }
357786
 
357786
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
357786
                 vtd_switch_address_space(vtd_as);
357786
                 /*
357786
                  * So a device is moving out of (or moving into) a
357786
-                 * domain, a replay() suites here to notify all the
357786
-                 * IOMMU_NOTIFIER_MAP registers about this change.
357786
+                 * domain, resync the shadow page table.
357786
                  * This won't bring bad even if we have no such
357786
                  * notifier registered - the IOMMU notification
357786
                  * framework will skip MAP notifications if that
357786
                  * happened.
357786
                  */
357786
-                memory_region_iommu_replay_all(&vtd_as->iommu);
357786
+                vtd_sync_shadow_page_table(vtd_as);
357786
             }
357786
         }
357786
     }
357786
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
357786
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
357786
                                       vtd_as->devfn, &ce) &&
357786
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
357786
-            memory_region_iommu_replay_all(&vtd_as->iommu);
357786
+            vtd_sync_shadow_page_table(vtd_as);
357786
         }
357786
     }
357786
 }
357786
 
357786
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
357786
-                                           void *private)
357786
-{
357786
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
357786
-    return 0;
357786
-}
357786
-
357786
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
357786
                                            uint16_t domain_id, hwaddr addr,
357786
                                            uint8_t am)
357786
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
357786
                                        vtd_as->devfn, &ce);
357786
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
357786
             if (vtd_as_has_map_notifier(vtd_as)) {
357786
-                vtd_page_walk_info info = {
357786
-                    .hook_fn = vtd_page_invalidate_notify_hook,
357786
-                    .private = (void *)&vtd_as->iommu,
357786
-                    .notify_unmap = true,
357786
-                    .aw = s->aw_bits,
357786
-                    .as = vtd_as,
357786
-                    .domain_id = domain_id,
357786
-                };
357786
-
357786
                 /*
357786
                  * As long as we have MAP notifications registered in
357786
                  * any of our IOMMU notifiers, we need to sync the
357786
                  * shadow page table.
357786
                  */
357786
-                vtd_page_walk(&ce, addr, addr + size, &info;;
357786
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
357786
             } else {
357786
                 /*
357786
                  * For UNMAP-only notifiers, we don't need to walk the
357786
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
357786
         vtd_dev_as->devfn = (uint8_t)devfn;
357786
         vtd_dev_as->iommu_state = s;
357786
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
357786
+        vtd_dev_as->iova_tree = iova_tree_new();
357786
 
357786
         /*
357786
          * Memory region relationships looks like (Address range shows
357786
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
357786
     hwaddr start = n->start;
357786
     hwaddr end = n->end;
357786
     IntelIOMMUState *s = as->iommu_state;
357786
+    DMAMap map;
357786
 
357786
     /*
357786
      * Note: all the codes in this function has a assumption that IOVA
357786
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
357786
                              VTD_PCI_FUNC(as->devfn),
357786
                              entry.iova, size);
357786
 
357786
+    map.iova = entry.iova;
357786
+    map.size = entry.addr_mask;
357786
+    iova_tree_remove(as->iova_tree, &map);
357786
+
357786
     memory_region_notify_one(n, &entry);
357786
 }
357786
 
357786
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
357786
index ca23ba9..e14d06e 100644
357786
--- a/hw/i386/trace-events
357786
+++ b/hw/i386/trace-events
357786
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
357786
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
357786
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
357786
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
357786
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
357786
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
357786
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
357786
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
357786
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
357786
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
357786
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
357786
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
357786
index 156f35e..fbfedcb 100644
357786
--- a/include/hw/i386/intel_iommu.h
357786
+++ b/include/hw/i386/intel_iommu.h
357786
@@ -27,6 +27,7 @@
357786
 #include "hw/i386/ioapic.h"
357786
 #include "hw/pci/msi.h"
357786
 #include "hw/sysbus.h"
357786
+#include "qemu/iova-tree.h"
357786
 
357786
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
357786
 #define INTEL_IOMMU_DEVICE(obj) \
357786
@@ -95,6 +96,7 @@ struct VTDAddressSpace {
357786
     QLIST_ENTRY(VTDAddressSpace) next;
357786
     /* Superset of notifier flags that this address space has */
357786
     IOMMUNotifierFlag notifier_flags;
357786
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
357786
 };
357786
 
357786
 struct VTDBus {
357786
-- 
357786
1.8.3.1
357786