26ba25
From a09020ea2e2e645b95ed603e075938d413f1114f Mon Sep 17 00:00:00 2001
26ba25
From: Peter Xu <peterx@redhat.com>
26ba25
Date: Fri, 12 Oct 2018 07:58:38 +0100
26ba25
Subject: [PATCH 08/17] intel-iommu: send PSI always even if across PDEs
26ba25
26ba25
RH-Author: Peter Xu <peterx@redhat.com>
26ba25
Message-id: <20181012075846.25449-2-peterx@redhat.com>
26ba25
Patchwork-id: 82674
26ba25
O-Subject: [RHEL-8 qemu-kvm PATCH 1/9] intel-iommu: send PSI always even if across PDEs
26ba25
Bugzilla: 1450712
26ba25
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
26ba25
RH-Acked-by: Xiao Wang <jasowang@redhat.com>
26ba25
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
26ba25
26ba25
SECURITY IMPLICATION: without this patch, any guest with both assigned
26ba25
device and a vIOMMU might encounter stale IO page mappings even if guest
26ba25
has already unmapped the page, which may lead to guest memory
26ba25
corruption.  The stale mappings will only be limited to the guest's own
26ba25
memory range, so it should not affect the host memory or other guests on
26ba25
the host.
26ba25
26ba25
During IOVA page table walking, there is a special case when the PSI
26ba25
covers one whole PDE (Page Directory Entry, which contains 512 Page
26ba25
Table Entries) or more.  In the past, we skip that entry and we don't
26ba25
notify the IOMMU notifiers.  This is not correct.  We should send UNMAP
26ba25
notification to registered UNMAP notifiers in this case.
26ba25
26ba25
For UNMAP only notifiers, this might cause IOTLBs cached in the devices
26ba25
even if they were already invalid.  For MAP/UNMAP notifiers like
26ba25
vfio-pci, this will cause stale page mappings.
26ba25
26ba25
This special case doesn't trigger often, but it is very easy to be
26ba25
triggered by nested device assignments, since in that case we'll
26ba25
possibly map the whole L2 guest RAM region into the device's IOVA
26ba25
address space (several GBs at least), which is far bigger than normal
26ba25
kernel driver usages of the device (tens of MBs normally).
26ba25
26ba25
Without this patch applied to L1 QEMU, nested device assignment to L2
26ba25
guests will dump some errors like:
26ba25
26ba25
qemu-system-x86_64: VFIO_MAP_DMA: -17
26ba25
qemu-system-x86_64: vfio_dma_map(0x557305420c30, 0xad000, 0x1000,
26ba25
                    0x7f89a920d000) = -17 (File exists)
26ba25
26ba25
CC: QEMU Stable <qemu-stable@nongnu.org>
26ba25
Acked-by: Jason Wang <jasowang@redhat.com>
26ba25
[peterx: rewrite the commit message]
26ba25
Signed-off-by: Peter Xu <peterx@redhat.com>
26ba25
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
26ba25
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
26ba25
(cherry picked from commit 36d2d52bdb45f5b753a61fdaf0fe7891f1f5b61d)
26ba25
Signed-off-by: Peter Xu <peterx@redhat.com>
26ba25
26ba25
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
26ba25
---
26ba25
 hw/i386/intel_iommu.c | 42 ++++++++++++++++++++++++++++++------------
26ba25
 1 file changed, 30 insertions(+), 12 deletions(-)
26ba25
26ba25
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
26ba25
index fb31de9..b359efd 100644
26ba25
--- a/hw/i386/intel_iommu.c
26ba25
+++ b/hw/i386/intel_iommu.c
26ba25
@@ -722,6 +722,15 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
26ba25
 
26ba25
 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
26ba25
 
26ba25
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
26ba25
+                             vtd_page_walk_hook hook_fn, void *private)
26ba25
+{
26ba25
+    assert(hook_fn);
26ba25
+    trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
26ba25
+                            entry->addr_mask, entry->perm);
26ba25
+    return hook_fn(entry, private);
26ba25
+}
26ba25
+
26ba25
 /**
26ba25
  * vtd_page_walk_level - walk over specific level for IOVA range
26ba25
  *
26ba25
@@ -781,28 +790,37 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
26ba25
          */
26ba25
         entry_valid = read_cur | write_cur;
26ba25
 
26ba25
+        entry.target_as = &address_space_memory;
26ba25
+        entry.iova = iova & subpage_mask;
26ba25
+        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
26ba25
+        entry.addr_mask = ~subpage_mask;
26ba25
+
26ba25
         if (vtd_is_last_slpte(slpte, level)) {
26ba25
-            entry.target_as = &address_space_memory;
26ba25
-            entry.iova = iova & subpage_mask;
26ba25
             /* NOTE: this is only meaningful if entry_valid == true */
26ba25
             entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
26ba25
-            entry.addr_mask = ~subpage_mask;
26ba25
-            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
26ba25
             if (!entry_valid && !notify_unmap) {
26ba25
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
26ba25
                 goto next;
26ba25
             }
26ba25
-            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
26ba25
-                                    entry.addr_mask, entry.perm);
26ba25
-            if (hook_fn) {
26ba25
-                ret = hook_fn(&entry, private);
26ba25
-                if (ret < 0) {
26ba25
-                    return ret;
26ba25
-                }
26ba25
+            ret = vtd_page_walk_one(&entry, level, hook_fn, private);
26ba25
+            if (ret < 0) {
26ba25
+                return ret;
26ba25
             }
26ba25
         } else {
26ba25
             if (!entry_valid) {
26ba25
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
26ba25
+                if (notify_unmap) {
26ba25
+                    /*
26ba25
+                     * The whole entry is invalid; unmap it all.
26ba25
+                     * Translated address is meaningless, zero it.
26ba25
+                     */
26ba25
+                    entry.translated_addr = 0x0;
26ba25
+                    ret = vtd_page_walk_one(&entry, level, hook_fn, private);
26ba25
+                    if (ret < 0) {
26ba25
+                        return ret;
26ba25
+                    }
26ba25
+                } else {
26ba25
+                    trace_vtd_page_walk_skip_perm(iova, iova_next);
26ba25
+                }
26ba25
                 goto next;
26ba25
             }
26ba25
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
26ba25
-- 
26ba25
1.8.3.1
26ba25