Blame SOURCES/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch

77c23f
From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001
77c23f
From: "plai@redhat.com" <plai@redhat.com>
77c23f
Date: Thu, 21 May 2020 23:56:47 +0100
77c23f
Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for
77c23f
 numa nodes
77c23f
77c23f
RH-Author: plai@redhat.com
77c23f
Message-id: <20200521235655.27141-4-plai@redhat.com>
77c23f
Patchwork-id: 96736
77c23f
O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes
77c23f
Bugzilla: 1600217
77c23f
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
77c23f
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
77c23f
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
77c23f
77c23f
From: Tao Xu <tao3.xu@intel.com>
77c23f
77c23f
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT),
77c23f
The initiator represents processor which access to memory. And in 5.2.27.3
77c23f
Memory Proximity Domain Attributes Structure, the attached initiator is
77c23f
defined as where the memory controller responsible for a memory proximity
77c23f
domain. With attached initiator information, the topology of heterogeneous
77c23f
memory can be described. Add new machine property 'hmat' to enable all
77c23f
HMAT specific options.
77c23f
77c23f
Extend CLI of "-numa node" option to indicate the initiator numa node-id.
77c23f
In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report
77c23f
the platform's HMAT tables. Before using initiator option, enable HMAT with
77c23f
-machine hmat=on.
77c23f
77c23f
Acked-by: Markus Armbruster <armbru@redhat.com>
77c23f
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
77c23f
Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
77c23f
Suggested-by: Dan Williams <dan.j.williams@intel.com>
77c23f
Signed-off-by: Tao Xu <tao3.xu@intel.com>
77c23f
Message-Id: <20191213011929.2520-2-tao3.xu@intel.com>
77c23f
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
77c23f
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
77c23f
(cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8)
77c23f
Signed-off-by: Paul Lai <plai@redhat.com>
77c23f
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
77c23f
---
77c23f
 hw/core/machine.c     | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
77c23f
 hw/core/numa.c        | 23 ++++++++++++++++++
77c23f
 include/sysemu/numa.h |  5 ++++
77c23f
 qapi/machine.json     | 10 +++++++-
77c23f
 qemu-options.hx       | 35 ++++++++++++++++++++++++----
77c23f
 5 files changed, 131 insertions(+), 6 deletions(-)
77c23f
77c23f
diff --git a/hw/core/machine.c b/hw/core/machine.c
77c23f
index 19c78c6..cb21ae1 100644
77c23f
--- a/hw/core/machine.c
77c23f
+++ b/hw/core/machine.c
77c23f
@@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp)
77c23f
     ms->nvdimms_state->is_enabled = value;
77c23f
 }
77c23f
 
77c23f
+static bool machine_get_hmat(Object *obj, Error **errp)
77c23f
+{
77c23f
+    MachineState *ms = MACHINE(obj);
77c23f
+
77c23f
+    return ms->numa_state->hmat_enabled;
77c23f
+}
77c23f
+
77c23f
+static void machine_set_hmat(Object *obj, bool value, Error **errp)
77c23f
+{
77c23f
+    MachineState *ms = MACHINE(obj);
77c23f
+
77c23f
+    ms->numa_state->hmat_enabled = value;
77c23f
+}
77c23f
+
77c23f
 static char *machine_get_nvdimm_persistence(Object *obj, Error **errp)
77c23f
 {
77c23f
     MachineState *ms = MACHINE(obj);
77c23f
@@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
77c23f
                                const CpuInstanceProperties *props, Error **errp)
77c23f
 {
77c23f
     MachineClass *mc = MACHINE_GET_CLASS(machine);
77c23f
+    NodeInfo *numa_info = machine->numa_state->nodes;
77c23f
     bool match = false;
77c23f
     int i;
77c23f
 
77c23f
@@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine,
77c23f
         match = true;
77c23f
         slot->props.node_id = props->node_id;
77c23f
         slot->props.has_node_id = props->has_node_id;
77c23f
+
77c23f
+        if (machine->numa_state->hmat_enabled) {
77c23f
+            if ((numa_info[props->node_id].initiator < MAX_NODES) &&
77c23f
+                (props->node_id != numa_info[props->node_id].initiator)) {
77c23f
+                error_setg(errp, "The initiator of CPU NUMA node %" PRId64
77c23f
+                        " should be itself", props->node_id);
77c23f
+                return;
77c23f
+            }
77c23f
+            numa_info[props->node_id].has_cpu = true;
77c23f
+            numa_info[props->node_id].initiator = props->node_id;
77c23f
+        }
77c23f
     }
77c23f
 
77c23f
     if (!match) {
77c23f
@@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj)
77c23f
 
77c23f
     if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) {
77c23f
         ms->numa_state = g_new0(NumaState, 1);
77c23f
+        object_property_add_bool(obj, "hmat",
77c23f
+                                 machine_get_hmat, machine_set_hmat,
77c23f
+                                 &error_abort);
77c23f
+        object_property_set_description(obj, "hmat",
77c23f
+                                        "Set on/off to enable/disable "
77c23f
+                                        "ACPI Heterogeneous Memory Attribute "
77c23f
+                                        "Table (HMAT)", NULL);
77c23f
     }
77c23f
 
77c23f
     /* Register notifier when init is done for sysbus sanity checks */
77c23f
@@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu)
77c23f
     return g_string_free(s, false);
77c23f
 }
77c23f
 
77c23f
+static void numa_validate_initiator(NumaState *numa_state)
77c23f
+{
77c23f
+    int i;
77c23f
+    NodeInfo *numa_info = numa_state->nodes;
77c23f
+
77c23f
+    for (i = 0; i < numa_state->num_nodes; i++) {
77c23f
+        if (numa_info[i].initiator == MAX_NODES) {
77c23f
+            error_report("The initiator of NUMA node %d is missing, use "
77c23f
+                         "'-numa node,initiator' option to declare it", i);
77c23f
+            exit(1);
77c23f
+        }
77c23f
+
77c23f
+        if (!numa_info[numa_info[i].initiator].present) {
77c23f
+            error_report("NUMA node %" PRIu16 " is missing, use "
77c23f
+                         "'-numa node' option to declare it first",
77c23f
+                         numa_info[i].initiator);
77c23f
+            exit(1);
77c23f
+        }
77c23f
+
77c23f
+        if (!numa_info[numa_info[i].initiator].has_cpu) {
77c23f
+            error_report("The initiator of NUMA node %d is invalid", i);
77c23f
+            exit(1);
77c23f
+        }
77c23f
+    }
77c23f
+}
77c23f
+
77c23f
 static void machine_numa_finish_cpu_init(MachineState *machine)
77c23f
 {
77c23f
     int i;
77c23f
@@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine)
77c23f
             machine_set_cpu_numa_node(machine, &props, &error_fatal);
77c23f
         }
77c23f
     }
77c23f
+
77c23f
+    if (machine->numa_state->hmat_enabled) {
77c23f
+        numa_validate_initiator(machine->numa_state);
77c23f
+    }
77c23f
+
77c23f
     if (s->len && !qtest_enabled()) {
77c23f
         warn_report("CPU(s) not present in any NUMA nodes: %s",
77c23f
                     s->str);
77c23f
diff --git a/hw/core/numa.c b/hw/core/numa.c
77c23f
index 19f082d..a07eef9 100644
77c23f
--- a/hw/core/numa.c
77c23f
+++ b/hw/core/numa.c
77c23f
@@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
77c23f
         numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
77c23f
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
77c23f
     }
77c23f
+
77c23f
+    /*
77c23f
+     * If not set the initiator, set it to MAX_NODES. And if
77c23f
+     * HMAT is enabled and this node has no cpus, QEMU will raise error.
77c23f
+     */
77c23f
+    numa_info[nodenr].initiator = MAX_NODES;
77c23f
+    if (node->has_initiator) {
77c23f
+        if (!ms->numa_state->hmat_enabled) {
77c23f
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
77c23f
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
77c23f
+                       "before using any of hmat specific options");
77c23f
+            return;
77c23f
+        }
77c23f
+
77c23f
+        if (node->initiator >= MAX_NODES) {
77c23f
+            error_report("The initiator id %" PRIu16 " expects an integer "
77c23f
+                         "between 0 and %d", node->initiator,
77c23f
+                         MAX_NODES - 1);
77c23f
+            return;
77c23f
+        }
77c23f
+
77c23f
+        numa_info[nodenr].initiator = node->initiator;
77c23f
+    }
77c23f
     numa_info[nodenr].present = true;
77c23f
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
77c23f
     ms->numa_state->num_nodes++;
77c23f
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
77c23f
index ae9c41d..788cbec 100644
77c23f
--- a/include/sysemu/numa.h
77c23f
+++ b/include/sysemu/numa.h
77c23f
@@ -18,6 +18,8 @@ struct NodeInfo {
77c23f
     uint64_t node_mem;
77c23f
     struct HostMemoryBackend *node_memdev;
77c23f
     bool present;
77c23f
+    bool has_cpu;
77c23f
+    uint16_t initiator;
77c23f
     uint8_t distance[MAX_NODES];
77c23f
 };
77c23f
 
77c23f
@@ -33,6 +35,9 @@ struct NumaState {
77c23f
     /* Allow setting NUMA distance for different NUMA nodes */
77c23f
     bool have_numa_distance;
77c23f
 
77c23f
+    /* Detect if HMAT support is enabled. */
77c23f
+    bool hmat_enabled;
77c23f
+
77c23f
     /* NUMA nodes information */
77c23f
     NodeInfo nodes[MAX_NODES];
77c23f
 };
77c23f
diff --git a/qapi/machine.json b/qapi/machine.json
77c23f
index ca26779..27d0e37 100644
77c23f
--- a/qapi/machine.json
77c23f
+++ b/qapi/machine.json
77c23f
@@ -463,6 +463,13 @@
77c23f
 # @memdev: memory backend object.  If specified for one node,
77c23f
 #          it must be specified for all nodes.
77c23f
 #
77c23f
+# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145,
77c23f
+#             points to the nodeid which has the memory controller
77c23f
+#             responsible for this NUMA node. This field provides
77c23f
+#             additional information as to the initiator node that
77c23f
+#             is closest (as in directly attached) to this node, and
77c23f
+#             therefore has the best performance (since 5.0)
77c23f
+#
77c23f
 # Since: 2.1
77c23f
 ##
77c23f
 { 'struct': 'NumaNodeOptions',
77c23f
@@ -470,7 +477,8 @@
77c23f
    '*nodeid': 'uint16',
77c23f
    '*cpus':   ['uint16'],
77c23f
    '*mem':    'size',
77c23f
-   '*memdev': 'str' }}
77c23f
+   '*memdev': 'str',
77c23f
+   '*initiator': 'uint16' }}
77c23f
 
77c23f
 ##
77c23f
 # @NumaDistOptions:
77c23f
diff --git a/qemu-options.hx b/qemu-options.hx
77c23f
index df1d27b..e2ce754 100644
77c23f
--- a/qemu-options.hx
77c23f
+++ b/qemu-options.hx
77c23f
@@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
77c23f
     "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
77c23f
     "                nvdimm=on|off controls NVDIMM support (default=off)\n"
77c23f
     "                enforce-config-section=on|off enforce configuration section migration (default=off)\n"
77c23f
-    "                memory-encryption=@var{} memory encryption object to use (default=none)\n",
77c23f
+    "                memory-encryption=@var{} memory encryption object to use (default=none)\n"
77c23f
+    "                hmat=on|off controls ACPI HMAT support (default=off)\n",
77c23f
     QEMU_ARCH_ALL)
77c23f
 STEXI
77c23f
 @item -machine [type=]@var{name}[,prop=@var{value}[,...]]
77c23f
@@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global}
77c23f
 @option{migration.send-configuration}=@var{on|off} instead.
77c23f
 @item memory-encryption=@var{}
77c23f
 Memory encryption object to use. The default is none.
77c23f
+@item hmat=on|off
77c23f
+Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support.
77c23f
+The default is off.
77c23f
 @end table
77c23f
 ETEXI
77c23f
 
77c23f
@@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi
77c23f
 ETEXI
77c23f
 
77c23f
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
77c23f
-    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
77c23f
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
77c23f
+    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
77c23f
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
77c23f
     "-numa dist,src=source,dst=destination,val=distance\n"
77c23f
     "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
77c23f
     QEMU_ARCH_ALL)
77c23f
 STEXI
77c23f
-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
77c23f
-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
77c23f
+@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
77c23f
+@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
77c23f
 @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
77c23f
 @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
77c23f
 @findex -numa
77c23f
@@ -215,6 +219,27 @@ split equally between them.
77c23f
 @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore,
77c23f
 if one node uses @samp{memdev}, all of them have to use it.
77c23f
 
77c23f
+@samp{initiator} is an additional option that points to an @var{initiator}
77c23f
+NUMA node that has best performance (the lowest latency or largest bandwidth)
77c23f
+to this NUMA @var{node}. Note that this option can be set only when
77c23f
+the machine property 'hmat' is set to 'on'.
77c23f
+
77c23f
+Following example creates a machine with 2 NUMA nodes, node 0 has CPU.
77c23f
+node 1 has only memory, and its initiator is node 0. Note that because
77c23f
+node 0 has CPU, by default the initiator of node 0 is itself and must be
77c23f
+itself.
77c23f
+@example
77c23f
+-machine hmat=on \
77c23f
+-m 2G,slots=2,maxmem=4G \
77c23f
+-object memory-backend-ram,size=1G,id=m0 \
77c23f
+-object memory-backend-ram,size=1G,id=m1 \
77c23f
+-numa node,nodeid=0,memdev=m0 \
77c23f
+-numa node,nodeid=1,memdev=m1,initiator=0 \
77c23f
+-smp 2,sockets=2,maxcpus=2  \
77c23f
+-numa cpu,node-id=0,socket-id=0 \
77c23f
+-numa cpu,node-id=0,socket-id=1
77c23f
+@end example
77c23f
+
77c23f
 @var{source} and @var{destination} are NUMA node IDs.
77c23f
 @var{distance} is the NUMA distance from @var{source} to @var{destination}.
77c23f
 The distance from a node to itself is always 10. If any pair of nodes is
77c23f
-- 
77c23f
1.8.3.1
77c23f