Blame SOURCES/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch

ddf19c
From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001
ddf19c
From: "plai@redhat.com" <plai@redhat.com>
ddf19c
Date: Thu, 21 May 2020 23:56:47 +0100
ddf19c
Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for
ddf19c
 numa nodes
ddf19c
ddf19c
RH-Author: plai@redhat.com
ddf19c
Message-id: <20200521235655.27141-4-plai@redhat.com>
ddf19c
Patchwork-id: 96736
ddf19c
O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes
ddf19c
Bugzilla: 1600217
ddf19c
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
ddf19c
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
ddf19c
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
ddf19c
ddf19c
From: Tao Xu <tao3.xu@intel.com>
ddf19c
ddf19c
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT),
ddf19c
The initiator represents processor which access to memory. And in 5.2.27.3
ddf19c
Memory Proximity Domain Attributes Structure, the attached initiator is
ddf19c
defined as where the memory controller responsible for a memory proximity
ddf19c
domain. With attached initiator information, the topology of heterogeneous
ddf19c
memory can be described. Add new machine property 'hmat' to enable all
ddf19c
HMAT specific options.
ddf19c
ddf19c
Extend CLI of "-numa node" option to indicate the initiator numa node-id.
ddf19c
In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report
ddf19c
the platform's HMAT tables. Before using initiator option, enable HMAT with
ddf19c
-machine hmat=on.
ddf19c
ddf19c
Acked-by: Markus Armbruster <armbru@redhat.com>
ddf19c
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
ddf19c
Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
ddf19c
Suggested-by: Dan Williams <dan.j.williams@intel.com>
ddf19c
Signed-off-by: Tao Xu <tao3.xu@intel.com>
ddf19c
Message-Id: <20191213011929.2520-2-tao3.xu@intel.com>
ddf19c
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
ddf19c
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
ddf19c
(cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8)
ddf19c
Signed-off-by: Paul Lai <plai@redhat.com>
ddf19c
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
ddf19c
---
ddf19c
 hw/core/machine.c     | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
ddf19c
 hw/core/numa.c        | 23 ++++++++++++++++++
ddf19c
 include/sysemu/numa.h |  5 ++++
ddf19c
 qapi/machine.json     | 10 +++++++-
ddf19c
 qemu-options.hx       | 35 ++++++++++++++++++++++++----
ddf19c
 5 files changed, 131 insertions(+), 6 deletions(-)
ddf19c
ddf19c
diff --git a/hw/core/machine.c b/hw/core/machine.c
ddf19c
index 19c78c6..cb21ae1 100644
ddf19c
--- a/hw/core/machine.c
ddf19c
+++ b/hw/core/machine.c
ddf19c
@@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp)
ddf19c
     ms->nvdimms_state->is_enabled = value;
ddf19c
 }
ddf19c
 
ddf19c
+static bool machine_get_hmat(Object *obj, Error **errp)
ddf19c
+{
ddf19c
+    MachineState *ms = MACHINE(obj);
ddf19c
+
ddf19c
+    return ms->numa_state->hmat_enabled;
ddf19c
+}
ddf19c
+
ddf19c
+static void machine_set_hmat(Object *obj, bool value, Error **errp)
ddf19c
+{
ddf19c
+    MachineState *ms = MACHINE(obj);
ddf19c
+
ddf19c
+    ms->numa_state->hmat_enabled = value;
ddf19c
+}
ddf19c
+
ddf19c
 static char *machine_get_nvdimm_persistence(Object *obj, Error **errp)
ddf19c
 {
ddf19c
     MachineState *ms = MACHINE(obj);
ddf19c
@@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
ddf19c
                                const CpuInstanceProperties *props, Error **errp)
ddf19c
 {
ddf19c
     MachineClass *mc = MACHINE_GET_CLASS(machine);
ddf19c
+    NodeInfo *numa_info = machine->numa_state->nodes;
ddf19c
     bool match = false;
ddf19c
     int i;
ddf19c
 
ddf19c
@@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine,
ddf19c
         match = true;
ddf19c
         slot->props.node_id = props->node_id;
ddf19c
         slot->props.has_node_id = props->has_node_id;
ddf19c
+
ddf19c
+        if (machine->numa_state->hmat_enabled) {
ddf19c
+            if ((numa_info[props->node_id].initiator < MAX_NODES) &&
ddf19c
+                (props->node_id != numa_info[props->node_id].initiator)) {
ddf19c
+                error_setg(errp, "The initiator of CPU NUMA node %" PRId64
ddf19c
+                        " should be itself", props->node_id);
ddf19c
+                return;
ddf19c
+            }
ddf19c
+            numa_info[props->node_id].has_cpu = true;
ddf19c
+            numa_info[props->node_id].initiator = props->node_id;
ddf19c
+        }
ddf19c
     }
ddf19c
 
ddf19c
     if (!match) {
ddf19c
@@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj)
ddf19c
 
ddf19c
     if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) {
ddf19c
         ms->numa_state = g_new0(NumaState, 1);
ddf19c
+        object_property_add_bool(obj, "hmat",
ddf19c
+                                 machine_get_hmat, machine_set_hmat,
ddf19c
+                                 &error_abort);
ddf19c
+        object_property_set_description(obj, "hmat",
ddf19c
+                                        "Set on/off to enable/disable "
ddf19c
+                                        "ACPI Heterogeneous Memory Attribute "
ddf19c
+                                        "Table (HMAT)", NULL);
ddf19c
     }
ddf19c
 
ddf19c
     /* Register notifier when init is done for sysbus sanity checks */
ddf19c
@@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu)
ddf19c
     return g_string_free(s, false);
ddf19c
 }
ddf19c
 
ddf19c
+static void numa_validate_initiator(NumaState *numa_state)
ddf19c
+{
ddf19c
+    int i;
ddf19c
+    NodeInfo *numa_info = numa_state->nodes;
ddf19c
+
ddf19c
+    for (i = 0; i < numa_state->num_nodes; i++) {
ddf19c
+        if (numa_info[i].initiator == MAX_NODES) {
ddf19c
+            error_report("The initiator of NUMA node %d is missing, use "
ddf19c
+                         "'-numa node,initiator' option to declare it", i);
ddf19c
+            exit(1);
ddf19c
+        }
ddf19c
+
ddf19c
+        if (!numa_info[numa_info[i].initiator].present) {
ddf19c
+            error_report("NUMA node %" PRIu16 " is missing, use "
ddf19c
+                         "'-numa node' option to declare it first",
ddf19c
+                         numa_info[i].initiator);
ddf19c
+            exit(1);
ddf19c
+        }
ddf19c
+
ddf19c
+        if (!numa_info[numa_info[i].initiator].has_cpu) {
ddf19c
+            error_report("The initiator of NUMA node %d is invalid", i);
ddf19c
+            exit(1);
ddf19c
+        }
ddf19c
+    }
ddf19c
+}
ddf19c
+
ddf19c
 static void machine_numa_finish_cpu_init(MachineState *machine)
ddf19c
 {
ddf19c
     int i;
ddf19c
@@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine)
ddf19c
             machine_set_cpu_numa_node(machine, &props, &error_fatal);
ddf19c
         }
ddf19c
     }
ddf19c
+
ddf19c
+    if (machine->numa_state->hmat_enabled) {
ddf19c
+        numa_validate_initiator(machine->numa_state);
ddf19c
+    }
ddf19c
+
ddf19c
     if (s->len && !qtest_enabled()) {
ddf19c
         warn_report("CPU(s) not present in any NUMA nodes: %s",
ddf19c
                     s->str);
ddf19c
diff --git a/hw/core/numa.c b/hw/core/numa.c
ddf19c
index 19f082d..a07eef9 100644
ddf19c
--- a/hw/core/numa.c
ddf19c
+++ b/hw/core/numa.c
ddf19c
@@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
ddf19c
         numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
ddf19c
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
ddf19c
     }
ddf19c
+
ddf19c
+    /*
ddf19c
+     * If not set the initiator, set it to MAX_NODES. And if
ddf19c
+     * HMAT is enabled and this node has no cpus, QEMU will raise error.
ddf19c
+     */
ddf19c
+    numa_info[nodenr].initiator = MAX_NODES;
ddf19c
+    if (node->has_initiator) {
ddf19c
+        if (!ms->numa_state->hmat_enabled) {
ddf19c
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
ddf19c
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
ddf19c
+                       "before using any of hmat specific options");
ddf19c
+            return;
ddf19c
+        }
ddf19c
+
ddf19c
+        if (node->initiator >= MAX_NODES) {
ddf19c
+            error_report("The initiator id %" PRIu16 " expects an integer "
ddf19c
+                         "between 0 and %d", node->initiator,
ddf19c
+                         MAX_NODES - 1);
ddf19c
+            return;
ddf19c
+        }
ddf19c
+
ddf19c
+        numa_info[nodenr].initiator = node->initiator;
ddf19c
+    }
ddf19c
     numa_info[nodenr].present = true;
ddf19c
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
ddf19c
     ms->numa_state->num_nodes++;
ddf19c
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
ddf19c
index ae9c41d..788cbec 100644
ddf19c
--- a/include/sysemu/numa.h
ddf19c
+++ b/include/sysemu/numa.h
ddf19c
@@ -18,6 +18,8 @@ struct NodeInfo {
ddf19c
     uint64_t node_mem;
ddf19c
     struct HostMemoryBackend *node_memdev;
ddf19c
     bool present;
ddf19c
+    bool has_cpu;
ddf19c
+    uint16_t initiator;
ddf19c
     uint8_t distance[MAX_NODES];
ddf19c
 };
ddf19c
 
ddf19c
@@ -33,6 +35,9 @@ struct NumaState {
ddf19c
     /* Allow setting NUMA distance for different NUMA nodes */
ddf19c
     bool have_numa_distance;
ddf19c
 
ddf19c
+    /* Detect if HMAT support is enabled. */
ddf19c
+    bool hmat_enabled;
ddf19c
+
ddf19c
     /* NUMA nodes information */
ddf19c
     NodeInfo nodes[MAX_NODES];
ddf19c
 };
ddf19c
diff --git a/qapi/machine.json b/qapi/machine.json
ddf19c
index ca26779..27d0e37 100644
ddf19c
--- a/qapi/machine.json
ddf19c
+++ b/qapi/machine.json
ddf19c
@@ -463,6 +463,13 @@
ddf19c
 # @memdev: memory backend object.  If specified for one node,
ddf19c
 #          it must be specified for all nodes.
ddf19c
 #
ddf19c
+# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145,
ddf19c
+#             points to the nodeid which has the memory controller
ddf19c
+#             responsible for this NUMA node. This field provides
ddf19c
+#             additional information as to the initiator node that
ddf19c
+#             is closest (as in directly attached) to this node, and
ddf19c
+#             therefore has the best performance (since 5.0)
ddf19c
+#
ddf19c
 # Since: 2.1
ddf19c
 ##
ddf19c
 { 'struct': 'NumaNodeOptions',
ddf19c
@@ -470,7 +477,8 @@
ddf19c
    '*nodeid': 'uint16',
ddf19c
    '*cpus':   ['uint16'],
ddf19c
    '*mem':    'size',
ddf19c
-   '*memdev': 'str' }}
ddf19c
+   '*memdev': 'str',
ddf19c
+   '*initiator': 'uint16' }}
ddf19c
 
ddf19c
 ##
ddf19c
 # @NumaDistOptions:
ddf19c
diff --git a/qemu-options.hx b/qemu-options.hx
ddf19c
index df1d27b..e2ce754 100644
ddf19c
--- a/qemu-options.hx
ddf19c
+++ b/qemu-options.hx
ddf19c
@@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
ddf19c
     "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
ddf19c
     "                nvdimm=on|off controls NVDIMM support (default=off)\n"
ddf19c
     "                enforce-config-section=on|off enforce configuration section migration (default=off)\n"
ddf19c
-    "                memory-encryption=@var{} memory encryption object to use (default=none)\n",
ddf19c
+    "                memory-encryption=@var{} memory encryption object to use (default=none)\n"
ddf19c
+    "                hmat=on|off controls ACPI HMAT support (default=off)\n",
ddf19c
     QEMU_ARCH_ALL)
ddf19c
 STEXI
ddf19c
 @item -machine [type=]@var{name}[,prop=@var{value}[,...]]
ddf19c
@@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global}
ddf19c
 @option{migration.send-configuration}=@var{on|off} instead.
ddf19c
 @item memory-encryption=@var{}
ddf19c
 Memory encryption object to use. The default is none.
ddf19c
+@item hmat=on|off
ddf19c
+Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support.
ddf19c
+The default is off.
ddf19c
 @end table
ddf19c
 ETEXI
ddf19c
 
ddf19c
@@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi
ddf19c
 ETEXI
ddf19c
 
ddf19c
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
ddf19c
-    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
ddf19c
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
ddf19c
+    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
ddf19c
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
ddf19c
     "-numa dist,src=source,dst=destination,val=distance\n"
ddf19c
     "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
ddf19c
     QEMU_ARCH_ALL)
ddf19c
 STEXI
ddf19c
-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
ddf19c
-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
ddf19c
+@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
ddf19c
+@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
ddf19c
 @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
ddf19c
 @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
ddf19c
 @findex -numa
ddf19c
@@ -215,6 +219,27 @@ split equally between them.
ddf19c
 @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore,
ddf19c
 if one node uses @samp{memdev}, all of them have to use it.
ddf19c
 
ddf19c
+@samp{initiator} is an additional option that points to an @var{initiator}
ddf19c
+NUMA node that has best performance (the lowest latency or largest bandwidth)
ddf19c
+to this NUMA @var{node}. Note that this option can be set only when
ddf19c
+the machine property 'hmat' is set to 'on'.
ddf19c
+
ddf19c
+Following example creates a machine with 2 NUMA nodes, node 0 has CPU.
ddf19c
+node 1 has only memory, and its initiator is node 0. Note that because
ddf19c
+node 0 has CPU, by default the initiator of node 0 is itself and must be
ddf19c
+itself.
ddf19c
+@example
ddf19c
+-machine hmat=on \
ddf19c
+-m 2G,slots=2,maxmem=4G \
ddf19c
+-object memory-backend-ram,size=1G,id=m0 \
ddf19c
+-object memory-backend-ram,size=1G,id=m1 \
ddf19c
+-numa node,nodeid=0,memdev=m0 \
ddf19c
+-numa node,nodeid=1,memdev=m1,initiator=0 \
ddf19c
+-smp 2,sockets=2,maxcpus=2  \
ddf19c
+-numa cpu,node-id=0,socket-id=0 \
ddf19c
+-numa cpu,node-id=0,socket-id=1
ddf19c
+@end example
ddf19c
+
ddf19c
 @var{source} and @var{destination} are NUMA node IDs.
ddf19c
 @var{distance} is the NUMA distance from @var{source} to @var{destination}.
ddf19c
 The distance from a node to itself is always 10. If any pair of nodes is
ddf19c
-- 
ddf19c
1.8.3.1
ddf19c