Blame SOURCES/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch

902636
From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001
902636
From: "plai@redhat.com" <plai@redhat.com>
902636
Date: Thu, 21 May 2020 23:56:47 +0100
902636
Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for
902636
 numa nodes
902636
902636
RH-Author: plai@redhat.com
902636
Message-id: <20200521235655.27141-4-plai@redhat.com>
902636
Patchwork-id: 96736
902636
O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes
902636
Bugzilla: 1600217
902636
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
902636
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
902636
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
902636
902636
From: Tao Xu <tao3.xu@intel.com>
902636
902636
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT),
902636
The initiator represents processor which access to memory. And in 5.2.27.3
902636
Memory Proximity Domain Attributes Structure, the attached initiator is
902636
defined as where the memory controller responsible for a memory proximity
902636
domain. With attached initiator information, the topology of heterogeneous
902636
memory can be described. Add new machine property 'hmat' to enable all
902636
HMAT specific options.
902636
902636
Extend CLI of "-numa node" option to indicate the initiator numa node-id.
902636
In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report
902636
the platform's HMAT tables. Before using initiator option, enable HMAT with
902636
-machine hmat=on.
902636
902636
Acked-by: Markus Armbruster <armbru@redhat.com>
902636
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
902636
Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
902636
Suggested-by: Dan Williams <dan.j.williams@intel.com>
902636
Signed-off-by: Tao Xu <tao3.xu@intel.com>
902636
Message-Id: <20191213011929.2520-2-tao3.xu@intel.com>
902636
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
902636
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
902636
(cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8)
902636
Signed-off-by: Paul Lai <plai@redhat.com>
902636
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
902636
---
902636
 hw/core/machine.c     | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
902636
 hw/core/numa.c        | 23 ++++++++++++++++++
902636
 include/sysemu/numa.h |  5 ++++
902636
 qapi/machine.json     | 10 +++++++-
902636
 qemu-options.hx       | 35 ++++++++++++++++++++++++----
902636
 5 files changed, 131 insertions(+), 6 deletions(-)
902636
902636
diff --git a/hw/core/machine.c b/hw/core/machine.c
902636
index 19c78c6..cb21ae1 100644
902636
--- a/hw/core/machine.c
902636
+++ b/hw/core/machine.c
902636
@@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp)
902636
     ms->nvdimms_state->is_enabled = value;
902636
 }
902636
 
902636
+static bool machine_get_hmat(Object *obj, Error **errp)
902636
+{
902636
+    MachineState *ms = MACHINE(obj);
902636
+
902636
+    return ms->numa_state->hmat_enabled;
902636
+}
902636
+
902636
+static void machine_set_hmat(Object *obj, bool value, Error **errp)
902636
+{
902636
+    MachineState *ms = MACHINE(obj);
902636
+
902636
+    ms->numa_state->hmat_enabled = value;
902636
+}
902636
+
902636
 static char *machine_get_nvdimm_persistence(Object *obj, Error **errp)
902636
 {
902636
     MachineState *ms = MACHINE(obj);
902636
@@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
902636
                                const CpuInstanceProperties *props, Error **errp)
902636
 {
902636
     MachineClass *mc = MACHINE_GET_CLASS(machine);
902636
+    NodeInfo *numa_info = machine->numa_state->nodes;
902636
     bool match = false;
902636
     int i;
902636
 
902636
@@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine,
902636
         match = true;
902636
         slot->props.node_id = props->node_id;
902636
         slot->props.has_node_id = props->has_node_id;
902636
+
902636
+        if (machine->numa_state->hmat_enabled) {
902636
+            if ((numa_info[props->node_id].initiator < MAX_NODES) &&
902636
+                (props->node_id != numa_info[props->node_id].initiator)) {
902636
+                error_setg(errp, "The initiator of CPU NUMA node %" PRId64
902636
+                        " should be itself", props->node_id);
902636
+                return;
902636
+            }
902636
+            numa_info[props->node_id].has_cpu = true;
902636
+            numa_info[props->node_id].initiator = props->node_id;
902636
+        }
902636
     }
902636
 
902636
     if (!match) {
902636
@@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj)
902636
 
902636
     if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) {
902636
         ms->numa_state = g_new0(NumaState, 1);
902636
+        object_property_add_bool(obj, "hmat",
902636
+                                 machine_get_hmat, machine_set_hmat,
902636
+                                 &error_abort);
902636
+        object_property_set_description(obj, "hmat",
902636
+                                        "Set on/off to enable/disable "
902636
+                                        "ACPI Heterogeneous Memory Attribute "
902636
+                                        "Table (HMAT)", NULL);
902636
     }
902636
 
902636
     /* Register notifier when init is done for sysbus sanity checks */
902636
@@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu)
902636
     return g_string_free(s, false);
902636
 }
902636
 
902636
+static void numa_validate_initiator(NumaState *numa_state)
902636
+{
902636
+    int i;
902636
+    NodeInfo *numa_info = numa_state->nodes;
902636
+
902636
+    for (i = 0; i < numa_state->num_nodes; i++) {
902636
+        if (numa_info[i].initiator == MAX_NODES) {
902636
+            error_report("The initiator of NUMA node %d is missing, use "
902636
+                         "'-numa node,initiator' option to declare it", i);
902636
+            exit(1);
902636
+        }
902636
+
902636
+        if (!numa_info[numa_info[i].initiator].present) {
902636
+            error_report("NUMA node %" PRIu16 " is missing, use "
902636
+                         "'-numa node' option to declare it first",
902636
+                         numa_info[i].initiator);
902636
+            exit(1);
902636
+        }
902636
+
902636
+        if (!numa_info[numa_info[i].initiator].has_cpu) {
902636
+            error_report("The initiator of NUMA node %d is invalid", i);
902636
+            exit(1);
902636
+        }
902636
+    }
902636
+}
902636
+
902636
 static void machine_numa_finish_cpu_init(MachineState *machine)
902636
 {
902636
     int i;
902636
@@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine)
902636
             machine_set_cpu_numa_node(machine, &props, &error_fatal);
902636
         }
902636
     }
902636
+
902636
+    if (machine->numa_state->hmat_enabled) {
902636
+        numa_validate_initiator(machine->numa_state);
902636
+    }
902636
+
902636
     if (s->len && !qtest_enabled()) {
902636
         warn_report("CPU(s) not present in any NUMA nodes: %s",
902636
                     s->str);
902636
diff --git a/hw/core/numa.c b/hw/core/numa.c
902636
index 19f082d..a07eef9 100644
902636
--- a/hw/core/numa.c
902636
+++ b/hw/core/numa.c
902636
@@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
902636
         numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
902636
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
902636
     }
902636
+
902636
+    /*
902636
+     * If not set the initiator, set it to MAX_NODES. And if
902636
+     * HMAT is enabled and this node has no cpus, QEMU will raise error.
902636
+     */
902636
+    numa_info[nodenr].initiator = MAX_NODES;
902636
+    if (node->has_initiator) {
902636
+        if (!ms->numa_state->hmat_enabled) {
902636
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
902636
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
902636
+                       "before using any of hmat specific options");
902636
+            return;
902636
+        }
902636
+
902636
+        if (node->initiator >= MAX_NODES) {
902636
+            error_report("The initiator id %" PRIu16 " expects an integer "
902636
+                         "between 0 and %d", node->initiator,
902636
+                         MAX_NODES - 1);
902636
+            return;
902636
+        }
902636
+
902636
+        numa_info[nodenr].initiator = node->initiator;
902636
+    }
902636
     numa_info[nodenr].present = true;
902636
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
902636
     ms->numa_state->num_nodes++;
902636
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
902636
index ae9c41d..788cbec 100644
902636
--- a/include/sysemu/numa.h
902636
+++ b/include/sysemu/numa.h
902636
@@ -18,6 +18,8 @@ struct NodeInfo {
902636
     uint64_t node_mem;
902636
     struct HostMemoryBackend *node_memdev;
902636
     bool present;
902636
+    bool has_cpu;
902636
+    uint16_t initiator;
902636
     uint8_t distance[MAX_NODES];
902636
 };
902636
 
902636
@@ -33,6 +35,9 @@ struct NumaState {
902636
     /* Allow setting NUMA distance for different NUMA nodes */
902636
     bool have_numa_distance;
902636
 
902636
+    /* Detect if HMAT support is enabled. */
902636
+    bool hmat_enabled;
902636
+
902636
     /* NUMA nodes information */
902636
     NodeInfo nodes[MAX_NODES];
902636
 };
902636
diff --git a/qapi/machine.json b/qapi/machine.json
902636
index ca26779..27d0e37 100644
902636
--- a/qapi/machine.json
902636
+++ b/qapi/machine.json
902636
@@ -463,6 +463,13 @@
902636
 # @memdev: memory backend object.  If specified for one node,
902636
 #          it must be specified for all nodes.
902636
 #
902636
+# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145,
902636
+#             points to the nodeid which has the memory controller
902636
+#             responsible for this NUMA node. This field provides
902636
+#             additional information as to the initiator node that
902636
+#             is closest (as in directly attached) to this node, and
902636
+#             therefore has the best performance (since 5.0)
902636
+#
902636
 # Since: 2.1
902636
 ##
902636
 { 'struct': 'NumaNodeOptions',
902636
@@ -470,7 +477,8 @@
902636
    '*nodeid': 'uint16',
902636
    '*cpus':   ['uint16'],
902636
    '*mem':    'size',
902636
-   '*memdev': 'str' }}
902636
+   '*memdev': 'str',
902636
+   '*initiator': 'uint16' }}
902636
 
902636
 ##
902636
 # @NumaDistOptions:
902636
diff --git a/qemu-options.hx b/qemu-options.hx
902636
index df1d27b..e2ce754 100644
902636
--- a/qemu-options.hx
902636
+++ b/qemu-options.hx
902636
@@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
902636
     "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
902636
     "                nvdimm=on|off controls NVDIMM support (default=off)\n"
902636
     "                enforce-config-section=on|off enforce configuration section migration (default=off)\n"
902636
-    "                memory-encryption=@var{} memory encryption object to use (default=none)\n",
902636
+    "                memory-encryption=@var{} memory encryption object to use (default=none)\n"
902636
+    "                hmat=on|off controls ACPI HMAT support (default=off)\n",
902636
     QEMU_ARCH_ALL)
902636
 STEXI
902636
 @item -machine [type=]@var{name}[,prop=@var{value}[,...]]
902636
@@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global}
902636
 @option{migration.send-configuration}=@var{on|off} instead.
902636
 @item memory-encryption=@var{}
902636
 Memory encryption object to use. The default is none.
902636
+@item hmat=on|off
902636
+Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support.
902636
+The default is off.
902636
 @end table
902636
 ETEXI
902636
 
902636
@@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi
902636
 ETEXI
902636
 
902636
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
902636
-    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
902636
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
902636
+    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
902636
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
902636
     "-numa dist,src=source,dst=destination,val=distance\n"
902636
     "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
902636
     QEMU_ARCH_ALL)
902636
 STEXI
902636
-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
902636
-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
902636
+@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
902636
+@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
902636
 @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
902636
 @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
902636
 @findex -numa
902636
@@ -215,6 +219,27 @@ split equally between them.
902636
 @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore,
902636
 if one node uses @samp{memdev}, all of them have to use it.
902636
 
902636
+@samp{initiator} is an additional option that points to an @var{initiator}
902636
+NUMA node that has best performance (the lowest latency or largest bandwidth)
902636
+to this NUMA @var{node}. Note that this option can be set only when
902636
+the machine property 'hmat' is set to 'on'.
902636
+
902636
+Following example creates a machine with 2 NUMA nodes, node 0 has CPU.
902636
+node 1 has only memory, and its initiator is node 0. Note that because
902636
+node 0 has CPU, by default the initiator of node 0 is itself and must be
902636
+itself.
902636
+@example
902636
+-machine hmat=on \
902636
+-m 2G,slots=2,maxmem=4G \
902636
+-object memory-backend-ram,size=1G,id=m0 \
902636
+-object memory-backend-ram,size=1G,id=m1 \
902636
+-numa node,nodeid=0,memdev=m0 \
902636
+-numa node,nodeid=1,memdev=m1,initiator=0 \
902636
+-smp 2,sockets=2,maxcpus=2  \
902636
+-numa cpu,node-id=0,socket-id=0 \
902636
+-numa cpu,node-id=0,socket-id=1
902636
+@end example
902636
+
902636
 @var{source} and @var{destination} are NUMA node IDs.
902636
 @var{distance} is the NUMA distance from @var{source} to @var{destination}.
902636
 The distance from a node to itself is always 10. If any pair of nodes is
902636
-- 
902636
1.8.3.1
902636