|
|
ddf19c |
From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001
|
|
|
ddf19c |
From: "plai@redhat.com" <plai@redhat.com>
|
|
|
ddf19c |
Date: Thu, 21 May 2020 23:56:47 +0100
|
|
|
ddf19c |
Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for
|
|
|
ddf19c |
numa nodes
|
|
|
ddf19c |
|
|
|
ddf19c |
RH-Author: plai@redhat.com
|
|
|
ddf19c |
Message-id: <20200521235655.27141-4-plai@redhat.com>
|
|
|
ddf19c |
Patchwork-id: 96736
|
|
|
ddf19c |
O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes
|
|
|
ddf19c |
Bugzilla: 1600217
|
|
|
ddf19c |
RH-Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
|
|
ddf19c |
RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
|
|
|
ddf19c |
RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com>
|
|
|
ddf19c |
|
|
|
ddf19c |
From: Tao Xu <tao3.xu@intel.com>
|
|
|
ddf19c |
|
|
|
ddf19c |
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT),
|
|
|
ddf19c |
The initiator represents processor which access to memory. And in 5.2.27.3
|
|
|
ddf19c |
Memory Proximity Domain Attributes Structure, the attached initiator is
|
|
|
ddf19c |
defined as where the memory controller responsible for a memory proximity
|
|
|
ddf19c |
domain. With attached initiator information, the topology of heterogeneous
|
|
|
ddf19c |
memory can be described. Add new machine property 'hmat' to enable all
|
|
|
ddf19c |
HMAT specific options.
|
|
|
ddf19c |
|
|
|
ddf19c |
Extend CLI of "-numa node" option to indicate the initiator numa node-id.
|
|
|
ddf19c |
In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report
|
|
|
ddf19c |
the platform's HMAT tables. Before using initiator option, enable HMAT with
|
|
|
ddf19c |
-machine hmat=on.
|
|
|
ddf19c |
|
|
|
ddf19c |
Acked-by: Markus Armbruster <armbru@redhat.com>
|
|
|
ddf19c |
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
|
|
|
ddf19c |
Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
|
|
|
ddf19c |
Suggested-by: Dan Williams <dan.j.williams@intel.com>
|
|
|
ddf19c |
Signed-off-by: Tao Xu <tao3.xu@intel.com>
|
|
|
ddf19c |
Message-Id: <20191213011929.2520-2-tao3.xu@intel.com>
|
|
|
ddf19c |
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
|
|
ddf19c |
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
|
|
ddf19c |
(cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8)
|
|
|
ddf19c |
Signed-off-by: Paul Lai <plai@redhat.com>
|
|
|
ddf19c |
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
|
|
|
ddf19c |
---
|
|
|
ddf19c |
hw/core/machine.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
|
|
|
ddf19c |
hw/core/numa.c | 23 ++++++++++++++++++
|
|
|
ddf19c |
include/sysemu/numa.h | 5 ++++
|
|
|
ddf19c |
qapi/machine.json | 10 +++++++-
|
|
|
ddf19c |
qemu-options.hx | 35 ++++++++++++++++++++++++----
|
|
|
ddf19c |
5 files changed, 131 insertions(+), 6 deletions(-)
|
|
|
ddf19c |
|
|
|
ddf19c |
diff --git a/hw/core/machine.c b/hw/core/machine.c
|
|
|
ddf19c |
index 19c78c6..cb21ae1 100644
|
|
|
ddf19c |
--- a/hw/core/machine.c
|
|
|
ddf19c |
+++ b/hw/core/machine.c
|
|
|
ddf19c |
@@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp)
|
|
|
ddf19c |
ms->nvdimms_state->is_enabled = value;
|
|
|
ddf19c |
}
|
|
|
ddf19c |
|
|
|
ddf19c |
+static bool machine_get_hmat(Object *obj, Error **errp)
|
|
|
ddf19c |
+{
|
|
|
ddf19c |
+ MachineState *ms = MACHINE(obj);
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ return ms->numa_state->hmat_enabled;
|
|
|
ddf19c |
+}
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+static void machine_set_hmat(Object *obj, bool value, Error **errp)
|
|
|
ddf19c |
+{
|
|
|
ddf19c |
+ MachineState *ms = MACHINE(obj);
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ ms->numa_state->hmat_enabled = value;
|
|
|
ddf19c |
+}
|
|
|
ddf19c |
+
|
|
|
ddf19c |
static char *machine_get_nvdimm_persistence(Object *obj, Error **errp)
|
|
|
ddf19c |
{
|
|
|
ddf19c |
MachineState *ms = MACHINE(obj);
|
|
|
ddf19c |
@@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
|
|
|
ddf19c |
const CpuInstanceProperties *props, Error **errp)
|
|
|
ddf19c |
{
|
|
|
ddf19c |
MachineClass *mc = MACHINE_GET_CLASS(machine);
|
|
|
ddf19c |
+ NodeInfo *numa_info = machine->numa_state->nodes;
|
|
|
ddf19c |
bool match = false;
|
|
|
ddf19c |
int i;
|
|
|
ddf19c |
|
|
|
ddf19c |
@@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine,
|
|
|
ddf19c |
match = true;
|
|
|
ddf19c |
slot->props.node_id = props->node_id;
|
|
|
ddf19c |
slot->props.has_node_id = props->has_node_id;
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ if (machine->numa_state->hmat_enabled) {
|
|
|
ddf19c |
+ if ((numa_info[props->node_id].initiator < MAX_NODES) &&
|
|
|
ddf19c |
+ (props->node_id != numa_info[props->node_id].initiator)) {
|
|
|
ddf19c |
+ error_setg(errp, "The initiator of CPU NUMA node %" PRId64
|
|
|
ddf19c |
+ " should be itself", props->node_id);
|
|
|
ddf19c |
+ return;
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+ numa_info[props->node_id].has_cpu = true;
|
|
|
ddf19c |
+ numa_info[props->node_id].initiator = props->node_id;
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
}
|
|
|
ddf19c |
|
|
|
ddf19c |
if (!match) {
|
|
|
ddf19c |
@@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj)
|
|
|
ddf19c |
|
|
|
ddf19c |
if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) {
|
|
|
ddf19c |
ms->numa_state = g_new0(NumaState, 1);
|
|
|
ddf19c |
+ object_property_add_bool(obj, "hmat",
|
|
|
ddf19c |
+ machine_get_hmat, machine_set_hmat,
|
|
|
ddf19c |
+ &error_abort);
|
|
|
ddf19c |
+ object_property_set_description(obj, "hmat",
|
|
|
ddf19c |
+ "Set on/off to enable/disable "
|
|
|
ddf19c |
+ "ACPI Heterogeneous Memory Attribute "
|
|
|
ddf19c |
+ "Table (HMAT)", NULL);
|
|
|
ddf19c |
}
|
|
|
ddf19c |
|
|
|
ddf19c |
/* Register notifier when init is done for sysbus sanity checks */
|
|
|
ddf19c |
@@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu)
|
|
|
ddf19c |
return g_string_free(s, false);
|
|
|
ddf19c |
}
|
|
|
ddf19c |
|
|
|
ddf19c |
+static void numa_validate_initiator(NumaState *numa_state)
|
|
|
ddf19c |
+{
|
|
|
ddf19c |
+ int i;
|
|
|
ddf19c |
+ NodeInfo *numa_info = numa_state->nodes;
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ for (i = 0; i < numa_state->num_nodes; i++) {
|
|
|
ddf19c |
+ if (numa_info[i].initiator == MAX_NODES) {
|
|
|
ddf19c |
+ error_report("The initiator of NUMA node %d is missing, use "
|
|
|
ddf19c |
+ "'-numa node,initiator' option to declare it", i);
|
|
|
ddf19c |
+ exit(1);
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ if (!numa_info[numa_info[i].initiator].present) {
|
|
|
ddf19c |
+ error_report("NUMA node %" PRIu16 " is missing, use "
|
|
|
ddf19c |
+ "'-numa node' option to declare it first",
|
|
|
ddf19c |
+ numa_info[i].initiator);
|
|
|
ddf19c |
+ exit(1);
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ if (!numa_info[numa_info[i].initiator].has_cpu) {
|
|
|
ddf19c |
+ error_report("The initiator of NUMA node %d is invalid", i);
|
|
|
ddf19c |
+ exit(1);
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+}
|
|
|
ddf19c |
+
|
|
|
ddf19c |
static void machine_numa_finish_cpu_init(MachineState *machine)
|
|
|
ddf19c |
{
|
|
|
ddf19c |
int i;
|
|
|
ddf19c |
@@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine)
|
|
|
ddf19c |
machine_set_cpu_numa_node(machine, &props, &error_fatal);
|
|
|
ddf19c |
}
|
|
|
ddf19c |
}
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ if (machine->numa_state->hmat_enabled) {
|
|
|
ddf19c |
+ numa_validate_initiator(machine->numa_state);
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+
|
|
|
ddf19c |
if (s->len && !qtest_enabled()) {
|
|
|
ddf19c |
warn_report("CPU(s) not present in any NUMA nodes: %s",
|
|
|
ddf19c |
s->str);
|
|
|
ddf19c |
diff --git a/hw/core/numa.c b/hw/core/numa.c
|
|
|
ddf19c |
index 19f082d..a07eef9 100644
|
|
|
ddf19c |
--- a/hw/core/numa.c
|
|
|
ddf19c |
+++ b/hw/core/numa.c
|
|
|
ddf19c |
@@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
|
|
|
ddf19c |
numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
|
|
|
ddf19c |
numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
|
|
|
ddf19c |
}
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ /*
|
|
|
ddf19c |
+ * If not set the initiator, set it to MAX_NODES. And if
|
|
|
ddf19c |
+ * HMAT is enabled and this node has no cpus, QEMU will raise error.
|
|
|
ddf19c |
+ */
|
|
|
ddf19c |
+ numa_info[nodenr].initiator = MAX_NODES;
|
|
|
ddf19c |
+ if (node->has_initiator) {
|
|
|
ddf19c |
+ if (!ms->numa_state->hmat_enabled) {
|
|
|
ddf19c |
+ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
|
|
|
ddf19c |
+ "(HMAT) is disabled, enable it with -machine hmat=on "
|
|
|
ddf19c |
+ "before using any of hmat specific options");
|
|
|
ddf19c |
+ return;
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ if (node->initiator >= MAX_NODES) {
|
|
|
ddf19c |
+ error_report("The initiator id %" PRIu16 " expects an integer "
|
|
|
ddf19c |
+ "between 0 and %d", node->initiator,
|
|
|
ddf19c |
+ MAX_NODES - 1);
|
|
|
ddf19c |
+ return;
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+ numa_info[nodenr].initiator = node->initiator;
|
|
|
ddf19c |
+ }
|
|
|
ddf19c |
numa_info[nodenr].present = true;
|
|
|
ddf19c |
max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
|
|
|
ddf19c |
ms->numa_state->num_nodes++;
|
|
|
ddf19c |
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
|
|
|
ddf19c |
index ae9c41d..788cbec 100644
|
|
|
ddf19c |
--- a/include/sysemu/numa.h
|
|
|
ddf19c |
+++ b/include/sysemu/numa.h
|
|
|
ddf19c |
@@ -18,6 +18,8 @@ struct NodeInfo {
|
|
|
ddf19c |
uint64_t node_mem;
|
|
|
ddf19c |
struct HostMemoryBackend *node_memdev;
|
|
|
ddf19c |
bool present;
|
|
|
ddf19c |
+ bool has_cpu;
|
|
|
ddf19c |
+ uint16_t initiator;
|
|
|
ddf19c |
uint8_t distance[MAX_NODES];
|
|
|
ddf19c |
};
|
|
|
ddf19c |
|
|
|
ddf19c |
@@ -33,6 +35,9 @@ struct NumaState {
|
|
|
ddf19c |
/* Allow setting NUMA distance for different NUMA nodes */
|
|
|
ddf19c |
bool have_numa_distance;
|
|
|
ddf19c |
|
|
|
ddf19c |
+ /* Detect if HMAT support is enabled. */
|
|
|
ddf19c |
+ bool hmat_enabled;
|
|
|
ddf19c |
+
|
|
|
ddf19c |
/* NUMA nodes information */
|
|
|
ddf19c |
NodeInfo nodes[MAX_NODES];
|
|
|
ddf19c |
};
|
|
|
ddf19c |
diff --git a/qapi/machine.json b/qapi/machine.json
|
|
|
ddf19c |
index ca26779..27d0e37 100644
|
|
|
ddf19c |
--- a/qapi/machine.json
|
|
|
ddf19c |
+++ b/qapi/machine.json
|
|
|
ddf19c |
@@ -463,6 +463,13 @@
|
|
|
ddf19c |
# @memdev: memory backend object. If specified for one node,
|
|
|
ddf19c |
# it must be specified for all nodes.
|
|
|
ddf19c |
#
|
|
|
ddf19c |
+# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145,
|
|
|
ddf19c |
+# points to the nodeid which has the memory controller
|
|
|
ddf19c |
+# responsible for this NUMA node. This field provides
|
|
|
ddf19c |
+# additional information as to the initiator node that
|
|
|
ddf19c |
+# is closest (as in directly attached) to this node, and
|
|
|
ddf19c |
+# therefore has the best performance (since 5.0)
|
|
|
ddf19c |
+#
|
|
|
ddf19c |
# Since: 2.1
|
|
|
ddf19c |
##
|
|
|
ddf19c |
{ 'struct': 'NumaNodeOptions',
|
|
|
ddf19c |
@@ -470,7 +477,8 @@
|
|
|
ddf19c |
'*nodeid': 'uint16',
|
|
|
ddf19c |
'*cpus': ['uint16'],
|
|
|
ddf19c |
'*mem': 'size',
|
|
|
ddf19c |
- '*memdev': 'str' }}
|
|
|
ddf19c |
+ '*memdev': 'str',
|
|
|
ddf19c |
+ '*initiator': 'uint16' }}
|
|
|
ddf19c |
|
|
|
ddf19c |
##
|
|
|
ddf19c |
# @NumaDistOptions:
|
|
|
ddf19c |
diff --git a/qemu-options.hx b/qemu-options.hx
|
|
|
ddf19c |
index df1d27b..e2ce754 100644
|
|
|
ddf19c |
--- a/qemu-options.hx
|
|
|
ddf19c |
+++ b/qemu-options.hx
|
|
|
ddf19c |
@@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
|
|
|
ddf19c |
" suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
|
|
|
ddf19c |
" nvdimm=on|off controls NVDIMM support (default=off)\n"
|
|
|
ddf19c |
" enforce-config-section=on|off enforce configuration section migration (default=off)\n"
|
|
|
ddf19c |
- " memory-encryption=@var{} memory encryption object to use (default=none)\n",
|
|
|
ddf19c |
+ " memory-encryption=@var{} memory encryption object to use (default=none)\n"
|
|
|
ddf19c |
+ " hmat=on|off controls ACPI HMAT support (default=off)\n",
|
|
|
ddf19c |
QEMU_ARCH_ALL)
|
|
|
ddf19c |
STEXI
|
|
|
ddf19c |
@item -machine [type=]@var{name}[,prop=@var{value}[,...]]
|
|
|
ddf19c |
@@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global}
|
|
|
ddf19c |
@option{migration.send-configuration}=@var{on|off} instead.
|
|
|
ddf19c |
@item memory-encryption=@var{}
|
|
|
ddf19c |
Memory encryption object to use. The default is none.
|
|
|
ddf19c |
+@item hmat=on|off
|
|
|
ddf19c |
+Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support.
|
|
|
ddf19c |
+The default is off.
|
|
|
ddf19c |
@end table
|
|
|
ddf19c |
ETEXI
|
|
|
ddf19c |
|
|
|
ddf19c |
@@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi
|
|
|
ddf19c |
ETEXI
|
|
|
ddf19c |
|
|
|
ddf19c |
DEF("numa", HAS_ARG, QEMU_OPTION_numa,
|
|
|
ddf19c |
- "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
|
|
|
ddf19c |
- "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
|
|
|
ddf19c |
+ "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
|
|
|
ddf19c |
+ "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
|
|
|
ddf19c |
"-numa dist,src=source,dst=destination,val=distance\n"
|
|
|
ddf19c |
"-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
|
|
|
ddf19c |
QEMU_ARCH_ALL)
|
|
|
ddf19c |
STEXI
|
|
|
ddf19c |
-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
|
|
|
ddf19c |
-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
|
|
|
ddf19c |
+@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
|
|
|
ddf19c |
+@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
|
|
|
ddf19c |
@itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
|
|
|
ddf19c |
@itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
|
|
|
ddf19c |
@findex -numa
|
|
|
ddf19c |
@@ -215,6 +219,27 @@ split equally between them.
|
|
|
ddf19c |
@samp{mem} and @samp{memdev} are mutually exclusive. Furthermore,
|
|
|
ddf19c |
if one node uses @samp{memdev}, all of them have to use it.
|
|
|
ddf19c |
|
|
|
ddf19c |
+@samp{initiator} is an additional option that points to an @var{initiator}
|
|
|
ddf19c |
+NUMA node that has best performance (the lowest latency or largest bandwidth)
|
|
|
ddf19c |
+to this NUMA @var{node}. Note that this option can be set only when
|
|
|
ddf19c |
+the machine property 'hmat' is set to 'on'.
|
|
|
ddf19c |
+
|
|
|
ddf19c |
+Following example creates a machine with 2 NUMA nodes, node 0 has CPU.
|
|
|
ddf19c |
+node 1 has only memory, and its initiator is node 0. Note that because
|
|
|
ddf19c |
+node 0 has CPU, by default the initiator of node 0 is itself and must be
|
|
|
ddf19c |
+itself.
|
|
|
ddf19c |
+@example
|
|
|
ddf19c |
+-machine hmat=on \
|
|
|
ddf19c |
+-m 2G,slots=2,maxmem=4G \
|
|
|
ddf19c |
+-object memory-backend-ram,size=1G,id=m0 \
|
|
|
ddf19c |
+-object memory-backend-ram,size=1G,id=m1 \
|
|
|
ddf19c |
+-numa node,nodeid=0,memdev=m0 \
|
|
|
ddf19c |
+-numa node,nodeid=1,memdev=m1,initiator=0 \
|
|
|
ddf19c |
+-smp 2,sockets=2,maxcpus=2 \
|
|
|
ddf19c |
+-numa cpu,node-id=0,socket-id=0 \
|
|
|
ddf19c |
+-numa cpu,node-id=0,socket-id=1
|
|
|
ddf19c |
+@end example
|
|
|
ddf19c |
+
|
|
|
ddf19c |
@var{source} and @var{destination} are NUMA node IDs.
|
|
|
ddf19c |
@var{distance} is the NUMA distance from @var{source} to @var{destination}.
|
|
|
ddf19c |
The distance from a node to itself is always 10. If any pair of nodes is
|
|
|
ddf19c |
--
|
|
|
ddf19c |
1.8.3.1
|
|
|
ddf19c |
|