| From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001 |
| From: "plai@redhat.com" <plai@redhat.com> |
| Date: Thu, 21 May 2020 23:56:47 +0100 |
| Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for |
| numa nodes |
| |
| RH-Author: plai@redhat.com |
| Message-id: <20200521235655.27141-4-plai@redhat.com> |
| Patchwork-id: 96736 |
| O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes |
| Bugzilla: 1600217 |
| RH-Acked-by: Michael S. Tsirkin <mst@redhat.com> |
| RH-Acked-by: Igor Mammedov <imammedo@redhat.com> |
| RH-Acked-by: Eduardo Habkost <ehabkost@redhat.com> |
| |
| From: Tao Xu <tao3.xu@intel.com> |
| |
| In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), |
| The initiator represents processor which access to memory. And in 5.2.27.3 |
| Memory Proximity Domain Attributes Structure, the attached initiator is |
| defined as where the memory controller responsible for a memory proximity |
| domain. With attached initiator information, the topology of heterogeneous |
| memory can be described. Add new machine property 'hmat' to enable all |
| HMAT specific options. |
| |
| Extend CLI of "-numa node" option to indicate the initiator numa node-id. |
| In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report |
| the platform's HMAT tables. Before using initiator option, enable HMAT with |
| -machine hmat=on. |
| |
| Acked-by: Markus Armbruster <armbru@redhat.com> |
| Reviewed-by: Igor Mammedov <imammedo@redhat.com> |
| Reviewed-by: Jingqi Liu <jingqi.liu@intel.com> |
| Suggested-by: Dan Williams <dan.j.williams@intel.com> |
| Signed-off-by: Tao Xu <tao3.xu@intel.com> |
| Message-Id: <20191213011929.2520-2-tao3.xu@intel.com> |
| Reviewed-by: Michael S. Tsirkin <mst@redhat.com> |
| Signed-off-by: Michael S. Tsirkin <mst@redhat.com> |
| (cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8) |
| Signed-off-by: Paul Lai <plai@redhat.com> |
| Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com> |
| |
| hw/core/machine.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ |
| hw/core/numa.c | 23 ++++++++++++++++++ |
| include/sysemu/numa.h | 5 ++++ |
| qapi/machine.json | 10 +++++++- |
| qemu-options.hx | 35 ++++++++++++++++++++++++---- |
| 5 files changed, 131 insertions(+), 6 deletions(-) |
| |
| diff --git a/hw/core/machine.c b/hw/core/machine.c |
| index 19c78c6..cb21ae1 100644 |
| |
| |
| @@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) |
| ms->nvdimms_state->is_enabled = value; |
| } |
| |
| +static bool machine_get_hmat(Object *obj, Error **errp) |
| +{ |
| + MachineState *ms = MACHINE(obj); |
| + |
| + return ms->numa_state->hmat_enabled; |
| +} |
| + |
| +static void machine_set_hmat(Object *obj, bool value, Error **errp) |
| +{ |
| + MachineState *ms = MACHINE(obj); |
| + |
| + ms->numa_state->hmat_enabled = value; |
| +} |
| + |
| static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) |
| { |
| MachineState *ms = MACHINE(obj); |
| @@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine, |
| const CpuInstanceProperties *props, Error **errp) |
| { |
| MachineClass *mc = MACHINE_GET_CLASS(machine); |
| + NodeInfo *numa_info = machine->numa_state->nodes; |
| bool match = false; |
| int i; |
| |
| @@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine, |
| match = true; |
| slot->props.node_id = props->node_id; |
| slot->props.has_node_id = props->has_node_id; |
| + |
| + if (machine->numa_state->hmat_enabled) { |
| + if ((numa_info[props->node_id].initiator < MAX_NODES) && |
| + (props->node_id != numa_info[props->node_id].initiator)) { |
| + error_setg(errp, "The initiator of CPU NUMA node %" PRId64 |
| + " should be itself", props->node_id); |
| + return; |
| + } |
| + numa_info[props->node_id].has_cpu = true; |
| + numa_info[props->node_id].initiator = props->node_id; |
| + } |
| } |
| |
| if (!match) { |
| @@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj) |
| |
| if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { |
| ms->numa_state = g_new0(NumaState, 1); |
| + object_property_add_bool(obj, "hmat", |
| + machine_get_hmat, machine_set_hmat, |
| + &error_abort); |
| + object_property_set_description(obj, "hmat", |
| + "Set on/off to enable/disable " |
| + "ACPI Heterogeneous Memory Attribute " |
| + "Table (HMAT)", NULL); |
| } |
| |
| /* Register notifier when init is done for sysbus sanity checks */ |
| @@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) |
| return g_string_free(s, false); |
| } |
| |
| +static void numa_validate_initiator(NumaState *numa_state) |
| +{ |
| + int i; |
| + NodeInfo *numa_info = numa_state->nodes; |
| + |
| + for (i = 0; i < numa_state->num_nodes; i++) { |
| + if (numa_info[i].initiator == MAX_NODES) { |
| + error_report("The initiator of NUMA node %d is missing, use " |
| + "'-numa node,initiator' option to declare it", i); |
| + exit(1); |
| + } |
| + |
| + if (!numa_info[numa_info[i].initiator].present) { |
| + error_report("NUMA node %" PRIu16 " is missing, use " |
| + "'-numa node' option to declare it first", |
| + numa_info[i].initiator); |
| + exit(1); |
| + } |
| + |
| + if (!numa_info[numa_info[i].initiator].has_cpu) { |
| + error_report("The initiator of NUMA node %d is invalid", i); |
| + exit(1); |
| + } |
| + } |
| +} |
| + |
| static void machine_numa_finish_cpu_init(MachineState *machine) |
| { |
| int i; |
| @@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine) |
| machine_set_cpu_numa_node(machine, &props, &error_fatal); |
| } |
| } |
| + |
| + if (machine->numa_state->hmat_enabled) { |
| + numa_validate_initiator(machine->numa_state); |
| + } |
| + |
| if (s->len && !qtest_enabled()) { |
| warn_report("CPU(s) not present in any NUMA nodes: %s", |
| s->str); |
| diff --git a/hw/core/numa.c b/hw/core/numa.c |
| index 19f082d..a07eef9 100644 |
| |
| |
| @@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, |
| numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); |
| numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); |
| } |
| + |
| + /* |
| + * If not set the initiator, set it to MAX_NODES. And if |
| + * HMAT is enabled and this node has no cpus, QEMU will raise error. |
| + */ |
| + numa_info[nodenr].initiator = MAX_NODES; |
| + if (node->has_initiator) { |
| + if (!ms->numa_state->hmat_enabled) { |
| + error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " |
| + "(HMAT) is disabled, enable it with -machine hmat=on " |
| + "before using any of hmat specific options"); |
| + return; |
| + } |
| + |
| + if (node->initiator >= MAX_NODES) { |
| + error_report("The initiator id %" PRIu16 " expects an integer " |
| + "between 0 and %d", node->initiator, |
| + MAX_NODES - 1); |
| + return; |
| + } |
| + |
| + numa_info[nodenr].initiator = node->initiator; |
| + } |
| numa_info[nodenr].present = true; |
| max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); |
| ms->numa_state->num_nodes++; |
| diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h |
| index ae9c41d..788cbec 100644 |
| |
| |
| @@ -18,6 +18,8 @@ struct NodeInfo { |
| uint64_t node_mem; |
| struct HostMemoryBackend *node_memdev; |
| bool present; |
| + bool has_cpu; |
| + uint16_t initiator; |
| uint8_t distance[MAX_NODES]; |
| }; |
| |
| @@ -33,6 +35,9 @@ struct NumaState { |
| /* Allow setting NUMA distance for different NUMA nodes */ |
| bool have_numa_distance; |
| |
| + /* Detect if HMAT support is enabled. */ |
| + bool hmat_enabled; |
| + |
| /* NUMA nodes information */ |
| NodeInfo nodes[MAX_NODES]; |
| }; |
| diff --git a/qapi/machine.json b/qapi/machine.json |
| index ca26779..27d0e37 100644 |
| |
| |
| @@ -463,6 +463,13 @@ |
| # @memdev: memory backend object. If specified for one node, |
| # it must be specified for all nodes. |
| # |
| +# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, |
| +# points to the nodeid which has the memory controller |
| +# responsible for this NUMA node. This field provides |
| +# additional information as to the initiator node that |
| +# is closest (as in directly attached) to this node, and |
| +# therefore has the best performance (since 5.0) |
| +# |
| # Since: 2.1 |
| ## |
| { 'struct': 'NumaNodeOptions', |
| @@ -470,7 +477,8 @@ |
| '*nodeid': 'uint16', |
| '*cpus': ['uint16'], |
| '*mem': 'size', |
| - '*memdev': 'str' }} |
| + '*memdev': 'str', |
| + '*initiator': 'uint16' }} |
| |
| ## |
| # @NumaDistOptions: |
| diff --git a/qemu-options.hx b/qemu-options.hx |
| index df1d27b..e2ce754 100644 |
| |
| |
| @@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ |
| " suppress-vmdesc=on|off disables self-describing migration (default=off)\n" |
| " nvdimm=on|off controls NVDIMM support (default=off)\n" |
| " enforce-config-section=on|off enforce configuration section migration (default=off)\n" |
| - " memory-encryption=@var{} memory encryption object to use (default=none)\n", |
| + " memory-encryption=@var{} memory encryption object to use (default=none)\n" |
| + " hmat=on|off controls ACPI HMAT support (default=off)\n", |
| QEMU_ARCH_ALL) |
| STEXI |
| @item -machine [type=]@var{name}[,prop=@var{value}[,...]] |
| @@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global} |
| @option{migration.send-configuration}=@var{on|off} instead. |
| @item memory-encryption=@var{} |
| Memory encryption object to use. The default is none. |
| +@item hmat=on|off |
| +Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support. |
| +The default is off. |
| @end table |
| ETEXI |
| |
| @@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi |
| ETEXI |
| |
| DEF("numa", HAS_ARG, QEMU_OPTION_numa, |
| - "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" |
| - "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" |
| + "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" |
| + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" |
| "-numa dist,src=source,dst=destination,val=distance\n" |
| "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", |
| QEMU_ARCH_ALL) |
| STEXI |
| -@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] |
| -@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] |
| +@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] |
| +@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] |
| @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} |
| @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] |
| @findex -numa |
| @@ -215,6 +219,27 @@ split equally between them. |
| @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, |
| if one node uses @samp{memdev}, all of them have to use it. |
| |
| +@samp{initiator} is an additional option that points to an @var{initiator} |
| +NUMA node that has best performance (the lowest latency or largest bandwidth) |
| +to this NUMA @var{node}. Note that this option can be set only when |
| +the machine property 'hmat' is set to 'on'. |
| + |
| +Following example creates a machine with 2 NUMA nodes, node 0 has CPU. |
| +node 1 has only memory, and its initiator is node 0. Note that because |
| +node 0 has CPU, by default the initiator of node 0 is itself and must be |
| +itself. |
| +@example |
| +-machine hmat=on \ |
| +-m 2G,slots=2,maxmem=4G \ |
| +-object memory-backend-ram,size=1G,id=m0 \ |
| +-object memory-backend-ram,size=1G,id=m1 \ |
| +-numa node,nodeid=0,memdev=m0 \ |
| +-numa node,nodeid=1,memdev=m1,initiator=0 \ |
| +-smp 2,sockets=2,maxcpus=2 \ |
| +-numa cpu,node-id=0,socket-id=0 \ |
| +-numa cpu,node-id=0,socket-id=1 |
| +@end example |
| + |
| @var{source} and @var{destination} are NUMA node IDs. |
| @var{distance} is the NUMA distance from @var{source} to @var{destination}. |
| The distance from a node to itself is always 10. If any pair of nodes is |
| -- |
| 1.8.3.1 |
| |