|
|
71bb80 |
From: Prarit Bhargava <prarit@redhat.com>
|
|
|
71bb80 |
|
|
|
71bb80 |
Subject: linux: fix support for NUMA node0 being offline
|
|
|
71bb80 |
|
|
|
71bb80 |
commit 0114c2b0b3e39265e0829eebfff87ac9f4412fe9
|
|
|
71bb80 |
Author: Brice Goglin <Brice.Goglin@inria.fr>
|
|
|
71bb80 |
Date: Mon Apr 26 20:35:42 2021 +0200
|
|
|
71bb80 |
|
|
|
71bb80 |
linux: fix support for NUMA node0 being offline
|
|
|
71bb80 |
|
|
|
71bb80 |
Just like we didn't support offline CPU#0 until commit
|
|
|
71bb80 |
7bcc273efd50536961ba16d474efca4ae163229b, we need to
|
|
|
71bb80 |
support node0 being offline as well.
|
|
|
71bb80 |
It's not clear whether it's a new Linux feature or not,
|
|
|
71bb80 |
this was reported on a POWER LPAR VM.
|
|
|
71bb80 |
|
|
|
71bb80 |
The symptoms are different here because we got no NUMA
|
|
|
71bb80 |
nodes at all, hence the core hwloc added a default
|
|
|
71bb80 |
machine-wide node. But this node got marked disallowed
|
|
|
71bb80 |
by Linux cgroups. Hence load() failed with
|
|
|
71bb80 |
"Topology does not contain any NUMA node, aborting!"
|
|
|
71bb80 |
|
|
|
71bb80 |
We opportunistically assume node0 is online to avoid
|
|
|
71bb80 |
the overhead in the vast majority of cases. If node0
|
|
|
71bb80 |
is missing, we parse "online" to find the first node.
|
|
|
71bb80 |
|
|
|
71bb80 |
Thanks to Jirka Hladky for the report.
|
|
|
71bb80 |
|
|
|
71bb80 |
Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
|
|
|
71bb80 |
|
|
|
71bb80 |
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
|
|
|
71bb80 |
|
|
|
71bb80 |
diff -urNp hwloc-2.2.0.orig/hwloc/topology-linux.c hwloc-2.2.0/hwloc/topology-linux.c
|
|
|
71bb80 |
--- hwloc-2.2.0.orig/hwloc/topology-linux.c 2021-05-10 14:44:42.690559128 -0400
|
|
|
71bb80 |
+++ hwloc-2.2.0/hwloc/topology-linux.c 2021-05-10 14:44:57.858982883 -0400
|
|
|
71bb80 |
@@ -5342,6 +5342,9 @@ static const char *find_sysfs_cpu_path(i
|
|
|
71bb80 |
|
|
|
71bb80 |
static const char *find_sysfs_node_path(int root_fd)
|
|
|
71bb80 |
{
|
|
|
71bb80 |
+ unsigned first;
|
|
|
71bb80 |
+ int err;
|
|
|
71bb80 |
+
|
|
|
71bb80 |
if (!hwloc_access("/sys/bus/node/devices", R_OK|X_OK, root_fd)
|
|
|
71bb80 |
&& !hwloc_access("/sys/bus/node/devices/node0/cpumap", R_OK, root_fd))
|
|
|
71bb80 |
return "/sys/bus/node/devices";
|
|
|
71bb80 |
@@ -5350,6 +5353,28 @@ static const char *find_sysfs_node_path(
|
|
|
71bb80 |
&& !hwloc_access("/sys/devices/system/node/node0/cpumap", R_OK, root_fd))
|
|
|
71bb80 |
return "/sys/devices/system/node";
|
|
|
71bb80 |
|
|
|
71bb80 |
+ /* node0 might be offline, fallback to looking at the first online node.
|
|
|
71bb80 |
+ * online contains comma-separated ranges, just read the first number.
|
|
|
71bb80 |
+ */
|
|
|
71bb80 |
+ hwloc_debug("Failed to find sysfs node files using node0, looking at online nodes...\n");
|
|
|
71bb80 |
+ err = hwloc_read_path_as_uint("/sys/devices/system/node/online", &first, root_fd);
|
|
|
71bb80 |
+ if (err) {
|
|
|
71bb80 |
+ hwloc_debug("Failed to find read /sys/devices/system/node/online.\n");
|
|
|
71bb80 |
+ } else {
|
|
|
71bb80 |
+ char path[PATH_MAX];
|
|
|
71bb80 |
+ hwloc_debug("Found node#%u as first online node\n", first);
|
|
|
71bb80 |
+
|
|
|
71bb80 |
+ snprintf(path, sizeof(path), "/sys/bus/node/devices/node%u/cpumap", first);
|
|
|
71bb80 |
+ if (!hwloc_access("/sys/bus/node/devices", R_OK|X_OK, root_fd)
|
|
|
71bb80 |
+ && !hwloc_access(path, R_OK, root_fd))
|
|
|
71bb80 |
+ return "/sys/bus/node/devices";
|
|
|
71bb80 |
+
|
|
|
71bb80 |
+ snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpumap", first);
|
|
|
71bb80 |
+ if (!hwloc_access("/sys/devices/system/node", R_OK|X_OK, root_fd)
|
|
|
71bb80 |
+ && !hwloc_access(path, R_OK, root_fd))
|
|
|
71bb80 |
+ return "/sys/devices/system/node";
|
|
|
71bb80 |
+ }
|
|
|
71bb80 |
+
|
|
|
71bb80 |
return NULL;
|
|
|
71bb80 |
}
|
|
|
71bb80 |
|