Blob Blame History Raw
From: Prarit Bhargava <prarit@redhat.com>

Subject: linux: fix support for NUMA node0 being offline

commit 0114c2b0b3e39265e0829eebfff87ac9f4412fe9
Author: Brice Goglin <Brice.Goglin@inria.fr>
Date:   Mon Apr 26 20:35:42 2021 +0200

    linux: fix support for NUMA node0 being offline
    
    Just like we didn't support offline CPU#0 until commit
    7bcc273efd50536961ba16d474efca4ae163229b, we need to
    support node0 being offline as well.
    It's not clear whether it's a new Linux feature or not,
    this was reported on a POWER LPAR VM.
    
    The symptoms are different here because we got no NUMA
    nodes at all, hence the core hwloc added a default
    machine-wide node. But this node got marked disallowed
    by Linux cgroups. Hence load() failed with
     "Topology does not contain any NUMA node, aborting!"
    
    We opportunistically assume node0 is online to avoid
    the overhead in the vast majority of cases. If node0
    is missing, we parse "online" to find the first node.
    
    Thanks to Jirka Hladky for the report.
    
    Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>

Signed-off-by: Prarit Bhargava <prarit@redhat.com>

diff -urNp hwloc-2.2.0.orig/hwloc/topology-linux.c hwloc-2.2.0/hwloc/topology-linux.c
--- hwloc-2.2.0.orig/hwloc/topology-linux.c	2021-05-10 14:44:42.690559128 -0400
+++ hwloc-2.2.0/hwloc/topology-linux.c	2021-05-10 14:44:57.858982883 -0400
@@ -5342,6 +5342,9 @@ static const char *find_sysfs_cpu_path(i
 
 static const char *find_sysfs_node_path(int root_fd)
 {
+  unsigned first;
+  int err;
+
   if (!hwloc_access("/sys/bus/node/devices", R_OK|X_OK, root_fd)
       && !hwloc_access("/sys/bus/node/devices/node0/cpumap", R_OK, root_fd))
     return "/sys/bus/node/devices";
@@ -5350,6 +5353,28 @@ static const char *find_sysfs_node_path(
       && !hwloc_access("/sys/devices/system/node/node0/cpumap", R_OK, root_fd))
     return "/sys/devices/system/node";
 
+  /* node0 might be offline, fallback to looking at the first online node.
+   * online contains comma-separated ranges, just read the first number.
+   */
+  hwloc_debug("Failed to find sysfs node files using node0, looking at online nodes...\n");
+  err = hwloc_read_path_as_uint("/sys/devices/system/node/online", &first, root_fd);
+  if (err) {
+    hwloc_debug("Failed to find read /sys/devices/system/node/online.\n");
+  } else {
+    char path[PATH_MAX];
+    hwloc_debug("Found node#%u as first online node\n", first);
+
+    snprintf(path, sizeof(path), "/sys/bus/node/devices/node%u/cpumap", first);
+    if (!hwloc_access("/sys/bus/node/devices", R_OK|X_OK, root_fd)
+        && !hwloc_access(path, R_OK, root_fd))
+      return "/sys/bus/node/devices";
+
+    snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpumap", first);
+    if (!hwloc_access("/sys/devices/system/node", R_OK|X_OK, root_fd)
+        && !hwloc_access(path, R_OK, root_fd))
+      return "/sys/devices/system/node";
+  }
+
   return NULL;
 }