This patch modifies libnuma to use variable-length bit masks (the
bitmask structure).

Diffed against numactl-1.0.2

Signed-off-by: Cliff Wickman <cpw@sgi.com>
---
 Makefile  |    2 
 libnuma.c |  804 ++++++++++++++++++++++++++++++++++++++++++++------------------
 numa.h    |  124 ++++++---
 numaif.h  |    7 
 numaint.h |   14 -
 shm.c     |   46 ++-
 shm.h     |    6 
 syscall.c |   30 +-
 util.c    |  211 ++++++++--------
 util.h    |    8 
 10 files changed, 831 insertions(+), 421 deletions(-)

Index: numactl-1.0.2/libnuma.c
===================================================================
--- numactl-1.0.2.orig/libnuma.c
+++ numactl-1.0.2/libnuma.c
@@ -30,28 +30,150 @@
 #include <sys/mman.h>
 #include <limits.h>
 
-#include "numaif.h"
 #include "numa.h"
+#include "numaif.h"
 #include "numaint.h"
 #include "util.h"
 
 #define WEAK __attribute__((weak))
 
-#define MAX_NR_CPUS		4096
-#define CPU_BUFFER_SIZE ((MAX_NR_CPUS + 8 - 1) / 8)
-
-const nodemask_t numa_no_nodes;
-const nodemask_t numa_all_nodes;
+struct bitmask *numa_no_nodes = NULL;
+struct bitmask *numa_all_nodes = NULL;
+struct bitmask *numa_all_cpus = NULL;
+struct bitmask **node_cpu_mask;
 
 #ifdef __thread
 #warning "not threadsafe"
 #endif
 
 static __thread int bind_policy = MPOL_BIND; 
-static __thread int mbind_flags = 0;
+static __thread unsigned int mbind_flags = 0;
+static int sizes_set=0;
+static int maxconfigurednode = -1;
+static int maxconfiguredcpu = -1;
+static int maxprocnode = -1;
+static int maxproccpu = -1;
+static int nodemask_sz = 0;
+static int cpumask_sz = 0;
 
 int numa_exit_on_error = 0;
 
+/*
+ * The following bitmask declarations, bitmask_*() routines, and associated
+ * _setbit() and _getbit() routines are:
+ * Copyright (c) 2004 Silicon Graphics, Inc. (SGI) All rights reserved.
+ * SGI publishes it under the terms of the GNU General Public License, v2,
+ * as published by the Free Software Foundation.
+ */
+static unsigned int
+_getbit(const struct bitmask *bmp, unsigned int n)
+{
+	if (n < bmp->size)
+		return (bmp->maskp[n/bitsperlong] >> (n % bitsperlong)) & 1;
+	else
+		return 0;
+}
+
+static void
+_setbit(struct bitmask *bmp, unsigned int n, unsigned int v)
+{
+	if (n < bmp->size) {
+		if (v)
+			bmp->maskp[n/bitsperlong] |= 1UL << (n % bitsperlong);
+		else
+			bmp->maskp[n/bitsperlong] &= ~(1UL << (n % bitsperlong));
+	}
+}
+
+int
+bitmask_isbitset(const struct bitmask *bmp, unsigned int i)
+{
+	return _getbit(bmp, i);
+}
+
+struct bitmask *
+bitmask_setall(struct bitmask *bmp)
+{
+	unsigned int i;
+	for (i = 0; i < bmp->size; i++)
+		_setbit(bmp, i, 1);
+	return bmp;
+}
+
+struct bitmask *
+bitmask_clearall(struct bitmask *bmp)
+{
+	unsigned int i;
+	for (i = 0; i < bmp->size; i++)
+		_setbit(bmp, i, 0);
+	return bmp;
+}
+
+struct bitmask *
+bitmask_setbit(struct bitmask *bmp, unsigned int i)
+{
+	_setbit(bmp, i, 1);
+	return bmp;
+}
+
+struct bitmask *
+bitmask_clearbit(struct bitmask *bmp, unsigned int i)
+{
+	_setbit(bmp, i, 0);
+	return bmp;
+}
+
+unsigned int
+bitmask_nbytes(struct bitmask *bmp)
+{
+	return longsperbits(bmp->size) * sizeof(unsigned long);
+}
+
+/* where n is the number of bits in the map */
+struct bitmask *
+bitmask_alloc(unsigned int n)
+{
+	struct bitmask *bmp;
+
+	if (n < 1) {
+		printf ("request to allocate mask for %d bits; abort\n", n);
+		exit(1);
+	}
+	bmp = malloc(sizeof(*bmp));
+	if (bmp == 0)
+		return 0;
+	bmp->size = n;
+	bmp->maskp = calloc(longsperbits(n), sizeof(unsigned long));
+	if (bmp->maskp == 0) {
+		free(bmp);
+		return 0;
+	}
+	return bmp;
+}
+
+void
+bitmask_free(struct bitmask *bmp)
+{
+	if (bmp == 0)
+		return;
+	free(bmp->maskp);
+	bmp->maskp = (unsigned long *)0xdeadcdef;  /* double free tripwire */
+	free(bmp);
+	return;
+}
+
+/* True if two bitmasks are equal */
+int
+bitmask_equal(const struct bitmask *bmp1, const struct bitmask *bmp2)
+{
+	unsigned int i;
+	for (i = 0; i < bmp1->size || i < bmp2->size; i++)
+		if (_getbit(bmp1, i) != _getbit(bmp2, i))
+			return 0;
+	return 1;
+}
+/* *****end of bitmask_  routines ************ */
+
 make_internal_alias(numa_exit_on_error);
 
 /* Next two can be overwritten by the application for different error handling */
@@ -84,22 +206,21 @@ WEAK void numa_warn(int num, char *fmt, 
 	errno = olde;
 } 
 
-static void setpol(int policy, nodemask_t mask) 
+static void setpol(int policy, struct bitmask *bmp)
 { 
-	if (set_mempolicy_int(policy, &mask.n[0], NUMA_NUM_NODES+1) < 0) 
+	if (set_mempolicy(policy, bmp->maskp, bmp->size) < 0)
 		numa_error("set_mempolicy");
 } 
 
-static void getpol(int *oldpolicy, nodemask_t *oldmask)
+static void getpol(int *oldpolicy, struct bitmask *bmp)
 { 
-	if (get_mempolicy_int(oldpolicy, oldmask->n, NUMA_NUM_NODES+1, 0, 0) < 0) 
+	if (get_mempolicy(oldpolicy, bmp->maskp, bmp->size, 0, 0) < 0)
 		numa_error("get_mempolicy");
 } 
 
-static void dombind(void *mem, size_t size, int pol, const nodemask_t *nodes)
+static void dombind(void *mem, size_t size, int pol, struct bitmask *bmp)
 { 
-	if (mbind_int(mem, size, pol, nodes->n, nodes ? NUMA_NUM_NODES+1 : 0, mbind_flags) 
-	    < 0) 
+	if (mbind(mem, size, pol, bmp->maskp, bmp->size, mbind_flags) < 0)
 		numa_error("mbind"); 
 } 
 
@@ -116,41 +237,100 @@ int numa_pagesize(void)
 
 make_internal_alias(numa_pagesize);
 
-int maxnode = -1;
-int maxcpus = -1;
-
-unsigned long numa_all_cpus[(CPU_BUFFER_SIZE + BYTES_PER_LONG - 1) / BYTES_PER_LONG];
+/*
+ * Find the highest numbered existing memory node: maxconfigurednode.
+ */
+void
+set_configured_nodes(void)
+{
+	DIR *d;
+	struct dirent *de;
 
-static int number_of_configured_cpus(void)
-{ 
-	int len = 16; 
-	int n;
-	int olde = errno;
+	d = opendir("/sys/devices/system/node");
+	if (!d) {
+		numa_warn(W_nosysfs,
+		   "/sys not mounted or no numa system. Assuming one node: %s",
+		  	strerror(errno));
+		maxconfigurednode = 0;
+	} else {
+		while ((de = readdir(d)) != NULL) {
+			int nd;
+			if (strncmp(de->d_name, "node", 4))
+				continue;
+			nd = strtoul(de->d_name+4, NULL, 0);
+			if (maxconfigurednode < nd)
+				maxconfigurednode = nd;
+		}
+		closedir(d);
+	}
+}
 
-	if (maxcpus >= 0) 
-		return maxcpus + 1;
+/*
+ * Convert the string length of an ascii hex mask to the number
+ * of bits represented by that mask.
+ */
+static int s2nbits(const char *s)
+{
+	return strlen(s) * 32 / 9;
+}
 
-	do { 
-		unsigned long buffer[CPU_LONGS(len)];
-		memset(buffer, 0, sizeof(buffer));
-		n = numa_sched_getaffinity_int(0, CPU_BYTES(len), buffer);
-		if (n < 0 && errno == EINVAL) {
-			if (len >= 1024*1024) 
-				break;
-			len *= 2;
-			continue;
-		}
-	} while (n < 0);
-	errno = olde;
-	return len;
-} 
+/*
+ * Determine number of bytes in a seekable open file, without
+ * assuming that stat(2) on that file has a useful size.
+ * Has side affect of leaving the file rewound to the beginnning.
+ */
+static int filesize(FILE *fp)
+{
+	int sz = 0;
+	rewind(fp);
+	while (fgetc(fp) != EOF)
+		sz++;
+	rewind(fp);
+	return sz;
+}
+/* Is string 'pre' a prefix of string 's'? */
+static int strprefix(const char *s, const char *pre)
+{
+	return strncmp(s, pre, strlen(pre)) == 0;
+}
 
-static int fallback_max_node(void)
+static const char *mask_size_file = "/proc/self/status";
+static const char *nodemask_prefix = "Mems_allowed:\t";
+/*
+ * (do this the way Paul Jackson's libcpuset does it)
+ * The nodemask values in /proc/self/status are in an
+ * ascii format that uses 9 characters for each 32 bits of mask.
+ * (this could also be used to find the cpumask size)
+ */
+static void set_nodemask_size()
 {
-	numa_warn(W_nosysfs, "/sys not mounted or no numa system. Assuming one node: %s",
-		  strerror(errno));
-	maxnode = 0;
-	return maxnode;
+	FILE *fp = NULL;
+	char *buf = NULL;
+	int fsize;
+
+	if ((fp = fopen(mask_size_file, "r")) == NULL)
+		goto done;
+	fsize = filesize(fp);
+	if ((buf = malloc(fsize)) == NULL)
+		goto done;
+
+	/*
+	 * Beware: mask sizing arithmetic is fussy.
+	 * The trailing newline left by fgets() is required.
+	 */
+	while (fgets(buf, fsize, fp)) {
+		if (strprefix(buf, nodemask_prefix)) {
+			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
+			break;
+		}
+	}
+done:
+	if (buf != NULL)
+		free(buf);
+	if (fp != NULL)
+		fclose(fp);
+	if (nodemask_sz == 0) /* fall back on error */
+		nodemask_sz = maxconfigurednode+1;
 }
 
 /*
@@ -158,10 +338,11 @@ static int fallback_max_node(void)
  * commas. Order them correctly and return the number of the last bit
  * set.
  */
-int read_mask(char *s, unsigned long *mask)
+int
+read_mask(char *s, struct bitmask *bmp)
 {
 	char *end = s;
-	unsigned int *start = (unsigned int *)mask;
+	unsigned int *start = (unsigned int *)bmp->maskp;
 	unsigned int *p = start;
 	unsigned int *q;
 	unsigned int i;
@@ -207,75 +388,208 @@ int read_mask(char *s, unsigned long *ma
 }
 
 /*
- * Read a processes constraints in terms of nodes and cpus from /proc/pid/status.
+ * Read a processes constraints in terms of nodes and cpus from
+ * /proc/pid/status.
  */
-int read_constraints(void)
+void
+set_task_constraints(void)
 {
+	int max_cpus = number_of_possible_cpus();
+	int buflen;
+	char *buffer;
 	FILE *f;
 	/*
 	 * The maximum line size consists of the string at the beginning plus
 	 * a digit for each 4 cpus and a comma for each 64 cpus.
 	 */
-	char buffer[MAX_NR_CPUS / 4 + MAX_NR_CPUS / BITS_PER_LONG + 20];
+	buflen = max_cpus / 4 + max_cpus / BITS_PER_LONG + 40;
+	buffer = malloc(buflen);
+
+	numa_all_cpus = allocate_cpumask();
+	numa_all_nodes = allocate_nodemask();
 
 	sprintf(buffer,"/proc/%d/status", getpid());
 	f = fopen(buffer, "r");
-	if (!f)
-		return 0;
+	if (!f) {
+		numa_warn(W_cpumap, "Cannot parse /proc/%d/status", getpid());
+		return;
+	}
 
-	while (fgets(buffer, sizeof(buffer), f)) {
+	while (fgets(buffer, buflen, f)) {
 
 		if (strncmp(buffer,"Cpus_allowed",12) == 0)
-			maxcpus = read_mask(buffer + 14, numa_all_cpus);
+			maxproccpu = read_mask(buffer + 14, numa_all_cpus);
 
 		if (strncmp(buffer,"Mems_allowed",12) == 0) {
-			*(nodemask_t *)&numa_all_nodes = numa_no_nodes;
-			maxnode = read_mask(buffer + 14,
-					(unsigned long *)numa_all_nodes.n);
+			maxprocnode =
+				read_mask(buffer + 14, numa_all_nodes);
 		}
 	}
 	fclose(f);
+	free (buffer);
 
-	if (maxnode < 0)
-		return 0;
-
-	return 1;
+	if (maxprocnode < 0) {
+		numa_warn(W_cpumap, "Cannot parse /proc/%d/status", getpid());
+		return;
+	}
+	return;
 }
 
-void determine_nodes(void)
+/*
+ * Find the highest cpu number possible (in other words the size
+ * of a kernel cpumask_t (in bits) - 1)
+ */
+void
+set_numa_max_cpu(void)
 {
-	DIR *d;
-	struct dirent *de;
-	int found;
+	int len = 2048;
+	int n;
+	int olde = errno;
+	struct bitmask *buffer;
 
-	d = opendir("/sys/devices/system/node");
-	if (!d)
-		goto fail;
+	do {
+		buffer = bitmask_alloc(len);
+		n = numa_sched_getaffinity_int(0, buffer);
+		/* on success, returns size of kernel cpumask_t, in bytes */
+		if (n < 0 && errno == EINVAL) {
+			if (len >= 1024*1024)
+				break;
+			len *= 2;
+			bitmask_free(buffer);
+			continue;
+		}
+	} while (n < 0);
+	bitmask_free(buffer);
+	errno = olde;
+	cpumask_sz = n*8;
+}
 
-	found = 0;
-	while ((de = readdir(d)) != NULL) {
-		int nd;
-		if (strncmp(de->d_name, "node", 4))
+/*
+ * get the total (configured) number of cpus - both online and offline
+ */
+void
+set_configured_cpus()
+{
+	int		filecount=0;
+	char		*dirnamep = "/sys/devices/system/cpu";
+	struct dirent	*dirent;
+	DIR		*dir;
+	dir = opendir(dirnamep);
+
+	if (dir == NULL) {
+		fprintf (stderr,
+			"cannot open directory %s\n", dirnamep);
+		return;
+	}
+	while ((dirent = readdir(dir)) != 0) {
+		if (!strncmp("cpu", dirent->d_name, 3)) {
+			filecount++;
+		} else {
 			continue;
-		found++;
-		nd = strtoul(de->d_name+4, NULL, 0);
-		if (maxnode < nd)
-			maxnode = nd;
+		}
 	}
-	closedir(d);
-	if (found)
-		return;
-fail:
-	maxnode  = fallback_max_node();
+	closedir(dir);
+	maxconfiguredcpu = filecount-1; /* high cpu number */
+	return;
+}
+
+/*
+ * Initialize all the sizes.
+ */
+void
+set_sizes()
+{
+	sizes_set++;
+	set_configured_nodes();	/* configured nodes listed in /sys */
+	set_nodemask_size();	/* size of nodemask_t */
+	set_numa_max_cpu();	/* size of cpumask_t */
+	set_configured_cpus();	/* cpus listed in /sys/devices/system/cpu */
+	set_task_constraints();	/* cpus and nodes for current task */
 }
-int numa_max_node(void)
+
+int
+number_of_configured_nodes()
+{
+	if (!sizes_set)
+		set_sizes();
+	return maxconfigurednode+1;
+}
+
+int
+number_of_configured_cpus(void)
+{
+
+	if (!sizes_set)
+		set_sizes();
+	return maxconfiguredcpu+1;
+}
+
+int
+number_of_possible_nodes()
+{
+	if (!sizes_set)
+		set_sizes();
+	return nodemask_sz;
+}
+
+int
+number_of_possible_cpus()
+{
+	if (!sizes_set)
+		set_sizes();
+	return cpumask_sz;
+}
+
+int
+number_of_task_nodes()
 {
-	if (maxnode >= 0)
-		return maxnode;
-	if (!read_constraints())
-		determine_nodes();
+	if (!sizes_set)
+		set_sizes();
+	return maxprocnode+1;
+}
 
-	return maxnode;
+int
+number_of_task_cpus()
+{
+	if (!sizes_set)
+		set_sizes();
+	return maxproccpu+1;
+}
+
+/*
+ * Allocate a bitmask for cpus, of a size large enough to
+ * match the kernel's cpumask_t.
+ */
+struct bitmask *
+allocate_cpumask()
+{
+	int ncpus = number_of_possible_cpus();
+
+	return bitmask_alloc(ncpus);
+}
+
+/*
+ * Allocate a bitmask for nodes, of a size large enough to
+ * match the kernel's nodemask_t.
+ */
+struct bitmask *
+allocate_nodemask()
+{
+	int nnodes = numa_max_node()+1;
+
+	return bitmask_alloc(nnodes);
+}
+
+/*
+ * Return the number of the highest node in the system, in other words
+ * the size of a kernel nodemask_t (in bits) - 1).
+ */
+int
+numa_max_node(void)
+{
+	if (!sizes_set)
+		set_sizes();
+	return nodemask_sz -1;
 }
 
 make_internal_alias(numa_max_node);
@@ -342,28 +656,29 @@ long numa_node_size(int node, long *free
 
 int numa_available(void)
 {
-	if (get_mempolicy_int(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS) 
+	if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
 		return -1; 
 	numa_max_node_int();
 	return 0;
 } 
 
-void numa_interleave_memory(void *mem, size_t size, const nodemask_t *mask)
+void numa_interleave_memory(void *mem, size_t size, struct bitmask *bmp)
 { 
-	dombind(mem, size, MPOL_INTERLEAVE, mask);
+	dombind(mem, size, MPOL_INTERLEAVE, bmp);
 } 
 
 void numa_tonode_memory(void *mem, size_t size, int node)
 {
-	nodemask_t nodes;
-	nodemask_zero(&nodes); 
-	nodemask_set(&nodes, node); 
-	dombind(mem, size,  bind_policy, &nodes); 
+	struct bitmask *nodes;
+
+	nodes = allocate_nodemask();
+	bitmask_setbit(nodes, node);
+	dombind(mem, size, bind_policy, nodes);
 }
 
-void numa_tonodemask_memory(void *mem, size_t size, const nodemask_t *mask)
+void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *bmp)
 {
-	dombind(mem, size,  bind_policy, mask); 
+	dombind(mem, size,  bind_policy, bmp);
 }
 
 void numa_setlocal_memory(void *mem, size_t size)
@@ -392,7 +707,7 @@ void *numa_alloc(size_t size)
 	return mem;
 } 
 
-void *numa_alloc_interleaved_subset(size_t size, const nodemask_t *mask) 
+void *numa_alloc_interleaved_subset(size_t size, struct bitmask *bmp)
 { 
 	char *mem;	
 
@@ -400,7 +715,7 @@ void *numa_alloc_interleaved_subset(size
 		   0, 0); 
 	if (mem == (char *)-1) 
 		return NULL;
-	dombind(mem, size, MPOL_INTERLEAVE, mask);
+	dombind(mem, size, MPOL_INTERLEAVE, bmp);
 	return mem;
 } 
 
@@ -408,24 +723,30 @@ make_internal_alias(numa_alloc_interleav
 
 void *numa_alloc_interleaved(size_t size)
 { 
-	return numa_alloc_interleaved_subset_int(size, &numa_all_nodes); 
+	return numa_alloc_interleaved_subset_int(size, numa_all_nodes);
 } 
 
-void numa_set_interleave_mask(const nodemask_t *mask)
+void numa_set_interleave_mask(struct bitmask *bmp)
 { 
-	if (nodemask_equal(mask, &numa_no_nodes))
-		setpol(MPOL_DEFAULT, *mask); 
+	if (!numa_no_nodes)
+		numa_no_nodes = allocate_nodemask();
+
+	if (bitmask_equal(bmp, numa_no_nodes))
+		setpol(MPOL_DEFAULT, bmp);
 	else
-		setpol(MPOL_INTERLEAVE, *mask);
+		setpol(MPOL_INTERLEAVE, bmp);
 } 
 
-nodemask_t numa_get_interleave_mask(void)
+struct bitmask *
+numa_get_interleave_mask(void)
 { 
 	int oldpolicy;
-	nodemask_t mask; 
-	getpol(&oldpolicy, &mask); 
+	struct bitmask *bmp;
+
+	bmp = allocate_nodemask();
+	getpol(&oldpolicy, bmp);
 	if (oldpolicy == MPOL_INTERLEAVE)
-		return mask;
+		return bmp;
 	return numa_no_nodes; 
 } 
 
@@ -433,7 +754,7 @@ nodemask_t numa_get_interleave_mask(void
 int numa_get_interleave_node(void)
 { 
 	int nd;
-	if (get_mempolicy_int(&nd, NULL, 0, 0, MPOL_F_NODE) == 0)
+	if (get_mempolicy(&nd, NULL, 0, 0, MPOL_F_NODE) == 0)
 		return nd;
 	return 0;	
 } 
@@ -441,14 +762,15 @@ int numa_get_interleave_node(void)
 void *numa_alloc_onnode(size_t size, int node) 
 { 
 	char *mem; 
-	nodemask_t nodes;
-	nodemask_zero(&nodes); 
-	nodemask_set(&nodes, node); 
+	struct bitmask *bmp;
+
+	bmp = allocate_nodemask();
+	bitmask_setbit(bmp, node);
 	mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
 		   0, 0);  
 	if (mem == (char *)-1)
 		return NULL;		
-	dombind(mem, size, bind_policy, &nodes); 
+	dombind(mem, size, bind_policy, bmp);
 	return mem; 	
 } 
 
@@ -471,20 +793,23 @@ void numa_set_bind_policy(int strict) 
 		bind_policy = MPOL_PREFERRED;
 } 
 
-void numa_set_membind(const nodemask_t *mask) 
+void numa_set_membind(struct bitmask *bmp)
 { 
-	setpol(MPOL_BIND, *mask);
+	setpol(MPOL_BIND, bmp);
 } 
 
 make_internal_alias(numa_set_membind);
 
-nodemask_t numa_get_membind(void)
+struct bitmask *
+numa_get_membind(void)
 {
 	int oldpolicy;
-	nodemask_t nodes;
-	getpol(&oldpolicy, &nodes);
+	struct bitmask *bmp;
+
+	bmp = allocate_nodemask();
+	getpol(&oldpolicy, bmp);
 	if (oldpolicy == MPOL_BIND)
-		return nodes;
+		return bmp;
 	return numa_all_nodes;	
 } 
 
@@ -493,21 +818,21 @@ void numa_free(void *mem, size_t size)
 	munmap(mem, size); 
 } 
 
-static unsigned long *node_cpu_mask[NUMA_NUM_NODES];  
-
-static void bad_cpumap(int ncpus, unsigned long *mask)
+static void bad_cpumap(struct bitmask *mask)
 {
 	int n;
-	for (n = 0; n < ncpus; n++)
-		set_bit(n, mask);
+
+	for (n = 0; n < mask->size; n++)
+		bitmask_setbit(mask, n);
 }
 
-int numa_parse_bitmap(char *line, unsigned long *mask, int ncpus)
+int numa_parse_bitmap(char *line, struct bitmask *mask)
 {
-	int i;
+	int i, ncpus;
 	char *p = strchr(line, '\n'); 
 	if (!p)
 		return -1;
+	ncpus = mask->size;
 
 	for (i = 0; p > line;i++) {
 		char *oldp, *endp; 
@@ -526,70 +851,90 @@ int numa_parse_bitmap(char *line, unsign
 		if (*p == ',')
 			p++;
 		if (i >= CPU_LONGS(ncpus))
-			return 0; /* filled the mask */
-		mask[i] = strtoul(p, &endp, 16);
+			return -1;
+		mask->maskp[i] = strtoul(p, &endp, 16);
 		if (endp != oldp)
-			return 0;  /* we filled the mask */
+			return -1;
 		p--;
 	}
 	return 0;
 }
 
+void
+init_node_cpu_mask()
+{
+	int nnodes = numa_max_node()+1;
+	node_cpu_mask = calloc (nnodes, sizeof(struct bitmask *));
+}
+
+/*
+ * test whether a node has cpus
+ */
 /* This would be better with some locking, but I don't want to make libnuma
    dependent on pthreads right now. The races are relatively harmless. */
-int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) 
+/*
+ * deliver a bitmask of cpus representing the cpus on a given node
+ */
+int numa_node_to_cpus(int node, struct bitmask *buffer)
 {
-	int err = 0;
-	char fn[64];
+	int err = 0, bufferlen;
+	int nnodes = number_of_configured_nodes();
+	char fn[64], *line = NULL;
 	FILE *f; 
-	char *line = NULL; 
 	size_t len = 0; 
-	int buflen_needed;
-	unsigned long *mask;
-	int ncpus = number_of_configured_cpus();
+	struct bitmask *mask;
+
+	if (!node_cpu_mask)
+		init_node_cpu_mask();
 
-	buflen_needed = CPU_BYTES(ncpus);
-	if ((unsigned)node > maxnode || bufferlen < buflen_needed) { 
+	bufferlen = bitmask_nbytes(buffer);
+	if (node > nnodes-1) {
 		errno = ERANGE;
 		return -1;
 	}
-	if (bufferlen > buflen_needed)
-		memset(buffer, 0, bufferlen); 
+	bitmask_clearall(buffer);
+
 	if (node_cpu_mask[node]) { 
-		memcpy(buffer, node_cpu_mask[node], buflen_needed);
+		/* have already constructed a mask for this node */
+		if (buffer->size != node_cpu_mask[node]->size) {
+			printf ("map size mismatch; abort\n");
+			exit(1);
+		}
+		memcpy(buffer->maskp, node_cpu_mask[node]->maskp, bufferlen);
 		return 0;
 	}
 
-	mask = malloc(buflen_needed);
-	if (!mask) 
-		mask = (unsigned long *)buffer; 
-	memset(mask, 0, buflen_needed); 
+	/* need a new mask for this node */
+	mask = allocate_cpumask();
 
+	/* this is a kernel cpumask_t (see node_read_cpumap()) */
 	sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node); 
 	f = fopen(fn, "r"); 
 	if (!f || getdelim(&line, &len, '\n', f) < 1) { 
 		numa_warn(W_nosysfs2,
 		   "/sys not mounted or invalid. Assuming one node: %s",
 			  strerror(errno)); 
-		bad_cpumap(ncpus, mask);
+		bad_cpumap(mask);
 		err = -1;
 	} 
 	if (f)
 		fclose(f);
 
-	if (line && numa_parse_bitmap(line, mask, ncpus) < 0) {
+	if (line && numa_parse_bitmap(line, mask) < 0) {
 		numa_warn(W_cpumap, "Cannot parse cpumap. Assuming one node");
-		bad_cpumap(ncpus, mask);
+		bad_cpumap(mask);
 		err = -1;
 	}
 
 	free(line);
-	memcpy(buffer, mask, buflen_needed);
+	memcpy(buffer->maskp, mask->maskp, bufferlen);
 
 	/* slightly racy, see above */ 
+	/* save the mask we created */
 	if (node_cpu_mask[node]) {
+		/* how could this be? */
 		if (mask != buffer)
-			free(mask); 	       
+			bitmask_free(mask);
 	} else {
 		node_cpu_mask[node] = mask; 
 	} 
@@ -598,88 +943,83 @@ int numa_node_to_cpus(int node, unsigned
 
 make_internal_alias(numa_node_to_cpus);
 
-int numa_run_on_node_mask(const nodemask_t *mask)
+/*
+ * Given a node mask (size of a kernel nodemask_t) (probably populated by
+ * a user argument list) set up a map of cpus (map "cpus") on those nodes.
+ * Then set affinity to those cpus.
+ */
+int numa_run_on_node_mask(struct bitmask *bmp)
 { 	
-	int ncpus = number_of_configured_cpus();
-	int i, k, err;
-	unsigned long cpus[CPU_LONGS(ncpus)], nodecpus[CPU_LONGS(ncpus)];
-	memset(cpus, 0, CPU_BYTES(ncpus));
-	for (i = 0; i < NUMA_NUM_NODES; i++) { 
-		if (mask->n[i / BITS_PER_LONG] == 0)
+	int ncpus, i, k, err;
+	struct bitmask *cpus, *nodecpus;
+
+	cpus = allocate_cpumask();
+	ncpus = cpus->size;
+	nodecpus = allocate_cpumask();
+
+	for (i = 0; i < bmp->size; i++) {
+		if (bmp->maskp[i / BITS_PER_LONG] == 0)
 			continue;
-		if (nodemask_isset(mask, i)) { 
-			if (numa_node_to_cpus_int(i, nodecpus, CPU_BYTES(ncpus)) < 0) { 
+		if (bitmask_isbitset(bmp, i)) {
+			if (numa_node_to_cpus_int(i, nodecpus) < 0) {
 				numa_warn(W_noderunmask, 
-					  "Cannot read node cpumask from sysfs");
+					"Cannot read node cpumask from sysfs");
 				continue;
 			}
 			for (k = 0; k < CPU_LONGS(ncpus); k++)
-				cpus[k] |= nodecpus[k];
+				cpus->maskp[k] |= nodecpus->maskp[k];
 		}	
 	}
-	err = numa_sched_setaffinity_int(0, CPU_BYTES(ncpus), cpus);
+	err = numa_sched_setaffinity_int(0, cpus);
+
+	/* used to have to consider that this could fail - it shouldn't now */
+	if (err < 0) {
+		printf ("numa_sched_setaffinity_int() failed; abort\n");
+		exit(1);
+	}
+	bitmask_free(cpus);
+	bitmask_free(nodecpus);
 
-	/* The sched_setaffinity API is broken because it expects
-	   the user to guess the kernel cpuset size. Do this in a
-	   brute force way. */
-	if (err < 0 && errno == EINVAL) { 
-		int savederrno = errno;
-		char *bigbuf;
-		static int size = -1;
-		if (size == -1) 
-			size = CPU_BYTES(ncpus) * 2; 
-		bigbuf = malloc(CPU_BUFFER_SIZE);
-		if (!bigbuf) {
-			errno = ENOMEM; 
-			return -1;
-		}
-		errno = savederrno;
-		while (size <= CPU_BUFFER_SIZE) { 
-			memcpy(bigbuf, cpus, CPU_BYTES(ncpus)); 
-			memset(bigbuf + CPU_BYTES(ncpus), 0,
-			       CPU_BUFFER_SIZE - CPU_BYTES(ncpus));
-			err = numa_sched_setaffinity_int(0, size, (unsigned long *)bigbuf);
-			if (err == 0 || errno != EINVAL)
-				break;
-			size *= 2;
-		}
-		savederrno = errno;
-		free(bigbuf);
-		errno = savederrno;
-	} 
 	return err;
 } 
 
 make_internal_alias(numa_run_on_node_mask);
 
-nodemask_t numa_get_run_node_mask(void)
+struct bitmask *
+numa_get_run_node_mask(void)
 { 
-	int ncpus = NUMA_NUM_NODES;
-	nodemask_t mask;
+	int ncpus = number_of_configured_cpus();
 	int i, k;
 	int max = numa_max_node_int();
-	unsigned long cpus[CPU_LONGS(ncpus)], nodecpus[CPU_LONGS(ncpus)];
+	struct bitmask *bmp, *cpus, *nodecpus;
+
+	bmp = allocate_cpumask();
+	cpus = allocate_cpumask();
+	nodecpus = allocate_cpumask();
 
-	memset(cpus, 0, CPU_BYTES(ncpus));
-	nodemask_zero(&mask);
-	if (numa_sched_getaffinity_int(0, CPU_BYTES(ncpus), cpus) < 0) 
+	if (numa_sched_getaffinity_int(0, cpus) < 0)
 		return numa_no_nodes; 
+
 	for (i = 0; i <= max; i++) {
-		if (numa_node_to_cpus_int(i, nodecpus, CPU_BYTES(ncpus)) < 0) {
+		if (numa_node_to_cpus_int(i, nodecpus) < 0) {
 			/* It's possible for the node to not exist */
 			continue;
 		}
 		for (k = 0; k < CPU_LONGS(ncpus); k++) {
-			if (nodecpus[k] & cpus[k])
-				nodemask_set(&mask, i); 
+			if (nodecpus->maskp[k] & cpus->maskp[k])
+				bitmask_setbit(bmp, i);
 		}
 	}		
-	return mask;
+	return bmp;
 } 
 
-int numa_migrate_pages(int pid, const nodemask_t *fromnodes, const nodemask_t *tonodes)
+int
+numa_migrate_pages(int pid, struct bitmask *fromnodes, struct bitmask *tonodes)
 {
-	return migrate_pages(pid, NUMA_NUM_NODES + 1, &fromnodes->n[0], &tonodes->n[0]);
+	int numa_num_nodes = number_of_possible_nodes();
+
+	return migrate_pages(pid, numa_num_nodes + 1, fromnodes->maskp,
+							tonodes->maskp);
 }
 
 int numa_move_pages(int pid, unsigned long count,
@@ -690,36 +1030,37 @@ int numa_move_pages(int pid, unsigned lo
 
 int numa_run_on_node(int node)
 { 
-	int ncpus = number_of_configured_cpus();
-	unsigned long cpus[CPU_LONGS(ncpus)];
+	int numa_num_nodes = number_of_possible_nodes();
+	struct bitmask *cpus;
 
 	if (node == -1) {
-		int i;
-		memset(cpus, 0, CPU_BYTES(ncpus));
-		for (i = 0; i < ncpus; i++) 
-			cpus[i / BITS_PER_LONG] |= 1UL << (i%BITS_PER_LONG);
-	} else if (node < NUMA_NUM_NODES) {
-		if (numa_node_to_cpus_int(node, cpus, CPU_BYTES(ncpus)) < 0) {
-			numa_warn(W_noderunmask, "Cannot read node cpumask from sysfs");
+		cpus = allocate_cpumask();
+		bitmask_setall(cpus);
+	} else if (node < numa_num_nodes) {
+		if (numa_node_to_cpus_int(node, cpus) < 0) {
+			numa_warn(W_noderunmask,
+				"Cannot read node cpumask from sysfs");
 			return -1; 
 		} 		
 	} else { 
 		errno = EINVAL;
 		return -1; 
 	}
-	return numa_sched_setaffinity_int(0, CPU_BYTES(ncpus), cpus);
+	return numa_sched_setaffinity_int(0, cpus);
 } 
 
 int numa_preferred(void)
 { 
 	int policy;
-	nodemask_t nodes;
-	getpol(&policy, &nodes);
+	struct bitmask *bmp;
+
+	bmp = allocate_nodemask();
+	getpol(&policy, bmp);
 	if (policy == MPOL_PREFERRED || policy == MPOL_BIND) { 
 		int i;
-		int max = NUMA_NUM_NODES;
+		int max = number_of_possible_nodes();
 		for (i = 0; i < max ; i++) 
-			if (nodemask_isset(&nodes, i))
+			if (bitmask_isbitset(bmp, i))
 				return i; 
 	}
 	/* could read the current CPU from /proc/self/status. Probably 
@@ -729,29 +1070,30 @@ int numa_preferred(void)
 
 void numa_set_preferred(int node)
 { 
-	nodemask_t n;
-	if (node < 0) {
-		nodemask_t empty;
-		nodemask_zero(&empty);
-		setpol(MPOL_DEFAULT, empty);
-		return;
-	}
-	nodemask_zero(&n);
-	nodemask_set(&n, node); 
-	setpol(MPOL_PREFERRED, n);
+	struct bitmask *bmp;
+
+	bmp = allocate_nodemask();
+	if (node >= 0)
+		bitmask_setbit(bmp, node);
+	setpol(MPOL_PREFERRED, bmp);
+	bitmask_free(bmp);
+	return;
 } 
 
 void numa_set_localalloc(void) 
 {	
-	nodemask_t empty;
-	nodemask_zero(&empty);
-	setpol(MPOL_PREFERRED, empty);
+	struct bitmask *bmp;
+
+	bmp = allocate_nodemask();
+	setpol(MPOL_PREFERRED, bmp);
+	bitmask_free(bmp);
+	return;
 } 
 
-void numa_bind(const nodemask_t *nodemask)
+void numa_bind(struct bitmask *bmp)
 {
-	numa_run_on_node_mask_int(nodemask); 
-	numa_set_membind_int(nodemask);
+	numa_run_on_node_mask_int(bmp);
+	numa_set_membind_int(bmp);
 }
 
 void numa_set_strict(int flag)
Index: numactl-1.0.2/shm.c
===================================================================
--- numactl-1.0.2.orig/shm.c
+++ numactl-1.0.2/shm.c
@@ -160,7 +160,7 @@ void attach_shared(char *name)
 } 
 
 static void 
-dumppol(unsigned long long start, unsigned long long end, int pol, nodemask_t mask)
+dumppol(unsigned long long start, unsigned long long end, int pol, struct bitmask *mask)
 { 
 	if (pol == MPOL_DEFAULT)
 		return;
@@ -168,13 +168,13 @@ dumppol(unsigned long long start, unsign
 	       shmoffset+start, 
 	       shmoffset+end, 
 	       policy_name(pol));
-	printmask("", &mask);
+	printmask("", mask);
 } 
 
 /* Dump policies in a shared memory segment. */
 void dump_shm(void) 
 { 
-	nodemask_t nodes, prevnodes;
+	struct bitmask *nodes, *prevnodes;
 	int prevpol = -1, pol; 
 	unsigned long long c, start; 
 
@@ -184,9 +184,12 @@ void dump_shm(void) 
 		return;
 	}
 
+	nodes = allocate_nodemask();
+	prevnodes = allocate_nodemask();
+
 	for (c = 0; c < shmlen; c += shm_pagesize) { 
-		if (get_mempolicy(&pol, nodes.n, NUMA_NUM_NODES+1, c+shmptr, 
-				  MPOL_F_ADDR) < 0) 
+		if (get_mempolicy(&pol, nodes->maskp, nodes->size, c+shmptr,
+						MPOL_F_ADDR) < 0)
 			err("get_mempolicy on shm");
 		if (pol == prevpol) 
 			continue;
@@ -210,42 +213,47 @@ static void vwarn(char *ptr, char *fmt, 
 	exitcode = 1;
 } 
 
-static unsigned interleave_next(unsigned cur, nodemask_t *mask)
+static unsigned interleave_next(unsigned cur, struct bitmask *mask)
 {
+	int numa_num_nodes = number_of_possible_nodes();
+
 	++cur;
 	while (!nodemask_isset(mask, cur)) { 		
-		cur = (cur+1) % NUMA_NUM_NODES;
+		cur = (cur+1) % numa_num_nodes;
 	} 
 	return cur;
 }
 
 /* Verify policy in a shared memory segment */
-void verify_shm(int policy, nodemask_t nodes)
+void verify_shm(int policy, struct bitmask *nodes)
 {
 	char *p; 
 	int ilnode, node;
 	int pol2;
-	nodemask_t nodes2;
+	struct bitmask *nodes2;
+
+	nodes2 = allocate_nodemask();
 	
 	if (policy == MPOL_INTERLEAVE) {
-		if (get_mempolicy(&ilnode, NULL, 0, shmptr, MPOL_F_ADDR|MPOL_F_NODE) 
+		if (get_mempolicy(&ilnode, NULL, 0, shmptr,
+					MPOL_F_ADDR|MPOL_F_NODE)
 		    < 0) 
 			err("get_mempolicy");
 	} 
 	
 	for (p = shmptr; p - (char *)shmptr < shmlen; p += shm_pagesize) { 
-		if (get_mempolicy(&pol2, nodes2.n, NUMA_NUM_NODES, p, MPOL_F_ADDR)
-		    < 0) 
+		if (get_mempolicy(&pol2, nodes2->maskp, nodes2->size, p,
+							MPOL_F_ADDR) < 0)
 			err("get_mempolicy");
 		if (pol2 != policy) { 
 			vwarn(p, "wrong policy %s, expected %s\n", 
 			      policy_name(pol2), policy_name(policy));
 			return;
 		}
-		if (memcmp(&nodes2, &nodes, sizeof(nodemask_t))) { 
+		if (memcmp(nodes2, nodes, bitmask_nbytes(nodes))) {
 			vwarn(p, "mismatched node mask\n"); 
-			printmask("expected", &nodes);
-			printmask("real", &nodes2);
+			printmask("expected", nodes);
+			printmask("real", nodes2);
 		} 
 
 		if (get_mempolicy(&node, NULL, 0, p, MPOL_F_ADDR|MPOL_F_NODE) < 0) 
@@ -253,20 +261,20 @@ void verify_shm(int policy, nodemask_t n
 
 		switch (policy) { 
 		case MPOL_INTERLEAVE: 
-			if (node < 0 || !nodemask_isset(&nodes2, node))
+			if (node < 0 || !nodemask_isset(nodes2, node))
 				vwarn(p, "interleave node out of range %d\n", node);
 			if (node != ilnode) { 
 				vwarn(p, "expected interleave node %d, got %d\n",
 				     ilnode,node); 
 				return;
 			}
-			ilnode = interleave_next(ilnode, &nodes2); 
+			ilnode = interleave_next(ilnode, nodes2);
 			break;
 		case MPOL_PREFERRED:
 		case MPOL_BIND:
-			if (!nodemask_isset(&nodes2, node)) {
+			if (!nodemask_isset(nodes2, node)) {
 				vwarn(p, "unexpected node %d\n", node);
-				printmask("expected", &nodes2);
+				printmask("expected", nodes2);
 			}	
 			break;
 
Index: numactl-1.0.2/syscall.c
===================================================================
--- numactl-1.0.2.orig/syscall.c
+++ numactl-1.0.2/syscall.c
@@ -17,6 +17,7 @@
 #include <sys/types.h>
 #include <asm/unistd.h>
 #include <errno.h>
+#include "numa.h"
 #include "numaif.h"
 #include "numaint.h"
 
@@ -132,23 +133,26 @@ long syscall6(long call, long a, long b,
 #define syscall6 syscall
 #endif
 
-long WEAK get_mempolicy(int *policy, 
-		   const unsigned long *nmask, unsigned long maxnode,
-		   void *addr, int flags)          
+long WEAK get_mempolicy(int *policy, const unsigned long *nmask,
+				unsigned long maxnode, void *addr, int flags)
 {
-	return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags);
+	return syscall(__NR_get_mempolicy, policy, nmask,
+					maxnode, addr, flags);
 }
 
 long WEAK mbind(void *start, unsigned long len, int mode, 
-	   const unsigned long *nmask, unsigned long maxnode, unsigned flags) 
+	const unsigned long *nmask, unsigned long maxnode, unsigned flags)
 {
-	return syscall6(__NR_mbind, (long)start, len, mode, (long)nmask, maxnode, flags); 
+	return syscall6(__NR_mbind, (long)start, len, mode, nmask, maxnode,
+							flags);
 }
 
 long WEAK set_mempolicy(int mode, const unsigned long *nmask, 
                                    unsigned long maxnode)
 {
-	return syscall(__NR_set_mempolicy,mode,nmask,maxnode);
+	long i;
+	i = syscall(__NR_set_mempolicy,mode,nmask,maxnode);
+	return i;
 }
 
 long WEAK migrate_pages(int pid, unsigned long maxnode,
@@ -165,14 +169,18 @@ long WEAK move_pages(int pid, unsigned l
 
 /* SLES8 glibc doesn't define those */
 
-int numa_sched_setaffinity(pid_t pid, unsigned len, const unsigned long *mask)
+int numa_sched_setaffinity(pid_t pid, struct bitmask *mask)
 {
-	return syscall(__NR_sched_setaffinity,pid,len,mask);
+	return syscall(__NR_sched_setaffinity, pid, bitmask_nbytes(mask),
+								mask->maskp);
 }
 
-int numa_sched_getaffinity(pid_t pid, unsigned len, const unsigned long *mask)
+int numa_sched_getaffinity(pid_t pid, struct bitmask *mask)
 {
-	return syscall(__NR_sched_getaffinity,pid,len,mask);
+	/* len is length in bytes */
+	return syscall(__NR_sched_getaffinity, pid, bitmask_nbytes(mask),
+								mask->maskp);
+	/* sched_getaffinity returns sizeof(cpumask_t) */
 
 }
 
Index: numactl-1.0.2/util.c
===================================================================
--- numactl-1.0.2.orig/util.c
+++ numactl-1.0.2/util.c
@@ -25,26 +25,13 @@
 #include <errno.h>
 #include <unistd.h>
 
-void printmask(char *name, nodemask_t *mask)
+void printmask(char *name, struct bitmask *mask)
 { 
 	int i;
-	int max = numa_max_node();
+
 	printf("%s: ", name); 
-#if 0
-	int full = 1;
-	for (i = 0; i <= max; i++) { 
-		if (nodemask_isset(&numa_all_nodes, i) && !nodemask_isset(mask, i)) {
-			full = 0;
-			break;
-		}		
-	} 
-	if (full) { 
-		printf("all nodes\n"); 
-		return;
-	}	
-#endif
-	for (i = 0; i <= max; i++) 
-		if (nodemask_isset(mask, i))
+	for (i = 0; i <= mask->size; i++)
+		if (bitmask_isbitset(mask, i))
 			printf("%d ", i); 
 	putchar('\n');
 } 
@@ -52,82 +39,90 @@ void printmask(char *name, nodemask_t *m
 /*
  * Extract a node or processor number from the given string.
  * Allow a relative node / processor specification within the allowed
- * set if a + is prepended to the number.
+ * set if "relative" is nonzero
  */
-unsigned long get_nr(char *s, char **end, int max, unsigned long *mask)
+unsigned long get_nr(char *s, char **end, struct bitmask *bmp, int relative)
 {
-	unsigned long i, nr;
+	long i, nr;
 
-	if (*s != '+')
+	if (!relative)
 		return strtoul(s, end, 0);
-	s++;
+
 	nr = strtoul(s, end, 0);
 	if (s == *end)
 		return nr;
 	/* Find the nth set bit */
-	for (i = 0; nr > 0 && i <= max; i++)
-		if (test_bit(i, mask))
+	for (i = 0; nr >= 0 && i <= bmp->size; i++)
+		if (bitmask_isbitset(bmp, i))
 			nr--;
-	if (nr)
-		*end = s;
-	return i;
-
+	return i-1;
 }
 
-int numcpus; 
-extern unsigned long numa_all_cpus[];
-extern int maxcpus;
-
-/* caller must free buffer */
-unsigned long *cpumask(char *s, int *ncpus) 
+/*
+ * cpumask() is called to create a cpumask_t mask, given
+ * an ascii string such as 25 or 12-15 or 1,3,5-7 or +6-10.
+ * (the + indicates that the numbers are cpuset-relative)
+ *
+ * The cpus may be specified as absolute, or relative to the current cpuset.
+ * The list of available cpus for this task is in map "numa_all_cpus",
+ * which may represent all cpus or the cpus in the current cpuset.
+ * (it is set up by read_constraints() from the current task's Cpus_allowed)
+ *
+ * The caller must free the returned cpubuf buffer.
+ */
+struct bitmask *
+cpumask(char *s, int *ncpus)
 {
 	int invert = 0, relative=0;
+	int conf_cpus = number_of_configured_cpus();
+	int task_cpus = number_of_task_cpus();
 	char *end; 
+	struct bitmask *cpubuf;
 
-	int cpubufsize = round_up(maxcpus, BITS_PER_LONG) / 8;
-	unsigned long *cpubuf = calloc(cpubufsize,1); 
-	if (!cpubuf) 
-		complain("Out of memory");
+	cpubuf = allocate_cpumask();
 
 	if (s[0] == 0) 
 		return cpubuf;
 	if (*s == '!') { 
 		invert = 1;
-		++s;
+		s++;
+	}
+	if (*s == '+') {
+		relative++;
+		s++;
 	}
-	do {		
+	do {
 		unsigned long arg;
+		int i;
 
 		if (!strcmp(s,"all")) { 
 			int i;
-			for (i = 0; i < numcpus; i++)
-				set_bit(i, cpubuf);
+			for (i = 0; i < task_cpus; i++)
+				bitmask_setbit(cpubuf, i);
+			s+=4;
 			break;
 		}
-		if (*s == '+') relative++;
-		arg = get_nr(s, &end, maxcpus, numa_all_cpus);
+		arg = get_nr(s, &end, numa_all_cpus, relative);
 		if (end == s)
-			complain("unparseable node description `%s'\n", s);
-		if (arg > maxcpus)
-			complain("cpu argument %d is out of range\n", arg);
-		set_bit(arg, cpubuf);
+			complain("unparseable cpu description `%s'\n", s);
+		if (arg >= task_cpus)
+			complain("cpu argument %s is out of range\n", s);
+		i = arg;
+		bitmask_setbit(cpubuf, i);
 		s = end; 
 		if (*s == '-') {
 			char *end2;
 			unsigned long arg2;
-			if (relative && *(s+1) != '+') {
-				*s = '+';
-				arg2 = get_nr(s,&end2,maxcpus,numa_all_cpus);
-			} else {
-				arg2 = get_nr(++s,&end2,maxcpus,numa_all_cpus);
-			}
+			int i;
+			arg2 = get_nr(++s, &end2, numa_all_cpus, relative);
 			if (end2 == s)
 				complain("missing cpu argument %s\n", s);
-			if (arg2 > maxcpus)
-				complain("cpu argument %d out of range\n",arg2);
+			if (arg2 >= task_cpus)
+				complain("cpu argument %s out of range\n", s);
 			while (arg <= arg2) {
-				if (test_bit(arg, numa_all_cpus))
-					set_bit(arg, cpubuf);
+				i = arg;
+				if (bitmask_isbitset(numa_all_cpus, i))
+					bitmask_setbit(cpubuf, i);
 				arg++;
 			}
 			s = end2;
@@ -137,67 +132,92 @@ unsigned long *cpumask(char *s, int *ncp
 		usage();
 	if (invert) { 
 		int i;
-		for (i = 0; i <= maxcpus; i++) {
-			if (test_bit(i, cpubuf))
-				clear_bit(i, cpubuf);
+		for (i = 0; i < conf_cpus; i++) {
+			if (bitmask_isbitset(cpubuf, i))
+				bitmask_clearbit(cpubuf, i);
 			else
-				set_bit(i, cpubuf);
+				bitmask_setbit(cpubuf, i);
 		}
 	} 
-	*ncpus = cpubufsize;
+	*ncpus = cpubuf->size;
 	return cpubuf;	
 }
 
-void printcpumask(char *name, unsigned long *mask, int size)
+void printcpumask(char *name, struct bitmask *mask)
 { 
 	int i;
 	printf("%s: ", name);
-	for (i = 0; i < size*8; i++) {
-		if (test_bit(i, mask))
+	for (i = 0; i < mask->size; i++) {
+		if (bitmask_isbitset(mask, i))
 			printf("%d ", i);
 	}
 	putchar('\n');
 } 
 
-nodemask_t nodemask(char *s) 
-{ 
-	int max = numa_max_node();
-	nodemask_t mask;
-	int invert = 0;
+/*
+ * nodemask() is called to create a node mask, given
+ * an ascii string such as 25 or 12-15 or 1,3,5-7 or +6-10.
+ * (the + indicates that the numbers are cpuset-relative)
+ *
+ * The nodes may be specified as absolute, or relative to the current cpuset.
+ * The list of available nodes is in map "numa_all_nodes",
+ * which may represent all nodes or the nodes in the current cpuset.
+ * (it is set up by read_constraints() from the current task's Mems_allowed)
+ *
+ * The caller must free the returned nodebuf buffer.
+ */
+struct bitmask *
+nodemask(char *s)
+{
+	int maxnode = numa_max_node();
+	int invert = 0, relative = 0;
+	int conf_nodes = number_of_configured_nodes();
+	int task_nodes = number_of_task_nodes();
 	char *end; 
-	nodemask_zero(&mask);
+	struct bitmask *nodebuf;
+
+	nodebuf = allocate_nodemask();
+
 	if (s[0] == 0) 
-		return numa_no_nodes; 
+		return numa_no_nodes;
 	if (*s == '!') { 
 		invert = 1;
-		++s;
+		s++;
+	}
+	if (*s == '+') {
+		relative++;
+		s++;
 	}
-	do {		
+	do {
 		unsigned long arg;
-
+		int i;
 		if (!strcmp(s,"all")) { 
-			s += 4;
-			mask = numa_all_nodes;
+			int i;
+			for (i = 0; i < task_nodes; i++)
+				bitmask_setbit(nodebuf, i);
+			s+=4;
 			break;
 		}
-		arg = get_nr(s, &end, max, (unsigned long *)numa_all_nodes.n);
+		arg = get_nr(s, &end, numa_all_nodes, relative);
 		if (end == s)
 			complain("unparseable node description `%s'\n", s);
-		if (arg > max)
+		if (arg > maxnode)
 			complain("node argument %d is out of range\n", arg);
-		nodemask_set(&mask, arg);
+		i = arg;
+		bitmask_setbit(nodebuf, i);
 		s = end; 
 		if (*s == '-') { 
 			char *end2;
-			unsigned long arg2 = get_nr(++s, &end2, max,
-					(unsigned long *)numa_all_nodes.n);
+			unsigned long arg2;
+			arg2 = get_nr(++s, &end2, numa_all_nodes, relative);
 			if (end2 == s)
-				complain("missing cpu argument %s\n", s);
-			if (arg2 > max)
-				complain("node argument %d out of range\n",arg2);
+				complain("missing node argument %s\n", s);
+			if (arg2 >= task_nodes)
+				complain("node argument %d out of range\n", arg2);
 			while (arg <= arg2) {
-				if (nodemask_isset(&numa_all_nodes, arg))
-					nodemask_set(&mask, arg);
+				i = arg;
+				if (bitmask_isbitset(numa_all_nodes, i))
+					bitmask_setbit(nodebuf, i);
 				arg++;
 			}
 			s = end2;
@@ -207,17 +227,15 @@ nodemask_t nodemask(char *s) 
 		usage();
 	if (invert) { 
 		int i;
-		for (i = 0; i <= max; i++) {
-			if (!nodemask_isset(&numa_all_nodes, i))
-				continue;
-			if (nodemask_isset(&mask, i))
-				nodemask_clr(&mask, i);
+		for (i = 0; i < conf_nodes; i++) {
+			if (bitmask_isbitset(nodebuf, i))
+				bitmask_clearbit(nodebuf, i);
 			else
-				nodemask_set(&mask, i); 
+				bitmask_setbit(nodebuf, i);
 		}
 	} 
-	return mask;
-} 
+	return nodebuf;
+}
 
 void complain(char *fmt, ...)
 {
@@ -306,4 +324,3 @@ void print_policies(void)
 		printf(" %s", policies[i].name);
 	printf("\n"); 
 }
-
Index: numactl-1.0.2/Makefile
===================================================================
--- numactl-1.0.2.orig/Makefile
+++ numactl-1.0.2/Makefile
@@ -24,7 +24,7 @@ CLEANFILES := numactl.o libnuma.o numact
 	      .depend .depend.X test/nodemap test/distance test/tbitmap \
 	      test/after test/before threadtest test_move_pages \
 	      test/mbind_mig_pages test/migrate_pages \
-	      migratepages migspeed
+	      migratepages migspeed migspeed.o
 SOURCES := bitops.c libnuma.c distance.c memhog.c numactl.c numademo.c \
 	numamon.c shm.c stream_lib.c stream_main.c syscall.c util.c mt.c \
 	test/*.c
Index: numactl-1.0.2/numa.h
===================================================================
--- numactl-1.0.2.orig/numa.h
+++ numactl-1.0.2/numa.h
@@ -22,52 +22,51 @@
 #include <stddef.h>
 #include <string.h>
 
-#if defined(__x86_64__) || defined(__i386__) 
-#define NUMA_NUM_NODES 	128
-#else
-#define NUMA_NUM_NODES	2048
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct { 
-	unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)];
-} nodemask_t;
+struct bitmask {
+	unsigned long size; /* number of bits in the map */
+	unsigned long *maskp;
+};
+
+#define howmany(x,y) (((x)+((y)-1))/(y))
+#define bitsperlong (8 * sizeof(unsigned long))
+#define longsperbits(n) howmany(n, bitsperlong)
+#define bytesperbits(x) ((x+7)/8)
+
+int bitmask_isbitset(const struct bitmask *, unsigned int);
+struct bitmask *bitmask_clearall(struct bitmask *);
+struct bitmask *bitmask_setbit(struct bitmask *, unsigned int);
+struct bitmask *bitmask_clearbit(struct bitmask *, unsigned int);
+unsigned int bitmask_nbytes(struct bitmask *);
+struct bitmask *bitmask_alloc(unsigned int);
+void bitmask_free(struct bitmask *);
+int bitmask_equal(const struct bitmask *, const struct bitmask *);
 
-static inline void nodemask_zero(nodemask_t *mask)
+static inline void nodemask_zero(struct bitmask *mask)
 { 
-	memset(mask->n, 0, sizeof(mask->n)); 
+	bitmask_clearall(mask);
 } 
 
-static inline void nodemask_set(nodemask_t *mask, int node)
+static inline void nodemask_set(struct bitmask *mask, int node)
 {
-	mask->n[node / (8*sizeof(unsigned long))] |=
-		(1UL<<(node%(8*sizeof(unsigned long))));		
+	bitmask_setbit(mask, node);
 } 
 
-static inline void nodemask_clr(nodemask_t *mask, int node)
+static inline void nodemask_clr(struct bitmask *mask, int node)
 {
-	mask->n[node / (8*sizeof(unsigned long))] &= 
-		~(1UL<<(node%(8*sizeof(unsigned long))));	
+	bitmask_clearbit(mask, node);
 }
-static inline int nodemask_isset(const nodemask_t *mask, int node)
+
+static inline int nodemask_isset(struct bitmask *mask, int node)
 {
-	if ((unsigned)node >= NUMA_NUM_NODES)
-		return 0;
-	if (mask->n[node / (8*sizeof(unsigned long))] & 
-		(1UL<<(node%(8*sizeof(unsigned long)))))
-		return 1;
-	return 0;	
+	return bitmask_isbitset(mask, node);
 }
-static inline int nodemask_equal(const nodemask_t *a, const nodemask_t *b) 
+static inline int nodemask_equal(struct bitmask *a, struct bitmask *b)
 { 
-	int i;
-	for (i = 0; i < NUMA_NUM_NODES/(sizeof(unsigned long)*8); i++) 
-		if (a->n[i] != b->n[i]) 
-			return 0; 
-	return 1;
+	return bitmask_equal(a, b);
 } 
 
 /* NUMA support available. If this returns a negative value all other function
@@ -88,25 +87,34 @@ long numa_node_size(int node, long *free
 int numa_pagesize(void); 
 
 /* Set with all nodes. Only valid after numa_available. */
-extern const nodemask_t numa_all_nodes;
+extern struct bitmask *numa_all_nodes;
 
 /* Set with no nodes */
-extern const nodemask_t numa_no_nodes;
+extern struct bitmask *numa_no_nodes;
 
 /* Only run and allocate memory from a specific set of nodes. */
-void numa_bind(const nodemask_t *nodes); 
+void numa_bind(struct bitmask *nodes);
+
 /* Set the NUMA node interleaving mask. 0 to turn off interleaving */
-void numa_set_interleave_mask(const nodemask_t *nodemask); 
+void numa_set_interleave_mask(struct bitmask *nodemask);
+
 /* Return the current interleaving mask */
-nodemask_t numa_get_interleave_mask(void);
+struct bitmask *numa_get_interleave_mask(void);
+
+/* allocate a bitmask big enough for all nodes */
+struct bitmask *allocate_nodemask(void);
+
 /* Some node to preferably allocate memory from for thread. */
 void numa_set_preferred(int node);
+
 /* Set local memory allocation policy for thread */
 void numa_set_localalloc(void);
+
 /* Only allocate memory from the nodes set in mask. 0 to turn off */
-void numa_set_membind(const nodemask_t *nodemask); 
+void numa_set_membind(struct bitmask *nodemask);
+
 /* Return current membind */ 
-nodemask_t numa_get_membind(void);
+struct bitmask *numa_get_membind(void);
 
 int numa_get_interleave_node(void);
 
@@ -114,7 +122,7 @@ int numa_get_interleave_node(void);
    and are relatively slow. */
 
 /* Alloc memory page interleaved on nodes in mask */ 
-void *numa_alloc_interleaved_subset(size_t size, const nodemask_t *nodemask);
+void *numa_alloc_interleaved_subset(size_t size, struct bitmask *nodemask);
 /* Alloc memory page interleaved on all nodes. */
 void *numa_alloc_interleaved(size_t size);
 /* Alloc memory located on node */
@@ -130,13 +138,13 @@ void numa_free(void *mem, size_t size);
    processed by these must not be touched yet */
 
 /* Interleave an memory area. */
-void numa_interleave_memory(void *mem, size_t size, const nodemask_t *mask);
+void numa_interleave_memory(void *mem, size_t size, struct bitmask *mask);
 
 /* Allocate a memory area on a specific node. */
 void numa_tonode_memory(void *start, size_t size, int node);
 
 /* Allocate memory on a mask of nodes. */
-void numa_tonodemask_memory(void *mem, size_t size, const nodemask_t *mask);
+void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *mask);
 
 /* Allocate a memory area on the current node. */
 void numa_setlocal_memory(void *start, size_t size);
@@ -145,11 +153,11 @@ void numa_setlocal_memory(void *start, s
 void numa_police_memory(void *start, size_t size);
 
 /* Run current thread only on nodes in mask */
-int numa_run_on_node_mask(const nodemask_t *mask);
+int numa_run_on_node_mask(struct bitmask *mask);
 /* Run current thread only on node */
 int numa_run_on_node(int node);
 /* Return current mask of nodes the thread can run on */
-nodemask_t numa_get_run_node_mask(void);
+struct bitmask * numa_get_run_node_mask(void);
 
 /* When strict fail allocation when memory cannot be allocated in target node(s). */
 void numa_set_bind_policy(int strict);  
@@ -157,8 +165,35 @@ void numa_set_bind_policy(int strict);  
 /* Fail when existing memory has incompatible policy */
 void numa_set_strict(int flag);
 
+/* maximum nodes (size of kernel nodemask_t) */
+int number_of_possible_nodes();
+
+/* maximum cpus (size of kernel cpumask_t) */
+int number_of_possible_cpus();
+
+/* nodes in the system */
+int number_of_configured_nodes();
+
+/* maximum cpus */
+int number_of_configured_cpus();
+
+/* maximum cpus allowed to current task */
+int number_of_task_cpus();
+
+/* maximum nodes allowed to current task */
+int number_of_task_nodes();
+
+/* allocate a bitmask the size of the kernel cpumask_t */
+struct bitmask *allocate_cpumask();
+
+/* allocate a bitmask the size of the kernel nodemask_t */
+struct bitmask *allocate_nodemask();
+
+/* set up to represent the cpus available to the current task */
+struct bitmask *numa_all_cpus;
+
 /* Convert node to CPU mask. -1/errno on failure, otherwise 0. */
-int numa_node_to_cpus(int node, unsigned long *buffer, int buffer_len);
+int numa_node_to_cpus(int node, struct bitmask *buffer);
 
 /* Report distance of node1 from node2. 0 on error.*/
 int numa_distance(int node1, int node2);
@@ -176,11 +211,12 @@ extern int numa_exit_on_error;
    once. */
 void numa_warn(int num, char *fmt, ...);
 
-int numa_migrate_pages(int pid, const nodemask_t *from, const nodemask_t *to);
+int numa_migrate_pages(int pid, struct bitmask *from, struct bitmask *to);
 
 int numa_move_pages(int pid, unsigned long count, void **pages,
 		const int *nodes, int *status, int flags);
 
+
 #ifdef __cplusplus
 }
 #endif
Index: numactl-1.0.2/numaif.h
===================================================================
--- numactl-1.0.2.orig/numaif.h
+++ numactl-1.0.2/numaif.h
@@ -8,11 +8,10 @@ extern "C" { 
 /* Kernel interface for NUMA API */
 
 /* System calls */
-extern long get_mempolicy(int *policy, 
-			  const unsigned long *nmask, unsigned long maxnode,
-			  void *addr, int flags);
+extern long get_mempolicy(int *policy, const unsigned long *nmask,
+			unsigned long maxnode, void *addr, int flags);
 extern long mbind(void *start, unsigned long len, int mode, 
-		  const unsigned long *nmask, unsigned long maxnode, unsigned flags);
+	const unsigned long *nmask, unsigned long maxnode, unsigned flags);
 extern long set_mempolicy(int mode, const unsigned long *nmask, 
 			  unsigned long maxnode);
 extern long migratepages(int pid, unsigned long maxnode, unsigned long *fromnode,
Index: numactl-1.0.2/numaint.h
===================================================================
--- numactl-1.0.2.orig/numaint.h
+++ numactl-1.0.2/numaint.h
@@ -1,14 +1,14 @@
 /* Internal interfaces of libnuma */
 #include "bitops.h"
 
-extern int numa_sched_setaffinity(pid_t pid, unsigned len, const unsigned long *mask);
-extern int numa_sched_getaffinity(pid_t pid, unsigned len, const unsigned long *mask);
-extern int numa_sched_setaffinity_int(pid_t pid, unsigned len,const unsigned long *mask);
-extern int numa_sched_getaffinity_int(pid_t pid, unsigned len,const unsigned long *mask);
-extern long get_mempolicy_int(int *policy, const unsigned long *nmask, 
-			      unsigned long maxnode, void *addr, int flags);
+extern int numa_sched_setaffinity(pid_t pid, struct bitmask *mask);
+extern int numa_sched_getaffinity(pid_t pid, struct bitmask *mask);
+extern int numa_sched_setaffinity_int(pid_t pid, struct bitmask *mask);
+extern int numa_sched_getaffinity_int(pid_t pid, struct bitmask *mask);
+extern long get_mempolicy_int(int *policy, const unsigned long *nmask,
+                              unsigned long maxnode, void *addr, int flags);
 extern long mbind_int(void *start, unsigned long len, int mode, 
-		  const unsigned long *nmask, unsigned long maxnode, unsigned flags);
+	  const unsigned long *nmask, unsigned long maxnode, unsigned flags);
 extern long set_mempolicy_int(int mode, const unsigned long *nmask, 
 			  unsigned long maxnode);
 extern long migrate_pages(int pid, unsigned long maxnode, const unsigned long *frommask,
Index: numactl-1.0.2/shm.h
===================================================================
--- numactl-1.0.2.orig/shm.h
+++ numactl-1.0.2/shm.h
@@ -8,9 +8,9 @@ extern unsigned long long shmoffset;
 extern int shmflags;
 
 extern void dump_shm(void);
-extern void attach_shared(char *name);
-extern void attach_sysvshm(char *name); 
-extern void verify_shm(int policy, nodemask_t nodes);
+extern void attach_shared(char *);
+extern void attach_sysvshm(char *);
+extern void verify_shm(int policy, struct bitmask *);
 
 /* in numactl.c */
 extern int exitcode;
Index: numactl-1.0.2/util.h
===================================================================
--- numactl-1.0.2.orig/util.h
+++ numactl-1.0.2/util.h
@@ -1,7 +1,7 @@
-extern void printmask(char *name, nodemask_t *mask);
-extern void printcpumask(char *name, unsigned long *mask, int len);
-extern nodemask_t nodemask(char *s);
-extern unsigned long *cpumask(char *s, int *ncpus);
+extern void printmask(char *name, struct bitmask *mask);
+extern void printcpumask(char *name, struct bitmask *mask);
+extern struct bitmask *nodemask(char *s);
+extern struct bitmask *cpumask(char *s, int *ncpus);
 extern int read_sysctl(char *name);
 extern void complain(char *fmt, ...);
 extern void nerror(char *fmt, ...);
