Blame SOURCES/numad-0.5git-version.patch

dafe70
diff -rup numad-0.5git/numad.8 numad-0.5git-new/numad.8
dafe70
--- numad-0.5git/numad.8	2012-12-03 15:40:40.000000000 +0100
dafe70
+++ numad-0.5git-new/numad.8	2016-08-30 08:45:19.000000000 +0200
dafe70
@@ -1,45 +1,56 @@
dafe70
 .TH "numad" "8" "1.0.0" "Bill Gray" "Administration"
dafe70
-.SH "numad"
dafe70
-.LP 
dafe70
+.SH "NAME"
dafe70
+.LP
dafe70
 numad \- A user\-level daemon that provides placement advice and process
dafe70
 management for efficient use of CPUs and memory on systems with NUMA topology.
dafe70
-.SH "SYNTAX"
dafe70
-.LP 
dafe70
+.SH "SYNOPSIS"
dafe70
+.LP
dafe70
 numad [\fI\-dhvV\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
-numad  [\fI\-D non-standard-cgroup-mount-point\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
+numad  [\fI\-C 0|1\fP]
dafe70
+.br
dafe70
+.LP
dafe70
+numad  [\fI\-H THP_hugepage_scan_sleep_ms\fP]
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-i [min_interval:]max_interval\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-K 0|1\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-l log_level\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
+numad  [\fI\-m target_memory_locality\fP]
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-p PID\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-r PID\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
+numad  [\fI\-R reserved-CPU-list\fP]
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-S 0|1\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
+numad  [\fI\-t logical_CPU_percent\fP]
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-u target_utilization\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-w NCPUS[:MB]\fP]
dafe70
-.br 
dafe70
-.LP 
dafe70
+.br
dafe70
+.LP
dafe70
 numad  [\fI\-x PID\fP]
dafe70
-.br 
dafe70
-
dafe70
+.br
dafe70
 .SH "DESCRIPTION"
dafe70
-.LP 
dafe70
+.LP
dafe70
 Numad is a system daemon that monitors NUMA topology and resource usage. It
dafe70
 will attempt to locate processes for efficient NUMA locality and affinity,
dafe70
 dynamically adjusting to changing system conditions.  Numad also provides
dafe70
@@ -53,25 +64,42 @@ large in-memory database application, fo
dafe70
 accesses will likely remain unpredictable -- numad will probably not improve
dafe70
 performance.
dafe70
 .SH "OPTIONS"
dafe70
-.LP 
dafe70
-.TP 
dafe70
+.LP
dafe70
+.TP
dafe70
+\fB\-C\fR <\fI0|1\fP>
dafe70
+This option controls whether or not numad treats inactive file cache as
dafe70
+available memory. By default, numad assumes it can count inactive file cache as
dafe70
+"free" memory when considering resources to match with processes.  Specify
dafe70
+\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed
dafe70
+resource.
dafe70
+.TP
dafe70
 \fB\-d\fR
dafe70
 Debug output in log, sets the log level to LOG_DEBUG.  Same effect as \fI\-l 7\fP.
dafe70
 .TP
dafe70
-\fB\-D\fR <\fInon-standard-cgroup-mount-point\fP>
dafe70
-This option can be used to communicate a non-standard cgroup mount point to
dafe70
-numad.  This is not normally necessary.
dafe70
-.TP 
dafe70
 \fB\-h\fR
dafe70
 Display usage help information and then exit.
dafe70
-.TP 
dafe70
+.TP
dafe70
+\fB\-H\fR  <\fITHP_scan_sleep_ms\fP>
dafe70
+Set the desired transparent hugepage scan interval in ms.  The
dafe70
+.na
dafe70
+/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs
dafe70
+.ad
dafe70
+tunable is usually set to 10000ms by the operating system.  The default is
dafe70
+changed by numad to be 1000ms since it is helpful for the hugepage daemon to be
dafe70
+more aggressive when memory moves between nodes.  Specifying (\fI\-H 0\fP) will
dafe70
+cause numad to retain the system default value.  You can also make the hugepage
dafe70
+daemon more or less aggressive by specifying an alternate value with this
dafe70
+option.  For example, setting this value to 100ms (\fI\-H 100\fP) might improve
dafe70
+the performance of workloads which use many transparent hugepages.
dafe70
+.TP
dafe70
 \fB\-i\fR <\fI[min_interval:]max_interval\fP>
dafe70
 Sets the time interval that numad waits between system scans, in seconds to
dafe70
 <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default
dafe70
 <\fImin_interval\fP> is 5 seconds.  Setting a <\fImax_interval\fP> of zero will
dafe70
 cause the daemon to exit.  (This is the normal mechanism to terminate the
dafe70
 daemon.)  A bigger <\fImax_interval\fP> will decrease numad overhead but also
dafe70
-decrease responsiveness to changing loads.
dafe70
+decrease responsiveness to changing loads.  The default numad max_interval can
dafe70
+be changed in the numad.conf file.
dafe70
 .TP
dafe70
 \fB\-K\fR <\fI0|1\fP>
dafe70
 This option controls whether numad keeps interleaved memory spread across NUMA
dafe70
@@ -82,10 +110,24 @@ a large, single-instance application tha
dafe70
 the workload will have continuous unpredictable memory access patterns (e.g. a
dafe70
 large in-memory database), you might get better results by specifying \fI\-K
dafe70
 1\fP to instruct numad to keep interleaved memory distributed.
dafe70
-.TP 
dafe70
+.TP
dafe70
 \fB\-l\fR <\fIlog_level\fP>
dafe70
 Sets the log level to <\fIlog_level\fP>.  Reasonable choices are 5, 6, or 7.
dafe70
-The default value is 5.
dafe70
+The default value is 5.  Note that CPU values are scaled by a factor of 100
dafe70
+internally and in the numad log files.  Unfortunately, you don't actually have
dafe70
+that many CPUs.
dafe70
+.TP
dafe70
+\fB\-m\fR  <\fItarget_memory_locality\fP>
dafe70
+Set the desired memory locality threshold to stop moving process memory.  Numad
dafe70
+might stop retrying to coalesce process memory when more than this percentage
dafe70
+of the process's memory is already localized in the target node(s).  The
dafe70
+default is 90%. Numad will frequently localize more than the localization
dafe70
+threshold percent, but it will not necessarily do so.  Decrease the threshold
dafe70
+to allow numad to leave more process memory distributed on various nodes.
dafe70
+Increase the threshold to instruct numad to try to localize more memory.
dafe70
+Acceptable values are between 50 and 100 percent.  Note that setting the target
dafe70
+memory locality to 100% might cause numad to continually retry to move memory
dafe70
+that the kernel will never succesfully move.
dafe70
 .TP
dafe70
 \fB\-p\fR <\fIPID\fP>
dafe70
 Add PID to explicit inclusion list of processes to consider for managing, if
dafe70
@@ -102,6 +144,12 @@ processes.  After daemon start, only one
dafe70
 process lists per subsequent numad invocation.  Use with \-S and \-p and \-x to
dafe70
 precisely control the scope of processes numad can manage.
dafe70
 .TP
dafe70
+\fB\-R\fR <\fICPU_LIST\fP>
dafe70
+Specify a list of CPUs that numad should assume are reserved for non-numad use.
dafe70
+No processes will be bound to the specified CPUs by numad.  This option is
dafe70
+effective only when starting numad.  You cannot change reserved CPUs
dafe70
+dynamically while numad is already running.
dafe70
+.TP
dafe70
 \fB\-S\fR <\fI0|1\fP>
dafe70
 This option controls whether numad scans all system processes or only the
dafe70
 processes on the explicit inclusion PID list.  The default is to scan all
dafe70
@@ -113,18 +161,30 @@ exclusion list).  Starting numad as
dafe70
 .br
dafe70
 will limit scanning, and thus also automatic NUMA management, to only those
dafe70
 three explicitly specified processes.
dafe70
-.TP 
dafe70
+.TP
dafe70
+\fB\-t\fR  <\fIlogical_CPU_percent\fP>
dafe70
+Specify the resource value of logical CPUs.  Hardware threads typically share
dafe70
+most core resources, and so logical CPUs add only a fraction of CPU power for
dafe70
+many workloads.  By default numad considers logical CPUs to be only 20 percent
dafe70
+of a dedicated hardware core.
dafe70
+.TP
dafe70
 \fB\-u\fR  <\fItarget_utilization\fP>
dafe70
 Set the desired maximum consumption percentage of a node. Default is 85%.
dafe70
 Decrease the target value to maintain more available resource margin on each
dafe70
 node.  Increase the target value to more exhaustively consume node resources.
dafe70
-.TP 
dafe70
+If you have sized your workloads to precisely fit inside a NUMA node,
dafe70
+specifying (\fI\-u 100\fP) might improve system performance by telling numad to
dafe70
+go ahead and consume all the resources in each node.  It is possible to specify
dafe70
+values up to 130 percent to oversubscribe CPUs in the nodes, but memory
dafe70
+utilization is always capped at 100%.  Use oversubscription values very
dafe70
+carefully.
dafe70
+.TP
dafe70
 \fB\-v\fR
dafe70
 Verbose output in log, sets the log level to LOG_INFO.  Same effect as \fI\-l 6\fP.
dafe70
-.TP 
dafe70
+.TP
dafe70
 \fB\-V\fR
dafe70
 Display version information and exit.
dafe70
-.TP 
dafe70
+.TP
dafe70
 \fB\-w\fR <\fINCPUS[:MB]\fP>
dafe70
 Queries numad for the best NUMA nodes to bind an entity that needs
dafe70
 <\fINCPUS\fP>.  The amount of memory (in MBs) is optional, but should normally
dafe70
@@ -145,32 +205,37 @@ Add PID to explicit exclusion list of pr
dafe70
 Multiple \fI\-x PID\fP options can be specified at daemon start, but after
dafe70
 daemon start, only one PID can be added to the exclusion list per subsequent
dafe70
 numad invocation.  Use with \-S to precisely control the scope of processes
dafe70
-numad can manage.  
dafe70
+numad can manage.
dafe70
 .SH "FILES"
dafe70
-.LP 
dafe70
-\fI/usr/bin/numad\fP 
dafe70
-.br 
dafe70
-\fI/var/log/numad.log\fP 
dafe70
-.br 
dafe70
-\fI/var/run/numad.pid\fP 
dafe70
+.LP
dafe70
+\fI/usr/bin/numad\fP
dafe70
+.br
dafe70
+\fI/etc/numad.conf\fP
dafe70
+.br
dafe70
+\fI/var/log/numad.log\fP
dafe70
+.br
dafe70
+\fI/var/run/numad.pid\fP
dafe70
 .SH "ENVIRONMENT VARIABLES"
dafe70
-.LP 
dafe70
-.TP 
dafe70
+.LP
dafe70
+.TP
dafe70
 None.
dafe70
 .SH "EXAMPLES"
dafe70
-.LP 
dafe70
-Numad is normally run as a system daemon and should be managed by the 
dafe70
+.LP
dafe70
+Numad can be run as a system daemon and can be managed by the
dafe70
 standard init mechanisms of the host.
dafe70
-.LP  
dafe70
+.LP
dafe70
 If interactive (manual) control is desired, you can start the daemon manually by typing:
dafe70
-.LP 
dafe70
+.LP
dafe70
 /usr/bin/numad
dafe70
 .LP
dafe70
-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options.
dafe70
+Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options.
dafe70
+.LP
dafe70
+You can terminate numad from running by typing:
dafe70
+.LP
dafe70
+/usr/bin/numad -i0
dafe70
 .SH "AUTHORS"
dafe70
-.LP 
dafe70
+.LP
dafe70
 Bill Gray <bgray@redhat.com>
dafe70
 .SH "SEE ALSO"
dafe70
-.LP 
dafe70
+.LP
dafe70
 numactl(8)
dafe70
-
dafe70
diff -rup numad-0.5git/numad.c numad-0.5git-new/numad.c
dafe70
--- numad-0.5git/numad.c	2012-12-03 15:40:40.000000000 +0100
dafe70
+++ numad-0.5git-new/numad.c	2016-08-30 08:45:19.000000000 +0200
dafe70
@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston
dafe70
 */ 
dafe70
 
dafe70
 
dafe70
-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt
dafe70
+// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
dafe70
 
dafe70
 
dafe70
 #define _GNU_SOURCE
dafe70
@@ -40,6 +40,10 @@ Inc., 59 Temple Place, Suite 330, Boston
dafe70
 #include <stdio.h>
dafe70
 #include <stdlib.h>
dafe70
 #include <string.h>
dafe70
+#include <time.h>
dafe70
+#include <unistd.h>
dafe70
+#include <values.h>
dafe70
+
dafe70
 #include <sys/ipc.h>
dafe70
 #include <sys/mman.h>
dafe70
 #include <sys/msg.h>
dafe70
@@ -49,26 +53,16 @@ Inc., 59 Temple Place, Suite 330, Boston
dafe70
 #include <sys/syslog.h>
dafe70
 #include <sys/time.h>
dafe70
 #include <sys/types.h>
dafe70
-#include <time.h>
dafe70
-#include <unistd.h>
dafe70
-#include <values.h>
dafe70
+
dafe70
+#include <asm/unistd.h>
dafe70
 
dafe70
 
dafe70
-#define VERSION_STRING "20121130"
dafe70
+#define VERSION_STRING "20150602"
dafe70
 
dafe70
 
dafe70
 #define VAR_RUN_FILE "/var/run/numad.pid"
dafe70
 #define VAR_LOG_FILE "/var/log/numad.log"
dafe70
 
dafe70
-char *cpuset_dir = NULL;
dafe70
-char *cpuset_dir_list[] =  {
dafe70
-    NULL,
dafe70
-    "/sys/fs/cgroup/cpuset",
dafe70
-    "/cgroup/cpuset",
dafe70
-    NULL
dafe70
-};
dafe70
-
dafe70
-
dafe70
 #define KILOBYTE (1024)
dafe70
 #define MEGABYTE (1024 * 1024)
dafe70
 
dafe70
@@ -86,14 +80,11 @@ char *cpuset_dir_list[] =  {
dafe70
 #define MAX_INTERVAL 15
dafe70
 #define CPU_THRESHOLD     50
dafe70
 #define MEMORY_THRESHOLD 300
dafe70
-#define TARGET_UTILIZATION_PERCENT 85
dafe70
-#define IMPROVEMENT_THRESHOLD_PERCENT 5
dafe70
-
dafe70
+#define DEFAULT_HTT_PERCENT 20
dafe70
+#define DEFAULT_THP_SCAN_SLEEP_MS 1000
dafe70
+#define DEFAULT_UTILIZATION_PERCENT 85
dafe70
+#define DEFAULT_MEMLOCALITY_PERCENT 90
dafe70
 
dafe70
-#define ELIM_NEW_LINE(s) \
dafe70
-    if (s[strlen(s) - 1] == '\n') { \
dafe70
-        s[strlen(s) - 1] = '\0'; \
dafe70
-    }
dafe70
 
dafe70
 #define CONVERT_DIGITS_TO_NUM(p, n) \
dafe70
     n = *p++ - '0'; \
dafe70
@@ -105,19 +96,36 @@ char *cpuset_dir_list[] =  {
dafe70
 
dafe70
 int num_cpus = 0;
dafe70
 int num_nodes = 0;
dafe70
-int page_size_in_bytes = 0;
dafe70
-int huge_page_size_in_bytes = 0;
dafe70
+int threads_per_core = 0;
dafe70
+uint64_t page_size_in_bytes = 0;
dafe70
+uint64_t huge_page_size_in_bytes = 0;
dafe70
 
dafe70
 int min_interval = MIN_INTERVAL;
dafe70
 int max_interval = MAX_INTERVAL;
dafe70
-int target_utilization  = TARGET_UTILIZATION_PERCENT;
dafe70
+int htt_percent = DEFAULT_HTT_PERCENT;
dafe70
+int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS;
dafe70
+int target_utilization  = DEFAULT_UTILIZATION_PERCENT;
dafe70
+int target_memlocality  = DEFAULT_MEMLOCALITY_PERCENT;
dafe70
 int scan_all_processes = 1;
dafe70
 int keep_interleaved_memory = 0;
dafe70
+int use_inactive_file_cache = 1;
dafe70
 
dafe70
 pthread_mutex_t pid_list_mutex;
dafe70
 pthread_mutex_t node_info_mutex;
dafe70
+long sum_CPUs_total = 0;
dafe70
 int requested_mbs = 0;
dafe70
 int requested_cpus = 0;
dafe70
+int got_sighup = 0;
dafe70
+int got_sigterm = 0;
dafe70
+int got_sigquit = 0;
dafe70
+
dafe70
+void sig_handler(int signum) { 
dafe70
+    switch (signum) {
dafe70
+        case SIGHUP:  got_sighup  = 1; break;
dafe70
+        case SIGTERM: got_sigterm = 1; break;
dafe70
+        case SIGQUIT: got_sigquit = 1; break;
dafe70
+    }
dafe70
+}
dafe70
 
dafe70
 
dafe70
 
dafe70
@@ -139,7 +147,7 @@ void numad_log(int level, const char *fm
dafe70
     }
dafe70
     char buf[BUF_SIZE];
dafe70
     time_t ts = time(NULL);
dafe70
-    sprintf(buf, ctime(&ts);;
dafe70
+    strncpy(buf, ctime(&ts), sizeof(buf));
dafe70
     char *p = &buf[strlen(buf) - 1];
dafe70
     *p++ = ':';
dafe70
     *p++ = ' ';
dafe70
@@ -155,13 +163,16 @@ void open_log_file() {
dafe70
     log_fs = fopen(VAR_LOG_FILE, "a");
dafe70
     if (log_fs == NULL) {
dafe70
         log_fs = stderr;
dafe70
-        numad_log(LOG_ERR, "Cannot open numad log file -- using stderr\n");
dafe70
+        numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno);
dafe70
     }
dafe70
 }
dafe70
 
dafe70
+
dafe70
 void close_log_file() {
dafe70
     if (log_fs != NULL) {
dafe70
-        fclose(log_fs);
dafe70
+        if (log_fs != stderr) {
dafe70
+            fclose(log_fs);
dafe70
+        }
dafe70
         log_fs = NULL;
dafe70
     }
dafe70
 }
dafe70
@@ -235,23 +246,32 @@ void send_msg(long dst_pid, long cmd, lo
dafe70
 
dafe70
 
dafe70
 typedef struct id_list {
dafe70
-    // Use CPU_SET(3) <sched.h> cpuset bitmasks,
dafe70
+    // Use CPU_SET(3) <sched.h> bitmasks,
dafe70
     // but bundle size and pointer together
dafe70
     // and genericize for both CPU and Node IDs
dafe70
     cpu_set_t *set_p; 
dafe70
     size_t bytes;
dafe70
 } id_list_t, *id_list_p;
dafe70
 
dafe70
-#define INIT_ID_LIST(list_p) \
dafe70
+#define ID_LIST_SET_P(list_p) (list_p->set_p)
dafe70
+#define ID_LIST_BYTES(list_p) (list_p->bytes)
dafe70
+
dafe70
+#define INIT_ID_LIST(list_p, num_elements) \
dafe70
     list_p = malloc(sizeof(id_list_t)); \
dafe70
     if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
dafe70
-    list_p->set_p = CPU_ALLOC(num_cpus); \
dafe70
+    list_p->set_p = CPU_ALLOC(num_elements); \
dafe70
     if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
dafe70
-    list_p->bytes = CPU_ALLOC_SIZE(num_cpus);
dafe70
+    list_p->bytes = CPU_ALLOC_SIZE(num_elements);
dafe70
 
dafe70
-#define CLEAR_LIST(list_p) \
dafe70
+#define CLEAR_CPU_LIST(list_p) \
dafe70
     if (list_p == NULL) { \
dafe70
-        INIT_ID_LIST(list_p); \
dafe70
+        INIT_ID_LIST(list_p, num_cpus); \
dafe70
+    } \
dafe70
+    CPU_ZERO_S(list_p->bytes, list_p->set_p)
dafe70
+
dafe70
+#define CLEAR_NODE_LIST(list_p) \
dafe70
+    if (list_p == NULL) { \
dafe70
+        INIT_ID_LIST(list_p, num_nodes); \
dafe70
     } \
dafe70
     CPU_ZERO_S(list_p->bytes, list_p->set_p)
dafe70
 
dafe70
@@ -262,6 +282,9 @@ typedef struct id_list {
dafe70
         list_p = NULL; \
dafe70
     }
dafe70
 
dafe70
+#define COPY_LIST(orig_list_p, copy_list_p) \
dafe70
+    memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
dafe70
+
dafe70
 #define NUM_IDS_IN_LIST(list_p)     CPU_COUNT_S(list_p->bytes, list_p->set_p)
dafe70
 #define ADD_ID_TO_LIST(k, list_p)  CPU_SET_S(k, list_p->bytes, list_p->set_p)
dafe70
 #define CLR_ID_IN_LIST(k, list_p)  CPU_CLR_S(k, list_p->bytes, list_p->set_p)
dafe70
@@ -272,6 +295,25 @@ typedef struct id_list {
dafe70
 #define  OR_LISTS( or_list_p, list_1_p, list_2_p)  CPU_OR_S( or_list_p->bytes,  or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
dafe70
 #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
dafe70
 
dafe70
+int negate_cpu_list(id_list_p list_p) {
dafe70
+    if (list_p == NULL) {
dafe70
+        numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
dafe70
+        exit(EXIT_FAILURE);
dafe70
+    }
dafe70
+    if (num_cpus < 1) {
dafe70
+        numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
dafe70
+        exit(EXIT_FAILURE);
dafe70
+    }
dafe70
+    for (int ix = 0;  (ix < num_cpus);  ix++) {
dafe70
+        if (ID_IS_IN_LIST(ix, list_p)) {
dafe70
+            CLR_ID_IN_LIST(ix, list_p);
dafe70
+        } else {
dafe70
+            ADD_ID_TO_LIST(ix, list_p);
dafe70
+        }
dafe70
+    }
dafe70
+    return NUM_IDS_IN_LIST(list_p);
dafe70
+}
dafe70
+
dafe70
 int add_ids_to_list_from_str(id_list_p list_p, char *s) {
dafe70
     if (list_p == NULL) {
dafe70
         numad_log(LOG_CRIT, "Cannot add to NULL list\n");
dafe70
@@ -352,9 +394,21 @@ typedef struct node_data {
dafe70
     uint8_t *distance;
dafe70
     id_list_p cpu_list_p; 
dafe70
 } node_data_t, *node_data_p;
dafe70
-
dafe70
 node_data_p node = NULL;
dafe70
 
dafe70
+int min_node_CPUs_free_ix = -1;
dafe70
+int min_node_MBs_free_ix = -1;
dafe70
+long min_node_CPUs_free = MAXINT;
dafe70
+long min_node_MBs_free = MAXINT;
dafe70
+long max_node_CPUs_free = 0;
dafe70
+long max_node_MBs_free = 0;
dafe70
+long avg_node_CPUs_free = 0;
dafe70
+long avg_node_MBs_free = 0;
dafe70
+double stddev_node_CPUs_free = 0.0;
dafe70
+double stddev_node_MBs_free = 0.0;
dafe70
+
dafe70
+
dafe70
+
dafe70
 // RING_BUF_SIZE must be a power of two
dafe70
 #define RING_BUF_SIZE 8
dafe70
 
dafe70
@@ -366,14 +420,15 @@ typedef struct process_data {
dafe70
     uint64_t data_time_stamp; // hundredths of seconds
dafe70
     uint64_t bind_time_stamp;
dafe70
     uint64_t num_threads;
dafe70
+    uint64_t MBs_size;
dafe70
     uint64_t MBs_used;
dafe70
     uint64_t cpu_util;
dafe70
     uint64_t CPUs_used;  // scaled * ONE_HUNDRED
dafe70
     uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
dafe70
     int ring_buf_ix;
dafe70
-    int dup_bind_count;
dafe70
     char *comm;
dafe70
-    char *cpuset_name;
dafe70
+    id_list_p node_list_p;
dafe70
+    uint64_t *process_MBs;
dafe70
 } process_data_t, *process_data_p;
dafe70
 
dafe70
 
dafe70
@@ -433,7 +488,8 @@ int process_hash_insert(int pid) {
dafe70
 }
dafe70
 
dafe70
 int process_hash_update(process_data_p newp) {
dafe70
-    // This updates hash table stats for processes we are monitoring
dafe70
+    // This updates hash table stats for processes we are monitoring. Only the
dafe70
+    // scalar resource consumption stats need to be updated here.
dafe70
     int new_hash_table_entry = 1;
dafe70
     int ix = process_hash_insert(newp->pid);
dafe70
     if (ix >= 0) {
dafe70
@@ -460,6 +516,7 @@ int process_hash_update(process_data_p n
dafe70
             }
dafe70
             p->comm = strdup(newp->comm);
dafe70
         }
dafe70
+        p->MBs_size = newp->MBs_size;
dafe70
         p->MBs_used = newp->MBs_used;
dafe70
         p->cpu_util = newp->cpu_util;
dafe70
         p->num_threads = newp->num_threads;
dafe70
@@ -468,6 +525,11 @@ int process_hash_update(process_data_p n
dafe70
     return new_hash_table_entry;
dafe70
 }
dafe70
 
dafe70
+void process_hash_clear_all_bind_time_stamps() {
dafe70
+    for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
dafe70
+        process_hash_table[ix].bind_time_stamp = 0;
dafe70
+    }
dafe70
+}
dafe70
 
dafe70
 int process_hash_rehash(int old_ix) {
dafe70
     // Given the index of a table entry that would otherwise be orphaned by
dafe70
@@ -489,7 +551,8 @@ int process_hash_remove(int pid) {
dafe70
         // remove the target
dafe70
         process_data_p dp = &process_hash_table[ix];
dafe70
         if (dp->comm) { free(dp->comm); }
dafe70
-        if (dp->cpuset_name) { free(dp->cpuset_name); }
dafe70
+        if (dp->process_MBs) { free(dp->process_MBs); }
dafe70
+        FREE_LIST(dp->node_list_p);
dafe70
         memset(dp, 0, sizeof(process_data_t));
dafe70
         // bubble up the collision chain and rehash if neeeded
dafe70
         for (;;) {
dafe70
@@ -543,15 +606,15 @@ void process_hash_table_dump() {
dafe70
         process_data_p p = &process_hash_table[ix];
dafe70
         if (p->pid) {
dafe70
             numad_log(LOG_DEBUG,
dafe70
-                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld Data TS: %ld  Bind TS: %ld\n",
dafe70
+                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld/%ld Data TS: %ld  Bind TS: %ld\n",
dafe70
                 ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
dafe70
-                p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp);
dafe70
+                p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
dafe70
+            // FIXME: make this dump every field, but this is not even currently used
dafe70
         }
dafe70
     }
dafe70
 }
dafe70
 
dafe70
 void process_hash_table_cleanup(uint64_t update_time) {
dafe70
-    int cpusets_removed = 0;
dafe70
     int num_hash_entries_used = 0;
dafe70
     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
dafe70
         process_data_p p = &process_hash_table[ix];
dafe70
@@ -562,34 +625,14 @@ void process_hash_table_cleanup(uint64_t
dafe70
                 p->data_time_stamp = 0;
dafe70
                 p->CPUs_used = 0;
dafe70
                 // Check for dead pids and remove them...
dafe70
-                char fname[FNAME_SIZE];
dafe70
-                snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
dafe70
-                if (access(fname, F_OK) < 0) {
dafe70
-                    // Seems dead.  Forget this pid -- after first checking 
dafe70
-                    // and removing obsolete numad.PID cpuset directories.  
dafe70
-                    snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
dafe70
-                    if (access(fname, F_OK) == 0) {
dafe70
-                        numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
dafe70
-                        int rc = rmdir(fname);
dafe70
-                        if (rc >= 0) {
dafe70
-                            cpusets_removed += 1;
dafe70
-                        } else {
dafe70
-                            numad_log(LOG_ERR, "bad cpuset rmdir\n");
dafe70
-                            // exit(EXIT_FAILURE);
dafe70
-                        }
dafe70
-                    }
dafe70
+                if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
dafe70
+                    // Seems dead.  Forget this pid
dafe70
                     process_hash_remove(p->pid);
dafe70
                     num_hash_entries_used -= 1;
dafe70
                 }
dafe70
             }
dafe70
         }
dafe70
     }
dafe70
-    if (cpusets_removed > 0) {
dafe70
-        // Expire all the duplicate bind counts so things will be re-evaluated sooner.
dafe70
-        for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
dafe70
-            process_hash_table[ix].dup_bind_count = 0;
dafe70
-        }
dafe70
-    }
dafe70
     // Keep hash table approximately half empty
dafe70
     if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
dafe70
         process_hash_table_expand();
dafe70
@@ -610,9 +653,7 @@ pid_list_p insert_pid_into_pid_list(pid_
dafe70
     if (process_hash_table != NULL) {
dafe70
         int hash_ix = process_hash_lookup(pid);
dafe70
         if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
dafe70
-            // Clear dup_bind_count and interleaved flag,
dafe70
-            // in case user wants it to be re-evaluated soon
dafe70
-            process_hash_table[hash_ix].dup_bind_count = 0;
dafe70
+            // Clear interleaved flag, in case user wants it to be re-evaluated
dafe70
             process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
dafe70
         }
dafe70
     }
dafe70
@@ -678,18 +719,23 @@ void print_version_and_exit(char *prog_n
dafe70
 
dafe70
 void print_usage_and_exit(char *prog_name) {
dafe70
     fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
dafe70
+    fprintf(stderr, "-C 1  to count inactive file cache as available memory (default 1)\n");
dafe70
+    fprintf(stderr, "-C 0  to count inactive file cache memory as unavailable (default 1)\n");
dafe70
     fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
dafe70
-    fprintf(stderr, "-D <CGROUP_MOUNT_POINT> to specify cgroup mount point\n");
dafe70
     fprintf(stderr, "-h to print this usage info\n");
dafe70
+    fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS);
dafe70
     fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
dafe70
-    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes\n");
dafe70
-    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes\n");
dafe70
-    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7)\n");
dafe70
+    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes (default 0)\n");
dafe70
+    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes (default 0)\n");
dafe70
+    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
dafe70
+    fprintf(stderr, "-m <N> to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT);
dafe70
     fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
dafe70
     fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
dafe70
-    fprintf(stderr, "-S 1  to scan all processes\n");
dafe70
-    fprintf(stderr, "-S 0  to scan only explicit PID list processes\n");
dafe70
-    fprintf(stderr, "-u <N> to specify target utilization percent (default 85)\n");
dafe70
+    fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
dafe70
+    fprintf(stderr, "-S 1  to scan all processes (default 1)\n");
dafe70
+    fprintf(stderr, "-S 0  to scan only explicit PID list processes (default 1)\n");
dafe70
+    fprintf(stderr, "-t <N> to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT);
dafe70
+    fprintf(stderr, "-u <N> to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT);
dafe70
     fprintf(stderr, "-v for verbose  (same effect as '-l 6')\n");
dafe70
     fprintf(stderr, "-V to show version info\n");
dafe70
     fprintf(stderr, "-w <CPUs>[:<MBs>] for NUMA node suggestions\n");
dafe70
@@ -698,62 +744,35 @@ void print_usage_and_exit(char *prog_nam
dafe70
 }
dafe70
 
dafe70
 
dafe70
-void check_prereqs(char *prog_name) {
dafe70
-    // Verify cpusets are available on this system.
dafe70
-    char **dir = &cpuset_dir_list[0];
dafe70
-    if (*dir == NULL) { dir++; }
dafe70
-    while (*dir != NULL) {
dafe70
-        cpuset_dir = *dir;
dafe70
-        char fname[FNAME_SIZE];
dafe70
-        snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_dir);
dafe70
-        if (access(fname, F_OK) == 0) {
dafe70
-            break;
dafe70
-        }
dafe70
-        dir++;
dafe70
-    }
dafe70
-    if (*dir == NULL) {
dafe70
-        fprintf(stderr, "\n");
dafe70
-        fprintf(stderr, "Are CPUSETs enabled on this system?\n");
dafe70
-        fprintf(stderr, "They are required for %s to function.\n\n", prog_name);
dafe70
-        fprintf(stderr, "Check manpage CPUSET(7). You might need to do something like:\n");
dafe70
-        fprintf(stderr, "    # mkdir <DIRECTORY_MOUNT_POINT>\n");
dafe70
-        fprintf(stderr, "    # mount cgroup -t cgroup -o cpuset <DIRECTORY_MOUNT_POINT>\n");
dafe70
-        fprintf(stderr, "    where <DIRECTORY_MOUNT_POINT> is something like:\n");
dafe70
-        dir = &cpuset_dir_list[0];
dafe70
-        if (*dir == NULL) { dir++; }
dafe70
-        while (*dir != NULL) {
dafe70
-            fprintf(stderr, "      - %s\n", *dir);
dafe70
-            dir++;
dafe70
-        }
dafe70
-        fprintf(stderr, "and then try again...\n");
dafe70
-        fprintf(stderr, "Or, use '-D <DIRECTORY_MOUNT_POINT>' to specify the correct mount point\n");
dafe70
-        fprintf(stderr, "\n");
dafe70
-        exit(EXIT_FAILURE);
dafe70
+void set_thp_scan_sleep_ms(int new_ms) {
dafe70
+    if (new_ms < 1) {
dafe70
+        // 0 means do not change the system default
dafe70
+        return;
dafe70
     }
dafe70
-    // Check on THP scan sleep time.
dafe70
-    char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs";
dafe70
-    int fd = open(thp_scan_fname, O_RDONLY, 0);
dafe70
+    char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
dafe70
+    int fd = open(thp_scan_fname, O_RDWR, 0);
dafe70
     if (fd >= 0) {
dafe70
-        int ms;
dafe70
         char buf[BUF_SIZE];
dafe70
         int bytes = read(fd, buf, BUF_SIZE);
dafe70
-        close(fd);
dafe70
         if (bytes > 0) {
dafe70
+            buf[bytes] = '\0';
dafe70
+            int cur_ms;
dafe70
             char *p = buf;
dafe70
-            CONVERT_DIGITS_TO_NUM(p, ms);
dafe70
-            if (ms > 150) {
dafe70
-                fprintf(stderr, "\n");
dafe70
-                numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
dafe70
-                fprintf(stderr,       "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
dafe70
-                fprintf(stderr, "Consider increasing the frequency of THP scanning,\n");
dafe70
-                fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname);
dafe70
-                fprintf(stderr, "to more aggressively (re)construct THPs.  For example:\n");
dafe70
-                fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
dafe70
-                fprintf(stderr, "\n");
dafe70
+            CONVERT_DIGITS_TO_NUM(p, cur_ms);
dafe70
+            if (cur_ms != new_ms) {
dafe70
+                lseek(fd, 0, SEEK_SET);
dafe70
+                numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
dafe70
+                sprintf(buf, "%d\n", new_ms);
dafe70
+                write(fd, buf, strlen(buf));
dafe70
             }
dafe70
         }
dafe70
+        close(fd);
dafe70
     }
dafe70
-    // FIXME: ?? check for enabled ksmd, and recommend disabling ksm?
dafe70
+}
dafe70
+
dafe70
+void check_prereqs(char *prog_name) {
dafe70
+    // Adjust kernel tunable to scan for THP more frequently...
dafe70
+    set_thp_scan_sleep_ms(thp_scan_sleep_ms);
dafe70
 }
dafe70
 
dafe70
 
dafe70
@@ -785,7 +804,6 @@ int get_daemon_pid() {
dafe70
     return pid; 
dafe70
 }
dafe70
 
dafe70
-
dafe70
 int register_numad_pid() {
dafe70
     int pid;
dafe70
     char buf[BUF_SIZE];
dafe70
@@ -831,6 +849,43 @@ fail_numad_run_file:
dafe70
 }
dafe70
 
dafe70
 
dafe70
+int count_set_bits_in_hex_list_file(char *fname) {
dafe70
+    int sum = 0;
dafe70
+    int fd = open(fname, O_RDONLY, 0);
dafe70
+    if (fd >= 0) {
dafe70
+        char buf[BUF_SIZE];
dafe70
+        int bytes = read(fd, buf, BUF_SIZE);
dafe70
+        close(fd);
dafe70
+        for (int ix = 0;  (ix < bytes);  ix++) {
dafe70
+            char c = tolower(buf[ix]);
dafe70
+            switch (c) {
dafe70
+                case '0'  : sum += 0; break;
dafe70
+                case '1'  : sum += 1; break;
dafe70
+                case '2'  : sum += 1; break;
dafe70
+                case '3'  : sum += 2; break;
dafe70
+                case '4'  : sum += 1; break;
dafe70
+                case '5'  : sum += 2; break;
dafe70
+                case '6'  : sum += 2; break;
dafe70
+                case '7'  : sum += 3; break;
dafe70
+                case '8'  : sum += 1; break;
dafe70
+                case '9'  : sum += 2; break;
dafe70
+                case 'a'  : sum += 2; break;
dafe70
+                case 'b'  : sum += 3; break;
dafe70
+                case 'c'  : sum += 2; break;
dafe70
+                case 'd'  : sum += 3; break;
dafe70
+                case 'e'  : sum += 3; break;
dafe70
+                case 'f'  : sum += 4; break;
dafe70
+                case ' '  : sum += 0; break;
dafe70
+                case ','  : sum += 0; break;
dafe70
+                case '\n' : sum += 0; break;
dafe70
+                default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
dafe70
+            }
dafe70
+        }
dafe70
+    }
dafe70
+    return sum;
dafe70
+}
dafe70
+
dafe70
+
dafe70
 int get_num_cpus() {
dafe70
     int n1 = sysconf(_SC_NPROCESSORS_CONF);
dafe70
     int n2 = sysconf(_SC_NPROCESSORS_ONLN);
dafe70
@@ -848,7 +903,7 @@ int get_num_cpus() {
dafe70
 int get_num_kvm_vcpu_threads(int pid) {
dafe70
     // Try to return the number of vCPU threads for this VM guest,
dafe70
     // excluding the IO threads.  All failures return MAXINT.
dafe70
-    // FIXME: figure out some better way to do this...
dafe70
+    // FIXME: someday figure out some better way to do this...
dafe70
     char fname[FNAME_SIZE];
dafe70
     snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid);
dafe70
     int fd = open(fname, O_RDONLY, 0);
dafe70
@@ -876,8 +931,8 @@ int get_num_kvm_vcpu_threads(int pid) {
dafe70
 }
dafe70
 
dafe70
 
dafe70
-int get_huge_page_size_in_bytes() {
dafe70
-    int huge_page_size = 0;;
dafe70
+uint64_t get_huge_page_size_in_bytes() {
dafe70
+    uint64_t huge_page_size = 0;;
dafe70
     FILE *fs = fopen("/proc/meminfo", "r");
dafe70
     if (!fs) {
dafe70
         numad_log(LOG_CRIT, "Can't open /proc/meminfo\n");
dafe70
@@ -890,7 +945,7 @@ int get_huge_page_size_in_bytes() {
dafe70
             while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) {
dafe70
                 p++;
dafe70
             }
dafe70
-            huge_page_size = atoi(p);
dafe70
+            huge_page_size = atol(p);
dafe70
             break;
dafe70
         }
dafe70
     }
dafe70
@@ -916,143 +971,134 @@ static int name_starts_with_digit(const
dafe70
 }
dafe70
 
dafe70
 
dafe70
-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) {
dafe70
-    // Check basic parameter validity.  
dafe70
-    if (pid <= 0) {
dafe70
+
dafe70
+#define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long))
dafe70
+#define   SET_BIT(i,a)   (a)[(i) / BITS_IN_LONG] |=  (1u << ((i) % BITS_IN_LONG))
dafe70
+#define  TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] &   (1u << ((i) % BITS_IN_LONG))) != 0)
dafe70
+#define CLEAR_BIT(i,a)   (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG))
dafe70
+
dafe70
+int bind_process_and_migrate_memory(process_data_p p) {
dafe70
+    uint64_t t0 = get_time_stamp();
dafe70
+    // Parameter p is a pointer to an element in the hash table
dafe70
+    if ((!p) || (p->pid < 1)) {
dafe70
         numad_log(LOG_CRIT, "Bad PID to bind\n");
dafe70
         exit(EXIT_FAILURE);
dafe70
     }
dafe70
-    if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) {
dafe70
-        numad_log(LOG_CRIT, "Bad cpuset name to bind\n");
dafe70
-        exit(EXIT_FAILURE);
dafe70
-    }
dafe70
-    int nodes;
dafe70
-    if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) {
dafe70
-        numad_log(LOG_CRIT, "Cannot bind to unspecified node\n");
dafe70
+    if (!p->node_list_p) {
dafe70
+        numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
dafe70
         exit(EXIT_FAILURE);
dafe70
     }
dafe70
-    // Cpu_list_p is optional and may be NULL...
dafe70
-    // Generate CPU id list from the specified node list if necessary
dafe70
-    if (cpu_list_p == NULL) {
dafe70
-        static id_list_p tmp_cpu_list_p;
dafe70
-        CLEAR_LIST(tmp_cpu_list_p);
dafe70
-        int node_id = 0;
dafe70
-        while (nodes) {
dafe70
-            if (ID_IS_IN_LIST(node_id, node_list_p)) {
dafe70
-                OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p);
dafe70
-                nodes -= 1;
dafe70
-            }
dafe70
-            node_id += 1;
dafe70
-        }
dafe70
-        cpu_list_p = tmp_cpu_list_p;
dafe70
-    }
dafe70
-    // Make the cpuset directory if necessary
dafe70
-    char cpuset_name_buf[FNAME_SIZE];
dafe70
-    snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name);
dafe70
-    char *p = &cpuset_name_buf[strlen(cpuset_dir)];
dafe70
-    if (!strcmp(p, "/")) {
dafe70
-        // Make a cpuset directory for this process
dafe70
-        snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
dafe70
-        numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf);
dafe70
-        int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
dafe70
-        if (rc == -1) {
dafe70
-            numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno);
dafe70
-            return 0;
dafe70
+    // Generate CPU list derived from target node list.
dafe70
+    static id_list_p cpu_bind_list_p;
dafe70
+    CLEAR_CPU_LIST(cpu_bind_list_p);
dafe70
+    int nodes = NUM_IDS_IN_LIST(p->node_list_p);
dafe70
+    int node_id = 0;
dafe70
+    while (nodes) {
dafe70
+        if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
dafe70
+            OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
dafe70
+            nodes -= 1;
dafe70
         }
dafe70
+        node_id += 1;
dafe70
     }
dafe70
-    cpuset_name = cpuset_name_buf;
dafe70
-    // Now that we have a cpuset for pid and a populated cpulist,
dafe70
-    // start the actual binding and migration.
dafe70
-    uint64_t t0 = get_time_stamp();
dafe70
-
dafe70
-    // Write "1" out to cpuset.memory_migrate file
dafe70
     char fname[FNAME_SIZE];
dafe70
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
dafe70
-    int fd = open(fname, O_WRONLY | O_TRUNC, 0);
dafe70
-    if (fd == -1) {
dafe70
-        numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno);
dafe70
-        return 0;
dafe70
-    }
dafe70
-    write(fd, "1", 1);
dafe70
-    close(fd);
dafe70
-
dafe70
-    // Write node IDs out to cpuset.mems file
dafe70
-    char node_list_buf[BUF_SIZE];
dafe70
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
dafe70
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
dafe70
-    if (fd == -1) {
dafe70
-        numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno);
dafe70
-        return 0;
dafe70
-    }
dafe70
-    int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p);
dafe70
-    write(fd, node_list_buf, len);
dafe70
-    close(fd);
dafe70
-
dafe70
-    // Write CPU IDs out to cpuset.cpus file
dafe70
-    char cpu_list_buf[BUF_SIZE];
dafe70
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
dafe70
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
dafe70
-    if (fd == -1) {
dafe70
-        numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno);
dafe70
-        return 0;
dafe70
-    }
dafe70
-    len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p);
dafe70
-    write(fd, cpu_list_buf, len);
dafe70
-    close(fd);
dafe70
-
dafe70
-    // Copy pid tasks one at a time to tasks file
dafe70
-    snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name);
dafe70
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
dafe70
-    if (fd == -1) {
dafe70
-        numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno);
dafe70
-        return 0;
dafe70
-    }
dafe70
-    snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid);
dafe70
     struct dirent **namelist;
dafe70
-    int files = scandir(fname, &namelist, name_starts_with_digit, NULL);
dafe70
-    if (files < 0) {
dafe70
-        numad_log(LOG_WARNING, "Could not scandir task list\n");
dafe70
+    snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
dafe70
+    int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
dafe70
+    if (num_tasks <= 0) {
dafe70
+        numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
dafe70
         return 0;  // Assume the process terminated
dafe70
     }
dafe70
-    for (int ix = 0;  (ix < files);  ix++) {
dafe70
-        // copy pid tasks, one at a time
dafe70
-        numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name);
dafe70
-        write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name));
dafe70
-        free(namelist[ix]);
dafe70
+    // Set the affinity of each task in the process...
dafe70
+    for (int namelist_ix = 0;  (namelist_ix < num_tasks);  namelist_ix++) {
dafe70
+        int tid = atoi(namelist[namelist_ix]->d_name);
dafe70
+        int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p));
dafe70
+        if (rc < 0) {
dafe70
+            // Check errno
dafe70
+            if (errno == ESRCH) {
dafe70
+                numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid);
dafe70
+            }
dafe70
+            numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno);
dafe70
+        }
dafe70
+        free(namelist[namelist_ix]);
dafe70
     }
dafe70
     free(namelist);
dafe70
-    close(fd);
dafe70
-
dafe70
-    uint64_t t1 = get_time_stamp();
dafe70
+    // Now move the memory to the target nodes....
dafe70
+    static unsigned long *dest_mask;
dafe70
+    static unsigned long *from_mask;
dafe70
+    static int allocated_bytes_in_masks;
dafe70
+    // Lie about num_nodes being one bigger because of kernel bug...
dafe70
+    int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long);
dafe70
+    if (allocated_bytes_in_masks < num_bytes_in_masks) {
dafe70
+        allocated_bytes_in_masks = num_bytes_in_masks;
dafe70
+        dest_mask = realloc(dest_mask, num_bytes_in_masks);
dafe70
+        from_mask = realloc(from_mask, num_bytes_in_masks);
dafe70
+        if ((dest_mask == NULL) || (from_mask == NULL)) {
dafe70
+            numad_log(LOG_CRIT, "bit mask malloc failed\n");
dafe70
+            exit(EXIT_FAILURE);
dafe70
+        }
dafe70
+    }
dafe70
+    // In an effort to put semi-balanced memory in each target node, move the
dafe70
+    // contents from the source node with the max amount of memory to the
dafe70
+    // destination node with the least amount of memory.  Repeat until done.
dafe70
+    int prev_from_node_id = -1;
dafe70
+    for (;;) {
dafe70
+        int min_dest_node_id = -1;
dafe70
+        int max_from_node_id = -1;
dafe70
+        for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
dafe70
+            node_id = node[node_ix].node_id;
dafe70
+            if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
dafe70
+                if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) {
dafe70
+                    // The ">=" above is intentional, so we tend to move memory to higher numbered nodes
dafe70
+                    min_dest_node_id = node_id;
dafe70
+                }
dafe70
+            } else {
dafe70
+                if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) {
dafe70
+                    max_from_node_id = node_id;
dafe70
+                }
dafe70
+            }
dafe70
+        }
dafe70
+        if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) {
dafe70
+            break;
dafe70
+        }
dafe70
+        memset(dest_mask, 0, num_bytes_in_masks);
dafe70
+        memset(from_mask, 0, num_bytes_in_masks);
dafe70
+        SET_BIT(max_from_node_id, from_mask);
dafe70
+        SET_BIT(min_dest_node_id, dest_mask);
dafe70
+        numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id);
dafe70
+        // Lie about num_nodes being one bigger because of kernel bug...
dafe70
+        int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask);
dafe70
+        if (rc > 2) {
dafe70
+            // rc == the number of pages that could not be moved.  
dafe70
+            // A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2.
dafe70
+            numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc);
dafe70
+        } else if (rc < 0) {
dafe70
+            // Check errno
dafe70
+            if (errno == ESRCH) {
dafe70
+                numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid);
dafe70
+                return 0;  // Assume the process terminated
dafe70
+            }
dafe70
+        }
dafe70
+        // Assume memory did move for current accounting purposes...
dafe70
+        p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id];
dafe70
+        p->process_MBs[max_from_node_id] = 0;
dafe70
+        prev_from_node_id = max_from_node_id;
dafe70
+    }
dafe70
     // Check pid still active
dafe70
-    snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
dafe70
+    snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
dafe70
     if (access(fname, F_OK) < 0) {
dafe70
-        numad_log(LOG_WARNING, "Could not migrate pid\n");
dafe70
-        return 0;  // Assume the process terminated
dafe70
+        numad_log(LOG_WARNING, "Could not migrate pid %d.  Apparently it went away.\n", p->pid);
dafe70
+        return 0;
dafe70
+    } else {
dafe70
+        uint64_t t1 = get_time_stamp();
dafe70
+        p->bind_time_stamp = t1;
dafe70
+        char node_list_str[BUF_SIZE];
dafe70
+        str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p);
dafe70
+        numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100);
dafe70
+        return 1;
dafe70
     }
dafe70
-    numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
dafe70
-    return 1;
dafe70
 }
dafe70
 
dafe70
 
dafe70
-void show_nodes() {
dafe70
-    time_t ts = time(NULL);
dafe70
-    fprintf(log_fs, "%s", ctime(&ts);;
dafe70
-    fprintf(log_fs, "Nodes: %d\n", num_nodes);
dafe70
-    for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
-        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ", 
dafe70
-            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
dafe70
-        for (int d = 0;  (d < num_nodes);  d++) {
dafe70
-            fprintf(log_fs, "%d ", node[ix].distance[d]);
dafe70
-        }
dafe70
-        char buf[BUF_SIZE];
dafe70
-        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
dafe70
-        fprintf(log_fs, " CPUs: %s\n", buf);
dafe70
-    }
dafe70
-    fprintf(log_fs, "\n");
dafe70
-    fflush(log_fs);
dafe70
-}
dafe70
-
dafe70
 
dafe70
 typedef struct cpu_data {
dafe70
     uint64_t time_stamp;
dafe70
@@ -1062,10 +1108,9 @@ typedef struct cpu_data {
dafe70
 cpu_data_t cpu_data_buf[2];  // Two sets, to calc deltas
dafe70
 int cur_cpu_data_buf = 0;
dafe70
 
dafe70
-
dafe70
 void update_cpu_data() {
dafe70
     // Parse idle percents from CPU stats in /proc/stat cpu<N> lines
dafe70
-    static FILE *fs = NULL;
dafe70
+    static FILE *fs;
dafe70
     if (fs != NULL) {
dafe70
         rewind(fs);
dafe70
     } else {
dafe70
@@ -1107,14 +1152,14 @@ void update_cpu_data() {
dafe70
             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip nice
dafe70
             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip system
dafe70
             while (!isdigit(*p)) { p++; }
dafe70
-            uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); }
dafe70
+            uint64_t idle;
dafe70
+            CONVERT_DIGITS_TO_NUM(p, idle);
dafe70
             cpu_data_buf[new].idle[cpu_id] = idle;
dafe70
         }
dafe70
     }
dafe70
     cur_cpu_data_buf = new;
dafe70
 }
dafe70
 
dafe70
-
dafe70
 int node_and_digits(const struct dirent *dptr) {
dafe70
     char *p = (char *)(dptr->d_name);
dafe70
     if (*p++ != 'n') return 0;
dafe70
@@ -1129,10 +1174,31 @@ int node_and_digits(const struct dirent
dafe70
 }
dafe70
 
dafe70
 
dafe70
+uint64_t node_info_time_stamp = 0;
dafe70
 id_list_p all_cpus_list_p = NULL;
dafe70
 id_list_p all_nodes_list_p = NULL;
dafe70
-uint64_t node_info_time_stamp = 0;
dafe70
+id_list_p reserved_cpu_mask_list_p = NULL;
dafe70
+char *reserved_cpu_str = NULL;
dafe70
 
dafe70
+void show_nodes() {
dafe70
+    fprintf(log_fs, "\n");
dafe70
+    numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
dafe70
+    fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n", 
dafe70
+        min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
dafe70
+    fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n", 
dafe70
+        min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
dafe70
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
+        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ", 
dafe70
+            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
dafe70
+        for (int d = 0;  (d < num_nodes);  d++) {
dafe70
+            fprintf(log_fs, "%d ", node[ix].distance[d]);
dafe70
+        }
dafe70
+        char buf[BUF_SIZE];
dafe70
+        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
dafe70
+        fprintf(log_fs, " CPUs: %s\n", buf);
dafe70
+    }
dafe70
+    fflush(log_fs);
dafe70
+}
dafe70
 
dafe70
 int update_nodes() {
dafe70
     char fname[FNAME_SIZE];
dafe70
@@ -1141,6 +1207,7 @@ int update_nodes() {
dafe70
     uint64_t time_stamp = get_time_stamp();
dafe70
 #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
dafe70
     if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
dafe70
+        node_info_time_stamp = time_stamp;
dafe70
         // Count directory names of the form: /sys/devices/system/node/node<N>
dafe70
         struct dirent **namelist;
dafe70
         int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
dafe70
@@ -1167,8 +1234,15 @@ int update_nodes() {
dafe70
             }
dafe70
             num_nodes = num_files;
dafe70
         }
dafe70
-        CLEAR_LIST(all_cpus_list_p);
dafe70
-        CLEAR_LIST(all_nodes_list_p);
dafe70
+        sum_CPUs_total = 0;
dafe70
+        CLEAR_CPU_LIST(all_cpus_list_p);
dafe70
+        CLEAR_NODE_LIST(all_nodes_list_p);
dafe70
+        // Figure out how many threads per core there are (for later discounting of hyper-threads)
dafe70
+        threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
dafe70
+        if (threads_per_core < 1) {
dafe70
+            numad_log(LOG_CRIT, "Could not count threads per core\n");
dafe70
+            exit(EXIT_FAILURE);
dafe70
+        }
dafe70
         // For each "node<N>" filename present, save <N> in node[ix].node_id
dafe70
         // Note that the node id might not necessarily match the node ix.
dafe70
         // Also populate the cpu lists and distance vectors for this node.
dafe70
@@ -1184,11 +1258,24 @@ int update_nodes() {
dafe70
             snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id);
dafe70
             int fd = open(fname, O_RDONLY, 0);
dafe70
             if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
dafe70
+                buf[BIG_BUF_SIZE - 1] = '\0';
dafe70
                 // get cpulist from the cpulist string
dafe70
-                CLEAR_LIST(node[node_ix].cpu_list_p);
dafe70
+                CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
dafe70
                 int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
dafe70
+                if (reserved_cpu_str != NULL) {
dafe70
+                    AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
dafe70
+                    n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
dafe70
+                }
dafe70
                 OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
dafe70
-                node[node_ix].CPUs_total = n * ONE_HUNDRED;
dafe70
+                // Calculate total CPUs, but possibly discount hyper-threads
dafe70
+                if ((threads_per_core == 1) || (htt_percent >= 100)) {
dafe70
+                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
dafe70
+                } else {
dafe70
+                    n /= threads_per_core;
dafe70
+                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
dafe70
+                    node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
dafe70
+                }
dafe70
+                sum_CPUs_total += node[node_ix].CPUs_total;
dafe70
                 close(fd);
dafe70
             } else {
dafe70
                 numad_log(LOG_CRIT, "Could not get node cpu list\n");
dafe70
@@ -1220,15 +1307,30 @@ int update_nodes() {
dafe70
         }
dafe70
         free(namelist);
dafe70
     }
dafe70
-    // Second, get the dynamic free memory and available CPU capacity
dafe70
+    // Second, update the dynamic free memory and available CPU capacity
dafe70
+    while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
dafe70
+        // Make sure at least 7/100 of a second has passed.
dafe70
+        // Otherwise sleep for 1/10 second.
dafe70
+	struct timespec ts = { 0, 100000000 }; 
dafe70
+	nanosleep(&ts, &ts);
dafe70
+	time_stamp = get_time_stamp();
dafe70
+    }
dafe70
     update_cpu_data();
dafe70
+    max_node_MBs_free = 0;
dafe70
+    max_node_CPUs_free = 0;
dafe70
+    min_node_MBs_free = MAXINT;
dafe70
+    min_node_CPUs_free = MAXINT;
dafe70
+    uint64_t sum_of_node_MBs_free = 0;
dafe70
+    uint64_t sum_of_node_CPUs_free = 0;
dafe70
     for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
dafe70
         int node_id = node[node_ix].node_id;
dafe70
         // Get available memory info from node<N>/meminfo file
dafe70
         snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
dafe70
         int fd = open(fname, O_RDONLY, 0);
dafe70
         if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
dafe70
+            close(fd);
dafe70
             uint64_t KB;
dafe70
+            buf[BIG_BUF_SIZE - 1] = '\0';
dafe70
             char *p = strstr(buf, "MemTotal:");
dafe70
             if (p != NULL) {
dafe70
                 p += 9;
dafe70
@@ -1238,7 +1340,11 @@ int update_nodes() {
dafe70
             }
dafe70
             while (!isdigit(*p)) { p++; }
dafe70
             CONVERT_DIGITS_TO_NUM(p, KB);
dafe70
-            node[node_ix].MBs_total = KB / KILOBYTE;
dafe70
+            node[node_ix].MBs_total = (KB / KILOBYTE);
dafe70
+            if (node[node_ix].MBs_total < 1) {
dafe70
+                // If a node has zero memory, remove it from the all_nodes_list...
dafe70
+                CLR_ID_IN_LIST(node_id, all_nodes_list_p);
dafe70
+            }
dafe70
             p = strstr(p, "MemFree:");
dafe70
             if (p != NULL) {
dafe70
                 p += 8;
dafe70
@@ -1248,8 +1354,28 @@ int update_nodes() {
dafe70
             }
dafe70
             while (!isdigit(*p)) { p++; }
dafe70
             CONVERT_DIGITS_TO_NUM(p, KB);
dafe70
-            node[node_ix].MBs_free = KB / KILOBYTE;
dafe70
-            close(fd);
dafe70
+            node[node_ix].MBs_free = (KB / KILOBYTE);
dafe70
+            if (use_inactive_file_cache) {
dafe70
+                // Add inactive file cache quantity to "free" memory
dafe70
+                p = strstr(p, "Inactive(file):");
dafe70
+                if (p != NULL) {
dafe70
+                    p += 15;
dafe70
+                } else {
dafe70
+                    numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
dafe70
+                    exit(EXIT_FAILURE);
dafe70
+                }
dafe70
+                while (!isdigit(*p)) { p++; }
dafe70
+                CONVERT_DIGITS_TO_NUM(p, KB);
dafe70
+                node[node_ix].MBs_free += (KB / KILOBYTE);
dafe70
+            }
dafe70
+            sum_of_node_MBs_free += node[node_ix].MBs_free;
dafe70
+            if (min_node_MBs_free > node[node_ix].MBs_free) {
dafe70
+                min_node_MBs_free = node[node_ix].MBs_free;
dafe70
+                min_node_MBs_free_ix = node[node_ix].node_id;
dafe70
+            }
dafe70
+            if (max_node_MBs_free < node[node_ix].MBs_free) {
dafe70
+                max_node_MBs_free = node[node_ix].MBs_free;
dafe70
+            }
dafe70
         } else {
dafe70
             numad_log(LOG_CRIT, "Could not get node meminfo\n");
dafe70
             exit(EXIT_FAILURE);
dafe70
@@ -1260,7 +1386,8 @@ int update_nodes() {
dafe70
         if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
dafe70
             uint64_t idle_ticks = 0;
dafe70
             int cpu = 0;
dafe70
-            int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED;
dafe70
+            int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
dafe70
+            int num_cpus_to_process = num_lcpus;
dafe70
             while (num_cpus_to_process) {
dafe70
                 if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
dafe70
                     idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
dafe70
@@ -1274,15 +1401,46 @@ int update_nodes() {
dafe70
             // printf("Node: %d   CPUs: %ld   time diff %ld   Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
dafe70
             // assert(time_diff > 0);
dafe70
             node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
dafe70
+            // Possibly discount hyper-threads
dafe70
+            if ((threads_per_core > 1) && (htt_percent < 100)) {
dafe70
+                uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
dafe70
+                if (node[node_ix].CPUs_free > htt_discount) {
dafe70
+                    node[node_ix].CPUs_free -= htt_discount;
dafe70
+                } else {
dafe70
+                    node[node_ix].CPUs_free = 0;
dafe70
+                }
dafe70
+            }
dafe70
             if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
dafe70
                 node[node_ix].CPUs_free = node[node_ix].CPUs_total;
dafe70
             }
dafe70
+            sum_of_node_CPUs_free += node[node_ix].CPUs_free;
dafe70
+            if (min_node_CPUs_free > node[node_ix].CPUs_free) {
dafe70
+                min_node_CPUs_free = node[node_ix].CPUs_free;
dafe70
+                min_node_CPUs_free_ix = node[node_ix].node_id;
dafe70
+            }
dafe70
+            if (max_node_CPUs_free < node[node_ix].CPUs_free) {
dafe70
+                max_node_CPUs_free = node[node_ix].CPUs_free;
dafe70
+            }
dafe70
             node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
dafe70
         } else {
dafe70
             node[node_ix].CPUs_free = 0;
dafe70
             node[node_ix].magnitude = 0;
dafe70
         }
dafe70
     }
dafe70
+    avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
dafe70
+    avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
dafe70
+    double MBs_variance_sum = 0.0;
dafe70
+    double CPUs_variance_sum = 0.0;
dafe70
+    for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
dafe70
+        double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
dafe70
+        double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
dafe70
+        MBs_variance_sum += MBs_diff * MBs_diff;
dafe70
+        CPUs_variance_sum += CPUs_diff * CPUs_diff;
dafe70
+    }
dafe70
+    double MBs_variance = MBs_variance_sum / (num_nodes);
dafe70
+    double CPUs_variance = CPUs_variance_sum / (num_nodes);
dafe70
+    stddev_node_MBs_free = sqrt(MBs_variance);
dafe70
+    stddev_node_CPUs_free = sqrt(CPUs_variance);
dafe70
     if (log_level >= LOG_INFO) {
dafe70
         show_nodes();
dafe70
     }
dafe70
@@ -1316,7 +1474,7 @@ typedef struct stat_data {
dafe70
     int64_t num_threads;  // 19
dafe70
     int64_t itrealvalue;
dafe70
     uint64_t starttime;
dafe70
-    uint64_t vsize;
dafe70
+    uint64_t vsize;       // 22
dafe70
     int64_t rss;          // 23
dafe70
     uint64_t rsslim;
dafe70
     uint64_t startcode;
dafe70
@@ -1356,15 +1514,16 @@ process_data_p get_stat_data_for_pid(int
dafe70
     }
dafe70
     static char buf[BUF_SIZE];
dafe70
     int bytes = read(fd, buf, BUF_SIZE);
dafe70
+    close(fd);
dafe70
     if (bytes < 50) {
dafe70
         numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname);
dafe70
         return NULL;
dafe70
     }
dafe70
-    close(fd);
dafe70
+    uint64_t val;
dafe70
     char *p = buf;
dafe70
     static process_data_t data;
dafe70
     // Get PID from field 0
dafe70
-    uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
dafe70
+    CONVERT_DIGITS_TO_NUM(p, val);
dafe70
     data.pid = val;
dafe70
     // Copy comm from field 1
dafe70
     while (*p == ' ') { p++; }
dafe70
@@ -1373,23 +1532,27 @@ process_data_p get_stat_data_for_pid(int
dafe70
     // Skip fields 2 through 12
dafe70
     for (int ix = 0;  (ix < 11);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
dafe70
     // Get utime from field 13 for cpu_util
dafe70
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
dafe70
+    CONVERT_DIGITS_TO_NUM(p, val);
dafe70
     data.cpu_util = val;
dafe70
     // Get stime from field 14 to add on to cpu_util (which already has utime)
dafe70
     while (*p == ' ') { p++; }
dafe70
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
dafe70
+    CONVERT_DIGITS_TO_NUM(p, val);
dafe70
     data.cpu_util += val;
dafe70
     // Skip fields 15 through 18
dafe70
     while (*p == ' ') { p++; }
dafe70
     for (int ix = 0;  (ix < 4);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
dafe70
     // Get num_threads from field 19
dafe70
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
dafe70
+    CONVERT_DIGITS_TO_NUM(p, val);
dafe70
     data.num_threads = val;
dafe70
-    // Skip fields 20 through 22
dafe70
+    // Skip fields 20 through 21
dafe70
     while (*p == ' ') { p++; }
dafe70
-    for (int ix = 0;  (ix < 3);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
dafe70
+    for (int ix = 0;  (ix < 2);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
dafe70
+    // Get vsize from field 22 to compute MBs_size
dafe70
+    CONVERT_DIGITS_TO_NUM(p, val);
dafe70
+    data.MBs_size = val / MEGABYTE;
dafe70
     // Get rss from field 23 to compute MBs_used
dafe70
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
dafe70
+    while (*p == ' ') { p++; }
dafe70
+    CONVERT_DIGITS_TO_NUM(p, val);
dafe70
     data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
dafe70
     // Return pointer to data
dafe70
     return &dat;;
dafe70
@@ -1471,446 +1634,409 @@ int update_processes() {
dafe70
 }
dafe70
 
dafe70
 
dafe70
+int initialize_mem_node_list(process_data_p p) {
dafe70
+    // Parameter p is a pointer to an element in the hash table
dafe70
+    if ((!p) || (p->pid < 1)) {
dafe70
+        numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
dafe70
+        exit(EXIT_FAILURE);
dafe70
+    }
dafe70
+    int n = 0;
dafe70
+    char fname[FNAME_SIZE];
dafe70
+    char buf[BIG_BUF_SIZE];
dafe70
+    p->process_MBs = NULL;
dafe70
+    CLEAR_NODE_LIST(p->node_list_p);
dafe70
+    snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
dafe70
+    int fd = open(fname, O_RDONLY, 0);
dafe70
+    if (fd < 0) {
dafe70
+        numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
dafe70
+        return 0;  // Assume the process terminated
dafe70
+    }
dafe70
+    int bytes = read(fd, buf, BIG_BUF_SIZE);
dafe70
+    close(fd);
dafe70
+    if (bytes <= 0) {
dafe70
+        numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid);
dafe70
+        return 0;  // Assume the process terminated
dafe70
+    } else if (bytes >= BIG_BUF_SIZE) {
dafe70
+        buf[BIG_BUF_SIZE - 1] = '\0';
dafe70
+    } else {
dafe70
+        buf[bytes] = '\0';
dafe70
+    }
dafe70
+    char *list_str_p = strstr(buf, "Mems_allowed_list:");
dafe70
+    if (!list_str_p) {
dafe70
+        numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
dafe70
+        exit(EXIT_FAILURE);
dafe70
+    }
dafe70
+    list_str_p += 18;
dafe70
+    while (!isdigit(*list_str_p)) { list_str_p++; }
dafe70
+    n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
dafe70
+    if (n < num_nodes) {
dafe70
+        // If process already bound to a subset of nodes when we discover it,
dafe70
+        // set initial bind_time_stamp to 30 minutes ago...
dafe70
+        p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
dafe70
+    }
dafe70
+    return n;
dafe70
+}
dafe70
 
dafe70
-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) {
dafe70
-    char buf[BUF_SIZE];
dafe70
-    char buf2[BUF_SIZE];
dafe70
+
dafe70
+uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
dafe70
+    int64_t needed_mem;
dafe70
+    int64_t needed_cpu;
dafe70
+    int64_t excess_mem;
dafe70
+    int64_t excess_cpu;
dafe70
+    if (MBs_free > mbs) {
dafe70
+        needed_mem = mbs;
dafe70
+        excess_mem = MBs_free - mbs;
dafe70
+    } else {
dafe70
+        needed_mem = MBs_free;
dafe70
+        excess_mem = 0;
dafe70
+    }
dafe70
+    if (CPUs_free > cpus) {
dafe70
+        needed_cpu = cpus;
dafe70
+        excess_cpu = CPUs_free - cpus;
dafe70
+    } else {
dafe70
+        needed_cpu = CPUs_free;
dafe70
+        excess_cpu = 0;
dafe70
+    }
dafe70
+    // Weight the available resources, and then calculate magnitude as
dafe70
+    // product of available CPUs and available MBs.
dafe70
+    int64_t memfactor = (needed_mem * 10 + excess_mem * 4);
dafe70
+    int64_t cpufactor = (needed_cpu *  6 + excess_cpu * 1);
dafe70
+    numad_log(LOG_DEBUG, "    Node[%d]: mem: %ld  cpu: %ld\n", ix, memfactor, cpufactor);
dafe70
+    return (memfactor * cpufactor);
dafe70
+}
dafe70
+
dafe70
+
dafe70
+id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
dafe70
     if (log_level >= LOG_DEBUG) {
dafe70
         numad_log(LOG_DEBUG, "PICK NODES FOR:  PID: %d,  CPUs %d,  MBs %d\n", pid, cpus, mbs);
dafe70
     }
dafe70
-    int num_existing_mems = 0;
dafe70
-    static id_list_p existing_mems_list_p;
dafe70
-    CLEAR_LIST(existing_mems_list_p);
dafe70
-    uint64_t time_stamp = get_time_stamp();
dafe70
-    static node_data_p tmp_node;
dafe70
-    static uint64_t *process_MBs;
dafe70
-    static uint64_t *saved_magnitude_for_node;
dafe70
-    static int process_MBs_num_nodes;
dafe70
-    // See if dynamic structures need to grow.
dafe70
-    if (process_MBs_num_nodes < num_nodes + 1) {
dafe70
-        process_MBs_num_nodes = num_nodes + 1;
dafe70
-        // The "+1 node" is for accumulating interleaved memory
dafe70
-        process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t));
dafe70
-        tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
dafe70
-        saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t));
dafe70
-        if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) {
dafe70
-            numad_log(LOG_CRIT, "process_MBs realloc failed\n");
dafe70
-            exit(EXIT_FAILURE);
dafe70
-        }
dafe70
-    }
dafe70
+    char buf[BUF_SIZE];
dafe70
+    uint64_t proc_avg_node_CPUs_free = 0;
dafe70
     // For existing processes, get miscellaneous process specific details
dafe70
     int pid_ix;
dafe70
     process_data_p p = NULL;
dafe70
     if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
dafe70
         p = &process_hash_table[pid_ix];
dafe70
-        // Quick rejection if this process has interleaved memory, but recheck it once an hour...
dafe70
-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED)
dafe70
-        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
dafe70
-          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
dafe70
-            if (log_level >= LOG_DEBUG) {
dafe70
-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
dafe70
-            }
dafe70
-            return NULL;
dafe70
-        }
dafe70
-        // Get cpuset name for this process, and existing mems binding, if any.
dafe70
+        // Add up per-node memory in use by this process.
dafe70
+        // This scanning is expensive and should be minimized.
dafe70
         char fname[FNAME_SIZE];
dafe70
-        snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid);
dafe70
-        FILE *fs = fopen(fname, "r");
dafe70
-        if (!fs) {
dafe70
-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
dafe70
-            return NULL;  // Assume the process terminated?
dafe70
-        }
dafe70
-        if (!fgets(buf, BUF_SIZE, fs)) {
dafe70
-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
dafe70
-            fclose(fs);
dafe70
-            return NULL;  // Assume the process terminated?
dafe70
-        }
dafe70
-        fclose(fs);
dafe70
-        ELIM_NEW_LINE(buf);
dafe70
-        if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) {
dafe70
-            if (p->cpuset_name != NULL) {
dafe70
-                free(p->cpuset_name);
dafe70
-            }
dafe70
-            p->cpuset_name = strdup(buf);
dafe70
-        }
dafe70
-        if (log_level >= LOG_DEBUG) {
dafe70
-            numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name);
dafe70
-        }
dafe70
-        snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
dafe70
-        fs = fopen(fname, "r");
dafe70
-        if ((fs) && (fgets(buf, BUF_SIZE, fs))) {
dafe70
-            fclose(fs);
dafe70
-            num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf);
dafe70
-            if (log_level >= LOG_DEBUG) {
dafe70
-                str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
dafe70
-                numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf);
dafe70
-            }
dafe70
-        } 
dafe70
-        // If this process was just recently bound, enforce a minimum delay
dafe70
-        // period between repeated attempts to potentially move the memory.
dafe70
-        // FIXME: ?? might this retard appropriate process expansion too much?  
dafe70
-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED)
dafe70
-        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
dafe70
-            // Skip re-evaluation because we just did it recently.
dafe70
-            if (log_level >= LOG_DEBUG) {
dafe70
-                numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n");
dafe70
-            }
dafe70
-            return NULL;
dafe70
-        }
dafe70
-        // Look for short cut because of duplicate bindings.  If we have bound
dafe70
-        // this process to the same nodes multiple times already, and the load
dafe70
-        // on those nodes still seems acceptable, skip the rest of this and
dafe70
-        // just return NULL to indicate no change needed.  FIXME: should figure
dafe70
-        // out what can change that would make a rebinding desirable (e.g. (1)
dafe70
-        // some process gets sub-optimal allocation on busy machine which
dafe70
-        // subsequently becomes less busy leaving disadvantaged process. (2)
dafe70
-        // node load imbalance, (3) any process split across nodes which should
dafe70
-        // fit within a single node.) For now, just expire the dup_bid_count
dafe70
-        // occasionally, which is a reasonably good mitigation.
dafe70
-        // So, check to see if we should decay the dup_bind_count...
dafe70
-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED)
dafe70
-        if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) {
dafe70
-            p->dup_bind_count -= 1;
dafe70
-        }
dafe70
-        // Now, look for short cut because of duplicate bindings
dafe70
-        if (p->dup_bind_count > 0) {
dafe70
-            int node_id = 0;
dafe70
-            int nodes_have_cpu = 1;
dafe70
-            int nodes_have_ram = 1;
dafe70
-            int n = num_existing_mems;
dafe70
-            int min_resource_pct = 100 - target_utilization;
dafe70
-            if (min_resource_pct < 5) {
dafe70
-                min_resource_pct = 5;
dafe70
-            }
dafe70
-            while (n) {
dafe70
-                if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
dafe70
-                    nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct));
dafe70
-                    nodes_have_ram &= ((100 * node[node_id].MBs_free  / node[node_id].MBs_total)  >= (min_resource_pct));
dafe70
-                    n -= 1;
dafe70
-                }
dafe70
-                node_id += 1;
dafe70
-            }
dafe70
-            if ((nodes_have_cpu) && (nodes_have_ram)) {
dafe70
-                if (log_level >= LOG_DEBUG) {
dafe70
-                    numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n");
dafe70
-                }
dafe70
-                return NULL;
dafe70
-            }
dafe70
-            if (log_level >= LOG_DEBUG) {
dafe70
-                numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram);
dafe70
-            }
dafe70
-        }
dafe70
-        // Fourth, add up per-node memory in use by this process. This scanning
dafe70
-        // is expensive and should be minimized.  Also, old kernels dismantle
dafe70
-        // transparent huge pages while producing the numa_maps memory
dafe70
-        // information! 
dafe70
-        memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t));
dafe70
         snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
dafe70
-        fs = fopen(fname, "r");
dafe70
+        FILE *fs = fopen(fname, "r");
dafe70
         if (!fs) {
dafe70
             numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
dafe70
             return NULL;  // Assume the process terminated
dafe70
         }
dafe70
+        // Allocate and zero per node memory array.
dafe70
+        // The "+1 node" is for accumulating interleaved memory
dafe70
+        p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t));
dafe70
+        if (p->process_MBs == NULL) {
dafe70
+            numad_log(LOG_CRIT, "p->process_MBs realloc failed\n");
dafe70
+            exit(EXIT_FAILURE);
dafe70
+        }
dafe70
+        memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t));
dafe70
         int process_has_interleaved_memory = 0;
dafe70
         while (fgets(buf, BUF_SIZE, fs)) {
dafe70
             int interleaved_memory = 0;
dafe70
             uint64_t page_size = page_size_in_bytes;
dafe70
             const char *delimiters = " \n";
dafe70
-            char *p = strtok(buf, delimiters);
dafe70
-            while (p) {
dafe70
-                if (!strncmp(p, "interleave", 10)) {
dafe70
+            char *str_p = strtok(buf, delimiters);
dafe70
+            while (str_p) {
dafe70
+                if (!strncmp(str_p, "interleave", 10)) {
dafe70
                     interleaved_memory = 1;
dafe70
                     process_has_interleaved_memory = 1;
dafe70
-                } else if (!strcmp(p, "huge")) {
dafe70
+                } else if (!strcmp(str_p, "huge")) {
dafe70
                     page_size = huge_page_size_in_bytes;
dafe70
-                } else if (*p++ == 'N') {
dafe70
+                } else if (*str_p++ == 'N') {
dafe70
                     int node;
dafe70
                     uint64_t pages;
dafe70
-                    CONVERT_DIGITS_TO_NUM(p, node);
dafe70
-                    if (*p++ != '=') {
dafe70
+                    CONVERT_DIGITS_TO_NUM(str_p, node);
dafe70
+                    if (*str_p++ != '=') {
dafe70
                         numad_log(LOG_CRIT, "numa_maps node number parse error\n");
dafe70
                         exit(EXIT_FAILURE);
dafe70
                     }
dafe70
-                    CONVERT_DIGITS_TO_NUM(p, pages);
dafe70
-                    process_MBs[node] += (pages * page_size);
dafe70
+                    CONVERT_DIGITS_TO_NUM(str_p, pages);
dafe70
+                    p->process_MBs[node] += (pages * page_size);
dafe70
                     if (interleaved_memory) {
dafe70
                         // sum interleaved quantity in "extra node"
dafe70
-                        process_MBs[num_nodes] += (pages * page_size);
dafe70
+                        p->process_MBs[num_nodes] += (pages * page_size);
dafe70
                     }
dafe70
                 }
dafe70
                 // Get next token on the line
dafe70
-                p = strtok(NULL, delimiters);
dafe70
+                str_p = strtok(NULL, delimiters);
dafe70
             }
dafe70
         }
dafe70
         fclose(fs);
dafe70
+        proc_avg_node_CPUs_free = p->CPUs_used;
dafe70
         for (int ix = 0;  (ix <= num_nodes);  ix++) {
dafe70
-            process_MBs[ix] /= MEGABYTE;
dafe70
-            if (log_level >= LOG_DEBUG) {
dafe70
-                numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
dafe70
+            p->process_MBs[ix] /= MEGABYTE;
dafe70
+            if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) {
dafe70
+                if (ix == num_nodes) {
dafe70
+                    numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]);
dafe70
+                } else {
dafe70
+                    numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]);
dafe70
+                }
dafe70
+            }
dafe70
+            if (ID_IS_IN_LIST(ix, p->node_list_p)) {
dafe70
+                proc_avg_node_CPUs_free += node[ix].CPUs_free;
dafe70
             }
dafe70
         }
dafe70
+        proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p);
dafe70
         if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
dafe70
             // Mark this process as having interleaved memory so we do not
dafe70
-            // merge the interleaved memory.  Time stamp it as done.
dafe70
+            // merge the interleaved memory.  Time stamp it as done and return.
dafe70
             p->flags |= PROCESS_FLAG_INTERLEAVED;
dafe70
             p->bind_time_stamp = get_time_stamp();
dafe70
             if (log_level >= LOG_DEBUG) {
dafe70
-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
dafe70
+                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
dafe70
             }
dafe70
             return NULL;
dafe70
         }
dafe70
     }  // end of existing PID conditional
dafe70
     // Make a copy of node available resources array.  Add in info specific to
dafe70
     // this process to equalize available resource quantities wrt locations of
dafe70
-    // resources already in use by this process.  Inflate the value of already
dafe70
-    // assigned memory by approximately 3/2, because moving memory is
dafe70
-    // expensive.  Average the amount of CPUs_free across the existing nodes
dafe70
-    // used, because the threads are free to move around in that domain.  After
dafe70
-    // calculating combined magnitude of available resources, bias the values
dafe70
-    // towards existing locations for this process.
dafe70
-    int target_using_all_nodes = 0;
dafe70
-    uint64_t node_CPUs_free_for_this_process = 0;
dafe70
-    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
dafe70
-    if (num_existing_mems > 0) {
dafe70
-        node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation?
dafe70
-        int node_id = 0;
dafe70
-        int n = num_existing_mems;
dafe70
-        while (n) {
dafe70
-            if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
dafe70
-                node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free;
dafe70
-                n -= 1;
dafe70
-            }
dafe70
-            node_id += 1;
dafe70
-        }
dafe70
-        // Divide to get average CPUs_free for the nodes in use by process
dafe70
-        node_CPUs_free_for_this_process /= num_existing_mems;
dafe70
+    // resources already in use by this process.
dafe70
+    static node_data_p tmp_node;
dafe70
+    tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
dafe70
+    if (tmp_node == NULL) {
dafe70
+        numad_log(LOG_CRIT, "tmp_node realloc failed\n");
dafe70
+        exit(EXIT_FAILURE);
dafe70
     }
dafe70
+    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
dafe70
+    uint64_t sum_of_node_CPUs_free = 0;
dafe70
     for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
         if (pid > 0) {
dafe70
-            tmp_node[ix].MBs_free  += ((process_MBs[ix] * 12) / 8);
dafe70
-        }
dafe70
-        if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) {
dafe70
-            tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process;
dafe70
-        }
dafe70
-        if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
dafe70
-            tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
dafe70
-        }
dafe70
-        if (log_level >= LOG_DEBUG) {
dafe70
-            numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free);
dafe70
+            if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) {
dafe70
+                // Process not yet bound to a subset of nodes.
dafe70
+                // Add back memory used by this process on this node.
dafe70
+                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16);  // Apply light mem bias
dafe70
+                // Add back CPU used by this process in proportion to the memory used on this node.
dafe70
+                tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used);
dafe70
+            } else {
dafe70
+                // If the process is currently running on less than all the
dafe70
+                // nodes, first add back (biased) memory already used by this
dafe70
+                // process on this node, then assign average process CPU / node
dafe70
+                // for this process iff the process is present on this node.
dafe70
+                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4);  // Apply heavy mem bias
dafe70
+                if (ID_IS_IN_LIST(ix, p->node_list_p)) {
dafe70
+                    tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free;
dafe70
+                }
dafe70
+            }
dafe70
+            sum_of_node_CPUs_free += tmp_node[ix].CPUs_free;
dafe70
+            if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
dafe70
+                tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
dafe70
+            }
dafe70
+            if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
dafe70
+                tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
dafe70
+            }
dafe70
         }
dafe70
-        // Calculate magnitude as product of available CPUs and available MBs
dafe70
-        tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free;
dafe70
-        // Bias combined magnitude towards already assigned nodes
dafe70
-        if (ID_IS_IN_LIST(ix, existing_mems_list_p)) {
dafe70
-            tmp_node[ix].magnitude *= 9;
dafe70
-            tmp_node[ix].magnitude /= 8;
dafe70
+        // Enforce 1/100th CPU minimum
dafe70
+        if (tmp_node[ix].CPUs_free < 1) {
dafe70
+            tmp_node[ix].CPUs_free = 1;
dafe70
         }
dafe70
-        // Save the current magnitudes
dafe70
-        saved_magnitude_for_node[ix] = tmp_node[ix].magnitude;
dafe70
+        // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld  cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
dafe70
+        tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
dafe70
     }
dafe70
-    // OK, figure out where to get resources for this request.
dafe70
+    // Now figure out where to get resources for this request....
dafe70
     static id_list_p target_node_list_p;
dafe70
-    CLEAR_LIST(target_node_list_p);
dafe70
-    int prev_node_used = -1;
dafe70
-    // Continue to allocate more resources until request are met.
dafe70
-    // OK if not not quite all the CPU request is met.
dafe70
-    // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing?
dafe70
-    int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; 
dafe70
-    if (pid <= 0) {
dafe70
-        // If trying to find resources for pre-placement advice request, do not
dafe70
-        // underestimate the amount of CPUs needed.  Instead, err on the side
dafe70
-        // of providing too many resources.  So, no flexing here...
dafe70
-        cpu_flex = 0;
dafe70
-    }
dafe70
-    while ((mbs > 0) || (cpus > cpu_flex)) {
dafe70
-        if (log_level >= LOG_DEBUG) {
dafe70
-            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
dafe70
+    CLEAR_NODE_LIST(target_node_list_p);
dafe70
+    if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) {
dafe70
+        // System CPUs might be oversubscribed, but...
dafe70
+        assume_enough_cpus = 1;
dafe70
+        // and rely on available memory for placement.
dafe70
+    }
dafe70
+    // Establish a CPU flex fudge factor, on the presumption it is OK if not
dafe70
+    // quite all the CPU request is met.  However, if trying to find resources
dafe70
+    // for pre-placement advice request, do not underestimate the amount of
dafe70
+    // CPUs needed.  Instead, err on the side of providing too many resources.
dafe70
+    int cpu_flex = 0;
dafe70
+    if ((pid > 0) && (target_utilization < 100)) {
dafe70
+        // FIXME: Is half of the utilization margin a good amount of CPU flexing?
dafe70
+        cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200;
dafe70
+    }
dafe70
+    // Figure out minimum number of nodes required
dafe70
+    int mem_req_nodes = ceil((double)mbs  / (double)node[0].MBs_total);
dafe70
+    int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total); 
dafe70
+    int min_req_nodes = mem_req_nodes;
dafe70
+    if (min_req_nodes < cpu_req_nodes) {
dafe70
+        min_req_nodes = cpu_req_nodes;
dafe70
+    }
dafe70
+    if (min_req_nodes > num_nodes) {
dafe70
+        min_req_nodes = num_nodes;
dafe70
+    }
dafe70
+    // Use an index to sort NUMA connected resource chain for each node
dafe70
+    int index[num_nodes];
dafe70
+    uint64_t totmag[num_nodes];
dafe70
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
+        // Reset the index each time
dafe70
+        for (int n = 0;  (n < num_nodes);  n++) {
dafe70
+            index[n] = n;
dafe70
         }
dafe70
-        // Sort nodes by magnitude of available resources.  Note that
dafe70
-        // inter-node distances (to the previous node used) are factored into
dafe70
-        // the sort.
dafe70
+        // Sort by minimum relative NUMA distance from node[ix],
dafe70
+        // breaking distance ties with magnitude of available resources
dafe70
         for (int ij = 0;  (ij < num_nodes);  ij++) {
dafe70
-            int big_ix = ij;
dafe70
+            int best_ix = ij;
dafe70
             for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
dafe70
-                uint64_t ik_dist = 1;
dafe70
-                uint64_t big_ix_dist = 1;
dafe70
-                if (prev_node_used >= 0) {
dafe70
-                    ik_dist = tmp_node[ik].distance[prev_node_used];
dafe70
-                    big_ix_dist = tmp_node[big_ix].distance[prev_node_used];
dafe70
-                }
dafe70
-                // Scale magnitude comparison by distances to previous node used...
dafe70
-                if ((tmp_node[big_ix].magnitude / big_ix_dist) < (tmp_node[ik].magnitude / ik_dist)) {
dafe70
-                    big_ix = ik;
dafe70
-                }
dafe70
-            }
dafe70
-            if (big_ix != ij) {
dafe70
-                node_data_t tmp;
dafe70
-                memcpy((void *)&tmp, (void *)&tmp_node[ij], sizeof(node_data_t) );
dafe70
-                memcpy((void *)&tmp_node[ij], (void *)&tmp_node[big_ix], sizeof(node_data_t) );
dafe70
-                memcpy((void *)&tmp_node[big_ix], (void *)&tmp, sizeof(node_data_t) );
dafe70
+                int ik_dist = tmp_node[index[ik]].distance[ix];
dafe70
+                int best_ix_dist = tmp_node[index[best_ix]].distance[ix];
dafe70
+                if (best_ix_dist > ik_dist) {
dafe70
+                    best_ix = ik;
dafe70
+                } else if (best_ix_dist == ik_dist) {
dafe70
+                    if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
dafe70
+                        best_ix = ik;
dafe70
+                    }
dafe70
+                }
dafe70
+            }
dafe70
+            if (best_ix != ij) {
dafe70
+                int tmp = index[ij];
dafe70
+                index[ij] = index[best_ix];
dafe70
+                index[best_ix] = tmp;
dafe70
             }
dafe70
         }
dafe70
+#if 0
dafe70
         if (log_level >= LOG_DEBUG) {
dafe70
-            for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
-                numad_log(LOG_DEBUG, "Sorted magnitude[%d]: %ld\n", tmp_node[ix].node_id, tmp_node[ix].magnitude);
dafe70
+            for (int iq = 0;  (iq < num_nodes);  iq++) {
dafe70
+                numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
dafe70
+                    tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude);
dafe70
+            }
dafe70
+        }
dafe70
+#endif
dafe70
+        // Save the totmag[] sum of the magnitudes of expected needed nodes,
dafe70
+        // "normalized" by NUMA distance (by dividing each magnitude by the
dafe70
+        // relative distance squared).
dafe70
+        totmag[ix] = 0;
dafe70
+        for (int ij = 0;  (ij < min_req_nodes);  ij++) {
dafe70
+            int dist = tmp_node[index[ij]].distance[ix];
dafe70
+            totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist));
dafe70
+        }
dafe70
+        numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]);
dafe70
+    }
dafe70
+    // Now find the best NUMA node based on the normalized sum of node
dafe70
+    // magnitudes expected to be used.
dafe70
+    int best_node_ix = 0;
dafe70
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
+        if (totmag[best_node_ix] < totmag[ix]) {
dafe70
+            best_node_ix = ix;
dafe70
+        }
dafe70
+    }
dafe70
+    numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix);
dafe70
+    // Reset sorting index again
dafe70
+    for (int n = 0;  (n < num_nodes);  n++) {
dafe70
+        index[n] = n;
dafe70
+    }
dafe70
+    // Sort index by distance from node[best_node_ix],
dafe70
+    // breaking distance ties with magnitude
dafe70
+    for (int ij = 0;  (ij < num_nodes);  ij++) {
dafe70
+        int best_ix = ij;
dafe70
+        for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
dafe70
+            int ik_dist = tmp_node[index[ik]].distance[best_node_ix];
dafe70
+            int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix];
dafe70
+            if (best_ix_dist > ik_dist) {
dafe70
+                best_ix = ik;
dafe70
+            } else if (best_ix_dist == ik_dist) {
dafe70
+                if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
dafe70
+                    best_ix = ik;
dafe70
+                }
dafe70
             }
dafe70
         }
dafe70
-        if (tmp_node[0].node_id == prev_node_used) {
dafe70
-            // Hmmm.  Looks like the best node for more resources, is also the
dafe70
-            // last one we used.  This is not going to make progress...  So
dafe70
-            // just punt and use everything.
dafe70
-            OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p);
dafe70
-            target_using_all_nodes = 1;
dafe70
-            break;
dafe70
+        if (best_ix != ij) {
dafe70
+            int tmp = index[ij];
dafe70
+            index[ij] = index[best_ix];
dafe70
+            index[best_ix] = tmp;
dafe70
+        }
dafe70
+    }
dafe70
+    if (log_level >= LOG_DEBUG) {
dafe70
+        for (int iq = 0;  (iq < num_nodes);  iq++) {
dafe70
+            numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
dafe70
+                tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude);
dafe70
         }
dafe70
-        prev_node_used = tmp_node[0].node_id;
dafe70
-        ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p);
dafe70
+    }
dafe70
+    // Allocate more resources until request is met.
dafe70
+    best_node_ix = 0;
dafe70
+    while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
dafe70
         if (log_level >= LOG_DEBUG) {
dafe70
-            str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
dafe70
-            str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
dafe70
-            numad_log(LOG_DEBUG, "Existing nodes: %s  Target nodes: %s\n", buf, buf2);
dafe70
+            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
dafe70
         }
dafe70
+        numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]);
dafe70
+        ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p);
dafe70
+        min_req_nodes -= 1;
dafe70
         if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
dafe70
             // Apparently we must use all resource nodes...
dafe70
-            target_using_all_nodes = 1;
dafe70
             break;
dafe70
         }
dafe70
-#define MBS_MARGIN 10
dafe70
-        if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) {
dafe70
-            tmp_node[0].MBs_free -= mbs;
dafe70
+        // "Consume" the resources on this node
dafe70
+#define CPUS_MARGIN 0
dafe70
+#define MBS_MARGIN 100
dafe70
+        if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) {
dafe70
+            tmp_node[index[best_node_ix]].MBs_free -= mbs;
dafe70
             mbs = 0;
dafe70
         } else {
dafe70
-            mbs -= (tmp_node[0].MBs_free - MBS_MARGIN);
dafe70
-            tmp_node[0].MBs_free = MBS_MARGIN;
dafe70
+            mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN);
dafe70
+            tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN;
dafe70
         }
dafe70
-#define CPUS_MARGIN 0
dafe70
-        if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) {
dafe70
-            tmp_node[0].CPUs_free -= cpus;
dafe70
+        if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) {
dafe70
+            tmp_node[index[best_node_ix]].CPUs_free -= cpus;
dafe70
             cpus = 0;
dafe70
         } else {
dafe70
-            cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN);
dafe70
-            tmp_node[0].CPUs_free = CPUS_MARGIN;
dafe70
-        }
dafe70
-        tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free;
dafe70
-    }
dafe70
-    // If this existing process is already located where we want it, and almost
dafe70
-    // all memory is already moved to those nodes, then return NULL indicating
dafe70
-    // no need to change binding this time.
dafe70
-    if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) {
dafe70
-        // May not need to change binding.  However, if there is any significant
dafe70
-        // memory still on non-target nodes, advise the bind anyway because
dafe70
-        // there are some scenarios when the kernel will not move it all the
dafe70
-        // first time.
dafe70
-        if (!target_using_all_nodes) {
dafe70
-            p->dup_bind_count += 1;
dafe70
-            for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
-                if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) {
dafe70
-                    goto try_memory_move_again;
dafe70
-                }
dafe70
-            }
dafe70
-            // We will accept these memory locations.  Stamp it as done.
dafe70
-            p->bind_time_stamp = get_time_stamp();
dafe70
-        }
dafe70
-        // Skip rebinding either because practically all memory is in the
dafe70
-        // target nodes, or because we are stuck using all the nodes.
dafe70
-        if (log_level >= LOG_DEBUG) {
dafe70
-            numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n");
dafe70
+            cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN);
dafe70
+            tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN;
dafe70
         }
dafe70
-        return NULL;
dafe70
-    } else {
dafe70
-        // Either a non-existing process, or a new binding for an existing process.
dafe70
-        if (p != NULL) {
dafe70
-            // Must be a new binding for an existing process, so reset dup_bind_count.
dafe70
-            p->dup_bind_count = 0;
dafe70
-        }
dafe70
-    }
dafe70
-    // See if this proposed move will make a significant difference.
dafe70
-    // If not, return null instead of advising the move.
dafe70
-    uint64_t target_magnitude = 0;
dafe70
-    uint64_t existing_magnitude = 0;
dafe70
-    int num_target_nodes   = NUM_IDS_IN_LIST(target_node_list_p);
dafe70
-    int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p);
dafe70
-    /* FIXME: this expansion seems to cause excessive growth
dafe70
-     * So calculate the improvement before hastily expanding nodes.
dafe70
-    if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; }
dafe70
-    */
dafe70
-    int node_id = 0;
dafe70
-    int n = num_existing_nodes + num_target_nodes;
dafe70
-    while (n) {
dafe70
-        if (ID_IS_IN_LIST(node_id, target_node_list_p)) {
dafe70
-            target_magnitude += saved_magnitude_for_node[node_id];
dafe70
-            n -= 1;
dafe70
-        }
dafe70
-        if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
dafe70
-            existing_magnitude += saved_magnitude_for_node[node_id];
dafe70
-            n -= 1;
dafe70
-        }
dafe70
-        node_id += 1;
dafe70
-    }
dafe70
-    if (existing_magnitude > 0) {
dafe70
-        uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude;
dafe70
-        if (magnitude_change < 0) {
dafe70
-            magnitude_change = -(magnitude_change);
dafe70
-        }
dafe70
-        if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) {
dafe70
-            // Not significant enough percentage change to do rebind
dafe70
+        // Next line optional, since we will not look at that node again
dafe70
+        tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free);
dafe70
+        best_node_ix += 1;
dafe70
+    }
dafe70
+    // For existing processes, calculate the non-local memory percent to see if
dafe70
+    // process is already in the right place.
dafe70
+    if ((pid > 0) && (p != NULL)) {
dafe70
+        uint64_t nonlocal_memory = 0;
dafe70
+        for (int ix = 0;  (ix < num_nodes);  ix++) {
dafe70
+            if (!ID_IS_IN_LIST(ix, target_node_list_p)) {
dafe70
+                // Accumulate total of nonlocal memory
dafe70
+                nonlocal_memory += p->process_MBs[ix];
dafe70
+            }
dafe70
+        }
dafe70
+        int disp_percent = (100 * nonlocal_memory) / p->MBs_used;
dafe70
+        // If this existing process is already located where we want it, then just
dafe70
+        // return NULL indicating no need to change binding this time.  Check the
dafe70
+        // ammount of nonlocal memory against the target_memlocality_perecent.
dafe70
+        if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
dafe70
+            // Already bound to targets, and enough of the memory is located where we want it, so no need to rebind
dafe70
             if (log_level >= LOG_DEBUG) {
dafe70
-                str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
dafe70
-                str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
dafe70
-                numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n",
dafe70
-                    pid, buf, buf2, magnitude_change);
dafe70
+                numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent);
dafe70
             }
dafe70
-            // We decided this is almost good enough.  Stamp it as done.
dafe70
             p->bind_time_stamp = get_time_stamp();
dafe70
             return NULL;
dafe70
         }
dafe70
     }
dafe70
-    if ((pid <= 0) && (num_target_nodes <= 0)) {
dafe70
-        // Always provide at least one node for pre-placement advice
dafe70
+    // Must always provide at least one node for pre-placement advice
dafe70
+    // FIXME: verify this can happen only if no resources requested...
dafe70
+    if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
dafe70
         ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
dafe70
     }
dafe70
-try_memory_move_again:
dafe70
-    str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
dafe70
+    // Log advice, and return target node list
dafe70
+    if ((pid > 0) && (p->bind_time_stamp)) {
dafe70
+        str_from_id_list(buf,  BUF_SIZE, p->node_list_p);
dafe70
+    } else {
dafe70
+        str_from_id_list(buf,  BUF_SIZE, all_nodes_list_p);
dafe70
+    }
dafe70
+    char buf2[BUF_SIZE];
dafe70
     str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
dafe70
     char *cmd_name = "(unknown)";
dafe70
     if ((p) && (p->comm)) {
dafe70
         cmd_name = p->comm;
dafe70
     }
dafe70
     numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
dafe70
+    if (pid > 0) {
dafe70
+        COPY_LIST(target_node_list_p, p->node_list_p);
dafe70
+    }
dafe70
     return target_node_list_p;
dafe70
 }
dafe70
 
dafe70
 
dafe70
-
dafe70
-void show_processes(process_data_p *ptr, int nprocs) {
dafe70
-    time_t ts = time(NULL);
dafe70
-    fprintf(log_fs, "%s", ctime(&ts);;
dafe70
-    fprintf(log_fs, "Candidates: %d\n", nprocs);
dafe70
-    for (int ix = 0;  (ix < nprocs);  ix++) {
dafe70
-        process_data_p p = ptr[ix];
dafe70
-        char buf[BUF_SIZE];
dafe70
-        snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
dafe70
-        FILE *fs = fopen(buf, "r");
dafe70
-        buf[0] = '\0';
dafe70
-        if (fs) {
dafe70
-            if (fgets(buf, BUF_SIZE, fs)) {
dafe70
-                ELIM_NEW_LINE(buf);
dafe70
-            }
dafe70
-            fclose(fs);
dafe70
-        }
dafe70
-        fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", 
dafe70
-            p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
dafe70
-        }
dafe70
-    fprintf(log_fs, "\n");
dafe70
-    fflush(log_fs);
dafe70
-}
dafe70
-
dafe70
-
dafe70
-
dafe70
 int manage_loads() {
dafe70
+    uint64_t time_stamp = get_time_stamp();
dafe70
     // Use temporary index to access and sort hash table entries
dafe70
-    static process_data_p *pindex;
dafe70
     static int pindex_size;
dafe70
+    static process_data_p *pindex;
dafe70
     if (pindex_size < process_hash_table_size) {
dafe70
         pindex_size = process_hash_table_size;
dafe70
         pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
dafe70
@@ -1923,19 +2049,54 @@ int manage_loads() {
dafe70
         return min_interval / 2;
dafe70
     }
dafe70
     memset(pindex, 0, pindex_size * sizeof(process_data_p));
dafe70
-    // Copy live candidate pointers to the index for sorting, etc
dafe70
+    // Copy live candidate pointers to the index for sorting
dafe70
+    // if they meet the threshold for memory usage and CPU usage.
dafe70
     int nprocs = 0;
dafe70
+    long sum_CPUs_used = 0;
dafe70
     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
dafe70
         process_data_p p = &process_hash_table[ix];
dafe70
-        if (p->pid) {
dafe70
+        if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) {
dafe70
             pindex[nprocs++] = p;
dafe70
+            sum_CPUs_used += p->CPUs_used;
dafe70
+            // Initialize node list, if not already done for this process.
dafe70
+            if (p->node_list_p == NULL) {
dafe70
+                initialize_mem_node_list(p);
dafe70
+            }
dafe70
         }
dafe70
     }
dafe70
-    // Sort index by amount of CPU used * amount of memory used.  Not expecting
dafe70
-    // a long list here.  Use a simple sort -- however, sort into bins,
dafe70
-    // treating values within 10% as aquivalent.  Within bins, order by
dafe70
-    // bind_time_stamp so oldest bound will be higher priority to evaluate.
dafe70
+    // Order candidate considerations using timestamps and magnitude: amount of
dafe70
+    // CPU used * amount of memory used.  Not expecting a long list here.  Use
dafe70
+    // a simplistic sort -- however move all not yet bound to front of list and
dafe70
+    // order by decreasing magnitude.  Previously bound processes follow in
dafe70
+    // bins of increasing magnitude treating values within 20% as aquivalent.
dafe70
+    // Within bins, order by bind_time_stamp so oldest bound will be higher
dafe70
+    // priority to evaluate.  Start by moving all unbound to beginning.
dafe70
+    int num_unbound = 0;
dafe70
     for (int ij = 0;  (ij < nprocs);  ij++) {
dafe70
+        if (pindex[ij]->bind_time_stamp == 0) {
dafe70
+            process_data_p tmp = pindex[num_unbound];
dafe70
+            pindex[num_unbound++] = pindex[ij];
dafe70
+            pindex[ij] = tmp;
dafe70
+        }
dafe70
+    }
dafe70
+    // Sort all unbound so biggest magnitude comes first
dafe70
+    for (int ij = 0;  (ij < num_unbound);  ij++) {
dafe70
+        int best = ij;
dafe70
+        for (int ik = ij + 1;  (ik < num_unbound);  ik++) {
dafe70
+            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
dafe70
+            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
dafe70
+            if (ik_mag <= best_mag) continue;
dafe70
+            best = ik;
dafe70
+        }
dafe70
+        if (best != ij) {
dafe70
+            process_data_p tmp = pindex[ij];
dafe70
+            pindex[ij] = pindex[best];
dafe70
+            pindex[best] = tmp;
dafe70
+        }
dafe70
+    }
dafe70
+    // Sort the remaining candidates into bins of increasting magnitude, and by
dafe70
+    // timestamp within bins.
dafe70
+    for (int ij = num_unbound;  (ij < nprocs);  ij++) {
dafe70
         int best = ij;
dafe70
         for (int ik = ij + 1;  (ik < nprocs);  ik++) {
dafe70
             uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
dafe70
@@ -1946,11 +2107,11 @@ int manage_loads() {
dafe70
                 diff_mag = -(diff_mag);
dafe70
                 min_mag = best_mag;
dafe70
             }
dafe70
-            if ((diff_mag > 0) && (min_mag / diff_mag < 10)) {
dafe70
-                // difference > 10 percent.  Use strict ordering
dafe70
+            if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
dafe70
+                // difference > 20 percent.  Use magnitude ordering
dafe70
                 if (ik_mag <= best_mag) continue;
dafe70
             } else {
dafe70
-                // difference within 10 percent.  Sort these by bind_time_stamp.
dafe70
+                // difference within 20 percent.  Sort these by bind_time_stamp.
dafe70
                 if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
dafe70
             }
dafe70
             best = ik;
dafe70
@@ -1961,23 +2122,57 @@ int manage_loads() {
dafe70
             pindex[best] = tmp;
dafe70
         }
dafe70
     }
dafe70
+    // Show the candidate processes in the log file
dafe70
     if ((log_level >= LOG_INFO) && (nprocs > 0)) {
dafe70
-        show_processes(pindex, nprocs);
dafe70
+        numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
dafe70
+        for (int ix = 0;  (ix < nprocs);  ix++) {
dafe70
+            process_data_p p = pindex[ix];
dafe70
+            char buf[BUF_SIZE];
dafe70
+            str_from_id_list(buf, BUF_SIZE, p->node_list_p);
dafe70
+            fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", 
dafe70
+                p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
dafe70
+            }
dafe70
+        fflush(log_fs);
dafe70
     }
dafe70
-    // Estimate desired size and make resource requests for each significant process
dafe70
+    // Estimate desired size (+ margin capacity) and
dafe70
+    // make resource requests for each candidate process
dafe70
     for (int ix = 0;  (ix < nprocs);  ix++) {
dafe70
         process_data_p p = pindex[ix];
dafe70
-        if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) {
dafe70
-            break; // No more significant processes worth worrying about...
dafe70
+        // If this process has interleaved memory, recheck it only every 30 minutes...
dafe70
+#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
dafe70
+        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
dafe70
+          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
dafe70
+            if (log_level >= LOG_DEBUG) {
dafe70
+                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
dafe70
+            }
dafe70
+            continue;
dafe70
+        }
dafe70
+        // Expand resources needed estimate using target_utilization factor.
dafe70
+        // Start with the CPUs actually used (capped by number of threads) for
dafe70
+        // CPUs required, and the RSS MBs actually used for the MBs
dafe70
+        // requirement,
dafe70
+        int mem_target_utilization = target_utilization;
dafe70
+        int cpu_target_utilization = target_utilization;
dafe70
+        // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
dafe70
+        if (mem_target_utilization > 100) {
dafe70
+            mem_target_utilization = 100;
dafe70
+        }
dafe70
+        // If the process virtual memory size is bigger than one node, and it
dafe70
+        // is already using more than 80 percent of a node, then request MBs
dafe70
+        // based on the virtual size rather than on the current amount in use.
dafe70
+        int mb_request;
dafe70
+        if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) {
dafe70
+            mb_request = (p->MBs_size * 100) / mem_target_utilization;
dafe70
+        } else {
dafe70
+            mb_request = (p->MBs_used * 100) / mem_target_utilization;
dafe70
         }
dafe70
-        int mb_request  =  (p->MBs_used * 100) / target_utilization;
dafe70
-        int cpu_request = (p->CPUs_used * 100) / target_utilization;
dafe70
-        // Do not give a process more CPUs than it has threads!
dafe70
-        // FIXME: For guest VMs, should limit max to VCPU threads. Will
dafe70
-        // need to do something more intelligent with guest IO threads
dafe70
-        // when eventually considering devices and IRQs.
dafe70
+        int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
dafe70
+        // But do not give a process more CPUs than it has threads!
dafe70
         int thread_limit = p->num_threads;
dafe70
-        // If process looks like a KVM guest, try to limit to number of vCPU threads
dafe70
+        // If process looks like a KVM guest, try to limit thread count to the
dafe70
+        // number of vCPU threads.  FIXME: Will need to do something more
dafe70
+        // intelligent than this with guest IO threads when eventually
dafe70
+        // considering devices and IRQs.
dafe70
         if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
dafe70
             int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
dafe70
             if (thread_limit > kvm_vcpu_threads) {
dafe70
@@ -1988,23 +2183,51 @@ int manage_loads() {
dafe70
         if (cpu_request > thread_limit) {
dafe70
             cpu_request = thread_limit;
dafe70
         }
dafe70
+        // If this process was recently bound, enforce a five-minute minimum
dafe70
+        // delay between repeated attempts to potentially move the process.
dafe70
+#define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED)
dafe70
+        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
dafe70
+            // Skip re-evaluation because we just did it recently, but check
dafe70
+            // first for node utilization balance to see if we should
dafe70
+            // re-evaluate this particular process right now.  If this process
dafe70
+            // is running on one of the busiest nodes, go ahead and re-evaluate
dafe70
+            // it if it looks like it should have a better place with
dafe70
+            // sufficient resources.  FIXME: this is currently implemented for
dafe70
+            // only smallish processes that will fit in a single node.
dafe70
+            if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p))
dafe70
+                && (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total) 
dafe70
+                && (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free) 
dafe70
+                    + abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free) 
dafe70
+                    < (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD)  // CPU slop
dafe70
+                && (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free)
dafe70
+                    + abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free) 
dafe70
+                    < (max_node_MBs_free - min_node_MBs_free)) ) { 
dafe70
+                if (log_level >= LOG_DEBUG) {
dafe70
+                    numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid);
dafe70
+                }
dafe70
+            } else {
dafe70
+                if (log_level >= LOG_DEBUG) {
dafe70
+                    numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
dafe70
+                }
dafe70
+                continue;
dafe70
+            }
dafe70
+        }
dafe70
+        // OK, now pick NUMA nodes for this process and bind it!
dafe70
         pthread_mutex_lock(&node_info_mutex);
dafe70
-        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request);
dafe70
-        // FIXME: ?? copy node_list_p to shorten mutex region?
dafe70
-        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) {
dafe70
-            // Shorten interval if actively moving processes
dafe70
+        int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
dafe70
+        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
dafe70
+        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
dafe70
             pthread_mutex_unlock(&node_info_mutex);
dafe70
-            p->bind_time_stamp = get_time_stamp();
dafe70
+            // Return minimum interval when actively moving processes
dafe70
             return min_interval;
dafe70
         }
dafe70
         pthread_mutex_unlock(&node_info_mutex);
dafe70
     }
dafe70
-    // Return maximum interval if no process movement
dafe70
+    // Return maximum interval when no process movement
dafe70
     return max_interval;
dafe70
 }
dafe70
 
dafe70
 
dafe70
-
dafe70
 void *set_dynamic_options(void *arg) {
dafe70
     // int arg_value = *(int *)arg;
dafe70
     char buf[BUF_SIZE];
dafe70
@@ -2013,6 +2236,18 @@ void *set_dynamic_options(void *arg) {
dafe70
         msg_t msg;
dafe70
         recv_msg(&msg;;
dafe70
         switch (msg.body.cmd) {
dafe70
+        case 'C':
dafe70
+            use_inactive_file_cache = (msg.body.arg1 != 0);
dafe70
+            if (use_inactive_file_cache) {
dafe70
+                numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
dafe70
+            } else {
dafe70
+                numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
dafe70
+            }
dafe70
+            break;
dafe70
+        case 'H':
dafe70
+            thp_scan_sleep_ms = msg.body.arg1;
dafe70
+            set_thp_scan_sleep_ms(thp_scan_sleep_ms);
dafe70
+            break;
dafe70
         case 'i':
dafe70
             min_interval = msg.body.arg1;
dafe70
             max_interval = msg.body.arg2;
dafe70
@@ -2033,6 +2268,10 @@ void *set_dynamic_options(void *arg) {
dafe70
             numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1);
dafe70
             log_level = msg.body.arg1;
dafe70
             break;
dafe70
+        case 'm':
dafe70
+            numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1);
dafe70
+            target_memlocality = msg.body.arg1;
dafe70
+            break;
dafe70
         case 'p':
dafe70
             numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1);
dafe70
             pthread_mutex_lock(&pid_list_mutex);
dafe70
@@ -2055,6 +2294,11 @@ void *set_dynamic_options(void *arg) {
dafe70
                 numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
dafe70
             }
dafe70
             break;
dafe70
+        case 't':
dafe70
+            numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
dafe70
+            htt_percent = msg.body.arg1;
dafe70
+            node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
dafe70
+            break;
dafe70
         case 'u':
dafe70
             numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
dafe70
             target_utilization = msg.body.arg1;
dafe70
@@ -2064,7 +2308,7 @@ void *set_dynamic_options(void *arg) {
dafe70
                                     msg.body.arg1, msg.body.arg2);
dafe70
             pthread_mutex_lock(&node_info_mutex);
dafe70
             update_nodes();
dafe70
-            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2);
dafe70
+            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
dafe70
             str_from_id_list(buf, BUF_SIZE, node_list_p);
dafe70
             pthread_mutex_unlock(&node_info_mutex);
dafe70
             send_msg(msg.body.src_pid, 'w', 0, 0, buf);
dafe70
@@ -2134,30 +2378,50 @@ void parse_two_arg_values(char *p, int *
dafe70
 
dafe70
 int main(int argc, char *argv[]) {
dafe70
     int opt;
dafe70
+    int C_flag = 0;
dafe70
     int d_flag = 0;
dafe70
+    int H_flag = 0;
dafe70
     int i_flag = 0;
dafe70
     int K_flag = 0;
dafe70
     int l_flag = 0;
dafe70
+    int m_flag = 0;
dafe70
     int p_flag = 0;
dafe70
     int r_flag = 0;
dafe70
     int S_flag = 0;
dafe70
+    int t_flag = 0;
dafe70
     int u_flag = 0;
dafe70
     int v_flag = 0;
dafe70
     int w_flag = 0;
dafe70
     int x_flag = 0;
dafe70
+    int tmp_int = 0;
dafe70
     long list_pid = 0;
dafe70
-    while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) {
dafe70
+    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
dafe70
         switch (opt) {
dafe70
+        case 'C':
dafe70
+            C_flag = 1;
dafe70
+            use_inactive_file_cache = (atoi(optarg) != 0);
dafe70
+            break;
dafe70
         case 'd':
dafe70
             d_flag = 1;
dafe70
             log_level = LOG_DEBUG;
dafe70
             break;
dafe70
         case 'D':
dafe70
-            cpuset_dir_list[0] = strdup(optarg);
dafe70
+            // obsoleted
dafe70
             break;
dafe70
         case 'h':
dafe70
             print_usage_and_exit(argv[0]);
dafe70
             break;
dafe70
+        case 'H':
dafe70
+            tmp_int = atoi(optarg);
dafe70
+            if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
dafe70
+                // 0 means do not change the system default value
dafe70
+                H_flag = 1;
dafe70
+                thp_scan_sleep_ms = tmp_int;
dafe70
+            } else {
dafe70
+		fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
dafe70
+		exit(EXIT_FAILURE);
dafe70
+	    }
dafe70
+            break;
dafe70
         case 'i':
dafe70
             i_flag = 1;
dafe70
             parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
dafe70
@@ -2170,6 +2434,13 @@ int main(int argc, char *argv[]) {
dafe70
             l_flag = 1;
dafe70
             log_level = atoi(optarg);
dafe70
             break;
dafe70
+        case 'm':
dafe70
+            tmp_int = atoi(optarg);
dafe70
+            if ((tmp_int >= 50) && (tmp_int <= 100)) {
dafe70
+                m_flag = 1;
dafe70
+                target_memlocality = tmp_int;
dafe70
+            }
dafe70
+            break;
dafe70
         case 'p':
dafe70
             p_flag = 1;
dafe70
             list_pid = atol(optarg);
dafe70
@@ -2183,13 +2454,26 @@ int main(int argc, char *argv[]) {
dafe70
             include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
dafe70
             exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
dafe70
             break;
dafe70
+        case 'R':
dafe70
+            reserved_cpu_str = strdup(optarg);
dafe70
+            break;
dafe70
         case 'S':
dafe70
             S_flag = 1;
dafe70
             scan_all_processes = (atoi(optarg) != 0);
dafe70
             break;
dafe70
+        case 't':
dafe70
+            tmp_int = atoi(optarg);
dafe70
+            if ((tmp_int >= 0) && (tmp_int <= 100)) {
dafe70
+                t_flag = 1;
dafe70
+                htt_percent = tmp_int;
dafe70
+            }
dafe70
+            break;
dafe70
         case 'u':
dafe70
-            u_flag = 1;
dafe70
-            target_utilization = atoi(optarg);
dafe70
+            tmp_int = atoi(optarg);
dafe70
+            if ((tmp_int >= 10) && (tmp_int <= 130)) {
dafe70
+                u_flag = 1;
dafe70
+                target_utilization = tmp_int;
dafe70
+            }
dafe70
             break;
dafe70
         case 'v':
dafe70
             v_flag = 1;
dafe70
@@ -2234,6 +2518,12 @@ int main(int argc, char *argv[]) {
dafe70
         // Daemon is already running.  So send dynamic options to persistant
dafe70
         // thread to handle requests, get the response (if any), and finish.
dafe70
         msg_t msg; 
dafe70
+        if (C_flag) {
dafe70
+            send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
dafe70
+        }
dafe70
+        if (H_flag) {
dafe70
+            send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
dafe70
+        }
dafe70
         if (i_flag) {
dafe70
             send_msg(daemon_pid, 'i', min_interval, max_interval, "");
dafe70
         }
dafe70
@@ -2243,6 +2533,9 @@ int main(int argc, char *argv[]) {
dafe70
         if (d_flag || l_flag || v_flag) {
dafe70
             send_msg(daemon_pid, 'l', log_level, 0, "");
dafe70
         }
dafe70
+        if (m_flag) {
dafe70
+            send_msg(daemon_pid, 'm', target_memlocality, 0, "");
dafe70
+        }
dafe70
         if (p_flag) {
dafe70
             send_msg(daemon_pid, 'p', list_pid, 0, "");
dafe70
         }
dafe70
@@ -2252,6 +2545,9 @@ int main(int argc, char *argv[]) {
dafe70
         if (S_flag) {
dafe70
             send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
dafe70
         }
dafe70
+        if (t_flag) {
dafe70
+            send_msg(daemon_pid, 't', htt_percent, 0, "");
dafe70
+        }
dafe70
         if (u_flag) {
dafe70
             send_msg(daemon_pid, 'u', target_utilization, 0, "");
dafe70
         }
dafe70
@@ -2263,14 +2559,30 @@ int main(int argc, char *argv[]) {
dafe70
         if (x_flag) {
dafe70
             send_msg(daemon_pid, 'x', list_pid, 0, "");
dafe70
         }
dafe70
-    } else if (w_flag) {
dafe70
-        // Get pre-placement NUMA advice without starting daemon
dafe70
+        close_log_file();
dafe70
+        exit(EXIT_SUCCESS);
dafe70
+    }
dafe70
+    // No numad daemon running yet.
dafe70
+    // First, make note of any reserved CPUs....
dafe70
+    if (reserved_cpu_str != NULL) {
dafe70
+        CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
dafe70
+        int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
dafe70
         char buf[BUF_SIZE];
dafe70
+        str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
dafe70
+        numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
dafe70
+        // turn reserved list into a negated mask for later ANDing use...
dafe70
+        negate_cpu_list(reserved_cpu_mask_list_p);
dafe70
+    }
dafe70
+    // If it is a "-w" pre-placement request, handle that without starting
dafe70
+    // the daemon.  Otherwise start the numad daemon.
dafe70
+    if (w_flag) {
dafe70
+        // Get pre-placement NUMA advice without starting daemon
dafe70
         update_nodes();
dafe70
         sleep(2);
dafe70
         update_nodes();
dafe70
         numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
dafe70
-        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs);
dafe70
+        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
dafe70
+        char buf[BUF_SIZE];
dafe70
         str_from_id_list(buf, BUF_SIZE, node_list_p);
dafe70
         fprintf(stdout, "%s\n", buf);
dafe70
         close_log_file();
dafe70
@@ -2278,6 +2590,7 @@ int main(int argc, char *argv[]) {
dafe70
     } else if (max_interval > 0) {
dafe70
         // Start the numad daemon...
dafe70
         check_prereqs(argv[0]);
dafe70
+#if (!NO_DAEMON)
dafe70
         // Daemonize self...
dafe70
         daemon_pid = fork();
dafe70
         if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
dafe70
@@ -2298,9 +2611,20 @@ int main(int argc, char *argv[]) {
dafe70
         if (log_fs != stderr) {
dafe70
             fclose(stderr);
dafe70
         }
dafe70
+#endif
dafe70
+        // Set up signal handlers
dafe70
+        struct sigaction sa;
dafe70
+        memset(&sa, 0, sizeof(sa)); 
dafe70
+        sa.sa_handler = sig_handler;
dafe70
+        if (sigaction(SIGHUP, &sa, NULL)
dafe70
+            || sigaction(SIGTERM, &sa, NULL)
dafe70
+            || sigaction(SIGQUIT, &sa, NULL)) {
dafe70
+            numad_log(LOG_CRIT, "sigaction does not work?\n");
dafe70
+            exit(EXIT_FAILURE);
dafe70
+        }
dafe70
         // Allocate initial process hash table
dafe70
         process_hash_table_expand();
dafe70
-        // Spawn thread to handle messages from subsequent invocation requests
dafe70
+        // Spawn a thread to handle messages from subsequent invocation requests
dafe70
         pthread_mutex_init(&pid_list_mutex, NULL);
dafe70
         pthread_mutex_init(&node_info_mutex, NULL);
dafe70
         pthread_attr_t attr;
dafe70
@@ -2310,7 +2634,7 @@ int main(int argc, char *argv[]) {
dafe70
         }
dafe70
         pthread_t tid;
dafe70
         if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
dafe70
-            numad_log(LOG_CRIT, "pthread_create failure\n");
dafe70
+            numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
dafe70
             exit(EXIT_FAILURE);
dafe70
         }
dafe70
         // Loop here forwever...
dafe70
@@ -2322,16 +2646,26 @@ int main(int argc, char *argv[]) {
dafe70
             if (nodes > 1) {
dafe70
                 update_processes();
dafe70
                 interval = manage_loads();
dafe70
+                if (interval < max_interval) {
dafe70
+                    // Update node info since we moved something
dafe70
+                    nodes = update_nodes();
dafe70
+                }
dafe70
             }
dafe70
             sleep(interval);
dafe70
+            if (got_sigterm | got_sigquit) {
dafe70
+                shut_down_numad();
dafe70
+            }
dafe70
+            if (got_sighup) {
dafe70
+                got_sighup = 0;
dafe70
+                close_log_file();
dafe70
+                open_log_file();
dafe70
+            }
dafe70
         }
dafe70
         if (pthread_attr_destroy(&attr) != 0) {
dafe70
             numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
dafe70
         }
dafe70
         pthread_mutex_destroy(&pid_list_mutex);
dafe70
         pthread_mutex_destroy(&node_info_mutex);
dafe70
-    } else {
dafe70
-        shut_down_numad();
dafe70
     }
dafe70
     exit(EXIT_SUCCESS);
dafe70
 }
dafe70
diff -rup numad-0.5git/numad.init numad-0.5git-new/numad.init
dafe70
--- numad-0.5git/numad.init	2012-12-03 15:40:40.000000000 +0100
dafe70
+++ numad-0.5git-new/numad.init	2016-08-30 08:45:19.000000000 +0200
dafe70
@@ -37,7 +37,7 @@ start() {
dafe70
     [ -f $config ] || exit 6
dafe70
     echo -n $"Starting $prog: "
dafe70
     . $config
dafe70
-    daemon "$exec -i $INTERVAL"
dafe70
+    daemon $exec -i $INTERVAL
dafe70
     retval=$?
dafe70
     echo
dafe70
     [ $retval -eq 0 ] && touch $lockfile