96ab97
diff -rup numad-0.5git/numad.8 numad-0.5git-new/numad.8
96ab97
--- numad-0.5git/numad.8	2012-12-03 15:40:40.000000000 +0100
96ab97
+++ numad-0.5git-new/numad.8	2016-08-30 08:45:19.000000000 +0200
96ab97
@@ -1,45 +1,56 @@
96ab97
 .TH "numad" "8" "1.0.0" "Bill Gray" "Administration"
96ab97
-.SH "numad"
96ab97
-.LP 
96ab97
+.SH "NAME"
96ab97
+.LP
96ab97
 numad \- A user\-level daemon that provides placement advice and process
96ab97
 management for efficient use of CPUs and memory on systems with NUMA topology.
96ab97
-.SH "SYNTAX"
96ab97
-.LP 
96ab97
+.SH "SYNOPSIS"
96ab97
+.LP
96ab97
 numad [\fI\-dhvV\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
-numad  [\fI\-D non-standard-cgroup-mount-point\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
+numad  [\fI\-C 0|1\fP]
96ab97
+.br
96ab97
+.LP
96ab97
+numad  [\fI\-H THP_hugepage_scan_sleep_ms\fP]
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-i [min_interval:]max_interval\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-K 0|1\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-l log_level\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
+numad  [\fI\-m target_memory_locality\fP]
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-p PID\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-r PID\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
+numad  [\fI\-R reserved-CPU-list\fP]
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-S 0|1\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
+numad  [\fI\-t logical_CPU_percent\fP]
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-u target_utilization\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-w NCPUS[:MB]\fP]
96ab97
-.br 
96ab97
-.LP 
96ab97
+.br
96ab97
+.LP
96ab97
 numad  [\fI\-x PID\fP]
96ab97
-.br 
96ab97
-
96ab97
+.br
96ab97
 .SH "DESCRIPTION"
96ab97
-.LP 
96ab97
+.LP
96ab97
 Numad is a system daemon that monitors NUMA topology and resource usage. It
96ab97
 will attempt to locate processes for efficient NUMA locality and affinity,
96ab97
 dynamically adjusting to changing system conditions.  Numad also provides
96ab97
@@ -53,25 +64,42 @@ large in-memory database application, fo
96ab97
 accesses will likely remain unpredictable -- numad will probably not improve
96ab97
 performance.
96ab97
 .SH "OPTIONS"
96ab97
-.LP 
96ab97
-.TP 
96ab97
+.LP
96ab97
+.TP
96ab97
+\fB\-C\fR <\fI0|1\fP>
96ab97
+This option controls whether or not numad treats inactive file cache as
96ab97
+available memory. By default, numad assumes it can count inactive file cache as
96ab97
+"free" memory when considering resources to match with processes.  Specify
96ab97
+\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed
96ab97
+resource.
96ab97
+.TP
96ab97
 \fB\-d\fR
96ab97
 Debug output in log, sets the log level to LOG_DEBUG.  Same effect as \fI\-l 7\fP.
96ab97
 .TP
96ab97
-\fB\-D\fR <\fInon-standard-cgroup-mount-point\fP>
96ab97
-This option can be used to communicate a non-standard cgroup mount point to
96ab97
-numad.  This is not normally necessary.
96ab97
-.TP 
96ab97
 \fB\-h\fR
96ab97
 Display usage help information and then exit.
96ab97
-.TP 
96ab97
+.TP
96ab97
+\fB\-H\fR  <\fITHP_scan_sleep_ms\fP>
96ab97
+Set the desired transparent hugepage scan interval in ms.  The
96ab97
+.na
96ab97
+/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs
96ab97
+.ad
96ab97
+tunable is usually set to 10000ms by the operating system.  The default is
96ab97
+changed by numad to be 1000ms since it is helpful for the hugepage daemon to be
96ab97
+more aggressive when memory moves between nodes.  Specifying (\fI\-H 0\fP) will
96ab97
+cause numad to retain the system default value.  You can also make the hugepage
96ab97
+daemon more or less aggressive by specifying an alternate value with this
96ab97
+option.  For example, setting this value to 100ms (\fI\-H 100\fP) might improve
96ab97
+the performance of workloads which use many transparent hugepages.
96ab97
+.TP
96ab97
 \fB\-i\fR <\fI[min_interval:]max_interval\fP>
96ab97
 Sets the time interval that numad waits between system scans, in seconds to
96ab97
 <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default
96ab97
 <\fImin_interval\fP> is 5 seconds.  Setting a <\fImax_interval\fP> of zero will
96ab97
 cause the daemon to exit.  (This is the normal mechanism to terminate the
96ab97
 daemon.)  A bigger <\fImax_interval\fP> will decrease numad overhead but also
96ab97
-decrease responsiveness to changing loads.
96ab97
+decrease responsiveness to changing loads.  The default numad max_interval can
96ab97
+be changed in the numad.conf file.
96ab97
 .TP
96ab97
 \fB\-K\fR <\fI0|1\fP>
96ab97
 This option controls whether numad keeps interleaved memory spread across NUMA
96ab97
@@ -82,10 +110,24 @@ a large, single-instance application tha
96ab97
 the workload will have continuous unpredictable memory access patterns (e.g. a
96ab97
 large in-memory database), you might get better results by specifying \fI\-K
96ab97
 1\fP to instruct numad to keep interleaved memory distributed.
96ab97
-.TP 
96ab97
+.TP
96ab97
 \fB\-l\fR <\fIlog_level\fP>
96ab97
 Sets the log level to <\fIlog_level\fP>.  Reasonable choices are 5, 6, or 7.
96ab97
-The default value is 5.
96ab97
+The default value is 5.  Note that CPU values are scaled by a factor of 100
96ab97
+internally and in the numad log files.  Unfortunately, you don't actually have
96ab97
+that many CPUs.
96ab97
+.TP
96ab97
+\fB\-m\fR  <\fItarget_memory_locality\fP>
96ab97
+Set the desired memory locality threshold to stop moving process memory.  Numad
96ab97
+might stop retrying to coalesce process memory when more than this percentage
96ab97
+of the process's memory is already localized in the target node(s).  The
96ab97
+default is 90%. Numad will frequently localize more than the localization
96ab97
+threshold percent, but it will not necessarily do so.  Decrease the threshold
96ab97
+to allow numad to leave more process memory distributed on various nodes.
96ab97
+Increase the threshold to instruct numad to try to localize more memory.
96ab97
+Acceptable values are between 50 and 100 percent.  Note that setting the target
96ab97
+memory locality to 100% might cause numad to continually retry to move memory
96ab97
+that the kernel will never succesfully move.
96ab97
 .TP
96ab97
 \fB\-p\fR <\fIPID\fP>
96ab97
 Add PID to explicit inclusion list of processes to consider for managing, if
96ab97
@@ -102,6 +144,12 @@ processes.  After daemon start, only one
96ab97
 process lists per subsequent numad invocation.  Use with \-S and \-p and \-x to
96ab97
 precisely control the scope of processes numad can manage.
96ab97
 .TP
96ab97
+\fB\-R\fR <\fICPU_LIST\fP>
96ab97
+Specify a list of CPUs that numad should assume are reserved for non-numad use.
96ab97
+No processes will be bound to the specified CPUs by numad.  This option is
96ab97
+effective only when starting numad.  You cannot change reserved CPUs
96ab97
+dynamically while numad is already running.
96ab97
+.TP
96ab97
 \fB\-S\fR <\fI0|1\fP>
96ab97
 This option controls whether numad scans all system processes or only the
96ab97
 processes on the explicit inclusion PID list.  The default is to scan all
96ab97
@@ -113,18 +161,30 @@ exclusion list).  Starting numad as
96ab97
 .br
96ab97
 will limit scanning, and thus also automatic NUMA management, to only those
96ab97
 three explicitly specified processes.
96ab97
-.TP 
96ab97
+.TP
96ab97
+\fB\-t\fR  <\fIlogical_CPU_percent\fP>
96ab97
+Specify the resource value of logical CPUs.  Hardware threads typically share
96ab97
+most core resources, and so logical CPUs add only a fraction of CPU power for
96ab97
+many workloads.  By default numad considers logical CPUs to be only 20 percent
96ab97
+of a dedicated hardware core.
96ab97
+.TP
96ab97
 \fB\-u\fR  <\fItarget_utilization\fP>
96ab97
 Set the desired maximum consumption percentage of a node. Default is 85%.
96ab97
 Decrease the target value to maintain more available resource margin on each
96ab97
 node.  Increase the target value to more exhaustively consume node resources.
96ab97
-.TP 
96ab97
+If you have sized your workloads to precisely fit inside a NUMA node,
96ab97
+specifying (\fI\-u 100\fP) might improve system performance by telling numad to
96ab97
+go ahead and consume all the resources in each node.  It is possible to specify
96ab97
+values up to 130 percent to oversubscribe CPUs in the nodes, but memory
96ab97
+utilization is always capped at 100%.  Use oversubscription values very
96ab97
+carefully.
96ab97
+.TP
96ab97
 \fB\-v\fR
96ab97
 Verbose output in log, sets the log level to LOG_INFO.  Same effect as \fI\-l 6\fP.
96ab97
-.TP 
96ab97
+.TP
96ab97
 \fB\-V\fR
96ab97
 Display version information and exit.
96ab97
-.TP 
96ab97
+.TP
96ab97
 \fB\-w\fR <\fINCPUS[:MB]\fP>
96ab97
 Queries numad for the best NUMA nodes to bind an entity that needs
96ab97
 <\fINCPUS\fP>.  The amount of memory (in MBs) is optional, but should normally
96ab97
@@ -145,32 +205,37 @@ Add PID to explicit exclusion list of pr
96ab97
 Multiple \fI\-x PID\fP options can be specified at daemon start, but after
96ab97
 daemon start, only one PID can be added to the exclusion list per subsequent
96ab97
 numad invocation.  Use with \-S to precisely control the scope of processes
96ab97
-numad can manage.  
96ab97
+numad can manage.
96ab97
 .SH "FILES"
96ab97
-.LP 
96ab97
-\fI/usr/bin/numad\fP 
96ab97
-.br 
96ab97
-\fI/var/log/numad.log\fP 
96ab97
-.br 
96ab97
-\fI/var/run/numad.pid\fP 
96ab97
+.LP
96ab97
+\fI/usr/bin/numad\fP
96ab97
+.br
96ab97
+\fI/etc/numad.conf\fP
96ab97
+.br
96ab97
+\fI/var/log/numad.log\fP
96ab97
+.br
96ab97
+\fI/var/run/numad.pid\fP
96ab97
 .SH "ENVIRONMENT VARIABLES"
96ab97
-.LP 
96ab97
-.TP 
96ab97
+.LP
96ab97
+.TP
96ab97
 None.
96ab97
 .SH "EXAMPLES"
96ab97
-.LP 
96ab97
-Numad is normally run as a system daemon and should be managed by the 
96ab97
+.LP
96ab97
+Numad can be run as a system daemon and can be managed by the
96ab97
 standard init mechanisms of the host.
96ab97
-.LP  
96ab97
+.LP
96ab97
 If interactive (manual) control is desired, you can start the daemon manually by typing:
96ab97
-.LP 
96ab97
+.LP
96ab97
 /usr/bin/numad
96ab97
 .LP
96ab97
-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options.
96ab97
+Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options.
96ab97
+.LP
96ab97
+You can terminate numad from running by typing:
96ab97
+.LP
96ab97
+/usr/bin/numad -i0
96ab97
 .SH "AUTHORS"
96ab97
-.LP 
96ab97
+.LP
96ab97
 Bill Gray <bgray@redhat.com>
96ab97
 .SH "SEE ALSO"
96ab97
-.LP 
96ab97
+.LP
96ab97
 numactl(8)
96ab97
-
96ab97
diff -rup numad-0.5git/numad.c numad-0.5git-new/numad.c
96ab97
--- numad-0.5git/numad.c	2012-12-03 15:40:40.000000000 +0100
96ab97
+++ numad-0.5git-new/numad.c	2016-08-30 08:45:19.000000000 +0200
96ab97
@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston
96ab97
 */ 
96ab97
 
96ab97
 
96ab97
-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt
96ab97
+// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
96ab97
 
96ab97
 
96ab97
 #define _GNU_SOURCE
96ab97
@@ -40,6 +40,10 @@ Inc., 59 Temple Place, Suite 330, Boston
96ab97
 #include <stdio.h>
96ab97
 #include <stdlib.h>
96ab97
 #include <string.h>
96ab97
+#include <time.h>
96ab97
+#include <unistd.h>
96ab97
+#include <values.h>
96ab97
+
96ab97
 #include <sys/ipc.h>
96ab97
 #include <sys/mman.h>
96ab97
 #include <sys/msg.h>
96ab97
@@ -49,26 +53,16 @@ Inc., 59 Temple Place, Suite 330, Boston
96ab97
 #include <sys/syslog.h>
96ab97
 #include <sys/time.h>
96ab97
 #include <sys/types.h>
96ab97
-#include <time.h>
96ab97
-#include <unistd.h>
96ab97
-#include <values.h>
96ab97
+
96ab97
+#include <asm/unistd.h>
96ab97
 
96ab97
 
96ab97
-#define VERSION_STRING "20121130"
96ab97
+#define VERSION_STRING "20150602"
96ab97
 
96ab97
 
96ab97
 #define VAR_RUN_FILE "/var/run/numad.pid"
96ab97
 #define VAR_LOG_FILE "/var/log/numad.log"
96ab97
 
96ab97
-char *cpuset_dir = NULL;
96ab97
-char *cpuset_dir_list[] =  {
96ab97
-    NULL,
96ab97
-    "/sys/fs/cgroup/cpuset",
96ab97
-    "/cgroup/cpuset",
96ab97
-    NULL
96ab97
-};
96ab97
-
96ab97
-
96ab97
 #define KILOBYTE (1024)
96ab97
 #define MEGABYTE (1024 * 1024)
96ab97
 
96ab97
@@ -86,14 +80,11 @@ char *cpuset_dir_list[] =  {
96ab97
 #define MAX_INTERVAL 15
96ab97
 #define CPU_THRESHOLD     50
96ab97
 #define MEMORY_THRESHOLD 300
96ab97
-#define TARGET_UTILIZATION_PERCENT 85
96ab97
-#define IMPROVEMENT_THRESHOLD_PERCENT 5
96ab97
-
96ab97
+#define DEFAULT_HTT_PERCENT 20
96ab97
+#define DEFAULT_THP_SCAN_SLEEP_MS 1000
96ab97
+#define DEFAULT_UTILIZATION_PERCENT 85
96ab97
+#define DEFAULT_MEMLOCALITY_PERCENT 90
96ab97
 
96ab97
-#define ELIM_NEW_LINE(s) \
96ab97
-    if (s[strlen(s) - 1] == '\n') { \
96ab97
-        s[strlen(s) - 1] = '\0'; \
96ab97
-    }
96ab97
 
96ab97
 #define CONVERT_DIGITS_TO_NUM(p, n) \
96ab97
     n = *p++ - '0'; \
96ab97
@@ -105,19 +96,36 @@ char *cpuset_dir_list[] =  {
96ab97
 
96ab97
 int num_cpus = 0;
96ab97
 int num_nodes = 0;
96ab97
-int page_size_in_bytes = 0;
96ab97
-int huge_page_size_in_bytes = 0;
96ab97
+int threads_per_core = 0;
96ab97
+uint64_t page_size_in_bytes = 0;
96ab97
+uint64_t huge_page_size_in_bytes = 0;
96ab97
 
96ab97
 int min_interval = MIN_INTERVAL;
96ab97
 int max_interval = MAX_INTERVAL;
96ab97
-int target_utilization  = TARGET_UTILIZATION_PERCENT;
96ab97
+int htt_percent = DEFAULT_HTT_PERCENT;
96ab97
+int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS;
96ab97
+int target_utilization  = DEFAULT_UTILIZATION_PERCENT;
96ab97
+int target_memlocality  = DEFAULT_MEMLOCALITY_PERCENT;
96ab97
 int scan_all_processes = 1;
96ab97
 int keep_interleaved_memory = 0;
96ab97
+int use_inactive_file_cache = 1;
96ab97
 
96ab97
 pthread_mutex_t pid_list_mutex;
96ab97
 pthread_mutex_t node_info_mutex;
96ab97
+long sum_CPUs_total = 0;
96ab97
 int requested_mbs = 0;
96ab97
 int requested_cpus = 0;
96ab97
+int got_sighup = 0;
96ab97
+int got_sigterm = 0;
96ab97
+int got_sigquit = 0;
96ab97
+
96ab97
+void sig_handler(int signum) { 
96ab97
+    switch (signum) {
96ab97
+        case SIGHUP:  got_sighup  = 1; break;
96ab97
+        case SIGTERM: got_sigterm = 1; break;
96ab97
+        case SIGQUIT: got_sigquit = 1; break;
96ab97
+    }
96ab97
+}
96ab97
 
96ab97
 
96ab97
 
96ab97
@@ -139,7 +147,7 @@ void numad_log(int level, const char *fm
96ab97
     }
96ab97
     char buf[BUF_SIZE];
96ab97
     time_t ts = time(NULL);
96ab97
-    sprintf(buf, ctime(&ts);;
96ab97
+    strncpy(buf, ctime(&ts), sizeof(buf));
96ab97
     char *p = &buf[strlen(buf) - 1];
96ab97
     *p++ = ':';
96ab97
     *p++ = ' ';
96ab97
@@ -155,13 +163,16 @@ void open_log_file() {
96ab97
     log_fs = fopen(VAR_LOG_FILE, "a");
96ab97
     if (log_fs == NULL) {
96ab97
         log_fs = stderr;
96ab97
-        numad_log(LOG_ERR, "Cannot open numad log file -- using stderr\n");
96ab97
+        numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno);
96ab97
     }
96ab97
 }
96ab97
 
96ab97
+
96ab97
 void close_log_file() {
96ab97
     if (log_fs != NULL) {
96ab97
-        fclose(log_fs);
96ab97
+        if (log_fs != stderr) {
96ab97
+            fclose(log_fs);
96ab97
+        }
96ab97
         log_fs = NULL;
96ab97
     }
96ab97
 }
96ab97
@@ -235,23 +246,32 @@ void send_msg(long dst_pid, long cmd, lo
96ab97
 
96ab97
 
96ab97
 typedef struct id_list {
96ab97
-    // Use CPU_SET(3) <sched.h> cpuset bitmasks,
96ab97
+    // Use CPU_SET(3) <sched.h> bitmasks,
96ab97
     // but bundle size and pointer together
96ab97
     // and genericize for both CPU and Node IDs
96ab97
     cpu_set_t *set_p; 
96ab97
     size_t bytes;
96ab97
 } id_list_t, *id_list_p;
96ab97
 
96ab97
-#define INIT_ID_LIST(list_p) \
96ab97
+#define ID_LIST_SET_P(list_p) (list_p->set_p)
96ab97
+#define ID_LIST_BYTES(list_p) (list_p->bytes)
96ab97
+
96ab97
+#define INIT_ID_LIST(list_p, num_elements) \
96ab97
     list_p = malloc(sizeof(id_list_t)); \
96ab97
     if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
96ab97
-    list_p->set_p = CPU_ALLOC(num_cpus); \
96ab97
+    list_p->set_p = CPU_ALLOC(num_elements); \
96ab97
     if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
96ab97
-    list_p->bytes = CPU_ALLOC_SIZE(num_cpus);
96ab97
+    list_p->bytes = CPU_ALLOC_SIZE(num_elements);
96ab97
 
96ab97
-#define CLEAR_LIST(list_p) \
96ab97
+#define CLEAR_CPU_LIST(list_p) \
96ab97
     if (list_p == NULL) { \
96ab97
-        INIT_ID_LIST(list_p); \
96ab97
+        INIT_ID_LIST(list_p, num_cpus); \
96ab97
+    } \
96ab97
+    CPU_ZERO_S(list_p->bytes, list_p->set_p)
96ab97
+
96ab97
+#define CLEAR_NODE_LIST(list_p) \
96ab97
+    if (list_p == NULL) { \
96ab97
+        INIT_ID_LIST(list_p, num_nodes); \
96ab97
     } \
96ab97
     CPU_ZERO_S(list_p->bytes, list_p->set_p)
96ab97
 
96ab97
@@ -262,6 +282,9 @@ typedef struct id_list {
96ab97
         list_p = NULL; \
96ab97
     }
96ab97
 
96ab97
+#define COPY_LIST(orig_list_p, copy_list_p) \
96ab97
+    memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
96ab97
+
96ab97
 #define NUM_IDS_IN_LIST(list_p)     CPU_COUNT_S(list_p->bytes, list_p->set_p)
96ab97
 #define ADD_ID_TO_LIST(k, list_p)  CPU_SET_S(k, list_p->bytes, list_p->set_p)
96ab97
 #define CLR_ID_IN_LIST(k, list_p)  CPU_CLR_S(k, list_p->bytes, list_p->set_p)
96ab97
@@ -272,6 +295,25 @@ typedef struct id_list {
96ab97
 #define  OR_LISTS( or_list_p, list_1_p, list_2_p)  CPU_OR_S( or_list_p->bytes,  or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
96ab97
 #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
96ab97
 
96ab97
+int negate_cpu_list(id_list_p list_p) {
96ab97
+    if (list_p == NULL) {
96ab97
+        numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
96ab97
+        exit(EXIT_FAILURE);
96ab97
+    }
96ab97
+    if (num_cpus < 1) {
96ab97
+        numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
96ab97
+        exit(EXIT_FAILURE);
96ab97
+    }
96ab97
+    for (int ix = 0;  (ix < num_cpus);  ix++) {
96ab97
+        if (ID_IS_IN_LIST(ix, list_p)) {
96ab97
+            CLR_ID_IN_LIST(ix, list_p);
96ab97
+        } else {
96ab97
+            ADD_ID_TO_LIST(ix, list_p);
96ab97
+        }
96ab97
+    }
96ab97
+    return NUM_IDS_IN_LIST(list_p);
96ab97
+}
96ab97
+
96ab97
 int add_ids_to_list_from_str(id_list_p list_p, char *s) {
96ab97
     if (list_p == NULL) {
96ab97
         numad_log(LOG_CRIT, "Cannot add to NULL list\n");
96ab97
@@ -352,9 +394,21 @@ typedef struct node_data {
96ab97
     uint8_t *distance;
96ab97
     id_list_p cpu_list_p; 
96ab97
 } node_data_t, *node_data_p;
96ab97
-
96ab97
 node_data_p node = NULL;
96ab97
 
96ab97
+int min_node_CPUs_free_ix = -1;
96ab97
+int min_node_MBs_free_ix = -1;
96ab97
+long min_node_CPUs_free = MAXINT;
96ab97
+long min_node_MBs_free = MAXINT;
96ab97
+long max_node_CPUs_free = 0;
96ab97
+long max_node_MBs_free = 0;
96ab97
+long avg_node_CPUs_free = 0;
96ab97
+long avg_node_MBs_free = 0;
96ab97
+double stddev_node_CPUs_free = 0.0;
96ab97
+double stddev_node_MBs_free = 0.0;
96ab97
+
96ab97
+
96ab97
+
96ab97
 // RING_BUF_SIZE must be a power of two
96ab97
 #define RING_BUF_SIZE 8
96ab97
 
96ab97
@@ -366,14 +420,15 @@ typedef struct process_data {
96ab97
     uint64_t data_time_stamp; // hundredths of seconds
96ab97
     uint64_t bind_time_stamp;
96ab97
     uint64_t num_threads;
96ab97
+    uint64_t MBs_size;
96ab97
     uint64_t MBs_used;
96ab97
     uint64_t cpu_util;
96ab97
     uint64_t CPUs_used;  // scaled * ONE_HUNDRED
96ab97
     uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
96ab97
     int ring_buf_ix;
96ab97
-    int dup_bind_count;
96ab97
     char *comm;
96ab97
-    char *cpuset_name;
96ab97
+    id_list_p node_list_p;
96ab97
+    uint64_t *process_MBs;
96ab97
 } process_data_t, *process_data_p;
96ab97
 
96ab97
 
96ab97
@@ -433,7 +488,8 @@ int process_hash_insert(int pid) {
96ab97
 }
96ab97
 
96ab97
 int process_hash_update(process_data_p newp) {
96ab97
-    // This updates hash table stats for processes we are monitoring
96ab97
+    // This updates hash table stats for processes we are monitoring. Only the
96ab97
+    // scalar resource consumption stats need to be updated here.
96ab97
     int new_hash_table_entry = 1;
96ab97
     int ix = process_hash_insert(newp->pid);
96ab97
     if (ix >= 0) {
96ab97
@@ -460,6 +516,7 @@ int process_hash_update(process_data_p n
96ab97
             }
96ab97
             p->comm = strdup(newp->comm);
96ab97
         }
96ab97
+        p->MBs_size = newp->MBs_size;
96ab97
         p->MBs_used = newp->MBs_used;
96ab97
         p->cpu_util = newp->cpu_util;
96ab97
         p->num_threads = newp->num_threads;
96ab97
@@ -468,6 +525,11 @@ int process_hash_update(process_data_p n
96ab97
     return new_hash_table_entry;
96ab97
 }
96ab97
 
96ab97
+void process_hash_clear_all_bind_time_stamps() {
96ab97
+    for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
96ab97
+        process_hash_table[ix].bind_time_stamp = 0;
96ab97
+    }
96ab97
+}
96ab97
 
96ab97
 int process_hash_rehash(int old_ix) {
96ab97
     // Given the index of a table entry that would otherwise be orphaned by
96ab97
@@ -489,7 +551,8 @@ int process_hash_remove(int pid) {
96ab97
         // remove the target
96ab97
         process_data_p dp = &process_hash_table[ix];
96ab97
         if (dp->comm) { free(dp->comm); }
96ab97
-        if (dp->cpuset_name) { free(dp->cpuset_name); }
96ab97
+        if (dp->process_MBs) { free(dp->process_MBs); }
96ab97
+        FREE_LIST(dp->node_list_p);
96ab97
         memset(dp, 0, sizeof(process_data_t));
96ab97
         // bubble up the collision chain and rehash if neeeded
96ab97
         for (;;) {
96ab97
@@ -543,15 +606,15 @@ void process_hash_table_dump() {
96ab97
         process_data_p p = &process_hash_table[ix];
96ab97
         if (p->pid) {
96ab97
             numad_log(LOG_DEBUG,
96ab97
-                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld Data TS: %ld  Bind TS: %ld\n",
96ab97
+                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld/%ld Data TS: %ld  Bind TS: %ld\n",
96ab97
                 ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
96ab97
-                p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp);
96ab97
+                p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
96ab97
+            // FIXME: make this dump every field, but this is not even currently used
96ab97
         }
96ab97
     }
96ab97
 }
96ab97
 
96ab97
 void process_hash_table_cleanup(uint64_t update_time) {
96ab97
-    int cpusets_removed = 0;
96ab97
     int num_hash_entries_used = 0;
96ab97
     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
96ab97
         process_data_p p = &process_hash_table[ix];
96ab97
@@ -562,34 +625,14 @@ void process_hash_table_cleanup(uint64_t
96ab97
                 p->data_time_stamp = 0;
96ab97
                 p->CPUs_used = 0;
96ab97
                 // Check for dead pids and remove them...
96ab97
-                char fname[FNAME_SIZE];
96ab97
-                snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
96ab97
-                if (access(fname, F_OK) < 0) {
96ab97
-                    // Seems dead.  Forget this pid -- after first checking 
96ab97
-                    // and removing obsolete numad.PID cpuset directories.  
96ab97
-                    snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
96ab97
-                    if (access(fname, F_OK) == 0) {
96ab97
-                        numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
96ab97
-                        int rc = rmdir(fname);
96ab97
-                        if (rc >= 0) {
96ab97
-                            cpusets_removed += 1;
96ab97
-                        } else {
96ab97
-                            numad_log(LOG_ERR, "bad cpuset rmdir\n");
96ab97
-                            // exit(EXIT_FAILURE);
96ab97
-                        }
96ab97
-                    }
96ab97
+                if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
96ab97
+                    // Seems dead.  Forget this pid
96ab97
                     process_hash_remove(p->pid);
96ab97
                     num_hash_entries_used -= 1;
96ab97
                 }
96ab97
             }
96ab97
         }
96ab97
     }
96ab97
-    if (cpusets_removed > 0) {
96ab97
-        // Expire all the duplicate bind counts so things will be re-evaluated sooner.
96ab97
-        for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
96ab97
-            process_hash_table[ix].dup_bind_count = 0;
96ab97
-        }
96ab97
-    }
96ab97
     // Keep hash table approximately half empty
96ab97
     if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
96ab97
         process_hash_table_expand();
96ab97
@@ -610,9 +653,7 @@ pid_list_p insert_pid_into_pid_list(pid_
96ab97
     if (process_hash_table != NULL) {
96ab97
         int hash_ix = process_hash_lookup(pid);
96ab97
         if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
96ab97
-            // Clear dup_bind_count and interleaved flag,
96ab97
-            // in case user wants it to be re-evaluated soon
96ab97
-            process_hash_table[hash_ix].dup_bind_count = 0;
96ab97
+            // Clear interleaved flag, in case user wants it to be re-evaluated
96ab97
             process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
96ab97
         }
96ab97
     }
96ab97
@@ -678,18 +719,23 @@ void print_version_and_exit(char *prog_n
96ab97
 
96ab97
 void print_usage_and_exit(char *prog_name) {
96ab97
     fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
96ab97
+    fprintf(stderr, "-C 1  to count inactive file cache as available memory (default 1)\n");
96ab97
+    fprintf(stderr, "-C 0  to count inactive file cache memory as unavailable (default 1)\n");
96ab97
     fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
96ab97
-    fprintf(stderr, "-D <CGROUP_MOUNT_POINT> to specify cgroup mount point\n");
96ab97
     fprintf(stderr, "-h to print this usage info\n");
96ab97
+    fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS);
96ab97
     fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
96ab97
-    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes\n");
96ab97
-    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes\n");
96ab97
-    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7)\n");
96ab97
+    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes (default 0)\n");
96ab97
+    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes (default 0)\n");
96ab97
+    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
96ab97
+    fprintf(stderr, "-m <N> to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT);
96ab97
     fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
96ab97
     fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
96ab97
-    fprintf(stderr, "-S 1  to scan all processes\n");
96ab97
-    fprintf(stderr, "-S 0  to scan only explicit PID list processes\n");
96ab97
-    fprintf(stderr, "-u <N> to specify target utilization percent (default 85)\n");
96ab97
+    fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
96ab97
+    fprintf(stderr, "-S 1  to scan all processes (default 1)\n");
96ab97
+    fprintf(stderr, "-S 0  to scan only explicit PID list processes (default 1)\n");
96ab97
+    fprintf(stderr, "-t <N> to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT);
96ab97
+    fprintf(stderr, "-u <N> to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT);
96ab97
     fprintf(stderr, "-v for verbose  (same effect as '-l 6')\n");
96ab97
     fprintf(stderr, "-V to show version info\n");
96ab97
     fprintf(stderr, "-w <CPUs>[:<MBs>] for NUMA node suggestions\n");
96ab97
@@ -698,62 +744,35 @@ void print_usage_and_exit(char *prog_nam
96ab97
 }
96ab97
 
96ab97
 
96ab97
-void check_prereqs(char *prog_name) {
96ab97
-    // Verify cpusets are available on this system.
96ab97
-    char **dir = &cpuset_dir_list[0];
96ab97
-    if (*dir == NULL) { dir++; }
96ab97
-    while (*dir != NULL) {
96ab97
-        cpuset_dir = *dir;
96ab97
-        char fname[FNAME_SIZE];
96ab97
-        snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_dir);
96ab97
-        if (access(fname, F_OK) == 0) {
96ab97
-            break;
96ab97
-        }
96ab97
-        dir++;
96ab97
-    }
96ab97
-    if (*dir == NULL) {
96ab97
-        fprintf(stderr, "\n");
96ab97
-        fprintf(stderr, "Are CPUSETs enabled on this system?\n");
96ab97
-        fprintf(stderr, "They are required for %s to function.\n\n", prog_name);
96ab97
-        fprintf(stderr, "Check manpage CPUSET(7). You might need to do something like:\n");
96ab97
-        fprintf(stderr, "    # mkdir <DIRECTORY_MOUNT_POINT>\n");
96ab97
-        fprintf(stderr, "    # mount cgroup -t cgroup -o cpuset <DIRECTORY_MOUNT_POINT>\n");
96ab97
-        fprintf(stderr, "    where <DIRECTORY_MOUNT_POINT> is something like:\n");
96ab97
-        dir = &cpuset_dir_list[0];
96ab97
-        if (*dir == NULL) { dir++; }
96ab97
-        while (*dir != NULL) {
96ab97
-            fprintf(stderr, "      - %s\n", *dir);
96ab97
-            dir++;
96ab97
-        }
96ab97
-        fprintf(stderr, "and then try again...\n");
96ab97
-        fprintf(stderr, "Or, use '-D <DIRECTORY_MOUNT_POINT>' to specify the correct mount point\n");
96ab97
-        fprintf(stderr, "\n");
96ab97
-        exit(EXIT_FAILURE);
96ab97
+void set_thp_scan_sleep_ms(int new_ms) {
96ab97
+    if (new_ms < 1) {
96ab97
+        // 0 means do not change the system default
96ab97
+        return;
96ab97
     }
96ab97
-    // Check on THP scan sleep time.
96ab97
-    char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs";
96ab97
-    int fd = open(thp_scan_fname, O_RDONLY, 0);
96ab97
+    char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
96ab97
+    int fd = open(thp_scan_fname, O_RDWR, 0);
96ab97
     if (fd >= 0) {
96ab97
-        int ms;
96ab97
         char buf[BUF_SIZE];
96ab97
         int bytes = read(fd, buf, BUF_SIZE);
96ab97
-        close(fd);
96ab97
         if (bytes > 0) {
96ab97
+            buf[bytes] = '\0';
96ab97
+            int cur_ms;
96ab97
             char *p = buf;
96ab97
-            CONVERT_DIGITS_TO_NUM(p, ms);
96ab97
-            if (ms > 150) {
96ab97
-                fprintf(stderr, "\n");
96ab97
-                numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
96ab97
-                fprintf(stderr,       "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
96ab97
-                fprintf(stderr, "Consider increasing the frequency of THP scanning,\n");
96ab97
-                fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname);
96ab97
-                fprintf(stderr, "to more aggressively (re)construct THPs.  For example:\n");
96ab97
-                fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
96ab97
-                fprintf(stderr, "\n");
96ab97
+            CONVERT_DIGITS_TO_NUM(p, cur_ms);
96ab97
+            if (cur_ms != new_ms) {
96ab97
+                lseek(fd, 0, SEEK_SET);
96ab97
+                numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
96ab97
+                sprintf(buf, "%d\n", new_ms);
96ab97
+                write(fd, buf, strlen(buf));
96ab97
             }
96ab97
         }
96ab97
+        close(fd);
96ab97
     }
96ab97
-    // FIXME: ?? check for enabled ksmd, and recommend disabling ksm?
96ab97
+}
96ab97
+
96ab97
+void check_prereqs(char *prog_name) {
96ab97
+    // Adjust kernel tunable to scan for THP more frequently...
96ab97
+    set_thp_scan_sleep_ms(thp_scan_sleep_ms);
96ab97
 }
96ab97
 
96ab97
 
96ab97
@@ -785,7 +804,6 @@ int get_daemon_pid() {
96ab97
     return pid; 
96ab97
 }
96ab97
 
96ab97
-
96ab97
 int register_numad_pid() {
96ab97
     int pid;
96ab97
     char buf[BUF_SIZE];
96ab97
@@ -831,6 +849,43 @@ fail_numad_run_file:
96ab97
 }
96ab97
 
96ab97
 
96ab97
+int count_set_bits_in_hex_list_file(char *fname) {
96ab97
+    int sum = 0;
96ab97
+    int fd = open(fname, O_RDONLY, 0);
96ab97
+    if (fd >= 0) {
96ab97
+        char buf[BUF_SIZE];
96ab97
+        int bytes = read(fd, buf, BUF_SIZE);
96ab97
+        close(fd);
96ab97
+        for (int ix = 0;  (ix < bytes);  ix++) {
96ab97
+            char c = tolower(buf[ix]);
96ab97
+            switch (c) {
96ab97
+                case '0'  : sum += 0; break;
96ab97
+                case '1'  : sum += 1; break;
96ab97
+                case '2'  : sum += 1; break;
96ab97
+                case '3'  : sum += 2; break;
96ab97
+                case '4'  : sum += 1; break;
96ab97
+                case '5'  : sum += 2; break;
96ab97
+                case '6'  : sum += 2; break;
96ab97
+                case '7'  : sum += 3; break;
96ab97
+                case '8'  : sum += 1; break;
96ab97
+                case '9'  : sum += 2; break;
96ab97
+                case 'a'  : sum += 2; break;
96ab97
+                case 'b'  : sum += 3; break;
96ab97
+                case 'c'  : sum += 2; break;
96ab97
+                case 'd'  : sum += 3; break;
96ab97
+                case 'e'  : sum += 3; break;
96ab97
+                case 'f'  : sum += 4; break;
96ab97
+                case ' '  : sum += 0; break;
96ab97
+                case ','  : sum += 0; break;
96ab97
+                case '\n' : sum += 0; break;
96ab97
+                default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
96ab97
+            }
96ab97
+        }
96ab97
+    }
96ab97
+    return sum;
96ab97
+}
96ab97
+
96ab97
+
96ab97
 int get_num_cpus() {
96ab97
     int n1 = sysconf(_SC_NPROCESSORS_CONF);
96ab97
     int n2 = sysconf(_SC_NPROCESSORS_ONLN);
96ab97
@@ -848,7 +903,7 @@ int get_num_cpus() {
96ab97
 int get_num_kvm_vcpu_threads(int pid) {
96ab97
     // Try to return the number of vCPU threads for this VM guest,
96ab97
     // excluding the IO threads.  All failures return MAXINT.
96ab97
-    // FIXME: figure out some better way to do this...
96ab97
+    // FIXME: someday figure out some better way to do this...
96ab97
     char fname[FNAME_SIZE];
96ab97
     snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid);
96ab97
     int fd = open(fname, O_RDONLY, 0);
96ab97
@@ -876,8 +931,8 @@ int get_num_kvm_vcpu_threads(int pid) {
96ab97
 }
96ab97
 
96ab97
 
96ab97
-int get_huge_page_size_in_bytes() {
96ab97
-    int huge_page_size = 0;;
96ab97
+uint64_t get_huge_page_size_in_bytes() {
96ab97
+    uint64_t huge_page_size = 0;;
96ab97
     FILE *fs = fopen("/proc/meminfo", "r");
96ab97
     if (!fs) {
96ab97
         numad_log(LOG_CRIT, "Can't open /proc/meminfo\n");
96ab97
@@ -890,7 +945,7 @@ int get_huge_page_size_in_bytes() {
96ab97
             while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) {
96ab97
                 p++;
96ab97
             }
96ab97
-            huge_page_size = atoi(p);
96ab97
+            huge_page_size = atol(p);
96ab97
             break;
96ab97
         }
96ab97
     }
96ab97
@@ -916,143 +971,134 @@ static int name_starts_with_digit(const
96ab97
 }
96ab97
 
96ab97
 
96ab97
-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) {
96ab97
-    // Check basic parameter validity.  
96ab97
-    if (pid <= 0) {
96ab97
+
96ab97
+#define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long))
96ab97
+#define   SET_BIT(i,a)   (a)[(i) / BITS_IN_LONG] |=  (1u << ((i) % BITS_IN_LONG))
96ab97
+#define  TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] &   (1u << ((i) % BITS_IN_LONG))) != 0)
96ab97
+#define CLEAR_BIT(i,a)   (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG))
96ab97
+
96ab97
+int bind_process_and_migrate_memory(process_data_p p) {
96ab97
+    uint64_t t0 = get_time_stamp();
96ab97
+    // Parameter p is a pointer to an element in the hash table
96ab97
+    if ((!p) || (p->pid < 1)) {
96ab97
         numad_log(LOG_CRIT, "Bad PID to bind\n");
96ab97
         exit(EXIT_FAILURE);
96ab97
     }
96ab97
-    if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) {
96ab97
-        numad_log(LOG_CRIT, "Bad cpuset name to bind\n");
96ab97
-        exit(EXIT_FAILURE);
96ab97
-    }
96ab97
-    int nodes;
96ab97
-    if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) {
96ab97
-        numad_log(LOG_CRIT, "Cannot bind to unspecified node\n");
96ab97
+    if (!p->node_list_p) {
96ab97
+        numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
96ab97
         exit(EXIT_FAILURE);
96ab97
     }
96ab97
-    // Cpu_list_p is optional and may be NULL...
96ab97
-    // Generate CPU id list from the specified node list if necessary
96ab97
-    if (cpu_list_p == NULL) {
96ab97
-        static id_list_p tmp_cpu_list_p;
96ab97
-        CLEAR_LIST(tmp_cpu_list_p);
96ab97
-        int node_id = 0;
96ab97
-        while (nodes) {
96ab97
-            if (ID_IS_IN_LIST(node_id, node_list_p)) {
96ab97
-                OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p);
96ab97
-                nodes -= 1;
96ab97
-            }
96ab97
-            node_id += 1;
96ab97
-        }
96ab97
-        cpu_list_p = tmp_cpu_list_p;
96ab97
-    }
96ab97
-    // Make the cpuset directory if necessary
96ab97
-    char cpuset_name_buf[FNAME_SIZE];
96ab97
-    snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name);
96ab97
-    char *p = &cpuset_name_buf[strlen(cpuset_dir)];
96ab97
-    if (!strcmp(p, "/")) {
96ab97
-        // Make a cpuset directory for this process
96ab97
-        snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
96ab97
-        numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf);
96ab97
-        int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
96ab97
-        if (rc == -1) {
96ab97
-            numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno);
96ab97
-            return 0;
96ab97
+    // Generate CPU list derived from target node list.
96ab97
+    static id_list_p cpu_bind_list_p;
96ab97
+    CLEAR_CPU_LIST(cpu_bind_list_p);
96ab97
+    int nodes = NUM_IDS_IN_LIST(p->node_list_p);
96ab97
+    int node_id = 0;
96ab97
+    while (nodes) {
96ab97
+        if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
96ab97
+            OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
96ab97
+            nodes -= 1;
96ab97
         }
96ab97
+        node_id += 1;
96ab97
     }
96ab97
-    cpuset_name = cpuset_name_buf;
96ab97
-    // Now that we have a cpuset for pid and a populated cpulist,
96ab97
-    // start the actual binding and migration.
96ab97
-    uint64_t t0 = get_time_stamp();
96ab97
-
96ab97
-    // Write "1" out to cpuset.memory_migrate file
96ab97
     char fname[FNAME_SIZE];
96ab97
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
96ab97
-    int fd = open(fname, O_WRONLY | O_TRUNC, 0);
96ab97
-    if (fd == -1) {
96ab97
-        numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno);
96ab97
-        return 0;
96ab97
-    }
96ab97
-    write(fd, "1", 1);
96ab97
-    close(fd);
96ab97
-
96ab97
-    // Write node IDs out to cpuset.mems file
96ab97
-    char node_list_buf[BUF_SIZE];
96ab97
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
96ab97
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
96ab97
-    if (fd == -1) {
96ab97
-        numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno);
96ab97
-        return 0;
96ab97
-    }
96ab97
-    int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p);
96ab97
-    write(fd, node_list_buf, len);
96ab97
-    close(fd);
96ab97
-
96ab97
-    // Write CPU IDs out to cpuset.cpus file
96ab97
-    char cpu_list_buf[BUF_SIZE];
96ab97
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
96ab97
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
96ab97
-    if (fd == -1) {
96ab97
-        numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno);
96ab97
-        return 0;
96ab97
-    }
96ab97
-    len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p);
96ab97
-    write(fd, cpu_list_buf, len);
96ab97
-    close(fd);
96ab97
-
96ab97
-    // Copy pid tasks one at a time to tasks file
96ab97
-    snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name);
96ab97
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
96ab97
-    if (fd == -1) {
96ab97
-        numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno);
96ab97
-        return 0;
96ab97
-    }
96ab97
-    snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid);
96ab97
     struct dirent **namelist;
96ab97
-    int files = scandir(fname, &namelist, name_starts_with_digit, NULL);
96ab97
-    if (files < 0) {
96ab97
-        numad_log(LOG_WARNING, "Could not scandir task list\n");
96ab97
+    snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
96ab97
+    int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
96ab97
+    if (num_tasks <= 0) {
96ab97
+        numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
96ab97
         return 0;  // Assume the process terminated
96ab97
     }
96ab97
-    for (int ix = 0;  (ix < files);  ix++) {
96ab97
-        // copy pid tasks, one at a time
96ab97
-        numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name);
96ab97
-        write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name));
96ab97
-        free(namelist[ix]);
96ab97
+    // Set the affinity of each task in the process...
96ab97
+    for (int namelist_ix = 0;  (namelist_ix < num_tasks);  namelist_ix++) {
96ab97
+        int tid = atoi(namelist[namelist_ix]->d_name);
96ab97
+        int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p));
96ab97
+        if (rc < 0) {
96ab97
+            // Check errno
96ab97
+            if (errno == ESRCH) {
96ab97
+                numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid);
96ab97
+            }
96ab97
+            numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno);
96ab97
+        }
96ab97
+        free(namelist[namelist_ix]);
96ab97
     }
96ab97
     free(namelist);
96ab97
-    close(fd);
96ab97
-
96ab97
-    uint64_t t1 = get_time_stamp();
96ab97
+    // Now move the memory to the target nodes....
96ab97
+    static unsigned long *dest_mask;
96ab97
+    static unsigned long *from_mask;
96ab97
+    static int allocated_bytes_in_masks;
96ab97
+    // Lie about num_nodes being one bigger because of kernel bug...
96ab97
+    int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long);
96ab97
+    if (allocated_bytes_in_masks < num_bytes_in_masks) {
96ab97
+        allocated_bytes_in_masks = num_bytes_in_masks;
96ab97
+        dest_mask = realloc(dest_mask, num_bytes_in_masks);
96ab97
+        from_mask = realloc(from_mask, num_bytes_in_masks);
96ab97
+        if ((dest_mask == NULL) || (from_mask == NULL)) {
96ab97
+            numad_log(LOG_CRIT, "bit mask malloc failed\n");
96ab97
+            exit(EXIT_FAILURE);
96ab97
+        }
96ab97
+    }
96ab97
+    // In an effort to put semi-balanced memory in each target node, move the
96ab97
+    // contents from the source node with the max amount of memory to the
96ab97
+    // destination node with the least amount of memory.  Repeat until done.
96ab97
+    int prev_from_node_id = -1;
96ab97
+    for (;;) {
96ab97
+        int min_dest_node_id = -1;
96ab97
+        int max_from_node_id = -1;
96ab97
+        for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
96ab97
+            node_id = node[node_ix].node_id;
96ab97
+            if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
96ab97
+                if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) {
96ab97
+                    // The ">=" above is intentional, so we tend to move memory to higher numbered nodes
96ab97
+                    min_dest_node_id = node_id;
96ab97
+                }
96ab97
+            } else {
96ab97
+                if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) {
96ab97
+                    max_from_node_id = node_id;
96ab97
+                }
96ab97
+            }
96ab97
+        }
96ab97
+        if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) {
96ab97
+            break;
96ab97
+        }
96ab97
+        memset(dest_mask, 0, num_bytes_in_masks);
96ab97
+        memset(from_mask, 0, num_bytes_in_masks);
96ab97
+        SET_BIT(max_from_node_id, from_mask);
96ab97
+        SET_BIT(min_dest_node_id, dest_mask);
96ab97
+        numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id);
96ab97
+        // Lie about num_nodes being one bigger because of kernel bug...
96ab97
+        int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask);
96ab97
+        if (rc > 2) {
96ab97
+            // rc == the number of pages that could not be moved.  
96ab97
+            // A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2.
96ab97
+            numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc);
96ab97
+        } else if (rc < 0) {
96ab97
+            // Check errno
96ab97
+            if (errno == ESRCH) {
96ab97
+                numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid);
96ab97
+                return 0;  // Assume the process terminated
96ab97
+            }
96ab97
+        }
96ab97
+        // Assume memory did move for current accounting purposes...
96ab97
+        p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id];
96ab97
+        p->process_MBs[max_from_node_id] = 0;
96ab97
+        prev_from_node_id = max_from_node_id;
96ab97
+    }
96ab97
     // Check pid still active
96ab97
-    snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
96ab97
+    snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
96ab97
     if (access(fname, F_OK) < 0) {
96ab97
-        numad_log(LOG_WARNING, "Could not migrate pid\n");
96ab97
-        return 0;  // Assume the process terminated
96ab97
+        numad_log(LOG_WARNING, "Could not migrate pid %d.  Apparently it went away.\n", p->pid);
96ab97
+        return 0;
96ab97
+    } else {
96ab97
+        uint64_t t1 = get_time_stamp();
96ab97
+        p->bind_time_stamp = t1;
96ab97
+        char node_list_str[BUF_SIZE];
96ab97
+        str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p);
96ab97
+        numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100);
96ab97
+        return 1;
96ab97
     }
96ab97
-    numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
96ab97
-    return 1;
96ab97
 }
96ab97
 
96ab97
 
96ab97
-void show_nodes() {
96ab97
-    time_t ts = time(NULL);
96ab97
-    fprintf(log_fs, "%s", ctime(&ts);;
96ab97
-    fprintf(log_fs, "Nodes: %d\n", num_nodes);
96ab97
-    for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
-        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ", 
96ab97
-            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
96ab97
-        for (int d = 0;  (d < num_nodes);  d++) {
96ab97
-            fprintf(log_fs, "%d ", node[ix].distance[d]);
96ab97
-        }
96ab97
-        char buf[BUF_SIZE];
96ab97
-        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
96ab97
-        fprintf(log_fs, " CPUs: %s\n", buf);
96ab97
-    }
96ab97
-    fprintf(log_fs, "\n");
96ab97
-    fflush(log_fs);
96ab97
-}
96ab97
-
96ab97
 
96ab97
 typedef struct cpu_data {
96ab97
     uint64_t time_stamp;
96ab97
@@ -1062,10 +1108,9 @@ typedef struct cpu_data {
96ab97
 cpu_data_t cpu_data_buf[2];  // Two sets, to calc deltas
96ab97
 int cur_cpu_data_buf = 0;
96ab97
 
96ab97
-
96ab97
 void update_cpu_data() {
96ab97
     // Parse idle percents from CPU stats in /proc/stat cpu<N> lines
96ab97
-    static FILE *fs = NULL;
96ab97
+    static FILE *fs;
96ab97
     if (fs != NULL) {
96ab97
         rewind(fs);
96ab97
     } else {
96ab97
@@ -1107,14 +1152,14 @@ void update_cpu_data() {
96ab97
             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip nice
96ab97
             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip system
96ab97
             while (!isdigit(*p)) { p++; }
96ab97
-            uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); }
96ab97
+            uint64_t idle;
96ab97
+            CONVERT_DIGITS_TO_NUM(p, idle);
96ab97
             cpu_data_buf[new].idle[cpu_id] = idle;
96ab97
         }
96ab97
     }
96ab97
     cur_cpu_data_buf = new;
96ab97
 }
96ab97
 
96ab97
-
96ab97
 int node_and_digits(const struct dirent *dptr) {
96ab97
     char *p = (char *)(dptr->d_name);
96ab97
     if (*p++ != 'n') return 0;
96ab97
@@ -1129,10 +1174,31 @@ int node_and_digits(const struct dirent
96ab97
 }
96ab97
 
96ab97
 
96ab97
+uint64_t node_info_time_stamp = 0;
96ab97
 id_list_p all_cpus_list_p = NULL;
96ab97
 id_list_p all_nodes_list_p = NULL;
96ab97
-uint64_t node_info_time_stamp = 0;
96ab97
+id_list_p reserved_cpu_mask_list_p = NULL;
96ab97
+char *reserved_cpu_str = NULL;
96ab97
 
96ab97
+void show_nodes() {
96ab97
+    fprintf(log_fs, "\n");
96ab97
+    numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
96ab97
+    fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n", 
96ab97
+        min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
96ab97
+    fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n", 
96ab97
+        min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
96ab97
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
+        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ", 
96ab97
+            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
96ab97
+        for (int d = 0;  (d < num_nodes);  d++) {
96ab97
+            fprintf(log_fs, "%d ", node[ix].distance[d]);
96ab97
+        }
96ab97
+        char buf[BUF_SIZE];
96ab97
+        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
96ab97
+        fprintf(log_fs, " CPUs: %s\n", buf);
96ab97
+    }
96ab97
+    fflush(log_fs);
96ab97
+}
96ab97
 
96ab97
 int update_nodes() {
96ab97
     char fname[FNAME_SIZE];
96ab97
@@ -1141,6 +1207,7 @@ int update_nodes() {
96ab97
     uint64_t time_stamp = get_time_stamp();
96ab97
 #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
96ab97
     if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
96ab97
+        node_info_time_stamp = time_stamp;
96ab97
         // Count directory names of the form: /sys/devices/system/node/node<N>
96ab97
         struct dirent **namelist;
96ab97
         int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
96ab97
@@ -1167,8 +1234,15 @@ int update_nodes() {
96ab97
             }
96ab97
             num_nodes = num_files;
96ab97
         }
96ab97
-        CLEAR_LIST(all_cpus_list_p);
96ab97
-        CLEAR_LIST(all_nodes_list_p);
96ab97
+        sum_CPUs_total = 0;
96ab97
+        CLEAR_CPU_LIST(all_cpus_list_p);
96ab97
+        CLEAR_NODE_LIST(all_nodes_list_p);
96ab97
+        // Figure out how many threads per core there are (for later discounting of hyper-threads)
96ab97
+        threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
96ab97
+        if (threads_per_core < 1) {
96ab97
+            numad_log(LOG_CRIT, "Could not count threads per core\n");
96ab97
+            exit(EXIT_FAILURE);
96ab97
+        }
96ab97
         // For each "node<N>" filename present, save <N> in node[ix].node_id
96ab97
         // Note that the node id might not necessarily match the node ix.
96ab97
         // Also populate the cpu lists and distance vectors for this node.
96ab97
@@ -1184,11 +1258,24 @@ int update_nodes() {
96ab97
             snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id);
96ab97
             int fd = open(fname, O_RDONLY, 0);
96ab97
             if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
96ab97
+                buf[BIG_BUF_SIZE - 1] = '\0';
96ab97
                 // get cpulist from the cpulist string
96ab97
-                CLEAR_LIST(node[node_ix].cpu_list_p);
96ab97
+                CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
96ab97
                 int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
96ab97
+                if (reserved_cpu_str != NULL) {
96ab97
+                    AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
96ab97
+                    n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
96ab97
+                }
96ab97
                 OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
96ab97
-                node[node_ix].CPUs_total = n * ONE_HUNDRED;
96ab97
+                // Calculate total CPUs, but possibly discount hyper-threads
96ab97
+                if ((threads_per_core == 1) || (htt_percent >= 100)) {
96ab97
+                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
96ab97
+                } else {
96ab97
+                    n /= threads_per_core;
96ab97
+                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
96ab97
+                    node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
96ab97
+                }
96ab97
+                sum_CPUs_total += node[node_ix].CPUs_total;
96ab97
                 close(fd);
96ab97
             } else {
96ab97
                 numad_log(LOG_CRIT, "Could not get node cpu list\n");
96ab97
@@ -1220,15 +1307,30 @@ int update_nodes() {
96ab97
         }
96ab97
         free(namelist);
96ab97
     }
96ab97
-    // Second, get the dynamic free memory and available CPU capacity
96ab97
+    // Second, update the dynamic free memory and available CPU capacity
96ab97
+    while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
96ab97
+        // Make sure at least 7/100 of a second has passed.
96ab97
+        // Otherwise sleep for 1/10 second.
96ab97
+	struct timespec ts = { 0, 100000000 }; 
96ab97
+	nanosleep(&ts, &ts);
96ab97
+	time_stamp = get_time_stamp();
96ab97
+    }
96ab97
     update_cpu_data();
96ab97
+    max_node_MBs_free = 0;
96ab97
+    max_node_CPUs_free = 0;
96ab97
+    min_node_MBs_free = MAXINT;
96ab97
+    min_node_CPUs_free = MAXINT;
96ab97
+    uint64_t sum_of_node_MBs_free = 0;
96ab97
+    uint64_t sum_of_node_CPUs_free = 0;
96ab97
     for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
96ab97
         int node_id = node[node_ix].node_id;
96ab97
         // Get available memory info from node<N>/meminfo file
96ab97
         snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
96ab97
         int fd = open(fname, O_RDONLY, 0);
96ab97
         if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
96ab97
+            close(fd);
96ab97
             uint64_t KB;
96ab97
+            buf[BIG_BUF_SIZE - 1] = '\0';
96ab97
             char *p = strstr(buf, "MemTotal:");
96ab97
             if (p != NULL) {
96ab97
                 p += 9;
96ab97
@@ -1238,7 +1340,11 @@ int update_nodes() {
96ab97
             }
96ab97
             while (!isdigit(*p)) { p++; }
96ab97
             CONVERT_DIGITS_TO_NUM(p, KB);
96ab97
-            node[node_ix].MBs_total = KB / KILOBYTE;
96ab97
+            node[node_ix].MBs_total = (KB / KILOBYTE);
96ab97
+            if (node[node_ix].MBs_total < 1) {
96ab97
+                // If a node has zero memory, remove it from the all_nodes_list...
96ab97
+                CLR_ID_IN_LIST(node_id, all_nodes_list_p);
96ab97
+            }
96ab97
             p = strstr(p, "MemFree:");
96ab97
             if (p != NULL) {
96ab97
                 p += 8;
96ab97
@@ -1248,8 +1354,28 @@ int update_nodes() {
96ab97
             }
96ab97
             while (!isdigit(*p)) { p++; }
96ab97
             CONVERT_DIGITS_TO_NUM(p, KB);
96ab97
-            node[node_ix].MBs_free = KB / KILOBYTE;
96ab97
-            close(fd);
96ab97
+            node[node_ix].MBs_free = (KB / KILOBYTE);
96ab97
+            if (use_inactive_file_cache) {
96ab97
+                // Add inactive file cache quantity to "free" memory
96ab97
+                p = strstr(p, "Inactive(file):");
96ab97
+                if (p != NULL) {
96ab97
+                    p += 15;
96ab97
+                } else {
96ab97
+                    numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
96ab97
+                    exit(EXIT_FAILURE);
96ab97
+                }
96ab97
+                while (!isdigit(*p)) { p++; }
96ab97
+                CONVERT_DIGITS_TO_NUM(p, KB);
96ab97
+                node[node_ix].MBs_free += (KB / KILOBYTE);
96ab97
+            }
96ab97
+            sum_of_node_MBs_free += node[node_ix].MBs_free;
96ab97
+            if (min_node_MBs_free > node[node_ix].MBs_free) {
96ab97
+                min_node_MBs_free = node[node_ix].MBs_free;
96ab97
+                min_node_MBs_free_ix = node[node_ix].node_id;
96ab97
+            }
96ab97
+            if (max_node_MBs_free < node[node_ix].MBs_free) {
96ab97
+                max_node_MBs_free = node[node_ix].MBs_free;
96ab97
+            }
96ab97
         } else {
96ab97
             numad_log(LOG_CRIT, "Could not get node meminfo\n");
96ab97
             exit(EXIT_FAILURE);
96ab97
@@ -1260,7 +1386,8 @@ int update_nodes() {
96ab97
         if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
96ab97
             uint64_t idle_ticks = 0;
96ab97
             int cpu = 0;
96ab97
-            int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED;
96ab97
+            int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
96ab97
+            int num_cpus_to_process = num_lcpus;
96ab97
             while (num_cpus_to_process) {
96ab97
                 if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
96ab97
                     idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
96ab97
@@ -1274,15 +1401,46 @@ int update_nodes() {
96ab97
             // printf("Node: %d   CPUs: %ld   time diff %ld   Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
96ab97
             // assert(time_diff > 0);
96ab97
             node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
96ab97
+            // Possibly discount hyper-threads
96ab97
+            if ((threads_per_core > 1) && (htt_percent < 100)) {
96ab97
+                uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
96ab97
+                if (node[node_ix].CPUs_free > htt_discount) {
96ab97
+                    node[node_ix].CPUs_free -= htt_discount;
96ab97
+                } else {
96ab97
+                    node[node_ix].CPUs_free = 0;
96ab97
+                }
96ab97
+            }
96ab97
             if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
96ab97
                 node[node_ix].CPUs_free = node[node_ix].CPUs_total;
96ab97
             }
96ab97
+            sum_of_node_CPUs_free += node[node_ix].CPUs_free;
96ab97
+            if (min_node_CPUs_free > node[node_ix].CPUs_free) {
96ab97
+                min_node_CPUs_free = node[node_ix].CPUs_free;
96ab97
+                min_node_CPUs_free_ix = node[node_ix].node_id;
96ab97
+            }
96ab97
+            if (max_node_CPUs_free < node[node_ix].CPUs_free) {
96ab97
+                max_node_CPUs_free = node[node_ix].CPUs_free;
96ab97
+            }
96ab97
             node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
96ab97
         } else {
96ab97
             node[node_ix].CPUs_free = 0;
96ab97
             node[node_ix].magnitude = 0;
96ab97
         }
96ab97
     }
96ab97
+    avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
96ab97
+    avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
96ab97
+    double MBs_variance_sum = 0.0;
96ab97
+    double CPUs_variance_sum = 0.0;
96ab97
+    for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
96ab97
+        double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
96ab97
+        double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
96ab97
+        MBs_variance_sum += MBs_diff * MBs_diff;
96ab97
+        CPUs_variance_sum += CPUs_diff * CPUs_diff;
96ab97
+    }
96ab97
+    double MBs_variance = MBs_variance_sum / (num_nodes);
96ab97
+    double CPUs_variance = CPUs_variance_sum / (num_nodes);
96ab97
+    stddev_node_MBs_free = sqrt(MBs_variance);
96ab97
+    stddev_node_CPUs_free = sqrt(CPUs_variance);
96ab97
     if (log_level >= LOG_INFO) {
96ab97
         show_nodes();
96ab97
     }
96ab97
@@ -1316,7 +1474,7 @@ typedef struct stat_data {
96ab97
     int64_t num_threads;  // 19
96ab97
     int64_t itrealvalue;
96ab97
     uint64_t starttime;
96ab97
-    uint64_t vsize;
96ab97
+    uint64_t vsize;       // 22
96ab97
     int64_t rss;          // 23
96ab97
     uint64_t rsslim;
96ab97
     uint64_t startcode;
96ab97
@@ -1356,15 +1514,16 @@ process_data_p get_stat_data_for_pid(int
96ab97
     }
96ab97
     static char buf[BUF_SIZE];
96ab97
     int bytes = read(fd, buf, BUF_SIZE);
96ab97
+    close(fd);
96ab97
     if (bytes < 50) {
96ab97
         numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname);
96ab97
         return NULL;
96ab97
     }
96ab97
-    close(fd);
96ab97
+    uint64_t val;
96ab97
     char *p = buf;
96ab97
     static process_data_t data;
96ab97
     // Get PID from field 0
96ab97
-    uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
96ab97
+    CONVERT_DIGITS_TO_NUM(p, val);
96ab97
     data.pid = val;
96ab97
     // Copy comm from field 1
96ab97
     while (*p == ' ') { p++; }
96ab97
@@ -1373,23 +1532,27 @@ process_data_p get_stat_data_for_pid(int
96ab97
     // Skip fields 2 through 12
96ab97
     for (int ix = 0;  (ix < 11);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
96ab97
     // Get utime from field 13 for cpu_util
96ab97
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
96ab97
+    CONVERT_DIGITS_TO_NUM(p, val);
96ab97
     data.cpu_util = val;
96ab97
     // Get stime from field 14 to add on to cpu_util (which already has utime)
96ab97
     while (*p == ' ') { p++; }
96ab97
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
96ab97
+    CONVERT_DIGITS_TO_NUM(p, val);
96ab97
     data.cpu_util += val;
96ab97
     // Skip fields 15 through 18
96ab97
     while (*p == ' ') { p++; }
96ab97
     for (int ix = 0;  (ix < 4);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
96ab97
     // Get num_threads from field 19
96ab97
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
96ab97
+    CONVERT_DIGITS_TO_NUM(p, val);
96ab97
     data.num_threads = val;
96ab97
-    // Skip fields 20 through 22
96ab97
+    // Skip fields 20 through 21
96ab97
     while (*p == ' ') { p++; }
96ab97
-    for (int ix = 0;  (ix < 3);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
96ab97
+    for (int ix = 0;  (ix < 2);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
96ab97
+    // Get vsize from field 22 to compute MBs_size
96ab97
+    CONVERT_DIGITS_TO_NUM(p, val);
96ab97
+    data.MBs_size = val / MEGABYTE;
96ab97
     // Get rss from field 23 to compute MBs_used
96ab97
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
96ab97
+    while (*p == ' ') { p++; }
96ab97
+    CONVERT_DIGITS_TO_NUM(p, val);
96ab97
     data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
96ab97
     // Return pointer to data
96ab97
     return &dat;;
96ab97
@@ -1471,446 +1634,409 @@ int update_processes() {
96ab97
 }
96ab97
 
96ab97
 
96ab97
+int initialize_mem_node_list(process_data_p p) {
96ab97
+    // Parameter p is a pointer to an element in the hash table
96ab97
+    if ((!p) || (p->pid < 1)) {
96ab97
+        numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
96ab97
+        exit(EXIT_FAILURE);
96ab97
+    }
96ab97
+    int n = 0;
96ab97
+    char fname[FNAME_SIZE];
96ab97
+    char buf[BIG_BUF_SIZE];
96ab97
+    p->process_MBs = NULL;
96ab97
+    CLEAR_NODE_LIST(p->node_list_p);
96ab97
+    snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
96ab97
+    int fd = open(fname, O_RDONLY, 0);
96ab97
+    if (fd < 0) {
96ab97
+        numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
96ab97
+        return 0;  // Assume the process terminated
96ab97
+    }
96ab97
+    int bytes = read(fd, buf, BIG_BUF_SIZE);
96ab97
+    close(fd);
96ab97
+    if (bytes <= 0) {
96ab97
+        numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid);
96ab97
+        return 0;  // Assume the process terminated
96ab97
+    } else if (bytes >= BIG_BUF_SIZE) {
96ab97
+        buf[BIG_BUF_SIZE - 1] = '\0';
96ab97
+    } else {
96ab97
+        buf[bytes] = '\0';
96ab97
+    }
96ab97
+    char *list_str_p = strstr(buf, "Mems_allowed_list:");
96ab97
+    if (!list_str_p) {
96ab97
+        numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
96ab97
+        exit(EXIT_FAILURE);
96ab97
+    }
96ab97
+    list_str_p += 18;
96ab97
+    while (!isdigit(*list_str_p)) { list_str_p++; }
96ab97
+    n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
96ab97
+    if (n < num_nodes) {
96ab97
+        // If process already bound to a subset of nodes when we discover it,
96ab97
+        // set initial bind_time_stamp to 30 minutes ago...
96ab97
+        p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
96ab97
+    }
96ab97
+    return n;
96ab97
+}
96ab97
 
96ab97
-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) {
96ab97
-    char buf[BUF_SIZE];
96ab97
-    char buf2[BUF_SIZE];
96ab97
+
96ab97
+uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
96ab97
+    int64_t needed_mem;
96ab97
+    int64_t needed_cpu;
96ab97
+    int64_t excess_mem;
96ab97
+    int64_t excess_cpu;
96ab97
+    if (MBs_free > mbs) {
96ab97
+        needed_mem = mbs;
96ab97
+        excess_mem = MBs_free - mbs;
96ab97
+    } else {
96ab97
+        needed_mem = MBs_free;
96ab97
+        excess_mem = 0;
96ab97
+    }
96ab97
+    if (CPUs_free > cpus) {
96ab97
+        needed_cpu = cpus;
96ab97
+        excess_cpu = CPUs_free - cpus;
96ab97
+    } else {
96ab97
+        needed_cpu = CPUs_free;
96ab97
+        excess_cpu = 0;
96ab97
+    }
96ab97
+    // Weight the available resources, and then calculate magnitude as
96ab97
+    // product of available CPUs and available MBs.
96ab97
+    int64_t memfactor = (needed_mem * 10 + excess_mem * 4);
96ab97
+    int64_t cpufactor = (needed_cpu *  6 + excess_cpu * 1);
96ab97
+    numad_log(LOG_DEBUG, "    Node[%d]: mem: %ld  cpu: %ld\n", ix, memfactor, cpufactor);
96ab97
+    return (memfactor * cpufactor);
96ab97
+}
96ab97
+
96ab97
+
96ab97
+id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
96ab97
     if (log_level >= LOG_DEBUG) {
96ab97
         numad_log(LOG_DEBUG, "PICK NODES FOR:  PID: %d,  CPUs %d,  MBs %d\n", pid, cpus, mbs);
96ab97
     }
96ab97
-    int num_existing_mems = 0;
96ab97
-    static id_list_p existing_mems_list_p;
96ab97
-    CLEAR_LIST(existing_mems_list_p);
96ab97
-    uint64_t time_stamp = get_time_stamp();
96ab97
-    static node_data_p tmp_node;
96ab97
-    static uint64_t *process_MBs;
96ab97
-    static uint64_t *saved_magnitude_for_node;
96ab97
-    static int process_MBs_num_nodes;
96ab97
-    // See if dynamic structures need to grow.
96ab97
-    if (process_MBs_num_nodes < num_nodes + 1) {
96ab97
-        process_MBs_num_nodes = num_nodes + 1;
96ab97
-        // The "+1 node" is for accumulating interleaved memory
96ab97
-        process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t));
96ab97
-        tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
96ab97
-        saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t));
96ab97
-        if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) {
96ab97
-            numad_log(LOG_CRIT, "process_MBs realloc failed\n");
96ab97
-            exit(EXIT_FAILURE);
96ab97
-        }
96ab97
-    }
96ab97
+    char buf[BUF_SIZE];
96ab97
+    uint64_t proc_avg_node_CPUs_free = 0;
96ab97
     // For existing processes, get miscellaneous process specific details
96ab97
     int pid_ix;
96ab97
     process_data_p p = NULL;
96ab97
     if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
96ab97
         p = &process_hash_table[pid_ix];
96ab97
-        // Quick rejection if this process has interleaved memory, but recheck it once an hour...
96ab97
-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED)
96ab97
-        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
96ab97
-          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
96ab97
-            if (log_level >= LOG_DEBUG) {
96ab97
-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
96ab97
-            }
96ab97
-            return NULL;
96ab97
-        }
96ab97
-        // Get cpuset name for this process, and existing mems binding, if any.
96ab97
+        // Add up per-node memory in use by this process.
96ab97
+        // This scanning is expensive and should be minimized.
96ab97
         char fname[FNAME_SIZE];
96ab97
-        snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid);
96ab97
-        FILE *fs = fopen(fname, "r");
96ab97
-        if (!fs) {
96ab97
-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
96ab97
-            return NULL;  // Assume the process terminated?
96ab97
-        }
96ab97
-        if (!fgets(buf, BUF_SIZE, fs)) {
96ab97
-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
96ab97
-            fclose(fs);
96ab97
-            return NULL;  // Assume the process terminated?
96ab97
-        }
96ab97
-        fclose(fs);
96ab97
-        ELIM_NEW_LINE(buf);
96ab97
-        if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) {
96ab97
-            if (p->cpuset_name != NULL) {
96ab97
-                free(p->cpuset_name);
96ab97
-            }
96ab97
-            p->cpuset_name = strdup(buf);
96ab97
-        }
96ab97
-        if (log_level >= LOG_DEBUG) {
96ab97
-            numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name);
96ab97
-        }
96ab97
-        snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
96ab97
-        fs = fopen(fname, "r");
96ab97
-        if ((fs) && (fgets(buf, BUF_SIZE, fs))) {
96ab97
-            fclose(fs);
96ab97
-            num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf);
96ab97
-            if (log_level >= LOG_DEBUG) {
96ab97
-                str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
96ab97
-                numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf);
96ab97
-            }
96ab97
-        } 
96ab97
-        // If this process was just recently bound, enforce a minimum delay
96ab97
-        // period between repeated attempts to potentially move the memory.
96ab97
-        // FIXME: ?? might this retard appropriate process expansion too much?  
96ab97
-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED)
96ab97
-        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
96ab97
-            // Skip re-evaluation because we just did it recently.
96ab97
-            if (log_level >= LOG_DEBUG) {
96ab97
-                numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n");
96ab97
-            }
96ab97
-            return NULL;
96ab97
-        }
96ab97
-        // Look for short cut because of duplicate bindings.  If we have bound
96ab97
-        // this process to the same nodes multiple times already, and the load
96ab97
-        // on those nodes still seems acceptable, skip the rest of this and
96ab97
-        // just return NULL to indicate no change needed.  FIXME: should figure
96ab97
-        // out what can change that would make a rebinding desirable (e.g. (1)
96ab97
-        // some process gets sub-optimal allocation on busy machine which
96ab97
-        // subsequently becomes less busy leaving disadvantaged process. (2)
96ab97
-        // node load imbalance, (3) any process split across nodes which should
96ab97
-        // fit within a single node.) For now, just expire the dup_bid_count
96ab97
-        // occasionally, which is a reasonably good mitigation.
96ab97
-        // So, check to see if we should decay the dup_bind_count...
96ab97
-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED)
96ab97
-        if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) {
96ab97
-            p->dup_bind_count -= 1;
96ab97
-        }
96ab97
-        // Now, look for short cut because of duplicate bindings
96ab97
-        if (p->dup_bind_count > 0) {
96ab97
-            int node_id = 0;
96ab97
-            int nodes_have_cpu = 1;
96ab97
-            int nodes_have_ram = 1;
96ab97
-            int n = num_existing_mems;
96ab97
-            int min_resource_pct = 100 - target_utilization;
96ab97
-            if (min_resource_pct < 5) {
96ab97
-                min_resource_pct = 5;
96ab97
-            }
96ab97
-            while (n) {
96ab97
-                if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
96ab97
-                    nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct));
96ab97
-                    nodes_have_ram &= ((100 * node[node_id].MBs_free  / node[node_id].MBs_total)  >= (min_resource_pct));
96ab97
-                    n -= 1;
96ab97
-                }
96ab97
-                node_id += 1;
96ab97
-            }
96ab97
-            if ((nodes_have_cpu) && (nodes_have_ram)) {
96ab97
-                if (log_level >= LOG_DEBUG) {
96ab97
-                    numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n");
96ab97
-                }
96ab97
-                return NULL;
96ab97
-            }
96ab97
-            if (log_level >= LOG_DEBUG) {
96ab97
-                numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram);
96ab97
-            }
96ab97
-        }
96ab97
-        // Fourth, add up per-node memory in use by this process. This scanning
96ab97
-        // is expensive and should be minimized.  Also, old kernels dismantle
96ab97
-        // transparent huge pages while producing the numa_maps memory
96ab97
-        // information! 
96ab97
-        memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t));
96ab97
         snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
96ab97
-        fs = fopen(fname, "r");
96ab97
+        FILE *fs = fopen(fname, "r");
96ab97
         if (!fs) {
96ab97
             numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
96ab97
             return NULL;  // Assume the process terminated
96ab97
         }
96ab97
+        // Allocate and zero per node memory array.
96ab97
+        // The "+1 node" is for accumulating interleaved memory
96ab97
+        p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t));
96ab97
+        if (p->process_MBs == NULL) {
96ab97
+            numad_log(LOG_CRIT, "p->process_MBs realloc failed\n");
96ab97
+            exit(EXIT_FAILURE);
96ab97
+        }
96ab97
+        memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t));
96ab97
         int process_has_interleaved_memory = 0;
96ab97
         while (fgets(buf, BUF_SIZE, fs)) {
96ab97
             int interleaved_memory = 0;
96ab97
             uint64_t page_size = page_size_in_bytes;
96ab97
             const char *delimiters = " \n";
96ab97
-            char *p = strtok(buf, delimiters);
96ab97
-            while (p) {
96ab97
-                if (!strncmp(p, "interleave", 10)) {
96ab97
+            char *str_p = strtok(buf, delimiters);
96ab97
+            while (str_p) {
96ab97
+                if (!strncmp(str_p, "interleave", 10)) {
96ab97
                     interleaved_memory = 1;
96ab97
                     process_has_interleaved_memory = 1;
96ab97
-                } else if (!strcmp(p, "huge")) {
96ab97
+                } else if (!strcmp(str_p, "huge")) {
96ab97
                     page_size = huge_page_size_in_bytes;
96ab97
-                } else if (*p++ == 'N') {
96ab97
+                } else if (*str_p++ == 'N') {
96ab97
                     int node;
96ab97
                     uint64_t pages;
96ab97
-                    CONVERT_DIGITS_TO_NUM(p, node);
96ab97
-                    if (*p++ != '=') {
96ab97
+                    CONVERT_DIGITS_TO_NUM(str_p, node);
96ab97
+                    if (*str_p++ != '=') {
96ab97
                         numad_log(LOG_CRIT, "numa_maps node number parse error\n");
96ab97
                         exit(EXIT_FAILURE);
96ab97
                     }
96ab97
-                    CONVERT_DIGITS_TO_NUM(p, pages);
96ab97
-                    process_MBs[node] += (pages * page_size);
96ab97
+                    CONVERT_DIGITS_TO_NUM(str_p, pages);
96ab97
+                    p->process_MBs[node] += (pages * page_size);
96ab97
                     if (interleaved_memory) {
96ab97
                         // sum interleaved quantity in "extra node"
96ab97
-                        process_MBs[num_nodes] += (pages * page_size);
96ab97
+                        p->process_MBs[num_nodes] += (pages * page_size);
96ab97
                     }
96ab97
                 }
96ab97
                 // Get next token on the line
96ab97
-                p = strtok(NULL, delimiters);
96ab97
+                str_p = strtok(NULL, delimiters);
96ab97
             }
96ab97
         }
96ab97
         fclose(fs);
96ab97
+        proc_avg_node_CPUs_free = p->CPUs_used;
96ab97
         for (int ix = 0;  (ix <= num_nodes);  ix++) {
96ab97
-            process_MBs[ix] /= MEGABYTE;
96ab97
-            if (log_level >= LOG_DEBUG) {
96ab97
-                numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
96ab97
+            p->process_MBs[ix] /= MEGABYTE;
96ab97
+            if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) {
96ab97
+                if (ix == num_nodes) {
96ab97
+                    numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]);
96ab97
+                } else {
96ab97
+                    numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]);
96ab97
+                }
96ab97
+            }
96ab97
+            if (ID_IS_IN_LIST(ix, p->node_list_p)) {
96ab97
+                proc_avg_node_CPUs_free += node[ix].CPUs_free;
96ab97
             }
96ab97
         }
96ab97
+        proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p);
96ab97
         if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
96ab97
             // Mark this process as having interleaved memory so we do not
96ab97
-            // merge the interleaved memory.  Time stamp it as done.
96ab97
+            // merge the interleaved memory.  Time stamp it as done and return.
96ab97
             p->flags |= PROCESS_FLAG_INTERLEAVED;
96ab97
             p->bind_time_stamp = get_time_stamp();
96ab97
             if (log_level >= LOG_DEBUG) {
96ab97
-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
96ab97
+                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
96ab97
             }
96ab97
             return NULL;
96ab97
         }
96ab97
     }  // end of existing PID conditional
96ab97
     // Make a copy of node available resources array.  Add in info specific to
96ab97
     // this process to equalize available resource quantities wrt locations of
96ab97
-    // resources already in use by this process.  Inflate the value of already
96ab97
-    // assigned memory by approximately 3/2, because moving memory is
96ab97
-    // expensive.  Average the amount of CPUs_free across the existing nodes
96ab97
-    // used, because the threads are free to move around in that domain.  After
96ab97
-    // calculating combined magnitude of available resources, bias the values
96ab97
-    // towards existing locations for this process.
96ab97
-    int target_using_all_nodes = 0;
96ab97
-    uint64_t node_CPUs_free_for_this_process = 0;
96ab97
-    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
96ab97
-    if (num_existing_mems > 0) {
96ab97
-        node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation?
96ab97
-        int node_id = 0;
96ab97
-        int n = num_existing_mems;
96ab97
-        while (n) {
96ab97
-            if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
96ab97
-                node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free;
96ab97
-                n -= 1;
96ab97
-            }
96ab97
-            node_id += 1;
96ab97
-        }
96ab97
-        // Divide to get average CPUs_free for the nodes in use by process
96ab97
-        node_CPUs_free_for_this_process /= num_existing_mems;
96ab97
+    // resources already in use by this process.
96ab97
+    static node_data_p tmp_node;
96ab97
+    tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
96ab97
+    if (tmp_node == NULL) {
96ab97
+        numad_log(LOG_CRIT, "tmp_node realloc failed\n");
96ab97
+        exit(EXIT_FAILURE);
96ab97
     }
96ab97
+    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
96ab97
+    uint64_t sum_of_node_CPUs_free = 0;
96ab97
     for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
         if (pid > 0) {
96ab97
-            tmp_node[ix].MBs_free  += ((process_MBs[ix] * 12) / 8);
96ab97
-        }
96ab97
-        if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) {
96ab97
-            tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process;
96ab97
-        }
96ab97
-        if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
96ab97
-            tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
96ab97
-        }
96ab97
-        if (log_level >= LOG_DEBUG) {
96ab97
-            numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free);
96ab97
+            if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) {
96ab97
+                // Process not yet bound to a subset of nodes.
96ab97
+                // Add back memory used by this process on this node.
96ab97
+                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16);  // Apply light mem bias
96ab97
+                // Add back CPU used by this process in proportion to the memory used on this node.
96ab97
+                tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used);
96ab97
+            } else {
96ab97
+                // If the process is currently running on less than all the
96ab97
+                // nodes, first add back (biased) memory already used by this
96ab97
+                // process on this node, then assign average process CPU / node
96ab97
+                // for this process iff the process is present on this node.
96ab97
+                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4);  // Apply heavy mem bias
96ab97
+                if (ID_IS_IN_LIST(ix, p->node_list_p)) {
96ab97
+                    tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free;
96ab97
+                }
96ab97
+            }
96ab97
+            sum_of_node_CPUs_free += tmp_node[ix].CPUs_free;
96ab97
+            if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
96ab97
+                tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
96ab97
+            }
96ab97
+            if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
96ab97
+                tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
96ab97
+            }
96ab97
         }
96ab97
-        // Calculate magnitude as product of available CPUs and available MBs
96ab97
-        tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free;
96ab97
-        // Bias combined magnitude towards already assigned nodes
96ab97
-        if (ID_IS_IN_LIST(ix, existing_mems_list_p)) {
96ab97
-            tmp_node[ix].magnitude *= 9;
96ab97
-            tmp_node[ix].magnitude /= 8;
96ab97
+        // Enforce 1/100th CPU minimum
96ab97
+        if (tmp_node[ix].CPUs_free < 1) {
96ab97
+            tmp_node[ix].CPUs_free = 1;
96ab97
         }
96ab97
-        // Save the current magnitudes
96ab97
-        saved_magnitude_for_node[ix] = tmp_node[ix].magnitude;
96ab97
+        // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld  cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
96ab97
+        tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
96ab97
     }
96ab97
-    // OK, figure out where to get resources for this request.
96ab97
+    // Now figure out where to get resources for this request....
96ab97
     static id_list_p target_node_list_p;
96ab97
-    CLEAR_LIST(target_node_list_p);
96ab97
-    int prev_node_used = -1;
96ab97
-    // Continue to allocate more resources until request are met.
96ab97
-    // OK if not not quite all the CPU request is met.
96ab97
-    // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing?
96ab97
-    int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; 
96ab97
-    if (pid <= 0) {
96ab97
-        // If trying to find resources for pre-placement advice request, do not
96ab97
-        // underestimate the amount of CPUs needed.  Instead, err on the side
96ab97
-        // of providing too many resources.  So, no flexing here...
96ab97
-        cpu_flex = 0;
96ab97
-    }
96ab97
-    while ((mbs > 0) || (cpus > cpu_flex)) {
96ab97
-        if (log_level >= LOG_DEBUG) {
96ab97
-            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
96ab97
+    CLEAR_NODE_LIST(target_node_list_p);
96ab97
+    if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) {
96ab97
+        // System CPUs might be oversubscribed, but...
96ab97
+        assume_enough_cpus = 1;
96ab97
+        // and rely on available memory for placement.
96ab97
+    }
96ab97
+    // Establish a CPU flex fudge factor, on the presumption it is OK if not
96ab97
+    // quite all the CPU request is met.  However, if trying to find resources
96ab97
+    // for pre-placement advice request, do not underestimate the amount of
96ab97
+    // CPUs needed.  Instead, err on the side of providing too many resources.
96ab97
+    int cpu_flex = 0;
96ab97
+    if ((pid > 0) && (target_utilization < 100)) {
96ab97
+        // FIXME: Is half of the utilization margin a good amount of CPU flexing?
96ab97
+        cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200;
96ab97
+    }
96ab97
+    // Figure out minimum number of nodes required
96ab97
+    int mem_req_nodes = ceil((double)mbs  / (double)node[0].MBs_total);
96ab97
+    int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total); 
96ab97
+    int min_req_nodes = mem_req_nodes;
96ab97
+    if (min_req_nodes < cpu_req_nodes) {
96ab97
+        min_req_nodes = cpu_req_nodes;
96ab97
+    }
96ab97
+    if (min_req_nodes > num_nodes) {
96ab97
+        min_req_nodes = num_nodes;
96ab97
+    }
96ab97
+    // Use an index to sort NUMA connected resource chain for each node
96ab97
+    int index[num_nodes];
96ab97
+    uint64_t totmag[num_nodes];
96ab97
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
+        // Reset the index each time
96ab97
+        for (int n = 0;  (n < num_nodes);  n++) {
96ab97
+            index[n] = n;
96ab97
         }
96ab97
-        // Sort nodes by magnitude of available resources.  Note that
96ab97
-        // inter-node distances (to the previous node used) are factored into
96ab97
-        // the sort.
96ab97
+        // Sort by minimum relative NUMA distance from node[ix],
96ab97
+        // breaking distance ties with magnitude of available resources
96ab97
         for (int ij = 0;  (ij < num_nodes);  ij++) {
96ab97
-            int big_ix = ij;
96ab97
+            int best_ix = ij;
96ab97
             for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
96ab97
-                uint64_t ik_dist = 1;
96ab97
-                uint64_t big_ix_dist = 1;
96ab97
-                if (prev_node_used >= 0) {
96ab97
-                    ik_dist = tmp_node[ik].distance[prev_node_used];
96ab97
-                    big_ix_dist = tmp_node[big_ix].distance[prev_node_used];
96ab97
-                }
96ab97
-                // Scale magnitude comparison by distances to previous node used...
96ab97
-                if ((tmp_node[big_ix].magnitude / big_ix_dist) < (tmp_node[ik].magnitude / ik_dist)) {
96ab97
-                    big_ix = ik;
96ab97
-                }
96ab97
-            }
96ab97
-            if (big_ix != ij) {
96ab97
-                node_data_t tmp;
96ab97
-                memcpy((void *)&tmp, (void *)&tmp_node[ij], sizeof(node_data_t) );
96ab97
-                memcpy((void *)&tmp_node[ij], (void *)&tmp_node[big_ix], sizeof(node_data_t) );
96ab97
-                memcpy((void *)&tmp_node[big_ix], (void *)&tmp, sizeof(node_data_t) );
96ab97
+                int ik_dist = tmp_node[index[ik]].distance[ix];
96ab97
+                int best_ix_dist = tmp_node[index[best_ix]].distance[ix];
96ab97
+                if (best_ix_dist > ik_dist) {
96ab97
+                    best_ix = ik;
96ab97
+                } else if (best_ix_dist == ik_dist) {
96ab97
+                    if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
96ab97
+                        best_ix = ik;
96ab97
+                    }
96ab97
+                }
96ab97
+            }
96ab97
+            if (best_ix != ij) {
96ab97
+                int tmp = index[ij];
96ab97
+                index[ij] = index[best_ix];
96ab97
+                index[best_ix] = tmp;
96ab97
             }
96ab97
         }
96ab97
+#if 0
96ab97
         if (log_level >= LOG_DEBUG) {
96ab97
-            for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
-                numad_log(LOG_DEBUG, "Sorted magnitude[%d]: %ld\n", tmp_node[ix].node_id, tmp_node[ix].magnitude);
96ab97
+            for (int iq = 0;  (iq < num_nodes);  iq++) {
96ab97
+                numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
96ab97
+                    tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude);
96ab97
+            }
96ab97
+        }
96ab97
+#endif
96ab97
+        // Save the totmag[] sum of the magnitudes of expected needed nodes,
96ab97
+        // "normalized" by NUMA distance (by dividing each magnitude by the
96ab97
+        // relative distance squared).
96ab97
+        totmag[ix] = 0;
96ab97
+        for (int ij = 0;  (ij < min_req_nodes);  ij++) {
96ab97
+            int dist = tmp_node[index[ij]].distance[ix];
96ab97
+            totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist));
96ab97
+        }
96ab97
+        numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]);
96ab97
+    }
96ab97
+    // Now find the best NUMA node based on the normalized sum of node
96ab97
+    // magnitudes expected to be used.
96ab97
+    int best_node_ix = 0;
96ab97
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
+        if (totmag[best_node_ix] < totmag[ix]) {
96ab97
+            best_node_ix = ix;
96ab97
+        }
96ab97
+    }
96ab97
+    numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix);
96ab97
+    // Reset sorting index again
96ab97
+    for (int n = 0;  (n < num_nodes);  n++) {
96ab97
+        index[n] = n;
96ab97
+    }
96ab97
+    // Sort index by distance from node[best_node_ix],
96ab97
+    // breaking distance ties with magnitude
96ab97
+    for (int ij = 0;  (ij < num_nodes);  ij++) {
96ab97
+        int best_ix = ij;
96ab97
+        for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
96ab97
+            int ik_dist = tmp_node[index[ik]].distance[best_node_ix];
96ab97
+            int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix];
96ab97
+            if (best_ix_dist > ik_dist) {
96ab97
+                best_ix = ik;
96ab97
+            } else if (best_ix_dist == ik_dist) {
96ab97
+                if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
96ab97
+                    best_ix = ik;
96ab97
+                }
96ab97
             }
96ab97
         }
96ab97
-        if (tmp_node[0].node_id == prev_node_used) {
96ab97
-            // Hmmm.  Looks like the best node for more resources, is also the
96ab97
-            // last one we used.  This is not going to make progress...  So
96ab97
-            // just punt and use everything.
96ab97
-            OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p);
96ab97
-            target_using_all_nodes = 1;
96ab97
-            break;
96ab97
+        if (best_ix != ij) {
96ab97
+            int tmp = index[ij];
96ab97
+            index[ij] = index[best_ix];
96ab97
+            index[best_ix] = tmp;
96ab97
+        }
96ab97
+    }
96ab97
+    if (log_level >= LOG_DEBUG) {
96ab97
+        for (int iq = 0;  (iq < num_nodes);  iq++) {
96ab97
+            numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
96ab97
+                tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude);
96ab97
         }
96ab97
-        prev_node_used = tmp_node[0].node_id;
96ab97
-        ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p);
96ab97
+    }
96ab97
+    // Allocate more resources until request is met.
96ab97
+    best_node_ix = 0;
96ab97
+    while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
96ab97
         if (log_level >= LOG_DEBUG) {
96ab97
-            str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
96ab97
-            str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
96ab97
-            numad_log(LOG_DEBUG, "Existing nodes: %s  Target nodes: %s\n", buf, buf2);
96ab97
+            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
96ab97
         }
96ab97
+        numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]);
96ab97
+        ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p);
96ab97
+        min_req_nodes -= 1;
96ab97
         if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
96ab97
             // Apparently we must use all resource nodes...
96ab97
-            target_using_all_nodes = 1;
96ab97
             break;
96ab97
         }
96ab97
-#define MBS_MARGIN 10
96ab97
-        if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) {
96ab97
-            tmp_node[0].MBs_free -= mbs;
96ab97
+        // "Consume" the resources on this node
96ab97
+#define CPUS_MARGIN 0
96ab97
+#define MBS_MARGIN 100
96ab97
+        if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) {
96ab97
+            tmp_node[index[best_node_ix]].MBs_free -= mbs;
96ab97
             mbs = 0;
96ab97
         } else {
96ab97
-            mbs -= (tmp_node[0].MBs_free - MBS_MARGIN);
96ab97
-            tmp_node[0].MBs_free = MBS_MARGIN;
96ab97
+            mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN);
96ab97
+            tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN;
96ab97
         }
96ab97
-#define CPUS_MARGIN 0
96ab97
-        if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) {
96ab97
-            tmp_node[0].CPUs_free -= cpus;
96ab97
+        if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) {
96ab97
+            tmp_node[index[best_node_ix]].CPUs_free -= cpus;
96ab97
             cpus = 0;
96ab97
         } else {
96ab97
-            cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN);
96ab97
-            tmp_node[0].CPUs_free = CPUS_MARGIN;
96ab97
-        }
96ab97
-        tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free;
96ab97
-    }
96ab97
-    // If this existing process is already located where we want it, and almost
96ab97
-    // all memory is already moved to those nodes, then return NULL indicating
96ab97
-    // no need to change binding this time.
96ab97
-    if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) {
96ab97
-        // May not need to change binding.  However, if there is any significant
96ab97
-        // memory still on non-target nodes, advise the bind anyway because
96ab97
-        // there are some scenarios when the kernel will not move it all the
96ab97
-        // first time.
96ab97
-        if (!target_using_all_nodes) {
96ab97
-            p->dup_bind_count += 1;
96ab97
-            for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
-                if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) {
96ab97
-                    goto try_memory_move_again;
96ab97
-                }
96ab97
-            }
96ab97
-            // We will accept these memory locations.  Stamp it as done.
96ab97
-            p->bind_time_stamp = get_time_stamp();
96ab97
-        }
96ab97
-        // Skip rebinding either because practically all memory is in the
96ab97
-        // target nodes, or because we are stuck using all the nodes.
96ab97
-        if (log_level >= LOG_DEBUG) {
96ab97
-            numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n");
96ab97
+            cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN);
96ab97
+            tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN;
96ab97
         }
96ab97
-        return NULL;
96ab97
-    } else {
96ab97
-        // Either a non-existing process, or a new binding for an existing process.
96ab97
-        if (p != NULL) {
96ab97
-            // Must be a new binding for an existing process, so reset dup_bind_count.
96ab97
-            p->dup_bind_count = 0;
96ab97
-        }
96ab97
-    }
96ab97
-    // See if this proposed move will make a significant difference.
96ab97
-    // If not, return null instead of advising the move.
96ab97
-    uint64_t target_magnitude = 0;
96ab97
-    uint64_t existing_magnitude = 0;
96ab97
-    int num_target_nodes   = NUM_IDS_IN_LIST(target_node_list_p);
96ab97
-    int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p);
96ab97
-    /* FIXME: this expansion seems to cause excessive growth
96ab97
-     * So calculate the improvement before hastily expanding nodes.
96ab97
-    if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; }
96ab97
-    */
96ab97
-    int node_id = 0;
96ab97
-    int n = num_existing_nodes + num_target_nodes;
96ab97
-    while (n) {
96ab97
-        if (ID_IS_IN_LIST(node_id, target_node_list_p)) {
96ab97
-            target_magnitude += saved_magnitude_for_node[node_id];
96ab97
-            n -= 1;
96ab97
-        }
96ab97
-        if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
96ab97
-            existing_magnitude += saved_magnitude_for_node[node_id];
96ab97
-            n -= 1;
96ab97
-        }
96ab97
-        node_id += 1;
96ab97
-    }
96ab97
-    if (existing_magnitude > 0) {
96ab97
-        uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude;
96ab97
-        if (magnitude_change < 0) {
96ab97
-            magnitude_change = -(magnitude_change);
96ab97
-        }
96ab97
-        if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) {
96ab97
-            // Not significant enough percentage change to do rebind
96ab97
+        // Next line optional, since we will not look at that node again
96ab97
+        tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free);
96ab97
+        best_node_ix += 1;
96ab97
+    }
96ab97
+    // For existing processes, calculate the non-local memory percent to see if
96ab97
+    // process is already in the right place.
96ab97
+    if ((pid > 0) && (p != NULL)) {
96ab97
+        uint64_t nonlocal_memory = 0;
96ab97
+        for (int ix = 0;  (ix < num_nodes);  ix++) {
96ab97
+            if (!ID_IS_IN_LIST(ix, target_node_list_p)) {
96ab97
+                // Accumulate total of nonlocal memory
96ab97
+                nonlocal_memory += p->process_MBs[ix];
96ab97
+            }
96ab97
+        }
96ab97
+        int disp_percent = (100 * nonlocal_memory) / p->MBs_used;
96ab97
+        // If this existing process is already located where we want it, then just
96ab97
+        // return NULL indicating no need to change binding this time.  Check the
96ab97
+        // ammount of nonlocal memory against the target_memlocality_perecent.
96ab97
+        if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
96ab97
+            // Already bound to targets, and enough of the memory is located where we want it, so no need to rebind
96ab97
             if (log_level >= LOG_DEBUG) {
96ab97
-                str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
96ab97
-                str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
96ab97
-                numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n",
96ab97
-                    pid, buf, buf2, magnitude_change);
96ab97
+                numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent);
96ab97
             }
96ab97
-            // We decided this is almost good enough.  Stamp it as done.
96ab97
             p->bind_time_stamp = get_time_stamp();
96ab97
             return NULL;
96ab97
         }
96ab97
     }
96ab97
-    if ((pid <= 0) && (num_target_nodes <= 0)) {
96ab97
-        // Always provide at least one node for pre-placement advice
96ab97
+    // Must always provide at least one node for pre-placement advice
96ab97
+    // FIXME: verify this can happen only if no resources requested...
96ab97
+    if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
96ab97
         ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
96ab97
     }
96ab97
-try_memory_move_again:
96ab97
-    str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
96ab97
+    // Log advice, and return target node list
96ab97
+    if ((pid > 0) && (p->bind_time_stamp)) {
96ab97
+        str_from_id_list(buf,  BUF_SIZE, p->node_list_p);
96ab97
+    } else {
96ab97
+        str_from_id_list(buf,  BUF_SIZE, all_nodes_list_p);
96ab97
+    }
96ab97
+    char buf2[BUF_SIZE];
96ab97
     str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
96ab97
     char *cmd_name = "(unknown)";
96ab97
     if ((p) && (p->comm)) {
96ab97
         cmd_name = p->comm;
96ab97
     }
96ab97
     numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
96ab97
+    if (pid > 0) {
96ab97
+        COPY_LIST(target_node_list_p, p->node_list_p);
96ab97
+    }
96ab97
     return target_node_list_p;
96ab97
 }
96ab97
 
96ab97
 
96ab97
-
96ab97
-void show_processes(process_data_p *ptr, int nprocs) {
96ab97
-    time_t ts = time(NULL);
96ab97
-    fprintf(log_fs, "%s", ctime(&ts);;
96ab97
-    fprintf(log_fs, "Candidates: %d\n", nprocs);
96ab97
-    for (int ix = 0;  (ix < nprocs);  ix++) {
96ab97
-        process_data_p p = ptr[ix];
96ab97
-        char buf[BUF_SIZE];
96ab97
-        snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
96ab97
-        FILE *fs = fopen(buf, "r");
96ab97
-        buf[0] = '\0';
96ab97
-        if (fs) {
96ab97
-            if (fgets(buf, BUF_SIZE, fs)) {
96ab97
-                ELIM_NEW_LINE(buf);
96ab97
-            }
96ab97
-            fclose(fs);
96ab97
-        }
96ab97
-        fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", 
96ab97
-            p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
96ab97
-        }
96ab97
-    fprintf(log_fs, "\n");
96ab97
-    fflush(log_fs);
96ab97
-}
96ab97
-
96ab97
-
96ab97
-
96ab97
 int manage_loads() {
96ab97
+    uint64_t time_stamp = get_time_stamp();
96ab97
     // Use temporary index to access and sort hash table entries
96ab97
-    static process_data_p *pindex;
96ab97
     static int pindex_size;
96ab97
+    static process_data_p *pindex;
96ab97
     if (pindex_size < process_hash_table_size) {
96ab97
         pindex_size = process_hash_table_size;
96ab97
         pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
96ab97
@@ -1923,19 +2049,54 @@ int manage_loads() {
96ab97
         return min_interval / 2;
96ab97
     }
96ab97
     memset(pindex, 0, pindex_size * sizeof(process_data_p));
96ab97
-    // Copy live candidate pointers to the index for sorting, etc
96ab97
+    // Copy live candidate pointers to the index for sorting
96ab97
+    // if they meet the threshold for memory usage and CPU usage.
96ab97
     int nprocs = 0;
96ab97
+    long sum_CPUs_used = 0;
96ab97
     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
96ab97
         process_data_p p = &process_hash_table[ix];
96ab97
-        if (p->pid) {
96ab97
+        if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) {
96ab97
             pindex[nprocs++] = p;
96ab97
+            sum_CPUs_used += p->CPUs_used;
96ab97
+            // Initialize node list, if not already done for this process.
96ab97
+            if (p->node_list_p == NULL) {
96ab97
+                initialize_mem_node_list(p);
96ab97
+            }
96ab97
         }
96ab97
     }
96ab97
-    // Sort index by amount of CPU used * amount of memory used.  Not expecting
96ab97
-    // a long list here.  Use a simple sort -- however, sort into bins,
96ab97
-    // treating values within 10% as aquivalent.  Within bins, order by
96ab97
-    // bind_time_stamp so oldest bound will be higher priority to evaluate.
96ab97
+    // Order candidate considerations using timestamps and magnitude: amount of
96ab97
+    // CPU used * amount of memory used.  Not expecting a long list here.  Use
96ab97
+    // a simplistic sort -- however move all not yet bound to front of list and
96ab97
+    // order by decreasing magnitude.  Previously bound processes follow in
96ab97
+    // bins of increasing magnitude treating values within 20% as aquivalent.
96ab97
+    // Within bins, order by bind_time_stamp so oldest bound will be higher
96ab97
+    // priority to evaluate.  Start by moving all unbound to beginning.
96ab97
+    int num_unbound = 0;
96ab97
     for (int ij = 0;  (ij < nprocs);  ij++) {
96ab97
+        if (pindex[ij]->bind_time_stamp == 0) {
96ab97
+            process_data_p tmp = pindex[num_unbound];
96ab97
+            pindex[num_unbound++] = pindex[ij];
96ab97
+            pindex[ij] = tmp;
96ab97
+        }
96ab97
+    }
96ab97
+    // Sort all unbound so biggest magnitude comes first
96ab97
+    for (int ij = 0;  (ij < num_unbound);  ij++) {
96ab97
+        int best = ij;
96ab97
+        for (int ik = ij + 1;  (ik < num_unbound);  ik++) {
96ab97
+            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
96ab97
+            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
96ab97
+            if (ik_mag <= best_mag) continue;
96ab97
+            best = ik;
96ab97
+        }
96ab97
+        if (best != ij) {
96ab97
+            process_data_p tmp = pindex[ij];
96ab97
+            pindex[ij] = pindex[best];
96ab97
+            pindex[best] = tmp;
96ab97
+        }
96ab97
+    }
96ab97
+    // Sort the remaining candidates into bins of increasting magnitude, and by
96ab97
+    // timestamp within bins.
96ab97
+    for (int ij = num_unbound;  (ij < nprocs);  ij++) {
96ab97
         int best = ij;
96ab97
         for (int ik = ij + 1;  (ik < nprocs);  ik++) {
96ab97
             uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
96ab97
@@ -1946,11 +2107,11 @@ int manage_loads() {
96ab97
                 diff_mag = -(diff_mag);
96ab97
                 min_mag = best_mag;
96ab97
             }
96ab97
-            if ((diff_mag > 0) && (min_mag / diff_mag < 10)) {
96ab97
-                // difference > 10 percent.  Use strict ordering
96ab97
+            if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
96ab97
+                // difference > 20 percent.  Use magnitude ordering
96ab97
                 if (ik_mag <= best_mag) continue;
96ab97
             } else {
96ab97
-                // difference within 10 percent.  Sort these by bind_time_stamp.
96ab97
+                // difference within 20 percent.  Sort these by bind_time_stamp.
96ab97
                 if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
96ab97
             }
96ab97
             best = ik;
96ab97
@@ -1961,23 +2122,57 @@ int manage_loads() {
96ab97
             pindex[best] = tmp;
96ab97
         }
96ab97
     }
96ab97
+    // Show the candidate processes in the log file
96ab97
     if ((log_level >= LOG_INFO) && (nprocs > 0)) {
96ab97
-        show_processes(pindex, nprocs);
96ab97
+        numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
96ab97
+        for (int ix = 0;  (ix < nprocs);  ix++) {
96ab97
+            process_data_p p = pindex[ix];
96ab97
+            char buf[BUF_SIZE];
96ab97
+            str_from_id_list(buf, BUF_SIZE, p->node_list_p);
96ab97
+            fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", 
96ab97
+                p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
96ab97
+            }
96ab97
+        fflush(log_fs);
96ab97
     }
96ab97
-    // Estimate desired size and make resource requests for each significant process
96ab97
+    // Estimate desired size (+ margin capacity) and
96ab97
+    // make resource requests for each candidate process
96ab97
     for (int ix = 0;  (ix < nprocs);  ix++) {
96ab97
         process_data_p p = pindex[ix];
96ab97
-        if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) {
96ab97
-            break; // No more significant processes worth worrying about...
96ab97
+        // If this process has interleaved memory, recheck it only every 30 minutes...
96ab97
+#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
96ab97
+        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
96ab97
+          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
96ab97
+            if (log_level >= LOG_DEBUG) {
96ab97
+                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
96ab97
+            }
96ab97
+            continue;
96ab97
+        }
96ab97
+        // Expand resources needed estimate using target_utilization factor.
96ab97
+        // Start with the CPUs actually used (capped by number of threads) for
96ab97
+        // CPUs required, and the RSS MBs actually used for the MBs
96ab97
+        // requirement,
96ab97
+        int mem_target_utilization = target_utilization;
96ab97
+        int cpu_target_utilization = target_utilization;
96ab97
+        // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
96ab97
+        if (mem_target_utilization > 100) {
96ab97
+            mem_target_utilization = 100;
96ab97
+        }
96ab97
+        // If the process virtual memory size is bigger than one node, and it
96ab97
+        // is already using more than 80 percent of a node, then request MBs
96ab97
+        // based on the virtual size rather than on the current amount in use.
96ab97
+        int mb_request;
96ab97
+        if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) {
96ab97
+            mb_request = (p->MBs_size * 100) / mem_target_utilization;
96ab97
+        } else {
96ab97
+            mb_request = (p->MBs_used * 100) / mem_target_utilization;
96ab97
         }
96ab97
-        int mb_request  =  (p->MBs_used * 100) / target_utilization;
96ab97
-        int cpu_request = (p->CPUs_used * 100) / target_utilization;
96ab97
-        // Do not give a process more CPUs than it has threads!
96ab97
-        // FIXME: For guest VMs, should limit max to VCPU threads. Will
96ab97
-        // need to do something more intelligent with guest IO threads
96ab97
-        // when eventually considering devices and IRQs.
96ab97
+        int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
96ab97
+        // But do not give a process more CPUs than it has threads!
96ab97
         int thread_limit = p->num_threads;
96ab97
-        // If process looks like a KVM guest, try to limit to number of vCPU threads
96ab97
+        // If process looks like a KVM guest, try to limit thread count to the
96ab97
+        // number of vCPU threads.  FIXME: Will need to do something more
96ab97
+        // intelligent than this with guest IO threads when eventually
96ab97
+        // considering devices and IRQs.
96ab97
         if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
96ab97
             int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
96ab97
             if (thread_limit > kvm_vcpu_threads) {
96ab97
@@ -1988,23 +2183,51 @@ int manage_loads() {
96ab97
         if (cpu_request > thread_limit) {
96ab97
             cpu_request = thread_limit;
96ab97
         }
96ab97
+        // If this process was recently bound, enforce a five-minute minimum
96ab97
+        // delay between repeated attempts to potentially move the process.
96ab97
+#define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED)
96ab97
+        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
96ab97
+            // Skip re-evaluation because we just did it recently, but check
96ab97
+            // first for node utilization balance to see if we should
96ab97
+            // re-evaluate this particular process right now.  If this process
96ab97
+            // is running on one of the busiest nodes, go ahead and re-evaluate
96ab97
+            // it if it looks like it should have a better place with
96ab97
+            // sufficient resources.  FIXME: this is currently implemented for
96ab97
+            // only smallish processes that will fit in a single node.
96ab97
+            if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p))
96ab97
+                && (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total) 
96ab97
+                && (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free) 
96ab97
+                    + abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free) 
96ab97
+                    < (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD)  // CPU slop
96ab97
+                && (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free)
96ab97
+                    + abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free) 
96ab97
+                    < (max_node_MBs_free - min_node_MBs_free)) ) { 
96ab97
+                if (log_level >= LOG_DEBUG) {
96ab97
+                    numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid);
96ab97
+                }
96ab97
+            } else {
96ab97
+                if (log_level >= LOG_DEBUG) {
96ab97
+                    numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
96ab97
+                }
96ab97
+                continue;
96ab97
+            }
96ab97
+        }
96ab97
+        // OK, now pick NUMA nodes for this process and bind it!
96ab97
         pthread_mutex_lock(&node_info_mutex);
96ab97
-        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request);
96ab97
-        // FIXME: ?? copy node_list_p to shorten mutex region?
96ab97
-        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) {
96ab97
-            // Shorten interval if actively moving processes
96ab97
+        int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
96ab97
+        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
96ab97
+        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
96ab97
             pthread_mutex_unlock(&node_info_mutex);
96ab97
-            p->bind_time_stamp = get_time_stamp();
96ab97
+            // Return minimum interval when actively moving processes
96ab97
             return min_interval;
96ab97
         }
96ab97
         pthread_mutex_unlock(&node_info_mutex);
96ab97
     }
96ab97
-    // Return maximum interval if no process movement
96ab97
+    // Return maximum interval when no process movement
96ab97
     return max_interval;
96ab97
 }
96ab97
 
96ab97
 
96ab97
-
96ab97
 void *set_dynamic_options(void *arg) {
96ab97
     // int arg_value = *(int *)arg;
96ab97
     char buf[BUF_SIZE];
96ab97
@@ -2013,6 +2236,18 @@ void *set_dynamic_options(void *arg) {
96ab97
         msg_t msg;
96ab97
         recv_msg(&msg;;
96ab97
         switch (msg.body.cmd) {
96ab97
+        case 'C':
96ab97
+            use_inactive_file_cache = (msg.body.arg1 != 0);
96ab97
+            if (use_inactive_file_cache) {
96ab97
+                numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
96ab97
+            } else {
96ab97
+                numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
96ab97
+            }
96ab97
+            break;
96ab97
+        case 'H':
96ab97
+            thp_scan_sleep_ms = msg.body.arg1;
96ab97
+            set_thp_scan_sleep_ms(thp_scan_sleep_ms);
96ab97
+            break;
96ab97
         case 'i':
96ab97
             min_interval = msg.body.arg1;
96ab97
             max_interval = msg.body.arg2;
96ab97
@@ -2033,6 +2268,10 @@ void *set_dynamic_options(void *arg) {
96ab97
             numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1);
96ab97
             log_level = msg.body.arg1;
96ab97
             break;
96ab97
+        case 'm':
96ab97
+            numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1);
96ab97
+            target_memlocality = msg.body.arg1;
96ab97
+            break;
96ab97
         case 'p':
96ab97
             numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1);
96ab97
             pthread_mutex_lock(&pid_list_mutex);
96ab97
@@ -2055,6 +2294,11 @@ void *set_dynamic_options(void *arg) {
96ab97
                 numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
96ab97
             }
96ab97
             break;
96ab97
+        case 't':
96ab97
+            numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
96ab97
+            htt_percent = msg.body.arg1;
96ab97
+            node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
96ab97
+            break;
96ab97
         case 'u':
96ab97
             numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
96ab97
             target_utilization = msg.body.arg1;
96ab97
@@ -2064,7 +2308,7 @@ void *set_dynamic_options(void *arg) {
96ab97
                                     msg.body.arg1, msg.body.arg2);
96ab97
             pthread_mutex_lock(&node_info_mutex);
96ab97
             update_nodes();
96ab97
-            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2);
96ab97
+            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
96ab97
             str_from_id_list(buf, BUF_SIZE, node_list_p);
96ab97
             pthread_mutex_unlock(&node_info_mutex);
96ab97
             send_msg(msg.body.src_pid, 'w', 0, 0, buf);
96ab97
@@ -2134,30 +2378,50 @@ void parse_two_arg_values(char *p, int *
96ab97
 
96ab97
 int main(int argc, char *argv[]) {
96ab97
     int opt;
96ab97
+    int C_flag = 0;
96ab97
     int d_flag = 0;
96ab97
+    int H_flag = 0;
96ab97
     int i_flag = 0;
96ab97
     int K_flag = 0;
96ab97
     int l_flag = 0;
96ab97
+    int m_flag = 0;
96ab97
     int p_flag = 0;
96ab97
     int r_flag = 0;
96ab97
     int S_flag = 0;
96ab97
+    int t_flag = 0;
96ab97
     int u_flag = 0;
96ab97
     int v_flag = 0;
96ab97
     int w_flag = 0;
96ab97
     int x_flag = 0;
96ab97
+    int tmp_int = 0;
96ab97
     long list_pid = 0;
96ab97
-    while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) {
96ab97
+    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
96ab97
         switch (opt) {
96ab97
+        case 'C':
96ab97
+            C_flag = 1;
96ab97
+            use_inactive_file_cache = (atoi(optarg) != 0);
96ab97
+            break;
96ab97
         case 'd':
96ab97
             d_flag = 1;
96ab97
             log_level = LOG_DEBUG;
96ab97
             break;
96ab97
         case 'D':
96ab97
-            cpuset_dir_list[0] = strdup(optarg);
96ab97
+            // obsoleted
96ab97
             break;
96ab97
         case 'h':
96ab97
             print_usage_and_exit(argv[0]);
96ab97
             break;
96ab97
+        case 'H':
96ab97
+            tmp_int = atoi(optarg);
96ab97
+            if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
96ab97
+                // 0 means do not change the system default value
96ab97
+                H_flag = 1;
96ab97
+                thp_scan_sleep_ms = tmp_int;
96ab97
+            } else {
96ab97
+		fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
96ab97
+		exit(EXIT_FAILURE);
96ab97
+	    }
96ab97
+            break;
96ab97
         case 'i':
96ab97
             i_flag = 1;
96ab97
             parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
96ab97
@@ -2170,6 +2434,13 @@ int main(int argc, char *argv[]) {
96ab97
             l_flag = 1;
96ab97
             log_level = atoi(optarg);
96ab97
             break;
96ab97
+        case 'm':
96ab97
+            tmp_int = atoi(optarg);
96ab97
+            if ((tmp_int >= 50) && (tmp_int <= 100)) {
96ab97
+                m_flag = 1;
96ab97
+                target_memlocality = tmp_int;
96ab97
+            }
96ab97
+            break;
96ab97
         case 'p':
96ab97
             p_flag = 1;
96ab97
             list_pid = atol(optarg);
96ab97
@@ -2183,13 +2454,26 @@ int main(int argc, char *argv[]) {
96ab97
             include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
96ab97
             exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
96ab97
             break;
96ab97
+        case 'R':
96ab97
+            reserved_cpu_str = strdup(optarg);
96ab97
+            break;
96ab97
         case 'S':
96ab97
             S_flag = 1;
96ab97
             scan_all_processes = (atoi(optarg) != 0);
96ab97
             break;
96ab97
+        case 't':
96ab97
+            tmp_int = atoi(optarg);
96ab97
+            if ((tmp_int >= 0) && (tmp_int <= 100)) {
96ab97
+                t_flag = 1;
96ab97
+                htt_percent = tmp_int;
96ab97
+            }
96ab97
+            break;
96ab97
         case 'u':
96ab97
-            u_flag = 1;
96ab97
-            target_utilization = atoi(optarg);
96ab97
+            tmp_int = atoi(optarg);
96ab97
+            if ((tmp_int >= 10) && (tmp_int <= 130)) {
96ab97
+                u_flag = 1;
96ab97
+                target_utilization = tmp_int;
96ab97
+            }
96ab97
             break;
96ab97
         case 'v':
96ab97
             v_flag = 1;
96ab97
@@ -2234,6 +2518,12 @@ int main(int argc, char *argv[]) {
96ab97
         // Daemon is already running.  So send dynamic options to persistant
96ab97
         // thread to handle requests, get the response (if any), and finish.
96ab97
         msg_t msg; 
96ab97
+        if (C_flag) {
96ab97
+            send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
96ab97
+        }
96ab97
+        if (H_flag) {
96ab97
+            send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
96ab97
+        }
96ab97
         if (i_flag) {
96ab97
             send_msg(daemon_pid, 'i', min_interval, max_interval, "");
96ab97
         }
96ab97
@@ -2243,6 +2533,9 @@ int main(int argc, char *argv[]) {
96ab97
         if (d_flag || l_flag || v_flag) {
96ab97
             send_msg(daemon_pid, 'l', log_level, 0, "");
96ab97
         }
96ab97
+        if (m_flag) {
96ab97
+            send_msg(daemon_pid, 'm', target_memlocality, 0, "");
96ab97
+        }
96ab97
         if (p_flag) {
96ab97
             send_msg(daemon_pid, 'p', list_pid, 0, "");
96ab97
         }
96ab97
@@ -2252,6 +2545,9 @@ int main(int argc, char *argv[]) {
96ab97
         if (S_flag) {
96ab97
             send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
96ab97
         }
96ab97
+        if (t_flag) {
96ab97
+            send_msg(daemon_pid, 't', htt_percent, 0, "");
96ab97
+        }
96ab97
         if (u_flag) {
96ab97
             send_msg(daemon_pid, 'u', target_utilization, 0, "");
96ab97
         }
96ab97
@@ -2263,14 +2559,30 @@ int main(int argc, char *argv[]) {
96ab97
         if (x_flag) {
96ab97
             send_msg(daemon_pid, 'x', list_pid, 0, "");
96ab97
         }
96ab97
-    } else if (w_flag) {
96ab97
-        // Get pre-placement NUMA advice without starting daemon
96ab97
+        close_log_file();
96ab97
+        exit(EXIT_SUCCESS);
96ab97
+    }
96ab97
+    // No numad daemon running yet.
96ab97
+    // First, make note of any reserved CPUs....
96ab97
+    if (reserved_cpu_str != NULL) {
96ab97
+        CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
96ab97
+        int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
96ab97
         char buf[BUF_SIZE];
96ab97
+        str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
96ab97
+        numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
96ab97
+        // turn reserved list into a negated mask for later ANDing use...
96ab97
+        negate_cpu_list(reserved_cpu_mask_list_p);
96ab97
+    }
96ab97
+    // If it is a "-w" pre-placement request, handle that without starting
96ab97
+    // the daemon.  Otherwise start the numad daemon.
96ab97
+    if (w_flag) {
96ab97
+        // Get pre-placement NUMA advice without starting daemon
96ab97
         update_nodes();
96ab97
         sleep(2);
96ab97
         update_nodes();
96ab97
         numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
96ab97
-        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs);
96ab97
+        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
96ab97
+        char buf[BUF_SIZE];
96ab97
         str_from_id_list(buf, BUF_SIZE, node_list_p);
96ab97
         fprintf(stdout, "%s\n", buf);
96ab97
         close_log_file();
96ab97
@@ -2278,6 +2590,7 @@ int main(int argc, char *argv[]) {
96ab97
     } else if (max_interval > 0) {
96ab97
         // Start the numad daemon...
96ab97
         check_prereqs(argv[0]);
96ab97
+#if (!NO_DAEMON)
96ab97
         // Daemonize self...
96ab97
         daemon_pid = fork();
96ab97
         if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
96ab97
@@ -2298,9 +2611,20 @@ int main(int argc, char *argv[]) {
96ab97
         if (log_fs != stderr) {
96ab97
             fclose(stderr);
96ab97
         }
96ab97
+#endif
96ab97
+        // Set up signal handlers
96ab97
+        struct sigaction sa;
96ab97
+        memset(&sa, 0, sizeof(sa)); 
96ab97
+        sa.sa_handler = sig_handler;
96ab97
+        if (sigaction(SIGHUP, &sa, NULL)
96ab97
+            || sigaction(SIGTERM, &sa, NULL)
96ab97
+            || sigaction(SIGQUIT, &sa, NULL)) {
96ab97
+            numad_log(LOG_CRIT, "sigaction does not work?\n");
96ab97
+            exit(EXIT_FAILURE);
96ab97
+        }
96ab97
         // Allocate initial process hash table
96ab97
         process_hash_table_expand();
96ab97
-        // Spawn thread to handle messages from subsequent invocation requests
96ab97
+        // Spawn a thread to handle messages from subsequent invocation requests
96ab97
         pthread_mutex_init(&pid_list_mutex, NULL);
96ab97
         pthread_mutex_init(&node_info_mutex, NULL);
96ab97
         pthread_attr_t attr;
96ab97
@@ -2310,7 +2634,7 @@ int main(int argc, char *argv[]) {
96ab97
         }
96ab97
         pthread_t tid;
96ab97
         if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
96ab97
-            numad_log(LOG_CRIT, "pthread_create failure\n");
96ab97
+            numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
96ab97
             exit(EXIT_FAILURE);
96ab97
         }
96ab97
         // Loop here forwever...
96ab97
@@ -2322,16 +2646,26 @@ int main(int argc, char *argv[]) {
96ab97
             if (nodes > 1) {
96ab97
                 update_processes();
96ab97
                 interval = manage_loads();
96ab97
+                if (interval < max_interval) {
96ab97
+                    // Update node info since we moved something
96ab97
+                    nodes = update_nodes();
96ab97
+                }
96ab97
             }
96ab97
             sleep(interval);
96ab97
+            if (got_sigterm | got_sigquit) {
96ab97
+                shut_down_numad();
96ab97
+            }
96ab97
+            if (got_sighup) {
96ab97
+                got_sighup = 0;
96ab97
+                close_log_file();
96ab97
+                open_log_file();
96ab97
+            }
96ab97
         }
96ab97
         if (pthread_attr_destroy(&attr) != 0) {
96ab97
             numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
96ab97
         }
96ab97
         pthread_mutex_destroy(&pid_list_mutex);
96ab97
         pthread_mutex_destroy(&node_info_mutex);
96ab97
-    } else {
96ab97
-        shut_down_numad();
96ab97
     }
96ab97
     exit(EXIT_SUCCESS);
96ab97
 }
96ab97
diff -rup numad-0.5git/numad.init numad-0.5git-new/numad.init
96ab97
--- numad-0.5git/numad.init	2012-12-03 15:40:40.000000000 +0100
96ab97
+++ numad-0.5git-new/numad.init	2016-08-30 08:45:19.000000000 +0200
96ab97
@@ -37,7 +37,7 @@ start() {
96ab97
     [ -f $config ] || exit 6
96ab97
     echo -n $"Starting $prog: "
96ab97
     . $config
96ab97
-    daemon "$exec -i $INTERVAL"
96ab97
+    daemon $exec -i $INTERVAL
96ab97
     retval=$?
96ab97
     echo
96ab97
     [ $retval -eq 0 ] && touch $lockfile