845250
diff -rup numad-0.5git/numad.8 numad-0.5git-new/numad.8
845250
--- numad-0.5git/numad.8	2012-12-03 15:40:40.000000000 +0100
845250
+++ numad-0.5git-new/numad.8	2016-08-30 08:45:19.000000000 +0200
845250
@@ -1,45 +1,56 @@
845250
 .TH "numad" "8" "1.0.0" "Bill Gray" "Administration"
845250
-.SH "numad"
845250
-.LP 
845250
+.SH "NAME"
845250
+.LP
845250
 numad \- A user\-level daemon that provides placement advice and process
845250
 management for efficient use of CPUs and memory on systems with NUMA topology.
845250
-.SH "SYNTAX"
845250
-.LP 
845250
+.SH "SYNOPSIS"
845250
+.LP
845250
 numad [\fI\-dhvV\fP]
845250
-.br 
845250
-.LP 
845250
-numad  [\fI\-D non-standard-cgroup-mount-point\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
+numad  [\fI\-C 0|1\fP]
845250
+.br
845250
+.LP
845250
+numad  [\fI\-H THP_hugepage_scan_sleep_ms\fP]
845250
+.br
845250
+.LP
845250
 numad  [\fI\-i [min_interval:]max_interval\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
 numad  [\fI\-K 0|1\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
 numad  [\fI\-l log_level\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
+numad  [\fI\-m target_memory_locality\fP]
845250
+.br
845250
+.LP
845250
 numad  [\fI\-p PID\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
 numad  [\fI\-r PID\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
+numad  [\fI\-R reserved-CPU-list\fP]
845250
+.br
845250
+.LP
845250
 numad  [\fI\-S 0|1\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
+numad  [\fI\-t logical_CPU_percent\fP]
845250
+.br
845250
+.LP
845250
 numad  [\fI\-u target_utilization\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
 numad  [\fI\-w NCPUS[:MB]\fP]
845250
-.br 
845250
-.LP 
845250
+.br
845250
+.LP
845250
 numad  [\fI\-x PID\fP]
845250
-.br 
845250
-
845250
+.br
845250
 .SH "DESCRIPTION"
845250
-.LP 
845250
+.LP
845250
 Numad is a system daemon that monitors NUMA topology and resource usage. It
845250
 will attempt to locate processes for efficient NUMA locality and affinity,
845250
 dynamically adjusting to changing system conditions.  Numad also provides
845250
@@ -53,25 +64,42 @@ large in-memory database application, fo
845250
 accesses will likely remain unpredictable -- numad will probably not improve
845250
 performance.
845250
 .SH "OPTIONS"
845250
-.LP 
845250
-.TP 
845250
+.LP
845250
+.TP
845250
+\fB\-C\fR <\fI0|1\fP>
845250
+This option controls whether or not numad treats inactive file cache as
845250
+available memory. By default, numad assumes it can count inactive file cache as
845250
+"free" memory when considering resources to match with processes.  Specify
845250
+\fI\-C 0\fP if numad should instead consider inactive file cache as a consumed
845250
+resource.
845250
+.TP
845250
 \fB\-d\fR
845250
 Debug output in log, sets the log level to LOG_DEBUG.  Same effect as \fI\-l 7\fP.
845250
 .TP
845250
-\fB\-D\fR <\fInon-standard-cgroup-mount-point\fP>
845250
-This option can be used to communicate a non-standard cgroup mount point to
845250
-numad.  This is not normally necessary.
845250
-.TP 
845250
 \fB\-h\fR
845250
 Display usage help information and then exit.
845250
-.TP 
845250
+.TP
845250
+\fB\-H\fR  <\fITHP_scan_sleep_ms\fP>
845250
+Set the desired transparent hugepage scan interval in ms.  The
845250
+.na
845250
+/sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs
845250
+.ad
845250
+tunable is usually set to 10000ms by the operating system.  The default is
845250
+changed by numad to be 1000ms since it is helpful for the hugepage daemon to be
845250
+more aggressive when memory moves between nodes.  Specifying (\fI\-H 0\fP) will
845250
+cause numad to retain the system default value.  You can also make the hugepage
845250
+daemon more or less aggressive by specifying an alternate value with this
845250
+option.  For example, setting this value to 100ms (\fI\-H 100\fP) might improve
845250
+the performance of workloads which use many transparent hugepages.
845250
+.TP
845250
 \fB\-i\fR <\fI[min_interval:]max_interval\fP>
845250
 Sets the time interval that numad waits between system scans, in seconds to
845250
 <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default
845250
 <\fImin_interval\fP> is 5 seconds.  Setting a <\fImax_interval\fP> of zero will
845250
 cause the daemon to exit.  (This is the normal mechanism to terminate the
845250
 daemon.)  A bigger <\fImax_interval\fP> will decrease numad overhead but also
845250
-decrease responsiveness to changing loads.
845250
+decrease responsiveness to changing loads.  The default numad max_interval can
845250
+be changed in the numad.conf file.
845250
 .TP
845250
 \fB\-K\fR <\fI0|1\fP>
845250
 This option controls whether numad keeps interleaved memory spread across NUMA
845250
@@ -82,10 +110,24 @@ a large, single-instance application tha
845250
 the workload will have continuous unpredictable memory access patterns (e.g. a
845250
 large in-memory database), you might get better results by specifying \fI\-K
845250
 1\fP to instruct numad to keep interleaved memory distributed.
845250
-.TP 
845250
+.TP
845250
 \fB\-l\fR <\fIlog_level\fP>
845250
 Sets the log level to <\fIlog_level\fP>.  Reasonable choices are 5, 6, or 7.
845250
-The default value is 5.
845250
+The default value is 5.  Note that CPU values are scaled by a factor of 100
845250
+internally and in the numad log files.  Unfortunately, you don't actually have
845250
+that many CPUs.
845250
+.TP
845250
+\fB\-m\fR  <\fItarget_memory_locality\fP>
845250
+Set the desired memory locality threshold to stop moving process memory.  Numad
845250
+might stop retrying to coalesce process memory when more than this percentage
845250
+of the process's memory is already localized in the target node(s).  The
845250
+default is 90%. Numad will frequently localize more than the localization
845250
+threshold percent, but it will not necessarily do so.  Decrease the threshold
845250
+to allow numad to leave more process memory distributed on various nodes.
845250
+Increase the threshold to instruct numad to try to localize more memory.
845250
+Acceptable values are between 50 and 100 percent.  Note that setting the target
845250
+memory locality to 100% might cause numad to continually retry to move memory
845250
+that the kernel will never succesfully move.
845250
 .TP
845250
 \fB\-p\fR <\fIPID\fP>
845250
 Add PID to explicit inclusion list of processes to consider for managing, if
845250
@@ -102,6 +144,12 @@ processes.  After daemon start, only one
845250
 process lists per subsequent numad invocation.  Use with \-S and \-p and \-x to
845250
 precisely control the scope of processes numad can manage.
845250
 .TP
845250
+\fB\-R\fR <\fICPU_LIST\fP>
845250
+Specify a list of CPUs that numad should assume are reserved for non-numad use.
845250
+No processes will be bound to the specified CPUs by numad.  This option is
845250
+effective only when starting numad.  You cannot change reserved CPUs
845250
+dynamically while numad is already running.
845250
+.TP
845250
 \fB\-S\fR <\fI0|1\fP>
845250
 This option controls whether numad scans all system processes or only the
845250
 processes on the explicit inclusion PID list.  The default is to scan all
845250
@@ -113,18 +161,30 @@ exclusion list).  Starting numad as
845250
 .br
845250
 will limit scanning, and thus also automatic NUMA management, to only those
845250
 three explicitly specified processes.
845250
-.TP 
845250
+.TP
845250
+\fB\-t\fR  <\fIlogical_CPU_percent\fP>
845250
+Specify the resource value of logical CPUs.  Hardware threads typically share
845250
+most core resources, and so logical CPUs add only a fraction of CPU power for
845250
+many workloads.  By default numad considers logical CPUs to be only 20 percent
845250
+of a dedicated hardware core.
845250
+.TP
845250
 \fB\-u\fR  <\fItarget_utilization\fP>
845250
 Set the desired maximum consumption percentage of a node. Default is 85%.
845250
 Decrease the target value to maintain more available resource margin on each
845250
 node.  Increase the target value to more exhaustively consume node resources.
845250
-.TP 
845250
+If you have sized your workloads to precisely fit inside a NUMA node,
845250
+specifying (\fI\-u 100\fP) might improve system performance by telling numad to
845250
+go ahead and consume all the resources in each node.  It is possible to specify
845250
+values up to 130 percent to oversubscribe CPUs in the nodes, but memory
845250
+utilization is always capped at 100%.  Use oversubscription values very
845250
+carefully.
845250
+.TP
845250
 \fB\-v\fR
845250
 Verbose output in log, sets the log level to LOG_INFO.  Same effect as \fI\-l 6\fP.
845250
-.TP 
845250
+.TP
845250
 \fB\-V\fR
845250
 Display version information and exit.
845250
-.TP 
845250
+.TP
845250
 \fB\-w\fR <\fINCPUS[:MB]\fP>
845250
 Queries numad for the best NUMA nodes to bind an entity that needs
845250
 <\fINCPUS\fP>.  The amount of memory (in MBs) is optional, but should normally
845250
@@ -145,32 +205,37 @@ Add PID to explicit exclusion list of pr
845250
 Multiple \fI\-x PID\fP options can be specified at daemon start, but after
845250
 daemon start, only one PID can be added to the exclusion list per subsequent
845250
 numad invocation.  Use with \-S to precisely control the scope of processes
845250
-numad can manage.  
845250
+numad can manage.
845250
 .SH "FILES"
845250
-.LP 
845250
-\fI/usr/bin/numad\fP 
845250
-.br 
845250
-\fI/var/log/numad.log\fP 
845250
-.br 
845250
-\fI/var/run/numad.pid\fP 
845250
+.LP
845250
+\fI/usr/bin/numad\fP
845250
+.br
845250
+\fI/etc/numad.conf\fP
845250
+.br
845250
+\fI/var/log/numad.log\fP
845250
+.br
845250
+\fI/var/run/numad.pid\fP
845250
 .SH "ENVIRONMENT VARIABLES"
845250
-.LP 
845250
-.TP 
845250
+.LP
845250
+.TP
845250
 None.
845250
 .SH "EXAMPLES"
845250
-.LP 
845250
-Numad is normally run as a system daemon and should be managed by the 
845250
+.LP
845250
+Numad can be run as a system daemon and can be managed by the
845250
 standard init mechanisms of the host.
845250
-.LP  
845250
+.LP
845250
 If interactive (manual) control is desired, you can start the daemon manually by typing:
845250
-.LP 
845250
+.LP
845250
 /usr/bin/numad
845250
 .LP
845250
-Subsequent numad invocations while the daemon is running can be used to dynamically change run-time options.
845250
+Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options.
845250
+.LP
845250
+You can terminate numad from running by typing:
845250
+.LP
845250
+/usr/bin/numad -i0
845250
 .SH "AUTHORS"
845250
-.LP 
845250
+.LP
845250
 Bill Gray <bgray@redhat.com>
845250
 .SH "SEE ALSO"
845250
-.LP 
845250
+.LP
845250
 numactl(8)
845250
-
845250
diff -rup numad-0.5git/numad.c numad-0.5git-new/numad.c
845250
--- numad-0.5git/numad.c	2012-12-03 15:40:40.000000000 +0100
845250
+++ numad-0.5git-new/numad.c	2016-08-30 08:45:19.000000000 +0200
845250
@@ -19,7 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston
845250
 */ 
845250
 
845250
 
845250
-// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt
845250
+// Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm
845250
 
845250
 
845250
 #define _GNU_SOURCE
845250
@@ -40,6 +40,10 @@ Inc., 59 Temple Place, Suite 330, Boston
845250
 #include <stdio.h>
845250
 #include <stdlib.h>
845250
 #include <string.h>
845250
+#include <time.h>
845250
+#include <unistd.h>
845250
+#include <values.h>
845250
+
845250
 #include <sys/ipc.h>
845250
 #include <sys/mman.h>
845250
 #include <sys/msg.h>
845250
@@ -49,26 +53,16 @@ Inc., 59 Temple Place, Suite 330, Boston
845250
 #include <sys/syslog.h>
845250
 #include <sys/time.h>
845250
 #include <sys/types.h>
845250
-#include <time.h>
845250
-#include <unistd.h>
845250
-#include <values.h>
845250
+
845250
+#include <asm/unistd.h>
845250
 
845250
 
845250
-#define VERSION_STRING "20121130"
845250
+#define VERSION_STRING "20150602"
845250
 
845250
 
845250
 #define VAR_RUN_FILE "/var/run/numad.pid"
845250
 #define VAR_LOG_FILE "/var/log/numad.log"
845250
 
845250
-char *cpuset_dir = NULL;
845250
-char *cpuset_dir_list[] =  {
845250
-    NULL,
845250
-    "/sys/fs/cgroup/cpuset",
845250
-    "/cgroup/cpuset",
845250
-    NULL
845250
-};
845250
-
845250
-
845250
 #define KILOBYTE (1024)
845250
 #define MEGABYTE (1024 * 1024)
845250
 
845250
@@ -86,14 +80,11 @@ char *cpuset_dir_list[] =  {
845250
 #define MAX_INTERVAL 15
845250
 #define CPU_THRESHOLD     50
845250
 #define MEMORY_THRESHOLD 300
845250
-#define TARGET_UTILIZATION_PERCENT 85
845250
-#define IMPROVEMENT_THRESHOLD_PERCENT 5
845250
-
845250
+#define DEFAULT_HTT_PERCENT 20
845250
+#define DEFAULT_THP_SCAN_SLEEP_MS 1000
845250
+#define DEFAULT_UTILIZATION_PERCENT 85
845250
+#define DEFAULT_MEMLOCALITY_PERCENT 90
845250
 
845250
-#define ELIM_NEW_LINE(s) \
845250
-    if (s[strlen(s) - 1] == '\n') { \
845250
-        s[strlen(s) - 1] = '\0'; \
845250
-    }
845250
 
845250
 #define CONVERT_DIGITS_TO_NUM(p, n) \
845250
     n = *p++ - '0'; \
845250
@@ -105,19 +96,36 @@ char *cpuset_dir_list[] =  {
845250
 
845250
 int num_cpus = 0;
845250
 int num_nodes = 0;
845250
-int page_size_in_bytes = 0;
845250
-int huge_page_size_in_bytes = 0;
845250
+int threads_per_core = 0;
845250
+uint64_t page_size_in_bytes = 0;
845250
+uint64_t huge_page_size_in_bytes = 0;
845250
 
845250
 int min_interval = MIN_INTERVAL;
845250
 int max_interval = MAX_INTERVAL;
845250
-int target_utilization  = TARGET_UTILIZATION_PERCENT;
845250
+int htt_percent = DEFAULT_HTT_PERCENT;
845250
+int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS;
845250
+int target_utilization  = DEFAULT_UTILIZATION_PERCENT;
845250
+int target_memlocality  = DEFAULT_MEMLOCALITY_PERCENT;
845250
 int scan_all_processes = 1;
845250
 int keep_interleaved_memory = 0;
845250
+int use_inactive_file_cache = 1;
845250
 
845250
 pthread_mutex_t pid_list_mutex;
845250
 pthread_mutex_t node_info_mutex;
845250
+long sum_CPUs_total = 0;
845250
 int requested_mbs = 0;
845250
 int requested_cpus = 0;
845250
+int got_sighup = 0;
845250
+int got_sigterm = 0;
845250
+int got_sigquit = 0;
845250
+
845250
+void sig_handler(int signum) { 
845250
+    switch (signum) {
845250
+        case SIGHUP:  got_sighup  = 1; break;
845250
+        case SIGTERM: got_sigterm = 1; break;
845250
+        case SIGQUIT: got_sigquit = 1; break;
845250
+    }
845250
+}
845250
 
845250
 
845250
 
845250
@@ -139,7 +147,7 @@ void numad_log(int level, const char *fm
845250
     }
845250
     char buf[BUF_SIZE];
845250
     time_t ts = time(NULL);
845250
-    sprintf(buf, ctime(&ts);;
845250
+    strncpy(buf, ctime(&ts), sizeof(buf));
845250
     char *p = &buf[strlen(buf) - 1];
845250
     *p++ = ':';
845250
     *p++ = ' ';
845250
@@ -155,13 +163,16 @@ void open_log_file() {
845250
     log_fs = fopen(VAR_LOG_FILE, "a");
845250
     if (log_fs == NULL) {
845250
         log_fs = stderr;
845250
-        numad_log(LOG_ERR, "Cannot open numad log file -- using stderr\n");
845250
+        numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno);
845250
     }
845250
 }
845250
 
845250
+
845250
 void close_log_file() {
845250
     if (log_fs != NULL) {
845250
-        fclose(log_fs);
845250
+        if (log_fs != stderr) {
845250
+            fclose(log_fs);
845250
+        }
845250
         log_fs = NULL;
845250
     }
845250
 }
845250
@@ -235,23 +246,32 @@ void send_msg(long dst_pid, long cmd, lo
845250
 
845250
 
845250
 typedef struct id_list {
845250
-    // Use CPU_SET(3) <sched.h> cpuset bitmasks,
845250
+    // Use CPU_SET(3) <sched.h> bitmasks,
845250
     // but bundle size and pointer together
845250
     // and genericize for both CPU and Node IDs
845250
     cpu_set_t *set_p; 
845250
     size_t bytes;
845250
 } id_list_t, *id_list_p;
845250
 
845250
-#define INIT_ID_LIST(list_p) \
845250
+#define ID_LIST_SET_P(list_p) (list_p->set_p)
845250
+#define ID_LIST_BYTES(list_p) (list_p->bytes)
845250
+
845250
+#define INIT_ID_LIST(list_p, num_elements) \
845250
     list_p = malloc(sizeof(id_list_t)); \
845250
     if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \
845250
-    list_p->set_p = CPU_ALLOC(num_cpus); \
845250
+    list_p->set_p = CPU_ALLOC(num_elements); \
845250
     if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \
845250
-    list_p->bytes = CPU_ALLOC_SIZE(num_cpus);
845250
+    list_p->bytes = CPU_ALLOC_SIZE(num_elements);
845250
 
845250
-#define CLEAR_LIST(list_p) \
845250
+#define CLEAR_CPU_LIST(list_p) \
845250
     if (list_p == NULL) { \
845250
-        INIT_ID_LIST(list_p); \
845250
+        INIT_ID_LIST(list_p, num_cpus); \
845250
+    } \
845250
+    CPU_ZERO_S(list_p->bytes, list_p->set_p)
845250
+
845250
+#define CLEAR_NODE_LIST(list_p) \
845250
+    if (list_p == NULL) { \
845250
+        INIT_ID_LIST(list_p, num_nodes); \
845250
     } \
845250
     CPU_ZERO_S(list_p->bytes, list_p->set_p)
845250
 
845250
@@ -262,6 +282,9 @@ typedef struct id_list {
845250
         list_p = NULL; \
845250
     }
845250
 
845250
+#define COPY_LIST(orig_list_p, copy_list_p) \
845250
+    memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes)
845250
+
845250
 #define NUM_IDS_IN_LIST(list_p)     CPU_COUNT_S(list_p->bytes, list_p->set_p)
845250
 #define ADD_ID_TO_LIST(k, list_p)  CPU_SET_S(k, list_p->bytes, list_p->set_p)
845250
 #define CLR_ID_IN_LIST(k, list_p)  CPU_CLR_S(k, list_p->bytes, list_p->set_p)
845250
@@ -272,6 +295,25 @@ typedef struct id_list {
845250
 #define  OR_LISTS( or_list_p, list_1_p, list_2_p)  CPU_OR_S( or_list_p->bytes,  or_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
845250
 #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p)
845250
 
845250
+int negate_cpu_list(id_list_p list_p) {
845250
+    if (list_p == NULL) {
845250
+        numad_log(LOG_CRIT, "Cannot negate a NULL list\n");
845250
+        exit(EXIT_FAILURE);
845250
+    }
845250
+    if (num_cpus < 1) {
845250
+        numad_log(LOG_CRIT, "No CPUs to negate in list!\n");
845250
+        exit(EXIT_FAILURE);
845250
+    }
845250
+    for (int ix = 0;  (ix < num_cpus);  ix++) {
845250
+        if (ID_IS_IN_LIST(ix, list_p)) {
845250
+            CLR_ID_IN_LIST(ix, list_p);
845250
+        } else {
845250
+            ADD_ID_TO_LIST(ix, list_p);
845250
+        }
845250
+    }
845250
+    return NUM_IDS_IN_LIST(list_p);
845250
+}
845250
+
845250
 int add_ids_to_list_from_str(id_list_p list_p, char *s) {
845250
     if (list_p == NULL) {
845250
         numad_log(LOG_CRIT, "Cannot add to NULL list\n");
845250
@@ -352,9 +394,21 @@ typedef struct node_data {
845250
     uint8_t *distance;
845250
     id_list_p cpu_list_p; 
845250
 } node_data_t, *node_data_p;
845250
-
845250
 node_data_p node = NULL;
845250
 
845250
+int min_node_CPUs_free_ix = -1;
845250
+int min_node_MBs_free_ix = -1;
845250
+long min_node_CPUs_free = MAXINT;
845250
+long min_node_MBs_free = MAXINT;
845250
+long max_node_CPUs_free = 0;
845250
+long max_node_MBs_free = 0;
845250
+long avg_node_CPUs_free = 0;
845250
+long avg_node_MBs_free = 0;
845250
+double stddev_node_CPUs_free = 0.0;
845250
+double stddev_node_MBs_free = 0.0;
845250
+
845250
+
845250
+
845250
 // RING_BUF_SIZE must be a power of two
845250
 #define RING_BUF_SIZE 8
845250
 
845250
@@ -366,14 +420,15 @@ typedef struct process_data {
845250
     uint64_t data_time_stamp; // hundredths of seconds
845250
     uint64_t bind_time_stamp;
845250
     uint64_t num_threads;
845250
+    uint64_t MBs_size;
845250
     uint64_t MBs_used;
845250
     uint64_t cpu_util;
845250
     uint64_t CPUs_used;  // scaled * ONE_HUNDRED
845250
     uint64_t CPUs_used_ring_buf[RING_BUF_SIZE];
845250
     int ring_buf_ix;
845250
-    int dup_bind_count;
845250
     char *comm;
845250
-    char *cpuset_name;
845250
+    id_list_p node_list_p;
845250
+    uint64_t *process_MBs;
845250
 } process_data_t, *process_data_p;
845250
 
845250
 
845250
@@ -433,7 +488,8 @@ int process_hash_insert(int pid) {
845250
 }
845250
 
845250
 int process_hash_update(process_data_p newp) {
845250
-    // This updates hash table stats for processes we are monitoring
845250
+    // This updates hash table stats for processes we are monitoring. Only the
845250
+    // scalar resource consumption stats need to be updated here.
845250
     int new_hash_table_entry = 1;
845250
     int ix = process_hash_insert(newp->pid);
845250
     if (ix >= 0) {
845250
@@ -460,6 +516,7 @@ int process_hash_update(process_data_p n
845250
             }
845250
             p->comm = strdup(newp->comm);
845250
         }
845250
+        p->MBs_size = newp->MBs_size;
845250
         p->MBs_used = newp->MBs_used;
845250
         p->cpu_util = newp->cpu_util;
845250
         p->num_threads = newp->num_threads;
845250
@@ -468,6 +525,11 @@ int process_hash_update(process_data_p n
845250
     return new_hash_table_entry;
845250
 }
845250
 
845250
+void process_hash_clear_all_bind_time_stamps() {
845250
+    for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
845250
+        process_hash_table[ix].bind_time_stamp = 0;
845250
+    }
845250
+}
845250
 
845250
 int process_hash_rehash(int old_ix) {
845250
     // Given the index of a table entry that would otherwise be orphaned by
845250
@@ -489,7 +551,8 @@ int process_hash_remove(int pid) {
845250
         // remove the target
845250
         process_data_p dp = &process_hash_table[ix];
845250
         if (dp->comm) { free(dp->comm); }
845250
-        if (dp->cpuset_name) { free(dp->cpuset_name); }
845250
+        if (dp->process_MBs) { free(dp->process_MBs); }
845250
+        FREE_LIST(dp->node_list_p);
845250
         memset(dp, 0, sizeof(process_data_t));
845250
         // bubble up the collision chain and rehash if neeeded
845250
         for (;;) {
845250
@@ -543,15 +606,15 @@ void process_hash_table_dump() {
845250
         process_data_p p = &process_hash_table[ix];
845250
         if (p->pid) {
845250
             numad_log(LOG_DEBUG,
845250
-                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld Data TS: %ld  Bind TS: %ld\n",
845250
+                "ix: %d  PID: %d %s  Thds: %d  CPU %ld  MBs: %ld/%ld Data TS: %ld  Bind TS: %ld\n",
845250
                 ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads,
845250
-                p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp);
845250
+                p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp);
845250
+            // FIXME: make this dump every field, but this is not even currently used
845250
         }
845250
     }
845250
 }
845250
 
845250
 void process_hash_table_cleanup(uint64_t update_time) {
845250
-    int cpusets_removed = 0;
845250
     int num_hash_entries_used = 0;
845250
     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
845250
         process_data_p p = &process_hash_table[ix];
845250
@@ -562,34 +625,14 @@ void process_hash_table_cleanup(uint64_t
845250
                 p->data_time_stamp = 0;
845250
                 p->CPUs_used = 0;
845250
                 // Check for dead pids and remove them...
845250
-                char fname[FNAME_SIZE];
845250
-                snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
845250
-                if (access(fname, F_OK) < 0) {
845250
-                    // Seems dead.  Forget this pid -- after first checking 
845250
-                    // and removing obsolete numad.PID cpuset directories.  
845250
-                    snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid);
845250
-                    if (access(fname, F_OK) == 0) {
845250
-                        numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname);
845250
-                        int rc = rmdir(fname);
845250
-                        if (rc >= 0) {
845250
-                            cpusets_removed += 1;
845250
-                        } else {
845250
-                            numad_log(LOG_ERR, "bad cpuset rmdir\n");
845250
-                            // exit(EXIT_FAILURE);
845250
-                        }
845250
-                    }
845250
+                if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) {
845250
+                    // Seems dead.  Forget this pid
845250
                     process_hash_remove(p->pid);
845250
                     num_hash_entries_used -= 1;
845250
                 }
845250
             }
845250
         }
845250
     }
845250
-    if (cpusets_removed > 0) {
845250
-        // Expire all the duplicate bind counts so things will be re-evaluated sooner.
845250
-        for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
845250
-            process_hash_table[ix].dup_bind_count = 0;
845250
-        }
845250
-    }
845250
     // Keep hash table approximately half empty
845250
     if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) {
845250
         process_hash_table_expand();
845250
@@ -610,9 +653,7 @@ pid_list_p insert_pid_into_pid_list(pid_
845250
     if (process_hash_table != NULL) {
845250
         int hash_ix = process_hash_lookup(pid);
845250
         if ((hash_ix >= 0) && (list_ptr == include_pid_list)) {
845250
-            // Clear dup_bind_count and interleaved flag,
845250
-            // in case user wants it to be re-evaluated soon
845250
-            process_hash_table[hash_ix].dup_bind_count = 0;
845250
+            // Clear interleaved flag, in case user wants it to be re-evaluated
845250
             process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED;
845250
         }
845250
     }
845250
@@ -678,18 +719,23 @@ void print_version_and_exit(char *prog_n
845250
 
845250
 void print_usage_and_exit(char *prog_name) {
845250
     fprintf(stderr, "Usage: %s <options> ...\n", prog_name);
845250
+    fprintf(stderr, "-C 1  to count inactive file cache as available memory (default 1)\n");
845250
+    fprintf(stderr, "-C 0  to count inactive file cache memory as unavailable (default 1)\n");
845250
     fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n");
845250
-    fprintf(stderr, "-D <CGROUP_MOUNT_POINT> to specify cgroup mount point\n");
845250
     fprintf(stderr, "-h to print this usage info\n");
845250
+    fprintf(stderr, "-H <N> to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS);
845250
     fprintf(stderr, "-i [<MIN>:]<MAX> to specify interval seconds\n");
845250
-    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes\n");
845250
-    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes\n");
845250
-    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7)\n");
845250
+    fprintf(stderr, "-K 1  to keep interleaved memory spread across nodes (default 0)\n");
845250
+    fprintf(stderr, "-K 0  to merge interleaved memory to local NUMA nodes (default 0)\n");
845250
+    fprintf(stderr, "-l <N> to specify logging level (usually 5, 6, or 7 -- default 5)\n");
845250
+    fprintf(stderr, "-m <N> to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT);
845250
     fprintf(stderr, "-p <PID> to add PID to inclusion pid list\n");
845250
     fprintf(stderr, "-r <PID> to remove PID from explicit pid lists\n");
845250
-    fprintf(stderr, "-S 1  to scan all processes\n");
845250
-    fprintf(stderr, "-S 0  to scan only explicit PID list processes\n");
845250
-    fprintf(stderr, "-u <N> to specify target utilization percent (default 85)\n");
845250
+    fprintf(stderr, "-R <CPU_LIST> to reserve some CPUs for non-numad use\n");
845250
+    fprintf(stderr, "-S 1  to scan all processes (default 1)\n");
845250
+    fprintf(stderr, "-S 0  to scan only explicit PID list processes (default 1)\n");
845250
+    fprintf(stderr, "-t <N> to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT);
845250
+    fprintf(stderr, "-u <N> to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT);
845250
     fprintf(stderr, "-v for verbose  (same effect as '-l 6')\n");
845250
     fprintf(stderr, "-V to show version info\n");
845250
     fprintf(stderr, "-w <CPUs>[:<MBs>] for NUMA node suggestions\n");
845250
@@ -698,62 +744,35 @@ void print_usage_and_exit(char *prog_nam
845250
 }
845250
 
845250
 
845250
-void check_prereqs(char *prog_name) {
845250
-    // Verify cpusets are available on this system.
845250
-    char **dir = &cpuset_dir_list[0];
845250
-    if (*dir == NULL) { dir++; }
845250
-    while (*dir != NULL) {
845250
-        cpuset_dir = *dir;
845250
-        char fname[FNAME_SIZE];
845250
-        snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_dir);
845250
-        if (access(fname, F_OK) == 0) {
845250
-            break;
845250
-        }
845250
-        dir++;
845250
-    }
845250
-    if (*dir == NULL) {
845250
-        fprintf(stderr, "\n");
845250
-        fprintf(stderr, "Are CPUSETs enabled on this system?\n");
845250
-        fprintf(stderr, "They are required for %s to function.\n\n", prog_name);
845250
-        fprintf(stderr, "Check manpage CPUSET(7). You might need to do something like:\n");
845250
-        fprintf(stderr, "    # mkdir <DIRECTORY_MOUNT_POINT>\n");
845250
-        fprintf(stderr, "    # mount cgroup -t cgroup -o cpuset <DIRECTORY_MOUNT_POINT>\n");
845250
-        fprintf(stderr, "    where <DIRECTORY_MOUNT_POINT> is something like:\n");
845250
-        dir = &cpuset_dir_list[0];
845250
-        if (*dir == NULL) { dir++; }
845250
-        while (*dir != NULL) {
845250
-            fprintf(stderr, "      - %s\n", *dir);
845250
-            dir++;
845250
-        }
845250
-        fprintf(stderr, "and then try again...\n");
845250
-        fprintf(stderr, "Or, use '-D <DIRECTORY_MOUNT_POINT>' to specify the correct mount point\n");
845250
-        fprintf(stderr, "\n");
845250
-        exit(EXIT_FAILURE);
845250
+void set_thp_scan_sleep_ms(int new_ms) {
845250
+    if (new_ms < 1) {
845250
+        // 0 means do not change the system default
845250
+        return;
845250
     }
845250
-    // Check on THP scan sleep time.
845250
-    char *thp_scan_fname = "/sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs";
845250
-    int fd = open(thp_scan_fname, O_RDONLY, 0);
845250
+    char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs";
845250
+    int fd = open(thp_scan_fname, O_RDWR, 0);
845250
     if (fd >= 0) {
845250
-        int ms;
845250
         char buf[BUF_SIZE];
845250
         int bytes = read(fd, buf, BUF_SIZE);
845250
-        close(fd);
845250
         if (bytes > 0) {
845250
+            buf[bytes] = '\0';
845250
+            int cur_ms;
845250
             char *p = buf;
845250
-            CONVERT_DIGITS_TO_NUM(p, ms);
845250
-            if (ms > 150) {
845250
-                fprintf(stderr, "\n");
845250
-                numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
845250
-                fprintf(stderr,       "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms);
845250
-                fprintf(stderr, "Consider increasing the frequency of THP scanning,\n");
845250
-                fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname);
845250
-                fprintf(stderr, "to more aggressively (re)construct THPs.  For example:\n");
845250
-                fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
845250
-                fprintf(stderr, "\n");
845250
+            CONVERT_DIGITS_TO_NUM(p, cur_ms);
845250
+            if (cur_ms != new_ms) {
845250
+                lseek(fd, 0, SEEK_SET);
845250
+                numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms);
845250
+                sprintf(buf, "%d\n", new_ms);
845250
+                write(fd, buf, strlen(buf));
845250
             }
845250
         }
845250
+        close(fd);
845250
     }
845250
-    // FIXME: ?? check for enabled ksmd, and recommend disabling ksm?
845250
+}
845250
+
845250
+void check_prereqs(char *prog_name) {
845250
+    // Adjust kernel tunable to scan for THP more frequently...
845250
+    set_thp_scan_sleep_ms(thp_scan_sleep_ms);
845250
 }
845250
 
845250
 
845250
@@ -785,7 +804,6 @@ int get_daemon_pid() {
845250
     return pid; 
845250
 }
845250
 
845250
-
845250
 int register_numad_pid() {
845250
     int pid;
845250
     char buf[BUF_SIZE];
845250
@@ -831,6 +849,43 @@ fail_numad_run_file:
845250
 }
845250
 
845250
 
845250
+int count_set_bits_in_hex_list_file(char *fname) {
845250
+    int sum = 0;
845250
+    int fd = open(fname, O_RDONLY, 0);
845250
+    if (fd >= 0) {
845250
+        char buf[BUF_SIZE];
845250
+        int bytes = read(fd, buf, BUF_SIZE);
845250
+        close(fd);
845250
+        for (int ix = 0;  (ix < bytes);  ix++) {
845250
+            char c = tolower(buf[ix]);
845250
+            switch (c) {
845250
+                case '0'  : sum += 0; break;
845250
+                case '1'  : sum += 1; break;
845250
+                case '2'  : sum += 1; break;
845250
+                case '3'  : sum += 2; break;
845250
+                case '4'  : sum += 1; break;
845250
+                case '5'  : sum += 2; break;
845250
+                case '6'  : sum += 2; break;
845250
+                case '7'  : sum += 3; break;
845250
+                case '8'  : sum += 1; break;
845250
+                case '9'  : sum += 2; break;
845250
+                case 'a'  : sum += 2; break;
845250
+                case 'b'  : sum += 3; break;
845250
+                case 'c'  : sum += 2; break;
845250
+                case 'd'  : sum += 3; break;
845250
+                case 'e'  : sum += 3; break;
845250
+                case 'f'  : sum += 4; break;
845250
+                case ' '  : sum += 0; break;
845250
+                case ','  : sum += 0; break;
845250
+                case '\n' : sum += 0; break;
845250
+                default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE);
845250
+            }
845250
+        }
845250
+    }
845250
+    return sum;
845250
+}
845250
+
845250
+
845250
 int get_num_cpus() {
845250
     int n1 = sysconf(_SC_NPROCESSORS_CONF);
845250
     int n2 = sysconf(_SC_NPROCESSORS_ONLN);
845250
@@ -848,7 +903,7 @@ int get_num_cpus() {
845250
 int get_num_kvm_vcpu_threads(int pid) {
845250
     // Try to return the number of vCPU threads for this VM guest,
845250
     // excluding the IO threads.  All failures return MAXINT.
845250
-    // FIXME: figure out some better way to do this...
845250
+    // FIXME: someday figure out some better way to do this...
845250
     char fname[FNAME_SIZE];
845250
     snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid);
845250
     int fd = open(fname, O_RDONLY, 0);
845250
@@ -876,8 +931,8 @@ int get_num_kvm_vcpu_threads(int pid) {
845250
 }
845250
 
845250
 
845250
-int get_huge_page_size_in_bytes() {
845250
-    int huge_page_size = 0;;
845250
+uint64_t get_huge_page_size_in_bytes() {
845250
+    uint64_t huge_page_size = 0;;
845250
     FILE *fs = fopen("/proc/meminfo", "r");
845250
     if (!fs) {
845250
         numad_log(LOG_CRIT, "Can't open /proc/meminfo\n");
845250
@@ -890,7 +945,7 @@ int get_huge_page_size_in_bytes() {
845250
             while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) {
845250
                 p++;
845250
             }
845250
-            huge_page_size = atoi(p);
845250
+            huge_page_size = atol(p);
845250
             break;
845250
         }
845250
     }
845250
@@ -916,143 +971,134 @@ static int name_starts_with_digit(const
845250
 }
845250
 
845250
 
845250
-int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) {
845250
-    // Check basic parameter validity.  
845250
-    if (pid <= 0) {
845250
+
845250
+#define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long))
845250
+#define   SET_BIT(i,a)   (a)[(i) / BITS_IN_LONG] |=  (1u << ((i) % BITS_IN_LONG))
845250
+#define  TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] &   (1u << ((i) % BITS_IN_LONG))) != 0)
845250
+#define CLEAR_BIT(i,a)   (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG))
845250
+
845250
+int bind_process_and_migrate_memory(process_data_p p) {
845250
+    uint64_t t0 = get_time_stamp();
845250
+    // Parameter p is a pointer to an element in the hash table
845250
+    if ((!p) || (p->pid < 1)) {
845250
         numad_log(LOG_CRIT, "Bad PID to bind\n");
845250
         exit(EXIT_FAILURE);
845250
     }
845250
-    if ((cpuset_name == NULL) || (strlen(cpuset_name) == 0)) {
845250
-        numad_log(LOG_CRIT, "Bad cpuset name to bind\n");
845250
-        exit(EXIT_FAILURE);
845250
-    }
845250
-    int nodes;
845250
-    if ((node_list_p == NULL) || ((nodes = NUM_IDS_IN_LIST(node_list_p)) == 0)) {
845250
-        numad_log(LOG_CRIT, "Cannot bind to unspecified node\n");
845250
+    if (!p->node_list_p) {
845250
+        numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n");
845250
         exit(EXIT_FAILURE);
845250
     }
845250
-    // Cpu_list_p is optional and may be NULL...
845250
-    // Generate CPU id list from the specified node list if necessary
845250
-    if (cpu_list_p == NULL) {
845250
-        static id_list_p tmp_cpu_list_p;
845250
-        CLEAR_LIST(tmp_cpu_list_p);
845250
-        int node_id = 0;
845250
-        while (nodes) {
845250
-            if (ID_IS_IN_LIST(node_id, node_list_p)) {
845250
-                OR_LISTS(tmp_cpu_list_p, tmp_cpu_list_p, node[node_id].cpu_list_p);
845250
-                nodes -= 1;
845250
-            }
845250
-            node_id += 1;
845250
-        }
845250
-        cpu_list_p = tmp_cpu_list_p;
845250
-    }
845250
-    // Make the cpuset directory if necessary
845250
-    char cpuset_name_buf[FNAME_SIZE];
845250
-    snprintf(cpuset_name_buf, FNAME_SIZE, "%s%s", cpuset_dir, cpuset_name);
845250
-    char *p = &cpuset_name_buf[strlen(cpuset_dir)];
845250
-    if (!strcmp(p, "/")) {
845250
-        // Make a cpuset directory for this process
845250
-        snprintf(cpuset_name_buf, FNAME_SIZE, "%s/numad.%d", cpuset_dir, pid);
845250
-        numad_log(LOG_NOTICE, "Making new cpuset: %s\n", cpuset_name_buf);
845250
-        int rc = mkdir(cpuset_name_buf, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
845250
-        if (rc == -1) {
845250
-            numad_log(LOG_CRIT, "Bad cpuset mkdir -- errno: %d\n", errno);
845250
-            return 0;
845250
+    // Generate CPU list derived from target node list.
845250
+    static id_list_p cpu_bind_list_p;
845250
+    CLEAR_CPU_LIST(cpu_bind_list_p);
845250
+    int nodes = NUM_IDS_IN_LIST(p->node_list_p);
845250
+    int node_id = 0;
845250
+    while (nodes) {
845250
+        if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
845250
+            OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p);
845250
+            nodes -= 1;
845250
         }
845250
+        node_id += 1;
845250
     }
845250
-    cpuset_name = cpuset_name_buf;
845250
-    // Now that we have a cpuset for pid and a populated cpulist,
845250
-    // start the actual binding and migration.
845250
-    uint64_t t0 = get_time_stamp();
845250
-
845250
-    // Write "1" out to cpuset.memory_migrate file
845250
     char fname[FNAME_SIZE];
845250
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.memory_migrate", cpuset_name);
845250
-    int fd = open(fname, O_WRONLY | O_TRUNC, 0);
845250
-    if (fd == -1) {
845250
-        numad_log(LOG_CRIT, "Could not open cpuset.memory_migrate -- errno: %d\n", errno);
845250
-        return 0;
845250
-    }
845250
-    write(fd, "1", 1);
845250
-    close(fd);
845250
-
845250
-    // Write node IDs out to cpuset.mems file
845250
-    char node_list_buf[BUF_SIZE];
845250
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.mems", cpuset_name);
845250
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
845250
-    if (fd == -1) {
845250
-        numad_log(LOG_CRIT, "Could not open cpuset.mems -- errno: %d\n", errno);
845250
-        return 0;
845250
-    }
845250
-    int len = str_from_id_list(node_list_buf, BUF_SIZE, node_list_p);
845250
-    write(fd, node_list_buf, len);
845250
-    close(fd);
845250
-
845250
-    // Write CPU IDs out to cpuset.cpus file
845250
-    char cpu_list_buf[BUF_SIZE];
845250
-    snprintf(fname, FNAME_SIZE, "%s/cpuset.cpus", cpuset_name);
845250
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
845250
-    if (fd == -1) {
845250
-        numad_log(LOG_CRIT, "Could not open cpuset.cpus -- errno: %d\n", errno);
845250
-        return 0;
845250
-    }
845250
-    len = str_from_id_list(cpu_list_buf, BUF_SIZE, cpu_list_p);
845250
-    write(fd, cpu_list_buf, len);
845250
-    close(fd);
845250
-
845250
-    // Copy pid tasks one at a time to tasks file
845250
-    snprintf(fname, FNAME_SIZE, "%s/tasks", cpuset_name);
845250
-    fd = open(fname, O_WRONLY | O_TRUNC, 0);
845250
-    if (fd == -1) {
845250
-        numad_log(LOG_CRIT, "Could not open tasks -- errno: %d\n", errno);
845250
-        return 0;
845250
-    }
845250
-    snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid);
845250
     struct dirent **namelist;
845250
-    int files = scandir(fname, &namelist, name_starts_with_digit, NULL);
845250
-    if (files < 0) {
845250
-        numad_log(LOG_WARNING, "Could not scandir task list\n");
845250
+    snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid);
845250
+    int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL);
845250
+    if (num_tasks <= 0) {
845250
+        numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid);
845250
         return 0;  // Assume the process terminated
845250
     }
845250
-    for (int ix = 0;  (ix < files);  ix++) {
845250
-        // copy pid tasks, one at a time
845250
-        numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name);
845250
-        write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name));
845250
-        free(namelist[ix]);
845250
+    // Set the affinity of each task in the process...
845250
+    for (int namelist_ix = 0;  (namelist_ix < num_tasks);  namelist_ix++) {
845250
+        int tid = atoi(namelist[namelist_ix]->d_name);
845250
+        int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p));
845250
+        if (rc < 0) {
845250
+            // Check errno
845250
+            if (errno == ESRCH) {
845250
+                numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid);
845250
+            }
845250
+            numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno);
845250
+        }
845250
+        free(namelist[namelist_ix]);
845250
     }
845250
     free(namelist);
845250
-    close(fd);
845250
-
845250
-    uint64_t t1 = get_time_stamp();
845250
+    // Now move the memory to the target nodes....
845250
+    static unsigned long *dest_mask;
845250
+    static unsigned long *from_mask;
845250
+    static int allocated_bytes_in_masks;
845250
+    // Lie about num_nodes being one bigger because of kernel bug...
845250
+    int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long);
845250
+    if (allocated_bytes_in_masks < num_bytes_in_masks) {
845250
+        allocated_bytes_in_masks = num_bytes_in_masks;
845250
+        dest_mask = realloc(dest_mask, num_bytes_in_masks);
845250
+        from_mask = realloc(from_mask, num_bytes_in_masks);
845250
+        if ((dest_mask == NULL) || (from_mask == NULL)) {
845250
+            numad_log(LOG_CRIT, "bit mask malloc failed\n");
845250
+            exit(EXIT_FAILURE);
845250
+        }
845250
+    }
845250
+    // In an effort to put semi-balanced memory in each target node, move the
845250
+    // contents from the source node with the max amount of memory to the
845250
+    // destination node with the least amount of memory.  Repeat until done.
845250
+    int prev_from_node_id = -1;
845250
+    for (;;) {
845250
+        int min_dest_node_id = -1;
845250
+        int max_from_node_id = -1;
845250
+        for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
845250
+            node_id = node[node_ix].node_id;
845250
+            if (ID_IS_IN_LIST(node_id, p->node_list_p)) {
845250
+                if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) {
845250
+                    // The ">=" above is intentional, so we tend to move memory to higher numbered nodes
845250
+                    min_dest_node_id = node_id;
845250
+                }
845250
+            } else {
845250
+                if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) {
845250
+                    max_from_node_id = node_id;
845250
+                }
845250
+            }
845250
+        }
845250
+        if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) {
845250
+            break;
845250
+        }
845250
+        memset(dest_mask, 0, num_bytes_in_masks);
845250
+        memset(from_mask, 0, num_bytes_in_masks);
845250
+        SET_BIT(max_from_node_id, from_mask);
845250
+        SET_BIT(min_dest_node_id, dest_mask);
845250
+        numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id);
845250
+        // Lie about num_nodes being one bigger because of kernel bug...
845250
+        int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask);
845250
+        if (rc > 2) {
845250
+            // rc == the number of pages that could not be moved.  
845250
+            // A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2.
845250
+            numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc);
845250
+        } else if (rc < 0) {
845250
+            // Check errno
845250
+            if (errno == ESRCH) {
845250
+                numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid);
845250
+                return 0;  // Assume the process terminated
845250
+            }
845250
+        }
845250
+        // Assume memory did move for current accounting purposes...
845250
+        p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id];
845250
+        p->process_MBs[max_from_node_id] = 0;
845250
+        prev_from_node_id = max_from_node_id;
845250
+    }
845250
     // Check pid still active
845250
-    snprintf(fname, FNAME_SIZE, "/proc/%d", pid);
845250
+    snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid);
845250
     if (access(fname, F_OK) < 0) {
845250
-        numad_log(LOG_WARNING, "Could not migrate pid\n");
845250
-        return 0;  // Assume the process terminated
845250
+        numad_log(LOG_WARNING, "Could not migrate pid %d.  Apparently it went away.\n", p->pid);
845250
+        return 0;
845250
+    } else {
845250
+        uint64_t t1 = get_time_stamp();
845250
+        p->bind_time_stamp = t1;
845250
+        char node_list_str[BUF_SIZE];
845250
+        str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p);
845250
+        numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100);
845250
+        return 1;
845250
     }
845250
-    numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", pid, node_list_buf, (t1-t0)/100, (t1-t0)%100);
845250
-    return 1;
845250
 }
845250
 
845250
 
845250
-void show_nodes() {
845250
-    time_t ts = time(NULL);
845250
-    fprintf(log_fs, "%s", ctime(&ts);;
845250
-    fprintf(log_fs, "Nodes: %d\n", num_nodes);
845250
-    for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
-        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ", 
845250
-            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
845250
-        for (int d = 0;  (d < num_nodes);  d++) {
845250
-            fprintf(log_fs, "%d ", node[ix].distance[d]);
845250
-        }
845250
-        char buf[BUF_SIZE];
845250
-        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
845250
-        fprintf(log_fs, " CPUs: %s\n", buf);
845250
-    }
845250
-    fprintf(log_fs, "\n");
845250
-    fflush(log_fs);
845250
-}
845250
-
845250
 
845250
 typedef struct cpu_data {
845250
     uint64_t time_stamp;
845250
@@ -1062,10 +1108,9 @@ typedef struct cpu_data {
845250
 cpu_data_t cpu_data_buf[2];  // Two sets, to calc deltas
845250
 int cur_cpu_data_buf = 0;
845250
 
845250
-
845250
 void update_cpu_data() {
845250
     // Parse idle percents from CPU stats in /proc/stat cpu<N> lines
845250
-    static FILE *fs = NULL;
845250
+    static FILE *fs;
845250
     if (fs != NULL) {
845250
         rewind(fs);
845250
     } else {
845250
@@ -1107,14 +1152,14 @@ void update_cpu_data() {
845250
             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip nice
845250
             while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; }  // skip system
845250
             while (!isdigit(*p)) { p++; }
845250
-            uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); }
845250
+            uint64_t idle;
845250
+            CONVERT_DIGITS_TO_NUM(p, idle);
845250
             cpu_data_buf[new].idle[cpu_id] = idle;
845250
         }
845250
     }
845250
     cur_cpu_data_buf = new;
845250
 }
845250
 
845250
-
845250
 int node_and_digits(const struct dirent *dptr) {
845250
     char *p = (char *)(dptr->d_name);
845250
     if (*p++ != 'n') return 0;
845250
@@ -1129,10 +1174,31 @@ int node_and_digits(const struct dirent
845250
 }
845250
 
845250
 
845250
+uint64_t node_info_time_stamp = 0;
845250
 id_list_p all_cpus_list_p = NULL;
845250
 id_list_p all_nodes_list_p = NULL;
845250
-uint64_t node_info_time_stamp = 0;
845250
+id_list_p reserved_cpu_mask_list_p = NULL;
845250
+char *reserved_cpu_str = NULL;
845250
 
845250
+void show_nodes() {
845250
+    fprintf(log_fs, "\n");
845250
+    numad_log(LOG_INFO, "Nodes: %d\n", num_nodes);
845250
+    fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n", 
845250
+        min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free);
845250
+    fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n", 
845250
+        min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free);
845250
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
+        fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld,  Distance: ", 
845250
+            ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free);
845250
+        for (int d = 0;  (d < num_nodes);  d++) {
845250
+            fprintf(log_fs, "%d ", node[ix].distance[d]);
845250
+        }
845250
+        char buf[BUF_SIZE];
845250
+        str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p);
845250
+        fprintf(log_fs, " CPUs: %s\n", buf);
845250
+    }
845250
+    fflush(log_fs);
845250
+}
845250
 
845250
 int update_nodes() {
845250
     char fname[FNAME_SIZE];
845250
@@ -1141,6 +1207,7 @@ int update_nodes() {
845250
     uint64_t time_stamp = get_time_stamp();
845250
 #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED)
845250
     if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) {
845250
+        node_info_time_stamp = time_stamp;
845250
         // Count directory names of the form: /sys/devices/system/node/node<N>
845250
         struct dirent **namelist;
845250
         int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL);
845250
@@ -1167,8 +1234,15 @@ int update_nodes() {
845250
             }
845250
             num_nodes = num_files;
845250
         }
845250
-        CLEAR_LIST(all_cpus_list_p);
845250
-        CLEAR_LIST(all_nodes_list_p);
845250
+        sum_CPUs_total = 0;
845250
+        CLEAR_CPU_LIST(all_cpus_list_p);
845250
+        CLEAR_NODE_LIST(all_nodes_list_p);
845250
+        // Figure out how many threads per core there are (for later discounting of hyper-threads)
845250
+        threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings");
845250
+        if (threads_per_core < 1) {
845250
+            numad_log(LOG_CRIT, "Could not count threads per core\n");
845250
+            exit(EXIT_FAILURE);
845250
+        }
845250
         // For each "node<N>" filename present, save <N> in node[ix].node_id
845250
         // Note that the node id might not necessarily match the node ix.
845250
         // Also populate the cpu lists and distance vectors for this node.
845250
@@ -1184,11 +1258,24 @@ int update_nodes() {
845250
             snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id);
845250
             int fd = open(fname, O_RDONLY, 0);
845250
             if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
845250
+                buf[BIG_BUF_SIZE - 1] = '\0';
845250
                 // get cpulist from the cpulist string
845250
-                CLEAR_LIST(node[node_ix].cpu_list_p);
845250
+                CLEAR_CPU_LIST(node[node_ix].cpu_list_p);
845250
                 int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf);
845250
+                if (reserved_cpu_str != NULL) {
845250
+                    AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p);
845250
+                    n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
845250
+                }
845250
                 OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p);
845250
-                node[node_ix].CPUs_total = n * ONE_HUNDRED;
845250
+                // Calculate total CPUs, but possibly discount hyper-threads
845250
+                if ((threads_per_core == 1) || (htt_percent >= 100)) {
845250
+                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
845250
+                } else {
845250
+                    n /= threads_per_core;
845250
+                    node[node_ix].CPUs_total = n * ONE_HUNDRED;
845250
+                    node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent;
845250
+                }
845250
+                sum_CPUs_total += node[node_ix].CPUs_total;
845250
                 close(fd);
845250
             } else {
845250
                 numad_log(LOG_CRIT, "Could not get node cpu list\n");
845250
@@ -1220,15 +1307,30 @@ int update_nodes() {
845250
         }
845250
         free(namelist);
845250
     }
845250
-    // Second, get the dynamic free memory and available CPU capacity
845250
+    // Second, update the dynamic free memory and available CPU capacity
845250
+    while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) {
845250
+        // Make sure at least 7/100 of a second has passed.
845250
+        // Otherwise sleep for 1/10 second.
845250
+	struct timespec ts = { 0, 100000000 }; 
845250
+	nanosleep(&ts, &ts);
845250
+	time_stamp = get_time_stamp();
845250
+    }
845250
     update_cpu_data();
845250
+    max_node_MBs_free = 0;
845250
+    max_node_CPUs_free = 0;
845250
+    min_node_MBs_free = MAXINT;
845250
+    min_node_CPUs_free = MAXINT;
845250
+    uint64_t sum_of_node_MBs_free = 0;
845250
+    uint64_t sum_of_node_CPUs_free = 0;
845250
     for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
845250
         int node_id = node[node_ix].node_id;
845250
         // Get available memory info from node<N>/meminfo file
845250
         snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id);
845250
         int fd = open(fname, O_RDONLY, 0);
845250
         if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) {
845250
+            close(fd);
845250
             uint64_t KB;
845250
+            buf[BIG_BUF_SIZE - 1] = '\0';
845250
             char *p = strstr(buf, "MemTotal:");
845250
             if (p != NULL) {
845250
                 p += 9;
845250
@@ -1238,7 +1340,11 @@ int update_nodes() {
845250
             }
845250
             while (!isdigit(*p)) { p++; }
845250
             CONVERT_DIGITS_TO_NUM(p, KB);
845250
-            node[node_ix].MBs_total = KB / KILOBYTE;
845250
+            node[node_ix].MBs_total = (KB / KILOBYTE);
845250
+            if (node[node_ix].MBs_total < 1) {
845250
+                // If a node has zero memory, remove it from the all_nodes_list...
845250
+                CLR_ID_IN_LIST(node_id, all_nodes_list_p);
845250
+            }
845250
             p = strstr(p, "MemFree:");
845250
             if (p != NULL) {
845250
                 p += 8;
845250
@@ -1248,8 +1354,28 @@ int update_nodes() {
845250
             }
845250
             while (!isdigit(*p)) { p++; }
845250
             CONVERT_DIGITS_TO_NUM(p, KB);
845250
-            node[node_ix].MBs_free = KB / KILOBYTE;
845250
-            close(fd);
845250
+            node[node_ix].MBs_free = (KB / KILOBYTE);
845250
+            if (use_inactive_file_cache) {
845250
+                // Add inactive file cache quantity to "free" memory
845250
+                p = strstr(p, "Inactive(file):");
845250
+                if (p != NULL) {
845250
+                    p += 15;
845250
+                } else {
845250
+                    numad_log(LOG_CRIT, "Could not get node Inactive(file)\n");
845250
+                    exit(EXIT_FAILURE);
845250
+                }
845250
+                while (!isdigit(*p)) { p++; }
845250
+                CONVERT_DIGITS_TO_NUM(p, KB);
845250
+                node[node_ix].MBs_free += (KB / KILOBYTE);
845250
+            }
845250
+            sum_of_node_MBs_free += node[node_ix].MBs_free;
845250
+            if (min_node_MBs_free > node[node_ix].MBs_free) {
845250
+                min_node_MBs_free = node[node_ix].MBs_free;
845250
+                min_node_MBs_free_ix = node[node_ix].node_id;
845250
+            }
845250
+            if (max_node_MBs_free < node[node_ix].MBs_free) {
845250
+                max_node_MBs_free = node[node_ix].MBs_free;
845250
+            }
845250
         } else {
845250
             numad_log(LOG_CRIT, "Could not get node meminfo\n");
845250
             exit(EXIT_FAILURE);
845250
@@ -1260,7 +1386,8 @@ int update_nodes() {
845250
         if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) {
845250
             uint64_t idle_ticks = 0;
845250
             int cpu = 0;
845250
-            int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED;
845250
+            int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p);
845250
+            int num_cpus_to_process = num_lcpus;
845250
             while (num_cpus_to_process) {
845250
                 if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) {
845250
                     idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu]
845250
@@ -1274,15 +1401,46 @@ int update_nodes() {
845250
             // printf("Node: %d   CPUs: %ld   time diff %ld   Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks);
845250
             // assert(time_diff > 0);
845250
             node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff;
845250
+            // Possibly discount hyper-threads
845250
+            if ((threads_per_core > 1) && (htt_percent < 100)) {
845250
+                uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent);
845250
+                if (node[node_ix].CPUs_free > htt_discount) {
845250
+                    node[node_ix].CPUs_free -= htt_discount;
845250
+                } else {
845250
+                    node[node_ix].CPUs_free = 0;
845250
+                }
845250
+            }
845250
             if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) {
845250
                 node[node_ix].CPUs_free = node[node_ix].CPUs_total;
845250
             }
845250
+            sum_of_node_CPUs_free += node[node_ix].CPUs_free;
845250
+            if (min_node_CPUs_free > node[node_ix].CPUs_free) {
845250
+                min_node_CPUs_free = node[node_ix].CPUs_free;
845250
+                min_node_CPUs_free_ix = node[node_ix].node_id;
845250
+            }
845250
+            if (max_node_CPUs_free < node[node_ix].CPUs_free) {
845250
+                max_node_CPUs_free = node[node_ix].CPUs_free;
845250
+            }
845250
             node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free;
845250
         } else {
845250
             node[node_ix].CPUs_free = 0;
845250
             node[node_ix].magnitude = 0;
845250
         }
845250
     }
845250
+    avg_node_MBs_free = sum_of_node_MBs_free / num_nodes;
845250
+    avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes;
845250
+    double MBs_variance_sum = 0.0;
845250
+    double CPUs_variance_sum = 0.0;
845250
+    for (int node_ix = 0;  (node_ix < num_nodes);  node_ix++) {
845250
+        double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free;
845250
+        double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free;
845250
+        MBs_variance_sum += MBs_diff * MBs_diff;
845250
+        CPUs_variance_sum += CPUs_diff * CPUs_diff;
845250
+    }
845250
+    double MBs_variance = MBs_variance_sum / (num_nodes);
845250
+    double CPUs_variance = CPUs_variance_sum / (num_nodes);
845250
+    stddev_node_MBs_free = sqrt(MBs_variance);
845250
+    stddev_node_CPUs_free = sqrt(CPUs_variance);
845250
     if (log_level >= LOG_INFO) {
845250
         show_nodes();
845250
     }
845250
@@ -1316,7 +1474,7 @@ typedef struct stat_data {
845250
     int64_t num_threads;  // 19
845250
     int64_t itrealvalue;
845250
     uint64_t starttime;
845250
-    uint64_t vsize;
845250
+    uint64_t vsize;       // 22
845250
     int64_t rss;          // 23
845250
     uint64_t rsslim;
845250
     uint64_t startcode;
845250
@@ -1356,15 +1514,16 @@ process_data_p get_stat_data_for_pid(int
845250
     }
845250
     static char buf[BUF_SIZE];
845250
     int bytes = read(fd, buf, BUF_SIZE);
845250
+    close(fd);
845250
     if (bytes < 50) {
845250
         numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname);
845250
         return NULL;
845250
     }
845250
-    close(fd);
845250
+    uint64_t val;
845250
     char *p = buf;
845250
     static process_data_t data;
845250
     // Get PID from field 0
845250
-    uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
845250
+    CONVERT_DIGITS_TO_NUM(p, val);
845250
     data.pid = val;
845250
     // Copy comm from field 1
845250
     while (*p == ' ') { p++; }
845250
@@ -1373,23 +1532,27 @@ process_data_p get_stat_data_for_pid(int
845250
     // Skip fields 2 through 12
845250
     for (int ix = 0;  (ix < 11);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
845250
     // Get utime from field 13 for cpu_util
845250
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
845250
+    CONVERT_DIGITS_TO_NUM(p, val);
845250
     data.cpu_util = val;
845250
     // Get stime from field 14 to add on to cpu_util (which already has utime)
845250
     while (*p == ' ') { p++; }
845250
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
845250
+    CONVERT_DIGITS_TO_NUM(p, val);
845250
     data.cpu_util += val;
845250
     // Skip fields 15 through 18
845250
     while (*p == ' ') { p++; }
845250
     for (int ix = 0;  (ix < 4);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
845250
     // Get num_threads from field 19
845250
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
845250
+    CONVERT_DIGITS_TO_NUM(p, val);
845250
     data.num_threads = val;
845250
-    // Skip fields 20 through 22
845250
+    // Skip fields 20 through 21
845250
     while (*p == ' ') { p++; }
845250
-    for (int ix = 0;  (ix < 3);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
845250
+    for (int ix = 0;  (ix < 2);  ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } }
845250
+    // Get vsize from field 22 to compute MBs_size
845250
+    CONVERT_DIGITS_TO_NUM(p, val);
845250
+    data.MBs_size = val / MEGABYTE;
845250
     // Get rss from field 23 to compute MBs_used
845250
-    val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); }
845250
+    while (*p == ' ') { p++; }
845250
+    CONVERT_DIGITS_TO_NUM(p, val);
845250
     data.MBs_used = (val * page_size_in_bytes) / MEGABYTE;
845250
     // Return pointer to data
845250
     return &dat;;
845250
@@ -1471,446 +1634,409 @@ int update_processes() {
845250
 }
845250
 
845250
 
845250
+int initialize_mem_node_list(process_data_p p) {
845250
+    // Parameter p is a pointer to an element in the hash table
845250
+    if ((!p) || (p->pid < 1)) {
845250
+        numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n");
845250
+        exit(EXIT_FAILURE);
845250
+    }
845250
+    int n = 0;
845250
+    char fname[FNAME_SIZE];
845250
+    char buf[BIG_BUF_SIZE];
845250
+    p->process_MBs = NULL;
845250
+    CLEAR_NODE_LIST(p->node_list_p);
845250
+    snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid);
845250
+    int fd = open(fname, O_RDONLY, 0);
845250
+    if (fd < 0) {
845250
+        numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid);
845250
+        return 0;  // Assume the process terminated
845250
+    }
845250
+    int bytes = read(fd, buf, BIG_BUF_SIZE);
845250
+    close(fd);
845250
+    if (bytes <= 0) {
845250
+        numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid);
845250
+        return 0;  // Assume the process terminated
845250
+    } else if (bytes >= BIG_BUF_SIZE) {
845250
+        buf[BIG_BUF_SIZE - 1] = '\0';
845250
+    } else {
845250
+        buf[bytes] = '\0';
845250
+    }
845250
+    char *list_str_p = strstr(buf, "Mems_allowed_list:");
845250
+    if (!list_str_p) {
845250
+        numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n");
845250
+        exit(EXIT_FAILURE);
845250
+    }
845250
+    list_str_p += 18;
845250
+    while (!isdigit(*list_str_p)) { list_str_p++; }
845250
+    n = add_ids_to_list_from_str(p->node_list_p, list_str_p);
845250
+    if (n < num_nodes) {
845250
+        // If process already bound to a subset of nodes when we discover it,
845250
+        // set initial bind_time_stamp to 30 minutes ago...
845250
+        p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED);
845250
+    }
845250
+    return n;
845250
+}
845250
 
845250
-id_list_p pick_numa_nodes(int pid, int cpus, int mbs) {
845250
-    char buf[BUF_SIZE];
845250
-    char buf2[BUF_SIZE];
845250
+
845250
+uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) {
845250
+    int64_t needed_mem;
845250
+    int64_t needed_cpu;
845250
+    int64_t excess_mem;
845250
+    int64_t excess_cpu;
845250
+    if (MBs_free > mbs) {
845250
+        needed_mem = mbs;
845250
+        excess_mem = MBs_free - mbs;
845250
+    } else {
845250
+        needed_mem = MBs_free;
845250
+        excess_mem = 0;
845250
+    }
845250
+    if (CPUs_free > cpus) {
845250
+        needed_cpu = cpus;
845250
+        excess_cpu = CPUs_free - cpus;
845250
+    } else {
845250
+        needed_cpu = CPUs_free;
845250
+        excess_cpu = 0;
845250
+    }
845250
+    // Weight the available resources, and then calculate magnitude as
845250
+    // product of available CPUs and available MBs.
845250
+    int64_t memfactor = (needed_mem * 10 + excess_mem * 4);
845250
+    int64_t cpufactor = (needed_cpu *  6 + excess_cpu * 1);
845250
+    numad_log(LOG_DEBUG, "    Node[%d]: mem: %ld  cpu: %ld\n", ix, memfactor, cpufactor);
845250
+    return (memfactor * cpufactor);
845250
+}
845250
+
845250
+
845250
+id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) {
845250
     if (log_level >= LOG_DEBUG) {
845250
         numad_log(LOG_DEBUG, "PICK NODES FOR:  PID: %d,  CPUs %d,  MBs %d\n", pid, cpus, mbs);
845250
     }
845250
-    int num_existing_mems = 0;
845250
-    static id_list_p existing_mems_list_p;
845250
-    CLEAR_LIST(existing_mems_list_p);
845250
-    uint64_t time_stamp = get_time_stamp();
845250
-    static node_data_p tmp_node;
845250
-    static uint64_t *process_MBs;
845250
-    static uint64_t *saved_magnitude_for_node;
845250
-    static int process_MBs_num_nodes;
845250
-    // See if dynamic structures need to grow.
845250
-    if (process_MBs_num_nodes < num_nodes + 1) {
845250
-        process_MBs_num_nodes = num_nodes + 1;
845250
-        // The "+1 node" is for accumulating interleaved memory
845250
-        process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t));
845250
-        tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
845250
-        saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t));
845250
-        if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) {
845250
-            numad_log(LOG_CRIT, "process_MBs realloc failed\n");
845250
-            exit(EXIT_FAILURE);
845250
-        }
845250
-    }
845250
+    char buf[BUF_SIZE];
845250
+    uint64_t proc_avg_node_CPUs_free = 0;
845250
     // For existing processes, get miscellaneous process specific details
845250
     int pid_ix;
845250
     process_data_p p = NULL;
845250
     if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) {
845250
         p = &process_hash_table[pid_ix];
845250
-        // Quick rejection if this process has interleaved memory, but recheck it once an hour...
845250
-#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED)
845250
-        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
845250
-          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
845250
-            if (log_level >= LOG_DEBUG) {
845250
-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
845250
-            }
845250
-            return NULL;
845250
-        }
845250
-        // Get cpuset name for this process, and existing mems binding, if any.
845250
+        // Add up per-node memory in use by this process.
845250
+        // This scanning is expensive and should be minimized.
845250
         char fname[FNAME_SIZE];
845250
-        snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid);
845250
-        FILE *fs = fopen(fname, "r");
845250
-        if (!fs) {
845250
-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
845250
-            return NULL;  // Assume the process terminated?
845250
-        }
845250
-        if (!fgets(buf, BUF_SIZE, fs)) {
845250
-            numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid);
845250
-            fclose(fs);
845250
-            return NULL;  // Assume the process terminated?
845250
-        }
845250
-        fclose(fs);
845250
-        ELIM_NEW_LINE(buf);
845250
-        if ((!p->cpuset_name) || (strcmp(p->cpuset_name, buf))) {
845250
-            if (p->cpuset_name != NULL) {
845250
-                free(p->cpuset_name);
845250
-            }
845250
-            p->cpuset_name = strdup(buf);
845250
-        }
845250
-        if (log_level >= LOG_DEBUG) {
845250
-            numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name);
845250
-        }
845250
-        snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
845250
-        fs = fopen(fname, "r");
845250
-        if ((fs) && (fgets(buf, BUF_SIZE, fs))) {
845250
-            fclose(fs);
845250
-            num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf);
845250
-            if (log_level >= LOG_DEBUG) {
845250
-                str_from_id_list(buf, BUF_SIZE, existing_mems_list_p);
845250
-                numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf);
845250
-            }
845250
-        } 
845250
-        // If this process was just recently bound, enforce a minimum delay
845250
-        // period between repeated attempts to potentially move the memory.
845250
-        // FIXME: ?? might this retard appropriate process expansion too much?  
845250
-#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED)
845250
-        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
845250
-            // Skip re-evaluation because we just did it recently.
845250
-            if (log_level >= LOG_DEBUG) {
845250
-                numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n");
845250
-            }
845250
-            return NULL;
845250
-        }
845250
-        // Look for short cut because of duplicate bindings.  If we have bound
845250
-        // this process to the same nodes multiple times already, and the load
845250
-        // on those nodes still seems acceptable, skip the rest of this and
845250
-        // just return NULL to indicate no change needed.  FIXME: should figure
845250
-        // out what can change that would make a rebinding desirable (e.g. (1)
845250
-        // some process gets sub-optimal allocation on busy machine which
845250
-        // subsequently becomes less busy leaving disadvantaged process. (2)
845250
-        // node load imbalance, (3) any process split across nodes which should
845250
-        // fit within a single node.) For now, just expire the dup_bid_count
845250
-        // occasionally, which is a reasonably good mitigation.
845250
-        // So, check to see if we should decay the dup_bind_count...
845250
-#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED)
845250
-        if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) {
845250
-            p->dup_bind_count -= 1;
845250
-        }
845250
-        // Now, look for short cut because of duplicate bindings
845250
-        if (p->dup_bind_count > 0) {
845250
-            int node_id = 0;
845250
-            int nodes_have_cpu = 1;
845250
-            int nodes_have_ram = 1;
845250
-            int n = num_existing_mems;
845250
-            int min_resource_pct = 100 - target_utilization;
845250
-            if (min_resource_pct < 5) {
845250
-                min_resource_pct = 5;
845250
-            }
845250
-            while (n) {
845250
-                if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
845250
-                    nodes_have_cpu &= ((100 * node[node_id].CPUs_free / node[node_id].CPUs_total) >= (min_resource_pct));
845250
-                    nodes_have_ram &= ((100 * node[node_id].MBs_free  / node[node_id].MBs_total)  >= (min_resource_pct));
845250
-                    n -= 1;
845250
-                }
845250
-                node_id += 1;
845250
-            }
845250
-            if ((nodes_have_cpu) && (nodes_have_ram)) {
845250
-                if (log_level >= LOG_DEBUG) {
845250
-                    numad_log(LOG_DEBUG, "Skipping evaluation because of repeat binding\n");
845250
-                }
845250
-                return NULL;
845250
-            }
845250
-            if (log_level >= LOG_DEBUG) {
845250
-                numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram);
845250
-            }
845250
-        }
845250
-        // Fourth, add up per-node memory in use by this process. This scanning
845250
-        // is expensive and should be minimized.  Also, old kernels dismantle
845250
-        // transparent huge pages while producing the numa_maps memory
845250
-        // information! 
845250
-        memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t));
845250
         snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid);
845250
-        fs = fopen(fname, "r");
845250
+        FILE *fs = fopen(fname, "r");
845250
         if (!fs) {
845250
             numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid);
845250
             return NULL;  // Assume the process terminated
845250
         }
845250
+        // Allocate and zero per node memory array.
845250
+        // The "+1 node" is for accumulating interleaved memory
845250
+        p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t));
845250
+        if (p->process_MBs == NULL) {
845250
+            numad_log(LOG_CRIT, "p->process_MBs realloc failed\n");
845250
+            exit(EXIT_FAILURE);
845250
+        }
845250
+        memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t));
845250
         int process_has_interleaved_memory = 0;
845250
         while (fgets(buf, BUF_SIZE, fs)) {
845250
             int interleaved_memory = 0;
845250
             uint64_t page_size = page_size_in_bytes;
845250
             const char *delimiters = " \n";
845250
-            char *p = strtok(buf, delimiters);
845250
-            while (p) {
845250
-                if (!strncmp(p, "interleave", 10)) {
845250
+            char *str_p = strtok(buf, delimiters);
845250
+            while (str_p) {
845250
+                if (!strncmp(str_p, "interleave", 10)) {
845250
                     interleaved_memory = 1;
845250
                     process_has_interleaved_memory = 1;
845250
-                } else if (!strcmp(p, "huge")) {
845250
+                } else if (!strcmp(str_p, "huge")) {
845250
                     page_size = huge_page_size_in_bytes;
845250
-                } else if (*p++ == 'N') {
845250
+                } else if (*str_p++ == 'N') {
845250
                     int node;
845250
                     uint64_t pages;
845250
-                    CONVERT_DIGITS_TO_NUM(p, node);
845250
-                    if (*p++ != '=') {
845250
+                    CONVERT_DIGITS_TO_NUM(str_p, node);
845250
+                    if (*str_p++ != '=') {
845250
                         numad_log(LOG_CRIT, "numa_maps node number parse error\n");
845250
                         exit(EXIT_FAILURE);
845250
                     }
845250
-                    CONVERT_DIGITS_TO_NUM(p, pages);
845250
-                    process_MBs[node] += (pages * page_size);
845250
+                    CONVERT_DIGITS_TO_NUM(str_p, pages);
845250
+                    p->process_MBs[node] += (pages * page_size);
845250
                     if (interleaved_memory) {
845250
                         // sum interleaved quantity in "extra node"
845250
-                        process_MBs[num_nodes] += (pages * page_size);
845250
+                        p->process_MBs[num_nodes] += (pages * page_size);
845250
                     }
845250
                 }
845250
                 // Get next token on the line
845250
-                p = strtok(NULL, delimiters);
845250
+                str_p = strtok(NULL, delimiters);
845250
             }
845250
         }
845250
         fclose(fs);
845250
+        proc_avg_node_CPUs_free = p->CPUs_used;
845250
         for (int ix = 0;  (ix <= num_nodes);  ix++) {
845250
-            process_MBs[ix] /= MEGABYTE;
845250
-            if (log_level >= LOG_DEBUG) {
845250
-                numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]);
845250
+            p->process_MBs[ix] /= MEGABYTE;
845250
+            if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) {
845250
+                if (ix == num_nodes) {
845250
+                    numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]);
845250
+                } else {
845250
+                    numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]);
845250
+                }
845250
+            }
845250
+            if (ID_IS_IN_LIST(ix, p->node_list_p)) {
845250
+                proc_avg_node_CPUs_free += node[ix].CPUs_free;
845250
             }
845250
         }
845250
+        proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p);
845250
         if ((process_has_interleaved_memory) && (keep_interleaved_memory)) {
845250
             // Mark this process as having interleaved memory so we do not
845250
-            // merge the interleaved memory.  Time stamp it as done.
845250
+            // merge the interleaved memory.  Time stamp it as done and return.
845250
             p->flags |= PROCESS_FLAG_INTERLEAVED;
845250
             p->bind_time_stamp = get_time_stamp();
845250
             if (log_level >= LOG_DEBUG) {
845250
-                numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n");
845250
+                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
845250
             }
845250
             return NULL;
845250
         }
845250
     }  // end of existing PID conditional
845250
     // Make a copy of node available resources array.  Add in info specific to
845250
     // this process to equalize available resource quantities wrt locations of
845250
-    // resources already in use by this process.  Inflate the value of already
845250
-    // assigned memory by approximately 3/2, because moving memory is
845250
-    // expensive.  Average the amount of CPUs_free across the existing nodes
845250
-    // used, because the threads are free to move around in that domain.  After
845250
-    // calculating combined magnitude of available resources, bias the values
845250
-    // towards existing locations for this process.
845250
-    int target_using_all_nodes = 0;
845250
-    uint64_t node_CPUs_free_for_this_process = 0;
845250
-    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
845250
-    if (num_existing_mems > 0) {
845250
-        node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation?
845250
-        int node_id = 0;
845250
-        int n = num_existing_mems;
845250
-        while (n) {
845250
-            if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
845250
-                node_CPUs_free_for_this_process += tmp_node[node_id].CPUs_free;
845250
-                n -= 1;
845250
-            }
845250
-            node_id += 1;
845250
-        }
845250
-        // Divide to get average CPUs_free for the nodes in use by process
845250
-        node_CPUs_free_for_this_process /= num_existing_mems;
845250
+    // resources already in use by this process.
845250
+    static node_data_p tmp_node;
845250
+    tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) );
845250
+    if (tmp_node == NULL) {
845250
+        numad_log(LOG_CRIT, "tmp_node realloc failed\n");
845250
+        exit(EXIT_FAILURE);
845250
     }
845250
+    memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) );
845250
+    uint64_t sum_of_node_CPUs_free = 0;
845250
     for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
         if (pid > 0) {
845250
-            tmp_node[ix].MBs_free  += ((process_MBs[ix] * 12) / 8);
845250
-        }
845250
-        if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) {
845250
-            tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process;
845250
-        }
845250
-        if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
845250
-            tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
845250
-        }
845250
-        if (log_level >= LOG_DEBUG) {
845250
-            numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free);
845250
+            if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) {
845250
+                // Process not yet bound to a subset of nodes.
845250
+                // Add back memory used by this process on this node.
845250
+                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16);  // Apply light mem bias
845250
+                // Add back CPU used by this process in proportion to the memory used on this node.
845250
+                tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used);
845250
+            } else {
845250
+                // If the process is currently running on less than all the
845250
+                // nodes, first add back (biased) memory already used by this
845250
+                // process on this node, then assign average process CPU / node
845250
+                // for this process iff the process is present on this node.
845250
+                tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4);  // Apply heavy mem bias
845250
+                if (ID_IS_IN_LIST(ix, p->node_list_p)) {
845250
+                    tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free;
845250
+                }
845250
+            }
845250
+            sum_of_node_CPUs_free += tmp_node[ix].CPUs_free;
845250
+            if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) {
845250
+                tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total;
845250
+            }
845250
+            if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) {
845250
+                tmp_node[ix].MBs_free = tmp_node[ix].MBs_total;
845250
+            }
845250
         }
845250
-        // Calculate magnitude as product of available CPUs and available MBs
845250
-        tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free;
845250
-        // Bias combined magnitude towards already assigned nodes
845250
-        if (ID_IS_IN_LIST(ix, existing_mems_list_p)) {
845250
-            tmp_node[ix].magnitude *= 9;
845250
-            tmp_node[ix].magnitude /= 8;
845250
+        // Enforce 1/100th CPU minimum
845250
+        if (tmp_node[ix].CPUs_free < 1) {
845250
+            tmp_node[ix].CPUs_free = 1;
845250
         }
845250
-        // Save the current magnitudes
845250
-        saved_magnitude_for_node[ix] = tmp_node[ix].magnitude;
845250
+        // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld  cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
845250
+        tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free);
845250
     }
845250
-    // OK, figure out where to get resources for this request.
845250
+    // Now figure out where to get resources for this request....
845250
     static id_list_p target_node_list_p;
845250
-    CLEAR_LIST(target_node_list_p);
845250
-    int prev_node_used = -1;
845250
-    // Continue to allocate more resources until request are met.
845250
-    // OK if not not quite all the CPU request is met.
845250
-    // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing?
845250
-    int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; 
845250
-    if (pid <= 0) {
845250
-        // If trying to find resources for pre-placement advice request, do not
845250
-        // underestimate the amount of CPUs needed.  Instead, err on the side
845250
-        // of providing too many resources.  So, no flexing here...
845250
-        cpu_flex = 0;
845250
-    }
845250
-    while ((mbs > 0) || (cpus > cpu_flex)) {
845250
-        if (log_level >= LOG_DEBUG) {
845250
-            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
845250
+    CLEAR_NODE_LIST(target_node_list_p);
845250
+    if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) {
845250
+        // System CPUs might be oversubscribed, but...
845250
+        assume_enough_cpus = 1;
845250
+        // and rely on available memory for placement.
845250
+    }
845250
+    // Establish a CPU flex fudge factor, on the presumption it is OK if not
845250
+    // quite all the CPU request is met.  However, if trying to find resources
845250
+    // for pre-placement advice request, do not underestimate the amount of
845250
+    // CPUs needed.  Instead, err on the side of providing too many resources.
845250
+    int cpu_flex = 0;
845250
+    if ((pid > 0) && (target_utilization < 100)) {
845250
+        // FIXME: Is half of the utilization margin a good amount of CPU flexing?
845250
+        cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200;
845250
+    }
845250
+    // Figure out minimum number of nodes required
845250
+    int mem_req_nodes = ceil((double)mbs  / (double)node[0].MBs_total);
845250
+    int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total); 
845250
+    int min_req_nodes = mem_req_nodes;
845250
+    if (min_req_nodes < cpu_req_nodes) {
845250
+        min_req_nodes = cpu_req_nodes;
845250
+    }
845250
+    if (min_req_nodes > num_nodes) {
845250
+        min_req_nodes = num_nodes;
845250
+    }
845250
+    // Use an index to sort NUMA connected resource chain for each node
845250
+    int index[num_nodes];
845250
+    uint64_t totmag[num_nodes];
845250
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
+        // Reset the index each time
845250
+        for (int n = 0;  (n < num_nodes);  n++) {
845250
+            index[n] = n;
845250
         }
845250
-        // Sort nodes by magnitude of available resources.  Note that
845250
-        // inter-node distances (to the previous node used) are factored into
845250
-        // the sort.
845250
+        // Sort by minimum relative NUMA distance from node[ix],
845250
+        // breaking distance ties with magnitude of available resources
845250
         for (int ij = 0;  (ij < num_nodes);  ij++) {
845250
-            int big_ix = ij;
845250
+            int best_ix = ij;
845250
             for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
845250
-                uint64_t ik_dist = 1;
845250
-                uint64_t big_ix_dist = 1;
845250
-                if (prev_node_used >= 0) {
845250
-                    ik_dist = tmp_node[ik].distance[prev_node_used];
845250
-                    big_ix_dist = tmp_node[big_ix].distance[prev_node_used];
845250
-                }
845250
-                // Scale magnitude comparison by distances to previous node used...
845250
-                if ((tmp_node[big_ix].magnitude / big_ix_dist) < (tmp_node[ik].magnitude / ik_dist)) {
845250
-                    big_ix = ik;
845250
-                }
845250
-            }
845250
-            if (big_ix != ij) {
845250
-                node_data_t tmp;
845250
-                memcpy((void *)&tmp, (void *)&tmp_node[ij], sizeof(node_data_t) );
845250
-                memcpy((void *)&tmp_node[ij], (void *)&tmp_node[big_ix], sizeof(node_data_t) );
845250
-                memcpy((void *)&tmp_node[big_ix], (void *)&tmp, sizeof(node_data_t) );
845250
+                int ik_dist = tmp_node[index[ik]].distance[ix];
845250
+                int best_ix_dist = tmp_node[index[best_ix]].distance[ix];
845250
+                if (best_ix_dist > ik_dist) {
845250
+                    best_ix = ik;
845250
+                } else if (best_ix_dist == ik_dist) {
845250
+                    if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
845250
+                        best_ix = ik;
845250
+                    }
845250
+                }
845250
+            }
845250
+            if (best_ix != ij) {
845250
+                int tmp = index[ij];
845250
+                index[ij] = index[best_ix];
845250
+                index[best_ix] = tmp;
845250
             }
845250
         }
845250
+#if 0
845250
         if (log_level >= LOG_DEBUG) {
845250
-            for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
-                numad_log(LOG_DEBUG, "Sorted magnitude[%d]: %ld\n", tmp_node[ix].node_id, tmp_node[ix].magnitude);
845250
+            for (int iq = 0;  (iq < num_nodes);  iq++) {
845250
+                numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
845250
+                    tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude);
845250
+            }
845250
+        }
845250
+#endif
845250
+        // Save the totmag[] sum of the magnitudes of expected needed nodes,
845250
+        // "normalized" by NUMA distance (by dividing each magnitude by the
845250
+        // relative distance squared).
845250
+        totmag[ix] = 0;
845250
+        for (int ij = 0;  (ij < min_req_nodes);  ij++) {
845250
+            int dist = tmp_node[index[ij]].distance[ix];
845250
+            totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist));
845250
+        }
845250
+        numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]);
845250
+    }
845250
+    // Now find the best NUMA node based on the normalized sum of node
845250
+    // magnitudes expected to be used.
845250
+    int best_node_ix = 0;
845250
+    for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
+        if (totmag[best_node_ix] < totmag[ix]) {
845250
+            best_node_ix = ix;
845250
+        }
845250
+    }
845250
+    numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix);
845250
+    // Reset sorting index again
845250
+    for (int n = 0;  (n < num_nodes);  n++) {
845250
+        index[n] = n;
845250
+    }
845250
+    // Sort index by distance from node[best_node_ix],
845250
+    // breaking distance ties with magnitude
845250
+    for (int ij = 0;  (ij < num_nodes);  ij++) {
845250
+        int best_ix = ij;
845250
+        for (int ik = ij + 1;  (ik < num_nodes);  ik++) {
845250
+            int ik_dist = tmp_node[index[ik]].distance[best_node_ix];
845250
+            int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix];
845250
+            if (best_ix_dist > ik_dist) {
845250
+                best_ix = ik;
845250
+            } else if (best_ix_dist == ik_dist) {
845250
+                if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) {
845250
+                    best_ix = ik;
845250
+                }
845250
             }
845250
         }
845250
-        if (tmp_node[0].node_id == prev_node_used) {
845250
-            // Hmmm.  Looks like the best node for more resources, is also the
845250
-            // last one we used.  This is not going to make progress...  So
845250
-            // just punt and use everything.
845250
-            OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p);
845250
-            target_using_all_nodes = 1;
845250
-            break;
845250
+        if (best_ix != ij) {
845250
+            int tmp = index[ij];
845250
+            index[ij] = index[best_ix];
845250
+            index[best_ix] = tmp;
845250
+        }
845250
+    }
845250
+    if (log_level >= LOG_DEBUG) {
845250
+        for (int iq = 0;  (iq < num_nodes);  iq++) {
845250
+            numad_log(LOG_DEBUG, "Node: %d  Dist: %d  Magnitude: %ld\n",
845250
+                tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude);
845250
         }
845250
-        prev_node_used = tmp_node[0].node_id;
845250
-        ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p);
845250
+    }
845250
+    // Allocate more resources until request is met.
845250
+    best_node_ix = 0;
845250
+    while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) {
845250
         if (log_level >= LOG_DEBUG) {
845250
-            str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
845250
-            str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
845250
-            numad_log(LOG_DEBUG, "Existing nodes: %s  Target nodes: %s\n", buf, buf2);
845250
+            numad_log(LOG_DEBUG, "MBs: %d,  CPUs: %d\n", mbs, cpus);
845250
         }
845250
+        numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]);
845250
+        ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p);
845250
+        min_req_nodes -= 1;
845250
         if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) {
845250
             // Apparently we must use all resource nodes...
845250
-            target_using_all_nodes = 1;
845250
             break;
845250
         }
845250
-#define MBS_MARGIN 10
845250
-        if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) {
845250
-            tmp_node[0].MBs_free -= mbs;
845250
+        // "Consume" the resources on this node
845250
+#define CPUS_MARGIN 0
845250
+#define MBS_MARGIN 100
845250
+        if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) {
845250
+            tmp_node[index[best_node_ix]].MBs_free -= mbs;
845250
             mbs = 0;
845250
         } else {
845250
-            mbs -= (tmp_node[0].MBs_free - MBS_MARGIN);
845250
-            tmp_node[0].MBs_free = MBS_MARGIN;
845250
+            mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN);
845250
+            tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN;
845250
         }
845250
-#define CPUS_MARGIN 0
845250
-        if (tmp_node[0].CPUs_free >= (cpus + CPUS_MARGIN)) {
845250
-            tmp_node[0].CPUs_free -= cpus;
845250
+        if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) {
845250
+            tmp_node[index[best_node_ix]].CPUs_free -= cpus;
845250
             cpus = 0;
845250
         } else {
845250
-            cpus -= (tmp_node[0].CPUs_free - CPUS_MARGIN);
845250
-            tmp_node[0].CPUs_free = CPUS_MARGIN;
845250
-        }
845250
-        tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free;
845250
-    }
845250
-    // If this existing process is already located where we want it, and almost
845250
-    // all memory is already moved to those nodes, then return NULL indicating
845250
-    // no need to change binding this time.
845250
-    if ((pid > 0) && (EQUAL_LISTS(target_node_list_p, existing_mems_list_p))) {
845250
-        // May not need to change binding.  However, if there is any significant
845250
-        // memory still on non-target nodes, advise the bind anyway because
845250
-        // there are some scenarios when the kernel will not move it all the
845250
-        // first time.
845250
-        if (!target_using_all_nodes) {
845250
-            p->dup_bind_count += 1;
845250
-            for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
-                if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) {
845250
-                    goto try_memory_move_again;
845250
-                }
845250
-            }
845250
-            // We will accept these memory locations.  Stamp it as done.
845250
-            p->bind_time_stamp = get_time_stamp();
845250
-        }
845250
-        // Skip rebinding either because practically all memory is in the
845250
-        // target nodes, or because we are stuck using all the nodes.
845250
-        if (log_level >= LOG_DEBUG) {
845250
-            numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n");
845250
+            cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN);
845250
+            tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN;
845250
         }
845250
-        return NULL;
845250
-    } else {
845250
-        // Either a non-existing process, or a new binding for an existing process.
845250
-        if (p != NULL) {
845250
-            // Must be a new binding for an existing process, so reset dup_bind_count.
845250
-            p->dup_bind_count = 0;
845250
-        }
845250
-    }
845250
-    // See if this proposed move will make a significant difference.
845250
-    // If not, return null instead of advising the move.
845250
-    uint64_t target_magnitude = 0;
845250
-    uint64_t existing_magnitude = 0;
845250
-    int num_target_nodes   = NUM_IDS_IN_LIST(target_node_list_p);
845250
-    int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p);
845250
-    /* FIXME: this expansion seems to cause excessive growth
845250
-     * So calculate the improvement before hastily expanding nodes.
845250
-    if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; }
845250
-    */
845250
-    int node_id = 0;
845250
-    int n = num_existing_nodes + num_target_nodes;
845250
-    while (n) {
845250
-        if (ID_IS_IN_LIST(node_id, target_node_list_p)) {
845250
-            target_magnitude += saved_magnitude_for_node[node_id];
845250
-            n -= 1;
845250
-        }
845250
-        if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) {
845250
-            existing_magnitude += saved_magnitude_for_node[node_id];
845250
-            n -= 1;
845250
-        }
845250
-        node_id += 1;
845250
-    }
845250
-    if (existing_magnitude > 0) {
845250
-        uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude;
845250
-        if (magnitude_change < 0) {
845250
-            magnitude_change = -(magnitude_change);
845250
-        }
845250
-        if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) {
845250
-            // Not significant enough percentage change to do rebind
845250
+        // Next line optional, since we will not look at that node again
845250
+        tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free);
845250
+        best_node_ix += 1;
845250
+    }
845250
+    // For existing processes, calculate the non-local memory percent to see if
845250
+    // process is already in the right place.
845250
+    if ((pid > 0) && (p != NULL)) {
845250
+        uint64_t nonlocal_memory = 0;
845250
+        for (int ix = 0;  (ix < num_nodes);  ix++) {
845250
+            if (!ID_IS_IN_LIST(ix, target_node_list_p)) {
845250
+                // Accumulate total of nonlocal memory
845250
+                nonlocal_memory += p->process_MBs[ix];
845250
+            }
845250
+        }
845250
+        int disp_percent = (100 * nonlocal_memory) / p->MBs_used;
845250
+        // If this existing process is already located where we want it, then just
845250
+        // return NULL indicating no need to change binding this time.  Check the
845250
+        // ammount of nonlocal memory against the target_memlocality_perecent.
845250
+        if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) {
845250
+            // Already bound to targets, and enough of the memory is located where we want it, so no need to rebind
845250
             if (log_level >= LOG_DEBUG) {
845250
-                str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
845250
-                str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
845250
-                numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n",
845250
-                    pid, buf, buf2, magnitude_change);
845250
+                numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent);
845250
             }
845250
-            // We decided this is almost good enough.  Stamp it as done.
845250
             p->bind_time_stamp = get_time_stamp();
845250
             return NULL;
845250
         }
845250
     }
845250
-    if ((pid <= 0) && (num_target_nodes <= 0)) {
845250
-        // Always provide at least one node for pre-placement advice
845250
+    // Must always provide at least one node for pre-placement advice
845250
+    // FIXME: verify this can happen only if no resources requested...
845250
+    if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) {
845250
         ADD_ID_TO_LIST(node[0].node_id, target_node_list_p);
845250
     }
845250
-try_memory_move_again:
845250
-    str_from_id_list(buf,  BUF_SIZE, existing_mems_list_p);
845250
+    // Log advice, and return target node list
845250
+    if ((pid > 0) && (p->bind_time_stamp)) {
845250
+        str_from_id_list(buf,  BUF_SIZE, p->node_list_p);
845250
+    } else {
845250
+        str_from_id_list(buf,  BUF_SIZE, all_nodes_list_p);
845250
+    }
845250
+    char buf2[BUF_SIZE];
845250
     str_from_id_list(buf2, BUF_SIZE, target_node_list_p);
845250
     char *cmd_name = "(unknown)";
845250
     if ((p) && (p->comm)) {
845250
         cmd_name = p->comm;
845250
     }
845250
     numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2);
845250
+    if (pid > 0) {
845250
+        COPY_LIST(target_node_list_p, p->node_list_p);
845250
+    }
845250
     return target_node_list_p;
845250
 }
845250
 
845250
 
845250
-
845250
-void show_processes(process_data_p *ptr, int nprocs) {
845250
-    time_t ts = time(NULL);
845250
-    fprintf(log_fs, "%s", ctime(&ts);;
845250
-    fprintf(log_fs, "Candidates: %d\n", nprocs);
845250
-    for (int ix = 0;  (ix < nprocs);  ix++) {
845250
-        process_data_p p = ptr[ix];
845250
-        char buf[BUF_SIZE];
845250
-        snprintf(buf, BUF_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name);
845250
-        FILE *fs = fopen(buf, "r");
845250
-        buf[0] = '\0';
845250
-        if (fs) {
845250
-            if (fgets(buf, BUF_SIZE, fs)) {
845250
-                ELIM_NEW_LINE(buf);
845250
-            }
845250
-            fclose(fs);
845250
-        }
845250
-        fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", 
845250
-            p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
845250
-        }
845250
-    fprintf(log_fs, "\n");
845250
-    fflush(log_fs);
845250
-}
845250
-
845250
-
845250
-
845250
 int manage_loads() {
845250
+    uint64_t time_stamp = get_time_stamp();
845250
     // Use temporary index to access and sort hash table entries
845250
-    static process_data_p *pindex;
845250
     static int pindex_size;
845250
+    static process_data_p *pindex;
845250
     if (pindex_size < process_hash_table_size) {
845250
         pindex_size = process_hash_table_size;
845250
         pindex = realloc(pindex, pindex_size * sizeof(process_data_p));
845250
@@ -1923,19 +2049,54 @@ int manage_loads() {
845250
         return min_interval / 2;
845250
     }
845250
     memset(pindex, 0, pindex_size * sizeof(process_data_p));
845250
-    // Copy live candidate pointers to the index for sorting, etc
845250
+    // Copy live candidate pointers to the index for sorting
845250
+    // if they meet the threshold for memory usage and CPU usage.
845250
     int nprocs = 0;
845250
+    long sum_CPUs_used = 0;
845250
     for (int ix = 0;  (ix < process_hash_table_size);  ix++) {
845250
         process_data_p p = &process_hash_table[ix];
845250
-        if (p->pid) {
845250
+        if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) {
845250
             pindex[nprocs++] = p;
845250
+            sum_CPUs_used += p->CPUs_used;
845250
+            // Initialize node list, if not already done for this process.
845250
+            if (p->node_list_p == NULL) {
845250
+                initialize_mem_node_list(p);
845250
+            }
845250
         }
845250
     }
845250
-    // Sort index by amount of CPU used * amount of memory used.  Not expecting
845250
-    // a long list here.  Use a simple sort -- however, sort into bins,
845250
-    // treating values within 10% as aquivalent.  Within bins, order by
845250
-    // bind_time_stamp so oldest bound will be higher priority to evaluate.
845250
+    // Order candidate considerations using timestamps and magnitude: amount of
845250
+    // CPU used * amount of memory used.  Not expecting a long list here.  Use
845250
+    // a simplistic sort -- however move all not yet bound to front of list and
845250
+    // order by decreasing magnitude.  Previously bound processes follow in
845250
+    // bins of increasing magnitude treating values within 20% as aquivalent.
845250
+    // Within bins, order by bind_time_stamp so oldest bound will be higher
845250
+    // priority to evaluate.  Start by moving all unbound to beginning.
845250
+    int num_unbound = 0;
845250
     for (int ij = 0;  (ij < nprocs);  ij++) {
845250
+        if (pindex[ij]->bind_time_stamp == 0) {
845250
+            process_data_p tmp = pindex[num_unbound];
845250
+            pindex[num_unbound++] = pindex[ij];
845250
+            pindex[ij] = tmp;
845250
+        }
845250
+    }
845250
+    // Sort all unbound so biggest magnitude comes first
845250
+    for (int ij = 0;  (ij < num_unbound);  ij++) {
845250
+        int best = ij;
845250
+        for (int ik = ij + 1;  (ik < num_unbound);  ik++) {
845250
+            uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
845250
+            uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used);
845250
+            if (ik_mag <= best_mag) continue;
845250
+            best = ik;
845250
+        }
845250
+        if (best != ij) {
845250
+            process_data_p tmp = pindex[ij];
845250
+            pindex[ij] = pindex[best];
845250
+            pindex[best] = tmp;
845250
+        }
845250
+    }
845250
+    // Sort the remaining candidates into bins of increasting magnitude, and by
845250
+    // timestamp within bins.
845250
+    for (int ij = num_unbound;  (ij < nprocs);  ij++) {
845250
         int best = ij;
845250
         for (int ik = ij + 1;  (ik < nprocs);  ik++) {
845250
             uint64_t   ik_mag = (pindex[  ik]->CPUs_used * pindex[  ik]->MBs_used);
845250
@@ -1946,11 +2107,11 @@ int manage_loads() {
845250
                 diff_mag = -(diff_mag);
845250
                 min_mag = best_mag;
845250
             }
845250
-            if ((diff_mag > 0) && (min_mag / diff_mag < 10)) {
845250
-                // difference > 10 percent.  Use strict ordering
845250
+            if ((diff_mag > 0) && (min_mag / diff_mag < 5)) {
845250
+                // difference > 20 percent.  Use magnitude ordering
845250
                 if (ik_mag <= best_mag) continue;
845250
             } else {
845250
-                // difference within 10 percent.  Sort these by bind_time_stamp.
845250
+                // difference within 20 percent.  Sort these by bind_time_stamp.
845250
                 if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue;
845250
             }
845250
             best = ik;
845250
@@ -1961,23 +2122,57 @@ int manage_loads() {
845250
             pindex[best] = tmp;
845250
         }
845250
     }
845250
+    // Show the candidate processes in the log file
845250
     if ((log_level >= LOG_INFO) && (nprocs > 0)) {
845250
-        show_processes(pindex, nprocs);
845250
+        numad_log(LOG_INFO, "Candidates: %d\n", nprocs);
845250
+        for (int ix = 0;  (ix < nprocs);  ix++) {
845250
+            process_data_p p = pindex[ix];
845250
+            char buf[BUF_SIZE];
845250
+            str_from_id_list(buf, BUF_SIZE, p->node_list_p);
845250
+            fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", 
845250
+                p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf);
845250
+            }
845250
+        fflush(log_fs);
845250
     }
845250
-    // Estimate desired size and make resource requests for each significant process
845250
+    // Estimate desired size (+ margin capacity) and
845250
+    // make resource requests for each candidate process
845250
     for (int ix = 0;  (ix < nprocs);  ix++) {
845250
         process_data_p p = pindex[ix];
845250
-        if (p->CPUs_used * p->MBs_used < CPU_THRESHOLD * MEMORY_THRESHOLD) {
845250
-            break; // No more significant processes worth worrying about...
845250
+        // If this process has interleaved memory, recheck it only every 30 minutes...
845250
+#define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED)
845250
+        if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0)
845250
+          && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) {
845250
+            if (log_level >= LOG_DEBUG) {
845250
+                numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid);
845250
+            }
845250
+            continue;
845250
+        }
845250
+        // Expand resources needed estimate using target_utilization factor.
845250
+        // Start with the CPUs actually used (capped by number of threads) for
845250
+        // CPUs required, and the RSS MBs actually used for the MBs
845250
+        // requirement,
845250
+        int mem_target_utilization = target_utilization;
845250
+        int cpu_target_utilization = target_utilization;
845250
+        // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe)
845250
+        if (mem_target_utilization > 100) {
845250
+            mem_target_utilization = 100;
845250
+        }
845250
+        // If the process virtual memory size is bigger than one node, and it
845250
+        // is already using more than 80 percent of a node, then request MBs
845250
+        // based on the virtual size rather than on the current amount in use.
845250
+        int mb_request;
845250
+        if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) {
845250
+            mb_request = (p->MBs_size * 100) / mem_target_utilization;
845250
+        } else {
845250
+            mb_request = (p->MBs_used * 100) / mem_target_utilization;
845250
         }
845250
-        int mb_request  =  (p->MBs_used * 100) / target_utilization;
845250
-        int cpu_request = (p->CPUs_used * 100) / target_utilization;
845250
-        // Do not give a process more CPUs than it has threads!
845250
-        // FIXME: For guest VMs, should limit max to VCPU threads. Will
845250
-        // need to do something more intelligent with guest IO threads
845250
-        // when eventually considering devices and IRQs.
845250
+        int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization;
845250
+        // But do not give a process more CPUs than it has threads!
845250
         int thread_limit = p->num_threads;
845250
-        // If process looks like a KVM guest, try to limit to number of vCPU threads
845250
+        // If process looks like a KVM guest, try to limit thread count to the
845250
+        // number of vCPU threads.  FIXME: Will need to do something more
845250
+        // intelligent than this with guest IO threads when eventually
845250
+        // considering devices and IRQs.
845250
         if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) {
845250
             int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid);
845250
             if (thread_limit > kvm_vcpu_threads) {
845250
@@ -1988,23 +2183,51 @@ int manage_loads() {
845250
         if (cpu_request > thread_limit) {
845250
             cpu_request = thread_limit;
845250
         }
845250
+        // If this process was recently bound, enforce a five-minute minimum
845250
+        // delay between repeated attempts to potentially move the process.
845250
+#define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED)
845250
+        if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) {
845250
+            // Skip re-evaluation because we just did it recently, but check
845250
+            // first for node utilization balance to see if we should
845250
+            // re-evaluate this particular process right now.  If this process
845250
+            // is running on one of the busiest nodes, go ahead and re-evaluate
845250
+            // it if it looks like it should have a better place with
845250
+            // sufficient resources.  FIXME: this is currently implemented for
845250
+            // only smallish processes that will fit in a single node.
845250
+            if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p))
845250
+                && (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total) 
845250
+                && (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free) 
845250
+                    + abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free) 
845250
+                    < (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD)  // CPU slop
845250
+                && (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free)
845250
+                    + abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free) 
845250
+                    < (max_node_MBs_free - min_node_MBs_free)) ) { 
845250
+                if (log_level >= LOG_DEBUG) {
845250
+                    numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid);
845250
+                }
845250
+            } else {
845250
+                if (log_level >= LOG_DEBUG) {
845250
+                    numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid);
845250
+                }
845250
+                continue;
845250
+            }
845250
+        }
845250
+        // OK, now pick NUMA nodes for this process and bind it!
845250
         pthread_mutex_lock(&node_info_mutex);
845250
-        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request);
845250
-        // FIXME: ?? copy node_list_p to shorten mutex region?
845250
-        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) {
845250
-            // Shorten interval if actively moving processes
845250
+        int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total);
845250
+        id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus);
845250
+        if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) {
845250
             pthread_mutex_unlock(&node_info_mutex);
845250
-            p->bind_time_stamp = get_time_stamp();
845250
+            // Return minimum interval when actively moving processes
845250
             return min_interval;
845250
         }
845250
         pthread_mutex_unlock(&node_info_mutex);
845250
     }
845250
-    // Return maximum interval if no process movement
845250
+    // Return maximum interval when no process movement
845250
     return max_interval;
845250
 }
845250
 
845250
 
845250
-
845250
 void *set_dynamic_options(void *arg) {
845250
     // int arg_value = *(int *)arg;
845250
     char buf[BUF_SIZE];
845250
@@ -2013,6 +2236,18 @@ void *set_dynamic_options(void *arg) {
845250
         msg_t msg;
845250
         recv_msg(&msg;;
845250
         switch (msg.body.cmd) {
845250
+        case 'C':
845250
+            use_inactive_file_cache = (msg.body.arg1 != 0);
845250
+            if (use_inactive_file_cache) {
845250
+                numad_log(LOG_NOTICE, "Counting inactive file cache as available\n");
845250
+            } else {
845250
+                numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n");
845250
+            }
845250
+            break;
845250
+        case 'H':
845250
+            thp_scan_sleep_ms = msg.body.arg1;
845250
+            set_thp_scan_sleep_ms(thp_scan_sleep_ms);
845250
+            break;
845250
         case 'i':
845250
             min_interval = msg.body.arg1;
845250
             max_interval = msg.body.arg2;
845250
@@ -2033,6 +2268,10 @@ void *set_dynamic_options(void *arg) {
845250
             numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1);
845250
             log_level = msg.body.arg1;
845250
             break;
845250
+        case 'm':
845250
+            numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1);
845250
+            target_memlocality = msg.body.arg1;
845250
+            break;
845250
         case 'p':
845250
             numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1);
845250
             pthread_mutex_lock(&pid_list_mutex);
845250
@@ -2055,6 +2294,11 @@ void *set_dynamic_options(void *arg) {
845250
                 numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n");
845250
             }
845250
             break;
845250
+        case 't':
845250
+            numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1);
845250
+            htt_percent = msg.body.arg1;
845250
+            node_info_time_stamp = 0; // to force rescan of nodes/cpus soon
845250
+            break;
845250
         case 'u':
845250
             numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1);
845250
             target_utilization = msg.body.arg1;
845250
@@ -2064,7 +2308,7 @@ void *set_dynamic_options(void *arg) {
845250
                                     msg.body.arg1, msg.body.arg2);
845250
             pthread_mutex_lock(&node_info_mutex);
845250
             update_nodes();
845250
-            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2);
845250
+            id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0);
845250
             str_from_id_list(buf, BUF_SIZE, node_list_p);
845250
             pthread_mutex_unlock(&node_info_mutex);
845250
             send_msg(msg.body.src_pid, 'w', 0, 0, buf);
845250
@@ -2134,30 +2378,50 @@ void parse_two_arg_values(char *p, int *
845250
 
845250
 int main(int argc, char *argv[]) {
845250
     int opt;
845250
+    int C_flag = 0;
845250
     int d_flag = 0;
845250
+    int H_flag = 0;
845250
     int i_flag = 0;
845250
     int K_flag = 0;
845250
     int l_flag = 0;
845250
+    int m_flag = 0;
845250
     int p_flag = 0;
845250
     int r_flag = 0;
845250
     int S_flag = 0;
845250
+    int t_flag = 0;
845250
     int u_flag = 0;
845250
     int v_flag = 0;
845250
     int w_flag = 0;
845250
     int x_flag = 0;
845250
+    int tmp_int = 0;
845250
     long list_pid = 0;
845250
-    while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) {
845250
+    while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) {
845250
         switch (opt) {
845250
+        case 'C':
845250
+            C_flag = 1;
845250
+            use_inactive_file_cache = (atoi(optarg) != 0);
845250
+            break;
845250
         case 'd':
845250
             d_flag = 1;
845250
             log_level = LOG_DEBUG;
845250
             break;
845250
         case 'D':
845250
-            cpuset_dir_list[0] = strdup(optarg);
845250
+            // obsoleted
845250
             break;
845250
         case 'h':
845250
             print_usage_and_exit(argv[0]);
845250
             break;
845250
+        case 'H':
845250
+            tmp_int = atoi(optarg);
845250
+            if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) {
845250
+                // 0 means do not change the system default value
845250
+                H_flag = 1;
845250
+                thp_scan_sleep_ms = tmp_int;
845250
+            } else {
845250
+		fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n");
845250
+		exit(EXIT_FAILURE);
845250
+	    }
845250
+            break;
845250
         case 'i':
845250
             i_flag = 1;
845250
             parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0);
845250
@@ -2170,6 +2434,13 @@ int main(int argc, char *argv[]) {
845250
             l_flag = 1;
845250
             log_level = atoi(optarg);
845250
             break;
845250
+        case 'm':
845250
+            tmp_int = atoi(optarg);
845250
+            if ((tmp_int >= 50) && (tmp_int <= 100)) {
845250
+                m_flag = 1;
845250
+                target_memlocality = tmp_int;
845250
+            }
845250
+            break;
845250
         case 'p':
845250
             p_flag = 1;
845250
             list_pid = atol(optarg);
845250
@@ -2183,13 +2454,26 @@ int main(int argc, char *argv[]) {
845250
             include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid);
845250
             exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid);
845250
             break;
845250
+        case 'R':
845250
+            reserved_cpu_str = strdup(optarg);
845250
+            break;
845250
         case 'S':
845250
             S_flag = 1;
845250
             scan_all_processes = (atoi(optarg) != 0);
845250
             break;
845250
+        case 't':
845250
+            tmp_int = atoi(optarg);
845250
+            if ((tmp_int >= 0) && (tmp_int <= 100)) {
845250
+                t_flag = 1;
845250
+                htt_percent = tmp_int;
845250
+            }
845250
+            break;
845250
         case 'u':
845250
-            u_flag = 1;
845250
-            target_utilization = atoi(optarg);
845250
+            tmp_int = atoi(optarg);
845250
+            if ((tmp_int >= 10) && (tmp_int <= 130)) {
845250
+                u_flag = 1;
845250
+                target_utilization = tmp_int;
845250
+            }
845250
             break;
845250
         case 'v':
845250
             v_flag = 1;
845250
@@ -2234,6 +2518,12 @@ int main(int argc, char *argv[]) {
845250
         // Daemon is already running.  So send dynamic options to persistant
845250
         // thread to handle requests, get the response (if any), and finish.
845250
         msg_t msg; 
845250
+        if (C_flag) {
845250
+            send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, "");
845250
+        }
845250
+        if (H_flag) {
845250
+            send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, "");
845250
+        }
845250
         if (i_flag) {
845250
             send_msg(daemon_pid, 'i', min_interval, max_interval, "");
845250
         }
845250
@@ -2243,6 +2533,9 @@ int main(int argc, char *argv[]) {
845250
         if (d_flag || l_flag || v_flag) {
845250
             send_msg(daemon_pid, 'l', log_level, 0, "");
845250
         }
845250
+        if (m_flag) {
845250
+            send_msg(daemon_pid, 'm', target_memlocality, 0, "");
845250
+        }
845250
         if (p_flag) {
845250
             send_msg(daemon_pid, 'p', list_pid, 0, "");
845250
         }
845250
@@ -2252,6 +2545,9 @@ int main(int argc, char *argv[]) {
845250
         if (S_flag) {
845250
             send_msg(daemon_pid, 'S', scan_all_processes, 0, "");
845250
         }
845250
+        if (t_flag) {
845250
+            send_msg(daemon_pid, 't', htt_percent, 0, "");
845250
+        }
845250
         if (u_flag) {
845250
             send_msg(daemon_pid, 'u', target_utilization, 0, "");
845250
         }
845250
@@ -2263,14 +2559,30 @@ int main(int argc, char *argv[]) {
845250
         if (x_flag) {
845250
             send_msg(daemon_pid, 'x', list_pid, 0, "");
845250
         }
845250
-    } else if (w_flag) {
845250
-        // Get pre-placement NUMA advice without starting daemon
845250
+        close_log_file();
845250
+        exit(EXIT_SUCCESS);
845250
+    }
845250
+    // No numad daemon running yet.
845250
+    // First, make note of any reserved CPUs....
845250
+    if (reserved_cpu_str != NULL) {
845250
+        CLEAR_CPU_LIST(reserved_cpu_mask_list_p);
845250
+        int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str);
845250
         char buf[BUF_SIZE];
845250
+        str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p);
845250
+        numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf);
845250
+        // turn reserved list into a negated mask for later ANDing use...
845250
+        negate_cpu_list(reserved_cpu_mask_list_p);
845250
+    }
845250
+    // If it is a "-w" pre-placement request, handle that without starting
845250
+    // the daemon.  Otherwise start the numad daemon.
845250
+    if (w_flag) {
845250
+        // Get pre-placement NUMA advice without starting daemon
845250
         update_nodes();
845250
         sleep(2);
845250
         update_nodes();
845250
         numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs);
845250
-        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs);
845250
+        id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0);
845250
+        char buf[BUF_SIZE];
845250
         str_from_id_list(buf, BUF_SIZE, node_list_p);
845250
         fprintf(stdout, "%s\n", buf);
845250
         close_log_file();
845250
@@ -2278,6 +2590,7 @@ int main(int argc, char *argv[]) {
845250
     } else if (max_interval > 0) {
845250
         // Start the numad daemon...
845250
         check_prereqs(argv[0]);
845250
+#if (!NO_DAEMON)
845250
         // Daemonize self...
845250
         daemon_pid = fork();
845250
         if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); }
845250
@@ -2298,9 +2611,20 @@ int main(int argc, char *argv[]) {
845250
         if (log_fs != stderr) {
845250
             fclose(stderr);
845250
         }
845250
+#endif
845250
+        // Set up signal handlers
845250
+        struct sigaction sa;
845250
+        memset(&sa, 0, sizeof(sa)); 
845250
+        sa.sa_handler = sig_handler;
845250
+        if (sigaction(SIGHUP, &sa, NULL)
845250
+            || sigaction(SIGTERM, &sa, NULL)
845250
+            || sigaction(SIGQUIT, &sa, NULL)) {
845250
+            numad_log(LOG_CRIT, "sigaction does not work?\n");
845250
+            exit(EXIT_FAILURE);
845250
+        }
845250
         // Allocate initial process hash table
845250
         process_hash_table_expand();
845250
-        // Spawn thread to handle messages from subsequent invocation requests
845250
+        // Spawn a thread to handle messages from subsequent invocation requests
845250
         pthread_mutex_init(&pid_list_mutex, NULL);
845250
         pthread_mutex_init(&node_info_mutex, NULL);
845250
         pthread_attr_t attr;
845250
@@ -2310,7 +2634,7 @@ int main(int argc, char *argv[]) {
845250
         }
845250
         pthread_t tid;
845250
         if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) {
845250
-            numad_log(LOG_CRIT, "pthread_create failure\n");
845250
+            numad_log(LOG_CRIT, "pthread_create failure: setting thread\n");
845250
             exit(EXIT_FAILURE);
845250
         }
845250
         // Loop here forwever...
845250
@@ -2322,16 +2646,26 @@ int main(int argc, char *argv[]) {
845250
             if (nodes > 1) {
845250
                 update_processes();
845250
                 interval = manage_loads();
845250
+                if (interval < max_interval) {
845250
+                    // Update node info since we moved something
845250
+                    nodes = update_nodes();
845250
+                }
845250
             }
845250
             sleep(interval);
845250
+            if (got_sigterm | got_sigquit) {
845250
+                shut_down_numad();
845250
+            }
845250
+            if (got_sighup) {
845250
+                got_sighup = 0;
845250
+                close_log_file();
845250
+                open_log_file();
845250
+            }
845250
         }
845250
         if (pthread_attr_destroy(&attr) != 0) {
845250
             numad_log(LOG_WARNING, "pthread_attr_destroy failure\n");
845250
         }
845250
         pthread_mutex_destroy(&pid_list_mutex);
845250
         pthread_mutex_destroy(&node_info_mutex);
845250
-    } else {
845250
-        shut_down_numad();
845250
     }
845250
     exit(EXIT_SUCCESS);
845250
 }
845250
diff -rup numad-0.5git/numad.init numad-0.5git-new/numad.init
845250
--- numad-0.5git/numad.init	2012-12-03 15:40:40.000000000 +0100
845250
+++ numad-0.5git-new/numad.init	2016-08-30 08:45:19.000000000 +0200
845250
@@ -37,7 +37,7 @@ start() {
845250
     [ -f $config ] || exit 6
845250
     echo -n $"Starting $prog: "
845250
     . $config
845250
-    daemon "$exec -i $INTERVAL"
845250
+    daemon $exec -i $INTERVAL
845250
     retval=$?
845250
     echo
845250
     [ $retval -eq 0 ] && touch $lockfile