move_pages support in numactl

This patch adds support for move_pages(2) (sys_move_pages()) to numactl.
Adds numa_move_pages() to libnuma.

There is a FIXME in here because powerpc has not added support
for sys_move_pages() yet.

Diffed against numactl-1.0.2

Signed-off-by: Christoph Lameter<clameter@sgi.com>

---
 libnuma.c    |    6 ++
 move_pages.2 |  155 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 numa.h       |    3 +
 numaif.h     |    3 +
 numaint.h    |    3 +
 syscall.c    |   16 ++++--
 6 files changed, 182 insertions(+), 4 deletions(-)

Index: numactl-1.0.2/syscall.c
===================================================================
--- numactl-1.0.2.orig/syscall.c
+++ numactl-1.0.2/syscall.c
@@ -36,11 +36,13 @@
 #define __NR_set_mempolicy 238
 #define __NR_get_mempolicy 239
 #define __NR_migrate_pages 256
+#define __NR_move_pages 279
 
 #elif defined(__ia64__)
 #define __NR_sched_setaffinity    1231
 #define __NR_sched_getaffinity    1232
 #define __NR_migrate_pages	1280
+#define __NR_move_pages 1276
 
 /* Official allocation */
 
@@ -54,6 +56,7 @@
 #define __NR_get_mempolicy 275
 #define __NR_set_mempolicy 276
 #define __NR_migrate_pages 294
+#define __NR_move_pages 317
 
 #elif defined(__powerpc__)
 
@@ -61,6 +64,9 @@
 #define __NR_get_mempolicy 260
 #define __NR_set_mempolicy 261
 #define __NR_migrate_pages 258
+/* FIXME: powerpc is missing move pages!!!
+#define __NR_move_pages xxx
+*/
 
 #elif !defined(DEPS_RUN)
 #error "Add syscalls for your architecture or update kernel headers"
@@ -151,6 +157,12 @@ long WEAK migrate_pages(int pid, unsigne
 	return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask);
 }
 
+long WEAK move_pages(int pid, unsigned long count,
+	void **pages, const int *nodes, int *status, int flags)
+{
+	return syscall(__NR_move_pages, pid, count, pages, nodes, status, flags);
+}
+
 /* SLES8 glibc doesn't define those */
 
 int numa_sched_setaffinity(pid_t pid, unsigned len, const unsigned long *mask)
@@ -167,7 +179,3 @@ int numa_sched_getaffinity(pid_t pid, un
 make_internal_alias(numa_sched_getaffinity);
 make_internal_alias(numa_sched_setaffinity);
 make_internal_alias(get_mempolicy);
-make_internal_alias(set_mempolicy);
-make_internal_alias(mbind);
-make_internal_alias(migrate_pages);
-
Index: numactl-1.0.2/libnuma.c
===================================================================
--- numactl-1.0.2.orig/libnuma.c
+++ numactl-1.0.2/libnuma.c
@@ -585,6 +585,12 @@ int numa_migrate_pages(int pid, const no
 	return migrate_pages(pid, NUMA_NUM_NODES + 1, &fromnodes->n[0], &tonodes->n[0]);
 }
 
+int numa_move_pages(int pid, unsigned long count,
+	void **pages, const int *nodes, int *status, int flags)
+{
+	return move_pages(pid, count, pages, nodes, status, flags);
+}
+
 int numa_run_on_node(int node)
 { 
 	int ncpus = number_of_configured_cpus();
Index: numactl-1.0.2/numa.h
===================================================================
--- numactl-1.0.2.orig/numa.h
+++ numactl-1.0.2/numa.h
@@ -178,6 +178,9 @@ void numa_warn(int num, char *fmt, ...);
 
 int numa_migrate_pages(int pid, const nodemask_t *from, const nodemask_t *to);
 
+int numa_move_pages(int pid, unsigned long count, void **pages,
+		const int *nodes, int *status, int flags);
+
 #ifdef __cplusplus
 }
 #endif
Index: numactl-1.0.2/numaint.h
===================================================================
--- numactl-1.0.2.orig/numaint.h
+++ numactl-1.0.2/numaint.h
@@ -14,6 +14,9 @@ extern long set_mempolicy_int(int mode, 
 extern long migrate_pages(int pid, unsigned long maxnode, const unsigned long *frommask,
 	const unsigned long *tomask);
 
+extern long move_pages(int pid, unsigned long count,
+	void **pages, const int *nodes, int *status, int flags);
+
 #define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
 
 #define CPU_BYTES(x) (round_up(x, BITS_PER_LONG)/8)
Index: numactl-1.0.2/move_pages.2
===================================================================
--- /dev/null
+++ numactl-1.0.2/move_pages.2
@@ -0,0 +1,155 @@
+.\" Hey Emacs! This file is -*- nroff -*- source.
+.\"
+.\" This manpage is Copyright (C) 2006 Silicon Graphics, Inc.
+.\"                               Christoph Lameter
+.\"
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.TH MOVE_PAGES 2 2006-10-31 "Linux 2.6.18" "Linux Programmer's Manual"
+.SH NAME
+move_pages \- Move individual pages of a process to another node
+.SH SYNOPSIS
+.B #include <numaif.h>
+.sp
+.BI "long move_pages(int " pid ", unsigned long count, void ** " pages ", const int * " nodes ", int * " status ", int " flags );
+.SH DESCRIPTION
+.BR move_pages ()
+moves
+.I count
+pages to the
+.I nodes.
+The result of the move is reflected in
+.I status.
+The
+.I flags
+indicate constraints on the pages to be moved.
+
+.I pid
+is the process id in which pages are to be moved. Sufficient rights
+must exist to move pages of another process. This means the moving
+process either has root priviledges, has SYS_NICE administrative rights or
+the same owner. If pid is 0 then we move pages of the current process.
+
+.I count
+is the number of pages to move. It defines the size of the three
+arrays
+.I pages,
+.I nodes
+and
+.I status.
+
+.I pages
+is an array of pointers to the pages that should be moved. These are pointers
+that should be aligned to page boundaries. Addresses are specified as seen by
+the process specified by
+.I pid.
+
+.I nodes
+is either an array of integers that specify the desired location for each
+page or it is NULL. Each integer is a node number. If NULL is specified then
+move_pages will not move any pages but return the node of each page in
+the
+.I status
+array. Having the status of each page may be necessary to determine
+pages that need to be moved.
+
+.I status
+is an array of integers that return the status of each page. The array
+only contains valid values if
+.I move_pages
+did not return an error code.
+
+.I flags
+specify what types of pages to move.
+.B MPOL_MF_MOVE
+means that only pages that are in exclusive use by the process
+are to be moved.
+.B MPOL_MF_MOVE_ALL
+means that pages shared between multiple processes can also be moved.
+The process must have root priviledges or SYS_NICE priviledges.
+
+.SH Page states in the status array
+
+.TP
+.B 0..MAX_NUMNODES
+Indicates that the location of the page is on this node.
+.TP
+.B -ENOENT
+The page is not present.
+.TP
+.B -EACCES
+The page is mapped by multiple processes and can only be moved
+if
+.I MPOL_MF_MOVE_ALL
+is specified.
+.TP
+.B -EBUSY
+The page is currently busy and cannot be moved. Try again later.
+This occurs if a page is undergoing I/O or another kernel subsystem
+is holding a reference to the page.
+.TP
+.B -EFAULT
+This is a zero page or the memory area is not mapped by the process.
+.TP
+.B -ENOMEM
+Unable to allocate memory on target node.
+.TP
+.B -EIO
+Unable to write back a page. The page has to be written back
+in order to move ti since the page is dirty and the filesystem
+has not provide a migration function that would allow the move
+of dirty pages.
+.TP
+.B -EINVAL
+A dirty page cannot be moved. The filesystem does not
+provide a migration function and has no ability to write back pages.
+
+.SH "RETURN VALUE"
+On success
+.B move_pages
+returns zero.
+.SH ERRORS
+.TP
+.B -ENOENT
+No pages were found that require moving. All pages are either already
+on the target node, not present, had an invalid address or could not be
+moved because they were mapped by multiple processes.
+.TP
+.B -EINVAL
+Flags other than
+.I MPOL_MF_MOVE
+and
+.I MPOL_MF_MOVE_ALL
+was specified or an attempt was made to migrate pages of a kernel thread.
+.TP
+.B -EPERM
+.I MPOL_MF_MOVE_ALL
+specified without sufficient privileges or an attempt to move a process
+belonging to another user.
+.TP
+.B -EACCESS
+On of the target nodes is not allowed by the current cpuset.
+.TP
+.B -ENODEV
+On of the target nodes is not online.
+.TP
+.B -ESRCH
+Process does not exist.
+.TP
+.B -E2BIG
+Too many pages to move.
+.TP
+.B -EFAULT
+Parameter array could not be accessed.
+.SH "SEE ALSO"
+.BR numa_maps (5),
+.BR migratepages (8),
+.BR numa_stat (8),
+.BR numa (3)
Index: numactl-1.0.2/numaif.h
===================================================================
--- numactl-1.0.2.orig/numaif.h
+++ numactl-1.0.2/numaif.h
@@ -18,6 +18,9 @@ extern long set_mempolicy(int mode, cons
 extern long migratepages(int pid, unsigned long maxnode, unsigned long *fromnode,
 			unsigned long *tonode);
 
+extern long move_pages(int pid, unsigned long count,
+		void **pages, const int *nodes, int *status, int flags);
+
 /* Policies */
 #define MPOL_DEFAULT     0
 #define MPOL_PREFERRED    1
