aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-01-08 04:00:51 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-08 23:12:42 -0500
commit39743889aaf76725152f16aa90ca3c45f6d52da3 (patch)
tree2a6f658d03dbbd9428934c5e030230a4acb6d5e0
parentdc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd (diff)
[PATCH] Swap Migration V5: sys_migrate_pages interface
sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/syscall_table.S1
-rw-r--r--arch/ia64/kernel/entry.S1
-rw-r--r--arch/x86_64/ia32/ia32entry.S1
-rw-r--r--include/asm-i386/unistd.h3
-rw-r--r--include/asm-ia64/unistd.h3
-rw-r--r--include/asm-x86_64/ia32_unistd.h3
-rw-r--r--include/asm-x86_64/unistd.h4
-rw-r--r--include/linux/mempolicy.h3
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--mm/mempolicy.c94
11 files changed, 111 insertions, 5 deletions
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index f7ba4acc20ec..6ff3e5243226 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -293,3 +293,4 @@ ENTRY(sys_call_table)
293 .long sys_inotify_init 293 .long sys_inotify_init
294 .long sys_inotify_add_watch 294 .long sys_inotify_add_watch
295 .long sys_inotify_rm_watch 295 .long sys_inotify_rm_watch
296 .long sys_migrate_pages
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0741b066b98f..7a6ffd613789 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1600,5 +1600,6 @@ sys_call_table:
1600 data8 sys_inotify_init 1600 data8 sys_inotify_init
1601 data8 sys_inotify_add_watch 1601 data8 sys_inotify_add_watch
1602 data8 sys_inotify_rm_watch 1602 data8 sys_inotify_rm_watch
1603 data8 sys_migrate_pages // 1280
1603 1604
1604 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls 1605 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index df0773c9bdbe..1f0ff5adc80e 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,7 @@ ia32_sys_call_table:
643 .quad sys_inotify_init 643 .quad sys_inotify_init
644 .quad sys_inotify_add_watch 644 .quad sys_inotify_add_watch
645 .quad sys_inotify_rm_watch 645 .quad sys_inotify_rm_watch
646 .quad sys_migrate_pages
646ia32_syscall_end: 647ia32_syscall_end:
647 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 648 .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
648 .quad ni_syscall 649 .quad ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fe38b9a96233..481c3c0ea720 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,9 @@
299#define __NR_inotify_init 291 299#define __NR_inotify_init 291
300#define __NR_inotify_add_watch 292 300#define __NR_inotify_add_watch 292
301#define __NR_inotify_rm_watch 293 301#define __NR_inotify_rm_watch 293
302#define __NR_migrate_pages 294
302 303
303#define NR_syscalls 294 304#define NR_syscalls 295
304 305
305/* 306/*
306 * user-visible error numbers are in the range -1 - -128: see 307 * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 2bf543493cb8..962f9bd1bdff 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -269,12 +269,13 @@
269#define __NR_inotify_init 1277 269#define __NR_inotify_init 1277
270#define __NR_inotify_add_watch 1278 270#define __NR_inotify_add_watch 1278
271#define __NR_inotify_rm_watch 1279 271#define __NR_inotify_rm_watch 1279
272#define __NR_migrate_pages 1280
272 273
273#ifdef __KERNEL__ 274#ifdef __KERNEL__
274 275
275#include <linux/config.h> 276#include <linux/config.h>
276 277
277#define NR_syscalls 256 /* length of syscall table */ 278#define NR_syscalls 270 /* length of syscall table */
278 279
279#define __ARCH_WANT_SYS_RT_SIGACTION 280#define __ARCH_WANT_SYS_RT_SIGACTION
280 281
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec3868d..e8843362a6cc 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
299#define __NR_ia32_inotify_init 291 299#define __NR_ia32_inotify_init 291
300#define __NR_ia32_inotify_add_watch 292 300#define __NR_ia32_inotify_add_watch 292
301#define __NR_ia32_inotify_rm_watch 293 301#define __NR_ia32_inotify_rm_watch 293
302#define __NR_ia32_migrate_pages 294
302 303
303#define IA32_NR_syscalls 294 /* must be > than biggest syscall! */ 304#define IA32_NR_syscalls 295 /* must be > than biggest syscall! */
304 305
305#endif /* _ASM_X86_64_IA32_UNISTD_H_ */ 306#endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150bce0c..e6f896161c11 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init)
571__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) 571__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
572#define __NR_inotify_rm_watch 255 572#define __NR_inotify_rm_watch 255
573__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) 573__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
574#define __NR_migrate_pages 256
575__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
574 576
575#define __NR_syscall_max __NR_inotify_rm_watch 577#define __NR_syscall_max __NR_migrate_pages
576#ifndef __NO_STUBS 578#ifndef __NO_STUBS
577 579
578/* user-visible error numbers are in the range -1 - -4095 */ 580/* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05443a766cb8..3e61e829681d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -162,6 +162,9 @@ static inline void check_highest_zone(int k)
162 policy_zone = k; 162 policy_zone = k;
163} 163}
164 164
165int do_migrate_pages(struct mm_struct *mm,
166 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
167
165#else 168#else
166 169
167struct mempolicy {}; 170struct mempolicy {};
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1db91d..e910d1a481df 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
511asmlinkage long sys_ioprio_get(int which, int who); 511asmlinkage long sys_ioprio_get(int which, int who);
512asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 512asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
513 unsigned long maxnode); 513 unsigned long maxnode);
514asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
515 const unsigned long __user *from, const unsigned long __user *to);
514 516
515#endif 517#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..7a8bc7f60d91 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
82cond_syscall(sys_inotify_init); 82cond_syscall(sys_inotify_init);
83cond_syscall(sys_inotify_add_watch); 83cond_syscall(sys_inotify_add_watch);
84cond_syscall(sys_inotify_rm_watch); 84cond_syscall(sys_inotify_rm_watch);
85cond_syscall(sys_migrate_pages);
85 86
86/* arch-specific weak syscall entries */ 87/* arch-specific weak syscall entries */
87cond_syscall(sys_pciconfig_read); 88cond_syscall(sys_pciconfig_read);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9cc6d962831d..20d5ad39fa41 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -615,11 +615,41 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
615} 615}
616 616
617/* 617/*
618 * For now migrate_pages simply swaps out the pages from nodes that are in
619 * the source set but not in the target set. In the future, we would
620 * want a function that moves pages between the two nodesets in such
621 * a way as to preserve the physical layout as much as possible.
622 *
623 * Returns the number of page that could not be moved.
624 */
625int do_migrate_pages(struct mm_struct *mm,
626 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
627{
628 LIST_HEAD(pagelist);
629 int count = 0;
630 nodemask_t nodes;
631
632 nodes_andnot(nodes, *from_nodes, *to_nodes);
633 nodes_complement(nodes, nodes);
634
635 down_read(&mm->mmap_sem);
636 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
637 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
638 if (!list_empty(&pagelist)) {
639 migrate_pages(&pagelist, NULL);
640 if (!list_empty(&pagelist))
641 count = putback_lru_pages(&pagelist);
642 }
643 up_read(&mm->mmap_sem);
644 return count;
645}
646
647/*
618 * User space interface with variable sized bitmaps for nodelists. 648 * User space interface with variable sized bitmaps for nodelists.
619 */ 649 */
620 650
621/* Copy a node mask from user space. */ 651/* Copy a node mask from user space. */
622static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 652static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
623 unsigned long maxnode) 653 unsigned long maxnode)
624{ 654{
625 unsigned long k; 655 unsigned long k;
@@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
708 return do_set_mempolicy(mode, &nodes); 738 return do_set_mempolicy(mode, &nodes);
709} 739}
710 740
741/* Macro needed until Paul implements this function in kernel/cpusets.c */
742#define cpuset_mems_allowed(task) node_online_map
743
744asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
745 const unsigned long __user *old_nodes,
746 const unsigned long __user *new_nodes)
747{
748 struct mm_struct *mm;
749 struct task_struct *task;
750 nodemask_t old;
751 nodemask_t new;
752 nodemask_t task_nodes;
753 int err;
754
755 err = get_nodes(&old, old_nodes, maxnode);
756 if (err)
757 return err;
758
759 err = get_nodes(&new, new_nodes, maxnode);
760 if (err)
761 return err;
762
763 /* Find the mm_struct */
764 read_lock(&tasklist_lock);
765 task = pid ? find_task_by_pid(pid) : current;
766 if (!task) {
767 read_unlock(&tasklist_lock);
768 return -ESRCH;
769 }
770 mm = get_task_mm(task);
771 read_unlock(&tasklist_lock);
772
773 if (!mm)
774 return -EINVAL;
775
776 /*
777 * Check if this process has the right to modify the specified
778 * process. The right exists if the process has administrative
779 * capabilities, superuser priviledges or the same
780 * userid as the target process.
781 */
782 if ((current->euid != task->suid) && (current->euid != task->uid) &&
783 (current->uid != task->suid) && (current->uid != task->uid) &&
784 !capable(CAP_SYS_ADMIN)) {
785 err = -EPERM;
786 goto out;
787 }
788
789 task_nodes = cpuset_mems_allowed(task);
790 /* Is the user allowed to access the target nodes? */
791 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
792 err = -EPERM;
793 goto out;
794 }
795
796 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
797out:
798 mmput(mm);
799 return err;
800}
801
802
711/* Retrieve NUMA policy */ 803/* Retrieve NUMA policy */
712asmlinkage long sys_get_mempolicy(int __user *policy, 804asmlinkage long sys_get_mempolicy(int __user *policy,
713 unsigned long __user *nmask, 805 unsigned long __user *nmask,