aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/page_migration29
-rw-r--r--arch/ia64/kernel/entry.S2
-rw-r--r--include/asm-ia64/unistd.h2
-rw-r--r--include/linux/migrate.h2
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c268
8 files changed, 288 insertions, 25 deletions
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 0a5d5fb1885..99f89aa1016 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package.
26Manual migration is useful if for example the scheduler has relocated 26Manual migration is useful if for example the scheduler has relocated
27a process to a processor on a distant node. A batch scheduler or an 27a process to a processor on a distant node. A batch scheduler or an
28administrator may detect the situation and move the pages of the process 28administrator may detect the situation and move the pages of the process
29nearer to the new processor. At some point in the future we may have 29nearer to the new processor. The kernel itself does only provide
30some mechanism in the scheduler that will automatically move the pages. 30manual page migration support. Automatic page migration may be implemented
31through user space processes that move pages. A special function call
32"move_pages" allows the moving of individual pages within a process.
33A NUMA profiler may f.e. obtain a log showing frequent off node
34accesses and may use the result to move pages to more advantageous
35locations.
31 36
32Larger installations usually partition the system using cpusets into 37Larger installations usually partition the system using cpusets into
33sections of nodes. Paul Jackson has equipped cpusets with the ability to 38sections of nodes. Paul Jackson has equipped cpusets with the ability to
@@ -62,22 +67,14 @@ A. In kernel use of migrate_pages()
62 It also prevents the swapper or other scans to encounter 67 It also prevents the swapper or other scans to encounter
63 the page. 68 the page.
64 69
652. Generate a list of newly allocates pages. These pages will contain the 702. We need to have a function of type new_page_t that can be
66 contents of the pages from the first list after page migration is 71 passed to migrate_pages(). This function should figure out
67 complete. 72 how to allocate the correct new page given the old page.
68 73
693. The migrate_pages() function is called which attempts 743. The migrate_pages() function is called which attempts
70 to do the migration. It returns the moved pages in the 75 to do the migration. It will call the function to allocate
71 list specified as the third parameter and the failed 76 the new page for each page that is considered for
72 migrations in the fourth parameter. When the function 77 moving.
73 returns the first list will contain the pages that could still be retried.
74
754. The leftover pages of various types are returned
76 to the LRU using putback_to_lru_pages() or otherwise
77 disposed of. The pages will still have the refcount as
78 increased by isolate_lru_pages() if putback_to_lru_pages() is not
79 used! The kernel may want to handle the various cases of failures in
80 different ways.
81 78
82B. How migrate_pages() works 79B. How migrate_pages() works
83---------------------------- 80----------------------------
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index bcb80ca5cf4..32c999f58d1 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1584,7 +1584,7 @@ sys_call_table:
1584 data8 sys_keyctl 1584 data8 sys_keyctl
1585 data8 sys_ioprio_set 1585 data8 sys_ioprio_set
1586 data8 sys_ioprio_get // 1275 1586 data8 sys_ioprio_get // 1275
1587 data8 sys_ni_syscall 1587 data8 sys_move_pages
1588 data8 sys_inotify_init 1588 data8 sys_inotify_init
1589 data8 sys_inotify_add_watch 1589 data8 sys_inotify_add_watch
1590 data8 sys_inotify_rm_watch 1590 data8 sys_inotify_rm_watch
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 632f2eedf72..bb0eb727dcd 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
265#define __NR_keyctl 1273 265#define __NR_keyctl 1273
266#define __NR_ioprio_set 1274 266#define __NR_ioprio_set 1274
267#define __NR_ioprio_get 1275 267#define __NR_ioprio_get 1275
268/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */ 268#define __NR_move_pages 1276
269#define __NR_inotify_init 1277 269#define __NR_inotify_init 1277
270#define __NR_inotify_add_watch 1278 270#define __NR_inotify_add_watch 1278
271#define __NR_inotify_rm_watch 1279 271#define __NR_inotify_rm_watch 1279
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5b95d6568dc..5dba23a1c0d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/mm.h> 4#include <linux/mm.h>
5 5
6typedef struct page *new_page_t(struct page *, unsigned long private); 6typedef struct page *new_page_t(struct page *, unsigned long private, int **);
7 7
8#ifdef CONFIG_MIGRATION 8#ifdef CONFIG_MIGRATION
9extern int isolate_lru_page(struct page *p, struct list_head *pagelist); 9extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bd67a4413df..7e3f2349091 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
516asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 516asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
517 const unsigned long __user *from, 517 const unsigned long __user *from,
518 const unsigned long __user *to); 518 const unsigned long __user *to);
519asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
520 const void __user * __user *pages,
521 const int __user *nodes,
522 int __user *status,
523 int flags);
519asmlinkage long sys_mbind(unsigned long start, unsigned long len, 524asmlinkage long sys_mbind(unsigned long start, unsigned long len,
520 unsigned long mode, 525 unsigned long mode,
521 unsigned long __user *nmask, 526 unsigned long __user *nmask,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f..597229749de 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
87cond_syscall(sys_inotify_add_watch); 87cond_syscall(sys_inotify_add_watch);
88cond_syscall(sys_inotify_rm_watch); 88cond_syscall(sys_inotify_rm_watch);
89cond_syscall(sys_migrate_pages); 89cond_syscall(sys_migrate_pages);
90cond_syscall(sys_move_pages);
90cond_syscall(sys_chown16); 91cond_syscall(sys_chown16);
91cond_syscall(sys_fchown16); 92cond_syscall(sys_fchown16);
92cond_syscall(sys_getegid16); 93cond_syscall(sys_getegid16);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f432642e9e6..05b84acf0bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
588 isolate_lru_page(page, pagelist); 588 isolate_lru_page(page, pagelist);
589} 589}
590 590
591static struct page *new_node_page(struct page *page, unsigned long node) 591static struct page *new_node_page(struct page *page, unsigned long node, int **x)
592{ 592{
593 return alloc_pages_node(node, GFP_HIGHUSER, 0); 593 return alloc_pages_node(node, GFP_HIGHUSER, 0);
594} 594}
@@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm,
698 698
699} 699}
700 700
701static struct page *new_vma_page(struct page *page, unsigned long private) 701static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
702{ 702{
703 struct vm_area_struct *vma = (struct vm_area_struct *)private; 703 struct vm_area_struct *vma = (struct vm_area_struct *)private;
704 704
diff --git a/mm/migrate.c b/mm/migrate.c
index 251a8d15825..033a12f4c94 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -25,6 +25,8 @@
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/cpuset.h> 26#include <linux/cpuset.h>
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/mempolicy.h>
29#include <linux/vmalloc.h>
28 30
29#include "internal.h" 31#include "internal.h"
30 32
@@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
62} 64}
63 65
64/* 66/*
65 * migrate_prep() needs to be called after we have compiled the list of pages 67 * migrate_prep() needs to be called before we start compiling a list of pages
66 * to be migrated using isolate_lru_page() but before we begin a series of calls 68 * to be migrated using isolate_lru_page().
67 * to migrate_pages().
68 */ 69 */
69int migrate_prep(void) 70int migrate_prep(void)
70{ 71{
@@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
588 struct page *page, int force) 589 struct page *page, int force)
589{ 590{
590 int rc = 0; 591 int rc = 0;
591 struct page *newpage = get_new_page(page, private); 592 int *result = NULL;
593 struct page *newpage = get_new_page(page, private, &result);
592 594
593 if (!newpage) 595 if (!newpage)
594 return -ENOMEM; 596 return -ENOMEM;
@@ -642,6 +644,12 @@ move_newpage:
642 * then this will free the page. 644 * then this will free the page.
643 */ 645 */
644 move_to_lru(newpage); 646 move_to_lru(newpage);
647 if (result) {
648 if (rc)
649 *result = rc;
650 else
651 *result = page_to_nid(newpage);
652 }
645 return rc; 653 return rc;
646} 654}
647 655
@@ -710,3 +718,255 @@ out:
710 return nr_failed + retry; 718 return nr_failed + retry;
711} 719}
712 720
721#ifdef CONFIG_NUMA
722/*
723 * Move a list of individual pages
724 */
725struct page_to_node {
726 unsigned long addr;
727 struct page *page;
728 int node;
729 int status;
730};
731
732static struct page *new_page_node(struct page *p, unsigned long private,
733 int **result)
734{
735 struct page_to_node *pm = (struct page_to_node *)private;
736
737 while (pm->node != MAX_NUMNODES && pm->page != p)
738 pm++;
739
740 if (pm->node == MAX_NUMNODES)
741 return NULL;
742
743 *result = &pm->status;
744
745 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
746}
747
748/*
749 * Move a set of pages as indicated in the pm array. The addr
750 * field must be set to the virtual address of the page to be moved
751 * and the node number must contain a valid target node.
752 */
753static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
754 int migrate_all)
755{
756 int err;
757 struct page_to_node *pp;
758 LIST_HEAD(pagelist);
759
760 down_read(&mm->mmap_sem);
761
762 /*
763 * Build a list of pages to migrate
764 */
765 migrate_prep();
766 for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
767 struct vm_area_struct *vma;
768 struct page *page;
769
770 /*
771 * A valid page pointer that will not match any of the
772 * pages that will be moved.
773 */
774 pp->page = ZERO_PAGE(0);
775
776 err = -EFAULT;
777 vma = find_vma(mm, pp->addr);
778 if (!vma)
779 goto set_status;
780
781 page = follow_page(vma, pp->addr, FOLL_GET);
782 err = -ENOENT;
783 if (!page)
784 goto set_status;
785
786 if (PageReserved(page)) /* Check for zero page */
787 goto put_and_set;
788
789 pp->page = page;
790 err = page_to_nid(page);
791
792 if (err == pp->node)
793 /*
794 * Node already in the right place
795 */
796 goto put_and_set;
797
798 err = -EACCES;
799 if (page_mapcount(page) > 1 &&
800 !migrate_all)
801 goto put_and_set;
802
803 err = isolate_lru_page(page, &pagelist);
804put_and_set:
805 /*
806 * Either remove the duplicate refcount from
807 * isolate_lru_page() or drop the page ref if it was
808 * not isolated.
809 */
810 put_page(page);
811set_status:
812 pp->status = err;
813 }
814
815 if (!list_empty(&pagelist))
816 err = migrate_pages(&pagelist, new_page_node,
817 (unsigned long)pm);
818 else
819 err = -ENOENT;
820
821 up_read(&mm->mmap_sem);
822 return err;
823}
824
825/*
826 * Determine the nodes of a list of pages. The addr in the pm array
827 * must have been set to the virtual address of which we want to determine
828 * the node number.
829 */
830static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
831{
832 down_read(&mm->mmap_sem);
833
834 for ( ; pm->node != MAX_NUMNODES; pm++) {
835 struct vm_area_struct *vma;
836 struct page *page;
837 int err;
838
839 err = -EFAULT;
840 vma = find_vma(mm, pm->addr);
841 if (!vma)
842 goto set_status;
843
844 page = follow_page(vma, pm->addr, 0);
845 err = -ENOENT;
846 /* Use PageReserved to check for zero page */
847 if (!page || PageReserved(page))
848 goto set_status;
849
850 err = page_to_nid(page);
851set_status:
852 pm->status = err;
853 }
854
855 up_read(&mm->mmap_sem);
856 return 0;
857}
858
859/*
860 * Move a list of pages in the address space of the currently executing
861 * process.
862 */
863asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
864 const void __user * __user *pages,
865 const int __user *nodes,
866 int __user *status, int flags)
867{
868 int err = 0;
869 int i;
870 struct task_struct *task;
871 nodemask_t task_nodes;
872 struct mm_struct *mm;
873 struct page_to_node *pm = NULL;
874
875 /* Check flags */
876 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
877 return -EINVAL;
878
879 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
880 return -EPERM;
881
882 /* Find the mm_struct */
883 read_lock(&tasklist_lock);
884 task = pid ? find_task_by_pid(pid) : current;
885 if (!task) {
886 read_unlock(&tasklist_lock);
887 return -ESRCH;
888 }
889 mm = get_task_mm(task);
890 read_unlock(&tasklist_lock);
891
892 if (!mm)
893 return -EINVAL;
894
895 /*
896 * Check if this process has the right to modify the specified
897 * process. The right exists if the process has administrative
898 * capabilities, superuser privileges or the same
899 * userid as the target process.
900 */
901 if ((current->euid != task->suid) && (current->euid != task->uid) &&
902 (current->uid != task->suid) && (current->uid != task->uid) &&
903 !capable(CAP_SYS_NICE)) {
904 err = -EPERM;
905 goto out2;
906 }
907
908 task_nodes = cpuset_mems_allowed(task);
909
910 /* Limit nr_pages so that the multiplication may not overflow */
911 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
912 err = -E2BIG;
913 goto out2;
914 }
915
916 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
917 if (!pm) {
918 err = -ENOMEM;
919 goto out2;
920 }
921
922 /*
923 * Get parameters from user space and initialize the pm
924 * array. Return various errors if the user did something wrong.
925 */
926 for (i = 0; i < nr_pages; i++) {
927 const void *p;
928
929 err = -EFAULT;
930 if (get_user(p, pages + i))
931 goto out;
932
933 pm[i].addr = (unsigned long)p;
934 if (nodes) {
935 int node;
936
937 if (get_user(node, nodes + i))
938 goto out;
939
940 err = -ENODEV;
941 if (!node_online(node))
942 goto out;
943
944 err = -EACCES;
945 if (!node_isset(node, task_nodes))
946 goto out;
947
948 pm[i].node = node;
949 }
950 }
951 /* End marker */
952 pm[nr_pages].node = MAX_NUMNODES;
953
954 if (nodes)
955 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
956 else
957 err = do_pages_stat(mm, pm);
958
959 if (err >= 0)
960 /* Return status information */
961 for (i = 0; i < nr_pages; i++)
962 if (put_user(pm[i].status, status + i))
963 err = -EFAULT;
964
965out:
966 vfree(pm);
967out2:
968 mmput(mm);
969 return err;
970}
971#endif
972