diff options
-rw-r--r-- | Documentation/vm/page_migration | 29 | ||||
-rw-r--r-- | arch/ia64/kernel/entry.S | 2 | ||||
-rw-r--r-- | include/asm-ia64/unistd.h | 2 | ||||
-rw-r--r-- | include/linux/migrate.h | 2 | ||||
-rw-r--r-- | include/linux/syscalls.h | 5 | ||||
-rw-r--r-- | kernel/sys_ni.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 268 |
8 files changed, 288 insertions, 25 deletions
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0a5d5fb18854..99f89aa10169 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration | |||
@@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package. | |||
26 | Manual migration is useful if for example the scheduler has relocated | 26 | Manual migration is useful if for example the scheduler has relocated |
27 | a process to a processor on a distant node. A batch scheduler or an | 27 | a process to a processor on a distant node. A batch scheduler or an |
28 | administrator may detect the situation and move the pages of the process | 28 | administrator may detect the situation and move the pages of the process |
29 | nearer to the new processor. At some point in the future we may have | 29 | nearer to the new processor. The kernel itself does only provide |
30 | some mechanism in the scheduler that will automatically move the pages. | 30 | manual page migration support. Automatic page migration may be implemented |
31 | through user space processes that move pages. A special function call | ||
32 | "move_pages" allows the moving of individual pages within a process. | ||
33 | A NUMA profiler may f.e. obtain a log showing frequent off node | ||
34 | accesses and may use the result to move pages to more advantageous | ||
35 | locations. | ||
31 | 36 | ||
32 | Larger installations usually partition the system using cpusets into | 37 | Larger installations usually partition the system using cpusets into |
33 | sections of nodes. Paul Jackson has equipped cpusets with the ability to | 38 | sections of nodes. Paul Jackson has equipped cpusets with the ability to |
@@ -62,22 +67,14 @@ A. In kernel use of migrate_pages() | |||
62 | It also prevents the swapper or other scans to encounter | 67 | It also prevents the swapper or other scans to encounter |
63 | the page. | 68 | the page. |
64 | 69 | ||
65 | 2. Generate a list of newly allocates pages. These pages will contain the | 70 | 2. We need to have a function of type new_page_t that can be |
66 | contents of the pages from the first list after page migration is | 71 | passed to migrate_pages(). This function should figure out |
67 | complete. | 72 | how to allocate the correct new page given the old page. |
68 | 73 | ||
69 | 3. The migrate_pages() function is called which attempts | 74 | 3. The migrate_pages() function is called which attempts |
70 | to do the migration. It returns the moved pages in the | 75 | to do the migration. It will call the function to allocate |
71 | list specified as the third parameter and the failed | 76 | the new page for each page that is considered for |
72 | migrations in the fourth parameter. When the function | 77 | moving. |
73 | returns the first list will contain the pages that could still be retried. | ||
74 | |||
75 | 4. The leftover pages of various types are returned | ||
76 | to the LRU using putback_to_lru_pages() or otherwise | ||
77 | disposed of. The pages will still have the refcount as | ||
78 | increased by isolate_lru_pages() if putback_to_lru_pages() is not | ||
79 | used! The kernel may want to handle the various cases of failures in | ||
80 | different ways. | ||
81 | 78 | ||
82 | B. How migrate_pages() works | 79 | B. How migrate_pages() works |
83 | ---------------------------- | 80 | ---------------------------- |
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index bcb80ca5cf40..32c999f58d12 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S | |||
@@ -1584,7 +1584,7 @@ sys_call_table: | |||
1584 | data8 sys_keyctl | 1584 | data8 sys_keyctl |
1585 | data8 sys_ioprio_set | 1585 | data8 sys_ioprio_set |
1586 | data8 sys_ioprio_get // 1275 | 1586 | data8 sys_ioprio_get // 1275 |
1587 | data8 sys_ni_syscall | 1587 | data8 sys_move_pages |
1588 | data8 sys_inotify_init | 1588 | data8 sys_inotify_init |
1589 | data8 sys_inotify_add_watch | 1589 | data8 sys_inotify_add_watch |
1590 | data8 sys_inotify_rm_watch | 1590 | data8 sys_inotify_rm_watch |
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 632f2eedf72c..bb0eb727dcd0 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h | |||
@@ -265,7 +265,7 @@ | |||
265 | #define __NR_keyctl 1273 | 265 | #define __NR_keyctl 1273 |
266 | #define __NR_ioprio_set 1274 | 266 | #define __NR_ioprio_set 1274 |
267 | #define __NR_ioprio_get 1275 | 267 | #define __NR_ioprio_get 1275 |
268 | /* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */ | 268 | #define __NR_move_pages 1276 |
269 | #define __NR_inotify_init 1277 | 269 | #define __NR_inotify_init 1277 |
270 | #define __NR_inotify_add_watch 1278 | 270 | #define __NR_inotify_add_watch 1278 |
271 | #define __NR_inotify_rm_watch 1279 | 271 | #define __NR_inotify_rm_watch 1279 |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 5b95d6568dc4..5dba23a1c0d0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
5 | 5 | ||
6 | typedef struct page *new_page_t(struct page *, unsigned long private); | 6 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); |
7 | 7 | ||
8 | #ifdef CONFIG_MIGRATION | 8 | #ifdef CONFIG_MIGRATION |
9 | extern int isolate_lru_page(struct page *p, struct list_head *pagelist); | 9 | extern int isolate_lru_page(struct page *p, struct list_head *pagelist); |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bd67a4413df7..7e3f23490918 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |||
516 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | 516 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, |
517 | const unsigned long __user *from, | 517 | const unsigned long __user *from, |
518 | const unsigned long __user *to); | 518 | const unsigned long __user *to); |
519 | asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | ||
520 | const void __user * __user *pages, | ||
521 | const int __user *nodes, | ||
522 | int __user *status, | ||
523 | int flags); | ||
519 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 524 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, |
520 | unsigned long mode, | 525 | unsigned long mode, |
521 | unsigned long __user *nmask, | 526 | unsigned long __user *nmask, |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..597229749dec 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); | |||
87 | cond_syscall(sys_inotify_add_watch); | 87 | cond_syscall(sys_inotify_add_watch); |
88 | cond_syscall(sys_inotify_rm_watch); | 88 | cond_syscall(sys_inotify_rm_watch); |
89 | cond_syscall(sys_migrate_pages); | 89 | cond_syscall(sys_migrate_pages); |
90 | cond_syscall(sys_move_pages); | ||
90 | cond_syscall(sys_chown16); | 91 | cond_syscall(sys_chown16); |
91 | cond_syscall(sys_fchown16); | 92 | cond_syscall(sys_fchown16); |
92 | cond_syscall(sys_getegid16); | 93 | cond_syscall(sys_getegid16); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f432642e9e66..05b84acf0bb3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
588 | isolate_lru_page(page, pagelist); | 588 | isolate_lru_page(page, pagelist); |
589 | } | 589 | } |
590 | 590 | ||
591 | static struct page *new_node_page(struct page *page, unsigned long node) | 591 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
592 | { | 592 | { |
593 | return alloc_pages_node(node, GFP_HIGHUSER, 0); | 593 | return alloc_pages_node(node, GFP_HIGHUSER, 0); |
594 | } | 594 | } |
@@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
698 | 698 | ||
699 | } | 699 | } |
700 | 700 | ||
701 | static struct page *new_vma_page(struct page *page, unsigned long private) | 701 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) |
702 | { | 702 | { |
703 | struct vm_area_struct *vma = (struct vm_area_struct *)private; | 703 | struct vm_area_struct *vma = (struct vm_area_struct *)private; |
704 | 704 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 251a8d158257..033a12f4c949 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/mempolicy.h> | ||
29 | #include <linux/vmalloc.h> | ||
28 | 30 | ||
29 | #include "internal.h" | 31 | #include "internal.h" |
30 | 32 | ||
@@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) | |||
62 | } | 64 | } |
63 | 65 | ||
64 | /* | 66 | /* |
65 | * migrate_prep() needs to be called after we have compiled the list of pages | 67 | * migrate_prep() needs to be called before we start compiling a list of pages |
66 | * to be migrated using isolate_lru_page() but before we begin a series of calls | 68 | * to be migrated using isolate_lru_page(). |
67 | * to migrate_pages(). | ||
68 | */ | 69 | */ |
69 | int migrate_prep(void) | 70 | int migrate_prep(void) |
70 | { | 71 | { |
@@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
588 | struct page *page, int force) | 589 | struct page *page, int force) |
589 | { | 590 | { |
590 | int rc = 0; | 591 | int rc = 0; |
591 | struct page *newpage = get_new_page(page, private); | 592 | int *result = NULL; |
593 | struct page *newpage = get_new_page(page, private, &result); | ||
592 | 594 | ||
593 | if (!newpage) | 595 | if (!newpage) |
594 | return -ENOMEM; | 596 | return -ENOMEM; |
@@ -642,6 +644,12 @@ move_newpage: | |||
642 | * then this will free the page. | 644 | * then this will free the page. |
643 | */ | 645 | */ |
644 | move_to_lru(newpage); | 646 | move_to_lru(newpage); |
647 | if (result) { | ||
648 | if (rc) | ||
649 | *result = rc; | ||
650 | else | ||
651 | *result = page_to_nid(newpage); | ||
652 | } | ||
645 | return rc; | 653 | return rc; |
646 | } | 654 | } |
647 | 655 | ||
@@ -710,3 +718,255 @@ out: | |||
710 | return nr_failed + retry; | 718 | return nr_failed + retry; |
711 | } | 719 | } |
712 | 720 | ||
721 | #ifdef CONFIG_NUMA | ||
722 | /* | ||
723 | * Move a list of individual pages | ||
724 | */ | ||
725 | struct page_to_node { | ||
726 | unsigned long addr; | ||
727 | struct page *page; | ||
728 | int node; | ||
729 | int status; | ||
730 | }; | ||
731 | |||
732 | static struct page *new_page_node(struct page *p, unsigned long private, | ||
733 | int **result) | ||
734 | { | ||
735 | struct page_to_node *pm = (struct page_to_node *)private; | ||
736 | |||
737 | while (pm->node != MAX_NUMNODES && pm->page != p) | ||
738 | pm++; | ||
739 | |||
740 | if (pm->node == MAX_NUMNODES) | ||
741 | return NULL; | ||
742 | |||
743 | *result = &pm->status; | ||
744 | |||
745 | return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); | ||
746 | } | ||
747 | |||
748 | /* | ||
749 | * Move a set of pages as indicated in the pm array. The addr | ||
750 | * field must be set to the virtual address of the page to be moved | ||
751 | * and the node number must contain a valid target node. | ||
752 | */ | ||
753 | static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, | ||
754 | int migrate_all) | ||
755 | { | ||
756 | int err; | ||
757 | struct page_to_node *pp; | ||
758 | LIST_HEAD(pagelist); | ||
759 | |||
760 | down_read(&mm->mmap_sem); | ||
761 | |||
762 | /* | ||
763 | * Build a list of pages to migrate | ||
764 | */ | ||
765 | migrate_prep(); | ||
766 | for (pp = pm; pp->node != MAX_NUMNODES; pp++) { | ||
767 | struct vm_area_struct *vma; | ||
768 | struct page *page; | ||
769 | |||
770 | /* | ||
771 | * A valid page pointer that will not match any of the | ||
772 | * pages that will be moved. | ||
773 | */ | ||
774 | pp->page = ZERO_PAGE(0); | ||
775 | |||
776 | err = -EFAULT; | ||
777 | vma = find_vma(mm, pp->addr); | ||
778 | if (!vma) | ||
779 | goto set_status; | ||
780 | |||
781 | page = follow_page(vma, pp->addr, FOLL_GET); | ||
782 | err = -ENOENT; | ||
783 | if (!page) | ||
784 | goto set_status; | ||
785 | |||
786 | if (PageReserved(page)) /* Check for zero page */ | ||
787 | goto put_and_set; | ||
788 | |||
789 | pp->page = page; | ||
790 | err = page_to_nid(page); | ||
791 | |||
792 | if (err == pp->node) | ||
793 | /* | ||
794 | * Node already in the right place | ||
795 | */ | ||
796 | goto put_and_set; | ||
797 | |||
798 | err = -EACCES; | ||
799 | if (page_mapcount(page) > 1 && | ||
800 | !migrate_all) | ||
801 | goto put_and_set; | ||
802 | |||
803 | err = isolate_lru_page(page, &pagelist); | ||
804 | put_and_set: | ||
805 | /* | ||
806 | * Either remove the duplicate refcount from | ||
807 | * isolate_lru_page() or drop the page ref if it was | ||
808 | * not isolated. | ||
809 | */ | ||
810 | put_page(page); | ||
811 | set_status: | ||
812 | pp->status = err; | ||
813 | } | ||
814 | |||
815 | if (!list_empty(&pagelist)) | ||
816 | err = migrate_pages(&pagelist, new_page_node, | ||
817 | (unsigned long)pm); | ||
818 | else | ||
819 | err = -ENOENT; | ||
820 | |||
821 | up_read(&mm->mmap_sem); | ||
822 | return err; | ||
823 | } | ||
824 | |||
825 | /* | ||
826 | * Determine the nodes of a list of pages. The addr in the pm array | ||
827 | * must have been set to the virtual address of which we want to determine | ||
828 | * the node number. | ||
829 | */ | ||
830 | static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) | ||
831 | { | ||
832 | down_read(&mm->mmap_sem); | ||
833 | |||
834 | for ( ; pm->node != MAX_NUMNODES; pm++) { | ||
835 | struct vm_area_struct *vma; | ||
836 | struct page *page; | ||
837 | int err; | ||
838 | |||
839 | err = -EFAULT; | ||
840 | vma = find_vma(mm, pm->addr); | ||
841 | if (!vma) | ||
842 | goto set_status; | ||
843 | |||
844 | page = follow_page(vma, pm->addr, 0); | ||
845 | err = -ENOENT; | ||
846 | /* Use PageReserved to check for zero page */ | ||
847 | if (!page || PageReserved(page)) | ||
848 | goto set_status; | ||
849 | |||
850 | err = page_to_nid(page); | ||
851 | set_status: | ||
852 | pm->status = err; | ||
853 | } | ||
854 | |||
855 | up_read(&mm->mmap_sem); | ||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | /* | ||
860 | * Move a list of pages in the address space of the currently executing | ||
861 | * process. | ||
862 | */ | ||
863 | asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | ||
864 | const void __user * __user *pages, | ||
865 | const int __user *nodes, | ||
866 | int __user *status, int flags) | ||
867 | { | ||
868 | int err = 0; | ||
869 | int i; | ||
870 | struct task_struct *task; | ||
871 | nodemask_t task_nodes; | ||
872 | struct mm_struct *mm; | ||
873 | struct page_to_node *pm = NULL; | ||
874 | |||
875 | /* Check flags */ | ||
876 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | ||
877 | return -EINVAL; | ||
878 | |||
879 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | ||
880 | return -EPERM; | ||
881 | |||
882 | /* Find the mm_struct */ | ||
883 | read_lock(&tasklist_lock); | ||
884 | task = pid ? find_task_by_pid(pid) : current; | ||
885 | if (!task) { | ||
886 | read_unlock(&tasklist_lock); | ||
887 | return -ESRCH; | ||
888 | } | ||
889 | mm = get_task_mm(task); | ||
890 | read_unlock(&tasklist_lock); | ||
891 | |||
892 | if (!mm) | ||
893 | return -EINVAL; | ||
894 | |||
895 | /* | ||
896 | * Check if this process has the right to modify the specified | ||
897 | * process. The right exists if the process has administrative | ||
898 | * capabilities, superuser privileges or the same | ||
899 | * userid as the target process. | ||
900 | */ | ||
901 | if ((current->euid != task->suid) && (current->euid != task->uid) && | ||
902 | (current->uid != task->suid) && (current->uid != task->uid) && | ||
903 | !capable(CAP_SYS_NICE)) { | ||
904 | err = -EPERM; | ||
905 | goto out2; | ||
906 | } | ||
907 | |||
908 | task_nodes = cpuset_mems_allowed(task); | ||
909 | |||
910 | /* Limit nr_pages so that the multiplication may not overflow */ | ||
911 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
912 | err = -E2BIG; | ||
913 | goto out2; | ||
914 | } | ||
915 | |||
916 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
917 | if (!pm) { | ||
918 | err = -ENOMEM; | ||
919 | goto out2; | ||
920 | } | ||
921 | |||
922 | /* | ||
923 | * Get parameters from user space and initialize the pm | ||
924 | * array. Return various errors if the user did something wrong. | ||
925 | */ | ||
926 | for (i = 0; i < nr_pages; i++) { | ||
927 | const void *p; | ||
928 | |||
929 | err = -EFAULT; | ||
930 | if (get_user(p, pages + i)) | ||
931 | goto out; | ||
932 | |||
933 | pm[i].addr = (unsigned long)p; | ||
934 | if (nodes) { | ||
935 | int node; | ||
936 | |||
937 | if (get_user(node, nodes + i)) | ||
938 | goto out; | ||
939 | |||
940 | err = -ENODEV; | ||
941 | if (!node_online(node)) | ||
942 | goto out; | ||
943 | |||
944 | err = -EACCES; | ||
945 | if (!node_isset(node, task_nodes)) | ||
946 | goto out; | ||
947 | |||
948 | pm[i].node = node; | ||
949 | } | ||
950 | } | ||
951 | /* End marker */ | ||
952 | pm[nr_pages].node = MAX_NUMNODES; | ||
953 | |||
954 | if (nodes) | ||
955 | err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
956 | else | ||
957 | err = do_pages_stat(mm, pm); | ||
958 | |||
959 | if (err >= 0) | ||
960 | /* Return status information */ | ||
961 | for (i = 0; i < nr_pages; i++) | ||
962 | if (put_user(pm[i].status, status + i)) | ||
963 | err = -EFAULT; | ||
964 | |||
965 | out: | ||
966 | vfree(pm); | ||
967 | out2: | ||
968 | mmput(mm); | ||
969 | return err; | ||
970 | } | ||
971 | #endif | ||
972 | |||