aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-06-23 05:03:55 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:53 -0400
commit742755a1d8ce2b548428f7aacf1758b4bba50080 (patch)
tree53426657e14dc19a694d418274c9a6f4dcb8a997 /mm
parent95a402c3847cc16f4ba03013cd01404fa0f14c2e (diff)
[PATCH] page migration: sys_move_pages(): support moving of individual pages
move_pages() is used to move individual pages of a process. The function can be used to determine the location of pages and to move them onto the desired node. move_pages() returns status information for each page. long move_pages(pid, number_of_pages_to_move, addresses_of_pages[], nodes[] or NULL, status[], flags); The addresses of pages is an array of void * pointing to the pages to be moved. The nodes array contains the node numbers that the pages should be moved to. If a NULL is passed instead of an array then no pages are moved but the status array is updated. The status request may be used to determine the page state before issuing another move_pages() to move pages. The status array will contain the state of all individual page migration attempts when the function terminates. The status array is only valid if move_pages() completed successfullly. Possible page states in status[]: 0..MAX_NUMNODES The page is now on the indicated node. -ENOENT Page is not present -EACCES Page is mapped by multiple processes and can only be moved if MPOL_MF_MOVE_ALL is specified. -EPERM The page has been mlocked by a process/driver and cannot be moved. -EBUSY Page is busy and cannot be moved. Try again later. -EFAULT Invalid address (no VMA or zero page). -ENOMEM Unable to allocate memory on target node. -EIO Unable to write back page. The page must be written back in order to move it since the page is dirty and the filesystem does not provide a migration function that would allow the moving of dirty pages. -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. The flags parameter indicates what types of pages to move: MPOL_MF_MOVE Move pages that are only mapped by the process. MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes. Requires sufficient capabilities. Possible return codes from move_pages() -ENOENT No pages found that would require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. -EINVAL Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt to migrate pages in a kernel thread. -EPERM MPOL_MF_MOVE_ALL specified without sufficient priviledges. or an attempt to move a process belonging to another user. -EACCES One of the target nodes is not allowed by the current cpuset. -ENODEV One of the target nodes is not online. -ESRCH Process does not exist. -E2BIG Too many pages to move. -ENOMEM Not enough memory to allocate control array. -EFAULT Parameters could not be accessed. A test program for move_pages() may be found with the patches on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3 From: Christoph Lameter <clameter@sgi.com> Detailed results for sys_move_pages() Pass a pointer to an integer to get_new_page() that may be used to indicate where the completion status of a migration operation should be placed. This allows sys_move_pags() to report back exactly what happened to each page. Wish there would be a better way to do this. Looks a bit hacky. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Jes Sorensen <jes@trained-monkey.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Andi Kleen <ak@muc.de> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c268
2 files changed, 266 insertions, 6 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f432642e9e66..05b84acf0bb3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
588 isolate_lru_page(page, pagelist); 588 isolate_lru_page(page, pagelist);
589} 589}
590 590
591static struct page *new_node_page(struct page *page, unsigned long node) 591static struct page *new_node_page(struct page *page, unsigned long node, int **x)
592{ 592{
593 return alloc_pages_node(node, GFP_HIGHUSER, 0); 593 return alloc_pages_node(node, GFP_HIGHUSER, 0);
594} 594}
@@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm,
698 698
699} 699}
700 700
701static struct page *new_vma_page(struct page *page, unsigned long private) 701static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
702{ 702{
703 struct vm_area_struct *vma = (struct vm_area_struct *)private; 703 struct vm_area_struct *vma = (struct vm_area_struct *)private;
704 704
diff --git a/mm/migrate.c b/mm/migrate.c
index 251a8d158257..033a12f4c949 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -25,6 +25,8 @@
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/cpuset.h> 26#include <linux/cpuset.h>
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/mempolicy.h>
29#include <linux/vmalloc.h>
28 30
29#include "internal.h" 31#include "internal.h"
30 32
@@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
62} 64}
63 65
64/* 66/*
65 * migrate_prep() needs to be called after we have compiled the list of pages 67 * migrate_prep() needs to be called before we start compiling a list of pages
66 * to be migrated using isolate_lru_page() but before we begin a series of calls 68 * to be migrated using isolate_lru_page().
67 * to migrate_pages().
68 */ 69 */
69int migrate_prep(void) 70int migrate_prep(void)
70{ 71{
@@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
588 struct page *page, int force) 589 struct page *page, int force)
589{ 590{
590 int rc = 0; 591 int rc = 0;
591 struct page *newpage = get_new_page(page, private); 592 int *result = NULL;
593 struct page *newpage = get_new_page(page, private, &result);
592 594
593 if (!newpage) 595 if (!newpage)
594 return -ENOMEM; 596 return -ENOMEM;
@@ -642,6 +644,12 @@ move_newpage:
642 * then this will free the page. 644 * then this will free the page.
643 */ 645 */
644 move_to_lru(newpage); 646 move_to_lru(newpage);
647 if (result) {
648 if (rc)
649 *result = rc;
650 else
651 *result = page_to_nid(newpage);
652 }
645 return rc; 653 return rc;
646} 654}
647 655
@@ -710,3 +718,255 @@ out:
710 return nr_failed + retry; 718 return nr_failed + retry;
711} 719}
712 720
721#ifdef CONFIG_NUMA
722/*
723 * Move a list of individual pages
724 */
725struct page_to_node {
726 unsigned long addr;
727 struct page *page;
728 int node;
729 int status;
730};
731
732static struct page *new_page_node(struct page *p, unsigned long private,
733 int **result)
734{
735 struct page_to_node *pm = (struct page_to_node *)private;
736
737 while (pm->node != MAX_NUMNODES && pm->page != p)
738 pm++;
739
740 if (pm->node == MAX_NUMNODES)
741 return NULL;
742
743 *result = &pm->status;
744
745 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
746}
747
748/*
749 * Move a set of pages as indicated in the pm array. The addr
750 * field must be set to the virtual address of the page to be moved
751 * and the node number must contain a valid target node.
752 */
753static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
754 int migrate_all)
755{
756 int err;
757 struct page_to_node *pp;
758 LIST_HEAD(pagelist);
759
760 down_read(&mm->mmap_sem);
761
762 /*
763 * Build a list of pages to migrate
764 */
765 migrate_prep();
766 for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
767 struct vm_area_struct *vma;
768 struct page *page;
769
770 /*
771 * A valid page pointer that will not match any of the
772 * pages that will be moved.
773 */
774 pp->page = ZERO_PAGE(0);
775
776 err = -EFAULT;
777 vma = find_vma(mm, pp->addr);
778 if (!vma)
779 goto set_status;
780
781 page = follow_page(vma, pp->addr, FOLL_GET);
782 err = -ENOENT;
783 if (!page)
784 goto set_status;
785
786 if (PageReserved(page)) /* Check for zero page */
787 goto put_and_set;
788
789 pp->page = page;
790 err = page_to_nid(page);
791
792 if (err == pp->node)
793 /*
794 * Node already in the right place
795 */
796 goto put_and_set;
797
798 err = -EACCES;
799 if (page_mapcount(page) > 1 &&
800 !migrate_all)
801 goto put_and_set;
802
803 err = isolate_lru_page(page, &pagelist);
804put_and_set:
805 /*
806 * Either remove the duplicate refcount from
807 * isolate_lru_page() or drop the page ref if it was
808 * not isolated.
809 */
810 put_page(page);
811set_status:
812 pp->status = err;
813 }
814
815 if (!list_empty(&pagelist))
816 err = migrate_pages(&pagelist, new_page_node,
817 (unsigned long)pm);
818 else
819 err = -ENOENT;
820
821 up_read(&mm->mmap_sem);
822 return err;
823}
824
825/*
826 * Determine the nodes of a list of pages. The addr in the pm array
827 * must have been set to the virtual address of which we want to determine
828 * the node number.
829 */
830static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
831{
832 down_read(&mm->mmap_sem);
833
834 for ( ; pm->node != MAX_NUMNODES; pm++) {
835 struct vm_area_struct *vma;
836 struct page *page;
837 int err;
838
839 err = -EFAULT;
840 vma = find_vma(mm, pm->addr);
841 if (!vma)
842 goto set_status;
843
844 page = follow_page(vma, pm->addr, 0);
845 err = -ENOENT;
846 /* Use PageReserved to check for zero page */
847 if (!page || PageReserved(page))
848 goto set_status;
849
850 err = page_to_nid(page);
851set_status:
852 pm->status = err;
853 }
854
855 up_read(&mm->mmap_sem);
856 return 0;
857}
858
859/*
860 * Move a list of pages in the address space of the currently executing
861 * process.
862 */
863asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
864 const void __user * __user *pages,
865 const int __user *nodes,
866 int __user *status, int flags)
867{
868 int err = 0;
869 int i;
870 struct task_struct *task;
871 nodemask_t task_nodes;
872 struct mm_struct *mm;
873 struct page_to_node *pm = NULL;
874
875 /* Check flags */
876 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
877 return -EINVAL;
878
879 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
880 return -EPERM;
881
882 /* Find the mm_struct */
883 read_lock(&tasklist_lock);
884 task = pid ? find_task_by_pid(pid) : current;
885 if (!task) {
886 read_unlock(&tasklist_lock);
887 return -ESRCH;
888 }
889 mm = get_task_mm(task);
890 read_unlock(&tasklist_lock);
891
892 if (!mm)
893 return -EINVAL;
894
895 /*
896 * Check if this process has the right to modify the specified
897 * process. The right exists if the process has administrative
898 * capabilities, superuser privileges or the same
899 * userid as the target process.
900 */
901 if ((current->euid != task->suid) && (current->euid != task->uid) &&
902 (current->uid != task->suid) && (current->uid != task->uid) &&
903 !capable(CAP_SYS_NICE)) {
904 err = -EPERM;
905 goto out2;
906 }
907
908 task_nodes = cpuset_mems_allowed(task);
909
910 /* Limit nr_pages so that the multiplication may not overflow */
911 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
912 err = -E2BIG;
913 goto out2;
914 }
915
916 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
917 if (!pm) {
918 err = -ENOMEM;
919 goto out2;
920 }
921
922 /*
923 * Get parameters from user space and initialize the pm
924 * array. Return various errors if the user did something wrong.
925 */
926 for (i = 0; i < nr_pages; i++) {
927 const void *p;
928
929 err = -EFAULT;
930 if (get_user(p, pages + i))
931 goto out;
932
933 pm[i].addr = (unsigned long)p;
934 if (nodes) {
935 int node;
936
937 if (get_user(node, nodes + i))
938 goto out;
939
940 err = -ENODEV;
941 if (!node_online(node))
942 goto out;
943
944 err = -EACCES;
945 if (!node_isset(node, task_nodes))
946 goto out;
947
948 pm[i].node = node;
949 }
950 }
951 /* End marker */
952 pm[nr_pages].node = MAX_NUMNODES;
953
954 if (nodes)
955 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
956 else
957 err = do_pages_stat(mm, pm);
958
959 if (err >= 0)
960 /* Return status information */
961 for (i = 0; i < nr_pages; i++)
962 if (put_user(pm[i].status, status + i))
963 err = -EFAULT;
964
965out:
966 vfree(pm);
967out2:
968 mmput(mm);
969 return err;
970}
971#endif
972