1 files changed, 164 insertions, 25 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0eb..880831bd3003 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -129,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
        }
        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
        struct zonelist *zl;
-        int num, max, nd;
+        int num, max, nd, k;
        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-        zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes)
+        /* First put in the highest zones from all nodes, then all the next 
-                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+           lower zones etc. Avoid empty zones because the memory allocator
+           doesn't like them. If you implement node hot removal you
+           have to fix that. */
+        for (k = policy_zone; k >= 0; k--) { 
+                for_each_node_mask(nd, *nodes) { 
+                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
+                        if (z->present_pages > 0) 
+                                zl->zones[num++] = z;
+                }
+        }
        zl->zones[num] = NULL;
        return zl;
 }
@@ -543,24 +556,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
        }
 }
-static int swap_pages(struct list_head *pagelist)
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+                        struct vm_area_struct *vma, int dest)
 {
+        LIST_HEAD(newlist);
        LIST_HEAD(moved);
        LIST_HEAD(failed);
-        int n;
+        int err = 0;
+        int nr_pages;
+        struct page *page;
+        struct list_head *p;
-        n = migrate_pages(pagelist, NULL, &moved, &failed);
+redo:
-        putback_lru_pages(&failed);
+        nr_pages = 0;
-        putback_lru_pages(&moved);
+        list_for_each(p, pagelist) {
+                if (vma)
+                        page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+                else
+                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-        return n;
+                if (!page) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                list_add(&page->lru, &newlist);
+                nr_pages++;
+                if (nr_pages > MIGRATE_CHUNK_SIZE)
+                        break;
+        }
+        err = migrate_pages(pagelist, &newlist, &moved, &failed);
+        putback_lru_pages(&moved);      /* Call release pages instead ?? */
+        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+                goto redo;
+out:
+        /* Return leftover allocated pages */
+        while (!list_empty(&newlist)) {
+                page = list_entry(newlist.next, struct page, lru);
+                list_del(&page->lru);
+                __free_page(page);
+        }
+        list_splice(&failed, pagelist);
+        if (err < 0)
+                return err;
+        /* Calculate number of leftover pages */
+        nr_pages = 0;
+        list_for_each(p, pagelist)
+                nr_pages++;
+        return nr_pages;
 }
 /*
- * For now migrate_pages simply swaps out the pages from nodes that are in
+ * Migrate pages from one node to a target node.
- * the source set but not in the target set. In the future, we would
+ * Returns error or the number of pages not migrated.
- * want a function that moves pages between the two nodesets in such
+ */
- * a way as to preserve the physical layout as much as possible.
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+{
+        nodemask_t nmask;
+        LIST_HEAD(pagelist);
+        int err = 0;
+        nodes_clear(nmask);
+        node_set(source, nmask);
+        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+        if (!list_empty(&pagelist)) {
+                err = migrate_pages_to(&pagelist, NULL, dest);
+                if (!list_empty(&pagelist))
+                        putback_lru_pages(&pagelist);
+        }
+        return err;
+}
+/*
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
@@ -568,22 +648,76 @@ int do_migrate_pages(struct mm_struct *mm,
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
        LIST_HEAD(pagelist);
-        int count = 0;
+        int busy = 0;
-        nodemask_t nodes;
+        int err = 0;
+        nodemask_t tmp;
-        nodes_andnot(nodes, *from_nodes, *to_nodes);
+        down_read(&mm->mmap_sem);
-        down_read(&mm->mmap_sem);
+/*
-        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
-                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same.  If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient.  As we go, we remember the
+ * most recent <s, d> pair that moved (s != d).  If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved.  If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
-        if (!list_empty(&pagelist)) {
+        tmp = *from_nodes;
-                count = swap_pages(&pagelist);
+        while (!nodes_empty(tmp)) {
-                putback_lru_pages(&pagelist);
+                int s,d;
+                int source = -1;
+                int dest = 0;
+                for_each_node_mask(s, tmp) {
+                        d = node_remap(s, *from_nodes, *to_nodes);
+                        if (s == d)
+                                continue;
+                        source = s;     /* Node moved. Memorize */
+                        dest = d;
+                        /* dest not in remaining from nodes? */
+                        if (!node_isset(dest, tmp))
+                                break;
+                }
+                if (source == -1)
+                        break;
+                node_clear(source, tmp);
+                err = migrate_to_node(mm, source, dest, flags);
+                if (err > 0)
+                        busy += err;
+                if (err < 0)
+                        break;
        }
        up_read(&mm->mmap_sem);
-        return count;
+        if (err < 0)
+                return err;
+        return busy;
 }
 long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +777,9 @@ long do_mbind(unsigned long start, unsigned long len,
                int nr_failed = 0;
                err = mbind_range(vma, start, end, new);
                if (!list_empty(&pagelist))
-                        nr_failed = swap_pages(&pagelist);
+                        nr_failed = migrate_pages_to(&pagelist, vma, -1);
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
@@ -673,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
+        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+                return -EINVAL;
        nlongs = BITS_TO_LONGS(maxnode);
        if ((maxnode % BITS_PER_LONG) == 0)
@@ -1034,6 +1171,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
                return interleave_nodes(pol);
 }
+#ifdef CONFIG_HUGETLBFS
 /* Return a zonelist suitable for a huge page allocation. */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 {
@@ -1047,6 +1185,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
        }
        return zonelist_policy(GFP_HIGHUSER, pol);
 }
+#endif
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */

diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73790188b0eb..880831bd3003 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
95	#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */	95	#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96	#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */	96	#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97		97
		98	/* The number of pages to migrate per call to migrate_pages() */
		99	#define MIGRATE_CHUNK_SIZE 256
		100
98	static kmem_cache_t *policy_cache;	101	static kmem_cache_t *policy_cache;
99	static kmem_cache_t *sn_cache;	102	static kmem_cache_t *sn_cache;
100		103
@@ -129,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
129	}	132	}
130	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;	133	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
131	}	134	}
		135
132	/* Generate a custom zonelist for the BIND policy. */	136	/* Generate a custom zonelist for the BIND policy. */
133	static struct zonelist bind_zonelist(nodemask_t nodes)	137	static struct zonelist bind_zonelist(nodemask_t nodes)
134	{	138	{
135	struct zonelist *zl;	139	struct zonelist *zl;
136	int num, max, nd;	140	int num, max, nd, k;
137		141
138	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);	142	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
139	zl = kmalloc(sizeof(void ) max, GFP_KERNEL);	143	zl = kmalloc(sizeof(struct zone ) max, GFP_KERNEL);
140	if (!zl)	144	if (!zl)
141	return NULL;	145	return NULL;
142	num = 0;	146	num = 0;
143	for_each_node_mask(nd, *nodes)	147	/* First put in the highest zones from all nodes, then all the next
144	zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];	148	lower zones etc. Avoid empty zones because the memory allocator
		149	doesn't like them. If you implement node hot removal you
		150	have to fix that. */
		151	for (k = policy_zone; k >= 0; k--) {
		152	for_each_node_mask(nd, *nodes) {
		153	struct zone *z = &NODE_DATA(nd)->node_zones[k];
		154	if (z->present_pages > 0)
		155	zl->zones[num++] = z;
		156	}
		157	}
145	zl->zones[num] = NULL;	158	zl->zones[num] = NULL;
146	return zl;	159	return zl;
147	}	160	}
@@ -543,24 +556,91 @@ static void migrate_page_add(struct page page, struct list_head pagelist,
543	}	556	}
544	}	557	}
545		558
546	static int swap_pages(struct list_head *pagelist)	559	/*
		560	* Migrate the list 'pagelist' of pages to a certain destination.
		561	*
		562	* Specify destination with either non-NULL vma or dest_node >= 0
		563	* Return the number of pages not migrated or error code
		564	*/
		565	static int migrate_pages_to(struct list_head *pagelist,
		566	struct vm_area_struct *vma, int dest)
547	{	567	{
		568	LIST_HEAD(newlist);
548	LIST_HEAD(moved);	569	LIST_HEAD(moved);
549	LIST_HEAD(failed);	570	LIST_HEAD(failed);
550	int n;	571	int err = 0;
		572	int nr_pages;
		573	struct page *page;
		574	struct list_head *p;
551		575
552	n = migrate_pages(pagelist, NULL, &moved, &failed);	576	redo:
553	putback_lru_pages(&failed);	577	nr_pages = 0;
554	putback_lru_pages(&moved);	578	list_for_each(p, pagelist) {
		579	if (vma)
		580	page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
		581	else
		582	page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
555		583
556	return n;	584	if (!page) {
		585	err = -ENOMEM;
		586	goto out;
		587	}
		588	list_add(&page->lru, &newlist);
		589	nr_pages++;
		590	if (nr_pages > MIGRATE_CHUNK_SIZE)
		591	break;
		592	}
		593	err = migrate_pages(pagelist, &newlist, &moved, &failed);
		594
		595	putback_lru_pages(&moved); /* Call release pages instead ?? */
		596
		597	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
		598	goto redo;
		599	out:
		600	/* Return leftover allocated pages */
		601	while (!list_empty(&newlist)) {
		602	page = list_entry(newlist.next, struct page, lru);
		603	list_del(&page->lru);
		604	__free_page(page);
		605	}
		606	list_splice(&failed, pagelist);
		607	if (err < 0)
		608	return err;
		609
		610	/* Calculate number of leftover pages */
		611	nr_pages = 0;
		612	list_for_each(p, pagelist)
		613	nr_pages++;
		614	return nr_pages;
557	}	615	}
558		616
559	/*	617	/*
560	* For now migrate_pages simply swaps out the pages from nodes that are in	618	* Migrate pages from one node to a target node.
561	* the source set but not in the target set. In the future, we would	619	* Returns error or the number of pages not migrated.
562	* want a function that moves pages between the two nodesets in such	620	*/
563	* a way as to preserve the physical layout as much as possible.	621	int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
		622	{
		623	nodemask_t nmask;
		624	LIST_HEAD(pagelist);
		625	int err = 0;
		626
		627	nodes_clear(nmask);
		628	node_set(source, nmask);
		629
		630	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
		631	flags \| MPOL_MF_DISCONTIG_OK, &pagelist);
		632
		633	if (!list_empty(&pagelist)) {
		634	err = migrate_pages_to(&pagelist, NULL, dest);
		635	if (!list_empty(&pagelist))
		636	putback_lru_pages(&pagelist);
		637	}
		638	return err;
		639	}
		640
		641	/*
		642	* Move pages between the two nodesets so as to preserve the physical
		643	* layout as much as possible.
564	*	644	*
565	* Returns the number of page that could not be moved.	645	* Returns the number of page that could not be moved.
566	*/	646	*/
@@ -568,22 +648,76 @@ int do_migrate_pages(struct mm_struct *mm,
568	const nodemask_t from_nodes, const nodemask_t to_nodes, int flags)	648	const nodemask_t from_nodes, const nodemask_t to_nodes, int flags)
569	{	649	{
570	LIST_HEAD(pagelist);	650	LIST_HEAD(pagelist);
571	int count = 0;	651	int busy = 0;
572	nodemask_t nodes;	652	int err = 0;
		653	nodemask_t tmp;
573		654
574	nodes_andnot(nodes, from_nodes, to_nodes);	655	down_read(&mm->mmap_sem);
575		656
576	down_read(&mm->mmap_sem);	657	/*
577	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,	658	* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
578	flags \| MPOL_MF_DISCONTIG_OK, &pagelist);	659	* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
		660	* bit in 'tmp', and return that <source, dest> pair for migration.
		661	* The pair of nodemasks 'to' and 'from' define the map.
		662	*
		663	* If no pair of bits is found that way, fallback to picking some
		664	* pair of 'source' and 'dest' bits that are not the same. If the
		665	* 'source' and 'dest' bits are the same, this represents a node
		666	* that will be migrating to itself, so no pages need move.
		667	*
		668	* If no bits are left in 'tmp', or if all remaining bits left
		669	* in 'tmp' correspond to the same bit in 'to', return false
		670	* (nothing left to migrate).
		671	*
		672	* This lets us pick a pair of nodes to migrate between, such that
		673	* if possible the dest node is not already occupied by some other
		674	* source node, minimizing the risk of overloading the memory on a
		675	* node that would happen if we migrated incoming memory to a node
		676	* before migrating outgoing memory source that same node.
		677	*
		678	* A single scan of tmp is sufficient. As we go, we remember the
		679	* most recent <s, d> pair that moved (s != d). If we find a pair
		680	* that not only moved, but what's better, moved to an empty slot
		681	* (d is not set in tmp), then we break out then, with that pair.
		682	* Otherwise when we finish scannng from_tmp, we at least have the
		683	* most recent <s, d> pair that moved. If we get all the way through
		684	* the scan of tmp without finding any node that moved, much less
		685	* moved to an empty node, then there is nothing left worth migrating.
		686	*/
579		687
580	if (!list_empty(&pagelist)) {	688	tmp = *from_nodes;
581	count = swap_pages(&pagelist);	689	while (!nodes_empty(tmp)) {
582	putback_lru_pages(&pagelist);	690	int s,d;
		691	int source = -1;
		692	int dest = 0;
		693
		694	for_each_node_mask(s, tmp) {
		695	d = node_remap(s, from_nodes, to_nodes);
		696	if (s == d)
		697	continue;
		698
		699	source = s; /* Node moved. Memorize */
		700	dest = d;
		701
		702	/* dest not in remaining from nodes? */
		703	if (!node_isset(dest, tmp))
		704	break;
		705	}
		706	if (source == -1)
		707	break;
		708
		709	node_clear(source, tmp);
		710	err = migrate_to_node(mm, source, dest, flags);
		711	if (err > 0)
		712	busy += err;
		713	if (err < 0)
		714	break;
583	}	715	}
584		716
585	up_read(&mm->mmap_sem);	717	up_read(&mm->mmap_sem);
586	return count;	718	if (err < 0)
		719	return err;
		720	return busy;
587	}	721	}
588		722
589	long do_mbind(unsigned long start, unsigned long len,	723	long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +777,9 @@ long do_mbind(unsigned long start, unsigned long len,
643	int nr_failed = 0;	777	int nr_failed = 0;
644		778
645	err = mbind_range(vma, start, end, new);	779	err = mbind_range(vma, start, end, new);
		780
646	if (!list_empty(&pagelist))	781	if (!list_empty(&pagelist))
647	nr_failed = swap_pages(&pagelist);	782	nr_failed = migrate_pages_to(&pagelist, vma, -1);
648		783
649	if (!err && nr_failed && (flags & MPOL_MF_STRICT))	784	if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650	err = -EIO;	785	err = -EIO;
@@ -673,6 +808,8 @@ static int get_nodes(nodemask_t nodes, const unsigned long __user nmask,
673	nodes_clear(*nodes);	808	nodes_clear(*nodes);
674	if (maxnode == 0 \|\| !nmask)	809	if (maxnode == 0 \|\| !nmask)
675	return 0;	810	return 0;
		811	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
		812	return -EINVAL;
676		813
677	nlongs = BITS_TO_LONGS(maxnode);	814	nlongs = BITS_TO_LONGS(maxnode);
678	if ((maxnode % BITS_PER_LONG) == 0)	815	if ((maxnode % BITS_PER_LONG) == 0)
@@ -1034,6 +1171,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1034	return interleave_nodes(pol);	1171	return interleave_nodes(pol);
1035	}	1172	}
1036		1173
		1174	#ifdef CONFIG_HUGETLBFS
1037	/* Return a zonelist suitable for a huge page allocation. */	1175	/* Return a zonelist suitable for a huge page allocation. */
1038	struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr)	1176	struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr)
1039	{	1177	{
@@ -1047,6 +1185,7 @@ struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr)
1047	}	1185	}
1048	return zonelist_policy(GFP_HIGHUSER, pol);	1186	return zonelist_policy(GFP_HIGHUSER, pol);
1049	}	1187	}
		1188	#endif
1050		1189
1051	/* Allocate a page in interleaved policy.	1190	/* Allocate a page in interleaved policy.
1052	Own path because it needs to do special accounting. */	1191	Own path because it needs to do special accounting. */