From 1e275d406bf6b88e4de6925cf594b64bb2ec49bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 24 Feb 2006 13:04:12 -0800
Subject: [PATCH] page migration: Fix MPOL_INTERLEAVE behavior for migration
 via mbind()

migrate_pages_to() allocates a list of new pages on the intended target
node or with the intended policy and then uses the list of new pages as
targets for the migration of a list of pages out of place.

When the pages are allocated it is not clear which of the out of place
pages will be moved to the new pages.  So we cannot specify an address as
needed by alloc_page_vma().  This causes problem for MPOL_INTERLEAVE which
will currently allocate the pages on the first node of the set.  If mbind
is used with vma that has the policy of MPOL_INTERLEAVE then the
interleaving of pages may be destroyed.

This patch fixes that by generating a fake address for each alloc_page_vma
which will result is a distribution of pages as prescribed by
MPOL_INTERLEAVE.

Lee also noted that the sequence of nodes for the new pages seems to be
inverted.  So we also invert the way the lists of pages for migration are
build.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Looks-ok-to: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 880831bd3003..67af4cea1e23 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -552,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 		if (isolate_lru_page(page))
-			list_add(&page->lru, pagelist);
+			list_add_tail(&page->lru, pagelist);
 	}
 }
 
@@ -569,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
 	LIST_HEAD(moved);
 	LIST_HEAD(failed);
 	int err = 0;
+	unsigned long offset = 0;
 	int nr_pages;
 	struct page *page;
 	struct list_head *p;
@@ -576,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
 redo:
 	nr_pages = 0;
 	list_for_each(p, pagelist) {
-		if (vma)
-			page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
 		else
 			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 
@@ -585,7 +599,7 @@ redo:
 			err = -ENOMEM;
 			goto out;
 		}
-		list_add(&page->lru, &newlist);
+		list_add_tail(&page->lru, &newlist);
 		nr_pages++;
 		if (nr_pages > MIGRATE_CHUNK_SIZE)
 			break;
-- 
cgit v1.2.2


From d4f7796e9b387e471ab0e8ed4e0c2bd616b3c193 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 24 Feb 2006 13:04:22 -0800
Subject: [PATCH] vmscan: fix zone_reclaim

- PF_SWAPWRITE needs to be set for RECLAIM_SWAP to be able to write
  out pages to swap. Currently RECLAIM_SWAP may not do that.

- remove setting nr_reclaimed pages after slab reclaim since the slab shrinking
  code does not use that and the nr_reclaimed pages is just right for the
  intended follow up action.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1838c15ca4fd..b0af7593d01e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1908,7 +1908,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
 	cond_resched();
-	p->flags |= PF_MEMALLOC;
+	/*
+	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
+	 * and we also need to be able to write out pages for RECLAIM_WRITE
+	 * and RECLAIM_SWAP.
+	 */
+	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
@@ -1932,11 +1937,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * a long time.
 		 */
 		shrink_slab(sc.nr_scanned, gfp_mask, order);
-		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
 	}
 
 	p->reclaim_state = NULL;
-	current->flags &= ~PF_MEMALLOC;
+	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
 	if (sc.nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
-- 
cgit v1.2.2