From 2ae88149a27cadf2840e0ab8155bef13be285c03 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 28 Oct 2006 10:38:23 -0700
Subject: [PATCH] mm: clean up pagecache allocation

- Consolidate page_cache_alloc

- Fix splice: only the pagecache pages and filesystem data need to use
  mapping_gfp_mask.

- Fix grab_cache_page_nowait: same as splice, also honour NUMA placement.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index cb26e33fd0ff..7b84dc814347 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 }
 
 #ifdef CONFIG_NUMA
-struct page *page_cache_alloc(struct address_space *x)
+struct page *__page_cache_alloc(gfp_t gfp)
 {
 	if (cpuset_do_page_mem_spread()) {
 		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+		return alloc_pages_node(n, gfp, 0);
 	}
-	return alloc_pages(mapping_gfp_mask(x), 0);
+	return alloc_pages(gfp, 0);
 }
-EXPORT_SYMBOL(page_cache_alloc);
-
-struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	if (cpuset_do_page_mem_spread()) {
-		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
-	}
-	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
-}
-EXPORT_SYMBOL(page_cache_alloc_cold);
+EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
 static int __sleep_on_page_lock(void *word)
@@ -826,7 +816,6 @@ struct page *
 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 {
 	struct page *page = find_get_page(mapping, index);
-	gfp_t gfp_mask;
 
 	if (page) {
 		if (!TestSetPageLocked(page))
@@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 		page_cache_release(page);
 		return NULL;
 	}
-	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
-	page = alloc_pages(gfp_mask, 0);
-	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
+	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
 		page_cache_release(page);
 		page = NULL;
 	}
-- 
cgit v1.2.2


From 3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca Mon Sep 17 00:00:00 2001
From: Martin Bligh <mbligh@mbligh.org>
Date: Sat, 28 Oct 2006 10:38:24 -0700
Subject: [PATCH] vmscan: Fix temp_priority race

The temp_priority field in zone is racy, as we can walk through a reclaim
path, and just before we copy it into prev_priority, it can be overwritten
(say with DEF_PRIORITY) by another reclaimer.

The same bug is contained in both try_to_free_pages and balance_pgdat, but
it is fixed slightly differently.  In balance_pgdat, we keep a separate
priority record per zone in a local array.  In try_to_free_pages there is
no need to do this, as the priority level is the same for all zones that we
reclaim from.

Impact of this bug is that temp_priority is copied into prev_priority, and
setting this artificially high causes reclaimers to set distress
artificially low.  They then fail to reclaim mapped pages, when they are,
in fact, under severe memory pressure (their priority may be as low as 0).
This causes the OOM killer to fire incorrectly.

From: Andrew Morton <akpm@osdl.org>

__zone_reclaim() isn't modifying zone->prev_priority.  But zone->prev_priority
is used in the decision whether or not to bring mapped pages onto the inactive
list.  Hence there's a risk here that __zone_reclaim() will fail because
zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up
stuck on the active list.

Fix that up by decreasing (ie making more urgent) zone->prev_priority as
__zone_reclaim() scans the zone's pages.

This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created.  It should be
possible to remove that now, and to just start out at DEF_PRIORITY?

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c |  2 +-
 mm/vmscan.c     | 55 +++++++++++++++++++++++++++++++++++++++++--------------
 mm/vmstat.c     |  2 --
 3 files changed, 42 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f5fc45472d5c..ecf853b5e30e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2407,7 +2407,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+		zone->prev_priority = DEF_PRIORITY;
 
 		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f05527bf792b..b32560ead5c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -723,6 +723,20 @@ done:
 	return nr_reclaimed;
 }
 
+/*
+ * We are about to scan this zone at a certain priority level.  If that priority
+ * level is smaller (ie: more urgent) than the previous priority, then note
+ * that priority level within the zone.  This is done so that when the next
+ * process comes in to scan this zone, it will immediately start out at this
+ * priority level rather than having to build up its own scanning priority.
+ * Here, this priority affects only the reclaim-mapped threshold.
+ */
+static inline void note_zone_scanning_priority(struct zone *zone, int priority)
+{
+	if (priority < zone->prev_priority)
+		zone->prev_priority = priority;
+}
+
 static inline int zone_is_near_oom(struct zone *zone)
 {
 	return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
@@ -972,9 +986,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->temp_priority = priority;
-		if (zone->prev_priority > priority)
-			zone->prev_priority = priority;
+		note_zone_scanning_priority(zone, priority);
 
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
@@ -1024,7 +1036,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->temp_priority = DEF_PRIORITY;
 		lru_pages += zone->nr_active + zone->nr_inactive;
 	}
 
@@ -1065,13 +1076,22 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	if (!sc.all_unreclaimable)
 		ret = 1;
 out:
+	/*
+	 * Now that we've scanned all the zones at this priority level, note
+	 * that level within the zone so that the next thread which performs
+	 * scanning of this zone will immediately start out at this priority
+	 * level.  This affects only the decision whether or not to bring
+	 * mapped pages onto the inactive list.
+	 */
+	if (priority < 0)
+		priority = 0;
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->prev_priority = zone->temp_priority;
+		zone->prev_priority = priority;
 	}
 	return ret;
 }
@@ -1111,6 +1131,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 	};
+	/*
+	 * temp_priority is used to remember the scanning priority at which
+	 * this zone was successfully refilled to free_pages == pages_high.
+	 */
+	int temp_priority[MAX_NR_ZONES];
 
 loop_again:
 	total_scanned = 0;
@@ -1118,11 +1143,8 @@ loop_again:
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->temp_priority = DEF_PRIORITY;
-	}
+	for (i = 0; i < pgdat->nr_zones; i++)
+		temp_priority[i] = DEF_PRIORITY;
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
@@ -1183,10 +1205,9 @@ scan:
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       end_zone, 0))
 				all_zones_ok = 0;
-			zone->temp_priority = priority;
-			if (zone->prev_priority > priority)
-				zone->prev_priority = priority;
+			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
+			note_zone_scanning_priority(zone, priority);
 			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1226,10 +1247,15 @@ scan:
 			break;
 	}
 out:
+	/*
+	 * Note within each zone the priority level at which this zone was
+	 * brought into a happy state.  So that the next thread which scans this
+	 * zone will start out at that priority level.
+	 */
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
-		zone->prev_priority = zone->temp_priority;
+		zone->prev_priority = temp_priority[i];
 	}
 	if (!all_zones_ok) {
 		cond_resched();
@@ -1614,6 +1640,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 */
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
+			note_zone_scanning_priority(zone, priority);
 			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			priority--;
 		} while (priority >= 0 && nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 45b124e012f5..8614e8f6743b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 		seq_printf(m,
 			   "\n  all_unreclaimable: %u"
 			   "\n  prev_priority:     %i"
-			   "\n  temp_priority:     %i"
 			   "\n  start_pfn:         %lu",
 			   zone->all_unreclaimable,
 			   zone->prev_priority,
-			   zone->temp_priority,
 			   zone->zone_start_pfn);
 		spin_unlock_irqrestore(&zone->lock, flags);
 		seq_putc(m, '\n');
-- 
cgit v1.2.2


From bbdb396a60b2ebf7de3b717991e5d3e28c8b7bbd Mon Sep 17 00:00:00 2001
From: Martin Bligh <mbligh@google.com>
Date: Sat, 28 Oct 2006 10:38:25 -0700
Subject: [PATCH] Use min of two prio settings in calculating distress for
 reclaim

If try_to_free_pages / balance_pgdat are called with a gfp_mask specifying
GFP_IO and/or GFP_FS, they will reclaim the requisite number of pages, and the
reset prev_priority to DEF_PRIORITY (or to some other high (ie: unurgent)
value).

However, another reclaimer without those gfp_mask flags set (say, GFP_NOIO)
may still be struggling to reclaim pages.  The concurrent overwrite of
zone->prev_priority will cause this GFP_NOIO thread to unexpectedly cease
deactivating mapped pages, thus causing reclaim difficulties.

Fix this is to key the distress calculation not off zone->prev_priority, but
also take into account the local caller's priority by using
min(zone->prev_priority, sc->priority)

Signed-off-by: Martin J. Bligh <mbligh@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b32560ead5c0..518540a4a2a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -760,7 +760,7 @@ static inline int zone_is_near_oom(struct zone *zone)
  * But we had to alter page->flags anyway.
  */
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-				struct scan_control *sc)
+				struct scan_control *sc, int priority)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
@@ -784,7 +784,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * `distress' is a measure of how much trouble we're having
 		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
 		 */
-		distress = 100 >> zone->prev_priority;
+		distress = 100 >> min(zone->prev_priority, priority);
 
 		/*
 		 * The point of this algorithm is to decide when to start
@@ -936,7 +936,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
 			nr_active -= nr_to_scan;
-			shrink_active_list(nr_to_scan, zone, sc);
+			shrink_active_list(nr_to_scan, zone, sc, priority);
 		}
 
 		if (nr_inactive) {
@@ -1384,7 +1384,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
 			if (zone->nr_scan_active >= nr_pages || pass > 3) {
 				zone->nr_scan_active = 0;
 				nr_to_scan = min(nr_pages, zone->nr_active);
-				shrink_active_list(nr_to_scan, zone, sc);
+				shrink_active_list(nr_to_scan, zone, sc, prio);
 			}
 		}
 
-- 
cgit v1.2.2


From f2d0aa5bf8d4f7ae4cb1a7feebf5b1afddd0b9b0 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Sat, 28 Oct 2006 10:38:32 -0700
Subject: [PATCH] memory hotplug: __GFP_NOWARN is better for
 __kmalloc_section_memmap()

Add __GFP_NOWARN flag to calling of __alloc_pages() in
__kmalloc_section_memmap().  It can reduce noisy failure message.

In ia64, section size is 1 GB, this means that order 8 pages are necessary
for each section's memmap.  It is often very hard requirement under heavy
memory pressure as you know.  So, __alloc_pages() gives up allocation and
shows many noisy stack traces which means no page for each sections.
(Current my environment shows 32 times of stack trace....)

But, __kmalloc_section_memmap() calls vmalloc() after failure of it, and it
can succeed allocation of memmap.  So, its stack trace warning becomes just
noisy.  I suppose it shouldn't be shown.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/sparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 86c52ab80878..b3c82ba30012 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -211,7 +211,7 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 	struct page *page, *ret;
 	unsigned long memmap_size = sizeof(struct page) * nr_pages;
 
-	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
 	if (page)
 		goto got_map_page;
 
-- 
cgit v1.2.2


From 52fd24ca1db3a741f144bbc229beefe044202cac Mon Sep 17 00:00:00 2001
From: Giridhar Pemmasani <pgiri@yahoo.com>
Date: Sat, 28 Oct 2006 10:38:34 -0700
Subject: [PATCH] __vmalloc with GFP_ATOMIC causes 'sleeping from invalid
 context'

If __vmalloc is called to allocate memory with GFP_ATOMIC in atomic
context, the chain of calls results in __get_vm_area_node allocating memory
for vm_struct with GFP_KERNEL, causing the 'sleeping from invalid context'
warning.  This patch fixes it by passing the gfp flags along so
__get_vm_area_node allocates memory for vm_struct with the same flags.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1133dd3aafcf..6d381df7c9b3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -160,13 +160,15 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 	return err;
 }
 
-struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
-				unsigned long start, unsigned long end, int node)
+static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+					    unsigned long start, unsigned long end,
+					    int node, gfp_t gfp_mask)
 {
 	struct vm_struct **p, *tmp, *area;
 	unsigned long align = 1;
 	unsigned long addr;
 
+	BUG_ON(in_interrupt());
 	if (flags & VM_IOREMAP) {
 		int bit = fls(size);
 
@@ -180,7 +182,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
 	addr = ALIGN(start, align);
 	size = PAGE_ALIGN(size);
 
-	area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
+	area = kmalloc_node(sizeof(*area), gfp_mask, node);
 	if (unlikely(!area))
 		return NULL;
 
@@ -236,7 +238,7 @@ out:
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 				unsigned long start, unsigned long end)
 {
-	return __get_vm_area_node(size, flags, start, end, -1);
+	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
 }
 
 /**
@@ -253,9 +255,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
 }
 
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
+				   int node, gfp_t gfp_mask)
 {
-	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
+				  gfp_mask);
 }
 
 /* Caller must hold vmlist_lock */
@@ -487,7 +491,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
 		return NULL;
 
-	area = get_vm_area_node(size, VM_ALLOC, node);
+	area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
 	if (!area)
 		return NULL;
 
-- 
cgit v1.2.2


From ebed4bfc8da8df5b6b0bc4a5064a949f04683509 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 28 Oct 2006 10:38:43 -0700
Subject: [PATCH] hugetlb: fix absurd HugePages_Rsvd

If you truncated an mmap'ed hugetlbfs file, then faulted on the truncated
area, /proc/meminfo's HugePages_Rsvd wrapped hugely "negative".  Reinstate my
preliminary i_size check before attempting to allocate the page (though this
only fixes the most obvious case: more work will be needed here).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2dbec90dc3ba..a088f593a807 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -478,6 +478,9 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
 	page = find_lock_page(mapping, idx);
 	if (!page) {
+		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+		if (idx >= size)
+			goto out;
 		if (hugetlb_get_quota(mapping))
 			goto out;
 		page = alloc_huge_page(vma, address);
-- 
cgit v1.2.2


From 0c6cb974636dd29681b03f8eb0ae227decab01fb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@skynet.ie>
Date: Sat, 28 Oct 2006 10:38:59 -0700
Subject: [PATCH] Calculation fix for memory holes beyong the end of physical
 memory

absent_pages_in_range() made the assumption that users of the
arch-independent zone-sizing API would not care about holes beyound the end
of physical memory.  This was not the case and was "fixed" in a patch
called "Account for holes that are outside the range of physical memory".
However, when given a range that started before a hole in "real" memory and
ended beyond the end of memory, it would get the result wrong.  The bug is
in mainline but a patch is below.

It has been tested successfully on a number of machines and architectures.
Additional credit to Keith Mannthey for discovering the problem, helping
identify the correct fix and confirming it Worked For Him.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: keith mannthey <kmannth@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ecf853b5e30e..b55bb358b832 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2261,7 +2261,7 @@ unsigned long __init __absent_pages_in_range(int nid,
 
 	/* Account for ranges past physical memory on this node */
 	if (range_end_pfn > prev_end_pfn)
-		hole_pages = range_end_pfn -
+		hole_pages += range_end_pfn -
 				max(range_start_pfn, prev_end_pfn);
 
 	return hole_pages;
-- 
cgit v1.2.2


From 5211e6e6c671f0d4b1e1a1023384d20227d8ee65 Mon Sep 17 00:00:00 2001
From: Giridhar Pemmasani <pgiri@yahoo.com>
Date: Sun, 29 Oct 2006 04:46:55 -0800
Subject: [PATCH] Fix GFP_HIGHMEM slab panic

As reported by Martin J. Bligh <mbligh@google.com>, we let through some
non-slab bits to slab allocation through __get_vm_area_node when doing a
vmalloc.

I haven't been able to reproduce this, although I understand why it
happens: vmalloc allocates memory with

GFP_KERNEL | __GFP_HIGHMEM

and commit 52fd24ca1db3a741f144bbc229beefe044202cac resulted in the same
flags are passed down to cache_alloc_refill, causing the BUG.  The
following patch fixes it.

Note that when calling kmalloc_node, I am masking off __GFP_HIGHMEM with
GFP_LEVEL_MASK, whereas __vmalloc_area_node does the same with

~(__GFP_HIGHMEM | __GFP_ZERO).

IMHO, using GFP_LEVEL_MASK is preferable, but either should fix this
problem.

Signed-off-by: Giridhar Pemmasani (pgiri@yahoo.com)
Cc: Martin J. Bligh <mbligh@google.com>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6d381df7c9b3..46606c133e82 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -182,7 +182,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
 	addr = ALIGN(start, align);
 	size = PAGE_ALIGN(size);
 
-	area = kmalloc_node(sizeof(*area), gfp_mask, node);
+	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
 	if (unlikely(!area))
 		return NULL;
 
-- 
cgit v1.2.2


From 941c7105dc4f4961727acc518e18e00b9a03cbf3 Mon Sep 17 00:00:00 2001
From: nkalmala <nkalmala@gmail.com>
Date: Thu, 2 Nov 2006 22:07:04 -0800
Subject: [PATCH] mm: un-needed add-store operation wastes a few bytes

Un-needed add-store operation wastes a few bytes.
8 bytes wasted with -O2, on a ppc.

Signed-off-by: nkalmala <nkalmala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b55bb358b832..bf2f6cff1d6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -853,7 +853,7 @@ again:
 		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
 		if (!pcp->count) {
-			pcp->count += rmqueue_bulk(zone, 0,
+			pcp->count = rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 			if (unlikely(!pcp->count))
 				goto failed;
-- 
cgit v1.2.2


From 029e332ea717810172e965ec50f942755ad0c58a Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 2 Nov 2006 22:07:06 -0800
Subject: [PATCH] Cleanup read_pages()

Current read_pages() assume ->readpages() frees the passed pages.

This patch free the pages in ->read_pages(), if those were remaining in the
pages_list.  So, readpages() just can ignore the remaining pages in
pages_list.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Steven French <sfrench@us.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 1ba736ac0367..23cb61a01c6e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -173,6 +173,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+		/* Clean up the remaining pages */
+		put_pages_list(pages);
 		goto out;
 	}
 
-- 
cgit v1.2.2


From 7f6b8876c7e66b0d15af134e2a5b87e55514eb6d Mon Sep 17 00:00:00 2001
From: Daniel Yeisley <dan.yeisley@unisys.com>
Date: Thu, 2 Nov 2006 22:07:14 -0800
Subject: [PATCH] init_reap_node() initialization fix

It looks like there is a bug in init_reap_node() in slab.c that can cause
multiple oops's on certain ES7000 configurations.  The variable reap_node
is defined per cpu, but only initialized on a single CPU.  This causes an
oops in next_reap_node() when __get_cpu_var(reap_node) returns the wrong
value.  Fix is below.

Signed-off-by: Dan Yeisley <dan.yeisley@unisys.com>
Cc: Andi Kleen <ak@suse.de>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 84c631f30741..3c4a7e34eddc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -883,7 +883,7 @@ static void init_reap_node(int cpu)
 	if (node == MAX_NUMNODES)
 		node = first_node(node_online_map);
 
-	__get_cpu_var(reap_node) = node;
+	per_cpu(reap_node, cpu) = node;
 }
 
 static void next_reap_node(void)
-- 
cgit v1.2.2


From 8ce08464d2c749610a52c4d6c7c11080a7eaaef1 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 2 Nov 2006 22:07:28 -0800
Subject: [PATCH] Fix sys_move_pages when a NULL node list is passed

sys_move_pages() uses vmalloc() to allocate an array of structures that is
fills with information passed from user mode and then passes to
do_stat_pages() (in the case the node list is NULL).  do_stat_pages()
depends on a marker in the node field of the structure to decide how large
the array is and this marker is correctly inserted into the last element of
the array.  However, vmalloc() doesn't zero the memory it allocates and if
the user passes NULL for the node list, then the node fields are not filled
in (except for the end marker).  If the memory the vmalloc() returned
happend to have a word with the marker value in it in just the right place,
do_pages_stat will fail to fill the status field of part of the array and
we will return (random) kernel data to user mode.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index ba2453f9483d..b4979d423d2b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -952,7 +952,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				goto out;
 
 			pm[i].node = node;
-		}
+		} else
+			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
 	}
 	/* End marker */
 	pm[nr_pages].node = MAX_NUMNODES;
-- 
cgit v1.2.2


From 2b4ac44e7c7e16cf9411b81693ff3e604f332bf1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 10 Nov 2006 12:27:48 -0800
Subject: [PATCH] vmalloc: optimization, cleanup, bugfixes

- reorder 'struct vm_struct' to speedup lookups on CPUS with small cache
  lines.  The fields 'next,addr,size' should be now in the same cache line,
  to speedup lookups.

- One minor cleanup in __get_vm_area_node()

- Bugfixes in vmalloc_user() and vmalloc_32_user() NULL returns from
  __vmalloc() and __find_vm_area() were not tested.

[akpm@osdl.org: remove redundant BUG_ONs]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 46606c133e82..7dc6aa745166 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -186,10 +186,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
 	if (unlikely(!area))
 		return NULL;
 
-	if (unlikely(!size)) {
-		kfree (area);
+	if (unlikely(!size))
 		return NULL;
-	}
 
 	/*
 	 * We always allocate a guard page.
@@ -532,11 +530,12 @@ void *vmalloc_user(unsigned long size)
 	void *ret;
 
 	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
-	write_lock(&vmlist_lock);
-	area = __find_vm_area(ret);
-	area->flags |= VM_USERMAP;
-	write_unlock(&vmlist_lock);
-
+	if (ret) {
+		write_lock(&vmlist_lock);
+		area = __find_vm_area(ret);
+		area->flags |= VM_USERMAP;
+		write_unlock(&vmlist_lock);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(vmalloc_user);
@@ -605,11 +604,12 @@ void *vmalloc_32_user(unsigned long size)
 	void *ret;
 
 	ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
-	write_lock(&vmlist_lock);
-	area = __find_vm_area(ret);
-	area->flags |= VM_USERMAP;
-	write_unlock(&vmlist_lock);
-
+	if (ret) {
+		write_lock(&vmlist_lock);
+		area = __find_vm_area(ret);
+		area->flags |= VM_USERMAP;
+		write_unlock(&vmlist_lock);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(vmalloc_32_user);
-- 
cgit v1.2.2


From 68589bc353037f233fe510ad9ff432338c95db66 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Nov 2006 02:03:32 -0800
Subject: [PATCH] hugetlb: prepare_hugepage_range check offset too

(David:)

If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.

But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it.  That will eventually call down to the
non-hugepage version of unmap_page_range().  On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD.  unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries.  I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.

(Hugh:)

prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down.  PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.

Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 497e502dfd6b..bdace87d7c01 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1379,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		 * Check if the given range is hugepage aligned, and
 		 * can be made suitable for hugepages.
 		 */
-		ret = prepare_hugepage_range(addr, len);
+		ret = prepare_hugepage_range(addr, len, pgoff);
 	} else {
 		/*
 		 * Ensure that a normal request is not falling in a
-- 
cgit v1.2.2


From cb07c9a1864a8eac9f3123e428100d5b2a16e65a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 14 Nov 2006 02:03:38 -0800
Subject: [PATCH] hugetlb: check for brk() entering a hugepage region

Unlike mmap(), the codepath for brk() creates a vma without first checking
that it doesn't touch a region exclusively reserved for hugepages.  On
powerpc, this can allow it to create a normal page vma in a hugepage
region, causing oopses and other badness.

Add a test to prevent this.  With this patch, brk() will simply fail if it
attempts to move the break into a hugepage reserved region.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index bdace87d7c01..2526463c99a7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1880,6 +1880,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 		return -EINVAL;
 
+	error = is_hugepage_only_range(current->mm, addr, len);
+	if (error)
+		return error;
+
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
 	error = arch_mmap_check(addr, len, flags);
-- 
cgit v1.2.2


From cd2579d7aa7bfc966cc271a88e77f8cfc3b0b7ba Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Nov 2006 13:43:38 +0000
Subject: [PATCH] hugetlb: fix error return for brk() entering a hugepage
 region

Commit cb07c9a1864a8eac9f3123e428100d5b2a16e65a causes the wrong return
value.  is_hugepage_only_range() is a boolean, so we should return
-EINVAL rather than 1.

Also - we can use "mm" instead of looking up "current->mm" again.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 2526463c99a7..7b40abd7cba2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1880,9 +1880,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 		return -EINVAL;
 
-	error = is_hugepage_only_range(current->mm, addr, len);
-	if (error)
-		return error;
+	if (is_hugepage_only_range(mm, addr, len))
+		return -EINVAL;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
-- 
cgit v1.2.2


From 31be8309532a6743f301cb2e83bd12ca07988b09 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Nov 2006 01:19:29 -0800
Subject: [PATCH] Fix strange size check in __get_vm_area_node()

Recently, __get_vm_area_node() was changed like following

 	if (unlikely(!area))
 		return NULL;

-	if (unlikely(!size)) {
-		kfree (area);
+	if (unlikely(!size))
 		return NULL;
-	}

It is leaking `area', also original code seems strange already.
Probably, we wanted to do this patch.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7dc6aa745166..86897ee792d6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -181,14 +181,13 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
 	}
 	addr = ALIGN(start, align);
 	size = PAGE_ALIGN(size);
+	if (unlikely(!size))
+		return NULL;
 
 	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
 	if (unlikely(!area))
 		return NULL;
 
-	if (unlikely(!size))
-		return NULL;
-
 	/*
 	 * We always allocate a guard page.
 	 */
-- 
cgit v1.2.2