From 6292d9aaf3047f1abd970bc64ab6d952eda258ac Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Wed, 1 Feb 2006 03:04:44 -0800
Subject: [PATCH] __cpuinit functions wrongly marked __meminit

__meminit has overzelously been modified and crept its way into marking
cpuup callbacks as __meminit.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df54e2fc8ee0..44b4eb4202d9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1799,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
-static int __meminit zone_batchsize(struct zone *zone)
+static int __cpuinit zone_batchsize(struct zone *zone)
 {
 	int batch;
 
@@ -1893,7 +1893,7 @@ static struct per_cpu_pageset
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
-static int __meminit process_zones(int cpu)
+static int __cpuinit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 
@@ -1934,7 +1934,7 @@ static inline void free_zone_pagesets(int cpu)
 	}
 }
 
-static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
-- 
cgit v1.2.2


From 8928862398fef04a137e5673ac5fa9e797960c87 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:25 -0800
Subject: [PATCH] Optimize off-node performance of zone reclaim

Ensure that the performance of off node pages stays the same as before.
Off node pagefault tests showed an 18% drop in performance without this
patch.

- Increase the timeout to 30 seconds to reduce the overhead.

- Move all code possible out of the off node hot path for zone reclaim
  (Sorry Andrew, the struct initialization had to be sacrificed).
  The read_page_state() bit us there.

- Check first for the timeout before any other checks.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c7..465bfa54dfd6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1589,24 +1589,20 @@ int zone_reclaim_mode __read_mostly;
 /*
  * Mininum time between zone reclaim scans
  */
-#define ZONE_RECLAIM_INTERVAL HZ/2
+#define ZONE_RECLAIM_INTERVAL 30*HZ
 /*
  * Try to free up some pages from this zone through reclaim.
  */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages = 1 << order;
+	int nr_pages;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc = {
-		.gfp_mask	= gfp_mask,
-		.may_writepage	= 0,
-		.may_swap	= 0,
-		.nr_mapped	= read_page_state(nr_mapped),
-		.nr_scanned	= 0,
-		.nr_reclaimed	= 0,
-		.priority	= 0
-	};
+	struct scan_control sc;
+
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
 		zone->zone_pgdat->node_id != numa_node_id() ||
@@ -1614,12 +1610,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		atomic_read(&zone->reclaim_in_progress) > 0)
 			return 0;
 
-	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
-			return 0;
+	sc.may_writepage = 0;
+	sc.may_swap = 0;
+	sc.nr_scanned = 0;
+	sc.nr_reclaimed = 0;
+	sc.priority = 0;
+	sc.nr_mapped = read_page_state(nr_mapped);
+	sc.gfp_mask = gfp_mask;
 
 	disable_swap_token();
 
+	nr_pages = 1 << order;
 	if (nr_pages > SWAP_CLUSTER_MAX)
 		sc.swap_cluster_max = nr_pages;
 	else
-- 
cgit v1.2.2


From 42c722d4cb4022e56ff200f3f5a58c0dfd7edac6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:26 -0800
Subject: [PATCH] zone_reclaim: reclaim on memory only node support

Zone reclaim is usually only run on the local node.  Headless nodes do not
have any local processors.  This patch checks for headless nodes and
performs zone reclaim on them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 465bfa54dfd6..0ca6007d655b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1599,17 +1599,23 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc;
+	cpumask_t mask;
+	int node_id;
 
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
 			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
-		zone->zone_pgdat->node_id != numa_node_id() ||
 		zone->all_unreclaimable ||
 		atomic_read(&zone->reclaim_in_progress) > 0)
 			return 0;
 
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+
 	sc.may_writepage = 0;
 	sc.may_swap = 0;
 	sc.nr_scanned = 0;
-- 
cgit v1.2.2


From 52a8363eae3872af15880292ff4e06d0fab36986 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:28 -0800
Subject: [PATCH] mm: improve function of sc->may_writepage

Make sc->may_writepage control the writeout behavior of shrink_list.

Remove the laptop_mode trick from shrink_list and instead set may_writepage
in try_to_free_pages properly.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0ca6007d655b..a29efb2c06c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -492,7 +492,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
-			if (laptop_mode && !sc->may_writepage)
+			if (!sc->may_writepage)
 				goto keep_locked;
 
 			/* Page is dirty, try to write it out here */
@@ -1170,7 +1170,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int i;
 
 	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = 0;
+	sc.may_writepage = !laptop_mode;
 	sc.may_swap = 1;
 
 	inc_page_state(allocstall);
@@ -1273,7 +1273,7 @@ loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
 	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = 0;
+	sc.may_writepage = !laptop_mode;
 	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
-- 
cgit v1.2.2


From c84db23c6e587d3ab00a41c51fedf758e1f6ecd4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:29 -0800
Subject: [PATCH] zone_reclaim: minor fixes

- If we only reclaim nr_pages then its okay to stay on node.
  Switch from > to >= for the comparison.

- vm_table[] entry for zone_reclaim_mode is a bit screwed up.

- Add empty lines around shrink_zone to show that this is the
  central function to be called.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index a29efb2c06c8..61ca0097c834 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1636,14 +1636,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
+
 	shrink_zone(zone, &sc);
+
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
 
 	if (sc.nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	return sc.nr_reclaimed > nr_pages;
+	return sc.nr_reclaimed >= nr_pages;
 }
 #endif
 
-- 
cgit v1.2.2


From 9884fd8df195fe48d4e1be2279b419be96127cae Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@linux.intel.com>
Date: Wed, 1 Feb 2006 03:05:30 -0800
Subject: [PATCH] Use 32 bit division in slab_put_obj()

Improve the performance of slab_put_obj().  Without the cast, gcc considers
ptrdiff_t a 64 bit signed integer and ends up emitting code to use a full
signed 128 bit divide on EM64T, which is substantially slower than a 32 bit
unsigned divide.

I noticed this when looking at the profile of a case where the slab balance
is just on edge and thrashes back and forth freeing a block.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6f8495e2185b..88082ae15736 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1398,7 +1398,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		struct slab *slabp = page_get_slab(virt_to_page(objp));
 		int objnr;
 
-		objnr = (objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
 		if (objnr) {
 			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
 			realobj = (char *)objp + obj_dbghead(cachep);
@@ -2341,7 +2341,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (objp - slabp->s_mem) / cachep->objsize;
+	objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
 
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
@@ -2699,7 +2699,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 		slabp = page_get_slab(virt_to_page(objp));
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 
-- 
cgit v1.2.2


From aa3f18b3391ac305baa01faead3fdf9147daf54b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:32 -0800
Subject: [PATCH] zone_reclaim: do not unmap file backed pages

zone_reclaim should leave that to the real swapper.  We are only interested
in evicting unmapped pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 61ca0097c834..8277f93148b5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,6 +477,12 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
+			/*
+			 * No unmapping if we do not swap
+			 */
+			if (!sc->may_swap)
+				goto keep_locked;
+
 			switch (try_to_unmap(page)) {
 			case SWAP_FAIL:
 				goto activate_locked;
-- 
cgit v1.2.2


From a92f71263af9d0ab77c260f709c0c079656221aa Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:32 -0800
Subject: [PATCH] zone_reclaim: partial scans instead of full scan

Instead of scanning all the pages in a zone, imitate real swap and scan
only a portion of the pages and gradually scan more if we do not free up
enough pages.  This avoids a zone suddenly loosing all unused pagecache
pages (we may after all access some of these again so they deserve another
chance) but it still frees up large chunks of memory if a zone only
contains unused pagecache pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8277f93148b5..f8b94ea6f722 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1596,6 +1596,14 @@ int zone_reclaim_mode __read_mostly;
  * Mininum time between zone reclaim scans
  */
 #define ZONE_RECLAIM_INTERVAL 30*HZ
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -1626,7 +1634,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	sc.may_swap = 0;
 	sc.nr_scanned = 0;
 	sc.nr_reclaimed = 0;
-	sc.priority = 0;
+	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 	sc.gfp_mask = gfp_mask;
 
@@ -1643,7 +1651,15 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	shrink_zone(zone, &sc);
+	/*
+	 * Free memory by calling shrink zone with increasing priorities
+	 * until we have enough memory freed.
+	 */
+	do {
+		sc.priority--;
+		shrink_zone(zone, &sc);
+
+	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
 
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
-- 
cgit v1.2.2


From 2a11ff06d7d12be5d1bbcf592fff649b45ac2388 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:33 -0800
Subject: [PATCH] zone_reclaim: configurable off node allocation period.

Currently the zone_reclaim code has a fixed window of 30 seconds of off node
allocations should a local zone have no unused pagecache pages left.  Reclaim
will be attempted again after this timeout period to avoid repeated useless
scans for memory.  This is also useful to established sufficiently large off
node allocation chunks to relieve the local node.

It may be beneficial to adjust that time period for some special situations.
For example if memory use was exceeding node capacity one may want to give up
for longer periods of time.  If memory spikes intermittendly then one may want
to shorten the time period to reduce the number of off node allocations.

This patch allows just that....

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8b94ea6f722..8760a4abfa1f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1595,7 +1595,7 @@ int zone_reclaim_mode __read_mostly;
 /*
  * Mininum time between zone reclaim scans
  */
-#define ZONE_RECLAIM_INTERVAL 30*HZ
+int zone_reclaim_interval __read_mostly = 30*HZ;
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1617,7 +1617,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
 			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
-- 
cgit v1.2.2


From 1b2ffb7896ad46067f5b9ebf7de1891d74a4cdef Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:34 -0800
Subject: [PATCH] Zone reclaim: Allow modification of zone reclaim behavior

In some situations one may want zone_reclaim to behave differently.  For
example a process writing large amounts of memory will spew unto other nodes
to cache the writes if many pages in a zone become dirty.  This may impact the
performance of processes running on other nodes.

Allowing writes during reclaim puts a stop to that behavior and throttles the
process by restricting the pages to the local zone.

Similarly one may want to contain processes to local memory by enabling
regular swap behavior during zone_reclaim.  Off node memory allocation can
then be controlled through memory policies and cpusets.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8760a4abfa1f..9e2ef3624d77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1592,6 +1592,11 @@ module_init(kswapd_init)
  */
 int zone_reclaim_mode __read_mostly;
 
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+
 /*
  * Mininum time between zone reclaim scans
  */
@@ -1630,8 +1635,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
 
-	sc.may_writepage = 0;
-	sc.may_swap = 0;
+	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
+	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
 	sc.nr_scanned = 0;
 	sc.nr_reclaimed = 0;
 	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
-- 
cgit v1.2.2


From 2a16e3f4b0c408b9e50297d2ec27e295d490267a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:35 -0800
Subject: [PATCH] Reclaim slab during zone reclaim

If large amounts of zone memory are used by empty slabs then zone_reclaim
becomes uneffective.  This patch shakes the slab a bit.

The problem with this patch is that the slab reclaim is not containable to a
zone.  Thus slab reclaim may affect the whole system and be extremely slow.
This also means that we cannot determine how many pages were freed in this
zone.  Thus we need to go off node for at least one allocation.

The functionality is disabled by default.

We could modify the shrinkers to take a zone parameter but that would be quite
invasive.  Better ideas are welcome.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9e2ef3624d77..aa4b80dbe3ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1596,6 +1596,7 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
 
 /*
  * Mininum time between zone reclaim scans
@@ -1666,6 +1667,19 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
 
+	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+		/*
+		 * shrink_slab does not currently allow us to determine
+		 * how many pages were freed in the zone. So we just
+		 * shake the slab and then go offnode for a single allocation.
+		 *
+		 * shrink_slab will free memory on all zones and may take
+		 * a long time.
+		 */
+		shrink_slab(sc.nr_scanned, gfp_mask, order);
+		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
+	}
+
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
 
-- 
cgit v1.2.2


From b16664e44c54525be89dc07ad15a13b4eeec5634 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:36 -0800
Subject: [PATCH] Direct Migration V9: PageSwapCache checks

Check for PageSwapCache after looking up and locking a swap page.

The page migration code may change a swap pte to point to a different page
under lock_page().

If that happens then the vm must retry the lookup operation in the swap space
to find the correct page number.  There are a couple of locations in the VM
where a lock_page() is done on a swap page.  In these locations we need to
check afterwards if the page was migrated.  If the page was migrated then the
old page that was looked up before was freed and no longer has the
PageSwapCache bit set.

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c   | 7 +++++++
 mm/shmem.c    | 8 ++++++++
 mm/swapfile.c | 7 +++++++
 3 files changed, 22 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 7a11ddd5060f..2bee1f21aa8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1871,6 +1871,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
+again:
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1894,6 +1895,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	mark_page_accessed(page);
 	lock_page(page);
+	if (!PageSwapCache(page)) {
+		/* Page migration has occured */
+		unlock_page(page);
+		page_cache_release(page);
+		goto again;
+	}
 
 	/*
 	 * Back out if somebody else already faulted in this pte.
diff --git a/mm/shmem.c b/mm/shmem.c
index ce501bce1c2e..f7ac7b812f92 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1028,6 +1028,14 @@ repeat:
 			page_cache_release(swappage);
 			goto repeat;
 		}
+		if (!PageSwapCache(swappage)) {
+			/* Page migration has occured */
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			unlock_page(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
 		if (PageWriteback(swappage)) {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1e69c30d203..9678182e0eef 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -646,6 +646,7 @@ static int try_to_unuse(unsigned int type)
 		 */
 		swap_map = &si->swap_map[i];
 		entry = swp_entry(type, i);
+again:
 		page = read_swap_cache_async(entry, NULL, 0);
 		if (!page) {
 			/*
@@ -680,6 +681,12 @@ static int try_to_unuse(unsigned int type)
 		wait_on_page_locked(page);
 		wait_on_page_writeback(page);
 		lock_page(page);
+		if (!PageSwapCache(page)) {
+			/* Page migration has occured */
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
 		wait_on_page_writeback(page);
 
 		/*
-- 
cgit v1.2.2


From a48d07afdf18212de22b959715b16793c5a6e57a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:38 -0800
Subject: [PATCH] Direct Migration V9: migrate_pages() extension

Add direct migration support with fall back to swap.

Direct migration support on top of the swap based page migration facility.

This allows the direct migration of anonymous pages and the migration of file
backed pages by dropping the associated buffers (requires writeout).

Fall back to swap out if necessary.

The patch is based on lots of patches from the hotplug project but the code
was restructured, documented and simplified as much as possible.

Note that an additional patch that defines the migrate_page() method for
filesystems is necessary in order to avoid writeback for anonymous and file
backed pages.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c   |  21 +++---
 mm/vmscan.c | 226 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 227 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index d85a99d28c03..13fad5fcdf79 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -52,6 +52,7 @@
 #include <linux/init.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
+#include <linux/module.h>
 
 #include <asm/tlbflush.h>
 
@@ -541,7 +542,8 @@ void page_remove_rmap(struct page *page)
  * Subfunctions of try_to_unmap: try_to_unmap_one called
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+				int ignore_refs)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -564,7 +566,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if ((vma->vm_flags & VM_LOCKED) ||
-			ptep_clear_flush_young(vma, address, pte)) {
+			(ptep_clear_flush_young(vma, address, pte)
+				&& !ignore_refs)) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -698,7 +701,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
-static int try_to_unmap_anon(struct page *page)
+static int try_to_unmap_anon(struct page *page, int ignore_refs)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -709,7 +712,7 @@ static int try_to_unmap_anon(struct page *page)
 		return ret;
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		ret = try_to_unmap_one(page, vma);
+		ret = try_to_unmap_one(page, vma, ignore_refs);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
 	}
@@ -726,7 +729,7 @@ static int try_to_unmap_anon(struct page *page)
  *
  * This function is only called from try_to_unmap for object-based pages.
  */
-static int try_to_unmap_file(struct page *page)
+static int try_to_unmap_file(struct page *page, int ignore_refs)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -740,7 +743,7 @@ static int try_to_unmap_file(struct page *page)
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		ret = try_to_unmap_one(page, vma);
+		ret = try_to_unmap_one(page, vma, ignore_refs);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
 	}
@@ -825,16 +828,16 @@ out:
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  */
-int try_to_unmap(struct page *page)
+int try_to_unmap(struct page *page, int ignore_refs)
 {
 	int ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page);
+		ret = try_to_unmap_anon(page, ignore_refs);
 	else
-		ret = try_to_unmap_file(page);
+		ret = try_to_unmap_file(page, ignore_refs);
 
 	if (!page_mapped(page))
 		ret = SWAP_SUCCESS;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aa4b80dbe3ad..8f326ce2b690 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -483,7 +483,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 			if (!sc->may_swap)
 				goto keep_locked;
 
-			switch (try_to_unmap(page)) {
+			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -623,7 +623,7 @@ static int swap_page(struct page *page)
 	struct address_space *mapping = page_mapping(page);
 
 	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page) != SWAP_SUCCESS)
+		if (try_to_unmap(page, 0) != SWAP_SUCCESS)
 			goto unlock_retry;
 
 	if (PageDirty(page)) {
@@ -659,6 +659,154 @@ unlock_retry:
 retry:
 	return -EAGAIN;
 }
+
+/*
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+static int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return 1;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	try_to_unmap(page, 1);
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return 1;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 1;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	if (migrate_page_remove_references(newpage, page, 2))
+		return -EAGAIN;
+
+	migrate_page_copy(newpage, page);
+
+	return 0;
+}
+
 /*
  * migrate_pages
  *
@@ -672,11 +820,6 @@ retry:
  * are movable anymore because t has become empty
  * or no retryable pages exist anymore.
  *
- * SIMPLIFIED VERSION: This implementation of migrate_pages
- * is only swapping out pages and never touches the second
- * list. The direct migration patchset
- * extends this function to avoid the use of swap.
- *
  * Return: Number of pages not migrated when "to" ran empty.
  */
 int migrate_pages(struct list_head *from, struct list_head *to,
@@ -697,6 +840,9 @@ redo:
 	retry = 0;
 
 	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
 		cond_resched();
 
 		rc = 0;
@@ -704,6 +850,9 @@ redo:
 			/* page was freed from under us. So we are done. */
 			goto next;
 
+		if (to && list_empty(to))
+			break;
+
 		/*
 		 * Skip locked pages during the first two passes to give the
 		 * functions holding the lock time to release the page. Later we
@@ -740,12 +889,64 @@ redo:
 			}
 		}
 
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
 		/*
-		 * Page is properly locked and writeback is complete.
+		 * Pages are properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		rc = swap_page(page);
-		goto next;
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		/*
+		 * Trigger writeout if page is dirty
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+		/*
+		 * If we have no buffer or can release the buffer
+		 * then do a simple migration.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
 
 unlock_page:
 		unlock_page(page);
@@ -758,7 +959,10 @@ next:
 			list_move(&page->lru, failed);
 			nr_failed++;
 		} else {
-			/* Success */
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
 			list_move(&page->lru, moved);
 		}
 	}
-- 
cgit v1.2.2


From a3351e525e4768c29aa5d22ef59b5b38e0361e53 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:39 -0800
Subject: [PATCH] Direct Migration V9: remove_from_swap() to remove swap ptes

Add remove_from_swap

remove_from_swap() allows the restoration of the pte entries that existed
before page migration occurred for anonymous pages by walking the reverse
maps.  This reduces swap use and establishes regular pte's without the need
for page faults.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c     | 29 +++++++++++++++++++++++++++++
 mm/swapfile.c |  9 +++++++++
 mm/vmscan.c   |  9 +++++++++
 3 files changed, 47 insertions(+)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 13fad5fcdf79..f4b91d7aa5cf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -206,6 +206,35 @@ out:
 	return anon_vma;
 }
 
+#ifdef CONFIG_MIGRATION
+/*
+ * Remove an anonymous page from swap replacing the swap pte's
+ * through real pte's pointing to valid pages and then releasing
+ * the page from the swap cache.
+ *
+ * Must hold page lock on page.
+ */
+void remove_from_swap(struct page *page)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+
+	if (!PageAnon(page) || !PageSwapCache(page))
+		return;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return;
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+		remove_vma_swap(vma, page);
+
+	spin_unlock(&anon_vma->lock);
+
+	delete_from_swap_cache(page);
+}
+#endif
+
 /*
  * At what user virtual address is page expected in vma?
  */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9678182e0eef..1f9cf0d073b8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -554,6 +554,15 @@ static int unuse_mm(struct mm_struct *mm,
 	return 0;
 }
 
+#ifdef CONFIG_MIGRATION
+int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page) };
+
+	return unuse_vma(vma, entry, page);
+}
+#endif
+
 /*
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8f326ce2b690..5e98b86feb74 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -804,6 +804,15 @@ int migrate_page(struct page *newpage, struct page *page)
 
 	migrate_page_copy(newpage, page);
 
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 7e2ab150d1b3b286a4c864c60a549b2601777b63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:40 -0800
Subject: [PATCH] Direct Migration V9: upgrade MPOL_MF_MOVE and
 sys_migrate_pages()

Modify policy layer to support direct page migration

- Add migrate_pages_to() allowing the migration of a list of pages to a a
  specified node or to vma with a specific allocation policy in sets of
  MIGRATE_CHUNK_SIZE pages

- Modify do_migrate_pages() to do a staged move of pages from the source
  nodes to the target nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 146 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0eb..27da6d5c77ba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -543,24 +546,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	}
 }
 
-static int swap_pages(struct list_head *pagelist)
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
 {
+	LIST_HEAD(newlist);
 	LIST_HEAD(moved);
 	LIST_HEAD(failed);
-	int n;
+	int err = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
 
-	n = migrate_pages(pagelist, NULL, &moved, &failed);
-	putback_lru_pages(&failed);
-	putback_lru_pages(&moved);
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma)
+			page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 
-	return n;
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE);
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+{
+	nodemask_t nmask;
+	LIST_HEAD(pagelist);
+	int err = 0;
+
+	nodes_clear(nmask);
+	node_set(source, nmask);
+
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+	if (!list_empty(&pagelist)) {
+		err = migrate_pages_to(&pagelist, NULL, dest);
+		if (!list_empty(&pagelist))
+			putback_lru_pages(&pagelist);
+	}
+	return err;
 }
 
 /*
- * For now migrate_pages simply swaps out the pages from nodes that are in
- * the source set but not in the target set. In the future, we would
- * want a function that moves pages between the two nodesets in such
- * a way as to preserve the physical layout as much as possible.
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
  *
  * Returns the number of page that could not be moved.
  */
@@ -568,22 +638,76 @@ int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
 	LIST_HEAD(pagelist);
-	int count = 0;
-	nodemask_t nodes;
+	int busy = 0;
+	int err = 0;
+	nodemask_t tmp;
 
-	nodes_andnot(nodes, *from_nodes, *to_nodes);
+  	down_read(&mm->mmap_sem);
 
-	down_read(&mm->mmap_sem);
-	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
-			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same.  If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient.  As we go, we remember the
+ * most recent <s, d> pair that moved (s != d).  If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved.  If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
 
-	if (!list_empty(&pagelist)) {
-		count = swap_pages(&pagelist);
-		putback_lru_pages(&pagelist);
+	tmp = *from_nodes;
+	while (!nodes_empty(tmp)) {
+		int s,d;
+		int source = -1;
+		int dest = 0;
+
+		for_each_node_mask(s, tmp) {
+			d = node_remap(s, *from_nodes, *to_nodes);
+			if (s == d)
+				continue;
+
+			source = s;	/* Node moved. Memorize */
+			dest = d;
+
+			/* dest not in remaining from nodes? */
+			if (!node_isset(dest, tmp))
+				break;
+		}
+		if (source == -1)
+			break;
+
+		node_clear(source, tmp);
+		err = migrate_to_node(mm, source, dest, flags);
+		if (err > 0)
+			busy += err;
+		if (err < 0)
+			break;
 	}
 
 	up_read(&mm->mmap_sem);
-	return count;
+	if (err < 0)
+		return err;
+	return busy;
 }
 
 long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +767,9 @@ long do_mbind(unsigned long start, unsigned long len,
 		int nr_failed = 0;
 
 		err = mbind_range(vma, start, end, new);
+
 		if (!list_empty(&pagelist))
-			nr_failed = swap_pages(&pagelist);
+			nr_failed = migrate_pages_to(&pagelist, vma, -1);
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
-- 
cgit v1.2.2


From e965f9630c651fa4249039fd4b80c9392d07a856 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:41 -0800
Subject: [PATCH] Direct Migration V9: Avoid writeback / page_migrate() method

Migrate a page with buffers without requiring writeback

This introduces a new address space operation migratepage() that may be used
by a filesystem to implement its own version of page migration.

A version is provided that migrates buffers attached to pages.  Some
filesystems (ext2, ext3, xfs) are modified to utilize this feature.

The swapper address space operation are modified so that a regular
migrate_page() will occur for anonymous pages without writeback (migrate_pages
forces every anonymous page to have a swap entry).

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c       |  1 +
 mm/swap_state.c |  1 +
 mm/vmscan.c     | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index f4b91d7aa5cf..df2c41c2a9a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -233,6 +233,7 @@ void remove_from_swap(struct page *page)
 
 	delete_from_swap_cache(page);
 }
+EXPORT_SYMBOL(remove_from_swap);
 #endif
 
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7b09ac503fec..db8a3d3e1636 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,6 +27,7 @@ static struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.migratepage	= migrate_page,
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e98b86feb74..5a610804cd06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -614,6 +614,15 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -659,6 +668,7 @@ unlock_retry:
 retry:
 	return -EAGAIN;
 }
+EXPORT_SYMBOL(swap_page);
 
 /*
  * Page migration was first developed in the context of the memory hotplug
@@ -674,7 +684,7 @@ retry:
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
  */
-static int migrate_page_remove_references(struct page *newpage,
+int migrate_page_remove_references(struct page *newpage,
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -749,6 +759,7 @@ static int migrate_page_remove_references(struct page *newpage,
 
 	return 0;
 }
+EXPORT_SYMBOL(migrate_page_remove_references);
 
 /*
  * Copy the page to its new location
@@ -788,6 +799,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageWriteback(newpage))
 		end_page_writeback(newpage);
 }
+EXPORT_SYMBOL(migrate_page_copy);
 
 /*
  * Common logic to directly migrate a single page suitable for
@@ -815,6 +827,7 @@ int migrate_page(struct page *newpage, struct page *page)
 	remove_from_swap(newpage);
 	return 0;
 }
+EXPORT_SYMBOL(migrate_page);
 
 /*
  * migrate_pages
@@ -914,6 +927,11 @@ redo:
 		if (!mapping)
 			goto unlock_both;
 
+		if (mapping->a_ops->migratepage) {
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
 		/*
 		 * Trigger writeout if page is dirty
 		 */
-- 
cgit v1.2.2


From 3dafccf22751429e69b6266636cf3acf45b48075 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 1 Feb 2006 03:05:42 -0800
Subject: [PATCH] slab: distinguish between object and buffer size

An object cache has two different object lengths:

  - the amount of memory available for the user (object size)
  - the amount of memory allocated internally (buffer size)

This patch does some renames to make the code reflect that better.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 154 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 80 insertions(+), 74 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 88082ae15736..1a014aaf4491 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -375,7 +375,7 @@ struct kmem_cache {
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
-	unsigned int objsize;
+	unsigned int buffer_size;
 /* 2) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
 	unsigned int flags;	/* constant flags */
@@ -423,8 +423,14 @@ struct kmem_cache {
 	atomic_t freemiss;
 #endif
 #if DEBUG
-	int dbghead;
-	int reallen;
+	/*
+	 * If debugging is enabled, then the allocator can add additional
+	 * fields and/or padding to every object. buffer_size contains the total
+	 * object size including these internal fields, the following two
+	 * variables contain the offset to the user object and its size.
+	 */
+	int obj_offset;
+	int obj_size;
 #endif
 };
 
@@ -495,50 +501,50 @@ struct kmem_cache {
 
 /* memory layout of objects:
  * 0		: objp
- * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
+ * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
  * 		allocation. Catches writes behind the end of the allocation.
- * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
+ * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
  * 		redzone word.
- * cachep->dbghead: The real object.
- * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->obj_offset: The real object.
+ * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
  */
-static int obj_dbghead(kmem_cache_t *cachep)
+static int obj_offset(kmem_cache_t *cachep)
 {
-	return cachep->dbghead;
+	return cachep->obj_offset;
 }
 
-static int obj_reallen(kmem_cache_t *cachep)
+static int obj_size(kmem_cache_t *cachep)
 {
-	return cachep->reallen;
+	return cachep->obj_size;
 }
 
 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
-	return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
+	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 }
 
 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
-		return (unsigned long *)(objp + cachep->objsize -
+		return (unsigned long *)(objp + cachep->buffer_size -
 					 2 * BYTES_PER_WORD);
-	return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
+	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-	return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
+	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
 #else
 
-#define obj_dbghead(x)			0
-#define obj_reallen(cachep)		(cachep->objsize)
+#define obj_offset(x)			0
+#define obj_size(cachep)		(cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
@@ -623,12 +629,12 @@ static kmem_cache_t cache_cache = {
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
-	.objsize = sizeof(kmem_cache_t),
+	.buffer_size = sizeof(kmem_cache_t),
 	.flags = SLAB_NO_REAP,
 	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
-	.reallen = sizeof(kmem_cache_t),
+	.obj_size = sizeof(kmem_cache_t),
 #endif
 };
 
@@ -1057,9 +1063,9 @@ void __init kmem_cache_init(void)
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
-	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
 
-	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
+	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
 		       &left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
@@ -1274,9 +1280,9 @@ static void kmem_rcu_free(struct rcu_head *head)
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 			    unsigned long caller)
 {
-	int size = obj_reallen(cachep);
+	int size = obj_size(cachep);
 
-	addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
+	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
 
 	if (size < 5 * sizeof(unsigned long))
 		return;
@@ -1306,8 +1312,8 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
-	int size = obj_reallen(cachep);
-	addr = &((char *)addr)[obj_dbghead(cachep)];
+	int size = obj_size(cachep);
+	addr = &((char *)addr)[obj_offset(cachep)];
 
 	memset(addr, val, size);
 	*(unsigned char *)(addr + size - 1) = POISON_END;
@@ -1344,8 +1350,8 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 			     (unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
-	realobj = (char *)objp + obj_dbghead(cachep);
-	size = obj_reallen(cachep);
+	realobj = (char *)objp + obj_offset(cachep);
+	size = obj_size(cachep);
 	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
@@ -1361,8 +1367,8 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 	int size, i;
 	int lines = 0;
 
-	realobj = (char *)objp + obj_dbghead(cachep);
-	size = obj_reallen(cachep);
+	realobj = (char *)objp + obj_offset(cachep);
+	size = obj_size(cachep);
 
 	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
@@ -1398,17 +1404,17 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		struct slab *slabp = page_get_slab(virt_to_page(objp));
 		int objnr;
 
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		if (objnr) {
-			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
-			realobj = (char *)objp + obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
-			objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
-			realobj = (char *)objp + obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
@@ -1428,14 +1434,14 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 #if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->objsize * i;
+		void *objp = slabp->s_mem + cachep->buffer_size * i;
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->objsize % PAGE_SIZE) == 0
+			if ((cachep->buffer_size % PAGE_SIZE) == 0
 			    && OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
-						 cachep->objsize / PAGE_SIZE,
+						 cachep->buffer_size / PAGE_SIZE,
 						 1);
 			else
 				check_poison_obj(cachep, objp);
@@ -1452,13 +1458,13 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 					   "was overwritten");
 		}
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-			(cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
+			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
 	}
 #else
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void *objp = slabp->s_mem + cachep->objsize * i;
+			void *objp = slabp->s_mem + cachep->buffer_size * i;
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
@@ -1478,7 +1484,7 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose objsize is same
+/* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
 static inline void set_up_list3s(kmem_cache_t *cachep, int index)
 {
@@ -1611,7 +1617,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		set_fs(old_fs);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
-			       pc->objsize);
+			       pc->buffer_size);
 			continue;
 		}
 
@@ -1702,14 +1708,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	memset(cachep, 0, sizeof(kmem_cache_t));
 
 #if DEBUG
-	cachep->reallen = size;
+	cachep->obj_size = size;
 
 	if (flags & SLAB_RED_ZONE) {
 		/* redzoning only works with word aligned caches */
 		align = BYTES_PER_WORD;
 
 		/* add space for red zone words */
-		cachep->dbghead += BYTES_PER_WORD;
+		cachep->obj_offset += BYTES_PER_WORD;
 		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
@@ -1722,8 +1728,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-	    && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
-		cachep->dbghead += PAGE_SIZE - size;
+	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
+		cachep->obj_offset += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
@@ -1786,7 +1792,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
-	cachep->objsize = size;
+	cachep->buffer_size = size;
 
 	if (flags & CFLGS_OFF_SLAB)
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -2118,7 +2124,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->objsize * i;
+		void *objp = slabp->s_mem + cachep->buffer_size * i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2136,7 +2142,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 		 * Otherwise, deadlock. They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(objp + obj_dbghead(cachep), cachep,
+			cachep->ctor(objp + obj_offset(cachep), cachep,
 				     ctor_flags);
 
 		if (cachep->flags & SLAB_RED_ZONE) {
@@ -2147,10 +2153,10 @@ static void cache_init_objs(kmem_cache_t *cachep,
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
 		    && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
-					 cachep->objsize / PAGE_SIZE, 0);
+					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
@@ -2309,7 +2315,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	unsigned int objnr;
 	struct slab *slabp;
 
-	objp -= obj_dbghead(cachep);
+	objp -= obj_offset(cachep);
 	kfree_debugcheck(objp);
 	page = virt_to_page(objp);
 
@@ -2341,31 +2347,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
+	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
+	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
 		 * caller can perform a verify of its state (debugging).
 		 * Called without the cache-lock held.
 		 */
-		cachep->ctor(objp + obj_dbghead(cachep),
+		cachep->ctor(objp + obj_offset(cachep),
 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
 	}
 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
 		/* we want to cache poison the object,
 		 * call the destruction callback
 		 */
-		cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
+		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
 			kernel_map_pages(virt_to_page(objp),
-					 cachep->objsize / PAGE_SIZE, 0);
+					 cachep->buffer_size / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
@@ -2468,7 +2474,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 
 			/* get obj pointer */
 			ac->entry[ac->avail++] = slabp->s_mem +
-			    slabp->free * cachep->objsize;
+			    slabp->free * cachep->buffer_size;
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
@@ -2526,9 +2532,9 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
 			kernel_map_pages(virt_to_page(objp),
-					 cachep->objsize / PAGE_SIZE, 1);
+					 cachep->buffer_size / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
@@ -2553,7 +2559,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
-	objp += obj_dbghead(cachep);
+	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 
@@ -2648,7 +2654,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	BUG_ON(slabp->inuse == cachep->num);
 
 	/* get obj pointer */
-	obj = slabp->s_mem + slabp->free * cachep->objsize;
+	obj = slabp->s_mem + slabp->free * cachep->buffer_size;
 	slabp->inuse++;
 	next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
@@ -2699,7 +2705,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 		slabp = page_get_slab(virt_to_page(objp));
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->objsize;
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 
@@ -2881,7 +2887,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
 	unsigned long align_mask = BYTES_PER_WORD - 1;
-	unsigned long size = cachep->objsize;
+	unsigned long size = cachep->buffer_size;
 	struct page *page;
 
 	if (unlikely(addr < min_addr))
@@ -3083,7 +3089,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = page_get_cache(virt_to_page(objp));
-	mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
+	mutex_debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
@@ -3114,7 +3120,7 @@ EXPORT_SYMBOL(free_percpu);
 
 unsigned int kmem_cache_size(kmem_cache_t *cachep)
 {
-	return obj_reallen(cachep);
+	return obj_size(cachep);
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
@@ -3258,13 +3264,13 @@ static void enable_cpucache(kmem_cache_t *cachep)
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
 	 */
-	if (cachep->objsize > 131072)
+	if (cachep->buffer_size > 131072)
 		limit = 1;
-	else if (cachep->objsize > PAGE_SIZE)
+	else if (cachep->buffer_size > PAGE_SIZE)
 		limit = 8;
-	else if (cachep->objsize > 1024)
+	else if (cachep->buffer_size > 1024)
 		limit = 24;
-	else if (cachep->objsize > 256)
+	else if (cachep->buffer_size > 256)
 		limit = 54;
 	else
 		limit = 120;
@@ -3279,7 +3285,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
 	 */
 	shared = 0;
 #ifdef CONFIG_SMP
-	if (cachep->objsize <= PAGE_SIZE)
+	if (cachep->buffer_size <= PAGE_SIZE)
 		shared = 8;
 #endif
 
@@ -3528,7 +3534,7 @@ static int s_show(struct seq_file *m, void *p)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-		   name, active_objs, num_objs, cachep->objsize,
+		   name, active_objs, num_objs, cachep->buffer_size,
 		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
 		   cachep->limit, cachep->batchcount, cachep->shared);
@@ -3656,5 +3662,5 @@ unsigned int ksize(const void *objp)
 	if (unlikely(objp == NULL))
 		return 0;
 
-	return obj_reallen(page_get_cache(virt_to_page(objp)));
+	return obj_size(page_get_cache(virt_to_page(objp)));
 }
-- 
cgit v1.2.2


From 18f820f655ce93b1e4d9b48fc6fcafc64157c6bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:43 -0800
Subject: [PATCH] slab: minor cleanup to kmem_cache_alloc_node

Clean up kmem_cache_alloc_node a bit.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 1a014aaf4491..bb7a9837b949 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2928,27 +2928,18 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	unsigned long save_flags;
 	void *ptr;
 
-	if (nodeid == -1)
-		return __cache_alloc(cachep, flags);
-
-	if (unlikely(!cachep->nodelists[nodeid])) {
-		/* Fall back to __cache_alloc if we run into trouble */
-		printk(KERN_WARNING
-		       "slab: not allocating in inactive node %d for cache %s\n",
-		       nodeid, cachep->name);
-		return __cache_alloc(cachep, flags);
-	}
-
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
-	if (nodeid == numa_node_id())
+
+	if (nodeid == -1 || nodeid == numa_node_id() ||
+	    !cachep->nodelists[nodeid])
 		ptr = ____cache_alloc(cachep, flags);
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
 	local_irq_restore(save_flags);
-	ptr =
-	    cache_alloc_debugcheck_after(cachep, flags, ptr,
-					 __builtin_return_address(0));
+
+	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
+					   __builtin_return_address(0));
 
 	return ptr;
 }
-- 
cgit v1.2.2


From 5ec8a847bb8ae2ba6395cfb7cb4bfdc78ada82ed Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 1 Feb 2006 03:05:44 -0800
Subject: [PATCH] slab: have index_of bug at compile time

I noticed the code for index_of is a creative way of finding the cache
index using the compiler to optimize to a single hard coded number.  But
I couldn't help noticing that it uses two methods to let you know that
someone used it wrong.  One is at compile time (the correct way), and
the other is at run time (not good).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index bb7a9837b949..613d385519fe 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -316,6 +316,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
  */
 static __always_inline int index_of(const size_t size)
 {
+	extern void __bad_size(void);
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -326,12 +328,9 @@ static __always_inline int index_of(const size_t size)
 		i++;
 #include "linux/kmalloc_sizes.h"
 #undef CACHE
-		{
-			extern void __bad_size(void);
-			__bad_size();
-		}
+		__bad_size();
 	} else
-		BUG();
+		__bad_size();
 	return 0;
 }
 
-- 
cgit v1.2.2


From fbaccacff1f17c65ae0972085368a7ec75be6062 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 1 Feb 2006 03:05:45 -0800
Subject: [PATCH] slab: cache_estimate cleanup

Clean up cache_estimate() in mm/slab.c and improves the algorithm from O(n) to
O(1).  We first calculate the maximum number of objects a slab can hold after
struct slab and kmem_bufctl_t for each object has been given enough space.
After that, to respect alignment rules, we decrease the number of objects if
necessary.  As required padding is at most align-1 and memory of obj_size is
at least align, it is always enough to decrease number of objects by one.

The optimization was originally made by Balbir Singh with more improvements
from Steven Rostedt.  Manfred Spraul provider further modifications: no loop
at all for the off-slab case and added comments to explain the background.

Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 613d385519fe..e869400ea731 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -702,32 +702,69 @@ kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 }
 EXPORT_SYMBOL(kmem_find_general_cachep);
 
-/* Cal the num objs, wastage, and bytes left over for a given slab size. */
-static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-			   int flags, size_t *left_over, unsigned int *num)
+static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
-	int i;
-	size_t wastage = PAGE_SIZE << gfporder;
-	size_t extra = 0;
-	size_t base = 0;
+	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+}
 
-	if (!(flags & CFLGS_OFF_SLAB)) {
-		base = sizeof(struct slab);
-		extra = sizeof(kmem_bufctl_t);
-	}
-	i = 0;
-	while (i * size + ALIGN(base + i * extra, align) <= wastage)
-		i++;
-	if (i > 0)
-		i--;
+/* Calculate the number of objects and left-over bytes for a given
+   buffer size. */
+static void cache_estimate(unsigned long gfporder, size_t buffer_size,
+			   size_t align, int flags, size_t *left_over,
+			   unsigned int *num)
+{
+	int nr_objs;
+	size_t mgmt_size;
+	size_t slab_size = PAGE_SIZE << gfporder;
 
-	if (i > SLAB_LIMIT)
-		i = SLAB_LIMIT;
+	/*
+	 * The slab management structure can be either off the slab or
+	 * on it. For the latter case, the memory allocated for a
+	 * slab is used for:
+	 *
+	 * - The struct slab
+	 * - One kmem_bufctl_t for each object
+	 * - Padding to respect alignment of @align
+	 * - @buffer_size bytes for each object
+	 *
+	 * If the slab management structure is off the slab, then the
+	 * alignment will already be calculated into the size. Because
+	 * the slabs are all pages aligned, the objects will be at the
+	 * correct alignment when allocated.
+	 */
+	if (flags & CFLGS_OFF_SLAB) {
+		mgmt_size = 0;
+		nr_objs = slab_size / buffer_size;
 
-	*num = i;
-	wastage -= i * size;
-	wastage -= ALIGN(base + i * extra, align);
-	*left_over = wastage;
+		if (nr_objs > SLAB_LIMIT)
+			nr_objs = SLAB_LIMIT;
+	} else {
+		/*
+		 * Ignore padding for the initial guess. The padding
+		 * is at most @align-1 bytes, and @buffer_size is at
+		 * least @align. In the worst case, this result will
+		 * be one greater than the number of objects that fit
+		 * into the memory allocation when taking the padding
+		 * into account.
+		 */
+		nr_objs = (slab_size - sizeof(struct slab)) /
+			  (buffer_size + sizeof(kmem_bufctl_t));
+
+		/*
+		 * This calculated number will be either the right
+		 * amount, or one greater than what we want.
+		 */
+		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+		       > slab_size)
+			nr_objs--;
+
+		if (nr_objs > SLAB_LIMIT)
+			nr_objs = SLAB_LIMIT;
+
+		mgmt_size = slab_mgmt_size(nr_objs, align);
+	}
+	*num = nr_objs;
+	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
-- 
cgit v1.2.2


From 12dd36faec5d3bd96da84fa8f76efecc632930ab Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Wed, 1 Feb 2006 03:05:46 -0800
Subject: [PATCH] slab: extract slab_destroy_objs()

Create a helper function, slab_destroy_objs() which called from
slab_destroy().  This makes slab_destroy() smaller and more readable, and
moves ifdefs outside the function body.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index e869400ea731..85adf0992011 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1459,15 +1459,13 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 }
 #endif
 
-/* Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+#if DEBUG
+/**
+ * slab_destroy_objs - call the registered destructor for each object in
+ *      a slab that is to be destroyed.
  */
-static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
 {
-	void *addr = slabp->s_mem - slabp->colouroff;
-
-#if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = slabp->s_mem + cachep->buffer_size * i;
@@ -1496,7 +1494,10 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
 			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
 	}
+}
 #else
+static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
+{
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
@@ -1504,8 +1505,19 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
+}
 #endif
 
+/**
+ * Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache.
+ * The cache-lock is not held/needed.
+ */
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
+{
+	void *addr = slabp->s_mem - slabp->colouroff;
+
+	slab_destroy_objs(cachep, slabp);
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slab_rcu *slab_rcu;
 
-- 
cgit v1.2.2


From 78d382d77c84229d031431931bf6490d5da6ab86 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Wed, 1 Feb 2006 03:05:47 -0800
Subject: [PATCH] slab: extract slab_{put|get}_obj

Create two helper functions slab_get_obj() and slab_put_obj() to replace
duplicated code in mm/slab.c

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 77 +++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 85adf0992011..594a9155c7d8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2226,6 +2226,42 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
 	}
 }
 
+static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid)
+{
+	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+	kmem_bufctl_t next;
+
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+	WARN_ON(slabp->nodeid != nodeid);
+#endif
+	slabp->free = next;
+
+	return objp;
+}
+
+static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp,
+			  int nodeid)
+{
+	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+
+#if DEBUG
+	/* Verify that the slab belongs to the intended node */
+	WARN_ON(slabp->nodeid != nodeid);
+
+	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+		printk(KERN_ERR "slab: double free detected in cache "
+		       "'%s', objp %p\n", cachep->name, objp);
+		BUG();
+	}
+#endif
+	slab_bufctl(slabp)[objnr] = slabp->free;
+	slabp->free = objnr;
+	slabp->inuse--;
+}
+
 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
 {
 	int i;
@@ -2515,22 +2551,12 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 		check_slabp(cachep, slabp);
 		check_spinlock_acquired(cachep);
 		while (slabp->inuse < cachep->num && batchcount--) {
-			kmem_bufctl_t next;
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
-			/* get obj pointer */
-			ac->entry[ac->avail++] = slabp->s_mem +
-			    slabp->free * cachep->buffer_size;
-
-			slabp->inuse++;
-			next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-			WARN_ON(numa_node_id() != slabp->nodeid);
-#endif
-			slabp->free = next;
+			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+							    numa_node_id());
 		}
 		check_slabp(cachep, slabp);
 
@@ -2675,7 +2701,6 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	struct slab *slabp;
 	struct kmem_list3 *l3;
 	void *obj;
-	kmem_bufctl_t next;
 	int x;
 
 	l3 = cachep->nodelists[nodeid];
@@ -2701,14 +2726,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 
 	BUG_ON(slabp->inuse == cachep->num);
 
-	/* get obj pointer */
-	obj = slabp->s_mem + slabp->free * cachep->buffer_size;
-	slabp->inuse++;
-	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
-	slabp->free = next;
+	obj = slab_get_obj(cachep, slabp, nodeid);
 	check_slabp(cachep, slabp);
 	l3->free_objects--;
 	/* move slabp to correct slabp list: */
@@ -2748,29 +2766,14 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 	for (i = 0; i < nr_objects; i++) {
 		void *objp = objpp[i];
 		struct slab *slabp;
-		unsigned int objnr;
 
 		slabp = page_get_slab(virt_to_page(objp));
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
-
-#if DEBUG
-		/* Verify that the slab belongs to the intended node */
-		WARN_ON(slabp->nodeid != node);
-
-		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
-			printk(KERN_ERR "slab: double free detected in cache "
-			       "'%s', objp %p\n", cachep->name, objp);
-			BUG();
-		}
-#endif
-		slab_bufctl(slabp)[objnr] = slabp->free;
-		slabp->free = objnr;
+		slab_put_obj(cachep, slabp, objp, node);
 		STATS_DEC_ACTIVE(cachep);
-		slabp->inuse--;
 		l3->free_objects++;
 		check_slabp(cachep, slabp);
 
-- 
cgit v1.2.2


From 5295a74cc0bcf1291686eb734ccb06baa3d55c1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:48 -0800
Subject: [PATCH] slab: reduce inlining

From: Manfred Spraul <manfred@colorfullife.com>

Reduce the amount of inline functions in slab to the functions that
are used in the hot path:

  - no inline for debug functions
  - no __always_inline, inline is already __always_inline
  - remove inline from a few numa support functions.

Before:

   text    data     bss     dec     hex filename
  13588     752      48   14388    3834 mm/slab.o (defconfig)
  16671    2492      48   19211    4b0b mm/slab.o (numa)

After:

   text    data     bss     dec     hex filename
  13366     752      48   14166    3756 mm/slab.o (defconfig)
  16230    2492      48   18770    4952 mm/slab.o (numa)

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 594a9155c7d8..ba288b3877d1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -337,7 +337,7 @@ static __always_inline int index_of(const size_t size)
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
 
-static inline void kmem_list3_init(struct kmem_list3 *parent)
+static void kmem_list3_init(struct kmem_list3 *parent)
 {
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
@@ -818,7 +818,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
 
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * MAX_NUMNODES;
@@ -845,7 +845,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
 	return ac_ptr;
 }
 
-static inline void free_alien_cache(struct array_cache **ac_ptr)
+static void free_alien_cache(struct array_cache **ac_ptr)
 {
 	int i;
 
@@ -858,8 +858,8 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 	kfree(ac_ptr);
 }
 
-static inline void __drain_alien_cache(kmem_cache_t *cachep,
-				       struct array_cache *ac, int node)
+static void __drain_alien_cache(kmem_cache_t *cachep,
+				struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
@@ -1534,7 +1534,7 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 
 /* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
-static inline void set_up_list3s(kmem_cache_t *cachep, int index)
+static void set_up_list3s(kmem_cache_t *cachep, int index)
 {
 	int node;
 
@@ -1937,7 +1937,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
 #endif
 }
 
-static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+static void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-- 
cgit v1.2.2


From 6ed5eb2211204224799b2821656bbbfde26ef200 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:49 -0800
Subject: [PATCH] slab: extract virt_to_{cache|slab}

Introduce virt_to_cache() and virt_to_slab() functions to reduce duplicate
code and introduce a proper abstraction should we want to support other kind
of mapping for address to slab and cache (eg.  for vmalloc() or I/O memory).

Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ba288b3877d1..c2f9e0a330ff 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -596,6 +596,18 @@ static inline struct slab *page_get_slab(struct page *page)
 	return (struct slab *)page->lru.prev;
 }
 
+static inline struct kmem_cache *virt_to_cache(const void *obj)
+{
+	struct page *page = virt_to_page(obj);
+	return page_get_cache(page);
+}
+
+static inline struct slab *virt_to_slab(const void *obj)
+{
+	struct page *page = virt_to_page(obj);
+	return page_get_slab(page);
+}
+
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
@@ -1437,7 +1449,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		/* Print some data about the neighboring objects, if they
 		 * exist:
 		 */
-		struct slab *slabp = page_get_slab(virt_to_page(objp));
+		struct slab *slabp = virt_to_slab(objp);
 		int objnr;
 
 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
@@ -2767,7 +2779,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 		void *objp = objpp[i];
 		struct slab *slabp;
 
-		slabp = page_get_slab(virt_to_page(objp));
+		slabp = virt_to_slab(objp);
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
 		check_spinlock_acquired_node(cachep, node);
@@ -2867,7 +2879,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 #ifdef CONFIG_NUMA
 	{
 		struct slab *slabp;
-		slabp = page_get_slab(virt_to_page(objp));
+		slabp = virt_to_slab(objp);
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
@@ -3130,7 +3142,7 @@ void kfree(const void *objp)
 		return;
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
-	c = page_get_cache(virt_to_page(objp));
+	c = virt_to_cache(objp);
 	mutex_debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
@@ -3704,5 +3716,5 @@ unsigned int ksize(const void *objp)
 	if (unlikely(objp == NULL))
 		return 0;
 
-	return obj_size(page_get_cache(virt_to_page(objp)));
+	return obj_size(virt_to_cache(objp));
 }
-- 
cgit v1.2.2


From 9a2dba4b4912b493070cbc170629fdbf440b01d7 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:49 -0800
Subject: [PATCH] slab: rename ac_data to cpu_cache_get

Rename the ac_data() function to more descriptive cpu_cache_get().

Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index c2f9e0a330ff..b19093864998 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -679,7 +679,7 @@ static void enable_cpucache(kmem_cache_t *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+static inline struct array_cache *cpu_cache_get(kmem_cache_t *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
@@ -1186,8 +1186,8 @@ void __init kmem_cache_init(void)
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache),
+		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
@@ -1195,9 +1195,9 @@ void __init kmem_cache_init(void)
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
+		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
+		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
 		       sizeof(struct arraycache_init));
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
@@ -1235,7 +1235,7 @@ void __init kmem_cache_init(void)
 	g_cpucache_up = FULL;
 
 	/* Register a cpu startup notifier callback
-	 * that initializes ac_data for all new cpus
+	 * that initializes cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
@@ -1909,11 +1909,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		    jiffies + REAPTIMEOUT_LIST3 +
 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
-		BUG_ON(!ac_data(cachep));
-		ac_data(cachep)->avail = 0;
-		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		ac_data(cachep)->batchcount = 1;
-		ac_data(cachep)->touched = 0;
+		BUG_ON(!cpu_cache_get(cachep));
+		cpu_cache_get(cachep)->avail = 0;
+		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+		cpu_cache_get(cachep)->batchcount = 1;
+		cpu_cache_get(cachep)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
 	}
@@ -1992,7 +1992,7 @@ static void do_drain(void *arg)
 	int node = numa_node_id();
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep);
 	spin_lock(&cachep->nodelists[node]->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
@@ -2518,7 +2518,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep);
       retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2590,7 +2590,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 		x = cache_grow(cachep, flags, numa_node_id());
 
 		// cache_grow can reenable interrupts, then ac could change.
-		ac = ac_data(cachep);
+		ac = cpu_cache_get(cachep);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
@@ -2675,7 +2675,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 #endif
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = cpu_cache_get(cachep);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
@@ -2868,7 +2868,7 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
  */
 static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 {
-	struct array_cache *ac = ac_data(cachep);
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -3253,7 +3253,7 @@ static void do_ccupdate_local(void *info)
 	struct array_cache *old;
 
 	check_irq_off();
-	old = ac_data(new->cachep);
+	old = cpu_cache_get(new->cachep);
 
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
@@ -3419,7 +3419,7 @@ static void cache_reap(void *unused)
 			drain_alien_cache(searchp, l3);
 		spin_lock_irq(&l3->list_lock);
 
-		drain_array_locked(searchp, ac_data(searchp), 0,
+		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
 				   numa_node_id());
 
 		if (time_after(l3->next_reap, jiffies))
-- 
cgit v1.2.2


From 343e0d7a93951e35065fdb5e3dd61aece0ec6b3c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:50 -0800
Subject: [PATCH] slab: replace kmem_cache_t with struct kmem_cache

Replace uses of kmem_cache_t with proper struct kmem_cache in mm/slab.c.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 195 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 98 insertions(+), 97 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index b19093864998..6fbd6a1cdeb4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -55,7 +55,7 @@
  *
  * SMP synchronization:
  *  constructors and destructors are called without any locking.
- *  Several members in kmem_cache_t and struct slab never change, they
+ *  Several members in struct kmem_cache and struct slab never change, they
  *	are accessed without any locking.
  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  *  	and local interrupts are disabled so slab code is preempt-safe.
@@ -244,7 +244,7 @@ struct slab {
  */
 struct slab_rcu {
 	struct rcu_head head;
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	void *addr;
 };
 
@@ -363,7 +363,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	} while (0)
 
 /*
- * kmem_cache_t
+ * struct kmem_cache
  *
  * manages a cache.
  */
@@ -391,15 +391,15 @@ struct kmem_cache {
 	size_t colour;		/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
 	unsigned int colour_next;	/* cache colouring */
-	kmem_cache_t *slabp_cache;
+	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
 	unsigned int dflags;	/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor) (void *, kmem_cache_t *, unsigned long);
+	void (*ctor) (void *, struct kmem_cache *, unsigned long);
 
 	/* de-constructor func */
-	void (*dtor) (void *, kmem_cache_t *, unsigned long);
+	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 
 /* 4) cache creation/removal */
 	const char *name;
@@ -509,23 +509,23 @@ struct kmem_cache {
  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
  */
-static int obj_offset(kmem_cache_t *cachep)
+static int obj_offset(struct kmem_cache *cachep)
 {
 	return cachep->obj_offset;
 }
 
-static int obj_size(kmem_cache_t *cachep)
+static int obj_size(struct kmem_cache *cachep)
 {
 	return cachep->obj_size;
 }
 
-static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
+static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 }
 
-static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
+static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
@@ -534,7 +534,7 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 
-static void **dbg_userword(kmem_cache_t *cachep, void *objp)
+static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
@@ -636,16 +636,16 @@ static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
-static kmem_cache_t cache_cache = {
+static struct kmem_cache cache_cache = {
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
-	.buffer_size = sizeof(kmem_cache_t),
+	.buffer_size = sizeof(struct kmem_cache),
 	.flags = SLAB_NO_REAP,
 	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
-	.obj_size = sizeof(kmem_cache_t),
+	.obj_size = sizeof(struct kmem_cache),
 #endif
 };
 
@@ -674,17 +674,17 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
-static void enable_cpucache(kmem_cache_t *cachep);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
-static int __node_shrink(kmem_cache_t *cachep, int node);
+static int __node_shrink(struct kmem_cache *cachep, int node);
 
-static inline struct array_cache *cpu_cache_get(kmem_cache_t *cachep)
+static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
 
-static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -708,7 +708,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 	return csizep->cs_cachep;
 }
 
-kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
@@ -781,7 +781,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 
-static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
@@ -828,7 +828,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 
 #ifdef CONFIG_NUMA
-static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -870,7 +870,7 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 	kfree(ac_ptr);
 }
 
-static void __drain_alien_cache(kmem_cache_t *cachep,
+static void __drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -883,7 +883,7 @@ static void __drain_alien_cache(kmem_cache_t *cachep,
 	}
 }
 
-static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
 {
 	int i = 0;
 	struct array_cache *ac;
@@ -908,7 +908,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
@@ -1046,7 +1046,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1086,14 +1086,14 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
+	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
 	 *    structures of all caches, except cache_cache itself: cache_cache
 	 *    is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
-	 *    The kmem_cache_t for the new cache is allocated normally.
+	 *    The struct kmem_cache for the new cache is allocated normally.
 	 *    An __init data area is used for the head array.
 	 * 3) Create the remaining kmalloc caches, with minimally sized
 	 *    head arrays.
@@ -1224,7 +1224,7 @@ void __init kmem_cache_init(void)
 
 	/* 6) resize the head arrays to their final sizes */
 	{
-		kmem_cache_t *cachep;
+		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
 		    enable_cpucache(cachep);
@@ -1267,7 +1267,7 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct page *page;
 	void *addr;
@@ -1293,7 +1293,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 /*
  * Interface to system's page release.
  */
-static void kmem_freepages(kmem_cache_t *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 {
 	unsigned long i = (1 << cachep->gfporder);
 	struct page *page = virt_to_page(addr);
@@ -1315,7 +1315,7 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 static void kmem_rcu_free(struct rcu_head *head)
 {
 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
-	kmem_cache_t *cachep = slab_rcu->cachep;
+	struct kmem_cache *cachep = slab_rcu->cachep;
 
 	kmem_freepages(cachep, slab_rcu->addr);
 	if (OFF_SLAB(cachep))
@@ -1325,7 +1325,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 #if DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
+static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 			    unsigned long caller)
 {
 	int size = obj_size(cachep);
@@ -1358,7 +1358,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 }
 #endif
 
-static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
+static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
 	int size = obj_size(cachep);
 	addr = &((char *)addr)[obj_offset(cachep)];
@@ -1380,7 +1380,7 @@ static void dump_line(char *data, int offset, int limit)
 
 #if DEBUG
 
-static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
+static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 {
 	int i, size;
 	char *realobj;
@@ -1409,7 +1409,7 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 	}
 }
 
-static void check_poison_obj(kmem_cache_t *cachep, void *objp)
+static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 {
 	char *realobj;
 	int size, i;
@@ -1476,7 +1476,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
  * slab_destroy_objs - call the registered destructor for each object in
  *      a slab that is to be destroyed.
  */
-static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1508,7 +1508,7 @@ static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
 	}
 }
 #else
-static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	if (cachep->dtor) {
 		int i;
@@ -1525,7 +1525,7 @@ static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
  * Before calling the slab must have been unlinked from the cache.
  * The cache-lock is not held/needed.
  */
-static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1546,7 +1546,7 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 
 /* For setting up all the kmem_list3s for cache whose buffer_size is same
    as size of kmem_list3. */
-static void set_up_list3s(kmem_cache_t *cachep, int index)
+static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
 
@@ -1566,7 +1566,7 @@ static void set_up_list3s(kmem_cache_t *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+static inline size_t calculate_slab_order(struct kmem_cache *cachep, size_t size,
 					  size_t align, gfp_t flags)
 {
 	size_t left_over = 0;
@@ -1638,13 +1638,13 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-kmem_cache_t *
+struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
-	void (*dtor)(void*, kmem_cache_t *, unsigned long))
+	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
-	kmem_cache_t *cachep = NULL;
+	struct kmem_cache *cachep = NULL;
 	struct list_head *p;
 
 	/*
@@ -1662,7 +1662,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each(p, &cache_chain) {
-		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+		struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
 		mm_segment_t old_fs = get_fs();
 		char tmp;
 		int res;
@@ -1762,10 +1762,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	align = ralign;
 
 	/* Get cache's description obj. */
-	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+	cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto oops;
-	memset(cachep, 0, sizeof(kmem_cache_t));
+	memset(cachep, 0, sizeof(struct kmem_cache));
 
 #if DEBUG
 	cachep->obj_size = size;
@@ -1941,7 +1941,7 @@ static void check_irq_on(void)
 	BUG_ON(irqs_disabled());
 }
 
-static void check_spinlock_acquired(kmem_cache_t *cachep)
+static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
@@ -1949,7 +1949,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
 #endif
 }
 
-static void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
@@ -1982,12 +1982,12 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 	preempt_enable();
 }
 
-static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
 				int force, int node);
 
 static void do_drain(void *arg)
 {
-	kmem_cache_t *cachep = (kmem_cache_t *) arg;
+	struct kmem_cache *cachep = (struct kmem_cache *) arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -1999,7 +1999,7 @@ static void do_drain(void *arg)
 	ac->avail = 0;
 }
 
-static void drain_cpu_caches(kmem_cache_t *cachep)
+static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
@@ -2020,7 +2020,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
 	spin_unlock_irq(&cachep->spinlock);
 }
 
-static int __node_shrink(kmem_cache_t *cachep, int node)
+static int __node_shrink(struct kmem_cache *cachep, int node)
 {
 	struct slab *slabp;
 	struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -2049,7 +2049,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
 	return ret;
 }
 
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0, i = 0;
 	struct kmem_list3 *l3;
@@ -2075,7 +2075,7 @@ static int __cache_shrink(kmem_cache_t *cachep)
  * Releases as many slabs as possible for a cache.
  * To help debugging, a zero exit status indicates all slabs were released.
  */
-int kmem_cache_shrink(kmem_cache_t *cachep)
+int kmem_cache_shrink(struct kmem_cache *cachep)
 {
 	if (!cachep || in_interrupt())
 		BUG();
@@ -2088,7 +2088,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * kmem_cache_destroy - delete a cache
  * @cachep: the cache to destroy
  *
- * Remove a kmem_cache_t object from the slab cache.
+ * Remove a struct kmem_cache object from the slab cache.
  * Returns 0 on success.
  *
  * It is expected this function will be called by a module when it is
@@ -2101,7 +2101,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * The caller must guarantee that noone will allocate memory from the cache
  * during the kmem_cache_destroy().
  */
-int kmem_cache_destroy(kmem_cache_t *cachep)
+int kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2152,7 +2152,7 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 				   int colour_off, gfp_t local_flags)
 {
 	struct slab *slabp;
@@ -2178,7 +2178,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 	return (kmem_bufctl_t *) (slabp + 1);
 }
 
-static void cache_init_objs(kmem_cache_t *cachep,
+static void cache_init_objs(struct kmem_cache *cachep,
 			    struct slab *slabp, unsigned long ctor_flags)
 {
 	int i;
@@ -2227,7 +2227,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 	slabp->free = 0;
 }
 
-static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
+static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (flags & SLAB_DMA) {
 		if (!(cachep->gfpflags & GFP_DMA))
@@ -2238,7 +2238,7 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
 	}
 }
 
-static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
 {
 	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
 	kmem_bufctl_t next;
@@ -2254,7 +2254,7 @@ static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid)
 	return objp;
 }
 
-static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp,
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
 			  int nodeid)
 {
 	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
@@ -2274,7 +2274,7 @@ static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp,
 	slabp->inuse--;
 }
 
-static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
 {
 	int i;
 	struct page *page;
@@ -2293,7 +2293,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct slab *slabp;
 	void *objp;
@@ -2404,7 +2404,7 @@ static void kfree_debugcheck(const void *objp)
 	}
 }
 
-static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
+static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 				   void *caller)
 {
 	struct page *page;
@@ -2478,7 +2478,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	return objp;
 }
 
-static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
+static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 {
 	kmem_bufctl_t i;
 	int entries = 0;
@@ -2511,7 +2511,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2602,7 +2602,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 }
 
 static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
+cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2611,7 +2611,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
 					void *objp, void *caller)
 {
 	if (!objp)
@@ -2660,7 +2660,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
 	struct array_cache *ac;
@@ -2687,7 +2687,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 	return objp;
 }
 
-static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+static inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2707,7 +2707,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -2769,7 +2769,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 /*
  * Caller needs to acquire correct kmem_list's list_lock
  */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		       int node)
 {
 	int i;
@@ -2807,7 +2807,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
 	}
 }
 
-static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
+static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2866,7 +2866,7 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
  *
  * Called with disabled ints.
  */
-static inline void __cache_free(kmem_cache_t *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
@@ -2925,7 +2925,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
-void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	return __cache_alloc(cachep, flags);
 }
@@ -2945,7 +2945,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
  *
  * Currently only used for dentry validation.
  */
-int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
+int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
 {
 	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
@@ -2986,7 +2986,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
  * New and improved: it will now make sure that the object gets
  * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	unsigned long save_flags;
 	void *ptr;
@@ -3010,7 +3010,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
@@ -3043,7 +3043,7 @@ EXPORT_SYMBOL(kmalloc_node);
  */
 void *__kmalloc(size_t size, gfp_t flags)
 {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 
 	/* If you want to save a few bytes .text space: replace
 	 * __ with kmem_.
@@ -3114,7 +3114,7 @@ EXPORT_SYMBOL(__alloc_percpu);
  * Free an object which was previously allocated from this
  * cache.
  */
-void kmem_cache_free(kmem_cache_t *cachep, void *objp)
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
 
@@ -3135,7 +3135,7 @@ EXPORT_SYMBOL(kmem_cache_free);
  */
 void kfree(const void *objp)
 {
-	kmem_cache_t *c;
+	struct kmem_cache *c;
 	unsigned long flags;
 
 	if (unlikely(!objp))
@@ -3172,13 +3172,13 @@ void free_percpu(const void *objp)
 EXPORT_SYMBOL(free_percpu);
 #endif
 
-unsigned int kmem_cache_size(kmem_cache_t *cachep)
+unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
 	return obj_size(cachep);
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
-const char *kmem_cache_name(kmem_cache_t *cachep)
+const char *kmem_cache_name(struct kmem_cache *cachep)
 {
 	return cachep->name;
 }
@@ -3187,7 +3187,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 for all nodes.
  */
-static int alloc_kmemlist(kmem_cache_t *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3243,7 +3243,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
 }
 
 struct ccupdate_struct {
-	kmem_cache_t *cachep;
+	struct kmem_cache *cachep;
 	struct array_cache *new[NR_CPUS];
 };
 
@@ -3259,7 +3259,7 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
 			    int shared)
 {
 	struct ccupdate_struct new;
@@ -3305,7 +3305,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
 	return 0;
 }
 
-static void enable_cpucache(kmem_cache_t *cachep)
+static void enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
 	int limit, shared;
@@ -3357,7 +3357,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
 				int force, int node)
 {
 	int tofree;
@@ -3402,12 +3402,12 @@ static void cache_reap(void *unused)
 	}
 
 	list_for_each(walk, &cache_chain) {
-		kmem_cache_t *searchp;
+		struct kmem_cache *searchp;
 		struct list_head *p;
 		int tofree;
 		struct slab *slabp;
 
-		searchp = list_entry(walk, kmem_cache_t, next);
+		searchp = list_entry(walk, struct kmem_cache, next);
 
 		if (searchp->flags & SLAB_NO_REAP)
 			goto next;
@@ -3510,15 +3510,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		if (p == &cache_chain)
 			return NULL;
 	}
-	return list_entry(p, kmem_cache_t, next);
+	return list_entry(p, struct kmem_cache, next);
 }
 
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	kmem_cache_t *cachep = p;
+	struct kmem_cache *cachep = p;
 	++*pos;
 	return cachep->next.next == &cache_chain ? NULL
-	    : list_entry(cachep->next.next, kmem_cache_t, next);
+	    : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3528,7 +3528,7 @@ static void s_stop(struct seq_file *m, void *p)
 
 static int s_show(struct seq_file *m, void *p)
 {
-	kmem_cache_t *cachep = p;
+	struct kmem_cache *cachep = p;
 	struct list_head *q;
 	struct slab *slabp;
 	unsigned long active_objs;
@@ -3678,7 +3678,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
 	list_for_each(p, &cache_chain) {
-		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
+		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+						       next);
 
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 ||
-- 
cgit v1.2.2


From b958f7d9f35bfb61625f201cd92a3fc39504af7a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 1 Feb 2006 03:05:51 -0800
Subject: [PATCH] dump_stack() in oom handler

Sometimes it's nice to know who's calling.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 14bd4ec79597..b05ab8f2a562 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -271,6 +271,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
 			gfp_mask, order);
+		dump_stack();
 		show_mem();
 	}
 
-- 
cgit v1.2.2


From 7fd6b1413082c303613fc137aca9a004740cacf0 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:05:52 -0800
Subject: [PATCH] slab: fix kzalloc and kstrdup caller report for
 CONFIG_DEBUG_SLAB

Fix kzalloc() and kstrdup() caller report for CONFIG_DEBUG_SLAB.  We must
pass the caller to __cache_alloc() instead of directly doing
__builtin_return_address(0) there; otherwise kzalloc() and kstrdup() are
reported as the allocation site instead of the real one.

Thanks to Valdis Kletnieks for reporting the problem and Steven Rostedt for
the original idea.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6fbd6a1cdeb4..67527268b01c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2687,7 +2687,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	return objp;
 }
 
-static inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static __always_inline void *
+__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2698,7 +2699,7 @@ static inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	objp = ____cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-					    __builtin_return_address(0));
+					    caller);
 	prefetchw(objp);
 	return objp;
 }
@@ -2927,7 +2928,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	return __cache_alloc(cachep, flags);
+	return __cache_alloc(cachep, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
@@ -3041,7 +3042,8 @@ EXPORT_SYMBOL(kmalloc_node);
  * platforms.  For example, on i386, it means that the memory must come
  * from the first 16MB.
  */
-void *__kmalloc(size_t size, gfp_t flags)
+static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
+					  void *caller)
 {
 	struct kmem_cache *cachep;
 
@@ -3053,10 +3055,27 @@ void *__kmalloc(size_t size, gfp_t flags)
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
 		return NULL;
-	return __cache_alloc(cachep, flags);
+	return __cache_alloc(cachep, flags, caller);
+}
+
+#ifndef CONFIG_DEBUG_SLAB
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+	return __do_kmalloc(size, flags, NULL);
 }
 EXPORT_SYMBOL(__kmalloc);
 
+#else
+
+void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+{
+	return __do_kmalloc(size, flags, caller);
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#endif
+
 #ifdef CONFIG_SMP
 /**
  * __alloc_percpu - allocate one copy of the object for every present
-- 
cgit v1.2.2


From a70773ddb96b74c7afe5a5bc859ba45e3d02899e Mon Sep 17 00:00:00 2001
From: "Randy.Dunlap" <rdunlap@xenotime.net>
Date: Wed, 1 Feb 2006 03:05:52 -0800
Subject: [PATCH] mm/slab: add kernel-doc for one function

Fix kernel-doc for calculate_slab_order().

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 67527268b01c..afe9c5f8c57a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1559,8 +1559,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 }
 
 /**
- * calculate_slab_order - calculate size (page order) of slabs and the number
- *                        of objects per slab.
+ * calculate_slab_order - calculate size (page order) of slabs
+ * @cachep: pointer to the cache that is being created
+ * @size: size of objects to be created in this cache.
+ * @align: required alignment for the objects.
+ * @flags: slab allocation flags
+ *
+ * Also calculates the number of objects per slab.
  *
  * This could be made much more intelligent.  For now, try to avoid using
  * high order pages for slabs.  When the gfp() functions are more friendly
-- 
cgit v1.2.2


From ee13d785eac1fbe7e79ecca77bf7e902734a0b30 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 1 Feb 2006 03:05:53 -0800
Subject: [PATCH] slab: fix sparse warning

mm/slab.c:1522:13: error: incompatible types for operation (&)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index afe9c5f8c57a..71370256a7eb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1571,8 +1571,8 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep, size_t size,
-					  size_t align, gfp_t flags)
+static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
 
-- 
cgit v1.2.2


From 00ac59adfca8f2f339beb0b67054e786c275553e Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Fri, 3 Feb 2006 21:51:14 +0100
Subject: [PATCH] x86_64: Fix memory policy build without CONFIG_HUGETLBFS

> mm/mempolicy.c: In function `huge_zonelist':
> mm/mempolicy.c:1045: error: `HPAGE_SHIFT' undeclared (first use in this function)
> mm/mempolicy.c:1045: error: (Each undeclared identifier is reported only once
> mm/mempolicy.c:1045: error: for each function it appears in.)
> make[1]: *** [mm/mempolicy.o] Error 1

Need to wrap huge_zonelist function with CONFIG_HUGETLBFS.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27da6d5c77ba..3bd7fb7e4b75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1159,6 +1159,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		return interleave_nodes(pol);
 }
 
+#ifdef CONFIG_HUGETLBFS
 /* Return a zonelist suitable for a huge page allocation. */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 {
@@ -1172,6 +1173,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 	}
 	return zonelist_policy(GFP_HIGHUSER, pol);
 }
+#endif
 
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
-- 
cgit v1.2.2


From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 4 Feb 2006 23:27:36 -0800
Subject: [PATCH] percpu data: only iterate over possible CPUs

percpu_data blindly allocates bootmem memory to store NR_CPUS instances of
cpudata, instead of allocating memory only for possible cpus.

As a preparation for changing that, we need to convert various 0 -> NR_CPUS
loops to use for_each_cpu().

(The above only applies to users of asm-generic/percpu.h.  powerpc has gone it
alone and is presently only allocating memory for present CPUs, so it's
currently corrupting memory).

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <axboe@suse.de>
Cc: Anton Blanchard <anton@samba.org>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d9..dde04ff4be31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
-	memset(ret, 0, sizeof(*ret));
+	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
+		if (!cpu_isset(cpu, *cpumask))
+			continue;
+
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
 		cpu = next_cpu(cpu, *cpumask);
 
-		if (cpu < NR_CPUS)
+		if (likely(cpu < NR_CPUS))
 			prefetch(&per_cpu(page_states, cpu));
 
 		out = (unsigned long *)ret;
@@ -1886,8 +1889,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static struct per_cpu_pageset
-	boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
 
 /*
  * Dynamically allocate memory for the
-- 
cgit v1.2.2


From 64b4a954b03a1153fb8ae38d6ffbd991e01a1e80 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 4 Feb 2006 23:27:55 -0800
Subject: [PATCH] hugetlb: add comment explaining reasons for Bus Errors

I just spent some time researching a Bus Error.  Turns out that the huge
page fault handler can return VM_FAULT_SIGBUS for various conditions where
no huge page is available.

Add a note explaining the reasoning in the source.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b5..ceb3ebb3c399 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -444,6 +444,15 @@ retry:
 		page = alloc_huge_page(vma, address);
 		if (!page) {
 			hugetlb_put_quota(mapping);
+			/*
+		 	 * No huge pages available. So this is an OOM
+			 * condition but we do not want to trigger the OOM
+			 * killer, so we return VM_FAULT_SIGBUS.
+			 *
+			 * A program using hugepages may fault with Bus Error
+			 * because no huge pages are available in the cpuset, per
+			 * memory policy or because all are in use!
+			 */
 			goto out;
 		}
 
-- 
cgit v1.2.2


From 2e1217cf96b54d3b2d0162930608159e73507fbf Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 4 Feb 2006 23:27:56 -0800
Subject: [PATCH] NUMA slab locking fixes: move color_next to l3

colour_next is used as an index to add a colouring offset to a new slab in the
cache (colour_off * colour_next).  Now with the NUMA aware slab allocator, it
makes sense to colour slabs added on the same node sequentially with
colour_next.

This patch moves the colouring index "colour_next" per-node by placing it on
kmem_list3 rather than kmem_cache.

This also helps simplify locking for CPU up and down paths.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 71370256a7eb..2317096166dd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -294,6 +294,7 @@ struct kmem_list3 {
 	unsigned long next_reap;
 	int free_touched;
 	unsigned int free_limit;
+	unsigned int colour_next;	/* Per-node cache coloring */
 	spinlock_t list_lock;
 	struct array_cache *shared;	/* shared per node */
 	struct array_cache **alien;	/* on other nodes */
@@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	INIT_LIST_HEAD(&parent->slabs_free);
 	parent->shared = NULL;
 	parent->alien = NULL;
+	parent->colour_next = 0;
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
@@ -390,7 +392,6 @@ struct kmem_cache {
 
 	size_t colour;		/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
-	unsigned int colour_next;	/* cache colouring */
 	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
 	unsigned int dflags;	/* dynamic flags */
@@ -1119,7 +1120,6 @@ void __init kmem_cache_init(void)
 		BUG();
 
 	cache_cache.colour = left_over / cache_cache.colour_off;
-	cache_cache.colour_next = 0;
 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
 				      sizeof(struct slab), cache_line_size());
 
@@ -2324,18 +2324,19 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		 */
 		ctor_flags |= SLAB_CTOR_ATOMIC;
 
-	/* About to mess with non-constant members - lock. */
+	/* Take the l3 list lock to change the colour_next on this node */
 	check_irq_off();
-	spin_lock(&cachep->spinlock);
+	l3 = cachep->nodelists[nodeid];
+	spin_lock(&l3->list_lock);
 
 	/* Get colour for the slab, and cal the next value. */
-	offset = cachep->colour_next;
-	cachep->colour_next++;
-	if (cachep->colour_next >= cachep->colour)
-		cachep->colour_next = 0;
-	offset *= cachep->colour_off;
+	offset = l3->colour_next;
+	l3->colour_next++;
+	if (l3->colour_next >= cachep->colour)
+		l3->colour_next = 0;
+	spin_unlock(&l3->list_lock);
 
-	spin_unlock(&cachep->spinlock);
+	offset *= cachep->colour_off;
 
 	check_irq_off();
 	if (local_flags & __GFP_WAIT)
@@ -2367,7 +2368,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
-	l3 = cachep->nodelists[nodeid];
 	spin_lock(&l3->list_lock);
 
 	/* Make slab active. */
-- 
cgit v1.2.2


From ca3b9b91735316f0ec7f01976f85842e0bfe5c6e Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 4 Feb 2006 23:27:58 -0800
Subject: [PATCH] NUMA slab locking fixes: irq disabling from cahep->spinlock
 to l3 lock

Earlier, we had to disable on chip interrupts while taking the
cachep->spinlock because, at cache_grow, on every addition of a slab to a slab
cache, we incremented colour_next which was protected by the cachep->spinlock,
and cache_grow could occur at interrupt context.  Since, now we protect the
per-node colour_next with the node's list_lock, we do not need to disable on
chip interrupts while taking the per-cache spinlock, but we just need to
disable interrupts when taking the per-node kmem_list3 list_lock.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2317096166dd..d3f68543f9f4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -987,7 +987,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
-			spin_lock_irq(&cachep->spinlock);
+			spin_lock(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
@@ -996,7 +996,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			if (!l3)
 				goto unlock_cache;
 
-			spin_lock(&l3->list_lock);
+			spin_lock_irq(&l3->list_lock);
 
 			/* Free limit for this kmem_list3 */
 			l3->free_limit -= cachep->batchcount;
@@ -1004,7 +1004,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			if (!cpus_empty(mask)) {
-				spin_unlock(&l3->list_lock);
+				spin_unlock_irq(&l3->list_lock);
 				goto unlock_cache;
 			}
 
@@ -1023,13 +1023,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			/* free slabs belonging to this node */
 			if (__node_shrink(cachep, node)) {
 				cachep->nodelists[node] = NULL;
-				spin_unlock(&l3->list_lock);
+				spin_unlock_irq(&l3->list_lock);
 				kfree(l3);
 			} else {
-				spin_unlock(&l3->list_lock);
+				spin_unlock_irq(&l3->list_lock);
 			}
 		      unlock_cache:
-			spin_unlock_irq(&cachep->spinlock);
+			spin_unlock(&cachep->spinlock);
 			kfree(nc);
 		}
 		mutex_unlock(&cache_chain_mutex);
@@ -2011,18 +2011,18 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock(&l3->list_lock);
+			spin_lock_irq(&l3->list_lock);
 			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock(&l3->list_lock);
+			spin_unlock_irq(&l3->list_lock);
 			if (l3->alien)
 				drain_alien_cache(cachep, l3);
 		}
 	}
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 }
 
 static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -2338,7 +2338,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 
 	offset *= cachep->colour_off;
 
-	check_irq_off();
 	if (local_flags & __GFP_WAIT)
 		local_irq_enable();
 
@@ -2725,6 +2724,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	BUG_ON(!l3);
 
       retry:
+	check_irq_off();
 	spin_lock(&l3->list_lock);
 	entry = l3->slabs_partial.next;
 	if (entry == &l3->slabs_partial) {
@@ -3304,11 +3304,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
@@ -3564,8 +3564,7 @@ static int s_show(struct seq_file *m, void *p)
 	int node;
 	struct kmem_list3 *l3;
 
-	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
+	spin_lock(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
@@ -3573,7 +3572,8 @@ static int s_show(struct seq_file *m, void *p)
 		if (!l3)
 			continue;
 
-		spin_lock(&l3->list_lock);
+		check_irq_on();
+		spin_lock_irq(&l3->list_lock);
 
 		list_for_each(q, &l3->slabs_full) {
 			slabp = list_entry(q, struct slab, list);
@@ -3600,7 +3600,7 @@ static int s_show(struct seq_file *m, void *p)
 		free_objects += l3->free_objects;
 		shared_avail += l3->shared->avail;
 
-		spin_unlock(&l3->list_lock);
+		spin_unlock_irq(&l3->list_lock);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
@@ -3644,7 +3644,7 @@ static int s_show(struct seq_file *m, void *p)
 	}
 #endif
 	seq_putc(m, '\n');
-	spin_unlock_irq(&cachep->spinlock);
+	spin_unlock(&cachep->spinlock);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 4 Feb 2006 23:27:59 -0800
Subject: [PATCH] NUMA slab locking fixes: fix cpu down and up locking

This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab
allocator.  Sonny Rao <sonny@burdell.org> reported problems sometime back on
POWER5 boxes, when the last cpu on the nodes were being offlined.  We could
not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not
being updated on cpu down.  Since that issue is now fixed, we can reproduce
Sonny's problems on x86_64 NUMA, and here is the fix.

The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go
down, the array_caches (shared, alien) and the kmem_list3 of the node were
being freed (kfree) with the kmem_list3 lock held.  If the l3 or the
array_caches were to come from the same cache being cleared, we hit on
badness.

This patch cleans up the locking in cpu_up and cpu_down path.  We cannot
really free l3 on cpu down because, there is no node offlining yet and even
though a cpu is not yet up, node local memory can be allocated for it.  So l3s
are usually allocated at keme_cache_create and destroyed at
kmem_cache_destroy.  Hence, we don't need cachep->spinlock protection to get
to the cachep->nodelist[nodeid] either.

Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4
dbench process running all the time.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 123 +++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 85 insertions(+), 38 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d3f68543f9f4..9cc049a942c6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 	}
 }
 
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
 	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
 
 	for_each_online_node(i) {
-		ac = l3->alien[i];
+		ac = alien[i];
 		if (ac) {
 			spin_lock_irqsave(&ac->lock, flags);
 			__drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
 }
 #else
 #define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+#define drain_alien_cache(cachep, alien) do { } while (0)
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
+				/*
+				 * The l3s don't come and go as CPUs come and
+				 * go.  cache_chain_mutex is sufficient
+				 * protection here.
+				 */
 				cachep->nodelists[node] = l3;
 			}
 
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			struct array_cache *shared;
+			struct array_cache **alien;
 
 			nc = alloc_arraycache(node, cachep->limit,
-					      cachep->batchcount);
+						cachep->batchcount);
 			if (!nc)
 				goto bad;
+			shared = alloc_arraycache(node,
+					cachep->shared * cachep->batchcount,
+					0xbaadf00d);
+			if (!shared)
+				goto bad;
+#ifdef CONFIG_NUMA
+			alien = alloc_alien_cache(node, cachep->limit);
+			if (!alien)
+				goto bad;
+#endif
 			cachep->array[cpu] = nc;
 
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
-			if (!l3->shared) {
-				if (!(nc = alloc_arraycache(node,
-							    cachep->shared *
-							    cachep->batchcount,
-							    0xbaadf00d)))
-					goto bad;
 
-				/* we are serialised from CPU_DEAD or
-				   CPU_UP_CANCELLED by the cpucontrol lock */
-				l3->shared = nc;
+			spin_lock_irq(&l3->list_lock);
+			if (!l3->shared) {
+				/*
+				 * We are serialised from CPU_DEAD or
+				 * CPU_UP_CANCELLED by the cpucontrol lock
+				 */
+				l3->shared = shared;
+				shared = NULL;
 			}
+#ifdef CONFIG_NUMA
+			if (!l3->alien) {
+				l3->alien = alien;
+				alien = NULL;
+			}
+#endif
+			spin_unlock_irq(&l3->list_lock);
+
+			kfree(shared);
+			free_alien_cache(alien);
 		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+		/*
+		 * Even if all the cpus of a node are down, we don't free the
+		 * kmem_list3 of any cache. This to avoid a race between
+		 * cpu_down, and a kmalloc allocation from another cpu for
+		 * memory from the node of the cpu going down.  The list3
+		 * structure is usually allocated from kmem_cache_create() and
+		 * gets destroyed at kmem_cache_destroy().
+		 */
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		mutex_lock(&cache_chain_mutex);
 
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			struct array_cache *shared;
+			struct array_cache **alien;
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
-			spin_lock(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
 			l3 = cachep->nodelists[node];
 
 			if (!l3)
-				goto unlock_cache;
+				goto free_array_cache;
 
 			spin_lock_irq(&l3->list_lock);
 
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 
 			if (!cpus_empty(mask)) {
 				spin_unlock_irq(&l3->list_lock);
-				goto unlock_cache;
+				goto free_array_cache;
 			}
 
-			if (l3->shared) {
+			shared = l3->shared;
+			if (shared) {
 				free_block(cachep, l3->shared->entry,
 					   l3->shared->avail, node);
-				kfree(l3->shared);
 				l3->shared = NULL;
 			}
-			if (l3->alien) {
-				drain_alien_cache(cachep, l3);
-				free_alien_cache(l3->alien);
-				l3->alien = NULL;
-			}
 
-			/* free slabs belonging to this node */
-			if (__node_shrink(cachep, node)) {
-				cachep->nodelists[node] = NULL;
-				spin_unlock_irq(&l3->list_lock);
-				kfree(l3);
-			} else {
-				spin_unlock_irq(&l3->list_lock);
+			alien = l3->alien;
+			l3->alien = NULL;
+
+			spin_unlock_irq(&l3->list_lock);
+
+			kfree(shared);
+			if (alien) {
+				drain_alien_cache(cachep, alien);
+				free_alien_cache(alien);
 			}
-		      unlock_cache:
-			spin_unlock(&cachep->spinlock);
+free_array_cache:
 			kfree(nc);
 		}
+		/*
+		 * In the previous loop, all the objects were freed to
+		 * the respective cache's slabs,  now we can go ahead and
+		 * shrink each nodelist to its limit.
+		 */
+		list_for_each_entry(cachep, &cache_chain, next) {
+			l3 = cachep->nodelists[node];
+			if (!l3)
+				continue;
+			spin_lock_irq(&l3->list_lock);
+			/* free slabs belonging to this node */
+			__node_shrink(cachep, node);
+			spin_unlock_irq(&l3->list_lock);
+		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
 #endif
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
-	spin_lock(&cachep->spinlock);
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 			drain_array_locked(cachep, l3->shared, 1, node);
 			spin_unlock_irq(&l3->list_lock);
 			if (l3->alien)
-				drain_alien_cache(cachep, l3);
+				drain_alien_cache(cachep, l3->alien);
 		}
 	}
-	spin_unlock(&cachep->spinlock);
 }
 
 static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)
 
 		l3 = searchp->nodelists[numa_node_id()];
 		if (l3->alien)
-			drain_alien_cache(searchp, l3);
+			drain_alien_cache(searchp, l3->alien);
 		spin_lock_irq(&l3->list_lock);
 
 		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p)
 			num_slabs++;
 		}
 		free_objects += l3->free_objects;
-		shared_avail += l3->shared->avail;
+		if (l3->shared)
+			shared_avail += l3->shared->avail;
 
 		spin_unlock_irq(&l3->list_lock);
 	}
-- 
cgit v1.2.2


From 7a21ef6fe902ac0ad53b45af6851ae5ec3a64299 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 5 Feb 2006 11:26:38 -0800
Subject: mm/slab.c (non-NUMA): Fix compile warning and clean up code

The non-NUMA case would do an unmatched "free_alien_cache()" on an alien
pointer that had never been allocated.

It might not matter from a code generation standpoint (since in the
non-NUMA case, the code doesn't actually _do_ anything), but it not only
results in a compiler warning, it's really really ugly too.

Fix the compiler warning by just having a matching dummy allocation.
That also avoids an unnecessary #ifdef in the code.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 9cc049a942c6..d66c2b0d9715 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -900,12 +900,18 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 	}
 }
 #else
-#define alloc_alien_cache(node, limit) do { } while (0)
+
 #define drain_alien_cache(cachep, alien) do { } while (0)
 
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+	return (struct array_cache **) 0x01020304ul;
+}
+
 static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
+
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -970,11 +976,10 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 					0xbaadf00d);
 			if (!shared)
 				goto bad;
-#ifdef CONFIG_NUMA
+
 			alien = alloc_alien_cache(node, cachep->limit);
 			if (!alien)
 				goto bad;
-#endif
 			cachep->array[cpu] = nc;
 
 			l3 = cachep->nodelists[node];
-- 
cgit v1.2.2


From a2dfef6947139db9b886fce510c4d0c913beb5f0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 7 Feb 2006 12:58:25 -0800
Subject: [PATCH] Hugepages need clear_user_highpage() not clear_highpage()

When hugepages are newly allocated to a file in mm/hugetlb.c, we clear them
with a call to clear_highpage() on each of the subpages.  We should be
using clear_user_highpage(): on powerpc, at least, clear_highpage() doesn't
correctly mark the page as icache dirty so if the page is executed shortly
after it's possible to get strange results.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ceb3ebb3c399..3255ca420fc8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -107,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	set_page_count(page, 1);
 	page[1].mapping = (void *)free_huge_page;
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
+		clear_user_highpage(&page[i], addr);
 	return page;
 }
 
-- 
cgit v1.2.2


From 0df420d8b6c718d9a5e37531c3a9a6804493e9f4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 7 Feb 2006 12:58:30 -0800
Subject: [PATCH] hugetlbpage: return VM_FAULT_OOM on oom

Remove wrong and misleading comments.

Return VM_FAULT_OOM if the hugetlbpage fault handler cannot allocate a
page.  do_no_page will end up doing do_exit(SIGKILL).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3255ca420fc8..67f29516662a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -391,12 +391,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (!new_page) {
 		page_cache_release(old_page);
-
-		/* Logically this is OOM, not a SIGBUS, but an OOM
-		 * could cause the kernel to go killing other
-		 * processes which won't help the hugepage situation
-		 * at all (?) */
-		return VM_FAULT_SIGBUS;
+		return VM_FAULT_OOM;
 	}
 
 	spin_unlock(&mm->page_table_lock);
@@ -444,15 +439,7 @@ retry:
 		page = alloc_huge_page(vma, address);
 		if (!page) {
 			hugetlb_put_quota(mapping);
-			/*
-		 	 * No huge pages available. So this is an OOM
-			 * condition but we do not want to trigger the OOM
-			 * killer, so we return VM_FAULT_SIGBUS.
-			 *
-			 * A program using hugepages may fault with Bus Error
-			 * because no huge pages are available in the cpuset, per
-			 * memory policy or because all are in use!
-			 */
+			ret = VM_FAULT_OOM;
 			goto out;
 		}
 
-- 
cgit v1.2.2


From 8519fb30e438f8088b71a94a7d5a660a814d3872 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 7 Feb 2006 12:58:52 -0800
Subject: [PATCH] mm: compound release fix

Compound pages on SMP systems can now often be freed from pagetables via
the release_pages path.  This uses put_page_testzero which does not handle
compound pages at all.  Releasing constituent pages from process mappings
decrements their count to a large negative number and leaks the reference
at the head page - net result is a memory leak.

The problem was hidden because the debug check in put_page_testzero itself
actually did take compound pages into consideration.

Fix the bug and the debug check.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index bc2442a7b0ee..76247424dea1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,19 +34,22 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-void put_page(struct page *page)
+static void put_compound_page(struct page *page)
 {
-	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page_private(page);
-		if (put_page_testzero(page)) {
-			void (*dtor)(struct page *page);
+	page = (struct page *)page_private(page);
+	if (put_page_testzero(page)) {
+		void (*dtor)(struct page *page);
 
-			dtor = (void (*)(struct page *))page[1].mapping;
-			(*dtor)(page);
-		}
-		return;
+		dtor = (void (*)(struct page *))page[1].mapping;
+		(*dtor)(page);
 	}
-	if (put_page_testzero(page))
+}
+
+void put_page(struct page *page)
+{
+	if (unlikely(PageCompound(page)))
+		put_compound_page(page);
+	else if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -244,6 +247,15 @@ void release_pages(struct page **pages, int nr, int cold)
 		struct page *page = pages[i];
 		struct zone *pagezone;
 
+		if (unlikely(PageCompound(page))) {
+			if (zone) {
+				spin_unlock_irq(&zone->lru_lock);
+				zone = NULL;
+			}
+			put_compound_page(page);
+			continue;
+		}
+
 		if (!put_page_testzero(page))
 			continue;
 
-- 
cgit v1.2.2


From 9934a7939e1cdce62ece9ef7d25ebb3c55547fac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Feb 2006 10:11:56 +0100
Subject: [PATCH] SLOB=y && SMP=y fix

fix CONFIG_SLOB=y (when CONFIG_SMP=y): get rid of the 'align' parameter
from its __alloc_percpu() implementation. Boot-tested on x86.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 1c240c4b71d9..a1f42bdc0245 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(slab_reclaim_pages);
 
 #ifdef CONFIG_SMP
 
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
 	int i;
 	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
-- 
cgit v1.2.2


From f0188f47482efdbd2e005103bb4f0224a835dfad Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Fri, 10 Feb 2006 01:51:13 -0800
Subject: [PATCH] slab: Avoid deadlock at kmem_cache_create/kmem_cache_destroy

Prevents deadlock situation between
kmem_cache_create()/kmem_cache_destory(), and kmem_cache_create() /cpu
hotplug.  The locking order probably got moved over time.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d9715..add05d808a4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		BUG();
 	}
 
+	/*
+	 * Prevent CPUs from coming and going.
+	 * lock_cpu_hotplug() nests outside cache_chain_mutex
+	 */
+	lock_cpu_hotplug();
+
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->dtor = dtor;
 	cachep->name = name;
 
-	/* Don't let CPUs to come and go */
-	lock_cpu_hotplug();
 
 	if (g_cpucache_up == FULL) {
 		enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-	unlock_cpu_hotplug();
       oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 	mutex_unlock(&cache_chain_mutex);
+	unlock_cpu_hotplug();
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
-- 
cgit v1.2.2


From 418aade459f03318defd18ef0b11981a63bd81b0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 10 Feb 2006 01:51:15 -0800
Subject: [PATCH] Updates for page migration

This adds some additional comments in order to help others figure out how
exactly the code works.  And fix a variable name.

Also swap_page does need to ignore all reference bits when unmapping a
page.  Otherwise we may have to repeatedly unmap a frequently touched page.
So change the try_to_unmap parameter to 1.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..5db32fdfaf39 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -632,7 +632,7 @@ static int swap_page(struct page *page)
 	struct address_space *mapping = page_mapping(page);
 
 	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
 			goto unlock_retry;
 
 	if (PageDirty(page)) {
@@ -839,7 +839,7 @@ EXPORT_SYMBOL(migrate_page);
  * pages are swapped out.
  *
  * The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
  * or no retryable pages exist anymore.
  *
  * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +928,21 @@ redo:
 			goto unlock_both;
 
 		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
 			rc = mapping->a_ops->migratepage(newpage, page);
 			goto unlock_both;
                 }
 
 		/*
-		 * Trigger writeout if page is dirty
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
 		 */
 		if (PageDirty(page)) {
 			switch (pageout(page, mapping)) {
@@ -949,9 +958,10 @@ redo:
 				; /* try to migrate the page below */
 			}
                 }
+
 		/*
-		 * If we have no buffer or can release the buffer
-		 * then do a simple migration.
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
 		 */
 		if (!page_has_buffers(page) ||
 		    try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +976,11 @@ redo:
 		 * swap them out.
 		 */
 		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
 			unlock_page(newpage);
 			newpage = NULL;
 			rc = swap_page(page);
-- 
cgit v1.2.2


From 80e4342601abfafacb5f20571e40b56d73d10819 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 11 Feb 2006 17:55:53 -0800
Subject: [PATCH] zone reclaim: do not check references to a page during zone
 reclaim

shrink_list() and refill_inactive() check all ptes pointing to a page for
reference bits in order to decide if the page should be put on the active
list.  This is not necessary for zone_reclaim since we are only interested
in removing unmapped pages.  Skip the checks in both functions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5db32fdfaf39..e1c64230ffdd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		BUG_ON(PageActive(page));
 
 		sc->nr_scanned++;
+
+		if (!sc->may_swap && page_mapped(page))
+			goto keep_locked;
+
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
@@ -1231,7 +1235,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	 * Now use this metric to decide whether to start moving mapped memory
 	 * onto the inactive list.
 	 */
-	if (swap_tendency >= 100)
+	if (swap_tendency >= 100 && sc->may_swap)
 		reclaim_mapped = 1;
 
 	while (!list_empty(&l_hold)) {
-- 
cgit v1.2.2


From 072eaa5d9cc3e63f567ffd9ad87b36194fdd8010 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 11 Feb 2006 17:55:54 -0800
Subject: [PATCH] vmscan: remove duplicate increment of reclaim_in_progress

shrink_zone() already increments reclaim_in_progress.  No need to do it in
balance_pgdat.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e1c64230ffdd..58ed5125b1a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1614,9 +1614,7 @@ scan:
 			sc.nr_reclaimed = 0;
 			sc.priority = priority;
 			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-			atomic_inc(&zone->reclaim_in_progress);
 			shrink_zone(zone, &sc);
-			atomic_dec(&zone->reclaim_in_progress);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
-- 
cgit v1.2.2


From 2903fb1694dcb08a3c1d9d823cfae7ba30e66cd3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 11 Feb 2006 17:55:55 -0800
Subject: [PATCH] vmscan: skip reclaim_mapped determination if we do not swap

This puts the variables and the way to get to reclaim_mapped in one block.
And allows zone_reclaim or other things to skip the determination (maybe
this whole block of code does not belong into refill_inactive_zone()?)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 75 +++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 58ed5125b1a7..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1195,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	struct page *page;
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
+
+	if (unlikely(sc->may_swap)) {
+		long mapped_ratio;
+		long distress;
+		long swap_tendency;
+
+		/*
+		 * `distress' is a measure of how much trouble we're having
+		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+		 */
+		distress = 100 >> zone->prev_priority;
+
+		/*
+		 * The point of this algorithm is to decide when to start
+		 * reclaiming mapped memory instead of just pagecache.  Work out
+		 * how much memory
+		 * is mapped.
+		 */
+		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+		/*
+		 * Now decide how much we really want to unmap some pages.  The
+		 * mapped ratio is downgraded - just because there's a lot of
+		 * mapped memory doesn't necessarily mean that page reclaim
+		 * isn't succeeding.
+		 *
+		 * The distress ratio is important - we don't want to start
+		 * going oom.
+		 *
+		 * A 100% value of vm_swappiness overrides this algorithm
+		 * altogether.
+		 */
+		swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+		/*
+		 * Now use this metric to decide whether to start moving mapped
+		 * memory onto the inactive list.
+		 */
+		if (swap_tendency >= 100)
+			reclaim_mapped = 1;
+	}
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
@@ -1207,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	zone->nr_active -= pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 
-	/*
-	 * `distress' is a measure of how much trouble we're having reclaiming
-	 * pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	distress = 100 >> zone->prev_priority;
-
-	/*
-	 * The point of this algorithm is to decide when to start reclaiming
-	 * mapped memory instead of just pagecache.  Work out how much memory
-	 * is mapped.
-	 */
-	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The mapped
-	 * ratio is downgraded - just because there's a lot of mapped memory
-	 * doesn't necessarily mean that page reclaim isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped memory
-	 * onto the inactive list.
-	 */
-	if (swap_tendency >= 100 && sc->may_swap)
-		reclaim_mapped = 1;
-
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
-- 
cgit v1.2.2


From 41d78ba55037468e6c86c53e3076d1a74841de39 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Feb 2006 13:52:58 -0800
Subject: [PATCH] compound page: use page[1].lru

If a compound page has its own put_page_testzero destructor (the only current
example is free_huge_page), that is noted in page[1].mapping of the compound
page.  But that's rather a poor place to keep it: functions which call
set_page_dirty_lock after get_user_pages (e.g.  Infiniband's
__ib_umem_release) ought to be checking first, otherwise set_page_dirty is
liable to crash on what's not the address of a struct address_space.

And now I'm about to make that worse: it turns out that every compound page
needs a destructor, so we can no longer rely on hugetlb pages going their own
special way, to avoid further problems of page->mapping reuse.  For example,
not many people know that: on 50% of i386 -Os builds, the first tail page of a
compound page purports to be PageAnon (when its destructor has an odd
address), which surprises page_add_file_rmap.

Keep the compound page destructor in page[1].lru.next instead.  And to free up
the common pairing of mapping and index, also move compound page order from
index to lru.prev.  Slab reuses page->lru too: but if we ever need slab to use
compound pages, it can easily stack its use above this.

(akpm: decoded version of the above: the tail pages of a compound page now
have ->mapping==NULL, so there's no need for the set_page_dirty[_lock]()
caller to check that they're not compund pages before doing the dirty).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c    |  4 ++--
 mm/page_alloc.c | 15 ++++++---------
 mm/swap.c       |  2 +-
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f29516662a..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
 	BUG_ON(page_count(page));
 
 	INIT_LIST_HEAD(&page->lru);
-	page[1].mapping = NULL;
+	page[1].lru.next = NULL;			/* reset dtor */
 
 	spin_lock(&hugetlb_lock);
 	enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
-	page[1].mapping = (void *)free_huge_page;
+	page[1].lru.next = (void *)free_huge_page;	/* set dtor */
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be31..eec89ab39bb6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -169,20 +169,17 @@ static void bad_page(struct page *page)
  * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present.  This usage means that zero-order pages
- * may not be compound.
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function.  Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
  */
 static void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 
-	page[1].mapping = NULL;
-	page[1].index = order;
+	page[1].lru.next = NULL;			/* set dtor */
+	page[1].lru.prev = (void *)order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
@@ -196,7 +193,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	if (unlikely(page[1].index != order))
+	if (unlikely((unsigned long)page[1].lru.prev != order))
 		bad_page(page);
 
 	for (i = 0; i < nr_pages; i++) {
diff --git a/mm/swap.c b/mm/swap.c
index 76247424dea1..cce3dda59c59 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
 	if (put_page_testzero(page)) {
 		void (*dtor)(struct page *page);
 
-		dtor = (void (*)(struct page *))page[1].mapping;
+		dtor = (void (*)(struct page *))page[1].lru.next;
 		(*dtor)(page);
 	}
 }
-- 
cgit v1.2.2


From d98c7a09843621f1b145ca5ae8ed03ff04085edb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Feb 2006 13:52:59 -0800
Subject: [PATCH] compound page: default destructor

Somehow I imagined that calling a NULL destructor would free a compound page
rather than oopsing.  No, we must supply a default destructor, __free_pages_ok
using the order noted by prep_compound_page.  hugetlb can still replace this
as before with its own free_huge_page pointer.

The case that needs this is not common: rarely does put_compound_page's
put_page_testzero bring the count down to 0.  But if get_user_pages is applied
to some part of a compound page, without immediate release (e.g.  AIO or
Infiniband), then it's possible for its put_page to come after the containing
vma has been unmapped and the driver done its free_pages.

That's just the kind of case compound pages are supposed to be guarding
against (but Nick points out, nor did PageReserved handle this right).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eec89ab39bb6..62c122528587 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
 int percpu_pagelist_fraction;
 
 static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
 
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
@@ -173,12 +174,18 @@ static void bad_page(struct page *page)
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
+
+static void free_compound_page(struct page *page)
+{
+	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
 static void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 
-	page[1].lru.next = NULL;			/* set dtor */
+	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
 	page[1].lru.prev = (void *)order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
-- 
cgit v1.2.2


From f822566165dd46ff5de9bf895cfa6c51f53bb0c4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@mellanox.co.il>
Date: Tue, 14 Feb 2006 13:53:08 -0800
Subject: [PATCH] madvise MADV_DONTFORK/MADV_DOFORK

Currently, copy-on-write may change the physical address of a page even if the
user requested that the page is pinned in memory (either by mlock or by
get_user_pages).  This happens if the process forks meanwhile, and the parent
writes to that page.  As a result, the page is orphaned: in case of
get_user_pages, the application will never see any data hardware DMA's into
this page after the COW.  In case of mlock'd memory, the parent is not getting
the realtime/security benefits of mlock.

In particular, this affects the Infiniband modules which do DMA from and into
user pages all the time.

This patch adds madvise options to control whether memory range is inherited
across fork.  Useful e.g.  for when hardware is doing DMA from/into these
pages.  Could also be useful to an application wanting to speed up its forks
by cutting large areas out of consideration.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
 	struct mm_struct * mm = vma->vm_mm;
 	int error = 0;
 	pgoff_t pgoff;
-	int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+	int new_flags = vma->vm_flags;
 
 	switch (behavior) {
+	case MADV_NORMAL:
+		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+		break;
 	case MADV_SEQUENTIAL:
-		new_flags |= VM_SEQ_READ;
+		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
 		break;
 	case MADV_RANDOM:
-		new_flags |= VM_RAND_READ;
+		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
 		break;
-	default:
+	case MADV_DONTFORK:
+		new_flags |= VM_DONTCOPY;
+		break;
+	case MADV_DOFORK:
+		new_flags &= ~VM_DONTCOPY;
 		break;
 	}
 
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	long error;
 
 	switch (behavior) {
+	case MADV_DOFORK:
+		if (vma->vm_flags & VM_IO) {
+			error = -EINVAL;
+			break;
+		}
+	case MADV_DONTFORK:
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
-- 
cgit v1.2.2


From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 16 Feb 2006 23:41:58 +0100
Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and
 cleanup

AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution
installation it's easiest if it's a boot time option.

Also I moved the variable to a more appropiate place and make
it independent from sysctl

And marked __read_mostly which it is.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+	randomize_va_space = 0;
+	return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
-- 
cgit v1.2.2


From dd942ae331425812930cd01766178b7e28e65f2d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 17 Feb 2006 01:39:16 +0100
Subject: [PATCH] Handle all and empty zones when setting up custom zonelists
 for mbind

The memory allocator doesn't like empty zones (which have an
uninitialized freelist), so a x86-64 system with a node fully
in GFP_DMA32 only would crash on mbind.

Fix that up by putting all possible zones as fallback into the zonelist
and skipping the empty ones.

In fact the code always enough allocated space for all zones,
but only used it for the highest. This change just uses all the
memory that was allocated before.

This should work fine for now, but whoever implements node hot removal
needs to fix this somewhere else too (or make sure zone datastructures
by itself never go away, only their memory)

Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b75..323fdcf128c4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 	}
 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
+
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
 	struct zonelist *zl;
-	int num, max, nd;
+	int num, max, nd, k;
 
 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 	if (!zl)
 		return NULL;
 	num = 0;
-	for_each_node_mask(nd, *nodes)
-		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+	/* First put in the highest zones from all nodes, then all the next 
+	   lower zones etc. Avoid empty zones because the memory allocator
+	   doesn't like them. If you implement node hot removal you
+	   have to fix that. */
+	for (k = policy_zone; k >= 0; k--) { 
+		for_each_node_mask(nd, *nodes) { 
+			struct zone *z = &NODE_DATA(nd)->node_zones[k];
+			if (z->present_pages > 0) 
+				zl->zones[num++] = z;
+		}
+	}
 	zl->zones[num] = NULL;
 	return zl;
 }
-- 
cgit v1.2.2


From 4cf808eb443ead42777a0230b73aec0cee7fb298 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@osdl.org>
Date: Fri, 17 Feb 2006 20:38:21 +0100
Subject: [PATCH] Handle holes in node mask in node fallback list setup

Change the find_next_best_node algorithm to correctly skip
over holes in the node online mask. Previously it would not handle
missing nodes correctly and cause crashes at boot.

[Written by Linus, tested by AK]

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62c122528587..208812b25597 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1541,29 +1541,29 @@ static int __initdata node_load[MAX_NUMNODES];
  */
 static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 {
-	int i, n, val;
+	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
 
-	for_each_online_node(i) {
-		cpumask_t tmp;
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
 
-		/* Start from local node */
-		n = (node+i) % num_online_nodes();
+	for_each_online_node(n) {
+		cpumask_t tmp;
 
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 
-		/* Use the local node if we haven't already */
-		if (!node_isset(node, *used_node_mask)) {
-			best_node = node;
-			break;
-		}
-
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
 		/* Give preference to headless and unused nodes */
 		tmp = node_to_cpumask(n);
 		if (!cpus_empty(tmp))
-- 
cgit v1.2.2


From 636f13c174dd7c84a437d3c3e8fa66f03f7fda63 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Fri, 17 Feb 2006 13:59:36 -0800
Subject: [PATCH] sys_mbind sanity checking

Make sure maxnodes is safe size before calculating nlongs in
get_nodes().

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 323fdcf128c4..bedfa4f09c80 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -808,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
+	if (maxnode > PAGE_SIZE)
+		return -EINVAL;
 
 	nlongs = BITS_TO_LONGS(maxnode);
 	if ((maxnode % BITS_PER_LONG) == 0)
-- 
cgit v1.2.2


From 9827b781f20828e5ceb911b879f268f78fe90815 Mon Sep 17 00:00:00 2001
From: Kurt Garloff <garloff@suse.de>
Date: Mon, 20 Feb 2006 18:27:51 -0800
Subject: [PATCH] OOM kill: children accounting

In the badness() calculation, there's currently this piece of code:

        /*
         * Processes which fork a lot of child processes are likely
         * a good choice. We add the vmsize of the children if they
         * have an own mm. This prevents forking servers to flood the
         * machine with an endless amount of children
         */
        list_for_each(tsk, &p->children) {
                struct task_struct *chld;
                chld = list_entry(tsk, struct task_struct, sibling);
                if (chld->mm = p->mm && chld->mm)
                        points += chld->mm->total_vm;
        }

The intention is clear: If some server (apache) keeps spawning new children
and we run OOM, we want to kill the father rather than picking a child.

This -- to some degree -- also helps a bit with getting fork bombs under
control, though I'd consider this a desirable side-effect rather than a
feature.

There's one problem with this: No matter how many or few children there are,
if just one of them misbehaves, and all others (including the father) do
everything right, we still always kill the whole family.  This hits in real
life; whether it's javascript in konqueror resulting in kdeinit (and thus the
whole KDE session) being hit or just a classical server that spawns children.

Sidenote: The killer does kill all direct children as well, not only the
selected father, see oom_kill_process().

The idea in attached patch is that we do want to account the memory
consumption of the (direct) children to the father -- however not fully.
This maintains the property that fathers with too many children will still
very likely be picked, whereas a single misbehaving child has the chance to
be picked by the OOM killer.

In the patch I account only half (rounded up) of the children's vm_size to
the parent.  This means that if one child eats more mem than the rest of
the family, it will be picked, otherwise it's still the father and thus the
whole family that gets selected.

This is heuristics -- we could debate whether accounting for a fourth would
be better than for half of it.  Or -- if people would consider it worth the
trouble -- make it a sysctl.  For now I sticked to accounting for half,
which should IMHO be a significant improvement.

The patch does one more thing: As users tend to be irritated by the choice
of killed processes (mainly because the children are killed first, despite
some of them having a very low OOM score), I added some more output: The
selected (father) process will be reported first and it's oom_score printed
to syslog.

Description:

Only account for half of children's vm size in oom score calculation

This should still give the parent enough point in case of fork bombs.  If
any child however has more than 50% of the vm size of all children
together, it'll get a higher score and be elected.

This patch also makes the kernel display the oom_score.

Signed-off-by: Kurt Garloff <garloff@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a562..949eba1d5ba3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 
 	/*
 	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add the vmsize of the children if they
+	 * a good choice. We add half the vmsize of the children if they
 	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of children
+	 * machine with an endless amount of children. In case a single
+	 * child is eating the vast majority of memory, adding only half
+	 * to the parents will make the child our kill candidate of choice.
 	 */
 	list_for_each(tsk, &p->children) {
 		struct task_struct *chld;
 		chld = list_entry(tsk, struct task_struct, sibling);
 		if (chld->mm != p->mm && chld->mm)
-			points += chld->mm->total_vm;
+			points += chld->mm->total_vm/2 + 1;
 	}
 
 	/*
@@ -136,12 +138,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct * select_bad_process(void)
+static struct task_struct *select_bad_process(unsigned long *ppoints)
 {
-	unsigned long maxpoints = 0;
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	struct timespec uptime;
+	*ppoints = 0;
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p) {
@@ -169,9 +171,9 @@ static struct task_struct * select_bad_process(void)
 			return p;
 
 		points = badness(p, uptime.tv_sec);
-		if (points > maxpoints || !chosen) {
+		if (points > *ppoints || !chosen) {
 			chosen = p;
-			maxpoints = points;
+			*ppoints = points;
 		}
 	} while_each_thread(g, p);
 	return chosen;
@@ -237,12 +239,15 @@ static struct mm_struct *oom_kill_task(task_t *p)
 	return mm;
 }
 
-static struct mm_struct *oom_kill_process(struct task_struct *p)
+static struct mm_struct *oom_kill_process(struct task_struct *p,
+					  unsigned long points)
 {
  	struct mm_struct *mm;
 	struct task_struct *c;
 	struct list_head *tsk;
 
+	printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
+		"children.\n", p->pid, p->comm, points);
 	/* Try to kill a child first */
 	list_for_each(tsk, &p->children) {
 		c = list_entry(tsk, struct task_struct, sibling);
@@ -267,6 +272,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
 	task_t * p;
+	unsigned long points;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -278,7 +284,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
 	cpuset_lock();
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process();
+	p = select_bad_process(&points);
 
 	if (PTR_ERR(p) == -1UL)
 		goto out;
@@ -290,7 +296,7 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	mm = oom_kill_process(p);
+	mm = oom_kill_process(p, points);
 	if (!mm)
 		goto retry;
 
-- 
cgit v1.2.2


From 9b0f8b040acd8dfd23860754c0d09ff4f44e2cbc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Mon, 20 Feb 2006 18:27:52 -0800
Subject: [PATCH] Terminate process that fails on a constrained allocation

Some allocations are restricted to a limited set of nodes (due to memory
policies or cpuset constraints).  If the page allocator is not able to find
enough memory then that does not mean that overall system memory is low.

In particular going postal and more or less randomly shooting at processes
is not likely going to help the situation but may just lead to suicide (the
whole system coming down).

It is better to signal to the process that no memory exists given the
constraints that the process (or the configuration of the process) has
placed on the allocation behavior.  The process may be killed but then the
sysadmin or developer can investigate the situation.  The solution is
similar to what we do when running out of hugepages.

This patch adds a check before we kill processes.  At that point
performance considerations do not matter much so we just scan the zonelist
and reconstruct a list of nodes.  If the list of nodes does not contain all
online nodes then this is a constrained allocation and we should kill the
current process.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c   | 103 ++++++++++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c |   2 +-
 2 files changed, 79 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 949eba1d5ba3..8123fad5a485 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -132,6 +132,36 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	return points;
 }
 
+/*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+#define CONSTRAINT_NONE 1
+#define CONSTRAINT_MEMORY_POLICY 2
+#define CONSTRAINT_CPUSET 3
+
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+	struct zone **z;
+	nodemask_t nodes = node_online_map;
+
+	for (z = zonelist->zones; *z; z++)
+		if (cpuset_zone_allowed(*z, gfp_mask))
+			node_clear((*z)->zone_pgdat->node_id,
+					nodes);
+		else
+			return CONSTRAINT_CPUSET;
+
+	if (!nodes_empty(nodes))
+		return CONSTRAINT_MEMORY_POLICY;
+#endif
+
+	return CONSTRAINT_NONE;
+}
+
 /*
  * Simple selection loop. We chose the process with the highest
  * number of 'points'. We expect the caller will lock the tasklist.
@@ -184,7 +214,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
  * we select a process with CAP_SYS_RAW_IO set).
  */
-static void __oom_kill_task(task_t *p)
+static void __oom_kill_task(task_t *p, const char *message)
 {
 	if (p->pid == 1) {
 		WARN_ON(1);
@@ -200,8 +230,8 @@ static void __oom_kill_task(task_t *p)
 		return;
 	}
 	task_unlock(p);
-	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
-							p->pid, p->comm);
+	printk(KERN_ERR "%s: Killed process %d (%s).\n",
+				message, p->pid, p->comm);
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
@@ -214,7 +244,7 @@ static void __oom_kill_task(task_t *p)
 	force_sig(SIGKILL, p);
 }
 
-static struct mm_struct *oom_kill_task(task_t *p)
+static struct mm_struct *oom_kill_task(task_t *p, const char *message)
 {
 	struct mm_struct *mm = get_task_mm(p);
 	task_t * g, * q;
@@ -226,21 +256,21 @@ static struct mm_struct *oom_kill_task(task_t *p)
 		return NULL;
 	}
 
-	__oom_kill_task(p);
+	__oom_kill_task(p, message);
 	/*
 	 * kill all processes that share the ->mm (i.e. all threads),
 	 * but are in a different thread group
 	 */
 	do_each_thread(g, q)
 		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
+			__oom_kill_task(q, message);
 	while_each_thread(g, q);
 
 	return mm;
 }
 
 static struct mm_struct *oom_kill_process(struct task_struct *p,
-					  unsigned long points)
+				unsigned long points, const char *message)
 {
  	struct mm_struct *mm;
 	struct task_struct *c;
@@ -253,11 +283,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p,
 		c = list_entry(tsk, struct task_struct, sibling);
 		if (c->mm == p->mm)
 			continue;
-		mm = oom_kill_task(c);
+		mm = oom_kill_task(c, message);
 		if (mm)
 			return mm;
 	}
-	return oom_kill_task(p);
+	return oom_kill_task(p, message);
 }
 
 /**
@@ -268,10 +298,10 @@ static struct mm_struct *oom_kill_process(struct task_struct *p,
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-void out_of_memory(gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
-	task_t * p;
+	task_t *p;
 	unsigned long points;
 
 	if (printk_ratelimit()) {
@@ -283,25 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
 
 	cpuset_lock();
 	read_lock(&tasklist_lock);
+
+	/*
+	 * Check if there were limitations on the allocation (only relevant for
+	 * NUMA) that may require different handling.
+	 */
+	switch (constrained_alloc(zonelist, gfp_mask)) {
+	case CONSTRAINT_MEMORY_POLICY:
+		mm = oom_kill_process(current, points,
+				"No available memory (MPOL_BIND)");
+		break;
+
+	case CONSTRAINT_CPUSET:
+		mm = oom_kill_process(current, points,
+				"No available memory in cpuset");
+		break;
+
+	case CONSTRAINT_NONE:
 retry:
-	p = select_bad_process(&points);
+		/*
+		 * Rambo mode: Shoot down a process and hope it solves whatever
+		 * issues we may have.
+		 */
+		p = select_bad_process(&points);
 
-	if (PTR_ERR(p) == -1UL)
-		goto out;
+		if (PTR_ERR(p) == -1UL)
+			goto out;
 
-	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!p) {
-		read_unlock(&tasklist_lock);
-		cpuset_unlock();
-		panic("Out of memory and no killable processes...\n");
-	}
+		/* Found nothing?!?! Either we hang forever, or we panic. */
+		if (!p) {
+			read_unlock(&tasklist_lock);
+			cpuset_unlock();
+			panic("Out of memory and no killable processes...\n");
+		}
 
-	mm = oom_kill_process(p, points);
-	if (!mm)
-		goto retry;
+		mm = oom_kill_process(p, points, "Out of memory");
+		if (!mm)
+			goto retry;
+
+		break;
+	}
 
- out:
-	read_unlock(&tasklist_lock);
+out:
 	cpuset_unlock();
 	if (mm)
 		mmput(mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 208812b25597..791690d7d3fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1015,7 +1015,7 @@ rebalance:
 		if (page)
 			goto got_pg;
 
-		out_of_memory(gfp_mask, order);
+		out_of_memory(zonelist, gfp_mask, order);
 		goto restart;
 	}
 
-- 
cgit v1.2.2


From a9c930bac163c5e616ca0ba9378e7dc746c93227 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 20 Feb 2006 18:27:59 -0800
Subject: [PATCH] Fix units in mbind check

maxnode is a bit index and can't be directly compared against a byte length
like PAGE_SIZE

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bedfa4f09c80..6422fe478113 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -808,7 +808,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
-	if (maxnode > PAGE_SIZE)
+	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 		return -EINVAL;
 
 	nlongs = BITS_TO_LONGS(maxnode);
-- 
cgit v1.2.2


From 7a9166e3b037296366cea6f3c97f705d33e209e6 Mon Sep 17 00:00:00 2001
From: Luke Yang <luke.adi@gmail.com>
Date: Mon, 20 Feb 2006 18:28:07 -0800
Subject: [PATCH] Fix undefined symbols for nommu architecture

Signed-off-by: Luke Yang <luke.adi@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..99d21020ec9d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
 
 /*
  * Handle all mappings that got truncated by a "truncate()"
-- 
cgit v1.2.2


From fcab6f351305029fc5e3c632209d45cae57e4835 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 20 Feb 2006 18:28:10 -0800
Subject: [PATCH] mm/mempolicy.c: fix 'if ();' typo

[akpm; it happens that the code was still correct, only inefficient ]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6422fe478113..880831bd3003 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -587,7 +587,7 @@ redo:
 		}
 		list_add(&page->lru, &newlist);
 		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE);
+		if (nr_pages > MIGRATE_CHUNK_SIZE)
 			break;
 	}
 	err = migrate_pages(pagelist, &newlist, &moved, &failed);
-- 
cgit v1.2.2


From b00dc3ad74fdb676552d46ee573b88e927240d0c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 21 Feb 2006 23:49:47 +0000
Subject: [PATCH] tmpfs: fix mount mpol nodelist parsing

I've been dissatisfied with the mpol_nodelist mount option which was
added to tmpfs earlier in -rc.  Replace it by mpol=policy:nodelist.

And it was broken: a nodelist is a comma-separated list of numbers and
ranges; the mount options are a comma-separated list of token=values.
Whoops, blindly strsep'ing on commas doesn't work so well: since we've
no numeric tokens, and unlikely to add them, use that to distinguish.

Move the mpol= parsing to shmem_parse_mpol under CONFIG_NUMA, reject
all its options as invalid if not NUMA.  /proc shows MPOL_PREFERRED
as "prefer", so use that name for the policy instead of "preferred".

Enforce that mpol=default has no nodelist; that mpol=prefer has one
node only; that mpol=bind has a nodelist; but let mpol=interleave use
node_online_map if no nodelist given.  Describe this in tmpfs.txt.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Robin Holt <holt@sgi.com>
Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f92..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
+#include <linux/ctype.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
 }
 
 #ifdef CONFIG_NUMA
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+	char *nodelist = strchr(value, ':');
+	int err = 1;
+
+	if (nodelist) {
+		/* NUL-terminate policy string */
+		*nodelist++ = '\0';
+		if (nodelist_parse(nodelist, *policy_nodes))
+			goto out;
+	}
+	if (!strcmp(value, "default")) {
+		*policy = MPOL_DEFAULT;
+		/* Don't allow a nodelist */
+		if (!nodelist)
+			err = 0;
+	} else if (!strcmp(value, "prefer")) {
+		*policy = MPOL_PREFERRED;
+		/* Insist on a nodelist of one node only */
+		if (nodelist) {
+			char *rest = nodelist;
+			while (isdigit(*rest))
+				rest++;
+			if (!*rest)
+				err = 0;
+		}
+	} else if (!strcmp(value, "bind")) {
+		*policy = MPOL_BIND;
+		/* Insist on a nodelist */
+		if (nodelist)
+			err = 0;
+	} else if (!strcmp(value, "interleave")) {
+		*policy = MPOL_INTERLEAVE;
+		/* Default to nodes online if no nodelist */
+		if (!nodelist)
+			*policy_nodes = node_online_map;
+		err = 0;
+	}
+out:
+	/* Restore string for error message */
+	if (nodelist)
+		*--nodelist = ':';
+	return err;
+}
+
 static struct page *shmem_swapin_async(struct shared_policy *p,
 				       swp_entry_t entry, unsigned long idx)
 {
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
 	return page;
 }
 #else
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+	return 1;
+}
+
 static inline struct page *
 shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
 {
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 {
 	char *this_char, *value, *rest;
 
-	while ((this_char = strsep(&options, ",")) != NULL) {
+	while (options != NULL) {
+		this_char = options;
+		for (;;) {
+			/*
+			 * NUL-terminate this option: unfortunately,
+			 * mount options form a comma-separated list,
+			 * but mpol's nodelist may also contain commas.
+			 */
+			options = strchr(options, ',');
+			if (options == NULL)
+				break;
+			options++;
+			if (!isdigit(*options)) {
+				options[-1] = '\0';
+				break;
+			}
+		}
 		if (!*this_char)
 			continue;
 		if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 			if (*rest)
 				goto bad_val;
 		} else if (!strcmp(this_char,"mpol")) {
-			if (!strcmp(value,"default"))
-				*policy = MPOL_DEFAULT;
-			else if (!strcmp(value,"preferred"))
-				*policy = MPOL_PREFERRED;
-			else if (!strcmp(value,"bind"))
-				*policy = MPOL_BIND;
-			else if (!strcmp(value,"interleave"))
-				*policy = MPOL_INTERLEAVE;
-			else
+			if (shmem_parse_mpol(value,policy,policy_nodes))
 				goto bad_val;
-		} else if (!strcmp(this_char,"mpol_nodelist")) {
-			nodelist_parse(value, *policy_nodes);
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
 			       this_char);
-- 
cgit v1.2.2