From 69cf0fac6052c5bd3fb3469a41d4216e926028f8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 17 Apr 2006 22:46:32 +0100
Subject: [PATCH] Fix MADV_REMOVE protection checking

madvise_remove needs to respect file and mmap protections.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Will the real CVE-2006-1524 stand up, please.. ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index af3d573b0141..4e196155a0c3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -168,6 +168,9 @@ static long madvise_remove(struct vm_area_struct *vma,
 			return -EINVAL;
 	}
 
+	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+		return -EACCES;
+
 	mapping = vma->vm_file->f_mapping;
 
 	offset = (loff_t)(start - vma->vm_start)
-- 
cgit v1.2.2


From 75129e297e861e6c61038aa4cdbf604b022de4ff Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 18 Apr 2006 22:20:33 -0700
Subject: [PATCH] mm/slob.c: for_each_possible_cpu(), not NR_CPUS

Convert for-loops that explicitly reference "NR_CPUS" into the
potentially more efficient for_each_possible_cpu() construct.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slob.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 9bcc7e2cabfd..a68255ba4553 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -354,9 +354,7 @@ void *__alloc_percpu(size_t size)
 	if (!pdata)
 		return NULL;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+	for_each_possible_cpu(i) {
 		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
 		if (!pdata->ptrs[i])
 			goto unwind_oom;
@@ -383,11 +381,9 @@ free_percpu(const void *objp)
 	int i;
 	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+	for_each_possible_cpu(i)
 		kfree(p->ptrs[i]);
-	}
+
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
-- 
cgit v1.2.2


From 97c2c9b84d0c1edf4926b13661d5af3f0edccbce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 18 Apr 2006 22:20:38 -0700
Subject: [PATCH] oom-kill: mm locking fix

Dave Peterson <dsp@llnl.gov> points out that badness() is playing with
mm_structs without taking a reference on them.

mmput() can sleep, so taking a reference here (inside tasklist_lock) is
hard.  Fix it up via task_lock() instead.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 78747afad6b0..9a643c4bf77c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -46,15 +46,25 @@
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
-	struct list_head *tsk;
+	struct mm_struct *mm;
+	struct task_struct *child;
 
-	if (!p->mm)
+	task_lock(p);
+	mm = p->mm;
+	if (!mm) {
+		task_unlock(p);
 		return 0;
+	}
 
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
-	points = p->mm->total_vm;
+	points = mm->total_vm;
+
+	/*
+	 * After this unlock we can no longer dereference local variable `mm'
+	 */
+	task_unlock(p);
 
 	/*
 	 * Processes which fork a lot of child processes are likely
@@ -64,11 +74,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
 	 */
-	list_for_each(tsk, &p->children) {
-		struct task_struct *chld;
-		chld = list_entry(tsk, struct task_struct, sibling);
-		if (chld->mm != p->mm && chld->mm)
-			points += chld->mm->total_vm/2 + 1;
+	list_for_each_entry(child, &p->children, sibling) {
+		task_lock(child);
+		if (child->mm != mm && child->mm)
+			points += child->mm->total_vm/2 + 1;
+		task_unlock(child);
 	}
 
 	/*
-- 
cgit v1.2.2


From 013159227b840dfd441bd2e4c8b4d77ffb3cc42e Mon Sep 17 00:00:00 2001
From: Dave Peterson <dsp@llnl.gov>
Date: Tue, 18 Apr 2006 22:20:44 -0700
Subject: [PATCH] mm: fix mm_struct reference counting bugs in mm/oom_kill.c

Fix oom_kill_task() so it doesn't call mmput() (which may sleep) while
holding tasklist_lock.

Signed-off-by: David S. Peterson <dsp@llnl.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9a643c4bf77c..042e6436c3ee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -254,17 +254,24 @@ static void __oom_kill_task(task_t *p, const char *message)
 	force_sig(SIGKILL, p);
 }
 
-static struct mm_struct *oom_kill_task(task_t *p, const char *message)
+static int oom_kill_task(task_t *p, const char *message)
 {
-	struct mm_struct *mm = get_task_mm(p);
+	struct mm_struct *mm;
 	task_t * g, * q;
 
-	if (!mm)
-		return NULL;
-	if (mm == &init_mm) {
-		mmput(mm);
-		return NULL;
-	}
+	mm = p->mm;
+
+	/* WARNING: mm may not be dereferenced since we did not obtain its
+	 * value from get_task_mm(p).  This is OK since all we need to do is
+	 * compare mm to q->mm below.
+	 *
+	 * Furthermore, even if mm contains a non-NULL value, p->mm may
+	 * change to NULL at any time since we do not hold task_lock(p).
+	 * However, this is of no concern to us.
+	 */
+
+	if (mm == NULL || mm == &init_mm)
+		return 1;
 
 	__oom_kill_task(p, message);
 	/*
@@ -276,13 +283,12 @@ static struct mm_struct *oom_kill_task(task_t *p, const char *message)
 			__oom_kill_task(q, message);
 	while_each_thread(g, q);
 
-	return mm;
+	return 0;
 }
 
-static struct mm_struct *oom_kill_process(struct task_struct *p,
-				unsigned long points, const char *message)
+static int oom_kill_process(struct task_struct *p, unsigned long points,
+		const char *message)
 {
- 	struct mm_struct *mm;
 	struct task_struct *c;
 	struct list_head *tsk;
 
@@ -293,9 +299,8 @@ static struct mm_struct *oom_kill_process(struct task_struct *p,
 		c = list_entry(tsk, struct task_struct, sibling);
 		if (c->mm == p->mm)
 			continue;
-		mm = oom_kill_task(c, message);
-		if (mm)
-			return mm;
+		if (!oom_kill_task(c, message))
+			return 0;
 	}
 	return oom_kill_task(p, message);
 }
@@ -310,7 +315,6 @@ static struct mm_struct *oom_kill_process(struct task_struct *p,
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-	struct mm_struct *mm = NULL;
 	task_t *p;
 	unsigned long points = 0;
 
@@ -330,12 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 	 */
 	switch (constrained_alloc(zonelist, gfp_mask)) {
 	case CONSTRAINT_MEMORY_POLICY:
-		mm = oom_kill_process(current, points,
+		oom_kill_process(current, points,
 				"No available memory (MPOL_BIND)");
 		break;
 
 	case CONSTRAINT_CPUSET:
-		mm = oom_kill_process(current, points,
+		oom_kill_process(current, points,
 				"No available memory in cpuset");
 		break;
 
@@ -357,8 +361,7 @@ retry:
 			panic("Out of memory and no killable processes...\n");
 		}
 
-		mm = oom_kill_process(p, points, "Out of memory");
-		if (!mm)
+		if (oom_kill_process(p, points, "Out of memory"))
 			goto retry;
 
 		break;
@@ -367,8 +370,6 @@ retry:
 out:
 	read_unlock(&tasklist_lock);
 	cpuset_unlock();
-	if (mm)
-		mmput(mm);
 
 	/*
 	 * Give "p" a good chance of killing itself before we
-- 
cgit v1.2.2


From 6aa3001b239b387d98a7f945e4a51edeb59e4f2d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 18 Apr 2006 22:20:52 -0700
Subject: [PATCH] page_alloc.c: buddy handling cleanup

Fix up some whitespace damage.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 97d6827c7d66..123c60586740 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -232,11 +232,13 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_order(struct page *page)
+{
 	return page_private(page);
 }
 
-static inline void set_page_order(struct page *page, int order) {
+static inline void set_page_order(struct page *page, int order)
+{
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
@@ -299,9 +301,9 @@ static inline int page_is_buddy(struct page *page, int order)
 
 	if (PageBuddy(page) && page_order(page) == order) {
 		BUG_ON(page_count(page) != 0);
-               return 1;
+		return 1;
 	}
-       return 0;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.2


From 6d472be37896b1c41b50f3da124f8b7718ba7797 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 20 Apr 2006 02:43:12 -0700
Subject: [PATCH] Remove cond_resched in gather_stats()

gather_stats() is called with a spinlock held from check_pte_range.  We
cannot reschedule with a lock held.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dec8249e972d..8778f58880c4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1761,7 +1761,6 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
 		md->mapcount_max = count;
 
 	md->node[page_to_nid(page)]++;
-	cond_resched();
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.2


From 304dbdb7a4fbb7f40a6ad5c5836fdd456c233c63 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 22 Apr 2006 02:35:48 -0700
Subject: [PATCH] add migratepage address space op to shmem

Basic problem: pages of a shared memory segment can only be migrated once.

In 2.6.16 through 2.6.17-rc1, shared memory mappings do not have a
migratepage address space op.  Therefore, migrate_pages() falls back to
default processing.  In this path, it will try to pageout() dirty pages.
Once a shared memory page has been migrated it becomes dirty, so
migrate_pages() will try to page it out.  However, because the page count
is 3 [cache + current + pte], pageout() will return PAGE_KEEP because
is_page_cache_freeable() returns false.  This will abort all subsequent
migrations.

This patch adds a migratepage address space op to shared memory segments to
avoid taking the default path.  We use the "migrate_page()" function
because it knows how to migrate dirty pages.  This allows shared memory
segment pages to migrate, subject to other conditions such as # pte's
referencing the page [page_mapcount(page)], when requested.

I think this is safe.  If we're migrating a shared memory page, then we
found the page via a page table, so it must be in memory.

Can be verified with memtoy and the shmem-mbind-test script, both
available at:  http://free.linux.hp.com/~lts/Tools/

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 37eaf42ed2c6..4c5e68e4e9ae 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -46,6 +46,8 @@
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
+#include <linux/migrate.h>
+
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -2173,6 +2175,7 @@ static struct address_space_operations shmem_aops = {
 	.prepare_write	= shmem_prepare_write,
 	.commit_write	= simple_commit_write,
 #endif
+	.migratepage	= migrate_page,
 };
 
 static struct file_operations shmem_file_operations = {
-- 
cgit v1.2.2


From 83d722f7e198b034699b1500d98729beff930efd Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Mon, 24 Apr 2006 19:35:21 -0700
Subject: [PATCH] Remove __devinit and __cpuinit from notifier_call definitions

Few of the notifier_chain_register() callers use __init in the definition
of notifier_call.  It is incorrect as the function definition should be
available after the initializations (they do not unregister them during
initializations).

This patch fixes all such usages to _not_ have the notifier_call __init
section.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 +-
 mm/slab.c       | 2 +-
 mm/vmscan.c     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 123c60586740..ea77c999047e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1962,7 +1962,7 @@ static inline void free_zone_pagesets(int cpu)
 	}
 }
 
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+static int pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
diff --git a/mm/slab.c b/mm/slab.c
index e6ef9bd52335..af5c5237e11a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1036,7 +1036,7 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 
 #endif
 
-static int __devinit cpuup_callback(struct notifier_block *nfb,
+static int cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index acdf001d6941..4649a63a8cb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1328,7 +1328,7 @@ repeat:
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
-- 
cgit v1.2.2


From ebf43500ef148a380bd132743c3fc530111ac620 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 27 Apr 2006 08:46:01 +0200
Subject: [PATCH] Add find_get_pages_contig(): contiguous variant of
 find_get_pages()

find_get_pages_contig() will break out if we hit a hole in the page cache.
From Andrew Morton, small modifications and documentation by me.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 mm/filemap.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 3ef20739e725..fd57442186cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -697,6 +697,38 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 	return ret;
 }
 
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping:	The address_space to search
+ * @index:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ * @pages:	Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * find_get_pages_contig() returns the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+			       unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	read_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, index, nr_pages);
+	for (i = 0; i < ret; i++) {
+		if (pages[i]->mapping == NULL || pages[i]->index != index)
+			break;
+
+		page_cache_get(pages[i]);
+		index++;
+	}
+	read_unlock_irq(&mapping->tree_lock);
+	return i;
+}
+
 /*
  * Like find_get_pages, except we only return pages which are tagged with
  * `tag'.   We update *index to index the next page for the traversal.
-- 
cgit v1.2.2


From 693f7d362055261882659475d2ef022e32edbff1 Mon Sep 17 00:00:00 2001
From: "shin, jacob" <jacob.shin@amd.com>
Date: Fri, 28 Apr 2006 10:54:37 -0500
Subject: [PATCH] slab: fix crash on __drain_alien_cahce() during CPU Hotplug

transfer_objects should only be called when all of the cpus in the
node are online.  CPU_DEAD notifier callback marks l3->shared to NULL.

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index af5c5237e11a..c32af7e7581e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -979,7 +979,8 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 		 * That way we could avoid the overhead of putting the objects
 		 * into the free lists and getting them back later.
 		 */
-		transfer_objects(rl3->shared, ac, ac->limit);
+		if (rl3->shared)
+			transfer_objects(rl3->shared, ac, ac->limit);
 
 		free_block(cachep, ac->entry, ac->avail, node);
 		ac->avail = 0;
-- 
cgit v1.2.2


From 4c28f81193b6778f7b49090930d88e6d12bcb928 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 1 May 2006 12:16:08 -0700
Subject: [PATCH] page migration: Fix fallback behavior for dirty pages

Currently we check PageDirty() in order to make the decision to swap out
the page.  However, the dirty information may be only be contained in the
ptes pointing to the page.  We need to first unmap the ptes before checking
for PageDirty().  If unmap is successful then the page count of the page
will also be decreased so that pageout() works properly.

This is a fix necessary for 2.6.17.  Without this fix we may migrate dirty
pages for filesystems without migration functions.  Filesystems may keep
pointers to dirty pages.  Migration of dirty pages can result in the
filesystem keeping pointers to freed pages.

Unmapping is currently not be separated out from removing all the
references to a page and moving the mapping.  Therefore try_to_unmap will
be called again in migrate_page() if the writeout is successful.  However,
it wont do anything since the ptes are already removed.

The coming updates to the page migration code will restructure the code
so that this is no longer necessary.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index d444229f2599..1c25040693d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -439,6 +439,17 @@ redo:
 			goto unlock_both;
                 }
 
+		/* Make sure the dirty bit is up to date */
+		if (try_to_unmap(page, 1) == SWAP_FAIL) {
+			rc = -EPERM;
+			goto unlock_both;
+		}
+
+		if (page_mapcount(page)) {
+			rc = -EAGAIN;
+			goto unlock_both;
+		}
+
 		/*
 		 * Default handling if a filesystem does not provide
 		 * a migration function. We can only migrate clean
-- 
cgit v1.2.2


From 46a66eecdf7bc12562ecb492297447ed0e1ecf59 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mjkravetz@verizon.net>
Date: Mon, 1 May 2006 12:16:09 -0700
Subject: [PATCH] sparsemem interaction with memory add bug fixes

This patch fixes two bugs with the way sparsemem interacts with memory add.
They are:

- memory leak if memmap for section already exists

- calling alloc_bootmem_node() after boot

These bugs were discovered and a first cut at the fixes were provided by
Arnd Bergmann <arnd@arndb.de> and Joel Schopp <jschopp@us.ibm.com>.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/sparse.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 0a51f36ba3a1..d7c32de99ee8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -32,7 +32,10 @@ static struct mem_section *sparse_index_alloc(int nid)
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
 
-	section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+	if (system_state == SYSTEM_RUNNING)
+		section = kmalloc_node(array_size, GFP_KERNEL, nid);
+	else
+		section = alloc_bootmem_node(NODE_DATA(nid), array_size);
 
 	if (section)
 		memset(section, 0, array_size);
@@ -281,9 +284,9 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 
 	ret = sparse_init_one_section(ms, section_nr, memmap);
 
-	if (ret <= 0)
-		__kfree_section_memmap(memmap, nr_pages);
 out:
 	pgdat_resize_unlock(pgdat, &flags);
+	if (ret <= 0)
+		__kfree_section_memmap(memmap, nr_pages);
 	return ret;
 }
-- 
cgit v1.2.2


From bed120c64eb07b6838bb758109811484af8cebba Mon Sep 17 00:00:00 2001
From: Joel H Schopp <jschopp@us.ibm.com>
Date: Mon, 1 May 2006 12:16:11 -0700
Subject: [PATCH] spufs: fix for CONFIG_NUMA

Based on an older patch from  Mike Kravetz <kravetz@us.ibm.com>

We need to have a mem_map for high addresses in order to make fops->no_page
work on spufs mem and register files.  So far, we have used the
memory_present() function during early bootup, but that did not work when
CONFIG_NUMA was enabled.

We now use the __add_pages() function to add the mem_map when loading the
spufs module, which is a lot nicer.

Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1fe76d963ac2..1ae2b2cc3a54 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -69,12 +69,16 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 	for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
 		err = __add_section(zone, phys_start_pfn + i);
 
-		if (err)
+		/* We want to keep adding the rest of the
+		 * sections if the first ones already exist
+		 */
+		if (err && (err != -EEXIST))
 			break;
 	}
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(__add_pages);
 
 static void grow_zone_span(struct zone *zone,
 		unsigned long start_pfn, unsigned long end_pfn)
-- 
cgit v1.2.2


From ac924c6034d9095f95ee889f7e31bbb9145da0c2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 15 May 2006 09:43:59 -0700
Subject: [PATCH] setup_per_zone_pages_min() overflow fix

As pointed out in http://bugzilla.kernel.org/show_bug.cgi?id=6490, this
function can experience overflows on 32-bit machines, causing our response to
changed values of min_free_kbytes to go whacky.

Fixing it efficiently is all too hard, so fix it with 64-bit math instead.

Cc: Ake Sandgren <ake.sandgren@hpc2n.umu.se>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea77c999047e..813b4ec1298a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,7 @@
 #include <linux/mempolicy.h>
 
 #include <asm/tlbflush.h>
+#include <asm/div64.h>
 #include "internal.h"
 
 /*
@@ -2566,9 +2567,11 @@ void setup_per_zone_pages_min(void)
 	}
 
 	for_each_zone(zone) {
-		unsigned long tmp;
+		u64 tmp;
+
 		spin_lock_irqsave(&zone->lru_lock, flags);
-		tmp = (pages_min * zone->present_pages) / lowmem_pages;
+		tmp = (u64)pages_min * zone->present_pages;
+		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -2595,8 +2598,8 @@ void setup_per_zone_pages_min(void)
 			zone->pages_min = tmp;
 		}
 
-		zone->pages_low   = zone->pages_min + tmp / 4;
-		zone->pages_high  = zone->pages_min + tmp / 2;
+		zone->pages_low   = zone->pages_min + (tmp >> 2);
+		zone->pages_high  = zone->pages_min + (tmp >> 1);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 
-- 
cgit v1.2.2


From 39d24e64263cd3211705d3b61ea4171c65030921 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <kravetz@us.ibm.com>
Date: Mon, 15 May 2006 09:44:13 -0700
Subject: [PATCH] add slab_is_available() routine for boot code

slab_is_available() indicates slab based allocators are available for use.
SPARSEMEM code needs to know this as it can be called at various times
during the boot process.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c   | 8 ++++++++
 mm/sparse.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index c32af7e7581e..b1d643b5238d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -700,6 +700,14 @@ static enum {
 	FULL
 } g_cpucache_up;
 
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+	return g_cpucache_up == FULL;
+}
+
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
diff --git a/mm/sparse.c b/mm/sparse.c
index d7c32de99ee8..c5e89eb9ac8f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -32,7 +32,7 @@ static struct mem_section *sparse_index_alloc(int nid)
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
 
-	if (system_state == SYSTEM_RUNNING)
+	if (slab_is_available())
 		section = kmalloc_node(array_size, GFP_KERNEL, nid);
 	else
 		section = alloc_bootmem_node(NODE_DATA(nid), array_size);
-- 
cgit v1.2.2


From a4523a8b38089478f93bc053c31f678c63f5ee1b Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 15 May 2006 11:41:00 -0700
Subject: [PATCH] slab: Fix kmem_cache_destroy() on NUMA

With CONFIG_NUMA set, kmem_cache_destroy() may fail and say "Can't
free all objects."  The problem is caused by sequences such as the
following (suppose we are on a NUMA machine with two nodes, 0 and 1):

 * Allocate an object from cache on node 0.
 * Free the object on node 1.  The object is put into node 1's alien
   array_cache for node 0.
 * Call kmem_cache_destroy(), which ultimately ends up in __cache_shrink().
 * __cache_shrink() does drain_cpu_caches(), which loops through all nodes.
   For each node it drains the shared array_cache and then handles the
   alien array_cache for the other node.

However this means that node 0's shared array_cache will be drained,
and then node 1 will move the contents of its alien[0] array_cache
into that same shared array_cache.  node 0's shared array_cache is
never looked at again, so the objects left there will appear to be in
use when __cache_shrink() calls __node_shrink() for node 0.  So
__node_shrink() will return 1 and kmem_cache_destroy() will fail.

This patch fixes this by having drain_cpu_caches() do
drain_alien_cache() on every node before it does drain_array() on the
nodes' shared array_caches.

The problem was originally reported by Or Gerlitz <ogerlitz@voltaire.com>.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index b1d643b5238d..d31a06bfbea5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2200,11 +2200,14 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
-		if (l3) {
+		if (l3 && l3->alien)
+			drain_alien_cache(cachep, l3->alien);
+	}
+
+	for_each_online_node(node) {
+		l3 = cachep->nodelists[node];
+		if (l3)
 			drain_array(cachep, l3, l3->shared, 1, node);
-			if (l3->alien)
-				drain_alien_cache(cachep, l3->alien);
-		}
 	}
 }
 
-- 
cgit v1.2.2


From 12783b002db1f02c29353c8f698a85514420b9f4 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <kravetz@us.ibm.com>
Date: Sat, 20 May 2006 15:00:05 -0700
Subject: [PATCH] SPARSEMEM incorrectly calculates section number

A bad calculation/loop in __section_nr() could result in incorrect section
information being put into sysfs memory entries.  This primarily impacts
memory add operations as the sysfs information is used while onlining new
memory.

Fix suggested by Dave Hansen.

Note that the bug may not be obvious from the patch.  It actually occurs in
the function's return statement:

	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);

In the existing code, root_nr has already been multiplied by
SECTIONS_PER_ROOT.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/sparse.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index c5e89eb9ac8f..100040c0dfb6 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -87,11 +87,8 @@ int __section_nr(struct mem_section* ms)
 	unsigned long root_nr;
 	struct mem_section* root;
 
-	for (root_nr = 0;
-	     root_nr < NR_MEM_SECTIONS;
-	     root_nr += SECTIONS_PER_ROOT) {
-		root = __nr_to_section(root_nr);
-
+	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
+		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
 		if (!root)
 			continue;
 
-- 
cgit v1.2.2


From bdd804f478a0cc74bf7db8e9f9d5fd379d1b31ca Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sat, 20 May 2006 15:00:09 -0700
Subject: [PATCH] Cpuset: might sleep checking zones allowed fix

Fix a couple of infrequently encountered 'sleeping function called from
invalid context' in the cpuset hooks in __alloc_pages.  Could sleep while
interrupts disabled.

The routine cpuset_zone_allowed() is called by code in mm/page_alloc.c
__alloc_pages() to determine if a zone is allowed in the current tasks
cpuset.  This routine can sleep, for certain GFP_KERNEL allocations, if the
zone is on a memory node not allowed in the current cpuset, but might be
allowed in a parent cpuset.

But we can't sleep in __alloc_pages() if in interrupt, nor if called for a
GFP_ATOMIC request (__GFP_WAIT not set in gfp_flags).

The rule was intended to be:
  Don't call cpuset_zone_allowed() if you can't sleep, unless you
  pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  the code that might scan up ancestor cpusets and sleep.

This rule was being violated in a couple of places, due to a bogus change
made (by myself, pj) to __alloc_pages() as part of the November 2005 effort
to cleanup its logic, and also due to a later fix to constrain which swap
daemons were awoken.

The bogus change can be seen at:
  http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-11/4691.html
  [PATCH 01/05] mm fix __alloc_pages cpuset ALLOC_* flags

This was first noticed on a tight memory system, in code that was disabling
interrupts and doing allocation requests with __GFP_WAIT not set, which
resulted in __might_sleep() writing complaints to the log "Debug: sleeping
function called ...", when the code in cpuset_zone_allowed() tried to take
the callback_sem cpuset semaphore.

We haven't seen a system hang on this 'might_sleep' yet, but we are at
decent risk of seeing it fairly soon, especially since the additional
cpuset_zone_allowed() check was added, conditioning wakeup_kswapd(), in
March 2006.

Special thanks to Dave Chinner, for figuring this out, and a tip of the hat
to Nick Piggin who warned me of this back in Nov 2005, before I was ready
to listen.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 813b4ec1298a..bb3416932ab0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -951,7 +951,7 @@ restart:
 		goto got_pg;
 
 	do {
-		if (cpuset_zone_allowed(*z, gfp_mask))
+		if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL))
 			wakeup_kswapd(*z, order);
 	} while (*(++z));
 
@@ -970,7 +970,8 @@ restart:
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
 		alloc_flags |= ALLOC_HIGH;
-	alloc_flags |= ALLOC_CPUSET;
+	if (wait)
+		alloc_flags |= ALLOC_CPUSET;
 
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
-- 
cgit v1.2.2


From e984bb43f7450312ba66fe0e67a99efa6be3b246 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 20 May 2006 15:00:31 -0700
Subject: [PATCH] Align the node_mem_map endpoints to a MAX_ORDER boundary

Andy added code to buddy allocator which does not require the zone's
endpoints to be aligned to MAX_ORDER.  An issue is that the buddy allocator
requires the node_mem_map's endpoints to be MAX_ORDER aligned.  Otherwise
__page_find_buddy could compute a buddy not in node_mem_map for partial
MAX_ORDER regions at zone's endpoints.  page_is_buddy will detect that
these pages at endpoints are not PG_buddy (they were zeroed out by bootmem
allocator and not part of zone).  Of course the negative here is we could
waste a little memory but the positive is eliminating all the old checks
for zone boundary conditions.

SPARSEMEM won't encounter this issue because of MAX_ORDER size constraint
when SPARSEMEM is configured.  ia64 VIRTUAL_MEM_MAP doesn't need the logic
either because the holes and endpoints are handled differently.  This
leaves checking alloc_remap and other arches which privately allocate for
node_mem_map.

Signed-off-by: Bob Picco <bob.picco@hp.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb3416932ab0..253a450c400d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2125,14 +2125,22 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
-		unsigned long size;
+		unsigned long size, start, end;
 		struct page *map;
 
-		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		/*
+		 * The zone's endpoints aren't required to be MAX_ORDER
+		 * aligned but the node_mem_map endpoints must be in order
+		 * for the buddy allocator to function correctly.
+		 */
+		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+		end = ALIGN(end, MAX_ORDER_NR_PAGES);
+		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node(pgdat, size);
-		pgdat->node_mem_map = map;
+		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifdef CONFIG_FLATMEM
 	/*
-- 
cgit v1.2.2


From 25a6df952542ad9f284421b6ffe28f3eb3df1305 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 30 May 2006 21:25:42 -0700
Subject: [PATCH] spanned_pages is not updated at a case of memory hot-add

From: Yasunori Goto <y-goto@jp.fujitsu.com>

If hot-added memory's address is smaller than old area, spanned_pages will
not be updated.  It must be fixed.

example) Old zone_start_pfn = 0x60000, and spanned_pages = 0x10000
         Added new memory's start_pfn = 0x50000, and end_pfn = 0x60000

  new spanned_pages will be still 0x10000 by old code.
  (It should be updated to 0x20000.) Because old_zone_end_pfn will be
  0x70000, and end_pfn smaller than it. So, spanned_pages will not be
  updated.

In current code, spanned_pages is updated only when end_pfn is updated.
But, it should be updated by subtraction between bigger end_pfn and new
zone_start_pfn.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ae2b2cc3a54..70df5c0d957e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -91,8 +91,8 @@ static void grow_zone_span(struct zone *zone,
 	if (start_pfn < zone->zone_start_pfn)
 		zone->zone_start_pfn = start_pfn;
 
-	if (end_pfn > old_zone_end_pfn)
-		zone->spanned_pages = end_pfn - zone->zone_start_pfn;
+	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
+				zone->zone_start_pfn;
 
 	zone_span_writeunlock(zone);
 }
@@ -106,8 +106,8 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
 	if (start_pfn < pgdat->node_start_pfn)
 		pgdat->node_start_pfn = start_pfn;
 
-	if (end_pfn > old_pgdat_end_pfn)
-		pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn;
+	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
+					pgdat->node_start_pfn;
 }
 
 int online_pages(unsigned long pfn, unsigned long nr_pages)
-- 
cgit v1.2.2


From b1ab41c4943008375c149a63602d7407f61de5b2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 2 Jun 2006 15:44:58 +0200
Subject: [PATCH] slab.c: fix offslab_limit bug

mm/slab.c's offlab_limit logic is totally broken.

Firstly, "offslab_limit" is a global variable while it should either be
calculated in situ or should be passed in as a parameter.

Secondly, the more serious problem with it is that the condition for
calculating it:

               if (!(OFF_SLAB(sizes->cs_cachep))) {
                       offslab_limit = sizes->cs_size - sizeof(struct slab);
                       offslab_limit /= sizeof(kmem_bufctl_t);

is in total disconnect with the condition that makes use of it:

               /* More than offslab_limit objects will cause problems */
               if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
                       break;

but due to offslab_limit being a global variable this breakage was
hidden.

Up until lockdep came along and perturbed the slab sizes sufficiently so
that the first off-slab cache would still see a (non-calculated) zero
value for offslab_limit and would panic with:

  kmem_cache_create: couldn't create cache size-512.

  Call Trace:
   [<ffffffff8020a5b9>] show_trace+0x96/0x1c8
   [<ffffffff8020a8f0>] dump_stack+0x13/0x15
   [<ffffffff8022994f>] panic+0x39/0x21a
   [<ffffffff80270814>] kmem_cache_create+0x5a0/0x5d0
   [<ffffffff80aced62>] kmem_cache_init+0x193/0x379
   [<ffffffff80abf779>] start_kernel+0x17f/0x218
   [<ffffffff80abf263>] _sinittext+0x263/0x26a

  Kernel panic - not syncing: kmem_cache_create(): failed to create slab `size-512'

Paolo Ornati's config on x86_64 managed to trigger it.

The fix is to move the calculation to the place that makes use of it.
This also makes slab.o 54 bytes smaller.

Btw., the check itself is quite silly. Its intention is to test whether
the number of objects per slab would be higher than the number of slab
control pointers possible. In theory it could be triggered: if someone
tried to allocate 4-byte objects cache and explicitly requested with
CFLGS_OFF_SLAB. So i kept the check.

Out of historic interest i checked how old this bug was and it's
ancient, 10 years old! It is the oldest hidden and then truly triggering
bugs i ever saw being fixed in the kernel!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d31a06bfbea5..f1b644eb39d8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -207,11 +207,6 @@ typedef unsigned int kmem_bufctl_t;
 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
 
-/* Max number of objs-per-slab for caches which use off-slab slabs.
- * Needed to avoid a possible looping condition in cache_grow().
- */
-static unsigned long offslab_limit;
-
 /*
  * struct slab
  *
@@ -1356,12 +1351,6 @@ void __init kmem_cache_init(void)
 					NULL, NULL);
 		}
 
-		/* Inc off-slab bufctl limit until the ceiling is hit. */
-		if (!(OFF_SLAB(sizes->cs_cachep))) {
-			offslab_limit = sizes->cs_size - sizeof(struct slab);
-			offslab_limit /= sizeof(kmem_bufctl_t);
-		}
-
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
@@ -1780,6 +1769,7 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
+	unsigned long offslab_limit;
 	size_t left_over = 0;
 	int gfporder;
 
@@ -1791,9 +1781,18 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 		if (!num)
 			continue;
 
-		/* More than offslab_limit objects will cause problems */
-		if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
-			break;
+		if (flags & CFLGS_OFF_SLAB) {
+			/*
+			 * Max number of objs-per-slab for caches which
+			 * use off-slab slabs. Needed to avoid a possible
+			 * looping condition in cache_grow().
+			 */
+			offslab_limit = size - sizeof(struct slab);
+			offslab_limit /= sizeof(kmem_bufctl_t);
+
+ 			if (num > offslab_limit)
+				break;
+		}
 
 		/* Found something acceptable - save it away */
 		cachep->num = num;
-- 
cgit v1.2.2