From 99ee4ca746dda71326db7645463b4075ac1d665c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 Mar 2010 17:50:17 -0800
Subject: rcu: Suppress __mpol_dup() false positive from RCU lockdep

Common code is used during task creation and after the task has
started running.  RCU protection is not needed during task
creation because no other CPU has access to the
under-construction task.  Provide the RCU protection anyway to
suppress the false positive, as there does not appear to be a
good way for the common code to recognize that the task is only
accessible to the CPU creating it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1267667418-32233-2-git-send-email-paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/mempolicy.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 290fb5bf044..3cec080faa2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1730,10 +1730,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+	rcu_read_lock();
 	if (current_cpuset_is_being_rebound()) {
 		nodemask_t mems = cpuset_mems_allowed(current);
 		mpol_rebind_policy(old, &mems);
 	}
+	rcu_read_unlock();
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	return new;
-- 
cgit v1.2.2


From e9e58a4ec3b1086d1ed8c915311aef1ae55454fd Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 15 Mar 2010 00:34:57 -0400
Subject: memcg: avoid use cmpxchg in swap cgroup maintainance

swap_cgroup uses 2bytes data and uses cmpxchg in a new operation.  2byte
cmpxchg/xchg is not available on some archs.  This patch replaces
cmpxchg/xchg with operations under lock.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reported-by: Sachin Sant <sachinp@in.ibm.com> wrote:
Acked-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_cgroup.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3dd88539a0e..6c0081441a3 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex);
 struct swap_cgroup_ctrl {
 	struct page **map;
 	unsigned long length;
+	spinlock_t	lock;
 };
 
 struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
@@ -353,16 +354,22 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 	struct swap_cgroup_ctrl *ctrl;
 	struct page *mappage;
 	struct swap_cgroup *sc;
+	unsigned long flags;
+	unsigned short retval;
 
 	ctrl = &swap_cgroup_ctrl[type];
 
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
 	sc += pos;
-	if (cmpxchg(&sc->id, old, new) == old)
-		return old;
+	spin_lock_irqsave(&ctrl->lock, flags);
+	retval = sc->id;
+	if (retval == old)
+		sc->id = new;
 	else
-		return 0;
+		retval = 0;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+	return retval;
 }
 
 /**
@@ -383,13 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 	struct page *mappage;
 	struct swap_cgroup *sc;
 	unsigned short old;
+	unsigned long flags;
 
 	ctrl = &swap_cgroup_ctrl[type];
 
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
 	sc += pos;
-	old = xchg(&sc->id, id);
+	spin_lock_irqsave(&ctrl->lock, flags);
+	old = sc->id;
+	sc->id = id;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
 
 	return old;
 }
@@ -441,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
 	mutex_lock(&swap_cgroup_mutex);
 	ctrl->length = length;
 	ctrl->map = array;
+	spin_lock_init(&ctrl->lock);
 	if (swap_cgroup_prepare(type)) {
 		/* memory shortage */
 		ctrl->map = NULL;
-- 
cgit v1.2.2


From c26f91a3df1999ec1b3298372d73f90cbab81106 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 22 Mar 2010 09:32:26 +0100
Subject: x86: Remove excessive early_res debug output

Commit 08677214e318297 ("x86: Make 64 bit use early_res instead
of bootmem  before slab") introduced early_res replacement for
bootmem, but left code  in __free_pages_memory() which dumps all
the ranges that are beeing freed,  without any additional
information, causing some noise in dmesg during  bootup.

Just remove printing of the ranges, that doesn't provide
anything useful  anyway.

While at it, remove other commented-out KERN_DEBUG messages in
the NO_BOOTMEM code as well.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Found-OK-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <alpine.LNX.2.00.1003220931360.18642@pobox.suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/bootmem.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index d7c791ef003..9b134460b01 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -180,19 +180,12 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
 	end_aligned = end & ~(BITS_PER_LONG - 1);
 
 	if (end_aligned <= start_aligned) {
-#if 1
-		printk(KERN_DEBUG " %lx - %lx\n", start, end);
-#endif
 		for (i = start; i < end; i++)
 			__free_pages_bootmem(pfn_to_page(i), 0);
 
 		return;
 	}
 
-#if 1
-	printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
-		 start, start_aligned, end_aligned, end);
-#endif
 	for (i = start; i < start_aligned; i++)
 		__free_pages_bootmem(pfn_to_page(i), 0);
 
@@ -428,9 +421,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 {
 #ifdef CONFIG_NO_BOOTMEM
 	free_early(physaddr, physaddr + size);
-#if 0
-	printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
-#endif
 #else
 	unsigned long start, end;
 
@@ -456,9 +446,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 {
 #ifdef CONFIG_NO_BOOTMEM
 	free_early(addr, addr + size);
-#if 0
-	printk(KERN_DEBUG "free %lx %lx\n", addr, size);
-#endif
 #else
 	unsigned long start, end;
 
-- 
cgit v1.2.2


From 5cfb80a73b5a52fb19d8b0611203e4dd58e8e9a2 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <d-nishimura@mtf.biglobe.ne.jp>
Date: Tue, 23 Mar 2010 13:35:11 -0700
Subject: memcg: disable move charge in no mmu case

In commit 02491447 ("memcg: move charges of anonymous swap"), I tried to
disable move charge feature in no mmu case by enclosing all the related
functions with "#ifdef CONFIG_MMU", but the commit places these ifdefs in
wrong place.  (it seems that it's mangled while handling some fixes...)

This patch fixes it up.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7973b5221fb..00dda352144 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3946,28 +3946,6 @@ one_by_one:
 	}
 	return ret;
 }
-#else	/* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct task_struct *p,
-				bool threadgroup)
-{
-	return 0;
-}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct task_struct *p,
-				bool threadgroup)
-{
-}
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-				struct cgroup *cont,
-				struct cgroup *old_cont,
-				struct task_struct *p,
-				bool threadgroup)
-{
-}
-#endif
 
 /**
  * is_target_pte_for_mc - check a pte whether it is valid for move charge
@@ -4330,6 +4308,28 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 	}
 	mem_cgroup_clear_mc();
 }
+#else	/* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+				struct cgroup *cgroup,
+				struct task_struct *p,
+				bool threadgroup)
+{
+	return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+				struct cgroup *cgroup,
+				struct task_struct *p,
+				bool threadgroup)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+				struct cgroup *cont,
+				struct cgroup *old_cont,
+				struct task_struct *p,
+				bool threadgroup)
+{
+}
+#endif
 
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
-- 
cgit v1.2.2


From e7bbcdf3747e3919c31cfa87853c69d178bce548 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 23 Mar 2010 13:35:12 -0700
Subject: memcontrol: fix potential null deref

There was a potential null deref introduced in c62b1a3b31b5 ("memcg: use
generic percpu instead of private implementation").

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00dda352144..9ed760dc744 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3691,8 +3691,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	else
 		mem = vmalloc(size);
 
-	if (mem)
-		memset(mem, 0, size);
+	if (!mem)
+		return NULL;
+
+	memset(mem, 0, size);
 	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!mem->stat) {
 		if (size < PAGE_SIZE)
-- 
cgit v1.2.2


From 3fa30460ea502133a18a07b14452cd660906f16f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 23 Mar 2010 13:35:21 -0700
Subject: nommu: fix an incorrect comment in the do_mmap_shared_file()

Fix an incorrect comment in the do_mmap_shared_file().  If a mapping is
requested MAP_SHARED, then a private copy cannot be made and still provide
correct semantics.

Signed-off-by: David Howells <dhowells@redhat.com>
Reported-by: Dave Hudson <uclinux@blueteddy.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8982a..e4b8f4d28a3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
 	if (ret != -ENOSYS)
 		return ret;
 
-	/* getting an ENOSYS error indicates that direct mmap isn't
-	 * possible (as opposed to tried but failed) so we'll fall
-	 * through to making a private copy of the data and mapping
-	 * that if we can */
+	/* getting -ENOSYS indicates that direct mmap isn't possible (as
+	 * opposed to tried but failed) so we can only give a suitable error as
+	 * it's not possible to make a private copy if MAP_SHARED was given */
 	return -ENODEV;
 }
 
-- 
cgit v1.2.2


From cb53237513bd1e090cce120efe12ede72c932b5f Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 23 Mar 2010 13:35:26 -0700
Subject: mm/ksm.c is doing an unneeded _notify in write_protect_page.

ksm.c's write_protect_page implements a lockless means of verifying a page
does not have any users of the page which are not accounted for via other
kernel tracking means.  It does this by removing the writable pte with TLB
flushes, checking the page_count against the total known users, and then
using set_pte_at_notify to make it a read-only entry.

An unneeded mmu_notifier callout is made in the case where the known users
does not match the page_count.  In that event, we are inserting the
identical pte and there is no need for the set_pte_at_notify, but rather
the simpler set_pte_at suffices.

Signed-off-by: Robin Holt <holt@sgi.com>
Acked-by: Izik Eidus <ieidus@redhat.com>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7f508..8cdfc2a1e8b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * page
 		 */
 		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
-			set_pte_at_notify(mm, addr, ptep, entry);
+			set_pte_at(mm, addr, ptep, entry);
 			goto out_unlock;
 		}
 		entry = pte_wrprotect(entry);
-- 
cgit v1.2.2


From 413b43deab8377819aba1dbad2abf0c15d59b491 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 23 Mar 2010 13:35:28 -0700
Subject: tmpfs: fix oops on mounts with mpol=default

Fix an 'oops' when a tmpfs mount point is mounted with the mpol=default
mempolicy.

Upon remounting a tmpfs mount point with 'mpol=default' option, the mount
code crashed with a null pointer dereference.  The initial problem report
was on 2.6.27, but the problem exists in mainline 2.6.34-rc as well.  On
examining the code, we see that mpol_new returns NULL if default mempolicy
was requested.  This 'NULL' mempolicy is accessed to store the node mask
resulting in oops.

The following patch fixes it.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 643f66e1018..745ce90308a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2215,10 +2215,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 			goto out;
 		mode = MPOL_PREFERRED;
 		break;
-
+	case MPOL_DEFAULT:
+		/*
+		 * Insist on a empty nodelist
+		 */
+		if (!nodelist)
+			err = 0;
+		goto out;
 	/*
 	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
-	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
 	 */
 	}
 
-- 
cgit v1.2.2


From d69b2e63e9172afb4d07c305601b79a55509ac4c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 23 Mar 2010 13:35:30 -0700
Subject: tmpfs: mpol=bind:0 don't cause mount error.

Currently, following mount operation cause mount error.

% mount -t tmpfs -ompol=bind:0 none /tmp

Because commit 71fe804b6d5 (mempolicy: use struct mempolicy pointer in
shmem_sb_info) corrupted MPOL_BIND parse code.

This patch restore the needed one.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 745ce90308a..10db44f9574 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2222,9 +2222,13 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		if (!nodelist)
 			err = 0;
 		goto out;
-	/*
-	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
-	 */
+	case MPOL_BIND:
+		/*
+		 * Insist on a nodelist
+		 */
+		if (!nodelist)
+			goto out;
+		err = 0;
 	}
 
 	mode_flags = 0;
-- 
cgit v1.2.2


From 12821f5fb942e795f8009ece14bde868893bd811 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 23 Mar 2010 13:35:31 -0700
Subject: tmpfs: handle MPOL_LOCAL mount option properly

commit 71fe804b6d5 (mempolicy: use struct mempolicy pointer in
shmem_sb_info) added mpol=local mount option.  but its feature is broken
since it was born.  because such code always return 1 (i.e.  mount
failure).

This patch fixes it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10db44f9574..fb71790398f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2214,6 +2214,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		if (nodelist)
 			goto out;
 		mode = MPOL_PREFERRED;
+		err = 0;
 		break;
 	case MPOL_DEFAULT:
 		/*
-- 
cgit v1.2.2


From 926f2ae04f183098cf9a30521776fb2759c8afeb Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 23 Mar 2010 13:35:32 -0700
Subject: tmpfs: cleanup mpol_parse_str()

mpol_parse_str() made lots 'err' variable related bug.  Because it is ugly
and reviewing unfriendly.

This patch simplifies it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index fb71790398f..6cdfa1df57f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2195,8 +2195,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 			char *rest = nodelist;
 			while (isdigit(*rest))
 				rest++;
-			if (!*rest)
-				err = 0;
+			if (*rest)
+				goto out;
 		}
 		break;
 	case MPOL_INTERLEAVE:
@@ -2205,7 +2205,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		 */
 		if (!nodelist)
 			nodes = node_states[N_HIGH_MEMORY];
-		err = 0;
 		break;
 	case MPOL_LOCAL:
 		/*
@@ -2214,7 +2213,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		if (nodelist)
 			goto out;
 		mode = MPOL_PREFERRED;
-		err = 0;
 		break;
 	case MPOL_DEFAULT:
 		/*
@@ -2229,7 +2227,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		 */
 		if (!nodelist)
 			goto out;
-		err = 0;
 	}
 
 	mode_flags = 0;
@@ -2243,13 +2240,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		else if (!strcmp(flags, "relative"))
 			mode_flags |= MPOL_F_RELATIVE_NODES;
 		else
-			err = 1;
+			goto out;
 	}
 
 	new = mpol_new(mode, mode_flags, &nodes);
 	if (IS_ERR(new))
-		err = 1;
-	else {
+		goto out;
+
+	{
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
@@ -2260,13 +2258,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 			ret = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
 		if (ret) {
-			err = 1;
 			mpol_put(new);
-		} else if (no_context) {
-			/* save for contextualization */
-			new->w.user_nodemask = nodes;
+			goto out;
 		}
 	}
+	err = 0;
+	if (no_context) {
+		/* save for contextualization */
+		new->w.user_nodemask = nodes;
+	}
 
 out:
 	/* Restore string for error message */
-- 
cgit v1.2.2


From 298359c5bf06c04258d7cf552426e198c47e83c1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 23 Mar 2010 13:35:37 -0700
Subject: exit: fix oops in sync_mm_rss

In 2.6.34-rc1, removing vhost_net module causes an oops in sync_mm_rss
(called from do_exit) when workqueue is destroyed.  This does not happen
on net-next, or with vhost on top of to 2.6.33.

The issue seems to be introduced by
34e55232e59f7b19050267a05ff1226e5cd122a5 ("mm: avoid false sharing of
mm_counter) which added sync_mm_rss() that is passed task->mm, and
dereferences it without checking.  If task is a kernel thread, mm might be
NULL.  I think this might also happen e.g.  with aio.

This patch fixes the oops by calling sync_mm_rss when task->mm is set to
NULL.  I also added BUG_ON to detect any other cases where counters get
incremented while mm is NULL.

The oops I observed looks like this:

BUG: unable to handle kernel NULL pointer dereference at 00000000000002a8
IP: [<ffffffff810b436d>] sync_mm_rss+0x33/0x6f
PGD 0
Oops: 0002 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_map
CPU 2
Modules linked in: vhost_net(-) tun bridge stp sunrpc ipv6 cpufreq_ondemand acpi_cpufreq freq_table kvm_intel kvm i5000_edac edac_core rtc_cmos bnx2 button i2c_i801 i2c_core rtc_core e1000e sg joydev ide_cd_mod serio_raw pcspkr rtc_lib cdrom virtio_net virtio_blk virtio_pci virtio_ring virtio af_packet e1000 shpchp aacraid uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode]

Pid: 2046, comm: vhost Not tainted 2.6.34-rc1-vhost #25 System Planar/IBM System x3550 -[7978B3G]-
RIP: 0010:[<ffffffff810b436d>]  [<ffffffff810b436d>] sync_mm_rss+0x33/0x6f
RSP: 0018:ffff8802379b7e60  EFLAGS: 00010202
RAX: 0000000000000008 RBX: ffff88023f2390c0 RCX: 0000000000000000
RDX: ffff88023f2396b0 RSI: 0000000000000000 RDI: ffff88023f2390c0
RBP: ffff8802379b7e60 R08: 0000000000000000 R09: 0000000000000000
R10: ffff88023aecfbc0 R11: 0000000000013240 R12: 0000000000000000
R13: ffffffff81051a6c R14: ffffe8ffffc0f540 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff880001e80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000000000002a8 CR3: 000000023af23000 CR4: 00000000000406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process vhost (pid: 2046, threadinfo ffff8802379b6000, task ffff88023f2390c0)
Stack:
 ffff8802379b7ee0 ffffffff81040687 ffffe8ffffc0f558 ffffffffa00a3e2d
<0> 0000000000000000 ffff88023f2390c0 ffffffff81055817 ffff8802379b7e98
<0> ffff8802379b7e98 0000000100000286 ffff8802379b7ee0 ffff88023ad47d78
Call Trace:
 [<ffffffff81040687>] do_exit+0x147/0x6c4
 [<ffffffffa00a3e2d>] ? handle_rx_net+0x0/0x17 [vhost_net]
 [<ffffffff81055817>] ? autoremove_wake_function+0x0/0x39
 [<ffffffff81051a6c>] ? worker_thread+0x0/0x229
 [<ffffffff810553c9>] kthreadd+0x0/0xf2
 [<ffffffff810038d4>] kernel_thread_helper+0x4/0x10
 [<ffffffff81055342>] ? kthread+0x0/0x87
 [<ffffffff810038d0>] ? kernel_thread_helper+0x0/0x10
Code: 00 8b 87 6c 02 00 00 85 c0 74 14 48 98 f0 48 01 86 a0 02 00 00 c7 87 6c 02 00 00 00 00 00 00 8b 87 70 02 00 00 85 c0 74 14 48 98 <f0> 48 01 86 a8 02 00 00 c7 87 70 02 00 00 00 00 00 00 8b 87 74
RIP  [<ffffffff810b436d>] sync_mm_rss+0x33/0x6f
 RSP <ffff8802379b7e60>
CR2: 00000000000002a8
---[ end trace 41603ba922beddd2 ]---
Fixing recursive fault but reboot is needed!

(note: handle_rx_net is a work item using workqueue in question).
sync_mm_rss+0x33/0x6f gave me a hint. I also tried reverting
34e55232e59f7b19050267a05ff1226e5cd122a5 and the oops goes away.

The module in question calls use_mm and later unuse_mm from a kernel
thread.  It is when this kernel thread is destroyed that the crash
happens.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c      | 1 +
 mm/mmu_context.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 5b7f2002e54..bc9ba5a1f5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -130,6 +130,7 @@ void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		if (task->rss_stat.count[i]) {
+			BUG_ON(!mm);
 			add_mm_counter(mm, i, task->rss_stat.count[i]);
 			task->rss_stat.count[i] = 0;
 		}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 0777654147c..9e82e937000 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,6 +53,7 @@ void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	sync_mm_rss(tsk, mm);
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
-- 
cgit v1.2.2


From c6b6ef8bb05af632889c5536513b9f4004961f73 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Tue, 23 Mar 2010 13:35:41 -0700
Subject: mempolicy: fix get_mempolicy() for relative and static nodes

Discovered while testing other mempolicy changes:

get_mempolicy() does not handle static/relative mode flags correctly.
Return the value that the user specified so that it can be restored
via set_mempolicy() if desired.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6cdfa1df57f..8034abd3a13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -806,9 +806,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 
 	err = 0;
 	if (nmask) {
-		task_lock(current);
-		get_policy_nodemask(pol, nmask);
-		task_unlock(current);
+		if (mpol_store_user_nodemask(pol)) {
+			*nmask = pol->w.user_nodemask;
+		} else {
+			task_lock(current);
+			get_policy_nodemask(pol, nmask);
+			task_unlock(current);
+		}
 	}
 
  out:
-- 
cgit v1.2.2


From 7561e8ca0dfaf6fca3feef982830de3b65300e5b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 25 Mar 2010 16:48:38 +0000
Subject: NOMMU: Revert 'nommu: get_user_pages(): pin last page on
 non-page-aligned start'

Revert the following patch:

	commit c08c6e1f54c85fc299cf9f88cf330d6dd28a9a1d
	Author: Steven J. Magnani <steve@digidescorp.com>
	Date:   Fri Mar 5 13:42:24 2010 -0800

	nommu: get_user_pages(): pin last page on non-page-aligned start

As it assumes that the mappings begin at the start of pages - something that
isn't necessarily true on NOMMU systems.  On NOMMU systems, it is possible for
a mapping to only occupy part of the page, and not necessarily touch either end
of it; in fact it's also possible for multiple non-overlapping mappings to
coexist on one page (consider direct mappings of ROMFS files, for example).

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steven J. Magnani <steve@digidescorp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index e4b8f4d28a3..089982f5a4c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 
 	for (i = 0; i < nr_pages; i++) {
-		vma = find_extend_vma(mm, start);
+		vma = find_vma(mm, start);
 		if (!vma)
 			goto finish_or_fault;
 
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
  */
 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
-	return find_vma(mm, addr & PAGE_MASK);
+	return find_vma(mm, addr);
 }
 
 /*
-- 
cgit v1.2.2


From e1ee65d85904c5dd4b9cea1b15d5e85e20eae8a1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 25 Mar 2010 16:48:44 +0000
Subject: NOMMU: Fix __get_user_pages() to pin last page on offset buffers

Fix __get_user_pages() to make it pin the last page on a buffer that doesn't
begin at the start of a page, but is a multiple of PAGE_SIZE in size.

The problem is that __get_user_pages() advances the pointer too much when it
iterates to the next page if the page it's currently looking at isn't used from
the first byte.  This can cause the end of a short VMA to be reached
prematurely, resulting in the last page being lost.

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 089982f5a4c..63fa17d121f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 		if (vmas)
 			vmas[i] = vma;
-		start += PAGE_SIZE;
+		start = (start + PAGE_SIZE) & PAGE_MASK;
 	}
 
 	return i;
-- 
cgit v1.2.2


From 10fad5e46f6c7bdfb01b1a012380a38e3c6ab346 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Mar 2010 18:57:54 +0900
Subject: percpu, module: implement and use is_kernel/module_percpu_address()

lockdep has custom code to check whether a pointer belongs to static
percpu area which is somewhat broken.  Implement proper
is_kernel/module_percpu_address() and replace the custom code.

On UP, percpu variables are regular static variables and can't be
distinguished from them.  Always return %false on UP.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
---
 mm/percpu.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 768419d44ad..6e09741ddc6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1303,6 +1303,32 @@ void free_percpu(void __percpu *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
+/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area.  Module
+ * static percpu areas are not considered.  For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
+	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *start = per_cpu_ptr(base, cpu);
+
+		if ((void *)addr >= start && (void *)addr < start + static_size)
+			return true;
+        }
+	return false;
+}
+
 /**
  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
  * @addr: the address to be converted to physical address
-- 
cgit v1.2.2


From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:04:11 +0900
Subject: include cleanup: Update gfp.h and slab.h includes to prepare for
 breaking implicit slab.h inclusion from percpu.h

percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files.  percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.

percpu.h -> slab.h dependency is about to be removed.  Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability.  As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.

  http://userweb.kernel.org/~tj/misc/slabh-sweep.py

The script does the followings.

* Scan files for gfp and slab usages and update includes such that
  only the necessary includes are there.  ie. if only gfp is used,
  gfp.h, if slab is used, slab.h.

* When the script inserts a new include, it looks at the include
  blocks and try to put the new include such that its order conforms
  to its surrounding.  It's put in the include block which contains
  core kernel includes, in the same order that the rest are ordered -
  alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
  doesn't seem to be any matching order.

* If the script can't find a place to put a new include (mostly
  because the file doesn't have fitting include block), it prints out
  an error message indicating which .h file needs to be added to the
  file.

The conversion was done in the following steps.

1. The initial automatic conversion of all .c files updated slightly
   over 4000 files, deleting around 700 includes and adding ~480 gfp.h
   and ~3000 slab.h inclusions.  The script emitted errors for ~400
   files.

2. Each error was manually checked.  Some didn't need the inclusion,
   some needed manual addition while adding it to implementation .h or
   embedding .c file was more appropriate for others.  This step added
   inclusions to around 150 files.

3. The script was run again and the output was compared to the edits
   from #2 to make sure no file was left behind.

4. Several build tests were done and a couple of problems were fixed.
   e.g. lib/decompress_*.c used malloc/free() wrappers around slab
   APIs requiring slab.h to be added manually.

5. The script was run on all .h files but without automatically
   editing them as sprinkling gfp.h and slab.h inclusions around .h
   files could easily lead to inclusion dependency hell.  Most gfp.h
   inclusion directives were ignored as stuff from gfp.h was usually
   wildly available and often used in preprocessor macros.  Each
   slab.h inclusion directive was examined and added manually as
   necessary.

6. percpu.h was updated not to include slab.h.

7. Build test were done on the following configurations and failures
   were fixed.  CONFIG_GCOV_KERNEL was turned off for all tests (as my
   distributed build env didn't work with gcov compiles) and a few
   more options had to be turned off depending on archs to make things
   build (like ipr on powerpc/64 which failed due to missing writeq).

   * x86 and x86_64 UP and SMP allmodconfig and a custom test config.
   * powerpc and powerpc64 SMP allmodconfig
   * sparc and sparc64 SMP allmodconfig
   * ia64 SMP allmodconfig
   * s390 SMP allmodconfig
   * alpha SMP allmodconfig
   * um on x86_64 SMP allmodconfig

8. percpu.h modifications were reverted so that it could be applied as
   a separate patch and serve as bisection point.

Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 mm/bootmem.c        | 1 +
 mm/bounce.c         | 1 +
 mm/failslab.c       | 1 -
 mm/filemap.c        | 2 +-
 mm/filemap_xip.c    | 1 +
 mm/hugetlb.c        | 2 +-
 mm/kmemcheck.c      | 1 -
 mm/kmemleak.c       | 1 -
 mm/memory-failure.c | 1 +
 mm/memory.c         | 1 +
 mm/mempolicy.c      | 1 -
 mm/migrate.c        | 1 +
 mm/mincore.c        | 2 +-
 mm/mmu_notifier.c   | 1 +
 mm/mprotect.c       | 1 -
 mm/mremap.c         | 1 -
 mm/oom_kill.c       | 1 +
 mm/page_io.c        | 1 +
 mm/quicklist.c      | 1 +
 mm/readahead.c      | 1 +
 mm/sparse-vmemmap.c | 1 +
 mm/sparse.c         | 1 +
 mm/swap.c           | 1 +
 mm/swap_state.c     | 1 +
 mm/truncate.c       | 1 +
 mm/vmscan.c         | 2 +-
 mm/vmstat.c         | 1 +
 27 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9b134460b01..eff22422057 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -10,6 +10,7 @@
  */
 #include <linux/init.h>
 #include <linux/pfn.h>
+#include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/kmemleak.h>
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e3..13b6dad1eed 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
diff --git a/mm/failslab.c b/mm/failslab.c
index bb41f98dd8b..c5f88f240dd 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,4 @@
 #include <linux/fault-inject.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 
 static struct {
diff --git a/mm/filemap.c b/mm/filemap.c
index 045b31c3765..140ebda9640 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -10,13 +10,13 @@
  * the NFS filesystem used to do this differently, for example)
  */
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/mman.h>
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 78b94f0b6d5..83364df74a3 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb37c11..6034dc9e979 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
  * Generic hugetlb support.
  * (C) William Irwin, April 2004
  */
-#include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -18,6 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
+#include <linux/slab.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index fd814fd6131..8f8e48acf7d 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,7 +1,6 @@
 #include <linux/gfp.h>
 #include <linux/mm_types.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/kmemcheck.h>
 
 void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5b069e4f5e4..2c0d032ac89 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -72,7 +72,6 @@
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/prio_tree.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d1f33516297..620b0b46159 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -44,6 +44,7 @@
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
 #include <linux/suspend.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
diff --git a/mm/memory.c b/mm/memory.c
index bc9ba5a1f5b..1d2ea39260e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <linux/gfp.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8034abd3a13..08f40a2f3fe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
 #include <linux/sched.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 88000b89fc9..d3f3f7f8107 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
 
 #include "internal.h"
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 7a3436ef39e..f77433c2027 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -7,8 +7,8 @@
 /*
  * The mincore() system call.
  */
-#include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/syscalls.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7e33f2cb3c7..438951d366f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -16,6 +16,7 @@
 #include <linux/err.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 /*
  * This function can't run concurrently against mmu_notifier_register
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d8112..2d1bf7cf885 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -10,7 +10,6 @@
 
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
-#include <linux/slab.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index e9c75efce60..cde56ee51ef 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
 
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
-#include <linux/slab.h>
 #include <linux/shm.h>
 #include <linux/ksm.h>
 #include <linux/mman.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9b223af6a14..b68e802a7a7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -18,6 +18,7 @@
 #include <linux/oom.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
diff --git a/mm/page_io.c b/mm/page_io.c
index a19af956ee1..31a3b962230 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,6 +12,7 @@
 
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6633965bb27..2876349339a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -14,6 +14,7 @@
  */
 #include <linux/kernel.h>
 
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index 337b20e946f..999b54bb462 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 392b9bb5bc0..aa33fd67fa4 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 22896d58913..dc0cc4d43ff 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,6 +2,7 @@
  * sparse memory mappings.
  */
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
diff --git a/mm/swap.c b/mm/swap.c
index 9036b89813a..7cd60bf0a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
+#include <linux/gfp.h>
 
 #include "internal.h"
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb1cb4..e10f5833167 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,6 +8,7 @@
  */
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e3724482..f42675a3615 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/module.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 79c809895fb..e0e5f15bb72 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -13,7 +13,7 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7f760cbc73f..fa12ea3051f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/vmstat.h>
 #include <linux/sched.h>
-- 
cgit v1.2.2


From ea5a9f0c3447889abceb7482c391bb977472eab9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 30 Mar 2010 03:01:14 +0900
Subject: kmemcheck: Fix build errors due to missing slab.h

mm/kmemcheck.c:69: error: dereferencing pointer to incomplete type
mm/kmemcheck.c:69: error: 'SLAB_NOTRACK' undeclared (first use in this function)
mm/kmemcheck.c:82: error: dereferencing pointer to incomplete type
mm/kmemcheck.c:94: error: dereferencing pointer to incomplete type
mm/kmemcheck.c:94: error: dereferencing pointer to incomplete type
mm/kmemcheck.c:94: error: 'SLAB_DESTROY_BY_RCU' undeclared (first use in this function)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/kmemcheck.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 8f8e48acf7d..fd814fd6131 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,6 +1,7 @@
 #include <linux/gfp.h>
 #include <linux/mm_types.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/kmemcheck.h>
 
 void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-- 
cgit v1.2.2


From de380b55f92986c1a84198149cb71b7228d15fbd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:06:43 +0900
Subject: percpu: don't implicitly include slab.h from percpu.h

percpu.h has always been including slab.h to get k[mz]alloc/free() for
UP inline implementation.  percpu.h being used by very low level
headers including module.h and sched.h, this meant that a lot files
unintentionally got slab.h inclusion.

Lee Schermerhorn was trying to make topology.h use percpu.h and got
bitten by this implicit inclusion.  The right thing to do is break
this ultimately unnecessary dependency.  The previous patch added
explicit inclusion of either gfp.h or slab.h to the source files using
them.  This patch updates percpu.h such that slab.h is no longer
included from percpu.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 mm/Makefile    |  6 +++++-
 mm/percpu_up.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 mm/percpu_up.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 7a68d2ab556..6c2a73a54a4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_SMP) += percpu.o
+ifdef CONFIG_SMP
+obj-y += percpu.o
+else
+obj-y += percpu_up.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
new file mode 100644
index 00000000000..c4351c7f57d
--- /dev/null
+++ b/mm/percpu_up.c
@@ -0,0 +1,30 @@
+/*
+ * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
+ */
+
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+
+void __percpu *__alloc_percpu(size_t size, size_t align)
+{
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > SMP_CACHE_BYTES);
+	return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+void free_percpu(void __percpu *p)
+{
+	kfree(p);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+	return __pa(addr);
+}
-- 
cgit v1.2.2


From 337998587f802535896e9ed16d19f97915ccd368 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 31 Mar 2010 20:44:09 -0700
Subject: nobootmem, x86: Fix 32bit numa system without RAM on node 0

On one system without RAM on node0, got following boot dump with a 32
bit NUMA kernel:

early_node_map[4] active PFN ranges
    1: 0x00000010 -> 0x00000099
    1: 0x00000100 -> 0x0007da00
    1: 0x0007e800 -> 0x0007ffa0
    1: 0x0007ffae -> 0x0007ffb0
...
Subtract (29 early reservations)
  #000 [0000001000 - 0000002000]
  #001 [0000089000 - 000008f000]
  #002 [0000091000 - 0000093500]
...
  #027 [007cbfef40 - 007e800000]
  #028 [007e9ca000 - 007ff95000]
(0 free memory ranges)
Initializing HighMem for node 0 (00000000:00000000)
Initializing HighMem for node 1 (00000000:00000000)
Memory: 0k/2096832k available (6662k kernel code, 2096300k reserved, 4829k data, 484k init, 0k highmem)
...
Checking if this processor honours the WP bit even in supervisor mode...Ok.
swapper: page allocation failure. order:0, mode:0x0
Pid: 0, comm: swapper Not tainted 2.6.34-rc3-tip-03818-g4b1ea6c-dirty #35
Call Trace:
 [<4087a5dc>] ? printk+0xf/0x11
 [<40286728>] __alloc_pages_nodemask+0x417/0x487
 [<402a9ce1>] new_slab+0xe2/0x1fe
 [<402aa5b2>] kmem_cache_open+0x185/0x358
 [<402abbc0>] T.954+0x1c/0x60
 [<40d52a29>] kmem_cache_init+0x24/0x113
 [<40d39738>] start_kernel+0x166/0x2e4
 [<40d3940e>] ? unknown_bootoption+0x0/0x18e
 [<40d390ce>] i386_start_kernel+0xce/0xd5
Mem-Info:
Node 1 DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
Node 1 Normal per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
active_anon:0 inactive_anon:0 isolated_anon:0
 active_file:0 inactive_file:0 isolated_file:0
 unevictable:0 dirty:0 writeback:0 unstable:0
 free:0 slab_reclaimable:0 slab_unreclaimable:0
 mapped:0 shmem:0 pagetables:0 bounce:0

When 32bit NUMA is used, free_all_bootmem() will still only go over with
node id 0.

If node 0 doesn't have RAM installed, We need to go with node1
because early_node_map still use 1 for all ranges, and ram from node1
become low ram.

Use MAX_NUMNODES like 64-bit NUMA does.

Note: BOOTMEM path has the same problem.
      this bug exist before We have NO_BOOTMEM support.

-v3: add more comments, and fix bootmem path too.
-v4: seperate bootmem path fix

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4BB41689.9090502@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 mm/bootmem.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9b134460b01..2058cb7595f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -303,7 +303,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 unsigned long __init free_all_bootmem(void)
 {
 #ifdef CONFIG_NO_BOOTMEM
-	return free_all_memory_core_early(NODE_DATA(0)->node_id);
+	/*
+	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+	 *  because in some case like Node0 doesnt have RAM installed
+	 *  low ram will be on Node1
+	 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+	 *  will be used instead of only Node0 related
+	 */
+	return free_all_memory_core_early(MAX_NUMNODES);
 #else
 	return free_all_bootmem_core(NODE_DATA(0)->bdata);
 #endif
-- 
cgit v1.2.2


From aa235fc712f379d4194cff9217f07026c452c141 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 31 Mar 2010 20:45:27 -0700
Subject: bootmem, x86: Fix 32bit numa system without RAM on node 0

When 32bit numa is used, free_all_bootmem() will still only go over with
node id 0.

If node 0 doesn't have RAM installed, the lowest populated node
becomes low RAM.

This one fixes BOOTMEM path by iterating over the bdata_list.

-v3: add more comments, and fix bootmem path too.
-v4: seperate from one big patch

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4BB416D7.6090203@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 mm/bootmem.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2058cb7595f..ba37d62b684 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -312,7 +312,13 @@ unsigned long __init free_all_bootmem(void)
 	 */
 	return free_all_memory_core_early(MAX_NUMNODES);
 #else
-	return free_all_bootmem_core(NODE_DATA(0)->bdata);
+	unsigned long total_pages = 0;
+	bootmem_data_t *bdata;
+
+	list_for_each_entry(bdata, &bdata_list, list)
+		total_pages += free_all_bootmem_core(bdata);
+
+	return total_pages;
 #endif
 }
 
-- 
cgit v1.2.2


From 144214537370b4f133a735446ebe86e90cfb2501 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Fri, 2 Apr 2010 09:46:55 +0200
Subject: backing-dev: Handle class_create() failure

I hit this when we had a bug in IDR for a few days. Basically sysfs would
fail to create new inodes since it uses an IDR and therefore class_create would
fail.

While we are unlikely to see this fail we may as well handle it instead of
oopsing.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 mm/backing-dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca034770..f13e067e146 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -227,6 +227,9 @@ static struct device_attribute bdi_dev_attrs[] = {
 static __init int bdi_class_init(void)
 {
 	bdi_class = class_create(THIS_MODULE, "bdi");
+	if (IS_ERR(bdi_class))
+		return PTR_ERR(bdi_class);
+
 	bdi_class->dev_attrs = bdi_dev_attrs;
 	bdi_debug_init();
 	return 0;
-- 
cgit v1.2.2


From 4946d54cb55e86a156216fcfeed5568514b0830f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 5 Apr 2010 12:13:33 -0400
Subject: rmap: fix anon_vma_fork() memory leak

Fix a memory leak in anon_vma_fork(), where we fail to tear down the
anon_vmas attached to the new VMA in case setting up the new anon_vma
fails.

This bug also has the potential to leave behind anon_vma_chain structs
with pointers to invalid memory.

Reported-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c9c99..eaa7a09eb72 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -232,6 +232,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
  out_error_free_anon_vma:
 	anon_vma_free(anon_vma);
  out_error:
+	unlink_anon_vmas(vma);
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.2


From a3a2e76c77fa22b114e421ac11dec0c56c3503fb Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 6 Apr 2010 14:34:42 -0700
Subject: mm: avoid null-pointer deref in sync_mm_rss()

- We weren't zeroing p->rss_stat[] at fork()

- Consequently sync_mm_rss() was dereferencing tsk->mm for kernel
  threads and was oopsing.

- Make __sync_task_rss_stat() static, too.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=15648

[akpm@linux-foundation.org: remove the BUG_ON(!mm->rss)]
Reported-by: Troels Liebe Bentsen <tlb@rapanden.dk>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
"Michael S. Tsirkin" <mst@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 1d2ea39260e..833952d8b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,13 +125,12 @@ core_initcall(init_zero_pfn);
 
 #if defined(SPLIT_RSS_COUNTING)
 
-void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
 {
 	int i;
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		if (task->rss_stat.count[i]) {
-			BUG_ON(!mm);
 			add_mm_counter(mm, i, task->rss_stat.count[i]);
 			task->rss_stat.count[i] = 0;
 		}
-- 
cgit v1.2.2


From 70655c06bd3f25111312d63985888112aed15ac5 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 6 Apr 2010 14:34:53 -0700
Subject: readahead: fix NULL filp dereference

btrfs relocate_file_extent_cluster() calls us with NULL filp:

  [ 4005.426805] BUG: unable to handle kernel NULL pointer dereference at 00000021
  [ 4005.426818] IP: [<c109a130>] page_cache_sync_readahead+0x18/0x3e

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Yan Zheng <yanzheng@21cn.com>
Reported-by: Kirill A. Shutemov <kirill@shutemov.name>
Tested-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/readahead.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 999b54bb462..dfa9a1a03a1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -503,7 +503,7 @@ void page_cache_sync_readahead(struct address_space *mapping,
 		return;
 
 	/* be dumb */
-	if (filp->f_mode & FMODE_RANDOM) {
+	if (filp && (filp->f_mode & FMODE_RANDOM)) {
 		force_page_cache_readahead(mapping, filp, offset, req_size);
 		return;
 	}
-- 
cgit v1.2.2


From d6da1a5abc2bf3a06a5bda08e0f6833409234666 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 6 Apr 2010 14:34:56 -0700
Subject: mm: revert "vmscan: get_scan_ratio() cleanup"

Shaohua Li reported his tmpfs streaming I/O test can lead to make oom.
The test uses a 6G tmpfs in a system with 3G memory.  In the tmpfs, there
are 6 copies of kernel source and the test does kbuild for each copy.  His
investigation shows the test has a lot of rotated anon pages and quite few
file pages, so get_scan_ratio calculates percent[0] (i.e.  scanning
percent for anon) to be zero.  Actually the percent[0] shoule be a big
value, but our calculation round it to zero.

Although before commit 84b18490 ("vmscan: get_scan_ratio() cleanup") , we
have the same problem too.  But the old logic can rescue percent[0]==0
case only when priority==0.  It had hided the real issue.  I didn't think
merely streaming io can makes percent[0]==0 && priority==0 situation.  but
I was wrong.

So, definitely we have to fix such tmpfs streaming io issue.  but anyway I
revert the regression commit at first.

This reverts commit 84b18490d1f1bc7ed5095c929f78bc002eb70f26.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e0e5f15bb72..3ff3311447f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1535,13 +1535,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	unsigned long ap, fp;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 
-	/* If we have no swap space, do not bother scanning anon pages. */
-	if (!sc->may_swap || (nr_swap_pages <= 0)) {
-		percent[0] = 0;
-		percent[1] = 100;
-		return;
-	}
-
 	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
 		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
 	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1639,20 +1632,22 @@ static void shrink_zone(int priority, struct zone *zone,
 	unsigned long nr_reclaimed = sc->nr_reclaimed;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+	int noswap = 0;
 
-	get_scan_ratio(zone, sc, percent);
+	/* If we have no swap space, do not bother scanning anon pages. */
+	if (!sc->may_swap || (nr_swap_pages <= 0)) {
+		noswap = 1;
+		percent[0] = 0;
+		percent[1] = 100;
+	} else
+		get_scan_ratio(zone, sc, percent);
 
 	for_each_evictable_lru(l) {
 		int file = is_file_lru(l);
 		unsigned long scan;
 
-		if (percent[file] == 0) {
-			nr[l] = 0;
-			continue;
-		}
-
 		scan = zone_nr_lru_pages(zone, sc, l);
-		if (priority) {
+		if (priority || noswap) {
 			scan >>= priority;
 			scan = (scan * percent[file]) / 100;
 		}
-- 
cgit v1.2.2


From 116354d177ba2da37e91cf884e3d11e67f825efd Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 6 Apr 2010 14:35:04 -0700
Subject: pagemap: fix pfn calculation for hugepage

When we look into pagemap using page-types with option -p, the value of
pfn for hugepages looks wrong (see below.) This is because pte was
evaluated only once for one vma although it should be updated for each
hugepage.  This patch fixes it.

  $ page-types -p 3277 -Nl -b huge
  voffset   offset  len     flags
  7f21e8a00 11e400  1       ___U___________H_G________________
  7f21e8a01 11e401  1ff     ________________TG________________
               ^^^
  7f21e8c00 11e400  1       ___U___________H_G________________
  7f21e8c01 11e401  1ff     ________________TG________________
               ^^^

One hugepage contains 1 head page and 511 tail pages in x86_64 and each
two lines represent each hugepage.  Voffset and offset mean virtual
address and physical address in the page unit, respectively.  The
different hugepages should not have the same offset value.

With this patch applied:

  $ page-types -p 3386 -Nl -b huge
  voffset   offset   len    flags
  7fec7a600 112c00   1      ___UD__________H_G________________
  7fec7a601 112c01   1ff    ________________TG________________
               ^^^
  7fec7a800 113200   1      ___UD__________H_G________________
  7fec7a801 113201   1ff    ________________TG________________
               ^^^
               OK

More info:

- This patch modifies walk_page_range()'s hugepage walker.  But the
  change only affects pagemap_read(), which is the only caller of hugepage
  callback.

- Without this patch, hugetlb_entry() callback is called per vma, that
  doesn't match the natural expectation from its name.

- With this patch, hugetlb_entry() is called per hugepte entry and the
  callback can become much simpler.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/pagewalk.c | 47 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7b47a57b664..8b1a2ce21ee 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+				       unsigned long end)
+{
+	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+	return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+			      unsigned long addr, unsigned long end,
+			      struct mm_walk *walk)
+{
+	struct hstate *h = hstate_vma(vma);
+	unsigned long next;
+	unsigned long hmask = huge_page_mask(h);
+	pte_t *pte;
+	int err = 0;
+
+	do {
+		next = hugetlb_entry_end(h, addr, end);
+		pte = huge_pte_offset(walk->mm, addr & hmask);
+		if (pte && walk->hugetlb_entry)
+			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+		if (err)
+			return err;
+	} while (addr = next, addr != end);
+
+	return 0;
+}
+#endif
+
 /**
  * walk_page_range - walk a memory map's page tables with a callback
  * @mm: memory map to walk
@@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end,
 		vma = find_vma(walk->mm, addr);
 #ifdef CONFIG_HUGETLB_PAGE
 		if (vma && is_vm_hugetlb_page(vma)) {
-			pte_t *pte;
-			struct hstate *hs;
-
 			if (vma->vm_end < next)
 				next = vma->vm_end;
-			hs = hstate_vma(vma);
-			pte = huge_pte_offset(walk->mm,
-					      addr & huge_page_mask(hs));
-			if (pte && !huge_pte_none(huge_ptep_get(pte))
-			    && walk->hugetlb_entry)
-				err = walk->hugetlb_entry(pte, addr,
-							  next, walk);
+			/*
+			 * Hugepage is very tightly coupled with vma, so
+			 * walk through hugetlb entries within a given vma.
+			 */
+			err = walk_hugetlb_range(vma, addr, next, walk);
 			if (err)
 				break;
+			pgd = pgd_offset(walk->mm, next);
 			continue;
 		}
 #endif
-- 
cgit v1.2.2


From 8725d5416213a145ccc9c236dbd26830ba409e00 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 6 Apr 2010 14:35:05 -0700
Subject: memcg: fix race in file_mapped accounting

Presently, memcg's FILE_MAPPED accounting has following race with
move_account (happens at rmdir()).

    increment page->mapcount (rmap.c)
    mem_cgroup_update_file_mapped()           move_account()
					      lock_page_cgroup()
					      check page_mapped() if
					      page_mapped(page)>1 {
						FILE_MAPPED -1 from old memcg
						FILE_MAPPED +1 to old memcg
					      }
					      .....
					      overwrite pc->mem_cgroup
					      unlock_page_cgroup()
    lock_page_cgroup()
    FILE_MAPPED + 1 to pc->mem_cgroup
    unlock_page_cgroup()

Then,
	old memcg (-1 file mapped)
	new memcg (+2 file mapped)

This happens because move_account see page_mapped() which is not guarded
by lock_page_cgroup().  This patch adds FILE_MAPPED flag to page_cgroup
and move account information based on it.  Now, all checks are synchronous
with lock_page_cgroup().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Balbir Singh <balbir@in.ibm.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Andrea Righi <arighi@develer.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ed760dc744..f4ede99c8b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1359,16 +1359,19 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 
 	lock_page_cgroup(pc);
 	mem = pc->mem_cgroup;
-	if (!mem)
-		goto done;
-
-	if (!PageCgroupUsed(pc))
+	if (!mem || !PageCgroupUsed(pc))
 		goto done;
 
 	/*
 	 * Preemption is already disabled. We can use __this_cpu_xxx
 	 */
-	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
+	if (val > 0) {
+		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		SetPageCgroupFileMapped(pc);
+	} else {
+		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		ClearPageCgroupFileMapped(pc);
+	}
 
 done:
 	unlock_page_cgroup(pc);
@@ -1801,16 +1804,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
-	struct page *page;
-
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
 	VM_BUG_ON(!PageCgroupLocked(pc));
 	VM_BUG_ON(!PageCgroupUsed(pc));
 	VM_BUG_ON(pc->mem_cgroup != from);
 
-	page = pc->page;
-	if (page_mapped(page) && !PageAnon(page)) {
+	if (PageCgroupFileMapped(pc)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-- 
cgit v1.2.2


From fc1c183353a113c71675fecd0485e5aa0fe68d72 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 7 Apr 2010 19:23:40 +0300
Subject: slab: Generify kernel pointer validation

As suggested by Linus, introduce a kern_ptr_validate() helper that does some
sanity checks to make sure a pointer is a valid kernel pointer.  This is a
preparational step for fixing SLUB kmem_ptr_validate().

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 13 +------------
 mm/util.c | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a9f325b28be..bac0f4fcc21 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3602,21 +3602,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace);
  */
 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
 {
-	unsigned long addr = (unsigned long)ptr;
-	unsigned long min_addr = PAGE_OFFSET;
-	unsigned long align_mask = BYTES_PER_WORD - 1;
 	unsigned long size = cachep->buffer_size;
 	struct page *page;
 
-	if (unlikely(addr < min_addr))
-		goto out;
-	if (unlikely(addr > (unsigned long)high_memory - size))
-		goto out;
-	if (unlikely(addr & align_mask))
-		goto out;
-	if (unlikely(!kern_addr_valid(addr)))
-		goto out;
-	if (unlikely(!kern_addr_valid(addr + size - 1)))
+	if (unlikely(!kern_ptr_validate(ptr, size)))
 		goto out;
 	page = virt_to_page(ptr);
 	if (unlikely(!PageSlab(page)))
diff --git a/mm/util.c b/mm/util.c
index 834db7be240..f5712e8964b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,6 +186,27 @@ void kzfree(const void *p)
 }
 EXPORT_SYMBOL(kzfree);
 
+int kern_ptr_validate(const void *ptr, unsigned long size)
+{
+	unsigned long addr = (unsigned long)ptr;
+	unsigned long min_addr = PAGE_OFFSET;
+	unsigned long align_mask = sizeof(void *) - 1;
+
+	if (unlikely(addr < min_addr))
+		goto out;
+	if (unlikely(addr > (unsigned long)high_memory - size))
+		goto out;
+	if (unlikely(addr & align_mask))
+		goto out;
+	if (unlikely(!kern_addr_valid(addr)))
+		goto out;
+	if (unlikely(!kern_addr_valid(addr + size - 1)))
+		goto out;
+	return 1;
+out:
+	return 0;
+}
+
 /*
  * strndup_user - duplicate an existing string from user space
  * @s: The string to duplicate
-- 
cgit v1.2.2


From d3e06e2b15590b70ea73733fc4612e4741ff46e0 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 7 Apr 2010 19:23:41 +0300
Subject: slub: Fix kmem_ptr_validate() for non-kernel pointers

As suggested by Linus, fix up kmem_ptr_validate() to handle non-kernel pointers
more graciously. The patch changes kmem_ptr_validate() to use the newly
introduced kern_ptr_validate() helper to check that a pointer is a valid kernel
pointer before we attempt to convert it into a 'struct page'.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index b364844a106..7d6c8b1ccf6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2386,6 +2386,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
 {
 	struct page *page;
 
+	if (!kern_ptr_validate(object, s->size))
+		return 0;
+
 	page = get_object_page(object);
 
 	if (!page || s != page->slab)
-- 
cgit v1.2.2


From d0e9fe1758f222f13ec893f856552d81a10d266d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 10 Apr 2010 10:36:19 -0700
Subject: Simplify and comment on anon_vma re-use for anon_vma_prepare()

This changes the anon_vma reuse case to require that we only reuse
simple anon_vma's - ie the case when the vma only has a single anon_vma
associated with it.

This means that a reuse of an anon_vma from an adjacent vma will always
guarantee that both vma's are associated not only with the same
anon_vma, they will also have the same anon_vma chain (of just a single
entry in this case).

And since anon_vma re-use was the only case where the same anon_vma
might be associated with different chains of anon_vma's, we now have the
case that every vma that shares the same anon_vma will always also have
the same chain.  That makes it much easier to think about merging vma's
that share the same anon_vma's: you can always just drop the other
anon_vma chain in anon_vma_merge() since you know that they are always
identical.

This also splits up the function to validate the anon_vma re-use, and
adds a lot of commentary about the possible races.

Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Borislav Petkov <bp@alien8.de> [ "That didn't fix it" ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 86 +++++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 75557c639ad..acb023e2d35 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -824,6 +824,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	return NULL;
 }
 
+/*
+ * Rough compatbility check to quickly see if it's even worth looking
+ * at sharing an anon_vma.
+ *
+ * They need to have the same vm_file, and the flags can only differ
+ * in things that mprotect may change.
+ *
+ * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
+ * we can merge the two vma's. For example, we refuse to merge a vma if
+ * there is a vm_ops->close() function, because that indicates that the
+ * driver is doing some kind of reference counting. But that doesn't
+ * really matter for the anon_vma sharing case.
+ */
+static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
+{
+	return a->vm_end == b->vm_start &&
+		mpol_equal(vma_policy(a), vma_policy(b)) &&
+		a->vm_file == b->vm_file &&
+		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
+}
+
+/*
+ * Do some basic sanity checking to see if we can re-use the anon_vma
+ * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
+ * the same as 'old', the other will be the new one that is trying
+ * to share the anon_vma.
+ *
+ * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * the anon_vma of 'old' is concurrently in the process of being set up
+ * by another page fault trying to merge _that_. But that's ok: if it
+ * is being set up, that automatically means that it will be a singleton
+ * acceptable for merging, so we can do all of this optimistically. But
+ * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ *
+ * IOW: that the "list_is_singular()" test on the anon_vma_chain only
+ * matters for the 'stable anon_vma' case (ie the thing we want to avoid
+ * is to return an anon_vma that is "complex" due to having gone through
+ * a fork).
+ *
+ * We also make sure that the two vma's are compatible (adjacent,
+ * and with the same memory policies). That's all stable, even with just
+ * a read lock on the mm_sem.
+ */
+static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
+{
+	if (anon_vma_compatible(a, b)) {
+		struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+
+		if (anon_vma && list_is_singular(&old->anon_vma_chain))
+			return anon_vma;
+	}
+	return NULL;
+}
+
 /*
  * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  * neighbouring vmas for a suitable anon_vma, before it goes off
@@ -834,28 +889,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
  */
 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
 {
+	struct anon_vma *anon_vma;
 	struct vm_area_struct *near;
-	unsigned long vm_flags;
 
 	near = vma->vm_next;
 	if (!near)
 		goto try_prev;
 
-	/*
-	 * Since only mprotect tries to remerge vmas, match flags
-	 * which might be mprotected into each other later on.
-	 * Neither mlock nor madvise tries to remerge at present,
-	 * so leave their flags as obstructing a merge.
-	 */
-	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
-	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-
-	if (near->anon_vma && vma->vm_end == near->vm_start &&
- 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
-			can_vma_merge_before(near, vm_flags,
-				NULL, vma->vm_file, vma->vm_pgoff +
-				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
-		return near->anon_vma;
+	anon_vma = reusable_anon_vma(near, vma, near);
+	if (anon_vma)
+		return anon_vma;
 try_prev:
 	/*
 	 * It is potentially slow to have to call find_vma_prev here.
@@ -868,14 +911,9 @@ try_prev:
 	if (!near)
 		goto none;
 
-	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
-	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-
-	if (near->anon_vma && near->vm_end == vma->vm_start &&
-  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
-			can_vma_merge_after(near, vm_flags,
-				NULL, vma->vm_file, vma->vm_pgoff))
-		return near->anon_vma;
+	anon_vma = reusable_anon_vma(near, near, vma);
+	if (anon_vma)
+		return anon_vma;
 none:
 	/*
 	 * There's no absolute need to look only at touching neighbours:
-- 
cgit v1.2.2


From 287d97ac032136724143cde8d5964b414d562ee3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 10 Apr 2010 15:22:30 -0700
Subject: vma_adjust: fix the copying of anon_vma chains

When we move the boundaries between two vma's due to things like
mprotect, we need to make sure that the anon_vma of the pages that got
moved from one vma to another gets properly copied around.  And that was
not always the case, in this rather hard-to-follow code sequence.

Clarify the code, and fix it so that it copies the anon_vma from the
right source.

Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Borislav Petkov <bp@alien8.de> [ "Yeah, not so much this one either" ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index acb023e2d35..f90ea92f755 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -507,11 +507,12 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	struct address_space *mapping = NULL;
 	struct prio_tree_root *root = NULL;
 	struct file *file = vma->vm_file;
-	struct anon_vma *anon_vma = NULL;
 	long adjust_next = 0;
 	int remove_next = 0;
 
 	if (next && !insert) {
+		struct vm_area_struct *exporter = NULL;
+
 		if (end >= next->vm_end) {
 			/*
 			 * vma expands, overlapping all the next, and
@@ -519,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 			 */
 again:			remove_next = 1 + (end > next->vm_end);
 			end = next->vm_end;
-			anon_vma = next->anon_vma;
+			exporter = next;
 			importer = vma;
 		} else if (end > next->vm_start) {
 			/*
@@ -527,7 +528,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 			 * mprotect case 5 shifting the boundary up.
 			 */
 			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
-			anon_vma = next->anon_vma;
+			exporter = next;
 			importer = vma;
 		} else if (end < vma->vm_end) {
 			/*
@@ -536,28 +537,19 @@ again:			remove_next = 1 + (end > next->vm_end);
 			 * mprotect case 4 shifting the boundary down.
 			 */
 			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
-			anon_vma = next->anon_vma;
+			exporter = vma;
 			importer = next;
 		}
-	}
 
-	/*
-	 * When changing only vma->vm_end, we don't really need anon_vma lock.
-	 */
-	if (vma->anon_vma && (insert || importer || start != vma->vm_start))
-		anon_vma = vma->anon_vma;
-	if (anon_vma) {
 		/*
 		 * Easily overlooked: when mprotect shifts the boundary,
 		 * make sure the expanding vma has anon_vma set if the
 		 * shrinking vma had, to cover any anon pages imported.
 		 */
-		if (importer && !importer->anon_vma) {
-			/* Block reverse map lookups until things are set up. */
-			if (anon_vma_clone(importer, vma)) {
+		if (exporter && exporter->anon_vma && !importer->anon_vma) {
+			if (anon_vma_clone(importer, exporter))
 				return -ENOMEM;
-			}
-			importer->anon_vma = anon_vma;
+			importer->anon_vma = exporter->anon_vma;
 		}
 	}
 
-- 
cgit v1.2.2


From 646d87b481dab4ba8301716600dfd276605b0ab0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 11 Apr 2010 17:15:03 -0700
Subject: anon_vma: clone the anon_vma chain in the right order

We want to walk the chain in reverse order when cloning it, so that the
order of the result chain will be the same as the order in the source
chain.  When we add entries to the chain, they go at the head of the
chain, so we want to add the source head last.

Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Borislav Petkov <bp@alien8.de> [ "No, it still oopses" ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index eaa7a09eb72..ee97d38ed7d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -182,7 +182,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
 	struct anon_vma_chain *avc, *pavc;
 
-	list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
 		avc = anon_vma_chain_alloc();
 		if (!avc)
 			goto enomem_failure;
-- 
cgit v1.2.2


From ea90002b0fa7bdee86ec22eba1d951f30bf043a6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 12 Apr 2010 12:44:29 -0700
Subject: anonvma: when setting up page->mapping, we need to pick the _oldest_
 anonvma

Otherwise we might be mapping in a page in a new mapping, but that page
(through the swapcache) would later be mapped into an old mapping too.
The page->mapping must be the case that works for everybody, not just
the mapping that happened to page it in first.

Here's the scenario:

 - page gets allocated/mapped by process A. Let's call the anon_vma we
   associate the page with 'A' to keep it easy to track.

 - Process A forks, creating process B. The anon_vma in B is 'B', and has
   a chain that looks like 'B' -> 'A'. Everything is fine.

 - Swapping happens. The page (with mapping pointing to 'A') gets swapped
   out (perhaps not to disk - it's enough to assume that it's just not
   mapped any more, and lives entirely in the swap-cache)

 - Process B pages it in, which goes like this:

        do_swap_page ->
          page = lookup_swap_cache(entry);
         ...
          set_pte_at(mm, address, page_table, pte);
          page_add_anon_rmap(page, vma, address);

   And think about what happens here!

   In particular, what happens is that this will now be the "first"
   mapping of that page, so page_add_anon_rmap() used to do

        if (first)
                __page_set_anon_rmap(page, vma, address);

   and notice what anon_vma it will use? It will use the anon_vma for
   process B!

   What happens then? Trivial: process 'A' also pages it in (nothing
   happens, it's not the first mapping), and then process 'B' execve's
   or exits or unmaps, making anon_vma B go away.

   End result: process A has a page that points to anon_vma B, but
   anon_vma B does not exist any more.  This can go on forever.  Forget
   about RCU grace periods, forget about locking, forget anything like
   that.  The bug is simply that page->mapping points to an anon_vma
   that was correct at one point, but was _not_ the one that was shared
   by all users of that possible mapping.

Changing it to always use the deepest anon_vma in the anonvma chain gets
us to the safest model.

This can be improved in certain cases: if we know the page is private to
just this particular mapping (for example, it's a new page, or it is the
only swapcache entry), we could pick the top (most specific) anon_vma.

But that's a future optimization. Make it _work_ reliably first.

Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Borislav Petkov <bp@alien8.de> [ "What do you know, I think you fixed it!" ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index ee97d38ed7d..4bad3267537 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -734,9 +734,20 @@ void page_move_anon_rmap(struct page *page,
 static void __page_set_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	struct anon_vma *anon_vma = vma->anon_vma;
+	struct anon_vma_chain *avc;
+	struct anon_vma *anon_vma;
+
+	BUG_ON(!vma->anon_vma);
+
+	/*
+	 * We must use the _oldest_ possible anon_vma for the page mapping!
+	 *
+	 * So take the last AVC chain entry in the vma, which is the deepest
+	 * ancestor, and use the anon_vma from that.
+	 */
+	avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+	anon_vma = avc->anon_vma;
 
-	BUG_ON(!anon_vma);
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	page->mapping = (struct address_space *) anon_vma;
 	page->index = linear_page_index(vma, address);
-- 
cgit v1.2.2


From e8a03feb54ca7f1768bbdc2b491f9ef654e6d01d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 14 Apr 2010 17:59:28 -0400
Subject: rmap: add exclusively owned pages to the newest anon_vma

The recent anon_vma fixes cause many anonymous pages to end up
in the parent process anon_vma, even when the page is exclusively
owned by the current process.

Adding exclusively owned anonymous pages to the top anon_vma
reduces rmap scanning overhead, especially in workloads with
forking servers.

This patch adds a parameter to __page_set_anon_rmap that can
be used to indicate whether or not the added page is exclusively
owned by the current process.

Pages added through page_add_new_anon_rmap are exclusively
owned by the current process, and can be added to the top
anon_vma.

Pages added through page_add_anon_rmap can be either shared
or exclusively owned, so we do the conservative thing and
add it to the oldest anon_vma.

A next step would be to add the exclusive parameter to
page_add_anon_rmap, to be used from functions where we do
know for sure whether a page is exclusively owned.

Signed-off-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Lightly-tested-by: Borislav Petkov <bp@alien8.de>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
[ Edited to look nicer  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 4bad3267537..526704e8215 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -730,23 +730,28 @@ void page_move_anon_rmap(struct page *page,
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
+ * @exclusive:	the page is exclusively owned by the current process
  */
 static void __page_set_anon_rmap(struct page *page,
-	struct vm_area_struct *vma, unsigned long address)
+	struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
-	struct anon_vma_chain *avc;
-	struct anon_vma *anon_vma;
+	struct anon_vma *anon_vma = vma->anon_vma;
 
-	BUG_ON(!vma->anon_vma);
+	BUG_ON(!anon_vma);
 
 	/*
-	 * We must use the _oldest_ possible anon_vma for the page mapping!
+	 * If the page isn't exclusively mapped into this vma,
+	 * we must use the _oldest_ possible anon_vma for the
+	 * page mapping!
 	 *
-	 * So take the last AVC chain entry in the vma, which is the deepest
-	 * ancestor, and use the anon_vma from that.
+	 * So take the last AVC chain entry in the vma, which is
+	 * the deepest ancestor, and use the anon_vma from that.
 	 */
-	avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
-	anon_vma = avc->anon_vma;
+	if (!exclusive) {
+		struct anon_vma_chain *avc;
+		avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+		anon_vma = avc->anon_vma;
+	}
 
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	page->mapping = (struct address_space *) anon_vma;
@@ -802,7 +807,7 @@ void page_add_anon_rmap(struct page *page,
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	if (first)
-		__page_set_anon_rmap(page, vma, address);
+		__page_set_anon_rmap(page, vma, address, 0);
 	else
 		__page_check_anon_rmap(page, vma, address);
 }
@@ -824,7 +829,7 @@ void page_add_new_anon_rmap(struct page *page,
 	SetPageSwapBacked(page);
 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
 	__inc_zone_page_state(page, NR_ANON_PAGES);
-	__page_set_anon_rmap(page, vma, address);
+	__page_set_anon_rmap(page, vma, address, 1);
 	if (page_evictable(page, vma))
 		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
 	else
-- 
cgit v1.2.2