From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 8 Mar 2011 13:19:51 +0100
Subject: block: initial patch for on-stack per-task plugging

This patch adds support for creating a queuing context outside
of the queue itself. This enables us to batch up pieces of IO
before grabbing the block device queue lock and submitting them to
the IO scheduler.

The context is created on the stack of the process and assigned in
the task structure, so that we can auto-unplug it if we hit a schedule
event.

The current queue plugging happens implicitly if IO is submitted to
an empty device, yet callers have to remember to unplug that IO when
they are going to wait for it. This is an ugly API and has caused bugs
in the past. Additionally, it requires hacks in the vm (->sync_page()
callback) to handle that logic. By switching to an explicit plugging
scheme we make the API a lot nicer and can get rid of the ->sync_page()
hack in the vm.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/exit.c  |  1 +
 kernel/fork.c  |  3 +++
 kernel/sched.c | 12 ++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
 	profile_task_exit(tsk);
 
 	WARN_ON(atomic_read(&tsk->fs_excl));
+	WARN_ON(blk_needs_flush_plug(tsk));
 
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..027c80e5162f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1204,6 +1204,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+	p->plug = NULL;
+#endif
 #ifdef CONFIG_FUTEX
 	p->robust_list = NULL;
 #ifdef CONFIG_COMPAT
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..ca098bf4cc65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3978,6 +3978,16 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
+	/*
+	 * If we are going to sleep and we have plugged IO queued, make
+	 * sure to submit it to avoid deadlocks.
+	 */
+	if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
+		raw_spin_unlock(&rq->lock);
+		blk_flush_plug(prev);
+		raw_spin_lock(&rq->lock);
+	}
+
 	pre_schedule(rq, prev);
 
 	if (unlikely(!rq->nr_running))
@@ -5333,6 +5343,7 @@ void __sched io_schedule(void)
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
@@ -5348,6 +5359,7 @@ long __sched io_schedule_timeout(long timeout)
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
 	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
 	current->in_iowait = 0;
-- 
cgit v1.2.2


From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 9 Mar 2011 11:56:30 +0100
Subject: block: kill off REQ_UNPLUG

With the plugging now being explicitly controlled by the
submitter, callers need not pass down unplugging hints
to the block layer. If they want to unplug, it's because they
manually plugged on their own - in which case, they should just
unplug at will.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/power/block_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
 		struct page *page, struct bio **bio_chain)
 {
-	const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+	const int bio_rw = rw | REQ_SYNC;
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-- 
cgit v1.2.2


From 805f6b5e1cbfedfb9b3d354013e7f4b13a79270f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 11 Mar 2011 20:11:59 +0100
Subject: blktrace: Use rq->cmd_flags directly in blk_add_trace_rq.

In blk_add_trace_rq, we only chose the minor 2 bits from
request's cmd_flags and did some check for discard.
so most of other flags(e.g, REQ_SYNC) are missing.

For example, with a sync write after blkparse we get:
  8,16   1        1     0.001776503  7509  A  WS 1349632 + 1024 <- (8,17) 1347584
  8,16   1        2     0.001776813  7509  Q  WS 1349632 + 1024 [dd]
  8,16   1        3     0.001780395  7509  G  WS 1349632 + 1024 [dd]
  8,16   1        5     0.001783186  7509  I   W 1349632 + 1024 [dd]
  8,16   1       11     0.001816987  7509  D   W 1349632 + 1024 [dd]
  8,16   0        2     0.006218192     0  C   W 1349632 + 1024 [0]

Since now we have integrated the flags of both bio and request,
it is safe to pass rq->cmd_flags directly to __blk_add_trace.

With this patch, after a sync write we get:
  8,16   1        1     0.001776900  5425  A  WS 1189888 + 1024 <- (8,17) 1187840
  8,16   1        2     0.001777179  5425  Q  WS 1189888 + 1024 [dd]
  8,16   1        3     0.001780797  5425  G  WS 1189888 + 1024 [dd]
  8,16   1        5     0.001783402  5425  I  WS 1189888 + 1024 [dd]
  8,16   1       11     0.001817468  5425  D  WS 1189888 + 1024 [dd]
  8,16   0        2     0.005640709     0  C  WS 1189888 + 1024 [0]

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..7aa40f8e182d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
  *
  **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
+			     u32 what)
 {
 	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
 
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= REQ_DISCARD;
-
-	if (rq->cmd_flags & REQ_SECURE)
-		rw |= REQ_SECURE;
-
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
 				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
-				what, rq->errors, 0, NULL);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+				rq->cmd_flags, what, rq->errors, 0, NULL);
 	}
 }
 
-- 
cgit v1.2.2


From ed3cd4a86562eee79de25b567a00e648cc3dc2bf Mon Sep 17 00:00:00 2001
From: matt mooney <mfm@muteddisk.com>
Date: Fri, 14 Jan 2011 06:12:24 -0800
Subject: kernel: change to new flag variable

Replace EXTRA_CFLAGS with ccflags-y.

Signed-off-by: matt mooney <mfm@muteddisk.com>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 kernel/gcov/Makefile  | 2 +-
 kernel/power/Makefile | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 
 obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
-ccflags-$(CONFIG_PM_DEBUG)	:=	-DDEBUG
+
+ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_PM_SLEEP)		+= console.o
-- 
cgit v1.2.2


From d57f078b193981d1b7d24193f3118c6b806db0ff Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 Mar 2011 16:54:31 +0000
Subject: KGDB: Notify GDB of machine halt, reboot or power off

Notify GDB of the machine halting, rebooting or powering off by sending it an
exited command (remote protocol command 'W').  This is done by calling:

	void gdbstub_exit(int status)

from the arch's machine_{halt,restart,power_off}() functions with an
appropriate exit status to be reported to GDB.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/debug/gdbstub.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
 	put_packet(remcom_out_buffer);
 	return 0;
 }
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+	unsigned char checksum, ch, buffer[3];
+	int loop;
+
+	buffer[0] = 'W';
+	buffer[1] = hex_asc_hi(status);
+	buffer[2] = hex_asc_lo(status);
+
+	dbg_io_ops->write_char('$');
+	checksum = 0;
+
+	for (loop = 0; loop < 3; loop++) {
+		ch = buffer[loop];
+		checksum += ch;
+		dbg_io_ops->write_char(ch);
+	}
+
+	dbg_io_ops->write_char('#');
+	dbg_io_ops->write_char(hex_asc_hi(checksum));
+	dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+	/* make sure the output is flushed, lest the bootloader clobber it */
+	dbg_io_ops->flush();
+}
-- 
cgit v1.2.2


From 16addf954d3954a72fd56abc02ffcba3c18529a1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 18 Mar 2011 09:34:53 -0700
Subject: sched: Fix yield_to kernel-doc

Add missing function parameters for yield_to():

 Warning(kernel/sched.c:5470): No description found for parameter 'p'
 Warning(kernel/sched.c:5470): No description found for parameter 'preempt'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110318093453.8f7489a4.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 58d66ea7d200..052120d67706 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5467,6 +5467,8 @@ EXPORT_SYMBOL(yield);
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
  * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
  *
  * It's the caller's job to ensure that the target task struct
  * can't go away on us before we can do any checks.
-- 
cgit v1.2.2


From da48524eb20662618854bb3df2db01fc65f3070c Mon Sep 17 00:00:00 2001
From: Julien Tinnes <jln@google.com>
Date: Fri, 18 Mar 2011 15:05:21 -0700
Subject: Prevent rt_sigqueueinfo and rt_tgsigqueueinfo from spoofing the
 signal code

Userland should be able to trust the pid and uid of the sender of a
signal if the si_code is SI_TKILL.

Unfortunately, the kernel has historically allowed sigqueueinfo() to
send any si_code at all (as long as it was negative - to distinguish it
from kernel-generated signals like SIGILL etc), so it could spoof a
SI_TKILL with incorrect siginfo values.

Happily, it looks like glibc has always set si_code to the appropriate
SI_QUEUE, so there are probably no actual user code that ever uses
anything but the appropriate SI_QUEUE flag.

So just tighten the check for si_code (we used to allow any negative
value), and add a (one-time) warning in case there are binaries out
there that might depend on using other si_code values.

Signed-off-by: Julien Tinnes <jln@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..31751868de88 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2421,9 +2421,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 		return -EFAULT;
 
 	/* Not even root can pretend to send signals from the kernel.
-	   Nor can they impersonate a kill(), which adds source info.  */
-	if (info.si_code >= 0)
+	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
+	 */
+	if (info.si_code != SI_QUEUE) {
+		/* We used to allow any < 0 si_code */
+		WARN_ON_ONCE(info.si_code < 0);
 		return -EPERM;
+	}
 	info.si_signo = sig;
 
 	/* POSIX.1b doesn't mention process groups.  */
@@ -2437,9 +2441,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 		return -EINVAL;
 
 	/* Not even root can pretend to send signals from the kernel.
-	   Nor can they impersonate a kill(), which adds source info.  */
-	if (info->si_code >= 0)
+	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
+	 */
+	if (info->si_code != SI_QUEUE) {
+		/* We used to allow any < 0 si_code */
+		WARN_ON_ONCE(info->si_code < 0);
 		return -EPERM;
+	}
 	info->si_signo = sig;
 
 	return do_send_specific(tgid, pid, sig, info);
-- 
cgit v1.2.2


From 1106b6997df7d0c0487e21fd9c9dd2ce3d4a52db Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 16 Feb 2011 17:35:34 +0100
Subject: tracing: Fix set_ftrace_filter probe function display

If one or more function probes (like traceon) are enabled,
and there's no other function filter, the first probe
func is skipped (which one depends on the position in the hash).

$ echo sys_open:traceon sys_close:traceon > ./set_ftrace_filter
$ cat set_ftrace_filter
#### all functions enabled ####
sys_close:traceon:unlimited
$

The reason was, that in the case of no other function filter,
the func_pos was not properly updated before calling t_hash_start.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1297874134-7008-1-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 888b611897d3..c075f4ea6b94 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		return t_hash_next(m, pos);
 
 	(*pos)++;
-	iter->pos = *pos;
+	iter->pos = iter->func_pos = *pos;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	if (!rec)
 		return t_hash_start(m, pos);
 
-	iter->func_pos = *pos;
 	iter->func = rec;
 
 	return iter;
-- 
cgit v1.2.2


From 8d2587970b8bdf7c8d9208e3f4bb93182aef1a0f Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Tue, 22 Mar 2011 16:30:13 -0700
Subject: cgroups: if you list_empty() a head then don't list_del() it

list_del() leaves poison in the prev and next pointers.  The next
list_empty() will compare those poisons, and say the list isn't empty.
Any list operations that assume the node is on a list because of such a
check will be fooled into dereferencing poison.  One needs to INIT the
node after the del, and fortunately there's already a wrapper for that -
list_del_init().

Some of the dels are followed by deallocations, so can be ignored, and one
can be merged with an add to make a move.  Apart from that, I erred on the
side of caution in making nodes list_empty()-queriable.

Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 95362d15128c..e31b220a743d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	/* Update the css_set linked lists if we're using them */
 	write_lock(&css_set_lock);
-	if (!list_empty(&tsk->cg_list)) {
-		list_del(&tsk->cg_list);
-		list_add(&tsk->cg_list, &newcg->tasks);
-	}
+	if (!list_empty(&tsk->cg_list))
+		list_move(&tsk->cg_list, &newcg->tasks);
 	write_unlock(&css_set_lock);
 
 	for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 	if (!list_empty(&cgrp->release_list))
-		list_del(&cgrp->release_list);
+		list_del_init(&cgrp->release_list);
 	spin_unlock(&release_list_lock);
 
 	cgroup_lock_hierarchy(cgrp->root);
 	/* delete this cgroup from parent->children */
-	list_del(&cgrp->sibling);
+	list_del_init(&cgrp->sibling);
 	cgroup_unlock_hierarchy(cgrp->root);
 
 	d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	subsys[ss->subsys_id] = NULL;
 
 	/* remove subsystem from rootnode's list of subsystems */
-	list_del(&ss->sibling);
+	list_del_init(&ss->sibling);
 
 	/*
 	 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	if (!list_empty(&tsk->cg_list)) {
 		write_lock(&css_set_lock);
 		if (!list_empty(&tsk->cg_list))
-			list_del(&tsk->cg_list);
+			list_del_init(&tsk->cg_list);
 		write_unlock(&css_set_lock);
 	}
 
-- 
cgit v1.2.2


From 504f52b5439aaf26d3e2c1d45ec10fce38c8dd27 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 22 Mar 2011 16:30:41 -0700
Subject: mm: NUMA aware alloc_task_struct_node()

All kthreads being created from a single helper task, they all use memory
from a single node for their kernel stack and task struct.

This patch suite creates kthread_create_on_cpu(), adding a 'cpu' parameter
to parameters already used by kthread_create().

This parameter serves in allocating memory for the new kthread on its
memory node if available.

Users of this new function are : ksoftirqd, kworker, migration, pktgend...

This patch:

Add a node parameter to alloc_task_struct(), and change its name to
alloc_task_struct_node()

This change is needed to allow NUMA aware kthread_create_on_cpu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 05b92c457010..cffbe8a4e1fc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -109,8 +109,10 @@ int nr_processes(void)
 }
 
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
+# define alloc_task_struct_node(node)		\
+		kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk)			\
+		kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
 #endif
 
@@ -249,12 +251,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	struct task_struct *tsk;
 	struct thread_info *ti;
 	unsigned long *stackend;
-
+	int node = numa_node_id();
 	int err;
 
 	prepare_to_copy(orig);
 
-	tsk = alloc_task_struct();
+	tsk = alloc_task_struct_node(node);
 	if (!tsk)
 		return NULL;
 
-- 
cgit v1.2.2


From b6a84016bd2598e35ead635147fa53619982648d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 22 Mar 2011 16:30:42 -0700
Subject: mm: NUMA aware alloc_thread_info_node()

Add a node parameter to alloc_thread_info(), and change its name to
alloc_thread_info_node()

This change is needed to allow NUMA aware kthread_create_on_cpu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index cffbe8a4e1fc..cbc6adc6e891 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -117,14 +117,17 @@ static struct kmem_cache *task_struct_cachep;
 #endif
 
 #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+						  int node)
 {
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	gfp_t mask = GFP_KERNEL | __GFP_ZERO;
 #else
 	gfp_t mask = GFP_KERNEL;
 #endif
-	return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+	struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+
+	return page ? page_address(page) : NULL;
 }
 
 static inline void free_thread_info(struct thread_info *ti)
@@ -260,7 +263,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (!tsk)
 		return NULL;
 
-	ti = alloc_thread_info(tsk);
+	ti = alloc_thread_info_node(tsk, node);
 	if (!ti) {
 		free_task_struct(tsk);
 		return NULL;
-- 
cgit v1.2.2


From 207205a2ba2655652fe46a60b49838af6c16a919 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 22 Mar 2011 16:30:44 -0700
Subject: kthread: NUMA aware kthread_create_on_node()

All kthreads being created from a single helper task, they all use memory
from a single node for their kernel stack and task struct.

This patch suite creates kthread_create_on_node(), adding a 'cpu' parameter
to parameters already used by kthread_create().

This parameter serves in allocating memory for the new kthread on its
memory node if possible.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c    |  3 ++-
 kernel/kthread.c | 31 +++++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index cbc6adc6e891..a8f64f8ec7e1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/kthread.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
@@ -254,7 +255,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	struct task_struct *tsk;
 	struct thread_info *ti;
 	unsigned long *stackend;
-	int node = numa_node_id();
+	int node = tsk_fork_get_node(orig);
 	int err;
 
 	prepare_to_copy(orig);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..684ab3f7dd72 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
 	/* Information passed to kthread() from kthreadd. */
 	int (*threadfn)(void *data);
 	void *data;
+	int node;
 
 	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
 	do_exit(ret);
 }
 
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+	if (tsk == kthreadd_task)
+		return tsk->pref_node_fork;
+#endif
+	return numa_node_id();
+}
+
 static void create_kthread(struct kthread_create_info *create)
 {
 	int pid;
 
+#ifdef CONFIG_NUMA
+	current->pref_node_fork = create->node;
+#endif
 	/* We want our own signal handler (we take no signals by default). */
 	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
 	if (pid < 0) {
@@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create)
 }
 
 /**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
+ * @node: memory node number.
  * @namefmt: printf-style name for the thread.
  *
  * Description: This helper function creates and names a kernel
  * thread.  The thread will be stopped: use wake_up_process() to start
  * it.  See also kthread_run().
  *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
  * standalone thread for which noone will call kthread_stop(), or
@@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create)
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
-				   void *data,
-				   const char namefmt[],
-				   ...)
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+					   void *data,
+					   int node,
+					   const char namefmt[],
+					   ...)
 {
 	struct kthread_create_info create;
 
 	create.threadfn = threadfn;
 	create.data = data;
+	create.node = node;
 	init_completion(&create.done);
 
 	spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	}
 	return create.result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
 
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
-- 
cgit v1.2.2


From 94dcf29a11b3d20a28790598d701f98484a969da Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 22 Mar 2011 16:30:45 -0700
Subject: kthread: use kthread_create_on_node()

ksoftirqd, kworker, migration, and pktgend kthreads can be created with
kthread_create_on_node(), to get proper NUMA affinities for their stack and
task_struct.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softirq.c      | 5 ++++-
 kernel/stop_machine.c | 6 ++++--
 kernel/workqueue.c    | 6 ++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 56e5dec837f0..735d87095172 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+		p = kthread_create_on_node(run_ksoftirqd,
+					   hcpu,
+					   cpu_to_node(hotcpu),
+					   "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
 			return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		BUG_ON(stopper->thread || stopper->enabled ||
 		       !list_empty(&stopper->works));
-		p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
-				   cpu);
+		p = kthread_create_on_node(cpu_stopper_thread,
+					   stopper,
+					   cpu_to_node(cpu),
+					   "migration/%d", cpu);
 		if (IS_ERR(p))
 			return notifier_from_errno(PTR_ERR(p));
 		get_task_struct(p);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5ca7ce9ce754..04ef830690ec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	worker->id = id;
 
 	if (!on_unbound_cpu)
-		worker->task = kthread_create(worker_thread, worker,
-					      "kworker/%u:%d", gcwq->cpu, id);
+		worker->task = kthread_create_on_node(worker_thread,
+						      worker,
+						      cpu_to_node(gcwq->cpu),
+						      "kworker/%u:%d", gcwq->cpu, id);
 	else
 		worker->task = kthread_create(worker_thread, worker,
 					      "kworker/u:%d", id);
-- 
cgit v1.2.2


From d404ab0a1133e95557bb7deab2a49b348dfeba85 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 22 Mar 2011 16:34:04 -0700
Subject: move x86 specific oops=panic to generic code

The oops=panic cmdline option is not x86 specific, move it to generic code.
Update documentation.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+
+static int __init oops_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "panic"))
+		panic_on_oops = 1;
+	return 0;
+}
+early_param("oops", oops_setup);
-- 
cgit v1.2.2


From 34db18a054c600b6f81787165669dc572fe4de25 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 22 Mar 2011 16:34:06 -0700
Subject: smp: move smp setup functions to kernel/smp.c

Move setup_nr_cpu_ids(), smp_init() and some other SMP boot parameter
setup functions from init/main.c to kenrel/smp.c, saves some #ifdef
CONFIG_SMP.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Rakib Mullick <rakib.mullick@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tejun Heo <tj@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 7cbd0f293df4..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void)
 }
 #endif /* USE_GENERIC_SMP_HELPERS */
 
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+
+void __weak arch_disable_smp_support(void) { }
+
+static int __init nosmp(char *str)
+{
+	setup_max_cpus = 0;
+	arch_disable_smp_support();
+
+	return 0;
+}
+
+early_param("nosmp", nosmp);
+
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+	int nr_cpus;
+
+	get_option(&str, &nr_cpus);
+	if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+		nr_cpu_ids = nr_cpus;
+
+	return 0;
+}
+
+early_param("nr_cpus", nrcpus);
+
+static int __init maxcpus(char *str)
+{
+	get_option(&str, &setup_max_cpus);
+	if (setup_max_cpus == 0)
+		arch_disable_smp_support();
+
+	return 0;
+}
+
+early_param("maxcpus", maxcpus);
+
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+	nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+	unsigned int cpu;
+
+	/* FIXME: This should be done in userspace --RR */
+	for_each_present_cpu(cpu) {
+		if (num_online_cpus() >= setup_max_cpus)
+			break;
+		if (!cpu_online(cpu))
+			cpu_up(cpu);
+	}
+
+	/* Any cleanup work */
+	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+	smp_cpus_done(setup_max_cpus);
+}
+
 /*
  * Call a function on all processors.  May be used during early boot while
  * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
-- 
cgit v1.2.2


From 4d51985e484dd11d9047dfcd1278ec9ccfb435d5 Mon Sep 17 00:00:00 2001
From: Michael Rodriguez <dkingston02@gmail.com>
Date: Tue, 22 Mar 2011 16:34:07 -0700
Subject: kernel/cpu.c: fix many errors related to style.

Change the printk() calls to have the KERN_INFO/KERN_ERROR stuff, and
fixes other coding style errors.  Not _all_ of them are gone, though.

[akpm@linux-foundation.org: revert the bits I disagree with]
Signed-off-by: Michael Rodriguez <dkingston02@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..c95fc4df0faa 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
 	BUG_ON(cpu_notify(val, v));
 }
-
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
 		return err;
 
 	cpu_notify(CPU_DYING | param->mod, param->hcpu);
-
 	return 0;
 }
 
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
+
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
 		nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (ret) {
 		nr_calls--;
-		printk("%s: attempt to bring up CPU %u failed\n",
+		printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
 				__func__, cpu);
 		goto out_notify;
 	}
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
 	if (cpumask_empty(frozen_cpus))
 		goto out;
 
-	printk("Enabling non-boot CPUs ...\n");
+	printk(KERN_INFO "Enabling non-boot CPUs ...\n");
 
 	arch_enable_nonboot_cpus_begin();
 
 	for_each_cpu(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
-			printk("CPU%d is up\n", cpu);
+			printk(KERN_INFO "CPU%d is up\n", cpu);
 			continue;
 		}
 		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
  */
 
 /* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x)	[x+1][0] = (1UL << (x))
 #define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
 #define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
 #define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
-- 
cgit v1.2.2


From 9bfb23fc4a481650e60d22dbe84c0fd5a9d49bba Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 22 Mar 2011 16:34:09 -0700
Subject: sys_unshare: remove the dead CLONE_THREAD/SIGHAND/VM code

Cleanup: kill the dead code which does nothing but complicates the code
and confuses the reader.

sys_unshare(CLONE_THREAD/SIGHAND/VM) is not really implemented, and I
doubt very much it will ever work.  At least, nobody even tried since the
original 99d1419d96d7df9cfa56 ("unshare system call -v5: system call
handler function") was applied more than 4 years ago.

And the code is not consistent.  unshare_thread() always fails
unconditionally, while unshare_sighand() and unshare_vm() pretend to work
if there is nothing to unshare.

Remove unshare_thread(), unshare_sighand(), unshare_vm() helpers and
related variables and add a simple CLONE_THREAD | CLONE_SIGHAND| CLONE_VM
check into check_unshare_flags().

Also, move the "CLONE_NEWNS needs CLONE_FS" check from
check_unshare_flags() to sys_unshare().  This looks more consistent and
matches the similar do_sysvsem check in sys_unshare().

Note: with or without this patch "atomic_read(mm->mm_users) > 1" can give
a false positive due to get_task_mm().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Janak Desai <janak@us.ibm.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 123 ++++++++++++----------------------------------------------
 1 file changed, 25 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a8f64f8ec7e1..f2b494d7c557 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1519,38 +1519,24 @@ void __init proc_caches_init(void)
 }
 
 /*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
  */
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
 {
+	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+		return -EINVAL;
 	/*
-	 * If unsharing a thread from a thread group, must also
-	 * unshare vm.
-	 */
-	if (*flags_ptr & CLONE_THREAD)
-		*flags_ptr |= CLONE_VM;
-
-	/*
-	 * If unsharing vm, must also unshare signal handlers.
-	 */
-	if (*flags_ptr & CLONE_VM)
-		*flags_ptr |= CLONE_SIGHAND;
-
-	/*
-	 * If unsharing namespace, must also unshare filesystem information.
+	 * Not implemented, but pretend it works if there is nothing to
+	 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+	 * needs to unshare vm.
 	 */
-	if (*flags_ptr & CLONE_NEWNS)
-		*flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
-	if (unshare_flags & CLONE_THREAD)
-		return -EINVAL;
+	if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+		/* FIXME: get_task_mm() increments ->mm_users */
+		if (atomic_read(&current->mm->mm_users) > 1)
+			return -EINVAL;
+	}
 
 	return 0;
 }
@@ -1576,34 +1562,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 	return 0;
 }
 
-/*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
-	struct sighand_struct *sigh = current->sighand;
-
-	if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-		return -EINVAL;
-	else
-		return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
-	struct mm_struct *mm = current->mm;
-
-	if ((unshare_flags & CLONE_VM) &&
-	    (mm && atomic_read(&mm->mm_users) > 1)) {
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 /*
  * Unshare file descriptor table if it is being shared
  */
@@ -1632,23 +1590,21 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
  */
 SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
-	int err = 0;
 	struct fs_struct *fs, *new_fs = NULL;
-	struct sighand_struct *new_sigh = NULL;
-	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
 	struct nsproxy *new_nsproxy = NULL;
 	int do_sysvsem = 0;
+	int err;
 
-	check_unshare_flags(&unshare_flags);
-
-	/* Return -EINVAL for all unsupported flags */
-	err = -EINVAL;
-	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+	err = check_unshare_flags(unshare_flags);
+	if (err)
 		goto bad_unshare_out;
 
+	/*
+	 * If unsharing namespace, must also unshare filesystem information.
+	 */
+	if (unshare_flags & CLONE_NEWNS)
+		unshare_flags |= CLONE_FS;
 	/*
 	 * CLONE_NEWIPC must also detach from the undolist: after switching
 	 * to a new ipc namespace, the semaphore arrays from the old
@@ -1656,21 +1612,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	 */
 	if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
 		do_sysvsem = 1;
-	if ((err = unshare_thread(unshare_flags)))
-		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
-		goto bad_unshare_cleanup_thread;
-	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
-		goto bad_unshare_cleanup_fs;
-	if ((err = unshare_vm(unshare_flags, &new_mm)))
-		goto bad_unshare_cleanup_sigh;
+		goto bad_unshare_out;
 	if ((err = unshare_fd(unshare_flags, &new_fd)))
-		goto bad_unshare_cleanup_vm;
+		goto bad_unshare_cleanup_fs;
 	if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
 			new_fs)))
 		goto bad_unshare_cleanup_fd;
 
-	if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+	if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
 			 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1696,19 +1646,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 			spin_unlock(&fs->lock);
 		}
 
-		if (new_mm) {
-			mm = current->mm;
-			active_mm = current->active_mm;
-			current->mm = new_mm;
-			current->active_mm = new_mm;
-			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-				atomic_dec(&mm->oom_disable_count);
-				atomic_inc(&new_mm->oom_disable_count);
-			}
-			activate_mm(active_mm, new_mm);
-			new_mm = mm;
-		}
-
 		if (new_fd) {
 			fd = current->files;
 			current->files = new_fd;
@@ -1725,20 +1662,10 @@ bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);
 
-bad_unshare_cleanup_vm:
-	if (new_mm)
-		mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
-	if (new_sigh)
-		if (atomic_dec_and_test(&new_sigh->count))
-			kmem_cache_free(sighand_cachep, new_sigh);
-
 bad_unshare_cleanup_fs:
 	if (new_fs)
 		free_fs_struct(new_fs);
 
-bad_unshare_cleanup_thread:
 bad_unshare_out:
 	return err;
 }
-- 
cgit v1.2.2


From fef2c9bc1b54c0261324a96e948c0b849796e896 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 22 Mar 2011 16:34:16 -0700
Subject: kernel/watchdog.c: allow hardlockup to panic by default

When a cpu is considered stuck, instead of limping along and just printing
a warning, it is sometimes preferred to just panic, let kdump capture the
vmcore and reboot.  This gets the machine back into a stable state quickly
while saving the info that got it into a stuck state to begin with.

Add a Kconfig option to allow users to set the hardlockup to panic
by default.  Also add in a 'nmi_watchdog=nopanic' to override this.

[akpm@linux-foundation.org: fix strncmp length]
Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..054a67cca9da 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
  * Should we panic when a soft-lockup or hard-lockup occurs:
  */
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
 
 static int __init hardlockup_panic_setup(char *str)
 {
 	if (!strncmp(str, "panic", 5))
 		hardlockup_panic = 1;
+	else if (!strncmp(str, "nopanic", 7))
+		hardlockup_panic = 0;
 	else if (!strncmp(str, "0", 1))
 		watchdog_enabled = 0;
 	return 1;
-- 
cgit v1.2.2


From f99a99330f85a84c346ddeb4adc72dbfad9b9e3e Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 22 Mar 2011 16:34:17 -0700
Subject: kernel/watchdog.c: always return NOTIFY_OK during cpu up/down events

This patch addresses a couple of problems.  One was the case when the
hardlockup failed to start, it also failed to start the softlockup.  There
were valid cases when the hardlockup shouldn't start and that shouldn't
block the softlockup (no lapic, bios controls perf counters).

The second problem was when the hardlockup failed to start on boxes (from
a no lapic or bios controlled perf counter case), it reported failure to
the cpu notifier chain.  This blocked the notifier from continuing to
start other more critical pieces of cpu bring-up (in our case based on a
2.6.32 fork, it was the mce).  As a result, during soft cpu online/offline
testing, the system would panic when a cpu was offlined because the cpu
notifier would succeed in processing a watchdog disable cpu event and
would panic in the mce case as a result of un-initialized variables from a
never executed cpu up event.

I realized the hardlockup/softlockup cases are really just debugging aids
and should never impede the progress of a cpu up/down event.  Therefore I
modified the code to always return NOTIFY_OK and instead rely on printks
to inform the user of problems.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 054a67cca9da..140dce750450 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -418,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu)
 static int watchdog_enable(int cpu)
 {
 	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-	int err;
+	int err = 0;
 
 	/* enable the perf event */
 	err = watchdog_nmi_enable(cpu);
-	if (err)
-		return err;
+
+	/* Regardless of err above, fall through and start softlockup */
 
 	/* create the watchdog thread */
 	if (!p) {
 		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-			return PTR_ERR(p);
+			if (!err)
+				/* if hardlockup hasn't already set this */
+				err = PTR_ERR(p);
+			goto out;
 		}
 		kthread_bind(p, cpu);
 		per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -438,7 +441,8 @@ static int watchdog_enable(int cpu)
 		wake_up_process(p);
 	}
 
-	return 0;
+out:
+	return err;
 }
 
 static void watchdog_disable(int cpu)
@@ -550,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 #endif /* CONFIG_HOTPLUG_CPU */
 	}
-	return notifier_from_errno(err);
+
+	/*
+	 * hardlockup and softlockup are not important enough
+	 * to block cpu bring up.  Just always succeed and
+	 * rely on printk output to flag problems.
+	 */
+	return NOTIFY_OK;
 }
 
 static struct notifier_block __cpuinitdata cpu_nfb = {
-- 
cgit v1.2.2


From 7bf693951a8e5f7e600a45b74d91d962a453146e Mon Sep 17 00:00:00 2001
From: "Fabio M. Di Nitto" <fdinitto@redhat.com>
Date: Tue, 22 Mar 2011 16:34:20 -0700
Subject: console: allow to retain boot console via boot option keep_bootcon

On some architectures, the boot process involves de-registering the boot
console (early boot), initialize drivers and then re-register the console.

This mechanism introduces a window in which no printk can happen on the
console and messages are buffered and then printed once the new console is
available.

If a kernel crashes during this window, all it's left on the boot console
is "console [foo] enabled, bootconsole disabled" making debug of the crash
rather 'interesting'.

By adding "keep_bootcon" option, do not unregister the boot console, that
will allow to printk everything that is happening up to the crash.

The option is clearly meant only for debugging purposes as it introduces
lots of duplicated info printed on console, but will make bug report from
users easier as it doesn't require a kernel build just to figure out where
we crash.

Signed-off-by: Fabio M. Di Nitto <fabbione@fabbione.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Greg KH <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 33284adb2189..2b591f252e55 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1316,6 +1316,18 @@ void console_start(struct console *console)
 }
 EXPORT_SYMBOL(console_start);
 
+static int __read_mostly keep_bootcon;
+
+static int __init keep_bootcon_setup(char *str)
+{
+	keep_bootcon = 1;
+	printk(KERN_INFO "debug: skip boot console de-registration.\n");
+
+	return 0;
+}
+
+early_param("keep_bootcon", keep_bootcon_setup);
+
 /*
  * The console driver calls this routine during kernel initialization
  * to register the console printing procedure with printk() and to
@@ -1463,7 +1475,9 @@ void register_console(struct console *newcon)
 	 * users know there might be something in the kernel's log buffer that
 	 * went to the bootconsole (that they do not see on the real console)
 	 */
-	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+	if (bcon &&
+	    ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+	    !keep_bootcon) {
 		/* we need to iterate through twice, to make sure we print
 		 * everything out, before we unregister the console(s)
 		 */
-- 
cgit v1.2.2


From fe3d8ad31cf51b062bbb8a9609eeb1d0c41a7f30 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 22 Mar 2011 16:34:21 -0700
Subject: console: prevent registered consoles from dumping old kernel message
 over again

For a platform with many consoles like:
 "console=tty1 console=ttyMFD2 console=ttyS0 earlyprintk=mrst"

Each time when the non "selected_console" (tty1 and ttyMFD2 here) get
registered, the existing kernel message will be printed out on registered
consoles again, the "mrst" early console will get some same message for 3
times, and "tty1" will get some for twice.

As suggested by Andrew Morton, every time a new console is registered, it
will be set as the "exclusive" console which will dump the already
existing kernel messages.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: Greg KH <gregkh@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 2b591f252e55..a53607eea6d0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -112,6 +112,11 @@ static unsigned log_start;	/* Index into log_buf: next char to be read by syslog
 static unsigned con_start;	/* Index into log_buf: next char to be sent to consoles */
 static unsigned log_end;	/* Index into log_buf: most-recently-written-char + 1 */
 
+/*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+
 /*
  *	Array of consoles built from command line options (console=)
  */
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
 	struct console *con;
 
 	for_each_console(con) {
+		if (exclusive_console && con != exclusive_console)
+			continue;
 		if ((con->flags & CON_ENABLED) && con->write &&
 				(cpu_online(smp_processor_id()) ||
 				(con->flags & CON_ANYTIME)))
@@ -1230,6 +1237,11 @@ void console_unlock(void)
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
+
+	/* Release the exclusive_console once it is used */
+	if (unlikely(exclusive_console))
+		exclusive_console = NULL;
+
 	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
 	if (wake_klogd)
@@ -1464,6 +1476,12 @@ void register_console(struct console *newcon)
 		spin_lock_irqsave(&logbuf_lock, flags);
 		con_start = log_start;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
+		/*
+		 * We're about to replay the log buffer.  Only do this to the
+		 * just-registered console to avoid excessive message spam to
+		 * the already-registered consoles.
+		 */
+		exclusive_console = newcon;
 	}
 	console_unlock();
 	console_sysfs_notify();
-- 
cgit v1.2.2


From 9f36e2c448007b54851e7e4fa48da97d1477a175 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Tue, 22 Mar 2011 16:34:22 -0700
Subject: printk: use %pK for /proc/kallsyms and /proc/modules

In an effort to reduce kernel address leaks that might be used to help
target kernel privilege escalation exploits, this patch uses %pK when
displaying addresses in /proc/kallsyms, /proc/modules, and
/sys/module/*/sections/*.

Note that this changes %x to %p, so some legitimately 0 values in
/proc/kallsyms would have changed from 00000000 to "(null)".  To avoid
this, "(null)" is not used when using the "K" format.  Anything that was
already successfully parsing "(null)" in addition to full hex digits
should have no problem with this change.  (Thanks to Joe Perches for the
suggestion.) Due to the %x to %p, "void *" casts are needed since these
addresses are already "unsigned long" everywhere internally, due to their
starting life as ELF section offsets.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Cc: Eugene Teo <eugene@redhat.com>
Cc: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 10 ++++------
 kernel/module.c   |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..75dcca37d61a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -477,13 +477,11 @@ static int s_show(struct seq_file *m, void *p)
 		 */
 		type = iter->exported ? toupper(iter->type) :
 					tolower(iter->type);
-		seq_printf(m, "%0*lx %c %s\t[%s]\n",
-			   (int)(2 * sizeof(void *)),
-			   iter->value, type, iter->name, iter->module_name);
+		seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+			   type, iter->name, iter->module_name);
 	} else
-		seq_printf(m, "%0*lx %c %s\n",
-			   (int)(2 * sizeof(void *)),
-			   iter->value, iter->type, iter->name);
+		seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+			   iter->type, iter->name);
 	return 0;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..1f9f7bc56ca1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 {
 	struct module_sect_attr *sattr =
 		container_of(mattr, struct module_sect_attr, mattr);
-	return sprintf(buf, "0x%lx\n", sattr->address);
+	return sprintf(buf, "0x%pK\n", (void *)sattr->address);
 }
 
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading":
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	seq_printf(m, " 0x%p", mod->module_core);
+	seq_printf(m, " 0x%pK", mod->module_core);
 
 	/* Taints info */
 	if (mod->taints)
-- 
cgit v1.2.2


From 5af5bcb8d37f99ba415a1adc6da71051b84f93a5 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Tue, 22 Mar 2011 16:34:23 -0700
Subject: printk: allow setting DEFAULT_MESSAGE_LEVEL via Kconfig

We've been burned by regressions/bugs which we later realized could have
been triaged quicker if only we'd paid closer attention to dmesg.  To make
it easier to audit dmesg, we'd like to make DEFAULT_MESSAGE_LEVEL
Kconfig-settable.  That way we can set it to KERN_NOTICE and audit any
messages <= KERN_WARNING.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Joe Perches <joe@perches.com>
Cc: Olof Johansson <olofj@chromium.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index a53607eea6d0..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define __LOG_BUF_LEN	(1 << CONFIG_LOG_BUF_SHIFT)
 
 /* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 
 /* We show everything that is MORE important than this.. */
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
-- 
cgit v1.2.2


From 20dd67407160eac577656cd2f8ee9a1fead960b8 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Wed, 23 Mar 2011 13:17:23 +0200
Subject: sched: Remove unused 'rq' variable and cpu_rq() call from
 alloc_fair_sched_group()

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110323111722.GA4244@swordfish.minsk.epam.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 052120d67706..a361e20ec2cd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8443,7 +8443,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
-	struct rq *rq;
 	int i;
 
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8456,8 +8455,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	tg->shares = NICE_0_LOAD;
 
 	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
-- 
cgit v1.2.2


From dec2960827c85253d76938dbfa909df3be34958b Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Wed, 23 Mar 2011 14:38:28 +0200
Subject: lockdep: Remove unused 'factor' variable from lockdep_stats_show()

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110323123828.GB4244@swordfish.minsk.epam.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 		      nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
 		      nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
 		      nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
-		      sum_forward_deps = 0, factor = 0;
+		      sum_forward_deps = 0;
 
 	list_for_each_entry(class, &all_lock_classes, lock_entry) {
 
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			nr_hardirq_unsafe * nr_hardirq_safe +
 			nr_list_entries);
 
-	/*
-	 * Estimated factor between direct and indirect
-	 * dependencies:
-	 */
-	if (nr_list_entries)
-		factor = sum_forward_deps / nr_list_entries;
-
 #ifdef CONFIG_PROVE_LOCKING
 	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
 			nr_lock_chains, MAX_LOCKDEP_CHAINS);
-- 
cgit v1.2.2


From 1232d6132a986125f6a687ab9b61a4330e319270 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 22 Mar 2011 18:46:18 +0100
Subject: sched, doc: Update sched-design-CFS.txt

Correct ->dequeue_tree() thinko into sched_class->dequeue_task
and drop all references to ->task_new() since it is obviously
gone.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1300815978-16618-1-git-send-email-bp@amd64.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_idletask.c | 2 --
 kernel/sched_stoptask.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index c82f26c1b7c3..a776a6396427 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -94,6 +94,4 @@ static const struct sched_class idle_sched_class = {
 
 	.prio_changed		= prio_changed_idle,
 	.switched_to		= switched_to_idle,
-
-	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 84ec9bcf82d9..1ba2bd40fdac 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -102,6 +102,4 @@ static const struct sched_class stop_sched_class = {
 
 	.prio_changed		= prio_changed_stop,
 	.switched_to		= switched_to_stop,
-
-	/* no .task_new for stop tasks */
 };
-- 
cgit v1.2.2


From 68cacd29167b1926d237bd1b153aa2a990201729 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 23 Mar 2011 16:03:06 +0100
Subject: perf_events: Fix stale ->cgrp pointer in
 update_cgrp_time_from_cpuctx()

This patch solves a stale pointer problem in
update_cgrp_time_from_cpuctx(). The cpuctx->cgrp
was not cleared on all possible event exit paths,
including:

   close()
     perf_release()
       perf_release_kernel()
         list_del_event()

This patch fixes list_del_event() to clear cpuctx->cgrp
when there are no cgroup events left in the context.

[ This second version makes the code compile when
  CONFIG_CGROUP_PERF is not enabled. We unconditionally define
  perf_cpu_context->cgrp. ]

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: perfmon2-devel@lists.sf.net
Cc: paulus@samba.org
Cc: davem@davemloft.net
LKML-Reference: <20110323150306.GA1580@quad>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3472bb1a070c..0c714226ae0c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -941,6 +941,7 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+	struct perf_cpu_context *cpuctx;
 	/*
 	 * We can have double detach due to exit/hot-unplug + close.
 	 */
@@ -949,8 +950,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-	if (is_cgroup_event(event))
+	if (is_cgroup_event(event)) {
 		ctx->nr_cgroups--;
+		cpuctx = __get_cpu_context(ctx);
+		/*
+		 * if there are no more cgroup events
+		 * then cler cgrp to avoid stale pointer
+		 * in update_cgrp_time_from_cpuctx()
+		 */
+		if (!ctx->nr_cgroups)
+			cpuctx->cgrp = NULL;
+	}
 
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
-- 
cgit v1.2.2


From 3b9038912828384e38d82409c281124631c8533b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Mar 2011 00:24:11 +0100
Subject: genirq; Remove the last leftovers of the old sparse irq code

All users converted. Get rid of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index dbccc799407f..6fb014f172f7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -198,15 +198,6 @@ err:
 	return -ENOMEM;
 }
 
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-	int res = irq_alloc_descs(irq, irq, 1, node);
-
-	if (res == -EEXIST || res == irq)
-		return irq_to_desc(irq);
-	return NULL;
-}
-
 static int irq_expand_nr_irqs(unsigned int nr)
 {
 	if (nr > IRQ_BITMAP_BITS)
@@ -283,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-	return irq_to_desc(irq);
-}
-
 static void free_desc(unsigned int irq)
 {
 	dynamic_irq_cleanup(irq);
-- 
cgit v1.2.2


From 880f57318450dbead6a03f9e31a1468924d6dd88 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 23 Mar 2011 19:29:39 +0100
Subject: perf: Better fit max unprivileged mlock pages for tools needs

The maximum kilobytes of locked memory that an unprivileged user
can reserve is of 512 kB = 128 pages by default, scaled to the
number of onlined CPUs, which fits well with the tools that use
128 data pages by default.

However tools actually use 129 pages, because they need one more
for the user control page. Thus the default mlock threshold is
not sufficient for the default tools needs and we always end up
to evaluate the constant mlock rlimit policy, which doesn't have
this scaling with the number of online CPUs.

Hence, on systems that have more than 16 CPUs, we overlap the
rlimit threshold and fail to mmap:

	$ perf record ls
	Error: failed to mmap with 1 (Operation not permitted)

Just increase the max unprivileged mlock threshold by one page
so that it supports well perf tools even after 16 CPUs.

Reported-by: Han Pingtian <phan@redhat.com>
Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Stable <stable@kernel.org>
LKML-Reference: <1300904979-5508-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 0c714226ae0c..c75925c4d1e2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu;
  */
 int sysctl_perf_event_paranoid __read_mostly = 1;
 
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 128 pages + 1 for the user control page */
+int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
 
 /*
  * max perf event sample rate
-- 
cgit v1.2.2


From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:17 -0400
Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm

Now that gate vma's are referenced with respect to a particular mm and not a
particular task it only makes sense to propagate the change to this predicate as
well.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/kallsyms.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..b9d0fd1d21c7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
 	if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
 	    arch_is_kernel_text(addr))
 		return 1;
-	return in_gate_area_no_task(addr);
+	return in_gate_area_no_mm(addr);
 }
 
 static inline int is_kernel(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
 		return 1;
-	return in_gate_area_no_task(addr);
+	return in_gate_area_no_mm(addr);
 }
 
 static int is_ksym_addr(unsigned long addr)
-- 
cgit v1.2.2


From e1a85b2c519551d4792180cdab4074d7e99bf2c9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 23 Mar 2011 22:16:04 +0100
Subject: timekeeping: Use syscore_ops instead of sysdev class and sysdev

The timekeeping subsystem uses a sysdev class and a sysdev for
executing timekeeping_suspend() after interrupts have been turned off
on the boot CPU (during system suspend) and for executing
timekeeping_resume() before turning on interrupts on the boot CPU
(during system resume).  However, since both of these functions
ignore their arguments, the entire mechanism may be replaced with a
struct syscore_ops object which is simpler.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3bd7e3d5c632..8ad5d576755e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
 #include <linux/time.h>
@@ -597,13 +597,12 @@ static struct timespec timekeeping_suspend_time;
 
 /**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev:	unused
  *
  * This is for the generic clocksource timekeeping.
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static int timekeeping_resume(struct sys_device *dev)
+static void timekeeping_resume(void)
 {
 	unsigned long flags;
 	struct timespec ts;
@@ -632,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev)
 
 	/* Resume hrtimers */
 	hres_timers_resume();
-
-	return 0;
 }
 
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+static int timekeeping_suspend(void)
 {
 	unsigned long flags;
 
@@ -654,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 }
 
 /* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
-	.name		= "timekeeping",
+static struct syscore_ops timekeeping_syscore_ops = {
 	.resume		= timekeeping_resume,
 	.suspend	= timekeeping_suspend,
 };
 
-static struct sys_device device_timer = {
-	.id		= 0,
-	.cls		= &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
+static int __init timekeeping_init_ops(void)
 {
-	int error = sysdev_class_register(&timekeeping_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
+	register_syscore_ops(&timekeeping_syscore_ops);
+	return 0;
 }
 
-device_initcall(timekeeping_init_device);
+device_initcall(timekeeping_init_ops);
 
 /*
  * If the error is already larger, we look ahead even further
-- 
cgit v1.2.2


From 6c191cd01a935e5b53ef43c9403c771bb7a32b60 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 23 Mar 2011 16:42:18 -0700
Subject: memcg: res_counter_read_u64(): fix potential races on 32-bit machines

res_counter_read_u64 reads u64 value without lock.  It's dangerous in a
32bit environment.  Add locking.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/res_counter.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
 			pos, buf, s - buf);
 }
 
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+	unsigned long flags;
+	u64 ret;
+
+	spin_lock_irqsave(&counter->lock, flags);
+	ret = *res_counter_member(counter, member);
+	spin_unlock_irqrestore(&counter->lock, flags);
+
+	return ret;
+}
+#else
 u64 res_counter_read_u64(struct res_counter *counter, int member)
 {
 	return *res_counter_member(counter, member);
 }
+#endif
 
 int res_counter_memparse_write_strategy(const char *buf,
 					unsigned long long *res)
-- 
cgit v1.2.2


From 6b3ae58efca06623c197fd6d91ded4aa3a8fe039 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 23 Mar 2011 16:42:30 -0700
Subject: memcg: remove direct page_cgroup-to-page pointer

In struct page_cgroup, we have a full word for flags but only a few are
reserved.  Use the remaining upper bits to encode, depending on
configuration, the node or the section, to enable page_cgroup-to-page
lookups without a direct pointer.

This saves a full word for every page in a system with memory cgroups
enabled.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/bounds.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
 
 void foo(void)
 {
 	/* The enum constants to put into include/generated/bounds.h */
 	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
 	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+	DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
 	/* End of constants */
 }
-- 
cgit v1.2.2


From 9303e0c4814d2a6afca878cc35433291e862169c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 23 Mar 2011 16:42:45 -0700
Subject: cpuset: remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()

It's not necessary to copy cpuset->mems_allowed to a buffer allocated by
NODEMASK_ALLOC().  Just pass it to nodelist_scnprintf().

As spotted by Paul, a side effect is we fix a bug that the function can
return -ENOMEM but the caller doesn't expect negative return value.
Therefore change the return value of cpuset_sprintf_cpulist() and
cpuset_sprintf_memlist() from int to size_t.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e98189032..4683fe728c9b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1610,34 +1610,26 @@ out:
  * across a page fault.
  */
 
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-	int ret;
+	size_t count;
 
 	mutex_lock(&callback_mutex);
-	ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+	count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
-	return ret;
+	return count;
 }
 
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-	NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
-	int retval;
-
-	if (mask == NULL)
-		return -ENOMEM;
+	size_t count;
 
 	mutex_lock(&callback_mutex);
-	*mask = cs->mems_allowed;
+	count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
 	mutex_unlock(&callback_mutex);
 
-	retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
-	NODEMASK_FREE(mask);
-
-	return retval;
+	return count;
 }
 
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
-- 
cgit v1.2.2


From c8163ca8afcac0fc54593fc60d1e1110edbd0eb2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 23 Mar 2011 16:42:46 -0700
Subject: cpuset: remove unneeded NODEMASK_ALLOC() in cpuset_attach()

oldcs->mems_allowed is not modified during cpuset_attach(), so we don't
have to copy it to a buffer allocated by NODEMASK_ALLOC().  Just pass it
to cpuset_migrate_mm().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4683fe728c9b..7f384f4013b2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
 	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
 
-	if (from == NULL || to == NULL)
+	if (to == NULL)
 		goto alloc_fail;
 
 	if (cs == &top_cpuset) {
@@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	}
 
 	/* change mm; only needs to be done once even if threadgroup */
-	*from = oldcs->mems_allowed;
 	*to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
 		mpol_rebind_mm(mm, to);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, from, to);
+			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
 		mmput(mm);
 	}
 
 alloc_fail:
-	NODEMASK_FREE(from);
 	NODEMASK_FREE(to);
 }
 
-- 
cgit v1.2.2


From ee24d3797780eee6ffe581a7b78d27896f9b494a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 23 Mar 2011 16:42:47 -0700
Subject: cpuset: fix unchecked calls to NODEMASK_ALLOC()

Those functions that use NODEMASK_ALLOC() can't propagate errno
to users, but will fail silently.

Fix it by using a static nodemask_t variable for each function, and
those variables are protected by cgroup_mutex;

[akpm@linux-foundation.org: fix comment spelling, strengthen cgroup_lock comment]
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 51 ++++++++++++++++-----------------------------------
 1 file changed, 16 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7f384f4013b2..e472fe139192 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
-	if (!newmems)
-		return;
+	static nodemask_t newmems;	/* protected by cgroup_mutex */
 
 	cs = cgroup_cs(scan->cg);
-	guarantee_online_mems(cs, newmems);
+	guarantee_online_mems(cs, &newmems);
 
-	cpuset_change_task_nodemask(p, newmems);
-
-	NODEMASK_FREE(newmems);
+	cpuset_change_task_nodemask(p, &newmems);
 
 	mm = get_task_mm(p);
 	if (!mm)
@@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
-	if (to == NULL)
-		goto alloc_fail;
+	static nodemask_t to;		/* protected by cgroup_mutex */
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
 	} else {
 		guarantee_online_cpus(cs, cpus_attach);
 	}
-	guarantee_online_mems(cs, to);
+	guarantee_online_mems(cs, &to);
 
 	/* do per-task migration stuff possibly for each in the threadgroup */
-	cpuset_attach_task(tsk, to, cs);
+	cpuset_attach_task(tsk, &to, cs);
 	if (threadgroup) {
 		struct task_struct *c;
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-			cpuset_attach_task(c, to, cs);
+			cpuset_attach_task(c, &to, cs);
 		}
 		rcu_read_unlock();
 	}
 
 	/* change mm; only needs to be done once even if threadgroup */
-	*to = cs->mems_allowed;
+	to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
-		mpol_rebind_mm(mm, to);
+		mpol_rebind_mm(mm, &to);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
+			cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
 		mmput(mm);
 	}
-
-alloc_fail:
-	NODEMASK_FREE(to);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -2055,10 +2044,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return;
+	static nodemask_t oldmems;	/* protected by cgroup_mutex */
 
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
@@ -2075,7 +2061,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
-		*oldmems = cp->mems_allowed;
+		oldmems = cp->mems_allowed;
 
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
@@ -2091,10 +2077,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, oldmems, NULL);
+			update_tasks_nodemask(cp, &oldmems, NULL);
 		}
 	}
-	NODEMASK_FREE(oldmems);
 }
 
 /*
@@ -2136,19 +2121,16 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return NOTIFY_DONE;
+	static nodemask_t oldmems;	/* protected by cgroup_mutex */
 
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-		*oldmems = top_cpuset.mems_allowed;
+		oldmems = top_cpuset.mems_allowed;
 		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 		mutex_unlock(&callback_mutex);
-		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+		update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
 		break;
 	case MEM_OFFLINE:
 		/*
@@ -2162,7 +2144,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	}
 	cgroup_unlock();
 
-	NODEMASK_FREE(oldmems);
 	return NOTIFY_OK;
 }
 #endif
-- 
cgit v1.2.2


From 523fb486bfd94e3a3b16a42bcb21b1959cf14df8 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 23 Mar 2011 16:42:48 -0700
Subject: cpuset: hold callback_mutex in cpuset_post_clone()

Chaning cpuset->mems/cpuset->cpus should be protected under
callback_mutex.

cpuset_clone() doesn't follow this rule. It's ok because it's
called when creating and initializing a cgroup, but we'd better
hold the lock to avoid subtil break in the future.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e472fe139192..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 	cs = cgroup_cs(cgroup);
 	parent_cs = cgroup_cs(parent);
 
+	mutex_lock(&callback_mutex);
 	cs->mems_allowed = parent_cs->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+	mutex_unlock(&callback_mutex);
 	return;
 }
 
-- 
cgit v1.2.2


From 814ecf6e5b7854504ae83255173e53836c5d8420 Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@kernel.org>
Date: Wed, 23 Mar 2011 16:43:08 -0700
Subject: sysctl_check: drop table->procname checks

Since the for loop checks for the table->procname drop useless
table->procname checks inside the loop body

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_check.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..3a01c3e46494 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 		const char *fail = NULL;
 
 		if (table->parent) {
-			if (table->procname && !table->parent->procname)
+			if (!table->parent->procname)
 				set_fail(&fail, table, "Parent without procname");
 		}
-		if (!table->procname)
-			set_fail(&fail, table, "No procname");
 		if (table->child) {
 			if (table->data)
 				set_fail(&fail, table, "Directory with data?");
@@ -144,7 +142,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 					set_fail(&fail, table, "No maxlen");
 			}
 #ifdef CONFIG_PROC_SYSCTL
-			if (table->procname && !table->proc_handler)
+			if (!table->proc_handler)
 				set_fail(&fail, table, "No proc_handler");
 #endif
 #if 0
-- 
cgit v1.2.2


From 256c53a65128cbc8a766b1503f3f25a52a8d07cb Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@kernel.org>
Date: Wed, 23 Mar 2011 16:43:08 -0700
Subject: sysctl_check: drop dead code

Drop dead code.

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_check.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 3a01c3e46494..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -144,10 +144,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 #ifdef CONFIG_PROC_SYSCTL
 			if (!table->proc_handler)
 				set_fail(&fail, table, "No proc_handler");
-#endif
-#if 0
-			if (!table->procname && table->proc_handler)
-				set_fail(&fail, table, "proc_handler without procname");
 #endif
 			sysctl_check_leaf(namespaces, table, &fail);
 		}
-- 
cgit v1.2.2


From cb16e95fa2996743a6e80a665ed2ed0590bd38cf Mon Sep 17 00:00:00 2001
From: Petr Holasek <pholasek@redhat.com>
Date: Wed, 23 Mar 2011 16:43:09 -0700
Subject: sysctl: add some missing input constraint checks

Add boundaries of allowed input ranges for: dirty_expire_centisecs,
drop_caches, overcommit_memory, page-cluster and panic_on_oom.

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: Dave Young <hidave.darkstar@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 40245d697602..97ab1690f5ed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
@@ -971,14 +972,18 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_overcommit_memory,
 		.maxlen		= sizeof(sysctl_overcommit_memory),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
 		.maxlen		= sizeof(sysctl_panic_on_oom),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "oom_kill_allocating_task",
@@ -1006,7 +1011,8 @@ static struct ctl_table vm_table[] = {
 		.data		= &page_cluster,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
 	},
 	{
 		.procname	= "dirty_background_ratio",
@@ -1054,7 +1060,8 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
 	},
 	{
 		.procname	= "nr_pdflush_threads",
@@ -1130,6 +1137,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
+		.extra1		= &one,
+		.extra2		= &three,
 	},
 #ifdef CONFIG_COMPACTION
 	{
-- 
cgit v1.2.2


From bfdc0b497faa82a0ba2f9dddcf109231dd519fcc Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 23 Mar 2011 16:43:11 -0700
Subject: sysctl: restrict write access to dmesg_restrict

When dmesg_restrict is set to 1 CAP_SYS_ADMIN is needed to read the kernel
ring buffer.  But a root user without CAP_SYS_ADMIN is able to reset
dmesg_restrict to 0.

This is an issue when e.g.  LXC (Linux Containers) are used and complete
user space is running without CAP_SYS_ADMIN.  A unprivileged and jailed
root user can bypass the dmesg_restrict protection.

With this patch writing to dmesg_restrict is only allowed when root has
CAP_SYS_ADMIN.

Signed-off-by: Richard Weinberger <richard@nod.at>
Acked-by: Dan Rosenberg <drosenberg@vsecurity.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: James Morris <jmorris@namei.org>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97ab1690f5ed..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -707,7 +712,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &kptr_restrict,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dmesg_restrict,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
@@ -2394,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
 	return err;
 }
 
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
 struct do_proc_dointvec_minmax_conv_param {
 	int *min;
 	int *max;
-- 
cgit v1.2.2


From 45a68628d37222e655219febce9e91b6484789b2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 23 Mar 2011 16:43:12 -0700
Subject: pid: remove the child_reaper special case in init/main.c

This patchset is a cleanup and a preparation to unshare the pid namespace.
These prerequisites prepare for Eric's patchset to give a file descriptor
to a namespace and join an existing namespace.

This patch:

It turns out that the existing assignment in copy_process of the
child_reaper can handle the initial assignment of child_reaper we just
need to generalize the test in kernel/fork.c

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f2b494d7c557..17aed4378eda 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1296,7 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		tracehook_finish_clone(p, clone_flags, trace);
 
 		if (thread_group_leader(p)) {
-			if (clone_flags & CLONE_NEWPID)
+			if (is_child_reaper(pid))
 				p->nsproxy->pid_ns->child_reaper = p;
 
 			p->signal->leader_pid = pid;
-- 
cgit v1.2.2


From 4308eebbeb2026827d4492ce8c23d99f7f144a82 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 23 Mar 2011 16:43:13 -0700
Subject: pidns: call pid_ns_prepare_proc() from create_pid_namespace()

Reorganize proc_get_sb() so it can be called before the struct pid of the
first process is allocated.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c          |  6 ------
 kernel/pid_namespace.c | 11 +++++++++--
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 17aed4378eda..457fff2e17e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1187,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		pid = alloc_pid(p->nsproxy->pid_ns);
 		if (!pid)
 			goto bad_fork_cleanup_io;
-
-		if (clone_flags & CLONE_NEWPID) {
-			retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
-			if (retval < 0)
-				goto bad_fork_free_pid;
-		}
 	}
 
 	p->pid = pid_nr(pid);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
+#include <linux/proc_fs.h>
 
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 {
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
-	int i;
+	int i, err = -ENOMEM;
 
 	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
 	if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 	for (i = 1; i < PIDMAP_ENTRIES; i++)
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 
+	err = pid_ns_prepare_proc(ns);
+	if (err)
+		goto out_put_parent_pid_ns;
+
 	return ns;
 
+out_put_parent_pid_ns:
+	put_pid_ns(parent_pid_ns);
 out_free_map:
 	kfree(ns->pidmap[0].page);
 out_free:
 	kmem_cache_free(pid_ns_cachep, ns);
 out:
-	return ERR_PTR(-ENOMEM);
+	return ERR_PTR(err);
 }
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
-- 
cgit v1.2.2


From 59607db367c57f515183cb203642291bb14d9c40 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:16 -0700
Subject: userns: add a user_namespace as creator/owner of uts_namespace

The expected course of development for user namespaces targeted
capabilities is laid out at https://wiki.ubuntu.com/UserNamespace.

Goals:

- Make it safe for an unprivileged user to unshare namespaces.  They
  will be privileged with respect to the new namespace, but this should
  only include resources which the unprivileged user already owns.

- Provide separate limits and accounting for userids in different
  namespaces.

Status:

  Currently (as of 2.6.38) you can clone with the CLONE_NEWUSER flag to
  get a new user namespace if you have the CAP_SYS_ADMIN, CAP_SETUID, and
  CAP_SETGID capabilities.  What this gets you is a whole new set of
  userids, meaning that user 500 will have a different 'struct user' in
  your namespace than in other namespaces.  So any accounting information
  stored in struct user will be unique to your namespace.

  However, throughout the kernel there are checks which

  - simply check for a capability.  Since root in a child namespace
    has all capabilities, this means that a child namespace is not
    constrained.

  - simply compare uid1 == uid2.  Since these are the integer uids,
    uid 500 in namespace 1 will be said to be equal to uid 500 in
    namespace 2.

  As a result, the lxc implementation at lxc.sf.net does not use user
  namespaces.  This is actually helpful because it leaves us free to
  develop user namespaces in such a way that, for some time, user
  namespaces may be unuseful.

Bugs aside, this patchset is supposed to not at all affect systems which
are not actively using user namespaces, and only restrict what tasks in
child user namespace can do.  They begin to limit privilege to a user
namespace, so that root in a container cannot kill or ptrace tasks in the
parent user namespace, and can only get world access rights to files.
Since all files currently belong to the initila user namespace, that means
that child user namespaces can only get world access rights to *all*
files.  While this temporarily makes user namespaces bad for system
containers, it starts to get useful for some sandboxing.

I've run the 'runltplite.sh' with and without this patchset and found no
difference.

This patch:

copy_process() handles CLONE_NEWUSER before the rest of the namespaces.
So in the case of clone(CLONE_NEWUSER|CLONE_NEWUTS) the new uts namespace
will have the new user namespace as its owner.  That is what we want,
since we want root in that new userns to be able to have privilege over
it.

Changelog:
	Feb 15: don't set uts_ns->user_ns if we didn't create
		a new uts_ns.
	Feb 23: Move extern init_user_ns declaration from
		init/version.c to utsname.h.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/nsproxy.c | 5 +++++
 kernel/user.c    | 8 ++++++--
 kernel/utsname.c | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..034dc2ed13ac 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -74,6 +74,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}
+	if (new_nsp->uts_ns != tsk->nsproxy->uts_ns) {
+		put_user_ns(new_nsp->uts_ns->user_ns);
+		new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns;
+		get_user_ns(new_nsp->uts_ns->user_ns);
+	}
 
 	new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
 
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(2),
+		.refcount	= ATOMIC_INIT(3),
 	},
 	.creator = &root_user,
 };
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(2),
 	.processes	= ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..a7b3a8d1ad24 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -40,6 +41,8 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+	ns->user_ns = old_ns->user_ns;
+	get_user_ns(ns->user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@@ -71,5 +74,6 @@ void free_uts_ns(struct kref *kref)
 	struct uts_namespace *ns;
 
 	ns = container_of(kref, struct uts_namespace, kref);
+	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
-- 
cgit v1.2.2


From 3486740a4f32a6a466f5ac931654d154790ba648 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:17 -0700
Subject: userns: security: make capabilities relative to the user namespace

- Introduce ns_capable to test for a capability in a non-default
  user namespace.
- Teach cap_capable to handle capabilities in a non-default
  user namespace.

The motivation is to get to the unprivileged creation of new
namespaces.  It looks like this gets us 90% of the way there, with
only potential uid confusion issues left.

I still need to handle getting all caps after creation but otherwise I
think I have a good starter patch that achieves all of your goals.

Changelog:
	11/05/2010: [serge] add apparmor
	12/14/2010: [serge] fix capabilities to created user namespaces
	Without this, if user serge creates a user_ns, he won't have
	capabilities to the user_ns he created.  THis is because we
	were first checking whether his effective caps had the caps
	he needed and returning -EPERM if not, and THEN checking whether
	he was the creator.  Reverse those checks.
	12/16/2010: [serge] security_real_capable needs ns argument in !security case
	01/11/2011: [serge] add task_ns_capable helper
	01/11/2011: [serge] add nsown_capable() helper per Bastian Blank suggestion
	02/16/2011: [serge] fix a logic bug: the root user is always creator of
		    init_user_ns, but should not always have capabilities to
		    it!  Fix the check in cap_capable().
	02/21/2011: Add the required user_ns parameter to security_capable,
		    fixing a compile failure.
	02/23/2011: Convert some macros to functions as per akpm comments.  Some
		    couldn't be converted because we can't easily forward-declare
		    them (they are inline if !SECURITY, extern if SECURITY).  Add
		    a current_user_ns function so we can use it in capability.h
		    without #including cred.h.  Move all forward declarations
		    together to the top of the #ifdef __KERNEL__ section, and use
		    kernel-doc format.
	02/23/2011: Per dhowells, clean up comment in cap_capable().
	02/23/2011: Per akpm, remove unreachable 'return -EPERM' in cap_capable.

(Original written and signed off by Eric;  latest, modified version
acked by him)

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: export current_user_ns() for ecryptfs]
[serge.hallyn@canonical.com: remove unneeded extra argument in selinux's task_has_capability]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 42 +++++++++++++++++++++++++++++++++++++-----
 kernel/cred.c       |  6 ++++++
 2 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..0a3d2c863a1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 
 /*
@@ -299,17 +300,48 @@ error:
  * This sets PF_SUPERPRIV on the task if the capability is available on the
  * assumption that it's about to be used.
  */
-int capable(int cap)
+bool capable(int cap)
+{
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
 {
 	if (unlikely(!cap_valid(cap))) {
 		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
 		BUG();
 	}
 
-	if (security_capable(current_cred(), cap) == 0) {
+	if (security_capable(ns, current_cred(), cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
-		return 1;
+		return true;
 	}
-	return 0;
+	return false;
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
+
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ *  Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+	return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cred.c b/kernel/cred.c
index 2343c132c5a7..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 }
 EXPORT_SYMBOL(set_create_files_as);
 
+struct user_namespace *current_user_ns(void)
+{
+	return _current_user_ns();
+}
+EXPORT_SYMBOL(current_user_ns);
+
 #ifdef CONFIG_DEBUG_CREDENTIALS
 
 bool creds_are_invalid(const struct cred *cred)
-- 
cgit v1.2.2


From bb96a6f50be27390dc959ff67d9ea0ea0cfbe177 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:18 -0700
Subject: userns: allow sethostname in a container

Changelog:
	Feb 23: let clone_uts_ns() handle setting uts->user_ns
		To do so we need to pass in the task_struct who'll
		get the utsname, so we can get its user_ns.
	Feb 23: As per Oleg's coment, just pass in tsk, instead of two
		of its members.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/nsproxy.c |  7 +------
 kernel/sys.c     |  2 +-
 kernel/utsname.c | 12 +++++++-----
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 034dc2ed13ac..b97fc9d04ddf 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,16 +69,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_ns;
 	}
 
-	new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+	new_nsp->uts_ns = copy_utsname(flags, tsk);
 	if (IS_ERR(new_nsp->uts_ns)) {
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}
-	if (new_nsp->uts_ns != tsk->nsproxy->uts_ns) {
-		put_user_ns(new_nsp->uts_ns->user_ns);
-		new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns;
-		get_user_ns(new_nsp->uts_ns->user_ns);
-	}
 
 	new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 1ad48b3b9068..5761c53e19e3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1181,7 +1181,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index a7b3a8d1ad24..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -31,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
  * @old_ns: namespace to clone
  * Return NULL on error (failure to kmalloc), new ns otherwise
  */
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+					  struct uts_namespace *old_ns)
 {
 	struct uts_namespace *ns;
 
@@ -41,8 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-	ns->user_ns = old_ns->user_ns;
-	get_user_ns(ns->user_ns);
+	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@@ -53,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
  * utsname of this process won't be seen by parent, and vice
  * versa.
  */
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+				   struct task_struct *tsk)
 {
+	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
 	struct uts_namespace *new_ns;
 
 	BUG_ON(!old_ns);
@@ -63,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
 	if (!(flags & CLONE_NEWUTS))
 		return old_ns;
 
-	new_ns = clone_uts_ns(old_ns);
+	new_ns = clone_uts_ns(tsk, old_ns);
 
 	put_uts_ns(old_ns);
 	return new_ns;
-- 
cgit v1.2.2


From 39fd33933b0209e4b6254743f2cede07c5ad4c52 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:19 -0700
Subject: userns: allow killing tasks in your own or child userns

Changelog:
	Dec  8: Fixed bug in my check_kill_permission pointed out by
	        Eric Biederman.
	Dec 13: Apply Eric's suggestion to pass target task into kill_ok_by_cred()
	        for clarity
	Dec 31: address comment by Eric Biederman:
		don't need cred/tcred in check_kill_permission.
	Jan  1: use const cred struct.
	Jan 11: Per Bastian Blank's advice, clean up kill_ok_by_cred().
	Feb 16: kill_ok_by_cred: fix bad parentheses
	Feb 23: per akpm, let compiler inline kill_ok_by_cred

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 31751868de88..324eff5468ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -635,6 +635,27 @@ static inline bool si_fromuser(const struct siginfo *info)
 		(!is_si_special(info) && SI_FROMUSER(info));
 }
 
+/*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+	const struct cred *cred = current_cred();
+	const struct cred *tcred = __task_cred(t);
+
+	if (cred->user->user_ns == tcred->user->user_ns &&
+	    (cred->euid == tcred->suid ||
+	     cred->euid == tcred->uid ||
+	     cred->uid  == tcred->suid ||
+	     cred->uid  == tcred->uid))
+		return 1;
+
+	if (ns_capable(tcred->user->user_ns, CAP_KILL))
+		return 1;
+
+	return 0;
+}
+
 /*
  * Bad permissions for sending the signal
  * - the caller must hold the RCU read lock
@@ -642,7 +663,6 @@ static inline bool si_fromuser(const struct siginfo *info)
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
-	const struct cred *cred, *tcred;
 	struct pid *sid;
 	int error;
 
@@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	cred = current_cred();
-	tcred = __task_cred(t);
 	if (!same_thread_group(current, t) &&
-	    (cred->euid ^ tcred->suid) &&
-	    (cred->euid ^ tcred->uid) &&
-	    (cred->uid  ^ tcred->suid) &&
-	    (cred->uid  ^ tcred->uid) &&
-	    !capable(CAP_KILL)) {
+	    !kill_ok_by_cred(t)) {
 		switch (sig) {
 		case SIGCONT:
 			sid = task_session(t);
-- 
cgit v1.2.2


From 8409cca7056113bee3236cb6a8e4d8d4d1eef102 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:20 -0700
Subject: userns: allow ptrace from non-init user namespaces

ptrace is allowed to tasks in the same user namespace according to the
usual rules (i.e.  the same rules as for two tasks in the init user
namespace).  ptrace is also allowed to a user namespace to which the
current task the has CAP_SYS_PTRACE capability.

Changelog:
	Dec 31: Address feedback by Eric:
		. Correct ptrace uid check
		. Rename may_ptrace_ns to ptrace_capable
		. Also fix the cap_ptrace checks.
	Jan  1: Use const cred struct
	Jan 11: use task_ns_capable() in place of ptrace_capable().
	Feb 23: same_or_ancestore_user_ns() was not an appropriate
		check to constrain cap_issubset.  Rather, cap_issubset()
		only is meaningful when both capsets are in the same
		user_ns.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b360..0fc1eed28d27 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 		return 0;
 	rcu_read_lock();
 	tcred = __task_cred(task);
-	if ((cred->uid != tcred->euid ||
-	     cred->uid != tcred->suid ||
-	     cred->uid != tcred->uid  ||
-	     cred->gid != tcred->egid ||
-	     cred->gid != tcred->sgid ||
-	     cred->gid != tcred->gid) &&
-	    !capable(CAP_SYS_PTRACE)) {
-		rcu_read_unlock();
-		return -EPERM;
-	}
+	if (cred->user->user_ns == tcred->user->user_ns &&
+	    (cred->uid == tcred->euid &&
+	     cred->uid == tcred->suid &&
+	     cred->uid == tcred->uid  &&
+	     cred->gid == tcred->egid &&
+	     cred->gid == tcred->sgid &&
+	     cred->gid == tcred->gid))
+		goto ok;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto ok;
+	rcu_read_unlock();
+	return -EPERM;
+ok:
 	rcu_read_unlock();
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !capable(CAP_SYS_PTRACE))
+	if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
@@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task)
 		goto unlock_tasklist;
 
 	task->ptrace = PT_PTRACED;
-	if (capable(CAP_SYS_PTRACE))
+	if (task_ns_capable(task, CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
-- 
cgit v1.2.2


From 3263245de48344ad7bdd0e7256bf1606d2592f88 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
Date: Wed, 23 Mar 2011 16:43:21 -0700
Subject: userns: make has_capability* into real functions

So we can let type safety keep things sane, and as a bonus we can remove
the declaration of init_user_ns in capability.h.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 0a3d2c863a1c..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -290,6 +290,60 @@ error:
 	return ret;
 }
 
+/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+	int ret = security_real_capable(t, &init_user_ns, cap);
+
+	return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+		       struct user_namespace *ns, int cap)
+{
+	int ret = security_real_capable(t, ns, cap);
+
+	return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+	int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+
+	return (ret == 0);
+}
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
-- 
cgit v1.2.2


From fc832ad3645f0507f24d11752544525a50a83c71 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:22 -0700
Subject: userns: user namespaces: convert all capable checks in kernel/sys.c

This allows setuid/setgid in containers.  It also fixes some corner cases
where kernel logic foregoes capability checks when uids are equivalent.
The latter will need to be done throughout the whole kernel.

Changelog:
	Jan 11: Use nsown_capable() as suggested by Bastian Blank.
	Jan 11: Fix logic errors in uid checks pointed out by Bastian.
	Feb 15: allow prlimit to current (was regression in previous version)
	Feb 23: remove debugging printks, uninline set_one_prio_perm and
		make it bool, and document its return value.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 75 +++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 5761c53e19e3..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -119,17 +119,34 @@ EXPORT_SYMBOL(cad_pid);
 
 void (*pm_power_off_prepare)(void);
 
+/*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+	if (pcred->user->user_ns == cred->user->user_ns &&
+	    (pcred->uid  == cred->euid ||
+	     pcred->euid == cred->euid))
+		return true;
+	if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+		return true;
+	return false;
+}
+
 /*
  * set the priority of a task
  * - the caller must hold the RCU read lock
  */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 	int no_nice;
 
-	if (pcred->uid  != cred->euid &&
-	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+	if (!set_one_prio_perm(p)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 	if (rgid != (gid_t) -1) {
 		if (old->gid == rgid ||
 		    old->egid == rgid ||
-		    capable(CAP_SETGID))
+		    nsown_capable(CAP_SETGID))
 			new->gid = rgid;
 		else
 			goto error;
@@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 		if (old->gid == egid ||
 		    old->egid == egid ||
 		    old->sgid == egid ||
-		    capable(CAP_SETGID))
+		    nsown_capable(CAP_SETGID))
 			new->egid = egid;
 		else
 			goto error;
@@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (capable(CAP_SETGID))
+	if (nsown_capable(CAP_SETGID))
 		new->gid = new->egid = new->sgid = new->fsgid = gid;
 	else if (gid == old->gid || gid == old->sgid)
 		new->egid = new->fsgid = gid;
@@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		new->uid = ruid;
 		if (old->uid != ruid &&
 		    old->euid != ruid &&
-		    !capable(CAP_SETUID))
+		    !nsown_capable(CAP_SETUID))
 			goto error;
 	}
 
@@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		if (old->uid != euid &&
 		    old->euid != euid &&
 		    old->suid != euid &&
-		    !capable(CAP_SETUID))
+		    !nsown_capable(CAP_SETUID))
 			goto error;
 	}
 
@@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (capable(CAP_SETUID)) {
+	if (nsown_capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
 		if (uid != old->uid) {
 			retval = set_user(new);
@@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!capable(CAP_SETUID)) {
+	if (!nsown_capable(CAP_SETUID)) {
 		if (ruid != (uid_t) -1 && ruid != old->uid &&
 		    ruid != old->euid  && ruid != old->suid)
 			goto error;
@@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!capable(CAP_SETGID)) {
+	if (!nsown_capable(CAP_SETGID)) {
 		if (rgid != (gid_t) -1 && rgid != old->gid &&
 		    rgid != old->egid  && rgid != old->sgid)
 			goto error;
@@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 
 	if (uid == old->uid  || uid == old->euid  ||
 	    uid == old->suid || uid == old->fsuid ||
-	    capable(CAP_SETUID)) {
+	    nsown_capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
 			new->fsuid = uid;
 			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 
 	if (gid == old->gid  || gid == old->egid  ||
 	    gid == old->sgid || gid == old->fsgid ||
-	    capable(CAP_SETGID)) {
+	    nsown_capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
 			new->fsgid = gid;
 			goto change_okay;
@@ -1183,6 +1200,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 
 	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
+
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
 	down_write(&uts_sem);
@@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
@@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 	rlim = tsk->signal->rlim + resource;
 	task_lock(tsk->group_leader);
 	if (new_rlim) {
+		/* Keep the capable check against init_user_ns until
+		   cgroups can contain all limits */
 		if (new_rlim->rlim_max > rlim->rlim_max &&
 				!capable(CAP_SYS_RESOURCE))
 			retval = -EPERM;
@@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
 {
 	const struct cred *cred = current_cred(), *tcred;
 
-	tcred = __task_cred(task);
-	if (current != task &&
-	    (cred->uid != tcred->euid ||
-	     cred->uid != tcred->suid ||
-	     cred->uid != tcred->uid  ||
-	     cred->gid != tcred->egid ||
-	     cred->gid != tcred->sgid ||
-	     cred->gid != tcred->gid) &&
-	     !capable(CAP_SYS_RESOURCE)) {
-		return -EPERM;
-	}
+	if (current == task)
+		return 0;
 
-	return 0;
+	tcred = __task_cred(task);
+	if (cred->user->user_ns == tcred->user->user_ns &&
+	    (cred->uid == tcred->euid &&
+	     cred->uid == tcred->suid &&
+	     cred->uid == tcred->uid  &&
+	     cred->gid == tcred->egid &&
+	     cred->gid == tcred->sgid &&
+	     cred->gid == tcred->gid))
+		return 0;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+		return 0;
+
+	return -EPERM;
 }
 
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
-- 
cgit v1.2.2


From b515498f5bb5f38fc0e390b4ff7d00b6077de127 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:23 -0700
Subject: userns: add a user namespace owner of ipc ns

Changelog:
	Feb 15: Don't set new ipc->user_ns if we didn't create a new
		ipc_ns.
	Feb 23: Move extern declaration to ipc_namespace.h, and group
		fwd declarations at top.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/nsproxy.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b97fc9d04ddf..ac8a56e90bf8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,6 +80,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		err = PTR_ERR(new_nsp->ipc_ns);
 		goto out_ipc;
 	}
+	if (new_nsp->ipc_ns != tsk->nsproxy->ipc_ns) {
+		put_user_ns(new_nsp->ipc_ns->user_ns);
+		new_nsp->ipc_ns->user_ns = task_cred_xxx(tsk, user)->user_ns;
+		get_user_ns(new_nsp->ipc_ns->user_ns);
+	}
 
 	new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
 	if (IS_ERR(new_nsp->pid_ns)) {
-- 
cgit v1.2.2


From b0e77598f87107001a00b8a4ece9c95e4254ccc4 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:24 -0700
Subject: userns: user namespaces: convert several capable() calls

CAP_IPC_OWNER and CAP_IPC_LOCK can be checked against current_user_ns(),
because the resource comes from current's own ipc namespace.

setuid/setgid are to uids in own namespace, so again checks can be against
current_user_ns().

Changelog:
	Jan 11: Use task_ns_capable() in place of sched_capable().
	Jan 11: Use nsown_capable() as suggested by Bastian Blank.
	Jan 11: Clarify (hopefully) some logic in futex and sched.c
	Feb 15: use ns_capable for ipc, not nsown_capable
	Feb 23: let copy_ipcs handle setting ipc_ns->user_ns
	Feb 23: pass ns down rather than taking it from current

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c        | 11 ++++++++++-
 kernel/futex_compat.c | 11 ++++++++++-
 kernel/groups.c       |  2 +-
 kernel/nsproxy.c      |  7 +------
 kernel/sched.c        |  9 ++++++---
 kernel/uid16.c        |  2 +-
 6 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index bda415715382..6570c459f31c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 			goto err_unlock;
 		ret = -EPERM;
 		pcred = __task_cred(p);
+		/* If victim is in different user_ns, then uids are not
+		   comparable, so we must have CAP_SYS_PTRACE */
+		if (cred->user->user_ns != pcred->user->user_ns) {
+			if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+				goto err_unlock;
+			goto ok;
+		}
+		/* If victim is in same user_ns, then uids are comparable */
 		if (cred->euid != pcred->euid &&
 		    cred->euid != pcred->uid &&
-		    !capable(CAP_SYS_PTRACE))
+		    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
 			goto err_unlock;
+ok:
 		head = p->robust_list;
 		rcu_read_unlock();
 	}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			goto err_unlock;
 		ret = -EPERM;
 		pcred = __task_cred(p);
+		/* If victim is in different user_ns, then uids are not
+		   comparable, so we must have CAP_SYS_PTRACE */
+		if (cred->user->user_ns != pcred->user->user_ns) {
+			if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+				goto err_unlock;
+			goto ok;
+		}
+		/* If victim is in same user_ns, then uids are comparable */
 		if (cred->euid != pcred->euid &&
 		    cred->euid != pcred->uid &&
-		    !capable(CAP_SYS_PTRACE))
+		    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
 			goto err_unlock;
+ok:
 		head = p->compat_robust_list;
 		rcu_read_unlock();
 	}
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 	struct group_info *group_info;
 	int retval;
 
-	if (!capable(CAP_SETGID))
+	if (!nsown_capable(CAP_SETGID))
 		return -EPERM;
 	if ((unsigned)gidsetsize > NGROUPS_MAX)
 		return -EINVAL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ac8a56e90bf8..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -75,16 +75,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_uts;
 	}
 
-	new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+	new_nsp->ipc_ns = copy_ipcs(flags, tsk);
 	if (IS_ERR(new_nsp->ipc_ns)) {
 		err = PTR_ERR(new_nsp->ipc_ns);
 		goto out_ipc;
 	}
-	if (new_nsp->ipc_ns != tsk->nsproxy->ipc_ns) {
-		put_user_ns(new_nsp->ipc_ns->user_ns);
-		new_nsp->ipc_ns->user_ns = task_cred_xxx(tsk, user)->user_ns;
-		get_user_ns(new_nsp->ipc_ns->user_ns);
-	}
 
 	new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
 	if (IS_ERR(new_nsp->pid_ns)) {
diff --git a/kernel/sched.c b/kernel/sched.c
index a172494a9a63..480adeb63f8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4892,8 +4892,11 @@ static bool check_same_owner(struct task_struct *p)
 
 	rcu_read_lock();
 	pcred = __task_cred(p);
-	match = (cred->euid == pcred->euid ||
-		 cred->euid == pcred->uid);
+	if (cred->user->user_ns == pcred->user->user_ns)
+		match = (cred->euid == pcred->euid ||
+			 cred->euid == pcred->uid);
+	else
+		match = false;
 	rcu_read_unlock();
 	return match;
 }
@@ -5221,7 +5224,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+	if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 	struct group_info *group_info;
 	int retval;
 
-	if (!capable(CAP_SETGID))
+	if (!nsown_capable(CAP_SETGID))
 		return -EPERM;
 	if ((unsigned)gidsetsize > NGROUPS_MAX)
 		return -EINVAL;
-- 
cgit v1.2.2


From f9b182e24ecb2b3bb33340f053ba31c8c4e1d895 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Wed, 23 Mar 2011 16:43:27 -0700
Subject: taskstats: use appropriate printk priority level

printk()s without a priority level default to KERN_WARNING.  To reduce
noise at KERN_WARNING, this patch set the priority level appriopriately
for unleveled printks()s.  This should be useful to folks that look at
dmesg warnings closely.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
 		goto err_cgroup_ops;
 
 	family_registered = 1;
-	printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+	pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
 	return 0;
 err_cgroup_ops:
 	genl_unregister_ops(&family, &taskstats_ops);
-- 
cgit v1.2.2


From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 23 Mar 2011 16:43:29 -0700
Subject: crash_dump: export is_kdump_kernel to modules, consolidate
 elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn

The Xen PV drivers in a crashed HVM guest can not connect to the dom0
backend drivers because both frontend and backend drivers are still in
connected state.  To run the connection reset function only in case of a
crashdump, the is_kdump_kernel() function needs to be available for the PV
driver modules.

Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into
kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel()
usable for modules.

Leave 'elfcorehdr' as early_param().  This changes powerpc from __setup()
to early_param().  It adds an address range check from x86 also on ia64
and powerpc.

[akpm@linux-foundation.org: additional #includes]
[akpm@linux-foundation.org: remove elfcorehdr_addr export]
[akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes]
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile     |  1 +
 kernel/crash_dump.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 kernel/crash_dump.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
-- 
cgit v1.2.2


From 0f77a8d378254f27df4a114a5da67223af1fe93f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 24 Mar 2011 11:42:29 +0900
Subject: vsprintf: Introduce %pB format specifier

The %pB format specifier is for stack backtrace. Its handler
sprint_backtrace() does symbol lookup using (address-1) to
ensure the address will not point outside of the function.

If there is a tail-call to the function marked "noreturn",
gcc optimized out the code after the call then causes saved
return address points outside of the function (i.e. the start
of the next function), so pollutes call trace somewhat.

This patch adds the %pB printk mechanism that allows architecture
call-trace printout functions to improve backtrace printouts.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
LKML-Reference: <1300934550-21394-1-git-send-email-namhyung@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kallsyms.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..59e879929b17 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 }
 
 /* Look up a kernel symbol and return it in a text buffer. */
-int sprint_symbol(char *buffer, unsigned long address)
+static int __sprint_symbol(char *buffer, unsigned long address,
+			   int symbol_offset)
 {
 	char *modname;
 	const char *name;
 	unsigned long offset, size;
 	int len;
 
+	address += symbol_offset;
 	name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
 	if (!name)
 		return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
 		strcpy(buffer, name);
 	len = strlen(buffer);
 	buffer += len;
+	offset -= symbol_offset;
 
 	if (modname)
-		len += sprintf(buffer, "+%#lx/%#lx [%s]",
-						offset, size, modname);
+		len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
 	else
 		len += sprintf(buffer, "+%#lx/%#lx", offset, size);
 
 	return len;
 }
+
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+	return __sprint_symbol(buffer, address, 0);
+}
+
 EXPORT_SYMBOL_GPL(sprint_symbol);
 
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+	return __sprint_symbol(buffer, address, -1);
+}
+
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
 {
-- 
cgit v1.2.2


From 29096202176ceaa5016a17ea2dd1aea19a4e90e2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 17 Mar 2011 15:21:07 -0400
Subject: futex: Fix WARN_ON() test for UP

An update of the futex code had a

	WARN_ON(!spin_is_locked(q->lock_ptr))

But on UP, spin_is_locked() is always false, and will
trigger this warning, and even worse, it will exit the function
without doing the necessary work.

Converting this to a WARN_ON_SMP() fixes the problem.

Reported-by: Richard Weinberger <richard@nod.at>
Tested-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <20110317192208.682654502@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index bda415715382..823aae3e2a96 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
-			|| plist_node_empty(&q->list)))
+	if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+	    || WARN_ON(plist_node_empty(&q->list)))
 		return;
 
 	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
-- 
cgit v1.2.2


From ab7798ffcf98b11a9525cf65bacdae3fd58d357f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Mar 2011 16:48:50 +0100
Subject: genirq: Expand generic show_interrupts()

Some archs want to print extra information for certain irq_chips which
is per irq and not per chip. Allow them to provide a chip callback to
print the chip name and the extra information.

PowerPC wants to print the LEVEL/EDGE type information. Make it configurable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig |  4 ++++
 kernel/irq/proc.c  | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 09bef82d74cb..00f2c037267a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -31,6 +31,10 @@ config GENERIC_IRQ_PROBE
 config GENERIC_IRQ_SHOW
        bool
 
+# Print level/edge extra information
+config GENERIC_IRQ_SHOW_LEVEL
+       bool
+
 # Support for delayed migration from interrupt context
 config GENERIC_PENDING_IRQ
 	bool
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 760248de109d..626d092eed9a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -404,7 +404,20 @@ int show_interrupts(struct seq_file *p, void *v)
 	seq_printf(p, "%*d: ", prec, i);
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-	seq_printf(p, " %8s", desc->irq_data.chip->name);
+
+	if (desc->irq_data.chip) {
+		if (desc->irq_data.chip->irq_print_chip)
+			desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
+		else if (desc->irq_data.chip->name)
+			seq_printf(p, " %8s", desc->irq_data.chip->name);
+		else
+			seq_printf(p, " %8s", "-");
+	} else {
+		seq_printf(p, " %8s", "None");
+	}
+#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL
+	seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+#endif
 	if (desc->name)
 		seq_printf(p, "-%-8s", desc->name);
 
-- 
cgit v1.2.2


From 27029c339b1beebe79bb4e64422ad1bb8d0b6440 Mon Sep 17 00:00:00 2001
From: Jovi Zhang <bookjovi@gmail.com>
Date: Mon, 15 Mar 2010 07:28:00 -0500
Subject: kdb: code cleanup to use macro instead of value

It's better to use macro KDB_BASE_CMD_MAX instead of 50

Signed-off-by: Jovi Zhang <bookjovi@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bd3e8e29caa3..38a85428c70f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
 static kdbtab_t *kdb_commands;
 #define KDB_BASE_CMD_MAX 50
 static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[50];
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
 #define for_each_kdbcmd(cmd, num)					\
 	for ((cmd) = kdb_base_commands, (num) = 0;			\
 	     num < kdb_max_commands;					\
-- 
cgit v1.2.2


From 0d3db28daed2529ab90933a3aaaaf46446fdfda8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 15 Mar 2010 07:28:00 -0500
Subject: kdb: add usage string of 'per_cpu' command

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 38a85428c70f..6bc6e3bc4f9c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
 	  "Send a signal to a process", 0, KDB_REPEAT_NONE);
 	kdb_register_repeat("summary", kdb_summary, "",
 	  "Summarize the system", 4, KDB_REPEAT_NONE);
-	kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+	kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
 	  "Display per_cpu variables", 3, KDB_REPEAT_NONE);
 	kdb_register_repeat("grephelp", kdb_grep_help, "",
 	  "Display help on | grep", 0, KDB_REPEAT_NONE);
-- 
cgit v1.2.2


From d72274e5895d11570a0a4a3214a1933c86d5ccb7 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 25 Mar 2011 12:38:48 -0700
Subject: genirq: Reserve the irq when calling irq_set_chip()

The helper macros and functions like for_each_active_irq() don't work
unless the irq is in the allocated_irqs set.

In the case of !CONFIG_SPARSE_IRQ, instead of forcing all users of the
irq infrastructure to explicitly call irq_reserve_irq(), do it for
them.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: ralf@linux-mips.org
LKML-Reference: <1301081931-11240-2-git-send-email-ddaney@caviumnetworks.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c9c0601f0615..c35d74c08b50 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -37,6 +37,12 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 	irq_chip_set_defaults(chip);
 	desc->irq_data.chip = chip;
 	irq_put_desc_unlock(desc, flags);
+	/*
+	 * For !CONFIG_SPARSE_IRQ make the irq show up in
+	 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
+	 * already marked, and this call is harmless.
+	 */
+	irq_reserve_irq(irq);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_chip);
-- 
cgit v1.2.2


From 801a0e9ae36e9b487092e31699d28c0b9a21ad52 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 27 Mar 2011 11:02:49 +0200
Subject: genirq: Add irq disabled flag to irq_data state

Some irq_chip implementation require to know the disabled state of the
interrupt in certain callbacks. Add a state flag and accessor to
irq_data.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c    | 5 +++--
 kernel/irq/irqdesc.c | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c35d74c08b50..0a890bdd9c63 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -141,12 +141,14 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data);
 static void irq_state_clr_disabled(struct irq_desc *desc)
 {
 	desc->istate &= ~IRQS_DISABLED;
+	irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
 	irq_compat_clr_disabled(desc);
 }
 
 static void irq_state_set_disabled(struct irq_desc *desc)
 {
 	desc->istate |= IRQS_DISABLED;
+	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
 	irq_compat_set_disabled(desc);
 }
 
@@ -648,8 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	if (handle == handle_bad_irq) {
 		if (desc->irq_data.chip != &no_irq_chip)
 			mask_ack_irq(desc);
-		irq_compat_set_disabled(desc);
-		desc->istate |= IRQS_DISABLED;
+		irq_state_set_disabled(desc);
 		desc->depth = 1;
 	}
 	desc->handle_irq = handle;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 6fb014f172f7..96c3268a509d 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
 	desc->istate = IRQS_DISABLED;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
-- 
cgit v1.2.2


From 0fdb4b259ed3e91b687ac26848202f5e7c217e62 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 25 Mar 2011 12:38:49 -0700
Subject: genirq: Add chip hooks for taking CPUs on/off line.

[ tglx: Removed the enabled argument as this is now available in
irq_data ]

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: ralf@linux-mips.org
LKML-Reference: <1301081931-11240-3-git-send-email-ddaney@caviumnetworks.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0a890bdd9c63..44b16a1ecd9a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -696,3 +696,61 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 
 	irq_put_desc_unlock(desc, flags);
 }
+
+/**
+ *	irq_cpu_online - Invoke all irq_cpu_online functions.
+ *
+ *	Iterate through all irqs and invoke the chip.irq_cpu_online()
+ *	for each.
+ */
+void irq_cpu_online(void)
+{
+	struct irq_desc *desc;
+	struct irq_chip *chip;
+	unsigned long flags;
+	unsigned int irq;
+
+	for_each_active_irq(irq) {
+		desc = irq_to_desc(irq);
+		if (!desc)
+			continue;
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+
+		chip = irq_data_get_irq_chip(&desc->irq_data);
+
+		if (chip && chip->irq_cpu_online)
+			chip->irq_cpu_online(&desc->irq_data);
+
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+
+/**
+ *	irq_cpu_offline - Invoke all irq_cpu_offline functions.
+ *
+ *	Iterate through all irqs and invoke the chip.irq_cpu_offline()
+ *	for each.
+ */
+void irq_cpu_offline(void)
+{
+	struct irq_desc *desc;
+	struct irq_chip *chip;
+	unsigned long flags;
+	unsigned int irq;
+
+	for_each_active_irq(irq) {
+		desc = irq_to_desc(irq);
+		if (!desc)
+			continue;
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+
+		chip = irq_data_get_irq_chip(&desc->irq_data);
+
+		if (chip && chip->irq_cpu_offline)
+			chip->irq_cpu_offline(&desc->irq_data);
+
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
-- 
cgit v1.2.2


From b3d422329f2e061d66af4f933ef316e50e5edcac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 27 Mar 2011 16:05:36 +0200
Subject: genirq: Add chip flag for restricting cpu_on/offline calls

Add a flag which indicates that the on/offline callback should only be
called on enabled interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 44b16a1ecd9a..9283d3300ea9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -718,8 +718,9 @@ void irq_cpu_online(void)
 		raw_spin_lock_irqsave(&desc->lock, flags);
 
 		chip = irq_data_get_irq_chip(&desc->irq_data);
-
-		if (chip && chip->irq_cpu_online)
+		if (chip && chip->irq_cpu_online &&
+		    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+		     !(desc->istate & IRQS_DISABLED)))
 			chip->irq_cpu_online(&desc->irq_data);
 
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -747,8 +748,9 @@ void irq_cpu_offline(void)
 		raw_spin_lock_irqsave(&desc->lock, flags);
 
 		chip = irq_data_get_irq_chip(&desc->irq_data);
-
-		if (chip && chip->irq_cpu_offline)
+		if (chip && chip->irq_cpu_offline &&
+		    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+		     !(desc->istate & IRQS_DISABLED)))
 			chip->irq_cpu_offline(&desc->irq_data);
 
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
-- 
cgit v1.2.2


From c2d0c555c22242c3a76e366074c4d83ef9fa3b8c Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 25 Mar 2011 12:38:50 -0700
Subject: genirq: Split irq_set_affinity() so it can be called with lock held.

The .irq_cpu_online() and .irq_cpu_offline() functions may need to
adjust affinity, but they are called with the descriptor lock held.
Create __irq_set_affinity_locked() which is called with the lock held.
Make irq_set_affinity() just a wrapper that acquires the lock.

[ tglx: Changed the argument to irq_data, added a !desc check and
        moved the !irq_set_affinity check where it belongs ]

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: ralf@linux-mips.org
LKML-Reference: <1301081931-11240-4-git-send-email-ddaney@caviumnetworks.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a2aa73e536c..3d151fd762ad 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -139,35 +139,26 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
 
-/**
- *	irq_set_affinity - Set the irq affinity of a given irq
- *	@irq:		Interrupt to set affinity
- *	@cpumask:	cpumask
- *
- */
-int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
+int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_chip *chip = desc->irq_data.chip;
-	unsigned long flags;
+	struct irq_chip *chip = irq_data_get_irq_chip(data);
+	struct irq_desc *desc = irq_data_to_desc(data);
 	int ret = 0;
 
-	if (!chip->irq_set_affinity)
+	if (!chip || !chip->irq_set_affinity)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&desc->lock, flags);
-
-	if (irq_can_move_pcntxt(desc)) {
-		ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+	if (irqd_can_move_in_process_context(data)) {
+		ret = chip->irq_set_affinity(data, mask, false);
 		switch (ret) {
 		case IRQ_SET_MASK_OK:
-			cpumask_copy(desc->irq_data.affinity, mask);
+			cpumask_copy(data->affinity, mask);
 		case IRQ_SET_MASK_OK_NOCOPY:
 			irq_set_thread_affinity(desc);
 			ret = 0;
 		}
 	} else {
-		irqd_set_move_pending(&desc->irq_data);
+		irqd_set_move_pending(data);
 		irq_copy_pending(desc, mask);
 	}
 
@@ -176,7 +167,28 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 		schedule_work(&desc->affinity_notify->work);
 	}
 	irq_compat_set_affinity(desc);
-	irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
+	irqd_set(data, IRQD_AFFINITY_SET);
+
+	return ret;
+}
+
+/**
+ *	irq_set_affinity - Set the irq affinity of a given irq
+ *	@irq:		Interrupt to set affinity
+ *	@cpumask:	cpumask
+ *
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	ret =  __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-- 
cgit v1.2.2


From 32f4125ebffee4f3c4dbc6a437fc656129eb9e60 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2011 14:10:52 +0200
Subject: genirq: Move INPROGRESS, MASKED and DISABLED state flags to irq_data

We really need these flags for some of the interrupt chips. Move it
from internal state to irq_data and provide proper accessors.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David Daney <ddaney@caviumnetworks.com>
---
 kernel/irq/chip.c      | 40 +++++++++++++++++++---------------------
 kernel/irq/debug.h     | 10 +++++++---
 kernel/irq/handle.c    |  4 ++--
 kernel/irq/internals.h |  6 ------
 kernel/irq/irqdesc.c   |  2 --
 kernel/irq/manage.c    | 30 ++++++++++++++----------------
 kernel/irq/migration.c |  4 ++--
 kernel/irq/spurious.c  | 10 +++++-----
 8 files changed, 49 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9283d3300ea9..e00bdc56269f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -140,27 +140,25 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data);
 
 static void irq_state_clr_disabled(struct irq_desc *desc)
 {
-	desc->istate &= ~IRQS_DISABLED;
 	irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
 	irq_compat_clr_disabled(desc);
 }
 
 static void irq_state_set_disabled(struct irq_desc *desc)
 {
-	desc->istate |= IRQS_DISABLED;
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
 	irq_compat_set_disabled(desc);
 }
 
 static void irq_state_clr_masked(struct irq_desc *desc)
 {
-	desc->istate &= ~IRQS_MASKED;
+	irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
 	irq_compat_clr_masked(desc);
 }
 
 static void irq_state_set_masked(struct irq_desc *desc)
 {
-	desc->istate |= IRQS_MASKED;
+	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 	irq_compat_set_masked(desc);
 }
 
@@ -380,11 +378,11 @@ void handle_nested_irq(unsigned int irq)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	action = desc->action;
-	if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
+	if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
 		goto out_unlock;
 
 	irq_compat_set_progress(desc);
-	desc->istate |= IRQS_INPROGRESS;
+	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	raw_spin_unlock_irq(&desc->lock);
 
 	action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -392,7 +390,7 @@ void handle_nested_irq(unsigned int irq)
 		note_interrupt(irq, desc, action_ret);
 
 	raw_spin_lock_irq(&desc->lock);
-	desc->istate &= ~IRQS_INPROGRESS;
+	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	irq_compat_clr_progress(desc);
 
 out_unlock:
@@ -424,14 +422,14 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	if (unlikely(desc->istate & IRQS_INPROGRESS))
+	if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
 		goto out_unlock;
 
 	handle_irq_event(desc);
@@ -456,7 +454,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
 
-	if (unlikely(desc->istate & IRQS_INPROGRESS))
+	if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
@@ -467,12 +465,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * keep it masked and get out of here
 	 */
-	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
 		goto out_unlock;
 
 	handle_irq_event(desc);
 
-	if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
+	if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
 		unmask_irq(desc);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
@@ -504,7 +502,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	if (unlikely(desc->istate & IRQS_INPROGRESS))
+	if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
 		if (!irq_check_poll(desc))
 			goto out;
 
@@ -515,7 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * then mask it and get out of here:
 	 */
-	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
 		irq_compat_set_pending(desc);
 		desc->istate |= IRQS_PENDING;
 		mask_irq(desc);
@@ -566,8 +564,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 * we shouldn't process the IRQ. Mark it pending, handle
 	 * the necessary masking and go out
 	 */
-	if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
-		      !desc->action))) {
+	if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+		     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
 		if (!irq_check_poll(desc)) {
 			irq_compat_set_pending(desc);
 			desc->istate |= IRQS_PENDING;
@@ -592,15 +590,15 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 * Renable it, if it was not disabled in meantime.
 		 */
 		if (unlikely(desc->istate & IRQS_PENDING)) {
-			if (!(desc->istate & IRQS_DISABLED) &&
-			    (desc->istate & IRQS_MASKED))
+			if (!irqd_irq_disabled(&desc->irq_data) &&
+			    irqd_irq_masked(&desc->irq_data))
 				unmask_irq(desc);
 		}
 
 		handle_irq_event(desc);
 
 	} while ((desc->istate & IRQS_PENDING) &&
-		 !(desc->istate & IRQS_DISABLED));
+		 !irqd_irq_disabled(&desc->irq_data));
 
 out_unlock:
 	raw_spin_unlock(&desc->lock);
@@ -720,7 +718,7 @@ void irq_cpu_online(void)
 		chip = irq_data_get_irq_chip(&desc->irq_data);
 		if (chip && chip->irq_cpu_online &&
 		    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
-		     !(desc->istate & IRQS_DISABLED)))
+		     !irqd_irq_disabled(&desc->irq_data)))
 			chip->irq_cpu_online(&desc->irq_data);
 
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -750,7 +748,7 @@ void irq_cpu_offline(void)
 		chip = irq_data_get_irq_chip(&desc->irq_data);
 		if (chip && chip->irq_cpu_offline &&
 		    (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
-		     !(desc->istate & IRQS_DISABLED)))
+		     !irqd_irq_disabled(&desc->irq_data)))
 			chip->irq_cpu_offline(&desc->irq_data);
 
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index d1a33b7fa61d..a0bd875ba3d5 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -6,6 +6,8 @@
 
 #define P(f) if (desc->status & f) printk("%14s set\n", #f)
 #define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+/* FIXME */
+#define PD(f) do { } while (0)
 
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
@@ -28,13 +30,15 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_NOAUTOEN);
 
 	PS(IRQS_AUTODETECT);
-	PS(IRQS_INPROGRESS);
 	PS(IRQS_REPLAY);
 	PS(IRQS_WAITING);
-	PS(IRQS_DISABLED);
 	PS(IRQS_PENDING);
-	PS(IRQS_MASKED);
+
+	PD(IRQS_INPROGRESS);
+	PD(IRQS_DISABLED);
+	PD(IRQS_MASKED);
 }
 
 #undef P
 #undef PS
+#undef PD
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 517561fc7317..60fd5cd75c77 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -178,13 +178,13 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	irq_compat_clr_pending(desc);
 	desc->istate &= ~IRQS_PENDING;
 	irq_compat_set_progress(desc);
-	desc->istate |= IRQS_INPROGRESS;
+	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	raw_spin_unlock(&desc->lock);
 
 	ret = handle_irq_event_percpu(desc, action);
 
 	raw_spin_lock(&desc->lock);
-	desc->istate &= ~IRQS_INPROGRESS;
+	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	irq_compat_clr_progress(desc);
 	return ret;
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6c6ec9a49027..6b8b9713e28d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,26 +44,20 @@ enum {
  * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
  *				  detection
  * IRQS_POLL_INPROGRESS		- polling in progress
- * IRQS_INPROGRESS		- Interrupt in progress
  * IRQS_ONESHOT			- irq is not unmasked in primary handler
  * IRQS_REPLAY			- irq is replayed
  * IRQS_WAITING			- irq is waiting
- * IRQS_DISABLED		- irq is disabled
  * IRQS_PENDING			- irq is pending and replayed later
- * IRQS_MASKED			- irq is masked
  * IRQS_SUSPENDED		- irq is suspended
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 	IRQS_POLL_INPROGRESS	= 0x00000008,
-	IRQS_INPROGRESS		= 0x00000010,
 	IRQS_ONESHOT		= 0x00000020,
 	IRQS_REPLAY		= 0x00000040,
 	IRQS_WAITING		= 0x00000080,
-	IRQS_DISABLED		= 0x00000100,
 	IRQS_PENDING		= 0x00000200,
-	IRQS_MASKED		= 0x00000400,
 	IRQS_SUSPENDED		= 0x00000800,
 };
 
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 96c3268a509d..2c039c9b9383 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -81,7 +81,6 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.msi_desc = NULL;
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-	desc->istate = IRQS_DISABLED;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
@@ -239,7 +238,6 @@ int __init early_irq_init(void)
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
-		.istate		= IRQS_DISABLED,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
 		.lock		= __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3d151fd762ad..6e8acb755993 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -41,7 +41,7 @@ early_param("threadirqs", setup_forced_irqthreads);
 void synchronize_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned int state;
+	bool inprogress;
 
 	if (!desc)
 		return;
@@ -53,16 +53,16 @@ void synchronize_irq(unsigned int irq)
 		 * Wait until we're out of the critical section.  This might
 		 * give the wrong answer due to the lack of memory barriers.
 		 */
-		while (desc->istate & IRQS_INPROGRESS)
+		while (irqd_irq_inprogress(&desc->irq_data))
 			cpu_relax();
 
 		/* Ok, that indicated we're done: double-check carefully. */
 		raw_spin_lock_irqsave(&desc->lock, flags);
-		state = desc->istate;
+		inprogress = irqd_irq_inprogress(&desc->irq_data);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 		/* Oops, that failed? */
-	} while (state & IRQS_INPROGRESS);
+	} while (inprogress);
 
 	/*
 	 * We made sure that no hardirq handler is running. Now verify
@@ -563,9 +563,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	flags &= IRQ_TYPE_SENSE_MASK;
 
 	if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
-		if (!(desc->istate & IRQS_MASKED))
+		if (!irqd_irq_masked(&desc->irq_data))
 			mask_irq(desc);
-		if (!(desc->istate & IRQS_DISABLED))
+		if (!irqd_irq_disabled(&desc->irq_data))
 			unmask = 1;
 	}
 
@@ -663,7 +663,7 @@ again:
 	 * irq_wake_thread(). See the comment there which explains the
 	 * serialization.
 	 */
-	if (unlikely(desc->istate & IRQS_INPROGRESS)) {
+	if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
 		raw_spin_unlock_irq(&desc->lock);
 		chip_bus_sync_unlock(desc);
 		cpu_relax();
@@ -680,12 +680,10 @@ again:
 
 	desc->threads_oneshot &= ~action->thread_mask;
 
-	if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
-	    (desc->istate & IRQS_MASKED)) {
-		irq_compat_clr_masked(desc);
-		desc->istate &= ~IRQS_MASKED;
-		desc->irq_data.chip->irq_unmask(&desc->irq_data);
-	}
+	if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
+	    irqd_irq_masked(&desc->irq_data))
+		unmask_irq(desc);
+
 out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
 	chip_bus_sync_unlock(desc);
@@ -779,7 +777,7 @@ static int irq_thread(void *data)
 		atomic_inc(&desc->threads_active);
 
 		raw_spin_lock_irq(&desc->lock);
-		if (unlikely(desc->istate & IRQS_DISABLED)) {
+		if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
 			/*
 			 * CHECKME: We might need a dedicated
 			 * IRQ_THREAD_PENDING flag here, which
@@ -997,8 +995,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		}
 
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-				  IRQS_INPROGRESS | IRQS_ONESHOT | \
-				  IRQS_WAITING);
+				  IRQS_ONESHOT | IRQS_WAITING);
+		irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 
 		if (new->flags & IRQF_PERCPU) {
 			irqd_set(&desc->irq_data, IRQD_PER_CPU);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ec4806d4778b..5e81d34b08d6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -66,7 +66,7 @@ void irq_move_irq(struct irq_data *idata)
 	if (likely(!irqd_is_setaffinity_pending(idata)))
 		return;
 
-	if (unlikely(desc->istate & IRQS_DISABLED))
+	if (unlikely(irqd_irq_disabled(idata)))
 		return;
 
 	/*
@@ -74,7 +74,7 @@ void irq_move_irq(struct irq_data *idata)
 	 * threaded interrupt with ONESHOT set, we can end up with an
 	 * interrupt storm.
 	 */
-	masked = desc->istate & IRQS_MASKED;
+	masked = irqd_irq_masked(idata);
 	if (!masked)
 		idata->chip->irq_mask(idata);
 	irq_move_masked_irq(idata);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd586ebf9c8c..cd424cdf17fc 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,12 +45,12 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 #ifdef CONFIG_SMP
 	do {
 		raw_spin_unlock(&desc->lock);
-		while (desc->istate & IRQS_INPROGRESS)
+		while (irqd_irq_inprogress(&desc->irq_data))
 			cpu_relax();
 		raw_spin_lock(&desc->lock);
-	} while (desc->istate & IRQS_INPROGRESS);
+	} while irqd_irq_inprogress(&desc->irq_data);
 	/* Might have been disabled in meantime */
-	return !(desc->istate & IRQS_DISABLED) && desc->action;
+	return !irqd_irq_disabled(&desc->irq_data) && desc->action;
 #else
 	return false;
 #endif
@@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	 * Do not poll disabled interrupts unless the spurious
 	 * disabled poller asks explicitely.
 	 */
-	if ((desc->istate & IRQS_DISABLED) && !force)
+	if (irqd_irq_disabled(&desc->irq_data) && !force)
 		goto out;
 
 	/*
@@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 
 	/* Already running on another processor */
-	if (desc->istate & IRQS_INPROGRESS) {
+	if (irqd_irq_inprogress(&desc->irq_data)) {
 		/*
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
-- 
cgit v1.2.2


From 0521c8fbb3da45c2a58cd551ca6e9644983f6028 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2011 16:13:24 +0200
Subject: genirq: Provide edge_eoi flow handler

This is a replacment for the cell flow handler which is in the way of
cleanups. Must be selected to avoid general bloat.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig |  4 ++++
 kernel/irq/chip.c  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 00f2c037267a..72606ba10b14 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -51,6 +51,10 @@ config HARDIRQS_SW_RESEND
 config IRQ_PREFLOW_FASTEOI
        bool
 
+# Edge style eoi based handler (cell)
+config IRQ_EDGE_EOI_HANDLER
+       bool
+
 # Support forced irq threading
 config IRQ_FORCED_THREADING
        bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e00bdc56269f..451d1e81c15c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -604,6 +604,51 @@ out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
 
+#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
+/**
+ *	handle_edge_eoi_irq - edge eoi type IRQ handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *
+ * Similar as the above handle_edge_irq, but using eoi and w/o the
+ * mask/unmask logic.
+ */
+void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+
+	raw_spin_lock(&desc->lock);
+
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+		     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+		if (!irq_check_poll(desc)) {
+			desc->istate |= IRQS_PENDING;
+			goto out_eoi;
+		}
+	}
+	kstat_incr_irqs_this_cpu(irq, desc);
+
+	do {
+		if (unlikely(!desc->action))
+			goto out_eoi;
+
+		handle_irq_event(desc);
+
+	} while ((desc->istate & IRQS_PENDING) &&
+		 !irqd_irq_disabled(&desc->irq_data));
+
+out_unlock:
+	chip->irq_eoi(&desc->irq_data);
+	raw_spin_unlock(&desc->lock);
+}
+#endif
+
 /**
  *	handle_percpu_irq - Per CPU local irq handler
  *	@irq:	the interrupt number
-- 
cgit v1.2.2


From 33b054b867b84015173a38d9cd9ff513b6498818 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2011 16:27:31 +0200
Subject: genirq: Remove handle_IRQ_event

Last user gone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 60fd5cd75c77..1a2fb77f2fd6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -188,15 +188,3 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	irq_compat_clr_progress(desc);
 	return ret;
 }
-
-/**
- * handle_IRQ_event - irq action chain handler
- * @irq:	the interrupt number
- * @action:	the interrupt action chain for this irq
- *
- * Handles the action chain of an irq event
- */
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
-{
-	return handle_irq_event_percpu(irq_to_desc(irq), action);
-}
-- 
cgit v1.2.2


From 30398bf6c684a77274dbdabf7efc1f24e4a99028 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 18 Mar 2011 09:33:56 -0700
Subject: genirq: Fix new kernel-doc warnings

Fix new irq-related kernel-doc warnings in 2.6.38:

Warning(kernel/irq/manage.c:149): No description found for parameter 'mask'
Warning(kernel/irq/manage.c:149): Excess function parameter 'cpumask' description in 'irq_set_affinity'
Warning(include/linux/irq.h:161): No description found for parameter 'state_use_accessors'
Warning(include/linux/irq.h:161): Excess struct/union/enum/typedef member 'state_use_accessor' description in 'irq_data'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <20110318093356.b939558d.randy.dunlap@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6e8acb755993..805c6a0ce780 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -175,7 +175,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
- *	@cpumask:	cpumask
+ *	@mask:		cpumask
  *
  */
 int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
-- 
cgit v1.2.2


From a6aeddd1c4e464a2150f97ca2d1c3d68cfbd9296 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2011 20:28:56 +0200
Subject: genirq: Fix typo and remove unused variable

Sigh, I'm overworked.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/migration.c | 1 -
 kernel/irq/spurious.c  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 5e81d34b08d6..e33d9c8d5089 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -60,7 +60,6 @@ void move_masked_irq(int irq)
 
 void irq_move_irq(struct irq_data *idata)
 {
-	struct irq_desc *desc = irq_data_to_desc(idata);
 	bool masked;
 
 	if (likely(!irqd_is_setaffinity_pending(idata)))
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index cd424cdf17fc..83f4799f46be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -48,7 +48,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 		while (irqd_irq_inprogress(&desc->irq_data))
 			cpu_relax();
 		raw_spin_lock(&desc->lock);
-	} while irqd_irq_inprogress(&desc->irq_data);
+	} while (irqd_irq_inprogress(&desc->irq_data));
 	/* Might have been disabled in meantime */
 	return !irqd_irq_disabled(&desc->irq_data) && desc->action;
 #else
-- 
cgit v1.2.2


From 243b422af9ea9af4ead07a8ad54c90d4f9b6081a Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Mon, 28 Mar 2011 14:13:35 -0700
Subject: Relax si_code check in rt_sigqueueinfo and rt_tgsigqueueinfo

Commit da48524eb206 ("Prevent rt_sigqueueinfo and rt_tgsigqueueinfo
from spoofing the signal code") made the check on si_code too strict.
There are several legitimate places where glibc wants to queue a
negative si_code different from SI_QUEUE:

 - This was first noticed with glibc's aio implementation, which wants
   to queue a signal with si_code SI_ASYNCIO; the current kernel
   causes glibc's tst-aio4 test to fail because rt_sigqueueinfo()
   fails with EPERM.

 - Further examination of the glibc source shows that getaddrinfo_a()
   wants to use SI_ASYNCNL (which the kernel does not even define).
   The timer_create() fallback code wants to queue signals with SI_TIMER.

As suggested by Oleg Nesterov <oleg@redhat.com>, loosen the check to
forbid only the problematic SI_TKILL case.

Reported-by: Klaus Dittrich <kladit@arcor.de>
Acked-by: Julien Tinnes <jln@google.com>
Cc: <stable@kernel.org>
Signed-off-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 324eff5468ad..1186cf7fac77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2437,7 +2437,7 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 	/* Not even root can pretend to send signals from the kernel.
 	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
 	 */
-	if (info.si_code != SI_QUEUE) {
+	if (info.si_code >= 0 || info.si_code == SI_TKILL) {
 		/* We used to allow any < 0 si_code */
 		WARN_ON_ONCE(info.si_code < 0);
 		return -EPERM;
@@ -2457,7 +2457,7 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 	/* Not even root can pretend to send signals from the kernel.
 	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
 	 */
-	if (info->si_code != SI_QUEUE) {
+	if (info->si_code >= 0 || info->si_code == SI_TKILL) {
 		/* We used to allow any < 0 si_code */
 		WARN_ON_ONCE(info->si_code < 0);
 		return -EPERM;
-- 
cgit v1.2.2


From 0ef5ca1e1f0de71300142b8f730f26ded6a0c2f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2011 21:59:37 +0200
Subject: genirq; Fix cleanup fallout

I missed the CONFIG_GENERIC_PENDING_IRQ dependency in the affinity
related functions and the IRQ_LEVEL propagation into irq_data
state. Did not pop up on my main test platforms. :(

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: David Daney <ddaney@caviumnetworks.com>
---
 kernel/irq/chip.c   |  2 ++
 kernel/irq/manage.c | 16 ++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 451d1e81c15c..03099d521f5e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -734,6 +734,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 		irqd_set(&desc->irq_data, IRQD_PER_CPU);
 	if (irq_settings_can_move_pcntxt(desc))
 		irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
+	if (irq_settings_is_level(desc))
+		irqd_set(&desc->irq_data, IRQD_LEVEL);
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 805c6a0ce780..acf540768b8f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -112,13 +112,13 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 }
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
 {
-	return irq_settings_can_move_pcntxt(desc);
+	return irqd_can_move_in_process_context(data);
 }
-static inline bool irq_move_pending(struct irq_desc *desc)
+static inline bool irq_move_pending(struct irq_data *data)
 {
-	return irqd_is_setaffinity_pending(&desc->irq_data);
+	return irqd_is_setaffinity_pending(data);
 }
 static inline void
 irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
@@ -131,8 +131,8 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
 	cpumask_copy(mask, desc->pending_mask);
 }
 #else
-static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
-static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
+static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
+static inline bool irq_move_pending(struct irq_desc *data) { return false; }
 static inline void
 irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
 static inline void
@@ -148,7 +148,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 	if (!chip || !chip->irq_set_affinity)
 		return -EINVAL;
 
-	if (irqd_can_move_in_process_context(data)) {
+	if (irq_can_move_pcntxt(data)) {
 		ret = chip->irq_set_affinity(data, mask, false);
 		switch (ret) {
 		case IRQ_SET_MASK_OK:
@@ -218,7 +218,7 @@ static void irq_affinity_notify(struct work_struct *work)
 		goto out;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	if (irq_move_pending(desc))
+	if (irq_move_pending(&desc->irq_data))
 		irq_get_pending(cpumask, desc);
 	else
 		cpumask_copy(cpumask, desc->irq_data.affinity);
-- 
cgit v1.2.2


From cd22c0e44b105aecd78e5f9e77abab3a1b8dc00c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 29 Mar 2011 11:36:05 +0200
Subject: genirq: Fix harmless typo

The late night fixup missed to convert the data type from irq_desc to
irq_data, which results in a harmless but annoying warning.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index acf540768b8f..b3bf54f7d977 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -132,7 +132,7 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
 }
 #else
 static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
-static inline bool irq_move_pending(struct irq_desc *data) { return false; }
+static inline bool irq_move_pending(struct irq_data *data) { return false; }
 static inline void
 irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
 static inline void
-- 
cgit v1.2.2


From a6e120ed42004d6051fff7c3233e2554f12ccecb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Mar 2011 22:20:51 +0100
Subject: alpha: Use generic show_interrupts()

The only subtle difference is that alpha uses ACTUAL_NR_IRQS and
prints the IRQF_DISABLED flag.

Change the generic implementation to deal with ACTUAL_NR_IRQS if
defined.

The IRQF_DISABLED printing is pointless, as we nowadays run all
interrupts with irqs disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 626d092eed9a..dd201bd35103 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -364,6 +364,10 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec)
 	return 0;
 }
 
+#ifndef ACTUAL_NR_IRQS
+# define ACTUAL_NR_IRQS nr_irqs
+#endif
+
 int show_interrupts(struct seq_file *p, void *v)
 {
 	static int prec;
@@ -373,10 +377,10 @@ int show_interrupts(struct seq_file *p, void *v)
 	struct irqaction *action;
 	struct irq_desc *desc;
 
-	if (i > nr_irqs)
+	if (i > ACTUAL_NR_IRQS)
 		return 0;
 
-	if (i == nr_irqs)
+	if (i == ACTUAL_NR_IRQS)
 		return arch_show_interrupts(p, prec);
 
 	/* print header and calculate the width of the first column */
-- 
cgit v1.2.2


From 0c6f8a8b917ad361319c8ace3e9f28e69bfdb4c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Mar 2011 13:32:20 +0200
Subject: genirq: Remove compat code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig     |   4 --
 kernel/irq/autoprobe.c |   4 +-
 kernel/irq/chip.c      | 129 -------------------------------------------------
 kernel/irq/compat.h    |  72 ---------------------------
 kernel/irq/debug.h     |   2 +-
 kernel/irq/dummychip.c |   9 ----
 kernel/irq/handle.c    |   3 --
 kernel/irq/internals.h |  10 ----
 kernel/irq/manage.c    |  10 +---
 kernel/irq/resend.c    |   1 -
 kernel/irq/settings.h  |  55 ++++++++-------------
 kernel/irq/spurious.c  |   1 -
 12 files changed, 24 insertions(+), 276 deletions(-)
 delete mode 100644 kernel/irq/compat.h

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 72606ba10b14..a69c333f78e4 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -10,10 +10,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
        def_bool y
 
-# Select this to disable the deprecated stuff
-config GENERIC_HARDIRQS_NO_DEPRECATED
-       bool
-
 config GENERIC_HARDIRQS_NO_COMPAT
        bool
 
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 394784c57060..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -70,10 +70,8 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && irq_settings_can_probe(desc)) {
 			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-			if (irq_startup(desc)) {
-				irq_compat_set_pending(desc);
+			if (irq_startup(desc))
 				desc->istate |= IRQS_PENDING;
-			}
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 03099d521f5e..616ec1c6b06f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,6 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 	if (!chip)
 		chip = &no_irq_chip;
 
-	irq_chip_set_defaults(chip);
 	desc->irq_data.chip = chip;
 	irq_put_desc_unlock(desc, flags);
 	/*
@@ -141,25 +140,21 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data);
 static void irq_state_clr_disabled(struct irq_desc *desc)
 {
 	irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
-	irq_compat_clr_disabled(desc);
 }
 
 static void irq_state_set_disabled(struct irq_desc *desc)
 {
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-	irq_compat_set_disabled(desc);
 }
 
 static void irq_state_clr_masked(struct irq_desc *desc)
 {
 	irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
-	irq_compat_clr_masked(desc);
 }
 
 static void irq_state_set_masked(struct irq_desc *desc)
 {
 	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
-	irq_compat_set_masked(desc);
 }
 
 int irq_startup(struct irq_desc *desc)
@@ -209,126 +204,6 @@ void irq_disable(struct irq_desc *desc)
 	}
 }
 
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-/* Temporary migration helpers */
-static void compat_irq_mask(struct irq_data *data)
-{
-	data->chip->mask(data->irq);
-}
-
-static void compat_irq_unmask(struct irq_data *data)
-{
-	data->chip->unmask(data->irq);
-}
-
-static void compat_irq_ack(struct irq_data *data)
-{
-	data->chip->ack(data->irq);
-}
-
-static void compat_irq_mask_ack(struct irq_data *data)
-{
-	data->chip->mask_ack(data->irq);
-}
-
-static void compat_irq_eoi(struct irq_data *data)
-{
-	data->chip->eoi(data->irq);
-}
-
-static void compat_irq_enable(struct irq_data *data)
-{
-	data->chip->enable(data->irq);
-}
-
-static void compat_irq_disable(struct irq_data *data)
-{
-	data->chip->disable(data->irq);
-}
-
-static void compat_irq_shutdown(struct irq_data *data)
-{
-	data->chip->shutdown(data->irq);
-}
-
-static unsigned int compat_irq_startup(struct irq_data *data)
-{
-	return data->chip->startup(data->irq);
-}
-
-static int compat_irq_set_affinity(struct irq_data *data,
-				   const struct cpumask *dest, bool force)
-{
-	return data->chip->set_affinity(data->irq, dest);
-}
-
-static int compat_irq_set_type(struct irq_data *data, unsigned int type)
-{
-	return data->chip->set_type(data->irq, type);
-}
-
-static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
-{
-	return data->chip->set_wake(data->irq, on);
-}
-
-static int compat_irq_retrigger(struct irq_data *data)
-{
-	return data->chip->retrigger(data->irq);
-}
-
-static void compat_bus_lock(struct irq_data *data)
-{
-	data->chip->bus_lock(data->irq);
-}
-
-static void compat_bus_sync_unlock(struct irq_data *data)
-{
-	data->chip->bus_sync_unlock(data->irq);
-}
-#endif
-
-/*
- * Fixup enable/disable function pointers
- */
-void irq_chip_set_defaults(struct irq_chip *chip)
-{
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-	if (chip->enable)
-		chip->irq_enable = compat_irq_enable;
-	if (chip->disable)
-		chip->irq_disable = compat_irq_disable;
-	if (chip->shutdown)
-		chip->irq_shutdown = compat_irq_shutdown;
-	if (chip->startup)
-		chip->irq_startup = compat_irq_startup;
-	if (!chip->end)
-		chip->end = dummy_irq_chip.end;
-	if (chip->bus_lock)
-		chip->irq_bus_lock = compat_bus_lock;
-	if (chip->bus_sync_unlock)
-		chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
-	if (chip->mask)
-		chip->irq_mask = compat_irq_mask;
-	if (chip->unmask)
-		chip->irq_unmask = compat_irq_unmask;
-	if (chip->ack)
-		chip->irq_ack = compat_irq_ack;
-	if (chip->mask_ack)
-		chip->irq_mask_ack = compat_irq_mask_ack;
-	if (chip->eoi)
-		chip->irq_eoi = compat_irq_eoi;
-	if (chip->set_affinity)
-		chip->irq_set_affinity = compat_irq_set_affinity;
-	if (chip->set_type)
-		chip->irq_set_type = compat_irq_set_type;
-	if (chip->set_wake)
-		chip->irq_set_wake = compat_irq_set_wake;
-	if (chip->retrigger)
-		chip->irq_retrigger = compat_irq_retrigger;
-#endif
-}
-
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask_ack)
@@ -381,7 +256,6 @@ void handle_nested_irq(unsigned int irq)
 	if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
 		goto out_unlock;
 
-	irq_compat_set_progress(desc);
 	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	raw_spin_unlock_irq(&desc->lock);
 
@@ -391,7 +265,6 @@ void handle_nested_irq(unsigned int irq)
 
 	raw_spin_lock_irq(&desc->lock);
 	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-	irq_compat_clr_progress(desc);
 
 out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
@@ -514,7 +387,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * then mask it and get out of here:
 	 */
 	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
-		irq_compat_set_pending(desc);
 		desc->istate |= IRQS_PENDING;
 		mask_irq(desc);
 		goto out;
@@ -567,7 +439,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
 		     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
 		if (!irq_check_poll(desc)) {
-			irq_compat_set_pending(desc);
 			desc->istate |= IRQS_PENDING;
 			mask_ack_irq(desc);
 			goto out_unlock;
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
deleted file mode 100644
index 6bbaf66aca85..000000000000
--- a/kernel/irq/compat.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Compat layer for transition period
- */
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
-static inline void irq_compat_set_progress(struct irq_desc *desc)
-{
-	desc->status |= IRQ_INPROGRESS;
-}
-
-static inline void irq_compat_clr_progress(struct irq_desc *desc)
-{
-	desc->status &= ~IRQ_INPROGRESS;
-}
-static inline void irq_compat_set_disabled(struct irq_desc *desc)
-{
-	desc->status |= IRQ_DISABLED;
-}
-static inline void irq_compat_clr_disabled(struct irq_desc *desc)
-{
-	desc->status &= ~IRQ_DISABLED;
-}
-static inline void irq_compat_set_pending(struct irq_desc *desc)
-{
-	desc->status |= IRQ_PENDING;
-}
-
-static inline void irq_compat_clr_pending(struct irq_desc *desc)
-{
-	desc->status &= ~IRQ_PENDING;
-}
-static inline void irq_compat_set_masked(struct irq_desc *desc)
-{
-	desc->status |= IRQ_MASKED;
-}
-
-static inline void irq_compat_clr_masked(struct irq_desc *desc)
-{
-	desc->status &= ~IRQ_MASKED;
-}
-static inline void irq_compat_set_move_pending(struct irq_desc *desc)
-{
-	desc->status |= IRQ_MOVE_PENDING;
-}
-
-static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
-{
-	desc->status &= ~IRQ_MOVE_PENDING;
-}
-static inline void irq_compat_set_affinity(struct irq_desc *desc)
-{
-	desc->status |= IRQ_AFFINITY_SET;
-}
-
-static inline void irq_compat_clr_affinity(struct irq_desc *desc)
-{
-	desc->status &= ~IRQ_AFFINITY_SET;
-}
-#else
-static inline void irq_compat_set_progress(struct irq_desc *desc) { }
-static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
-static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
-static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
-static inline void irq_compat_set_pending(struct irq_desc *desc) { }
-static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
-static inline void irq_compat_set_masked(struct irq_desc *desc) { }
-static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
-static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
-static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
-static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
-static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
-#endif
-
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index a0bd875ba3d5..306cba37e9a5 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,7 +4,7 @@
 
 #include <linux/kallsyms.h>
 
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
 #define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
 /* FIXME */
 #define PD(f) do { } while (0)
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947e..b5fcd96c7102 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
 	return 0;
 }
 
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static void compat_noop(unsigned int irq) { }
-#define END_INIT .end = compat_noop
-#else
-#define END_INIT
-#endif
-
 /*
  * Generic no controller implementation
  */
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
 	.irq_enable	= noop,
 	.irq_disable	= noop,
 	.irq_ack	= ack_bad,
-	END_INIT
 };
 
 /*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
 	.irq_ack	= noop,
 	.irq_mask	= noop,
 	.irq_unmask	= noop,
-	END_INIT
 };
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 1a2fb77f2fd6..90cb55f6d7eb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -175,9 +175,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	struct irqaction *action = desc->action;
 	irqreturn_t ret;
 
-	irq_compat_clr_pending(desc);
 	desc->istate &= ~IRQS_PENDING;
-	irq_compat_set_progress(desc);
 	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
 	raw_spin_unlock(&desc->lock);
 
@@ -185,6 +183,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 
 	raw_spin_lock(&desc->lock);
 	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-	irq_compat_clr_progress(desc);
 	return ret;
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6b8b9713e28d..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,10 +15,6 @@
 
 #define istate core_internal_state__do_not_mess_with_it
 
-#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
-# define status status_use_accessors
-#endif
-
 extern int noirqdebug;
 
 /*
@@ -61,15 +57,11 @@ enum {
 	IRQS_SUSPENDED		= 0x00000800,
 };
 
-#include "compat.h"
 #include "debug.h"
 #include "settings.h"
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
-/* Set default functions for irq_chip structures: */
-extern void irq_chip_set_defaults(struct irq_chip *chip);
-
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
@@ -156,13 +148,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
 static inline void irqd_set_move_pending(struct irq_data *d)
 {
 	d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
-	irq_compat_set_move_pending(irq_data_to_desc(d));
 }
 
 static inline void irqd_clr_move_pending(struct irq_data *d)
 {
 	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
-	irq_compat_clr_move_pending(irq_data_to_desc(d));
 }
 
 static inline void irqd_clear(struct irq_data *d, unsigned int mask)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b3bf54f7d977..12a80fdae11c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -166,7 +166,6 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 		kref_get(&desc->affinity_notify->kref);
 		schedule_work(&desc->affinity_notify->work);
 	}
-	irq_compat_set_affinity(desc);
 	irqd_set(data, IRQD_AFFINITY_SET);
 
 	return ret;
@@ -297,10 +296,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 		if (cpumask_intersects(desc->irq_data.affinity,
 				       cpu_online_mask))
 			set = desc->irq_data.affinity;
-		else {
-			irq_compat_clr_affinity(desc);
+		else
 			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
-		}
 	}
 
 	cpumask_and(mask, cpu_online_mask, set);
@@ -587,8 +584,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 			irqd_set(&desc->irq_data, IRQD_LEVEL);
 		}
 
-		if (chip != desc->irq_data.chip)
-			irq_chip_set_defaults(desc->irq_data.chip);
 		ret = 0;
 		break;
 	default:
@@ -785,7 +780,6 @@ static int irq_thread(void *data)
 			 * but AFAICT IRQS_PENDING should be fine as it
 			 * retriggers the interrupt itself --- tglx
 			 */
-			irq_compat_set_pending(desc);
 			desc->istate |= IRQS_PENDING;
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
@@ -981,8 +975,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	new->thread_mask = 1 << ffz(thread_mask);
 
 	if (!shared) {
-		irq_chip_set_defaults(desc->irq_data.chip);
-
 		init_waitqueue_head(&desc->wait_for_threads);
 
 		/* Setup the type (level, edge polarity) if configured: */
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index ad683a99b1ec..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -65,7 +65,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	if (desc->istate & IRQS_REPLAY)
 		return;
 	if (desc->istate & IRQS_PENDING) {
-		irq_compat_clr_pending(desc);
 		desc->istate &= ~IRQS_PENDING;
 		desc->istate |= IRQS_REPLAY;
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0227ad358272..0d91730b6330 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,17 +15,8 @@ enum {
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
-#define IRQ_INPROGRESS		GOT_YOU_MORON
-#define IRQ_REPLAY		GOT_YOU_MORON
-#define IRQ_WAITING		GOT_YOU_MORON
-#define IRQ_DISABLED		GOT_YOU_MORON
-#define IRQ_PENDING		GOT_YOU_MORON
-#define IRQ_MASKED		GOT_YOU_MORON
-#define IRQ_WAKEUP		GOT_YOU_MORON
-#define IRQ_MOVE_PENDING	GOT_YOU_MORON
 #define IRQ_PER_CPU		GOT_YOU_MORON
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
-#define IRQ_AFFINITY_SET	GOT_YOU_MORON
 #define IRQ_LEVEL		GOT_YOU_MORON
 #define IRQ_NOPROBE		GOT_YOU_MORON
 #define IRQ_NOREQUEST		GOT_YOU_MORON
@@ -37,102 +28,98 @@ enum {
 static inline void
 irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
 {
-	desc->status &= ~(clr & _IRQF_MODIFY_MASK);
-	desc->status |= (set & _IRQF_MODIFY_MASK);
+	desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
+	desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
 }
 
 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
 {
-	return desc->status & _IRQ_PER_CPU;
+	return desc->status_use_accessors & _IRQ_PER_CPU;
 }
 
 static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
 {
-	desc->status |= _IRQ_PER_CPU;
+	desc->status_use_accessors |= _IRQ_PER_CPU;
 }
 
 static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
 {
-	desc->status |= _IRQ_NO_BALANCING;
+	desc->status_use_accessors |= _IRQ_NO_BALANCING;
 }
 
 static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
 {
-	return desc->status & _IRQ_NO_BALANCING;
+	return desc->status_use_accessors & _IRQ_NO_BALANCING;
 }
 
 static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
 {
-	return desc->status & IRQ_TYPE_SENSE_MASK;
+	return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
 }
 
 static inline void
 irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
 {
-	desc->status &= ~IRQ_TYPE_SENSE_MASK;
-	desc->status |= mask & IRQ_TYPE_SENSE_MASK;
+	desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
+	desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
 }
 
 static inline bool irq_settings_is_level(struct irq_desc *desc)
 {
-	return desc->status & _IRQ_LEVEL;
+	return desc->status_use_accessors & _IRQ_LEVEL;
 }
 
 static inline void irq_settings_clr_level(struct irq_desc *desc)
 {
-	desc->status &= ~_IRQ_LEVEL;
+	desc->status_use_accessors &= ~_IRQ_LEVEL;
 }
 
 static inline void irq_settings_set_level(struct irq_desc *desc)
 {
-	desc->status |= _IRQ_LEVEL;
+	desc->status_use_accessors |= _IRQ_LEVEL;
 }
 
 static inline bool irq_settings_can_request(struct irq_desc *desc)
 {
-	return !(desc->status & _IRQ_NOREQUEST);
+	return !(desc->status_use_accessors & _IRQ_NOREQUEST);
 }
 
 static inline void irq_settings_clr_norequest(struct irq_desc *desc)
 {
-	desc->status &= ~_IRQ_NOREQUEST;
+	desc->status_use_accessors &= ~_IRQ_NOREQUEST;
 }
 
 static inline void irq_settings_set_norequest(struct irq_desc *desc)
 {
-	desc->status |= _IRQ_NOREQUEST;
+	desc->status_use_accessors |= _IRQ_NOREQUEST;
 }
 
 static inline bool irq_settings_can_probe(struct irq_desc *desc)
 {
-	return !(desc->status & _IRQ_NOPROBE);
+	return !(desc->status_use_accessors & _IRQ_NOPROBE);
 }
 
 static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
 {
-	desc->status &= ~_IRQ_NOPROBE;
+	desc->status_use_accessors &= ~_IRQ_NOPROBE;
 }
 
 static inline void irq_settings_set_noprobe(struct irq_desc *desc)
 {
-	desc->status |= _IRQ_NOPROBE;
+	desc->status_use_accessors |= _IRQ_NOPROBE;
 }
 
 static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
 {
-	return desc->status & _IRQ_MOVE_PCNTXT;
+	return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
 }
 
 static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
 {
-	return !(desc->status & _IRQ_NOAUTOEN);
+	return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
 }
 
 static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
 {
-	return desc->status & _IRQ_NESTED_THREAD;
+	return desc->status_use_accessors & _IRQ_NESTED_THREAD;
 }
-
-/* Nothing should touch desc->status from now on */
-#undef status
-#define status		USE_THE_PROPER_WRAPPERS_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 83f4799f46be..dfbd550401b2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -93,7 +93,6 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
 		 */
-		irq_compat_set_pending(desc);
 		desc->istate |= IRQS_PENDING;
 		goto out;
 	}
-- 
cgit v1.2.2


From 851d7cf647e0d31668eb5dc496f7698a2f6136b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 29 Mar 2011 02:51:13 +0200
Subject: genirq: Remove move_*irq leftovers

All users converted to new interface.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/migration.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e33d9c8d5089..bc6194698dfd 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -53,11 +53,6 @@ void irq_move_masked_irq(struct irq_data *idata)
 	cpumask_clear(desc->pending_mask);
 }
 
-void move_masked_irq(int irq)
-{
-	irq_move_masked_irq(irq_get_irq_data(irq));
-}
-
 void irq_move_irq(struct irq_data *idata)
 {
 	bool masked;
@@ -80,8 +75,3 @@ void irq_move_irq(struct irq_data *idata)
 	if (!masked)
 		idata->chip->irq_unmask(idata);
 }
-
-void move_native_irq(int irq)
-{
-	irq_move_irq(irq_get_irq_data(irq));
-}
-- 
cgit v1.2.2


From 353c8ed44f8f7414be614685ed29d1e23f5fa76b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 29 Mar 2011 22:18:28 +0200
Subject: genirq: Fix misnamed label in handle_edge_eoi_irq

Reported-by: michael@ellerman.id.au
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linuxppc-dev@lists.ozlabs.org
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 616ec1c6b06f..1dafc8652bd8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -514,7 +514,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
 	} while ((desc->istate & IRQS_PENDING) &&
 		 !irqd_irq_disabled(&desc->irq_data));
 
-out_unlock:
+out_eoi:
 	chip->irq_eoi(&desc->irq_data);
 	raw_spin_unlock(&desc->lock);
 }
-- 
cgit v1.2.2


From 78c89825649a9a5ed526c507603196f467d781a5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Mar 2011 14:13:23 +0200
Subject: genirq: Remove the now obsolete config options and select statements

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index a69c333f78e4..c574f9a12c48 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -10,9 +10,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
        def_bool y
 
-config GENERIC_HARDIRQS_NO_COMPAT
-       bool
-
 # Options selectable by the architecture code
 
 # Make sparse irq Kconfig switch below available
-- 
cgit v1.2.2


From a51e91981870d013fcfcc08b0117997edbcbc7a7 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Thu, 24 Mar 2011 14:00:18 +0100
Subject: sched: Leave sched_setscheduler() earlier if possible, do not disturb
 SCHED_FIFO tasks

sched_setscheduler() (in sched.c) is called in order of changing the
scheduling policy and/or the real-time priority of a task. Thus,
if we find out that neither of those are actually being modified, it
is possible to return earlier and save the overhead of a full
deactivate+activate cycle of the task in question.

Beside that, if we have more than one SCHED_FIFO task with the same
priority on the same rq (which means they share the same priority queue)
having one of them changing its position in the priority queue because of
a sched_setscheduler (as it happens by means of the deactivate+activate)
that does not actually change the priority violates POSIX which states,
for SCHED_FIFO:

  "If a thread whose policy or priority has been modified by
   pthread_setschedprio() is a running thread or is runnable, the effect on
   its position in the thread list depends on the direction of the
   modification, as follows: a. <...> b. If the priority is unchanged, the
   thread does not change position in the thread list. c. <...>"

     http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_08.html

 (ed: And the POSIX specification here does, briefly and somewhat unexpectedly,
      match what common sense tells us as well. )

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1300971618.3960.82.camel@Palantir>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f592ce6f8616..a8845516ace6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5011,6 +5011,17 @@ recheck:
 		return -EINVAL;
 	}
 
+	/*
+	 * If not changing anything there's no need to proceed further:
+	 */
+	if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+			param->sched_priority == p->rt_priority))) {
+
+		__task_rq_unlock(rq);
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return 0;
+	}
+
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (user) {
 		/*
-- 
cgit v1.2.2


From 3436ae1298cb22d722a6520fc97f112dd767a9e1 Mon Sep 17 00:00:00 2001
From: Sisir Koppaka <sisir.koppaka@gmail.com>
Date: Sat, 26 Mar 2011 18:22:55 +0530
Subject: sched: Fix rebalance interval calculation

The interval for checking scheduling domains if they are due to be
balanced currently depends on boot state NR_CPUS, which may not
accurately reflect the number of online CPUs at the time of check.

Thus replace NR_CPUS with num_online_cpus().

 (ed: Should only affect those who set NR_CPUS really high, such as 4096
      or so :-)

Signed-off-by: Sisir Koppaka <sisir.koppaka@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <AANLkTikqHWid2Q93F5U5Qw5snJH8C5PXoa7J6=6hYO94@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3f7ec9e27ee1..c7ec5c8e7b44 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
 
 #include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -3850,8 +3851,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
-		if (interval > HZ*NR_CPUS/10)
-			interval = HZ*NR_CPUS/10;
+		if (interval > HZ*num_online_cpus()/10)
+			interval = HZ*num_online_cpus()/10;
 
 		need_serialize = sd->flags & SD_SERIALIZE;
 
-- 
cgit v1.2.2


From 20443384fe090c5f8aeb016e7e85659c5bbdd69f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 31 Mar 2011 03:33:29 +0200
Subject: perf: Rebase max unprivileged mlock threshold on top of page size

Ensure we allow 512 kiB + 1 page for user control without
assuming a 4096 bytes page size.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
LKML-Reference: <1301535209-9679-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c75925c4d1e2..261690923ffb 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -145,8 +145,8 @@ static struct srcu_struct pmus_srcu;
  */
 int sysctl_perf_event_paranoid __read_mostly = 1;
 
-/* Minimum for 128 pages + 1 for the user control page */
-int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 
 /*
  * max perf event sample rate
-- 
cgit v1.2.2


From fd1edb3aa2c1d92618d8f0c6d15d44ea41fcac6a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 28 Mar 2011 13:13:56 +0200
Subject: perf: Fix task_struct reference leak

sys_perf_event_open() had an imbalance in the number of task refs it
took causing memory leakage

Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: stable@kernel.org # .37+
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 261690923ffb..27960f114efd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6531,6 +6531,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_alloc;
 	}
 
+	if (task) {
+		put_task_struct(task);
+		task = NULL;
+	}
+
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
-- 
cgit v1.2.2


From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 30 Mar 2011 22:57:33 -0300
Subject: Fix common misspellings

Fixes generated by 'codespell' and manually reviewed.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
---
 kernel/audit_tree.c                  | 2 +-
 kernel/auditsc.c                     | 2 +-
 kernel/cgroup.c                      | 2 +-
 kernel/cpu.c                         | 2 +-
 kernel/debug/debug_core.c            | 2 +-
 kernel/debug/kdb/kdb_main.c          | 6 +++---
 kernel/debug/kdb/kdb_support.c       | 2 +-
 kernel/exit.c                        | 2 +-
 kernel/irq/chip.c                    | 2 +-
 kernel/irq/migration.c               | 2 +-
 kernel/kexec.c                       | 6 +++---
 kernel/kthread.c                     | 2 +-
 kernel/latencytop.c                  | 2 +-
 kernel/lockdep.c                     | 4 ++--
 kernel/module.c                      | 6 +++---
 kernel/mutex.c                       | 2 +-
 kernel/padata.c                      | 8 ++++----
 kernel/params.c                      | 2 +-
 kernel/posix-cpu-timers.c            | 2 +-
 kernel/posix-timers.c                | 2 +-
 kernel/power/main.c                  | 2 +-
 kernel/sched.c                       | 6 +++---
 kernel/sched_autogroup.c             | 2 +-
 kernel/sched_fair.c                  | 2 +-
 kernel/sched_rt.c                    | 4 ++--
 kernel/signal.c                      | 2 +-
 kernel/softirq.c                     | 2 +-
 kernel/time/jiffies.c                | 2 +-
 kernel/time/timer_stats.c            | 2 +-
 kernel/trace/ftrace.c                | 4 ++--
 kernel/trace/ring_buffer.c           | 4 ++--
 kernel/trace/trace.c                 | 2 +-
 kernel/trace/trace_clock.c           | 2 +-
 kernel/trace/trace_entries.h         | 2 +-
 kernel/trace/trace_functions_graph.c | 2 +-
 kernel/trace/trace_irqsoff.c         | 2 +-
 kernel/trace/trace_kprobe.c          | 2 +-
 kernel/user-return-notifier.c        | 2 +-
 kernel/wait.c                        | 2 +-
 kernel/workqueue.c                   | 2 +-
 40 files changed, 55 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
 		spin_lock(&hash_lock);
 		list_for_each_entry(node, &tree->chunks, list) {
 			struct audit_chunk *chunk = find_chunk(node);
-			/* this could be NULL if the watch is dieing else where... */
+			/* this could be NULL if the watch is dying else where... */
 			struct inode *inode = chunk->mark.i.inode;
 			node->index |= 1U<<31;
 			if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..b33513a08beb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 /*
  * to_send and len_sent accounting are very loose estimates.  We aren't
  * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
  *
  * why snprintf?  an int is up to 12 digits long.  if we just assumed when
  * logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e31b220a743d..25c7eb52de1a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
 };
 
 /*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
  */
 struct cgroup_event {
 	/*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c95fc4df0faa..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
 #else /* #if CONFIG_HOTPLUG_CPU */
 static void cpu_hotplug_begin(void) {}
 static void cpu_hotplug_done(void) {}
-#endif	/* #esle #if CONFIG_HOTPLUG_CPU */
+#endif	/* #else #if CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
 
 	/*
 	 * For single stepping, try to only enter on the processor
-	 * that was single stepping.  To gaurd against a deadlock, the
+	 * that was single stepping.  To guard against a deadlock, the
 	 * kernel will only try for the value of sstep_tries before
 	 * giving up and continuing on.
 	 */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 6bc6e3bc4f9c..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
  *	symbol name, and offset to the caller.
  *
  *	The argument may consist of a numeric value (decimal or
- *	hexidecimal), a symbol name, a register name (preceeded by the
+ *	hexidecimal), a symbol name, a register name (preceded by the
  *	percent sign), an environment variable with a numeric value
- *	(preceeded by a dollar sign) or a simple arithmetic expression
+ *	(preceded by a dollar sign) or a simple arithmetic expression
  *	consisting of a symbol name, +/-, and a numeric constant value
  *	(offset).
  * Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
  *	error		The hardware-defined error code
  *	reason2		kdb's current reason code.
  *			Initially error but can change
- *			acording to kdb state.
+ *			according to kdb state.
  *	db_result	Result code from break or debug point.
  *	regs		The exception frame at time of fault/breakpoint.
  *			should always be valid.
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
  *	Mask for process state.
  * Notes:
  *	The mask folds data from several sources into a single long value, so
- *	be carefull not to overlap the bits.  TASK_* bits are in the LSB,
+ *	be careful not to overlap the bits.  TASK_* bits are in the LSB,
  *	special cases like UNRUNNABLE are in the MSB.  As of 2.6.10-rc1 there
  *	is no overlap between TASK_* and EXIT_* but that may not always be
  *	true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/exit.c b/kernel/exit.c
index 6a488ad2dce5..f5d2f63bae0b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	/* Let father know we died
 	 *
 	 * Thread signals are configurable, but you aren't going to use
-	 * that to send signals to arbitary processes.
+	 * that to send signals to arbitrary processes.
 	 * That stops right now.
 	 *
 	 * If the parent exec id doesn't match the exec id we saved
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1dafc8652bd8..4af1e2b244cb 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -415,7 +415,7 @@ out:
  *	@desc:	the interrupt description structure for this irq
  *
  *	Interrupt occures on the falling and/or rising edge of a hardware
- *	signal. The occurence is latched into the irq controller hardware
+ *	signal. The occurrence is latched into the irq controller hardware
  *	and must be acked in order to be reenabled. After the ack another
  *	interrupt can happen on the same source even before the first one
  *	is handled by the associated event handler. If this happens it
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bc6194698dfd..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata)
 	 * do the disable, re-program, enable sequence.
 	 * This is *not* particularly important for level triggered
 	 * but in a edge trigger case, we might be setting rte
-	 * when an active trigger is comming in. This could
+	 * when an active trigger is coming in. This could
 	 * cause some ioapics to mal-function.
 	 * Being paranoid i guess!
 	 *
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..e7e3d9788dc3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -144,7 +144,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	/* Initialize the list of destination pages */
 	INIT_LIST_HEAD(&image->dest_pages);
 
-	/* Initialize the list of unuseable pages */
+	/* Initialize the list of unusable pages */
 	INIT_LIST_HEAD(&image->unuseable_pages);
 
 	/* Read in the segments */
@@ -454,7 +454,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	/* Deal with the destination pages I have inadvertently allocated.
 	 *
 	 * Ideally I would convert multi-page allocations into single
-	 * page allocations, and add everyting to image->dest_pages.
+	 * page allocations, and add everything to image->dest_pages.
 	 *
 	 * For now it is simpler to just free the pages.
 	 */
@@ -602,7 +602,7 @@ static void kimage_free_extra_pages(struct kimage *image)
 	/* Walk through and free any extra destination pages I may have */
 	kimage_free_page_list(&image->dest_pages);
 
-	/* Walk through and free any unuseable pages I have cached */
+	/* Walk through and free any unusable pages I have cached */
 	kimage_free_page_list(&image->unuseable_pages);
 
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 684ab3f7dd72..3b34d2732bce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -139,7 +139,7 @@ static void create_kthread(struct kthread_create_info *create)
  * in @node, to get NUMA affinity for kthread stack, or else give -1.
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
+ * standalone thread for which no one will call kthread_stop(), or
  * return when 'kthread_should_stop()' is true (which means
  * kthread_stop() has been called).  The return value should be zero
  * or a negative error number; it will be passed to kthread_stop().
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
 }
 
 /**
- * __account_scheduler_latency - record an occured latency
+ * __account_scheduler_latency - record an occurred latency
  * @tsk - the task struct of the task hitting the latency
  * @usecs - the duration of the latency in microseconds
  * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..53a68956f131 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	if (unlikely(curr->hardirqs_enabled)) {
 		/*
 		 * Neither irq nor preemption are disabled here
-		 * so this is racy by nature but loosing one hit
+		 * so this is racy by nature but losing one hit
 		 * in a stat is not a big deal.
 		 */
 		__debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	if (!graph_lock())
 		return 0;
 	/*
-	 * Make sure we didnt race:
+	 * Make sure we didn't race:
 	 */
 	if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
 		graph_unlock();
diff --git a/kernel/module.c b/kernel/module.c
index 1f9f7bc56ca1..d5938a5c19c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		wait_for_zero_refcount(mod);
 
 	mutex_unlock(&module_mutex);
-	/* Final destruction now noone is using it. */
+	/* Final destruction now no one is using it. */
 	if (mod->exit != NULL)
 		mod->exit();
 	blocking_notifier_call_chain(&module_notify_list,
@@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod,
 	mod->state = MODULE_STATE_COMING;
 
 	/* Now sew it into the lists so we can get lockdep and oops
-	 * info during argument parsing.  Noone should access us, since
+	 * info during argument parsing.  No one should access us, since
 	 * strong_try_module_get() will fail.
 	 * lockdep/oops can run asynchronous, so use the RCU list insertion
 	 * function to insert in a way safe to concurrent readers.
@@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod,
 	else
 		nextval = (unsigned long)mod->module_core+mod->core_text_size;
 
-	/* Scan for closest preceeding symbol, and next symbol. (ELF
+	/* Scan for closest preceding symbol, and next symbol. (ELF
 	   starts real symbols at 1). */
 	for (i = 1; i < mod->num_symtab; i++) {
 		if (mod->symtab[i].st_shndx == SHN_UNDEF)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..c4195fa98900 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		}
 		__set_task_state(task, state);
 
-		/* didnt get the lock, go to sleep: */
+		/* didn't get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		preempt_enable_no_resched();
 		schedule();
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
 		/*
 		 * This cpu has to do the parallel processing of the next
 		 * object. It's waiting in the cpu's parallelization queue,
-		 * so exit imediately.
+		 * so exit immediately.
 		 */
 		if (PTR_ERR(padata) == -ENODATA) {
 			del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
 	/*
 	 * The next object that needs serialization might have arrived to
 	 * the reorder queues in the meantime, we will be called again
-	 * from the timer function if noone else cares for it.
+	 * from the timer function if no one else cares for it.
 	 */
 	if (atomic_read(&pd->reorder_objects)
 			&& !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
 	put_online_cpus();
 }
 
-/* Replace the internal control stucture with a new one. */
+/* Replace the internal control structure with a new one. */
 static void padata_replace(struct padata_instance *pinst,
 			   struct parallel_data *pd_new)
 {
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 }
 
  /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
  *                     padata cpumasks.
  *
  * @pinst: padata instance
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..7ab388a48a2e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
 	/* Find parameter */
 	for (i = 0; i < num_params; i++) {
 		if (parameq(param, params[i].name)) {
-			/* Noone handled NULL, so do it here. */
+			/* No one handled NULL, so do it here. */
 			if (!val && params[i].ops->set != param_set_bool)
 				return -EINVAL;
 			DEBUGP("They are equal!  Calling %p\n",
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 67fea9d25d55..0791b13df7bf 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 	/*
 	 * Now that all the timers on our list have the firing flag,
-	 * noone will touch their list entries but us.  We'll take
+	 * no one will touch their list entries but us.  We'll take
 	 * each timer's lock before clearing its firing flag, so no
 	 * timer call will interfere.
 	 */
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4c0124919f9a..e5498d7405c3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
  * restarted (i.e. we have flagged this in the sys_private entry of the
  * info block).
  *
- * To protect aginst the timer going away while the interrupt is queued,
+ * To protect against the timer going away while the interrupt is queued,
  * we require that the it_requeue_pending flag be set.
  */
 void do_schedule_next_timer(struct siginfo *info)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8eaba5f27b10..de9aef8742f4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -224,7 +224,7 @@ power_attr(state);
  * writing to 'state'.  It first should read from 'wakeup_count' and store
  * the read value.  Then, after carrying out its own preparations for the system
  * transition to a sleep state, it should write the stored value to
- * 'wakeup_count'.  If that fails, at least one wakeup event has occured since
+ * 'wakeup_count'.  If that fails, at least one wakeup event has occurred since
  * 'wakeup_count' was read and 'state' should not be written to.  Otherwise, it
  * is allowed to write to 'state', but the transition will be aborted if there
  * are any wakeup events detected after 'wakeup_count' was written to.
diff --git a/kernel/sched.c b/kernel/sched.c
index f592ce6f8616..865b433fac5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2309,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
@@ -4997,7 +4997,7 @@ recheck:
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	/*
-	 * To be able to change p->policy safely, the apropriate
+	 * To be able to change p->policy safely, the appropriate
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
@@ -5705,7 +5705,7 @@ void show_state_filter(unsigned long state_filter)
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
-		 * console might take alot of time:
+		 * console might take a lot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 5946ac515602..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
 	struct autogroup *ag = autogroup_create();
 
 	autogroup_move_group(p, ag);
-	/* drop extra refrence added by autogroup_create() */
+	/* drop extra reference added by autogroup_create() */
 	autogroup_kref_put(ag);
 }
 EXPORT_SYMBOL(sched_autogroup_create_attach);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3f7ec9e27ee1..3cb7f07887a1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3061,7 +3061,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 
 	/*
 	 * if *imbalance is less than the average load per runnable task
-	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * there is no guarantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index db308cb08b75..e7cebdc65f82 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1378,7 +1378,7 @@ retry:
 		task = pick_next_pushable_task(rq);
 		if (task_cpu(next_task) == rq->cpu && task == next_task) {
 			/*
-			 * If we get here, the task hasnt moved at all, but
+			 * If we get here, the task hasn't moved at all, but
 			 * it has failed to push.  We will not try again,
 			 * since the other cpus will pull from us when they
 			 * are ready.
@@ -1488,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq)
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
-			 * in another runqueue. (low likelyhood
+			 * in another runqueue. (low likelihood
 			 * but possible)
 			 */
 		}
diff --git a/kernel/signal.c b/kernel/signal.c
index 1186cf7fac77..f486d10f3b8e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1885,7 +1885,7 @@ relock:
 	for (;;) {
 		struct k_sigaction *ka;
 		/*
-		 * Tracing can induce an artifical signal and choose sigaction.
+		 * Tracing can induce an artificial signal and choose sigaction.
 		 * The return value in @signr determines the default action,
 		 * but @info->si_signo is the signal number we will report.
 		 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 735d87095172..174f976c2874 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
 /**
  * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
  * @ttimer:	 tasklet_hrtimer which is initialized
- * @function:	 hrtimer callback funtion which gets called from softirq context
+ * @function:	 hrtimer callback function which gets called from softirq context
  * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
  * @mode:	 hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
  */
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index b2fa506667c0..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -34,7 +34,7 @@
  * inaccuracies caused by missed or lost timer
  * interrupts and the inability for the timer
  * interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not reccomended
+ * requested HZ value. It is also not recommended
  * for "tick-less" systems.
  */
 #define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 			      unsigned int timer_flag)
 {
 	/*
-	 * It doesnt matter which lock we take:
+	 * It doesn't matter which lock we take:
 	 */
 	raw_spinlock_t *lock;
 	struct entry *entry, input;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c075f4ea6b94..ee24fa1935ac 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod)
 		p->flags = 0L;
 
 		/*
-		 * Do the initial record convertion from mcount jump
+		 * Do the initial record conversion from mcount jump
 		 * to the NOP instructions.
 		 */
 		if (!ftrace_code_disable(mod, p)) {
@@ -3425,7 +3425,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
 	atomic_set(&t->tracing_graph_pause, 0);
 	atomic_set(&t->trace_overrun, 0);
 	t->ftrace_timestamp = 0;
-	/* make curr_ret_stack visable before we add the ret_stack */
+	/* make curr_ret_stack visible before we add the ret_stack */
 	smp_wmb();
 	t->ret_stack = ret_stack;
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9c8bcafb120..0ef7b4b2a1f7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
 	return local_read(&bpage->entries) & RB_WRITE_MASK;
 }
 
-/* Size is determined by what has been commited */
+/* Size is determined by what has been committed */
 static inline unsigned rb_page_size(struct buffer_page *bpage)
 {
 	return rb_page_commit(bpage);
@@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	/*
 	 * cpu_buffer->pages just needs to point to the buffer, it
 	 *  has no specific buffer page to point to. Lets move it out
-	 *  of our way so we don't accidently swap it.
+	 *  of our way so we don't accidentally swap it.
 	 */
 	cpu_buffer->pages = reader->list.prev;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9541c27c1cf2..d38c16a06a6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3239,7 +3239,7 @@ waitagain:
 		trace_seq_init(&iter->seq);
 
 	/*
-	 * If there was nothing to send to user, inspite of consuming trace
+	 * If there was nothing to send to user, in spite of consuming trace
 	 * entries, go back to wait for more entries.
 	 */
 	if (sret == -EBUSY)
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
 }
 
 /*
- * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * trace_clock(): 'between' trace clock. Not completely serialized,
  * but not completely incorrect when crossing CPUs either.
  *
  * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 1516cb3ec549..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
  *	  in the structure.
  *
  *   * for structures within structures, the format of the internal
- *	structure is layed out. This allows the internal structure
+ *	structure is laid out. This allows the internal structure
  *	to be deciphered for the format file. Although these macros
  *	may become out of sync with the internal structure, they
  *	will create a compile error if it happens. Since the
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
  *
  * returns 1 if
  *  - we are inside irq code
- *  - we just extered irq code
+ *  - we just entered irq code
  *
  * retunns 0 if
  *  - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..a4969b47afc1 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
  * skip the latency if the sequence has changed - some other section
  * did a maximum and could disturb our measurement with serial console
  * printouts, etc. Truly coinciding maximum latencies should be rare
- * and what happens together happens separately as well, so this doesnt
+ * and what happens together happens separately as well, so this doesn't
  * decrease the validity of the maximum found:
  */
 static __cacheline_aligned_in_smp	unsigned long max_sequence;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8435b43b1782..35d55a386145 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1839,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp)
 	kfree(tp->call.print_fmt);
 }
 
-/* Make a debugfs interface for controling probe points */
+/* Make a debugfs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
 	struct dentry *d_tracer;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 
 /*
  * Removes a registered user return notifier.  Must be called from atomic
- * context, and from the same cpu registration occured in.
+ * context, and from the same cpu registration occurred in.
  */
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
  * woken up through the queue.
  *
  * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and noone wakes up
+ * aborts and is woken up concurrently and no one wakes up
  * the next waiter.
  */
 void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 04ef830690ec..8859a41806dd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1291,7 +1291,7 @@ __acquires(&gcwq->lock)
 			return true;
 		spin_unlock_irq(&gcwq->lock);
 
-		/* CPU has come up inbetween, retry migration */
+		/* CPU has come up in between, retry migration */
 		cpu_relax();
 	}
 }
-- 
cgit v1.2.2


From c0bb9e45f3a7f67fc358946727bc3d5f23d0f55d Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 25 Aug 2010 10:22:58 +1000
Subject: kdump: Allow shrinking of kdump region to be overridden

On ppc64 the crashkernel region almost always overlaps an area of firmware.
This works fine except when using the sysfs interface to reduce the kdump
region. If we free the firmware area we are guaranteed to crash.

Rename free_reserved_phys_range to crash_free_reserved_phys_range and make
it a weak function so we can override it.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 kernel/kexec.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..4e240a378df6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1099,7 +1099,8 @@ size_t crash_get_memory_size(void)
 	return size;
 }
 
-static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+					   unsigned long end)
 {
 	unsigned long addr;
 
@@ -1135,7 +1136,7 @@ int crash_shrink_memory(unsigned long new_size)
 	start = roundup(start, PAGE_SIZE);
 	end = roundup(start + new_size, PAGE_SIZE);
 
-	free_reserved_phys_range(end, crashk_res.end);
+	crash_free_reserved_phys_range(end, crashk_res.end);
 
 	if ((start == end) && (crashk_res.parent != NULL))
 		release_resource(&crashk_res);
-- 
cgit v1.2.2


From 4f5058c3b71ed5930bb2b478c4d5dbc799dd9ad1 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Sat, 2 Apr 2011 19:39:35 +0800
Subject: genirq: Fix cpumask leak in __setup_irq()

The allocated cpumask should be freed in __setup_irq().

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
LKML-Reference: <1301744375-6812-1-git-send-email-dfeng@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 12a80fdae11c..07c1611f3899 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1051,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	register_irq_proc(irq, desc);
 	new->dir = NULL;
 	register_handler_proc(irq, new);
+	free_cpumask_var(mask);
 
 	return 0;
 
-- 
cgit v1.2.2


From 4352d9d44b935e4d000be6ec89ddb55c2bf35f24 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Mon, 4 Apr 2011 08:31:23 -0700
Subject: ntp: fix non privileged system time shifting

The ADJ_SETOFFSET bit added in commit 094aa188 ("ntp: Add ADJ_SETOFFSET
mode bit") also introduced a way for any user to change the system time.
Sneaky or buggy calls to adjtimex() could set

    ADJ_OFFSET_SS_READ | ADJ_SETOFFSET

which would result in a successful call to timekeeping_inject_offset().
This patch fixes the issue by adding the capability check.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/ntp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5f1bb8e2008f..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -652,6 +652,8 @@ int do_adjtimex(struct timex *txc)
 		struct timespec delta;
 		delta.tv_sec  = txc->time.tv_sec;
 		delta.tv_nsec = txc->time.tv_usec;
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
 		if (!(txc->modes & ADJ_NANO))
 			delta.tv_nsec *= 1000;
 		result = timekeeping_inject_offset(&delta);
-- 
cgit v1.2.2


From 5aba085ededa6c5a1ff465e2aebc3e8eb00a7567 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 4 Apr 2011 14:59:31 -0700
Subject: kernel/signal.c: fix typos and coding style

General coding style and comment fixes; no code changes:

 - Use multi-line-comment coding style.
 - Put some function signatures completely on one line.
 - Hyphenate some words.
 - Spell Posix as POSIX.
 - Correct typos & spellos in some comments.
 - Drop trailing whitespace.
 - End sentences with periods.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 90 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 48 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1186cf7fac77..3ab90e8b6ecf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig)
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to stop the target task from exiting
+ *   appropriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *
 __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 	return !tracehook_consider_fatal_signal(tsk, sig);
 }
 
-
-/* Notify the system that a driver wants to block all signals for this
+/*
+ * Notify the system that a driver wants to block all signals for this
  * process, and wants to be notified if any signals at all were to be
  * sent/acted upon.  If the notifier routine returns non-zero, then the
  * signal will be acted upon after all.  If the notifier routine returns 0,
  * then then signal will be blocked.  Only one block per process is
  * allowed.  priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.  */
-
+ * can use to determine if the signal should be blocked or not.
+ */
 void
 block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
 {
@@ -434,9 +434,10 @@ still_pending:
 		copy_siginfo(info, &first->info);
 		__sigqueue_free(first);
 	} else {
-		/* Ok, it wasn't in the queue.  This must be
-		   a fast-pathed signal or we must have been
-		   out of queue space.  So zero out the info.
+		/*
+		 * Ok, it wasn't in the queue.  This must be
+		 * a fast-pathed signal or we must have been
+		 * out of queue space.  So zero out the info.
 		 */
 		info->si_signo = sig;
 		info->si_errno = 0;
@@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 }
 
 /*
- * Dequeue a signal and return the element to the caller, which is 
+ * Dequeue a signal and return the element to the caller, which is
  * expected to free it.
  *
  * All callers have to hold the siglock.
@@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 * itimers are process shared and we restart periodic
 		 * itimers in the signal delivery path to prevent DoS
 		 * attacks in the high resolution timer case. This is
-		 * compliant with the old way of self restarting
+		 * compliant with the old way of self-restarting
 		 * itimers, as the SIGALRM is a legacy signal and only
 		 * queued once. Changing the restart behaviour to
 		 * restart the timer in the signal dequeue path is
@@ -923,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	if (info == SEND_SIG_FORCED)
 		goto out_set;
 
-	/* Real-time signals must be queued if sent by sigqueue, or
-	   some other real-time mechanism.  It is implementation
-	   defined whether kill() does so.  We attempt to do so, on
-	   the principle of least surprise, but since kill is not
-	   allowed to fail with EAGAIN when low on memory we just
-	   make sure at least one signal gets delivered and don't
-	   pass on the info struct.  */
-
+	/*
+	 * Real-time signals must be queued if sent by sigqueue, or
+	 * some other real-time mechanism.  It is implementation
+	 * defined whether kill() does so.  We attempt to do so, on
+	 * the principle of least surprise, but since kill is not
+	 * allowed to fail with EAGAIN when low on memory we just
+	 * make sure at least one signal gets delivered and don't
+	 * pass on the info struct.
+	 */
 	if (sig < SIGRTMIN)
 		override_rlimit = (is_si_special(info) || info->si_code >= 0);
 	else
@@ -1201,8 +1203,7 @@ retry:
 	return error;
 }
 
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
 	rcu_read_lock();
@@ -1299,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
  * These are for backward compatibility with the rest of the kernel source.
  */
 
-int
-send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	/*
 	 * Make sure legacy kernel users don't send in bad values
@@ -1368,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid);
  * These functions support sending signals using preallocated sigqueue
  * structures.  This is needed "because realtime applications cannot
  * afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions".  In the case of Posix Timers
+ * expirations or I/O completions".  In the case of POSIX Timers
  * we allocate the sigqueue structure from the timer_create.  If this
  * allocation fails we are able to report the failure to the application
  * with an EAGAIN error.
@@ -1553,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	info.si_signo = SIGCHLD;
 	info.si_errno = 0;
 	/*
-	 * see comment in do_notify_parent() abot the following 3 lines
+	 * see comment in do_notify_parent() about the following 4 lines
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1611,7 +1611,7 @@ static inline int may_ptrace_stop(void)
 }
 
 /*
- * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Return non-zero if there is a SIGKILL that should be waking us up.
  * Called with the siglock held.
  */
 static int sigkill_pending(struct task_struct *tsk)
@@ -1735,7 +1735,7 @@ void ptrace_notify(int exit_code)
 /*
  * This performs the stopping for SIGSTOP and other stop signals.
  * We have to stop all threads in the thread group.
- * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns non-zero if we've actually stopped and released the siglock.
  * Returns zero if we didn't stop and still hold the siglock.
  */
 static int do_signal_stop(int signr)
@@ -1823,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
 
 	current->exit_code = 0;
 
-	/* Update the siginfo structure if the signal has
-	   changed.  If the debugger wanted something
-	   specific in the siginfo structure then it should
-	   have updated *info via PTRACE_SETSIGINFO.  */
+	/*
+	 * Update the siginfo structure if the signal has
+	 * changed.  If the debugger wanted something
+	 * specific in the siginfo structure then it should
+	 * have updated *info via PTRACE_SETSIGINFO.
+	 */
 	if (signr != info->si_signo) {
 		info->si_signo = signr;
 		info->si_errno = 0;
@@ -2034,7 +2036,8 @@ void exit_signals(struct task_struct *tsk)
 	if (!signal_pending(tsk))
 		goto out;
 
-	/* It could be that __group_complete_signal() choose us to
+	/*
+	 * It could be that __group_complete_signal() choose us to
 	 * notify about group-wide signal. Another thread should be
 	 * woken now to take the signal since we will not.
 	 */
@@ -2183,7 +2186,7 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
 
 out:
 	return error;
-}	
+}
 
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
 {
@@ -2233,9 +2236,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 		err |= __put_user(from->si_trapno, &to->si_trapno);
 #endif
 #ifdef BUS_MCEERR_AO
-		/* 
+		/*
 		 * Other callers might not initialize the si_lsb field,
-	 	 * so check explicitely for the right codes here.
+		 * so check explicitly for the right codes here.
 		 */
 		if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
 			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2280,7 +2283,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
 
 	if (copy_from_user(&these, uthese, sizeof(these)))
 		return -EFAULT;
-		
+
 	/*
 	 * Invert the set of allowed signals to get those we
 	 * want to block.
@@ -2305,9 +2308,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
 				   + (ts.tv_sec || ts.tv_nsec));
 
 		if (timeout) {
-			/* None ready -- temporarily unblock those we're
+			/*
+			 * None ready -- temporarily unblock those we're
 			 * interested while we are sleeping in so that we'll
-			 * be awakened when they arrive.  */
+			 * be awakened when they arrive.
+			 */
 			current->real_blocked = current->blocked;
 			sigandsets(&current->blocked, &current->blocked, &these);
 			recalc_sigpending();
@@ -2553,12 +2558,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 
 		error = -EINVAL;
 		/*
-		 *
-		 * Note - this code used to test ss_flags incorrectly
+		 * Note - this code used to test ss_flags incorrectly:
 		 *  	  old code may have been written using ss_flags==0
 		 *	  to mean ss_flags==SS_ONSTACK (as this was the only
 		 *	  way that worked) - this fix preserves that older
-		 *	  mechanism
+		 *	  mechanism.
 		 */
 		if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
 			goto out;
@@ -2600,8 +2604,10 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #endif
 
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/* Some platforms have their own version with special arguments others
-   support only sys_rt_sigprocmask.  */
+/*
+ * Some platforms have their own version with special arguments;
+ * others support only sys_rt_sigprocmask.
+ */
 
 SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
 		old_sigset_t __user *, oset)
-- 
cgit v1.2.2


From 41c57892a2895865afc89ff1a21f91a0f1506f66 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 4 Apr 2011 15:00:26 -0700
Subject: kernel/signal.c: add kernel-doc notation to syscalls

Add kernel-doc to syscalls in signal.c.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 3ab90e8b6ecf..dc17929ab78a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2075,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals);
  * System call entry points.
  */
 
+/**
+ *  sys_restart_syscall - restart a system call
+ */
 SYSCALL_DEFINE0(restart_syscall)
 {
 	struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2128,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 	return error;
 }
 
+/**
+ *  sys_rt_sigprocmask - change the list of currently blocked signals
+ *  @how: whether to add, remove, or set signals
+ *  @set: stores pending signals
+ *  @oset: previous value of signal mask if non-null
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
 		sigset_t __user *, oset, size_t, sigsetsize)
 {
@@ -2188,6 +2198,12 @@ out:
 	return error;
 }
 
+/**
+ *  sys_rt_sigpending - examine a pending signal that has been raised
+ *			while blocked
+ *  @set: stores pending signals
+ *  @sigsetsize: size of sigset_t type or larger
+ */
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
 {
 	return do_sigpending(set, sigsetsize);
@@ -2267,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 
 #endif
 
+/**
+ *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
+ *			in @uthese
+ *  @uthese: queued signals to wait for
+ *  @uinfo: if non-null, the signal's siginfo is returned here
+ *  @uts: upper bound on process time suspension
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
 		siginfo_t __user *, uinfo, const struct timespec __user *, uts,
 		size_t, sigsetsize)
@@ -2344,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
 	return ret;
 }
 
+/**
+ *  sys_kill - send a signal to a process
+ *  @pid: the PID of the process
+ *  @sig: signal to be sent
+ */
 SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct siginfo info;
@@ -2419,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
 	return do_tkill(tgid, pid, sig);
 }
 
-/*
+/**
+ *  sys_tkill - send signal to one specific task
+ *  @pid: the PID of the task
+ *  @sig: signal to be sent
+ *
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2431,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
 	return do_tkill(0, pid, sig);
 }
 
+/**
+ *  sys_rt_sigqueueinfo - send signal information to a signal
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *  @uinfo: signal info to be sent
+ */
 SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 		siginfo_t __user *, uinfo)
 {
@@ -2596,6 +2635,10 @@ out:
 
 #ifdef __ARCH_WANT_SYS_SIGPENDING
 
+/**
+ *  sys_sigpending - examine pending signals
+ *  @set: where mask of pending signal is returned
+ */
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
 	return do_sigpending(set, sizeof(*set));
@@ -2604,7 +2647,12 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #endif
 
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/*
+/**
+ *  sys_sigprocmask - examine and change blocked signals
+ *  @how: whether to add, remove, or set signals
+ *  @set: signals to add or remove (if non-null)
+ *  @oset: previous value of signal mask if non-null
+ *
  * Some platforms have their own version with special arguments;
  * others support only sys_rt_sigprocmask.
  */
@@ -2660,6 +2708,13 @@ out:
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
 
 #ifdef __ARCH_WANT_SYS_RT_SIGACTION
+/**
+ *  sys_rt_sigaction - alter an action taken by a process
+ *  @sig: signal to be sent
+ *  @act: the thread group ID of the thread
+ *  @oact: the PID of the thread
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		const struct sigaction __user *, act,
 		struct sigaction __user *, oact,
@@ -2746,6 +2801,12 @@ SYSCALL_DEFINE0(pause)
 #endif
 
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+/**
+ *  sys_rt_sigsuspend - replace the signal mask for a value with the
+ *	@unewset value until a signal is received
+ *  @unewset: new signal mask value
+ *  @sigsetsize: size of sigset_t type
+ */
 SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 {
 	sigset_t newset;
-- 
cgit v1.2.2


From 49c022e657fbe661460d191fbe776a387132e2b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Apr 2011 10:14:25 +0200
Subject: sched: Clean up rebalance_domains() load-balance interval calculation

Instead of the possible multiple-evaluation of num_online_cpus()
in rebalance_domains() that Linus reported, avoid it altogether
in the normal case since it's implemented with a Hamming weight
function over a cpu bitmask which can be darn expensive for those
with big iron.

This also makes it cleaner, smaller and documents the code.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1301991265.2225.12.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  3 +++
 kernel/sched_fair.c | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a8845516ace6..17b4d226ee0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6331,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 #endif
 	}
+
+	update_max_interval();
+
 	return NOTIFY_OK;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c7ec5c8e7b44..80ecd09452e0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3820,6 +3820,17 @@ void select_nohz_load_balancer(int stop_tick)
 
 static DEFINE_SPINLOCK(balancing);
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+	max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -3849,10 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
-		if (unlikely(!interval))
-			interval = 1;
-		if (interval > HZ*num_online_cpus()/10)
-			interval = HZ*num_online_cpus()/10;
+		interval = clamp(interval, 1UL, max_load_balance_interval);
 
 		need_serialize = sd->flags & SD_SERIALIZE;
 
-- 
cgit v1.2.2


From f9fa0bc1fabe1d861e46d80ecbe7e85da359195c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 8 Apr 2011 10:53:46 -0700
Subject: signal.c: fix erroneous syscall kernel-doc

Fix erroneous syscall kernel-doc comments in kernel/signal.c.

Reported-by: Matt Fleming <matt@console-pimps.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 29e233fd7a0f..7165af5f1b11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2711,8 +2711,8 @@ out:
 /**
  *  sys_rt_sigaction - alter an action taken by a process
  *  @sig: signal to be sent
- *  @act: the thread group ID of the thread
- *  @oact: the PID of the thread
+ *  @act: new sigaction
+ *  @oact: used to save the previous sigaction
  *  @sigsetsize: size of sigset_t type
  */
 SYSCALL_DEFINE4(rt_sigaction, int, sig,
-- 
cgit v1.2.2


From e566b76ed30768140df8f0023904aed5a41244f7 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 6 Apr 2011 02:54:54 +0200
Subject: perf_event: Fix cgrp event scheduling bug in perf_enable_on_exec()

There is a bug in perf_event_enable_on_exec() when cgroup events are
active on a CPU: the cgroup events may be scheduled twice causing event
state corruptions which eventually may lead to kernel panics.

The reason is that the function needs to first schedule out the cgroup
events, just like for the per-thread events. The cgroup event are
scheduled back in automatically from the perf_event_context_sched_in()
function.

The patch also adds a WARN_ON_ONCE() is perf_cgroup_switch() to catch any
bogus state.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110406005454.GA1062@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 27960f114efd..8e81a9860a0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
 			}
 
 			if (mode & PERF_CGROUP_SWIN) {
+				WARN_ON_ONCE(cpuctx->cgrp);
 				/* set cgrp before ctxsw in to
 				 * allow event_filter_match() to not
 				 * have to pass task around
@@ -2423,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 	if (!ctx || !ctx->nr_events)
 		goto out;
 
+	/*
+	 * We must ctxsw out cgroup events to avoid conflict
+	 * when invoking perf_task_event_sched_in() later on
+	 * in this function. Otherwise we end up trying to
+	 * ctxswin cgroup events which are already scheduled
+	 * in.
+	 */
+	perf_cgroup_sched_out(current);
 	task_ctx_sched_out(ctx, EVENT_ALL);
 
 	raw_spin_lock(&ctx->lock);
@@ -2447,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	raw_spin_unlock(&ctx->lock);
 
+	/*
+	 * Also calls ctxswin for cgroup events, if any:
+	 */
 	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
-- 
cgit v1.2.2


From b0432d8f162c7d5d9537b4cb749d44076b76a783 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Thu, 7 Apr 2011 17:23:22 -0700
Subject: sched: Fix sched-domain avg_load calculation

In function find_busiest_group(), the sched-domain avg_load isn't
calculated at all if there is a group imbalance within the domain. This
will cause erroneous imbalance calculation.

The reason is that calculate_imbalance() sees sds->avg_load = 0 and it
will dump entire sds->max_load into imbalance variable, which is used
later on to migrate entire load from busiest CPU to the puller CPU.

This has two really bad effect:

1. stampede of task migration, and they won't be able to break out
   of the bad state because of positive feedback loop: large load
   delta -> heavier load migration -> larger imbalance and the cycle
   goes on.

2. severe imbalance in CPU queue depth.  This causes really long
   scheduling latency blip which affects badly on application that
   has tight latency requirement.

The fix is to have kernel calculate domain avg_load in both cases. This
will ensure that imbalance calculation is always sensible and the target
is usually half way between busiest and puller CPU.

Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20110408002322.3A0D812217F@elm.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7f00772e57c9..60f9d407c5ec 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3127,6 +3127,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assumes all things are equal, which typically
@@ -3151,7 +3153,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * Don't pull any tasks if this group is already above the domain
 	 * average load.
 	 */
-	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-- 
cgit v1.2.2


From b30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Fri, 8 Apr 2011 12:20:16 -0700
Subject: sched: Fix erroneous all_pinned logic

The scheduler load balancer has specific code to deal with cases of
unbalanced system due to lots of unmovable tasks (for example because of
hard CPU affinity). In those situation, it excludes the busiest CPU that
has pinned tasks for load balance consideration such that it can perform
second 2nd load balance pass on the rest of the system.

This all works as designed if there is only one cgroup in the system.

However, when we have multiple cgroups, this logic has false positives and
triggers multiple load balance passes despite there are actually no pinned
tasks at all.

The reason it has false positives is that the all pinned logic is deep in
the lowest function of can_migrate_task() and is too low level:

load_balance_fair() iterates each task group and calls balance_tasks() to
migrate target load. Along the way, balance_tasks() will also set a
all_pinned variable. Given that task-groups are iterated, this all_pinned
variable is essentially the status of last group in the scanning process.
Task group can have number of reasons that no load being migrated, none
due to cpu affinity. However, this status bit is being propagated back up
to the higher level load_balance(), which incorrectly think that no tasks
were moved.  It kick off the all pinned logic and start multiple passes
attempt to move load onto puller CPU.

To fix this, move the all_pinned aggregation up at the iterator level.
This ensures that the status is aggregated over all task-groups, not just
last one in the list.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: stable@kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 60f9d407c5ec..6fa833ab2cb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-	int loops = 0, pulled = 0, pinned = 0;
+	int loops = 0, pulled = 0;
 	long rem_load_move = max_load_move;
 	struct task_struct *p, *n;
 
 	if (max_load_move == 0)
 		goto out;
 
-	pinned = 1;
-
 	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate)
 			break;
 
 		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+		    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+				      all_pinned))
 			continue;
 
 		pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,9 +2152,6 @@ out:
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 
-	if (all_pinned)
-		*all_pinned = pinned;
-
 	return max_load_move - rem_load_move;
 }
 
@@ -3341,6 +3337,7 @@ redo:
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
+		all_pinned = 1;
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-- 
cgit v1.2.2


From 1f112cee07b314e244ee9e71d9c1e6950dc13327 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 11 Apr 2011 22:54:42 +0200
Subject: PM / Hibernate: Introduce CONFIG_HIBERNATE_CALLBACKS

Xen save/restore is going to use hibernate device callbacks for
quiescing devices and putting them back to normal operations and it
would need to select CONFIG_HIBERNATION for this purpose.  However,
that also would cause the hibernate interfaces for user space to be
enabled, which might confuse user space, because the Xen kernels
don't support hibernation.  Moreover, it would be wasteful, as it
would make the Xen kernels include a substantial amount of code that
they would never use.

To address this issue introduce new power management Kconfig option
CONFIG_HIBERNATE_CALLBACKS, such that it will only select the code
that is necessary for the hibernate device callbacks to work and make
CONFIG_HIBERNATION select it.  Then, Xen save/restore will be able to
select CONFIG_HIBERNATE_CALLBACKS without dragging the entire
hibernate code along with it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Tested-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>
---
 kernel/power/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4603f08dc47b..049791468d37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,9 +18,13 @@ config SUSPEND_FREEZER
 
 	  Turning OFF this setting is NOT recommended! If in doubt, say Y.
 
+config HIBERNATE_CALLBACKS
+	bool
+
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on SWAP && ARCH_HIBERNATION_POSSIBLE
+	select HIBERNATE_CALLBACKS
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
 	---help---
@@ -85,7 +89,7 @@ config PM_STD_PARTITION
 
 config PM_SLEEP
 	def_bool y
-	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
+	depends on SUSPEND || HIBERNATE_CALLBACKS || XEN_SAVE_RESTORE
 
 config PM_SLEEP_SMP
 	def_bool y
-- 
cgit v1.2.2


From d419e4c0f7584ffc5c72d9aeeaac485cc756ebcf Mon Sep 17 00:00:00 2001
From: Shriram Rajagopalan <rshriram@cs.ubc.ca>
Date: Mon, 11 Apr 2011 22:54:48 +0200
Subject: fix XEN_SAVE_RESTORE Kconfig dependencies

Make XEN_SAVE_RESTORE select HIBERNATE_CALLBACKS.
Remove XEN_SAVE_RESTORE dependency from PM_SLEEP.

Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 049791468d37..6de9a8fc3417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -89,7 +89,7 @@ config PM_STD_PARTITION
 
 config PM_SLEEP
 	def_bool y
-	depends on SUSPEND || HIBERNATE_CALLBACKS || XEN_SAVE_RESTORE
+	depends on SUSPEND || HIBERNATE_CALLBACKS
 
 config PM_SLEEP_SMP
 	def_bool y
-- 
cgit v1.2.2


From d9c97833179036408e53ef5f3f5c7eaf781769bc Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:06:33 +0200
Subject: block: remove block_unplug_timer() trace point

We no longer have an unplug timer running, so no point in keeping
the trace point.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7aa40f8e182d..824708cbfb7b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -863,19 +863,6 @@ static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
 	}
 }
 
-static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
-				sizeof(rpdu), &rpdu);
-	}
-}
-
 static void blk_add_trace_split(void *ignore,
 				struct request_queue *q, struct bio *bio,
 				unsigned int pdu)
@@ -1015,8 +1002,6 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
-	WARN_ON(ret);
 	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
@@ -1033,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
-	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
 	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1348,7 +1332,6 @@ static const struct {
 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
-	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
 	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
-- 
cgit v1.2.2


From 94b5eb28b41cc79d9713696e0005ae167b5afd1b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:12:19 +0200
Subject: block: fixup block IO unplug trace call

It was removed with the on-stack plugging, readd it and track the
depth of requests added when flushing the plug.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 824708cbfb7b..3e3970d53d14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -850,13 +850,13 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q,
+				    unsigned int depth)
 {
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
+		__be64 rpdu = cpu_to_be64(depth);
 
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
 				sizeof(rpdu), &rpdu);
-- 
cgit v1.2.2


From 6631e635c65dc33cb798cc2f51d0ddd69ada6319 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 13 Apr 2011 08:08:20 -0700
Subject: block: don't flush plugged IO on forced preemtion scheduling

We really only want to unplug the pending IO when the process actually
goes to sleep.  So move the test for flushing the plug up to the place
where we actually deactivate the task - where we have properly checked
for preemption and for the process really sleeping.

Acked-by: Jens Axboe <jaxboe@fusionio.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 48013633d792..a187c3fe027b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4111,20 +4111,20 @@ need_resched:
 					try_to_wake_up_local(to_wakeup);
 			}
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+			/*
+			 * If we are going to sleep and we have plugged IO queued, make
+			 * sure to submit it to avoid deadlocks.
+			 */
+			if (blk_needs_flush_plug(prev)) {
+				raw_spin_unlock(&rq->lock);
+				blk_flush_plug(prev);
+				raw_spin_lock(&rq->lock);
+			}
 		}
 		switch_count = &prev->nvcsw;
 	}
 
-	/*
-	 * If we are going to sleep and we have plugged IO queued, make
-	 * sure to submit it to avoid deadlocks.
-	 */
-	if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
-		raw_spin_unlock(&rq->lock);
-		blk_flush_plug(prev);
-		raw_spin_lock(&rq->lock);
-	}
-
 	pre_schedule(rq, prev);
 
 	if (unlikely(!rq->nr_running))
-- 
cgit v1.2.2


From 0cd9c6494ee5c19aef085152bc37f3a4e774a9e1 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Thu, 14 Apr 2011 15:41:57 -0700
Subject: futex: Set FLAGS_HAS_TIMEOUT during futex_wait restart setup

The FLAGS_HAS_TIMEOUT flag was not getting set, causing the restart_block to
restart futex_wait() without a timeout after a signal.

Commit b41277dc7a18ee332d in 2.6.38 introduced the regression by accidentally
removing the the FLAGS_HAS_TIMEOUT assignment from futex_wait() during the setup
of the restart block. Restore the originaly behavior.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=32922

Reported-by: Tim Smith <tsmith201104@yahoo.com>
Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/%3Cdaac0eb3af607f72b9a4d3126b2ba8fb5ed3b883.1302820917.git.dvhart%40linux.intel.com%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index dfb924ffe65b..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1886,7 +1886,7 @@ retry:
 	restart->futex.val = val;
 	restart->futex.time = abs_time->tv64;
 	restart->futex.bitset = bitset;
-	restart->futex.flags = flags;
+	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
 	ret = -ERESTART_RESTARTBLOCK;
 
-- 
cgit v1.2.2


From a237c1c5bc5dc5c76a21be922dca4826f3eca8ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sat, 16 Apr 2011 13:27:55 +0200
Subject: block: let io_schedule() flush the plug inline

Linus correctly observes that the most important dispatch cases
are now done from kblockd, this isn't ideal for latency reasons.
The original reason for switching dispatches out-of-line was to
avoid too deep a stack, so by _only_ letting the "accidental"
flush directly in schedule() be guarded by offload to kblockd,
we should be able to get the best of both worlds.

So add a blk_schedule_flush_plug() that offloads to kblockd,
and only use that from the schedule() path.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a187c3fe027b..312f8b95c2d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4118,7 +4118,7 @@ need_resched:
 			 */
 			if (blk_needs_flush_plug(prev)) {
 				raw_spin_unlock(&rq->lock);
-				blk_flush_plug(prev);
+				blk_schedule_flush_plug(prev);
 				raw_spin_lock(&rq->lock);
 			}
 		}
-- 
cgit v1.2.2


From 49cac01e1fa74174d72adb0e872504a7fefd7c01 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sat, 16 Apr 2011 13:51:05 +0200
Subject: block: make unplug timer trace event correspond to the schedule()
 unplug

It's a pretty close match to what we had before - the timer triggering
would mean that nobody unplugged the plug in due time, in the new
scheme this matches very closely what the schedule() unplug now is.
It's essentially the difference between an explicit unplug (IO unplug)
or an implicit unplug (timer unplug, we scheduled with pending IO
queued).

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3e3970d53d14..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -850,16 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q,
-				    unsigned int depth)
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+				    unsigned int depth, bool explicit)
 {
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(depth);
+		u32 what;
 
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
-				sizeof(rpdu), &rpdu);
+		if (explicit)
+			what = BLK_TA_UNPLUG_IO;
+		else
+			what = BLK_TA_UNPLUG_TIMER;
+
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
 	}
 }
 
@@ -1002,7 +1007,7 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
 	WARN_ON(ret);
@@ -1017,7 +1022,7 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
-	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
 	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1332,6 +1337,7 @@ static const struct {
 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
 	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
-- 
cgit v1.2.2


From 1791f881435fab951939ad700e947b66c062e083 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Wed, 30 Mar 2011 15:24:21 +0200
Subject: posix clocks: Replace mutex with reader/writer semaphore

A dynamic posix clock is protected from asynchronous removal by a mutex.
However, using a mutex has the unwanted effect that a long running clock
operation in one process will unnecessarily block other processes.

For example, one process might call read() to get an external time stamp
coming in at one pulse per second. A second process calling clock_gettime
would have to wait for almost a whole second.

This patch fixes the issue by using a reader/writer semaphore instead of
a mutex.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/%3C20110330132421.GA31771%40riccoc20.at.omicron.at%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/posix-clock.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 25028dd4fa18..c340ca658f37 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,6 @@
  */
 #include <linux/device.h>
 #include <linux/file.h>
-#include <linux/mutex.h>
 #include <linux/posix-clock.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
@@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp)
 {
 	struct posix_clock *clk = fp->private_data;
 
-	mutex_lock(&clk->mutex);
+	down_read(&clk->rwsem);
 
 	if (!clk->zombie)
 		return clk;
 
-	mutex_unlock(&clk->mutex);
+	up_read(&clk->rwsem);
 
 	return NULL;
 }
 
 static void put_posix_clock(struct posix_clock *clk)
 {
-	mutex_unlock(&clk->mutex);
+	up_read(&clk->rwsem);
 }
 
 static ssize_t posix_clock_read(struct file *fp, char __user *buf,
@@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 	struct posix_clock *clk =
 		container_of(inode->i_cdev, struct posix_clock, cdev);
 
-	mutex_lock(&clk->mutex);
+	down_read(&clk->rwsem);
 
 	if (clk->zombie) {
 		err = -ENODEV;
@@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 		fp->private_data = clk;
 	}
 out:
-	mutex_unlock(&clk->mutex);
+	up_read(&clk->rwsem);
 	return err;
 }
 
@@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid)
 	int err;
 
 	kref_init(&clk->kref);
-	mutex_init(&clk->mutex);
+	init_rwsem(&clk->rwsem);
 
 	cdev_init(&clk->cdev, &posix_clock_file_operations);
 	clk->cdev.owner = clk->ops.owner;
 	err = cdev_add(&clk->cdev, devid, 1);
-	if (err)
-		goto no_cdev;
 
 	return err;
-no_cdev:
-	mutex_destroy(&clk->mutex);
-	return err;
 }
 EXPORT_SYMBOL_GPL(posix_clock_register);
 
 static void delete_clock(struct kref *kref)
 {
 	struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-	mutex_destroy(&clk->mutex);
+
 	if (clk->release)
 		clk->release(clk);
 }
@@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk)
 {
 	cdev_del(&clk->cdev);
 
-	mutex_lock(&clk->mutex);
+	down_write(&clk->rwsem);
 	clk->zombie = true;
-	mutex_unlock(&clk->mutex);
+	up_write(&clk->rwsem);
 
 	kref_put(&clk->kref, delete_clock);
 }
-- 
cgit v1.2.2


From c78193e9c7bcbf25b8237ad0dec82f805c4ea69b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 18 Apr 2011 10:35:30 -0700
Subject: next_pidmap: fix overflow condition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

next_pidmap() just quietly accepted whatever 'last' pid that was passed
in, which is not all that safe when one of the users is /proc.

Admittedly the proc code should do some sanity checking on the range
(and that will be the next commit), but that doesn't mean that the
helper functions should just do that pidmap pointer arithmetic without
checking the range of its arguments.

So clamp 'last' to PID_MAX_LIMIT.  The fact that we then do "last+1"
doesn't really matter, the for-loop does check against the end of the
pidmap array properly (it's only the actual pointer arithmetic overflow
case we need to worry about, and going one bit beyond isn't going to
overflow).

[ Use PID_MAX_LIMIT rather than pid_max as per Eric Biederman ]

Reported-by: Tavis Ormandy <taviso@cmpxchg8b.com>
Analyzed-by: Robert Święcki <robert@swiecki.net>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 02f221274265..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	return -1;
 }
 
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 {
 	int offset;
 	struct pidmap *map, *end;
 
+	if (last >= PID_MAX_LIMIT)
+		return -1;
+
 	offset = (last + 1) & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
 	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-- 
cgit v1.2.2


From 2ca6f62f595c01f689b269db6736de5544da7667 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 18 Apr 2011 23:58:59 +0200
Subject: PM: Fix error code paths executed after failing syscore_suspend()

If syscore_suspend() fails in suspend_enter(), create_image() or
resume_target_kernel(), it is necessary to call sysdev_resume(),
because sysdev_suspend() has been called already and succeeded
and we are going to abort the transition.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/hibernate.c | 10 ++++++++--
 kernel/power/suspend.c   |  5 ++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aeabd26e3342..50aae660174d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -273,8 +273,11 @@ static int create_image(int platform_mode)
 	local_irq_disable();
 
 	error = sysdev_suspend(PMSG_FREEZE);
-	if (!error)
+	if (!error) {
 		error = syscore_suspend();
+		if (error)
+			sysdev_resume();
+	}
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
 			"aborting hibernation\n");
@@ -407,8 +410,11 @@ static int resume_target_kernel(bool platform_mode)
 	local_irq_disable();
 
 	error = sysdev_suspend(PMSG_QUIESCE);
-	if (!error)
+	if (!error) {
 		error = syscore_suspend();
+		if (error)
+			sysdev_resume();
+	}
 	if (error)
 		goto Enable_irqs;
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 2814c32aed51..8935369d503a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -164,8 +164,11 @@ static int suspend_enter(suspend_state_t state)
 	BUG_ON(!irqs_disabled());
 
 	error = sysdev_suspend(PMSG_SUSPEND);
-	if (!error)
+	if (!error) {
 		error = syscore_suspend();
+		if (error)
+			sysdev_resume();
+	}
 	if (!error) {
 		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
 			error = suspend_ops->enter(state);
-- 
cgit v1.2.2


From 19234c0819da0e043a02710488dfd9b242b42eba Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 20 Apr 2011 00:36:11 +0200
Subject: PM: Add missing syscore_suspend() and syscore_resume() calls

Device suspend/resume infrastructure is used not only by the suspend
and hibernate code in kernel/power, but also by APM, Xen and the
kexec jump feature.  However, commit 40dc166cb5dddbd36aa4ad11c03915ea
(PM / Core: Introduce struct syscore_ops for core subsystems PM)
failed to add syscore_suspend() and syscore_resume() calls to that
code, which generally leads to breakage when the features in question
are used.

To fix this problem, add the missing syscore_suspend() and
syscore_resume() calls to arch/x86/kernel/apm_32.c, kernel/kexec.c
and drivers/xen/manage.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
---
 kernel/kexec.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 55936f9cb251..87b77de03dd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1532,6 +1533,11 @@ int kernel_kexec(void)
 		local_irq_disable();
 		/* Suspend system devices */
 		error = sysdev_suspend(PMSG_FREEZE);
+		if (!error) {
+			error = syscore_suspend();
+			if (error)
+				sysdev_resume();
+		}
 		if (error)
 			goto Enable_irqs;
 	} else
@@ -1546,6 +1552,7 @@ int kernel_kexec(void)
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
+		syscore_resume();
 		sysdev_resume();
  Enable_irqs:
 		local_irq_enable();
-- 
cgit v1.2.2