From 408af87a397a8ddef56ad39a79481f592aa1ac1a Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 4 Nov 2010 21:44:41 +0100
Subject: Clean up relay_alloc_page_array() slightly by using vzalloc rather
 than vmalloc and memset

We can optimize kernel/relay.c::relay_alloc_page_array() slightly by
using vzalloc.  The patch makes these changes:

 - use vzalloc instead of vmalloc+memset.
 - remove redundant local variable 'array'.
 - declare local 'pa_size' as const.

Cuts down nicely on both source and object-code size.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb92..859ea5a9605 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
  */
 static struct page **relay_alloc_page_array(unsigned int n_pages)
 {
-	struct page **array;
-	size_t pa_size = n_pages * sizeof(struct page *);
-
-	if (pa_size > PAGE_SIZE) {
-		array = vmalloc(pa_size);
-		if (array)
-			memset(array, 0, pa_size);
-	} else {
-		array = kzalloc(pa_size, GFP_KERNEL);
-	}
-	return array;
+	const size_t pa_size = n_pages * sizeof(struct page *);
+	if (pa_size > PAGE_SIZE)
+		return vzalloc(pa_size);
+	return kzalloc(pa_size, GFP_KERNEL);
 }
 
 /*
-- 
cgit v1.2.2


From e0a70217107e6f9844628120412cb27bb4cea194 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 5 Nov 2010 16:53:42 +0100
Subject: posix-cpu-timers: workaround to suppress the problems with mt exec

posix-cpu-timers.c correctly assumes that the dying process does
posix_cpu_timers_exit_group() and removes all !CPUCLOCK_PERTHREAD
timers from signal->cpu_timers list.

But, it also assumes that timer->it.cpu.task is always the group
leader, and thus the dead ->task means the dead thread group.

This is obviously not true after de_thread() changes the leader.
After that almost every posix_cpu_timer_ method has problems.

It is not simple to fix this bug correctly. First of all, I think
that timer->it.cpu should use struct pid instead of task_struct.
Also, the locking should be reworked completely. In particular,
tasklist_lock should not be used at all. This all needs a lot of
nontrivial and hard-to-test changes.

Change __exit_signal() to do posix_cpu_timers_exit_group() when
the old leader dies during exec. This is not the fix, just the
temporary hack to hide the problem for 2.6.37 and stable. IOW,
this is obviously wrong but this is what we currently have anyway:
cpu timers do not work after mt exec.

In theory this change adds another race. The exiting leader can
detach the timers which were attached to the new leader. However,
the window between de_thread() and release_task() is small, we
can pretend that sys_timer_create() was called before de_thread().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b194febf579..21aa7b3001f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk)
 		tty = sig->tty;
 		sig->tty = NULL;
 	} else {
+		/*
+		 * This can only happen if the caller is de_thread().
+		 * FIXME: this is the temporary hack, we should teach
+		 * posix-cpu-timers to handle this case correctly.
+		 */
+		if (unlikely(has_group_leader_pid(tsk)))
+			posix_cpu_timers_exit_group(tsk);
+
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
-- 
cgit v1.2.2


From 433039e97f672b81e6c8f6daef385dcf035c6e29 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 5 Nov 2010 16:17:39 -0700
Subject: watchdog: Fix section mismatch and potential undefined behavior.

Commit d9ca07a05ce1 ("watchdog: Avoid kernel crash when disabling
watchdog") introduces a section mismatch.

Now that we reference no_watchdog from non-__init code it can no longer
be __initdata.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bafba687a6d..6e3c41a4024 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
 
-static int __initdata no_watchdog;
+static int no_watchdog;
 
 
 /* boot commands */
-- 
cgit v1.2.2


From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Nov 2010 14:54:09 +0100
Subject: block: remove REQ_HARDBARRIER

REQ_HARDBARRIER is dead now, so remove the leftovers.  What's left
at this point is:

 - various checks inside the block layer.
 - sanity checks in bio based drivers.
 - now unused bio_empty_barrier helper.
 - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while,
   but Xen really needs to sort out it's barrier situaton.
 - setting of ordered tags in uas - dead code copied from old scsi
   drivers.
 - scsi different retry for barriers - it's dead and should have been
   removed when flushes were converted to FS requests.
 - blktrace handling of barriers - removed.  Someone who knows blktrace
   better should add support for REQ_FLUSH and REQ_FUA, though.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc251ed6672..7b8ec028154 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 
-#define BLK_TC_HARDBARRIER	BLK_TC_BARRIER
 #define BLK_TC_RAHEAD		BLK_TC_AHEAD
 
 /* The ilog2() calls fall out because they're constant */
@@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 
 	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, HARDBARRIER);
 	what |= MASK_TC_BIT(rw, SYNC);
 	what |= MASK_TC_BIT(rw, RAHEAD);
 	what |= MASK_TC_BIT(rw, META);
@@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 
 	if (rw & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (rw & REQ_HARDBARRIER)
-		rwbs[i++] = 'B';
 	if (rw & REQ_SYNC)
 		rwbs[i++] = 'S';
 	if (rw & REQ_META)
-- 
cgit v1.2.2


From eed01528a45dc4138e9a08064b4b6cc1a9426899 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 26 Oct 2010 16:08:01 +0200
Subject: perf_events: Fix time tracking in samples

This patch corrects time tracking in samples. Without this patch
both time_enabled and time_running are bogus when user asks for
PERF_SAMPLE_READ.

One uses PERF_SAMPLE_READ to sample the values of other counters
in each sample. Because of multiplexing, it is necessary to know
both time_enabled, time_running to be able to scale counts correctly.

In this second version of the patch, we maintain a shadow
copy of ctx->time which allows us to compute ctx->time without
calling update_context_time() from NMI context. We avoid the
issue that update_context_time() must always be called with
ctx->lock held.

We do not keep shadow copies of the other event timings
because if the lead event is overflowing then it is active
and thus it's been scheduled in via event_sched_in() in
which case neither tstamp_stopped, tstamp_running can be modified.

This timing logic only applies to samples when PERF_SAMPLE_READ
is used.

Note that this patch does not address timing issues related
to sampling inheritance between tasks. This will be addressed
in a future patch.

With this patch, the libpfm4 example task_smpl now reports
correct counts (shown on 2.4GHz Core 2):

$ task_smpl -p 2400000000 -e unhalted_core_cycles:u,instructions_retired:u,baclears  noploop 5
noploop for 5 seconds
IIP:0x000000004006d6 PID:5596 TID:5596 TIME:466,210,211,430 STREAM_ID:33 PERIOD:2,400,000,000 ENA=1,010,157,814 RUN=1,010,157,814 NR=3
	2,400,000,254 unhalted_core_cycles:u (33)
	2,399,273,744 instructions_retired:u (34)
	53,340 baclears (35)

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4cc6e14b.1e07e30a.256e.5190@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f498..cb6c0d2af68 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -674,6 +674,8 @@ event_sched_in(struct perf_event *event,
 
 	event->tstamp_running += ctx->time - event->tstamp_stopped;
 
+	event->shadow_ctx_time = ctx->time - ctx->timestamp;
+
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -3396,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 }
 
 static void perf_output_read_one(struct perf_output_handle *handle,
-				 struct perf_event *event)
+				 struct perf_event *event,
+				 u64 enabled, u64 running)
 {
 	u64 read_format = event->attr.read_format;
 	u64 values[4];
@@ -3404,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 
 	values[n++] = perf_event_count(event);
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = event->total_time_enabled +
+		values[n++] = enabled +
 			atomic64_read(&event->child_total_time_enabled);
 	}
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = event->total_time_running +
+		values[n++] = running +
 			atomic64_read(&event->child_total_time_running);
 	}
 	if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
  */
 static void perf_output_read_group(struct perf_output_handle *handle,
-			    struct perf_event *event)
+			    struct perf_event *event,
+			    u64 enabled, u64 running)
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
@@ -3431,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	values[n++] = 1 + leader->nr_siblings;
 
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = leader->total_time_enabled;
+		values[n++] = enabled;
 
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = leader->total_time_running;
+		values[n++] = running;
 
 	if (leader != event)
 		leader->pmu->read(leader);
@@ -3459,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	}
 }
 
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+				 PERF_FORMAT_TOTAL_TIME_RUNNING)
+
 static void perf_output_read(struct perf_output_handle *handle,
 			     struct perf_event *event)
 {
+	u64 enabled = 0, running = 0, now, ctx_time;
+	u64 read_format = event->attr.read_format;
+
+	/*
+	 * compute total_time_enabled, total_time_running
+	 * based on snapshot values taken when the event
+	 * was last scheduled in.
+	 *
+	 * we cannot simply called update_context_time()
+	 * because of locking issue as we are called in
+	 * NMI context
+	 */
+	if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+		now = perf_clock();
+		ctx_time = event->shadow_ctx_time + now;
+		enabled = ctx_time - event->tstamp_enabled;
+		running = ctx_time - event->tstamp_running;
+	}
+
 	if (event->attr.read_format & PERF_FORMAT_GROUP)
-		perf_output_read_group(handle, event);
+		perf_output_read_group(handle, event, enabled, running);
 	else
-		perf_output_read_one(handle, event);
+		perf_output_read_one(handle, event, enabled, running);
 }
 
 void perf_output_sample(struct perf_output_handle *handle,
-- 
cgit v1.2.2


From 834b40380e93e36f1c9b48ec1d280cebe3d7bd8c Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Thu, 11 Nov 2010 14:05:14 -0800
Subject: kernel/range.c: fix clean_sort_range() for the case of full array

clean_sort_range() should return a number of nonempty elements of range
array, but if the array is full clean_sort_range() returns 0.

The problem is that the number of nonempty elements is evaluated by
finding the first empty element of the array.  If there is no such element
it returns an initial value of local variable nr_range that is zero.

The fix is trivial: it changes initial value of nr_range to size of the
array.

The bug can lead to loss of information regarding all ranges, since
typically returned value of clean_sort_range() is considered as an actual
number of ranges in the array after a series of add/subtract operations.

Found by Analytical Verification project of Linux Verification Center
(linuxtesting.org), thanks to Alexander Kolosov.

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/range.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb..37fa9b99ad5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
 
 int clean_sort_range(struct range *range, int az)
 {
-	int i, j, k = az - 1, nr_range = 0;
+	int i, j, k = az - 1, nr_range = az;
 
 	for (i = 0; i < k; i++) {
 		if (range[i].end)
-- 
cgit v1.2.2


From 38715258aa2e8cd94bd4aafadc544e5104efd551 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Thu, 11 Nov 2010 14:05:16 -0800
Subject: latencytop: fix per task accumulator

Per task latencytop accumulator prematurely terminates due to erroneous
placement of latency_record_count.  It should be incremented whenever a
new record is allocated instead of increment on every latencytop event.

Also fix search iterator to only search known record events instead of
blindly searching all pre-allocated space.

Signed-off-by: Ken Chen <kenchen@google.com>
Reviewed-by: Arjan van de Ven <arjan@infradead.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/latencytop.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d41..17110a4a4fc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 
 	account_global_scheduler_latency(tsk, &lat);
 
-	/*
-	 * short term hack; if we're > 32 we stop; future we recycle:
-	 */
-	tsk->latency_record_count++;
-	if (tsk->latency_record_count >= LT_SAVECOUNT)
-		goto out_unlock;
-
-	for (i = 0; i < LT_SAVECOUNT; i++) {
+	for (i = 0; i < tsk->latency_record_count; i++) {
 		struct latency_record *mylat;
 		int same = 1;
 
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 		}
 	}
 
+	/*
+	 * short term hack; if we're > 32 we stop; future we recycle:
+	 */
+	if (tsk->latency_record_count >= LT_SAVECOUNT)
+		goto out_unlock;
+
 	/* Allocated a new one: */
-	i = tsk->latency_record_count;
+	i = tsk->latency_record_count++;
 	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 
 out_unlock:
-- 
cgit v1.2.2


From eaf06b241b091357e72b76863ba16e89610d31bd Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Thu, 11 Nov 2010 14:05:18 -0800
Subject: Restrict unprivileged access to kernel syslog

The kernel syslog contains debugging information that is often useful
during exploitation of other vulnerabilities, such as kernel heap
addresses.  Rather than futilely attempt to sanitize hundreds (or
thousands) of printk statements and simultaneously cripple useful
debugging functionality, it is far simpler to create an option that
prevents unprivileged users from reading the syslog.

This patch, loosely based on grsecurity's GRKERNSEC_DMESG, creates the
dmesg_restrict sysctl.  When set to "0", the default, no restrictions are
enforced.  When set to "1", only users with CAP_SYS_ADMIN can read the
kernel syslog via dmesg(8) or other mechanisms.

[akpm@linux-foundation.org: explain the config option in kernel.txt]
Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Eugene Teo <eugeneteo@kernel.org>
Acked-by: Kees Cook <kees.cook@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 6 ++++++
 kernel/sysctl.c | 9 +++++++++
 2 files changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index b2ebaee8c37..38e7d5868d6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)
 }
 #endif
 
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	unsigned i, j, limit, count;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c33a1edb799..b65bf634035 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -703,6 +703,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &ten_thousand,
 	},
 #endif
+	{
+		.procname	= "dmesg_restrict",
+		.data		= &dmesg_restrict,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.procname	= "ngroups_max",
 		.data		= &ngroups_max,
-- 
cgit v1.2.2


From 12b3052c3ee8f508b2c7ee4ddd63ed03423409d8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 15 Nov 2010 18:36:29 -0500
Subject: capabilities/syslog: open code cap_syslog logic to fix build failure

The addition of CONFIG_SECURITY_DMESG_RESTRICT resulted in a build
failure when CONFIG_PRINTK=n.  This is because the capabilities code
which used the new option was built even though the variable in question
didn't exist.

The patch here fixes this by moving the capabilities checks out of the
LSM and into the caller.  All (known) LSMs should have been calling the
capabilities hook already so it actually makes the code organization
better to eliminate the hook altogether.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 38e7d5868d6..9a2264fc42c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -274,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 	char c;
 	int error = 0;
 
-	error = security_syslog(type, from_file);
+	/*
+	 * If this is from /proc/kmsg we only do the capabilities checks
+	 * at open time.
+	 */
+	if (type == SYSLOG_ACTION_OPEN || !from_file) {
+		if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if ((type != SYSLOG_ACTION_READ_ALL &&
+		     type != SYSLOG_ACTION_SIZE_BUFFER) &&
+		    !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	error = security_syslog(type);
 	if (error)
 		return error;
 
-- 
cgit v1.2.2