From 408af87a397a8ddef56ad39a79481f592aa1ac1a Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 4 Nov 2010 21:44:41 +0100 Subject: Clean up relay_alloc_page_array() slightly by using vzalloc rather than vmalloc and memset We can optimize kernel/relay.c::relay_alloc_page_array() slightly by using vzalloc. The patch makes these changes: - use vzalloc instead of vmalloc+memset. - remove redundant local variable 'array'. - declare local 'pa_size' as const. Cuts down nicely on both source and object-code size. Signed-off-by: Jesper Juhl Acked-by: Pekka Enberg Acked-by: Mathieu Desnoyers Signed-off-by: Linus Torvalds --- kernel/relay.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb92..859ea5a9605 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - struct page **array; - size_t pa_size = n_pages * sizeof(struct page *); - - if (pa_size > PAGE_SIZE) { - array = vmalloc(pa_size); - if (array) - memset(array, 0, pa_size); - } else { - array = kzalloc(pa_size, GFP_KERNEL); - } - return array; + const size_t pa_size = n_pages * sizeof(struct page *); + if (pa_size > PAGE_SIZE) + return vzalloc(pa_size); + return kzalloc(pa_size, GFP_KERNEL); } /* -- cgit v1.2.2 From e0a70217107e6f9844628120412cb27bb4cea194 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 5 Nov 2010 16:53:42 +0100 Subject: posix-cpu-timers: workaround to suppress the problems with mt exec posix-cpu-timers.c correctly assumes that the dying process does posix_cpu_timers_exit_group() and removes all !CPUCLOCK_PERTHREAD timers from signal->cpu_timers list. But, it also assumes that timer->it.cpu.task is always the group leader, and thus the dead ->task means the dead thread group. This is obviously not true after de_thread() changes the leader. After that almost every posix_cpu_timer_ method has problems. It is not simple to fix this bug correctly. First of all, I think that timer->it.cpu should use struct pid instead of task_struct. Also, the locking should be reworked completely. In particular, tasklist_lock should not be used at all. This all needs a lot of nontrivial and hard-to-test changes. Change __exit_signal() to do posix_cpu_timers_exit_group() when the old leader dies during exec. This is not the fix, just the temporary hack to hide the problem for 2.6.37 and stable. IOW, this is obviously wrong but this is what we currently have anyway: cpu timers do not work after mt exec. In theory this change adds another race. The exiting leader can detach the timers which were attached to the new leader. However, the window between de_thread() and release_task() is small, we can pretend that sys_timer_create() was called before de_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b194febf579..21aa7b3001f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk) tty = sig->tty; sig->tty = NULL; } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + /* * If there is any task waiting for the group exit * then notify it: -- cgit v1.2.2 From 433039e97f672b81e6c8f6daef385dcf035c6e29 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 5 Nov 2010 16:17:39 -0700 Subject: watchdog: Fix section mismatch and potential undefined behavior. Commit d9ca07a05ce1 ("watchdog: Avoid kernel crash when disabling watchdog") introduces a section mismatch. Now that we reference no_watchdog from non-__init code it can no longer be __initdata. Signed-off-by: David Daney Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d..6e3c41a4024 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __initdata no_watchdog; +static int no_watchdog; /* boot commands */ -- cgit v1.2.2 From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 14:54:09 +0100 Subject: block: remove REQ_HARDBARRIER REQ_HARDBARRIER is dead now, so remove the leftovers. What's left at this point is: - various checks inside the block layer. - sanity checks in bio based drivers. - now unused bio_empty_barrier helper. - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while, but Xen really needs to sort out it's barrier situaton. - setting of ordered tags in uas - dead code copied from old scsi drivers. - scsi different retry for barriers - it's dead and should have been removed when flushes were converted to FS requests. - blktrace handling of barriers - removed. Someone who knows blktrace better should add support for REQ_FLUSH and REQ_FUA, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) -- cgit v1.2.2 From eed01528a45dc4138e9a08064b4b6cc1a9426899 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 26 Oct 2010 16:08:01 +0200 Subject: perf_events: Fix time tracking in samples This patch corrects time tracking in samples. Without this patch both time_enabled and time_running are bogus when user asks for PERF_SAMPLE_READ. One uses PERF_SAMPLE_READ to sample the values of other counters in each sample. Because of multiplexing, it is necessary to know both time_enabled, time_running to be able to scale counts correctly. In this second version of the patch, we maintain a shadow copy of ctx->time which allows us to compute ctx->time without calling update_context_time() from NMI context. We avoid the issue that update_context_time() must always be called with ctx->lock held. We do not keep shadow copies of the other event timings because if the lead event is overflowing then it is active and thus it's been scheduled in via event_sched_in() in which case neither tstamp_stopped, tstamp_running can be modified. This timing logic only applies to samples when PERF_SAMPLE_READ is used. Note that this patch does not address timing issues related to sampling inheritance between tasks. This will be addressed in a future patch. With this patch, the libpfm4 example task_smpl now reports correct counts (shown on 2.4GHz Core 2): $ task_smpl -p 2400000000 -e unhalted_core_cycles:u,instructions_retired:u,baclears noploop 5 noploop for 5 seconds IIP:0x000000004006d6 PID:5596 TID:5596 TIME:466,210,211,430 STREAM_ID:33 PERIOD:2,400,000,000 ENA=1,010,157,814 RUN=1,010,157,814 NR=3 2,400,000,254 unhalted_core_cycles:u (33) 2,399,273,744 instructions_retired:u (34) 53,340 baclears (35) Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cc6e14b.1e07e30a.256e.5190@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f498..cb6c0d2af68 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -674,6 +674,8 @@ event_sched_in(struct perf_event *event, event->tstamp_running += ctx->time - event->tstamp_stopped; + event->shadow_ctx_time = ctx->time - ctx->timestamp; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -3396,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[4]; @@ -3404,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + + values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + + values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) @@ -3421,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; @@ -3431,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; + values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; + values[n++] = running; if (leader != event) leader->pmu->read(leader); @@ -3459,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, } } +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event); + perf_output_read_group(handle, event, enabled, running); else - perf_output_read_one(handle, event); + perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, -- cgit v1.2.2 From 834b40380e93e36f1c9b48ec1d280cebe3d7bd8c Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Thu, 11 Nov 2010 14:05:14 -0800 Subject: kernel/range.c: fix clean_sort_range() for the case of full array clean_sort_range() should return a number of nonempty elements of range array, but if the array is full clean_sort_range() returns 0. The problem is that the number of nonempty elements is evaluated by finding the first empty element of the array. If there is no such element it returns an initial value of local variable nr_range that is zero. The fix is trivial: it changes initial value of nr_range to size of the array. The bug can lead to loss of information regarding all ranges, since typically returned value of clean_sort_range() is considered as an actual number of ranges in the array after a series of add/subtract operations. Found by Analytical Verification project of Linux Verification Center (linuxtesting.org), thanks to Alexander Kolosov. Signed-off-by: Alexey Khoroshilov Cc: Yinghai Lu Cc: "H. Peter Anvin" Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb..37fa9b99ad5 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) int clean_sort_range(struct range *range, int az) { - int i, j, k = az - 1, nr_range = 0; + int i, j, k = az - 1, nr_range = az; for (i = 0; i < k; i++) { if (range[i].end) -- cgit v1.2.2 From 38715258aa2e8cd94bd4aafadc544e5104efd551 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 11 Nov 2010 14:05:16 -0800 Subject: latencytop: fix per task accumulator Per task latencytop accumulator prematurely terminates due to erroneous placement of latency_record_count. It should be incremented whenever a new record is allocated instead of increment on every latencytop event. Also fix search iterator to only search known record events instead of blindly searching all pre-allocated space. Signed-off-by: Ken Chen Reviewed-by: Arjan van de Ven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d41..17110a4a4fc 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) account_global_scheduler_latency(tsk, &lat); - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT; i++) { + for (i = 0; i < tsk->latency_record_count; i++) { struct latency_record *mylat; int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) } } + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + /* Allocated a new one: */ - i = tsk->latency_record_count; + i = tsk->latency_record_count++; memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: -- cgit v1.2.2 From eaf06b241b091357e72b76863ba16e89610d31bd Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Thu, 11 Nov 2010 14:05:18 -0800 Subject: Restrict unprivileged access to kernel syslog The kernel syslog contains debugging information that is often useful during exploitation of other vulnerabilities, such as kernel heap addresses. Rather than futilely attempt to sanitize hundreds (or thousands) of printk statements and simultaneously cripple useful debugging functionality, it is far simpler to create an option that prevents unprivileged users from reading the syslog. This patch, loosely based on grsecurity's GRKERNSEC_DMESG, creates the dmesg_restrict sysctl. When set to "0", the default, no restrictions are enforced. When set to "1", only users with CAP_SYS_ADMIN can read the kernel syslog via dmesg(8) or other mechanisms. [akpm@linux-foundation.org: explain the config option in kernel.txt] Signed-off-by: Dan Rosenberg Acked-by: Ingo Molnar Acked-by: Eugene Teo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 6 ++++++ kernel/sysctl.c | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c37..38e7d5868d6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void) } #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799..b65bf634035 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -703,6 +703,15 @@ static struct ctl_table kern_table[] = { .extra2 = &ten_thousand, }, #endif + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ngroups_max", .data = &ngroups_max, -- cgit v1.2.2 From 12b3052c3ee8f508b2c7ee4ddd63ed03423409d8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 15 Nov 2010 18:36:29 -0500 Subject: capabilities/syslog: open code cap_syslog logic to fix build failure The addition of CONFIG_SECURITY_DMESG_RESTRICT resulted in a build failure when CONFIG_PRINTK=n. This is because the capabilities code which used the new option was built even though the variable in question didn't exist. The patch here fixes this by moving the capabilities checks out of the LSM and into the caller. All (known) LSMs should have been calling the capabilities hook already so it actually makes the code organization better to eliminate the hook altogether. Signed-off-by: Eric Paris Acked-by: James Morris Signed-off-by: Linus Torvalds --- kernel/printk.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 38e7d5868d6..9a2264fc42c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -274,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) char c; int error = 0; - error = security_syslog(type, from_file); + /* + * If this is from /proc/kmsg we only do the capabilities checks + * at open time. + */ + if (type == SYSLOG_ACTION_OPEN || !from_file) { + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + error = security_syslog(type); if (error) return error; -- cgit v1.2.2