aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-01-13 03:58:37 -0500
committerIngo Molnar <mingo@elte.hu>2010-01-13 04:08:50 -0500
commit61405fea92c42d072d9b8bd189689f1502a838af (patch)
tree013ea3e7ed71f4114004d5852d40b6e89e128f76 /kernel
parent9c443dfdd31eddea6cbe6ee0ca469fbcc4e1dc3b (diff)
parent1703f2c321a8a531c393e137a82602e16c6061cb (diff)
Merge branch 'perf/urgent' into perf/core
Merge reason: queue up dependent patch, update to -rc4 Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_tree.c13
-rw-r--r--kernel/auditsc.c1
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/exit.c36
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/hw_breakpoint.c10
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kfifo.c361
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/perf_event.c9
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/resource.c32
-rw-r--r--kernel/sched.c307
-rw-r--r--kernel/sched_clock.c23
-rw-r--r--kernel/sched_fair.c53
-rw-r--r--kernel/sched_rt.c4
-rw-r--r--kernel/signal.c28
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/sysctl_binary.c31
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/timekeeping.c27
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig112
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_export.c7
-rw-r--r--kernel/trace/trace_kprobe.c7
-rw-r--r--kernel/trace/trace_ksym.c140
35 files changed, 834 insertions, 504 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f3282..4b05bd9479db 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,7 +277,7 @@ static void untag_chunk(struct node *p)
277 owner->root = NULL; 277 owner->root = NULL;
278 } 278 }
279 279
280 for (i = j = 0; i < size; i++, j++) { 280 for (i = j = 0; j <= size; i++, j++) {
281 struct audit_tree *s; 281 struct audit_tree *s;
282 if (&chunk->owners[j] == p) { 282 if (&chunk->owners[j] == p) {
283 list_del_init(&p->list); 283 list_del_init(&p->list);
@@ -290,7 +290,7 @@ static void untag_chunk(struct node *p)
290 if (!s) /* result of earlier fallback */ 290 if (!s) /* result of earlier fallback */
291 continue; 291 continue;
292 get_tree(s); 292 get_tree(s);
293 list_replace_init(&chunk->owners[i].list, &new->owners[j].list); 293 list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
294 } 294 }
295 295
296 list_replace_rcu(&chunk->hash, &new->hash); 296 list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
373 for (n = 0; n < old->count; n++) { 373 for (n = 0; n < old->count; n++) {
374 if (old->owners[n].owner == tree) { 374 if (old->owners[n].owner == tree) {
375 spin_unlock(&hash_lock); 375 spin_unlock(&hash_lock);
376 put_inotify_watch(watch); 376 put_inotify_watch(&old->watch);
377 return 0; 377 return 0;
378 } 378 }
379 } 379 }
380 spin_unlock(&hash_lock); 380 spin_unlock(&hash_lock);
381 381
382 chunk = alloc_chunk(old->count + 1); 382 chunk = alloc_chunk(old->count + 1);
383 if (!chunk) 383 if (!chunk) {
384 put_inotify_watch(&old->watch);
384 return -ENOMEM; 385 return -ENOMEM;
386 }
385 387
386 mutex_lock(&inode->inotify_mutex); 388 mutex_lock(&inode->inotify_mutex);
387 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 389 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
425 spin_unlock(&hash_lock); 427 spin_unlock(&hash_lock);
426 inotify_evict_watch(&old->watch); 428 inotify_evict_watch(&old->watch);
427 mutex_unlock(&inode->inotify_mutex); 429 mutex_unlock(&inode->inotify_mutex);
428 put_inotify_watch(&old->watch); 430 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
431 put_inotify_watch(&old->watch); /* and kill it */
429 return 0; 432 return 0;
430} 433}
431 434
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f0198..fc0f928167e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -250,7 +250,6 @@ struct audit_context {
250#endif 250#endif
251}; 251};
252 252
253#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
254static inline int open_arg(int flags, int mask) 253static inline int open_arg(int flags, int mask)
255{ 254{
256 int n = ACC_MODE(flags); 255 int n = ACC_MODE(flags);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c5301381837..98a51f26c136 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
12 12
13void foo(void) 13void foo(void)
14{ 14{
15 /* The enum constants to put into include/linux/bounds.h */ 15 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */ 18 /* End of constants */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5c..1fbcc748044a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468 /* make sure l doesn't vanish out from under us */ 2468 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2469 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2470 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l; 2471 return l;
2473 } 2472 }
2474 } 2473 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 291ac586f37f..1c8ddd6ee940 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
209 return -ENOMEM; 209 return -ENOMEM;
210 210
211 cpu_hotplug_begin(); 211 cpu_hotplug_begin();
212 set_cpu_active(cpu, false);
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 214 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 215 if (err == NOTIFY_BAD) {
@@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu)
280 goto out; 281 goto out;
281 } 282 }
282 283
283 set_cpu_active(cpu, false);
284
285 /*
286 * Make sure the all cpus did the reschedule and are not
287 * using stale version of the cpu_active_mask.
288 * This is not strictly necessary becuase stop_machine()
289 * that we run down the line already provides the required
290 * synchronization. But it's really a side effect and we do not
291 * want to depend on the innards of the stop_machine here.
292 */
293 synchronize_sched();
294
295 err = _cpu_down(cpu, 0); 284 err = _cpu_down(cpu, 0);
296 285
297out: 286out:
@@ -382,19 +371,12 @@ int disable_nonboot_cpus(void)
382 return error; 371 return error;
383 cpu_maps_update_begin(); 372 cpu_maps_update_begin();
384 first_cpu = cpumask_first(cpu_online_mask); 373 first_cpu = cpumask_first(cpu_online_mask);
385 /* We take down all of the non-boot CPUs in one shot to avoid races 374 /*
375 * We take down all of the non-boot CPUs in one shot to avoid races
386 * with the userspace trying to use the CPU hotplug at the same time 376 * with the userspace trying to use the CPU hotplug at the same time
387 */ 377 */
388 cpumask_clear(frozen_cpus); 378 cpumask_clear(frozen_cpus);
389 379
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
398 printk("Disabling non-boot CPUs ...\n"); 380 printk("Disabling non-boot CPUs ...\n");
399 for_each_online_cpu(cpu) { 381 for_each_online_cpu(cpu) {
400 if (cpu == first_cpu) 382 if (cpu == first_cpu)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5962d7ccf243..546774a31a66 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p)
68 detach_pid(p, PIDTYPE_SID); 68 detach_pid(p, PIDTYPE_SID);
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling);
71 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
72 } 73 }
73 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
74 list_del_init(&p->sibling);
75} 75}
76 76
77/* 77/*
@@ -736,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
736/* 736/*
737* Any that need to be release_task'd are put on the @dead list. 737* Any that need to be release_task'd are put on the @dead list.
738 */ 738 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p, 739static void reparent_leader(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead) 740 struct list_head *dead)
741{ 741{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children); 742 list_move_tail(&p->sibling, &p->real_parent->children);
746 743
747 if (task_detached(p)) 744 if (task_detached(p))
@@ -780,12 +777,18 @@ static void forget_original_parent(struct task_struct *father)
780 reaper = find_new_reaper(father); 777 reaper = find_new_reaper(father);
781 778
782 list_for_each_entry_safe(p, n, &father->children, sibling) { 779 list_for_each_entry_safe(p, n, &father->children, sibling) {
783 p->real_parent = reaper; 780 struct task_struct *t = p;
784 if (p->parent == father) { 781 do {
785 BUG_ON(task_ptrace(p)); 782 t->real_parent = reaper;
786 p->parent = p->real_parent; 783 if (t->parent == father) {
787 } 784 BUG_ON(task_ptrace(t));
788 reparent_thread(father, p, &dead_children); 785 t->parent = t->real_parent;
786 }
787 if (t->pdeath_signal)
788 group_send_sig_info(t->pdeath_signal,
789 SEND_SIG_NOINFO, t);
790 } while_each_thread(p, t);
791 reparent_leader(father, p, &dead_children);
789 } 792 }
790 write_unlock_irq(&tasklist_lock); 793 write_unlock_irq(&tasklist_lock);
791 794
@@ -1551,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1551 struct task_struct *p; 1554 struct task_struct *p;
1552 1555
1553 list_for_each_entry(p, &tsk->children, sibling) { 1556 list_for_each_entry(p, &tsk->children, sibling) {
1554 /* 1557 int ret = wait_consider_task(wo, 0, p);
1555 * Do not consider detached threads. 1558 if (ret)
1556 */ 1559 return ret;
1557 if (!task_detached(p)) {
1558 int ret = wait_consider_task(wo, 0, p);
1559 if (ret)
1560 return ret;
1561 }
1562 } 1560 }
1563 1561
1564 return 0; 1562 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 202a0ba63d3c..5b2959b3ffc2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1291,7 +1291,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1291 } 1291 }
1292 1292
1293 if (likely(p->pid)) { 1293 if (likely(p->pid)) {
1294 list_add_tail(&p->sibling, &p->real_parent->children);
1295 tracehook_finish_clone(p, clone_flags, trace); 1294 tracehook_finish_clone(p, clone_flags, trace);
1296 1295
1297 if (thread_group_leader(p)) { 1296 if (thread_group_leader(p)) {
@@ -1303,6 +1302,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1303 p->signal->tty = tty_kref_get(current->signal->tty); 1302 p->signal->tty = tty_kref_get(current->signal->tty);
1304 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1303 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1305 attach_pid(p, PIDTYPE_SID, task_session(current)); 1304 attach_pid(p, PIDTYPE_SID, task_session(current));
1305 list_add_tail(&p->sibling, &p->real_parent->children);
1306 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1306 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1307 __get_cpu_var(process_counts)++; 1307 __get_cpu_var(process_counts)++;
1308 } 1308 }
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index dbcbf6a33a08..50dbd5999588 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,6 +40,7 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/cpu.h>
43#include <linux/smp.h> 44#include <linux/smp.h>
44 45
45#include <linux/hw_breakpoint.h> 46#include <linux/hw_breakpoint.h>
@@ -388,7 +389,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
388 if (!cpu_events) 389 if (!cpu_events)
389 return ERR_PTR(-ENOMEM); 390 return ERR_PTR(-ENOMEM);
390 391
391 for_each_possible_cpu(cpu) { 392 get_online_cpus();
393 for_each_online_cpu(cpu) {
392 pevent = per_cpu_ptr(cpu_events, cpu); 394 pevent = per_cpu_ptr(cpu_events, cpu);
393 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 395 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
394 396
@@ -399,18 +401,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
399 goto fail; 401 goto fail;
400 } 402 }
401 } 403 }
404 put_online_cpus();
402 405
403 return cpu_events; 406 return cpu_events;
404 407
405fail: 408fail:
406 for_each_possible_cpu(cpu) { 409 for_each_online_cpu(cpu) {
407 pevent = per_cpu_ptr(cpu_events, cpu); 410 pevent = per_cpu_ptr(cpu_events, cpu);
408 if (IS_ERR(*pevent)) 411 if (IS_ERR(*pevent))
409 break; 412 break;
410 unregister_hw_breakpoint(*pevent); 413 unregister_hw_breakpoint(*pevent);
411 } 414 }
415 put_online_cpus();
416
412 free_percpu(cpu_events); 417 free_percpu(cpu_events);
413 /* return the error if any */
414 return ERR_PTR(err); 418 return ERR_PTR(err);
415} 419}
416EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 420EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 433e9fcc1fc5..a9a93d9ee7a7 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <linux/utsrelease.h> 24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h> 27#include <linux/suspend.h>
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bbe..e92d519f93b1 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * A simple kernel FIFO implementation. 2 * A generic kernel FIFO implementation.
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net> 5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
25#include <linux/err.h> 26#include <linux/err.h>
26#include <linux/kfifo.h> 27#include <linux/kfifo.h>
27#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h>
30
31static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer,
32 unsigned int size)
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
28 39
29/** 40/**
30 * kfifo_init - allocates a new FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
31 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
32 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this have to be a power of 2.
33 * @gfp_mask: get_free_pages mask, passed to kmalloc()
34 * @lock: the lock to be used to protect the fifo buffer
35 * 45 *
36 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
37 * &struct kfifo with kfree().
38 */ 46 */
39struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 47void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size)
40 gfp_t gfp_mask, spinlock_t *lock)
41{ 48{
42 struct kfifo *fifo;
43
44 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
45 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
46 51
47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 52 _kfifo_init(fifo, buffer, size);
48 if (!fifo)
49 return ERR_PTR(-ENOMEM);
50
51 fifo->buffer = buffer;
52 fifo->size = size;
53 fifo->in = fifo->out = 0;
54 fifo->lock = lock;
55
56 return fifo;
57} 53}
58EXPORT_SYMBOL(kfifo_init); 54EXPORT_SYMBOL(kfifo_init);
59 55
60/** 56/**
61 * kfifo_alloc - allocates a new FIFO and its internal buffer 57 * kfifo_alloc - allocates a new FIFO internal buffer
62 * @size: the size of the internal buffer to be allocated. 58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
63 * @gfp_mask: get_free_pages mask, passed to kmalloc() 60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
64 * @lock: the lock to be used to protect the fifo buffer 61 *
62 * This function dynamically allocates a new fifo internal buffer
65 * 63 *
66 * The size will be rounded-up to a power of 2. 64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */ 67 */
68struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) 68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 69{
70 unsigned char *buffer; 70 unsigned char *buffer;
71 struct kfifo *ret;
72 71
73 /* 72 /*
74 * round up to the next power of 2, since our 'let the indices 73 * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,91 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
80 } 79 }
81 80
82 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
83 if (!buffer) 82 if (!buffer) {
84 return ERR_PTR(-ENOMEM); 83 _kfifo_init(fifo, 0, 0);
85 84 return -ENOMEM;
86 ret = kfifo_init(buffer, size, gfp_mask, lock); 85 }
87 86
88 if (IS_ERR(ret)) 87 _kfifo_init(fifo, buffer, size);
89 kfree(buffer);
90 88
91 return ret; 89 return 0;
92} 90}
93EXPORT_SYMBOL(kfifo_alloc); 91EXPORT_SYMBOL(kfifo_alloc);
94 92
95/** 93/**
96 * kfifo_free - frees the FIFO 94 * kfifo_free - frees the FIFO internal buffer
97 * @fifo: the fifo to be freed. 95 * @fifo: the fifo to be freed.
98 */ 96 */
99void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
100{ 98{
101 kfree(fifo->buffer); 99 kfree(fifo->buffer);
102 kfree(fifo);
103} 100}
104EXPORT_SYMBOL(kfifo_free); 101EXPORT_SYMBOL(kfifo_free);
105 102
106/** 103/**
107 * __kfifo_put - puts some data into the FIFO, no locking version 104 * kfifo_skip - skip output data
108 * @fifo: the fifo to be used. 105 * @fifo: the fifo to be used.
109 * @buffer: the data to be added. 106 * @len: number of bytes to skip
110 * @len: the length of the data to be added.
111 *
112 * This function copies at most @len bytes from the @buffer into
113 * the FIFO depending on the free space, and returns the number of
114 * bytes copied.
115 *
116 * Note that with only one concurrent reader and one concurrent
117 * writer, you don't need extra locking to use these functions.
118 */ 107 */
119unsigned int __kfifo_put(struct kfifo *fifo, 108void kfifo_skip(struct kfifo *fifo, unsigned int len)
120 const unsigned char *buffer, unsigned int len) 109{
110 if (len < kfifo_len(fifo)) {
111 __kfifo_add_out(fifo, len);
112 return;
113 }
114 kfifo_reset_out(fifo);
115}
116EXPORT_SYMBOL(kfifo_skip);
117
118static inline void __kfifo_in_data(struct kfifo *fifo,
119 const void *from, unsigned int len, unsigned int off)
121{ 120{
122 unsigned int l; 121 unsigned int l;
123 122
124 len = min(len, fifo->size - fifo->in + fifo->out); 123 /*
124 * Ensure that we sample the fifo->out index -before- we
125 * start putting bytes into the kfifo.
126 */
127
128 smp_mb();
129
130 off = __kfifo_off(fifo, fifo->in + off);
131
132 /* first put the data starting from fifo->in to buffer end */
133 l = min(len, fifo->size - off);
134 memcpy(fifo->buffer + off, from, l);
135
136 /* then put the rest (if any) at the beginning of the buffer */
137 memcpy(fifo->buffer, from + l, len - l);
138}
139
140static inline void __kfifo_out_data(struct kfifo *fifo,
141 void *to, unsigned int len, unsigned int off)
142{
143 unsigned int l;
144
145 /*
146 * Ensure that we sample the fifo->in index -before- we
147 * start removing bytes from the kfifo.
148 */
149
150 smp_rmb();
151
152 off = __kfifo_off(fifo, fifo->out + off);
153
154 /* first get the data from fifo->out until the end of the buffer */
155 l = min(len, fifo->size - off);
156 memcpy(to, fifo->buffer + off, l);
157
158 /* then get the rest (if any) from the beginning of the buffer */
159 memcpy(to + l, fifo->buffer, len - l);
160}
161
162static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo,
163 const void __user *from, unsigned int len, unsigned int off)
164{
165 unsigned int l;
166 int ret;
125 167
126 /* 168 /*
127 * Ensure that we sample the fifo->out index -before- we 169 * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +172,229 @@ unsigned int __kfifo_put(struct kfifo *fifo,
130 172
131 smp_mb(); 173 smp_mb();
132 174
175 off = __kfifo_off(fifo, fifo->in + off);
176
133 /* first put the data starting from fifo->in to buffer end */ 177 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 178 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 179 ret = copy_from_user(fifo->buffer + off, from, l);
180
181 if (unlikely(ret))
182 return ret + len - l;
136 183
137 /* then put the rest (if any) at the beginning of the buffer */ 184 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, buffer + l, len - l); 185 return copy_from_user(fifo->buffer, from + l, len - l);
186}
187
188static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo,
189 void __user *to, unsigned int len, unsigned int off)
190{
191 unsigned int l;
192 int ret;
139 193
140 /* 194 /*
141 * Ensure that we add the bytes to the kfifo -before- 195 * Ensure that we sample the fifo->in index -before- we
142 * we update the fifo->in index. 196 * start removing bytes from the kfifo.
143 */ 197 */
144 198
145 smp_wmb(); 199 smp_rmb();
200
201 off = __kfifo_off(fifo, fifo->out + off);
202
203 /* first get the data from fifo->out until the end of the buffer */
204 l = min(len, fifo->size - off);
205 ret = copy_to_user(to, fifo->buffer + off, l);
206
207 if (unlikely(ret))
208 return ret + len - l;
209
210 /* then get the rest (if any) from the beginning of the buffer */
211 return copy_to_user(to + l, fifo->buffer, len - l);
212}
213
214unsigned int __kfifo_in_n(struct kfifo *fifo,
215 const void *from, unsigned int len, unsigned int recsize)
216{
217 if (kfifo_avail(fifo) < len + recsize)
218 return len + 1;
219
220 __kfifo_in_data(fifo, from, len, recsize);
221 return 0;
222}
223EXPORT_SYMBOL(__kfifo_in_n);
146 224
147 fifo->in += len; 225/**
226 * kfifo_in - puts some data into the FIFO
227 * @fifo: the fifo to be used.
228 * @from: the data to be added.
229 * @len: the length of the data to be added.
230 *
231 * This function copies at most @len bytes from the @from buffer into
232 * the FIFO depending on the free space, and returns the number of
233 * bytes copied.
234 *
235 * Note that with only one concurrent reader and one concurrent
236 * writer, you don't need extra locking to use these functions.
237 */
238unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from,
239 unsigned int len)
240{
241 len = min(kfifo_avail(fifo), len);
148 242
243 __kfifo_in_data(fifo, from, len, 0);
244 __kfifo_add_in(fifo, len);
149 return len; 245 return len;
150} 246}
151EXPORT_SYMBOL(__kfifo_put); 247EXPORT_SYMBOL(kfifo_in);
248
249unsigned int __kfifo_in_generic(struct kfifo *fifo,
250 const void *from, unsigned int len, unsigned int recsize)
251{
252 return __kfifo_in_rec(fifo, from, len, recsize);
253}
254EXPORT_SYMBOL(__kfifo_in_generic);
255
256unsigned int __kfifo_out_n(struct kfifo *fifo,
257 void *to, unsigned int len, unsigned int recsize)
258{
259 if (kfifo_len(fifo) < len + recsize)
260 return len;
261
262 __kfifo_out_data(fifo, to, len, recsize);
263 __kfifo_add_out(fifo, len + recsize);
264 return 0;
265}
266EXPORT_SYMBOL(__kfifo_out_n);
152 267
153/** 268/**
154 * __kfifo_get - gets some data from the FIFO, no locking version 269 * kfifo_out - gets some data from the FIFO
155 * @fifo: the fifo to be used. 270 * @fifo: the fifo to be used.
156 * @buffer: where the data must be copied. 271 * @to: where the data must be copied.
157 * @len: the size of the destination buffer. 272 * @len: the size of the destination buffer.
158 * 273 *
159 * This function copies at most @len bytes from the FIFO into the 274 * This function copies at most @len bytes from the FIFO into the
160 * @buffer and returns the number of copied bytes. 275 * @to buffer and returns the number of copied bytes.
161 * 276 *
162 * Note that with only one concurrent reader and one concurrent 277 * Note that with only one concurrent reader and one concurrent
163 * writer, you don't need extra locking to use these functions. 278 * writer, you don't need extra locking to use these functions.
164 */ 279 */
165unsigned int __kfifo_get(struct kfifo *fifo, 280unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len)
166 unsigned char *buffer, unsigned int len)
167{ 281{
168 unsigned int l; 282 len = min(kfifo_len(fifo), len);
169 283
170 len = min(len, fifo->in - fifo->out); 284 __kfifo_out_data(fifo, to, len, 0);
285 __kfifo_add_out(fifo, len);
171 286
172 /* 287 return len;
173 * Ensure that we sample the fifo->in index -before- we 288}
174 * start removing bytes from the kfifo. 289EXPORT_SYMBOL(kfifo_out);
175 */
176 290
177 smp_rmb(); 291unsigned int __kfifo_out_generic(struct kfifo *fifo,
292 void *to, unsigned int len, unsigned int recsize,
293 unsigned int *total)
294{
295 return __kfifo_out_rec(fifo, to, len, recsize, total);
296}
297EXPORT_SYMBOL(__kfifo_out_generic);
178 298
179 /* first get the data from fifo->out until the end of the buffer */ 299unsigned int __kfifo_from_user_n(struct kfifo *fifo,
180 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 300 const void __user *from, unsigned int len, unsigned int recsize)
181 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 301{
302 if (kfifo_avail(fifo) < len + recsize)
303 return len + 1;
182 304
183 /* then get the rest (if any) from the beginning of the buffer */ 305 return __kfifo_from_user_data(fifo, from, len, recsize);
184 memcpy(buffer + l, fifo->buffer, len - l); 306}
307EXPORT_SYMBOL(__kfifo_from_user_n);
185 308
186 /* 309/**
187 * Ensure that we remove the bytes from the kfifo -before- 310 * kfifo_from_user - puts some data from user space into the FIFO
188 * we update the fifo->out index. 311 * @fifo: the fifo to be used.
189 */ 312 * @from: pointer to the data to be added.
313 * @len: the length of the data to be added.
314 *
315 * This function copies at most @len bytes from the @from into the
316 * FIFO depending and returns the number of copied bytes.
317 *
318 * Note that with only one concurrent reader and one concurrent
319 * writer, you don't need extra locking to use these functions.
320 */
321unsigned int kfifo_from_user(struct kfifo *fifo,
322 const void __user *from, unsigned int len)
323{
324 len = min(kfifo_avail(fifo), len);
325 len -= __kfifo_from_user_data(fifo, from, len, 0);
326 __kfifo_add_in(fifo, len);
327 return len;
328}
329EXPORT_SYMBOL(kfifo_from_user);
190 330
191 smp_mb(); 331unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
332 const void __user *from, unsigned int len, unsigned int recsize)
333{
334 return __kfifo_from_user_rec(fifo, from, len, recsize);
335}
336EXPORT_SYMBOL(__kfifo_from_user_generic);
192 337
193 fifo->out += len; 338unsigned int __kfifo_to_user_n(struct kfifo *fifo,
339 void __user *to, unsigned int len, unsigned int reclen,
340 unsigned int recsize)
341{
342 unsigned int ret;
343
344 if (kfifo_len(fifo) < reclen + recsize)
345 return len;
346
347 ret = __kfifo_to_user_data(fifo, to, reclen, recsize);
194 348
349 if (likely(ret == 0))
350 __kfifo_add_out(fifo, reclen + recsize);
351
352 return ret;
353}
354EXPORT_SYMBOL(__kfifo_to_user_n);
355
356/**
357 * kfifo_to_user - gets data from the FIFO and write it to user space
358 * @fifo: the fifo to be used.
359 * @to: where the data must be copied.
360 * @len: the size of the destination buffer.
361 *
362 * This function copies at most @len bytes from the FIFO into the
363 * @to buffer and returns the number of copied bytes.
364 *
365 * Note that with only one concurrent reader and one concurrent
366 * writer, you don't need extra locking to use these functions.
367 */
368unsigned int kfifo_to_user(struct kfifo *fifo,
369 void __user *to, unsigned int len)
370{
371 len = min(kfifo_len(fifo), len);
372 len -= __kfifo_to_user_data(fifo, to, len, 0);
373 __kfifo_add_out(fifo, len);
195 return len; 374 return len;
196} 375}
197EXPORT_SYMBOL(__kfifo_get); 376EXPORT_SYMBOL(kfifo_to_user);
377
378unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
379 void __user *to, unsigned int len, unsigned int recsize,
380 unsigned int *total)
381{
382 return __kfifo_to_user_rec(fifo, to, len, recsize, total);
383}
384EXPORT_SYMBOL(__kfifo_to_user_generic);
385
386unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
387{
388 if (recsize == 0)
389 return kfifo_avail(fifo);
390
391 return __kfifo_peek_n(fifo, recsize);
392}
393EXPORT_SYMBOL(__kfifo_peek_generic);
394
395void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
396{
397 __kfifo_skip_rec(fifo, recsize);
398}
399EXPORT_SYMBOL(__kfifo_skip_generic);
400
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 25b103190364..bf0e231d9702 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5342a344c43..b7df302a0204 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1035 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1036 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1037#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1038 rp->maxactive = max(10, 2 * num_possible_cpus()); 1038 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1039#else 1039#else
1040 rp->maxactive = num_possible_cpus(); 1040 rp->maxactive = num_possible_cpus();
1041#endif 1041#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e1..fbb6222fe7e0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @p: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *p, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168
169 p->cpus_allowed = cpumask_of_cpu(cpu);
170 p->rt.nr_cpus_allowed = 1;
171 p->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
153 * kthread_stop - stop a thread created by kthread_create(). 176 * kthread_stop - stop a thread created by kthread_create().
154 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
155 * 178 *
diff --git a/kernel/module.c b/kernel/module.c
index a65dc787a27b..f82386bd9ee9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1010,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1010 * J. Corbet <corbet@lwn.net> 1010 * J. Corbet <corbet@lwn.net>
1011 */ 1011 */
1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1013
1014static inline bool sect_empty(const Elf_Shdr *sect)
1015{
1016 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1017}
1018
1013struct module_sect_attr 1019struct module_sect_attr
1014{ 1020{
1015 struct module_attribute mattr; 1021 struct module_attribute mattr;
@@ -1051,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1051 1057
1052 /* Count loaded sections and allocate structures */ 1058 /* Count loaded sections and allocate structures */
1053 for (i = 0; i < nsect; i++) 1059 for (i = 0; i < nsect; i++)
1054 if (sechdrs[i].sh_flags & SHF_ALLOC 1060 if (!sect_empty(&sechdrs[i]))
1055 && sechdrs[i].sh_size)
1056 nloaded++; 1061 nloaded++;
1057 size[0] = ALIGN(sizeof(*sect_attrs) 1062 size[0] = ALIGN(sizeof(*sect_attrs)
1058 + nloaded * sizeof(sect_attrs->attrs[0]), 1063 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1070,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1070 sattr = &sect_attrs->attrs[0]; 1075 sattr = &sect_attrs->attrs[0];
1071 gattr = &sect_attrs->grp.attrs[0]; 1076 gattr = &sect_attrs->grp.attrs[0];
1072 for (i = 0; i < nsect; i++) { 1077 for (i = 0; i < nsect; i++) {
1073 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1078 if (sect_empty(&sechdrs[i]))
1074 continue;
1075 if (!sechdrs[i].sh_size)
1076 continue; 1079 continue;
1077 sattr->address = sechdrs[i].sh_addr; 1080 sattr->address = sechdrs[i].sh_addr;
1078 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1081 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1156,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1156 /* Count notes sections and allocate structures. */ 1159 /* Count notes sections and allocate structures. */
1157 notes = 0; 1160 notes = 0;
1158 for (i = 0; i < nsect; i++) 1161 for (i = 0; i < nsect; i++)
1159 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1162 if (!sect_empty(&sechdrs[i]) &&
1160 (sechdrs[i].sh_type == SHT_NOTE)) 1163 (sechdrs[i].sh_type == SHT_NOTE))
1161 ++notes; 1164 ++notes;
1162 1165
@@ -1172,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1172 notes_attrs->notes = notes; 1175 notes_attrs->notes = notes;
1173 nattr = &notes_attrs->attrs[0]; 1176 nattr = &notes_attrs->attrs[0];
1174 for (loaded = i = 0; i < nsect; ++i) { 1177 for (loaded = i = 0; i < nsect; ++i) {
1175 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1178 if (sect_empty(&sechdrs[i]))
1176 continue; 1179 continue;
1177 if (sechdrs[i].sh_type == SHT_NOTE) { 1180 if (sechdrs[i].sh_type == SHT_NOTE) {
1178 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
@@ -1910,9 +1913,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1910 unsigned int i; 1913 unsigned int i;
1911 1914
1912 /* only scan the sections containing data */ 1915 /* only scan the sections containing data */
1913 kmemleak_scan_area(mod->module_core, (unsigned long)mod - 1916 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
1914 (unsigned long)mod->module_core,
1915 sizeof(struct module), GFP_KERNEL);
1916 1917
1917 for (i = 1; i < hdr->e_shnum; i++) { 1918 for (i = 1; i < hdr->e_shnum; i++) {
1918 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1919 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -1921,8 +1922,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1921 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) 1922 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1922 continue; 1923 continue;
1923 1924
1924 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - 1925 kmemleak_scan_area((void *)sechdrs[i].sh_addr,
1925 (unsigned long)mod->module_core,
1926 sechdrs[i].sh_size, GFP_KERNEL); 1926 sechdrs[i].sh_size, GFP_KERNEL);
1927 } 1927 }
1928} 1928}
@@ -2250,6 +2250,12 @@ static noinline struct module *load_module(void __user *umod,
2250 "_ftrace_events", 2250 "_ftrace_events",
2251 sizeof(*mod->trace_events), 2251 sizeof(*mod->trace_events),
2252 &mod->num_trace_events); 2252 &mod->num_trace_events);
2253 /*
2254 * This section contains pointers to allocated objects in the trace
2255 * code and not scanning it leads to false positives.
2256 */
2257 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2258 mod->num_trace_events, GFP_KERNEL);
2253#endif 2259#endif
2254#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2260#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2255 /* sechdrs[0].sh_size is always zero */ 2261 /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 5b987b4a98a8..27f69a04541d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1618,7 +1618,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1618 * offline CPU and activate it when the CPU comes up, but 1618 * offline CPU and activate it when the CPU comes up, but
1619 * that's for later. 1619 * that's for later.
1620 */ 1620 */
1621 if (!cpu_isset(cpu, cpu_online_map)) 1621 if (!cpu_online(cpu))
1622 return ERR_PTR(-ENODEV); 1622 return ERR_PTR(-ENODEV);
1623 1623
1624 cpuctx = &per_cpu(perf_cpu_context, cpu); 1624 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -4725,7 +4725,7 @@ SYSCALL_DEFINE5(perf_event_open,
4725 if (IS_ERR(event)) 4725 if (IS_ERR(event))
4726 goto err_put_context; 4726 goto err_put_context;
4727 4727
4728 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4728 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4729 if (err < 0) 4729 if (err < 0)
4730 goto err_free_put_context; 4730 goto err_free_put_context;
4731 4731
@@ -5149,7 +5149,7 @@ int perf_event_init_task(struct task_struct *child)
5149 GFP_KERNEL); 5149 GFP_KERNEL);
5150 if (!child_ctx) { 5150 if (!child_ctx) {
5151 ret = -ENOMEM; 5151 ret = -ENOMEM;
5152 goto exit; 5152 break;
5153 } 5153 }
5154 5154
5155 __perf_event_init_context(child_ctx, child); 5155 __perf_event_init_context(child_ctx, child);
@@ -5165,7 +5165,7 @@ int perf_event_init_task(struct task_struct *child)
5165 } 5165 }
5166 } 5166 }
5167 5167
5168 if (inherited_all) { 5168 if (child_ctx && inherited_all) {
5169 /* 5169 /*
5170 * Mark the child context as a clone of the parent 5170 * Mark the child context as a clone of the parent
5171 * context, or of whatever the parent is a clone of. 5171 * context, or of whatever the parent is a clone of.
@@ -5185,7 +5185,6 @@ int perf_event_init_task(struct task_struct *child)
5185 get_ctx(child_ctx->parent_ctx); 5185 get_ctx(child_ctx->parent_ctx);
5186 } 5186 }
5187 5187
5188exit:
5189 mutex_unlock(&parent_ctx->mutex); 5188 mutex_unlock(&parent_ctx->mutex);
5190 5189
5191 perf_unpin_context(parent_ctx); 5190 perf_unpin_context(parent_ctx);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1ded8e7dd19b..17463ca2e229 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1412,7 +1412,7 @@ static LIST_HEAD(dump_list);
1412 1412
1413/** 1413/**
1414 * kmsg_dump_register - register a kernel log dumper. 1414 * kmsg_dump_register - register a kernel log dumper.
1415 * @dump: pointer to the kmsg_dumper structure 1415 * @dumper: pointer to the kmsg_dumper structure
1416 * 1416 *
1417 * Adds a kernel log dumper to the system. The dump callback in the 1417 * Adds a kernel log dumper to the system. The dump callback in the
1418 * structure will be called when the kernel oopses or panics and must be 1418 * structure will be called when the kernel oopses or panics and must be
@@ -1442,7 +1442,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_register);
1442 1442
1443/** 1443/**
1444 * kmsg_dump_unregister - unregister a kmsg dumper. 1444 * kmsg_dump_unregister - unregister a kmsg dumper.
1445 * @dump: pointer to the kmsg_dumper structure 1445 * @dumper: pointer to the kmsg_dumper structure
1446 * 1446 *
1447 * Removes a dump device from the system. Returns zero on success and 1447 * Removes a dump device from the system. Returns zero on success and
1448 * %-EINVAL otherwise. 1448 * %-EINVAL otherwise.
diff --git a/kernel/resource.c b/kernel/resource.c
index dc15686b7a77..af96c1e4b54b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,37 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
308 void *alignf_data) 308 void *alignf_data)
309{ 309{
310 struct resource *this = root->child; 310 struct resource *this = root->child;
311 resource_size_t start, end; 311 struct resource tmp = *new;
312 312
313 start = root->start; 313 tmp.start = root->start;
314 /* 314 /*
315 * Skip past an allocated resource that starts at 0, since the assignment 315 * Skip past an allocated resource that starts at 0, since the assignment
316 * of this->start - 1 to new->end below would cause an underflow. 316 * of this->start - 1 to tmp->end below would cause an underflow.
317 */ 317 */
318 if (this && this->start == 0) { 318 if (this && this->start == 0) {
319 start = this->end + 1; 319 tmp.start = this->end + 1;
320 this = this->sibling; 320 this = this->sibling;
321 } 321 }
322 for(;;) { 322 for(;;) {
323 if (this) 323 if (this)
324 end = this->start - 1; 324 tmp.end = this->start - 1;
325 else 325 else
326 end = root->end; 326 tmp.end = root->end;
327 if (start < min) 327 if (tmp.start < min)
328 start = min; 328 tmp.start = min;
329 if (end > max) 329 if (tmp.end > max)
330 end = max; 330 tmp.end = max;
331 start = ALIGN(start, align); 331 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 332 if (alignf)
333 alignf(alignf_data, new, size, align); 333 alignf(alignf_data, &tmp, size, align);
334 if (start < end && end - start >= size - 1) { 334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = start; 335 new->start = tmp.start;
336 new->end = start + size - 1; 336 new->end = tmp.start + size - 1;
337 return 0; 337 return 0;
338 } 338 }
339 if (!this) 339 if (!this)
340 break; 340 break;
341 start = this->end + 1; 341 tmp.start = this->end + 1;
342 this = this->sibling; 342 this = this->sibling;
343 } 343 }
344 return -EBUSY; 344 return -EBUSY;
diff --git a/kernel/sched.c b/kernel/sched.c
index d6527ac0f6e7..e507af086b42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2002,39 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2002 p->sched_class->prio_changed(rq, p, oldprio, running); 2002 p->sched_class->prio_changed(rq, p, oldprio, running);
2003} 2003}
2004 2004
2005/**
2006 * kthread_bind - bind a just-created kthread to a cpu.
2007 * @p: thread created by kthread_create().
2008 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2009 *
2010 * Description: This function is equivalent to set_cpus_allowed(),
2011 * except that @cpu doesn't need to be online, and the thread must be
2012 * stopped (i.e., just returned from kthread_create()).
2013 *
2014 * Function lives here instead of kthread.c because it messes with
2015 * scheduler internals which require locking.
2016 */
2017void kthread_bind(struct task_struct *p, unsigned int cpu)
2018{
2019 struct rq *rq = cpu_rq(cpu);
2020 unsigned long flags;
2021
2022 /* Must have done schedule() in kthread() before we set_task_cpu */
2023 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2024 WARN_ON(1);
2025 return;
2026 }
2027
2028 raw_spin_lock_irqsave(&rq->lock, flags);
2029 update_rq_clock(rq);
2030 set_task_cpu(p, cpu);
2031 p->cpus_allowed = cpumask_of_cpu(cpu);
2032 p->rt.nr_cpus_allowed = 1;
2033 p->flags |= PF_THREAD_BOUND;
2034 raw_spin_unlock_irqrestore(&rq->lock, flags);
2035}
2036EXPORT_SYMBOL(kthread_bind);
2037
2038#ifdef CONFIG_SMP 2005#ifdef CONFIG_SMP
2039/* 2006/*
2040 * Is this task likely cache-hot: 2007 * Is this task likely cache-hot:
@@ -2044,6 +2011,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2044{ 2011{
2045 s64 delta; 2012 s64 delta;
2046 2013
2014 if (p->sched_class != &fair_sched_class)
2015 return 0;
2016
2047 /* 2017 /*
2048 * Buddy candidates are cache hot: 2018 * Buddy candidates are cache hot:
2049 */ 2019 */
@@ -2052,9 +2022,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2052 &p->se == cfs_rq_of(&p->se)->last)) 2022 &p->se == cfs_rq_of(&p->se)->last))
2053 return 1; 2023 return 1;
2054 2024
2055 if (p->sched_class != &fair_sched_class)
2056 return 0;
2057
2058 if (sysctl_sched_migration_cost == -1) 2025 if (sysctl_sched_migration_cost == -1)
2059 return 1; 2026 return 1;
2060 if (sysctl_sched_migration_cost == 0) 2027 if (sysctl_sched_migration_cost == 0)
@@ -2065,22 +2032,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2065 return delta < (s64)sysctl_sched_migration_cost; 2032 return delta < (s64)sysctl_sched_migration_cost;
2066} 2033}
2067 2034
2068
2069void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2035void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2070{ 2036{
2071 int old_cpu = task_cpu(p); 2037#ifdef CONFIG_SCHED_DEBUG
2072 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2038 /*
2073 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2039 * We should never call set_task_cpu() on a blocked task,
2040 * ttwu() will sort out the placement.
2041 */
2042 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2043 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2044#endif
2074 2045
2075 trace_sched_migrate_task(p, new_cpu); 2046 trace_sched_migrate_task(p, new_cpu);
2076 2047
2077 if (old_cpu != new_cpu) { 2048 if (task_cpu(p) != new_cpu) {
2078 p->se.nr_migrations++; 2049 p->se.nr_migrations++;
2079 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2050 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2080 1, 1, NULL, 0);
2081 } 2051 }
2082 p->se.vruntime -= old_cfsrq->min_vruntime -
2083 new_cfsrq->min_vruntime;
2084 2052
2085 __set_task_cpu(p, new_cpu); 2053 __set_task_cpu(p, new_cpu);
2086} 2054}
@@ -2105,13 +2073,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2105 2073
2106 /* 2074 /*
2107 * If the task is not on a runqueue (and not running), then 2075 * If the task is not on a runqueue (and not running), then
2108 * it is sufficient to simply update the task's cpu field. 2076 * the next wake-up will properly place the task.
2109 */ 2077 */
2110 if (!p->se.on_rq && !task_running(rq, p)) { 2078 if (!p->se.on_rq && !task_running(rq, p))
2111 update_rq_clock(rq);
2112 set_task_cpu(p, dest_cpu);
2113 return 0; 2079 return 0;
2114 }
2115 2080
2116 init_completion(&req->done); 2081 init_completion(&req->done);
2117 req->task = p; 2082 req->task = p;
@@ -2317,10 +2282,73 @@ void task_oncpu_function_call(struct task_struct *p,
2317} 2282}
2318 2283
2319#ifdef CONFIG_SMP 2284#ifdef CONFIG_SMP
2285static int select_fallback_rq(int cpu, struct task_struct *p)
2286{
2287 int dest_cpu;
2288 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2289
2290 /* Look for allowed, online CPU in same node. */
2291 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2292 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2293 return dest_cpu;
2294
2295 /* Any allowed, online CPU? */
2296 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2297 if (dest_cpu < nr_cpu_ids)
2298 return dest_cpu;
2299
2300 /* No more Mr. Nice Guy. */
2301 if (dest_cpu >= nr_cpu_ids) {
2302 rcu_read_lock();
2303 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2304 rcu_read_unlock();
2305 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2306
2307 /*
2308 * Don't tell them about moving exiting tasks or
2309 * kernel threads (both mm NULL), since they never
2310 * leave kernel.
2311 */
2312 if (p->mm && printk_ratelimit()) {
2313 printk(KERN_INFO "process %d (%s) no "
2314 "longer affine to cpu%d\n",
2315 task_pid_nr(p), p->comm, cpu);
2316 }
2317 }
2318
2319 return dest_cpu;
2320}
2321
2322/*
2323 * Called from:
2324 *
2325 * - fork, @p is stable because it isn't on the tasklist yet
2326 *
2327 * - exec, @p is unstable, retry loop
2328 *
2329 * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
2330 * we should be good.
2331 */
2320static inline 2332static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2333int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2322{ 2334{
2323 return p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2335 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2336
2337 /*
2338 * In order not to call set_task_cpu() on a blocking task we need
2339 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2340 * cpu.
2341 *
2342 * Since this is common to all placement strategies, this lives here.
2343 *
2344 * [ this allows ->select_task() to simply return task_cpu(p) and
2345 * not worry about this generic constraint ]
2346 */
2347 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2348 !cpu_online(cpu)))
2349 cpu = select_fallback_rq(task_cpu(p), p);
2350
2351 return cpu;
2324} 2352}
2325#endif 2353#endif
2326 2354
@@ -2375,6 +2403,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2375 if (task_contributes_to_load(p)) 2403 if (task_contributes_to_load(p))
2376 rq->nr_uninterruptible--; 2404 rq->nr_uninterruptible--;
2377 p->state = TASK_WAKING; 2405 p->state = TASK_WAKING;
2406
2407 if (p->sched_class->task_waking)
2408 p->sched_class->task_waking(rq, p);
2409
2378 __task_rq_unlock(rq); 2410 __task_rq_unlock(rq);
2379 2411
2380 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2412 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
@@ -2438,8 +2470,8 @@ out_running:
2438 2470
2439 p->state = TASK_RUNNING; 2471 p->state = TASK_RUNNING;
2440#ifdef CONFIG_SMP 2472#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up) 2473 if (p->sched_class->task_woken)
2442 p->sched_class->task_wake_up(rq, p); 2474 p->sched_class->task_woken(rq, p);
2443 2475
2444 if (unlikely(rq->idle_stamp)) { 2476 if (unlikely(rq->idle_stamp)) {
2445 u64 delta = rq->clock - rq->idle_stamp; 2477 u64 delta = rq->clock - rq->idle_stamp;
@@ -2538,14 +2570,6 @@ static void __sched_fork(struct task_struct *p)
2538#ifdef CONFIG_PREEMPT_NOTIFIERS 2570#ifdef CONFIG_PREEMPT_NOTIFIERS
2539 INIT_HLIST_HEAD(&p->preempt_notifiers); 2571 INIT_HLIST_HEAD(&p->preempt_notifiers);
2540#endif 2572#endif
2541
2542 /*
2543 * We mark the process as running here, but have not actually
2544 * inserted it onto the runqueue yet. This guarantees that
2545 * nobody will actually run it, and a signal or other external
2546 * event cannot wake it up and insert it on the runqueue either.
2547 */
2548 p->state = TASK_RUNNING;
2549} 2573}
2550 2574
2551/* 2575/*
@@ -2556,6 +2580,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
2556 int cpu = get_cpu(); 2580 int cpu = get_cpu();
2557 2581
2558 __sched_fork(p); 2582 __sched_fork(p);
2583 /*
2584 * We mark the process as waking here. This guarantees that
2585 * nobody will actually run it, and a signal or other external
2586 * event cannot wake it up and insert it on the runqueue either.
2587 */
2588 p->state = TASK_WAKING;
2559 2589
2560 /* 2590 /*
2561 * Revert to default priority/policy on fork if requested. 2591 * Revert to default priority/policy on fork if requested.
@@ -2624,14 +2654,15 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2624 struct rq *rq; 2654 struct rq *rq;
2625 2655
2626 rq = task_rq_lock(p, &flags); 2656 rq = task_rq_lock(p, &flags);
2627 BUG_ON(p->state != TASK_RUNNING); 2657 BUG_ON(p->state != TASK_WAKING);
2658 p->state = TASK_RUNNING;
2628 update_rq_clock(rq); 2659 update_rq_clock(rq);
2629 activate_task(rq, p, 0); 2660 activate_task(rq, p, 0);
2630 trace_sched_wakeup_new(rq, p, 1); 2661 trace_sched_wakeup_new(rq, p, 1);
2631 check_preempt_curr(rq, p, WF_FORK); 2662 check_preempt_curr(rq, p, WF_FORK);
2632#ifdef CONFIG_SMP 2663#ifdef CONFIG_SMP
2633 if (p->sched_class->task_wake_up) 2664 if (p->sched_class->task_woken)
2634 p->sched_class->task_wake_up(rq, p); 2665 p->sched_class->task_woken(rq, p);
2635#endif 2666#endif
2636 task_rq_unlock(rq, &flags); 2667 task_rq_unlock(rq, &flags);
2637} 2668}
@@ -3101,21 +3132,36 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3101} 3132}
3102 3133
3103/* 3134/*
3104 * If dest_cpu is allowed for this process, migrate the task to it. 3135 * sched_exec - execve() is a valuable balancing opportunity, because at
3105 * This is accomplished by forcing the cpu_allowed mask to only 3136 * this point the task has the smallest effective memory and cache footprint.
3106 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3107 * the cpu_allowed mask is restored.
3108 */ 3137 */
3109static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3138void sched_exec(void)
3110{ 3139{
3140 struct task_struct *p = current;
3111 struct migration_req req; 3141 struct migration_req req;
3142 int dest_cpu, this_cpu;
3112 unsigned long flags; 3143 unsigned long flags;
3113 struct rq *rq; 3144 struct rq *rq;
3114 3145
3146again:
3147 this_cpu = get_cpu();
3148 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3149 if (dest_cpu == this_cpu) {
3150 put_cpu();
3151 return;
3152 }
3153
3115 rq = task_rq_lock(p, &flags); 3154 rq = task_rq_lock(p, &flags);
3155 put_cpu();
3156
3157 /*
3158 * select_task_rq() can race against ->cpus_allowed
3159 */
3116 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3160 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3117 || unlikely(!cpu_active(dest_cpu))) 3161 || unlikely(!cpu_active(dest_cpu))) {
3118 goto out; 3162 task_rq_unlock(rq, &flags);
3163 goto again;
3164 }
3119 3165
3120 /* force the process onto the specified CPU */ 3166 /* force the process onto the specified CPU */
3121 if (migrate_task(p, dest_cpu, &req)) { 3167 if (migrate_task(p, dest_cpu, &req)) {
@@ -3130,24 +3176,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3130 3176
3131 return; 3177 return;
3132 } 3178 }
3133out:
3134 task_rq_unlock(rq, &flags); 3179 task_rq_unlock(rq, &flags);
3135} 3180}
3136 3181
3137/* 3182/*
3138 * sched_exec - execve() is a valuable balancing opportunity, because at
3139 * this point the task has the smallest effective memory and cache footprint.
3140 */
3141void sched_exec(void)
3142{
3143 int new_cpu, this_cpu = get_cpu();
3144 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3145 put_cpu();
3146 if (new_cpu != this_cpu)
3147 sched_migrate_task(current, new_cpu);
3148}
3149
3150/*
3151 * pull_task - move a task from a remote runqueue to the local runqueue. 3183 * pull_task - move a task from a remote runqueue to the local runqueue.
3152 * Both runqueues must be locked. 3184 * Both runqueues must be locked.
3153 */ 3185 */
@@ -5911,14 +5943,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5911 */ 5943 */
5912bool try_wait_for_completion(struct completion *x) 5944bool try_wait_for_completion(struct completion *x)
5913{ 5945{
5946 unsigned long flags;
5914 int ret = 1; 5947 int ret = 1;
5915 5948
5916 spin_lock_irq(&x->wait.lock); 5949 spin_lock_irqsave(&x->wait.lock, flags);
5917 if (!x->done) 5950 if (!x->done)
5918 ret = 0; 5951 ret = 0;
5919 else 5952 else
5920 x->done--; 5953 x->done--;
5921 spin_unlock_irq(&x->wait.lock); 5954 spin_unlock_irqrestore(&x->wait.lock, flags);
5922 return ret; 5955 return ret;
5923} 5956}
5924EXPORT_SYMBOL(try_wait_for_completion); 5957EXPORT_SYMBOL(try_wait_for_completion);
@@ -5933,12 +5966,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5933 */ 5966 */
5934bool completion_done(struct completion *x) 5967bool completion_done(struct completion *x)
5935{ 5968{
5969 unsigned long flags;
5936 int ret = 1; 5970 int ret = 1;
5937 5971
5938 spin_lock_irq(&x->wait.lock); 5972 spin_lock_irqsave(&x->wait.lock, flags);
5939 if (!x->done) 5973 if (!x->done)
5940 ret = 0; 5974 ret = 0;
5941 spin_unlock_irq(&x->wait.lock); 5975 spin_unlock_irqrestore(&x->wait.lock, flags);
5942 return ret; 5976 return ret;
5943} 5977}
5944EXPORT_SYMBOL(completion_done); 5978EXPORT_SYMBOL(completion_done);
@@ -6457,7 +6491,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6457 return -EINVAL; 6491 return -EINVAL;
6458 6492
6459 retval = -ESRCH; 6493 retval = -ESRCH;
6460 read_lock(&tasklist_lock); 6494 rcu_read_lock();
6461 p = find_process_by_pid(pid); 6495 p = find_process_by_pid(pid);
6462 if (p) { 6496 if (p) {
6463 retval = security_task_getscheduler(p); 6497 retval = security_task_getscheduler(p);
@@ -6465,7 +6499,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6465 retval = p->policy 6499 retval = p->policy
6466 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 6500 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6467 } 6501 }
6468 read_unlock(&tasklist_lock); 6502 rcu_read_unlock();
6469 return retval; 6503 return retval;
6470} 6504}
6471 6505
@@ -6483,7 +6517,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6483 if (!param || pid < 0) 6517 if (!param || pid < 0)
6484 return -EINVAL; 6518 return -EINVAL;
6485 6519
6486 read_lock(&tasklist_lock); 6520 rcu_read_lock();
6487 p = find_process_by_pid(pid); 6521 p = find_process_by_pid(pid);
6488 retval = -ESRCH; 6522 retval = -ESRCH;
6489 if (!p) 6523 if (!p)
@@ -6494,7 +6528,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6494 goto out_unlock; 6528 goto out_unlock;
6495 6529
6496 lp.sched_priority = p->rt_priority; 6530 lp.sched_priority = p->rt_priority;
6497 read_unlock(&tasklist_lock); 6531 rcu_read_unlock();
6498 6532
6499 /* 6533 /*
6500 * This one might sleep, we cannot do it with a spinlock held ... 6534 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6504,7 +6538,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6504 return retval; 6538 return retval;
6505 6539
6506out_unlock: 6540out_unlock:
6507 read_unlock(&tasklist_lock); 6541 rcu_read_unlock();
6508 return retval; 6542 return retval;
6509} 6543}
6510 6544
@@ -6515,22 +6549,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6515 int retval; 6549 int retval;
6516 6550
6517 get_online_cpus(); 6551 get_online_cpus();
6518 read_lock(&tasklist_lock); 6552 rcu_read_lock();
6519 6553
6520 p = find_process_by_pid(pid); 6554 p = find_process_by_pid(pid);
6521 if (!p) { 6555 if (!p) {
6522 read_unlock(&tasklist_lock); 6556 rcu_read_unlock();
6523 put_online_cpus(); 6557 put_online_cpus();
6524 return -ESRCH; 6558 return -ESRCH;
6525 } 6559 }
6526 6560
6527 /* 6561 /* Prevent p going away */
6528 * It is not safe to call set_cpus_allowed with the
6529 * tasklist_lock held. We will bump the task_struct's
6530 * usage count and then drop tasklist_lock.
6531 */
6532 get_task_struct(p); 6562 get_task_struct(p);
6533 read_unlock(&tasklist_lock); 6563 rcu_read_unlock();
6534 6564
6535 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 6565 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6536 retval = -ENOMEM; 6566 retval = -ENOMEM;
@@ -6616,7 +6646,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6616 int retval; 6646 int retval;
6617 6647
6618 get_online_cpus(); 6648 get_online_cpus();
6619 read_lock(&tasklist_lock); 6649 rcu_read_lock();
6620 6650
6621 retval = -ESRCH; 6651 retval = -ESRCH;
6622 p = find_process_by_pid(pid); 6652 p = find_process_by_pid(pid);
@@ -6632,7 +6662,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632 task_rq_unlock(rq, &flags); 6662 task_rq_unlock(rq, &flags);
6633 6663
6634out_unlock: 6664out_unlock:
6635 read_unlock(&tasklist_lock); 6665 rcu_read_unlock();
6636 put_online_cpus(); 6666 put_online_cpus();
6637 6667
6638 return retval; 6668 return retval;
@@ -6876,7 +6906,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6876 return -EINVAL; 6906 return -EINVAL;
6877 6907
6878 retval = -ESRCH; 6908 retval = -ESRCH;
6879 read_lock(&tasklist_lock); 6909 rcu_read_lock();
6880 p = find_process_by_pid(pid); 6910 p = find_process_by_pid(pid);
6881 if (!p) 6911 if (!p)
6882 goto out_unlock; 6912 goto out_unlock;
@@ -6889,13 +6919,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6889 time_slice = p->sched_class->get_rr_interval(rq, p); 6919 time_slice = p->sched_class->get_rr_interval(rq, p);
6890 task_rq_unlock(rq, &flags); 6920 task_rq_unlock(rq, &flags);
6891 6921
6892 read_unlock(&tasklist_lock); 6922 rcu_read_unlock();
6893 jiffies_to_timespec(time_slice, &t); 6923 jiffies_to_timespec(time_slice, &t);
6894 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6924 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6895 return retval; 6925 return retval;
6896 6926
6897out_unlock: 6927out_unlock:
6898 read_unlock(&tasklist_lock); 6928 rcu_read_unlock();
6899 return retval; 6929 return retval;
6900} 6930}
6901 6931
@@ -6986,6 +7016,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6986 raw_spin_lock_irqsave(&rq->lock, flags); 7016 raw_spin_lock_irqsave(&rq->lock, flags);
6987 7017
6988 __sched_fork(idle); 7018 __sched_fork(idle);
7019 idle->state = TASK_RUNNING;
6989 idle->se.exec_start = sched_clock(); 7020 idle->se.exec_start = sched_clock();
6990 7021
6991 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 7022 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
@@ -7100,7 +7131,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7100 struct rq *rq; 7131 struct rq *rq;
7101 int ret = 0; 7132 int ret = 0;
7102 7133
7134 /*
7135 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7136 * the ->cpus_allowed mask from under waking tasks, which would be
7137 * possible when we change rq->lock in ttwu(), so synchronize against
7138 * TASK_WAKING to avoid that.
7139 */
7140again:
7141 while (p->state == TASK_WAKING)
7142 cpu_relax();
7143
7103 rq = task_rq_lock(p, &flags); 7144 rq = task_rq_lock(p, &flags);
7145
7146 if (p->state == TASK_WAKING) {
7147 task_rq_unlock(rq, &flags);
7148 goto again;
7149 }
7150
7104 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 7151 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7105 ret = -EINVAL; 7152 ret = -EINVAL;
7106 goto out; 7153 goto out;
@@ -7156,7 +7203,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7156static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 7203static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7157{ 7204{
7158 struct rq *rq_dest, *rq_src; 7205 struct rq *rq_dest, *rq_src;
7159 int ret = 0, on_rq; 7206 int ret = 0;
7160 7207
7161 if (unlikely(!cpu_active(dest_cpu))) 7208 if (unlikely(!cpu_active(dest_cpu)))
7162 return ret; 7209 return ret;
@@ -7172,12 +7219,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7172 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7219 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7173 goto fail; 7220 goto fail;
7174 7221
7175 on_rq = p->se.on_rq; 7222 /*
7176 if (on_rq) 7223 * If we're not on a rq, the next wake-up will ensure we're
7224 * placed properly.
7225 */
7226 if (p->se.on_rq) {
7177 deactivate_task(rq_src, p, 0); 7227 deactivate_task(rq_src, p, 0);
7178 7228 set_task_cpu(p, dest_cpu);
7179 set_task_cpu(p, dest_cpu);
7180 if (on_rq) {
7181 activate_task(rq_dest, p, 0); 7229 activate_task(rq_dest, p, 0);
7182 check_preempt_curr(rq_dest, p, 0); 7230 check_preempt_curr(rq_dest, p, 0);
7183 } 7231 }
@@ -7273,37 +7321,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7273static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 7321static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7274{ 7322{
7275 int dest_cpu; 7323 int dest_cpu;
7276 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7277 7324
7278again: 7325again:
7279 /* Look for allowed, online CPU in same node. */ 7326 dest_cpu = select_fallback_rq(dead_cpu, p);
7280 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7281 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7282 goto move;
7283
7284 /* Any allowed, online CPU? */
7285 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7286 if (dest_cpu < nr_cpu_ids)
7287 goto move;
7288
7289 /* No more Mr. Nice Guy. */
7290 if (dest_cpu >= nr_cpu_ids) {
7291 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7292 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7293
7294 /*
7295 * Don't tell them about moving exiting tasks or
7296 * kernel threads (both mm NULL), since they never
7297 * leave kernel.
7298 */
7299 if (p->mm && printk_ratelimit()) {
7300 printk(KERN_INFO "process %d (%s) no "
7301 "longer affine to cpu%d\n",
7302 task_pid_nr(p), p->comm, dead_cpu);
7303 }
7304 }
7305 7327
7306move:
7307 /* It can have affinity changed while we were choosing. */ 7328 /* It can have affinity changed while we were choosing. */
7308 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 7329 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7309 goto again; 7330 goto again;
@@ -9668,7 +9689,7 @@ void __init sched_init(void)
9668#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9689#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9669static inline int preempt_count_equals(int preempt_offset) 9690static inline int preempt_count_equals(int preempt_offset)
9670{ 9691{
9671 int nested = preempt_count() & ~PREEMPT_ACTIVE; 9692 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9672 9693
9673 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 9694 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9674} 9695}
@@ -10083,7 +10104,7 @@ void sched_move_task(struct task_struct *tsk)
10083 10104
10084#ifdef CONFIG_FAIR_GROUP_SCHED 10105#ifdef CONFIG_FAIR_GROUP_SCHED
10085 if (tsk->sched_class->moved_group) 10106 if (tsk->sched_class->moved_group)
10086 tsk->sched_class->moved_group(tsk); 10107 tsk->sched_class->moved_group(tsk, on_rq);
10087#endif 10108#endif
10088 10109
10089 if (unlikely(running)) 10110 if (unlikely(running))
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7c..5b496132c28a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 236}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 238
239unsigned long long cpu_clock(int cpu)
240{
241 unsigned long long clock;
242 unsigned long flags;
243
244 local_irq_save(flags);
245 clock = sched_clock_cpu(cpu);
246 local_irq_restore(flags);
247
248 return clock;
249}
250
239#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
240 252
241void sched_clock_init(void) 253void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
251 return sched_clock(); 263 return sched_clock();
252} 264}
253 265
254#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255 266
256unsigned long long cpu_clock(int cpu) 267unsigned long long cpu_clock(int cpu)
257{ 268{
258 unsigned long long clock; 269 return sched_clock_cpu(cpu);
259 unsigned long flags; 270}
260 271
261 local_irq_save(flags); 272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
262 clock = sched_clock_cpu(cpu);
263 local_irq_restore(flags);
264 273
265 return clock;
266}
267EXPORT_SYMBOL_GPL(cpu_clock); 274EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5bedf6e3ebf3..42ac3c9f66f6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
510 curr->sum_exec_runtime += delta_exec; 510 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 511 schedstat_add(cfs_rq, exec_clock, delta_exec);
512 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 512 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
513
513 curr->vruntime += delta_exec_weighted; 514 curr->vruntime += delta_exec_weighted;
514 update_min_vruntime(cfs_rq); 515 update_min_vruntime(cfs_rq);
515} 516}
@@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
765 se->vruntime = vruntime; 766 se->vruntime = vruntime;
766} 767}
767 768
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
768static void 772static void
769enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
770{ 774{
771 /* 775 /*
776 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr().
778 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
780 se->vruntime += cfs_rq->min_vruntime;
781
782 /*
772 * Update run-time statistics of the 'current'. 783 * Update run-time statistics of the 'current'.
773 */ 784 */
774 update_curr(cfs_rq); 785 update_curr(cfs_rq);
775 account_entity_enqueue(cfs_rq, se); 786 account_entity_enqueue(cfs_rq, se);
776 787
777 if (wakeup) { 788 if (flags & ENQUEUE_WAKEUP) {
778 place_entity(cfs_rq, se, 0); 789 place_entity(cfs_rq, se, 0);
779 enqueue_sleeper(cfs_rq, se); 790 enqueue_sleeper(cfs_rq, se);
780 } 791 }
@@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
828 __dequeue_entity(cfs_rq, se); 839 __dequeue_entity(cfs_rq, se);
829 account_entity_dequeue(cfs_rq, se); 840 account_entity_dequeue(cfs_rq, se);
830 update_min_vruntime(cfs_rq); 841 update_min_vruntime(cfs_rq);
842
843 /*
844 * Normalize the entity after updating the min_vruntime because the
845 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position.
847 */
848 if (!sleep)
849 se->vruntime -= cfs_rq->min_vruntime;
831} 850}
832 851
833/* 852/*
@@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
1038{ 1057{
1039 struct cfs_rq *cfs_rq; 1058 struct cfs_rq *cfs_rq;
1040 struct sched_entity *se = &p->se; 1059 struct sched_entity *se = &p->se;
1060 int flags = 0;
1061
1062 if (wakeup)
1063 flags |= ENQUEUE_WAKEUP;
1064 if (p->state == TASK_WAKING)
1065 flags |= ENQUEUE_MIGRATE;
1041 1066
1042 for_each_sched_entity(se) { 1067 for_each_sched_entity(se) {
1043 if (se->on_rq) 1068 if (se->on_rq)
1044 break; 1069 break;
1045 cfs_rq = cfs_rq_of(se); 1070 cfs_rq = cfs_rq_of(se);
1046 enqueue_entity(cfs_rq, se, wakeup); 1071 enqueue_entity(cfs_rq, se, flags);
1047 wakeup = 1; 1072 flags = ENQUEUE_WAKEUP;
1048 } 1073 }
1049 1074
1050 hrtick_update(rq); 1075 hrtick_update(rq);
@@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)
1120 1145
1121#ifdef CONFIG_SMP 1146#ifdef CONFIG_SMP
1122 1147
1148static void task_waking_fair(struct rq *rq, struct task_struct *p)
1149{
1150 struct sched_entity *se = &p->se;
1151 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1152
1153 se->vruntime -= cfs_rq->min_vruntime;
1154}
1155
1123#ifdef CONFIG_FAIR_GROUP_SCHED 1156#ifdef CONFIG_FAIR_GROUP_SCHED
1124/* 1157/*
1125 * effective_load() calculates the load change as seen from the root_task_group 1158 * effective_load() calculates the load change as seen from the root_task_group
@@ -1429,6 +1462,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429 } 1462 }
1430 1463
1431 for_each_domain(cpu, tmp) { 1464 for_each_domain(cpu, tmp) {
1465 if (!(tmp->flags & SD_LOAD_BALANCE))
1466 continue;
1467
1432 /* 1468 /*
1433 * If power savings logic is enabled for a domain, see if we 1469 * If power savings logic is enabled for a domain, see if we
1434 * are not overloaded, if so, don't balance wider. 1470 * are not overloaded, if so, don't balance wider.
@@ -1975,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p)
1975 resched_task(rq->curr); 2011 resched_task(rq->curr);
1976 } 2012 }
1977 2013
2014 se->vruntime -= cfs_rq->min_vruntime;
2015
1978 raw_spin_unlock_irqrestore(&rq->lock, flags); 2016 raw_spin_unlock_irqrestore(&rq->lock, flags);
1979} 2017}
1980 2018
@@ -2028,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq)
2028} 2066}
2029 2067
2030#ifdef CONFIG_FAIR_GROUP_SCHED 2068#ifdef CONFIG_FAIR_GROUP_SCHED
2031static void moved_group_fair(struct task_struct *p) 2069static void moved_group_fair(struct task_struct *p, int on_rq)
2032{ 2070{
2033 struct cfs_rq *cfs_rq = task_cfs_rq(p); 2071 struct cfs_rq *cfs_rq = task_cfs_rq(p);
2034 2072
2035 update_curr(cfs_rq); 2073 update_curr(cfs_rq);
2036 place_entity(cfs_rq, &p->se, 1); 2074 if (!on_rq)
2075 place_entity(cfs_rq, &p->se, 1);
2037} 2076}
2038#endif 2077#endif
2039 2078
@@ -2073,6 +2112,8 @@ static const struct sched_class fair_sched_class = {
2073 .move_one_task = move_one_task_fair, 2112 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair, 2113 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair, 2114 .rq_offline = rq_offline_fair,
2115
2116 .task_waking = task_waking_fair,
2076#endif 2117#endif
2077 2118
2078 .set_curr_task = set_curr_task_fair, 2119 .set_curr_task = set_curr_task_fair,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d2ea2828164e..f48328ac216f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq)
1472 * If we are not running and we are not going to reschedule soon, we should 1472 * If we are not running and we are not going to reschedule soon, we should
1473 * try to push tasks away now 1473 * try to push tasks away now
1474 */ 1474 */
1475static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1475static void task_woken_rt(struct rq *rq, struct task_struct *p)
1476{ 1476{
1477 if (!task_running(rq, p) && 1477 if (!task_running(rq, p) &&
1478 !test_tsk_need_resched(rq->curr) && 1478 !test_tsk_need_resched(rq->curr) &&
@@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = {
1753 .rq_offline = rq_offline_rt, 1753 .rq_offline = rq_offline_rt,
1754 .pre_schedule = pre_schedule_rt, 1754 .pre_schedule = pre_schedule_rt,
1755 .post_schedule = post_schedule_rt, 1755 .post_schedule = post_schedule_rt,
1756 .task_wake_up = task_wake_up_rt, 1756 .task_woken = task_woken_rt,
1757 .switched_from = switched_from_rt, 1757 .switched_from = switched_from_rt,
1758#endif 1758#endif
1759 1759
diff --git a/kernel/signal.c b/kernel/signal.c
index 1814e68e4de3..934ae5e687b9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
218 struct user_struct *user; 218 struct user_struct *user;
219 219
220 /* 220 /*
221 * We won't get problems with the target's UID changing under us 221 * Protect access to @t credentials. This can go away when all
222 * because changing it requires RCU be used, and if t != current, the 222 * callers hold rcu read lock.
223 * caller must be holding the RCU readlock (by way of a spinlock) and
224 * we use RCU protection here
225 */ 223 */
224 rcu_read_lock();
226 user = get_uid(__task_cred(t)->user); 225 user = get_uid(__task_cred(t)->user);
227 atomic_inc(&user->sigpending); 226 atomic_inc(&user->sigpending);
227 rcu_read_unlock();
228 228
229 if (override_rlimit || 229 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
@@ -979,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
979 for (i = 0; i < 16; i++) { 979 for (i = 0; i < 16; i++) {
980 unsigned char insn; 980 unsigned char insn;
981 981
982 __get_user(insn, (unsigned char *)(regs->ip + i)); 982 if (get_user(insn, (unsigned char *)(regs->ip + i)))
983 break;
983 printk("%02x ", insn); 984 printk("%02x ", insn);
984 } 985 }
985 } 986 }
@@ -1179,11 +1180,12 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1179 int ret = -EINVAL; 1180 int ret = -EINVAL;
1180 struct task_struct *p; 1181 struct task_struct *p;
1181 const struct cred *pcred; 1182 const struct cred *pcred;
1183 unsigned long flags;
1182 1184
1183 if (!valid_signal(sig)) 1185 if (!valid_signal(sig))
1184 return ret; 1186 return ret;
1185 1187
1186 read_lock(&tasklist_lock); 1188 rcu_read_lock();
1187 p = pid_task(pid, PIDTYPE_PID); 1189 p = pid_task(pid, PIDTYPE_PID);
1188 if (!p) { 1190 if (!p) {
1189 ret = -ESRCH; 1191 ret = -ESRCH;
@@ -1199,14 +1201,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1199 ret = security_task_kill(p, info, sig, secid); 1201 ret = security_task_kill(p, info, sig, secid);
1200 if (ret) 1202 if (ret)
1201 goto out_unlock; 1203 goto out_unlock;
1202 if (sig && p->sighand) { 1204
1203 unsigned long flags; 1205 if (sig) {
1204 spin_lock_irqsave(&p->sighand->siglock, flags); 1206 if (lock_task_sighand(p, &flags)) {
1205 ret = __send_signal(sig, info, p, 1, 0); 1207 ret = __send_signal(sig, info, p, 1, 0);
1206 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1208 unlock_task_sighand(p, &flags);
1209 } else
1210 ret = -ESRCH;
1207 } 1211 }
1208out_unlock: 1212out_unlock:
1209 read_unlock(&tasklist_lock); 1213 rcu_read_unlock();
1210 return ret; 1214 return ret;
1211} 1215}
1212EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1216EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
diff --git a/kernel/sys.c b/kernel/sys.c
index 20ccfb5da6af..26a6b73a6b85 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -162,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
162 if (niceval > 19) 162 if (niceval > 19)
163 niceval = 19; 163 niceval = 19;
164 164
165 rcu_read_lock();
165 read_lock(&tasklist_lock); 166 read_lock(&tasklist_lock);
166 switch (which) { 167 switch (which) {
167 case PRIO_PROCESS: 168 case PRIO_PROCESS:
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
199 } 200 }
200out_unlock: 201out_unlock:
201 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
203 rcu_read_unlock();
202out: 204out:
203 return error; 205 return error;
204} 206}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45e4bef0012a..8a68b2448468 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1131,7 +1131,7 @@ static struct ctl_table vm_table[] = {
1131 .data = &sysctl_max_map_count, 1131 .data = &sysctl_max_map_count,
1132 .maxlen = sizeof(sysctl_max_map_count), 1132 .maxlen = sizeof(sysctl_max_map_count),
1133 .mode = 0644, 1133 .mode = 0644,
1134 .proc_handler = proc_dointvec, 1134 .proc_handler = proc_dointvec_minmax,
1135 .extra1 = &zero, 1135 .extra1 = &zero,
1136 }, 1136 },
1137#else 1137#else
@@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = {
1214 .proc_handler = proc_dointvec_jiffies, 1214 .proc_handler = proc_dointvec_jiffies,
1215 }, 1215 },
1216#endif 1216#endif
1217#ifdef CONFIG_MMU
1217 { 1218 {
1218 .procname = "mmap_min_addr", 1219 .procname = "mmap_min_addr",
1219 .data = &dac_mmap_min_addr, 1220 .data = &dac_mmap_min_addr,
@@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = {
1221 .mode = 0644, 1222 .mode = 0644,
1222 .proc_handler = mmap_min_addr_handler, 1223 .proc_handler = mmap_min_addr_handler,
1223 }, 1224 },
1225#endif
1224#ifdef CONFIG_NUMA 1226#ifdef CONFIG_NUMA
1225 { 1227 {
1226 .procname = "numa_zonelist_order", 1228 .procname = "numa_zonelist_order",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 112533d5fc08..8f5d16e0707a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1417,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1417 return; 1417 return;
1418} 1418}
1419 1419
1420#define WARN_ONCE_HASH_BITS 8
1421#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1422
1423static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1424
1425#define FNV32_OFFSET 2166136261U
1426#define FNV32_PRIME 0x01000193
1427
1428/*
1429 * Print each legacy sysctl (approximately) only once.
1430 * To avoid making the tables non-const use a external
1431 * hash-table instead.
1432 * Worst case hash collision: 6, but very rarely.
1433 * NOTE! We don't use the SMP-safe bit tests. We simply
1434 * don't care enough.
1435 */
1436static void warn_on_bintable(const int *name, int nlen)
1437{
1438 int i;
1439 u32 hash = FNV32_OFFSET;
1440
1441 for (i = 0; i < nlen; i++)
1442 hash = (hash ^ name[i]) * FNV32_PRIME;
1443 hash %= WARN_ONCE_HASH_SIZE;
1444 if (__test_and_set_bit(hash, warn_once_bitmap))
1445 return;
1446 deprecated_sysctl_warning(name, nlen);
1447}
1448
1420static ssize_t do_sysctl(int __user *args_name, int nlen, 1449static ssize_t do_sysctl(int __user *args_name, int nlen,
1421 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1450 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1422{ 1451{
@@ -1431,7 +1460,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen,
1431 if (get_user(name[i], args_name + i)) 1460 if (get_user(name[i], args_name + i))
1432 return -EFAULT; 1461 return -EFAULT;
1433 1462
1434 deprecated_sysctl_warning(name, nlen); 1463 warn_on_bintable(name, nlen);
1435 1464
1436 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); 1465 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1437} 1466}
diff --git a/kernel/time.c b/kernel/time.c
index c6324d96009e..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,6 +136,7 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 136 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 update_xtime_cache(0);
139 write_sequnlock_irq(&xtime_lock); 140 write_sequnlock_irq(&xtime_lock);
140 clock_was_set(); 141 clock_was_set();
141} 142}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d5fc0fd1cca..6f740d9f0948 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -238,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
238 */ 238 */
239void clockevents_notify(unsigned long reason, void *arg) 239void clockevents_notify(unsigned long reason, void *arg)
240{ 240{
241 struct list_head *node, *tmp; 241 struct clock_event_device *dev, *tmp;
242 unsigned long flags; 242 unsigned long flags;
243 int cpu;
243 244
244 raw_spin_lock_irqsave(&clockevents_lock, flags); 245 raw_spin_lock_irqsave(&clockevents_lock, flags);
245 clockevents_do_notify(reason, arg); 246 clockevents_do_notify(reason, arg);
@@ -250,8 +251,19 @@ void clockevents_notify(unsigned long reason, void *arg)
250 * Unregister the clock event devices which were 251 * Unregister the clock event devices which were
251 * released from the users in the notify chain. 252 * released from the users in the notify chain.
252 */ 253 */
253 list_for_each_safe(node, tmp, &clockevents_released) 254 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
254 list_del(node); 255 list_del(&dev->list);
256 /*
257 * Now check whether the CPU has left unused per cpu devices
258 */
259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1) {
263 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
264 list_del(&dev->list);
265 }
266 }
255 break; 267 break;
256 default: 268 default:
257 break; 269 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index af4135f05825..7faaa32fbf4f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,6 +165,13 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
168/* must hold xtime_lock */ 175/* must hold xtime_lock */
169void timekeeping_leap_insert(int leapsecond) 176void timekeeping_leap_insert(int leapsecond)
170{ 177{
@@ -325,6 +332,8 @@ int do_settimeofday(struct timespec *tv)
325 332
326 xtime = *tv; 333 xtime = *tv;
327 334
335 update_xtime_cache(0);
336
328 timekeeper.ntp_error = 0; 337 timekeeper.ntp_error = 0;
329 ntp_clear(); 338 ntp_clear();
330 339
@@ -550,6 +559,7 @@ void __init timekeeping_init(void)
550 } 559 }
551 set_normalized_timespec(&wall_to_monotonic, 560 set_normalized_timespec(&wall_to_monotonic,
552 -boot.tv_sec, -boot.tv_nsec); 561 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
553 total_sleep_time.tv_sec = 0; 563 total_sleep_time.tv_sec = 0;
554 total_sleep_time.tv_nsec = 0; 564 total_sleep_time.tv_nsec = 0;
555 write_sequnlock_irqrestore(&xtime_lock, flags); 565 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -583,6 +593,7 @@ static int timekeeping_resume(struct sys_device *dev)
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 594 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
585 } 595 }
596 update_xtime_cache(0);
586 /* re-base the last cycle value */ 597 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
588 timekeeper.ntp_error = 0; 599 timekeeper.ntp_error = 0;
@@ -722,6 +733,7 @@ static void timekeeping_adjust(s64 offset)
722 timekeeper.ntp_error_shift; 733 timekeeper.ntp_error_shift;
723} 734}
724 735
736
725/** 737/**
726 * logarithmic_accumulation - shifted accumulation of cycles 738 * logarithmic_accumulation - shifted accumulation of cycles
727 * 739 *
@@ -765,6 +777,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
765 return offset; 777 return offset;
766} 778}
767 779
780
768/** 781/**
769 * update_wall_time - Uses the current clocksource to increment the wall time 782 * update_wall_time - Uses the current clocksource to increment the wall time
770 * 783 *
@@ -774,6 +787,7 @@ void update_wall_time(void)
774{ 787{
775 struct clocksource *clock; 788 struct clocksource *clock;
776 cycle_t offset; 789 cycle_t offset;
790 u64 nsecs;
777 int shift = 0, maxshift; 791 int shift = 0, maxshift;
778 792
779 /* Make sure we're fully resumed: */ 793 /* Make sure we're fully resumed: */
@@ -839,6 +853,9 @@ void update_wall_time(void)
839 timekeeper.ntp_error += timekeeper.xtime_nsec << 853 timekeeper.ntp_error += timekeeper.xtime_nsec <<
840 timekeeper.ntp_error_shift; 854 timekeeper.ntp_error_shift;
841 855
856 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
857 update_xtime_cache(nsecs);
858
842 /* check to see if there is a new clocksource to use */ 859 /* check to see if there is a new clocksource to use */
843 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 860 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
844} 861}
@@ -875,13 +892,13 @@ void monotonic_to_bootbased(struct timespec *ts)
875 892
876unsigned long get_seconds(void) 893unsigned long get_seconds(void)
877{ 894{
878 return xtime.tv_sec; 895 return xtime_cache.tv_sec;
879} 896}
880EXPORT_SYMBOL(get_seconds); 897EXPORT_SYMBOL(get_seconds);
881 898
882struct timespec __current_kernel_time(void) 899struct timespec __current_kernel_time(void)
883{ 900{
884 return xtime; 901 return xtime_cache;
885} 902}
886 903
887struct timespec current_kernel_time(void) 904struct timespec current_kernel_time(void)
@@ -891,7 +908,8 @@ struct timespec current_kernel_time(void)
891 908
892 do { 909 do {
893 seq = read_seqbegin(&xtime_lock); 910 seq = read_seqbegin(&xtime_lock);
894 now = xtime; 911
912 now = xtime_cache;
895 } while (read_seqretry(&xtime_lock, seq)); 913 } while (read_seqretry(&xtime_lock, seq));
896 914
897 return now; 915 return now;
@@ -905,7 +923,8 @@ struct timespec get_monotonic_coarse(void)
905 923
906 do { 924 do {
907 seq = read_seqbegin(&xtime_lock); 925 seq = read_seqbegin(&xtime_lock);
908 now = xtime; 926
927 now = xtime_cache;
909 mono = wall_to_monotonic; 928 mono = wall_to_monotonic;
910 } while (read_seqretry(&xtime_lock, seq)); 929 } while (read_seqretry(&xtime_lock, seq));
911 930
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 28265636b6c2..bdfb8dd1050c 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -237,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
238 print_tickdevice(m, tick_get_broadcast_device(), -1); 238 print_tickdevice(m, tick_get_broadcast_device(), -1);
239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
240 tick_get_broadcast_mask()->bits[0]); 240 cpumask_bits(tick_get_broadcast_mask())[0]);
241#ifdef CONFIG_TICK_ONESHOT 241#ifdef CONFIG_TICK_ONESHOT
242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", 242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
243 tick_get_broadcast_oneshot_mask()->bits[0]); 243 cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
244#endif 244#endif
245 SEQ_printf(m, "\n"); 245 SEQ_printf(m, "\n");
246#endif 246#endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d26811..15533b792397 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
656 656
657 debug_activate(timer, expires); 657 debug_activate(timer, expires);
658 658
659 new_base = __get_cpu_var(tvec_bases);
660
661 cpu = smp_processor_id(); 659 cpu = smp_processor_id();
662 660
663#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 661#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d006554888dc..6c22d8a2f289 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,17 +12,17 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
@@ -34,17 +34,17 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 35 bool
36 help 36 help
37 See Documentation/trace/ftrace-implementation.txt 37 See Documentation/trace/ftrace-design.txt
38 38
39config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
40 bool 40 bool
41 help 41 help
42 See Documentation/trace/ftrace-implementation.txt 42 See Documentation/trace/ftrace-design.txt
43 43
44config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
45 bool 45 bool
46 help 46 help
47 See Documentation/trace/ftrace-implementation.txt 47 See Documentation/trace/ftrace-design.txt
48 48
49config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
50 bool 50 bool
@@ -52,7 +52,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 52config HAVE_SYSCALL_TRACEPOINTS
53 bool 53 bool
54 help 54 help
55 See Documentation/trace/ftrace-implementation.txt 55 See Documentation/trace/ftrace-design.txt
56 56
57config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
58 bool 58 bool
@@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 86# hiding of the automatic options.
87 87
88config TRACING 88config TRACING
89 bool 89 bool
@@ -119,7 +119,7 @@ menuconfig FTRACE
119 bool "Tracers" 119 bool "Tracers"
120 default y if DEBUG_KERNEL 120 default y if DEBUG_KERNEL
121 help 121 help
122 Enable the kernel tracing infrastructure. 122 Enable the kernel tracing infrastructure.
123 123
124if FTRACE 124if FTRACE
125 125
@@ -133,7 +133,7 @@ config FUNCTION_TRACER
133 help 133 help
134 Enable the kernel to trace every kernel function. This is done 134 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 135 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 136 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 137 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 138 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 139 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +150,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 150 and its entry.
151 Its first purpose is to trace the duration of functions and 151 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 152 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 153 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 154 address on the current task structure into a stack of calls.
155 155
156 156
@@ -173,7 +173,7 @@ config IRQSOFF_TRACER
173 173
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 175
176 (Note that kernel size and overhead increases with this option 176 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 177 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 178 used together or separately.)
179 179
@@ -186,7 +186,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 186 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 187 select RING_BUFFER_ALLOW_SWAP
188 help 188 help
189 This option measures the time spent in preemption off critical 189 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 190 sections, with microsecond accuracy.
191 191
192 The default measurement method is a maximum search, which is 192 The default measurement method is a maximum search, which is
@@ -195,7 +195,7 @@ config PREEMPT_TRACER
195 195
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 197
198 (Note that kernel size and overhead increases with this option 198 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 199 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 200 used together or separately.)
201 201
@@ -222,7 +222,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 222 depends on !GENERIC_TRACER
223 select TRACING 223 select TRACING
224 help 224 help
225 This tracer hooks to various trace points in the kernel 225 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 226 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 227 want to trace. It also includes the sched_switch tracer plugin.
228 228
@@ -265,19 +265,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 265 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 266 are annotated with a likely or unlikely macro.
267 267
268 The "all branch" profiler will profile every if statement in the 268 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 269 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 270 profiler.
271 271
272 Either of the above profilers add a bit of overhead to the system. 272 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 273 If unsure, choose "No branch profiling".
274 274
275config BRANCH_PROFILE_NONE 275config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 276 bool "No branch profiling"
277 help 277 help
278 No branch profiling. Branch profiling adds a bit of overhead. 278 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 279 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 280 Otherwise keep it disabled.
281 281
282config PROFILE_ANNOTATED_BRANCHES 282config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 283 bool "Trace likely/unlikely profiler"
@@ -288,7 +288,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 288
289 /sys/kernel/debug/tracing/profile_annotated_branch 289 /sys/kernel/debug/tracing/profile_annotated_branch
290 290
291 Note: this will add a significant overhead, only turn this 291 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 292 on if you need to profile the system's use of these macros.
293 293
294config PROFILE_ALL_BRANCHES 294config PROFILE_ALL_BRANCHES
@@ -305,7 +305,7 @@ config PROFILE_ALL_BRANCHES
305 305
306 This configuration, when enabled, will impose a great overhead 306 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 307 on the system. This should only be enabled when the system
308 is to be analyzed 308 is to be analyzed in much detail.
309endchoice 309endchoice
310 310
311config TRACING_BRANCHES 311config TRACING_BRANCHES
@@ -335,7 +335,7 @@ config POWER_TRACER
335 depends on X86 335 depends on X86
336 select GENERIC_TRACER 336 select GENERIC_TRACER
337 help 337 help
338 This tracer helps developers to analyze and optimize the kernels 338 This tracer helps developers to analyze and optimize the kernel's
339 power management decisions, specifically the C-state and P-state 339 power management decisions, specifically the C-state and P-state
340 behavior. 340 behavior.
341 341
@@ -391,14 +391,14 @@ config HW_BRANCH_TRACER
391 select GENERIC_TRACER 391 select GENERIC_TRACER
392 help 392 help
393 This tracer records all branches on the system in a circular 393 This tracer records all branches on the system in a circular
394 buffer giving access to the last N branches for each cpu. 394 buffer, giving access to the last N branches for each cpu.
395 395
396config KMEMTRACE 396config KMEMTRACE
397 bool "Trace SLAB allocations" 397 bool "Trace SLAB allocations"
398 select GENERIC_TRACER 398 select GENERIC_TRACER
399 help 399 help
400 kmemtrace provides tracing for slab allocator functions, such as 400 kmemtrace provides tracing for slab allocator functions, such as
401 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 401 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
402 data is then fed to the userspace application in order to analyse 402 data is then fed to the userspace application in order to analyse
403 allocation hotspots, internal fragmentation and so on, making it 403 allocation hotspots, internal fragmentation and so on, making it
404 possible to see how well an allocator performs, as well as debug 404 possible to see how well an allocator performs, as well as debug
@@ -417,15 +417,15 @@ config WORKQUEUE_TRACER
417 bool "Trace workqueues" 417 bool "Trace workqueues"
418 select GENERIC_TRACER 418 select GENERIC_TRACER
419 help 419 help
420 The workqueue tracer provides some statistical informations 420 The workqueue tracer provides some statistical information
421 about each cpu workqueue thread such as the number of the 421 about each cpu workqueue thread such as the number of the
422 works inserted and executed since their creation. It can help 422 works inserted and executed since their creation. It can help
423 to evaluate the amount of work each of them have to perform. 423 to evaluate the amount of work each of them has to perform.
424 For example it can help a developer to decide whether he should 424 For example it can help a developer to decide whether he should
425 choose a per cpu workqueue instead of a singlethreaded one. 425 choose a per-cpu workqueue instead of a singlethreaded one.
426 426
427config BLK_DEV_IO_TRACE 427config BLK_DEV_IO_TRACE
428 bool "Support for tracing block io actions" 428 bool "Support for tracing block IO actions"
429 depends on SYSFS 429 depends on SYSFS
430 depends on BLOCK 430 depends on BLOCK
431 select RELAY 431 select RELAY
@@ -456,15 +456,15 @@ config KPROBE_EVENT
456 select TRACING 456 select TRACING
457 default y 457 default y
458 help 458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly 459 This allows the user to add tracing events (similar to tracepoints)
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt 460 on the fly via the ftrace interface. See
461 for more details. 461 Documentation/trace/kprobetrace.txt for more details.
462 462
463 Those events can be inserted wherever kprobes can probe, and record 463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values. 464 various register and memory values.
465 465
466 This option is also required by perf-probe subcommand of perf tools. If 466 This option is also required by perf-probe subcommand of perf tools.
467 you want to use perf tools, this option is strongly recommended. 467 If you want to use perf tools, this option is strongly recommended.
468 468
469config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
470 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
@@ -472,32 +472,32 @@ config DYNAMIC_FTRACE
472 depends on HAVE_DYNAMIC_FTRACE 472 depends on HAVE_DYNAMIC_FTRACE
473 default y 473 default y
474 help 474 help
475 This option will modify all the calls to ftrace dynamically 475 This option will modify all the calls to ftrace dynamically
476 (will patch them out of the binary image and replaces them 476 (will patch them out of the binary image and replace them
477 with a No-Op instruction) as they are called. A table is 477 with a No-Op instruction) as they are called. A table is
478 created to dynamically enable them again. 478 created to dynamically enable them again.
479 479
480 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 480 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
481 has native performance as long as no tracing is active. 481 otherwise has native performance as long as no tracing is active.
482 482
483 The changes to the code are done by a kernel thread that 483 The changes to the code are done by a kernel thread that
484 wakes up once a second and checks to see if any ftrace calls 484 wakes up once a second and checks to see if any ftrace calls
485 were made. If so, it runs stop_machine (stops all CPUS) 485 were made. If so, it runs stop_machine (stops all CPUS)
486 and modifies the code to jump over the call to ftrace. 486 and modifies the code to jump over the call to ftrace.
487 487
488config FUNCTION_PROFILER 488config FUNCTION_PROFILER
489 bool "Kernel function profiler" 489 bool "Kernel function profiler"
490 depends on FUNCTION_TRACER 490 depends on FUNCTION_TRACER
491 default n 491 default n
492 help 492 help
493 This option enables the kernel function profiler. A file is created 493 This option enables the kernel function profiler. A file is created
494 in debugfs called function_profile_enabled which defaults to zero. 494 in debugfs called function_profile_enabled which defaults to zero.
495 When a 1 is echoed into this file profiling begins, and when a 495 When a 1 is echoed into this file profiling begins, and when a
496 zero is entered, profiling stops. A file in the trace_stats 496 zero is entered, profiling stops. A "functions" file is created in
497 directory called functions, that show the list of functions that 497 the trace_stats directory; this file shows the list of functions that
498 have been hit and their counters. 498 have been hit and their counters.
499 499
500 If in doubt, say N 500 If in doubt, say N.
501 501
502config FTRACE_MCOUNT_RECORD 502config FTRACE_MCOUNT_RECORD
503 def_bool y 503 def_bool y
@@ -556,8 +556,8 @@ config RING_BUFFER_BENCHMARK
556 tristate "Ring buffer benchmark stress tester" 556 tristate "Ring buffer benchmark stress tester"
557 depends on RING_BUFFER 557 depends on RING_BUFFER
558 help 558 help
559 This option creates a test to stress the ring buffer and bench mark it. 559 This option creates a test to stress the ring buffer and benchmark it.
560 It creates its own ring buffer such that it will not interfer with 560 It creates its own ring buffer such that it will not interfere with
561 any other users of the ring buffer (such as ftrace). It then creates 561 any other users of the ring buffer (such as ftrace). It then creates
562 a producer and consumer that will run for 10 seconds and sleep for 562 a producer and consumer that will run for 10 seconds and sleep for
563 10 seconds. Each interval it will print out the number of events 563 10 seconds. Each interval it will print out the number of events
@@ -566,7 +566,7 @@ config RING_BUFFER_BENCHMARK
566 It does not disable interrupts or raise its priority, so it may be 566 It does not disable interrupts or raise its priority, so it may be
567 affected by processes that are running. 567 affected by processes that are running.
568 568
569 If unsure, say N 569 If unsure, say N.
570 570
571endif # FTRACE 571endif # FTRACE
572 572
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06ba26747d7e..0df1b0f2cb9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <linux/utsrelease.h> 15#include <generated/utsrelease.h>
16#include <linux/stacktrace.h> 16#include <linux/stacktrace.h>
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
@@ -3949,7 +3949,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3949 if (!!(topt->flags->val & topt->opt->bit) != val) { 3949 if (!!(topt->flags->val & topt->opt->bit) != val) {
3950 mutex_lock(&trace_types_lock); 3950 mutex_lock(&trace_types_lock);
3951 ret = __set_tracer_option(current_trace, topt->flags, 3951 ret = __set_tracer_option(current_trace, topt->flags,
3952 topt->opt, val); 3952 topt->opt, !val);
3953 mutex_unlock(&trace_types_lock); 3953 mutex_unlock(&trace_types_lock);
3954 if (ret) 3954 if (ret)
3955 return ret; 3955 return ret;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 458e5bfe26d0..d4fa5dc1ee4e 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
160 offsetof(typeof(field), item), \ 160 offsetof(typeof(field), item), \
161 sizeof(field.item), 0, FILTER_OTHER); \ 161 sizeof(field.item), \
162 is_signed_type(type), FILTER_OTHER); \
162 if (ret) \ 163 if (ret) \
163 return ret; 164 return ret;
164 165
@@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 169 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
169 offsetof(typeof(field), \ 170 offsetof(typeof(field), \
170 container.item), \ 171 container.item), \
171 sizeof(field.container.item), 0, \ 172 sizeof(field.container.item), \
172 FILTER_OTHER); \ 173 is_signed_type(type), FILTER_OTHER); \
173 if (ret) \ 174 if (ret) \
174 return ret; 175 return ret;
175 176
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 75d75dec226a..47f54ab57b68 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1201,10 +1201,11 @@ static int __probe_event_show_format(struct trace_seq *s,
1201#undef SHOW_FIELD 1201#undef SHOW_FIELD
1202#define SHOW_FIELD(type, item, name) \ 1202#define SHOW_FIELD(type, item, name) \
1203 do { \ 1203 do { \
1204 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ 1204 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \
1205 "offset:%u;\tsize:%u;\n", name, \ 1205 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1206 (unsigned int)offsetof(typeof(field), item),\ 1206 (unsigned int)offsetof(typeof(field), item),\
1207 (unsigned int)sizeof(type)); \ 1207 (unsigned int)sizeof(type), \
1208 is_signed_type(type)); \
1208 if (!ret) \ 1209 if (!ret) \
1209 return 0; \ 1210 return 0; \
1210 } while (0) 1211 } while (0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index faf37fa4408c..94103cdcf9d8 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -26,12 +26,13 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27 27
28#include "trace_output.h" 28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h" 29#include "trace.h"
31 30
32#include <linux/hw_breakpoint.h> 31#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h> 32#include <asm/hw_breakpoint.h>
34 33
34#include <asm/atomic.h>
35
35/* 36/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number 37 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers. 38 * of available hardware breakpoint registers.
@@ -44,7 +45,7 @@ struct trace_ksym {
44 struct perf_event **ksym_hbp; 45 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr; 46 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER 47#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter; 48 atomic64_t counter;
48#endif 49#endif
49 struct hlist_node ksym_hlist; 50 struct hlist_node ksym_hlist;
50}; 51};
@@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
69 70
70 rcu_read_lock(); 71 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { 72 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) && 73 if (entry->attr.bp_addr == hbp_hit_addr) {
73 (entry->counter <= MAX_UL_INT)) { 74 atomic64_inc(&entry->counter);
74 entry->counter++;
75 break; 75 break;
76 } 76 }
77 } 77 }
@@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
197 entry->attr.bp_addr = addr; 197 entry->attr.bp_addr = addr;
198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4; 198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
199 199
200 ret = -EAGAIN;
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, 200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler); 201 ksym_hbp_handler);
203 202
@@ -300,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file,
300 * 2: echo 0 > ksym_trace_filter 299 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter 300 * 3: echo "*:---" > ksym_trace_filter
302 */ 301 */
303 if (!buf[0] || !strcmp(buf, "0") || 302 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(buf, "*:---")) { 303 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset(); 304 __ksym_trace_reset();
306 ret = 0; 305 ret = 0;
307 goto out; 306 goto out;
@@ -444,102 +443,77 @@ struct tracer ksym_tracer __read_mostly =
444 .print_line = ksym_trace_output 443 .print_line = ksym_trace_output
445}; 444};
446 445
447__init static int init_ksym_trace(void)
448{
449 struct dentry *d_tracer;
450 struct dentry *entry;
451
452 d_tracer = tracing_init_dentry();
453 ksym_filter_entry_count = 0;
454
455 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
456 NULL, &ksym_tracing_fops);
457 if (!entry)
458 pr_warning("Could not create debugfs "
459 "'ksym_trace_filter' file\n");
460
461 return register_tracer(&ksym_tracer);
462}
463device_initcall(init_ksym_trace);
464
465
466#ifdef CONFIG_PROFILE_KSYM_TRACER 446#ifdef CONFIG_PROFILE_KSYM_TRACER
467static int ksym_tracer_stat_headers(struct seq_file *m) 447static int ksym_profile_show(struct seq_file *m, void *v)
468{ 448{
449 struct hlist_node *node;
450 struct trace_ksym *entry;
451 int access_type = 0;
452 char fn_name[KSYM_NAME_LEN];
453
469 seq_puts(m, " Access Type "); 454 seq_puts(m, " Access Type ");
470 seq_puts(m, " Symbol Counter\n"); 455 seq_puts(m, " Symbol Counter\n");
471 seq_puts(m, " ----------- "); 456 seq_puts(m, " ----------- ");
472 seq_puts(m, " ------ -------\n"); 457 seq_puts(m, " ------ -------\n");
473 return 0;
474}
475 458
476static int ksym_tracer_stat_show(struct seq_file *m, void *v) 459 rcu_read_lock();
477{ 460 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
478 struct hlist_node *stat = v;
479 struct trace_ksym *entry;
480 int access_type = 0;
481 char fn_name[KSYM_NAME_LEN];
482 461
483 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); 462 access_type = entry->attr.bp_type;
484 463
485 access_type = entry->attr.bp_type; 464 switch (access_type) {
465 case HW_BREAKPOINT_R:
466 seq_puts(m, " R ");
467 break;
468 case HW_BREAKPOINT_W:
469 seq_puts(m, " W ");
470 break;
471 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
472 seq_puts(m, " RW ");
473 break;
474 default:
475 seq_puts(m, " NA ");
476 }
486 477
487 switch (access_type) { 478 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
488 case HW_BREAKPOINT_R: 479 seq_printf(m, " %-36s", fn_name);
489 seq_puts(m, " R "); 480 else
490 break; 481 seq_printf(m, " %-36s", "<NA>");
491 case HW_BREAKPOINT_W: 482 seq_printf(m, " %15llu\n",
492 seq_puts(m, " W "); 483 (unsigned long long)atomic64_read(&entry->counter));
493 break;
494 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
495 seq_puts(m, " RW ");
496 break;
497 default:
498 seq_puts(m, " NA ");
499 } 484 }
500 485 rcu_read_unlock();
501 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
502 seq_printf(m, " %-36s", fn_name);
503 else
504 seq_printf(m, " %-36s", "<NA>");
505 seq_printf(m, " %15lu\n", entry->counter);
506 486
507 return 0; 487 return 0;
508} 488}
509 489
510static void *ksym_tracer_stat_start(struct tracer_stat *trace) 490static int ksym_profile_open(struct inode *node, struct file *file)
511{ 491{
512 return ksym_filter_head.first; 492 return single_open(file, ksym_profile_show, NULL);
513}
514
515static void *
516ksym_tracer_stat_next(void *v, int idx)
517{
518 struct hlist_node *stat = v;
519
520 return stat->next;
521} 493}
522 494
523static struct tracer_stat ksym_tracer_stats = { 495static const struct file_operations ksym_profile_fops = {
524 .name = "ksym_tracer", 496 .open = ksym_profile_open,
525 .stat_start = ksym_tracer_stat_start, 497 .read = seq_read,
526 .stat_next = ksym_tracer_stat_next, 498 .llseek = seq_lseek,
527 .stat_headers = ksym_tracer_stat_headers, 499 .release = single_release,
528 .stat_show = ksym_tracer_stat_show
529}; 500};
501#endif /* CONFIG_PROFILE_KSYM_TRACER */
530 502
531__init static int ksym_tracer_stat_init(void) 503__init static int init_ksym_trace(void)
532{ 504{
533 int ret; 505 struct dentry *d_tracer;
534 506
535 ret = register_stat_tracer(&ksym_tracer_stats); 507 d_tracer = tracing_init_dentry();
536 if (ret) {
537 printk(KERN_WARNING "Warning: could not register "
538 "ksym tracer stats\n");
539 return 1;
540 }
541 508
542 return 0; 509 trace_create_file("ksym_trace_filter", 0644, d_tracer,
510 NULL, &ksym_tracing_fops);
511
512#ifdef CONFIG_PROFILE_KSYM_TRACER
513 trace_create_file("ksym_profile", 0444, d_tracer,
514 NULL, &ksym_profile_fops);
515#endif
516
517 return register_tracer(&ksym_tracer);
543} 518}
544fs_initcall(ksym_tracer_stat_init); 519device_initcall(init_ksym_trace);
545#endif /* CONFIG_PROFILE_KSYM_TRACER */