aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c32
-rw-r--r--kernel/audit_tree.c91
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/cgroup.c21
-rw-r--r--kernel/cgroup_freezer.c19
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/extable.c21
-rw-r--r--kernel/fork.c39
-rw-r--r--kernel/futex.c290
-rw-r--r--kernel/hrtimer.c26
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/handle.c181
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c68
-rw-r--r--kernel/irq/migration.c11
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kprobes.c23
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c38
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/marker.c192
-rw-r--r--kernel/module.c13
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/posix-cpu-timers.c19
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcuclassic.c6
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/sched.c1181
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_debug.c103
-rw-r--r--kernel/sched_fair.c31
-rw-r--r--kernel/sched_rt.c80
-rw-r--r--kernel/sched_stats.h18
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/stop_machine.c5
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/trace/Kconfig102
-rw-r--r--kernel/trace/Makefile9
-rw-r--r--kernel/trace/ftrace.c997
-rw-r--r--kernel/trace/ring_buffer.c776
-rw-r--r--kernel/trace/trace.c919
-rw-r--r--kernel/trace/trace.h265
-rw-r--r--kernel/trace/trace_boot.c166
-rw-r--r--kernel/trace/trace_branch.c342
-rw-r--r--kernel/trace/trace_bts.c276
-rw-r--r--kernel/trace/trace_functions.c18
-rw-r--r--kernel/trace/trace_functions_graph.c611
-rw-r--r--kernel/trace/trace_irqsoff.c61
-rw-r--r--kernel/trace/trace_mmiotrace.c43
-rw-r--r--kernel/trace/trace_nop.c65
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_sched_switch.c106
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c173
-rw-r--r--kernel/trace/trace_stack.c45
-rw-r--r--kernel/trace/trace_sysprof.c19
-rw-r--r--kernel/tracepoint.c295
-rw-r--r--kernel/user.c2
79 files changed, 6375 insertions, 2038 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d84..6a212b842d86 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14CFLAGS_REMOVE_sched.o = -mno-spe
15
16ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
18CFLAGS_REMOVE_lockdep.o = -pg 16CFLAGS_REMOVE_lockdep.o = -pg
@@ -21,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg
25endif 22endif
26 23
27obj-$(CONFIG_FREEZER) += freezer.o 24obj-$(CONFIG_FREEZER) += freezer.o
@@ -92,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
92obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
93obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
94 91
95ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
96# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
97# needed for x86 only. Why this used to be enabled for all architectures is beyond 94# needed for x86 only. Why this used to be enabled for all architectures is beyond
98# me. I suspect most platforms don't need this, but until we know that for sure 95# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d8750..ce6d8ea3131e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
61 61
62#include "audit.h" 62#include "audit.h"
63 63
64/* No auditing will take place until audit_initialized != 0. 64/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
65 * (Initialization happens after skb_init is called.) */ 65 * (Initialization happens after skb_init is called.) */
66#define AUDIT_DISABLED -1
67#define AUDIT_UNINITIALIZED 0
68#define AUDIT_INITIALIZED 1
66static int audit_initialized; 69static int audit_initialized;
67 70
68#define AUDIT_OFF 0 71#define AUDIT_OFF 0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
965{ 968{
966 int i; 969 int i;
967 970
971 if (audit_initialized == AUDIT_DISABLED)
972 return 0;
973
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 974 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 975 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 976 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
976 982
977 skb_queue_head_init(&audit_skb_queue); 983 skb_queue_head_init(&audit_skb_queue);
978 skb_queue_head_init(&audit_skb_hold_queue); 984 skb_queue_head_init(&audit_skb_hold_queue);
979 audit_initialized = 1; 985 audit_initialized = AUDIT_INITIALIZED;
980 audit_enabled = audit_default; 986 audit_enabled = audit_default;
981 audit_ever_enabled |= !!audit_default; 987 audit_ever_enabled |= !!audit_default;
982 988
@@ -999,13 +1005,21 @@ __initcall(audit_init);
999static int __init audit_enable(char *str) 1005static int __init audit_enable(char *str)
1000{ 1006{
1001 audit_default = !!simple_strtol(str, NULL, 0); 1007 audit_default = !!simple_strtol(str, NULL, 0);
1002 printk(KERN_INFO "audit: %s%s\n", 1008 if (!audit_default)
1003 audit_default ? "enabled" : "disabled", 1009 audit_initialized = AUDIT_DISABLED;
1004 audit_initialized ? "" : " (after initialization)"); 1010
1005 if (audit_initialized) { 1011 printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
1012
1013 if (audit_initialized == AUDIT_INITIALIZED) {
1006 audit_enabled = audit_default; 1014 audit_enabled = audit_default;
1007 audit_ever_enabled |= !!audit_default; 1015 audit_ever_enabled |= !!audit_default;
1016 } else if (audit_initialized == AUDIT_UNINITIALIZED) {
1017 printk(" (after initialization)");
1018 } else {
1019 printk(" (until reboot)");
1008 } 1020 }
1021 printk("\n");
1022
1009 return 1; 1023 return 1;
1010} 1024}
1011 1025
@@ -1107,9 +1121,7 @@ unsigned int audit_serial(void)
1107static inline void audit_get_stamp(struct audit_context *ctx, 1121static inline void audit_get_stamp(struct audit_context *ctx,
1108 struct timespec *t, unsigned int *serial) 1122 struct timespec *t, unsigned int *serial)
1109{ 1123{
1110 if (ctx) 1124 if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
1111 auditsc_get_stamp(ctx, t, serial);
1112 else {
1113 *t = CURRENT_TIME; 1125 *t = CURRENT_TIME;
1114 *serial = audit_serial(); 1126 *serial = audit_serial();
1115 } 1127 }
@@ -1146,7 +1158,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1146 int reserve; 1158 int reserve;
1147 unsigned long timeout_start = jiffies; 1159 unsigned long timeout_start = jiffies;
1148 1160
1149 if (!audit_initialized) 1161 if (audit_initialized != AUDIT_INITIALIZED)
1150 return NULL; 1162 return NULL;
1151 1163
1152 if (unlikely(audit_filter_type(type))) 1164 if (unlikely(audit_filter_type(type)))
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ba0e0d934f2..8b509441f49a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
24 struct list_head trees; /* with root here */ 24 struct list_head trees; /* with root here */
25 int dead; 25 int dead;
26 int count; 26 int count;
27 atomic_long_t refs;
27 struct rcu_head head; 28 struct rcu_head head;
28 struct node { 29 struct node {
29 struct list_head list; 30 struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
56 * tree is refcounted; one reference for "some rules on rules_list refer to 57 * tree is refcounted; one reference for "some rules on rules_list refer to
57 * it", one for each chunk with pointer to it. 58 * it", one for each chunk with pointer to it.
58 * 59 *
59 * chunk is refcounted by embedded inotify_watch. 60 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
61 * of watch contributes 1 to .refs).
60 * 62 *
61 * node.index allows to get from node.list to containing chunk. 63 * node.index allows to get from node.list to containing chunk.
62 * MSB of that sucker is stolen to mark taggings that we might have to 64 * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
121 INIT_LIST_HEAD(&chunk->hash); 123 INIT_LIST_HEAD(&chunk->hash);
122 INIT_LIST_HEAD(&chunk->trees); 124 INIT_LIST_HEAD(&chunk->trees);
123 chunk->count = count; 125 chunk->count = count;
126 atomic_long_set(&chunk->refs, 1);
124 for (i = 0; i < count; i++) { 127 for (i = 0; i < count; i++) {
125 INIT_LIST_HEAD(&chunk->owners[i].list); 128 INIT_LIST_HEAD(&chunk->owners[i].list);
126 chunk->owners[i].index = i; 129 chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
129 return chunk; 132 return chunk;
130} 133}
131 134
132static void __free_chunk(struct rcu_head *rcu) 135static void free_chunk(struct audit_chunk *chunk)
133{ 136{
134 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
135 int i; 137 int i;
136 138
137 for (i = 0; i < chunk->count; i++) { 139 for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
141 kfree(chunk); 143 kfree(chunk);
142} 144}
143 145
144static inline void free_chunk(struct audit_chunk *chunk) 146void audit_put_chunk(struct audit_chunk *chunk)
145{ 147{
146 call_rcu(&chunk->head, __free_chunk); 148 if (atomic_long_dec_and_test(&chunk->refs))
149 free_chunk(chunk);
147} 150}
148 151
149void audit_put_chunk(struct audit_chunk *chunk) 152static void __put_chunk(struct rcu_head *rcu)
150{ 153{
151 put_inotify_watch(&chunk->watch); 154 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
155 audit_put_chunk(chunk);
152} 156}
153 157
154enum {HASH_SIZE = 128}; 158enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
176 180
177 list_for_each_entry_rcu(p, list, hash) { 181 list_for_each_entry_rcu(p, list, hash) {
178 if (p->watch.inode == inode) { 182 if (p->watch.inode == inode) {
179 get_inotify_watch(&p->watch); 183 atomic_long_inc(&p->refs);
180 return p; 184 return p;
181 } 185 }
182 } 186 }
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
194 198
195/* tagging and untagging inodes with trees */ 199/* tagging and untagging inodes with trees */
196 200
197static void untag_chunk(struct audit_chunk *chunk, struct node *p) 201static struct audit_chunk *find_chunk(struct node *p)
202{
203 int index = p->index & ~(1U<<31);
204 p -= index;
205 return container_of(p, struct audit_chunk, owners[0]);
206}
207
208static void untag_chunk(struct node *p)
198{ 209{
210 struct audit_chunk *chunk = find_chunk(p);
199 struct audit_chunk *new; 211 struct audit_chunk *new;
200 struct audit_tree *owner; 212 struct audit_tree *owner;
201 int size = chunk->count - 1; 213 int size = chunk->count - 1;
202 int i, j; 214 int i, j;
203 215
216 if (!pin_inotify_watch(&chunk->watch)) {
217 /*
218 * Filesystem is shutting down; all watches are getting
219 * evicted, just take it off the node list for this
220 * tree and let the eviction logics take care of the
221 * rest.
222 */
223 owner = p->owner;
224 if (owner->root == chunk) {
225 list_del_init(&owner->same_root);
226 owner->root = NULL;
227 }
228 list_del_init(&p->list);
229 p->owner = NULL;
230 put_tree(owner);
231 return;
232 }
233
234 spin_unlock(&hash_lock);
235
236 /*
237 * pin_inotify_watch() succeeded, so the watch won't go away
238 * from under us.
239 */
204 mutex_lock(&chunk->watch.inode->inotify_mutex); 240 mutex_lock(&chunk->watch.inode->inotify_mutex);
205 if (chunk->dead) { 241 if (chunk->dead) {
206 mutex_unlock(&chunk->watch.inode->inotify_mutex); 242 mutex_unlock(&chunk->watch.inode->inotify_mutex);
207 return; 243 goto out;
208 } 244 }
209 245
210 owner = p->owner; 246 owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
221 inotify_evict_watch(&chunk->watch); 257 inotify_evict_watch(&chunk->watch);
222 mutex_unlock(&chunk->watch.inode->inotify_mutex); 258 mutex_unlock(&chunk->watch.inode->inotify_mutex);
223 put_inotify_watch(&chunk->watch); 259 put_inotify_watch(&chunk->watch);
224 return; 260 goto out;
225 } 261 }
226 262
227 new = alloc_chunk(size); 263 new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
263 inotify_evict_watch(&chunk->watch); 299 inotify_evict_watch(&chunk->watch);
264 mutex_unlock(&chunk->watch.inode->inotify_mutex); 300 mutex_unlock(&chunk->watch.inode->inotify_mutex);
265 put_inotify_watch(&chunk->watch); 301 put_inotify_watch(&chunk->watch);
266 return; 302 goto out;
267 303
268Fallback: 304Fallback:
269 // do the best we can 305 // do the best we can
@@ -277,6 +313,9 @@ Fallback:
277 put_tree(owner); 313 put_tree(owner);
278 spin_unlock(&hash_lock); 314 spin_unlock(&hash_lock);
279 mutex_unlock(&chunk->watch.inode->inotify_mutex); 315 mutex_unlock(&chunk->watch.inode->inotify_mutex);
316out:
317 unpin_inotify_watch(&chunk->watch);
318 spin_lock(&hash_lock);
280} 319}
281 320
282static int create_chunk(struct inode *inode, struct audit_tree *tree) 321static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
387 return 0; 426 return 0;
388} 427}
389 428
390static struct audit_chunk *find_chunk(struct node *p)
391{
392 int index = p->index & ~(1U<<31);
393 p -= index;
394 return container_of(p, struct audit_chunk, owners[0]);
395}
396
397static void kill_rules(struct audit_tree *tree) 429static void kill_rules(struct audit_tree *tree)
398{ 430{
399 struct audit_krule *rule, *next; 431 struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
431 spin_lock(&hash_lock); 463 spin_lock(&hash_lock);
432 while (!list_empty(&victim->chunks)) { 464 while (!list_empty(&victim->chunks)) {
433 struct node *p; 465 struct node *p;
434 struct audit_chunk *chunk;
435 466
436 p = list_entry(victim->chunks.next, struct node, list); 467 p = list_entry(victim->chunks.next, struct node, list);
437 chunk = find_chunk(p);
438 get_inotify_watch(&chunk->watch);
439 spin_unlock(&hash_lock);
440
441 untag_chunk(chunk, p);
442 468
443 put_inotify_watch(&chunk->watch); 469 untag_chunk(p);
444 spin_lock(&hash_lock);
445 } 470 }
446 spin_unlock(&hash_lock); 471 spin_unlock(&hash_lock);
447 put_tree(victim); 472 put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
469 494
470 while (!list_empty(&tree->chunks)) { 495 while (!list_empty(&tree->chunks)) {
471 struct node *node; 496 struct node *node;
472 struct audit_chunk *chunk;
473 497
474 node = list_entry(tree->chunks.next, struct node, list); 498 node = list_entry(tree->chunks.next, struct node, list);
475 499
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
477 if (!(node->index & (1U<<31))) 501 if (!(node->index & (1U<<31)))
478 break; 502 break;
479 503
480 chunk = find_chunk(node); 504 untag_chunk(node);
481 get_inotify_watch(&chunk->watch);
482 spin_unlock(&hash_lock);
483
484 untag_chunk(chunk, node);
485
486 put_inotify_watch(&chunk->watch);
487 spin_lock(&hash_lock);
488 } 505 }
489 if (!tree->root && !tree->goner) { 506 if (!tree->root && !tree->goner) {
490 tree->goner = 1; 507 tree->goner = 1;
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
878static void destroy_watch(struct inotify_watch *watch) 895static void destroy_watch(struct inotify_watch *watch)
879{ 896{
880 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 897 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
881 free_chunk(chunk); 898 call_rcu(&chunk->head, __put_chunk);
882} 899}
883 900
884static const struct inotify_operations rtree_inotify_ops = { 901static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0ef..9fd85a4640a0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
1094 list_for_each_entry_safe(p, n, in_list, ilist) { 1094 list_for_each_entry_safe(p, n, in_list, ilist) {
1095 list_del(&p->ilist); 1095 list_del(&p->ilist);
1096 inotify_rm_watch(audit_ih, &p->wdata); 1096 inotify_rm_watch(audit_ih, &p->wdata);
1097 /* the put matching the get in audit_do_del_rule() */ 1097 /* the unpin matching the pin in audit_do_del_rule() */
1098 put_inotify_watch(&p->wdata); 1098 unpin_inotify_watch(&p->wdata);
1099 } 1099 }
1100} 1100}
1101 1101
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
1389 /* Put parent on the inotify un-registration 1389 /* Put parent on the inotify un-registration
1390 * list. Grab a reference before releasing 1390 * list. Grab a reference before releasing
1391 * audit_filter_mutex, to be released in 1391 * audit_filter_mutex, to be released in
1392 * audit_inotify_unregister(). */ 1392 * audit_inotify_unregister().
1393 list_add(&parent->ilist, &inotify_list); 1393 * If filesystem is going away, just leave
1394 get_inotify_watch(&parent->wdata); 1394 * the sucker alone, eviction will take
1395 * care of it.
1396 */
1397 if (pin_inotify_watch(&parent->wdata))
1398 list_add(&parent->ilist, &inotify_list);
1395 } 1399 }
1396 } 1400 }
1397 } 1401 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..2a3f0afc4d2a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1459,7 +1459,6 @@ void audit_free(struct task_struct *tsk)
1459 1459
1460/** 1460/**
1461 * audit_syscall_entry - fill in an audit record at syscall entry 1461 * audit_syscall_entry - fill in an audit record at syscall entry
1462 * @tsk: task being audited
1463 * @arch: architecture type 1462 * @arch: architecture type
1464 * @major: major syscall type (function) 1463 * @major: major syscall type (function)
1465 * @a1: additional syscall register 1 1464 * @a1: additional syscall register 1
@@ -1548,9 +1547,25 @@ void audit_syscall_entry(int arch, int major,
1548 context->ppid = 0; 1547 context->ppid = 0;
1549} 1548}
1550 1549
1550void audit_finish_fork(struct task_struct *child)
1551{
1552 struct audit_context *ctx = current->audit_context;
1553 struct audit_context *p = child->audit_context;
1554 if (!p || !ctx || !ctx->auditable)
1555 return;
1556 p->arch = ctx->arch;
1557 p->major = ctx->major;
1558 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1559 p->ctime = ctx->ctime;
1560 p->dummy = ctx->dummy;
1561 p->auditable = ctx->auditable;
1562 p->in_syscall = ctx->in_syscall;
1563 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1564 p->ppid = current->pid;
1565}
1566
1551/** 1567/**
1552 * audit_syscall_exit - deallocate audit context after a system call 1568 * audit_syscall_exit - deallocate audit context after a system call
1553 * @tsk: task being audited
1554 * @valid: success/failure flag 1569 * @valid: success/failure flag
1555 * @return_code: syscall return value 1570 * @return_code: syscall return value
1556 * 1571 *
@@ -1942,15 +1957,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
1942 * 1957 *
1943 * Also sets the context as auditable. 1958 * Also sets the context as auditable.
1944 */ 1959 */
1945void auditsc_get_stamp(struct audit_context *ctx, 1960int auditsc_get_stamp(struct audit_context *ctx,
1946 struct timespec *t, unsigned int *serial) 1961 struct timespec *t, unsigned int *serial)
1947{ 1962{
1963 if (!ctx->in_syscall)
1964 return 0;
1948 if (!ctx->serial) 1965 if (!ctx->serial)
1949 ctx->serial = audit_serial(); 1966 ctx->serial = audit_serial();
1950 t->tv_sec = ctx->ctime.tv_sec; 1967 t->tv_sec = ctx->ctime.tv_sec;
1951 t->tv_nsec = ctx->ctime.tv_nsec; 1968 t->tv_nsec = ctx->ctime.tv_nsec;
1952 *serial = ctx->serial; 1969 *serial = ctx->serial;
1953 ctx->auditable = 1; 1970 ctx->auditable = 1;
1971 return 1;
1954} 1972}
1955 1973
1956/* global counter which is incremented every time something logs in */ 1974/* global counter which is incremented every time something logs in */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 358e77564e6f..fe00b3b983a8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2039,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2039 struct cgroup *cgrp; 2039 struct cgroup *cgrp;
2040 struct cgroup_iter it; 2040 struct cgroup_iter it;
2041 struct task_struct *tsk; 2041 struct task_struct *tsk;
2042
2042 /* 2043 /*
2043 * Validate dentry by checking the superblock operations 2044 * Validate dentry by checking the superblock operations,
2045 * and make sure it's a directory.
2044 */ 2046 */
2045 if (dentry->d_sb->s_op != &cgroup_ops) 2047 if (dentry->d_sb->s_op != &cgroup_ops ||
2048 !S_ISDIR(dentry->d_inode->i_mode))
2046 goto err; 2049 goto err;
2047 2050
2048 ret = 0; 2051 ret = 0;
@@ -2472,10 +2475,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2472 mutex_unlock(&cgroup_mutex); 2475 mutex_unlock(&cgroup_mutex);
2473 return -EBUSY; 2476 return -EBUSY;
2474 } 2477 }
2475 2478 mutex_unlock(&cgroup_mutex);
2476 parent = cgrp->parent;
2477 root = cgrp->root;
2478 sb = root->sb;
2479 2479
2480 /* 2480 /*
2481 * Call pre_destroy handlers of subsys. Notify subsystems 2481 * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2483,7 +2483,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2483 */ 2483 */
2484 cgroup_call_pre_destroy(cgrp); 2484 cgroup_call_pre_destroy(cgrp);
2485 2485
2486 if (cgroup_has_css_refs(cgrp)) { 2486 mutex_lock(&cgroup_mutex);
2487 parent = cgrp->parent;
2488 root = cgrp->root;
2489 sb = root->sb;
2490
2491 if (atomic_read(&cgrp->count)
2492 || !list_empty(&cgrp->children)
2493 || cgroup_has_css_refs(cgrp)) {
2487 mutex_unlock(&cgroup_mutex); 2494 mutex_unlock(&cgroup_mutex);
2488 return -EBUSY; 2495 return -EBUSY;
2489 } 2496 }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 7fa476f01d05..fb249e2bcada 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -184,9 +184,20 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
184{ 184{
185 struct freezer *freezer; 185 struct freezer *freezer;
186 186
187 task_lock(task); 187 /*
188 * No lock is needed, since the task isn't on tasklist yet,
189 * so it can't be moved to another cgroup, which means the
190 * freezer won't be removed and will be valid during this
191 * function call.
192 */
188 freezer = task_freezer(task); 193 freezer = task_freezer(task);
189 task_unlock(task); 194
195 /*
196 * The root cgroup is non-freezable, so we can skip the
197 * following check.
198 */
199 if (!freezer->css.cgroup->parent)
200 return;
190 201
191 spin_lock_irq(&freezer->lock); 202 spin_lock_irq(&freezer->lock);
192 BUG_ON(freezer->state == CGROUP_FROZEN); 203 BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -331,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup,
331 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 342 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
332 goal_state = CGROUP_FROZEN; 343 goal_state = CGROUP_FROZEN;
333 else 344 else
334 return -EIO; 345 return -EINVAL;
335 346
336 if (!cgroup_lock_live_group(cgroup)) 347 if (!cgroup_lock_live_group(cgroup))
337 return -ENODEV; 348 return -ENODEV;
@@ -350,6 +361,8 @@ static struct cftype files[] = {
350 361
351static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) 362static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
352{ 363{
364 if (!cgroup->parent)
365 return 0;
353 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); 366 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
354} 367}
355 368
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5a732c5ef08b..8ea32e8d68b0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,7 @@ out:
462 * It must be called by the arch code on the new cpu, before the new cpu 462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */ 464 */
465void notify_cpu_starting(unsigned int cpu) 465void __cpuinit notify_cpu_starting(unsigned int cpu)
466{ 466{
467 unsigned long val = CPU_STARTING; 467 unsigned long val = CPU_STARTING;
468 468
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e00526f52ec..96c0ba13b8cd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h>
39#include <linux/module.h> 40#include <linux/module.h>
40#include <linux/mount.h> 41#include <linux/mount.h>
41#include <linux/namei.h> 42#include <linux/namei.h>
@@ -584,10 +585,9 @@ static int generate_sched_domains(cpumask_t **domains,
584 int i, j, k; /* indices for partition finding loops */ 585 int i, j, k; /* indices for partition finding loops */
585 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 586 cpumask_t *doms; /* resulting partition; i.e. sched domains */
586 struct sched_domain_attr *dattr; /* attributes for custom domains */ 587 struct sched_domain_attr *dattr; /* attributes for custom domains */
587 int ndoms; /* number of sched domains in result */ 588 int ndoms = 0; /* number of sched domains in result */
588 int nslot; /* next empty doms[] cpumask_t slot */ 589 int nslot; /* next empty doms[] cpumask_t slot */
589 590
590 ndoms = 0;
591 doms = NULL; 591 doms = NULL;
592 dattr = NULL; 592 dattr = NULL;
593 csa = NULL; 593 csa = NULL;
@@ -674,10 +674,8 @@ restart:
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 675 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
677 if (!doms) { 677 if (!doms)
678 ndoms = 0;
679 goto done; 678 goto done;
680 }
681 679
682 /* 680 /*
683 * The rest of the code, including the scheduler, can deal with 681 * The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
732done: 730done:
733 kfree(csa); 731 kfree(csa);
734 732
733 /*
734 * Fallback to the default domain if kmalloc() failed.
735 * See comments in partition_sched_domains().
736 */
737 if (doms == NULL)
738 ndoms = 1;
739
735 *domains = doms; 740 *domains = doms;
736 *attributes = dattr; 741 *attributes = dattr;
737 return ndoms; 742 return ndoms;
@@ -2011,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2011 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2016 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2012 * See also the previous routine cpuset_track_online_cpus(). 2017 * See also the previous routine cpuset_track_online_cpus().
2013 */ 2018 */
2014void cpuset_track_online_nodes(void) 2019static int cpuset_track_online_nodes(struct notifier_block *self,
2020 unsigned long action, void *arg)
2015{ 2021{
2016 cgroup_lock(); 2022 cgroup_lock();
2017 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2023 switch (action) {
2018 scan_for_empty_cpusets(&top_cpuset); 2024 case MEM_ONLINE:
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 break;
2027 case MEM_OFFLINE:
2028 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2029 scan_for_empty_cpusets(&top_cpuset);
2030 break;
2031 default:
2032 break;
2033 }
2019 cgroup_unlock(); 2034 cgroup_unlock();
2035 return NOTIFY_OK;
2020} 2036}
2021#endif 2037#endif
2022 2038
@@ -2032,6 +2048,7 @@ void __init cpuset_init_smp(void)
2032 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2033 2049
2034 hotcpu_notifier(cpuset_track_online_cpus, 0); 2050 hotcpu_notifier(cpuset_track_online_cpus, 0);
2051 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2035} 2052}
2036 2053
2037/** 2054/**
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d9467..61ba5b4b10cf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,7 +40,6 @@
40#include <linux/cn_proc.h> 40#include <linux/cn_proc.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
44#include <linux/pipe_fs_i.h> 43#include <linux/pipe_fs_i.h>
45#include <linux/audit.h> /* for audit_free() */ 44#include <linux/audit.h> /* for audit_free() */
46#include <linux/resource.h> 45#include <linux/resource.h>
@@ -54,6 +53,10 @@
54#include <asm/pgtable.h> 53#include <asm/pgtable.h>
55#include <asm/mmu_context.h> 54#include <asm/mmu_context.h>
56 55
56DEFINE_TRACE(sched_process_free);
57DEFINE_TRACE(sched_process_exit);
58DEFINE_TRACE(sched_process_wait);
59
57static void exit_mm(struct task_struct * tsk); 60static void exit_mm(struct task_struct * tsk);
58 61
59static inline int task_detached(struct task_struct *p) 62static inline int task_detached(struct task_struct *p)
@@ -141,6 +144,11 @@ static void __exit_signal(struct task_struct *tsk)
141 if (sig) { 144 if (sig) {
142 flush_sigqueue(&sig->shared_pending); 145 flush_sigqueue(&sig->shared_pending);
143 taskstats_tgid_free(sig); 146 taskstats_tgid_free(sig);
147 /*
148 * Make sure ->signal can't go away under rq->lock,
149 * see account_group_exec_runtime().
150 */
151 task_rq_unlock_wait(tsk);
144 __cleanup_signal(sig); 152 __cleanup_signal(sig);
145 } 153 }
146} 154}
@@ -1054,14 +1062,6 @@ NORET_TYPE void do_exit(long code)
1054 exit_itimers(tsk->signal); 1062 exit_itimers(tsk->signal);
1055 } 1063 }
1056 acct_collect(code, group_dead); 1064 acct_collect(code, group_dead);
1057#ifdef CONFIG_FUTEX
1058 if (unlikely(tsk->robust_list))
1059 exit_robust_list(tsk);
1060#ifdef CONFIG_COMPAT
1061 if (unlikely(tsk->compat_robust_list))
1062 compat_exit_robust_list(tsk);
1063#endif
1064#endif
1065 if (group_dead) 1065 if (group_dead)
1066 tty_audit_exit(); 1066 tty_audit_exit();
1067 if (unlikely(tsk->audit_context)) 1067 if (unlikely(tsk->audit_context))
@@ -1127,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
1127 preempt_disable(); 1127 preempt_disable();
1128 /* causes final put_task_struct in finish_task_switch(). */ 1128 /* causes final put_task_struct in finish_task_switch(). */
1129 tsk->state = TASK_DEAD; 1129 tsk->state = TASK_DEAD;
1130
1131 schedule(); 1130 schedule();
1132 BUG(); 1131 BUG();
1133 /* Avoid "noreturn function does return". */ 1132 /* Avoid "noreturn function does return". */
@@ -1325,10 +1324,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1325 * group, which consolidates times for all threads in the 1324 * group, which consolidates times for all threads in the
1326 * group including the group leader. 1325 * group including the group leader.
1327 */ 1326 */
1327 thread_group_cputime(p, &cputime);
1328 spin_lock_irq(&p->parent->sighand->siglock); 1328 spin_lock_irq(&p->parent->sighand->siglock);
1329 psig = p->parent->signal; 1329 psig = p->parent->signal;
1330 sig = p->signal; 1330 sig = p->signal;
1331 thread_group_cputime(p, &cputime);
1332 psig->cutime = 1331 psig->cutime =
1333 cputime_add(psig->cutime, 1332 cputime_add(psig->cutime,
1334 cputime_add(cputime.utime, 1333 cputime_add(cputime.utime,
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
17*/ 17*/
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/ftrace.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22 23
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 41 return e;
41} 42}
42 43
43int core_kernel_text(unsigned long addr) 44__notrace_funcgraph int core_kernel_text(unsigned long addr)
44{ 45{
45 if (addr >= (unsigned long)_stext && 46 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 47 addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
53 return 0; 54 return 0;
54} 55}
55 56
56int __kernel_text_address(unsigned long addr) 57__notrace_funcgraph int __kernel_text_address(unsigned long addr)
57{ 58{
58 if (core_kernel_text(addr)) 59 if (core_kernel_text(addr))
59 return 1; 60 return 1;
@@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr)
66 return 1; 67 return 1;
67 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
68} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe0..7b93da72d4a2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,12 +40,14 @@
40#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/tracehook.h> 41#include <linux/tracehook.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
46#include <linux/mount.h> 47#include <linux/mount.h>
47#include <linux/audit.h> 48#include <linux/audit.h>
48#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
50#include <linux/ftrace.h>
49#include <linux/profile.h> 51#include <linux/profile.h>
50#include <linux/rmap.h> 52#include <linux/rmap.h>
51#include <linux/acct.h> 53#include <linux/acct.h>
@@ -79,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
79 81
80__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 82__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
81 83
84DEFINE_TRACE(sched_process_fork);
85
82int nr_processes(void) 86int nr_processes(void)
83{ 87{
84 int cpu; 88 int cpu;
@@ -136,6 +140,7 @@ void free_task(struct task_struct *tsk)
136 prop_local_destroy_single(&tsk->dirties); 140 prop_local_destroy_single(&tsk->dirties);
137 free_thread_info(tsk->stack); 141 free_thread_info(tsk->stack);
138 rt_mutex_debug_task_free(tsk); 142 rt_mutex_debug_task_free(tsk);
143 ftrace_graph_exit_task(tsk);
139 free_task_struct(tsk); 144 free_task_struct(tsk);
140} 145}
141EXPORT_SYMBOL(free_task); 146EXPORT_SYMBOL(free_task);
@@ -314,17 +319,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
314 file = tmp->vm_file; 319 file = tmp->vm_file;
315 if (file) { 320 if (file) {
316 struct inode *inode = file->f_path.dentry->d_inode; 321 struct inode *inode = file->f_path.dentry->d_inode;
322 struct address_space *mapping = file->f_mapping;
323
317 get_file(file); 324 get_file(file);
318 if (tmp->vm_flags & VM_DENYWRITE) 325 if (tmp->vm_flags & VM_DENYWRITE)
319 atomic_dec(&inode->i_writecount); 326 atomic_dec(&inode->i_writecount);
320 327 spin_lock(&mapping->i_mmap_lock);
321 /* insert tmp into the share list, just after mpnt */ 328 if (tmp->vm_flags & VM_SHARED)
322 spin_lock(&file->f_mapping->i_mmap_lock); 329 mapping->i_mmap_writable++;
323 tmp->vm_truncate_count = mpnt->vm_truncate_count; 330 tmp->vm_truncate_count = mpnt->vm_truncate_count;
324 flush_dcache_mmap_lock(file->f_mapping); 331 flush_dcache_mmap_lock(mapping);
332 /* insert tmp into the share list, just after mpnt */
325 vma_prio_tree_add(tmp, mpnt); 333 vma_prio_tree_add(tmp, mpnt);
326 flush_dcache_mmap_unlock(file->f_mapping); 334 flush_dcache_mmap_unlock(mapping);
327 spin_unlock(&file->f_mapping->i_mmap_lock); 335 spin_unlock(&mapping->i_mmap_lock);
328 } 336 }
329 337
330 /* 338 /*
@@ -519,6 +527,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
519{ 527{
520 struct completion *vfork_done = tsk->vfork_done; 528 struct completion *vfork_done = tsk->vfork_done;
521 529
530 /* Get rid of any futexes when releasing the mm */
531#ifdef CONFIG_FUTEX
532 if (unlikely(tsk->robust_list))
533 exit_robust_list(tsk);
534#ifdef CONFIG_COMPAT
535 if (unlikely(tsk->compat_robust_list))
536 compat_exit_robust_list(tsk);
537#endif
538#endif
539
522 /* Get rid of any cached register state */ 540 /* Get rid of any cached register state */
523 deactivate_mm(tsk, mm); 541 deactivate_mm(tsk, mm);
524 542
@@ -1122,6 +1140,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1122 } 1140 }
1123 } 1141 }
1124 1142
1143 ftrace_graph_init_task(p);
1144
1125 p->pid = pid_nr(pid); 1145 p->pid = pid_nr(pid);
1126 p->tgid = p->pid; 1146 p->tgid = p->pid;
1127 if (clone_flags & CLONE_THREAD) 1147 if (clone_flags & CLONE_THREAD)
@@ -1130,7 +1150,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1130 if (current->nsproxy != p->nsproxy) { 1150 if (current->nsproxy != p->nsproxy) {
1131 retval = ns_cgroup_clone(p, pid); 1151 retval = ns_cgroup_clone(p, pid);
1132 if (retval) 1152 if (retval)
1133 goto bad_fork_free_pid; 1153 goto bad_fork_free_graph;
1134 } 1154 }
1135 1155
1136 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1156 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1223,7 +1243,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1223 spin_unlock(&current->sighand->siglock); 1243 spin_unlock(&current->sighand->siglock);
1224 write_unlock_irq(&tasklist_lock); 1244 write_unlock_irq(&tasklist_lock);
1225 retval = -ERESTARTNOINTR; 1245 retval = -ERESTARTNOINTR;
1226 goto bad_fork_free_pid; 1246 goto bad_fork_free_graph;
1227 } 1247 }
1228 1248
1229 if (clone_flags & CLONE_THREAD) { 1249 if (clone_flags & CLONE_THREAD) {
@@ -1260,6 +1280,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1260 cgroup_post_fork(p); 1280 cgroup_post_fork(p);
1261 return p; 1281 return p;
1262 1282
1283bad_fork_free_graph:
1284 ftrace_graph_exit_task(p);
1263bad_fork_free_pid: 1285bad_fork_free_pid:
1264 if (pid != &init_struct_pid) 1286 if (pid != &init_struct_pid)
1265 free_pid(pid); 1287 free_pid(pid);
@@ -1387,6 +1409,7 @@ long do_fork(unsigned long clone_flags,
1387 init_completion(&vfork); 1409 init_completion(&vfork);
1388 } 1410 }
1389 1411
1412 audit_finish_fork(p);
1390 tracehook_report_clone(trace, regs, clone_flags, nr, p); 1413 tracehook_report_clone(trace, regs, clone_flags, nr, p);
1391 1414
1392 /* 1415 /*
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514b..e10c5c8786a6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -123,24 +123,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 124
125/* 125/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 126 * We hash on the keys returned from get_futex_key (see below).
145 */ 127 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 128static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +143,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 143 && key1->both.offset == key2->both.offset);
162} 144}
163 145
146/*
147 * Take a reference to the resource addressed by a key.
148 * Can be called while holding spinlocks.
149 *
150 */
151static void get_futex_key_refs(union futex_key *key)
152{
153 if (!key->both.ptr)
154 return;
155
156 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
157 case FUT_OFF_INODE:
158 atomic_inc(&key->shared.inode->i_count);
159 break;
160 case FUT_OFF_MMSHARED:
161 atomic_inc(&key->private.mm->mm_count);
162 break;
163 }
164}
165
166/*
167 * Drop a reference to the resource addressed by a key.
168 * The hash bucket spinlock must not be held.
169 */
170static void drop_futex_key_refs(union futex_key *key)
171{
172 if (!key->both.ptr)
173 return;
174
175 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
176 case FUT_OFF_INODE:
177 iput(key->shared.inode);
178 break;
179 case FUT_OFF_MMSHARED:
180 mmdrop(key->private.mm);
181 break;
182 }
183}
184
164/** 185/**
165 * get_futex_key - Get parameters which are the keys for a futex. 186 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 187 * @uaddr: virtual address of the futex
@@ -179,12 +200,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 200 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 201 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 202 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 203static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 204{
185 unsigned long address = (unsigned long)uaddr; 205 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 206 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 207 struct page *page;
189 int err; 208 int err;
190 209
@@ -208,100 +227,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 227 return -EFAULT;
209 key->private.mm = mm; 228 key->private.mm = mm;
210 key->private.address = address; 229 key->private.address = address;
230 get_futex_key_refs(key);
211 return 0; 231 return 0;
212 } 232 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 233
221 /* 234again:
222 * Permissions. 235 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 236 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 237 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 238
239 lock_page(page);
240 if (!page->mapping) {
241 unlock_page(page);
242 put_page(page);
243 goto again;
244 }
226 245
227 /* 246 /*
228 * Private mappings are handled in a simple way. 247 * Private mappings are handled in a simple way.
229 * 248 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 249 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 250 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 251 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 252 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 253 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 254 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 255 key->private.mm = mm;
239 key->private.address = address; 256 key->private.address = address;
240 return 0; 257 } else {
258 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
259 key->shared.inode = page->mapping->host;
260 key->shared.pgoff = page->index;
241 } 261 }
242 262
243 /* 263 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 264
254 /* 265 unlock_page(page);
255 * We could walk the page table to read the non-linear 266 put_page(page);
256 * pte, and get the page index without fetching the page 267 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 268}
288 269
289/* 270static inline
290 * Drop a reference to the resource addressed by a key. 271void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 272{
295 if (!key->both.ptr) 273 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 274}
306 275
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 276static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +297,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 297
329/* 298/*
330 * Fault handling. 299 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 300 */
333static int futex_handle_fault(unsigned long address, 301static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 302{
336 struct vm_area_struct * vma; 303 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 304 struct mm_struct *mm = current->mm;
@@ -340,8 +307,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 307 if (attempt > 2)
341 return ret; 308 return ret;
342 309
343 if (!fshared) 310 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 311 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 312 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 313 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +327,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 327 current->min_flt++;
362 } 328 }
363 } 329 }
364 if (!fshared) 330 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 331 return ret;
367} 332}
368 333
@@ -385,6 +350,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 350 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 351 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 352 atomic_set(&pi_state->refcount, 1);
353 pi_state->key = FUTEX_KEY_INIT;
388 354
389 current->pi_state_cache = pi_state; 355 current->pi_state_cache = pi_state;
390 356
@@ -462,7 +428,7 @@ void exit_pi_state_list(struct task_struct *curr)
462 struct list_head *next, *head = &curr->pi_state_list; 428 struct list_head *next, *head = &curr->pi_state_list;
463 struct futex_pi_state *pi_state; 429 struct futex_pi_state *pi_state;
464 struct futex_hash_bucket *hb; 430 struct futex_hash_bucket *hb;
465 union futex_key key; 431 union futex_key key = FUTEX_KEY_INIT;
466 432
467 if (!futex_cmpxchg_enabled) 433 if (!futex_cmpxchg_enabled)
468 return; 434 return;
@@ -719,20 +685,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
719 * Wake up all waiters hashed on the physical page that is mapped 685 * Wake up all waiters hashed on the physical page that is mapped
720 * to this virtual address: 686 * to this virtual address:
721 */ 687 */
722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 688static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
723 int nr_wake, u32 bitset)
724{ 689{
725 struct futex_hash_bucket *hb; 690 struct futex_hash_bucket *hb;
726 struct futex_q *this, *next; 691 struct futex_q *this, *next;
727 struct plist_head *head; 692 struct plist_head *head;
728 union futex_key key; 693 union futex_key key = FUTEX_KEY_INIT;
729 int ret; 694 int ret;
730 695
731 if (!bitset) 696 if (!bitset)
732 return -EINVAL; 697 return -EINVAL;
733 698
734 futex_lock_mm(fshared);
735
736 ret = get_futex_key(uaddr, fshared, &key); 699 ret = get_futex_key(uaddr, fshared, &key);
737 if (unlikely(ret != 0)) 700 if (unlikely(ret != 0))
738 goto out; 701 goto out;
@@ -760,7 +723,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
760 723
761 spin_unlock(&hb->lock); 724 spin_unlock(&hb->lock);
762out: 725out:
763 futex_unlock_mm(fshared); 726 put_futex_key(fshared, &key);
764 return ret; 727 return ret;
765} 728}
766 729
@@ -769,19 +732,16 @@ out:
769 * to this virtual address: 732 * to this virtual address:
770 */ 733 */
771static int 734static int
772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 735futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
773 u32 __user *uaddr2,
774 int nr_wake, int nr_wake2, int op) 736 int nr_wake, int nr_wake2, int op)
775{ 737{
776 union futex_key key1, key2; 738 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
777 struct futex_hash_bucket *hb1, *hb2; 739 struct futex_hash_bucket *hb1, *hb2;
778 struct plist_head *head; 740 struct plist_head *head;
779 struct futex_q *this, *next; 741 struct futex_q *this, *next;
780 int ret, op_ret, attempt = 0; 742 int ret, op_ret, attempt = 0;
781 743
782retryfull: 744retryfull:
783 futex_lock_mm(fshared);
784
785 ret = get_futex_key(uaddr1, fshared, &key1); 745 ret = get_futex_key(uaddr1, fshared, &key1);
786 if (unlikely(ret != 0)) 746 if (unlikely(ret != 0))
787 goto out; 747 goto out;
@@ -826,18 +786,12 @@ retry:
826 */ 786 */
827 if (attempt++) { 787 if (attempt++) {
828 ret = futex_handle_fault((unsigned long)uaddr2, 788 ret = futex_handle_fault((unsigned long)uaddr2,
829 fshared, attempt); 789 attempt);
830 if (ret) 790 if (ret)
831 goto out; 791 goto out;
832 goto retry; 792 goto retry;
833 } 793 }
834 794
835 /*
836 * If we would have faulted, release mmap_sem,
837 * fault it in and start all over again.
838 */
839 futex_unlock_mm(fshared);
840
841 ret = get_user(dummy, uaddr2); 795 ret = get_user(dummy, uaddr2);
842 if (ret) 796 if (ret)
843 return ret; 797 return ret;
@@ -873,7 +827,8 @@ retry:
873 if (hb1 != hb2) 827 if (hb1 != hb2)
874 spin_unlock(&hb2->lock); 828 spin_unlock(&hb2->lock);
875out: 829out:
876 futex_unlock_mm(fshared); 830 put_futex_key(fshared, &key2);
831 put_futex_key(fshared, &key1);
877 832
878 return ret; 833 return ret;
879} 834}
@@ -882,19 +837,16 @@ out:
882 * Requeue all waiters hashed on one physical page to another 837 * Requeue all waiters hashed on one physical page to another
883 * physical page. 838 * physical page.
884 */ 839 */
885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 840static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
886 u32 __user *uaddr2,
887 int nr_wake, int nr_requeue, u32 *cmpval) 841 int nr_wake, int nr_requeue, u32 *cmpval)
888{ 842{
889 union futex_key key1, key2; 843 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
890 struct futex_hash_bucket *hb1, *hb2; 844 struct futex_hash_bucket *hb1, *hb2;
891 struct plist_head *head1; 845 struct plist_head *head1;
892 struct futex_q *this, *next; 846 struct futex_q *this, *next;
893 int ret, drop_count = 0; 847 int ret, drop_count = 0;
894 848
895 retry: 849 retry:
896 futex_lock_mm(fshared);
897
898 ret = get_futex_key(uaddr1, fshared, &key1); 850 ret = get_futex_key(uaddr1, fshared, &key1);
899 if (unlikely(ret != 0)) 851 if (unlikely(ret != 0))
900 goto out; 852 goto out;
@@ -917,12 +869,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
917 if (hb1 != hb2) 869 if (hb1 != hb2)
918 spin_unlock(&hb2->lock); 870 spin_unlock(&hb2->lock);
919 871
920 /*
921 * If we would have faulted, release mmap_sem, fault
922 * it in and start all over again.
923 */
924 futex_unlock_mm(fshared);
925
926 ret = get_user(curval, uaddr1); 872 ret = get_user(curval, uaddr1);
927 873
928 if (!ret) 874 if (!ret)
@@ -974,7 +920,8 @@ out_unlock:
974 drop_futex_key_refs(&key1); 920 drop_futex_key_refs(&key1);
975 921
976out: 922out:
977 futex_unlock_mm(fshared); 923 put_futex_key(fshared, &key2);
924 put_futex_key(fshared, &key1);
978 return ret; 925 return ret;
979} 926}
980 927
@@ -1096,8 +1043,7 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1043 * private futexes.
1097 */ 1044 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1045static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner, 1046 struct task_struct *newowner, int fshared)
1100 struct rw_semaphore *fshared)
1101{ 1047{
1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1048 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103 struct futex_pi_state *pi_state = q->pi_state; 1049 struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1122,7 @@ retry:
1176handle_fault: 1122handle_fault:
1177 spin_unlock(q->lock_ptr); 1123 spin_unlock(q->lock_ptr);
1178 1124
1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1125 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1180 1126
1181 spin_lock(q->lock_ptr); 1127 spin_lock(q->lock_ptr);
1182 1128
@@ -1200,7 +1146,7 @@ handle_fault:
1200 1146
1201static long futex_wait_restart(struct restart_block *restart); 1147static long futex_wait_restart(struct restart_block *restart);
1202 1148
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1149static int futex_wait(u32 __user *uaddr, int fshared,
1204 u32 val, ktime_t *abs_time, u32 bitset) 1150 u32 val, ktime_t *abs_time, u32 bitset)
1205{ 1151{
1206 struct task_struct *curr = current; 1152 struct task_struct *curr = current;
@@ -1218,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1218 q.pi_state = NULL; 1164 q.pi_state = NULL;
1219 q.bitset = bitset; 1165 q.bitset = bitset;
1220 retry: 1166 retry:
1221 futex_lock_mm(fshared); 1167 q.key = FUTEX_KEY_INIT;
1222
1223 ret = get_futex_key(uaddr, fshared, &q.key); 1168 ret = get_futex_key(uaddr, fshared, &q.key);
1224 if (unlikely(ret != 0)) 1169 if (unlikely(ret != 0))
1225 goto out_release_sem; 1170 goto out_release_sem;
@@ -1251,12 +1196,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1251 if (unlikely(ret)) { 1196 if (unlikely(ret)) {
1252 queue_unlock(&q, hb); 1197 queue_unlock(&q, hb);
1253 1198
1254 /*
1255 * If we would have faulted, release mmap_sem, fault it in and
1256 * start all over again.
1257 */
1258 futex_unlock_mm(fshared);
1259
1260 ret = get_user(uval, uaddr); 1199 ret = get_user(uval, uaddr);
1261 1200
1262 if (!ret) 1201 if (!ret)
@@ -1271,12 +1210,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1271 queue_me(&q, hb); 1210 queue_me(&q, hb);
1272 1211
1273 /* 1212 /*
1274 * Now the futex is queued and we have checked the data, we
1275 * don't want to hold mmap_sem while we sleep.
1276 */
1277 futex_unlock_mm(fshared);
1278
1279 /*
1280 * There might have been scheduling since the queue_me(), as we 1213 * There might have been scheduling since the queue_me(), as we
1281 * cannot hold a spinlock across the get_user() in case it 1214 * cannot hold a spinlock across the get_user() in case it
1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1215 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1363,7 +1296,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 queue_unlock(&q, hb); 1296 queue_unlock(&q, hb);
1364 1297
1365 out_release_sem: 1298 out_release_sem:
1366 futex_unlock_mm(fshared); 1299 put_futex_key(fshared, &q.key);
1367 return ret; 1300 return ret;
1368} 1301}
1369 1302
@@ -1371,13 +1304,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1371static long futex_wait_restart(struct restart_block *restart) 1304static long futex_wait_restart(struct restart_block *restart)
1372{ 1305{
1373 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1306 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1374 struct rw_semaphore *fshared = NULL; 1307 int fshared = 0;
1375 ktime_t t; 1308 ktime_t t;
1376 1309
1377 t.tv64 = restart->futex.time; 1310 t.tv64 = restart->futex.time;
1378 restart->fn = do_no_restart_syscall; 1311 restart->fn = do_no_restart_syscall;
1379 if (restart->futex.flags & FLAGS_SHARED) 1312 if (restart->futex.flags & FLAGS_SHARED)
1380 fshared = &current->mm->mmap_sem; 1313 fshared = 1;
1381 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1314 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1382 restart->futex.bitset); 1315 restart->futex.bitset);
1383} 1316}
@@ -1389,7 +1322,7 @@ static long futex_wait_restart(struct restart_block *restart)
1389 * if there are waiters then it will block, it does PI, etc. (Due to 1322 * if there are waiters then it will block, it does PI, etc. (Due to
1390 * races the kernel might see a 0 value of the futex too.) 1323 * races the kernel might see a 0 value of the futex too.)
1391 */ 1324 */
1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1325static int futex_lock_pi(u32 __user *uaddr, int fshared,
1393 int detect, ktime_t *time, int trylock) 1326 int detect, ktime_t *time, int trylock)
1394{ 1327{
1395 struct hrtimer_sleeper timeout, *to = NULL; 1328 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1412,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1412 1345
1413 q.pi_state = NULL; 1346 q.pi_state = NULL;
1414 retry: 1347 retry:
1415 futex_lock_mm(fshared); 1348 q.key = FUTEX_KEY_INIT;
1416
1417 ret = get_futex_key(uaddr, fshared, &q.key); 1349 ret = get_futex_key(uaddr, fshared, &q.key);
1418 if (unlikely(ret != 0)) 1350 if (unlikely(ret != 0))
1419 goto out_release_sem; 1351 goto out_release_sem;
@@ -1502,7 +1434,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1502 * exit to complete. 1434 * exit to complete.
1503 */ 1435 */
1504 queue_unlock(&q, hb); 1436 queue_unlock(&q, hb);
1505 futex_unlock_mm(fshared);
1506 cond_resched(); 1437 cond_resched();
1507 goto retry; 1438 goto retry;
1508 1439
@@ -1534,12 +1465,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1534 */ 1465 */
1535 queue_me(&q, hb); 1466 queue_me(&q, hb);
1536 1467
1537 /*
1538 * Now the futex is queued and we have checked the data, we
1539 * don't want to hold mmap_sem while we sleep.
1540 */
1541 futex_unlock_mm(fshared);
1542
1543 WARN_ON(!q.pi_state); 1468 WARN_ON(!q.pi_state);
1544 /* 1469 /*
1545 * Block on the PI mutex: 1470 * Block on the PI mutex:
@@ -1552,7 +1477,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1552 ret = ret ? 0 : -EWOULDBLOCK; 1477 ret = ret ? 0 : -EWOULDBLOCK;
1553 } 1478 }
1554 1479
1555 futex_lock_mm(fshared);
1556 spin_lock(q.lock_ptr); 1480 spin_lock(q.lock_ptr);
1557 1481
1558 if (!ret) { 1482 if (!ret) {
@@ -1618,7 +1542,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1618 1542
1619 /* Unqueue and drop the lock */ 1543 /* Unqueue and drop the lock */
1620 unqueue_me_pi(&q); 1544 unqueue_me_pi(&q);
1621 futex_unlock_mm(fshared);
1622 1545
1623 if (to) 1546 if (to)
1624 destroy_hrtimer_on_stack(&to->timer); 1547 destroy_hrtimer_on_stack(&to->timer);
@@ -1628,7 +1551,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1628 queue_unlock(&q, hb); 1551 queue_unlock(&q, hb);
1629 1552
1630 out_release_sem: 1553 out_release_sem:
1631 futex_unlock_mm(fshared); 1554 put_futex_key(fshared, &q.key);
1632 if (to) 1555 if (to)
1633 destroy_hrtimer_on_stack(&to->timer); 1556 destroy_hrtimer_on_stack(&to->timer);
1634 return ret; 1557 return ret;
@@ -1645,15 +1568,12 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1645 queue_unlock(&q, hb); 1568 queue_unlock(&q, hb);
1646 1569
1647 if (attempt++) { 1570 if (attempt++) {
1648 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1571 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1649 attempt);
1650 if (ret) 1572 if (ret)
1651 goto out_release_sem; 1573 goto out_release_sem;
1652 goto retry_unlocked; 1574 goto retry_unlocked;
1653 } 1575 }
1654 1576
1655 futex_unlock_mm(fshared);
1656
1657 ret = get_user(uval, uaddr); 1577 ret = get_user(uval, uaddr);
1658 if (!ret && (uval != -EFAULT)) 1578 if (!ret && (uval != -EFAULT))
1659 goto retry; 1579 goto retry;
@@ -1668,13 +1588,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1668 * This is the in-kernel slowpath: we look up the PI state (if any), 1588 * This is the in-kernel slowpath: we look up the PI state (if any),
1669 * and do the rt-mutex unlock. 1589 * and do the rt-mutex unlock.
1670 */ 1590 */
1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1591static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1672{ 1592{
1673 struct futex_hash_bucket *hb; 1593 struct futex_hash_bucket *hb;
1674 struct futex_q *this, *next; 1594 struct futex_q *this, *next;
1675 u32 uval; 1595 u32 uval;
1676 struct plist_head *head; 1596 struct plist_head *head;
1677 union futex_key key; 1597 union futex_key key = FUTEX_KEY_INIT;
1678 int ret, attempt = 0; 1598 int ret, attempt = 0;
1679 1599
1680retry: 1600retry:
@@ -1685,10 +1605,6 @@ retry:
1685 */ 1605 */
1686 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1606 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1687 return -EPERM; 1607 return -EPERM;
1688 /*
1689 * First take all the futex related locks:
1690 */
1691 futex_lock_mm(fshared);
1692 1608
1693 ret = get_futex_key(uaddr, fshared, &key); 1609 ret = get_futex_key(uaddr, fshared, &key);
1694 if (unlikely(ret != 0)) 1610 if (unlikely(ret != 0))
@@ -1747,7 +1663,7 @@ retry_unlocked:
1747out_unlock: 1663out_unlock:
1748 spin_unlock(&hb->lock); 1664 spin_unlock(&hb->lock);
1749out: 1665out:
1750 futex_unlock_mm(fshared); 1666 put_futex_key(fshared, &key);
1751 1667
1752 return ret; 1668 return ret;
1753 1669
@@ -1763,16 +1679,13 @@ pi_faulted:
1763 spin_unlock(&hb->lock); 1679 spin_unlock(&hb->lock);
1764 1680
1765 if (attempt++) { 1681 if (attempt++) {
1766 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1682 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1767 attempt);
1768 if (ret) 1683 if (ret)
1769 goto out; 1684 goto out;
1770 uval = 0; 1685 uval = 0;
1771 goto retry_unlocked; 1686 goto retry_unlocked;
1772 } 1687 }
1773 1688
1774 futex_unlock_mm(fshared);
1775
1776 ret = get_user(uval, uaddr); 1689 ret = get_user(uval, uaddr);
1777 if (!ret && (uval != -EFAULT)) 1690 if (!ret && (uval != -EFAULT))
1778 goto retry; 1691 goto retry;
@@ -1898,8 +1811,7 @@ retry:
1898 * PI futexes happens in exit_pi_state(): 1811 * PI futexes happens in exit_pi_state():
1899 */ 1812 */
1900 if (!pi && (uval & FUTEX_WAITERS)) 1813 if (!pi && (uval & FUTEX_WAITERS))
1901 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1814 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1902 FUTEX_BITSET_MATCH_ANY);
1903 } 1815 }
1904 return 0; 1816 return 0;
1905} 1817}
@@ -1995,10 +1907,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1995{ 1907{
1996 int ret = -ENOSYS; 1908 int ret = -ENOSYS;
1997 int cmd = op & FUTEX_CMD_MASK; 1909 int cmd = op & FUTEX_CMD_MASK;
1998 struct rw_semaphore *fshared = NULL; 1910 int fshared = 0;
1999 1911
2000 if (!(op & FUTEX_PRIVATE_FLAG)) 1912 if (!(op & FUTEX_PRIVATE_FLAG))
2001 fshared = &current->mm->mmap_sem; 1913 fshared = 1;
2002 1914
2003 switch (cmd) { 1915 switch (cmd) {
2004 case FUTEX_WAIT: 1916 case FUTEX_WAIT:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b465dfde426..47e63349d1b2 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
664 664
665 /* Timer is expired, act upon the callback mode */ 665 /* Timer is expired, act upon the callback mode */
666 switch(timer->cb_mode) { 666 switch(timer->cb_mode) {
667 case HRTIMER_CB_IRQSAFE_NO_RESTART:
668 debug_hrtimer_deactivate(timer);
669 /*
670 * We can call the callback from here. No restart
671 * happens, so no danger of recursion
672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1;
675 case HRTIMER_CB_IRQSAFE_PERCPU: 667 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED: 668 case HRTIMER_CB_IRQSAFE_UNLOCKED:
677 /* 669 /*
@@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
683 */ 675 */
684 debug_hrtimer_deactivate(timer); 676 debug_hrtimer_deactivate(timer);
685 return 1; 677 return 1;
686 case HRTIMER_CB_IRQSAFE:
687 case HRTIMER_CB_SOFTIRQ: 678 case HRTIMER_CB_SOFTIRQ:
688 /* 679 /*
689 * Move everything else into the softirq pending list ! 680 * Move everything else into the softirq pending list !
@@ -1209,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1209 enum hrtimer_restart (*fn)(struct hrtimer *); 1200 enum hrtimer_restart (*fn)(struct hrtimer *);
1210 struct hrtimer *timer; 1201 struct hrtimer *timer;
1211 int restart; 1202 int restart;
1203 int emulate_hardirq_ctx = 0;
1212 1204
1213 timer = list_entry(cpu_base->cb_pending.next, 1205 timer = list_entry(cpu_base->cb_pending.next,
1214 struct hrtimer, cb_entry); 1206 struct hrtimer, cb_entry);
@@ -1217,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1217 timer_stats_account_hrtimer(timer); 1209 timer_stats_account_hrtimer(timer);
1218 1210
1219 fn = timer->function; 1211 fn = timer->function;
1212 /*
1213 * A timer might have been added to the cb_pending list
1214 * when it was migrated during a cpu-offline operation.
1215 * Emulate hardirq context for such timers.
1216 */
1217 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1218 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
1219 emulate_hardirq_ctx = 1;
1220
1220 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); 1221 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1221 spin_unlock_irq(&cpu_base->lock); 1222 spin_unlock_irq(&cpu_base->lock);
1222 1223
1223 restart = fn(timer); 1224 if (unlikely(emulate_hardirq_ctx)) {
1225 local_irq_disable();
1226 restart = fn(timer);
1227 local_irq_enable();
1228 } else
1229 restart = fn(timer);
1224 1230
1225 spin_lock_irq(&cpu_base->lock); 1231 spin_lock_irq(&cpu_base->lock);
1226 1232
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..650ce4102a63 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,6 +40,9 @@ unsigned long probe_irq_on(void)
40 * flush such a longstanding irq before considering it as spurious. 40 * flush such a longstanding irq before considering it as spurious.
41 */ 41 */
42 for_each_irq_desc_reverse(i, desc) { 42 for_each_irq_desc_reverse(i, desc) {
43 if (!desc)
44 continue;
45
43 spin_lock_irq(&desc->lock); 46 spin_lock_irq(&desc->lock);
44 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 47 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
45 /* 48 /*
@@ -68,6 +71,9 @@ unsigned long probe_irq_on(void)
68 * happened in the previous stage, it may have masked itself) 71 * happened in the previous stage, it may have masked itself)
69 */ 72 */
70 for_each_irq_desc_reverse(i, desc) { 73 for_each_irq_desc_reverse(i, desc) {
74 if (!desc)
75 continue;
76
71 spin_lock_irq(&desc->lock); 77 spin_lock_irq(&desc->lock);
72 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
73 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -86,6 +92,9 @@ unsigned long probe_irq_on(void)
86 * Now filter out any obviously spurious interrupts 92 * Now filter out any obviously spurious interrupts
87 */ 93 */
88 for_each_irq_desc(i, desc) { 94 for_each_irq_desc(i, desc) {
95 if (!desc)
96 continue;
97
89 spin_lock_irq(&desc->lock); 98 spin_lock_irq(&desc->lock);
90 status = desc->status; 99 status = desc->status;
91 100
@@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val)
124 int i; 133 int i;
125 134
126 for_each_irq_desc(i, desc) { 135 for_each_irq_desc(i, desc) {
136 if (!desc)
137 continue;
138
127 spin_lock_irq(&desc->lock); 139 spin_lock_irq(&desc->lock);
128 status = desc->status; 140 status = desc->status;
129 141
@@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val)
166 unsigned int status; 178 unsigned int status;
167 179
168 for_each_irq_desc(i, desc) { 180 for_each_irq_desc(i, desc) {
181 if (!desc)
182 continue;
183
169 spin_lock_irq(&desc->lock); 184 spin_lock_irq(&desc->lock);
170 status = desc->status; 185 status = desc->status;
171 186
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bfe..8e4fce4a1b1f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,9 +24,10 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc = irq_to_desc(irq); 27 struct irq_desc *desc;
28 unsigned long flags; 28 unsigned long flags;
29 29
30 desc = irq_to_desc(irq);
30 if (!desc) { 31 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 32 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 33 return;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c815b42d0f5b..8aa09547f5ef 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h>
19#include <linux/hash.h>
18 20
19#include "internals.h" 21#include "internals.h"
20 22
23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */
26static struct lock_class_key irq_desc_lock_class;
27
21/** 28/**
22 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 30 * @irq: the interrupt number
@@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
49int nr_irqs = NR_IRQS; 56int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs); 57EXPORT_SYMBOL_GPL(nr_irqs);
51 58
59void __init __attribute__((weak)) arch_early_irq_init(void)
60{
61}
62
63#ifdef CONFIG_SPARSE_IRQ
64static struct irq_desc irq_desc_init = {
65 .irq = -1,
66 .status = IRQ_DISABLED,
67 .chip = &no_irq_chip,
68 .handle_irq = handle_bad_irq,
69 .depth = 1,
70 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
71#ifdef CONFIG_SMP
72 .affinity = CPU_MASK_ALL
73#endif
74};
75
76static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
77{
78 unsigned long bytes;
79 char *ptr;
80 int node;
81
82 /* Compute how many bytes we need per irq and allocate them */
83 bytes = nr * sizeof(unsigned int);
84
85 node = cpu_to_node(cpu);
86 ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
87 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
88
89 if (ptr)
90 desc->kstat_irqs = (unsigned int *)ptr;
91}
92
93void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
94{
95}
96
97static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
98{
99 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
100 desc->irq = irq;
101#ifdef CONFIG_SMP
102 desc->cpu = cpu;
103#endif
104 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
105 init_kstat_irqs(desc, cpu, nr_cpu_ids);
106 if (!desc->kstat_irqs) {
107 printk(KERN_ERR "can not alloc kstat_irqs\n");
108 BUG_ON(1);
109 }
110 arch_init_chip_data(desc, cpu);
111}
112
113/*
114 * Protect the sparse_irqs:
115 */
116static DEFINE_SPINLOCK(sparse_irq_lock);
117
118struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
119
120static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
121 [0 ... NR_IRQS_LEGACY-1] = {
122 .irq = -1,
123 .status = IRQ_DISABLED,
124 .chip = &no_irq_chip,
125 .handle_irq = handle_bad_irq,
126 .depth = 1,
127 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
128#ifdef CONFIG_SMP
129 .affinity = CPU_MASK_ALL
130#endif
131 }
132};
133
134/* FIXME: use bootmem alloc ...*/
135static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
136
137void __init early_irq_init(void)
138{
139 struct irq_desc *desc;
140 int legacy_count;
141 int i;
142
143 desc = irq_desc_legacy;
144 legacy_count = ARRAY_SIZE(irq_desc_legacy);
145
146 for (i = 0; i < legacy_count; i++) {
147 desc[i].irq = i;
148 desc[i].kstat_irqs = kstat_irqs_legacy[i];
149
150 irq_desc_ptrs[i] = desc + i;
151 }
152
153 for (i = legacy_count; i < NR_IRQS; i++)
154 irq_desc_ptrs[i] = NULL;
155
156 arch_early_irq_init();
157}
158
159struct irq_desc *irq_to_desc(unsigned int irq)
160{
161 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
162}
163
164struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
165{
166 struct irq_desc *desc;
167 unsigned long flags;
168 int node;
169
170 if (irq >= NR_IRQS) {
171 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
172 irq, NR_IRQS);
173 WARN_ON(1);
174 return NULL;
175 }
176
177 desc = irq_desc_ptrs[irq];
178 if (desc)
179 return desc;
180
181 spin_lock_irqsave(&sparse_irq_lock, flags);
182
183 /* We have to check it to avoid races with another CPU */
184 desc = irq_desc_ptrs[irq];
185 if (desc)
186 goto out_unlock;
187
188 node = cpu_to_node(cpu);
189 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
190 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
191 irq, cpu, node);
192 if (!desc) {
193 printk(KERN_ERR "can not alloc irq_desc\n");
194 BUG_ON(1);
195 }
196 init_one_irq_desc(irq, desc, cpu);
197
198 irq_desc_ptrs[irq] = desc;
199
200out_unlock:
201 spin_unlock_irqrestore(&sparse_irq_lock, flags);
202
203 return desc;
204}
205
206#else
207
52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 208struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
53 [0 ... NR_IRQS-1] = { 209 [0 ... NR_IRQS-1] = {
54 .status = IRQ_DISABLED, 210 .status = IRQ_DISABLED,
@@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
62 } 218 }
63}; 219};
64 220
221#endif
222
65/* 223/*
66 * What should we do if we get a hw irq event on an illegal vector? 224 * What should we do if we get a hw irq event on an illegal vector?
67 * Each architecture has to answer this themself. 225 * Each architecture has to answer this themself.
@@ -261,17 +419,28 @@ out:
261 419
262 420
263#ifdef CONFIG_TRACE_IRQFLAGS 421#ifdef CONFIG_TRACE_IRQFLAGS
264/*
265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
266 */
267static struct lock_class_key irq_desc_lock_class;
268
269void early_init_irq_lock_class(void) 422void early_init_irq_lock_class(void)
270{ 423{
424#ifndef CONFIG_SPARSE_IRQ
271 struct irq_desc *desc; 425 struct irq_desc *desc;
272 int i; 426 int i;
273 427
274 for_each_irq_desc(i, desc) 428 for_each_irq_desc(i, desc) {
429 if (!desc)
430 continue;
431
275 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 432 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
433 }
434#endif
435}
436#endif
437
438#ifdef CONFIG_SPARSE_IRQ
439unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
440{
441 struct irq_desc *desc = irq_to_desc(irq);
442 return desc->kstat_irqs[cpu];
276} 443}
277#endif 444#endif
445EXPORT_SYMBOL(kstat_irqs_cpu);
446
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9767e641980..64c1c7253dae 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq,
25 struct irqaction *action) { } 25 struct irqaction *action) { }
26#endif 26#endif
27 27
28extern int irq_select_affinity_usr(unsigned int irq);
29
28/* 30/*
29 * Debugging printout: 31 * Debugging printout:
30 */ 32 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c621..801addda3c43 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq)
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 82int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
83{ 83{
84 struct irq_desc *desc = irq_to_desc(irq); 84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags;
85 86
86 if (!desc->chip->set_affinity) 87 if (!desc->chip->set_affinity)
87 return -EINVAL; 88 return -EINVAL;
88 89
90 spin_lock_irqsave(&desc->lock, flags);
91
89#ifdef CONFIG_GENERIC_PENDING_IRQ 92#ifdef CONFIG_GENERIC_PENDING_IRQ
90 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
91 unsigned long flags;
92
93 spin_lock_irqsave(&desc->lock, flags);
94 desc->affinity = cpumask; 94 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask); 95 desc->chip->set_affinity(irq, cpumask);
96 spin_unlock_irqrestore(&desc->lock, flags); 96 } else {
97 } else 97 desc->status |= IRQ_MOVE_PENDING;
98 set_pending_irq(irq, cpumask); 98 desc->pending_mask = cpumask;
99 }
99#else 100#else
100 desc->affinity = cpumask; 101 desc->affinity = cpumask;
101 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
102#endif 103#endif
104 desc->status |= IRQ_AFFINITY_SET;
105 spin_unlock_irqrestore(&desc->lock, flags);
103 return 0; 106 return 0;
104} 107}
105 108
@@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
107/* 110/*
108 * Generic version of the affinity autoselector. 111 * Generic version of the affinity autoselector.
109 */ 112 */
110int irq_select_affinity(unsigned int irq) 113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
111{ 114{
112 cpumask_t mask; 115 cpumask_t mask;
113 struct irq_desc *desc;
114 116
115 if (!irq_can_set_affinity(irq)) 117 if (!irq_can_set_affinity(irq))
116 return 0; 118 return 0;
117 119
118 cpus_and(mask, cpu_online_map, irq_default_affinity); 120 cpus_and(mask, cpu_online_map, irq_default_affinity);
119 121
120 desc = irq_to_desc(irq); 122 /*
123 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online.
125 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map))
128 mask = desc->affinity;
129 else
130 desc->status &= ~IRQ_AFFINITY_SET;
131 }
132
121 desc->affinity = mask; 133 desc->affinity = mask;
122 desc->chip->set_affinity(irq, mask); 134 desc->chip->set_affinity(irq, mask);
123 135
124 return 0; 136 return 0;
125} 137}
138#else
139static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
140{
141 return irq_select_affinity(irq);
142}
126#endif 143#endif
127 144
145/*
146 * Called when affinity is set via /proc/irq
147 */
148int irq_select_affinity_usr(unsigned int irq)
149{
150 struct irq_desc *desc = irq_to_desc(irq);
151 unsigned long flags;
152 int ret;
153
154 spin_lock_irqsave(&desc->lock, flags);
155 ret = do_irq_select_affinity(irq, desc);
156 spin_unlock_irqrestore(&desc->lock, flags);
157
158 return ret;
159}
160
161#else
162static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
163{
164 return 0;
165}
128#endif 166#endif
129 167
130/** 168/**
@@ -327,7 +365,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
327 * IRQF_TRIGGER_* but the PIC does not support multiple 365 * IRQF_TRIGGER_* but the PIC does not support multiple
328 * flow-types? 366 * flow-types?
329 */ 367 */
330 pr_warning("No set_type function for IRQ %d (%s)\n", irq, 368 pr_debug("No set_type function for IRQ %d (%s)\n", irq,
331 chip ? (chip->name ? : "unknown") : "unknown"); 369 chip ? (chip->name ? : "unknown") : "unknown");
332 return 0; 370 return 0;
333 } 371 }
@@ -445,8 +483,12 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
445 /* Undo nested disables: */ 483 /* Undo nested disables: */
446 desc->depth = 1; 484 desc->depth = 1;
447 485
486 /* Exclude IRQ from balancing if requested */
487 if (new->flags & IRQF_NOBALANCING)
488 desc->status |= IRQ_NO_BALANCING;
489
448 /* Set default affinity mask once everything is setup */ 490 /* Set default affinity mask once everything is setup */
449 irq_select_affinity(irq); 491 do_irq_select_affinity(irq, desc);
450 492
451 } else if ((new->flags & IRQF_TRIGGER_MASK) 493 } else if ((new->flags & IRQF_TRIGGER_MASK)
452 && (new->flags & IRQF_TRIGGER_MASK) 494 && (new->flags & IRQF_TRIGGER_MASK)
@@ -459,10 +501,6 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
459 501
460 *p = new; 502 *p = new;
461 503
462 /* Exclude IRQ from balancing */
463 if (new->flags & IRQF_NOBALANCING)
464 desc->status |= IRQ_NO_BALANCING;
465
466 /* Reset broken irq detection when installing new handler */ 504 /* Reset broken irq detection when installing new handler */
467 desc->irq_count = 0; 505 desc->irq_count = 0;
468 desc->irqs_unhandled = 0; 506 desc->irqs_unhandled = 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 90b920d3f52b..9db681d95814 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,17 +1,6 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{
6 struct irq_desc *desc = irq_to_desc(irq);
7 unsigned long flags;
8
9 spin_lock_irqsave(&desc->lock, flags);
10 desc->status |= IRQ_MOVE_PENDING;
11 desc->pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags);
13}
14
15void move_masked_irq(int irq) 4void move_masked_irq(int irq)
16{ 5{
17 struct irq_desc *desc = irq_to_desc(irq); 6 struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4d161c70ba55..f6b3440f05bc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
62 if (!cpus_intersects(new_value, cpu_online_map)) 62 if (!cpus_intersects(new_value, cpu_online_map))
63 /* Special case for empty set - allow the architecture 63 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 64 code to set default SMP affinity. */
65 return irq_select_affinity(irq) ? -EINVAL : count; 65 return irq_select_affinity_usr(irq) ? -EINVAL : count;
66 66
67 irq_set_affinity(irq, new_value); 67 irq_set_affinity(irq, new_value);
68 68
@@ -243,7 +243,11 @@ void init_irq_proc(void)
243 /* 243 /*
244 * Create entries for all existing IRQs. 244 * Create entries for all existing IRQs.
245 */ 245 */
246 for_each_irq_desc(irq, desc) 246 for_each_irq_desc(irq, desc) {
247 if (!desc)
248 continue;
249
247 register_irq_proc(irq, desc); 250 register_irq_proc(irq, desc);
251 }
248} 252}
249 253
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..3738107531fd 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,6 +91,9 @@ static int misrouted_irq(int irq)
91 int i, ok = 0; 91 int i, ok = 0;
92 92
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 if (!desc)
95 continue;
96
94 if (!i) 97 if (!i)
95 continue; 98 continue;
96 99
@@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy)
112 for_each_irq_desc(i, desc) { 115 for_each_irq_desc(i, desc) {
113 unsigned int status; 116 unsigned int status;
114 117
118 if (!desc)
119 continue;
115 if (!i) 120 if (!i)
116 continue; 121 continue;
117 122
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5072cf1685a2..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address)
304 char *modname; 304 char *modname;
305 const char *name; 305 const char *name;
306 unsigned long offset, size; 306 unsigned long offset, size;
307 char namebuf[KSYM_NAME_LEN]; 307 int len;
308 308
309 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 309 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
310 if (!name) 310 if (!name)
311 return sprintf(buffer, "0x%lx", address); 311 return sprintf(buffer, "0x%lx", address);
312 312
313 if (name != buffer)
314 strcpy(buffer, name);
315 len = strlen(buffer);
316 buffer += len;
317
313 if (modname) 318 if (modname)
314 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 319 len += sprintf(buffer, "+%#lx/%#lx [%s]",
315 size, modname); 320 offset, size, modname);
316 else 321 else
317 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 322 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
323
324 return len;
318} 325}
319 326
320/* Look up a kernel symbol and print it to the kernel messages. */ 327/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 8b57a2597f21..9f8a3f25259a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned; 75 spinlock_t lock ____cacheline_aligned_in_smp;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77 77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
613 return -EINVAL; 613 return -EINVAL;
614 p->addr = addr; 614 p->addr = addr;
615 615
616 if (!kernel_text_address((unsigned long) p->addr) || 616 preempt_disable();
617 in_kprobes_functions((unsigned long) p->addr)) 617 if (!__kernel_text_address((unsigned long) p->addr) ||
618 in_kprobes_functions((unsigned long) p->addr)) {
619 preempt_enable();
618 return -EINVAL; 620 return -EINVAL;
621 }
619 622
620 p->mod_refcounted = 0; 623 p->mod_refcounted = 0;
621 624
622 /* 625 /*
623 * Check if are we probing a module. 626 * Check if are we probing a module.
624 */ 627 */
625 probed_mod = module_text_address((unsigned long) p->addr); 628 probed_mod = __module_text_address((unsigned long) p->addr);
626 if (probed_mod) { 629 if (probed_mod) {
627 struct module *calling_mod = module_text_address(called_from); 630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
628 /* 632 /*
629 * We must allow modules to probe themself and in this case 633 * We must allow modules to probe themself and in this case
630 * avoid incrementing the module refcount, so as to allow 634 * avoid incrementing the module refcount, so as to allow
631 * unloading of self probing modules. 635 * unloading of self probing modules.
632 */ 636 */
633 if (calling_mod && calling_mod != probed_mod) { 637 if (calling_mod && calling_mod != probed_mod) {
634 if (unlikely(!try_module_get(probed_mod))) 638 if (unlikely(!try_module_get(probed_mod))) {
639 preempt_enable();
635 return -EINVAL; 640 return -EINVAL;
641 }
636 p->mod_refcounted = 1; 642 p->mod_refcounted = 1;
637 } else 643 } else
638 probed_mod = NULL; 644 probed_mod = NULL;
639 } 645 }
646 preempt_enable();
640 647
641 p->nmissed = 0; 648 p->nmissed = 0;
642 INIT_LIST_HEAD(&p->list); 649 INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
718 struct kprobe *old_p; 725 struct kprobe *old_p;
719 726
720 if (p->mod_refcounted) { 727 if (p->mod_refcounted) {
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
721 mod = module_text_address((unsigned long)p->addr); 732 mod = module_text_address((unsigned long)p->addr);
722 if (mod) 733 if (mod)
723 module_put(mod); 734 module_put(mod);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0a..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
23 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
24struct kthread_create_info 27struct kthread_create_info
25{ 28{
26 /* Information passed to kthread() from kthreadd. */ 29 /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
191 latency_record[i].time, 191 latency_record[i].time,
192 latency_record[i].max); 192 latency_record[i].max);
193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
194 char sym[KSYM_NAME_LEN]; 194 char sym[KSYM_SYMBOL_LEN];
195 char *c; 195 char *c;
196 if (!latency_record[i].backtrace[q]) 196 if (!latency_record[i].backtrace[q])
197 break; 197 break;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06e157119d2b..c4c7df23f8c7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
25 * Thanks to Arjan van de Ven for coming up with the initial idea of 25 * Thanks to Arjan van de Ven for coming up with the initial idea of
26 * mapping lock dependencies runtime. 26 * mapping lock dependencies runtime.
27 */ 27 */
28#define DISABLE_BRANCH_PROFILING
28#include <linux/mutex.h> 29#include <linux/mutex.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/delay.h> 31#include <linux/delay.h>
@@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
136#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
138 139
139static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
140{ 141{
141 int i; 142 int i;
142 143
143 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
144 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
145 class->contention_point[i] = ip; 146 points[i] = ip;
146 break; 147 break;
147 } 148 }
148 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
149 break; 150 break;
150 } 151 }
151 152
@@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
186 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
187 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
188 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
189 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
190 194
@@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
209 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
210 } 214 }
211 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
212} 217}
213 218
214static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -2999,7 +3004,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2999 struct held_lock *hlock, *prev_hlock; 3004 struct held_lock *hlock, *prev_hlock;
3000 struct lock_class_stats *stats; 3005 struct lock_class_stats *stats;
3001 unsigned int depth; 3006 unsigned int depth;
3002 int i, point; 3007 int i, contention_point, contending_point;
3003 3008
3004 depth = curr->lockdep_depth; 3009 depth = curr->lockdep_depth;
3005 if (DEBUG_LOCKS_WARN_ON(!depth)) 3010 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3023,18 +3028,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3023found_it: 3028found_it:
3024 hlock->waittime_stamp = sched_clock(); 3029 hlock->waittime_stamp = sched_clock();
3025 3030
3026 point = lock_contention_point(hlock_class(hlock), ip); 3031 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3032 contending_point = lock_point(hlock_class(hlock)->contending_point,
3033 lock->ip);
3027 3034
3028 stats = get_lock_stats(hlock_class(hlock)); 3035 stats = get_lock_stats(hlock_class(hlock));
3029 if (point < ARRAY_SIZE(stats->contention_point)) 3036 if (contention_point < LOCKSTAT_POINTS)
3030 stats->contention_point[point]++; 3037 stats->contention_point[contention_point]++;
3038 if (contending_point < LOCKSTAT_POINTS)
3039 stats->contending_point[contending_point]++;
3031 if (lock->cpu != smp_processor_id()) 3040 if (lock->cpu != smp_processor_id())
3032 stats->bounces[bounce_contended + !!hlock->read]++; 3041 stats->bounces[bounce_contended + !!hlock->read]++;
3033 put_lock_stats(stats); 3042 put_lock_stats(stats);
3034} 3043}
3035 3044
3036static void 3045static void
3037__lock_acquired(struct lockdep_map *lock) 3046__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3038{ 3047{
3039 struct task_struct *curr = current; 3048 struct task_struct *curr = current;
3040 struct held_lock *hlock, *prev_hlock; 3049 struct held_lock *hlock, *prev_hlock;
@@ -3083,6 +3092,7 @@ found_it:
3083 put_lock_stats(stats); 3092 put_lock_stats(stats);
3084 3093
3085 lock->cpu = cpu; 3094 lock->cpu = cpu;
3095 lock->ip = ip;
3086} 3096}
3087 3097
3088void lock_contended(struct lockdep_map *lock, unsigned long ip) 3098void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3104,7 +3114,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3104} 3114}
3105EXPORT_SYMBOL_GPL(lock_contended); 3115EXPORT_SYMBOL_GPL(lock_contended);
3106 3116
3107void lock_acquired(struct lockdep_map *lock) 3117void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3108{ 3118{
3109 unsigned long flags; 3119 unsigned long flags;
3110 3120
@@ -3117,7 +3127,7 @@ void lock_acquired(struct lockdep_map *lock)
3117 raw_local_irq_save(flags); 3127 raw_local_irq_save(flags);
3118 check_flags(flags); 3128 check_flags(flags);
3119 current->lockdep_recursion = 1; 3129 current->lockdep_recursion = 1;
3120 __lock_acquired(lock); 3130 __lock_acquired(lock, ip);
3121 current->lockdep_recursion = 0; 3131 current->lockdep_recursion = 0;
3122 raw_local_irq_restore(flags); 3132 raw_local_irq_restore(flags);
3123} 3133}
@@ -3276,10 +3286,10 @@ void __init lockdep_info(void)
3276{ 3286{
3277 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); 3287 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
3278 3288
3279 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); 3289 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
3280 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); 3290 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
3281 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); 3291 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
3282 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); 3292 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
3283 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); 3293 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
3284 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); 3294 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
3285 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); 3295 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc9400..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
43 */ 43 */
44#define MARKER_HASH_BITS 6 44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
64 void *oldptr; 65 void *oldptr;
65 int rcu_pending; 66 int rcu_pending;
66 unsigned char ptype:1; 67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 69 char name[0]; /* Contains name'\0'format'\0' */
68}; 70};
69 71
70static struct hlist_head marker_table[MARKER_TABLE_SIZE];
71
72/** 72/**
73 * __mark_empty_function - Empty probe callback 73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data 74 * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
81 * though the function pointer change and the marker enabling are two distinct 81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code. 82 * operations that modifies the execution flow of preemptible code.
83 */ 83 */
84void __mark_empty_function(void *probe_private, void *call_private, 84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args) 85 const char *fmt, va_list *args)
86{ 86{
87} 87}
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
97 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
99 */ 99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private, ...) 100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
101{ 102{
102 va_list args; 103 va_list args;
103 char ptype; 104 char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
107 * sure the teardown of the callbacks can be done correctly when they 108 * sure the teardown of the callbacks can be done correctly when they
108 * are in modules and they insure RCU read coherency. 109 * are in modules and they insure RCU read coherency.
109 */ 110 */
110 rcu_read_lock_sched(); 111 rcu_read_lock_sched_notrace();
111 ptype = mdata->ptype; 112 ptype = mdata->ptype;
112 if (likely(!ptype)) { 113 if (likely(!ptype)) {
113 marker_probe_func *func; 114 marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 146 va_end(args);
146 } 147 }
147 } 148 }
148 rcu_read_unlock_sched(); 149 rcu_read_unlock_sched_notrace();
149} 150}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 151EXPORT_SYMBOL_GPL(marker_probe_cb);
151 152
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
157 * 158 *
158 * Should be connected to markers "MARK_NOARGS". 159 * Should be connected to markers "MARK_NOARGS".
159 */ 160 */
160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) 161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
161{ 163{
162 va_list args; /* not initialized */ 164 va_list args; /* not initialized */
163 char ptype; 165 char ptype;
164 166
165 rcu_read_lock_sched(); 167 rcu_read_lock_sched_notrace();
166 ptype = mdata->ptype; 168 ptype = mdata->ptype;
167 if (likely(!ptype)) { 169 if (likely(!ptype)) {
168 marker_probe_func *func; 170 marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 197 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 198 mdata->format, &args);
197 } 199 }
198 rcu_read_unlock_sched(); 200 rcu_read_unlock_sched_notrace();
199} 201}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 202
202static void free_old_closure(struct rcu_head *head) 203static void free_old_closure(struct rcu_head *head)
203{ 204{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
416 e->single.probe_private = NULL; 417 e->single.probe_private = NULL;
417 e->multi = NULL; 418 e->multi = NULL;
418 e->ptype = 0; 419 e->ptype = 0;
420 e->format_allocated = 0;
419 e->refcount = 0; 421 e->refcount = 0;
420 e->rcu_pending = 0; 422 e->rcu_pending = 0;
421 hlist_add_head(&e->hlist, head); 423 hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
447 if (e->single.func != __mark_empty_function) 449 if (e->single.func != __mark_empty_function)
448 return -EBUSY; 450 return -EBUSY;
449 hlist_del(&e->hlist); 451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
450 /* Make sure the call_rcu has been executed */ 454 /* Make sure the call_rcu has been executed */
451 if (e->rcu_pending) 455 if (e->rcu_pending)
452 rcu_barrier_sched(); 456 rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
457/* 461/*
458 * Set the mark_entry format to the format found in the element. 462 * Set the mark_entry format to the format found in the element.
459 */ 463 */
460static int marker_set_format(struct marker_entry **entry, const char *format) 464static int marker_set_format(struct marker_entry *entry, const char *format)
461{ 465{
462 struct marker_entry *e; 466 entry->format = kstrdup(format, GFP_KERNEL);
463 size_t name_len = strlen((*entry)->name) + 1; 467 if (!entry->format)
464 size_t format_len = strlen(format) + 1;
465
466
467 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
468 GFP_KERNEL);
469 if (!e)
470 return -ENOMEM; 468 return -ENOMEM;
471 memcpy(&e->name[0], (*entry)->name, name_len); 469 entry->format_allocated = 1;
472 e->format = &e->name[name_len]; 470
473 memcpy(e->format, format, format_len);
474 if (strcmp(e->format, MARK_NOARGS) == 0)
475 e->call = marker_probe_cb_noarg;
476 else
477 e->call = marker_probe_cb;
478 e->single = (*entry)->single;
479 e->multi = (*entry)->multi;
480 e->ptype = (*entry)->ptype;
481 e->refcount = (*entry)->refcount;
482 e->rcu_pending = 0;
483 hlist_add_before(&e->hlist, &(*entry)->hlist);
484 hlist_del(&(*entry)->hlist);
485 /* Make sure the call_rcu has been executed */
486 if ((*entry)->rcu_pending)
487 rcu_barrier_sched();
488 kfree(*entry);
489 *entry = e;
490 trace_mark(core_marker_format, "name %s format %s", 471 trace_mark(core_marker_format, "name %s format %s",
491 e->name, e->format); 472 entry->name, entry->format);
492 return 0; 473 return 0;
493} 474}
494 475
495/* 476/*
496 * Sets the probe callback corresponding to one marker. 477 * Sets the probe callback corresponding to one marker.
497 */ 478 */
498static int set_marker(struct marker_entry **entry, struct marker *elem, 479static int set_marker(struct marker_entry *entry, struct marker *elem,
499 int active) 480 int active)
500{ 481{
501 int ret; 482 int ret = 0;
502 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 483 WARN_ON(strcmp(entry->name, elem->name) != 0);
503 484
504 if ((*entry)->format) { 485 if (entry->format) {
505 if (strcmp((*entry)->format, elem->format) != 0) { 486 if (strcmp(entry->format, elem->format) != 0) {
506 printk(KERN_NOTICE 487 printk(KERN_NOTICE
507 "Format mismatch for probe %s " 488 "Format mismatch for probe %s "
508 "(%s), marker (%s)\n", 489 "(%s), marker (%s)\n",
509 (*entry)->name, 490 entry->name,
510 (*entry)->format, 491 entry->format,
511 elem->format); 492 elem->format);
512 return -EPERM; 493 return -EPERM;
513 } 494 }
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
523 * pass from a "safe" callback (with argument) to an "unsafe" 504 * pass from a "safe" callback (with argument) to an "unsafe"
524 * callback (does not set arguments). 505 * callback (does not set arguments).
525 */ 506 */
526 elem->call = (*entry)->call; 507 elem->call = entry->call;
527 /* 508 /*
528 * Sanity check : 509 * Sanity check :
529 * We only update the single probe private data when the ptr is 510 * We only update the single probe private data when the ptr is
530 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
531 */ 512 */
532 WARN_ON(elem->single.func != __mark_empty_function 513 WARN_ON(elem->single.func != __mark_empty_function
533 && elem->single.probe_private 514 && elem->single.probe_private != entry->single.probe_private
534 != (*entry)->single.probe_private && 515 && !elem->ptype);
535 !elem->ptype); 516 elem->single.probe_private = entry->single.probe_private;
536 elem->single.probe_private = (*entry)->single.probe_private;
537 /* 517 /*
538 * Make sure the private data is valid when we update the 518 * Make sure the private data is valid when we update the
539 * single probe ptr. 519 * single probe ptr.
540 */ 520 */
541 smp_wmb(); 521 smp_wmb();
542 elem->single.func = (*entry)->single.func; 522 elem->single.func = entry->single.func;
543 /* 523 /*
544 * We also make sure that the new probe callbacks array is consistent 524 * We also make sure that the new probe callbacks array is consistent
545 * before setting a pointer to it. 525 * before setting a pointer to it.
546 */ 526 */
547 rcu_assign_pointer(elem->multi, (*entry)->multi); 527 rcu_assign_pointer(elem->multi, entry->multi);
548 /* 528 /*
549 * Update the function or multi probe array pointer before setting the 529 * Update the function or multi probe array pointer before setting the
550 * ptype. 530 * ptype.
551 */ 531 */
552 smp_wmb(); 532 smp_wmb();
553 elem->ptype = (*entry)->ptype; 533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
554 elem->state = active; 565 elem->state = active;
555 566
556 return 0; 567 return ret;
557} 568}
558 569
559/* 570/*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
564 */ 575 */
565static void disable_marker(struct marker *elem) 576static void disable_marker(struct marker *elem)
566{ 577{
578 int ret;
579
567 /* leave "call" as is. It is known statically. */ 580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
568 elem->state = 0; 596 elem->state = 0;
569 elem->single.func = __mark_empty_function; 597 elem->single.func = __mark_empty_function;
570 /* Update the function before setting the ptype */ 598 /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
594 for (iter = begin; iter < end; iter++) { 622 for (iter = begin; iter < end; iter++) {
595 mark_entry = get_marker(iter->name); 623 mark_entry = get_marker(iter->name);
596 if (mark_entry) { 624 if (mark_entry) {
597 set_marker(&mark_entry, iter, 625 set_marker(mark_entry, iter, !!mark_entry->refcount);
598 !!mark_entry->refcount);
599 /* 626 /*
600 * ignore error, continue 627 * ignore error, continue
601 */ 628 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
629 marker_update_probe_range(__start___markers, __stop___markers); 656 marker_update_probe_range(__start___markers, __stop___markers);
630 /* Markers in modules. */ 657 /* Markers in modules. */
631 module_update_markers(); 658 module_update_markers();
659 tracepoint_probe_update_all();
632} 660}
633 661
634/** 662/**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
657 ret = PTR_ERR(entry); 685 ret = PTR_ERR(entry);
658 } else if (format) { 686 } else if (format) {
659 if (!entry->format) 687 if (!entry->format)
660 ret = marker_set_format(&entry, format); 688 ret = marker_set_format(entry, format);
661 else if (strcmp(entry->format, format)) 689 else if (strcmp(entry->format, format))
662 ret = -EPERM; 690 ret = -EPERM;
663 } 691 }
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
676 goto end; 704 goto end;
677 } 705 }
678 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
679 marker_update_probes(); /* may update entry */ 707 marker_update_probes();
680 mutex_lock(&markers_mutex); 708 mutex_lock(&markers_mutex);
681 entry = get_marker(name); 709 entry = get_marker(name);
682 WARN_ON(!entry); 710 if (!entry)
711 goto end;
683 if (entry->rcu_pending) 712 if (entry->rcu_pending)
684 rcu_barrier_sched(); 713 rcu_barrier_sched();
685 entry->oldptr = old; 714 entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
720 rcu_barrier_sched(); 749 rcu_barrier_sched();
721 old = marker_entry_remove_probe(entry, probe, probe_private); 750 old = marker_entry_remove_probe(entry, probe, probe_private);
722 mutex_unlock(&markers_mutex); 751 mutex_unlock(&markers_mutex);
723 marker_update_probes(); /* may update entry */ 752 marker_update_probes();
724 mutex_lock(&markers_mutex); 753 mutex_lock(&markers_mutex);
725 entry = get_marker(name); 754 entry = get_marker(name);
726 if (!entry) 755 if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
801 rcu_barrier_sched(); 830 rcu_barrier_sched();
802 old = marker_entry_remove_probe(entry, NULL, probe_private); 831 old = marker_entry_remove_probe(entry, NULL, probe_private);
803 mutex_unlock(&markers_mutex); 832 mutex_unlock(&markers_mutex);
804 marker_update_probes(); /* may update entry */ 833 marker_update_probes();
805 mutex_lock(&markers_mutex); 834 mutex_lock(&markers_mutex);
806 entry = get_marker_from_private_data(probe, probe_private); 835 entry = get_marker_from_private_data(probe, probe_private);
807 WARN_ON(!entry); 836 if (!entry)
837 goto end;
808 if (entry->rcu_pending) 838 if (entry->rcu_pending)
809 rcu_barrier_sched(); 839 rcu_barrier_sched();
810 entry->oldptr = old; 840 entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
848 if (!e->ptype) { 878 if (!e->ptype) {
849 if (num == 0 && e->single.func == probe) 879 if (num == 0 && e->single.func == probe)
850 return e->single.probe_private; 880 return e->single.probe_private;
851 else
852 break;
853 } else { 881 } else {
854 struct marker_probe_closure *closure; 882 struct marker_probe_closure *closure;
855 int match = 0; 883 int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
861 return closure[i].probe_private; 889 return closure[i].probe_private;
862 } 890 }
863 } 891 }
892 break;
864 } 893 }
865 } 894 }
866 return ERR_PTR(-ENOENT); 895 return ERR_PTR(-ENOENT);
867} 896}
868EXPORT_SYMBOL_GPL(marker_get_private_data); 897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..dd2a54155b54 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod,
2184 struct mod_debug *debug; 2184 struct mod_debug *debug;
2185 unsigned int num_debug; 2185 unsigned int num_debug;
2186 2186
2187#ifdef CONFIG_MARKERS
2188 marker_update_probe_range(mod->markers,
2189 mod->markers + mod->num_markers);
2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2187 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug); 2188 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug); 2189 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 } 2190 }
2200 2191
2201 /* sechdrs[0].sh_size is always zero */ 2192 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", 2193 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount); 2194 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount); 2195 ftrace_init_module(mod, mseg, mseg + num_mcount);
2205 2196
2206 err = module_finalize(hdr, sechdrs, mod); 2197 err = module_finalize(hdr, sechdrs, mod);
2207 if (err < 0) 2198 if (err < 0)
@@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr)
2713 2704
2714 2705
2715/* Is this a valid kernel address? */ 2706/* Is this a valid kernel address? */
2716struct module *__module_text_address(unsigned long addr) 2707__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2717{ 2708{
2718 struct module *mod; 2709 struct module *mod;
2719 2710
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/panic.c b/kernel/panic.c
index 6513aac8e992..4d5088355bfe 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -167,6 +167,7 @@ static const struct tnt tnts[] = {
167 * 'M' - System experienced a machine check exception. 167 * 'M' - System experienced a machine check exception.
168 * 'B' - System has hit bad_page. 168 * 'B' - System has hit bad_page.
169 * 'U' - Userspace-defined naughtiness. 169 * 'U' - Userspace-defined naughtiness.
170 * 'D' - Kernel has oopsed before
170 * 'A' - ACPI table overridden. 171 * 'A' - ACPI table overridden.
171 * 'W' - Taint on warning. 172 * 'W' - Taint on warning.
172 * 'C' - modules from drivers/staging are loaded. 173 * 'C' - modules from drivers/staging are loaded.
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 153dcb2639c3..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
@@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
311 struct task_cputime cputime; 311 struct task_cputime cputime;
312 312
313 thread_group_cputime(p, &cputime); 313 thread_group_cputime(p, &cputime);
314 switch (which_clock) { 314 switch (CPUCLOCK_WHICH(which_clock)) {
315 default: 315 default:
316 return -EINVAL; 316 return -EINVAL;
317 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
@@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1308 */ 1308 */
1309static inline int fastpath_timer_check(struct task_struct *tsk) 1309static inline int fastpath_timer_check(struct task_struct *tsk)
1310{ 1310{
1311 struct signal_struct *sig = tsk->signal; 1311 struct signal_struct *sig;
1312 1312
1313 if (unlikely(!sig)) 1313 /* tsk == current, ensure it is safe to use ->signal/sighand */
1314 if (unlikely(tsk->exit_state))
1314 return 0; 1315 return 0;
1315 1316
1316 if (!task_cputime_zero(&tsk->cputime_expires)) { 1317 if (!task_cputime_zero(&tsk->cputime_expires)) {
@@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1323 if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) 1324 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1324 return 1; 1325 return 1;
1325 } 1326 }
1327
1328 sig = tsk->signal;
1326 if (!task_cputime_zero(&sig->cputime_expires)) { 1329 if (!task_cputime_zero(&sig->cputime_expires)) {
1327 struct task_cputime group_sample; 1330 struct task_cputime group_sample;
1328 1331
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746f..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
257 256
258int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
259{ 258{
260 int error, ftrace_save; 259 int error;
261 260
262 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
263 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
269 goto Close; 268 goto Close;
270 269
271 suspend_console(); 270 suspend_console();
272 ftrace_save = __ftrace_enabled_save();
273 error = device_suspend(PMSG_FREEZE); 271 error = device_suspend(PMSG_FREEZE);
274 if (error) 272 if (error)
275 goto Recover_platform; 273 goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
299 Resume_devices: 297 Resume_devices:
300 device_resume(in_suspend ? 298 device_resume(in_suspend ?
301 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
302 __ftrace_enabled_restore(ftrace_save);
303 resume_console(); 300 resume_console();
304 Close: 301 Close:
305 platform_end(platform_mode); 302 platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
370 367
371int hibernation_restore(int platform_mode) 368int hibernation_restore(int platform_mode)
372{ 369{
373 int error, ftrace_save; 370 int error;
374 371
375 pm_prepare_console(); 372 pm_prepare_console();
376 suspend_console(); 373 suspend_console();
377 ftrace_save = __ftrace_enabled_save();
378 error = device_suspend(PMSG_QUIESCE); 374 error = device_suspend(PMSG_QUIESCE);
379 if (error) 375 if (error)
380 goto Finish; 376 goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
389 platform_restore_cleanup(platform_mode); 385 platform_restore_cleanup(platform_mode);
390 device_resume(PMSG_RECOVER); 386 device_resume(PMSG_RECOVER);
391 Finish: 387 Finish:
392 __ftrace_enabled_restore(ftrace_save);
393 resume_console(); 388 resume_console();
394 pm_restore_console(); 389 pm_restore_console();
395 return error; 390 return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
402 397
403int hibernation_platform_enter(void) 398int hibernation_platform_enter(void)
404{ 399{
405 int error, ftrace_save; 400 int error;
406 401
407 if (!hibernation_ops) 402 if (!hibernation_ops)
408 return -ENOSYS; 403 return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
417 goto Close; 412 goto Close;
418 413
419 suspend_console(); 414 suspend_console();
420 ftrace_save = __ftrace_enabled_save();
421 error = device_suspend(PMSG_HIBERNATE); 415 error = device_suspend(PMSG_HIBERNATE);
422 if (error) { 416 if (error) {
423 if (hibernation_ops->recover) 417 if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
452 hibernation_ops->finish(); 446 hibernation_ops->finish();
453 Resume_devices: 447 Resume_devices:
454 device_resume(PMSG_RESTORE); 448 device_resume(PMSG_RESTORE);
455 __ftrace_enabled_restore(ftrace_save);
456 resume_console(); 449 resume_console();
457 Close: 450 Close:
458 hibernation_ops->end(); 451 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 19122cf6d827..613f16941b85 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/vmstat.h> 23#include <linux/vmstat.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -174,7 +173,7 @@ static void suspend_test_finish(const char *label)
174 * has some performance issues. The stack dump of a WARN_ON 173 * has some performance issues. The stack dump of a WARN_ON
175 * is more likely to get the right attention than a printk... 174 * is more likely to get the right attention than a printk...
176 */ 175 */
177 WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); 176 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
178} 177}
179 178
180#else 179#else
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
317 */ 316 */
318int suspend_devices_and_enter(suspend_state_t state) 317int suspend_devices_and_enter(suspend_state_t state)
319{ 318{
320 int error, ftrace_save; 319 int error;
321 320
322 if (!suspend_ops) 321 if (!suspend_ops)
323 return -ENOSYS; 322 return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
328 goto Close; 327 goto Close;
329 } 328 }
330 suspend_console(); 329 suspend_console();
331 ftrace_save = __ftrace_enabled_save();
332 suspend_test_start(); 330 suspend_test_start();
333 error = device_suspend(PMSG_SUSPEND); 331 error = device_suspend(PMSG_SUSPEND);
334 if (error) { 332 if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
360 suspend_test_start(); 358 suspend_test_start();
361 device_resume(PMSG_RESUME); 359 device_resume(PMSG_RESUME);
362 suspend_test_finish("resume devices"); 360 suspend_test_finish("resume devices");
363 __ftrace_enabled_restore(ftrace_save);
364 resume_console(); 361 resume_console();
365 Close: 362 Close:
366 if (suspend_ops->end) 363 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b7713b53d07a..6da14358537c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode)
633 return; 633 return;
634 } 634 }
635 635
636 blkdev_put(resume_bdev, mode); /* move up */ 636 blkdev_put(resume_bdev, mode);
637} 637}
638 638
639static int swsusp_header_init(void) 639static int swsusp_header_init(void)
diff --git a/kernel/profile.c b/kernel/profile.c
index 9830a037d8db..60adefb59b5e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -351,7 +351,7 @@ out:
351 put_cpu(); 351 put_cpu();
352} 352}
353 353
354static int __devinit profile_cpu_callback(struct notifier_block *info, 354static int __cpuinit profile_cpu_callback(struct notifier_block *info,
355 unsigned long action, void *__cpu) 355 unsigned long action, void *__cpu)
356{ 356{
357 int node, cpu = (unsigned long)__cpu; 357 int node, cpu = (unsigned long)__cpu;
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
544}; 544};
545 545
546#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
547static void __init profile_nop(void *unused) 547static void profile_nop(void *unused)
548{ 548{
549} 549}
550 550
@@ -596,7 +596,7 @@ out_cleanup:
596#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
597#endif 597#endif
598 598
599int create_proc_profile(void) 599int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
600{ 600{
601 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
602 602
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2c..4c8bcd7dd8e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
612 return (copied == sizeof(data)) ? 0 : -EIO; 612 return (copied == sizeof(data)) ? 0 : -EIO;
613} 613}
614 614
615#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE 615#if defined CONFIG_COMPAT
616#include <linux/compat.h> 616#include <linux/compat.h>
617 617
618int compat_ptrace_request(struct task_struct *child, compat_long_t request, 618int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
709 unlock_kernel(); 709 unlock_kernel();
710 return ret; 710 return ret;
711} 711}
712#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ 712#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..c03ca3e61919 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 393 * unnecessarily.
394 */ 394 */
395 smp_mb(); 395 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 396 cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
397 397
398 rcp->signaled = 0; 398 rcp->signaled = 0;
399 } 399 }
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c08..09ac2008f77b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
400 } 400 }
401 401
402 mutex_lock(&relay_channels_mutex); 402 mutex_lock(&relay_channels_mutex);
403 for_each_online_cpu(i) 403 for_each_possible_cpu(i)
404 if (chan->buf[i]) 404 if (chan->buf[i])
405 __relay_reset(chan->buf[i], 0); 405 __relay_reset(chan->buf[i], 0);
406 mutex_unlock(&relay_channels_mutex); 406 mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
611 return chan; 611 return chan;
612 612
613free_bufs: 613free_bufs:
614 for_each_online_cpu(i) { 614 for_each_possible_cpu(i) {
615 if (!chan->buf[i]) 615 if (chan->buf[i])
616 break; 616 relay_close_buf(chan->buf[i]);
617 relay_close_buf(chan->buf[i]);
618 } 617 }
619 618
620 kref_put(&chan->kref, relay_destroy_channel); 619 kref_put(&chan->kref, relay_destroy_channel);
@@ -1318,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
1318 if (ret < 0) 1317 if (ret < 0)
1319 break; 1318 break;
1320 else if (!ret) { 1319 else if (!ret) {
1321 if (spliced) 1320 if (flags & SPLICE_F_NONBLOCK)
1322 break;
1323 if (flags & SPLICE_F_NONBLOCK) {
1324 ret = -EAGAIN; 1321 ret = -EAGAIN;
1325 break; 1322 break;
1326 }
1327 } 1323 }
1328 1324
1329 *ppos += ret; 1325 *ppos += ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index 57c933ffbee1..e00c92d22655 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -261,6 +267,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 267 struct cgroup_subsys_state css;
262#endif 268#endif
263 269
270#ifdef CONFIG_USER_SCHED
271 uid_t uid;
272#endif
273
264#ifdef CONFIG_FAIR_GROUP_SCHED 274#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 275 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 276 struct sched_entity **se;
@@ -286,6 +296,12 @@ struct task_group {
286 296
287#ifdef CONFIG_USER_SCHED 297#ifdef CONFIG_USER_SCHED
288 298
299/* Helper function to pass uid information to create_sched_user() */
300void set_tg_uid(struct user_struct *user)
301{
302 user->tg->uid = user->uid;
303}
304
289/* 305/*
290 * Root task group. 306 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 307 * Every UID task group (including init_task_group aka UID-0) will
@@ -399,7 +415,7 @@ struct cfs_rq {
399 */ 415 */
400 struct sched_entity *curr, *next, *last; 416 struct sched_entity *curr, *next, *last;
401 417
402 unsigned long nr_spread_over; 418 unsigned int nr_spread_over;
403 419
404#ifdef CONFIG_FAIR_GROUP_SCHED 420#ifdef CONFIG_FAIR_GROUP_SCHED
405 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 421 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -481,14 +497,14 @@ struct rt_rq {
481 */ 497 */
482struct root_domain { 498struct root_domain {
483 atomic_t refcount; 499 atomic_t refcount;
484 cpumask_t span; 500 cpumask_var_t span;
485 cpumask_t online; 501 cpumask_var_t online;
486 502
487 /* 503 /*
488 * The "RT overload" flag: it gets set if a CPU has more than 504 * The "RT overload" flag: it gets set if a CPU has more than
489 * one runnable RT task. 505 * one runnable RT task.
490 */ 506 */
491 cpumask_t rto_mask; 507 cpumask_var_t rto_mask;
492 atomic_t rto_count; 508 atomic_t rto_count;
493#ifdef CONFIG_SMP 509#ifdef CONFIG_SMP
494 struct cpupri cpupri; 510 struct cpupri cpupri;
@@ -703,45 +719,18 @@ static __read_mostly char *sched_feat_names[] = {
703 719
704#undef SCHED_FEAT 720#undef SCHED_FEAT
705 721
706static int sched_feat_open(struct inode *inode, struct file *filp) 722static int sched_feat_show(struct seq_file *m, void *v)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{ 723{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 724 int i;
720 725
721 for (i = 0; sched_feat_names[i]; i++) { 726 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 727 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 728 seq_puts(m, "NO_");
729 seq_printf(m, "%s ", sched_feat_names[i]);
724 } 730 }
731 seq_puts(m, "\n");
725 732
726 buf = kmalloc(len + 2, GFP_KERNEL); 733 return 0;
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 }
736
737 r += sprintf(buf + r, "\n");
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 734}
746 735
747static ssize_t 736static ssize_t
@@ -786,10 +775,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 775 return cnt;
787} 776}
788 777
778static int sched_feat_open(struct inode *inode, struct file *filp)
779{
780 return single_open(filp, sched_feat_show, NULL);
781}
782
789static struct file_operations sched_feat_fops = { 783static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 784 .open = sched_feat_open,
791 .read = sched_feat_read, 785 .write = sched_feat_write,
792 .write = sched_feat_write, 786 .read = seq_read,
787 .llseek = seq_lseek,
788 .release = single_release,
793}; 789};
794 790
795static __init int sched_init_debug(void) 791static __init int sched_init_debug(void)
@@ -969,6 +965,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
969 } 965 }
970} 966}
971 967
968void task_rq_unlock_wait(struct task_struct *p)
969{
970 struct rq *rq = task_rq(p);
971
972 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
973 spin_unlock_wait(&rq->lock);
974}
975
972static void __task_rq_unlock(struct rq *rq) 976static void __task_rq_unlock(struct rq *rq)
973 __releases(rq->lock) 977 __releases(rq->lock)
974{ 978{
@@ -1445,9 +1449,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1445static unsigned long cpu_avg_load_per_task(int cpu) 1449static unsigned long cpu_avg_load_per_task(int cpu)
1446{ 1450{
1447 struct rq *rq = cpu_rq(cpu); 1451 struct rq *rq = cpu_rq(cpu);
1452 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1448 1453
1449 if (rq->nr_running) 1454 if (nr_running)
1450 rq->avg_load_per_task = rq->load.weight / rq->nr_running; 1455 rq->avg_load_per_task = rq->load.weight / nr_running;
1456 else
1457 rq->avg_load_per_task = 0;
1451 1458
1452 return rq->avg_load_per_task; 1459 return rq->avg_load_per_task;
1453} 1460}
@@ -1463,27 +1470,13 @@ static void
1463update_group_shares_cpu(struct task_group *tg, int cpu, 1470update_group_shares_cpu(struct task_group *tg, int cpu,
1464 unsigned long sd_shares, unsigned long sd_rq_weight) 1471 unsigned long sd_shares, unsigned long sd_rq_weight)
1465{ 1472{
1466 int boost = 0;
1467 unsigned long shares; 1473 unsigned long shares;
1468 unsigned long rq_weight; 1474 unsigned long rq_weight;
1469 1475
1470 if (!tg->se[cpu]) 1476 if (!tg->se[cpu])
1471 return; 1477 return;
1472 1478
1473 rq_weight = tg->cfs_rq[cpu]->load.weight; 1479 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1474
1475 /*
1476 * If there are currently no tasks on the cpu pretend there is one of
1477 * average load so that when a new task gets to run here it will not
1478 * get delayed by group starvation.
1479 */
1480 if (!rq_weight) {
1481 boost = 1;
1482 rq_weight = NICE_0_LOAD;
1483 }
1484
1485 if (unlikely(rq_weight > sd_rq_weight))
1486 rq_weight = sd_rq_weight;
1487 1480
1488 /* 1481 /*
1489 * \Sum shares * rq_weight 1482 * \Sum shares * rq_weight
@@ -1491,7 +1484,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1491 * \Sum rq_weight 1484 * \Sum rq_weight
1492 * 1485 *
1493 */ 1486 */
1494 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1487 shares = (sd_shares * rq_weight) / sd_rq_weight;
1495 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1488 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1496 1489
1497 if (abs(shares - tg->se[cpu]->load.weight) > 1490 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1500,11 +1493,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1500 unsigned long flags; 1493 unsigned long flags;
1501 1494
1502 spin_lock_irqsave(&rq->lock, flags); 1495 spin_lock_irqsave(&rq->lock, flags);
1503 /* 1496 tg->cfs_rq[cpu]->shares = shares;
1504 * record the actual number of shares, not the boosted amount.
1505 */
1506 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1507 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1508 1497
1509 __set_se_shares(tg->se[cpu], shares); 1498 __set_se_shares(tg->se[cpu], shares);
1510 spin_unlock_irqrestore(&rq->lock, flags); 1499 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1518,13 +1507,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1518 */ 1507 */
1519static int tg_shares_up(struct task_group *tg, void *data) 1508static int tg_shares_up(struct task_group *tg, void *data)
1520{ 1509{
1521 unsigned long rq_weight = 0; 1510 unsigned long weight, rq_weight = 0;
1522 unsigned long shares = 0; 1511 unsigned long shares = 0;
1523 struct sched_domain *sd = data; 1512 struct sched_domain *sd = data;
1524 int i; 1513 int i;
1525 1514
1526 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu(i, sched_domain_span(sd)) {
1527 rq_weight += tg->cfs_rq[i]->load.weight; 1516 /*
1517 * If there are currently no tasks on the cpu pretend there
1518 * is one of average load so that when a new task gets to
1519 * run here it will not get delayed by group starvation.
1520 */
1521 weight = tg->cfs_rq[i]->load.weight;
1522 if (!weight)
1523 weight = NICE_0_LOAD;
1524
1525 tg->cfs_rq[i]->rq_weight = weight;
1526 rq_weight += weight;
1528 shares += tg->cfs_rq[i]->shares; 1527 shares += tg->cfs_rq[i]->shares;
1529 } 1528 }
1530 1529
@@ -1534,10 +1533,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1534 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1533 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1535 shares = tg->shares; 1534 shares = tg->shares;
1536 1535
1537 if (!rq_weight) 1536 for_each_cpu(i, sched_domain_span(sd))
1538 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1539
1540 for_each_cpu_mask(i, sd->span)
1541 update_group_shares_cpu(tg, i, shares, rq_weight); 1537 update_group_shares_cpu(tg, i, shares, rq_weight);
1542 1538
1543 return 0; 1539 return 0;
@@ -1601,6 +1597,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1601 1597
1602#endif 1598#endif
1603 1599
1600/*
1601 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1602 */
1603static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1604 __releases(this_rq->lock)
1605 __acquires(busiest->lock)
1606 __acquires(this_rq->lock)
1607{
1608 int ret = 0;
1609
1610 if (unlikely(!irqs_disabled())) {
1611 /* printk() doesn't work good under rq->lock */
1612 spin_unlock(&this_rq->lock);
1613 BUG_ON(1);
1614 }
1615 if (unlikely(!spin_trylock(&busiest->lock))) {
1616 if (busiest < this_rq) {
1617 spin_unlock(&this_rq->lock);
1618 spin_lock(&busiest->lock);
1619 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1620 ret = 1;
1621 } else
1622 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1623 }
1624 return ret;
1625}
1626
1627static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1628 __releases(busiest->lock)
1629{
1630 spin_unlock(&busiest->lock);
1631 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1632}
1604#endif 1633#endif
1605 1634
1606#ifdef CONFIG_FAIR_GROUP_SCHED 1635#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2068,15 +2097,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2068 int i; 2097 int i;
2069 2098
2070 /* Skip over this group if it has no CPUs allowed */ 2099 /* Skip over this group if it has no CPUs allowed */
2071 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2100 if (!cpumask_intersects(sched_group_cpus(group),
2101 &p->cpus_allowed))
2072 continue; 2102 continue;
2073 2103
2074 local_group = cpu_isset(this_cpu, group->cpumask); 2104 local_group = cpumask_test_cpu(this_cpu,
2105 sched_group_cpus(group));
2075 2106
2076 /* Tally up the load of all CPUs in the group */ 2107 /* Tally up the load of all CPUs in the group */
2077 avg_load = 0; 2108 avg_load = 0;
2078 2109
2079 for_each_cpu_mask_nr(i, group->cpumask) { 2110 for_each_cpu(i, sched_group_cpus(group)) {
2080 /* Bias balancing toward cpus of our domain */ 2111 /* Bias balancing toward cpus of our domain */
2081 if (local_group) 2112 if (local_group)
2082 load = source_load(i, load_idx); 2113 load = source_load(i, load_idx);
@@ -2108,17 +2139,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2108 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2139 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2109 */ 2140 */
2110static int 2141static int
2111find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2142find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2112 cpumask_t *tmp)
2113{ 2143{
2114 unsigned long load, min_load = ULONG_MAX; 2144 unsigned long load, min_load = ULONG_MAX;
2115 int idlest = -1; 2145 int idlest = -1;
2116 int i; 2146 int i;
2117 2147
2118 /* Traverse only the allowed CPUs */ 2148 /* Traverse only the allowed CPUs */
2119 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2149 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2120
2121 for_each_cpu_mask_nr(i, *tmp) {
2122 load = weighted_cpuload(i); 2150 load = weighted_cpuload(i);
2123 2151
2124 if (load < min_load || (load == min_load && i == this_cpu)) { 2152 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2160,7 +2188,6 @@ static int sched_balance_self(int cpu, int flag)
2160 update_shares(sd); 2188 update_shares(sd);
2161 2189
2162 while (sd) { 2190 while (sd) {
2163 cpumask_t span, tmpmask;
2164 struct sched_group *group; 2191 struct sched_group *group;
2165 int new_cpu, weight; 2192 int new_cpu, weight;
2166 2193
@@ -2169,14 +2196,13 @@ static int sched_balance_self(int cpu, int flag)
2169 continue; 2196 continue;
2170 } 2197 }
2171 2198
2172 span = sd->span;
2173 group = find_idlest_group(sd, t, cpu); 2199 group = find_idlest_group(sd, t, cpu);
2174 if (!group) { 2200 if (!group) {
2175 sd = sd->child; 2201 sd = sd->child;
2176 continue; 2202 continue;
2177 } 2203 }
2178 2204
2179 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2205 new_cpu = find_idlest_cpu(group, t, cpu);
2180 if (new_cpu == -1 || new_cpu == cpu) { 2206 if (new_cpu == -1 || new_cpu == cpu) {
2181 /* Now try balancing at a lower domain level of cpu */ 2207 /* Now try balancing at a lower domain level of cpu */
2182 sd = sd->child; 2208 sd = sd->child;
@@ -2185,10 +2211,10 @@ static int sched_balance_self(int cpu, int flag)
2185 2211
2186 /* Now try balancing at a lower domain level of new_cpu */ 2212 /* Now try balancing at a lower domain level of new_cpu */
2187 cpu = new_cpu; 2213 cpu = new_cpu;
2214 weight = cpumask_weight(sched_domain_span(sd));
2188 sd = NULL; 2215 sd = NULL;
2189 weight = cpus_weight(span);
2190 for_each_domain(cpu, tmp) { 2216 for_each_domain(cpu, tmp) {
2191 if (weight <= cpus_weight(tmp->span)) 2217 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2192 break; 2218 break;
2193 if (tmp->flags & flag) 2219 if (tmp->flags & flag)
2194 sd = tmp; 2220 sd = tmp;
@@ -2233,7 +2259,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2233 cpu = task_cpu(p); 2259 cpu = task_cpu(p);
2234 2260
2235 for_each_domain(this_cpu, sd) { 2261 for_each_domain(this_cpu, sd) {
2236 if (cpu_isset(cpu, sd->span)) { 2262 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2237 update_shares(sd); 2263 update_shares(sd);
2238 break; 2264 break;
2239 } 2265 }
@@ -2281,7 +2307,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2281 else { 2307 else {
2282 struct sched_domain *sd; 2308 struct sched_domain *sd;
2283 for_each_domain(this_cpu, sd) { 2309 for_each_domain(this_cpu, sd) {
2284 if (cpu_isset(cpu, sd->span)) { 2310 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2285 schedstat_inc(sd, ttwu_wake_remote); 2311 schedstat_inc(sd, ttwu_wake_remote);
2286 break; 2312 break;
2287 } 2313 }
@@ -2801,40 +2827,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2801} 2827}
2802 2828
2803/* 2829/*
2804 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2805 */
2806static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2807 __releases(this_rq->lock)
2808 __acquires(busiest->lock)
2809 __acquires(this_rq->lock)
2810{
2811 int ret = 0;
2812
2813 if (unlikely(!irqs_disabled())) {
2814 /* printk() doesn't work good under rq->lock */
2815 spin_unlock(&this_rq->lock);
2816 BUG_ON(1);
2817 }
2818 if (unlikely(!spin_trylock(&busiest->lock))) {
2819 if (busiest < this_rq) {
2820 spin_unlock(&this_rq->lock);
2821 spin_lock(&busiest->lock);
2822 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2823 ret = 1;
2824 } else
2825 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2826 }
2827 return ret;
2828}
2829
2830static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2831 __releases(busiest->lock)
2832{
2833 spin_unlock(&busiest->lock);
2834 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2835}
2836
2837/*
2838 * If dest_cpu is allowed for this process, migrate the task to it. 2830 * If dest_cpu is allowed for this process, migrate the task to it.
2839 * This is accomplished by forcing the cpu_allowed mask to only 2831 * This is accomplished by forcing the cpu_allowed mask to only
2840 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2832 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2847,7 +2839,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2847 struct rq *rq; 2839 struct rq *rq;
2848 2840
2849 rq = task_rq_lock(p, &flags); 2841 rq = task_rq_lock(p, &flags);
2850 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2842 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2851 || unlikely(!cpu_active(dest_cpu))) 2843 || unlikely(!cpu_active(dest_cpu)))
2852 goto out; 2844 goto out;
2853 2845
@@ -2913,7 +2905,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2913 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2905 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2914 * 3) are cache-hot on their current CPU. 2906 * 3) are cache-hot on their current CPU.
2915 */ 2907 */
2916 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2908 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2917 schedstat_inc(p, se.nr_failed_migrations_affine); 2909 schedstat_inc(p, se.nr_failed_migrations_affine);
2918 return 0; 2910 return 0;
2919 } 2911 }
@@ -3088,7 +3080,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3088static struct sched_group * 3080static struct sched_group *
3089find_busiest_group(struct sched_domain *sd, int this_cpu, 3081find_busiest_group(struct sched_domain *sd, int this_cpu,
3090 unsigned long *imbalance, enum cpu_idle_type idle, 3082 unsigned long *imbalance, enum cpu_idle_type idle,
3091 int *sd_idle, const cpumask_t *cpus, int *balance) 3083 int *sd_idle, const struct cpumask *cpus, int *balance)
3092{ 3084{
3093 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3085 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3094 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3086 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3124,10 +3116,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3124 unsigned long sum_avg_load_per_task; 3116 unsigned long sum_avg_load_per_task;
3125 unsigned long avg_load_per_task; 3117 unsigned long avg_load_per_task;
3126 3118
3127 local_group = cpu_isset(this_cpu, group->cpumask); 3119 local_group = cpumask_test_cpu(this_cpu,
3120 sched_group_cpus(group));
3128 3121
3129 if (local_group) 3122 if (local_group)
3130 balance_cpu = first_cpu(group->cpumask); 3123 balance_cpu = cpumask_first(sched_group_cpus(group));
3131 3124
3132 /* Tally up the load of all CPUs in the group */ 3125 /* Tally up the load of all CPUs in the group */
3133 sum_weighted_load = sum_nr_running = avg_load = 0; 3126 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3136,13 +3129,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3136 max_cpu_load = 0; 3129 max_cpu_load = 0;
3137 min_cpu_load = ~0UL; 3130 min_cpu_load = ~0UL;
3138 3131
3139 for_each_cpu_mask_nr(i, group->cpumask) { 3132 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3140 struct rq *rq; 3133 struct rq *rq = cpu_rq(i);
3141
3142 if (!cpu_isset(i, *cpus))
3143 continue;
3144
3145 rq = cpu_rq(i);
3146 3134
3147 if (*sd_idle && rq->nr_running) 3135 if (*sd_idle && rq->nr_running)
3148 *sd_idle = 0; 3136 *sd_idle = 0;
@@ -3253,8 +3241,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3253 */ 3241 */
3254 if ((sum_nr_running < min_nr_running) || 3242 if ((sum_nr_running < min_nr_running) ||
3255 (sum_nr_running == min_nr_running && 3243 (sum_nr_running == min_nr_running &&
3256 first_cpu(group->cpumask) < 3244 cpumask_first(sched_group_cpus(group)) <
3257 first_cpu(group_min->cpumask))) { 3245 cpumask_first(sched_group_cpus(group_min)))) {
3258 group_min = group; 3246 group_min = group;
3259 min_nr_running = sum_nr_running; 3247 min_nr_running = sum_nr_running;
3260 min_load_per_task = sum_weighted_load / 3248 min_load_per_task = sum_weighted_load /
@@ -3269,8 +3257,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3269 if (sum_nr_running <= group_capacity - 1) { 3257 if (sum_nr_running <= group_capacity - 1) {
3270 if (sum_nr_running > leader_nr_running || 3258 if (sum_nr_running > leader_nr_running ||
3271 (sum_nr_running == leader_nr_running && 3259 (sum_nr_running == leader_nr_running &&
3272 first_cpu(group->cpumask) > 3260 cpumask_first(sched_group_cpus(group)) >
3273 first_cpu(group_leader->cpumask))) { 3261 cpumask_first(sched_group_cpus(group_leader)))) {
3274 group_leader = group; 3262 group_leader = group;
3275 leader_nr_running = sum_nr_running; 3263 leader_nr_running = sum_nr_running;
3276 } 3264 }
@@ -3409,16 +3397,16 @@ ret:
3409 */ 3397 */
3410static struct rq * 3398static struct rq *
3411find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3399find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3412 unsigned long imbalance, const cpumask_t *cpus) 3400 unsigned long imbalance, const struct cpumask *cpus)
3413{ 3401{
3414 struct rq *busiest = NULL, *rq; 3402 struct rq *busiest = NULL, *rq;
3415 unsigned long max_load = 0; 3403 unsigned long max_load = 0;
3416 int i; 3404 int i;
3417 3405
3418 for_each_cpu_mask_nr(i, group->cpumask) { 3406 for_each_cpu(i, sched_group_cpus(group)) {
3419 unsigned long wl; 3407 unsigned long wl;
3420 3408
3421 if (!cpu_isset(i, *cpus)) 3409 if (!cpumask_test_cpu(i, cpus))
3422 continue; 3410 continue;
3423 3411
3424 rq = cpu_rq(i); 3412 rq = cpu_rq(i);
@@ -3448,7 +3436,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3448 */ 3436 */
3449static int load_balance(int this_cpu, struct rq *this_rq, 3437static int load_balance(int this_cpu, struct rq *this_rq,
3450 struct sched_domain *sd, enum cpu_idle_type idle, 3438 struct sched_domain *sd, enum cpu_idle_type idle,
3451 int *balance, cpumask_t *cpus) 3439 int *balance, struct cpumask *cpus)
3452{ 3440{
3453 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3441 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3454 struct sched_group *group; 3442 struct sched_group *group;
@@ -3456,7 +3444,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3456 struct rq *busiest; 3444 struct rq *busiest;
3457 unsigned long flags; 3445 unsigned long flags;
3458 3446
3459 cpus_setall(*cpus); 3447 cpumask_setall(cpus);
3460 3448
3461 /* 3449 /*
3462 * When power savings policy is enabled for the parent domain, idle 3450 * When power savings policy is enabled for the parent domain, idle
@@ -3516,8 +3504,8 @@ redo:
3516 3504
3517 /* All tasks on this runqueue were pinned by CPU affinity */ 3505 /* All tasks on this runqueue were pinned by CPU affinity */
3518 if (unlikely(all_pinned)) { 3506 if (unlikely(all_pinned)) {
3519 cpu_clear(cpu_of(busiest), *cpus); 3507 cpumask_clear_cpu(cpu_of(busiest), cpus);
3520 if (!cpus_empty(*cpus)) 3508 if (!cpumask_empty(cpus))
3521 goto redo; 3509 goto redo;
3522 goto out_balanced; 3510 goto out_balanced;
3523 } 3511 }
@@ -3534,7 +3522,8 @@ redo:
3534 /* don't kick the migration_thread, if the curr 3522 /* don't kick the migration_thread, if the curr
3535 * task on busiest cpu can't be moved to this_cpu 3523 * task on busiest cpu can't be moved to this_cpu
3536 */ 3524 */
3537 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3525 if (!cpumask_test_cpu(this_cpu,
3526 &busiest->curr->cpus_allowed)) {
3538 spin_unlock_irqrestore(&busiest->lock, flags); 3527 spin_unlock_irqrestore(&busiest->lock, flags);
3539 all_pinned = 1; 3528 all_pinned = 1;
3540 goto out_one_pinned; 3529 goto out_one_pinned;
@@ -3609,7 +3598,7 @@ out:
3609 */ 3598 */
3610static int 3599static int
3611load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3600load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3612 cpumask_t *cpus) 3601 struct cpumask *cpus)
3613{ 3602{
3614 struct sched_group *group; 3603 struct sched_group *group;
3615 struct rq *busiest = NULL; 3604 struct rq *busiest = NULL;
@@ -3618,7 +3607,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3618 int sd_idle = 0; 3607 int sd_idle = 0;
3619 int all_pinned = 0; 3608 int all_pinned = 0;
3620 3609
3621 cpus_setall(*cpus); 3610 cpumask_setall(cpus);
3622 3611
3623 /* 3612 /*
3624 * When power savings policy is enabled for the parent domain, idle 3613 * When power savings policy is enabled for the parent domain, idle
@@ -3662,8 +3651,8 @@ redo:
3662 double_unlock_balance(this_rq, busiest); 3651 double_unlock_balance(this_rq, busiest);
3663 3652
3664 if (unlikely(all_pinned)) { 3653 if (unlikely(all_pinned)) {
3665 cpu_clear(cpu_of(busiest), *cpus); 3654 cpumask_clear_cpu(cpu_of(busiest), cpus);
3666 if (!cpus_empty(*cpus)) 3655 if (!cpumask_empty(cpus))
3667 goto redo; 3656 goto redo;
3668 } 3657 }
3669 } 3658 }
@@ -3696,9 +3685,12 @@ out_balanced:
3696static void idle_balance(int this_cpu, struct rq *this_rq) 3685static void idle_balance(int this_cpu, struct rq *this_rq)
3697{ 3686{
3698 struct sched_domain *sd; 3687 struct sched_domain *sd;
3699 int pulled_task = -1; 3688 int pulled_task = 0;
3700 unsigned long next_balance = jiffies + HZ; 3689 unsigned long next_balance = jiffies + HZ;
3701 cpumask_t tmpmask; 3690 cpumask_var_t tmpmask;
3691
3692 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3693 return;
3702 3694
3703 for_each_domain(this_cpu, sd) { 3695 for_each_domain(this_cpu, sd) {
3704 unsigned long interval; 3696 unsigned long interval;
@@ -3709,7 +3701,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3709 if (sd->flags & SD_BALANCE_NEWIDLE) 3701 if (sd->flags & SD_BALANCE_NEWIDLE)
3710 /* If we've pulled tasks over stop searching: */ 3702 /* If we've pulled tasks over stop searching: */
3711 pulled_task = load_balance_newidle(this_cpu, this_rq, 3703 pulled_task = load_balance_newidle(this_cpu, this_rq,
3712 sd, &tmpmask); 3704 sd, tmpmask);
3713 3705
3714 interval = msecs_to_jiffies(sd->balance_interval); 3706 interval = msecs_to_jiffies(sd->balance_interval);
3715 if (time_after(next_balance, sd->last_balance + interval)) 3707 if (time_after(next_balance, sd->last_balance + interval))
@@ -3724,6 +3716,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3724 */ 3716 */
3725 this_rq->next_balance = next_balance; 3717 this_rq->next_balance = next_balance;
3726 } 3718 }
3719 free_cpumask_var(tmpmask);
3727} 3720}
3728 3721
3729/* 3722/*
@@ -3761,7 +3754,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3761 /* Search for an sd spanning us and the target CPU. */ 3754 /* Search for an sd spanning us and the target CPU. */
3762 for_each_domain(target_cpu, sd) { 3755 for_each_domain(target_cpu, sd) {
3763 if ((sd->flags & SD_LOAD_BALANCE) && 3756 if ((sd->flags & SD_LOAD_BALANCE) &&
3764 cpu_isset(busiest_cpu, sd->span)) 3757 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3765 break; 3758 break;
3766 } 3759 }
3767 3760
@@ -3780,10 +3773,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3780#ifdef CONFIG_NO_HZ 3773#ifdef CONFIG_NO_HZ
3781static struct { 3774static struct {
3782 atomic_t load_balancer; 3775 atomic_t load_balancer;
3783 cpumask_t cpu_mask; 3776 cpumask_var_t cpu_mask;
3784} nohz ____cacheline_aligned = { 3777} nohz ____cacheline_aligned = {
3785 .load_balancer = ATOMIC_INIT(-1), 3778 .load_balancer = ATOMIC_INIT(-1),
3786 .cpu_mask = CPU_MASK_NONE,
3787}; 3779};
3788 3780
3789/* 3781/*
@@ -3811,7 +3803,7 @@ int select_nohz_load_balancer(int stop_tick)
3811 int cpu = smp_processor_id(); 3803 int cpu = smp_processor_id();
3812 3804
3813 if (stop_tick) { 3805 if (stop_tick) {
3814 cpu_set(cpu, nohz.cpu_mask); 3806 cpumask_set_cpu(cpu, nohz.cpu_mask);
3815 cpu_rq(cpu)->in_nohz_recently = 1; 3807 cpu_rq(cpu)->in_nohz_recently = 1;
3816 3808
3817 /* 3809 /*
@@ -3825,7 +3817,7 @@ int select_nohz_load_balancer(int stop_tick)
3825 } 3817 }
3826 3818
3827 /* time for ilb owner also to sleep */ 3819 /* time for ilb owner also to sleep */
3828 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3820 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3829 if (atomic_read(&nohz.load_balancer) == cpu) 3821 if (atomic_read(&nohz.load_balancer) == cpu)
3830 atomic_set(&nohz.load_balancer, -1); 3822 atomic_set(&nohz.load_balancer, -1);
3831 return 0; 3823 return 0;
@@ -3838,10 +3830,10 @@ int select_nohz_load_balancer(int stop_tick)
3838 } else if (atomic_read(&nohz.load_balancer) == cpu) 3830 } else if (atomic_read(&nohz.load_balancer) == cpu)
3839 return 1; 3831 return 1;
3840 } else { 3832 } else {
3841 if (!cpu_isset(cpu, nohz.cpu_mask)) 3833 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3842 return 0; 3834 return 0;
3843 3835
3844 cpu_clear(cpu, nohz.cpu_mask); 3836 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3845 3837
3846 if (atomic_read(&nohz.load_balancer) == cpu) 3838 if (atomic_read(&nohz.load_balancer) == cpu)
3847 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3839 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3869,7 +3861,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3869 unsigned long next_balance = jiffies + 60*HZ; 3861 unsigned long next_balance = jiffies + 60*HZ;
3870 int update_next_balance = 0; 3862 int update_next_balance = 0;
3871 int need_serialize; 3863 int need_serialize;
3872 cpumask_t tmp; 3864 cpumask_var_t tmp;
3865
3866 /* Fails alloc? Rebalancing probably not a priority right now. */
3867 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3868 return;
3873 3869
3874 for_each_domain(cpu, sd) { 3870 for_each_domain(cpu, sd) {
3875 if (!(sd->flags & SD_LOAD_BALANCE)) 3871 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3894,7 +3890,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3894 } 3890 }
3895 3891
3896 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3892 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3897 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 3893 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3898 /* 3894 /*
3899 * We've pulled tasks over so either we're no 3895 * We've pulled tasks over so either we're no
3900 * longer idle, or one of our SMT siblings is 3896 * longer idle, or one of our SMT siblings is
@@ -3928,6 +3924,8 @@ out:
3928 */ 3924 */
3929 if (likely(update_next_balance)) 3925 if (likely(update_next_balance))
3930 rq->next_balance = next_balance; 3926 rq->next_balance = next_balance;
3927
3928 free_cpumask_var(tmp);
3931} 3929}
3932 3930
3933/* 3931/*
@@ -3952,12 +3950,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3952 */ 3950 */
3953 if (this_rq->idle_at_tick && 3951 if (this_rq->idle_at_tick &&
3954 atomic_read(&nohz.load_balancer) == this_cpu) { 3952 atomic_read(&nohz.load_balancer) == this_cpu) {
3955 cpumask_t cpus = nohz.cpu_mask;
3956 struct rq *rq; 3953 struct rq *rq;
3957 int balance_cpu; 3954 int balance_cpu;
3958 3955
3959 cpu_clear(this_cpu, cpus); 3956 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3960 for_each_cpu_mask_nr(balance_cpu, cpus) { 3957 if (balance_cpu == this_cpu)
3958 continue;
3959
3961 /* 3960 /*
3962 * If this cpu gets work to do, stop the load balancing 3961 * If this cpu gets work to do, stop the load balancing
3963 * work being done for other cpus. Next load 3962 * work being done for other cpus. Next load
@@ -3995,7 +3994,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
3995 rq->in_nohz_recently = 0; 3994 rq->in_nohz_recently = 0;
3996 3995
3997 if (atomic_read(&nohz.load_balancer) == cpu) { 3996 if (atomic_read(&nohz.load_balancer) == cpu) {
3998 cpu_clear(cpu, nohz.cpu_mask); 3997 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3999 atomic_set(&nohz.load_balancer, -1); 3998 atomic_set(&nohz.load_balancer, -1);
4000 } 3999 }
4001 4000
@@ -4008,7 +4007,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4008 * TBD: Traverse the sched domains and nominate 4007 * TBD: Traverse the sched domains and nominate
4009 * the nearest cpu in the nohz.cpu_mask. 4008 * the nearest cpu in the nohz.cpu_mask.
4010 */ 4009 */
4011 int ilb = first_cpu(nohz.cpu_mask); 4010 int ilb = cpumask_first(nohz.cpu_mask);
4012 4011
4013 if (ilb < nr_cpu_ids) 4012 if (ilb < nr_cpu_ids)
4014 resched_cpu(ilb); 4013 resched_cpu(ilb);
@@ -4020,7 +4019,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4020 * cpus with ticks stopped, is it time for that to stop? 4019 * cpus with ticks stopped, is it time for that to stop?
4021 */ 4020 */
4022 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4021 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4023 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4022 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4024 resched_cpu(cpu); 4023 resched_cpu(cpu);
4025 return; 4024 return;
4026 } 4025 }
@@ -4030,7 +4029,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4030 * someone else, then no need raise the SCHED_SOFTIRQ 4029 * someone else, then no need raise the SCHED_SOFTIRQ
4031 */ 4030 */
4032 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4031 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4033 cpu_isset(cpu, nohz.cpu_mask)) 4032 cpumask_test_cpu(cpu, nohz.cpu_mask))
4034 return; 4033 return;
4035#endif 4034#endif
4036 if (time_after_eq(jiffies, rq->next_balance)) 4035 if (time_after_eq(jiffies, rq->next_balance))
@@ -4192,7 +4191,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4192 4191
4193 if (p == rq->idle) { 4192 if (p == rq->idle) {
4194 p->stime = cputime_add(p->stime, steal); 4193 p->stime = cputime_add(p->stime, steal);
4195 account_group_system_time(p, steal);
4196 if (atomic_read(&rq->nr_iowait) > 0) 4194 if (atomic_read(&rq->nr_iowait) > 0)
4197 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4195 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4198 else 4196 else
@@ -4328,7 +4326,7 @@ void __kprobes sub_preempt_count(int val)
4328 /* 4326 /*
4329 * Underflow? 4327 * Underflow?
4330 */ 4328 */
4331 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4329 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4332 return; 4330 return;
4333 /* 4331 /*
4334 * Is the spinlock portion underflowing? 4332 * Is the spinlock portion underflowing?
@@ -5389,10 +5387,9 @@ out_unlock:
5389 return retval; 5387 return retval;
5390} 5388}
5391 5389
5392long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5390long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5393{ 5391{
5394 cpumask_t cpus_allowed; 5392 cpumask_var_t cpus_allowed, new_mask;
5395 cpumask_t new_mask = *in_mask;
5396 struct task_struct *p; 5393 struct task_struct *p;
5397 int retval; 5394 int retval;
5398 5395
@@ -5414,6 +5411,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5414 get_task_struct(p); 5411 get_task_struct(p);
5415 read_unlock(&tasklist_lock); 5412 read_unlock(&tasklist_lock);
5416 5413
5414 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5415 retval = -ENOMEM;
5416 goto out_put_task;
5417 }
5418 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5419 retval = -ENOMEM;
5420 goto out_free_cpus_allowed;
5421 }
5417 retval = -EPERM; 5422 retval = -EPERM;
5418 if ((current->euid != p->euid) && (current->euid != p->uid) && 5423 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5419 !capable(CAP_SYS_NICE)) 5424 !capable(CAP_SYS_NICE))
@@ -5423,37 +5428,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5423 if (retval) 5428 if (retval)
5424 goto out_unlock; 5429 goto out_unlock;
5425 5430
5426 cpuset_cpus_allowed(p, &cpus_allowed); 5431 cpuset_cpus_allowed(p, cpus_allowed);
5427 cpus_and(new_mask, new_mask, cpus_allowed); 5432 cpumask_and(new_mask, in_mask, cpus_allowed);
5428 again: 5433 again:
5429 retval = set_cpus_allowed_ptr(p, &new_mask); 5434 retval = set_cpus_allowed_ptr(p, new_mask);
5430 5435
5431 if (!retval) { 5436 if (!retval) {
5432 cpuset_cpus_allowed(p, &cpus_allowed); 5437 cpuset_cpus_allowed(p, cpus_allowed);
5433 if (!cpus_subset(new_mask, cpus_allowed)) { 5438 if (!cpumask_subset(new_mask, cpus_allowed)) {
5434 /* 5439 /*
5435 * We must have raced with a concurrent cpuset 5440 * We must have raced with a concurrent cpuset
5436 * update. Just reset the cpus_allowed to the 5441 * update. Just reset the cpus_allowed to the
5437 * cpuset's cpus_allowed 5442 * cpuset's cpus_allowed
5438 */ 5443 */
5439 new_mask = cpus_allowed; 5444 cpumask_copy(new_mask, cpus_allowed);
5440 goto again; 5445 goto again;
5441 } 5446 }
5442 } 5447 }
5443out_unlock: 5448out_unlock:
5449 free_cpumask_var(new_mask);
5450out_free_cpus_allowed:
5451 free_cpumask_var(cpus_allowed);
5452out_put_task:
5444 put_task_struct(p); 5453 put_task_struct(p);
5445 put_online_cpus(); 5454 put_online_cpus();
5446 return retval; 5455 return retval;
5447} 5456}
5448 5457
5449static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5458static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5450 cpumask_t *new_mask) 5459 struct cpumask *new_mask)
5451{ 5460{
5452 if (len < sizeof(cpumask_t)) { 5461 if (len < cpumask_size())
5453 memset(new_mask, 0, sizeof(cpumask_t)); 5462 cpumask_clear(new_mask);
5454 } else if (len > sizeof(cpumask_t)) { 5463 else if (len > cpumask_size())
5455 len = sizeof(cpumask_t); 5464 len = cpumask_size();
5456 } 5465
5457 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5466 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5458} 5467}
5459 5468
@@ -5466,17 +5475,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5466asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5475asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5467 unsigned long __user *user_mask_ptr) 5476 unsigned long __user *user_mask_ptr)
5468{ 5477{
5469 cpumask_t new_mask; 5478 cpumask_var_t new_mask;
5470 int retval; 5479 int retval;
5471 5480
5472 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5481 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5473 if (retval) 5482 return -ENOMEM;
5474 return retval;
5475 5483
5476 return sched_setaffinity(pid, &new_mask); 5484 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5485 if (retval == 0)
5486 retval = sched_setaffinity(pid, new_mask);
5487 free_cpumask_var(new_mask);
5488 return retval;
5477} 5489}
5478 5490
5479long sched_getaffinity(pid_t pid, cpumask_t *mask) 5491long sched_getaffinity(pid_t pid, struct cpumask *mask)
5480{ 5492{
5481 struct task_struct *p; 5493 struct task_struct *p;
5482 int retval; 5494 int retval;
@@ -5493,7 +5505,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5493 if (retval) 5505 if (retval)
5494 goto out_unlock; 5506 goto out_unlock;
5495 5507
5496 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5508 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5497 5509
5498out_unlock: 5510out_unlock:
5499 read_unlock(&tasklist_lock); 5511 read_unlock(&tasklist_lock);
@@ -5512,19 +5524,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5512 unsigned long __user *user_mask_ptr) 5524 unsigned long __user *user_mask_ptr)
5513{ 5525{
5514 int ret; 5526 int ret;
5515 cpumask_t mask; 5527 cpumask_var_t mask;
5516 5528
5517 if (len < sizeof(cpumask_t)) 5529 if (len < cpumask_size())
5518 return -EINVAL; 5530 return -EINVAL;
5519 5531
5520 ret = sched_getaffinity(pid, &mask); 5532 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5521 if (ret < 0) 5533 return -ENOMEM;
5522 return ret;
5523 5534
5524 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5535 ret = sched_getaffinity(pid, mask);
5525 return -EFAULT; 5536 if (ret == 0) {
5537 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5538 ret = -EFAULT;
5539 else
5540 ret = cpumask_size();
5541 }
5542 free_cpumask_var(mask);
5526 5543
5527 return sizeof(cpumask_t); 5544 return ret;
5528} 5545}
5529 5546
5530/** 5547/**
@@ -5860,14 +5877,15 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5860 struct rq *rq = cpu_rq(cpu); 5877 struct rq *rq = cpu_rq(cpu);
5861 unsigned long flags; 5878 unsigned long flags;
5862 5879
5880 spin_lock_irqsave(&rq->lock, flags);
5881
5863 __sched_fork(idle); 5882 __sched_fork(idle);
5864 idle->se.exec_start = sched_clock(); 5883 idle->se.exec_start = sched_clock();
5865 5884
5866 idle->prio = idle->normal_prio = MAX_PRIO; 5885 idle->prio = idle->normal_prio = MAX_PRIO;
5867 idle->cpus_allowed = cpumask_of_cpu(cpu); 5886 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5868 __set_task_cpu(idle, cpu); 5887 __set_task_cpu(idle, cpu);
5869 5888
5870 spin_lock_irqsave(&rq->lock, flags);
5871 rq->curr = rq->idle = idle; 5889 rq->curr = rq->idle = idle;
5872#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5890#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5873 idle->oncpu = 1; 5891 idle->oncpu = 1;
@@ -5884,6 +5902,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5884 * The idle tasks have their own, simple scheduling class: 5902 * The idle tasks have their own, simple scheduling class:
5885 */ 5903 */
5886 idle->sched_class = &idle_sched_class; 5904 idle->sched_class = &idle_sched_class;
5905 ftrace_graph_init_task(idle);
5887} 5906}
5888 5907
5889/* 5908/*
@@ -5891,9 +5910,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5891 * indicates which cpus entered this state. This is used 5910 * indicates which cpus entered this state. This is used
5892 * in the rcu update to wait only for active cpus. For system 5911 * in the rcu update to wait only for active cpus. For system
5893 * which do not switch off the HZ timer nohz_cpu_mask should 5912 * which do not switch off the HZ timer nohz_cpu_mask should
5894 * always be CPU_MASK_NONE. 5913 * always be CPU_BITS_NONE.
5895 */ 5914 */
5896cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 5915cpumask_var_t nohz_cpu_mask;
5897 5916
5898/* 5917/*
5899 * Increase the granularity value when there are more CPUs, 5918 * Increase the granularity value when there are more CPUs,
@@ -5948,7 +5967,7 @@ static inline void sched_init_granularity(void)
5948 * task must not exit() & deallocate itself prematurely. The 5967 * task must not exit() & deallocate itself prematurely. The
5949 * call is not atomic; no spinlocks may be held. 5968 * call is not atomic; no spinlocks may be held.
5950 */ 5969 */
5951int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 5970int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5952{ 5971{
5953 struct migration_req req; 5972 struct migration_req req;
5954 unsigned long flags; 5973 unsigned long flags;
@@ -5956,13 +5975,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5956 int ret = 0; 5975 int ret = 0;
5957 5976
5958 rq = task_rq_lock(p, &flags); 5977 rq = task_rq_lock(p, &flags);
5959 if (!cpus_intersects(*new_mask, cpu_online_map)) { 5978 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
5960 ret = -EINVAL; 5979 ret = -EINVAL;
5961 goto out; 5980 goto out;
5962 } 5981 }
5963 5982
5964 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5983 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5965 !cpus_equal(p->cpus_allowed, *new_mask))) { 5984 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5966 ret = -EINVAL; 5985 ret = -EINVAL;
5967 goto out; 5986 goto out;
5968 } 5987 }
@@ -5970,15 +5989,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5970 if (p->sched_class->set_cpus_allowed) 5989 if (p->sched_class->set_cpus_allowed)
5971 p->sched_class->set_cpus_allowed(p, new_mask); 5990 p->sched_class->set_cpus_allowed(p, new_mask);
5972 else { 5991 else {
5973 p->cpus_allowed = *new_mask; 5992 cpumask_copy(&p->cpus_allowed, new_mask);
5974 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 5993 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5975 } 5994 }
5976 5995
5977 /* Can the task run on the task's current CPU? If so, we're done */ 5996 /* Can the task run on the task's current CPU? If so, we're done */
5978 if (cpu_isset(task_cpu(p), *new_mask)) 5997 if (cpumask_test_cpu(task_cpu(p), new_mask))
5979 goto out; 5998 goto out;
5980 5999
5981 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6000 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
5982 /* Need help from migration thread: drop lock and wait. */ 6001 /* Need help from migration thread: drop lock and wait. */
5983 task_rq_unlock(rq, &flags); 6002 task_rq_unlock(rq, &flags);
5984 wake_up_process(rq->migration_thread); 6003 wake_up_process(rq->migration_thread);
@@ -6020,7 +6039,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6020 if (task_cpu(p) != src_cpu) 6039 if (task_cpu(p) != src_cpu)
6021 goto done; 6040 goto done;
6022 /* Affinity changed (again). */ 6041 /* Affinity changed (again). */
6023 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6042 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6024 goto fail; 6043 goto fail;
6025 6044
6026 on_rq = p->se.on_rq; 6045 on_rq = p->se.on_rq;
@@ -6114,54 +6133,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6114 6133
6115/* 6134/*
6116 * Figure out where task on dead CPU should go, use force if necessary. 6135 * Figure out where task on dead CPU should go, use force if necessary.
6117 * NOTE: interrupts should be disabled by the caller
6118 */ 6136 */
6119static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6137static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6120{ 6138{
6121 unsigned long flags;
6122 cpumask_t mask;
6123 struct rq *rq;
6124 int dest_cpu; 6139 int dest_cpu;
6140 /* FIXME: Use cpumask_of_node here. */
6141 cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
6142 const struct cpumask *nodemask = &_nodemask;
6143
6144again:
6145 /* Look for allowed, online CPU in same node. */
6146 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6147 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6148 goto move;
6149
6150 /* Any allowed, online CPU? */
6151 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6152 if (dest_cpu < nr_cpu_ids)
6153 goto move;
6154
6155 /* No more Mr. Nice Guy. */
6156 if (dest_cpu >= nr_cpu_ids) {
6157 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6158 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6125 6159
6126 do { 6160 /*
6127 /* On same node? */ 6161 * Don't tell them about moving exiting tasks or
6128 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6162 * kernel threads (both mm NULL), since they never
6129 cpus_and(mask, mask, p->cpus_allowed); 6163 * leave kernel.
6130 dest_cpu = any_online_cpu(mask); 6164 */
6131 6165 if (p->mm && printk_ratelimit()) {
6132 /* On any allowed CPU? */ 6166 printk(KERN_INFO "process %d (%s) no "
6133 if (dest_cpu >= nr_cpu_ids) 6167 "longer affine to cpu%d\n",
6134 dest_cpu = any_online_cpu(p->cpus_allowed); 6168 task_pid_nr(p), p->comm, dead_cpu);
6135
6136 /* No more Mr. Nice Guy. */
6137 if (dest_cpu >= nr_cpu_ids) {
6138 cpumask_t cpus_allowed;
6139
6140 cpuset_cpus_allowed_locked(p, &cpus_allowed);
6141 /*
6142 * Try to stay on the same cpuset, where the
6143 * current cpuset may be a subset of all cpus.
6144 * The cpuset_cpus_allowed_locked() variant of
6145 * cpuset_cpus_allowed() will not block. It must be
6146 * called within calls to cpuset_lock/cpuset_unlock.
6147 */
6148 rq = task_rq_lock(p, &flags);
6149 p->cpus_allowed = cpus_allowed;
6150 dest_cpu = any_online_cpu(p->cpus_allowed);
6151 task_rq_unlock(rq, &flags);
6152
6153 /*
6154 * Don't tell them about moving exiting tasks or
6155 * kernel threads (both mm NULL), since they never
6156 * leave kernel.
6157 */
6158 if (p->mm && printk_ratelimit()) {
6159 printk(KERN_INFO "process %d (%s) no "
6160 "longer affine to cpu%d\n",
6161 task_pid_nr(p), p->comm, dead_cpu);
6162 }
6163 } 6169 }
6164 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6170 }
6171
6172move:
6173 /* It can have affinity changed while we were choosing. */
6174 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6175 goto again;
6165} 6176}
6166 6177
6167/* 6178/*
@@ -6173,7 +6184,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6173 */ 6184 */
6174static void migrate_nr_uninterruptible(struct rq *rq_src) 6185static void migrate_nr_uninterruptible(struct rq *rq_src)
6175{ 6186{
6176 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6187 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6177 unsigned long flags; 6188 unsigned long flags;
6178 6189
6179 local_irq_save(flags); 6190 local_irq_save(flags);
@@ -6463,7 +6474,7 @@ static void set_rq_online(struct rq *rq)
6463 if (!rq->online) { 6474 if (!rq->online) {
6464 const struct sched_class *class; 6475 const struct sched_class *class;
6465 6476
6466 cpu_set(rq->cpu, rq->rd->online); 6477 cpumask_set_cpu(rq->cpu, rq->rd->online);
6467 rq->online = 1; 6478 rq->online = 1;
6468 6479
6469 for_each_class(class) { 6480 for_each_class(class) {
@@ -6483,7 +6494,7 @@ static void set_rq_offline(struct rq *rq)
6483 class->rq_offline(rq); 6494 class->rq_offline(rq);
6484 } 6495 }
6485 6496
6486 cpu_clear(rq->cpu, rq->rd->online); 6497 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6487 rq->online = 0; 6498 rq->online = 0;
6488 } 6499 }
6489} 6500}
@@ -6524,7 +6535,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6524 rq = cpu_rq(cpu); 6535 rq = cpu_rq(cpu);
6525 spin_lock_irqsave(&rq->lock, flags); 6536 spin_lock_irqsave(&rq->lock, flags);
6526 if (rq->rd) { 6537 if (rq->rd) {
6527 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6538 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6528 6539
6529 set_rq_online(rq); 6540 set_rq_online(rq);
6530 } 6541 }
@@ -6538,7 +6549,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6538 break; 6549 break;
6539 /* Unbind it from offline cpu so it can run. Fall thru. */ 6550 /* Unbind it from offline cpu so it can run. Fall thru. */
6540 kthread_bind(cpu_rq(cpu)->migration_thread, 6551 kthread_bind(cpu_rq(cpu)->migration_thread,
6541 any_online_cpu(cpu_online_map)); 6552 cpumask_any(cpu_online_mask));
6542 kthread_stop(cpu_rq(cpu)->migration_thread); 6553 kthread_stop(cpu_rq(cpu)->migration_thread);
6543 cpu_rq(cpu)->migration_thread = NULL; 6554 cpu_rq(cpu)->migration_thread = NULL;
6544 break; 6555 break;
@@ -6575,7 +6586,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6575 req = list_entry(rq->migration_queue.next, 6586 req = list_entry(rq->migration_queue.next,
6576 struct migration_req, list); 6587 struct migration_req, list);
6577 list_del_init(&req->list); 6588 list_del_init(&req->list);
6589 spin_unlock_irq(&rq->lock);
6578 complete(&req->done); 6590 complete(&req->done);
6591 spin_lock_irq(&rq->lock);
6579 } 6592 }
6580 spin_unlock_irq(&rq->lock); 6593 spin_unlock_irq(&rq->lock);
6581 break; 6594 break;
@@ -6586,7 +6599,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6586 rq = cpu_rq(cpu); 6599 rq = cpu_rq(cpu);
6587 spin_lock_irqsave(&rq->lock, flags); 6600 spin_lock_irqsave(&rq->lock, flags);
6588 if (rq->rd) { 6601 if (rq->rd) {
6589 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6602 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6590 set_rq_offline(rq); 6603 set_rq_offline(rq);
6591 } 6604 }
6592 spin_unlock_irqrestore(&rq->lock, flags); 6605 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6624,36 +6637,14 @@ early_initcall(migration_init);
6624 6637
6625#ifdef CONFIG_SCHED_DEBUG 6638#ifdef CONFIG_SCHED_DEBUG
6626 6639
6627static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6628{
6629 switch (lvl) {
6630 case SD_LV_NONE:
6631 return "NONE";
6632 case SD_LV_SIBLING:
6633 return "SIBLING";
6634 case SD_LV_MC:
6635 return "MC";
6636 case SD_LV_CPU:
6637 return "CPU";
6638 case SD_LV_NODE:
6639 return "NODE";
6640 case SD_LV_ALLNODES:
6641 return "ALLNODES";
6642 case SD_LV_MAX:
6643 return "MAX";
6644
6645 }
6646 return "MAX";
6647}
6648
6649static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6640static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6650 cpumask_t *groupmask) 6641 struct cpumask *groupmask)
6651{ 6642{
6652 struct sched_group *group = sd->groups; 6643 struct sched_group *group = sd->groups;
6653 char str[256]; 6644 char str[256];
6654 6645
6655 cpulist_scnprintf(str, sizeof(str), sd->span); 6646 cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
6656 cpus_clear(*groupmask); 6647 cpumask_clear(groupmask);
6657 6648
6658 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6649 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6659 6650
@@ -6665,14 +6656,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6665 return -1; 6656 return -1;
6666 } 6657 }
6667 6658
6668 printk(KERN_CONT "span %s level %s\n", 6659 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6669 str, sd_level_to_string(sd->level));
6670 6660
6671 if (!cpu_isset(cpu, sd->span)) { 6661 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6672 printk(KERN_ERR "ERROR: domain->span does not contain " 6662 printk(KERN_ERR "ERROR: domain->span does not contain "
6673 "CPU%d\n", cpu); 6663 "CPU%d\n", cpu);
6674 } 6664 }
6675 if (!cpu_isset(cpu, group->cpumask)) { 6665 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6676 printk(KERN_ERR "ERROR: domain->groups does not contain" 6666 printk(KERN_ERR "ERROR: domain->groups does not contain"
6677 " CPU%d\n", cpu); 6667 " CPU%d\n", cpu);
6678 } 6668 }
@@ -6692,31 +6682,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6692 break; 6682 break;
6693 } 6683 }
6694 6684
6695 if (!cpus_weight(group->cpumask)) { 6685 if (!cpumask_weight(sched_group_cpus(group))) {
6696 printk(KERN_CONT "\n"); 6686 printk(KERN_CONT "\n");
6697 printk(KERN_ERR "ERROR: empty group\n"); 6687 printk(KERN_ERR "ERROR: empty group\n");
6698 break; 6688 break;
6699 } 6689 }
6700 6690
6701 if (cpus_intersects(*groupmask, group->cpumask)) { 6691 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6702 printk(KERN_CONT "\n"); 6692 printk(KERN_CONT "\n");
6703 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6693 printk(KERN_ERR "ERROR: repeated CPUs\n");
6704 break; 6694 break;
6705 } 6695 }
6706 6696
6707 cpus_or(*groupmask, *groupmask, group->cpumask); 6697 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6708 6698
6709 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6699 cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
6710 printk(KERN_CONT " %s", str); 6700 printk(KERN_CONT " %s", str);
6711 6701
6712 group = group->next; 6702 group = group->next;
6713 } while (group != sd->groups); 6703 } while (group != sd->groups);
6714 printk(KERN_CONT "\n"); 6704 printk(KERN_CONT "\n");
6715 6705
6716 if (!cpus_equal(sd->span, *groupmask)) 6706 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6717 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6707 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6718 6708
6719 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6709 if (sd->parent &&
6710 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6720 printk(KERN_ERR "ERROR: parent span is not a superset " 6711 printk(KERN_ERR "ERROR: parent span is not a superset "
6721 "of domain->span\n"); 6712 "of domain->span\n");
6722 return 0; 6713 return 0;
@@ -6724,7 +6715,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6724 6715
6725static void sched_domain_debug(struct sched_domain *sd, int cpu) 6716static void sched_domain_debug(struct sched_domain *sd, int cpu)
6726{ 6717{
6727 cpumask_t *groupmask; 6718 cpumask_var_t groupmask;
6728 int level = 0; 6719 int level = 0;
6729 6720
6730 if (!sd) { 6721 if (!sd) {
@@ -6734,8 +6725,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6734 6725
6735 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6726 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6736 6727
6737 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6728 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6738 if (!groupmask) {
6739 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6729 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6740 return; 6730 return;
6741 } 6731 }
@@ -6748,7 +6738,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6748 if (!sd) 6738 if (!sd)
6749 break; 6739 break;
6750 } 6740 }
6751 kfree(groupmask); 6741 free_cpumask_var(groupmask);
6752} 6742}
6753#else /* !CONFIG_SCHED_DEBUG */ 6743#else /* !CONFIG_SCHED_DEBUG */
6754# define sched_domain_debug(sd, cpu) do { } while (0) 6744# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6756,7 +6746,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6756 6746
6757static int sd_degenerate(struct sched_domain *sd) 6747static int sd_degenerate(struct sched_domain *sd)
6758{ 6748{
6759 if (cpus_weight(sd->span) == 1) 6749 if (cpumask_weight(sched_domain_span(sd)) == 1)
6760 return 1; 6750 return 1;
6761 6751
6762 /* Following flags need at least 2 groups */ 6752 /* Following flags need at least 2 groups */
@@ -6787,7 +6777,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6787 if (sd_degenerate(parent)) 6777 if (sd_degenerate(parent))
6788 return 1; 6778 return 1;
6789 6779
6790 if (!cpus_equal(sd->span, parent->span)) 6780 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6791 return 0; 6781 return 0;
6792 6782
6793 /* Does parent contain flags not in child? */ 6783 /* Does parent contain flags not in child? */
@@ -6802,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6802 SD_BALANCE_EXEC | 6792 SD_BALANCE_EXEC |
6803 SD_SHARE_CPUPOWER | 6793 SD_SHARE_CPUPOWER |
6804 SD_SHARE_PKG_RESOURCES); 6794 SD_SHARE_PKG_RESOURCES);
6795 if (nr_node_ids == 1)
6796 pflags &= ~SD_SERIALIZE;
6805 } 6797 }
6806 if (~cflags & pflags) 6798 if (~cflags & pflags)
6807 return 0; 6799 return 0;
@@ -6809,6 +6801,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6809 return 1; 6801 return 1;
6810} 6802}
6811 6803
6804static void free_rootdomain(struct root_domain *rd)
6805{
6806 cpupri_cleanup(&rd->cpupri);
6807
6808 free_cpumask_var(rd->rto_mask);
6809 free_cpumask_var(rd->online);
6810 free_cpumask_var(rd->span);
6811 kfree(rd);
6812}
6813
6812static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6814static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6813{ 6815{
6814 unsigned long flags; 6816 unsigned long flags;
@@ -6818,38 +6820,63 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6818 if (rq->rd) { 6820 if (rq->rd) {
6819 struct root_domain *old_rd = rq->rd; 6821 struct root_domain *old_rd = rq->rd;
6820 6822
6821 if (cpu_isset(rq->cpu, old_rd->online)) 6823 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6822 set_rq_offline(rq); 6824 set_rq_offline(rq);
6823 6825
6824 cpu_clear(rq->cpu, old_rd->span); 6826 cpumask_clear_cpu(rq->cpu, old_rd->span);
6825 6827
6826 if (atomic_dec_and_test(&old_rd->refcount)) 6828 if (atomic_dec_and_test(&old_rd->refcount))
6827 kfree(old_rd); 6829 free_rootdomain(old_rd);
6828 } 6830 }
6829 6831
6830 atomic_inc(&rd->refcount); 6832 atomic_inc(&rd->refcount);
6831 rq->rd = rd; 6833 rq->rd = rd;
6832 6834
6833 cpu_set(rq->cpu, rd->span); 6835 cpumask_set_cpu(rq->cpu, rd->span);
6834 if (cpu_isset(rq->cpu, cpu_online_map)) 6836 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6835 set_rq_online(rq); 6837 set_rq_online(rq);
6836 6838
6837 spin_unlock_irqrestore(&rq->lock, flags); 6839 spin_unlock_irqrestore(&rq->lock, flags);
6838} 6840}
6839 6841
6840static void init_rootdomain(struct root_domain *rd) 6842static int init_rootdomain(struct root_domain *rd, bool bootmem)
6841{ 6843{
6842 memset(rd, 0, sizeof(*rd)); 6844 memset(rd, 0, sizeof(*rd));
6843 6845
6844 cpus_clear(rd->span); 6846 if (bootmem) {
6845 cpus_clear(rd->online); 6847 alloc_bootmem_cpumask_var(&def_root_domain.span);
6848 alloc_bootmem_cpumask_var(&def_root_domain.online);
6849 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
6850 cpupri_init(&rd->cpupri, true);
6851 return 0;
6852 }
6853
6854 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6855 goto free_rd;
6856 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6857 goto free_span;
6858 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6859 goto free_online;
6860
6861 if (cpupri_init(&rd->cpupri, false) != 0)
6862 goto free_rto_mask;
6863 return 0;
6846 6864
6847 cpupri_init(&rd->cpupri); 6865free_rto_mask:
6866 free_cpumask_var(rd->rto_mask);
6867free_online:
6868 free_cpumask_var(rd->online);
6869free_span:
6870 free_cpumask_var(rd->span);
6871free_rd:
6872 kfree(rd);
6873 return -ENOMEM;
6848} 6874}
6849 6875
6850static void init_defrootdomain(void) 6876static void init_defrootdomain(void)
6851{ 6877{
6852 init_rootdomain(&def_root_domain); 6878 init_rootdomain(&def_root_domain, true);
6879
6853 atomic_set(&def_root_domain.refcount, 1); 6880 atomic_set(&def_root_domain.refcount, 1);
6854} 6881}
6855 6882
@@ -6861,7 +6888,10 @@ static struct root_domain *alloc_rootdomain(void)
6861 if (!rd) 6888 if (!rd)
6862 return NULL; 6889 return NULL;
6863 6890
6864 init_rootdomain(rd); 6891 if (init_rootdomain(rd, false) != 0) {
6892 kfree(rd);
6893 return NULL;
6894 }
6865 6895
6866 return rd; 6896 return rd;
6867} 6897}
@@ -6903,19 +6933,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6903} 6933}
6904 6934
6905/* cpus with isolated domains */ 6935/* cpus with isolated domains */
6906static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 6936static cpumask_var_t cpu_isolated_map;
6907 6937
6908/* Setup the mask of cpus configured for isolated domains */ 6938/* Setup the mask of cpus configured for isolated domains */
6909static int __init isolated_cpu_setup(char *str) 6939static int __init isolated_cpu_setup(char *str)
6910{ 6940{
6911 static int __initdata ints[NR_CPUS]; 6941 cpulist_parse(str, *cpu_isolated_map);
6912 int i;
6913
6914 str = get_options(str, ARRAY_SIZE(ints), ints);
6915 cpus_clear(cpu_isolated_map);
6916 for (i = 1; i <= ints[0]; i++)
6917 if (ints[i] < NR_CPUS)
6918 cpu_set(ints[i], cpu_isolated_map);
6919 return 1; 6942 return 1;
6920} 6943}
6921 6944
@@ -6924,42 +6947,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6924/* 6947/*
6925 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6948 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6926 * to a function which identifies what group(along with sched group) a CPU 6949 * to a function which identifies what group(along with sched group) a CPU
6927 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 6950 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6928 * (due to the fact that we keep track of groups covered with a cpumask_t). 6951 * (due to the fact that we keep track of groups covered with a struct cpumask).
6929 * 6952 *
6930 * init_sched_build_groups will build a circular linked list of the groups 6953 * init_sched_build_groups will build a circular linked list of the groups
6931 * covered by the given span, and will set each group's ->cpumask correctly, 6954 * covered by the given span, and will set each group's ->cpumask correctly,
6932 * and ->cpu_power to 0. 6955 * and ->cpu_power to 0.
6933 */ 6956 */
6934static void 6957static void
6935init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 6958init_sched_build_groups(const struct cpumask *span,
6936 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 6959 const struct cpumask *cpu_map,
6960 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6937 struct sched_group **sg, 6961 struct sched_group **sg,
6938 cpumask_t *tmpmask), 6962 struct cpumask *tmpmask),
6939 cpumask_t *covered, cpumask_t *tmpmask) 6963 struct cpumask *covered, struct cpumask *tmpmask)
6940{ 6964{
6941 struct sched_group *first = NULL, *last = NULL; 6965 struct sched_group *first = NULL, *last = NULL;
6942 int i; 6966 int i;
6943 6967
6944 cpus_clear(*covered); 6968 cpumask_clear(covered);
6945 6969
6946 for_each_cpu_mask_nr(i, *span) { 6970 for_each_cpu(i, span) {
6947 struct sched_group *sg; 6971 struct sched_group *sg;
6948 int group = group_fn(i, cpu_map, &sg, tmpmask); 6972 int group = group_fn(i, cpu_map, &sg, tmpmask);
6949 int j; 6973 int j;
6950 6974
6951 if (cpu_isset(i, *covered)) 6975 if (cpumask_test_cpu(i, covered))
6952 continue; 6976 continue;
6953 6977
6954 cpus_clear(sg->cpumask); 6978 cpumask_clear(sched_group_cpus(sg));
6955 sg->__cpu_power = 0; 6979 sg->__cpu_power = 0;
6956 6980
6957 for_each_cpu_mask_nr(j, *span) { 6981 for_each_cpu(j, span) {
6958 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6982 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6959 continue; 6983 continue;
6960 6984
6961 cpu_set(j, *covered); 6985 cpumask_set_cpu(j, covered);
6962 cpu_set(j, sg->cpumask); 6986 cpumask_set_cpu(j, sched_group_cpus(sg));
6963 } 6987 }
6964 if (!first) 6988 if (!first)
6965 first = sg; 6989 first = sg;
@@ -7023,9 +7047,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7023 * should be one that prevents unnecessary balancing, but also spreads tasks 7047 * should be one that prevents unnecessary balancing, but also spreads tasks
7024 * out optimally. 7048 * out optimally.
7025 */ 7049 */
7026static void sched_domain_node_span(int node, cpumask_t *span) 7050static void sched_domain_node_span(int node, struct cpumask *span)
7027{ 7051{
7028 nodemask_t used_nodes; 7052 nodemask_t used_nodes;
7053 /* FIXME: use cpumask_of_node() */
7029 node_to_cpumask_ptr(nodemask, node); 7054 node_to_cpumask_ptr(nodemask, node);
7030 int i; 7055 int i;
7031 7056
@@ -7047,18 +7072,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7047int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7072int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7048 7073
7049/* 7074/*
7075 * The cpus mask in sched_group and sched_domain hangs off the end.
7076 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7077 * for nr_cpu_ids < CONFIG_NR_CPUS.
7078 */
7079struct static_sched_group {
7080 struct sched_group sg;
7081 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7082};
7083
7084struct static_sched_domain {
7085 struct sched_domain sd;
7086 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7087};
7088
7089/*
7050 * SMT sched-domains: 7090 * SMT sched-domains:
7051 */ 7091 */
7052#ifdef CONFIG_SCHED_SMT 7092#ifdef CONFIG_SCHED_SMT
7053static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7093static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7054static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7094static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7055 7095
7056static int 7096static int
7057cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7097cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7058 cpumask_t *unused) 7098 struct sched_group **sg, struct cpumask *unused)
7059{ 7099{
7060 if (sg) 7100 if (sg)
7061 *sg = &per_cpu(sched_group_cpus, cpu); 7101 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7062 return cpu; 7102 return cpu;
7063} 7103}
7064#endif /* CONFIG_SCHED_SMT */ 7104#endif /* CONFIG_SCHED_SMT */
@@ -7067,56 +7107,55 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7067 * multi-core sched-domains: 7107 * multi-core sched-domains:
7068 */ 7108 */
7069#ifdef CONFIG_SCHED_MC 7109#ifdef CONFIG_SCHED_MC
7070static DEFINE_PER_CPU(struct sched_domain, core_domains); 7110static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7071static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7111static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7072#endif /* CONFIG_SCHED_MC */ 7112#endif /* CONFIG_SCHED_MC */
7073 7113
7074#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7114#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7075static int 7115static int
7076cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7116cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7077 cpumask_t *mask) 7117 struct sched_group **sg, struct cpumask *mask)
7078{ 7118{
7079 int group; 7119 int group;
7080 7120
7081 *mask = per_cpu(cpu_sibling_map, cpu); 7121 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7082 cpus_and(*mask, *mask, *cpu_map); 7122 group = cpumask_first(mask);
7083 group = first_cpu(*mask);
7084 if (sg) 7123 if (sg)
7085 *sg = &per_cpu(sched_group_core, group); 7124 *sg = &per_cpu(sched_group_core, group).sg;
7086 return group; 7125 return group;
7087} 7126}
7088#elif defined(CONFIG_SCHED_MC) 7127#elif defined(CONFIG_SCHED_MC)
7089static int 7128static int
7090cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7129cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7091 cpumask_t *unused) 7130 struct sched_group **sg, struct cpumask *unused)
7092{ 7131{
7093 if (sg) 7132 if (sg)
7094 *sg = &per_cpu(sched_group_core, cpu); 7133 *sg = &per_cpu(sched_group_core, cpu).sg;
7095 return cpu; 7134 return cpu;
7096} 7135}
7097#endif 7136#endif
7098 7137
7099static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7138static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7100static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7139static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7101 7140
7102static int 7141static int
7103cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7142cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7104 cpumask_t *mask) 7143 struct sched_group **sg, struct cpumask *mask)
7105{ 7144{
7106 int group; 7145 int group;
7107#ifdef CONFIG_SCHED_MC 7146#ifdef CONFIG_SCHED_MC
7147 /* FIXME: Use cpu_coregroup_mask. */
7108 *mask = cpu_coregroup_map(cpu); 7148 *mask = cpu_coregroup_map(cpu);
7109 cpus_and(*mask, *mask, *cpu_map); 7149 cpus_and(*mask, *mask, *cpu_map);
7110 group = first_cpu(*mask); 7150 group = cpumask_first(mask);
7111#elif defined(CONFIG_SCHED_SMT) 7151#elif defined(CONFIG_SCHED_SMT)
7112 *mask = per_cpu(cpu_sibling_map, cpu); 7152 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7113 cpus_and(*mask, *mask, *cpu_map); 7153 group = cpumask_first(mask);
7114 group = first_cpu(*mask);
7115#else 7154#else
7116 group = cpu; 7155 group = cpu;
7117#endif 7156#endif
7118 if (sg) 7157 if (sg)
7119 *sg = &per_cpu(sched_group_phys, group); 7158 *sg = &per_cpu(sched_group_phys, group).sg;
7120 return group; 7159 return group;
7121} 7160}
7122 7161
@@ -7130,19 +7169,21 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7130static struct sched_group ***sched_group_nodes_bycpu; 7169static struct sched_group ***sched_group_nodes_bycpu;
7131 7170
7132static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7171static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7133static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7172static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7134 7173
7135static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7174static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7136 struct sched_group **sg, cpumask_t *nodemask) 7175 struct sched_group **sg,
7176 struct cpumask *nodemask)
7137{ 7177{
7138 int group; 7178 int group;
7179 /* FIXME: use cpumask_of_node */
7180 node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
7139 7181
7140 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7182 cpumask_and(nodemask, pnodemask, cpu_map);
7141 cpus_and(*nodemask, *nodemask, *cpu_map); 7183 group = cpumask_first(nodemask);
7142 group = first_cpu(*nodemask);
7143 7184
7144 if (sg) 7185 if (sg)
7145 *sg = &per_cpu(sched_group_allnodes, group); 7186 *sg = &per_cpu(sched_group_allnodes, group).sg;
7146 return group; 7187 return group;
7147} 7188}
7148 7189
@@ -7154,11 +7195,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7154 if (!sg) 7195 if (!sg)
7155 return; 7196 return;
7156 do { 7197 do {
7157 for_each_cpu_mask_nr(j, sg->cpumask) { 7198 for_each_cpu(j, sched_group_cpus(sg)) {
7158 struct sched_domain *sd; 7199 struct sched_domain *sd;
7159 7200
7160 sd = &per_cpu(phys_domains, j); 7201 sd = &per_cpu(phys_domains, j).sd;
7161 if (j != first_cpu(sd->groups->cpumask)) { 7202 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7162 /* 7203 /*
7163 * Only add "power" once for each 7204 * Only add "power" once for each
7164 * physical package. 7205 * physical package.
@@ -7175,11 +7216,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7175 7216
7176#ifdef CONFIG_NUMA 7217#ifdef CONFIG_NUMA
7177/* Free memory allocated for various sched_group structures */ 7218/* Free memory allocated for various sched_group structures */
7178static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7219static void free_sched_groups(const struct cpumask *cpu_map,
7220 struct cpumask *nodemask)
7179{ 7221{
7180 int cpu, i; 7222 int cpu, i;
7181 7223
7182 for_each_cpu_mask_nr(cpu, *cpu_map) { 7224 for_each_cpu(cpu, cpu_map) {
7183 struct sched_group **sched_group_nodes 7225 struct sched_group **sched_group_nodes
7184 = sched_group_nodes_bycpu[cpu]; 7226 = sched_group_nodes_bycpu[cpu];
7185 7227
@@ -7188,10 +7230,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7188 7230
7189 for (i = 0; i < nr_node_ids; i++) { 7231 for (i = 0; i < nr_node_ids; i++) {
7190 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7232 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7233 /* FIXME: Use cpumask_of_node */
7234 node_to_cpumask_ptr(pnodemask, i);
7191 7235
7192 *nodemask = node_to_cpumask(i); 7236 cpus_and(*nodemask, *pnodemask, *cpu_map);
7193 cpus_and(*nodemask, *nodemask, *cpu_map); 7237 if (cpumask_empty(nodemask))
7194 if (cpus_empty(*nodemask))
7195 continue; 7238 continue;
7196 7239
7197 if (sg == NULL) 7240 if (sg == NULL)
@@ -7209,7 +7252,8 @@ next_sg:
7209 } 7252 }
7210} 7253}
7211#else /* !CONFIG_NUMA */ 7254#else /* !CONFIG_NUMA */
7212static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7255static void free_sched_groups(const struct cpumask *cpu_map,
7256 struct cpumask *nodemask)
7213{ 7257{
7214} 7258}
7215#endif /* CONFIG_NUMA */ 7259#endif /* CONFIG_NUMA */
@@ -7235,7 +7279,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7235 7279
7236 WARN_ON(!sd || !sd->groups); 7280 WARN_ON(!sd || !sd->groups);
7237 7281
7238 if (cpu != first_cpu(sd->groups->cpumask)) 7282 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7239 return; 7283 return;
7240 7284
7241 child = sd->child; 7285 child = sd->child;
@@ -7300,40 +7344,6 @@ SD_INIT_FUNC(CPU)
7300 SD_INIT_FUNC(MC) 7344 SD_INIT_FUNC(MC)
7301#endif 7345#endif
7302 7346
7303/*
7304 * To minimize stack usage kmalloc room for cpumasks and share the
7305 * space as the usage in build_sched_domains() dictates. Used only
7306 * if the amount of space is significant.
7307 */
7308struct allmasks {
7309 cpumask_t tmpmask; /* make this one first */
7310 union {
7311 cpumask_t nodemask;
7312 cpumask_t this_sibling_map;
7313 cpumask_t this_core_map;
7314 };
7315 cpumask_t send_covered;
7316
7317#ifdef CONFIG_NUMA
7318 cpumask_t domainspan;
7319 cpumask_t covered;
7320 cpumask_t notcovered;
7321#endif
7322};
7323
7324#if NR_CPUS > 128
7325#define SCHED_CPUMASK_ALLOC 1
7326#define SCHED_CPUMASK_FREE(v) kfree(v)
7327#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7328#else
7329#define SCHED_CPUMASK_ALLOC 0
7330#define SCHED_CPUMASK_FREE(v)
7331#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7332#endif
7333
7334#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7335 ((unsigned long)(a) + offsetof(struct allmasks, v))
7336
7337static int default_relax_domain_level = -1; 7347static int default_relax_domain_level = -1;
7338 7348
7339static int __init setup_relax_domain_level(char *str) 7349static int __init setup_relax_domain_level(char *str)
@@ -7373,17 +7383,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7373 * Build sched domains for a given set of cpus and attach the sched domains 7383 * Build sched domains for a given set of cpus and attach the sched domains
7374 * to the individual cpus 7384 * to the individual cpus
7375 */ 7385 */
7376static int __build_sched_domains(const cpumask_t *cpu_map, 7386static int __build_sched_domains(const struct cpumask *cpu_map,
7377 struct sched_domain_attr *attr) 7387 struct sched_domain_attr *attr)
7378{ 7388{
7379 int i; 7389 int i, err = -ENOMEM;
7380 struct root_domain *rd; 7390 struct root_domain *rd;
7381 SCHED_CPUMASK_DECLARE(allmasks); 7391 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7382 cpumask_t *tmpmask; 7392 tmpmask;
7383#ifdef CONFIG_NUMA 7393#ifdef CONFIG_NUMA
7394 cpumask_var_t domainspan, covered, notcovered;
7384 struct sched_group **sched_group_nodes = NULL; 7395 struct sched_group **sched_group_nodes = NULL;
7385 int sd_allnodes = 0; 7396 int sd_allnodes = 0;
7386 7397
7398 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7399 goto out;
7400 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7401 goto free_domainspan;
7402 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7403 goto free_covered;
7404#endif
7405
7406 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7407 goto free_notcovered;
7408 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7409 goto free_nodemask;
7410 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7411 goto free_this_sibling_map;
7412 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7413 goto free_this_core_map;
7414 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7415 goto free_send_covered;
7416
7417#ifdef CONFIG_NUMA
7387 /* 7418 /*
7388 * Allocate the per-node list of sched groups 7419 * Allocate the per-node list of sched groups
7389 */ 7420 */
@@ -7391,55 +7422,37 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7391 GFP_KERNEL); 7422 GFP_KERNEL);
7392 if (!sched_group_nodes) { 7423 if (!sched_group_nodes) {
7393 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7424 printk(KERN_WARNING "Can not alloc sched group node list\n");
7394 return -ENOMEM; 7425 goto free_tmpmask;
7395 } 7426 }
7396#endif 7427#endif
7397 7428
7398 rd = alloc_rootdomain(); 7429 rd = alloc_rootdomain();
7399 if (!rd) { 7430 if (!rd) {
7400 printk(KERN_WARNING "Cannot alloc root domain\n"); 7431 printk(KERN_WARNING "Cannot alloc root domain\n");
7401#ifdef CONFIG_NUMA 7432 goto free_sched_groups;
7402 kfree(sched_group_nodes);
7403#endif
7404 return -ENOMEM;
7405 } 7433 }
7406 7434
7407#if SCHED_CPUMASK_ALLOC
7408 /* get space for all scratch cpumask variables */
7409 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7410 if (!allmasks) {
7411 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7412 kfree(rd);
7413#ifdef CONFIG_NUMA
7414 kfree(sched_group_nodes);
7415#endif
7416 return -ENOMEM;
7417 }
7418#endif
7419 tmpmask = (cpumask_t *)allmasks;
7420
7421
7422#ifdef CONFIG_NUMA 7435#ifdef CONFIG_NUMA
7423 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 7436 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7424#endif 7437#endif
7425 7438
7426 /* 7439 /*
7427 * Set up domains for cpus specified by the cpu_map. 7440 * Set up domains for cpus specified by the cpu_map.
7428 */ 7441 */
7429 for_each_cpu_mask_nr(i, *cpu_map) { 7442 for_each_cpu(i, cpu_map) {
7430 struct sched_domain *sd = NULL, *p; 7443 struct sched_domain *sd = NULL, *p;
7431 SCHED_CPUMASK_VAR(nodemask, allmasks);
7432 7444
7445 /* FIXME: use cpumask_of_node */
7433 *nodemask = node_to_cpumask(cpu_to_node(i)); 7446 *nodemask = node_to_cpumask(cpu_to_node(i));
7434 cpus_and(*nodemask, *nodemask, *cpu_map); 7447 cpus_and(*nodemask, *nodemask, *cpu_map);
7435 7448
7436#ifdef CONFIG_NUMA 7449#ifdef CONFIG_NUMA
7437 if (cpus_weight(*cpu_map) > 7450 if (cpumask_weight(cpu_map) >
7438 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7451 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7439 sd = &per_cpu(allnodes_domains, i); 7452 sd = &per_cpu(allnodes_domains, i);
7440 SD_INIT(sd, ALLNODES); 7453 SD_INIT(sd, ALLNODES);
7441 set_domain_attribute(sd, attr); 7454 set_domain_attribute(sd, attr);
7442 sd->span = *cpu_map; 7455 cpumask_copy(sched_domain_span(sd), cpu_map);
7443 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7456 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7444 p = sd; 7457 p = sd;
7445 sd_allnodes = 1; 7458 sd_allnodes = 1;
@@ -7449,18 +7462,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7449 sd = &per_cpu(node_domains, i); 7462 sd = &per_cpu(node_domains, i);
7450 SD_INIT(sd, NODE); 7463 SD_INIT(sd, NODE);
7451 set_domain_attribute(sd, attr); 7464 set_domain_attribute(sd, attr);
7452 sched_domain_node_span(cpu_to_node(i), &sd->span); 7465 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7453 sd->parent = p; 7466 sd->parent = p;
7454 if (p) 7467 if (p)
7455 p->child = sd; 7468 p->child = sd;
7456 cpus_and(sd->span, sd->span, *cpu_map); 7469 cpumask_and(sched_domain_span(sd),
7470 sched_domain_span(sd), cpu_map);
7457#endif 7471#endif
7458 7472
7459 p = sd; 7473 p = sd;
7460 sd = &per_cpu(phys_domains, i); 7474 sd = &per_cpu(phys_domains, i).sd;
7461 SD_INIT(sd, CPU); 7475 SD_INIT(sd, CPU);
7462 set_domain_attribute(sd, attr); 7476 set_domain_attribute(sd, attr);
7463 sd->span = *nodemask; 7477 cpumask_copy(sched_domain_span(sd), nodemask);
7464 sd->parent = p; 7478 sd->parent = p;
7465 if (p) 7479 if (p)
7466 p->child = sd; 7480 p->child = sd;
@@ -7468,11 +7482,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7468 7482
7469#ifdef CONFIG_SCHED_MC 7483#ifdef CONFIG_SCHED_MC
7470 p = sd; 7484 p = sd;
7471 sd = &per_cpu(core_domains, i); 7485 sd = &per_cpu(core_domains, i).sd;
7472 SD_INIT(sd, MC); 7486 SD_INIT(sd, MC);
7473 set_domain_attribute(sd, attr); 7487 set_domain_attribute(sd, attr);
7474 sd->span = cpu_coregroup_map(i); 7488 *sched_domain_span(sd) = cpu_coregroup_map(i);
7475 cpus_and(sd->span, sd->span, *cpu_map); 7489 cpumask_and(sched_domain_span(sd),
7490 sched_domain_span(sd), cpu_map);
7476 sd->parent = p; 7491 sd->parent = p;
7477 p->child = sd; 7492 p->child = sd;
7478 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7493 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7480,11 +7495,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7480 7495
7481#ifdef CONFIG_SCHED_SMT 7496#ifdef CONFIG_SCHED_SMT
7482 p = sd; 7497 p = sd;
7483 sd = &per_cpu(cpu_domains, i); 7498 sd = &per_cpu(cpu_domains, i).sd;
7484 SD_INIT(sd, SIBLING); 7499 SD_INIT(sd, SIBLING);
7485 set_domain_attribute(sd, attr); 7500 set_domain_attribute(sd, attr);
7486 sd->span = per_cpu(cpu_sibling_map, i); 7501 cpumask_and(sched_domain_span(sd),
7487 cpus_and(sd->span, sd->span, *cpu_map); 7502 &per_cpu(cpu_sibling_map, i), cpu_map);
7488 sd->parent = p; 7503 sd->parent = p;
7489 p->child = sd; 7504 p->child = sd;
7490 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7505 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7493,13 +7508,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7493 7508
7494#ifdef CONFIG_SCHED_SMT 7509#ifdef CONFIG_SCHED_SMT
7495 /* Set up CPU (sibling) groups */ 7510 /* Set up CPU (sibling) groups */
7496 for_each_cpu_mask_nr(i, *cpu_map) { 7511 for_each_cpu(i, cpu_map) {
7497 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7512 cpumask_and(this_sibling_map,
7498 SCHED_CPUMASK_VAR(send_covered, allmasks); 7513 &per_cpu(cpu_sibling_map, i), cpu_map);
7499 7514 if (i != cpumask_first(this_sibling_map))
7500 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7501 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7502 if (i != first_cpu(*this_sibling_map))
7503 continue; 7515 continue;
7504 7516
7505 init_sched_build_groups(this_sibling_map, cpu_map, 7517 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7510,13 +7522,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7510 7522
7511#ifdef CONFIG_SCHED_MC 7523#ifdef CONFIG_SCHED_MC
7512 /* Set up multi-core groups */ 7524 /* Set up multi-core groups */
7513 for_each_cpu_mask_nr(i, *cpu_map) { 7525 for_each_cpu(i, cpu_map) {
7514 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7526 /* FIXME: Use cpu_coregroup_mask */
7515 SCHED_CPUMASK_VAR(send_covered, allmasks);
7516
7517 *this_core_map = cpu_coregroup_map(i); 7527 *this_core_map = cpu_coregroup_map(i);
7518 cpus_and(*this_core_map, *this_core_map, *cpu_map); 7528 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7519 if (i != first_cpu(*this_core_map)) 7529 if (i != cpumask_first(this_core_map))
7520 continue; 7530 continue;
7521 7531
7522 init_sched_build_groups(this_core_map, cpu_map, 7532 init_sched_build_groups(this_core_map, cpu_map,
@@ -7527,12 +7537,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7527 7537
7528 /* Set up physical groups */ 7538 /* Set up physical groups */
7529 for (i = 0; i < nr_node_ids; i++) { 7539 for (i = 0; i < nr_node_ids; i++) {
7530 SCHED_CPUMASK_VAR(nodemask, allmasks); 7540 /* FIXME: Use cpumask_of_node */
7531 SCHED_CPUMASK_VAR(send_covered, allmasks);
7532
7533 *nodemask = node_to_cpumask(i); 7541 *nodemask = node_to_cpumask(i);
7534 cpus_and(*nodemask, *nodemask, *cpu_map); 7542 cpus_and(*nodemask, *nodemask, *cpu_map);
7535 if (cpus_empty(*nodemask)) 7543 if (cpumask_empty(nodemask))
7536 continue; 7544 continue;
7537 7545
7538 init_sched_build_groups(nodemask, cpu_map, 7546 init_sched_build_groups(nodemask, cpu_map,
@@ -7543,8 +7551,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7543#ifdef CONFIG_NUMA 7551#ifdef CONFIG_NUMA
7544 /* Set up node groups */ 7552 /* Set up node groups */
7545 if (sd_allnodes) { 7553 if (sd_allnodes) {
7546 SCHED_CPUMASK_VAR(send_covered, allmasks);
7547
7548 init_sched_build_groups(cpu_map, cpu_map, 7554 init_sched_build_groups(cpu_map, cpu_map,
7549 &cpu_to_allnodes_group, 7555 &cpu_to_allnodes_group,
7550 send_covered, tmpmask); 7556 send_covered, tmpmask);
@@ -7553,58 +7559,58 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7553 for (i = 0; i < nr_node_ids; i++) { 7559 for (i = 0; i < nr_node_ids; i++) {
7554 /* Set up node groups */ 7560 /* Set up node groups */
7555 struct sched_group *sg, *prev; 7561 struct sched_group *sg, *prev;
7556 SCHED_CPUMASK_VAR(nodemask, allmasks);
7557 SCHED_CPUMASK_VAR(domainspan, allmasks);
7558 SCHED_CPUMASK_VAR(covered, allmasks);
7559 int j; 7562 int j;
7560 7563
7564 /* FIXME: Use cpumask_of_node */
7561 *nodemask = node_to_cpumask(i); 7565 *nodemask = node_to_cpumask(i);
7562 cpus_clear(*covered); 7566 cpumask_clear(covered);
7563 7567
7564 cpus_and(*nodemask, *nodemask, *cpu_map); 7568 cpus_and(*nodemask, *nodemask, *cpu_map);
7565 if (cpus_empty(*nodemask)) { 7569 if (cpumask_empty(nodemask)) {
7566 sched_group_nodes[i] = NULL; 7570 sched_group_nodes[i] = NULL;
7567 continue; 7571 continue;
7568 } 7572 }
7569 7573
7570 sched_domain_node_span(i, domainspan); 7574 sched_domain_node_span(i, domainspan);
7571 cpus_and(*domainspan, *domainspan, *cpu_map); 7575 cpumask_and(domainspan, domainspan, cpu_map);
7572 7576
7573 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7577 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7578 GFP_KERNEL, i);
7574 if (!sg) { 7579 if (!sg) {
7575 printk(KERN_WARNING "Can not alloc domain group for " 7580 printk(KERN_WARNING "Can not alloc domain group for "
7576 "node %d\n", i); 7581 "node %d\n", i);
7577 goto error; 7582 goto error;
7578 } 7583 }
7579 sched_group_nodes[i] = sg; 7584 sched_group_nodes[i] = sg;
7580 for_each_cpu_mask_nr(j, *nodemask) { 7585 for_each_cpu(j, nodemask) {
7581 struct sched_domain *sd; 7586 struct sched_domain *sd;
7582 7587
7583 sd = &per_cpu(node_domains, j); 7588 sd = &per_cpu(node_domains, j);
7584 sd->groups = sg; 7589 sd->groups = sg;
7585 } 7590 }
7586 sg->__cpu_power = 0; 7591 sg->__cpu_power = 0;
7587 sg->cpumask = *nodemask; 7592 cpumask_copy(sched_group_cpus(sg), nodemask);
7588 sg->next = sg; 7593 sg->next = sg;
7589 cpus_or(*covered, *covered, *nodemask); 7594 cpumask_or(covered, covered, nodemask);
7590 prev = sg; 7595 prev = sg;
7591 7596
7592 for (j = 0; j < nr_node_ids; j++) { 7597 for (j = 0; j < nr_node_ids; j++) {
7593 SCHED_CPUMASK_VAR(notcovered, allmasks);
7594 int n = (i + j) % nr_node_ids; 7598 int n = (i + j) % nr_node_ids;
7599 /* FIXME: Use cpumask_of_node */
7595 node_to_cpumask_ptr(pnodemask, n); 7600 node_to_cpumask_ptr(pnodemask, n);
7596 7601
7597 cpus_complement(*notcovered, *covered); 7602 cpumask_complement(notcovered, covered);
7598 cpus_and(*tmpmask, *notcovered, *cpu_map); 7603 cpumask_and(tmpmask, notcovered, cpu_map);
7599 cpus_and(*tmpmask, *tmpmask, *domainspan); 7604 cpumask_and(tmpmask, tmpmask, domainspan);
7600 if (cpus_empty(*tmpmask)) 7605 if (cpumask_empty(tmpmask))
7601 break; 7606 break;
7602 7607
7603 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7608 cpumask_and(tmpmask, tmpmask, pnodemask);
7604 if (cpus_empty(*tmpmask)) 7609 if (cpumask_empty(tmpmask))
7605 continue; 7610 continue;
7606 7611
7607 sg = kmalloc_node(sizeof(struct sched_group), 7612 sg = kmalloc_node(sizeof(struct sched_group) +
7613 cpumask_size(),
7608 GFP_KERNEL, i); 7614 GFP_KERNEL, i);
7609 if (!sg) { 7615 if (!sg) {
7610 printk(KERN_WARNING 7616 printk(KERN_WARNING
@@ -7612,9 +7618,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7612 goto error; 7618 goto error;
7613 } 7619 }
7614 sg->__cpu_power = 0; 7620 sg->__cpu_power = 0;
7615 sg->cpumask = *tmpmask; 7621 cpumask_copy(sched_group_cpus(sg), tmpmask);
7616 sg->next = prev->next; 7622 sg->next = prev->next;
7617 cpus_or(*covered, *covered, *tmpmask); 7623 cpumask_or(covered, covered, tmpmask);
7618 prev->next = sg; 7624 prev->next = sg;
7619 prev = sg; 7625 prev = sg;
7620 } 7626 }
@@ -7623,22 +7629,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7623 7629
7624 /* Calculate CPU power for physical packages and nodes */ 7630 /* Calculate CPU power for physical packages and nodes */
7625#ifdef CONFIG_SCHED_SMT 7631#ifdef CONFIG_SCHED_SMT
7626 for_each_cpu_mask_nr(i, *cpu_map) { 7632 for_each_cpu(i, cpu_map) {
7627 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7633 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7628 7634
7629 init_sched_groups_power(i, sd); 7635 init_sched_groups_power(i, sd);
7630 } 7636 }
7631#endif 7637#endif
7632#ifdef CONFIG_SCHED_MC 7638#ifdef CONFIG_SCHED_MC
7633 for_each_cpu_mask_nr(i, *cpu_map) { 7639 for_each_cpu(i, cpu_map) {
7634 struct sched_domain *sd = &per_cpu(core_domains, i); 7640 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7635 7641
7636 init_sched_groups_power(i, sd); 7642 init_sched_groups_power(i, sd);
7637 } 7643 }
7638#endif 7644#endif
7639 7645
7640 for_each_cpu_mask_nr(i, *cpu_map) { 7646 for_each_cpu(i, cpu_map) {
7641 struct sched_domain *sd = &per_cpu(phys_domains, i); 7647 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7642 7648
7643 init_sched_groups_power(i, sd); 7649 init_sched_groups_power(i, sd);
7644 } 7650 }
@@ -7650,56 +7656,87 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7650 if (sd_allnodes) { 7656 if (sd_allnodes) {
7651 struct sched_group *sg; 7657 struct sched_group *sg;
7652 7658
7653 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7659 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7654 tmpmask); 7660 tmpmask);
7655 init_numa_sched_groups_power(sg); 7661 init_numa_sched_groups_power(sg);
7656 } 7662 }
7657#endif 7663#endif
7658 7664
7659 /* Attach the domains */ 7665 /* Attach the domains */
7660 for_each_cpu_mask_nr(i, *cpu_map) { 7666 for_each_cpu(i, cpu_map) {
7661 struct sched_domain *sd; 7667 struct sched_domain *sd;
7662#ifdef CONFIG_SCHED_SMT 7668#ifdef CONFIG_SCHED_SMT
7663 sd = &per_cpu(cpu_domains, i); 7669 sd = &per_cpu(cpu_domains, i).sd;
7664#elif defined(CONFIG_SCHED_MC) 7670#elif defined(CONFIG_SCHED_MC)
7665 sd = &per_cpu(core_domains, i); 7671 sd = &per_cpu(core_domains, i).sd;
7666#else 7672#else
7667 sd = &per_cpu(phys_domains, i); 7673 sd = &per_cpu(phys_domains, i).sd;
7668#endif 7674#endif
7669 cpu_attach_domain(sd, rd, i); 7675 cpu_attach_domain(sd, rd, i);
7670 } 7676 }
7671 7677
7672 SCHED_CPUMASK_FREE((void *)allmasks); 7678 err = 0;
7673 return 0; 7679
7680free_tmpmask:
7681 free_cpumask_var(tmpmask);
7682free_send_covered:
7683 free_cpumask_var(send_covered);
7684free_this_core_map:
7685 free_cpumask_var(this_core_map);
7686free_this_sibling_map:
7687 free_cpumask_var(this_sibling_map);
7688free_nodemask:
7689 free_cpumask_var(nodemask);
7690free_notcovered:
7691#ifdef CONFIG_NUMA
7692 free_cpumask_var(notcovered);
7693free_covered:
7694 free_cpumask_var(covered);
7695free_domainspan:
7696 free_cpumask_var(domainspan);
7697out:
7698#endif
7699 return err;
7700
7701free_sched_groups:
7702#ifdef CONFIG_NUMA
7703 kfree(sched_group_nodes);
7704#endif
7705 goto free_tmpmask;
7674 7706
7675#ifdef CONFIG_NUMA 7707#ifdef CONFIG_NUMA
7676error: 7708error:
7677 free_sched_groups(cpu_map, tmpmask); 7709 free_sched_groups(cpu_map, tmpmask);
7678 SCHED_CPUMASK_FREE((void *)allmasks); 7710 free_rootdomain(rd);
7679 kfree(rd); 7711 goto free_tmpmask;
7680 return -ENOMEM;
7681#endif 7712#endif
7682} 7713}
7683 7714
7684static int build_sched_domains(const cpumask_t *cpu_map) 7715static int build_sched_domains(const struct cpumask *cpu_map)
7685{ 7716{
7686 return __build_sched_domains(cpu_map, NULL); 7717 return __build_sched_domains(cpu_map, NULL);
7687} 7718}
7688 7719
7689static cpumask_t *doms_cur; /* current sched domains */ 7720static struct cpumask *doms_cur; /* current sched domains */
7690static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7721static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7691static struct sched_domain_attr *dattr_cur; 7722static struct sched_domain_attr *dattr_cur;
7692 /* attribues of custom domains in 'doms_cur' */ 7723 /* attribues of custom domains in 'doms_cur' */
7693 7724
7694/* 7725/*
7695 * Special case: If a kmalloc of a doms_cur partition (array of 7726 * Special case: If a kmalloc of a doms_cur partition (array of
7696 * cpumask_t) fails, then fallback to a single sched domain, 7727 * cpumask) fails, then fallback to a single sched domain,
7697 * as determined by the single cpumask_t fallback_doms. 7728 * as determined by the single cpumask fallback_doms.
7698 */ 7729 */
7699static cpumask_t fallback_doms; 7730static cpumask_var_t fallback_doms;
7700 7731
7701void __attribute__((weak)) arch_update_cpu_topology(void) 7732/*
7733 * arch_update_cpu_topology lets virtualized architectures update the
7734 * cpu core maps. It is supposed to return 1 if the topology changed
7735 * or 0 if it stayed the same.
7736 */
7737int __attribute__((weak)) arch_update_cpu_topology(void)
7702{ 7738{
7739 return 0;
7703} 7740}
7704 7741
7705/* 7742/*
@@ -7707,16 +7744,16 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7707 * For now this just excludes isolated cpus, but could be used to 7744 * For now this just excludes isolated cpus, but could be used to
7708 * exclude other special cases in the future. 7745 * exclude other special cases in the future.
7709 */ 7746 */
7710static int arch_init_sched_domains(const cpumask_t *cpu_map) 7747static int arch_init_sched_domains(const struct cpumask *cpu_map)
7711{ 7748{
7712 int err; 7749 int err;
7713 7750
7714 arch_update_cpu_topology(); 7751 arch_update_cpu_topology();
7715 ndoms_cur = 1; 7752 ndoms_cur = 1;
7716 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7753 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7717 if (!doms_cur) 7754 if (!doms_cur)
7718 doms_cur = &fallback_doms; 7755 doms_cur = fallback_doms;
7719 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7756 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7720 dattr_cur = NULL; 7757 dattr_cur = NULL;
7721 err = build_sched_domains(doms_cur); 7758 err = build_sched_domains(doms_cur);
7722 register_sched_domain_sysctl(); 7759 register_sched_domain_sysctl();
@@ -7724,8 +7761,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7724 return err; 7761 return err;
7725} 7762}
7726 7763
7727static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7764static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7728 cpumask_t *tmpmask) 7765 struct cpumask *tmpmask)
7729{ 7766{
7730 free_sched_groups(cpu_map, tmpmask); 7767 free_sched_groups(cpu_map, tmpmask);
7731} 7768}
@@ -7734,17 +7771,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7734 * Detach sched domains from a group of cpus specified in cpu_map 7771 * Detach sched domains from a group of cpus specified in cpu_map
7735 * These cpus will now be attached to the NULL domain 7772 * These cpus will now be attached to the NULL domain
7736 */ 7773 */
7737static void detach_destroy_domains(const cpumask_t *cpu_map) 7774static void detach_destroy_domains(const struct cpumask *cpu_map)
7738{ 7775{
7739 cpumask_t tmpmask; 7776 /* Save because hotplug lock held. */
7777 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7740 int i; 7778 int i;
7741 7779
7742 unregister_sched_domain_sysctl(); 7780 for_each_cpu(i, cpu_map)
7743
7744 for_each_cpu_mask_nr(i, *cpu_map)
7745 cpu_attach_domain(NULL, &def_root_domain, i); 7781 cpu_attach_domain(NULL, &def_root_domain, i);
7746 synchronize_sched(); 7782 synchronize_sched();
7747 arch_destroy_sched_domains(cpu_map, &tmpmask); 7783 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7748} 7784}
7749 7785
7750/* handle null as "default" */ 7786/* handle null as "default" */
@@ -7769,7 +7805,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7769 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7805 * doms_new[] to the current sched domain partitioning, doms_cur[].
7770 * It destroys each deleted domain and builds each new domain. 7806 * It destroys each deleted domain and builds each new domain.
7771 * 7807 *
7772 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7808 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7773 * The masks don't intersect (don't overlap.) We should setup one 7809 * The masks don't intersect (don't overlap.) We should setup one
7774 * sched domain for each mask. CPUs not in any of the cpumasks will 7810 * sched domain for each mask. CPUs not in any of the cpumasks will
7775 * not be load balanced. If the same cpumask appears both in the 7811 * not be load balanced. If the same cpumask appears both in the
@@ -7778,32 +7814,38 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7778 * 7814 *
7779 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7815 * The passed in 'doms_new' should be kmalloc'd. This routine takes
7780 * ownership of it and will kfree it when done with it. If the caller 7816 * ownership of it and will kfree it when done with it. If the caller
7781 * failed the kmalloc call, then it can pass in doms_new == NULL, 7817 * failed the kmalloc call, then it can pass in doms_new == NULL &&
7782 * and partition_sched_domains() will fallback to the single partition 7818 * ndoms_new == 1, and partition_sched_domains() will fallback to
7783 * 'fallback_doms', it also forces the domains to be rebuilt. 7819 * the single partition 'fallback_doms', it also forces the domains
7820 * to be rebuilt.
7784 * 7821 *
7785 * If doms_new==NULL it will be replaced with cpu_online_map. 7822 * If doms_new == NULL it will be replaced with cpu_online_mask.
7786 * ndoms_new==0 is a special case for destroying existing domains. 7823 * ndoms_new == 0 is a special case for destroying existing domains,
7787 * It will not create the default domain. 7824 * and it will not create the default domain.
7788 * 7825 *
7789 * Call with hotplug lock held 7826 * Call with hotplug lock held
7790 */ 7827 */
7791void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7828/* FIXME: Change to struct cpumask *doms_new[] */
7829void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7792 struct sched_domain_attr *dattr_new) 7830 struct sched_domain_attr *dattr_new)
7793{ 7831{
7794 int i, j, n; 7832 int i, j, n;
7833 int new_topology;
7795 7834
7796 mutex_lock(&sched_domains_mutex); 7835 mutex_lock(&sched_domains_mutex);
7797 7836
7798 /* always unregister in case we don't destroy any domains */ 7837 /* always unregister in case we don't destroy any domains */
7799 unregister_sched_domain_sysctl(); 7838 unregister_sched_domain_sysctl();
7800 7839
7840 /* Let architecture update cpu core mappings. */
7841 new_topology = arch_update_cpu_topology();
7842
7801 n = doms_new ? ndoms_new : 0; 7843 n = doms_new ? ndoms_new : 0;
7802 7844
7803 /* Destroy deleted domains */ 7845 /* Destroy deleted domains */
7804 for (i = 0; i < ndoms_cur; i++) { 7846 for (i = 0; i < ndoms_cur; i++) {
7805 for (j = 0; j < n; j++) { 7847 for (j = 0; j < n && !new_topology; j++) {
7806 if (cpus_equal(doms_cur[i], doms_new[j]) 7848 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7807 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7849 && dattrs_equal(dattr_cur, i, dattr_new, j))
7808 goto match1; 7850 goto match1;
7809 } 7851 }
@@ -7815,15 +7857,15 @@ match1:
7815 7857
7816 if (doms_new == NULL) { 7858 if (doms_new == NULL) {
7817 ndoms_cur = 0; 7859 ndoms_cur = 0;
7818 doms_new = &fallback_doms; 7860 doms_new = fallback_doms;
7819 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7861 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7820 dattr_new = NULL; 7862 WARN_ON_ONCE(dattr_new);
7821 } 7863 }
7822 7864
7823 /* Build new domains */ 7865 /* Build new domains */
7824 for (i = 0; i < ndoms_new; i++) { 7866 for (i = 0; i < ndoms_new; i++) {
7825 for (j = 0; j < ndoms_cur; j++) { 7867 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7826 if (cpus_equal(doms_new[i], doms_cur[j]) 7868 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7827 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7869 && dattrs_equal(dattr_new, i, dattr_cur, j))
7828 goto match2; 7870 goto match2;
7829 } 7871 }
@@ -7835,7 +7877,7 @@ match2:
7835 } 7877 }
7836 7878
7837 /* Remember the new sched domains */ 7879 /* Remember the new sched domains */
7838 if (doms_cur != &fallback_doms) 7880 if (doms_cur != fallback_doms)
7839 kfree(doms_cur); 7881 kfree(doms_cur);
7840 kfree(dattr_cur); /* kfree(NULL) is safe */ 7882 kfree(dattr_cur); /* kfree(NULL) is safe */
7841 doms_cur = doms_new; 7883 doms_cur = doms_new;
@@ -7975,7 +8017,9 @@ static int update_runtime(struct notifier_block *nfb,
7975 8017
7976void __init sched_init_smp(void) 8018void __init sched_init_smp(void)
7977{ 8019{
7978 cpumask_t non_isolated_cpus; 8020 cpumask_var_t non_isolated_cpus;
8021
8022 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7979 8023
7980#if defined(CONFIG_NUMA) 8024#if defined(CONFIG_NUMA)
7981 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8025 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7984,10 +8028,10 @@ void __init sched_init_smp(void)
7984#endif 8028#endif
7985 get_online_cpus(); 8029 get_online_cpus();
7986 mutex_lock(&sched_domains_mutex); 8030 mutex_lock(&sched_domains_mutex);
7987 arch_init_sched_domains(&cpu_online_map); 8031 arch_init_sched_domains(cpu_online_mask);
7988 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8032 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7989 if (cpus_empty(non_isolated_cpus)) 8033 if (cpumask_empty(non_isolated_cpus))
7990 cpu_set(smp_processor_id(), non_isolated_cpus); 8034 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7991 mutex_unlock(&sched_domains_mutex); 8035 mutex_unlock(&sched_domains_mutex);
7992 put_online_cpus(); 8036 put_online_cpus();
7993 8037
@@ -8002,9 +8046,13 @@ void __init sched_init_smp(void)
8002 init_hrtick(); 8046 init_hrtick();
8003 8047
8004 /* Move init over to a non-isolated CPU */ 8048 /* Move init over to a non-isolated CPU */
8005 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8049 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8006 BUG(); 8050 BUG();
8007 sched_init_granularity(); 8051 sched_init_granularity();
8052 free_cpumask_var(non_isolated_cpus);
8053
8054 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8055 init_sched_rt_class();
8008} 8056}
8009#else 8057#else
8010void __init sched_init_smp(void) 8058void __init sched_init_smp(void)
@@ -8319,6 +8367,15 @@ void __init sched_init(void)
8319 */ 8367 */
8320 current->sched_class = &fair_sched_class; 8368 current->sched_class = &fair_sched_class;
8321 8369
8370 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8371 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8372#ifdef CONFIG_SMP
8373#ifdef CONFIG_NO_HZ
8374 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8375#endif
8376 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8377#endif /* SMP */
8378
8322 scheduler_running = 1; 8379 scheduler_running = 1;
8323} 8380}
8324 8381
@@ -8477,7 +8534,7 @@ static
8477int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8534int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8478{ 8535{
8479 struct cfs_rq *cfs_rq; 8536 struct cfs_rq *cfs_rq;
8480 struct sched_entity *se, *parent_se; 8537 struct sched_entity *se;
8481 struct rq *rq; 8538 struct rq *rq;
8482 int i; 8539 int i;
8483 8540
@@ -8493,18 +8550,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8493 for_each_possible_cpu(i) { 8550 for_each_possible_cpu(i) {
8494 rq = cpu_rq(i); 8551 rq = cpu_rq(i);
8495 8552
8496 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8553 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8497 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8554 GFP_KERNEL, cpu_to_node(i));
8498 if (!cfs_rq) 8555 if (!cfs_rq)
8499 goto err; 8556 goto err;
8500 8557
8501 se = kmalloc_node(sizeof(struct sched_entity), 8558 se = kzalloc_node(sizeof(struct sched_entity),
8502 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8559 GFP_KERNEL, cpu_to_node(i));
8503 if (!se) 8560 if (!se)
8504 goto err; 8561 goto err;
8505 8562
8506 parent_se = parent ? parent->se[i] : NULL; 8563 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8507 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8508 } 8564 }
8509 8565
8510 return 1; 8566 return 1;
@@ -8565,7 +8621,7 @@ static
8565int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8621int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8566{ 8622{
8567 struct rt_rq *rt_rq; 8623 struct rt_rq *rt_rq;
8568 struct sched_rt_entity *rt_se, *parent_se; 8624 struct sched_rt_entity *rt_se;
8569 struct rq *rq; 8625 struct rq *rq;
8570 int i; 8626 int i;
8571 8627
@@ -8582,18 +8638,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8582 for_each_possible_cpu(i) { 8638 for_each_possible_cpu(i) {
8583 rq = cpu_rq(i); 8639 rq = cpu_rq(i);
8584 8640
8585 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8641 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8586 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8642 GFP_KERNEL, cpu_to_node(i));
8587 if (!rt_rq) 8643 if (!rt_rq)
8588 goto err; 8644 goto err;
8589 8645
8590 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8646 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8591 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8647 GFP_KERNEL, cpu_to_node(i));
8592 if (!rt_se) 8648 if (!rt_se)
8593 goto err; 8649 goto err;
8594 8650
8595 parent_se = parent ? parent->rt_se[i] : NULL; 8651 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8596 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8597 } 8652 }
8598 8653
8599 return 1; 8654 return 1;
@@ -9236,11 +9291,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9236 * (balbir@in.ibm.com). 9291 * (balbir@in.ibm.com).
9237 */ 9292 */
9238 9293
9239/* track cpu usage of a group of tasks */ 9294/* track cpu usage of a group of tasks and its child groups */
9240struct cpuacct { 9295struct cpuacct {
9241 struct cgroup_subsys_state css; 9296 struct cgroup_subsys_state css;
9242 /* cpuusage holds pointer to a u64-type object on every cpu */ 9297 /* cpuusage holds pointer to a u64-type object on every cpu */
9243 u64 *cpuusage; 9298 u64 *cpuusage;
9299 struct cpuacct *parent;
9244}; 9300};
9245 9301
9246struct cgroup_subsys cpuacct_subsys; 9302struct cgroup_subsys cpuacct_subsys;
@@ -9274,6 +9330,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9274 return ERR_PTR(-ENOMEM); 9330 return ERR_PTR(-ENOMEM);
9275 } 9331 }
9276 9332
9333 if (cgrp->parent)
9334 ca->parent = cgroup_ca(cgrp->parent);
9335
9277 return &ca->css; 9336 return &ca->css;
9278} 9337}
9279 9338
@@ -9353,14 +9412,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9353static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9412static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9354{ 9413{
9355 struct cpuacct *ca; 9414 struct cpuacct *ca;
9415 int cpu;
9356 9416
9357 if (!cpuacct_subsys.active) 9417 if (!cpuacct_subsys.active)
9358 return; 9418 return;
9359 9419
9420 cpu = task_cpu(tsk);
9360 ca = task_ca(tsk); 9421 ca = task_ca(tsk);
9361 if (ca) {
9362 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9363 9422
9423 for (; ca; ca = ca->parent) {
9424 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9364 *cpuusage += cputime; 9425 *cpuusage += cputime;
9365 } 9426 }
9366} 9427}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..018b7be1db2e 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec32..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -144,7 +177,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
144 last = __pick_last_entity(cfs_rq); 177 last = __pick_last_entity(cfs_rq);
145 if (last) 178 if (last)
146 max_vruntime = last->vruntime; 179 max_vruntime = last->vruntime;
147 min_vruntime = rq->cfs.min_vruntime; 180 min_vruntime = cfs_rq->min_vruntime;
148 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 181 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
149 spin_unlock_irqrestore(&rq->lock, flags); 182 spin_unlock_irqrestore(&rq->lock, flags);
150 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 183 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,31 +194,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
161 SPLIT_NS(spread0)); 194 SPLIT_NS(spread0));
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 195 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 196 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS
165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171
172 P(sched_switch);
173 P(sched_count);
174 P(sched_goidle);
175 197
176 P(ttwu_count); 198 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
182#endif
183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
184 cfs_rq->nr_spread_over); 199 cfs_rq->nr_spread_over);
185#ifdef CONFIG_FAIR_GROUP_SCHED 200#ifdef CONFIG_FAIR_GROUP_SCHED
186#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
187 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
188#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
189#endif 205#endif
190} 206}
191 207
@@ -193,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{ 209{
194#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
195 char path[128] = ""; 211 char path[128] = "";
196 struct cgroup *cgroup = NULL;
197 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
198 213
199 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
200 cgroup = tg->css.cgroup;
201
202 if (cgroup)
203 cgroup_path(cgroup, path, sizeof(path));
204 215
205 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
206#else 217#else
@@ -260,6 +271,25 @@ static void print_cpu(struct seq_file *m, int cpu)
260#undef P 271#undef P
261#undef PN 272#undef PN
262 273
274#ifdef CONFIG_SCHEDSTATS
275#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
276
277 P(yld_exp_empty);
278 P(yld_act_empty);
279 P(yld_both_empty);
280 P(yld_count);
281
282 P(sched_switch);
283 P(sched_count);
284 P(sched_goidle);
285
286 P(ttwu_count);
287 P(ttwu_local);
288
289 P(bkl_count);
290
291#undef P
292#endif
263 print_cfs_stats(m, cpu); 293 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu); 294 print_rt_stats(m, cpu);
265 295
@@ -271,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
271 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
272 int cpu; 302 int cpu;
273 303
274 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
275 init_utsname()->release, 305 init_utsname()->release,
276 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
277 init_utsname()->version); 307 init_utsname()->version);
@@ -422,10 +452,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
422#undef __P 452#undef __P
423 453
424 { 454 {
455 unsigned int this_cpu = raw_smp_processor_id();
425 u64 t0, t1; 456 u64 t0, t1;
426 457
427 t0 = sched_clock(); 458 t0 = cpu_clock(this_cpu);
428 t1 = sched_clock(); 459 t1 = cpu_clock(this_cpu);
429 SEQ_printf(m, "%-35s:%21Ld\n", 460 SEQ_printf(m, "%-35s:%21Ld\n",
430 "clock-delta", (long long)(t1-t0)); 461 "clock-delta", (long long)(t1-t0));
431 } 462 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 51aa3e102acb..08ffffd4a410 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -716,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
716 __enqueue_entity(cfs_rq, se); 716 __enqueue_entity(cfs_rq, se);
717} 717}
718 718
719static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{
721 if (cfs_rq->last == se)
722 cfs_rq->last = NULL;
723
724 if (cfs_rq->next == se)
725 cfs_rq->next = NULL;
726}
727
719static void 728static void
720dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 729dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
721{ 730{
@@ -738,11 +747,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
738#endif 747#endif
739 } 748 }
740 749
741 if (cfs_rq->last == se) 750 clear_buddies(cfs_rq, se);
742 cfs_rq->last = NULL;
743
744 if (cfs_rq->next == se)
745 cfs_rq->next = NULL;
746 751
747 if (se != cfs_rq->curr) 752 if (se != cfs_rq->curr)
748 __dequeue_entity(cfs_rq, se); 753 __dequeue_entity(cfs_rq, se);
@@ -977,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
977 if (unlikely(cfs_rq->nr_running == 1)) 982 if (unlikely(cfs_rq->nr_running == 1))
978 return; 983 return;
979 984
985 clear_buddies(cfs_rq, se);
986
980 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 987 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
981 update_rq_clock(rq); 988 update_rq_clock(rq);
982 /* 989 /*
@@ -1010,14 +1017,13 @@ static void yield_task_fair(struct rq *rq)
1010 * search starts with cpus closest then further out as needed, 1017 * search starts with cpus closest then further out as needed,
1011 * so we always favor a closer, idle cpu. 1018 * so we always favor a closer, idle cpu.
1012 * Domains may include CPUs that are not usable for migration, 1019 * Domains may include CPUs that are not usable for migration,
1013 * hence we need to mask them out (cpu_active_map) 1020 * hence we need to mask them out (cpu_active_mask)
1014 * 1021 *
1015 * Returns the CPU we should wake onto. 1022 * Returns the CPU we should wake onto.
1016 */ 1023 */
1017#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1024#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1018static int wake_idle(int cpu, struct task_struct *p) 1025static int wake_idle(int cpu, struct task_struct *p)
1019{ 1026{
1020 cpumask_t tmp;
1021 struct sched_domain *sd; 1027 struct sched_domain *sd;
1022 int i; 1028 int i;
1023 1029
@@ -1037,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1037 if ((sd->flags & SD_WAKE_IDLE) 1043 if ((sd->flags & SD_WAKE_IDLE)
1038 || ((sd->flags & SD_WAKE_IDLE_FAR) 1044 || ((sd->flags & SD_WAKE_IDLE_FAR)
1039 && !task_hot(p, task_rq(p)->clock, sd))) { 1045 && !task_hot(p, task_rq(p)->clock, sd))) {
1040 cpus_and(tmp, sd->span, p->cpus_allowed); 1046 for_each_cpu_and(i, sched_domain_span(sd),
1041 cpus_and(tmp, tmp, cpu_active_map); 1047 &p->cpus_allowed) {
1042 for_each_cpu_mask_nr(i, tmp) { 1048 if (cpu_active(i) && idle_cpu(i)) {
1043 if (idle_cpu(i)) {
1044 if (i != task_cpu(p)) { 1049 if (i != task_cpu(p)) {
1045 schedstat_inc(p, 1050 schedstat_inc(p,
1046 se.nr_wakeups_idle); 1051 se.nr_wakeups_idle);
@@ -1233,13 +1238,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1233 * this_cpu and prev_cpu are present in: 1238 * this_cpu and prev_cpu are present in:
1234 */ 1239 */
1235 for_each_domain(this_cpu, sd) { 1240 for_each_domain(this_cpu, sd) {
1236 if (cpu_isset(prev_cpu, sd->span)) { 1241 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1237 this_sd = sd; 1242 this_sd = sd;
1238 break; 1243 break;
1239 } 1244 }
1240 } 1245 }
1241 1246
1242 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1247 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1243 goto out; 1248 goto out;
1244 1249
1245 /* 1250 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..1bbd99014011 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -909,15 +914,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
909/* Only try algorithms three times */ 914/* Only try algorithms three times */
910#define RT_MAX_TRIES 3 915#define RT_MAX_TRIES 3
911 916
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 917static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 918
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 919static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
918{ 920{
919 if (!task_running(rq, p) && 921 if (!task_running(rq, p) &&
920 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 922 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
921 (p->rt.nr_cpus_allowed > 1)) 923 (p->rt.nr_cpus_allowed > 1))
922 return 1; 924 return 1;
923 return 0; 925 return 0;
@@ -956,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
956 return next; 958 return next;
957} 959}
958 960
959static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
960 962
961static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
962{ 964{
@@ -976,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
976static int find_lowest_rq(struct task_struct *task) 978static int find_lowest_rq(struct task_struct *task)
977{ 979{
978 struct sched_domain *sd; 980 struct sched_domain *sd;
979 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
980 int this_cpu = smp_processor_id(); 982 int this_cpu = smp_processor_id();
981 int cpu = task_cpu(task); 983 int cpu = task_cpu(task);
982 984
@@ -991,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
991 * I guess we might want to change cpupri_find() to ignore those 993 * I guess we might want to change cpupri_find() to ignore those
992 * in the first place. 994 * in the first place.
993 */ 995 */
994 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 996 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
995 997
996 /* 998 /*
997 * At this point we have built a mask of cpus representing the 999 * At this point we have built a mask of cpus representing the
@@ -1001,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
1001 * We prioritize the last cpu that the task executed on since 1003 * We prioritize the last cpu that the task executed on since
1002 * it is most likely cache-hot in that location. 1004 * it is most likely cache-hot in that location.
1003 */ 1005 */
1004 if (cpu_isset(cpu, *lowest_mask)) 1006 if (cpumask_test_cpu(cpu, lowest_mask))
1005 return cpu; 1007 return cpu;
1006 1008
1007 /* 1009 /*
@@ -1016,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
1016 cpumask_t domain_mask; 1018 cpumask_t domain_mask;
1017 int best_cpu; 1019 int best_cpu;
1018 1020
1019 cpus_and(domain_mask, sd->span, *lowest_mask); 1021 cpumask_and(&domain_mask, sched_domain_span(sd),
1022 lowest_mask);
1020 1023
1021 best_cpu = pick_optimal_cpu(this_cpu, 1024 best_cpu = pick_optimal_cpu(this_cpu,
1022 &domain_mask); 1025 &domain_mask);
@@ -1057,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1057 * Also make sure that it wasn't scheduled on its rq. 1060 * Also make sure that it wasn't scheduled on its rq.
1058 */ 1061 */
1059 if (unlikely(task_rq(task) != rq || 1062 if (unlikely(task_rq(task) != rq ||
1060 !cpu_isset(lowest_rq->cpu, 1063 !cpumask_test_cpu(lowest_rq->cpu,
1061 task->cpus_allowed) || 1064 &task->cpus_allowed) ||
1062 task_running(rq, task) || 1065 task_running(rq, task) ||
1063 !task->se.on_rq)) { 1066 !task->se.on_rq)) {
1064 1067
@@ -1179,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
1179 1182
1180 next = pick_next_task_rt(this_rq); 1183 next = pick_next_task_rt(this_rq);
1181 1184
1182 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1185 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1183 if (this_cpu == cpu) 1186 if (this_cpu == cpu)
1184 continue; 1187 continue;
1185 1188
@@ -1308,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1308} 1311}
1309 1312
1310static void set_cpus_allowed_rt(struct task_struct *p, 1313static void set_cpus_allowed_rt(struct task_struct *p,
1311 const cpumask_t *new_mask) 1314 const struct cpumask *new_mask)
1312{ 1315{
1313 int weight = cpus_weight(*new_mask); 1316 int weight = cpumask_weight(new_mask);
1314 1317
1315 BUG_ON(!rt_task(p)); 1318 BUG_ON(!rt_task(p));
1316 1319
@@ -1331,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1331 update_rt_migration(rq); 1334 update_rt_migration(rq);
1332 } 1335 }
1333 1336
1334 p->cpus_allowed = *new_mask; 1337 cpumask_copy(&p->cpus_allowed, new_mask);
1335 p->rt.nr_cpus_allowed = weight; 1338 p->rt.nr_cpus_allowed = weight;
1336} 1339}
1337 1340
@@ -1374,6 +1377,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1374 if (!rq->rt.rt_nr_running) 1377 if (!rq->rt.rt_nr_running)
1375 pull_rt_task(rq); 1378 pull_rt_task(rq);
1376} 1379}
1380
1381static inline void init_sched_rt_class(void)
1382{
1383 unsigned int i;
1384
1385 for_each_possible_cpu(i)
1386 alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
1387}
1377#endif /* CONFIG_SMP */ 1388#endif /* CONFIG_SMP */
1378 1389
1379/* 1390/*
@@ -1544,3 +1555,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1544 rcu_read_unlock(); 1555 rcu_read_unlock();
1545} 1556}
1546#endif /* CONFIG_SCHED_DEBUG */ 1557#endif /* CONFIG_SCHED_DEBUG */
1558
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ee71bec1da66..ce340835d055 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 *sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
@@ -298,9 +299,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
298{ 299{
299 struct signal_struct *sig; 300 struct signal_struct *sig;
300 301
301 sig = tsk->signal; 302 /* tsk == current, ensure it is safe to use ->signal */
302 if (unlikely(!sig)) 303 if (unlikely(tsk->exit_state))
303 return; 304 return;
305
306 sig = tsk->signal;
304 if (sig->cputime.totals) { 307 if (sig->cputime.totals) {
305 struct task_cputime *times; 308 struct task_cputime *times;
306 309
@@ -325,9 +328,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
325{ 328{
326 struct signal_struct *sig; 329 struct signal_struct *sig;
327 330
328 sig = tsk->signal; 331 /* tsk == current, ensure it is safe to use ->signal */
329 if (unlikely(!sig)) 332 if (unlikely(tsk->exit_state))
330 return; 333 return;
334
335 sig = tsk->signal;
331 if (sig->cputime.totals) { 336 if (sig->cputime.totals) {
332 struct task_cputime *times; 337 struct task_cputime *times;
333 338
@@ -353,8 +358,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
353 struct signal_struct *sig; 358 struct signal_struct *sig;
354 359
355 sig = tsk->signal; 360 sig = tsk->signal;
361 /* see __exit_signal()->task_rq_unlock_wait() */
362 barrier();
356 if (unlikely(!sig)) 363 if (unlikely(!sig))
357 return; 364 return;
365
358 if (sig->cputime.totals) { 366 if (sig->cputime.totals) {
359 struct task_cputime *times; 367 struct task_cputime *times;
360 368
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..e9afe63da24b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
44static void __user *sig_handler(struct task_struct *t, int sig) 46static void __user *sig_handler(struct task_struct *t, int sig)
45{ 47{
46 return t->sighand->action[sig - 1].sa.sa_handler; 48 return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7110daeb9a90..e7c69a720d69 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,10 +269,11 @@ void irq_enter(void)
269{ 269{
270 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
271 271
272 if (idle_cpu(cpu) && !in_interrupt()) 272 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter();
273 tick_check_idle(cpu); 274 tick_check_idle(cpu);
274 275 } else
275 __irq_enter(); 276 __irq_enter();
276} 277}
277 278
278#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 279#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed733..1ab790c67b17 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
188 if ((long)(now - t->last_switch_timestamp) < 188 if ((long)(now - t->last_switch_timestamp) <
189 sysctl_hung_task_timeout_secs) 189 sysctl_hung_task_timeout_secs)
190 return; 190 return;
191 if (sysctl_hung_task_warnings < 0) 191 if (!sysctl_hung_task_warnings)
192 return; 192 return;
193 sysctl_hung_task_warnings--; 193 sysctl_hung_task_warnings--;
194 194
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bc4c00872c9..24e8ceacc388 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -112,7 +112,7 @@ static int chill(void *unused)
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
113{ 113{
114 struct work_struct *sm_work; 114 struct work_struct *sm_work;
115 int i; 115 int i, ret;
116 116
117 /* Set up initial state. */ 117 /* Set up initial state. */
118 mutex_lock(&lock); 118 mutex_lock(&lock);
@@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
137 /* This will release the thread on our CPU. */ 137 /* This will release the thread on our CPU. */
138 put_cpu(); 138 put_cpu();
139 flush_workqueue(stop_machine_wq); 139 flush_workqueue(stop_machine_wq);
140 ret = active.fnret;
140 mutex_unlock(&lock); 141 mutex_unlock(&lock);
141 return active.fnret; 142 return ret;
142} 143}
143 144
144int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..5fc3a0cfb994 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms)
858 struct task_cputime cputime; 858 struct task_cputime cputime;
859 cputime_t cutime, cstime; 859 cputime_t cutime, cstime;
860 860
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime); 861 thread_group_cputime(current, &cputime);
862 spin_lock_irq(&current->sighand->siglock);
863 cutime = current->signal->cutime; 863 cutime = current->signal->cutime;
864 cstime = current->signal->cstime; 864 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock); 865 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 31cond_syscall(sys_bind);
32cond_syscall(sys_listen); 32cond_syscall(sys_listen);
33cond_syscall(sys_accept); 33cond_syscall(sys_accept);
34cond_syscall(sys_paccept); 34cond_syscall(sys_accept4);
35cond_syscall(sys_connect); 35cond_syscall(sys_connect);
36cond_syscall(sys_getsockname); 36cond_syscall(sys_getsockname);
37cond_syscall(sys_getpeername); 37cond_syscall(sys_getpeername);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..c83f566e940a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
176#ifdef CONFIG_INOTIFY_USER 176#ifdef CONFIG_INOTIFY_USER
177extern struct ctl_table inotify_table[]; 177extern struct ctl_table inotify_table[];
178#endif 178#endif
179#ifdef CONFIG_EPOLL
180extern struct ctl_table epoll_table[];
181#endif
179 182
180#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 183#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
181int sysctl_legacy_va_layout; 184int sysctl_legacy_va_layout;
@@ -484,6 +487,16 @@ static struct ctl_table kern_table[] = {
484 .proc_handler = &ftrace_enable_sysctl, 487 .proc_handler = &ftrace_enable_sysctl,
485 }, 488 },
486#endif 489#endif
490#ifdef CONFIG_TRACING
491 {
492 .ctl_name = CTL_UNNUMBERED,
493 .procname = "ftrace_dump_on_oops",
494 .data = &ftrace_dump_on_oops,
495 .maxlen = sizeof(int),
496 .mode = 0644,
497 .proc_handler = &proc_dointvec,
498 },
499#endif
487#ifdef CONFIG_MODULES 500#ifdef CONFIG_MODULES
488 { 501 {
489 .ctl_name = KERN_MODPROBE, 502 .ctl_name = KERN_MODPROBE,
@@ -1325,6 +1338,13 @@ static struct ctl_table fs_table[] = {
1325 .child = inotify_table, 1338 .child = inotify_table,
1326 }, 1339 },
1327#endif 1340#endif
1341#ifdef CONFIG_EPOLL
1342 {
1343 .procname = "epoll",
1344 .mode = 0555,
1345 .child = epoll_table,
1346 },
1347#endif
1328#endif 1348#endif
1329 { 1349 {
1330 .ctl_name = KERN_SETUID_DUMPABLE, 1350 .ctl_name = KERN_SETUID_DUMPABLE,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5bbb1044f847..70f872c71f4e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle)
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 if (delta_jiffies > 1) 285 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 286 cpumask_set_cpu(cpu, nohz_cpu_mask);
287 /* 287 /*
288 * nohz_stop_sched_tick can be called several times before 288 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 289 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle)
296 /* 296 /*
297 * sched tick not stopped! 297 * sched tick not stopped!
298 */ 298 */
299 cpu_clear(cpu, nohz_cpu_mask); 299 cpumask_clear_cpu(cpu, nohz_cpu_mask);
300 goto out; 300 goto out;
301 } 301 }
302 302
@@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle)
354 * softirq. 354 * softirq.
355 */ 355 */
356 tick_do_update_jiffies64(ktime_get()); 356 tick_do_update_jiffies64(ktime_get());
357 cpu_clear(cpu, nohz_cpu_mask); 357 cpumask_clear_cpu(cpu, nohz_cpu_mask);
358 } 358 }
359 raise_softirq_irqoff(TIMER_SOFTIRQ); 359 raise_softirq_irqoff(TIMER_SOFTIRQ);
360out: 360out:
@@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void)
432 select_nohz_load_balancer(0); 432 select_nohz_load_balancer(0);
433 now = ktime_get(); 433 now = ktime_get();
434 tick_do_update_jiffies64(now); 434 tick_do_update_jiffies64(now);
435 cpu_clear(cpu, nohz_cpu_mask); 435 cpumask_clear_cpu(cpu, nohz_cpu_mask);
436 436
437 /* 437 /*
438 * We stopped the tick in idle. Update process times would miss the 438 * We stopped the tick in idle. Update process times would miss the
@@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void)
568 */ 568 */
569static void tick_nohz_kick_tick(int cpu) 569static void tick_nohz_kick_tick(int cpu)
570{ 570{
571#if 0
572 /* Switch back to 2.6.27 behaviour */
573
571 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 574 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
572 ktime_t delta, now; 575 ktime_t delta, now;
573 576
@@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu)
584 return; 587 return;
585 588
586 tick_nohz_restart(ts, now); 589 tick_nohz_restart(ts, now);
590#endif
587} 591}
588 592
589#else 593#else
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e7acfb482a68..fa05e88aa76f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -518,6 +518,28 @@ void update_wall_time(void)
518 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
519 clocksource_adjust(offset); 519 clocksource_adjust(offset);
520 520
521 /*
522 * Since in the loop above, we accumulate any amount of time
523 * in xtime_nsec over a second into xtime.tv_sec, its possible for
524 * xtime_nsec to be fairly small after the loop. Further, if we're
525 * slightly speeding the clocksource up in clocksource_adjust(),
526 * its possible the required corrective factor to xtime_nsec could
527 * cause it to underflow.
528 *
529 * Now, we cannot simply roll the accumulated second back, since
530 * the NTP subsystem has been notified via second_overflow. So
531 * instead we push xtime_nsec forward by the amount we underflowed,
532 * and add that amount into the error.
533 *
534 * We'll correct this error next time through this function, when
535 * xtime_nsec is not as small.
536 */
537 if (unlikely((s64)clock->xtime_nsec < 0)) {
538 s64 neg = -(s64)clock->xtime_nsec;
539 clock->xtime_nsec = 0;
540 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
541 }
542
521 /* store full nanoseconds into xtime after rounding it up and 543 /* store full nanoseconds into xtime after rounding it up and
522 * add the remainder to the error difference. 544 * add the remainder to the error difference.
523 */ 545 */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e8..bde6f03512d5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,18 +3,34 @@
3# select HAVE_FUNCTION_TRACER: 3# select HAVE_FUNCTION_TRACER:
4# 4#
5 5
6config USER_STACKTRACE_SUPPORT
7 bool
8
6config NOP_TRACER 9config NOP_TRACER
7 bool 10 bool
8 11
9config HAVE_FUNCTION_TRACER 12config HAVE_FUNCTION_TRACER
10 bool 13 bool
11 14
15config HAVE_FUNCTION_GRAPH_TRACER
16 bool
17
18config HAVE_FUNCTION_TRACE_MCOUNT_TEST
19 bool
20 help
21 This gets selected when the arch tests the function_trace_stop
22 variable at the mcount call site. Otherwise, this variable
23 is tested by the called function.
24
12config HAVE_DYNAMIC_FTRACE 25config HAVE_DYNAMIC_FTRACE
13 bool 26 bool
14 27
15config HAVE_FTRACE_MCOUNT_RECORD 28config HAVE_FTRACE_MCOUNT_RECORD
16 bool 29 bool
17 30
31config HAVE_HW_BRANCH_TRACER
32 bool
33
18config TRACER_MAX_TRACE 34config TRACER_MAX_TRACE
19 bool 35 bool
20 36
@@ -47,6 +63,20 @@ config FUNCTION_TRACER
47 (the bootup default), then the overhead of the instructions is very 63 (the bootup default), then the overhead of the instructions is very
48 small and not measurable even in micro-benchmarks. 64 small and not measurable even in micro-benchmarks.
49 65
66config FUNCTION_GRAPH_TRACER
67 bool "Kernel Function Graph Tracer"
68 depends on HAVE_FUNCTION_GRAPH_TRACER
69 depends on FUNCTION_TRACER
70 default y
71 help
72 Enable the kernel to trace a function at both its return
73 and its entry.
74 It's first purpose is to trace the duration of functions and
75 draw a call graph for each thread with some informations like
76 the return value.
77 This is done by setting the current return address on the current
78 task structure into a stack of calls.
79
50config IRQSOFF_TRACER 80config IRQSOFF_TRACER
51 bool "Interrupts-off Latency Tracer" 81 bool "Interrupts-off Latency Tracer"
52 default n 82 default n
@@ -138,6 +168,70 @@ config BOOT_TRACER
138 selected, because the self-tests are an initcall as well and that 168 selected, because the self-tests are an initcall as well and that
139 would invalidate the boot trace. ) 169 would invalidate the boot trace. )
140 170
171config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING
175 help
176 This tracer profiles all the the likely and unlikely macros
177 in the kernel. It will display the results in:
178
179 /debugfs/tracing/profile_annotated_branch
180
181 Note: this will add a significant overhead, only turn this
182 on if you need to profile the system's use of these macros.
183
184 Say N if unsure.
185
186config PROFILE_ALL_BRANCHES
187 bool "Profile all if conditionals"
188 depends on TRACE_BRANCH_PROFILING
189 help
190 This tracer profiles all branch conditions. Every if ()
191 taken in the kernel is recorded whether it hit or miss.
192 The results will be displayed in:
193
194 /debugfs/tracing/profile_branch
195
196 This configuration, when enabled, will impose a great overhead
197 on the system. This should only be enabled when the system
198 is to be analyzed
199
200 Say N if unsure.
201
202config TRACING_BRANCHES
203 bool
204 help
205 Selected by tracers that will trace the likely and unlikely
206 conditions. This prevents the tracers themselves from being
207 profiled. Profiling the tracing infrastructure can only happen
208 when the likelys and unlikelys are not being traced.
209
210config BRANCH_TRACER
211 bool "Trace likely/unlikely instances"
212 depends on TRACE_BRANCH_PROFILING
213 select TRACING_BRANCHES
214 help
215 This traces the events of likely and unlikely condition
216 calls in the kernel. The difference between this and the
217 "Trace likely/unlikely profiler" is that this is not a
218 histogram of the callers, but actually places the calling
219 events into a running trace buffer to see when and where the
220 events happened, as well as their results.
221
222 Say N if unsure.
223
224config POWER_TRACER
225 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86
228 select TRACING
229 help
230 This tracer helps developers to analyze and optimize the kernels
231 power management decisions, specifically the C-state and P-state
232 behavior.
233
234
141config STACK_TRACER 235config STACK_TRACER
142 bool "Trace max stack" 236 bool "Trace max stack"
143 depends on HAVE_FUNCTION_TRACER 237 depends on HAVE_FUNCTION_TRACER
@@ -157,6 +251,14 @@ config STACK_TRACER
157 251
158 Say N if unsure. 252 Say N if unsure.
159 253
254config BTS_TRACER
255 depends on HAVE_HW_BRANCH_TRACER
256 bool "Trace branches"
257 select TRACING
258 help
259 This tracer records all branches on the system in a circular
260 buffer giving access to the last N branches for each cpu.
261
160config DYNAMIC_FTRACE 262config DYNAMIC_FTRACE
161 bool "enable/disable ftrace tracepoints dynamically" 263 bool "enable/disable ftrace tracepoints dynamically"
162 depends on FUNCTION_TRACER 264 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e9..62dc561b6676 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif
17
13obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
15 20
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o 29obj-$(CONFIG_STACK_TRACER) += trace_stack.o
25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 30obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_BTS_TRACER) += trace_bts.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o
27 36
28libftrace-y := ftrace.o 37libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c8..a12f80efceaa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,13 @@
47int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
48static int last_ftrace_enabled; 48static int last_ftrace_enabled;
49 49
50/* set when tracing only a pid */
51struct pid *ftrace_pid_trace;
52static struct pid * const ftrace_swapper_pid = &init_struct_pid;
53
54/* Quick disabling of function tracer. */
55int function_trace_stop;
56
50/* 57/*
51 * ftrace_disabled is set when an anomaly is discovered. 58 * ftrace_disabled is set when an anomaly is discovered.
52 * ftrace_disabled is much stronger than ftrace_enabled. 59 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly;
55 62
56static DEFINE_SPINLOCK(ftrace_lock); 63static DEFINE_SPINLOCK(ftrace_lock);
57static DEFINE_MUTEX(ftrace_sysctl_lock); 64static DEFINE_MUTEX(ftrace_sysctl_lock);
65static DEFINE_MUTEX(ftrace_start_lock);
58 66
59static struct ftrace_ops ftrace_list_end __read_mostly = 67static struct ftrace_ops ftrace_list_end __read_mostly =
60{ 68{
@@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
63 71
64static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 72static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
65ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 73ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
74ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
75ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
66 76
67static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 77static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
68{ 78{
@@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
79 }; 89 };
80} 90}
81 91
92static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
93{
94 if (!test_tsk_trace_trace(current))
95 return;
96
97 ftrace_pid_function(ip, parent_ip);
98}
99
100static void set_ftrace_pid_function(ftrace_func_t func)
101{
102 /* do not set ftrace_pid_function to itself! */
103 if (func != ftrace_pid_func)
104 ftrace_pid_function = func;
105}
106
82/** 107/**
83 * clear_ftrace_function - reset the ftrace function 108 * clear_ftrace_function - reset the ftrace function
84 * 109 *
@@ -88,8 +113,24 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
88void clear_ftrace_function(void) 113void clear_ftrace_function(void)
89{ 114{
90 ftrace_trace_function = ftrace_stub; 115 ftrace_trace_function = ftrace_stub;
116 __ftrace_trace_function = ftrace_stub;
117 ftrace_pid_function = ftrace_stub;
91} 118}
92 119
120#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
121/*
122 * For those archs that do not test ftrace_trace_stop in their
123 * mcount call site, we need to do it from C.
124 */
125static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
126{
127 if (function_trace_stop)
128 return;
129
130 __ftrace_trace_function(ip, parent_ip);
131}
132#endif
133
93static int __register_ftrace_function(struct ftrace_ops *ops) 134static int __register_ftrace_function(struct ftrace_ops *ops)
94{ 135{
95 /* should not be called from interrupt context */ 136 /* should not be called from interrupt context */
@@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
106 ftrace_list = ops; 147 ftrace_list = ops;
107 148
108 if (ftrace_enabled) { 149 if (ftrace_enabled) {
150 ftrace_func_t func;
151
152 if (ops->next == &ftrace_list_end)
153 func = ops->func;
154 else
155 func = ftrace_list_func;
156
157 if (ftrace_pid_trace) {
158 set_ftrace_pid_function(func);
159 func = ftrace_pid_func;
160 }
161
109 /* 162 /*
110 * For one func, simply call it directly. 163 * For one func, simply call it directly.
111 * For more than one func, call the chain. 164 * For more than one func, call the chain.
112 */ 165 */
113 if (ops->next == &ftrace_list_end) 166#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
114 ftrace_trace_function = ops->func; 167 ftrace_trace_function = func;
115 else 168#else
116 ftrace_trace_function = ftrace_list_func; 169 __ftrace_trace_function = func;
170 ftrace_trace_function = ftrace_test_stop_func;
171#endif
117 } 172 }
118 173
119 spin_unlock(&ftrace_lock); 174 spin_unlock(&ftrace_lock);
@@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
152 207
153 if (ftrace_enabled) { 208 if (ftrace_enabled) {
154 /* If we only have one func left, then call that directly */ 209 /* If we only have one func left, then call that directly */
155 if (ftrace_list == &ftrace_list_end || 210 if (ftrace_list->next == &ftrace_list_end) {
156 ftrace_list->next == &ftrace_list_end) 211 ftrace_func_t func = ftrace_list->func;
157 ftrace_trace_function = ftrace_list->func; 212
213 if (ftrace_pid_trace) {
214 set_ftrace_pid_function(func);
215 func = ftrace_pid_func;
216 }
217#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
218 ftrace_trace_function = func;
219#else
220 __ftrace_trace_function = func;
221#endif
222 }
158 } 223 }
159 224
160 out: 225 out:
@@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
163 return ret; 228 return ret;
164} 229}
165 230
231static void ftrace_update_pid_func(void)
232{
233 ftrace_func_t func;
234
235 /* should not be called from interrupt context */
236 spin_lock(&ftrace_lock);
237
238 if (ftrace_trace_function == ftrace_stub)
239 goto out;
240
241 func = ftrace_trace_function;
242
243 if (ftrace_pid_trace) {
244 set_ftrace_pid_function(func);
245 func = ftrace_pid_func;
246 } else {
247 if (func == ftrace_pid_func)
248 func = ftrace_pid_function;
249 }
250
251#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
252 ftrace_trace_function = func;
253#else
254 __ftrace_trace_function = func;
255#endif
256
257 out:
258 spin_unlock(&ftrace_lock);
259}
260
166#ifdef CONFIG_DYNAMIC_FTRACE 261#ifdef CONFIG_DYNAMIC_FTRACE
167#ifndef CONFIG_FTRACE_MCOUNT_RECORD 262#ifndef CONFIG_FTRACE_MCOUNT_RECORD
168# error Dynamic ftrace depends on MCOUNT_RECORD 263# error Dynamic ftrace depends on MCOUNT_RECORD
@@ -182,10 +277,11 @@ enum {
182 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 277 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
183 FTRACE_ENABLE_MCOUNT = (1 << 3), 278 FTRACE_ENABLE_MCOUNT = (1 << 3),
184 FTRACE_DISABLE_MCOUNT = (1 << 4), 279 FTRACE_DISABLE_MCOUNT = (1 << 4),
280 FTRACE_START_FUNC_RET = (1 << 5),
281 FTRACE_STOP_FUNC_RET = (1 << 6),
185}; 282};
186 283
187static int ftrace_filtered; 284static int ftrace_filtered;
188static int tracing_on;
189 285
190static LIST_HEAD(ftrace_new_addrs); 286static LIST_HEAD(ftrace_new_addrs);
191 287
@@ -309,7 +405,7 @@ ftrace_record_ip(unsigned long ip)
309{ 405{
310 struct dyn_ftrace *rec; 406 struct dyn_ftrace *rec;
311 407
312 if (!ftrace_enabled || ftrace_disabled) 408 if (ftrace_disabled)
313 return NULL; 409 return NULL;
314 410
315 rec = ftrace_alloc_dyn_node(ip); 411 rec = ftrace_alloc_dyn_node(ip);
@@ -323,107 +419,131 @@ ftrace_record_ip(unsigned long ip)
323 return rec; 419 return rec;
324} 420}
325 421
326#define FTRACE_ADDR ((long)(ftrace_caller)) 422static void print_ip_ins(const char *fmt, unsigned char *p)
423{
424 int i;
425
426 printk(KERN_CONT "%s", fmt);
427
428 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
429 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
430}
431
432static void ftrace_bug(int failed, unsigned long ip)
433{
434 switch (failed) {
435 case -EFAULT:
436 FTRACE_WARN_ON_ONCE(1);
437 pr_info("ftrace faulted on modifying ");
438 print_ip_sym(ip);
439 break;
440 case -EINVAL:
441 FTRACE_WARN_ON_ONCE(1);
442 pr_info("ftrace failed to modify ");
443 print_ip_sym(ip);
444 print_ip_ins(" actual: ", (unsigned char *)ip);
445 printk(KERN_CONT "\n");
446 break;
447 case -EPERM:
448 FTRACE_WARN_ON_ONCE(1);
449 pr_info("ftrace faulted on writing ");
450 print_ip_sym(ip);
451 break;
452 default:
453 FTRACE_WARN_ON_ONCE(1);
454 pr_info("ftrace faulted on unknown error ");
455 print_ip_sym(ip);
456 }
457}
458
327 459
328static int 460static int
329__ftrace_replace_code(struct dyn_ftrace *rec, 461__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
330 unsigned char *old, unsigned char *new, int enable)
331{ 462{
332 unsigned long ip, fl; 463 unsigned long ip, fl;
464 unsigned long ftrace_addr;
465
466 ftrace_addr = (unsigned long)ftrace_caller;
333 467
334 ip = rec->ip; 468 ip = rec->ip;
335 469
336 if (ftrace_filtered && enable) { 470 /*
471 * If this record is not to be traced and
472 * it is not enabled then do nothing.
473 *
474 * If this record is not to be traced and
475 * it is enabled then disabled it.
476 *
477 */
478 if (rec->flags & FTRACE_FL_NOTRACE) {
479 if (rec->flags & FTRACE_FL_ENABLED)
480 rec->flags &= ~FTRACE_FL_ENABLED;
481 else
482 return 0;
483
484 } else if (ftrace_filtered && enable) {
337 /* 485 /*
338 * If filtering is on: 486 * Filtering is on:
339 *
340 * If this record is set to be filtered and
341 * is enabled then do nothing.
342 *
343 * If this record is set to be filtered and
344 * it is not enabled, enable it.
345 *
346 * If this record is not set to be filtered
347 * and it is not enabled do nothing.
348 *
349 * If this record is set not to trace then
350 * do nothing.
351 *
352 * If this record is set not to trace and
353 * it is enabled then disable it.
354 *
355 * If this record is not set to be filtered and
356 * it is enabled, disable it.
357 */ 487 */
358 488
359 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | 489 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
360 FTRACE_FL_ENABLED);
361 490
362 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || 491 /* Record is filtered and enabled, do nothing */
363 (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || 492 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
364 !fl || (fl == FTRACE_FL_NOTRACE))
365 return 0; 493 return 0;
366 494
367 /* 495 /* Record is not filtered and is not enabled do nothing */
368 * If it is enabled disable it, 496 if (!fl)
369 * otherwise enable it! 497 return 0;
370 */ 498
371 if (fl & FTRACE_FL_ENABLED) { 499 /* Record is not filtered but enabled, disable it */
372 /* swap new and old */ 500 if (fl == FTRACE_FL_ENABLED)
373 new = old;
374 old = ftrace_call_replace(ip, FTRACE_ADDR);
375 rec->flags &= ~FTRACE_FL_ENABLED; 501 rec->flags &= ~FTRACE_FL_ENABLED;
376 } else { 502 else
377 new = ftrace_call_replace(ip, FTRACE_ADDR); 503 /* Otherwise record is filtered but not enabled, enable it */
378 rec->flags |= FTRACE_FL_ENABLED; 504 rec->flags |= FTRACE_FL_ENABLED;
379 }
380 } else { 505 } else {
506 /* Disable or not filtered */
381 507
382 if (enable) { 508 if (enable) {
383 /* 509 /* if record is enabled, do nothing */
384 * If this record is set not to trace and is
385 * not enabled, do nothing.
386 */
387 fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
388 if (fl == FTRACE_FL_NOTRACE)
389 return 0;
390
391 new = ftrace_call_replace(ip, FTRACE_ADDR);
392 } else
393 old = ftrace_call_replace(ip, FTRACE_ADDR);
394
395 if (enable) {
396 if (rec->flags & FTRACE_FL_ENABLED) 510 if (rec->flags & FTRACE_FL_ENABLED)
397 return 0; 511 return 0;
512
398 rec->flags |= FTRACE_FL_ENABLED; 513 rec->flags |= FTRACE_FL_ENABLED;
514
399 } else { 515 } else {
516
517 /* if record is not enabled do nothing */
400 if (!(rec->flags & FTRACE_FL_ENABLED)) 518 if (!(rec->flags & FTRACE_FL_ENABLED))
401 return 0; 519 return 0;
520
402 rec->flags &= ~FTRACE_FL_ENABLED; 521 rec->flags &= ~FTRACE_FL_ENABLED;
403 } 522 }
404 } 523 }
405 524
406 return ftrace_modify_code(ip, old, new); 525 if (rec->flags & FTRACE_FL_ENABLED)
526 return ftrace_make_call(rec, ftrace_addr);
527 else
528 return ftrace_make_nop(NULL, rec, ftrace_addr);
407} 529}
408 530
409static void ftrace_replace_code(int enable) 531static void ftrace_replace_code(int enable)
410{ 532{
411 int i, failed; 533 int i, failed;
412 unsigned char *new = NULL, *old = NULL;
413 struct dyn_ftrace *rec; 534 struct dyn_ftrace *rec;
414 struct ftrace_page *pg; 535 struct ftrace_page *pg;
415 536
416 if (enable)
417 old = ftrace_nop_replace();
418 else
419 new = ftrace_nop_replace();
420
421 for (pg = ftrace_pages_start; pg; pg = pg->next) { 537 for (pg = ftrace_pages_start; pg; pg = pg->next) {
422 for (i = 0; i < pg->index; i++) { 538 for (i = 0; i < pg->index; i++) {
423 rec = &pg->records[i]; 539 rec = &pg->records[i];
424 540
425 /* don't modify code that has already faulted */ 541 /*
426 if (rec->flags & FTRACE_FL_FAILED) 542 * Skip over free records and records that have
543 * failed.
544 */
545 if (rec->flags & FTRACE_FL_FREE ||
546 rec->flags & FTRACE_FL_FAILED)
427 continue; 547 continue;
428 548
429 /* ignore updates to this record's mcount site */ 549 /* ignore updates to this record's mcount site */
@@ -434,68 +554,30 @@ static void ftrace_replace_code(int enable)
434 unfreeze_record(rec); 554 unfreeze_record(rec);
435 } 555 }
436 556
437 failed = __ftrace_replace_code(rec, old, new, enable); 557 failed = __ftrace_replace_code(rec, enable);
438 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 558 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
439 rec->flags |= FTRACE_FL_FAILED; 559 rec->flags |= FTRACE_FL_FAILED;
440 if ((system_state == SYSTEM_BOOTING) || 560 if ((system_state == SYSTEM_BOOTING) ||
441 !core_kernel_text(rec->ip)) { 561 !core_kernel_text(rec->ip)) {
442 ftrace_free_rec(rec); 562 ftrace_free_rec(rec);
443 } 563 } else
564 ftrace_bug(failed, rec->ip);
444 } 565 }
445 } 566 }
446 } 567 }
447} 568}
448 569
449static void print_ip_ins(const char *fmt, unsigned char *p)
450{
451 int i;
452
453 printk(KERN_CONT "%s", fmt);
454
455 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
456 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
457}
458
459static int 570static int
460ftrace_code_disable(struct dyn_ftrace *rec) 571ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
461{ 572{
462 unsigned long ip; 573 unsigned long ip;
463 unsigned char *nop, *call;
464 int ret; 574 int ret;
465 575
466 ip = rec->ip; 576 ip = rec->ip;
467 577
468 nop = ftrace_nop_replace(); 578 ret = ftrace_make_nop(mod, rec, mcount_addr);
469 call = ftrace_call_replace(ip, mcount_addr);
470
471 ret = ftrace_modify_code(ip, call, nop);
472 if (ret) { 579 if (ret) {
473 switch (ret) { 580 ftrace_bug(ret, ip);
474 case -EFAULT:
475 FTRACE_WARN_ON_ONCE(1);
476 pr_info("ftrace faulted on modifying ");
477 print_ip_sym(ip);
478 break;
479 case -EINVAL:
480 FTRACE_WARN_ON_ONCE(1);
481 pr_info("ftrace failed to modify ");
482 print_ip_sym(ip);
483 print_ip_ins(" expected: ", call);
484 print_ip_ins(" actual: ", (unsigned char *)ip);
485 print_ip_ins(" replace: ", nop);
486 printk(KERN_CONT "\n");
487 break;
488 case -EPERM:
489 FTRACE_WARN_ON_ONCE(1);
490 pr_info("ftrace faulted on writing ");
491 print_ip_sym(ip);
492 break;
493 default:
494 FTRACE_WARN_ON_ONCE(1);
495 pr_info("ftrace faulted on unknown error ");
496 print_ip_sym(ip);
497 }
498
499 rec->flags |= FTRACE_FL_FAILED; 581 rec->flags |= FTRACE_FL_FAILED;
500 return 0; 582 return 0;
501 } 583 }
@@ -506,17 +588,19 @@ static int __ftrace_modify_code(void *data)
506{ 588{
507 int *command = data; 589 int *command = data;
508 590
509 if (*command & FTRACE_ENABLE_CALLS) { 591 if (*command & FTRACE_ENABLE_CALLS)
510 ftrace_replace_code(1); 592 ftrace_replace_code(1);
511 tracing_on = 1; 593 else if (*command & FTRACE_DISABLE_CALLS)
512 } else if (*command & FTRACE_DISABLE_CALLS) {
513 ftrace_replace_code(0); 594 ftrace_replace_code(0);
514 tracing_on = 0;
515 }
516 595
517 if (*command & FTRACE_UPDATE_TRACE_FUNC) 596 if (*command & FTRACE_UPDATE_TRACE_FUNC)
518 ftrace_update_ftrace_func(ftrace_trace_function); 597 ftrace_update_ftrace_func(ftrace_trace_function);
519 598
599 if (*command & FTRACE_START_FUNC_RET)
600 ftrace_enable_ftrace_graph_caller();
601 else if (*command & FTRACE_STOP_FUNC_RET)
602 ftrace_disable_ftrace_graph_caller();
603
520 return 0; 604 return 0;
521} 605}
522 606
@@ -526,44 +610,43 @@ static void ftrace_run_update_code(int command)
526} 610}
527 611
528static ftrace_func_t saved_ftrace_func; 612static ftrace_func_t saved_ftrace_func;
529static int ftrace_start; 613static int ftrace_start_up;
530static DEFINE_MUTEX(ftrace_start_lock);
531 614
532static void ftrace_startup(void) 615static void ftrace_startup_enable(int command)
533{ 616{
534 int command = 0;
535
536 if (unlikely(ftrace_disabled))
537 return;
538
539 mutex_lock(&ftrace_start_lock);
540 ftrace_start++;
541 if (ftrace_start == 1)
542 command |= FTRACE_ENABLE_CALLS;
543
544 if (saved_ftrace_func != ftrace_trace_function) { 617 if (saved_ftrace_func != ftrace_trace_function) {
545 saved_ftrace_func = ftrace_trace_function; 618 saved_ftrace_func = ftrace_trace_function;
546 command |= FTRACE_UPDATE_TRACE_FUNC; 619 command |= FTRACE_UPDATE_TRACE_FUNC;
547 } 620 }
548 621
549 if (!command || !ftrace_enabled) 622 if (!command || !ftrace_enabled)
550 goto out; 623 return;
551 624
552 ftrace_run_update_code(command); 625 ftrace_run_update_code(command);
553 out:
554 mutex_unlock(&ftrace_start_lock);
555} 626}
556 627
557static void ftrace_shutdown(void) 628static void ftrace_startup(int command)
558{ 629{
559 int command = 0; 630 if (unlikely(ftrace_disabled))
631 return;
632
633 mutex_lock(&ftrace_start_lock);
634 ftrace_start_up++;
635 command |= FTRACE_ENABLE_CALLS;
636
637 ftrace_startup_enable(command);
560 638
639 mutex_unlock(&ftrace_start_lock);
640}
641
642static void ftrace_shutdown(int command)
643{
561 if (unlikely(ftrace_disabled)) 644 if (unlikely(ftrace_disabled))
562 return; 645 return;
563 646
564 mutex_lock(&ftrace_start_lock); 647 mutex_lock(&ftrace_start_lock);
565 ftrace_start--; 648 ftrace_start_up--;
566 if (!ftrace_start) 649 if (!ftrace_start_up)
567 command |= FTRACE_DISABLE_CALLS; 650 command |= FTRACE_DISABLE_CALLS;
568 651
569 if (saved_ftrace_func != ftrace_trace_function) { 652 if (saved_ftrace_func != ftrace_trace_function) {
@@ -589,8 +672,8 @@ static void ftrace_startup_sysctl(void)
589 mutex_lock(&ftrace_start_lock); 672 mutex_lock(&ftrace_start_lock);
590 /* Force update next time */ 673 /* Force update next time */
591 saved_ftrace_func = NULL; 674 saved_ftrace_func = NULL;
592 /* ftrace_start is true if we want ftrace running */ 675 /* ftrace_start_up is true if we want ftrace running */
593 if (ftrace_start) 676 if (ftrace_start_up)
594 command |= FTRACE_ENABLE_CALLS; 677 command |= FTRACE_ENABLE_CALLS;
595 678
596 ftrace_run_update_code(command); 679 ftrace_run_update_code(command);
@@ -605,8 +688,8 @@ static void ftrace_shutdown_sysctl(void)
605 return; 688 return;
606 689
607 mutex_lock(&ftrace_start_lock); 690 mutex_lock(&ftrace_start_lock);
608 /* ftrace_start is true if ftrace is running */ 691 /* ftrace_start_up is true if ftrace is running */
609 if (ftrace_start) 692 if (ftrace_start_up)
610 command |= FTRACE_DISABLE_CALLS; 693 command |= FTRACE_DISABLE_CALLS;
611 694
612 ftrace_run_update_code(command); 695 ftrace_run_update_code(command);
@@ -617,7 +700,7 @@ static cycle_t ftrace_update_time;
617static unsigned long ftrace_update_cnt; 700static unsigned long ftrace_update_cnt;
618unsigned long ftrace_update_tot_cnt; 701unsigned long ftrace_update_tot_cnt;
619 702
620static int ftrace_update_code(void) 703static int ftrace_update_code(struct module *mod)
621{ 704{
622 struct dyn_ftrace *p, *t; 705 struct dyn_ftrace *p, *t;
623 cycle_t start, stop; 706 cycle_t start, stop;
@@ -634,7 +717,7 @@ static int ftrace_update_code(void)
634 list_del_init(&p->list); 717 list_del_init(&p->list);
635 718
636 /* convert record (i.e, patch mcount-call with NOP) */ 719 /* convert record (i.e, patch mcount-call with NOP) */
637 if (ftrace_code_disable(p)) { 720 if (ftrace_code_disable(mod, p)) {
638 p->flags |= FTRACE_FL_CONVERTED; 721 p->flags |= FTRACE_FL_CONVERTED;
639 ftrace_update_cnt++; 722 ftrace_update_cnt++;
640 } else 723 } else
@@ -677,7 +760,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
677 760
678 cnt = num_to_init / ENTRIES_PER_PAGE; 761 cnt = num_to_init / ENTRIES_PER_PAGE;
679 pr_info("ftrace: allocating %ld entries in %d pages\n", 762 pr_info("ftrace: allocating %ld entries in %d pages\n",
680 num_to_init, cnt); 763 num_to_init, cnt + 1);
681 764
682 for (i = 0; i < cnt; i++) { 765 for (i = 0; i < cnt; i++) {
683 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 766 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -702,7 +785,6 @@ enum {
702#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 785#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
703 786
704struct ftrace_iterator { 787struct ftrace_iterator {
705 loff_t pos;
706 struct ftrace_page *pg; 788 struct ftrace_page *pg;
707 unsigned idx; 789 unsigned idx;
708 unsigned flags; 790 unsigned flags;
@@ -727,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
727 iter->pg = iter->pg->next; 809 iter->pg = iter->pg->next;
728 iter->idx = 0; 810 iter->idx = 0;
729 goto retry; 811 goto retry;
812 } else {
813 iter->idx = -1;
730 } 814 }
731 } else { 815 } else {
732 rec = &iter->pg->records[iter->idx++]; 816 rec = &iter->pg->records[iter->idx++];
@@ -738,6 +822,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
738 ((iter->flags & FTRACE_ITER_FAILURES) && 822 ((iter->flags & FTRACE_ITER_FAILURES) &&
739 !(rec->flags & FTRACE_FL_FAILED)) || 823 !(rec->flags & FTRACE_FL_FAILED)) ||
740 824
825 ((iter->flags & FTRACE_ITER_FILTER) &&
826 !(rec->flags & FTRACE_FL_FILTER)) ||
827
741 ((iter->flags & FTRACE_ITER_NOTRACE) && 828 ((iter->flags & FTRACE_ITER_NOTRACE) &&
742 !(rec->flags & FTRACE_FL_NOTRACE))) { 829 !(rec->flags & FTRACE_FL_NOTRACE))) {
743 rec = NULL; 830 rec = NULL;
@@ -746,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
746 } 833 }
747 spin_unlock(&ftrace_lock); 834 spin_unlock(&ftrace_lock);
748 835
749 iter->pos = *pos;
750
751 return rec; 836 return rec;
752} 837}
753 838
@@ -755,16 +840,16 @@ static void *t_start(struct seq_file *m, loff_t *pos)
755{ 840{
756 struct ftrace_iterator *iter = m->private; 841 struct ftrace_iterator *iter = m->private;
757 void *p = NULL; 842 void *p = NULL;
758 loff_t l = -1;
759 843
760 if (*pos != iter->pos) { 844 if (*pos > 0) {
761 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) 845 if (iter->idx < 0)
762 ; 846 return p;
763 } else { 847 (*pos)--;
764 l = *pos; 848 iter->idx--;
765 p = t_next(m, p, &l);
766 } 849 }
767 850
851 p = t_next(m, p, pos);
852
768 return p; 853 return p;
769} 854}
770 855
@@ -808,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
808 return -ENOMEM; 893 return -ENOMEM;
809 894
810 iter->pg = ftrace_pages_start; 895 iter->pg = ftrace_pages_start;
811 iter->pos = -1;
812 896
813 ret = seq_open(file, &show_ftrace_seq_ops); 897 ret = seq_open(file, &show_ftrace_seq_ops);
814 if (!ret) { 898 if (!ret) {
@@ -895,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
895 979
896 if (file->f_mode & FMODE_READ) { 980 if (file->f_mode & FMODE_READ) {
897 iter->pg = ftrace_pages_start; 981 iter->pg = ftrace_pages_start;
898 iter->pos = -1;
899 iter->flags = enable ? FTRACE_ITER_FILTER : 982 iter->flags = enable ? FTRACE_ITER_FILTER :
900 FTRACE_ITER_NOTRACE; 983 FTRACE_ITER_NOTRACE;
901 984
@@ -1186,7 +1269,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1186 1269
1187 mutex_lock(&ftrace_sysctl_lock); 1270 mutex_lock(&ftrace_sysctl_lock);
1188 mutex_lock(&ftrace_start_lock); 1271 mutex_lock(&ftrace_start_lock);
1189 if (iter->filtered && ftrace_start && ftrace_enabled) 1272 if (ftrace_start_up && ftrace_enabled)
1190 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1273 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1191 mutex_unlock(&ftrace_start_lock); 1274 mutex_unlock(&ftrace_start_lock);
1192 mutex_unlock(&ftrace_sysctl_lock); 1275 mutex_unlock(&ftrace_sysctl_lock);
@@ -1238,12 +1321,233 @@ static struct file_operations ftrace_notrace_fops = {
1238 .release = ftrace_notrace_release, 1321 .release = ftrace_notrace_release,
1239}; 1322};
1240 1323
1241static __init int ftrace_init_debugfs(void) 1324#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1325
1326static DEFINE_MUTEX(graph_lock);
1327
1328int ftrace_graph_count;
1329unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
1330
1331static void *
1332g_next(struct seq_file *m, void *v, loff_t *pos)
1242{ 1333{
1243 struct dentry *d_tracer; 1334 unsigned long *array = m->private;
1244 struct dentry *entry; 1335 int index = *pos;
1245 1336
1246 d_tracer = tracing_init_dentry(); 1337 (*pos)++;
1338
1339 if (index >= ftrace_graph_count)
1340 return NULL;
1341
1342 return &array[index];
1343}
1344
1345static void *g_start(struct seq_file *m, loff_t *pos)
1346{
1347 void *p = NULL;
1348
1349 mutex_lock(&graph_lock);
1350
1351 p = g_next(m, p, pos);
1352
1353 return p;
1354}
1355
1356static void g_stop(struct seq_file *m, void *p)
1357{
1358 mutex_unlock(&graph_lock);
1359}
1360
1361static int g_show(struct seq_file *m, void *v)
1362{
1363 unsigned long *ptr = v;
1364 char str[KSYM_SYMBOL_LEN];
1365
1366 if (!ptr)
1367 return 0;
1368
1369 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1370
1371 seq_printf(m, "%s\n", str);
1372
1373 return 0;
1374}
1375
1376static struct seq_operations ftrace_graph_seq_ops = {
1377 .start = g_start,
1378 .next = g_next,
1379 .stop = g_stop,
1380 .show = g_show,
1381};
1382
1383static int
1384ftrace_graph_open(struct inode *inode, struct file *file)
1385{
1386 int ret = 0;
1387
1388 if (unlikely(ftrace_disabled))
1389 return -ENODEV;
1390
1391 mutex_lock(&graph_lock);
1392 if ((file->f_mode & FMODE_WRITE) &&
1393 !(file->f_flags & O_APPEND)) {
1394 ftrace_graph_count = 0;
1395 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
1396 }
1397
1398 if (file->f_mode & FMODE_READ) {
1399 ret = seq_open(file, &ftrace_graph_seq_ops);
1400 if (!ret) {
1401 struct seq_file *m = file->private_data;
1402 m->private = ftrace_graph_funcs;
1403 }
1404 } else
1405 file->private_data = ftrace_graph_funcs;
1406 mutex_unlock(&graph_lock);
1407
1408 return ret;
1409}
1410
1411static ssize_t
1412ftrace_graph_read(struct file *file, char __user *ubuf,
1413 size_t cnt, loff_t *ppos)
1414{
1415 if (file->f_mode & FMODE_READ)
1416 return seq_read(file, ubuf, cnt, ppos);
1417 else
1418 return -EPERM;
1419}
1420
1421static int
1422ftrace_set_func(unsigned long *array, int idx, char *buffer)
1423{
1424 char str[KSYM_SYMBOL_LEN];
1425 struct dyn_ftrace *rec;
1426 struct ftrace_page *pg;
1427 int found = 0;
1428 int i, j;
1429
1430 if (ftrace_disabled)
1431 return -ENODEV;
1432
1433 /* should not be called from interrupt context */
1434 spin_lock(&ftrace_lock);
1435
1436 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1437 for (i = 0; i < pg->index; i++) {
1438 rec = &pg->records[i];
1439
1440 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1441 continue;
1442
1443 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1444 if (strcmp(str, buffer) == 0) {
1445 found = 1;
1446 for (j = 0; j < idx; j++)
1447 if (array[j] == rec->ip) {
1448 found = 0;
1449 break;
1450 }
1451 if (found)
1452 array[idx] = rec->ip;
1453 break;
1454 }
1455 }
1456 }
1457 spin_unlock(&ftrace_lock);
1458
1459 return found ? 0 : -EINVAL;
1460}
1461
1462static ssize_t
1463ftrace_graph_write(struct file *file, const char __user *ubuf,
1464 size_t cnt, loff_t *ppos)
1465{
1466 unsigned char buffer[FTRACE_BUFF_MAX+1];
1467 unsigned long *array;
1468 size_t read = 0;
1469 ssize_t ret;
1470 int index = 0;
1471 char ch;
1472
1473 if (!cnt || cnt < 0)
1474 return 0;
1475
1476 mutex_lock(&graph_lock);
1477
1478 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
1479 ret = -EBUSY;
1480 goto out;
1481 }
1482
1483 if (file->f_mode & FMODE_READ) {
1484 struct seq_file *m = file->private_data;
1485 array = m->private;
1486 } else
1487 array = file->private_data;
1488
1489 ret = get_user(ch, ubuf++);
1490 if (ret)
1491 goto out;
1492 read++;
1493 cnt--;
1494
1495 /* skip white space */
1496 while (cnt && isspace(ch)) {
1497 ret = get_user(ch, ubuf++);
1498 if (ret)
1499 goto out;
1500 read++;
1501 cnt--;
1502 }
1503
1504 if (isspace(ch)) {
1505 *ppos += read;
1506 ret = read;
1507 goto out;
1508 }
1509
1510 while (cnt && !isspace(ch)) {
1511 if (index < FTRACE_BUFF_MAX)
1512 buffer[index++] = ch;
1513 else {
1514 ret = -EINVAL;
1515 goto out;
1516 }
1517 ret = get_user(ch, ubuf++);
1518 if (ret)
1519 goto out;
1520 read++;
1521 cnt--;
1522 }
1523 buffer[index] = 0;
1524
1525 /* we allow only one at a time */
1526 ret = ftrace_set_func(array, ftrace_graph_count, buffer);
1527 if (ret)
1528 goto out;
1529
1530 ftrace_graph_count++;
1531
1532 file->f_pos += read;
1533
1534 ret = read;
1535 out:
1536 mutex_unlock(&graph_lock);
1537
1538 return ret;
1539}
1540
1541static const struct file_operations ftrace_graph_fops = {
1542 .open = ftrace_graph_open,
1543 .read = ftrace_graph_read,
1544 .write = ftrace_graph_write,
1545};
1546#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1547
1548static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
1549{
1550 struct dentry *entry;
1247 1551
1248 entry = debugfs_create_file("available_filter_functions", 0444, 1552 entry = debugfs_create_file("available_filter_functions", 0444,
1249 d_tracer, NULL, &ftrace_avail_fops); 1553 d_tracer, NULL, &ftrace_avail_fops);
@@ -1268,12 +1572,20 @@ static __init int ftrace_init_debugfs(void)
1268 pr_warning("Could not create debugfs " 1572 pr_warning("Could not create debugfs "
1269 "'set_ftrace_notrace' entry\n"); 1573 "'set_ftrace_notrace' entry\n");
1270 1574
1575#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1576 entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
1577 NULL,
1578 &ftrace_graph_fops);
1579 if (!entry)
1580 pr_warning("Could not create debugfs "
1581 "'set_graph_function' entry\n");
1582#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1583
1271 return 0; 1584 return 0;
1272} 1585}
1273 1586
1274fs_initcall(ftrace_init_debugfs); 1587static int ftrace_convert_nops(struct module *mod,
1275 1588 unsigned long *start,
1276static int ftrace_convert_nops(unsigned long *start,
1277 unsigned long *end) 1589 unsigned long *end)
1278{ 1590{
1279 unsigned long *p; 1591 unsigned long *p;
@@ -1284,23 +1596,32 @@ static int ftrace_convert_nops(unsigned long *start,
1284 p = start; 1596 p = start;
1285 while (p < end) { 1597 while (p < end) {
1286 addr = ftrace_call_adjust(*p++); 1598 addr = ftrace_call_adjust(*p++);
1599 /*
1600 * Some architecture linkers will pad between
1601 * the different mcount_loc sections of different
1602 * object files to satisfy alignments.
1603 * Skip any NULL pointers.
1604 */
1605 if (!addr)
1606 continue;
1287 ftrace_record_ip(addr); 1607 ftrace_record_ip(addr);
1288 } 1608 }
1289 1609
1290 /* disable interrupts to prevent kstop machine */ 1610 /* disable interrupts to prevent kstop machine */
1291 local_irq_save(flags); 1611 local_irq_save(flags);
1292 ftrace_update_code(); 1612 ftrace_update_code(mod);
1293 local_irq_restore(flags); 1613 local_irq_restore(flags);
1294 mutex_unlock(&ftrace_start_lock); 1614 mutex_unlock(&ftrace_start_lock);
1295 1615
1296 return 0; 1616 return 0;
1297} 1617}
1298 1618
1299void ftrace_init_module(unsigned long *start, unsigned long *end) 1619void ftrace_init_module(struct module *mod,
1620 unsigned long *start, unsigned long *end)
1300{ 1621{
1301 if (ftrace_disabled || start == end) 1622 if (ftrace_disabled || start == end)
1302 return; 1623 return;
1303 ftrace_convert_nops(start, end); 1624 ftrace_convert_nops(mod, start, end);
1304} 1625}
1305 1626
1306extern unsigned long __start_mcount_loc[]; 1627extern unsigned long __start_mcount_loc[];
@@ -1330,7 +1651,8 @@ void __init ftrace_init(void)
1330 1651
1331 last_ftrace_enabled = ftrace_enabled = 1; 1652 last_ftrace_enabled = ftrace_enabled = 1;
1332 1653
1333 ret = ftrace_convert_nops(__start_mcount_loc, 1654 ret = ftrace_convert_nops(NULL,
1655 __start_mcount_loc,
1334 __stop_mcount_loc); 1656 __stop_mcount_loc);
1335 1657
1336 return; 1658 return;
@@ -1347,12 +1669,186 @@ static int __init ftrace_nodyn_init(void)
1347} 1669}
1348device_initcall(ftrace_nodyn_init); 1670device_initcall(ftrace_nodyn_init);
1349 1671
1350# define ftrace_startup() do { } while (0) 1672static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
1351# define ftrace_shutdown() do { } while (0) 1673static inline void ftrace_startup_enable(int command) { }
1674/* Keep as macros so we do not need to define the commands */
1675# define ftrace_startup(command) do { } while (0)
1676# define ftrace_shutdown(command) do { } while (0)
1352# define ftrace_startup_sysctl() do { } while (0) 1677# define ftrace_startup_sysctl() do { } while (0)
1353# define ftrace_shutdown_sysctl() do { } while (0) 1678# define ftrace_shutdown_sysctl() do { } while (0)
1354#endif /* CONFIG_DYNAMIC_FTRACE */ 1679#endif /* CONFIG_DYNAMIC_FTRACE */
1355 1680
1681static ssize_t
1682ftrace_pid_read(struct file *file, char __user *ubuf,
1683 size_t cnt, loff_t *ppos)
1684{
1685 char buf[64];
1686 int r;
1687
1688 if (ftrace_pid_trace == ftrace_swapper_pid)
1689 r = sprintf(buf, "swapper tasks\n");
1690 else if (ftrace_pid_trace)
1691 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
1692 else
1693 r = sprintf(buf, "no pid\n");
1694
1695 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1696}
1697
1698static void clear_ftrace_swapper(void)
1699{
1700 struct task_struct *p;
1701 int cpu;
1702
1703 get_online_cpus();
1704 for_each_online_cpu(cpu) {
1705 p = idle_task(cpu);
1706 clear_tsk_trace_trace(p);
1707 }
1708 put_online_cpus();
1709}
1710
1711static void set_ftrace_swapper(void)
1712{
1713 struct task_struct *p;
1714 int cpu;
1715
1716 get_online_cpus();
1717 for_each_online_cpu(cpu) {
1718 p = idle_task(cpu);
1719 set_tsk_trace_trace(p);
1720 }
1721 put_online_cpus();
1722}
1723
1724static void clear_ftrace_pid(struct pid *pid)
1725{
1726 struct task_struct *p;
1727
1728 do_each_pid_task(pid, PIDTYPE_PID, p) {
1729 clear_tsk_trace_trace(p);
1730 } while_each_pid_task(pid, PIDTYPE_PID, p);
1731 put_pid(pid);
1732}
1733
1734static void set_ftrace_pid(struct pid *pid)
1735{
1736 struct task_struct *p;
1737
1738 do_each_pid_task(pid, PIDTYPE_PID, p) {
1739 set_tsk_trace_trace(p);
1740 } while_each_pid_task(pid, PIDTYPE_PID, p);
1741}
1742
1743static void clear_ftrace_pid_task(struct pid **pid)
1744{
1745 if (*pid == ftrace_swapper_pid)
1746 clear_ftrace_swapper();
1747 else
1748 clear_ftrace_pid(*pid);
1749
1750 *pid = NULL;
1751}
1752
1753static void set_ftrace_pid_task(struct pid *pid)
1754{
1755 if (pid == ftrace_swapper_pid)
1756 set_ftrace_swapper();
1757 else
1758 set_ftrace_pid(pid);
1759}
1760
1761static ssize_t
1762ftrace_pid_write(struct file *filp, const char __user *ubuf,
1763 size_t cnt, loff_t *ppos)
1764{
1765 struct pid *pid;
1766 char buf[64];
1767 long val;
1768 int ret;
1769
1770 if (cnt >= sizeof(buf))
1771 return -EINVAL;
1772
1773 if (copy_from_user(&buf, ubuf, cnt))
1774 return -EFAULT;
1775
1776 buf[cnt] = 0;
1777
1778 ret = strict_strtol(buf, 10, &val);
1779 if (ret < 0)
1780 return ret;
1781
1782 mutex_lock(&ftrace_start_lock);
1783 if (val < 0) {
1784 /* disable pid tracing */
1785 if (!ftrace_pid_trace)
1786 goto out;
1787
1788 clear_ftrace_pid_task(&ftrace_pid_trace);
1789
1790 } else {
1791 /* swapper task is special */
1792 if (!val) {
1793 pid = ftrace_swapper_pid;
1794 if (pid == ftrace_pid_trace)
1795 goto out;
1796 } else {
1797 pid = find_get_pid(val);
1798
1799 if (pid == ftrace_pid_trace) {
1800 put_pid(pid);
1801 goto out;
1802 }
1803 }
1804
1805 if (ftrace_pid_trace)
1806 clear_ftrace_pid_task(&ftrace_pid_trace);
1807
1808 if (!pid)
1809 goto out;
1810
1811 ftrace_pid_trace = pid;
1812
1813 set_ftrace_pid_task(ftrace_pid_trace);
1814 }
1815
1816 /* update the function call */
1817 ftrace_update_pid_func();
1818 ftrace_startup_enable(0);
1819
1820 out:
1821 mutex_unlock(&ftrace_start_lock);
1822
1823 return cnt;
1824}
1825
1826static struct file_operations ftrace_pid_fops = {
1827 .read = ftrace_pid_read,
1828 .write = ftrace_pid_write,
1829};
1830
1831static __init int ftrace_init_debugfs(void)
1832{
1833 struct dentry *d_tracer;
1834 struct dentry *entry;
1835
1836 d_tracer = tracing_init_dentry();
1837 if (!d_tracer)
1838 return 0;
1839
1840 ftrace_init_dyn_debugfs(d_tracer);
1841
1842 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
1843 NULL, &ftrace_pid_fops);
1844 if (!entry)
1845 pr_warning("Could not create debugfs "
1846 "'set_ftrace_pid' entry\n");
1847 return 0;
1848}
1849
1850fs_initcall(ftrace_init_debugfs);
1851
1356/** 1852/**
1357 * ftrace_kill - kill ftrace 1853 * ftrace_kill - kill ftrace
1358 * 1854 *
@@ -1386,10 +1882,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
1386 return -1; 1882 return -1;
1387 1883
1388 mutex_lock(&ftrace_sysctl_lock); 1884 mutex_lock(&ftrace_sysctl_lock);
1885
1389 ret = __register_ftrace_function(ops); 1886 ret = __register_ftrace_function(ops);
1390 ftrace_startup(); 1887 ftrace_startup(0);
1391 mutex_unlock(&ftrace_sysctl_lock);
1392 1888
1889 mutex_unlock(&ftrace_sysctl_lock);
1393 return ret; 1890 return ret;
1394} 1891}
1395 1892
@@ -1405,7 +1902,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1405 1902
1406 mutex_lock(&ftrace_sysctl_lock); 1903 mutex_lock(&ftrace_sysctl_lock);
1407 ret = __unregister_ftrace_function(ops); 1904 ret = __unregister_ftrace_function(ops);
1408 ftrace_shutdown(); 1905 ftrace_shutdown(0);
1409 mutex_unlock(&ftrace_sysctl_lock); 1906 mutex_unlock(&ftrace_sysctl_lock);
1410 1907
1411 return ret; 1908 return ret;
@@ -1454,3 +1951,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1454 return ret; 1951 return ret;
1455} 1952}
1456 1953
1954#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1955
1956static atomic_t ftrace_graph_active;
1957
1958int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1959{
1960 return 0;
1961}
1962
1963/* The callbacks that hook a function */
1964trace_func_graph_ret_t ftrace_graph_return =
1965 (trace_func_graph_ret_t)ftrace_stub;
1966trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
1967
1968/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
1969static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
1970{
1971 int i;
1972 int ret = 0;
1973 unsigned long flags;
1974 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
1975 struct task_struct *g, *t;
1976
1977 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
1978 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
1979 * sizeof(struct ftrace_ret_stack),
1980 GFP_KERNEL);
1981 if (!ret_stack_list[i]) {
1982 start = 0;
1983 end = i;
1984 ret = -ENOMEM;
1985 goto free;
1986 }
1987 }
1988
1989 read_lock_irqsave(&tasklist_lock, flags);
1990 do_each_thread(g, t) {
1991 if (start == end) {
1992 ret = -EAGAIN;
1993 goto unlock;
1994 }
1995
1996 if (t->ret_stack == NULL) {
1997 t->curr_ret_stack = -1;
1998 /* Make sure IRQs see the -1 first: */
1999 barrier();
2000 t->ret_stack = ret_stack_list[start++];
2001 atomic_set(&t->tracing_graph_pause, 0);
2002 atomic_set(&t->trace_overrun, 0);
2003 }
2004 } while_each_thread(g, t);
2005
2006unlock:
2007 read_unlock_irqrestore(&tasklist_lock, flags);
2008free:
2009 for (i = start; i < end; i++)
2010 kfree(ret_stack_list[i]);
2011 return ret;
2012}
2013
2014/* Allocate a return stack for each task */
2015static int start_graph_tracing(void)
2016{
2017 struct ftrace_ret_stack **ret_stack_list;
2018 int ret;
2019
2020 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2021 sizeof(struct ftrace_ret_stack *),
2022 GFP_KERNEL);
2023
2024 if (!ret_stack_list)
2025 return -ENOMEM;
2026
2027 do {
2028 ret = alloc_retstack_tasklist(ret_stack_list);
2029 } while (ret == -EAGAIN);
2030
2031 kfree(ret_stack_list);
2032 return ret;
2033}
2034
2035int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2036 trace_func_graph_ent_t entryfunc)
2037{
2038 int ret = 0;
2039
2040 mutex_lock(&ftrace_sysctl_lock);
2041
2042 atomic_inc(&ftrace_graph_active);
2043 ret = start_graph_tracing();
2044 if (ret) {
2045 atomic_dec(&ftrace_graph_active);
2046 goto out;
2047 }
2048
2049 ftrace_graph_return = retfunc;
2050 ftrace_graph_entry = entryfunc;
2051
2052 ftrace_startup(FTRACE_START_FUNC_RET);
2053
2054out:
2055 mutex_unlock(&ftrace_sysctl_lock);
2056 return ret;
2057}
2058
2059void unregister_ftrace_graph(void)
2060{
2061 mutex_lock(&ftrace_sysctl_lock);
2062
2063 atomic_dec(&ftrace_graph_active);
2064 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2065 ftrace_graph_entry = ftrace_graph_entry_stub;
2066 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2067
2068 mutex_unlock(&ftrace_sysctl_lock);
2069}
2070
2071/* Allocate a return stack for newly created task */
2072void ftrace_graph_init_task(struct task_struct *t)
2073{
2074 if (atomic_read(&ftrace_graph_active)) {
2075 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2076 * sizeof(struct ftrace_ret_stack),
2077 GFP_KERNEL);
2078 if (!t->ret_stack)
2079 return;
2080 t->curr_ret_stack = -1;
2081 atomic_set(&t->tracing_graph_pause, 0);
2082 atomic_set(&t->trace_overrun, 0);
2083 } else
2084 t->ret_stack = NULL;
2085}
2086
2087void ftrace_graph_exit_task(struct task_struct *t)
2088{
2089 struct ftrace_ret_stack *ret_stack = t->ret_stack;
2090
2091 t->ret_stack = NULL;
2092 /* NULL must become visible to IRQs before we free it: */
2093 barrier();
2094
2095 kfree(ret_stack);
2096}
2097
2098void ftrace_graph_stop(void)
2099{
2100 ftrace_stop();
2101}
2102#endif
2103
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3f3380638646..7f69cfeaadf7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,14 +16,100 @@
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18 18
19#include "trace.h"
20
21/*
22 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to.
25 * Turning this switch on, makes it OK to write to the
26 * ring buffer, if the ring buffer is enabled itself.
27 *
28 * There's three layers that must be on in order to write
29 * to the ring buffer.
30 *
31 * 1) This global flag must be set.
32 * 2) The ring buffer must be enabled for recording.
33 * 3) The per cpu buffer must be enabled for recording.
34 *
35 * In case of an anomaly, this global flag has a bit set that
36 * will permantly disable all ring buffers.
37 */
38
39/*
40 * Global flag to disable all recording to ring buffers
41 * This has two bits: ON, DISABLED
42 *
43 * ON DISABLED
44 * ---- ----------
45 * 0 0 : ring buffers are off
46 * 1 0 : ring buffers are on
47 * X 1 : ring buffers are permanently disabled
48 */
49
50enum {
51 RB_BUFFERS_ON_BIT = 0,
52 RB_BUFFERS_DISABLED_BIT = 1,
53};
54
55enum {
56 RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58};
59
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
61
62/**
63 * tracing_on - enable all tracing buffers
64 *
65 * This function enables all tracing buffers that may have been
66 * disabled with tracing_off.
67 */
68void tracing_on(void)
69{
70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
71}
72
73/**
74 * tracing_off - turn off all tracing buffers
75 *
76 * This function stops all tracing buffers from recording data.
77 * It does not disable any overhead the tracers themselves may
78 * be causing. This function simply causes all recording to
79 * the ring buffers to fail.
80 */
81void tracing_off(void)
82{
83 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
84}
85
86/**
87 * tracing_off_permanent - permanently disable ring buffers
88 *
89 * This function, once called, will disable all ring buffers
90 * permanenty.
91 */
92void tracing_off_permanent(void)
93{
94 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
95}
96
97#include "trace.h"
98
19/* Up this if you want to test the TIME_EXTENTS and normalization */ 99/* Up this if you want to test the TIME_EXTENTS and normalization */
20#define DEBUG_SHIFT 0 100#define DEBUG_SHIFT 0
21 101
22/* FIXME!!! */ 102/* FIXME!!! */
23u64 ring_buffer_time_stamp(int cpu) 103u64 ring_buffer_time_stamp(int cpu)
24{ 104{
105 u64 time;
106
107 preempt_disable_notrace();
25 /* shift to debug/test normalization and TIME_EXTENTS */ 108 /* shift to debug/test normalization and TIME_EXTENTS */
26 return sched_clock() << DEBUG_SHIFT; 109 time = sched_clock() << DEBUG_SHIFT;
110 preempt_enable_notrace();
111
112 return time;
27} 113}
28 114
29void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 115void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
@@ -109,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
109#define TS_MASK ((1ULL << TS_SHIFT) - 1) 195#define TS_MASK ((1ULL << TS_SHIFT) - 1)
110#define TS_DELTA_TEST (~TS_MASK) 196#define TS_DELTA_TEST (~TS_MASK)
111 197
112/* 198struct buffer_data_page {
113 * This hack stolen from mm/slob.c.
114 * We can store per page timing information in the page frame of the page.
115 * Thanks to Peter Zijlstra for suggesting this idea.
116 */
117struct buffer_page {
118 u64 time_stamp; /* page time stamp */ 199 u64 time_stamp; /* page time stamp */
119 local_t write; /* index for next write */
120 local_t commit; /* write commited index */ 200 local_t commit; /* write commited index */
201 unsigned char data[]; /* data of buffer page */
202};
203
204struct buffer_page {
205 local_t write; /* index for next write */
121 unsigned read; /* index for next read */ 206 unsigned read; /* index for next read */
122 struct list_head list; /* list of free pages */ 207 struct list_head list; /* list of free pages */
123 void *page; /* Actual data page */ 208 struct buffer_data_page *page; /* Actual data page */
124}; 209};
125 210
211static void rb_init_page(struct buffer_data_page *bpage)
212{
213 local_set(&bpage->commit, 0);
214}
215
126/* 216/*
127 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 217 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
128 * this issue out. 218 * this issue out.
@@ -144,7 +234,7 @@ static inline int test_time_stamp(u64 delta)
144 return 0; 234 return 0;
145} 235}
146 236
147#define BUF_PAGE_SIZE PAGE_SIZE 237#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
148 238
149/* 239/*
150 * head_page == tail_page && head == tail then buffer is empty. 240 * head_page == tail_page && head == tail then buffer is empty.
@@ -152,7 +242,8 @@ static inline int test_time_stamp(u64 delta)
152struct ring_buffer_per_cpu { 242struct ring_buffer_per_cpu {
153 int cpu; 243 int cpu;
154 struct ring_buffer *buffer; 244 struct ring_buffer *buffer;
155 spinlock_t lock; 245 spinlock_t reader_lock; /* serialize readers */
246 raw_spinlock_t lock;
156 struct lock_class_key lock_key; 247 struct lock_class_key lock_key;
157 struct list_head pages; 248 struct list_head pages;
158 struct buffer_page *head_page; /* read from head */ 249 struct buffer_page *head_page; /* read from head */
@@ -186,32 +277,16 @@ struct ring_buffer_iter {
186 u64 read_stamp; 277 u64 read_stamp;
187}; 278};
188 279
280/* buffer may be either ring_buffer or ring_buffer_per_cpu */
189#define RB_WARN_ON(buffer, cond) \ 281#define RB_WARN_ON(buffer, cond) \
190 do { \ 282 ({ \
191 if (unlikely(cond)) { \ 283 int _____ret = unlikely(cond); \
192 atomic_inc(&buffer->record_disabled); \ 284 if (_____ret) { \
193 WARN_ON(1); \
194 } \
195 } while (0)
196
197#define RB_WARN_ON_RET(buffer, cond) \
198 do { \
199 if (unlikely(cond)) { \
200 atomic_inc(&buffer->record_disabled); \
201 WARN_ON(1); \
202 return -1; \
203 } \
204 } while (0)
205
206#define RB_WARN_ON_ONCE(buffer, cond) \
207 do { \
208 static int once; \
209 if (unlikely(cond) && !once) { \
210 once++; \
211 atomic_inc(&buffer->record_disabled); \ 285 atomic_inc(&buffer->record_disabled); \
212 WARN_ON(1); \ 286 WARN_ON(1); \
213 } \ 287 } \
214 } while (0) 288 _____ret; \
289 })
215 290
216/** 291/**
217 * check_pages - integrity check of buffer pages 292 * check_pages - integrity check of buffer pages
@@ -223,16 +298,20 @@ struct ring_buffer_iter {
223static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 298static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
224{ 299{
225 struct list_head *head = &cpu_buffer->pages; 300 struct list_head *head = &cpu_buffer->pages;
226 struct buffer_page *page, *tmp; 301 struct buffer_page *bpage, *tmp;
227 302
228 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); 303 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
229 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); 304 return -1;
305 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
306 return -1;
230 307
231 list_for_each_entry_safe(page, tmp, head, list) { 308 list_for_each_entry_safe(bpage, tmp, head, list) {
232 RB_WARN_ON_RET(cpu_buffer, 309 if (RB_WARN_ON(cpu_buffer,
233 page->list.next->prev != &page->list); 310 bpage->list.next->prev != &bpage->list))
234 RB_WARN_ON_RET(cpu_buffer, 311 return -1;
235 page->list.prev->next != &page->list); 312 if (RB_WARN_ON(cpu_buffer,
313 bpage->list.prev->next != &bpage->list))
314 return -1;
236 } 315 }
237 316
238 return 0; 317 return 0;
@@ -242,22 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
242 unsigned nr_pages) 321 unsigned nr_pages)
243{ 322{
244 struct list_head *head = &cpu_buffer->pages; 323 struct list_head *head = &cpu_buffer->pages;
245 struct buffer_page *page, *tmp; 324 struct buffer_page *bpage, *tmp;
246 unsigned long addr; 325 unsigned long addr;
247 LIST_HEAD(pages); 326 LIST_HEAD(pages);
248 unsigned i; 327 unsigned i;
249 328
250 for (i = 0; i < nr_pages; i++) { 329 for (i = 0; i < nr_pages; i++) {
251 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 330 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
252 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 331 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
253 if (!page) 332 if (!bpage)
254 goto free_pages; 333 goto free_pages;
255 list_add(&page->list, &pages); 334 list_add(&bpage->list, &pages);
256 335
257 addr = __get_free_page(GFP_KERNEL); 336 addr = __get_free_page(GFP_KERNEL);
258 if (!addr) 337 if (!addr)
259 goto free_pages; 338 goto free_pages;
260 page->page = (void *)addr; 339 bpage->page = (void *)addr;
340 rb_init_page(bpage->page);
261 } 341 }
262 342
263 list_splice(&pages, head); 343 list_splice(&pages, head);
@@ -267,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
267 return 0; 347 return 0;
268 348
269 free_pages: 349 free_pages:
270 list_for_each_entry_safe(page, tmp, &pages, list) { 350 list_for_each_entry_safe(bpage, tmp, &pages, list) {
271 list_del_init(&page->list); 351 list_del_init(&bpage->list);
272 free_buffer_page(page); 352 free_buffer_page(bpage);
273 } 353 }
274 return -ENOMEM; 354 return -ENOMEM;
275} 355}
@@ -278,7 +358,7 @@ static struct ring_buffer_per_cpu *
278rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 358rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
279{ 359{
280 struct ring_buffer_per_cpu *cpu_buffer; 360 struct ring_buffer_per_cpu *cpu_buffer;
281 struct buffer_page *page; 361 struct buffer_page *bpage;
282 unsigned long addr; 362 unsigned long addr;
283 int ret; 363 int ret;
284 364
@@ -289,19 +369,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
289 369
290 cpu_buffer->cpu = cpu; 370 cpu_buffer->cpu = cpu;
291 cpu_buffer->buffer = buffer; 371 cpu_buffer->buffer = buffer;
292 spin_lock_init(&cpu_buffer->lock); 372 spin_lock_init(&cpu_buffer->reader_lock);
373 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
293 INIT_LIST_HEAD(&cpu_buffer->pages); 374 INIT_LIST_HEAD(&cpu_buffer->pages);
294 375
295 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 376 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
296 GFP_KERNEL, cpu_to_node(cpu)); 377 GFP_KERNEL, cpu_to_node(cpu));
297 if (!page) 378 if (!bpage)
298 goto fail_free_buffer; 379 goto fail_free_buffer;
299 380
300 cpu_buffer->reader_page = page; 381 cpu_buffer->reader_page = bpage;
301 addr = __get_free_page(GFP_KERNEL); 382 addr = __get_free_page(GFP_KERNEL);
302 if (!addr) 383 if (!addr)
303 goto fail_free_reader; 384 goto fail_free_reader;
304 page->page = (void *)addr; 385 bpage->page = (void *)addr;
386 rb_init_page(bpage->page);
305 387
306 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 388 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
307 389
@@ -326,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
326static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 408static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
327{ 409{
328 struct list_head *head = &cpu_buffer->pages; 410 struct list_head *head = &cpu_buffer->pages;
329 struct buffer_page *page, *tmp; 411 struct buffer_page *bpage, *tmp;
330 412
331 list_del_init(&cpu_buffer->reader_page->list); 413 list_del_init(&cpu_buffer->reader_page->list);
332 free_buffer_page(cpu_buffer->reader_page); 414 free_buffer_page(cpu_buffer->reader_page);
333 415
334 list_for_each_entry_safe(page, tmp, head, list) { 416 list_for_each_entry_safe(bpage, tmp, head, list) {
335 list_del_init(&page->list); 417 list_del_init(&bpage->list);
336 free_buffer_page(page); 418 free_buffer_page(bpage);
337 } 419 }
338 kfree(cpu_buffer); 420 kfree(cpu_buffer);
339} 421}
@@ -430,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
430static void 512static void
431rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) 513rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
432{ 514{
433 struct buffer_page *page; 515 struct buffer_page *bpage;
434 struct list_head *p; 516 struct list_head *p;
435 unsigned i; 517 unsigned i;
436 518
@@ -438,13 +520,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
438 synchronize_sched(); 520 synchronize_sched();
439 521
440 for (i = 0; i < nr_pages; i++) { 522 for (i = 0; i < nr_pages; i++) {
441 BUG_ON(list_empty(&cpu_buffer->pages)); 523 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
524 return;
442 p = cpu_buffer->pages.next; 525 p = cpu_buffer->pages.next;
443 page = list_entry(p, struct buffer_page, list); 526 bpage = list_entry(p, struct buffer_page, list);
444 list_del_init(&page->list); 527 list_del_init(&bpage->list);
445 free_buffer_page(page); 528 free_buffer_page(bpage);
446 } 529 }
447 BUG_ON(list_empty(&cpu_buffer->pages)); 530 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
531 return;
448 532
449 rb_reset_cpu(cpu_buffer); 533 rb_reset_cpu(cpu_buffer);
450 534
@@ -458,7 +542,7 @@ static void
458rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 542rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
459 struct list_head *pages, unsigned nr_pages) 543 struct list_head *pages, unsigned nr_pages)
460{ 544{
461 struct buffer_page *page; 545 struct buffer_page *bpage;
462 struct list_head *p; 546 struct list_head *p;
463 unsigned i; 547 unsigned i;
464 548
@@ -466,11 +550,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
466 synchronize_sched(); 550 synchronize_sched();
467 551
468 for (i = 0; i < nr_pages; i++) { 552 for (i = 0; i < nr_pages; i++) {
469 BUG_ON(list_empty(pages)); 553 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
554 return;
470 p = pages->next; 555 p = pages->next;
471 page = list_entry(p, struct buffer_page, list); 556 bpage = list_entry(p, struct buffer_page, list);
472 list_del_init(&page->list); 557 list_del_init(&bpage->list);
473 list_add_tail(&page->list, &cpu_buffer->pages); 558 list_add_tail(&bpage->list, &cpu_buffer->pages);
474 } 559 }
475 rb_reset_cpu(cpu_buffer); 560 rb_reset_cpu(cpu_buffer);
476 561
@@ -497,12 +582,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
497{ 582{
498 struct ring_buffer_per_cpu *cpu_buffer; 583 struct ring_buffer_per_cpu *cpu_buffer;
499 unsigned nr_pages, rm_pages, new_pages; 584 unsigned nr_pages, rm_pages, new_pages;
500 struct buffer_page *page, *tmp; 585 struct buffer_page *bpage, *tmp;
501 unsigned long buffer_size; 586 unsigned long buffer_size;
502 unsigned long addr; 587 unsigned long addr;
503 LIST_HEAD(pages); 588 LIST_HEAD(pages);
504 int i, cpu; 589 int i, cpu;
505 590
591 /*
592 * Always succeed at resizing a non-existent buffer:
593 */
594 if (!buffer)
595 return size;
596
506 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 597 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
507 size *= BUF_PAGE_SIZE; 598 size *= BUF_PAGE_SIZE;
508 buffer_size = buffer->pages * BUF_PAGE_SIZE; 599 buffer_size = buffer->pages * BUF_PAGE_SIZE;
@@ -521,7 +612,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
521 if (size < buffer_size) { 612 if (size < buffer_size) {
522 613
523 /* easy case, just free pages */ 614 /* easy case, just free pages */
524 BUG_ON(nr_pages >= buffer->pages); 615 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
616 mutex_unlock(&buffer->mutex);
617 return -1;
618 }
525 619
526 rm_pages = buffer->pages - nr_pages; 620 rm_pages = buffer->pages - nr_pages;
527 621
@@ -540,21 +634,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
540 * add these pages to the cpu_buffers. Otherwise we just free 634 * add these pages to the cpu_buffers. Otherwise we just free
541 * them all and return -ENOMEM; 635 * them all and return -ENOMEM;
542 */ 636 */
543 BUG_ON(nr_pages <= buffer->pages); 637 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
638 mutex_unlock(&buffer->mutex);
639 return -1;
640 }
641
544 new_pages = nr_pages - buffer->pages; 642 new_pages = nr_pages - buffer->pages;
545 643
546 for_each_buffer_cpu(buffer, cpu) { 644 for_each_buffer_cpu(buffer, cpu) {
547 for (i = 0; i < new_pages; i++) { 645 for (i = 0; i < new_pages; i++) {
548 page = kzalloc_node(ALIGN(sizeof(*page), 646 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
549 cache_line_size()), 647 cache_line_size()),
550 GFP_KERNEL, cpu_to_node(cpu)); 648 GFP_KERNEL, cpu_to_node(cpu));
551 if (!page) 649 if (!bpage)
552 goto free_pages; 650 goto free_pages;
553 list_add(&page->list, &pages); 651 list_add(&bpage->list, &pages);
554 addr = __get_free_page(GFP_KERNEL); 652 addr = __get_free_page(GFP_KERNEL);
555 if (!addr) 653 if (!addr)
556 goto free_pages; 654 goto free_pages;
557 page->page = (void *)addr; 655 bpage->page = (void *)addr;
656 rb_init_page(bpage->page);
558 } 657 }
559 } 658 }
560 659
@@ -563,7 +662,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
563 rb_insert_pages(cpu_buffer, &pages, new_pages); 662 rb_insert_pages(cpu_buffer, &pages, new_pages);
564 } 663 }
565 664
566 BUG_ON(!list_empty(&pages)); 665 if (RB_WARN_ON(buffer, !list_empty(&pages))) {
666 mutex_unlock(&buffer->mutex);
667 return -1;
668 }
567 669
568 out: 670 out:
569 buffer->pages = nr_pages; 671 buffer->pages = nr_pages;
@@ -572,10 +674,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
572 return size; 674 return size;
573 675
574 free_pages: 676 free_pages:
575 list_for_each_entry_safe(page, tmp, &pages, list) { 677 list_for_each_entry_safe(bpage, tmp, &pages, list) {
576 list_del_init(&page->list); 678 list_del_init(&bpage->list);
577 free_buffer_page(page); 679 free_buffer_page(bpage);
578 } 680 }
681 mutex_unlock(&buffer->mutex);
579 return -ENOMEM; 682 return -ENOMEM;
580} 683}
581 684
@@ -584,9 +687,15 @@ static inline int rb_null_event(struct ring_buffer_event *event)
584 return event->type == RINGBUF_TYPE_PADDING; 687 return event->type == RINGBUF_TYPE_PADDING;
585} 688}
586 689
587static inline void *__rb_page_index(struct buffer_page *page, unsigned index) 690static inline void *
691__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
588{ 692{
589 return page->page + index; 693 return bpage->data + index;
694}
695
696static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
697{
698 return bpage->page->data + index;
590} 699}
591 700
592static inline struct ring_buffer_event * 701static inline struct ring_buffer_event *
@@ -616,7 +725,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
616 725
617static inline unsigned rb_page_commit(struct buffer_page *bpage) 726static inline unsigned rb_page_commit(struct buffer_page *bpage)
618{ 727{
619 return local_read(&bpage->commit); 728 return local_read(&bpage->page->commit);
620} 729}
621 730
622/* Size is determined by what has been commited */ 731/* Size is determined by what has been commited */
@@ -651,7 +760,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
651 head += rb_event_length(event)) { 760 head += rb_event_length(event)) {
652 761
653 event = __rb_page_index(cpu_buffer->head_page, head); 762 event = __rb_page_index(cpu_buffer->head_page, head);
654 BUG_ON(rb_null_event(event)); 763 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
764 return;
655 /* Only count data entries */ 765 /* Only count data entries */
656 if (event->type != RINGBUF_TYPE_DATA) 766 if (event->type != RINGBUF_TYPE_DATA)
657 continue; 767 continue;
@@ -661,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
661} 771}
662 772
663static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 773static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
664 struct buffer_page **page) 774 struct buffer_page **bpage)
665{ 775{
666 struct list_head *p = (*page)->list.next; 776 struct list_head *p = (*bpage)->list.next;
667 777
668 if (p == &cpu_buffer->pages) 778 if (p == &cpu_buffer->pages)
669 p = p->next; 779 p = p->next;
670 780
671 *page = list_entry(p, struct buffer_page, list); 781 *bpage = list_entry(p, struct buffer_page, list);
672} 782}
673 783
674static inline unsigned 784static inline unsigned
@@ -704,16 +814,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
704 addr &= PAGE_MASK; 814 addr &= PAGE_MASK;
705 815
706 while (cpu_buffer->commit_page->page != (void *)addr) { 816 while (cpu_buffer->commit_page->page != (void *)addr) {
707 RB_WARN_ON(cpu_buffer, 817 if (RB_WARN_ON(cpu_buffer,
708 cpu_buffer->commit_page == cpu_buffer->tail_page); 818 cpu_buffer->commit_page == cpu_buffer->tail_page))
709 cpu_buffer->commit_page->commit = 819 return;
820 cpu_buffer->commit_page->page->commit =
710 cpu_buffer->commit_page->write; 821 cpu_buffer->commit_page->write;
711 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 822 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
712 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 823 cpu_buffer->write_stamp =
824 cpu_buffer->commit_page->page->time_stamp;
713 } 825 }
714 826
715 /* Now set the commit to the event's index */ 827 /* Now set the commit to the event's index */
716 local_set(&cpu_buffer->commit_page->commit, index); 828 local_set(&cpu_buffer->commit_page->page->commit, index);
717} 829}
718 830
719static inline void 831static inline void
@@ -728,16 +840,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
728 * assign the commit to the tail. 840 * assign the commit to the tail.
729 */ 841 */
730 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 842 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
731 cpu_buffer->commit_page->commit = 843 cpu_buffer->commit_page->page->commit =
732 cpu_buffer->commit_page->write; 844 cpu_buffer->commit_page->write;
733 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 845 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
734 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 846 cpu_buffer->write_stamp =
847 cpu_buffer->commit_page->page->time_stamp;
735 /* add barrier to keep gcc from optimizing too much */ 848 /* add barrier to keep gcc from optimizing too much */
736 barrier(); 849 barrier();
737 } 850 }
738 while (rb_commit_index(cpu_buffer) != 851 while (rb_commit_index(cpu_buffer) !=
739 rb_page_write(cpu_buffer->commit_page)) { 852 rb_page_write(cpu_buffer->commit_page)) {
740 cpu_buffer->commit_page->commit = 853 cpu_buffer->commit_page->page->commit =
741 cpu_buffer->commit_page->write; 854 cpu_buffer->commit_page->write;
742 barrier(); 855 barrier();
743 } 856 }
@@ -745,7 +858,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
745 858
746static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 859static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
747{ 860{
748 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; 861 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
749 cpu_buffer->reader_page->read = 0; 862 cpu_buffer->reader_page->read = 0;
750} 863}
751 864
@@ -764,7 +877,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
764 else 877 else
765 rb_inc_page(cpu_buffer, &iter->head_page); 878 rb_inc_page(cpu_buffer, &iter->head_page);
766 879
767 iter->read_stamp = iter->head_page->time_stamp; 880 iter->read_stamp = iter->head_page->page->time_stamp;
768 iter->head = 0; 881 iter->head = 0;
769} 882}
770 883
@@ -852,7 +965,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
852 if (write > BUF_PAGE_SIZE) { 965 if (write > BUF_PAGE_SIZE) {
853 struct buffer_page *next_page = tail_page; 966 struct buffer_page *next_page = tail_page;
854 967
855 spin_lock_irqsave(&cpu_buffer->lock, flags); 968 local_irq_save(flags);
969 __raw_spin_lock(&cpu_buffer->lock);
856 970
857 rb_inc_page(cpu_buffer, &next_page); 971 rb_inc_page(cpu_buffer, &next_page);
858 972
@@ -860,7 +974,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
860 reader_page = cpu_buffer->reader_page; 974 reader_page = cpu_buffer->reader_page;
861 975
862 /* we grabbed the lock before incrementing */ 976 /* we grabbed the lock before incrementing */
863 RB_WARN_ON(cpu_buffer, next_page == reader_page); 977 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
978 goto out_unlock;
864 979
865 /* 980 /*
866 * If for some reason, we had an interrupt storm that made 981 * If for some reason, we had an interrupt storm that made
@@ -898,12 +1013,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
898 */ 1013 */
899 if (tail_page == cpu_buffer->tail_page) { 1014 if (tail_page == cpu_buffer->tail_page) {
900 local_set(&next_page->write, 0); 1015 local_set(&next_page->write, 0);
901 local_set(&next_page->commit, 0); 1016 local_set(&next_page->page->commit, 0);
902 cpu_buffer->tail_page = next_page; 1017 cpu_buffer->tail_page = next_page;
903 1018
904 /* reread the time stamp */ 1019 /* reread the time stamp */
905 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1020 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
906 cpu_buffer->tail_page->time_stamp = *ts; 1021 cpu_buffer->tail_page->page->time_stamp = *ts;
907 } 1022 }
908 1023
909 /* 1024 /*
@@ -928,7 +1043,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
928 rb_set_commit_to_write(cpu_buffer); 1043 rb_set_commit_to_write(cpu_buffer);
929 } 1044 }
930 1045
931 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1046 __raw_spin_unlock(&cpu_buffer->lock);
1047 local_irq_restore(flags);
932 1048
933 /* fail and let the caller try again */ 1049 /* fail and let the caller try again */
934 return ERR_PTR(-EAGAIN); 1050 return ERR_PTR(-EAGAIN);
@@ -936,7 +1052,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
936 1052
937 /* We reserved something on the buffer */ 1053 /* We reserved something on the buffer */
938 1054
939 BUG_ON(write > BUF_PAGE_SIZE); 1055 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1056 return NULL;
940 1057
941 event = __rb_page_index(tail_page, tail); 1058 event = __rb_page_index(tail_page, tail);
942 rb_update_event(event, type, length); 1059 rb_update_event(event, type, length);
@@ -946,12 +1063,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
946 * this page's time stamp. 1063 * this page's time stamp.
947 */ 1064 */
948 if (!tail && rb_is_commit(cpu_buffer, event)) 1065 if (!tail && rb_is_commit(cpu_buffer, event))
949 cpu_buffer->commit_page->time_stamp = *ts; 1066 cpu_buffer->commit_page->page->time_stamp = *ts;
950 1067
951 return event; 1068 return event;
952 1069
953 out_unlock: 1070 out_unlock:
954 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1071 __raw_spin_unlock(&cpu_buffer->lock);
1072 local_irq_restore(flags);
955 return NULL; 1073 return NULL;
956} 1074}
957 1075
@@ -996,7 +1114,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
996 event->time_delta = *delta & TS_MASK; 1114 event->time_delta = *delta & TS_MASK;
997 event->array[0] = *delta >> TS_SHIFT; 1115 event->array[0] = *delta >> TS_SHIFT;
998 } else { 1116 } else {
999 cpu_buffer->commit_page->time_stamp = *ts; 1117 cpu_buffer->commit_page->page->time_stamp = *ts;
1000 event->time_delta = 0; 1118 event->time_delta = 0;
1001 event->array[0] = 0; 1119 event->array[0] = 0;
1002 } 1120 }
@@ -1034,10 +1152,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1034 * storm or we have something buggy. 1152 * storm or we have something buggy.
1035 * Bail! 1153 * Bail!
1036 */ 1154 */
1037 if (unlikely(++nr_loops > 1000)) { 1155 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1038 RB_WARN_ON(cpu_buffer, 1);
1039 return NULL; 1156 return NULL;
1040 }
1041 1157
1042 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1158 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1043 1159
@@ -1060,7 +1176,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1060 1176
1061 /* Did the write stamp get updated already? */ 1177 /* Did the write stamp get updated already? */
1062 if (unlikely(ts < cpu_buffer->write_stamp)) 1178 if (unlikely(ts < cpu_buffer->write_stamp))
1063 goto again; 1179 delta = 0;
1064 1180
1065 if (test_time_stamp(delta)) { 1181 if (test_time_stamp(delta)) {
1066 1182
@@ -1133,12 +1249,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1133 struct ring_buffer_event *event; 1249 struct ring_buffer_event *event;
1134 int cpu, resched; 1250 int cpu, resched;
1135 1251
1252 if (ring_buffer_flags != RB_BUFFERS_ON)
1253 return NULL;
1254
1136 if (atomic_read(&buffer->record_disabled)) 1255 if (atomic_read(&buffer->record_disabled))
1137 return NULL; 1256 return NULL;
1138 1257
1139 /* If we are tracing schedule, we don't want to recurse */ 1258 /* If we are tracing schedule, we don't want to recurse */
1140 resched = need_resched(); 1259 resched = ftrace_preempt_disable();
1141 preempt_disable_notrace();
1142 1260
1143 cpu = raw_smp_processor_id(); 1261 cpu = raw_smp_processor_id();
1144 1262
@@ -1169,10 +1287,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1169 return event; 1287 return event;
1170 1288
1171 out: 1289 out:
1172 if (resched) 1290 ftrace_preempt_enable(resched);
1173 preempt_enable_notrace();
1174 else
1175 preempt_enable_notrace();
1176 return NULL; 1291 return NULL;
1177} 1292}
1178 1293
@@ -1214,12 +1329,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1214 /* 1329 /*
1215 * Only the last preempt count needs to restore preemption. 1330 * Only the last preempt count needs to restore preemption.
1216 */ 1331 */
1217 if (preempt_count() == 1) { 1332 if (preempt_count() == 1)
1218 if (per_cpu(rb_need_resched, cpu)) 1333 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1219 preempt_enable_no_resched_notrace(); 1334 else
1220 else
1221 preempt_enable_notrace();
1222 } else
1223 preempt_enable_no_resched_notrace(); 1335 preempt_enable_no_resched_notrace();
1224 1336
1225 return 0; 1337 return 0;
@@ -1249,11 +1361,13 @@ int ring_buffer_write(struct ring_buffer *buffer,
1249 int ret = -EBUSY; 1361 int ret = -EBUSY;
1250 int cpu, resched; 1362 int cpu, resched;
1251 1363
1364 if (ring_buffer_flags != RB_BUFFERS_ON)
1365 return -EBUSY;
1366
1252 if (atomic_read(&buffer->record_disabled)) 1367 if (atomic_read(&buffer->record_disabled))
1253 return -EBUSY; 1368 return -EBUSY;
1254 1369
1255 resched = need_resched(); 1370 resched = ftrace_preempt_disable();
1256 preempt_disable_notrace();
1257 1371
1258 cpu = raw_smp_processor_id(); 1372 cpu = raw_smp_processor_id();
1259 1373
@@ -1279,10 +1393,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1279 1393
1280 ret = 0; 1394 ret = 0;
1281 out: 1395 out:
1282 if (resched) 1396 ftrace_preempt_enable(resched);
1283 preempt_enable_no_resched_notrace();
1284 else
1285 preempt_enable_notrace();
1286 1397
1287 return ret; 1398 return ret;
1288} 1399}
@@ -1441,14 +1552,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1441 return overruns; 1552 return overruns;
1442} 1553}
1443 1554
1444/** 1555static void rb_iter_reset(struct ring_buffer_iter *iter)
1445 * ring_buffer_iter_reset - reset an iterator
1446 * @iter: The iterator to reset
1447 *
1448 * Resets the iterator, so that it will start from the beginning
1449 * again.
1450 */
1451void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1452{ 1556{
1453 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1557 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1454 1558
@@ -1463,7 +1567,24 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1463 if (iter->head) 1567 if (iter->head)
1464 iter->read_stamp = cpu_buffer->read_stamp; 1568 iter->read_stamp = cpu_buffer->read_stamp;
1465 else 1569 else
1466 iter->read_stamp = iter->head_page->time_stamp; 1570 iter->read_stamp = iter->head_page->page->time_stamp;
1571}
1572
1573/**
1574 * ring_buffer_iter_reset - reset an iterator
1575 * @iter: The iterator to reset
1576 *
1577 * Resets the iterator, so that it will start from the beginning
1578 * again.
1579 */
1580void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1581{
1582 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1583 unsigned long flags;
1584
1585 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1586 rb_iter_reset(iter);
1587 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1467} 1588}
1468 1589
1469/** 1590/**
@@ -1549,7 +1670,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1549 unsigned long flags; 1670 unsigned long flags;
1550 int nr_loops = 0; 1671 int nr_loops = 0;
1551 1672
1552 spin_lock_irqsave(&cpu_buffer->lock, flags); 1673 local_irq_save(flags);
1674 __raw_spin_lock(&cpu_buffer->lock);
1553 1675
1554 again: 1676 again:
1555 /* 1677 /*
@@ -1558,8 +1680,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1558 * a case where we will loop three times. There should be no 1680 * a case where we will loop three times. There should be no
1559 * reason to loop four times (that I know of). 1681 * reason to loop four times (that I know of).
1560 */ 1682 */
1561 if (unlikely(++nr_loops > 3)) { 1683 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
1562 RB_WARN_ON(cpu_buffer, 1);
1563 reader = NULL; 1684 reader = NULL;
1564 goto out; 1685 goto out;
1565 } 1686 }
@@ -1571,8 +1692,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1571 goto out; 1692 goto out;
1572 1693
1573 /* Never should we have an index greater than the size */ 1694 /* Never should we have an index greater than the size */
1574 RB_WARN_ON(cpu_buffer, 1695 if (RB_WARN_ON(cpu_buffer,
1575 cpu_buffer->reader_page->read > rb_page_size(reader)); 1696 cpu_buffer->reader_page->read > rb_page_size(reader)))
1697 goto out;
1576 1698
1577 /* check if we caught up to the tail */ 1699 /* check if we caught up to the tail */
1578 reader = NULL; 1700 reader = NULL;
@@ -1589,7 +1711,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1589 cpu_buffer->reader_page->list.prev = reader->list.prev; 1711 cpu_buffer->reader_page->list.prev = reader->list.prev;
1590 1712
1591 local_set(&cpu_buffer->reader_page->write, 0); 1713 local_set(&cpu_buffer->reader_page->write, 0);
1592 local_set(&cpu_buffer->reader_page->commit, 0); 1714 local_set(&cpu_buffer->reader_page->page->commit, 0);
1593 1715
1594 /* Make the reader page now replace the head */ 1716 /* Make the reader page now replace the head */
1595 reader->list.prev->next = &cpu_buffer->reader_page->list; 1717 reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1611,7 +1733,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1611 goto again; 1733 goto again;
1612 1734
1613 out: 1735 out:
1614 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1736 __raw_spin_unlock(&cpu_buffer->lock);
1737 local_irq_restore(flags);
1615 1738
1616 return reader; 1739 return reader;
1617} 1740}
@@ -1625,7 +1748,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1625 reader = rb_get_reader_page(cpu_buffer); 1748 reader = rb_get_reader_page(cpu_buffer);
1626 1749
1627 /* This function should not be called when buffer is empty */ 1750 /* This function should not be called when buffer is empty */
1628 BUG_ON(!reader); 1751 if (RB_WARN_ON(cpu_buffer, !reader))
1752 return;
1629 1753
1630 event = rb_reader_event(cpu_buffer); 1754 event = rb_reader_event(cpu_buffer);
1631 1755
@@ -1652,7 +1776,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1652 * Check if we are at the end of the buffer. 1776 * Check if we are at the end of the buffer.
1653 */ 1777 */
1654 if (iter->head >= rb_page_size(iter->head_page)) { 1778 if (iter->head >= rb_page_size(iter->head_page)) {
1655 BUG_ON(iter->head_page == cpu_buffer->commit_page); 1779 if (RB_WARN_ON(buffer,
1780 iter->head_page == cpu_buffer->commit_page))
1781 return;
1656 rb_inc_iter(iter); 1782 rb_inc_iter(iter);
1657 return; 1783 return;
1658 } 1784 }
@@ -1665,8 +1791,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1665 * This should not be called to advance the header if we are 1791 * This should not be called to advance the header if we are
1666 * at the tail of the buffer. 1792 * at the tail of the buffer.
1667 */ 1793 */
1668 BUG_ON((iter->head_page == cpu_buffer->commit_page) && 1794 if (RB_WARN_ON(cpu_buffer,
1669 (iter->head + length > rb_commit_index(cpu_buffer))); 1795 (iter->head_page == cpu_buffer->commit_page) &&
1796 (iter->head + length > rb_commit_index(cpu_buffer))))
1797 return;
1670 1798
1671 rb_update_iter_read_stamp(iter, event); 1799 rb_update_iter_read_stamp(iter, event);
1672 1800
@@ -1678,17 +1806,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1678 rb_advance_iter(iter); 1806 rb_advance_iter(iter);
1679} 1807}
1680 1808
1681/** 1809static struct ring_buffer_event *
1682 * ring_buffer_peek - peek at the next event to be read 1810rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1683 * @buffer: The ring buffer to read
1684 * @cpu: The cpu to peak at
1685 * @ts: The timestamp counter of this event.
1686 *
1687 * This will return the event that will be read next, but does
1688 * not consume the data.
1689 */
1690struct ring_buffer_event *
1691ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1692{ 1811{
1693 struct ring_buffer_per_cpu *cpu_buffer; 1812 struct ring_buffer_per_cpu *cpu_buffer;
1694 struct ring_buffer_event *event; 1813 struct ring_buffer_event *event;
@@ -1709,10 +1828,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1709 * can have. Nesting 10 deep of interrupts is clearly 1828 * can have. Nesting 10 deep of interrupts is clearly
1710 * an anomaly. 1829 * an anomaly.
1711 */ 1830 */
1712 if (unlikely(++nr_loops > 10)) { 1831 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1713 RB_WARN_ON(cpu_buffer, 1);
1714 return NULL; 1832 return NULL;
1715 }
1716 1833
1717 reader = rb_get_reader_page(cpu_buffer); 1834 reader = rb_get_reader_page(cpu_buffer);
1718 if (!reader) 1835 if (!reader)
@@ -1750,16 +1867,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1750 return NULL; 1867 return NULL;
1751} 1868}
1752 1869
1753/** 1870static struct ring_buffer_event *
1754 * ring_buffer_iter_peek - peek at the next event to be read 1871rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1755 * @iter: The ring buffer iterator
1756 * @ts: The timestamp counter of this event.
1757 *
1758 * This will return the event that will be read next, but does
1759 * not increment the iterator.
1760 */
1761struct ring_buffer_event *
1762ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1763{ 1872{
1764 struct ring_buffer *buffer; 1873 struct ring_buffer *buffer;
1765 struct ring_buffer_per_cpu *cpu_buffer; 1874 struct ring_buffer_per_cpu *cpu_buffer;
@@ -1781,10 +1890,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1781 * can have. Nesting 10 deep of interrupts is clearly 1890 * can have. Nesting 10 deep of interrupts is clearly
1782 * an anomaly. 1891 * an anomaly.
1783 */ 1892 */
1784 if (unlikely(++nr_loops > 10)) { 1893 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1785 RB_WARN_ON(cpu_buffer, 1);
1786 return NULL; 1894 return NULL;
1787 }
1788 1895
1789 if (rb_per_cpu_empty(cpu_buffer)) 1896 if (rb_per_cpu_empty(cpu_buffer))
1790 return NULL; 1897 return NULL;
@@ -1821,6 +1928,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1821} 1928}
1822 1929
1823/** 1930/**
1931 * ring_buffer_peek - peek at the next event to be read
1932 * @buffer: The ring buffer to read
1933 * @cpu: The cpu to peak at
1934 * @ts: The timestamp counter of this event.
1935 *
1936 * This will return the event that will be read next, but does
1937 * not consume the data.
1938 */
1939struct ring_buffer_event *
1940ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1941{
1942 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1943 struct ring_buffer_event *event;
1944 unsigned long flags;
1945
1946 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1947 event = rb_buffer_peek(buffer, cpu, ts);
1948 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1949
1950 return event;
1951}
1952
1953/**
1954 * ring_buffer_iter_peek - peek at the next event to be read
1955 * @iter: The ring buffer iterator
1956 * @ts: The timestamp counter of this event.
1957 *
1958 * This will return the event that will be read next, but does
1959 * not increment the iterator.
1960 */
1961struct ring_buffer_event *
1962ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1963{
1964 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1965 struct ring_buffer_event *event;
1966 unsigned long flags;
1967
1968 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1969 event = rb_iter_peek(iter, ts);
1970 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1971
1972 return event;
1973}
1974
1975/**
1824 * ring_buffer_consume - return an event and consume it 1976 * ring_buffer_consume - return an event and consume it
1825 * @buffer: The ring buffer to get the next event from 1977 * @buffer: The ring buffer to get the next event from
1826 * 1978 *
@@ -1831,19 +1983,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1831struct ring_buffer_event * 1983struct ring_buffer_event *
1832ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 1984ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1833{ 1985{
1834 struct ring_buffer_per_cpu *cpu_buffer; 1986 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1835 struct ring_buffer_event *event; 1987 struct ring_buffer_event *event;
1988 unsigned long flags;
1836 1989
1837 if (!cpu_isset(cpu, buffer->cpumask)) 1990 if (!cpu_isset(cpu, buffer->cpumask))
1838 return NULL; 1991 return NULL;
1839 1992
1840 event = ring_buffer_peek(buffer, cpu, ts); 1993 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1994
1995 event = rb_buffer_peek(buffer, cpu, ts);
1841 if (!event) 1996 if (!event)
1842 return NULL; 1997 goto out;
1843 1998
1844 cpu_buffer = buffer->buffers[cpu];
1845 rb_advance_reader(cpu_buffer); 1999 rb_advance_reader(cpu_buffer);
1846 2000
2001 out:
2002 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2003
1847 return event; 2004 return event;
1848} 2005}
1849 2006
@@ -1880,9 +2037,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1880 atomic_inc(&cpu_buffer->record_disabled); 2037 atomic_inc(&cpu_buffer->record_disabled);
1881 synchronize_sched(); 2038 synchronize_sched();
1882 2039
1883 spin_lock_irqsave(&cpu_buffer->lock, flags); 2040 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1884 ring_buffer_iter_reset(iter); 2041 __raw_spin_lock(&cpu_buffer->lock);
1885 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2042 rb_iter_reset(iter);
2043 __raw_spin_unlock(&cpu_buffer->lock);
2044 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1886 2045
1887 return iter; 2046 return iter;
1888} 2047}
@@ -1914,12 +2073,17 @@ struct ring_buffer_event *
1914ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) 2073ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1915{ 2074{
1916 struct ring_buffer_event *event; 2075 struct ring_buffer_event *event;
2076 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2077 unsigned long flags;
1917 2078
1918 event = ring_buffer_iter_peek(iter, ts); 2079 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2080 event = rb_iter_peek(iter, ts);
1919 if (!event) 2081 if (!event)
1920 return NULL; 2082 goto out;
1921 2083
1922 rb_advance_iter(iter); 2084 rb_advance_iter(iter);
2085 out:
2086 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1923 2087
1924 return event; 2088 return event;
1925} 2089}
@@ -1939,7 +2103,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1939 cpu_buffer->head_page 2103 cpu_buffer->head_page
1940 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2104 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
1941 local_set(&cpu_buffer->head_page->write, 0); 2105 local_set(&cpu_buffer->head_page->write, 0);
1942 local_set(&cpu_buffer->head_page->commit, 0); 2106 local_set(&cpu_buffer->head_page->page->commit, 0);
1943 2107
1944 cpu_buffer->head_page->read = 0; 2108 cpu_buffer->head_page->read = 0;
1945 2109
@@ -1948,7 +2112,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1948 2112
1949 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2113 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1950 local_set(&cpu_buffer->reader_page->write, 0); 2114 local_set(&cpu_buffer->reader_page->write, 0);
1951 local_set(&cpu_buffer->reader_page->commit, 0); 2115 local_set(&cpu_buffer->reader_page->page->commit, 0);
1952 cpu_buffer->reader_page->read = 0; 2116 cpu_buffer->reader_page->read = 0;
1953 2117
1954 cpu_buffer->overrun = 0; 2118 cpu_buffer->overrun = 0;
@@ -1968,11 +2132,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
1968 if (!cpu_isset(cpu, buffer->cpumask)) 2132 if (!cpu_isset(cpu, buffer->cpumask))
1969 return; 2133 return;
1970 2134
1971 spin_lock_irqsave(&cpu_buffer->lock, flags); 2135 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2136
2137 __raw_spin_lock(&cpu_buffer->lock);
1972 2138
1973 rb_reset_cpu(cpu_buffer); 2139 rb_reset_cpu(cpu_buffer);
1974 2140
1975 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2141 __raw_spin_unlock(&cpu_buffer->lock);
2142
2143 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1976} 2144}
1977 2145
1978/** 2146/**
@@ -2070,3 +2238,233 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2070 return 0; 2238 return 0;
2071} 2239}
2072 2240
2241static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2242 struct buffer_data_page *bpage)
2243{
2244 struct ring_buffer_event *event;
2245 unsigned long head;
2246
2247 __raw_spin_lock(&cpu_buffer->lock);
2248 for (head = 0; head < local_read(&bpage->commit);
2249 head += rb_event_length(event)) {
2250
2251 event = __rb_data_page_index(bpage, head);
2252 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2253 return;
2254 /* Only count data entries */
2255 if (event->type != RINGBUF_TYPE_DATA)
2256 continue;
2257 cpu_buffer->entries--;
2258 }
2259 __raw_spin_unlock(&cpu_buffer->lock);
2260}
2261
2262/**
2263 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2264 * @buffer: the buffer to allocate for.
2265 *
2266 * This function is used in conjunction with ring_buffer_read_page.
2267 * When reading a full page from the ring buffer, these functions
2268 * can be used to speed up the process. The calling function should
2269 * allocate a few pages first with this function. Then when it
2270 * needs to get pages from the ring buffer, it passes the result
2271 * of this function into ring_buffer_read_page, which will swap
2272 * the page that was allocated, with the read page of the buffer.
2273 *
2274 * Returns:
2275 * The page allocated, or NULL on error.
2276 */
2277void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2278{
2279 unsigned long addr;
2280 struct buffer_data_page *bpage;
2281
2282 addr = __get_free_page(GFP_KERNEL);
2283 if (!addr)
2284 return NULL;
2285
2286 bpage = (void *)addr;
2287
2288 return bpage;
2289}
2290
2291/**
2292 * ring_buffer_free_read_page - free an allocated read page
2293 * @buffer: the buffer the page was allocate for
2294 * @data: the page to free
2295 *
2296 * Free a page allocated from ring_buffer_alloc_read_page.
2297 */
2298void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2299{
2300 free_page((unsigned long)data);
2301}
2302
2303/**
2304 * ring_buffer_read_page - extract a page from the ring buffer
2305 * @buffer: buffer to extract from
2306 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2307 * @cpu: the cpu of the buffer to extract
2308 * @full: should the extraction only happen when the page is full.
2309 *
2310 * This function will pull out a page from the ring buffer and consume it.
2311 * @data_page must be the address of the variable that was returned
2312 * from ring_buffer_alloc_read_page. This is because the page might be used
2313 * to swap with a page in the ring buffer.
2314 *
2315 * for example:
2316 * rpage = ring_buffer_alloc_page(buffer);
2317 * if (!rpage)
2318 * return error;
2319 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
2320 * if (ret)
2321 * process_page(rpage);
2322 *
2323 * When @full is set, the function will not return true unless
2324 * the writer is off the reader page.
2325 *
2326 * Note: it is up to the calling functions to handle sleeps and wakeups.
2327 * The ring buffer can be used anywhere in the kernel and can not
2328 * blindly call wake_up. The layer that uses the ring buffer must be
2329 * responsible for that.
2330 *
2331 * Returns:
2332 * 1 if data has been transferred
2333 * 0 if no data has been transferred.
2334 */
2335int ring_buffer_read_page(struct ring_buffer *buffer,
2336 void **data_page, int cpu, int full)
2337{
2338 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2339 struct ring_buffer_event *event;
2340 struct buffer_data_page *bpage;
2341 unsigned long flags;
2342 int ret = 0;
2343
2344 if (!data_page)
2345 return 0;
2346
2347 bpage = *data_page;
2348 if (!bpage)
2349 return 0;
2350
2351 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2352
2353 /*
2354 * rb_buffer_peek will get the next ring buffer if
2355 * the current reader page is empty.
2356 */
2357 event = rb_buffer_peek(buffer, cpu, NULL);
2358 if (!event)
2359 goto out;
2360
2361 /* check for data */
2362 if (!local_read(&cpu_buffer->reader_page->page->commit))
2363 goto out;
2364 /*
2365 * If the writer is already off of the read page, then simply
2366 * switch the read page with the given page. Otherwise
2367 * we need to copy the data from the reader to the writer.
2368 */
2369 if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
2370 unsigned int read = cpu_buffer->reader_page->read;
2371
2372 if (full)
2373 goto out;
2374 /* The writer is still on the reader page, we must copy */
2375 bpage = cpu_buffer->reader_page->page;
2376 memcpy(bpage->data,
2377 cpu_buffer->reader_page->page->data + read,
2378 local_read(&bpage->commit) - read);
2379
2380 /* consume what was read */
2381 cpu_buffer->reader_page += read;
2382
2383 } else {
2384 /* swap the pages */
2385 rb_init_page(bpage);
2386 bpage = cpu_buffer->reader_page->page;
2387 cpu_buffer->reader_page->page = *data_page;
2388 cpu_buffer->reader_page->read = 0;
2389 *data_page = bpage;
2390 }
2391 ret = 1;
2392
2393 /* update the entry counter */
2394 rb_remove_entries(cpu_buffer, bpage);
2395 out:
2396 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2397
2398 return ret;
2399}
2400
2401static ssize_t
2402rb_simple_read(struct file *filp, char __user *ubuf,
2403 size_t cnt, loff_t *ppos)
2404{
2405 long *p = filp->private_data;
2406 char buf[64];
2407 int r;
2408
2409 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
2410 r = sprintf(buf, "permanently disabled\n");
2411 else
2412 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
2413
2414 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2415}
2416
2417static ssize_t
2418rb_simple_write(struct file *filp, const char __user *ubuf,
2419 size_t cnt, loff_t *ppos)
2420{
2421 long *p = filp->private_data;
2422 char buf[64];
2423 long val;
2424 int ret;
2425
2426 if (cnt >= sizeof(buf))
2427 return -EINVAL;
2428
2429 if (copy_from_user(&buf, ubuf, cnt))
2430 return -EFAULT;
2431
2432 buf[cnt] = 0;
2433
2434 ret = strict_strtoul(buf, 10, &val);
2435 if (ret < 0)
2436 return ret;
2437
2438 if (val)
2439 set_bit(RB_BUFFERS_ON_BIT, p);
2440 else
2441 clear_bit(RB_BUFFERS_ON_BIT, p);
2442
2443 (*ppos)++;
2444
2445 return cnt;
2446}
2447
2448static struct file_operations rb_simple_fops = {
2449 .open = tracing_open_generic,
2450 .read = rb_simple_read,
2451 .write = rb_simple_write,
2452};
2453
2454
2455static __init int rb_init_debugfs(void)
2456{
2457 struct dentry *d_tracer;
2458 struct dentry *entry;
2459
2460 d_tracer = tracing_init_dentry();
2461
2462 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2463 &ring_buffer_flags, &rb_simple_fops);
2464 if (!entry)
2465 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2466
2467 return 0;
2468}
2469
2470fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3b478f9171..8ebe0070c47a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/seq_file.h>
33#include <linux/writeback.h> 34#include <linux/writeback.h>
34 35
35#include <linux/stacktrace.h> 36#include <linux/stacktrace.h>
@@ -43,6 +44,38 @@
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 44unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
44unsigned long __read_mostly tracing_thresh; 45unsigned long __read_mostly tracing_thresh;
45 46
47/*
48 * We need to change this state when a selftest is running.
49 * A selftest will lurk into the ring-buffer to count the
50 * entries inserted during the selftest although some concurrent
51 * insertions into the ring-buffer such as ftrace_printk could occurred
52 * at the same time, giving false positive or negative results.
53 */
54static bool __read_mostly tracing_selftest_running;
55
56/* For tracers that don't implement custom flags */
57static struct tracer_opt dummy_tracer_opt[] = {
58 { }
59};
60
61static struct tracer_flags dummy_tracer_flags = {
62 .val = 0,
63 .opts = dummy_tracer_opt
64};
65
66static int dummy_set_flag(u32 old_flags, u32 bit, int set)
67{
68 return 0;
69}
70
71/*
72 * Kill all tracing for good (never come back).
73 * It is initialized to 1 but will turn to zero if the initialization
74 * of the tracer is successful. But that is the only place that sets
75 * this back to zero.
76 */
77int tracing_disabled = 1;
78
46static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 79static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
47 80
48static inline void ftrace_disable_cpu(void) 81static inline void ftrace_disable_cpu(void)
@@ -62,7 +95,36 @@ static cpumask_t __read_mostly tracing_buffer_mask;
62#define for_each_tracing_cpu(cpu) \ 95#define for_each_tracing_cpu(cpu) \
63 for_each_cpu_mask(cpu, tracing_buffer_mask) 96 for_each_cpu_mask(cpu, tracing_buffer_mask)
64 97
65static int tracing_disabled = 1; 98/*
99 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
100 *
101 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
102 * is set, then ftrace_dump is called. This will output the contents
103 * of the ftrace buffers to the console. This is very useful for
104 * capturing traces that lead to crashes and outputing it to a
105 * serial console.
106 *
107 * It is default off, but you can enable it with either specifying
108 * "ftrace_dump_on_oops" in the kernel command line, or setting
109 * /proc/sys/kernel/ftrace_dump_on_oops to true.
110 */
111int ftrace_dump_on_oops;
112
113static int tracing_set_tracer(char *buf);
114
115static int __init set_ftrace(char *str)
116{
117 tracing_set_tracer(str);
118 return 1;
119}
120__setup("ftrace", set_ftrace);
121
122static int __init set_ftrace_dump_on_oops(char *str)
123{
124 ftrace_dump_on_oops = 1;
125 return 1;
126}
127__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
66 128
67long 129long
68ns2usecs(cycle_t nsec) 130ns2usecs(cycle_t nsec)
@@ -112,6 +174,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
112/* tracer_enabled is used to toggle activation of a tracer */ 174/* tracer_enabled is used to toggle activation of a tracer */
113static int tracer_enabled = 1; 175static int tracer_enabled = 1;
114 176
177/**
178 * tracing_is_enabled - return tracer_enabled status
179 *
180 * This function is used by other tracers to know the status
181 * of the tracer_enabled flag. Tracers may use this function
182 * to know if it should enable their features when starting
183 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
184 */
185int tracing_is_enabled(void)
186{
187 return tracer_enabled;
188}
189
115/* function tracing enabled */ 190/* function tracing enabled */
116int ftrace_function_enabled; 191int ftrace_function_enabled;
117 192
@@ -153,8 +228,9 @@ static DEFINE_MUTEX(trace_types_lock);
153/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 228/* trace_wait is a waitqueue for tasks blocked on trace_poll */
154static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 229static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
155 230
156/* trace_flags holds iter_ctrl options */ 231/* trace_flags holds trace_options default values */
157unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 232unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
233 TRACE_ITER_ANNOTATE;
158 234
159/** 235/**
160 * trace_wake_up - wake up tasks waiting for trace input 236 * trace_wake_up - wake up tasks waiting for trace input
@@ -193,13 +269,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
193 return nsecs / 1000; 269 return nsecs / 1000;
194} 270}
195 271
196/*
197 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
198 * control the output of kernel symbols.
199 */
200#define TRACE_ITER_SYM_MASK \
201 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
202
203/* These must match the bit postions in trace_iterator_flags */ 272/* These must match the bit postions in trace_iterator_flags */
204static const char *trace_options[] = { 273static const char *trace_options[] = {
205 "print-parent", 274 "print-parent",
@@ -213,6 +282,11 @@ static const char *trace_options[] = {
213 "stacktrace", 282 "stacktrace",
214 "sched-tree", 283 "sched-tree",
215 "ftrace_printk", 284 "ftrace_printk",
285 "ftrace_preempt",
286 "branch",
287 "annotate",
288 "userstacktrace",
289 "sym-userobj",
216 NULL 290 NULL
217}; 291};
218 292
@@ -359,6 +433,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
359 return trace_seq_putmem(s, hex, j); 433 return trace_seq_putmem(s, hex, j);
360} 434}
361 435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440
441 if (s->len >= (PAGE_SIZE - 1))
442 return 0;
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
444 if (!IS_ERR(p)) {
445 p = mangle_path(s->buffer + s->len, p, "\n");
446 if (p) {
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454
455 return 0;
456}
457
362static void 458static void
363trace_seq_reset(struct trace_seq *s) 459trace_seq_reset(struct trace_seq *s)
364{ 460{
@@ -470,7 +566,17 @@ int register_tracer(struct tracer *type)
470 return -1; 566 return -1;
471 } 567 }
472 568
569 /*
570 * When this gets called we hold the BKL which means that
571 * preemption is disabled. Various trace selftests however
572 * need to disable and enable preemption for successful tests.
573 * So we drop the BKL here and grab it after the tests again.
574 */
575 unlock_kernel();
473 mutex_lock(&trace_types_lock); 576 mutex_lock(&trace_types_lock);
577
578 tracing_selftest_running = true;
579
474 for (t = trace_types; t; t = t->next) { 580 for (t = trace_types; t; t = t->next) {
475 if (strcmp(type->name, t->name) == 0) { 581 if (strcmp(type->name, t->name) == 0) {
476 /* already found */ 582 /* already found */
@@ -481,12 +587,20 @@ int register_tracer(struct tracer *type)
481 } 587 }
482 } 588 }
483 589
590 if (!type->set_flag)
591 type->set_flag = &dummy_set_flag;
592 if (!type->flags)
593 type->flags = &dummy_tracer_flags;
594 else
595 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt;
597
484#ifdef CONFIG_FTRACE_STARTUP_TEST 598#ifdef CONFIG_FTRACE_STARTUP_TEST
485 if (type->selftest) { 599 if (type->selftest) {
486 struct tracer *saved_tracer = current_trace; 600 struct tracer *saved_tracer = current_trace;
487 struct trace_array *tr = &global_trace; 601 struct trace_array *tr = &global_trace;
488 int saved_ctrl = tr->ctrl;
489 int i; 602 int i;
603
490 /* 604 /*
491 * Run a selftest on this tracer. 605 * Run a selftest on this tracer.
492 * Here we reset the trace buffer, and set the current 606 * Here we reset the trace buffer, and set the current
@@ -494,25 +608,23 @@ int register_tracer(struct tracer *type)
494 * internal tracing to verify that everything is in order. 608 * internal tracing to verify that everything is in order.
495 * If we fail, we do not register this tracer. 609 * If we fail, we do not register this tracer.
496 */ 610 */
497 for_each_tracing_cpu(i) { 611 for_each_tracing_cpu(i)
498 tracing_reset(tr, i); 612 tracing_reset(tr, i);
499 } 613
500 current_trace = type; 614 current_trace = type;
501 tr->ctrl = 0;
502 /* the test is responsible for initializing and enabling */ 615 /* the test is responsible for initializing and enabling */
503 pr_info("Testing tracer %s: ", type->name); 616 pr_info("Testing tracer %s: ", type->name);
504 ret = type->selftest(type, tr); 617 ret = type->selftest(type, tr);
505 /* the test is responsible for resetting too */ 618 /* the test is responsible for resetting too */
506 current_trace = saved_tracer; 619 current_trace = saved_tracer;
507 tr->ctrl = saved_ctrl;
508 if (ret) { 620 if (ret) {
509 printk(KERN_CONT "FAILED!\n"); 621 printk(KERN_CONT "FAILED!\n");
510 goto out; 622 goto out;
511 } 623 }
512 /* Only reset on passing, to avoid touching corrupted buffers */ 624 /* Only reset on passing, to avoid touching corrupted buffers */
513 for_each_tracing_cpu(i) { 625 for_each_tracing_cpu(i)
514 tracing_reset(tr, i); 626 tracing_reset(tr, i);
515 } 627
516 printk(KERN_CONT "PASSED\n"); 628 printk(KERN_CONT "PASSED\n");
517 } 629 }
518#endif 630#endif
@@ -524,7 +636,9 @@ int register_tracer(struct tracer *type)
524 max_tracer_type_len = len; 636 max_tracer_type_len = len;
525 637
526 out: 638 out:
639 tracing_selftest_running = false;
527 mutex_unlock(&trace_types_lock); 640 mutex_unlock(&trace_types_lock);
641 lock_kernel();
528 642
529 return ret; 643 return ret;
530} 644}
@@ -581,6 +695,91 @@ static void trace_init_cmdlines(void)
581 cmdline_idx = 0; 695 cmdline_idx = 0;
582} 696}
583 697
698static int trace_stop_count;
699static DEFINE_SPINLOCK(tracing_start_lock);
700
701/**
702 * ftrace_off_permanent - disable all ftrace code permanently
703 *
704 * This should only be called when a serious anomally has
705 * been detected. This will turn off the function tracing,
706 * ring buffers, and other tracing utilites. It takes no
707 * locks and can be called from any context.
708 */
709void ftrace_off_permanent(void)
710{
711 tracing_disabled = 1;
712 ftrace_stop();
713 tracing_off_permanent();
714}
715
716/**
717 * tracing_start - quick start of the tracer
718 *
719 * If tracing is enabled but was stopped by tracing_stop,
720 * this will start the tracer back up.
721 */
722void tracing_start(void)
723{
724 struct ring_buffer *buffer;
725 unsigned long flags;
726
727 if (tracing_disabled)
728 return;
729
730 spin_lock_irqsave(&tracing_start_lock, flags);
731 if (--trace_stop_count)
732 goto out;
733
734 if (trace_stop_count < 0) {
735 /* Someone screwed up their debugging */
736 WARN_ON_ONCE(1);
737 trace_stop_count = 0;
738 goto out;
739 }
740
741
742 buffer = global_trace.buffer;
743 if (buffer)
744 ring_buffer_record_enable(buffer);
745
746 buffer = max_tr.buffer;
747 if (buffer)
748 ring_buffer_record_enable(buffer);
749
750 ftrace_start();
751 out:
752 spin_unlock_irqrestore(&tracing_start_lock, flags);
753}
754
755/**
756 * tracing_stop - quick stop of the tracer
757 *
758 * Light weight way to stop tracing. Use in conjunction with
759 * tracing_start.
760 */
761void tracing_stop(void)
762{
763 struct ring_buffer *buffer;
764 unsigned long flags;
765
766 ftrace_stop();
767 spin_lock_irqsave(&tracing_start_lock, flags);
768 if (trace_stop_count++)
769 goto out;
770
771 buffer = global_trace.buffer;
772 if (buffer)
773 ring_buffer_record_disable(buffer);
774
775 buffer = max_tr.buffer;
776 if (buffer)
777 ring_buffer_record_disable(buffer);
778
779 out:
780 spin_unlock_irqrestore(&tracing_start_lock, flags);
781}
782
584void trace_stop_cmdline_recording(void); 783void trace_stop_cmdline_recording(void);
585 784
586static void trace_save_cmdline(struct task_struct *tsk) 785static void trace_save_cmdline(struct task_struct *tsk)
@@ -618,7 +817,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
618 spin_unlock(&trace_cmdline_lock); 817 spin_unlock(&trace_cmdline_lock);
619} 818}
620 819
621static char *trace_find_cmdline(int pid) 820char *trace_find_cmdline(int pid)
622{ 821{
623 char *cmdline = "<...>"; 822 char *cmdline = "<...>";
624 unsigned map; 823 unsigned map;
@@ -655,6 +854,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
655 854
656 entry->preempt_count = pc & 0xff; 855 entry->preempt_count = pc & 0xff;
657 entry->pid = (tsk) ? tsk->pid : 0; 856 entry->pid = (tsk) ? tsk->pid : 0;
857 entry->tgid = (tsk) ? tsk->tgid : 0;
658 entry->flags = 858 entry->flags =
659#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 859#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
660 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 860 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -691,6 +891,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
691 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 891 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
692} 892}
693 893
894#ifdef CONFIG_FUNCTION_GRAPH_TRACER
895static void __trace_graph_entry(struct trace_array *tr,
896 struct trace_array_cpu *data,
897 struct ftrace_graph_ent *trace,
898 unsigned long flags,
899 int pc)
900{
901 struct ring_buffer_event *event;
902 struct ftrace_graph_ent_entry *entry;
903 unsigned long irq_flags;
904
905 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
906 return;
907
908 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
909 &irq_flags);
910 if (!event)
911 return;
912 entry = ring_buffer_event_data(event);
913 tracing_generic_entry_update(&entry->ent, flags, pc);
914 entry->ent.type = TRACE_GRAPH_ENT;
915 entry->graph_ent = *trace;
916 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
917}
918
919static void __trace_graph_return(struct trace_array *tr,
920 struct trace_array_cpu *data,
921 struct ftrace_graph_ret *trace,
922 unsigned long flags,
923 int pc)
924{
925 struct ring_buffer_event *event;
926 struct ftrace_graph_ret_entry *entry;
927 unsigned long irq_flags;
928
929 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
930 return;
931
932 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
933 &irq_flags);
934 if (!event)
935 return;
936 entry = ring_buffer_event_data(event);
937 tracing_generic_entry_update(&entry->ent, flags, pc);
938 entry->ent.type = TRACE_GRAPH_RET;
939 entry->ret = *trace;
940 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
941}
942#endif
943
694void 944void
695ftrace(struct trace_array *tr, struct trace_array_cpu *data, 945ftrace(struct trace_array *tr, struct trace_array_cpu *data,
696 unsigned long ip, unsigned long parent_ip, unsigned long flags, 946 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -742,6 +992,46 @@ void __trace_stack(struct trace_array *tr,
742 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 992 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
743} 993}
744 994
995static void ftrace_trace_userstack(struct trace_array *tr,
996 struct trace_array_cpu *data,
997 unsigned long flags, int pc)
998{
999#ifdef CONFIG_STACKTRACE
1000 struct ring_buffer_event *event;
1001 struct userstack_entry *entry;
1002 struct stack_trace trace;
1003 unsigned long irq_flags;
1004
1005 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1006 return;
1007
1008 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
1009 &irq_flags);
1010 if (!event)
1011 return;
1012 entry = ring_buffer_event_data(event);
1013 tracing_generic_entry_update(&entry->ent, flags, pc);
1014 entry->ent.type = TRACE_USER_STACK;
1015
1016 memset(&entry->caller, 0, sizeof(entry->caller));
1017
1018 trace.nr_entries = 0;
1019 trace.max_entries = FTRACE_STACK_ENTRIES;
1020 trace.skip = 0;
1021 trace.entries = entry->caller;
1022
1023 save_stack_trace_user(&trace);
1024 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1025#endif
1026}
1027
1028void __trace_userstack(struct trace_array *tr,
1029 struct trace_array_cpu *data,
1030 unsigned long flags)
1031{
1032 ftrace_trace_userstack(tr, data, flags, preempt_count());
1033}
1034
745static void 1035static void
746ftrace_trace_special(void *__tr, void *__data, 1036ftrace_trace_special(void *__tr, void *__data,
747 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1037 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -765,6 +1055,7 @@ ftrace_trace_special(void *__tr, void *__data,
765 entry->arg3 = arg3; 1055 entry->arg3 = arg3;
766 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1056 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
767 ftrace_trace_stack(tr, data, irq_flags, 4, pc); 1057 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1058 ftrace_trace_userstack(tr, data, irq_flags, pc);
768 1059
769 trace_wake_up(); 1060 trace_wake_up();
770} 1061}
@@ -803,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
803 entry->next_cpu = task_cpu(next); 1094 entry->next_cpu = task_cpu(next);
804 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1095 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
805 ftrace_trace_stack(tr, data, flags, 5, pc); 1096 ftrace_trace_stack(tr, data, flags, 5, pc);
1097 ftrace_trace_userstack(tr, data, flags, pc);
806} 1098}
807 1099
808void 1100void
@@ -832,6 +1124,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
832 entry->next_cpu = task_cpu(wakee); 1124 entry->next_cpu = task_cpu(wakee);
833 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1125 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
834 ftrace_trace_stack(tr, data, flags, 6, pc); 1126 ftrace_trace_stack(tr, data, flags, 6, pc);
1127 ftrace_trace_userstack(tr, data, flags, pc);
835 1128
836 trace_wake_up(); 1129 trace_wake_up();
837} 1130}
@@ -841,26 +1134,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
841{ 1134{
842 struct trace_array *tr = &global_trace; 1135 struct trace_array *tr = &global_trace;
843 struct trace_array_cpu *data; 1136 struct trace_array_cpu *data;
1137 unsigned long flags;
844 int cpu; 1138 int cpu;
845 int pc; 1139 int pc;
846 1140
847 if (tracing_disabled || !tr->ctrl) 1141 if (tracing_disabled)
848 return; 1142 return;
849 1143
850 pc = preempt_count(); 1144 pc = preempt_count();
851 preempt_disable_notrace(); 1145 local_irq_save(flags);
852 cpu = raw_smp_processor_id(); 1146 cpu = raw_smp_processor_id();
853 data = tr->data[cpu]; 1147 data = tr->data[cpu];
854 1148
855 if (likely(!atomic_read(&data->disabled))) 1149 if (likely(atomic_inc_return(&data->disabled) == 1))
856 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1150 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
857 1151
858 preempt_enable_notrace(); 1152 atomic_dec(&data->disabled);
1153 local_irq_restore(flags);
859} 1154}
860 1155
861#ifdef CONFIG_FUNCTION_TRACER 1156#ifdef CONFIG_FUNCTION_TRACER
862static void 1157static void
863function_trace_call(unsigned long ip, unsigned long parent_ip) 1158function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
864{ 1159{
865 struct trace_array *tr = &global_trace; 1160 struct trace_array *tr = &global_trace;
866 struct trace_array_cpu *data; 1161 struct trace_array_cpu *data;
@@ -873,8 +1168,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
873 return; 1168 return;
874 1169
875 pc = preempt_count(); 1170 pc = preempt_count();
876 resched = need_resched(); 1171 resched = ftrace_preempt_disable();
877 preempt_disable_notrace();
878 local_save_flags(flags); 1172 local_save_flags(flags);
879 cpu = raw_smp_processor_id(); 1173 cpu = raw_smp_processor_id();
880 data = tr->data[cpu]; 1174 data = tr->data[cpu];
@@ -884,11 +1178,96 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
884 trace_function(tr, data, ip, parent_ip, flags, pc); 1178 trace_function(tr, data, ip, parent_ip, flags, pc);
885 1179
886 atomic_dec(&data->disabled); 1180 atomic_dec(&data->disabled);
887 if (resched) 1181 ftrace_preempt_enable(resched);
888 preempt_enable_no_resched_notrace(); 1182}
889 else 1183
890 preempt_enable_notrace(); 1184static void
1185function_trace_call(unsigned long ip, unsigned long parent_ip)
1186{
1187 struct trace_array *tr = &global_trace;
1188 struct trace_array_cpu *data;
1189 unsigned long flags;
1190 long disabled;
1191 int cpu;
1192 int pc;
1193
1194 if (unlikely(!ftrace_function_enabled))
1195 return;
1196
1197 /*
1198 * Need to use raw, since this must be called before the
1199 * recursive protection is performed.
1200 */
1201 local_irq_save(flags);
1202 cpu = raw_smp_processor_id();
1203 data = tr->data[cpu];
1204 disabled = atomic_inc_return(&data->disabled);
1205
1206 if (likely(disabled == 1)) {
1207 pc = preempt_count();
1208 trace_function(tr, data, ip, parent_ip, flags, pc);
1209 }
1210
1211 atomic_dec(&data->disabled);
1212 local_irq_restore(flags);
1213}
1214
1215#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1216int trace_graph_entry(struct ftrace_graph_ent *trace)
1217{
1218 struct trace_array *tr = &global_trace;
1219 struct trace_array_cpu *data;
1220 unsigned long flags;
1221 long disabled;
1222 int cpu;
1223 int pc;
1224
1225 if (!ftrace_trace_task(current))
1226 return 0;
1227
1228 if (!ftrace_graph_addr(trace->func))
1229 return 0;
1230
1231 local_irq_save(flags);
1232 cpu = raw_smp_processor_id();
1233 data = tr->data[cpu];
1234 disabled = atomic_inc_return(&data->disabled);
1235 if (likely(disabled == 1)) {
1236 pc = preempt_count();
1237 __trace_graph_entry(tr, data, trace, flags, pc);
1238 }
1239 /* Only do the atomic if it is not already set */
1240 if (!test_tsk_trace_graph(current))
1241 set_tsk_trace_graph(current);
1242 atomic_dec(&data->disabled);
1243 local_irq_restore(flags);
1244
1245 return 1;
1246}
1247
1248void trace_graph_return(struct ftrace_graph_ret *trace)
1249{
1250 struct trace_array *tr = &global_trace;
1251 struct trace_array_cpu *data;
1252 unsigned long flags;
1253 long disabled;
1254 int cpu;
1255 int pc;
1256
1257 local_irq_save(flags);
1258 cpu = raw_smp_processor_id();
1259 data = tr->data[cpu];
1260 disabled = atomic_inc_return(&data->disabled);
1261 if (likely(disabled == 1)) {
1262 pc = preempt_count();
1263 __trace_graph_return(tr, data, trace, flags, pc);
1264 }
1265 if (!trace->depth)
1266 clear_tsk_trace_graph(current);
1267 atomic_dec(&data->disabled);
1268 local_irq_restore(flags);
891} 1269}
1270#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
892 1271
893static struct ftrace_ops trace_ops __read_mostly = 1272static struct ftrace_ops trace_ops __read_mostly =
894{ 1273{
@@ -898,9 +1277,14 @@ static struct ftrace_ops trace_ops __read_mostly =
898void tracing_start_function_trace(void) 1277void tracing_start_function_trace(void)
899{ 1278{
900 ftrace_function_enabled = 0; 1279 ftrace_function_enabled = 0;
1280
1281 if (trace_flags & TRACE_ITER_PREEMPTONLY)
1282 trace_ops.func = function_trace_call_preempt_only;
1283 else
1284 trace_ops.func = function_trace_call;
1285
901 register_ftrace_function(&trace_ops); 1286 register_ftrace_function(&trace_ops);
902 if (tracer_enabled) 1287 ftrace_function_enabled = 1;
903 ftrace_function_enabled = 1;
904} 1288}
905 1289
906void tracing_stop_function_trace(void) 1290void tracing_stop_function_trace(void)
@@ -912,6 +1296,7 @@ void tracing_stop_function_trace(void)
912 1296
913enum trace_file_type { 1297enum trace_file_type {
914 TRACE_FILE_LAT_FMT = 1, 1298 TRACE_FILE_LAT_FMT = 1,
1299 TRACE_FILE_ANNOTATE = 2,
915}; 1300};
916 1301
917static void trace_iterator_increment(struct trace_iterator *iter, int cpu) 1302static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1047,10 +1432,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1047 1432
1048 atomic_inc(&trace_record_cmdline_disabled); 1433 atomic_inc(&trace_record_cmdline_disabled);
1049 1434
1050 /* let the tracer grab locks here if needed */
1051 if (current_trace->start)
1052 current_trace->start(iter);
1053
1054 if (*pos != iter->pos) { 1435 if (*pos != iter->pos) {
1055 iter->ent = NULL; 1436 iter->ent = NULL;
1056 iter->cpu = 0; 1437 iter->cpu = 0;
@@ -1077,14 +1458,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1077 1458
1078static void s_stop(struct seq_file *m, void *p) 1459static void s_stop(struct seq_file *m, void *p)
1079{ 1460{
1080 struct trace_iterator *iter = m->private;
1081
1082 atomic_dec(&trace_record_cmdline_disabled); 1461 atomic_dec(&trace_record_cmdline_disabled);
1083
1084 /* let the tracer release locks here if needed */
1085 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1086 iter->trace->stop(iter);
1087
1088 mutex_unlock(&trace_types_lock); 1462 mutex_unlock(&trace_types_lock);
1089} 1463}
1090 1464
@@ -1143,7 +1517,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1143# define IP_FMT "%016lx" 1517# define IP_FMT "%016lx"
1144#endif 1518#endif
1145 1519
1146static int 1520int
1147seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 1521seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1148{ 1522{
1149 int ret; 1523 int ret;
@@ -1164,6 +1538,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1164 return ret; 1538 return ret;
1165} 1539}
1166 1540
1541static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1542 unsigned long ip, unsigned long sym_flags)
1543{
1544 struct file *file = NULL;
1545 unsigned long vmstart = 0;
1546 int ret = 1;
1547
1548 if (mm) {
1549 const struct vm_area_struct *vma;
1550
1551 down_read(&mm->mmap_sem);
1552 vma = find_vma(mm, ip);
1553 if (vma) {
1554 file = vma->vm_file;
1555 vmstart = vma->vm_start;
1556 }
1557 if (file) {
1558 ret = trace_seq_path(s, &file->f_path);
1559 if (ret)
1560 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1561 }
1562 up_read(&mm->mmap_sem);
1563 }
1564 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1565 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1566 return ret;
1567}
1568
1569static int
1570seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1571 unsigned long sym_flags)
1572{
1573 struct mm_struct *mm = NULL;
1574 int ret = 1;
1575 unsigned int i;
1576
1577 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1578 struct task_struct *task;
1579 /*
1580 * we do the lookup on the thread group leader,
1581 * since individual threads might have already quit!
1582 */
1583 rcu_read_lock();
1584 task = find_task_by_vpid(entry->ent.tgid);
1585 if (task)
1586 mm = get_task_mm(task);
1587 rcu_read_unlock();
1588 }
1589
1590 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1591 unsigned long ip = entry->caller[i];
1592
1593 if (ip == ULONG_MAX || !ret)
1594 break;
1595 if (i && ret)
1596 ret = trace_seq_puts(s, " <- ");
1597 if (!ip) {
1598 if (ret)
1599 ret = trace_seq_puts(s, "??");
1600 continue;
1601 }
1602 if (!ret)
1603 break;
1604 if (ret)
1605 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1606 }
1607
1608 if (mm)
1609 mmput(mm);
1610 return ret;
1611}
1612
1167static void print_lat_help_header(struct seq_file *m) 1613static void print_lat_help_header(struct seq_file *m)
1168{ 1614{
1169 seq_puts(m, "# _------=> CPU# \n"); 1615 seq_puts(m, "# _------=> CPU# \n");
@@ -1338,6 +1784,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1338 trace_seq_putc(s, '\n'); 1784 trace_seq_putc(s, '\n');
1339} 1785}
1340 1786
1787static void test_cpu_buff_start(struct trace_iterator *iter)
1788{
1789 struct trace_seq *s = &iter->seq;
1790
1791 if (!(trace_flags & TRACE_ITER_ANNOTATE))
1792 return;
1793
1794 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1795 return;
1796
1797 if (cpu_isset(iter->cpu, iter->started))
1798 return;
1799
1800 cpu_set(iter->cpu, iter->started);
1801 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1802}
1803
1341static enum print_line_t 1804static enum print_line_t
1342print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1805print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1343{ 1806{
@@ -1357,6 +1820,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1357 if (entry->type == TRACE_CONT) 1820 if (entry->type == TRACE_CONT)
1358 return TRACE_TYPE_HANDLED; 1821 return TRACE_TYPE_HANDLED;
1359 1822
1823 test_cpu_buff_start(iter);
1824
1360 next_entry = find_next_entry(iter, NULL, &next_ts); 1825 next_entry = find_next_entry(iter, NULL, &next_ts);
1361 if (!next_entry) 1826 if (!next_entry)
1362 next_ts = iter->ts; 1827 next_ts = iter->ts;
@@ -1448,6 +1913,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1448 trace_seq_print_cont(s, iter); 1913 trace_seq_print_cont(s, iter);
1449 break; 1914 break;
1450 } 1915 }
1916 case TRACE_BRANCH: {
1917 struct trace_branch *field;
1918
1919 trace_assign_type(field, entry);
1920
1921 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1922 field->correct ? " ok " : " MISS ",
1923 field->func,
1924 field->file,
1925 field->line);
1926 break;
1927 }
1928 case TRACE_USER_STACK: {
1929 struct userstack_entry *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 seq_print_userip_objs(field, s, sym_flags);
1934 trace_seq_putc(s, '\n');
1935 break;
1936 }
1451 default: 1937 default:
1452 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1938 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1453 } 1939 }
@@ -1472,6 +1958,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1472 if (entry->type == TRACE_CONT) 1958 if (entry->type == TRACE_CONT)
1473 return TRACE_TYPE_HANDLED; 1959 return TRACE_TYPE_HANDLED;
1474 1960
1961 test_cpu_buff_start(iter);
1962
1475 comm = trace_find_cmdline(iter->ent->pid); 1963 comm = trace_find_cmdline(iter->ent->pid);
1476 1964
1477 t = ns2usecs(iter->ts); 1965 t = ns2usecs(iter->ts);
@@ -1581,6 +2069,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1581 trace_seq_print_cont(s, iter); 2069 trace_seq_print_cont(s, iter);
1582 break; 2070 break;
1583 } 2071 }
2072 case TRACE_GRAPH_RET: {
2073 return print_graph_function(iter);
2074 }
2075 case TRACE_GRAPH_ENT: {
2076 return print_graph_function(iter);
2077 }
2078 case TRACE_BRANCH: {
2079 struct trace_branch *field;
2080
2081 trace_assign_type(field, entry);
2082
2083 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2084 field->correct ? " ok " : " MISS ",
2085 field->func,
2086 field->file,
2087 field->line);
2088 break;
2089 }
2090 case TRACE_USER_STACK: {
2091 struct userstack_entry *field;
2092
2093 trace_assign_type(field, entry);
2094
2095 ret = seq_print_userip_objs(field, s, sym_flags);
2096 if (!ret)
2097 return TRACE_TYPE_PARTIAL_LINE;
2098 ret = trace_seq_putc(s, '\n');
2099 if (!ret)
2100 return TRACE_TYPE_PARTIAL_LINE;
2101 break;
2102 }
1584 } 2103 }
1585 return TRACE_TYPE_HANDLED; 2104 return TRACE_TYPE_HANDLED;
1586} 2105}
@@ -1640,6 +2159,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1640 break; 2159 break;
1641 } 2160 }
1642 case TRACE_SPECIAL: 2161 case TRACE_SPECIAL:
2162 case TRACE_USER_STACK:
1643 case TRACE_STACK: { 2163 case TRACE_STACK: {
1644 struct special_entry *field; 2164 struct special_entry *field;
1645 2165
@@ -1728,6 +2248,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1728 break; 2248 break;
1729 } 2249 }
1730 case TRACE_SPECIAL: 2250 case TRACE_SPECIAL:
2251 case TRACE_USER_STACK:
1731 case TRACE_STACK: { 2252 case TRACE_STACK: {
1732 struct special_entry *field; 2253 struct special_entry *field;
1733 2254
@@ -1755,7 +2276,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1755 return TRACE_TYPE_HANDLED; 2276 return TRACE_TYPE_HANDLED;
1756 2277
1757 SEQ_PUT_FIELD_RET(s, entry->pid); 2278 SEQ_PUT_FIELD_RET(s, entry->pid);
1758 SEQ_PUT_FIELD_RET(s, iter->cpu); 2279 SEQ_PUT_FIELD_RET(s, entry->cpu);
1759 SEQ_PUT_FIELD_RET(s, iter->ts); 2280 SEQ_PUT_FIELD_RET(s, iter->ts);
1760 2281
1761 switch (entry->type) { 2282 switch (entry->type) {
@@ -1782,6 +2303,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1782 break; 2303 break;
1783 } 2304 }
1784 case TRACE_SPECIAL: 2305 case TRACE_SPECIAL:
2306 case TRACE_USER_STACK:
1785 case TRACE_STACK: { 2307 case TRACE_STACK: {
1786 struct special_entry *field; 2308 struct special_entry *field;
1787 2309
@@ -1847,7 +2369,9 @@ static int s_show(struct seq_file *m, void *v)
1847 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2369 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1848 seq_puts(m, "#\n"); 2370 seq_puts(m, "#\n");
1849 } 2371 }
1850 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2372 if (iter->trace && iter->trace->print_header)
2373 iter->trace->print_header(m);
2374 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1851 /* print nothing if the buffers are empty */ 2375 /* print nothing if the buffers are empty */
1852 if (trace_empty(iter)) 2376 if (trace_empty(iter))
1853 return 0; 2377 return 0;
@@ -1899,6 +2423,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1899 iter->trace = current_trace; 2423 iter->trace = current_trace;
1900 iter->pos = -1; 2424 iter->pos = -1;
1901 2425
2426 /* Notify the tracer early; before we stop tracing. */
2427 if (iter->trace && iter->trace->open)
2428 iter->trace->open(iter);
2429
2430 /* Annotate start of buffers if we had overruns */
2431 if (ring_buffer_overruns(iter->tr->buffer))
2432 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2433
2434
1902 for_each_tracing_cpu(cpu) { 2435 for_each_tracing_cpu(cpu) {
1903 2436
1904 iter->buffer_iter[cpu] = 2437 iter->buffer_iter[cpu] =
@@ -1917,13 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1917 m->private = iter; 2450 m->private = iter;
1918 2451
1919 /* stop the trace while dumping */ 2452 /* stop the trace while dumping */
1920 if (iter->tr->ctrl) { 2453 tracing_stop();
1921 tracer_enabled = 0;
1922 ftrace_function_enabled = 0;
1923 }
1924
1925 if (iter->trace && iter->trace->open)
1926 iter->trace->open(iter);
1927 2454
1928 mutex_unlock(&trace_types_lock); 2455 mutex_unlock(&trace_types_lock);
1929 2456
@@ -1936,6 +2463,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1936 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2463 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1937 } 2464 }
1938 mutex_unlock(&trace_types_lock); 2465 mutex_unlock(&trace_types_lock);
2466 kfree(iter);
1939 2467
1940 return ERR_PTR(-ENOMEM); 2468 return ERR_PTR(-ENOMEM);
1941} 2469}
@@ -1965,14 +2493,7 @@ int tracing_release(struct inode *inode, struct file *file)
1965 iter->trace->close(iter); 2493 iter->trace->close(iter);
1966 2494
1967 /* reenable tracing if it was previously enabled */ 2495 /* reenable tracing if it was previously enabled */
1968 if (iter->tr->ctrl) { 2496 tracing_start();
1969 tracer_enabled = 1;
1970 /*
1971 * It is safe to enable function tracing even if it
1972 * isn't used
1973 */
1974 ftrace_function_enabled = 1;
1975 }
1976 mutex_unlock(&trace_types_lock); 2497 mutex_unlock(&trace_types_lock);
1977 2498
1978 seq_release(inode, file); 2499 seq_release(inode, file);
@@ -2150,7 +2671,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2150 if (err) 2671 if (err)
2151 goto err_unlock; 2672 goto err_unlock;
2152 2673
2153 raw_local_irq_disable(); 2674 local_irq_disable();
2154 __raw_spin_lock(&ftrace_max_lock); 2675 __raw_spin_lock(&ftrace_max_lock);
2155 for_each_tracing_cpu(cpu) { 2676 for_each_tracing_cpu(cpu) {
2156 /* 2677 /*
@@ -2167,7 +2688,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2167 } 2688 }
2168 } 2689 }
2169 __raw_spin_unlock(&ftrace_max_lock); 2690 __raw_spin_unlock(&ftrace_max_lock);
2170 raw_local_irq_enable(); 2691 local_irq_enable();
2171 2692
2172 tracing_cpumask = tracing_cpumask_new; 2693 tracing_cpumask = tracing_cpumask_new;
2173 2694
@@ -2188,13 +2709,16 @@ static struct file_operations tracing_cpumask_fops = {
2188}; 2709};
2189 2710
2190static ssize_t 2711static ssize_t
2191tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, 2712tracing_trace_options_read(struct file *filp, char __user *ubuf,
2192 size_t cnt, loff_t *ppos) 2713 size_t cnt, loff_t *ppos)
2193{ 2714{
2715 int i;
2194 char *buf; 2716 char *buf;
2195 int r = 0; 2717 int r = 0;
2196 int len = 0; 2718 int len = 0;
2197 int i; 2719 u32 tracer_flags = current_trace->flags->val;
2720 struct tracer_opt *trace_opts = current_trace->flags->opts;
2721
2198 2722
2199 /* calulate max size */ 2723 /* calulate max size */
2200 for (i = 0; trace_options[i]; i++) { 2724 for (i = 0; trace_options[i]; i++) {
@@ -2202,6 +2726,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2202 len += 3; /* "no" and space */ 2726 len += 3; /* "no" and space */
2203 } 2727 }
2204 2728
2729 /*
2730 * Increase the size with names of options specific
2731 * of the current tracer.
2732 */
2733 for (i = 0; trace_opts[i].name; i++) {
2734 len += strlen(trace_opts[i].name);
2735 len += 3; /* "no" and space */
2736 }
2737
2205 /* +2 for \n and \0 */ 2738 /* +2 for \n and \0 */
2206 buf = kmalloc(len + 2, GFP_KERNEL); 2739 buf = kmalloc(len + 2, GFP_KERNEL);
2207 if (!buf) 2740 if (!buf)
@@ -2214,6 +2747,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2214 r += sprintf(buf + r, "no%s ", trace_options[i]); 2747 r += sprintf(buf + r, "no%s ", trace_options[i]);
2215 } 2748 }
2216 2749
2750 for (i = 0; trace_opts[i].name; i++) {
2751 if (tracer_flags & trace_opts[i].bit)
2752 r += sprintf(buf + r, "%s ",
2753 trace_opts[i].name);
2754 else
2755 r += sprintf(buf + r, "no%s ",
2756 trace_opts[i].name);
2757 }
2758
2217 r += sprintf(buf + r, "\n"); 2759 r += sprintf(buf + r, "\n");
2218 WARN_ON(r >= len + 2); 2760 WARN_ON(r >= len + 2);
2219 2761
@@ -2224,13 +2766,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2224 return r; 2766 return r;
2225} 2767}
2226 2768
2769/* Try to assign a tracer specific option */
2770static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2771{
2772 struct tracer_flags *trace_flags = trace->flags;
2773 struct tracer_opt *opts = NULL;
2774 int ret = 0, i = 0;
2775 int len;
2776
2777 for (i = 0; trace_flags->opts[i].name; i++) {
2778 opts = &trace_flags->opts[i];
2779 len = strlen(opts->name);
2780
2781 if (strncmp(cmp, opts->name, len) == 0) {
2782 ret = trace->set_flag(trace_flags->val,
2783 opts->bit, !neg);
2784 break;
2785 }
2786 }
2787 /* Not found */
2788 if (!trace_flags->opts[i].name)
2789 return -EINVAL;
2790
2791 /* Refused to handle */
2792 if (ret)
2793 return ret;
2794
2795 if (neg)
2796 trace_flags->val &= ~opts->bit;
2797 else
2798 trace_flags->val |= opts->bit;
2799
2800 return 0;
2801}
2802
2227static ssize_t 2803static ssize_t
2228tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, 2804tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2229 size_t cnt, loff_t *ppos) 2805 size_t cnt, loff_t *ppos)
2230{ 2806{
2231 char buf[64]; 2807 char buf[64];
2232 char *cmp = buf; 2808 char *cmp = buf;
2233 int neg = 0; 2809 int neg = 0;
2810 int ret;
2234 int i; 2811 int i;
2235 2812
2236 if (cnt >= sizeof(buf)) 2813 if (cnt >= sizeof(buf))
@@ -2257,11 +2834,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2257 break; 2834 break;
2258 } 2835 }
2259 } 2836 }
2260 /* 2837
2261 * If no option could be set, return an error: 2838 /* If no option could be set, test the specific tracer options */
2262 */ 2839 if (!trace_options[i]) {
2263 if (!trace_options[i]) 2840 ret = set_tracer_option(current_trace, cmp, neg);
2264 return -EINVAL; 2841 if (ret)
2842 return ret;
2843 }
2265 2844
2266 filp->f_pos += cnt; 2845 filp->f_pos += cnt;
2267 2846
@@ -2270,8 +2849,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2270 2849
2271static struct file_operations tracing_iter_fops = { 2850static struct file_operations tracing_iter_fops = {
2272 .open = tracing_open_generic, 2851 .open = tracing_open_generic,
2273 .read = tracing_iter_ctrl_read, 2852 .read = tracing_trace_options_read,
2274 .write = tracing_iter_ctrl_write, 2853 .write = tracing_trace_options_write,
2275}; 2854};
2276 2855
2277static const char readme_msg[] = 2856static const char readme_msg[] =
@@ -2285,9 +2864,9 @@ static const char readme_msg[] =
2285 "# echo sched_switch > /debug/tracing/current_tracer\n" 2864 "# echo sched_switch > /debug/tracing/current_tracer\n"
2286 "# cat /debug/tracing/current_tracer\n" 2865 "# cat /debug/tracing/current_tracer\n"
2287 "sched_switch\n" 2866 "sched_switch\n"
2288 "# cat /debug/tracing/iter_ctrl\n" 2867 "# cat /debug/tracing/trace_options\n"
2289 "noprint-parent nosym-offset nosym-addr noverbose\n" 2868 "noprint-parent nosym-offset nosym-addr noverbose\n"
2290 "# echo print-parent > /debug/tracing/iter_ctrl\n" 2869 "# echo print-parent > /debug/tracing/trace_options\n"
2291 "# echo 1 > /debug/tracing/tracing_enabled\n" 2870 "# echo 1 > /debug/tracing/tracing_enabled\n"
2292 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2871 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2293 "echo 0 > /debug/tracing/tracing_enabled\n" 2872 "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2310,11 +2889,10 @@ static ssize_t
2310tracing_ctrl_read(struct file *filp, char __user *ubuf, 2889tracing_ctrl_read(struct file *filp, char __user *ubuf,
2311 size_t cnt, loff_t *ppos) 2890 size_t cnt, loff_t *ppos)
2312{ 2891{
2313 struct trace_array *tr = filp->private_data;
2314 char buf[64]; 2892 char buf[64];
2315 int r; 2893 int r;
2316 2894
2317 r = sprintf(buf, "%ld\n", tr->ctrl); 2895 r = sprintf(buf, "%u\n", tracer_enabled);
2318 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2896 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2319} 2897}
2320 2898
@@ -2342,16 +2920,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2342 val = !!val; 2920 val = !!val;
2343 2921
2344 mutex_lock(&trace_types_lock); 2922 mutex_lock(&trace_types_lock);
2345 if (tr->ctrl ^ val) { 2923 if (tracer_enabled ^ val) {
2346 if (val) 2924 if (val) {
2347 tracer_enabled = 1; 2925 tracer_enabled = 1;
2348 else 2926 if (current_trace->start)
2927 current_trace->start(tr);
2928 tracing_start();
2929 } else {
2349 tracer_enabled = 0; 2930 tracer_enabled = 0;
2350 2931 tracing_stop();
2351 tr->ctrl = val; 2932 if (current_trace->stop)
2352 2933 current_trace->stop(tr);
2353 if (current_trace && current_trace->ctrl_update) 2934 }
2354 current_trace->ctrl_update(tr);
2355 } 2935 }
2356 mutex_unlock(&trace_types_lock); 2936 mutex_unlock(&trace_types_lock);
2357 2937
@@ -2377,29 +2957,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2377 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2957 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2378} 2958}
2379 2959
2380static ssize_t 2960static int tracing_set_tracer(char *buf)
2381tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2382 size_t cnt, loff_t *ppos)
2383{ 2961{
2384 struct trace_array *tr = &global_trace; 2962 struct trace_array *tr = &global_trace;
2385 struct tracer *t; 2963 struct tracer *t;
2386 char buf[max_tracer_type_len+1]; 2964 int ret = 0;
2387 int i;
2388 size_t ret;
2389
2390 ret = cnt;
2391
2392 if (cnt > max_tracer_type_len)
2393 cnt = max_tracer_type_len;
2394
2395 if (copy_from_user(&buf, ubuf, cnt))
2396 return -EFAULT;
2397
2398 buf[cnt] = 0;
2399
2400 /* strip ending whitespace. */
2401 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2402 buf[i] = 0;
2403 2965
2404 mutex_lock(&trace_types_lock); 2966 mutex_lock(&trace_types_lock);
2405 for (t = trace_types; t; t = t->next) { 2967 for (t = trace_types; t; t = t->next) {
@@ -2413,18 +2975,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2413 if (t == current_trace) 2975 if (t == current_trace)
2414 goto out; 2976 goto out;
2415 2977
2978 trace_branch_disable();
2416 if (current_trace && current_trace->reset) 2979 if (current_trace && current_trace->reset)
2417 current_trace->reset(tr); 2980 current_trace->reset(tr);
2418 2981
2419 current_trace = t; 2982 current_trace = t;
2420 if (t->init) 2983 if (t->init) {
2421 t->init(tr); 2984 ret = t->init(tr);
2985 if (ret)
2986 goto out;
2987 }
2422 2988
2989 trace_branch_enable(tr);
2423 out: 2990 out:
2424 mutex_unlock(&trace_types_lock); 2991 mutex_unlock(&trace_types_lock);
2425 2992
2426 if (ret > 0) 2993 return ret;
2427 filp->f_pos += ret; 2994}
2995
2996static ssize_t
2997tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2998 size_t cnt, loff_t *ppos)
2999{
3000 char buf[max_tracer_type_len+1];
3001 int i;
3002 size_t ret;
3003 int err;
3004
3005 ret = cnt;
3006
3007 if (cnt > max_tracer_type_len)
3008 cnt = max_tracer_type_len;
3009
3010 if (copy_from_user(&buf, ubuf, cnt))
3011 return -EFAULT;
3012
3013 buf[cnt] = 0;
3014
3015 /* strip ending whitespace. */
3016 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3017 buf[i] = 0;
3018
3019 err = tracing_set_tracer(buf);
3020 if (err)
3021 return err;
3022
3023 filp->f_pos += ret;
2428 3024
2429 return ret; 3025 return ret;
2430} 3026}
@@ -2491,6 +3087,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2491 return -ENOMEM; 3087 return -ENOMEM;
2492 3088
2493 mutex_lock(&trace_types_lock); 3089 mutex_lock(&trace_types_lock);
3090
3091 /* trace pipe does not show start of buffer */
3092 cpus_setall(iter->started);
3093
2494 iter->tr = &global_trace; 3094 iter->tr = &global_trace;
2495 iter->trace = current_trace; 3095 iter->trace = current_trace;
2496 filp->private_data = iter; 3096 filp->private_data = iter;
@@ -2666,7 +3266,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
2666 char buf[64]; 3266 char buf[64];
2667 int r; 3267 int r;
2668 3268
2669 r = sprintf(buf, "%lu\n", tr->entries); 3269 r = sprintf(buf, "%lu\n", tr->entries >> 10);
2670 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3270 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2671} 3271}
2672 3272
@@ -2676,8 +3276,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2676{ 3276{
2677 unsigned long val; 3277 unsigned long val;
2678 char buf[64]; 3278 char buf[64];
2679 int ret; 3279 int ret, cpu;
2680 struct trace_array *tr = filp->private_data;
2681 3280
2682 if (cnt >= sizeof(buf)) 3281 if (cnt >= sizeof(buf))
2683 return -EINVAL; 3282 return -EINVAL;
@@ -2697,13 +3296,19 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2697 3296
2698 mutex_lock(&trace_types_lock); 3297 mutex_lock(&trace_types_lock);
2699 3298
2700 if (tr->ctrl) { 3299 tracing_stop();
2701 cnt = -EBUSY; 3300
2702 pr_info("ftrace: please disable tracing" 3301 /* disable all cpu buffers */
2703 " before modifying buffer size\n"); 3302 for_each_tracing_cpu(cpu) {
2704 goto out; 3303 if (global_trace.data[cpu])
3304 atomic_inc(&global_trace.data[cpu]->disabled);
3305 if (max_tr.data[cpu])
3306 atomic_inc(&max_tr.data[cpu]->disabled);
2705 } 3307 }
2706 3308
3309 /* value is in KB */
3310 val <<= 10;
3311
2707 if (val != global_trace.entries) { 3312 if (val != global_trace.entries) {
2708 ret = ring_buffer_resize(global_trace.buffer, val); 3313 ret = ring_buffer_resize(global_trace.buffer, val);
2709 if (ret < 0) { 3314 if (ret < 0) {
@@ -2735,6 +3340,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2735 if (tracing_disabled) 3340 if (tracing_disabled)
2736 cnt = -ENOMEM; 3341 cnt = -ENOMEM;
2737 out: 3342 out:
3343 for_each_tracing_cpu(cpu) {
3344 if (global_trace.data[cpu])
3345 atomic_dec(&global_trace.data[cpu]->disabled);
3346 if (max_tr.data[cpu])
3347 atomic_dec(&max_tr.data[cpu]->disabled);
3348 }
3349
3350 tracing_start();
2738 max_tr.entries = global_trace.entries; 3351 max_tr.entries = global_trace.entries;
2739 mutex_unlock(&trace_types_lock); 3352 mutex_unlock(&trace_types_lock);
2740 3353
@@ -2746,7 +3359,7 @@ static int mark_printk(const char *fmt, ...)
2746 int ret; 3359 int ret;
2747 va_list args; 3360 va_list args;
2748 va_start(args, fmt); 3361 va_start(args, fmt);
2749 ret = trace_vprintk(0, fmt, args); 3362 ret = trace_vprintk(0, -1, fmt, args);
2750 va_end(args); 3363 va_end(args);
2751 return ret; 3364 return ret;
2752} 3365}
@@ -2757,9 +3370,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
2757{ 3370{
2758 char *buf; 3371 char *buf;
2759 char *end; 3372 char *end;
2760 struct trace_array *tr = &global_trace;
2761 3373
2762 if (!tr->ctrl || tracing_disabled) 3374 if (tracing_disabled)
2763 return -EINVAL; 3375 return -EINVAL;
2764 3376
2765 if (cnt > TRACE_BUF_SIZE) 3377 if (cnt > TRACE_BUF_SIZE)
@@ -2825,22 +3437,38 @@ static struct file_operations tracing_mark_fops = {
2825 3437
2826#ifdef CONFIG_DYNAMIC_FTRACE 3438#ifdef CONFIG_DYNAMIC_FTRACE
2827 3439
3440int __weak ftrace_arch_read_dyn_info(char *buf, int size)
3441{
3442 return 0;
3443}
3444
2828static ssize_t 3445static ssize_t
2829tracing_read_long(struct file *filp, char __user *ubuf, 3446tracing_read_dyn_info(struct file *filp, char __user *ubuf,
2830 size_t cnt, loff_t *ppos) 3447 size_t cnt, loff_t *ppos)
2831{ 3448{
3449 static char ftrace_dyn_info_buffer[1024];
3450 static DEFINE_MUTEX(dyn_info_mutex);
2832 unsigned long *p = filp->private_data; 3451 unsigned long *p = filp->private_data;
2833 char buf[64]; 3452 char *buf = ftrace_dyn_info_buffer;
3453 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
2834 int r; 3454 int r;
2835 3455
2836 r = sprintf(buf, "%ld\n", *p); 3456 mutex_lock(&dyn_info_mutex);
3457 r = sprintf(buf, "%ld ", *p);
2837 3458
2838 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3459 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
3460 buf[r++] = '\n';
3461
3462 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3463
3464 mutex_unlock(&dyn_info_mutex);
3465
3466 return r;
2839} 3467}
2840 3468
2841static struct file_operations tracing_read_long_fops = { 3469static struct file_operations tracing_dyn_info_fops = {
2842 .open = tracing_open_generic, 3470 .open = tracing_open_generic,
2843 .read = tracing_read_long, 3471 .read = tracing_read_dyn_info,
2844}; 3472};
2845#endif 3473#endif
2846 3474
@@ -2881,10 +3509,10 @@ static __init int tracer_init_debugfs(void)
2881 if (!entry) 3509 if (!entry)
2882 pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); 3510 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2883 3511
2884 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, 3512 entry = debugfs_create_file("trace_options", 0644, d_tracer,
2885 NULL, &tracing_iter_fops); 3513 NULL, &tracing_iter_fops);
2886 if (!entry) 3514 if (!entry)
2887 pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); 3515 pr_warning("Could not create debugfs 'trace_options' entry\n");
2888 3516
2889 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3517 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2890 NULL, &tracing_cpumask_fops); 3518 NULL, &tracing_cpumask_fops);
@@ -2934,11 +3562,11 @@ static __init int tracer_init_debugfs(void)
2934 pr_warning("Could not create debugfs " 3562 pr_warning("Could not create debugfs "
2935 "'trace_pipe' entry\n"); 3563 "'trace_pipe' entry\n");
2936 3564
2937 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 3565 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
2938 &global_trace, &tracing_entries_fops); 3566 &global_trace, &tracing_entries_fops);
2939 if (!entry) 3567 if (!entry)
2940 pr_warning("Could not create debugfs " 3568 pr_warning("Could not create debugfs "
2941 "'trace_entries' entry\n"); 3569 "'buffer_size_kb' entry\n");
2942 3570
2943 entry = debugfs_create_file("trace_marker", 0220, d_tracer, 3571 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2944 NULL, &tracing_mark_fops); 3572 NULL, &tracing_mark_fops);
@@ -2949,7 +3577,7 @@ static __init int tracer_init_debugfs(void)
2949#ifdef CONFIG_DYNAMIC_FTRACE 3577#ifdef CONFIG_DYNAMIC_FTRACE
2950 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 3578 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2951 &ftrace_update_tot_cnt, 3579 &ftrace_update_tot_cnt,
2952 &tracing_read_long_fops); 3580 &tracing_dyn_info_fops);
2953 if (!entry) 3581 if (!entry)
2954 pr_warning("Could not create debugfs " 3582 pr_warning("Could not create debugfs "
2955 "'dyn_ftrace_total_info' entry\n"); 3583 "'dyn_ftrace_total_info' entry\n");
@@ -2960,7 +3588,7 @@ static __init int tracer_init_debugfs(void)
2960 return 0; 3588 return 0;
2961} 3589}
2962 3590
2963int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 3591int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
2964{ 3592{
2965 static DEFINE_SPINLOCK(trace_buf_lock); 3593 static DEFINE_SPINLOCK(trace_buf_lock);
2966 static char trace_buf[TRACE_BUF_SIZE]; 3594 static char trace_buf[TRACE_BUF_SIZE];
@@ -2968,11 +3596,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2968 struct ring_buffer_event *event; 3596 struct ring_buffer_event *event;
2969 struct trace_array *tr = &global_trace; 3597 struct trace_array *tr = &global_trace;
2970 struct trace_array_cpu *data; 3598 struct trace_array_cpu *data;
2971 struct print_entry *entry;
2972 unsigned long flags, irq_flags;
2973 int cpu, len = 0, size, pc; 3599 int cpu, len = 0, size, pc;
3600 struct print_entry *entry;
3601 unsigned long irq_flags;
2974 3602
2975 if (!tr->ctrl || tracing_disabled) 3603 if (tracing_disabled || tracing_selftest_running)
2976 return 0; 3604 return 0;
2977 3605
2978 pc = preempt_count(); 3606 pc = preempt_count();
@@ -2983,7 +3611,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2983 if (unlikely(atomic_read(&data->disabled))) 3611 if (unlikely(atomic_read(&data->disabled)))
2984 goto out; 3612 goto out;
2985 3613
2986 spin_lock_irqsave(&trace_buf_lock, flags); 3614 pause_graph_tracing();
3615 spin_lock_irqsave(&trace_buf_lock, irq_flags);
2987 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 3616 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
2988 3617
2989 len = min(len, TRACE_BUF_SIZE-1); 3618 len = min(len, TRACE_BUF_SIZE-1);
@@ -2994,17 +3623,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2994 if (!event) 3623 if (!event)
2995 goto out_unlock; 3624 goto out_unlock;
2996 entry = ring_buffer_event_data(event); 3625 entry = ring_buffer_event_data(event);
2997 tracing_generic_entry_update(&entry->ent, flags, pc); 3626 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
2998 entry->ent.type = TRACE_PRINT; 3627 entry->ent.type = TRACE_PRINT;
2999 entry->ip = ip; 3628 entry->ip = ip;
3629 entry->depth = depth;
3000 3630
3001 memcpy(&entry->buf, trace_buf, len); 3631 memcpy(&entry->buf, trace_buf, len);
3002 entry->buf[len] = 0; 3632 entry->buf[len] = 0;
3003 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 3633 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3004 3634
3005 out_unlock: 3635 out_unlock:
3006 spin_unlock_irqrestore(&trace_buf_lock, flags); 3636 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3007 3637 unpause_graph_tracing();
3008 out: 3638 out:
3009 preempt_enable_notrace(); 3639 preempt_enable_notrace();
3010 3640
@@ -3021,7 +3651,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3021 return 0; 3651 return 0;
3022 3652
3023 va_start(ap, fmt); 3653 va_start(ap, fmt);
3024 ret = trace_vprintk(ip, fmt, ap); 3654 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3025 va_end(ap); 3655 va_end(ap);
3026 return ret; 3656 return ret;
3027} 3657}
@@ -3030,7 +3660,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
3030static int trace_panic_handler(struct notifier_block *this, 3660static int trace_panic_handler(struct notifier_block *this,
3031 unsigned long event, void *unused) 3661 unsigned long event, void *unused)
3032{ 3662{
3033 ftrace_dump(); 3663 if (ftrace_dump_on_oops)
3664 ftrace_dump();
3034 return NOTIFY_OK; 3665 return NOTIFY_OK;
3035} 3666}
3036 3667
@@ -3046,7 +3677,8 @@ static int trace_die_handler(struct notifier_block *self,
3046{ 3677{
3047 switch (val) { 3678 switch (val) {
3048 case DIE_OOPS: 3679 case DIE_OOPS:
3049 ftrace_dump(); 3680 if (ftrace_dump_on_oops)
3681 ftrace_dump();
3050 break; 3682 break;
3051 default: 3683 default:
3052 break; 3684 break;
@@ -3087,7 +3719,6 @@ trace_printk_seq(struct trace_seq *s)
3087 trace_seq_reset(s); 3719 trace_seq_reset(s);
3088} 3720}
3089 3721
3090
3091void ftrace_dump(void) 3722void ftrace_dump(void)
3092{ 3723{
3093 static DEFINE_SPINLOCK(ftrace_dump_lock); 3724 static DEFINE_SPINLOCK(ftrace_dump_lock);
@@ -3112,6 +3743,9 @@ void ftrace_dump(void)
3112 atomic_inc(&global_trace.data[cpu]->disabled); 3743 atomic_inc(&global_trace.data[cpu]->disabled);
3113 } 3744 }
3114 3745
3746 /* don't look at user memory in panic mode */
3747 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3748
3115 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 3749 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3116 3750
3117 iter.tr = &global_trace; 3751 iter.tr = &global_trace;
@@ -3205,7 +3839,6 @@ __init static int tracer_alloc_buffers(void)
3205#endif 3839#endif
3206 3840
3207 /* All seems OK, enable tracing */ 3841 /* All seems OK, enable tracing */
3208 global_trace.ctrl = tracer_enabled;
3209 tracing_disabled = 0; 3842 tracing_disabled = 0;
3210 3843
3211 atomic_notifier_chain_register(&panic_notifier_list, 3844 atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..5ac697065a48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h>
11 12
12enum trace_type { 13enum trace_type {
13 __TRACE_FIRST_TYPE = 0, 14 __TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
21 TRACE_SPECIAL, 22 TRACE_SPECIAL,
22 TRACE_MMIO_RW, 23 TRACE_MMIO_RW,
23 TRACE_MMIO_MAP, 24 TRACE_MMIO_MAP,
24 TRACE_BOOT, 25 TRACE_BRANCH,
26 TRACE_BOOT_CALL,
27 TRACE_BOOT_RET,
28 TRACE_GRAPH_RET,
29 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK,
31 TRACE_BTS,
32 TRACE_POWER,
25 33
26 __TRACE_LAST_TYPE 34 __TRACE_LAST_TYPE
27}; 35};
@@ -38,6 +46,7 @@ struct trace_entry {
38 unsigned char flags; 46 unsigned char flags;
39 unsigned char preempt_count; 47 unsigned char preempt_count;
40 int pid; 48 int pid;
49 int tgid;
41}; 50};
42 51
43/* 52/*
@@ -48,6 +57,18 @@ struct ftrace_entry {
48 unsigned long ip; 57 unsigned long ip;
49 unsigned long parent_ip; 58 unsigned long parent_ip;
50}; 59};
60
61/* Function call entry */
62struct ftrace_graph_ent_entry {
63 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent;
65};
66
67/* Function return entry */
68struct ftrace_graph_ret_entry {
69 struct trace_entry ent;
70 struct ftrace_graph_ret ret;
71};
51extern struct tracer boot_tracer; 72extern struct tracer boot_tracer;
52 73
53/* 74/*
@@ -85,12 +106,18 @@ struct stack_entry {
85 unsigned long caller[FTRACE_STACK_ENTRIES]; 106 unsigned long caller[FTRACE_STACK_ENTRIES];
86}; 107};
87 108
109struct userstack_entry {
110 struct trace_entry ent;
111 unsigned long caller[FTRACE_STACK_ENTRIES];
112};
113
88/* 114/*
89 * ftrace_printk entry: 115 * ftrace_printk entry:
90 */ 116 */
91struct print_entry { 117struct print_entry {
92 struct trace_entry ent; 118 struct trace_entry ent;
93 unsigned long ip; 119 unsigned long ip;
120 int depth;
94 char buf[]; 121 char buf[];
95}; 122};
96 123
@@ -112,9 +139,35 @@ struct trace_mmiotrace_map {
112 struct mmiotrace_map map; 139 struct mmiotrace_map map;
113}; 140};
114 141
115struct trace_boot { 142struct trace_boot_call {
116 struct trace_entry ent; 143 struct trace_entry ent;
117 struct boot_trace initcall; 144 struct boot_trace_call boot_call;
145};
146
147struct trace_boot_ret {
148 struct trace_entry ent;
149 struct boot_trace_ret boot_ret;
150};
151
152#define TRACE_FUNC_SIZE 30
153#define TRACE_FILE_SIZE 20
154struct trace_branch {
155 struct trace_entry ent;
156 unsigned line;
157 char func[TRACE_FUNC_SIZE+1];
158 char file[TRACE_FILE_SIZE+1];
159 char correct;
160};
161
162struct bts_entry {
163 struct trace_entry ent;
164 unsigned long from;
165 unsigned long to;
166};
167
168struct trace_power {
169 struct trace_entry ent;
170 struct power_trace state_data;
118}; 171};
119 172
120/* 173/*
@@ -172,7 +225,6 @@ struct trace_iterator;
172struct trace_array { 225struct trace_array {
173 struct ring_buffer *buffer; 226 struct ring_buffer *buffer;
174 unsigned long entries; 227 unsigned long entries;
175 long ctrl;
176 int cpu; 228 int cpu;
177 cycle_t time_start; 229 cycle_t time_start;
178 struct task_struct *waiter; 230 struct task_struct *waiter;
@@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void);
212 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
213 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ 265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
214 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
215 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
216 IF_ASSIGN(var, ent, struct special_entry, 0); \ 269 IF_ASSIGN(var, ent, struct special_entry, 0); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
218 TRACE_MMIO_RW); \ 271 TRACE_MMIO_RW); \
219 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 272 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
220 TRACE_MMIO_MAP); \ 273 TRACE_MMIO_MAP); \
221 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ 274 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
275 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
276 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
277 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
278 TRACE_GRAPH_ENT); \
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
222 __ftrace_bad_type(); \ 283 __ftrace_bad_type(); \
223 } while (0) 284 } while (0)
224 285
@@ -229,29 +290,56 @@ enum print_line_t {
229 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
230}; 291};
231 292
293
294/*
295 * An option specific to a tracer. This is a boolean value.
296 * The bit is the bit index that sets its value on the
297 * flags value in struct tracer_flags.
298 */
299struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */
302};
303
304/*
305 * The set of specific options for a tracer. Your tracer
306 * have to set the initial value of the flags val.
307 */
308struct tracer_flags {
309 u32 val;
310 struct tracer_opt *opts;
311};
312
313/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b
315
232/* 316/*
233 * A specific tracer, represented by methods that operate on a trace array: 317 * A specific tracer, represented by methods that operate on a trace array:
234 */ 318 */
235struct tracer { 319struct tracer {
236 const char *name; 320 const char *name;
237 void (*init)(struct trace_array *tr); 321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr);
238 void (*reset)(struct trace_array *tr); 323 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr);
239 void (*open)(struct trace_iterator *iter); 326 void (*open)(struct trace_iterator *iter);
240 void (*pipe_open)(struct trace_iterator *iter); 327 void (*pipe_open)(struct trace_iterator *iter);
241 void (*close)(struct trace_iterator *iter); 328 void (*close)(struct trace_iterator *iter);
242 void (*start)(struct trace_iterator *iter);
243 void (*stop)(struct trace_iterator *iter);
244 ssize_t (*read)(struct trace_iterator *iter, 329 ssize_t (*read)(struct trace_iterator *iter,
245 struct file *filp, char __user *ubuf, 330 struct file *filp, char __user *ubuf,
246 size_t cnt, loff_t *ppos); 331 size_t cnt, loff_t *ppos);
247 void (*ctrl_update)(struct trace_array *tr);
248#ifdef CONFIG_FTRACE_STARTUP_TEST 332#ifdef CONFIG_FTRACE_STARTUP_TEST
249 int (*selftest)(struct tracer *trace, 333 int (*selftest)(struct tracer *trace,
250 struct trace_array *tr); 334 struct trace_array *tr);
251#endif 335#endif
336 void (*print_header)(struct seq_file *m);
252 enum print_line_t (*print_line)(struct trace_iterator *iter); 337 enum print_line_t (*print_line)(struct trace_iterator *iter);
338 /* If you handled the flag setting, return 0 */
339 int (*set_flag)(u32 old_flags, u32 bit, int set);
253 struct tracer *next; 340 struct tracer *next;
254 int print_max; 341 int print_max;
342 struct tracer_flags *flags;
255}; 343};
256 344
257struct trace_seq { 345struct trace_seq {
@@ -279,8 +367,11 @@ struct trace_iterator {
279 unsigned long iter_flags; 367 unsigned long iter_flags;
280 loff_t pos; 368 loff_t pos;
281 long idx; 369 long idx;
370
371 cpumask_t started;
282}; 372};
283 373
374int tracing_is_enabled(void);
284void trace_wake_up(void); 375void trace_wake_up(void);
285void tracing_reset(struct trace_array *tr, int cpu); 376void tracing_reset(struct trace_array *tr, int cpu);
286int tracing_open_generic(struct inode *inode, struct file *filp); 377int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -321,8 +412,17 @@ void trace_function(struct trace_array *tr,
321 unsigned long parent_ip, 412 unsigned long parent_ip,
322 unsigned long flags, int pc); 413 unsigned long flags, int pc);
323 414
415void trace_graph_return(struct ftrace_graph_ret *trace);
416int trace_graph_entry(struct ftrace_graph_ent *trace);
417void trace_bts(struct trace_array *tr,
418 unsigned long from,
419 unsigned long to);
420
324void tracing_start_cmdline_record(void); 421void tracing_start_cmdline_record(void);
325void tracing_stop_cmdline_record(void); 422void tracing_stop_cmdline_record(void);
423void tracing_sched_switch_assign_trace(struct trace_array *tr);
424void tracing_stop_sched_switch_record(void);
425void tracing_start_sched_switch_record(void);
326int register_tracer(struct tracer *type); 426int register_tracer(struct tracer *type);
327void unregister_tracer(struct tracer *type); 427void unregister_tracer(struct tracer *type);
328 428
@@ -358,6 +458,7 @@ struct tracer_switch_ops {
358 struct tracer_switch_ops *next; 458 struct tracer_switch_ops *next;
359}; 459};
360 460
461char *trace_find_cmdline(int pid);
361#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 462#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
362 463
363#ifdef CONFIG_DYNAMIC_FTRACE 464#ifdef CONFIG_DYNAMIC_FTRACE
@@ -383,19 +484,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
383 struct trace_array *tr); 484 struct trace_array *tr);
384extern int trace_selftest_startup_sysprof(struct tracer *trace, 485extern int trace_selftest_startup_sysprof(struct tracer *trace,
385 struct trace_array *tr); 486 struct trace_array *tr);
487extern int trace_selftest_startup_branch(struct tracer *trace,
488 struct trace_array *tr);
386#endif /* CONFIG_FTRACE_STARTUP_TEST */ 489#endif /* CONFIG_FTRACE_STARTUP_TEST */
387 490
388extern void *head_page(struct trace_array_cpu *data); 491extern void *head_page(struct trace_array_cpu *data);
389extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 492extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
390extern void trace_seq_print_cont(struct trace_seq *s, 493extern void trace_seq_print_cont(struct trace_seq *s,
391 struct trace_iterator *iter); 494 struct trace_iterator *iter);
495
496extern int
497seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
498 unsigned long sym_flags);
392extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 499extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
393 size_t cnt); 500 size_t cnt);
394extern long ns2usecs(cycle_t nsec); 501extern long ns2usecs(cycle_t nsec);
395extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); 502extern int
503trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
396 504
397extern unsigned long trace_flags; 505extern unsigned long trace_flags;
398 506
507/* Standard output formatting function used for function return traces */
508#ifdef CONFIG_FUNCTION_GRAPH_TRACER
509extern enum print_line_t print_graph_function(struct trace_iterator *iter);
510
511#ifdef CONFIG_DYNAMIC_FTRACE
512/* TODO: make this variable */
513#define FTRACE_GRAPH_MAX_FUNCS 32
514extern int ftrace_graph_count;
515extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
516
517static inline int ftrace_graph_addr(unsigned long addr)
518{
519 int i;
520
521 if (!ftrace_graph_count || test_tsk_trace_graph(current))
522 return 1;
523
524 for (i = 0; i < ftrace_graph_count; i++) {
525 if (addr == ftrace_graph_funcs[i])
526 return 1;
527 }
528
529 return 0;
530}
531#else
532static inline int ftrace_trace_addr(unsigned long addr)
533{
534 return 1;
535}
536static inline int ftrace_graph_addr(unsigned long addr)
537{
538 return 1;
539}
540#endif /* CONFIG_DYNAMIC_FTRACE */
541
542#else /* CONFIG_FUNCTION_GRAPH_TRACER */
543static inline enum print_line_t
544print_graph_function(struct trace_iterator *iter)
545{
546 return TRACE_TYPE_UNHANDLED;
547}
548#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
549
550extern struct pid *ftrace_pid_trace;
551
552static inline int ftrace_trace_task(struct task_struct *task)
553{
554 if (!ftrace_pid_trace)
555 return 1;
556
557 return test_tsk_trace_trace(task);
558}
559
399/* 560/*
400 * trace_iterator_flags is an enumeration that defines bit 561 * trace_iterator_flags is an enumeration that defines bit
401 * positions into trace_flags that controls the output. 562 * positions into trace_flags that controls the output.
@@ -415,8 +576,92 @@ enum trace_iterator_flags {
415 TRACE_ITER_STACKTRACE = 0x100, 576 TRACE_ITER_STACKTRACE = 0x100,
416 TRACE_ITER_SCHED_TREE = 0x200, 577 TRACE_ITER_SCHED_TREE = 0x200,
417 TRACE_ITER_PRINTK = 0x400, 578 TRACE_ITER_PRINTK = 0x400,
579 TRACE_ITER_PREEMPTONLY = 0x800,
580 TRACE_ITER_BRANCH = 0x1000,
581 TRACE_ITER_ANNOTATE = 0x2000,
582 TRACE_ITER_USERSTACKTRACE = 0x4000,
583 TRACE_ITER_SYM_USEROBJ = 0x8000
418}; 584};
419 585
586/*
587 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
588 * control the output of kernel symbols.
589 */
590#define TRACE_ITER_SYM_MASK \
591 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
592
420extern struct tracer nop_trace; 593extern struct tracer nop_trace;
421 594
595/**
596 * ftrace_preempt_disable - disable preemption scheduler safe
597 *
598 * When tracing can happen inside the scheduler, there exists
599 * cases that the tracing might happen before the need_resched
600 * flag is checked. If this happens and the tracer calls
601 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion.
603 *
604 * To prevent this, we read the need_recshed flag before
605 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable.
608 *
609 * The rational for doing the above is that if need resched is set
610 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched.
613 */
614static inline int ftrace_preempt_disable(void)
615{
616 int resched;
617
618 resched = need_resched();
619 preempt_disable_notrace();
620
621 return resched;
622}
623
624/**
625 * ftrace_preempt_enable - enable preemption scheduler safe
626 * @resched: the return value from ftrace_preempt_disable
627 *
628 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or
631 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead.
634 */
635static inline void ftrace_preempt_enable(int resched)
636{
637 if (resched)
638 preempt_enable_no_resched_notrace();
639 else
640 preempt_enable_notrace();
641}
642
643#ifdef CONFIG_BRANCH_TRACER
644extern int enable_branch_tracing(struct trace_array *tr);
645extern void disable_branch_tracing(void);
646static inline int trace_branch_enable(struct trace_array *tr)
647{
648 if (trace_flags & TRACE_ITER_BRANCH)
649 return enable_branch_tracing(tr);
650 return 0;
651}
652static inline void trace_branch_disable(void)
653{
654 /* due to races, always disable */
655 disable_branch_tracing();
656}
657#else
658static inline int trace_branch_enable(struct trace_array *tr)
659{
660 return 0;
661}
662static inline void trace_branch_disable(void)
663{
664}
665#endif /* CONFIG_BRANCH_TRACER */
666
422#endif /* _LINUX_KERNEL_TRACE_H */ 667#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff2..a4fa2c57e34e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,73 +13,117 @@
13#include "trace.h" 13#include "trace.h"
14 14
15static struct trace_array *boot_trace; 15static struct trace_array *boot_trace;
16static int trace_boot_enabled; 16static bool pre_initcalls_finished;
17 17
18 18/* Tells the boot tracer that the pre_smp_initcalls are finished.
19/* Should be started after do_pre_smp_initcalls() in init/main.c */ 19 * So we are ready .
20 * It doesn't enable sched events tracing however.
21 * You have to call enable_boot_trace to do so.
22 */
20void start_boot_trace(void) 23void start_boot_trace(void)
21{ 24{
22 trace_boot_enabled = 1; 25 pre_initcalls_finished = true;
23} 26}
24 27
25void stop_boot_trace(void) 28void enable_boot_trace(void)
26{ 29{
27 trace_boot_enabled = 0; 30 if (pre_initcalls_finished)
31 tracing_start_sched_switch_record();
28} 32}
29 33
30void reset_boot_trace(struct trace_array *tr) 34void disable_boot_trace(void)
31{ 35{
32 stop_boot_trace(); 36 if (pre_initcalls_finished)
37 tracing_stop_sched_switch_record();
33} 38}
34 39
35static void boot_trace_init(struct trace_array *tr) 40static void reset_boot_trace(struct trace_array *tr)
36{ 41{
37 int cpu; 42 int cpu;
38 boot_trace = tr;
39 43
40 trace_boot_enabled = 0; 44 tr->time_start = ftrace_now(tr->cpu);
45
46 for_each_online_cpu(cpu)
47 tracing_reset(tr, cpu);
48}
49
50static int boot_trace_init(struct trace_array *tr)
51{
52 int cpu;
53 boot_trace = tr;
41 54
42 for_each_cpu_mask(cpu, cpu_possible_map) 55 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu); 56 tracing_reset(tr, cpu);
57
58 tracing_sched_switch_assign_trace(tr);
59 return 0;
44} 60}
45 61
46static void boot_trace_ctrl_update(struct trace_array *tr) 62static enum print_line_t
63initcall_call_print_line(struct trace_iterator *iter)
47{ 64{
48 if (tr->ctrl) 65 struct trace_entry *entry = iter->ent;
49 start_boot_trace(); 66 struct trace_seq *s = &iter->seq;
67 struct trace_boot_call *field;
68 struct boot_trace_call *call;
69 u64 ts;
70 unsigned long nsec_rem;
71 int ret;
72
73 trace_assign_type(field, entry);
74 call = &field->boot_call;
75 ts = iter->ts;
76 nsec_rem = do_div(ts, 1000000000);
77
78 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
79 (unsigned long)ts, nsec_rem, call->func, call->caller);
80
81 if (!ret)
82 return TRACE_TYPE_PARTIAL_LINE;
50 else 83 else
51 stop_boot_trace(); 84 return TRACE_TYPE_HANDLED;
52} 85}
53 86
54static enum print_line_t initcall_print_line(struct trace_iterator *iter) 87static enum print_line_t
88initcall_ret_print_line(struct trace_iterator *iter)
55{ 89{
56 int ret;
57 struct trace_entry *entry = iter->ent; 90 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq; 91 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime); 92 struct trace_boot_ret *field;
62 struct timespec rettime = ktime_to_timespec(it->rettime); 93 struct boot_trace_ret *init_ret;
63 94 u64 ts;
64 if (entry->type == TRACE_BOOT) { 95 unsigned long nsec_rem;
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 96 int ret;
66 calltime.tv_sec, 97
67 calltime.tv_nsec, 98 trace_assign_type(field, entry);
68 it->func, it->caller); 99 init_ret = &field->boot_ret;
69 if (!ret) 100 ts = iter->ts;
70 return TRACE_TYPE_PARTIAL_LINE; 101 nsec_rem = do_div(ts, 1000000000);
71 102
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 103 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n", 104 "returned %d after %llu msecs\n",
74 rettime.tv_sec, 105 (unsigned long) ts,
75 rettime.tv_nsec, 106 nsec_rem,
76 it->func, it->result, it->duration); 107 init_ret->func, init_ret->result, init_ret->duration);
77 108
78 if (!ret) 109 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE; 110 return TRACE_TYPE_PARTIAL_LINE;
111 else
80 return TRACE_TYPE_HANDLED; 112 return TRACE_TYPE_HANDLED;
113}
114
115static enum print_line_t initcall_print_line(struct trace_iterator *iter)
116{
117 struct trace_entry *entry = iter->ent;
118
119 switch (entry->type) {
120 case TRACE_BOOT_CALL:
121 return initcall_call_print_line(iter);
122 case TRACE_BOOT_RET:
123 return initcall_ret_print_line(iter);
124 default:
125 return TRACE_TYPE_UNHANDLED;
81 } 126 }
82 return TRACE_TYPE_UNHANDLED;
83} 127}
84 128
85struct tracer boot_tracer __read_mostly = 129struct tracer boot_tracer __read_mostly =
@@ -87,27 +131,53 @@ struct tracer boot_tracer __read_mostly =
87 .name = "initcall", 131 .name = "initcall",
88 .init = boot_trace_init, 132 .init = boot_trace_init,
89 .reset = reset_boot_trace, 133 .reset = reset_boot_trace,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line, 134 .print_line = initcall_print_line,
92}; 135};
93 136
94void trace_boot(struct boot_trace *it, initcall_t fn) 137void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
95{ 138{
96 struct ring_buffer_event *event; 139 struct ring_buffer_event *event;
97 struct trace_boot *entry; 140 struct trace_boot_call *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags; 141 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace; 142 struct trace_array *tr = boot_trace;
101 143
102 if (!trace_boot_enabled) 144 if (!pre_initcalls_finished)
103 return; 145 return;
104 146
105 /* Get its name now since this function could 147 /* Get its name now since this function could
106 * disappear because it is in the .init section. 148 * disappear because it is in the .init section.
107 */ 149 */
108 sprint_symbol(it->func, (unsigned long)fn); 150 sprint_symbol(bt->func, (unsigned long)fn);
151 preempt_disable();
152
153 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
154 &irq_flags);
155 if (!event)
156 goto out;
157 entry = ring_buffer_event_data(event);
158 tracing_generic_entry_update(&entry->ent, 0, 0);
159 entry->ent.type = TRACE_BOOT_CALL;
160 entry->boot_call = *bt;
161 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
162
163 trace_wake_up();
164
165 out:
166 preempt_enable();
167}
168
169void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
170{
171 struct ring_buffer_event *event;
172 struct trace_boot_ret *entry;
173 unsigned long irq_flags;
174 struct trace_array *tr = boot_trace;
175
176 if (!pre_initcalls_finished)
177 return;
178
179 sprint_symbol(bt->func, (unsigned long)fn);
109 preempt_disable(); 180 preempt_disable();
110 data = tr->data[smp_processor_id()];
111 181
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 182 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags); 183 &irq_flags);
@@ -115,8 +185,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
115 goto out; 185 goto out;
116 entry = ring_buffer_event_data(event); 186 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0); 187 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT; 188 entry->ent.type = TRACE_BOOT_RET;
119 entry->initcall = *it; 189 entry->boot_ret = *bt;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 190 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121 191
122 trace_wake_up(); 192 trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..6c00feb3bac7
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,342 @@
1/*
2 * unlikely profiler
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h>
12#include <linux/module.h>
13#include <linux/ftrace.h>
14#include <linux/hash.h>
15#include <linux/fs.h>
16#include <asm/local.h>
17#include "trace.h"
18
19#ifdef CONFIG_BRANCH_TRACER
20
21static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex);
23static struct trace_array *branch_tracer;
24
25static void
26probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
27{
28 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event;
30 struct trace_branch *entry;
31 unsigned long flags, irq_flags;
32 int cpu, pc;
33 const char *p;
34
35 /*
36 * I would love to save just the ftrace_likely_data pointer, but
37 * this code can also be used by modules. Ugly things can happen
38 * if the module is unloaded, and then we go and read the
39 * pointer. This is slower, but much safer.
40 */
41
42 if (unlikely(!tr))
43 return;
44
45 local_irq_save(flags);
46 cpu = raw_smp_processor_id();
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
51 &irq_flags);
52 if (!event)
53 goto out;
54
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59
60 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file);
62 while (p >= f->file && *p != '/')
63 p--;
64 p++;
65
66 strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
67 strncpy(entry->file, p, TRACE_FILE_SIZE);
68 entry->func[TRACE_FUNC_SIZE] = 0;
69 entry->file[TRACE_FILE_SIZE] = 0;
70 entry->line = f->line;
71 entry->correct = val == expect;
72
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
74
75 out:
76 atomic_dec(&tr->data[cpu]->disabled);
77 local_irq_restore(flags);
78}
79
80static inline
81void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
82{
83 if (!branch_tracing_enabled)
84 return;
85
86 probe_likely_condition(f, val, expect);
87}
88
89int enable_branch_tracing(struct trace_array *tr)
90{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr;
95 /*
96 * Must be seen before enabling. The reader is a condition
97 * where we do not need a matching rmb()
98 */
99 smp_wmb();
100 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex);
102
103 return ret;
104}
105
106void disable_branch_tracing(void)
107{
108 mutex_lock(&branch_tracing_mutex);
109
110 if (!branch_tracing_enabled)
111 goto out_unlock;
112
113 branch_tracing_enabled--;
114
115 out_unlock:
116 mutex_unlock(&branch_tracing_mutex);
117}
118
119static void start_branch_trace(struct trace_array *tr)
120{
121 enable_branch_tracing(tr);
122}
123
124static void stop_branch_trace(struct trace_array *tr)
125{
126 disable_branch_tracing();
127}
128
129static int branch_trace_init(struct trace_array *tr)
130{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr);
137 return 0;
138}
139
140static void branch_trace_reset(struct trace_array *tr)
141{
142 stop_branch_trace(tr);
143}
144
145struct tracer branch_trace __read_mostly =
146{
147 .name = "branch",
148 .init = branch_trace_init,
149 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch,
152#endif
153};
154
155__init static int init_branch_trace(void)
156{
157 return register_tracer(&branch_trace);
158}
159
160device_initcall(init_branch_trace);
161#else
162static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
164{
165}
166#endif /* CONFIG_BRANCH_TRACER */
167
168void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
169{
170 /*
171 * I would love to have a trace point here instead, but the
172 * trace point code is so inundated with unlikely and likely
173 * conditions that the recursive nightmare that exists is too
174 * much to try to get working. At least for now.
175 */
176 trace_likely_condition(f, val, expect);
177
178 /* FIXME: Make this atomic! */
179 if (val == expect)
180 f->correct++;
181 else
182 f->incorrect++;
183}
184EXPORT_SYMBOL(ftrace_likely_update);
185
186struct ftrace_pointer {
187 void *start;
188 void *stop;
189 int hit;
190};
191
192static void *
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{
195 const struct ftrace_pointer *f = m->private;
196 struct ftrace_branch_data *p = v;
197
198 (*pos)++;
199
200 if (v == (void *)1)
201 return f->start;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209}
210
211static void *t_start(struct seq_file *m, loff_t *pos)
212{
213 void *t = (void *)1;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218
219 return t;
220}
221
222static void t_stop(struct seq_file *m, void *p)
223{
224}
225
226static int t_show(struct seq_file *m, void *v)
227{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v;
230 const char *f;
231 long percent;
232
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/')
249 f--;
250 f++;
251
252 /*
253 * The miss is overlayed on correct, and hit on incorrect.
254 */
255 if (p->correct) {
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0)
263 seq_printf(m, " X ");
264 else
265 seq_printf(m, "%3ld ", percent);
266 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
267 return 0;
268}
269
270static struct seq_operations tracing_likely_seq_ops = {
271 .start = t_start,
272 .next = t_next,
273 .stop = t_stop,
274 .show = t_show,
275};
276
277static int tracing_branch_open(struct inode *inode, struct file *file)
278{
279 int ret;
280
281 ret = seq_open(file, &tracing_likely_seq_ops);
282 if (!ret) {
283 struct seq_file *m = file->private_data;
284 m->private = (void *)inode->i_private;
285 }
286
287 return ret;
288}
289
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295
296#ifdef CONFIG_PROFILE_ALL_BRANCHES
297extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[];
299
300static const struct ftrace_pointer ftrace_branch_pos = {
301 .start = __start_branch_profile,
302 .stop = __stop_branch_profile,
303 .hit = 1,
304};
305
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */
307
308extern unsigned long __start_annotated_branch_profile[];
309extern unsigned long __stop_annotated_branch_profile[];
310
311static const struct ftrace_pointer ftrace_annotated_branch_pos = {
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315
316static __init int ftrace_branch_init(void)
317{
318 struct dentry *d_tracer;
319 struct dentry *entry;
320
321 d_tracer = tracing_init_dentry();
322
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
324 (void *)&ftrace_annotated_branch_pos,
325 &tracing_branch_fops);
326 if (!entry)
327 pr_warning("Could not create debugfs "
328 "'profile_annotatet_branch' entry\n");
329
330#ifdef CONFIG_PROFILE_ALL_BRANCHES
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer,
332 (void *)&ftrace_branch_pos,
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338
339 return 0;
340}
341
342device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
new file mode 100644
index 000000000000..23b76e4690ef
--- /dev/null
+++ b/kernel/trace/trace_bts.c
@@ -0,0 +1,276 @@
1/*
2 * BTS tracer
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28/*
29 * Information to interpret a BTS record.
30 * This will go into an in-kernel BTS interface.
31 */
32static unsigned char sizeof_field;
33static unsigned long debugctl_mask;
34
35#define sizeof_bts (3 * sizeof_field)
36
37static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
38{
39 switch (c->x86) {
40 case 0x6:
41 switch (c->x86_model) {
42 case 0x0 ... 0xC:
43 break;
44 case 0xD:
45 case 0xE: /* Pentium M */
46 sizeof_field = sizeof(long);
47 debugctl_mask = (1<<6)|(1<<7);
48 break;
49 default:
50 sizeof_field = 8;
51 debugctl_mask = (1<<6)|(1<<7);
52 break;
53 }
54 break;
55 case 0xF:
56 switch (c->x86_model) {
57 case 0x0:
58 case 0x1:
59 case 0x2: /* Netburst */
60 sizeof_field = sizeof(long);
61 debugctl_mask = (1<<2)|(1<<3);
62 break;
63 default:
64 /* sorry, don't know about them */
65 break;
66 }
67 break;
68 default:
69 /* sorry, don't know about them */
70 break;
71 }
72}
73
74static inline void bts_enable(void)
75{
76 unsigned long debugctl;
77
78 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
79 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
80}
81
82static inline void bts_disable(void)
83{
84 unsigned long debugctl;
85
86 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
87 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
88}
89
90static void bts_trace_reset(struct trace_array *tr)
91{
92 int cpu;
93
94 tr->time_start = ftrace_now(tr->cpu);
95
96 for_each_online_cpu(cpu)
97 tracing_reset(tr, cpu);
98}
99
100static void bts_trace_start_cpu(void *arg)
101{
102 this_tracer =
103 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
104 /* ovfl = */ NULL, /* th = */ (size_t)-1);
105 if (IS_ERR(this_tracer)) {
106 this_tracer = NULL;
107 return;
108 }
109
110 bts_enable();
111}
112
113static void bts_trace_start(struct trace_array *tr)
114{
115 int cpu;
116
117 bts_trace_reset(tr);
118
119 for_each_cpu_mask(cpu, cpu_possible_map)
120 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
121}
122
123static void bts_trace_stop_cpu(void *arg)
124{
125 if (this_tracer) {
126 bts_disable();
127
128 ds_release_bts(this_tracer);
129 this_tracer = NULL;
130 }
131}
132
133static void bts_trace_stop(struct trace_array *tr)
134{
135 int cpu;
136
137 for_each_cpu_mask(cpu, cpu_possible_map)
138 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
139}
140
141static int bts_trace_init(struct trace_array *tr)
142{
143 bts_trace_cpuinit(&boot_cpu_data);
144 bts_trace_reset(tr);
145 bts_trace_start(tr);
146
147 return 0;
148}
149
150static void bts_trace_print_header(struct seq_file *m)
151{
152#ifdef __i386__
153 seq_puts(m, "# CPU# FROM TO FUNCTION\n");
154 seq_puts(m, "# | | | |\n");
155#else
156 seq_puts(m,
157 "# CPU# FROM TO FUNCTION\n");
158 seq_puts(m,
159 "# | | | |\n");
160#endif
161}
162
163static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
164{
165 struct trace_entry *entry = iter->ent;
166 struct trace_seq *seq = &iter->seq;
167 struct bts_entry *it;
168
169 trace_assign_type(it, entry);
170
171 if (entry->type == TRACE_BTS) {
172 int ret;
173#ifdef CONFIG_KALLSYMS
174 char function[KSYM_SYMBOL_LEN];
175 sprint_symbol(function, it->from);
176#else
177 char *function = "<unknown>";
178#endif
179
180 ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n",
181 entry->cpu, it->from, it->to, function);
182 if (!ret)
183 return TRACE_TYPE_PARTIAL_LINE;;
184 return TRACE_TYPE_HANDLED;
185 }
186 return TRACE_TYPE_UNHANDLED;
187}
188
189void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
190{
191 struct ring_buffer_event *event;
192 struct bts_entry *entry;
193 unsigned long irq;
194
195 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
196 if (!event)
197 return;
198 entry = ring_buffer_event_data(event);
199 tracing_generic_entry_update(&entry->ent, 0, from);
200 entry->ent.type = TRACE_BTS;
201 entry->ent.cpu = smp_processor_id();
202 entry->from = from;
203 entry->to = to;
204 ring_buffer_unlock_commit(tr->buffer, event, irq);
205}
206
207static void trace_bts_at(struct trace_array *tr, size_t index)
208{
209 const void *raw = NULL;
210 unsigned long from, to;
211 int err;
212
213 err = ds_access_bts(this_tracer, index, &raw);
214 if (err < 0)
215 return;
216
217 from = *(const unsigned long *)raw;
218 to = *(const unsigned long *)((const char *)raw + sizeof_field);
219
220 trace_bts(tr, from, to);
221}
222
223static void trace_bts_cpu(void *arg)
224{
225 struct trace_array *tr = (struct trace_array *) arg;
226 size_t index = 0, end = 0, i;
227 int err;
228
229 if (!this_tracer)
230 return;
231
232 bts_disable();
233
234 err = ds_get_bts_index(this_tracer, &index);
235 if (err < 0)
236 goto out;
237
238 err = ds_get_bts_end(this_tracer, &end);
239 if (err < 0)
240 goto out;
241
242 for (i = index; i < end; i++)
243 trace_bts_at(tr, i);
244
245 for (i = 0; i < index; i++)
246 trace_bts_at(tr, i);
247
248out:
249 bts_enable();
250}
251
252static void trace_bts_prepare(struct trace_iterator *iter)
253{
254 int cpu;
255
256 for_each_cpu_mask(cpu, cpu_possible_map)
257 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
258}
259
260struct tracer bts_tracer __read_mostly =
261{
262 .name = "bts",
263 .init = bts_trace_init,
264 .reset = bts_trace_stop,
265 .print_header = bts_trace_print_header,
266 .print_line = bts_trace_print_line,
267 .start = bts_trace_start,
268 .stop = bts_trace_stop,
269 .open = trace_bts_prepare
270};
271
272__init static int init_bts_trace(void)
273{
274 return register_tracer(&bts_tracer);
275}
276device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d3..e74f6d0a3216 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,24 +42,20 @@ static void stop_function_trace(struct trace_array *tr)
42 tracing_stop_cmdline_record(); 42 tracing_stop_cmdline_record();
43} 43}
44 44
45static void function_trace_init(struct trace_array *tr) 45static int function_trace_init(struct trace_array *tr)
46{ 46{
47 if (tr->ctrl) 47 start_function_trace(tr);
48 start_function_trace(tr); 48 return 0;
49} 49}
50 50
51static void function_trace_reset(struct trace_array *tr) 51static void function_trace_reset(struct trace_array *tr)
52{ 52{
53 if (tr->ctrl) 53 stop_function_trace(tr);
54 stop_function_trace(tr);
55} 54}
56 55
57static void function_trace_ctrl_update(struct trace_array *tr) 56static void function_trace_start(struct trace_array *tr)
58{ 57{
59 if (tr->ctrl) 58 function_reset(tr);
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63} 59}
64 60
65static struct tracer function_trace __read_mostly = 61static struct tracer function_trace __read_mostly =
@@ -67,7 +63,7 @@ static struct tracer function_trace __read_mostly =
67 .name = "function", 63 .name = "function",
68 .init = function_trace_init, 64 .init = function_trace_init,
69 .reset = function_trace_reset, 65 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 66 .start = function_trace_start,
71#ifdef CONFIG_FTRACE_SELFTEST 67#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function, 68 .selftest = trace_selftest_startup_function,
73#endif 69#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000000..af60eef4cbcc
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,611 @@
1/*
2 *
3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 *
8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/fs.h>
13
14#include "trace.h"
15
16#define TRACE_GRAPH_INDENT 2
17
18/* Flag options */
19#define TRACE_GRAPH_PRINT_OVERRUN 0x1
20#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8
23
24static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
29 /* Display Overhead ? */
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
33 { } /* Empty entry */
34};
35
36static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
39 .opts = trace_opts
40};
41
42/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44
45static int graph_trace_init(struct trace_array *tr)
46{
47 int cpu, ret;
48
49 for_each_online_cpu(cpu)
50 tracing_reset(tr, cpu);
51
52 ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry);
54 if (ret)
55 return ret;
56 tracing_start_cmdline_record();
57
58 return 0;
59}
60
61static void graph_trace_reset(struct trace_array *tr)
62{
63 tracing_stop_cmdline_record();
64 unregister_ftrace_graph();
65}
66
67static inline int log10_cpu(int nb)
68{
69 if (nb / 100)
70 return 3;
71 if (nb / 10)
72 return 2;
73 return 1;
74}
75
76static enum print_line_t
77print_graph_cpu(struct trace_seq *s, int cpu)
78{
79 int i;
80 int ret;
81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
83
84
85 /*
86 * Start with a space character - to make it stand out
87 * to the right a bit when trace output is pasted into
88 * email:
89 */
90 ret = trace_seq_printf(s, " ");
91
92 /*
93 * Tricky - we space the CPU field according to the max
94 * number of online CPUs. On a 2-cpu system it would take
95 * a maximum of 1 digit - on a 128 cpu system it would
96 * take up to 3 digits:
97 */
98 for (i = 0; i < log10_all - log10_this; i++) {
99 ret = trace_seq_printf(s, " ");
100 if (!ret)
101 return TRACE_TYPE_PARTIAL_LINE;
102 }
103 ret = trace_seq_printf(s, "%d) ", cpu);
104 if (!ret)
105 return TRACE_TYPE_PARTIAL_LINE;
106
107 return TRACE_TYPE_HANDLED;
108}
109
110#define TRACE_GRAPH_PROCINFO_LENGTH 14
111
112static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid)
114{
115 int i;
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11];
122
123 strncpy(comm, trace_find_cmdline(pid), 7);
124 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid);
126
127 /* 1 stands for the "-" character */
128 len = strlen(comm) + strlen(pid_str) + 1;
129
130 if (len < TRACE_GRAPH_PROCINFO_LENGTH)
131 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
132
133 /* First spaces to align center */
134 for (i = 0; i < spaces / 2; i++) {
135 ret = trace_seq_printf(s, " ");
136 if (!ret)
137 return TRACE_TYPE_PARTIAL_LINE;
138 }
139
140 ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
141 if (!ret)
142 return TRACE_TYPE_PARTIAL_LINE;
143
144 /* Last spaces to align center */
145 for (i = 0; i < spaces - (spaces / 2); i++) {
146 ret = trace_seq_printf(s, " ");
147 if (!ret)
148 return TRACE_TYPE_PARTIAL_LINE;
149 }
150 return TRACE_TYPE_HANDLED;
151}
152
153
154/* If the pid changed since the last trace, output this event */
155static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu)
157{
158 pid_t prev_pid;
159 int ret;
160
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
162 return TRACE_TYPE_HANDLED;
163
164 prev_pid = last_pid[cpu];
165 last_pid[cpu] = pid;
166
167/*
168 * Context-switch trace line:
169
170 ------------------------------------------
171 | 1) migration/0--1 => sshd-1755
172 ------------------------------------------
173
174 */
175 ret = trace_seq_printf(s,
176 " ------------------------------------------\n");
177 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE;
179
180 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE;
183
184 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE;
187
188 ret = trace_seq_printf(s, " => ");
189 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE;
191
192 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE;
195
196 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n");
198 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE;
200
201 return ret;
202}
203
204static bool
205trace_branch_is_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr)
207{
208 struct ring_buffer_iter *ring_iter;
209 struct ring_buffer_event *event;
210 struct ftrace_graph_ret_entry *next;
211
212 ring_iter = iter->buffer_iter[iter->cpu];
213
214 if (!ring_iter)
215 return false;
216
217 event = ring_buffer_iter_peek(ring_iter, NULL);
218
219 if (!event)
220 return false;
221
222 next = ring_buffer_event_data(event);
223
224 if (next->ent.type != TRACE_GRAPH_RET)
225 return false;
226
227 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func)
229 return false;
230
231 return true;
232}
233
234
235static enum print_line_t
236print_graph_duration(unsigned long long duration, struct trace_seq *s)
237{
238 unsigned long nsecs_rem = do_div(duration, 1000);
239 /* log10(ULONG_MAX) + '\0' */
240 char msecs_str[21];
241 char nsecs_str[5];
242 int ret, len;
243 int i;
244
245 sprintf(msecs_str, "%lu", (unsigned long) duration);
246
247 /* Print msecs */
248 ret = trace_seq_printf(s, msecs_str);
249 if (!ret)
250 return TRACE_TYPE_PARTIAL_LINE;
251
252 len = strlen(msecs_str);
253
254 /* Print nsecs (we don't want to exceed 7 numbers) */
255 if (len < 7) {
256 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
257 ret = trace_seq_printf(s, ".%s", nsecs_str);
258 if (!ret)
259 return TRACE_TYPE_PARTIAL_LINE;
260 len += strlen(nsecs_str);
261 }
262
263 ret = trace_seq_printf(s, " us ");
264 if (!ret)
265 return TRACE_TYPE_PARTIAL_LINE;
266
267 /* Print remaining spaces to fit the row's width */
268 for (i = len; i < 7; i++) {
269 ret = trace_seq_printf(s, " ");
270 if (!ret)
271 return TRACE_TYPE_PARTIAL_LINE;
272 }
273
274 ret = trace_seq_printf(s, "| ");
275 if (!ret)
276 return TRACE_TYPE_PARTIAL_LINE;
277 return TRACE_TYPE_HANDLED;
278
279}
280
281/* Signal a overhead of time execution to the output */
282static int
283print_graph_overhead(unsigned long long duration, struct trace_seq *s)
284{
285 /* Duration exceeded 100 msecs */
286 if (duration > 100000ULL)
287 return trace_seq_printf(s, "! ");
288
289 /* Duration exceeded 10 msecs */
290 if (duration > 10000ULL)
291 return trace_seq_printf(s, "+ ");
292
293 return trace_seq_printf(s, " ");
294}
295
296/* Case of a leaf function on its call entry */
297static enum print_line_t
298print_graph_entry_leaf(struct trace_iterator *iter,
299 struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
300{
301 struct ftrace_graph_ret_entry *ret_entry;
302 struct ftrace_graph_ret *graph_ret;
303 struct ring_buffer_event *event;
304 struct ftrace_graph_ent *call;
305 unsigned long long duration;
306 int ret;
307 int i;
308
309 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
310 ret_entry = ring_buffer_event_data(event);
311 graph_ret = &ret_entry->ret;
312 call = &entry->graph_ent;
313 duration = graph_ret->rettime - graph_ret->calltime;
314
315 /* Overhead */
316 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
317 ret = print_graph_overhead(duration, s);
318 if (!ret)
319 return TRACE_TYPE_PARTIAL_LINE;
320 }
321
322 /* Duration */
323 ret = print_graph_duration(duration, s);
324 if (ret == TRACE_TYPE_PARTIAL_LINE)
325 return TRACE_TYPE_PARTIAL_LINE;
326
327 /* Function */
328 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
329 ret = trace_seq_printf(s, " ");
330 if (!ret)
331 return TRACE_TYPE_PARTIAL_LINE;
332 }
333
334 ret = seq_print_ip_sym(s, call->func, 0);
335 if (!ret)
336 return TRACE_TYPE_PARTIAL_LINE;
337
338 ret = trace_seq_printf(s, "();\n");
339 if (!ret)
340 return TRACE_TYPE_PARTIAL_LINE;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345static enum print_line_t
346print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
347 struct trace_seq *s)
348{
349 int i;
350 int ret;
351 struct ftrace_graph_ent *call = &entry->graph_ent;
352
353 /* No overhead */
354 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
355 ret = trace_seq_printf(s, " ");
356 if (!ret)
357 return TRACE_TYPE_PARTIAL_LINE;
358 }
359
360 /* No time */
361 ret = trace_seq_printf(s, " | ");
362
363 /* Function */
364 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
365 ret = trace_seq_printf(s, " ");
366 if (!ret)
367 return TRACE_TYPE_PARTIAL_LINE;
368 }
369
370 ret = seq_print_ip_sym(s, call->func, 0);
371 if (!ret)
372 return TRACE_TYPE_PARTIAL_LINE;
373
374 ret = trace_seq_printf(s, "() {\n");
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 return TRACE_TYPE_HANDLED;
379}
380
381static enum print_line_t
382print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
383 struct trace_iterator *iter, int cpu)
384{
385 int ret;
386 struct trace_entry *ent = iter->ent;
387
388 /* Pid */
389 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
390 return TRACE_TYPE_PARTIAL_LINE;
391
392 /* Cpu */
393 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
394 ret = print_graph_cpu(s, cpu);
395 if (ret == TRACE_TYPE_PARTIAL_LINE)
396 return TRACE_TYPE_PARTIAL_LINE;
397 }
398
399 /* Proc */
400 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
401 ret = print_graph_proc(s, ent->pid);
402 if (ret == TRACE_TYPE_PARTIAL_LINE)
403 return TRACE_TYPE_PARTIAL_LINE;
404
405 ret = trace_seq_printf(s, " | ");
406 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE;
408 }
409
410 if (trace_branch_is_leaf(iter, field))
411 return print_graph_entry_leaf(iter, field, s);
412 else
413 return print_graph_entry_nested(field, s);
414
415}
416
417static enum print_line_t
418print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
419 struct trace_entry *ent, int cpu)
420{
421 int i;
422 int ret;
423 unsigned long long duration = trace->rettime - trace->calltime;
424
425 /* Pid */
426 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
427 return TRACE_TYPE_PARTIAL_LINE;
428
429 /* Cpu */
430 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
431 ret = print_graph_cpu(s, cpu);
432 if (ret == TRACE_TYPE_PARTIAL_LINE)
433 return TRACE_TYPE_PARTIAL_LINE;
434 }
435
436 /* Proc */
437 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
438 ret = print_graph_proc(s, ent->pid);
439 if (ret == TRACE_TYPE_PARTIAL_LINE)
440 return TRACE_TYPE_PARTIAL_LINE;
441
442 ret = trace_seq_printf(s, " | ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445 }
446
447 /* Overhead */
448 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
449 ret = print_graph_overhead(duration, s);
450 if (!ret)
451 return TRACE_TYPE_PARTIAL_LINE;
452 }
453
454 /* Duration */
455 ret = print_graph_duration(duration, s);
456 if (ret == TRACE_TYPE_PARTIAL_LINE)
457 return TRACE_TYPE_PARTIAL_LINE;
458
459 /* Closing brace */
460 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
461 ret = trace_seq_printf(s, " ");
462 if (!ret)
463 return TRACE_TYPE_PARTIAL_LINE;
464 }
465
466 ret = trace_seq_printf(s, "}\n");
467 if (!ret)
468 return TRACE_TYPE_PARTIAL_LINE;
469
470 /* Overrun */
471 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
472 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
473 trace->overrun);
474 if (!ret)
475 return TRACE_TYPE_PARTIAL_LINE;
476 }
477 return TRACE_TYPE_HANDLED;
478}
479
480static enum print_line_t
481print_graph_comment(struct print_entry *trace, struct trace_seq *s,
482 struct trace_entry *ent, struct trace_iterator *iter)
483{
484 int i;
485 int ret;
486
487 /* Pid */
488 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
489 return TRACE_TYPE_PARTIAL_LINE;
490
491 /* Cpu */
492 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
493 ret = print_graph_cpu(s, iter->cpu);
494 if (ret == TRACE_TYPE_PARTIAL_LINE)
495 return TRACE_TYPE_PARTIAL_LINE;
496 }
497
498 /* Proc */
499 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
500 ret = print_graph_proc(s, ent->pid);
501 if (ret == TRACE_TYPE_PARTIAL_LINE)
502 return TRACE_TYPE_PARTIAL_LINE;
503
504 ret = trace_seq_printf(s, " | ");
505 if (!ret)
506 return TRACE_TYPE_PARTIAL_LINE;
507 }
508
509 /* No overhead */
510 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
511 ret = trace_seq_printf(s, " ");
512 if (!ret)
513 return TRACE_TYPE_PARTIAL_LINE;
514 }
515
516 /* No time */
517 ret = trace_seq_printf(s, " | ");
518 if (!ret)
519 return TRACE_TYPE_PARTIAL_LINE;
520
521 /* Indentation */
522 if (trace->depth > 0)
523 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
524 ret = trace_seq_printf(s, " ");
525 if (!ret)
526 return TRACE_TYPE_PARTIAL_LINE;
527 }
528
529 /* The comment */
530 ret = trace_seq_printf(s, "/* %s", trace->buf);
531 if (!ret)
532 return TRACE_TYPE_PARTIAL_LINE;
533
534 if (ent->flags & TRACE_FLAG_CONT)
535 trace_seq_print_cont(s, iter);
536
537 ret = trace_seq_printf(s, " */\n");
538 if (!ret)
539 return TRACE_TYPE_PARTIAL_LINE;
540
541 return TRACE_TYPE_HANDLED;
542}
543
544
545enum print_line_t
546print_graph_function(struct trace_iterator *iter)
547{
548 struct trace_seq *s = &iter->seq;
549 struct trace_entry *entry = iter->ent;
550
551 switch (entry->type) {
552 case TRACE_GRAPH_ENT: {
553 struct ftrace_graph_ent_entry *field;
554 trace_assign_type(field, entry);
555 return print_graph_entry(field, s, iter,
556 iter->cpu);
557 }
558 case TRACE_GRAPH_RET: {
559 struct ftrace_graph_ret_entry *field;
560 trace_assign_type(field, entry);
561 return print_graph_return(&field->ret, s, entry, iter->cpu);
562 }
563 case TRACE_PRINT: {
564 struct print_entry *field;
565 trace_assign_type(field, entry);
566 return print_graph_comment(field, s, entry, iter);
567 }
568 default:
569 return TRACE_TYPE_UNHANDLED;
570 }
571}
572
573static void print_graph_headers(struct seq_file *s)
574{
575 /* 1st line */
576 seq_printf(s, "# ");
577 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
578 seq_printf(s, "CPU ");
579 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
580 seq_printf(s, "TASK/PID ");
581 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
582 seq_printf(s, "OVERHEAD/");
583 seq_printf(s, "DURATION FUNCTION CALLS\n");
584
585 /* 2nd line */
586 seq_printf(s, "# ");
587 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
588 seq_printf(s, "| ");
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
590 seq_printf(s, "| | ");
591 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
592 seq_printf(s, "| ");
593 seq_printf(s, "| | | | |\n");
594 } else
595 seq_printf(s, " | | | | |\n");
596}
597static struct tracer graph_trace __read_mostly = {
598 .name = "function_graph",
599 .init = graph_trace_init,
600 .reset = graph_trace_reset,
601 .print_line = print_graph_function,
602 .print_header = print_graph_headers,
603 .flags = &tracer_flags,
604};
605
606static __init int init_graph_trace(void)
607{
608 return register_tracer(&graph_trace);
609}
610
611device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e0..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 353}
354#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
355 355
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
356static void start_irqsoff_tracer(struct trace_array *tr) 362static void start_irqsoff_tracer(struct trace_array *tr)
357{ 363{
358 register_ftrace_function(&trace_ops); 364 register_ftrace_function(&trace_ops);
359 tracer_enabled = 1; 365 if (tracing_is_enabled()) {
366 tracer_enabled = 1;
367 save_tracer_enabled = 1;
368 } else {
369 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
360} 372}
361 373
362static void stop_irqsoff_tracer(struct trace_array *tr) 374static void stop_irqsoff_tracer(struct trace_array *tr)
363{ 375{
364 tracer_enabled = 0; 376 tracer_enabled = 0;
377 save_tracer_enabled = 0;
365 unregister_ftrace_function(&trace_ops); 378 unregister_ftrace_function(&trace_ops);
366} 379}
367 380
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
370 irqsoff_trace = tr; 383 irqsoff_trace = tr;
371 /* make sure that the tracer is visible */ 384 /* make sure that the tracer is visible */
372 smp_wmb(); 385 smp_wmb();
373 386 start_irqsoff_tracer(tr);
374 if (tr->ctrl)
375 start_irqsoff_tracer(tr);
376} 387}
377 388
378static void irqsoff_tracer_reset(struct trace_array *tr) 389static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 390{
380 if (tr->ctrl) 391 stop_irqsoff_tracer(tr);
381 stop_irqsoff_tracer(tr);
382} 392}
383 393
384static void irqsoff_tracer_ctrl_update(struct trace_array *tr) 394static void irqsoff_tracer_start(struct trace_array *tr)
385{ 395{
386 if (tr->ctrl) 396 tracer_enabled = 1;
387 start_irqsoff_tracer(tr); 397 save_tracer_enabled = 1;
388 else 398}
389 stop_irqsoff_tracer(tr); 399
400static void irqsoff_tracer_stop(struct trace_array *tr)
401{
402 tracer_enabled = 0;
403 save_tracer_enabled = 0;
390} 404}
391 405
392static void irqsoff_tracer_open(struct trace_iterator *iter) 406static void irqsoff_tracer_open(struct trace_iterator *iter)
393{ 407{
394 /* stop the trace while dumping */ 408 /* stop the trace while dumping */
395 if (iter->tr->ctrl) 409 tracer_enabled = 0;
396 stop_irqsoff_tracer(iter->tr);
397} 410}
398 411
399static void irqsoff_tracer_close(struct trace_iterator *iter) 412static void irqsoff_tracer_close(struct trace_iterator *iter)
400{ 413{
401 if (iter->tr->ctrl) 414 /* restart tracing */
402 start_irqsoff_tracer(iter->tr); 415 tracer_enabled = save_tracer_enabled;
403} 416}
404 417
405#ifdef CONFIG_IRQSOFF_TRACER 418#ifdef CONFIG_IRQSOFF_TRACER
406static void irqsoff_tracer_init(struct trace_array *tr) 419static int irqsoff_tracer_init(struct trace_array *tr)
407{ 420{
408 trace_type = TRACER_IRQS_OFF; 421 trace_type = TRACER_IRQS_OFF;
409 422
410 __irqsoff_tracer_init(tr); 423 __irqsoff_tracer_init(tr);
424 return 0;
411} 425}
412static struct tracer irqsoff_tracer __read_mostly = 426static struct tracer irqsoff_tracer __read_mostly =
413{ 427{
414 .name = "irqsoff", 428 .name = "irqsoff",
415 .init = irqsoff_tracer_init, 429 .init = irqsoff_tracer_init,
416 .reset = irqsoff_tracer_reset, 430 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop,
417 .open = irqsoff_tracer_open, 433 .open = irqsoff_tracer_open,
418 .close = irqsoff_tracer_close, 434 .close = irqsoff_tracer_close,
419 .ctrl_update = irqsoff_tracer_ctrl_update,
420 .print_max = 1, 435 .print_max = 1,
421#ifdef CONFIG_FTRACE_SELFTEST 436#ifdef CONFIG_FTRACE_SELFTEST
422 .selftest = trace_selftest_startup_irqsoff, 437 .selftest = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
428#endif 443#endif
429 444
430#ifdef CONFIG_PREEMPT_TRACER 445#ifdef CONFIG_PREEMPT_TRACER
431static void preemptoff_tracer_init(struct trace_array *tr) 446static int preemptoff_tracer_init(struct trace_array *tr)
432{ 447{
433 trace_type = TRACER_PREEMPT_OFF; 448 trace_type = TRACER_PREEMPT_OFF;
434 449
435 __irqsoff_tracer_init(tr); 450 __irqsoff_tracer_init(tr);
451 return 0;
436} 452}
437 453
438static struct tracer preemptoff_tracer __read_mostly = 454static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
440 .name = "preemptoff", 456 .name = "preemptoff",
441 .init = preemptoff_tracer_init, 457 .init = preemptoff_tracer_init,
442 .reset = irqsoff_tracer_reset, 458 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop,
443 .open = irqsoff_tracer_open, 461 .open = irqsoff_tracer_open,
444 .close = irqsoff_tracer_close, 462 .close = irqsoff_tracer_close,
445 .ctrl_update = irqsoff_tracer_ctrl_update,
446 .print_max = 1, 463 .print_max = 1,
447#ifdef CONFIG_FTRACE_SELFTEST 464#ifdef CONFIG_FTRACE_SELFTEST
448 .selftest = trace_selftest_startup_preemptoff, 465 .selftest = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
456#if defined(CONFIG_IRQSOFF_TRACER) && \ 473#if defined(CONFIG_IRQSOFF_TRACER) && \
457 defined(CONFIG_PREEMPT_TRACER) 474 defined(CONFIG_PREEMPT_TRACER)
458 475
459static void preemptirqsoff_tracer_init(struct trace_array *tr) 476static int preemptirqsoff_tracer_init(struct trace_array *tr)
460{ 477{
461 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 478 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
462 479
463 __irqsoff_tracer_init(tr); 480 __irqsoff_tracer_init(tr);
481 return 0;
464} 482}
465 483
466static struct tracer preemptirqsoff_tracer __read_mostly = 484static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
468 .name = "preemptirqsoff", 486 .name = "preemptirqsoff",
469 .init = preemptirqsoff_tracer_init, 487 .init = preemptirqsoff_tracer_init,
470 .reset = irqsoff_tracer_reset, 488 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop,
471 .open = irqsoff_tracer_open, 491 .open = irqsoff_tracer_open,
472 .close = irqsoff_tracer_close, 492 .close = irqsoff_tracer_close,
473 .ctrl_update = irqsoff_tracer_ctrl_update,
474 .print_max = 1, 493 .print_max = 1,
475#ifdef CONFIG_FTRACE_SELFTEST 494#ifdef CONFIG_FTRACE_SELFTEST
476 .selftest = trace_selftest_startup_preemptirqsoff, 495 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f28484618ff0..2fb6da6523b3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -18,46 +18,43 @@ struct header_iter {
18 18
19static struct trace_array *mmio_trace_array; 19static struct trace_array *mmio_trace_array;
20static bool overrun_detected; 20static bool overrun_detected;
21static unsigned long prev_overruns;
21 22
22static void mmio_reset_data(struct trace_array *tr) 23static void mmio_reset_data(struct trace_array *tr)
23{ 24{
24 int cpu; 25 int cpu;
25 26
26 overrun_detected = false; 27 overrun_detected = false;
28 prev_overruns = 0;
27 tr->time_start = ftrace_now(tr->cpu); 29 tr->time_start = ftrace_now(tr->cpu);
28 30
29 for_each_online_cpu(cpu) 31 for_each_online_cpu(cpu)
30 tracing_reset(tr, cpu); 32 tracing_reset(tr, cpu);
31} 33}
32 34
33static void mmio_trace_init(struct trace_array *tr) 35static int mmio_trace_init(struct trace_array *tr)
34{ 36{
35 pr_debug("in %s\n", __func__); 37 pr_debug("in %s\n", __func__);
36 mmio_trace_array = tr; 38 mmio_trace_array = tr;
37 if (tr->ctrl) { 39
38 mmio_reset_data(tr); 40 mmio_reset_data(tr);
39 enable_mmiotrace(); 41 enable_mmiotrace();
40 } 42 return 0;
41} 43}
42 44
43static void mmio_trace_reset(struct trace_array *tr) 45static void mmio_trace_reset(struct trace_array *tr)
44{ 46{
45 pr_debug("in %s\n", __func__); 47 pr_debug("in %s\n", __func__);
46 if (tr->ctrl) 48
47 disable_mmiotrace(); 49 disable_mmiotrace();
48 mmio_reset_data(tr); 50 mmio_reset_data(tr);
49 mmio_trace_array = NULL; 51 mmio_trace_array = NULL;
50} 52}
51 53
52static void mmio_trace_ctrl_update(struct trace_array *tr) 54static void mmio_trace_start(struct trace_array *tr)
53{ 55{
54 pr_debug("in %s\n", __func__); 56 pr_debug("in %s\n", __func__);
55 if (tr->ctrl) { 57 mmio_reset_data(tr);
56 mmio_reset_data(tr);
57 enable_mmiotrace();
58 } else {
59 disable_mmiotrace();
60 }
61} 58}
62 59
63static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 60static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -128,16 +125,12 @@ static void mmio_close(struct trace_iterator *iter)
128 125
129static unsigned long count_overruns(struct trace_iterator *iter) 126static unsigned long count_overruns(struct trace_iterator *iter)
130{ 127{
131 int cpu;
132 unsigned long cnt = 0; 128 unsigned long cnt = 0;
133/* FIXME: */ 129 unsigned long over = ring_buffer_overruns(iter->tr->buffer);
134#if 0 130
135 for_each_online_cpu(cpu) { 131 if (over > prev_overruns)
136 cnt += iter->overrun[cpu]; 132 cnt = over - prev_overruns;
137 iter->overrun[cpu] = 0; 133 prev_overruns = over;
138 }
139#endif
140 (void)cpu;
141 return cnt; 134 return cnt;
142} 135}
143 136
@@ -298,10 +291,10 @@ static struct tracer mmio_tracer __read_mostly =
298 .name = "mmiotrace", 291 .name = "mmiotrace",
299 .init = mmio_trace_init, 292 .init = mmio_trace_init,
300 .reset = mmio_trace_reset, 293 .reset = mmio_trace_reset,
294 .start = mmio_trace_start,
301 .pipe_open = mmio_pipe_open, 295 .pipe_open = mmio_pipe_open,
302 .close = mmio_close, 296 .close = mmio_close,
303 .read = mmio_read, 297 .read = mmio_read,
304 .ctrl_update = mmio_trace_ctrl_update,
305 .print_line = mmio_print_line, 298 .print_line = mmio_print_line,
306}; 299};
307 300
@@ -373,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
373 366
374int mmio_trace_printk(const char *fmt, va_list args) 367int mmio_trace_printk(const char *fmt, va_list args)
375{ 368{
376 return trace_vprintk(0, fmt, args); 369 return trace_vprintk(0, -1, fmt, args);
377} 370}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b4862515..b9767acd30ac 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15/* Our two options */
16enum {
17 TRACE_NOP_OPT_ACCEPT = 0x1,
18 TRACE_NOP_OPT_REFUSE = 0x2
19};
20
21/* Options for the tracer (see trace_options file) */
22static struct tracer_opt nop_opts[] = {
23 /* Option that will be accepted by set_flag callback */
24 { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
25 /* Option that will be refused by set_flag callback */
26 { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
27 { } /* Always set a last empty entry */
28};
29
30static struct tracer_flags nop_flags = {
31 /* You can check your flags value here when you want. */
32 .val = 0, /* By default: all flags disabled */
33 .opts = nop_opts
34};
35
15static struct trace_array *ctx_trace; 36static struct trace_array *ctx_trace;
16 37
17static void start_nop_trace(struct trace_array *tr) 38static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
24 /* Nothing to do! */ 45 /* Nothing to do! */
25} 46}
26 47
27static void nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
28{ 49{
29 int cpu; 50 int cpu;
30 ctx_trace = tr; 51 ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
32 for_each_online_cpu(cpu) 53 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu); 54 tracing_reset(tr, cpu);
34 55
35 if (tr->ctrl) 56 start_nop_trace(tr);
36 start_nop_trace(tr); 57 return 0;
37} 58}
38 59
39static void nop_trace_reset(struct trace_array *tr) 60static void nop_trace_reset(struct trace_array *tr)
40{ 61{
41 if (tr->ctrl) 62 stop_nop_trace(tr);
42 stop_nop_trace(tr);
43} 63}
44 64
45static void nop_trace_ctrl_update(struct trace_array *tr) 65/* It only serves as a signal handler and a callback to
66 * accept or refuse tthe setting of a flag.
67 * If you don't implement it, then the flag setting will be
68 * automatically accepted.
69 */
70static int nop_set_flag(u32 old_flags, u32 bit, int set)
46{ 71{
47 /* When starting a new trace, reset the buffers */ 72 /*
48 if (tr->ctrl) 73 * Note that you don't need to update nop_flags.val yourself.
49 start_nop_trace(tr); 74 * The tracing Api will do it automatically if you return 0
50 else 75 */
51 stop_nop_trace(tr); 76 if (bit == TRACE_NOP_OPT_ACCEPT) {
77 printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
78 " Now cat trace_options to see the result\n",
79 set);
80 return 0;
81 }
82
83 if (bit == TRACE_NOP_OPT_REFUSE) {
84 printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
85 "Now cat trace_options to see the result\n",
86 set);
87 return -EINVAL;
88 }
89
90 return 0;
52} 91}
53 92
93
54struct tracer nop_trace __read_mostly = 94struct tracer nop_trace __read_mostly =
55{ 95{
56 .name = "nop", 96 .name = "nop",
57 .init = nop_trace_init, 97 .init = nop_trace_init,
58 .reset = nop_trace_reset, 98 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST 99#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop, 100 .selftest = trace_selftest_startup_nop,
62#endif 101#endif
102 .flags = &nop_flags,
103 .set_flag = nop_set_flag
63}; 104};
64 105
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 000000000000..a7172a352f62
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <linux/ftrace.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19
20static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled;
22
23
24static void start_power_trace(struct trace_array *tr)
25{
26 trace_power_enabled = 1;
27}
28
29static void stop_power_trace(struct trace_array *tr)
30{
31 trace_power_enabled = 0;
32}
33
34
35static int power_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 power_trace = tr;
39
40 trace_power_enabled = 1;
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu);
44 return 0;
45}
46
47static enum print_line_t power_print_line(struct trace_iterator *iter)
48{
49 int ret = 0;
50 struct trace_entry *entry = iter->ent;
51 struct trace_power *field ;
52 struct power_trace *it;
53 struct trace_seq *s = &iter->seq;
54 struct timespec stamp;
55 struct timespec duration;
56
57 trace_assign_type(field, entry);
58 it = &field->state_data;
59 stamp = ktime_to_timespec(it->stamp);
60 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
61
62 if (entry->type == TRACE_POWER) {
63 if (it->type == POWER_CSTATE)
64 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
65 stamp.tv_sec,
66 stamp.tv_nsec,
67 it->state, iter->cpu,
68 duration.tv_sec,
69 duration.tv_nsec);
70 if (it->type == POWER_PSTATE)
71 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
72 stamp.tv_sec,
73 stamp.tv_nsec,
74 it->state, iter->cpu);
75 if (!ret)
76 return TRACE_TYPE_PARTIAL_LINE;
77 return TRACE_TYPE_HANDLED;
78 }
79 return TRACE_TYPE_UNHANDLED;
80}
81
82static struct tracer power_tracer __read_mostly =
83{
84 .name = "power",
85 .init = power_trace_init,
86 .start = start_power_trace,
87 .stop = stop_power_trace,
88 .reset = stop_power_trace,
89 .print_line = power_print_line,
90};
91
92static int init_power_trace(void)
93{
94 return register_tracer(&power_tracer);
95}
96device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a62..863390557b44 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@
16 16
17static struct trace_array *ctx_trace; 17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex);
20 21
21static void 22static void
22probe_sched_switch(struct rq *__rq, struct task_struct *prev, 23probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
27 int cpu; 28 int cpu;
28 int pc; 29 int pc;
29 30
30 if (!atomic_read(&sched_ref)) 31 if (!sched_ref)
31 return; 32 return;
32 33
33 tracing_record_cmdline(prev); 34 tracing_record_cmdline(prev);
@@ -123,20 +124,18 @@ static void tracing_sched_unregister(void)
123 124
124static void tracing_start_sched_switch(void) 125static void tracing_start_sched_switch(void)
125{ 126{
126 long ref; 127 mutex_lock(&sched_register_mutex);
127 128 if (!(sched_ref++))
128 ref = atomic_inc_return(&sched_ref);
129 if (ref == 1)
130 tracing_sched_register(); 129 tracing_sched_register();
130 mutex_unlock(&sched_register_mutex);
131} 131}
132 132
133static void tracing_stop_sched_switch(void) 133static void tracing_stop_sched_switch(void)
134{ 134{
135 long ref; 135 mutex_lock(&sched_register_mutex);
136 136 if (!(--sched_ref))
137 ref = atomic_dec_and_test(&sched_ref);
138 if (ref)
139 tracing_sched_unregister(); 137 tracing_sched_unregister();
138 mutex_unlock(&sched_register_mutex);
140} 139}
141 140
142void tracing_start_cmdline_record(void) 141void tracing_start_cmdline_record(void)
@@ -149,40 +148,86 @@ void tracing_stop_cmdline_record(void)
149 tracing_stop_sched_switch(); 148 tracing_stop_sched_switch();
150} 149}
151 150
151/**
152 * tracing_start_sched_switch_record - start tracing context switches
153 *
154 * Turns on context switch tracing for a tracer.
155 */
156void tracing_start_sched_switch_record(void)
157{
158 if (unlikely(!ctx_trace)) {
159 WARN_ON(1);
160 return;
161 }
162
163 tracing_start_sched_switch();
164
165 mutex_lock(&sched_register_mutex);
166 tracer_enabled++;
167 mutex_unlock(&sched_register_mutex);
168}
169
170/**
171 * tracing_stop_sched_switch_record - start tracing context switches
172 *
173 * Turns off context switch tracing for a tracer.
174 */
175void tracing_stop_sched_switch_record(void)
176{
177 mutex_lock(&sched_register_mutex);
178 tracer_enabled--;
179 WARN_ON(tracer_enabled < 0);
180 mutex_unlock(&sched_register_mutex);
181
182 tracing_stop_sched_switch();
183}
184
185/**
186 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
187 * @tr: trace array pointer to assign
188 *
189 * Some tracers might want to record the context switches in their
190 * trace. This function lets those tracers assign the trace array
191 * to use.
192 */
193void tracing_sched_switch_assign_trace(struct trace_array *tr)
194{
195 ctx_trace = tr;
196}
197
152static void start_sched_trace(struct trace_array *tr) 198static void start_sched_trace(struct trace_array *tr)
153{ 199{
154 sched_switch_reset(tr); 200 sched_switch_reset(tr);
155 tracing_start_cmdline_record(); 201 tracing_start_sched_switch_record();
156 tracer_enabled = 1;
157} 202}
158 203
159static void stop_sched_trace(struct trace_array *tr) 204static void stop_sched_trace(struct trace_array *tr)
160{ 205{
161 tracer_enabled = 0; 206 tracing_stop_sched_switch_record();
162 tracing_stop_cmdline_record();
163} 207}
164 208
165static void sched_switch_trace_init(struct trace_array *tr) 209static int sched_switch_trace_init(struct trace_array *tr)
166{ 210{
167 ctx_trace = tr; 211 ctx_trace = tr;
168 212 start_sched_trace(tr);
169 if (tr->ctrl) 213 return 0;
170 start_sched_trace(tr);
171} 214}
172 215
173static void sched_switch_trace_reset(struct trace_array *tr) 216static void sched_switch_trace_reset(struct trace_array *tr)
174{ 217{
175 if (tr->ctrl) 218 if (sched_ref)
176 stop_sched_trace(tr); 219 stop_sched_trace(tr);
177} 220}
178 221
179static void sched_switch_trace_ctrl_update(struct trace_array *tr) 222static void sched_switch_trace_start(struct trace_array *tr)
180{ 223{
181 /* When starting a new trace, reset the buffers */ 224 sched_switch_reset(tr);
182 if (tr->ctrl) 225 tracing_start_sched_switch();
183 start_sched_trace(tr); 226}
184 else 227
185 stop_sched_trace(tr); 228static void sched_switch_trace_stop(struct trace_array *tr)
229{
230 tracing_stop_sched_switch();
186} 231}
187 232
188static struct tracer sched_switch_trace __read_mostly = 233static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +235,8 @@ static struct tracer sched_switch_trace __read_mostly =
190 .name = "sched_switch", 235 .name = "sched_switch",
191 .init = sched_switch_trace_init, 236 .init = sched_switch_trace_init,
192 .reset = sched_switch_trace_reset, 237 .reset = sched_switch_trace_reset,
193 .ctrl_update = sched_switch_trace_ctrl_update, 238 .start = sched_switch_trace_start,
239 .stop = sched_switch_trace_stop,
194#ifdef CONFIG_FTRACE_SELFTEST 240#ifdef CONFIG_FTRACE_SELFTEST
195 .selftest = trace_selftest_startup_sched_switch, 241 .selftest = trace_selftest_startup_sched_switch,
196#endif 242#endif
@@ -198,14 +244,6 @@ static struct tracer sched_switch_trace __read_mostly =
198 244
199__init static int init_sched_switch_trace(void) 245__init static int init_sched_switch_trace(void)
200{ 246{
201 int ret = 0;
202
203 if (atomic_read(&sched_ref))
204 ret = tracing_sched_register();
205 if (ret) {
206 pr_info("error registering scheduler trace\n");
207 return ret;
208 }
209 return register_tracer(&sched_switch_trace); 247 return register_tracer(&sched_switch_trace);
210} 248}
211device_initcall(init_sched_switch_trace); 249device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b565..0067b49746c1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
50 return; 50 return;
51 51
52 pc = preempt_count(); 52 pc = preempt_count();
53 resched = need_resched(); 53 resched = ftrace_preempt_disable();
54 preempt_disable_notrace();
55 54
56 cpu = raw_smp_processor_id(); 55 cpu = raw_smp_processor_id();
57 data = tr->data[cpu]; 56 data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
81 out: 80 out:
82 atomic_dec(&data->disabled); 81 atomic_dec(&data->disabled);
83 82
84 /* 83 ftrace_preempt_enable(resched);
85 * To prevent recursion from the scheduler, if the
86 * resched flag was set before we entered, then
87 * don't reschedule.
88 */
89 if (resched)
90 preempt_enable_no_resched_notrace();
91 else
92 preempt_enable_notrace();
93} 84}
94 85
95static struct ftrace_ops trace_ops __read_mostly = 86static struct ftrace_ops trace_ops __read_mostly =
@@ -271,6 +262,12 @@ out:
271 atomic_dec(&wakeup_trace->data[cpu]->disabled); 262 atomic_dec(&wakeup_trace->data[cpu]->disabled);
272} 263}
273 264
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
274static void start_wakeup_tracer(struct trace_array *tr) 271static void start_wakeup_tracer(struct trace_array *tr)
275{ 272{
276 int ret; 273 int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
309 306
310 register_ftrace_function(&trace_ops); 307 register_ftrace_function(&trace_ops);
311 308
312 tracer_enabled = 1; 309 if (tracing_is_enabled()) {
310 tracer_enabled = 1;
311 save_tracer_enabled = 1;
312 } else {
313 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
313 316
314 return; 317 return;
315fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
321static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
322{ 325{
323 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
324 unregister_ftrace_function(&trace_ops); 328 unregister_ftrace_function(&trace_ops);
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 329 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 330 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 331 unregister_trace_sched_wakeup(probe_wakeup);
328} 332}
329 333
330static void wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
331{ 335{
332 wakeup_trace = tr; 336 wakeup_trace = tr;
333 337 start_wakeup_tracer(tr);
334 if (tr->ctrl) 338 return 0;
335 start_wakeup_tracer(tr);
336} 339}
337 340
338static void wakeup_tracer_reset(struct trace_array *tr) 341static void wakeup_tracer_reset(struct trace_array *tr)
339{ 342{
340 if (tr->ctrl) { 343 stop_wakeup_tracer(tr);
341 stop_wakeup_tracer(tr); 344 /* make sure we put back any tasks we are tracing */
342 /* make sure we put back any tasks we are tracing */ 345 wakeup_reset(tr);
343 wakeup_reset(tr); 346}
344 } 347
348static void wakeup_tracer_start(struct trace_array *tr)
349{
350 wakeup_reset(tr);
351 tracer_enabled = 1;
352 save_tracer_enabled = 1;
345} 353}
346 354
347static void wakeup_tracer_ctrl_update(struct trace_array *tr) 355static void wakeup_tracer_stop(struct trace_array *tr)
348{ 356{
349 if (tr->ctrl) 357 tracer_enabled = 0;
350 start_wakeup_tracer(tr); 358 save_tracer_enabled = 0;
351 else
352 stop_wakeup_tracer(tr);
353} 359}
354 360
355static void wakeup_tracer_open(struct trace_iterator *iter) 361static void wakeup_tracer_open(struct trace_iterator *iter)
356{ 362{
357 /* stop the trace while dumping */ 363 /* stop the trace while dumping */
358 if (iter->tr->ctrl) 364 tracer_enabled = 0;
359 stop_wakeup_tracer(iter->tr);
360} 365}
361 366
362static void wakeup_tracer_close(struct trace_iterator *iter) 367static void wakeup_tracer_close(struct trace_iterator *iter)
363{ 368{
364 /* forget about any processes we were recording */ 369 /* forget about any processes we were recording */
365 if (iter->tr->ctrl) 370 if (save_tracer_enabled) {
366 start_wakeup_tracer(iter->tr); 371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
367} 374}
368 375
369static struct tracer wakeup_tracer __read_mostly = 376static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
371 .name = "wakeup", 378 .name = "wakeup",
372 .init = wakeup_tracer_init, 379 .init = wakeup_tracer_init,
373 .reset = wakeup_tracer_reset, 380 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop,
374 .open = wakeup_tracer_open, 383 .open = wakeup_tracer_open,
375 .close = wakeup_tracer_close, 384 .close = wakeup_tracer_close,
376 .ctrl_update = wakeup_tracer_ctrl_update,
377 .print_max = 1, 385 .print_max = 1,
378#ifdef CONFIG_FTRACE_SELFTEST 386#ifdef CONFIG_FTRACE_SELFTEST
379 .selftest = trace_selftest_startup_wakeup, 387 .selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a7580..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH:
16 return 1; 17 return 1;
17 } 18 }
18 return 0; 19 return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
51 int cpu, ret = 0; 52 int cpu, ret = 0;
52 53
53 /* Don't allow flipping of max traces now */ 54 /* Don't allow flipping of max traces now */
54 raw_local_irq_save(flags); 55 local_irq_save(flags);
55 __raw_spin_lock(&ftrace_max_lock); 56 __raw_spin_lock(&ftrace_max_lock);
56 57
57 cnt = ring_buffer_entries(tr->buffer); 58 cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
62 break; 63 break;
63 } 64 }
64 __raw_spin_unlock(&ftrace_max_lock); 65 __raw_spin_unlock(&ftrace_max_lock);
65 raw_local_irq_restore(flags); 66 local_irq_restore(flags);
66 67
67 if (count) 68 if (count)
68 *count = cnt; 69 *count = cnt;
@@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
70 return ret; 71 return ret;
71} 72}
72 73
74static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
75{
76 printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
77 trace->name, init_ret);
78}
73#ifdef CONFIG_FUNCTION_TRACER 79#ifdef CONFIG_FUNCTION_TRACER
74 80
75#ifdef CONFIG_DYNAMIC_FTRACE 81#ifdef CONFIG_DYNAMIC_FTRACE
@@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
110 ftrace_set_filter(func_name, strlen(func_name), 1); 116 ftrace_set_filter(func_name, strlen(func_name), 1);
111 117
112 /* enable tracing */ 118 /* enable tracing */
113 tr->ctrl = 1; 119 ret = trace->init(tr);
114 trace->init(tr); 120 if (ret) {
121 warn_failed_init_tracer(trace, ret);
122 goto out;
123 }
115 124
116 /* Sleep for a 1/10 of a second */ 125 /* Sleep for a 1/10 of a second */
117 msleep(100); 126 msleep(100);
@@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
134 msleep(100); 143 msleep(100);
135 144
136 /* stop the tracing. */ 145 /* stop the tracing. */
137 tr->ctrl = 0; 146 tracing_stop();
138 trace->ctrl_update(tr);
139 ftrace_enabled = 0; 147 ftrace_enabled = 0;
140 148
141 /* check the trace buffer */ 149 /* check the trace buffer */
142 ret = trace_test_buffer(tr, &count); 150 ret = trace_test_buffer(tr, &count);
143 trace->reset(tr); 151 trace->reset(tr);
152 tracing_start();
144 153
145 /* we should only have one item */ 154 /* we should only have one item */
146 if (!ret && count != 1) { 155 if (!ret && count != 1) {
@@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
148 ret = -1; 157 ret = -1;
149 goto out; 158 goto out;
150 } 159 }
160
151 out: 161 out:
152 ftrace_enabled = save_ftrace_enabled; 162 ftrace_enabled = save_ftrace_enabled;
153 tracer_enabled = save_tracer_enabled; 163 tracer_enabled = save_tracer_enabled;
@@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
180 ftrace_enabled = 1; 190 ftrace_enabled = 1;
181 tracer_enabled = 1; 191 tracer_enabled = 1;
182 192
183 tr->ctrl = 1; 193 ret = trace->init(tr);
184 trace->init(tr); 194 if (ret) {
195 warn_failed_init_tracer(trace, ret);
196 goto out;
197 }
198
185 /* Sleep for a 1/10 of a second */ 199 /* Sleep for a 1/10 of a second */
186 msleep(100); 200 msleep(100);
187 /* stop the tracing. */ 201 /* stop the tracing. */
188 tr->ctrl = 0; 202 tracing_stop();
189 trace->ctrl_update(tr);
190 ftrace_enabled = 0; 203 ftrace_enabled = 0;
191 204
192 /* check the trace buffer */ 205 /* check the trace buffer */
193 ret = trace_test_buffer(tr, &count); 206 ret = trace_test_buffer(tr, &count);
194 trace->reset(tr); 207 trace->reset(tr);
208 tracing_start();
195 209
196 if (!ret && !count) { 210 if (!ret && !count) {
197 printk(KERN_CONT ".. no entries found .."); 211 printk(KERN_CONT ".. no entries found ..");
@@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
223 int ret; 237 int ret;
224 238
225 /* start the tracing */ 239 /* start the tracing */
226 tr->ctrl = 1; 240 ret = trace->init(tr);
227 trace->init(tr); 241 if (ret) {
242 warn_failed_init_tracer(trace, ret);
243 return ret;
244 }
245
228 /* reset the max latency */ 246 /* reset the max latency */
229 tracing_max_latency = 0; 247 tracing_max_latency = 0;
230 /* disable interrupts for a bit */ 248 /* disable interrupts for a bit */
@@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
232 udelay(100); 250 udelay(100);
233 local_irq_enable(); 251 local_irq_enable();
234 /* stop the tracing. */ 252 /* stop the tracing. */
235 tr->ctrl = 0; 253 tracing_stop();
236 trace->ctrl_update(tr);
237 /* check both trace buffers */ 254 /* check both trace buffers */
238 ret = trace_test_buffer(tr, NULL); 255 ret = trace_test_buffer(tr, NULL);
239 if (!ret) 256 if (!ret)
240 ret = trace_test_buffer(&max_tr, &count); 257 ret = trace_test_buffer(&max_tr, &count);
241 trace->reset(tr); 258 trace->reset(tr);
259 tracing_start();
242 260
243 if (!ret && !count) { 261 if (!ret && !count) {
244 printk(KERN_CONT ".. no entries found .."); 262 printk(KERN_CONT ".. no entries found ..");
@@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
259 unsigned long count; 277 unsigned long count;
260 int ret; 278 int ret;
261 279
280 /*
281 * Now that the big kernel lock is no longer preemptable,
282 * and this is called with the BKL held, it will always
283 * fail. If preemption is already disabled, simply
284 * pass the test. When the BKL is removed, or becomes
285 * preemptible again, we will once again test this,
286 * so keep it in.
287 */
288 if (preempt_count()) {
289 printk(KERN_CONT "can not test ... force ");
290 return 0;
291 }
292
262 /* start the tracing */ 293 /* start the tracing */
263 tr->ctrl = 1; 294 ret = trace->init(tr);
264 trace->init(tr); 295 if (ret) {
296 warn_failed_init_tracer(trace, ret);
297 return ret;
298 }
299
265 /* reset the max latency */ 300 /* reset the max latency */
266 tracing_max_latency = 0; 301 tracing_max_latency = 0;
267 /* disable preemption for a bit */ 302 /* disable preemption for a bit */
@@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
269 udelay(100); 304 udelay(100);
270 preempt_enable(); 305 preempt_enable();
271 /* stop the tracing. */ 306 /* stop the tracing. */
272 tr->ctrl = 0; 307 tracing_stop();
273 trace->ctrl_update(tr);
274 /* check both trace buffers */ 308 /* check both trace buffers */
275 ret = trace_test_buffer(tr, NULL); 309 ret = trace_test_buffer(tr, NULL);
276 if (!ret) 310 if (!ret)
277 ret = trace_test_buffer(&max_tr, &count); 311 ret = trace_test_buffer(&max_tr, &count);
278 trace->reset(tr); 312 trace->reset(tr);
313 tracing_start();
279 314
280 if (!ret && !count) { 315 if (!ret && !count) {
281 printk(KERN_CONT ".. no entries found .."); 316 printk(KERN_CONT ".. no entries found ..");
@@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
296 unsigned long count; 331 unsigned long count;
297 int ret; 332 int ret;
298 333
334 /*
335 * Now that the big kernel lock is no longer preemptable,
336 * and this is called with the BKL held, it will always
337 * fail. If preemption is already disabled, simply
338 * pass the test. When the BKL is removed, or becomes
339 * preemptible again, we will once again test this,
340 * so keep it in.
341 */
342 if (preempt_count()) {
343 printk(KERN_CONT "can not test ... force ");
344 return 0;
345 }
346
299 /* start the tracing */ 347 /* start the tracing */
300 tr->ctrl = 1; 348 ret = trace->init(tr);
301 trace->init(tr); 349 if (ret) {
350 warn_failed_init_tracer(trace, ret);
351 goto out;
352 }
302 353
303 /* reset the max latency */ 354 /* reset the max latency */
304 tracing_max_latency = 0; 355 tracing_max_latency = 0;
@@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
312 local_irq_enable(); 363 local_irq_enable();
313 364
314 /* stop the tracing. */ 365 /* stop the tracing. */
315 tr->ctrl = 0; 366 tracing_stop();
316 trace->ctrl_update(tr);
317 /* check both trace buffers */ 367 /* check both trace buffers */
318 ret = trace_test_buffer(tr, NULL); 368 ret = trace_test_buffer(tr, NULL);
319 if (ret) 369 if (ret) {
370 tracing_start();
320 goto out; 371 goto out;
372 }
321 373
322 ret = trace_test_buffer(&max_tr, &count); 374 ret = trace_test_buffer(&max_tr, &count);
323 if (ret) 375 if (ret) {
376 tracing_start();
324 goto out; 377 goto out;
378 }
325 379
326 if (!ret && !count) { 380 if (!ret && !count) {
327 printk(KERN_CONT ".. no entries found .."); 381 printk(KERN_CONT ".. no entries found ..");
328 ret = -1; 382 ret = -1;
383 tracing_start();
329 goto out; 384 goto out;
330 } 385 }
331 386
332 /* do the test by disabling interrupts first this time */ 387 /* do the test by disabling interrupts first this time */
333 tracing_max_latency = 0; 388 tracing_max_latency = 0;
334 tr->ctrl = 1; 389 tracing_start();
335 trace->ctrl_update(tr);
336 preempt_disable(); 390 preempt_disable();
337 local_irq_disable(); 391 local_irq_disable();
338 udelay(100); 392 udelay(100);
@@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
341 local_irq_enable(); 395 local_irq_enable();
342 396
343 /* stop the tracing. */ 397 /* stop the tracing. */
344 tr->ctrl = 0; 398 tracing_stop();
345 trace->ctrl_update(tr);
346 /* check both trace buffers */ 399 /* check both trace buffers */
347 ret = trace_test_buffer(tr, NULL); 400 ret = trace_test_buffer(tr, NULL);
348 if (ret) 401 if (ret)
@@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
358 411
359 out: 412 out:
360 trace->reset(tr); 413 trace->reset(tr);
414 tracing_start();
361 tracing_max_latency = save_max; 415 tracing_max_latency = save_max;
362 416
363 return ret; 417 return ret;
@@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
423 wait_for_completion(&isrt); 477 wait_for_completion(&isrt);
424 478
425 /* start the tracing */ 479 /* start the tracing */
426 tr->ctrl = 1; 480 ret = trace->init(tr);
427 trace->init(tr); 481 if (ret) {
482 warn_failed_init_tracer(trace, ret);
483 return ret;
484 }
485
428 /* reset the max latency */ 486 /* reset the max latency */
429 tracing_max_latency = 0; 487 tracing_max_latency = 0;
430 488
@@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448 msleep(100); 506 msleep(100);
449 507
450 /* stop the tracing. */ 508 /* stop the tracing. */
451 tr->ctrl = 0; 509 tracing_stop();
452 trace->ctrl_update(tr);
453 /* check both trace buffers */ 510 /* check both trace buffers */
454 ret = trace_test_buffer(tr, NULL); 511 ret = trace_test_buffer(tr, NULL);
455 if (!ret) 512 if (!ret)
@@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
457 514
458 515
459 trace->reset(tr); 516 trace->reset(tr);
517 tracing_start();
460 518
461 tracing_max_latency = save_max; 519 tracing_max_latency = save_max;
462 520
@@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
480 int ret; 538 int ret;
481 539
482 /* start the tracing */ 540 /* start the tracing */
483 tr->ctrl = 1; 541 ret = trace->init(tr);
484 trace->init(tr); 542 if (ret) {
543 warn_failed_init_tracer(trace, ret);
544 return ret;
545 }
546
485 /* Sleep for a 1/10 of a second */ 547 /* Sleep for a 1/10 of a second */
486 msleep(100); 548 msleep(100);
487 /* stop the tracing. */ 549 /* stop the tracing. */
488 tr->ctrl = 0; 550 tracing_stop();
489 trace->ctrl_update(tr);
490 /* check the trace buffer */ 551 /* check the trace buffer */
491 ret = trace_test_buffer(tr, &count); 552 ret = trace_test_buffer(tr, &count);
492 trace->reset(tr); 553 trace->reset(tr);
554 tracing_start();
493 555
494 if (!ret && !count) { 556 if (!ret && !count) {
495 printk(KERN_CONT ".. no entries found .."); 557 printk(KERN_CONT ".. no entries found ..");
@@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
508 int ret; 570 int ret;
509 571
510 /* start the tracing */ 572 /* start the tracing */
511 tr->ctrl = 1; 573 ret = trace->init(tr);
512 trace->init(tr); 574 if (ret) {
575 warn_failed_init_tracer(trace, ret);
576 return 0;
577 }
578
513 /* Sleep for a 1/10 of a second */ 579 /* Sleep for a 1/10 of a second */
514 msleep(100); 580 msleep(100);
515 /* stop the tracing. */ 581 /* stop the tracing. */
516 tr->ctrl = 0; 582 tracing_stop();
517 trace->ctrl_update(tr);
518 /* check the trace buffer */ 583 /* check the trace buffer */
519 ret = trace_test_buffer(tr, &count); 584 ret = trace_test_buffer(tr, &count);
520 trace->reset(tr); 585 trace->reset(tr);
586 tracing_start();
521 587
522 return ret; 588 return ret;
523} 589}
524#endif /* CONFIG_SYSPROF_TRACER */ 590#endif /* CONFIG_SYSPROF_TRACER */
591
592#ifdef CONFIG_BRANCH_TRACER
593int
594trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
595{
596 unsigned long count;
597 int ret;
598
599 /* start the tracing */
600 ret = trace->init(tr);
601 if (ret) {
602 warn_failed_init_tracer(trace, ret);
603 return ret;
604 }
605
606 /* Sleep for a 1/10 of a second */
607 msleep(100);
608 /* stop the tracing. */
609 tracing_stop();
610 /* check the trace buffer */
611 ret = trace_test_buffer(tr, &count);
612 trace->reset(tr);
613 tracing_start();
614
615 return ret;
616}
617#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index be682b62fe58..0b863f2cbc8e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -48,7 +48,7 @@ static inline void check_stack(void)
48 if (!object_is_on_stack(&this_size)) 48 if (!object_is_on_stack(&this_size))
49 return; 49 return;
50 50
51 raw_local_irq_save(flags); 51 local_irq_save(flags);
52 __raw_spin_lock(&max_stack_lock); 52 __raw_spin_lock(&max_stack_lock);
53 53
54 /* a race could have already updated it */ 54 /* a race could have already updated it */
@@ -78,6 +78,7 @@ static inline void check_stack(void)
78 * on a new max, so it is far from a fast path. 78 * on a new max, so it is far from a fast path.
79 */ 79 */
80 while (i < max_stack_trace.nr_entries) { 80 while (i < max_stack_trace.nr_entries) {
81 int found = 0;
81 82
82 stack_dump_index[i] = this_size; 83 stack_dump_index[i] = this_size;
83 p = start; 84 p = start;
@@ -86,17 +87,19 @@ static inline void check_stack(void)
86 if (*p == stack_dump_trace[i]) { 87 if (*p == stack_dump_trace[i]) {
87 this_size = stack_dump_index[i++] = 88 this_size = stack_dump_index[i++] =
88 (top - p) * sizeof(unsigned long); 89 (top - p) * sizeof(unsigned long);
90 found = 1;
89 /* Start the search from here */ 91 /* Start the search from here */
90 start = p + 1; 92 start = p + 1;
91 } 93 }
92 } 94 }
93 95
94 i++; 96 if (!found)
97 i++;
95 } 98 }
96 99
97 out: 100 out:
98 __raw_spin_unlock(&max_stack_lock); 101 __raw_spin_unlock(&max_stack_lock);
99 raw_local_irq_restore(flags); 102 local_irq_restore(flags);
100} 103}
101 104
102static void 105static void
@@ -107,8 +110,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
107 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 110 if (unlikely(!ftrace_enabled || stack_trace_disabled))
108 return; 111 return;
109 112
110 resched = need_resched(); 113 resched = ftrace_preempt_disable();
111 preempt_disable_notrace();
112 114
113 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
114 /* no atomic needed, we only modify this variable by this cpu */ 116 /* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +122,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
120 out: 122 out:
121 per_cpu(trace_active, cpu)--; 123 per_cpu(trace_active, cpu)--;
122 /* prevent recursion in schedule */ 124 /* prevent recursion in schedule */
123 if (resched) 125 ftrace_preempt_enable(resched);
124 preempt_enable_no_resched_notrace();
125 else
126 preempt_enable_notrace();
127} 126}
128 127
129static struct ftrace_ops trace_ops __read_mostly = 128static struct ftrace_ops trace_ops __read_mostly =
@@ -166,11 +165,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
166 if (ret < 0) 165 if (ret < 0)
167 return ret; 166 return ret;
168 167
169 raw_local_irq_save(flags); 168 local_irq_save(flags);
170 __raw_spin_lock(&max_stack_lock); 169 __raw_spin_lock(&max_stack_lock);
171 *ptr = val; 170 *ptr = val;
172 __raw_spin_unlock(&max_stack_lock); 171 __raw_spin_unlock(&max_stack_lock);
173 raw_local_irq_restore(flags); 172 local_irq_restore(flags);
174 173
175 return count; 174 return count;
176} 175}
@@ -184,11 +183,16 @@ static struct file_operations stack_max_size_fops = {
184static void * 183static void *
185t_next(struct seq_file *m, void *v, loff_t *pos) 184t_next(struct seq_file *m, void *v, loff_t *pos)
186{ 185{
187 long i = (long)m->private; 186 long i;
188 187
189 (*pos)++; 188 (*pos)++;
190 189
191 i++; 190 if (v == SEQ_START_TOKEN)
191 i = 0;
192 else {
193 i = *(long *)v;
194 i++;
195 }
192 196
193 if (i >= max_stack_trace.nr_entries || 197 if (i >= max_stack_trace.nr_entries ||
194 stack_dump_trace[i] == ULONG_MAX) 198 stack_dump_trace[i] == ULONG_MAX)
@@ -201,12 +205,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
201 205
202static void *t_start(struct seq_file *m, loff_t *pos) 206static void *t_start(struct seq_file *m, loff_t *pos)
203{ 207{
204 void *t = &m->private; 208 void *t = SEQ_START_TOKEN;
205 loff_t l = 0; 209 loff_t l = 0;
206 210
207 local_irq_disable(); 211 local_irq_disable();
208 __raw_spin_lock(&max_stack_lock); 212 __raw_spin_lock(&max_stack_lock);
209 213
214 if (*pos == 0)
215 return SEQ_START_TOKEN;
216
210 for (; t && l < *pos; t = t_next(m, t, &l)) 217 for (; t && l < *pos; t = t_next(m, t, &l))
211 ; 218 ;
212 219
@@ -235,10 +242,10 @@ static int trace_lookup_stack(struct seq_file *m, long i)
235 242
236static int t_show(struct seq_file *m, void *v) 243static int t_show(struct seq_file *m, void *v)
237{ 244{
238 long i = *(long *)v; 245 long i;
239 int size; 246 int size;
240 247
241 if (i < 0) { 248 if (v == SEQ_START_TOKEN) {
242 seq_printf(m, " Depth Size Location" 249 seq_printf(m, " Depth Size Location"
243 " (%d entries)\n" 250 " (%d entries)\n"
244 " ----- ---- --------\n", 251 " ----- ---- --------\n",
@@ -246,6 +253,8 @@ static int t_show(struct seq_file *m, void *v)
246 return 0; 253 return 0;
247 } 254 }
248 255
256 i = *(long *)v;
257
249 if (i >= max_stack_trace.nr_entries || 258 if (i >= max_stack_trace.nr_entries ||
250 stack_dump_trace[i] == ULONG_MAX) 259 stack_dump_trace[i] == ULONG_MAX)
251 return 0; 260 return 0;
@@ -275,10 +284,6 @@ static int stack_trace_open(struct inode *inode, struct file *file)
275 int ret; 284 int ret;
276 285
277 ret = seq_open(file, &stack_trace_seq_ops); 286 ret = seq_open(file, &stack_trace_seq_ops);
278 if (!ret) {
279 struct seq_file *m = file->private_data;
280 m->private = (void *)-1;
281 }
282 287
283 return ret; 288 return ret;
284} 289}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..54960edb96d0 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -261,27 +261,17 @@ static void stop_stack_trace(struct trace_array *tr)
261 mutex_unlock(&sample_timer_lock); 261 mutex_unlock(&sample_timer_lock);
262} 262}
263 263
264static void stack_trace_init(struct trace_array *tr) 264static int stack_trace_init(struct trace_array *tr)
265{ 265{
266 sysprof_trace = tr; 266 sysprof_trace = tr;
267 267
268 if (tr->ctrl) 268 start_stack_trace(tr);
269 start_stack_trace(tr); 269 return 0;
270} 270}
271 271
272static void stack_trace_reset(struct trace_array *tr) 272static void stack_trace_reset(struct trace_array *tr)
273{ 273{
274 if (tr->ctrl) 274 stop_stack_trace(tr);
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285} 275}
286 276
287static struct tracer stack_trace __read_mostly = 277static struct tracer stack_trace __read_mostly =
@@ -289,7 +279,6 @@ static struct tracer stack_trace __read_mostly =
289 .name = "sysprof", 279 .name = "sysprof",
290 .init = stack_trace_init, 280 .init = stack_trace_init,
291 .reset = stack_trace_reset, 281 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST 282#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof, 283 .selftest = trace_selftest_startup_sysprof,
295#endif 284#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c85664882..79602740bbb5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
43 */ 43 */
44#define TRACEPOINT_HASH_BITS 6 44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) 45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 void **funcs; 56 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */ 57 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0]; 58 char name[0];
61}; 59};
62 60
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; 61struct tp_probes {
62 union {
63 struct rcu_head rcu;
64 struct list_head list;
65 } u;
66 void *probes[0];
67};
64 68
65static void free_old_closure(struct rcu_head *head) 69static inline void *allocate_probes(int count)
66{ 70{
67 struct tracepoint_entry *entry = container_of(head, 71 struct tp_probes *p = kmalloc(count * sizeof(void *)
68 struct tracepoint_entry, rcu); 72 + sizeof(struct tp_probes), GFP_KERNEL);
69 kfree(entry->oldptr); 73 return p == NULL ? NULL : p->probes;
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73} 74}
74 75
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) 76static void rcu_free_old_probes(struct rcu_head *head)
76{ 77{
77 if (!old) 78 kfree(container_of(head, struct tp_probes, u.rcu));
78 return; 79}
79 entry->oldptr = old; 80
80 entry->rcu_pending = 1; 81static inline void release_probes(void *old)
81 /* write rcu_pending before calling the RCU callback */ 82{
82 smp_wmb(); 83 if (old) {
83 call_rcu_sched(&entry->rcu, free_old_closure); 84 struct tp_probes *tp_probes = container_of(old,
85 struct tp_probes, probes[0]);
86 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
87 }
84} 88}
85 89
86static void debug_print_probes(struct tracepoint_entry *entry) 90static void debug_print_probes(struct tracepoint_entry *entry)
87{ 91{
88 int i; 92 int i;
89 93
90 if (!tracepoint_debug) 94 if (!tracepoint_debug || !entry->funcs)
91 return; 95 return;
92 96
93 for (i = 0; entry->funcs[i]; i++) 97 for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 return ERR_PTR(-EEXIST); 115 return ERR_PTR(-EEXIST);
112 } 116 }
113 /* + 2 : one for new probe, one for NULL func */ 117 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); 118 new = allocate_probes(nr_probes + 2);
115 if (new == NULL) 119 if (new == NULL)
116 return ERR_PTR(-ENOMEM); 120 return ERR_PTR(-ENOMEM);
117 if (old) 121 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *)); 122 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe; 123 new[nr_probes] = probe;
124 new[nr_probes + 1] = NULL;
120 entry->refcount = nr_probes + 1; 125 entry->refcount = nr_probes + 1;
121 entry->funcs = new; 126 entry->funcs = new;
122 debug_print_probes(entry); 127 debug_print_probes(entry);
@@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
132 old = entry->funcs; 137 old = entry->funcs;
133 138
134 if (!old) 139 if (!old)
135 return NULL; 140 return ERR_PTR(-ENOENT);
136 141
137 debug_print_probes(entry); 142 debug_print_probes(entry);
138 /* (N -> M), (N > 1, M >= 0) probes */ 143 /* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
151 int j = 0; 156 int j = 0;
152 /* N -> M, (N > 1, M > 0) */ 157 /* N -> M, (N > 1, M > 0) */
153 /* + 1 for NULL */ 158 /* + 1 for NULL */
154 new = kzalloc((nr_probes - nr_del + 1) 159 new = allocate_probes(nr_probes - nr_del + 1);
155 * sizeof(void *), GFP_KERNEL);
156 if (new == NULL) 160 if (new == NULL)
157 return ERR_PTR(-ENOMEM); 161 return ERR_PTR(-ENOMEM);
158 for (i = 0; old[i]; i++) 162 for (i = 0; old[i]; i++)
159 if ((probe && old[i] != probe)) 163 if ((probe && old[i] != probe))
160 new[j++] = old[i]; 164 new[j++] = old[i];
165 new[nr_probes - nr_del] = NULL;
161 entry->refcount = nr_probes - nr_del; 166 entry->refcount = nr_probes - nr_del;
162 entry->funcs = new; 167 entry->funcs = new;
163 } 168 }
@@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
215 memcpy(&e->name[0], name, name_len); 220 memcpy(&e->name[0], name, name_len);
216 e->funcs = NULL; 221 e->funcs = NULL;
217 e->refcount = 0; 222 e->refcount = 0;
218 e->rcu_pending = 0;
219 hlist_add_head(&e->hlist, head); 223 hlist_add_head(&e->hlist, head);
220 return e; 224 return e;
221} 225}
@@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
224 * Remove the tracepoint from the tracepoint hash table. Must be called with 228 * Remove the tracepoint from the tracepoint hash table. Must be called with
225 * mutex_lock held. 229 * mutex_lock held.
226 */ 230 */
227static int remove_tracepoint(const char *name) 231static inline void remove_tracepoint(struct tracepoint_entry *e)
228{ 232{
229 struct hlist_head *head;
230 struct hlist_node *node;
231 struct tracepoint_entry *e;
232 int found = 0;
233 size_t len = strlen(name) + 1;
234 u32 hash = jhash(name, len-1, 0);
235
236 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
237 hlist_for_each_entry(e, node, head, hlist) {
238 if (!strcmp(name, e->name)) {
239 found = 1;
240 break;
241 }
242 }
243 if (!found)
244 return -ENOENT;
245 if (e->refcount)
246 return -EBUSY;
247 hlist_del(&e->hlist); 233 hlist_del(&e->hlist);
248 /* Make sure the call_rcu_sched has been executed */
249 if (e->rcu_pending)
250 rcu_barrier_sched();
251 kfree(e); 234 kfree(e);
252 return 0;
253} 235}
254 236
255/* 237/*
@@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
280static void disable_tracepoint(struct tracepoint *elem) 262static void disable_tracepoint(struct tracepoint *elem)
281{ 263{
282 elem->state = 0; 264 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL);
283} 266}
284 267
285/** 268/**
@@ -320,6 +303,23 @@ static void tracepoint_update_probes(void)
320 module_update_tracepoints(); 303 module_update_tracepoints();
321} 304}
322 305
306static void *tracepoint_add_probe(const char *name, void *probe)
307{
308 struct tracepoint_entry *entry;
309 void *old;
310
311 entry = get_tracepoint(name);
312 if (!entry) {
313 entry = add_tracepoint(name);
314 if (IS_ERR(entry))
315 return entry;
316 }
317 old = tracepoint_entry_add_probe(entry, probe);
318 if (IS_ERR(old) && !entry->refcount)
319 remove_tracepoint(entry);
320 return old;
321}
322
323/** 323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint 324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name 325 * @name: tracepoint name
@@ -330,44 +330,36 @@ static void tracepoint_update_probes(void)
330 */ 330 */
331int tracepoint_probe_register(const char *name, void *probe) 331int tracepoint_probe_register(const char *name, void *probe)
332{ 332{
333 struct tracepoint_entry *entry;
334 int ret = 0;
335 void *old; 333 void *old;
336 334
337 mutex_lock(&tracepoints_mutex); 335 mutex_lock(&tracepoints_mutex);
338 entry = get_tracepoint(name); 336 old = tracepoint_add_probe(name, probe);
339 if (!entry) {
340 entry = add_tracepoint(name);
341 if (IS_ERR(entry)) {
342 ret = PTR_ERR(entry);
343 goto end;
344 }
345 }
346 /*
347 * If we detect that a call_rcu_sched is pending for this tracepoint,
348 * make sure it's executed now.
349 */
350 if (entry->rcu_pending)
351 rcu_barrier_sched();
352 old = tracepoint_entry_add_probe(entry, probe);
353 if (IS_ERR(old)) {
354 ret = PTR_ERR(old);
355 goto end;
356 }
357 mutex_unlock(&tracepoints_mutex); 337 mutex_unlock(&tracepoints_mutex);
338 if (IS_ERR(old))
339 return PTR_ERR(old);
340
358 tracepoint_update_probes(); /* may update entry */ 341 tracepoint_update_probes(); /* may update entry */
359 mutex_lock(&tracepoints_mutex); 342 release_probes(old);
360 entry = get_tracepoint(name); 343 return 0;
361 WARN_ON(!entry);
362 if (entry->rcu_pending)
363 rcu_barrier_sched();
364 tracepoint_entry_free_old(entry, old);
365end:
366 mutex_unlock(&tracepoints_mutex);
367 return ret;
368} 344}
369EXPORT_SYMBOL_GPL(tracepoint_probe_register); 345EXPORT_SYMBOL_GPL(tracepoint_probe_register);
370 346
347static void *tracepoint_remove_probe(const char *name, void *probe)
348{
349 struct tracepoint_entry *entry;
350 void *old;
351
352 entry = get_tracepoint(name);
353 if (!entry)
354 return ERR_PTR(-ENOENT);
355 old = tracepoint_entry_remove_probe(entry, probe);
356 if (IS_ERR(old))
357 return old;
358 if (!entry->refcount)
359 remove_tracepoint(entry);
360 return old;
361}
362
371/** 363/**
372 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 364 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
373 * @name: tracepoint name 365 * @name: tracepoint name
@@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
380 */ 372 */
381int tracepoint_probe_unregister(const char *name, void *probe) 373int tracepoint_probe_unregister(const char *name, void *probe)
382{ 374{
383 struct tracepoint_entry *entry;
384 void *old; 375 void *old;
385 int ret = -ENOENT;
386 376
387 mutex_lock(&tracepoints_mutex); 377 mutex_lock(&tracepoints_mutex);
388 entry = get_tracepoint(name); 378 old = tracepoint_remove_probe(name, probe);
389 if (!entry)
390 goto end;
391 if (entry->rcu_pending)
392 rcu_barrier_sched();
393 old = tracepoint_entry_remove_probe(entry, probe);
394 if (!old) {
395 printk(KERN_WARNING "Warning: Trying to unregister a probe"
396 "that doesn't exist\n");
397 goto end;
398 }
399 mutex_unlock(&tracepoints_mutex); 379 mutex_unlock(&tracepoints_mutex);
380 if (IS_ERR(old))
381 return PTR_ERR(old);
382
400 tracepoint_update_probes(); /* may update entry */ 383 tracepoint_update_probes(); /* may update entry */
384 release_probes(old);
385 return 0;
386}
387EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
388
389static LIST_HEAD(old_probes);
390static int need_update;
391
392static void tracepoint_add_old_probes(void *old)
393{
394 need_update = 1;
395 if (old) {
396 struct tp_probes *tp_probes = container_of(old,
397 struct tp_probes, probes[0]);
398 list_add(&tp_probes->u.list, &old_probes);
399 }
400}
401
402/**
403 * tracepoint_probe_register_noupdate - register a probe but not connect
404 * @name: tracepoint name
405 * @probe: probe handler
406 *
407 * caller must call tracepoint_probe_update_all()
408 */
409int tracepoint_probe_register_noupdate(const char *name, void *probe)
410{
411 void *old;
412
401 mutex_lock(&tracepoints_mutex); 413 mutex_lock(&tracepoints_mutex);
402 entry = get_tracepoint(name); 414 old = tracepoint_add_probe(name, probe);
403 if (!entry) 415 if (IS_ERR(old)) {
404 goto end; 416 mutex_unlock(&tracepoints_mutex);
405 if (entry->rcu_pending) 417 return PTR_ERR(old);
406 rcu_barrier_sched(); 418 }
407 tracepoint_entry_free_old(entry, old); 419 tracepoint_add_old_probes(old);
408 remove_tracepoint(name); /* Ignore busy error message */
409 ret = 0;
410end:
411 mutex_unlock(&tracepoints_mutex); 420 mutex_unlock(&tracepoints_mutex);
412 return ret; 421 return 0;
413} 422}
414EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 423EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
424
425/**
426 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
427 * @name: tracepoint name
428 * @probe: probe function pointer
429 *
430 * caller must call tracepoint_probe_update_all()
431 */
432int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
433{
434 void *old;
435
436 mutex_lock(&tracepoints_mutex);
437 old = tracepoint_remove_probe(name, probe);
438 if (IS_ERR(old)) {
439 mutex_unlock(&tracepoints_mutex);
440 return PTR_ERR(old);
441 }
442 tracepoint_add_old_probes(old);
443 mutex_unlock(&tracepoints_mutex);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
447
448/**
449 * tracepoint_probe_update_all - update tracepoints
450 */
451void tracepoint_probe_update_all(void)
452{
453 LIST_HEAD(release_probes);
454 struct tp_probes *pos, *next;
455
456 mutex_lock(&tracepoints_mutex);
457 if (!need_update) {
458 mutex_unlock(&tracepoints_mutex);
459 return;
460 }
461 if (!list_empty(&old_probes))
462 list_replace_init(&old_probes, &release_probes);
463 need_update = 0;
464 mutex_unlock(&tracepoints_mutex);
465
466 tracepoint_update_probes();
467 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
468 list_del(&pos->u.list);
469 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
470 }
471}
472EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
415 473
416/** 474/**
417 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. 475 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
483 iter->tracepoint = NULL; 541 iter->tracepoint = NULL;
484} 542}
485EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 543EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
544
545#ifdef CONFIG_MODULES
546
547int tracepoint_module_notify(struct notifier_block *self,
548 unsigned long val, void *data)
549{
550 struct module *mod = data;
551
552 switch (val) {
553 case MODULE_STATE_COMING:
554 tracepoint_update_probe_range(mod->tracepoints,
555 mod->tracepoints + mod->num_tracepoints);
556 break;
557 case MODULE_STATE_GOING:
558 tracepoint_update_probe_range(mod->tracepoints,
559 mod->tracepoints + mod->num_tracepoints);
560 break;
561 }
562 return 0;
563}
564
565struct notifier_block tracepoint_module_nb = {
566 .notifier_call = tracepoint_module_notify,
567 .priority = 0,
568};
569
570static int init_tracepoints(void)
571{
572 return register_module_notifier(&tracepoint_module_nb);
573}
574__initcall(init_tracepoints);
575
576#endif /* CONFIG_MODULES */
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..cec2224bc9f5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
101 if (IS_ERR(up->tg)) 101 if (IS_ERR(up->tg))
102 rc = -ENOMEM; 102 rc = -ENOMEM;
103 103
104 set_tg_uid(up);
105
104 return rc; 106 return rc;
105} 107}
106 108