aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2008-11-24 13:54:37 -0500
committerThomas Gleixner <tglx@linutronix.de>2008-11-24 13:54:37 -0500
commit3e1d7a6219ab64e13b10b1a77c0625db9a8bd8db (patch)
treec682da7317845d7b1336e3d8498cf83bdf8f5900 /kernel
parent42569c39917a08e8de1e8b5685463be7b74baebd (diff)
parent13d428afc007fcfcd6deeb215618f54cf9c0cae6 (diff)
Merge branch 'linus' into core/futexes
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.freezer2
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit_tree.c139
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/cgroup.c291
-rw-r--r--kernel/cgroup_debug.c4
-rw-r--r--kernel/cgroup_freezer.c379
-rw-r--r--kernel/compat.c111
-rw-r--r--kernel/configs.c9
-rw-r--r--kernel/cpu.c27
-rw-r--r--kernel/cpuset.c50
-rw-r--r--kernel/dma-coherent.c2
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exec_domain.c33
-rw-r--r--kernel/exit.c52
-rw-r--r--kernel/fork.c113
-rw-r--r--kernel/freezer.c154
-rw-r--r--kernel/futex.c11
-rw-r--r--kernel/hrtimer.c342
-rw-r--r--kernel/irq/autoprobe.c43
-rw-r--r--kernel/irq/chip.c102
-rw-r--r--kernel/irq/handle.c27
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/manage.c130
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/proc.c47
-rw-r--r--kernel/irq/resend.c6
-rw-r--r--kernel/irq/spurious.c162
-rw-r--r--kernel/itimer.c33
-rw-r--r--kernel/kallsyms.c18
-rw-r--r--kernel/kexec.c3
-rw-r--r--kernel/kgdb.c3
-rw-r--r--kernel/kmod.c67
-rw-r--r--kernel/kprobes.c25
-rw-r--r--kernel/ksysfs.c35
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/lockdep.c17
-rw-r--r--kernel/marker.c36
-rw-r--r--kernel/module.c394
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c84
-rw-r--r--kernel/params.c276
-rw-r--r--kernel/posix-cpu-timers.c515
-rw-r--r--kernel/posix-timers.c165
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c9
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c119
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c81
-rw-r--r--kernel/profile.c43
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcuclassic.c337
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/rcupreempt.c10
-rw-r--r--kernel/rcupreempt_trace.c7
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/relay.c9
-rw-r--r--kernel/resource.c152
-rw-r--r--kernel/rtmutex.c3
-rw-r--r--kernel/sched.c518
-rw-r--r--kernel/sched_clock.c6
-rw-r--r--kernel/sched_debug.c50
-rw-r--r--kernel/sched_fair.c443
-rw-r--r--kernel/sched_features.h4
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c66
-rw-r--r--kernel/sched_stats.h104
-rw-r--r--kernel/signal.c14
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c157
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/stop_machine.c123
-rw-r--r--kernel/sys.c123
-rw-r--r--kernel/sys_ni.c8
-rw-r--r--kernel/sysctl.c145
-rw-r--r--kernel/time.c18
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c96
-rw-r--r--kernel/time/tick-broadcast.c19
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c133
-rw-r--r--kernel/time/timekeeping.c122
-rw-r--r--kernel/time/timer_list.c28
-rw-r--r--kernel/timer.c141
-rw-r--r--kernel/trace/Kconfig87
-rw-r--r--kernel/trace/Makefile10
-rw-r--r--kernel/trace/ftrace.c826
-rw-r--r--kernel/trace/ring_buffer.c2186
-rw-r--r--kernel/trace/trace.c1902
-rw-r--r--kernel/trace/trace.h215
-rw-r--r--kernel/trace/trace_boot.c126
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_irqsoff.c23
-rw-r--r--kernel/trace/trace_mmiotrace.c116
-rw-r--r--kernel/trace/trace_nop.c64
-rw-r--r--kernel/trace/trace_sched_switch.c137
-rw-r--r--kernel/trace/trace_sched_wakeup.c152
-rw-r--r--kernel/trace/trace_selftest.c101
-rw-r--r--kernel/trace/trace_stack.c314
-rw-r--r--kernel/trace/trace_sysprof.c4
-rw-r--r--kernel/tracepoint.c485
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/wait.c14
-rw-r--r--kernel/workqueue.c54
112 files changed, 9540 insertions, 4457 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000000..a3bb4cb52539
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
1config FREEZER
2 def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df7c3e2..19fad003b19d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,9 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14CFLAGS_REMOVE_sched.o = -mno-spe 14ifdef CONFIG_FUNCTION_TRACER
15
16ifdef CONFIG_FTRACE
17# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
18CFLAGS_REMOVE_lockdep.o = -pg 16CFLAGS_REMOVE_lockdep.o = -pg
19CFLAGS_REMOVE_lockdep_proc.o = -pg 17CFLAGS_REMOVE_lockdep_proc.o = -pg
@@ -21,9 +19,10 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg 22CFLAGS_REMOVE_sched.o = -pg
25endif 23endif
26 24
25obj-$(CONFIG_FREEZER) += freezer.o
27obj-$(CONFIG_PROFILING) += profile.o 26obj-$(CONFIG_PROFILING) += profile.o
28obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 27obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
29obj-$(CONFIG_STACKTRACE) += stacktrace.o 28obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -55,6 +54,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55obj-$(CONFIG_COMPAT) += compat.o 54obj-$(CONFIG_COMPAT) += compat.o
56obj-$(CONFIG_CGROUPS) += cgroup.o 55obj-$(CONFIG_CGROUPS) += cgroup.o
57obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 56obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
57obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
58obj-$(CONFIG_CPUSETS) += cpuset.o 58obj-$(CONFIG_CPUSETS) += cpuset.o
59obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 59obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
60obj-$(CONFIG_UTS_NS) += utsname.o 60obj-$(CONFIG_UTS_NS) += utsname.o
@@ -83,9 +83,10 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
84obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 84obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
85obj-$(CONFIG_MARKERS) += marker.o 85obj-$(CONFIG_MARKERS) += marker.o
86obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
86obj-$(CONFIG_LATENCYTOP) += latencytop.o 87obj-$(CONFIG_LATENCYTOP) += latencytop.o
87obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o 88obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
88obj-$(CONFIG_FTRACE) += trace/ 89obj-$(CONFIG_FUNCTION_TRACER) += trace/
89obj-$(CONFIG_TRACING) += trace/ 90obj-$(CONFIG_TRACING) += trace/
90obj-$(CONFIG_SMP) += sched_cpupri.o 91obj-$(CONFIG_SMP) += sched_cpupri.o
91 92
diff --git a/kernel/acct.c b/kernel/acct.c
index dd68b9059418..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
548#endif 548#endif
549 549
550 spin_lock_irq(&current->sighand->siglock); 550 spin_lock_irq(&current->sighand->siglock);
551 tty = current->signal->tty; 551 tty = current->signal->tty; /* Safe as we hold the siglock */
552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; 552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f7921a2ecf16..8b509441f49a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
24 struct list_head trees; /* with root here */ 24 struct list_head trees; /* with root here */
25 int dead; 25 int dead;
26 int count; 26 int count;
27 atomic_long_t refs;
27 struct rcu_head head; 28 struct rcu_head head;
28 struct node { 29 struct node {
29 struct list_head list; 30 struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
56 * tree is refcounted; one reference for "some rules on rules_list refer to 57 * tree is refcounted; one reference for "some rules on rules_list refer to
57 * it", one for each chunk with pointer to it. 58 * it", one for each chunk with pointer to it.
58 * 59 *
59 * chunk is refcounted by embedded inotify_watch. 60 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
61 * of watch contributes 1 to .refs).
60 * 62 *
61 * node.index allows to get from node.list to containing chunk. 63 * node.index allows to get from node.list to containing chunk.
62 * MSB of that sucker is stolen to mark taggings that we might have to 64 * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
121 INIT_LIST_HEAD(&chunk->hash); 123 INIT_LIST_HEAD(&chunk->hash);
122 INIT_LIST_HEAD(&chunk->trees); 124 INIT_LIST_HEAD(&chunk->trees);
123 chunk->count = count; 125 chunk->count = count;
126 atomic_long_set(&chunk->refs, 1);
124 for (i = 0; i < count; i++) { 127 for (i = 0; i < count; i++) {
125 INIT_LIST_HEAD(&chunk->owners[i].list); 128 INIT_LIST_HEAD(&chunk->owners[i].list);
126 chunk->owners[i].index = i; 129 chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
129 return chunk; 132 return chunk;
130} 133}
131 134
132static void __free_chunk(struct rcu_head *rcu) 135static void free_chunk(struct audit_chunk *chunk)
133{ 136{
134 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
135 int i; 137 int i;
136 138
137 for (i = 0; i < chunk->count; i++) { 139 for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
141 kfree(chunk); 143 kfree(chunk);
142} 144}
143 145
144static inline void free_chunk(struct audit_chunk *chunk) 146void audit_put_chunk(struct audit_chunk *chunk)
145{ 147{
146 call_rcu(&chunk->head, __free_chunk); 148 if (atomic_long_dec_and_test(&chunk->refs))
149 free_chunk(chunk);
147} 150}
148 151
149void audit_put_chunk(struct audit_chunk *chunk) 152static void __put_chunk(struct rcu_head *rcu)
150{ 153{
151 put_inotify_watch(&chunk->watch); 154 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
155 audit_put_chunk(chunk);
152} 156}
153 157
154enum {HASH_SIZE = 128}; 158enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
176 180
177 list_for_each_entry_rcu(p, list, hash) { 181 list_for_each_entry_rcu(p, list, hash) {
178 if (p->watch.inode == inode) { 182 if (p->watch.inode == inode) {
179 get_inotify_watch(&p->watch); 183 atomic_long_inc(&p->refs);
180 return p; 184 return p;
181 } 185 }
182 } 186 }
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
194 198
195/* tagging and untagging inodes with trees */ 199/* tagging and untagging inodes with trees */
196 200
197static void untag_chunk(struct audit_chunk *chunk, struct node *p) 201static struct audit_chunk *find_chunk(struct node *p)
202{
203 int index = p->index & ~(1U<<31);
204 p -= index;
205 return container_of(p, struct audit_chunk, owners[0]);
206}
207
208static void untag_chunk(struct node *p)
198{ 209{
210 struct audit_chunk *chunk = find_chunk(p);
199 struct audit_chunk *new; 211 struct audit_chunk *new;
200 struct audit_tree *owner; 212 struct audit_tree *owner;
201 int size = chunk->count - 1; 213 int size = chunk->count - 1;
202 int i, j; 214 int i, j;
203 215
216 if (!pin_inotify_watch(&chunk->watch)) {
217 /*
218 * Filesystem is shutting down; all watches are getting
219 * evicted, just take it off the node list for this
220 * tree and let the eviction logics take care of the
221 * rest.
222 */
223 owner = p->owner;
224 if (owner->root == chunk) {
225 list_del_init(&owner->same_root);
226 owner->root = NULL;
227 }
228 list_del_init(&p->list);
229 p->owner = NULL;
230 put_tree(owner);
231 return;
232 }
233
234 spin_unlock(&hash_lock);
235
236 /*
237 * pin_inotify_watch() succeeded, so the watch won't go away
238 * from under us.
239 */
204 mutex_lock(&chunk->watch.inode->inotify_mutex); 240 mutex_lock(&chunk->watch.inode->inotify_mutex);
205 if (chunk->dead) { 241 if (chunk->dead) {
206 mutex_unlock(&chunk->watch.inode->inotify_mutex); 242 mutex_unlock(&chunk->watch.inode->inotify_mutex);
207 return; 243 goto out;
208 } 244 }
209 245
210 owner = p->owner; 246 owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
221 inotify_evict_watch(&chunk->watch); 257 inotify_evict_watch(&chunk->watch);
222 mutex_unlock(&chunk->watch.inode->inotify_mutex); 258 mutex_unlock(&chunk->watch.inode->inotify_mutex);
223 put_inotify_watch(&chunk->watch); 259 put_inotify_watch(&chunk->watch);
224 return; 260 goto out;
225 } 261 }
226 262
227 new = alloc_chunk(size); 263 new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
263 inotify_evict_watch(&chunk->watch); 299 inotify_evict_watch(&chunk->watch);
264 mutex_unlock(&chunk->watch.inode->inotify_mutex); 300 mutex_unlock(&chunk->watch.inode->inotify_mutex);
265 put_inotify_watch(&chunk->watch); 301 put_inotify_watch(&chunk->watch);
266 return; 302 goto out;
267 303
268Fallback: 304Fallback:
269 // do the best we can 305 // do the best we can
@@ -277,6 +313,9 @@ Fallback:
277 put_tree(owner); 313 put_tree(owner);
278 spin_unlock(&hash_lock); 314 spin_unlock(&hash_lock);
279 mutex_unlock(&chunk->watch.inode->inotify_mutex); 315 mutex_unlock(&chunk->watch.inode->inotify_mutex);
316out:
317 unpin_inotify_watch(&chunk->watch);
318 spin_lock(&hash_lock);
280} 319}
281 320
282static int create_chunk(struct inode *inode, struct audit_tree *tree) 321static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
387 return 0; 426 return 0;
388} 427}
389 428
390static struct audit_chunk *find_chunk(struct node *p)
391{
392 int index = p->index & ~(1U<<31);
393 p -= index;
394 return container_of(p, struct audit_chunk, owners[0]);
395}
396
397static void kill_rules(struct audit_tree *tree) 429static void kill_rules(struct audit_tree *tree)
398{ 430{
399 struct audit_krule *rule, *next; 431 struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
431 spin_lock(&hash_lock); 463 spin_lock(&hash_lock);
432 while (!list_empty(&victim->chunks)) { 464 while (!list_empty(&victim->chunks)) {
433 struct node *p; 465 struct node *p;
434 struct audit_chunk *chunk;
435 466
436 p = list_entry(victim->chunks.next, struct node, list); 467 p = list_entry(victim->chunks.next, struct node, list);
437 chunk = find_chunk(p);
438 get_inotify_watch(&chunk->watch);
439 spin_unlock(&hash_lock);
440
441 untag_chunk(chunk, p);
442 468
443 put_inotify_watch(&chunk->watch); 469 untag_chunk(p);
444 spin_lock(&hash_lock);
445 } 470 }
446 spin_unlock(&hash_lock); 471 spin_unlock(&hash_lock);
447 put_tree(victim); 472 put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
469 494
470 while (!list_empty(&tree->chunks)) { 495 while (!list_empty(&tree->chunks)) {
471 struct node *node; 496 struct node *node;
472 struct audit_chunk *chunk;
473 497
474 node = list_entry(tree->chunks.next, struct node, list); 498 node = list_entry(tree->chunks.next, struct node, list);
475 499
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
477 if (!(node->index & (1U<<31))) 501 if (!(node->index & (1U<<31)))
478 break; 502 break;
479 503
480 chunk = find_chunk(node); 504 untag_chunk(node);
481 get_inotify_watch(&chunk->watch);
482 spin_unlock(&hash_lock);
483
484 untag_chunk(chunk, node);
485
486 put_inotify_watch(&chunk->watch);
487 spin_lock(&hash_lock);
488 } 505 }
489 if (!tree->root && !tree->goner) { 506 if (!tree->root && !tree->goner) {
490 tree->goner = 1; 507 tree->goner = 1;
@@ -532,7 +549,7 @@ void audit_trim_trees(void)
532 list_add(&cursor, &tree_list); 549 list_add(&cursor, &tree_list);
533 while (cursor.next != &tree_list) { 550 while (cursor.next != &tree_list) {
534 struct audit_tree *tree; 551 struct audit_tree *tree;
535 struct nameidata nd; 552 struct path path;
536 struct vfsmount *root_mnt; 553 struct vfsmount *root_mnt;
537 struct node *node; 554 struct node *node;
538 struct list_head list; 555 struct list_head list;
@@ -544,12 +561,12 @@ void audit_trim_trees(void)
544 list_add(&cursor, &tree->list); 561 list_add(&cursor, &tree->list);
545 mutex_unlock(&audit_filter_mutex); 562 mutex_unlock(&audit_filter_mutex);
546 563
547 err = path_lookup(tree->pathname, 0, &nd); 564 err = kern_path(tree->pathname, 0, &path);
548 if (err) 565 if (err)
549 goto skip_it; 566 goto skip_it;
550 567
551 root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); 568 root_mnt = collect_mounts(path.mnt, path.dentry);
552 path_put(&nd.path); 569 path_put(&path);
553 if (!root_mnt) 570 if (!root_mnt)
554 goto skip_it; 571 goto skip_it;
555 572
@@ -580,19 +597,19 @@ skip_it:
580} 597}
581 598
582static int is_under(struct vfsmount *mnt, struct dentry *dentry, 599static int is_under(struct vfsmount *mnt, struct dentry *dentry,
583 struct nameidata *nd) 600 struct path *path)
584{ 601{
585 if (mnt != nd->path.mnt) { 602 if (mnt != path->mnt) {
586 for (;;) { 603 for (;;) {
587 if (mnt->mnt_parent == mnt) 604 if (mnt->mnt_parent == mnt)
588 return 0; 605 return 0;
589 if (mnt->mnt_parent == nd->path.mnt) 606 if (mnt->mnt_parent == path->mnt)
590 break; 607 break;
591 mnt = mnt->mnt_parent; 608 mnt = mnt->mnt_parent;
592 } 609 }
593 dentry = mnt->mnt_mountpoint; 610 dentry = mnt->mnt_mountpoint;
594 } 611 }
595 return is_subdir(dentry, nd->path.dentry); 612 return is_subdir(dentry, path->dentry);
596} 613}
597 614
598int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 615int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -618,7 +635,7 @@ void audit_put_tree(struct audit_tree *tree)
618int audit_add_tree_rule(struct audit_krule *rule) 635int audit_add_tree_rule(struct audit_krule *rule)
619{ 636{
620 struct audit_tree *seed = rule->tree, *tree; 637 struct audit_tree *seed = rule->tree, *tree;
621 struct nameidata nd; 638 struct path path;
622 struct vfsmount *mnt, *p; 639 struct vfsmount *mnt, *p;
623 struct list_head list; 640 struct list_head list;
624 int err; 641 int err;
@@ -637,11 +654,11 @@ int audit_add_tree_rule(struct audit_krule *rule)
637 /* do not set rule->tree yet */ 654 /* do not set rule->tree yet */
638 mutex_unlock(&audit_filter_mutex); 655 mutex_unlock(&audit_filter_mutex);
639 656
640 err = path_lookup(tree->pathname, 0, &nd); 657 err = kern_path(tree->pathname, 0, &path);
641 if (err) 658 if (err)
642 goto Err; 659 goto Err;
643 mnt = collect_mounts(nd.path.mnt, nd.path.dentry); 660 mnt = collect_mounts(path.mnt, path.dentry);
644 path_put(&nd.path); 661 path_put(&path);
645 if (!mnt) { 662 if (!mnt) {
646 err = -ENOMEM; 663 err = -ENOMEM;
647 goto Err; 664 goto Err;
@@ -690,29 +707,29 @@ int audit_tag_tree(char *old, char *new)
690{ 707{
691 struct list_head cursor, barrier; 708 struct list_head cursor, barrier;
692 int failed = 0; 709 int failed = 0;
693 struct nameidata nd; 710 struct path path;
694 struct vfsmount *tagged; 711 struct vfsmount *tagged;
695 struct list_head list; 712 struct list_head list;
696 struct vfsmount *mnt; 713 struct vfsmount *mnt;
697 struct dentry *dentry; 714 struct dentry *dentry;
698 int err; 715 int err;
699 716
700 err = path_lookup(new, 0, &nd); 717 err = kern_path(new, 0, &path);
701 if (err) 718 if (err)
702 return err; 719 return err;
703 tagged = collect_mounts(nd.path.mnt, nd.path.dentry); 720 tagged = collect_mounts(path.mnt, path.dentry);
704 path_put(&nd.path); 721 path_put(&path);
705 if (!tagged) 722 if (!tagged)
706 return -ENOMEM; 723 return -ENOMEM;
707 724
708 err = path_lookup(old, 0, &nd); 725 err = kern_path(old, 0, &path);
709 if (err) { 726 if (err) {
710 drop_collected_mounts(tagged); 727 drop_collected_mounts(tagged);
711 return err; 728 return err;
712 } 729 }
713 mnt = mntget(nd.path.mnt); 730 mnt = mntget(path.mnt);
714 dentry = dget(nd.path.dentry); 731 dentry = dget(path.dentry);
715 path_put(&nd.path); 732 path_put(&path);
716 733
717 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) 734 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
718 follow_up(&mnt, &dentry); 735 follow_up(&mnt, &dentry);
@@ -733,7 +750,7 @@ int audit_tag_tree(char *old, char *new)
733 list_add(&cursor, &tree->list); 750 list_add(&cursor, &tree->list);
734 mutex_unlock(&audit_filter_mutex); 751 mutex_unlock(&audit_filter_mutex);
735 752
736 err = path_lookup(tree->pathname, 0, &nd); 753 err = kern_path(tree->pathname, 0, &path);
737 if (err) { 754 if (err) {
738 put_tree(tree); 755 put_tree(tree);
739 mutex_lock(&audit_filter_mutex); 756 mutex_lock(&audit_filter_mutex);
@@ -741,15 +758,15 @@ int audit_tag_tree(char *old, char *new)
741 } 758 }
742 759
743 spin_lock(&vfsmount_lock); 760 spin_lock(&vfsmount_lock);
744 if (!is_under(mnt, dentry, &nd)) { 761 if (!is_under(mnt, dentry, &path)) {
745 spin_unlock(&vfsmount_lock); 762 spin_unlock(&vfsmount_lock);
746 path_put(&nd.path); 763 path_put(&path);
747 put_tree(tree); 764 put_tree(tree);
748 mutex_lock(&audit_filter_mutex); 765 mutex_lock(&audit_filter_mutex);
749 continue; 766 continue;
750 } 767 }
751 spin_unlock(&vfsmount_lock); 768 spin_unlock(&vfsmount_lock);
752 path_put(&nd.path); 769 path_put(&path);
753 770
754 list_for_each_entry(p, &list, mnt_list) { 771 list_for_each_entry(p, &list, mnt_list) {
755 failed = tag_chunk(p->mnt_root->d_inode, tree); 772 failed = tag_chunk(p->mnt_root->d_inode, tree);
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
878static void destroy_watch(struct inotify_watch *watch) 895static void destroy_watch(struct inotify_watch *watch)
879{ 896{
880 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 897 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
881 free_chunk(chunk); 898 call_rcu(&chunk->head, __put_chunk);
882} 899}
883 900
884static const struct inotify_operations rtree_inotify_ops = { 901static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0ef..9fd85a4640a0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
1094 list_for_each_entry_safe(p, n, in_list, ilist) { 1094 list_for_each_entry_safe(p, n, in_list, ilist) {
1095 list_del(&p->ilist); 1095 list_del(&p->ilist);
1096 inotify_rm_watch(audit_ih, &p->wdata); 1096 inotify_rm_watch(audit_ih, &p->wdata);
1097 /* the put matching the get in audit_do_del_rule() */ 1097 /* the unpin matching the pin in audit_do_del_rule() */
1098 put_inotify_watch(&p->wdata); 1098 unpin_inotify_watch(&p->wdata);
1099 } 1099 }
1100} 1100}
1101 1101
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
1389 /* Put parent on the inotify un-registration 1389 /* Put parent on the inotify un-registration
1390 * list. Grab a reference before releasing 1390 * list. Grab a reference before releasing
1391 * audit_filter_mutex, to be released in 1391 * audit_filter_mutex, to be released in
1392 * audit_inotify_unregister(). */ 1392 * audit_inotify_unregister().
1393 list_add(&parent->ilist, &inotify_list); 1393 * If filesystem is going away, just leave
1394 get_inotify_watch(&parent->wdata); 1394 * the sucker alone, eviction will take
1395 * care of it.
1396 */
1397 if (pin_inotify_watch(&parent->wdata))
1398 list_add(&parent->ilist, &inotify_list);
1395 } 1399 }
1396 } 1400 }
1397 } 1401 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 59cedfb040e7..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
246 unsigned n; 246 unsigned n;
247 if (unlikely(!ctx)) 247 if (unlikely(!ctx))
248 return 0; 248 return 0;
249
250 n = ctx->major; 249 n = ctx->major;
250
251 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
252 case 0: /* native */ 252 case 0: /* native */
253 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
@@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1205 context->return_code); 1205 context->return_code);
1206 1206
1207 mutex_lock(&tty_mutex); 1207 spin_lock_irq(&tsk->sighand->siglock);
1208 read_lock(&tasklist_lock);
1209 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1208 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1210 tty = tsk->signal->tty->name; 1209 tty = tsk->signal->tty->name;
1211 else 1210 else
1212 tty = "(none)"; 1211 tty = "(none)";
1213 read_unlock(&tasklist_lock); 1212 spin_unlock_irq(&tsk->sighand->siglock);
1213
1214 audit_log_format(ab, 1214 audit_log_format(ab,
1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1230 context->egid, context->sgid, context->fsgid, tty, 1230 context->egid, context->sgid, context->fsgid, tty,
1231 tsk->sessionid); 1231 tsk->sessionid);
1232 1232
1233 mutex_unlock(&tty_mutex);
1234 1233
1235 audit_log_task_info(ab, tsk); 1234 audit_log_task_info(ab, tsk);
1236 if (context->filterkey) { 1235 if (context->filterkey) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0123d75ec9a..fe00b3b983a8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)
241 struct cg_cgroup_link *link; 241 struct cg_cgroup_link *link;
242 struct cg_cgroup_link *saved_link; 242 struct cg_cgroup_link *saved_link;
243 243
244 write_lock(&css_set_lock);
245 hlist_del(&cg->hlist); 244 hlist_del(&cg->hlist);
246 css_set_count--; 245 css_set_count--;
247 246
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)
251 list_del(&link->cgrp_link_list); 250 list_del(&link->cgrp_link_list);
252 kfree(link); 251 kfree(link);
253 } 252 }
254
255 write_unlock(&css_set_lock);
256} 253}
257 254
258static void __release_css_set(struct kref *k, int taskexit) 255static void __put_css_set(struct css_set *cg, int taskexit)
259{ 256{
260 int i; 257 int i;
261 struct css_set *cg = container_of(k, struct css_set, ref); 258 /*
262 259 * Ensure that the refcount doesn't hit zero while any readers
260 * can see it. Similar to atomic_dec_and_lock(), but for an
261 * rwlock
262 */
263 if (atomic_add_unless(&cg->refcount, -1, 1))
264 return;
265 write_lock(&css_set_lock);
266 if (!atomic_dec_and_test(&cg->refcount)) {
267 write_unlock(&css_set_lock);
268 return;
269 }
263 unlink_css_set(cg); 270 unlink_css_set(cg);
271 write_unlock(&css_set_lock);
264 272
265 rcu_read_lock(); 273 rcu_read_lock();
266 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
276 kfree(cg); 284 kfree(cg);
277} 285}
278 286
279static void release_css_set(struct kref *k)
280{
281 __release_css_set(k, 0);
282}
283
284static void release_css_set_taskexit(struct kref *k)
285{
286 __release_css_set(k, 1);
287}
288
289/* 287/*
290 * refcounted get/put for css_set objects 288 * refcounted get/put for css_set objects
291 */ 289 */
292static inline void get_css_set(struct css_set *cg) 290static inline void get_css_set(struct css_set *cg)
293{ 291{
294 kref_get(&cg->ref); 292 atomic_inc(&cg->refcount);
295} 293}
296 294
297static inline void put_css_set(struct css_set *cg) 295static inline void put_css_set(struct css_set *cg)
298{ 296{
299 kref_put(&cg->ref, release_css_set); 297 __put_css_set(cg, 0);
300} 298}
301 299
302static inline void put_css_set_taskexit(struct css_set *cg) 300static inline void put_css_set_taskexit(struct css_set *cg)
303{ 301{
304 kref_put(&cg->ref, release_css_set_taskexit); 302 __put_css_set(cg, 1);
305} 303}
306 304
307/* 305/*
@@ -427,7 +425,7 @@ static struct css_set *find_css_set(
427 return NULL; 425 return NULL;
428 } 426 }
429 427
430 kref_init(&res->ref); 428 atomic_set(&res->refcount, 1);
431 INIT_LIST_HEAD(&res->cg_links); 429 INIT_LIST_HEAD(&res->cg_links);
432 INIT_LIST_HEAD(&res->tasks); 430 INIT_LIST_HEAD(&res->tasks);
433 INIT_HLIST_NODE(&res->hlist); 431 INIT_HLIST_NODE(&res->hlist);
@@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {
870 .remount_fs = cgroup_remount, 868 .remount_fs = cgroup_remount,
871}; 869};
872 870
871static void init_cgroup_housekeeping(struct cgroup *cgrp)
872{
873 INIT_LIST_HEAD(&cgrp->sibling);
874 INIT_LIST_HEAD(&cgrp->children);
875 INIT_LIST_HEAD(&cgrp->css_sets);
876 INIT_LIST_HEAD(&cgrp->release_list);
877 init_rwsem(&cgrp->pids_mutex);
878}
873static void init_cgroup_root(struct cgroupfs_root *root) 879static void init_cgroup_root(struct cgroupfs_root *root)
874{ 880{
875 struct cgroup *cgrp = &root->top_cgroup; 881 struct cgroup *cgrp = &root->top_cgroup;
@@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
878 root->number_of_cgroups = 1; 884 root->number_of_cgroups = 1;
879 cgrp->root = root; 885 cgrp->root = root;
880 cgrp->top_cgroup = cgrp; 886 cgrp->top_cgroup = cgrp;
881 INIT_LIST_HEAD(&cgrp->sibling); 887 init_cgroup_housekeeping(cgrp);
882 INIT_LIST_HEAD(&cgrp->children);
883 INIT_LIST_HEAD(&cgrp->css_sets);
884 INIT_LIST_HEAD(&cgrp->release_list);
885} 888}
886 889
887static int cgroup_test_super(struct super_block *sb, void *data) 890static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1728 1731
1729 read_lock(&css_set_lock); 1732 read_lock(&css_set_lock);
1730 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 1733 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1731 count += atomic_read(&link->cg->ref.refcount); 1734 count += atomic_read(&link->cg->refcount);
1732 } 1735 }
1733 read_unlock(&css_set_lock); 1736 read_unlock(&css_set_lock);
1734 return count; 1737 return count;
@@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1997 * but we cannot guarantee that the information we produce is correct 2000 * but we cannot guarantee that the information we produce is correct
1998 * unless we produce it entirely atomically. 2001 * unless we produce it entirely atomically.
1999 * 2002 *
2000 * Upon tasks file open(), a struct ctr_struct is allocated, that
2001 * will have a pointer to an array (also allocated here). The struct
2002 * ctr_struct * is stored in file->private_data. Its resources will
2003 * be freed by release() when the file is closed. The array is used
2004 * to sprintf the PIDs and then used by read().
2005 */ 2003 */
2006struct ctr_struct {
2007 char *buf;
2008 int bufsz;
2009};
2010 2004
2011/* 2005/*
2012 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2006 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2045,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2045 struct cgroup *cgrp; 2039 struct cgroup *cgrp;
2046 struct cgroup_iter it; 2040 struct cgroup_iter it;
2047 struct task_struct *tsk; 2041 struct task_struct *tsk;
2042
2048 /* 2043 /*
2049 * Validate dentry by checking the superblock operations 2044 * Validate dentry by checking the superblock operations,
2045 * and make sure it's a directory.
2050 */ 2046 */
2051 if (dentry->d_sb->s_op != &cgroup_ops) 2047 if (dentry->d_sb->s_op != &cgroup_ops ||
2048 !S_ISDIR(dentry->d_inode->i_mode))
2052 goto err; 2049 goto err;
2053 2050
2054 ret = 0; 2051 ret = 0;
@@ -2088,42 +2085,132 @@ static int cmppid(const void *a, const void *b)
2088 return *(pid_t *)a - *(pid_t *)b; 2085 return *(pid_t *)a - *(pid_t *)b;
2089} 2086}
2090 2087
2088
2091/* 2089/*
2092 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2090 * seq_file methods for the "tasks" file. The seq_file position is the
2093 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2091 * next pid to display; the seq_file iterator is a pointer to the pid
2094 * count 'cnt' of how many chars would be written if buf were large enough. 2092 * in the cgroup->tasks_pids array.
2095 */ 2093 */
2096static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2094
2095static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2097{ 2096{
2098 int cnt = 0; 2097 /*
2099 int i; 2098 * Initially we receive a position value that corresponds to
2099 * one more than the last pid shown (or 0 on the first call or
2100 * after a seek to the start). Use a binary-search to find the
2101 * next pid to display, if any
2102 */
2103 struct cgroup *cgrp = s->private;
2104 int index = 0, pid = *pos;
2105 int *iter;
2106
2107 down_read(&cgrp->pids_mutex);
2108 if (pid) {
2109 int end = cgrp->pids_length;
2110
2111 while (index < end) {
2112 int mid = (index + end) / 2;
2113 if (cgrp->tasks_pids[mid] == pid) {
2114 index = mid;
2115 break;
2116 } else if (cgrp->tasks_pids[mid] <= pid)
2117 index = mid + 1;
2118 else
2119 end = mid;
2120 }
2121 }
2122 /* If we're off the end of the array, we're done */
2123 if (index >= cgrp->pids_length)
2124 return NULL;
2125 /* Update the abstract position to be the actual pid that we found */
2126 iter = cgrp->tasks_pids + index;
2127 *pos = *iter;
2128 return iter;
2129}
2100 2130
2101 for (i = 0; i < npids; i++) 2131static void cgroup_tasks_stop(struct seq_file *s, void *v)
2102 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2132{
2103 return cnt; 2133 struct cgroup *cgrp = s->private;
2134 up_read(&cgrp->pids_mutex);
2104} 2135}
2105 2136
2137static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2138{
2139 struct cgroup *cgrp = s->private;
2140 int *p = v;
2141 int *end = cgrp->tasks_pids + cgrp->pids_length;
2142
2143 /*
2144 * Advance to the next pid in the array. If this goes off the
2145 * end, we're done
2146 */
2147 p++;
2148 if (p >= end) {
2149 return NULL;
2150 } else {
2151 *pos = *p;
2152 return p;
2153 }
2154}
2155
2156static int cgroup_tasks_show(struct seq_file *s, void *v)
2157{
2158 return seq_printf(s, "%d\n", *(int *)v);
2159}
2160
2161static struct seq_operations cgroup_tasks_seq_operations = {
2162 .start = cgroup_tasks_start,
2163 .stop = cgroup_tasks_stop,
2164 .next = cgroup_tasks_next,
2165 .show = cgroup_tasks_show,
2166};
2167
2168static void release_cgroup_pid_array(struct cgroup *cgrp)
2169{
2170 down_write(&cgrp->pids_mutex);
2171 BUG_ON(!cgrp->pids_use_count);
2172 if (!--cgrp->pids_use_count) {
2173 kfree(cgrp->tasks_pids);
2174 cgrp->tasks_pids = NULL;
2175 cgrp->pids_length = 0;
2176 }
2177 up_write(&cgrp->pids_mutex);
2178}
2179
2180static int cgroup_tasks_release(struct inode *inode, struct file *file)
2181{
2182 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2183
2184 if (!(file->f_mode & FMODE_READ))
2185 return 0;
2186
2187 release_cgroup_pid_array(cgrp);
2188 return seq_release(inode, file);
2189}
2190
2191static struct file_operations cgroup_tasks_operations = {
2192 .read = seq_read,
2193 .llseek = seq_lseek,
2194 .write = cgroup_file_write,
2195 .release = cgroup_tasks_release,
2196};
2197
2106/* 2198/*
2107 * Handle an open on 'tasks' file. Prepare a buffer listing the 2199 * Handle an open on 'tasks' file. Prepare an array containing the
2108 * process id's of tasks currently attached to the cgroup being opened. 2200 * process id's of tasks currently attached to the cgroup being opened.
2109 *
2110 * Does not require any specific cgroup mutexes, and does not take any.
2111 */ 2201 */
2202
2112static int cgroup_tasks_open(struct inode *unused, struct file *file) 2203static int cgroup_tasks_open(struct inode *unused, struct file *file)
2113{ 2204{
2114 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2205 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2115 struct ctr_struct *ctr;
2116 pid_t *pidarray; 2206 pid_t *pidarray;
2117 int npids; 2207 int npids;
2118 char c; 2208 int retval;
2119 2209
2210 /* Nothing to do for write-only files */
2120 if (!(file->f_mode & FMODE_READ)) 2211 if (!(file->f_mode & FMODE_READ))
2121 return 0; 2212 return 0;
2122 2213
2123 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2124 if (!ctr)
2125 goto err0;
2126
2127 /* 2214 /*
2128 * If cgroup gets more users after we read count, we won't have 2215 * If cgroup gets more users after we read count, we won't have
2129 * enough space - tough. This race is indistinguishable to the 2216 * enough space - tough. This race is indistinguishable to the
@@ -2131,57 +2218,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2131 * show up until sometime later on. 2218 * show up until sometime later on.
2132 */ 2219 */
2133 npids = cgroup_task_count(cgrp); 2220 npids = cgroup_task_count(cgrp);
2134 if (npids) { 2221 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2135 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2222 if (!pidarray)
2136 if (!pidarray) 2223 return -ENOMEM;
2137 goto err1; 2224 npids = pid_array_load(pidarray, npids, cgrp);
2138 2225 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2139 npids = pid_array_load(pidarray, npids, cgrp);
2140 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2141
2142 /* Call pid_array_to_buf() twice, first just to get bufsz */
2143 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2144 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2145 if (!ctr->buf)
2146 goto err2;
2147 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2148
2149 kfree(pidarray);
2150 } else {
2151 ctr->buf = NULL;
2152 ctr->bufsz = 0;
2153 }
2154 file->private_data = ctr;
2155 return 0;
2156
2157err2:
2158 kfree(pidarray);
2159err1:
2160 kfree(ctr);
2161err0:
2162 return -ENOMEM;
2163}
2164
2165static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2166 struct cftype *cft,
2167 struct file *file, char __user *buf,
2168 size_t nbytes, loff_t *ppos)
2169{
2170 struct ctr_struct *ctr = file->private_data;
2171 2226
2172 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2227 /*
2173} 2228 * Store the array in the cgroup, freeing the old
2229 * array if necessary
2230 */
2231 down_write(&cgrp->pids_mutex);
2232 kfree(cgrp->tasks_pids);
2233 cgrp->tasks_pids = pidarray;
2234 cgrp->pids_length = npids;
2235 cgrp->pids_use_count++;
2236 up_write(&cgrp->pids_mutex);
2174 2237
2175static int cgroup_tasks_release(struct inode *unused_inode, 2238 file->f_op = &cgroup_tasks_operations;
2176 struct file *file)
2177{
2178 struct ctr_struct *ctr;
2179 2239
2180 if (file->f_mode & FMODE_READ) { 2240 retval = seq_open(file, &cgroup_tasks_seq_operations);
2181 ctr = file->private_data; 2241 if (retval) {
2182 kfree(ctr->buf); 2242 release_cgroup_pid_array(cgrp);
2183 kfree(ctr); 2243 return retval;
2184 } 2244 }
2245 ((struct seq_file *)file->private_data)->private = cgrp;
2185 return 0; 2246 return 0;
2186} 2247}
2187 2248
@@ -2210,7 +2271,6 @@ static struct cftype files[] = {
2210 { 2271 {
2211 .name = "tasks", 2272 .name = "tasks",
2212 .open = cgroup_tasks_open, 2273 .open = cgroup_tasks_open,
2213 .read = cgroup_tasks_read,
2214 .write_u64 = cgroup_tasks_write, 2274 .write_u64 = cgroup_tasks_write,
2215 .release = cgroup_tasks_release, 2275 .release = cgroup_tasks_release,
2216 .private = FILE_TASKLIST, 2276 .private = FILE_TASKLIST,
@@ -2300,10 +2360,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2300 2360
2301 mutex_lock(&cgroup_mutex); 2361 mutex_lock(&cgroup_mutex);
2302 2362
2303 INIT_LIST_HEAD(&cgrp->sibling); 2363 init_cgroup_housekeeping(cgrp);
2304 INIT_LIST_HEAD(&cgrp->children);
2305 INIT_LIST_HEAD(&cgrp->css_sets);
2306 INIT_LIST_HEAD(&cgrp->release_list);
2307 2364
2308 cgrp->parent = parent; 2365 cgrp->parent = parent;
2309 cgrp->root = parent->root; 2366 cgrp->root = parent->root;
@@ -2418,10 +2475,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2418 mutex_unlock(&cgroup_mutex); 2475 mutex_unlock(&cgroup_mutex);
2419 return -EBUSY; 2476 return -EBUSY;
2420 } 2477 }
2421 2478 mutex_unlock(&cgroup_mutex);
2422 parent = cgrp->parent;
2423 root = cgrp->root;
2424 sb = root->sb;
2425 2479
2426 /* 2480 /*
2427 * Call pre_destroy handlers of subsys. Notify subsystems 2481 * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2429,7 +2483,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2429 */ 2483 */
2430 cgroup_call_pre_destroy(cgrp); 2484 cgroup_call_pre_destroy(cgrp);
2431 2485
2432 if (cgroup_has_css_refs(cgrp)) { 2486 mutex_lock(&cgroup_mutex);
2487 parent = cgrp->parent;
2488 root = cgrp->root;
2489 sb = root->sb;
2490
2491 if (atomic_read(&cgrp->count)
2492 || !list_empty(&cgrp->children)
2493 || cgroup_has_css_refs(cgrp)) {
2433 mutex_unlock(&cgroup_mutex); 2494 mutex_unlock(&cgroup_mutex);
2434 return -EBUSY; 2495 return -EBUSY;
2435 } 2496 }
@@ -2443,7 +2504,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2443 list_del(&cgrp->sibling); 2504 list_del(&cgrp->sibling);
2444 spin_lock(&cgrp->dentry->d_lock); 2505 spin_lock(&cgrp->dentry->d_lock);
2445 d = dget(cgrp->dentry); 2506 d = dget(cgrp->dentry);
2446 cgrp->dentry = NULL;
2447 spin_unlock(&d->d_lock); 2507 spin_unlock(&d->d_lock);
2448 2508
2449 cgroup_d_remove_dir(d); 2509 cgroup_d_remove_dir(d);
@@ -2495,8 +2555,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2495int __init cgroup_init_early(void) 2555int __init cgroup_init_early(void)
2496{ 2556{
2497 int i; 2557 int i;
2498 kref_init(&init_css_set.ref); 2558 atomic_set(&init_css_set.refcount, 1);
2499 kref_get(&init_css_set.ref);
2500 INIT_LIST_HEAD(&init_css_set.cg_links); 2559 INIT_LIST_HEAD(&init_css_set.cg_links);
2501 INIT_LIST_HEAD(&init_css_set.tasks); 2560 INIT_LIST_HEAD(&init_css_set.tasks);
2502 INIT_HLIST_NODE(&init_css_set.hlist); 2561 INIT_HLIST_NODE(&init_css_set.hlist);
@@ -2735,6 +2794,8 @@ void cgroup_fork_callbacks(struct task_struct *child)
2735 * Called on every change to mm->owner. mm_init_owner() does not 2794 * Called on every change to mm->owner. mm_init_owner() does not
2736 * invoke this routine, since it assigns the mm->owner the first time 2795 * invoke this routine, since it assigns the mm->owner the first time
2737 * and does not change it. 2796 * and does not change it.
2797 *
2798 * The callbacks are invoked with mmap_sem held in read mode.
2738 */ 2799 */
2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2800void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2740{ 2801{
@@ -2750,7 +2811,7 @@ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2750 if (oldcgrp == newcgrp) 2811 if (oldcgrp == newcgrp)
2751 continue; 2812 continue;
2752 if (ss->mm_owner_changed) 2813 if (ss->mm_owner_changed)
2753 ss->mm_owner_changed(ss, oldcgrp, newcgrp); 2814 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2754 } 2815 }
2755 } 2816 }
2756} 2817}
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index c3dc3aba4c02..daca6209202d 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
57 u64 count; 57 u64 count;
58 58
59 rcu_read_lock(); 59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount); 60 count = atomic_read(&current->cgroups->refcount);
61 rcu_read_unlock(); 61 rcu_read_unlock();
62 return count; 62 return count;
63} 63}
@@ -90,7 +90,7 @@ static struct cftype files[] = {
90 { 90 {
91 .name = "releasable", 91 .name = "releasable",
92 .read_u64 = releasable_read, 92 .read_u64 = releasable_read,
93 } 93 },
94}; 94};
95 95
96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000000..fb249e2bcada
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,379 @@
1/*
2 * cgroup_freezer.c - control group freezer subsystem
3 *
4 * Copyright IBM Corporation, 2007
5 *
6 * Author : Cedric Le Goater <clg@fr.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of version 2.1 of the GNU Lesser General Public License
10 * as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it would be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */
16
17#include <linux/module.h>
18#include <linux/cgroup.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/freezer.h>
22#include <linux/seq_file.h>
23
24enum freezer_state {
25 CGROUP_THAWED = 0,
26 CGROUP_FREEZING,
27 CGROUP_FROZEN,
28};
29
30struct freezer {
31 struct cgroup_subsys_state css;
32 enum freezer_state state;
33 spinlock_t lock; /* protects _writes_ to state */
34};
35
36static inline struct freezer *cgroup_freezer(
37 struct cgroup *cgroup)
38{
39 return container_of(
40 cgroup_subsys_state(cgroup, freezer_subsys_id),
41 struct freezer, css);
42}
43
44static inline struct freezer *task_freezer(struct task_struct *task)
45{
46 return container_of(task_subsys_state(task, freezer_subsys_id),
47 struct freezer, css);
48}
49
50int cgroup_frozen(struct task_struct *task)
51{
52 struct freezer *freezer;
53 enum freezer_state state;
54
55 task_lock(task);
56 freezer = task_freezer(task);
57 state = freezer->state;
58 task_unlock(task);
59
60 return state == CGROUP_FROZEN;
61}
62
63/*
64 * cgroups_write_string() limits the size of freezer state strings to
65 * CGROUP_LOCAL_BUFFER_SIZE
66 */
67static const char *freezer_state_strs[] = {
68 "THAWED",
69 "FREEZING",
70 "FROZEN",
71};
72
73/*
74 * State diagram
75 * Transitions are caused by userspace writes to the freezer.state file.
76 * The values in parenthesis are state labels. The rest are edge labels.
77 *
78 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
79 * ^ ^ | |
80 * | \_______THAWED_______/ |
81 * \__________________________THAWED____________/
82 */
83
84struct cgroup_subsys freezer_subsys;
85
86/* Locks taken and their ordering
87 * ------------------------------
88 * css_set_lock
89 * cgroup_mutex (AKA cgroup_lock)
90 * task->alloc_lock (AKA task_lock)
91 * freezer->lock
92 * task->sighand->siglock
93 *
94 * cgroup code forces css_set_lock to be taken before task->alloc_lock
95 *
96 * freezer_create(), freezer_destroy():
97 * cgroup_mutex [ by cgroup core ]
98 *
99 * can_attach():
100 * cgroup_mutex
101 *
102 * cgroup_frozen():
103 * task->alloc_lock (to get task's cgroup)
104 *
105 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
106 * task->alloc_lock (to get task's cgroup)
107 * freezer->lock
108 * sighand->siglock (if the cgroup is freezing)
109 *
110 * freezer_read():
111 * cgroup_mutex
112 * freezer->lock
113 * read_lock css_set_lock (cgroup iterator start)
114 *
115 * freezer_write() (freeze):
116 * cgroup_mutex
117 * freezer->lock
118 * read_lock css_set_lock (cgroup iterator start)
119 * sighand->siglock
120 *
121 * freezer_write() (unfreeze):
122 * cgroup_mutex
123 * freezer->lock
124 * read_lock css_set_lock (cgroup iterator start)
125 * task->alloc_lock (to prevent races with freeze_task())
126 * sighand->siglock
127 */
128static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
129 struct cgroup *cgroup)
130{
131 struct freezer *freezer;
132
133 freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
134 if (!freezer)
135 return ERR_PTR(-ENOMEM);
136
137 spin_lock_init(&freezer->lock);
138 freezer->state = CGROUP_THAWED;
139 return &freezer->css;
140}
141
142static void freezer_destroy(struct cgroup_subsys *ss,
143 struct cgroup *cgroup)
144{
145 kfree(cgroup_freezer(cgroup));
146}
147
148/* Task is frozen or will freeze immediately when next it gets woken */
149static bool is_task_frozen_enough(struct task_struct *task)
150{
151 return frozen(task) ||
152 (task_is_stopped_or_traced(task) && freezing(task));
153}
154
155/*
156 * The call to cgroup_lock() in the freezer.state write method prevents
157 * a write to that file racing against an attach, and hence the
158 * can_attach() result will remain valid until the attach completes.
159 */
160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup,
162 struct task_struct *task)
163{
164 struct freezer *freezer;
165
166 /*
167 * Anything frozen can't move or be moved to/from.
168 *
169 * Since orig_freezer->state == FROZEN means that @task has been
170 * frozen, so it's sufficient to check the latter condition.
171 */
172
173 if (is_task_frozen_enough(task))
174 return -EBUSY;
175
176 freezer = cgroup_freezer(new_cgroup);
177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY;
179
180 return 0;
181}
182
183static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
184{
185 struct freezer *freezer;
186
187 /*
188 * No lock is needed, since the task isn't on tasklist yet,
189 * so it can't be moved to another cgroup, which means the
190 * freezer won't be removed and will be valid during this
191 * function call.
192 */
193 freezer = task_freezer(task);
194
195 /*
196 * The root cgroup is non-freezable, so we can skip the
197 * following check.
198 */
199 if (!freezer->css.cgroup->parent)
200 return;
201
202 spin_lock_irq(&freezer->lock);
203 BUG_ON(freezer->state == CGROUP_FROZEN);
204
205 /* Locking avoids race with FREEZING -> THAWED transitions. */
206 if (freezer->state == CGROUP_FREEZING)
207 freeze_task(task, true);
208 spin_unlock_irq(&freezer->lock);
209}
210
211/*
212 * caller must hold freezer->lock
213 */
214static void update_freezer_state(struct cgroup *cgroup,
215 struct freezer *freezer)
216{
217 struct cgroup_iter it;
218 struct task_struct *task;
219 unsigned int nfrozen = 0, ntotal = 0;
220
221 cgroup_iter_start(cgroup, &it);
222 while ((task = cgroup_iter_next(cgroup, &it))) {
223 ntotal++;
224 if (is_task_frozen_enough(task))
225 nfrozen++;
226 }
227
228 /*
229 * Transition to FROZEN when no new tasks can be added ensures
230 * that we never exist in the FROZEN state while there are unfrozen
231 * tasks.
232 */
233 if (nfrozen == ntotal)
234 freezer->state = CGROUP_FROZEN;
235 else if (nfrozen > 0)
236 freezer->state = CGROUP_FREEZING;
237 else
238 freezer->state = CGROUP_THAWED;
239 cgroup_iter_end(cgroup, &it);
240}
241
242static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
243 struct seq_file *m)
244{
245 struct freezer *freezer;
246 enum freezer_state state;
247
248 if (!cgroup_lock_live_group(cgroup))
249 return -ENODEV;
250
251 freezer = cgroup_freezer(cgroup);
252 spin_lock_irq(&freezer->lock);
253 state = freezer->state;
254 if (state == CGROUP_FREEZING) {
255 /* We change from FREEZING to FROZEN lazily if the cgroup was
256 * only partially frozen when we exitted write. */
257 update_freezer_state(cgroup, freezer);
258 state = freezer->state;
259 }
260 spin_unlock_irq(&freezer->lock);
261 cgroup_unlock();
262
263 seq_puts(m, freezer_state_strs[state]);
264 seq_putc(m, '\n');
265 return 0;
266}
267
268static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
269{
270 struct cgroup_iter it;
271 struct task_struct *task;
272 unsigned int num_cant_freeze_now = 0;
273
274 freezer->state = CGROUP_FREEZING;
275 cgroup_iter_start(cgroup, &it);
276 while ((task = cgroup_iter_next(cgroup, &it))) {
277 if (!freeze_task(task, true))
278 continue;
279 if (is_task_frozen_enough(task))
280 continue;
281 if (!freezing(task) && !freezer_should_skip(task))
282 num_cant_freeze_now++;
283 }
284 cgroup_iter_end(cgroup, &it);
285
286 return num_cant_freeze_now ? -EBUSY : 0;
287}
288
289static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
290{
291 struct cgroup_iter it;
292 struct task_struct *task;
293
294 cgroup_iter_start(cgroup, &it);
295 while ((task = cgroup_iter_next(cgroup, &it))) {
296 thaw_process(task);
297 }
298 cgroup_iter_end(cgroup, &it);
299
300 freezer->state = CGROUP_THAWED;
301}
302
303static int freezer_change_state(struct cgroup *cgroup,
304 enum freezer_state goal_state)
305{
306 struct freezer *freezer;
307 int retval = 0;
308
309 freezer = cgroup_freezer(cgroup);
310
311 spin_lock_irq(&freezer->lock);
312
313 update_freezer_state(cgroup, freezer);
314 if (goal_state == freezer->state)
315 goto out;
316
317 switch (goal_state) {
318 case CGROUP_THAWED:
319 unfreeze_cgroup(cgroup, freezer);
320 break;
321 case CGROUP_FROZEN:
322 retval = try_to_freeze_cgroup(cgroup, freezer);
323 break;
324 default:
325 BUG();
326 }
327out:
328 spin_unlock_irq(&freezer->lock);
329
330 return retval;
331}
332
333static int freezer_write(struct cgroup *cgroup,
334 struct cftype *cft,
335 const char *buffer)
336{
337 int retval;
338 enum freezer_state goal_state;
339
340 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
341 goal_state = CGROUP_THAWED;
342 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
343 goal_state = CGROUP_FROZEN;
344 else
345 return -EINVAL;
346
347 if (!cgroup_lock_live_group(cgroup))
348 return -ENODEV;
349 retval = freezer_change_state(cgroup, goal_state);
350 cgroup_unlock();
351 return retval;
352}
353
354static struct cftype files[] = {
355 {
356 .name = "state",
357 .read_seq_string = freezer_read,
358 .write_string = freezer_write,
359 },
360};
361
362static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
363{
364 if (!cgroup->parent)
365 return 0;
366 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
367}
368
369struct cgroup_subsys freezer_subsys = {
370 .name = "freezer",
371 .create = freezer_create,
372 .destroy = freezer_destroy,
373 .populate = freezer_populate,
374 .subsys_id = freezer_subsys_id,
375 .can_attach = freezer_can_attach,
376 .attach = NULL,
377 .fork = freezer_fork,
378 .exit = NULL,
379};
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..8eafe3eb50d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,9 +23,68 @@
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
30/*
31 * Note that the native side is already converted to a timespec, because
32 * that's what we want anyway.
33 */
34static int compat_get_timeval(struct timespec *o,
35 struct compat_timeval __user *i)
36{
37 long usec;
38
39 if (get_user(o->tv_sec, &i->tv_sec) ||
40 get_user(usec, &i->tv_usec))
41 return -EFAULT;
42 o->tv_nsec = usec * 1000;
43 return 0;
44}
45
46static int compat_put_timeval(struct compat_timeval __user *o,
47 struct timeval *i)
48{
49 return (put_user(i->tv_sec, &o->tv_sec) ||
50 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
51}
52
53asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
54 struct timezone __user *tz)
55{
56 if (tv) {
57 struct timeval ktv;
58 do_gettimeofday(&ktv);
59 if (compat_put_timeval(tv, &ktv))
60 return -EFAULT;
61 }
62 if (tz) {
63 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
64 return -EFAULT;
65 }
66
67 return 0;
68}
69
70asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
71 struct timezone __user *tz)
72{
73 struct timespec kts;
74 struct timezone ktz;
75
76 if (tv) {
77 if (compat_get_timeval(&kts, tv))
78 return -EFAULT;
79 }
80 if (tz) {
81 if (copy_from_user(&ktz, tz, sizeof(ktz)))
82 return -EFAULT;
83 }
84
85 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
86}
87
29int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 88int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
30{ 89{
31 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 90 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@ -150,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which,
150 return 0; 209 return 0;
151} 210}
152 211
212static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
213{
214 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
215}
216
153asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) 217asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
154{ 218{
155 /*
156 * In the SMP world we might just be unlucky and have one of
157 * the times increment as we use it. Since the value is an
158 * atomically safe type this is just fine. Conceptually its
159 * as if the syscall took an instant longer to occur.
160 */
161 if (tbuf) { 219 if (tbuf) {
220 struct tms tms;
162 struct compat_tms tmp; 221 struct compat_tms tmp;
163 struct task_struct *tsk = current; 222
164 struct task_struct *t; 223 do_sys_times(&tms);
165 cputime_t utime, stime, cutime, cstime; 224 /* Convert our struct tms to the compat version. */
166 225 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
167 read_lock(&tasklist_lock); 226 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
168 utime = tsk->signal->utime; 227 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
169 stime = tsk->signal->stime; 228 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
170 t = tsk;
171 do {
172 utime = cputime_add(utime, t->utime);
173 stime = cputime_add(stime, t->stime);
174 t = next_thread(t);
175 } while (t != tsk);
176
177 /*
178 * While we have tasklist_lock read-locked, no dying thread
179 * can be updating current->signal->[us]time. Instead,
180 * we got their counts included in the live thread loop.
181 * However, another thread can come in right now and
182 * do a wait call that updates current->signal->c[us]time.
183 * To make sure we always see that pair updated atomically,
184 * we take the siglock around fetching them.
185 */
186 spin_lock_irq(&tsk->sighand->siglock);
187 cutime = tsk->signal->cutime;
188 cstime = tsk->signal->cstime;
189 spin_unlock_irq(&tsk->sighand->siglock);
190 read_unlock(&tasklist_lock);
191
192 tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
193 tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
194 tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
195 tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
196 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 229 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
197 return -EFAULT; 230 return -EFAULT;
198 } 231 }
diff --git a/kernel/configs.c b/kernel/configs.c
index 4c345210ed8c..abaee684ecbf 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -54,9 +54,6 @@
54 54
55#ifdef CONFIG_IKCONFIG_PROC 55#ifdef CONFIG_IKCONFIG_PROC
56 56
57/**************************************************/
58/* globals and useful constants */
59
60static ssize_t 57static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 58ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 59 size_t len, loff_t * offset)
@@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = {
71 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
72}; 69};
73 70
74/***************************************************/
75/* ikconfig_init: start up everything we need to */
76
77static int __init ikconfig_init(void) 71static int __init ikconfig_init(void)
78{ 72{
79 struct proc_dir_entry *entry; 73 struct proc_dir_entry *entry;
@@ -89,9 +83,6 @@ static int __init ikconfig_init(void)
89 return 0; 83 return 0;
90} 84}
91 85
92/***************************************************/
93/* ikconfig_cleanup: clean up our mess */
94
95static void __exit ikconfig_cleanup(void) 86static void __exit ikconfig_cleanup(void)
96{ 87{
97 remove_proc_entry("config.gz", NULL); 88 remove_proc_entry("config.gz", NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..5a732c5ef08b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
199 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
200 int err; 200 int err;
201 201
202 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
203 param->hcpu);
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
205 err = __cpu_disable(); 203 err = __cpu_disable();
206 if (err < 0) 204 if (err < 0)
207 return err; 205 return err;
208 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
209 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
210 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
211 sched_idle_next(); 212 sched_idle_next();
@@ -453,6 +454,25 @@ out:
453} 454}
454#endif /* CONFIG_PM_SLEEP_SMP */ 455#endif /* CONFIG_PM_SLEEP_SMP */
455 456
457/**
458 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
459 * @cpu: cpu that just started
460 *
461 * This function calls the cpu_chain notifiers with CPU_STARTING.
462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */
465void notify_cpu_starting(unsigned int cpu)
466{
467 unsigned long val = CPU_STARTING;
468
469#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
474}
475
456#endif /* CONFIG_SMP */ 476#endif /* CONFIG_SMP */
457 477
458/* 478/*
@@ -479,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
479#endif 499#endif
480}; 500};
481EXPORT_SYMBOL_GPL(cpu_bit_bitmap); 501EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
502
503const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
504EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 827cd9adccb2..da7ff6137f37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h>
39#include <linux/module.h> 40#include <linux/module.h>
40#include <linux/mount.h> 41#include <linux/mount.h>
41#include <linux/namei.h> 42#include <linux/namei.h>
@@ -587,7 +588,6 @@ static int generate_sched_domains(cpumask_t **domains,
587 int ndoms; /* number of sched domains in result */ 588 int ndoms; /* number of sched domains in result */
588 int nslot; /* next empty doms[] cpumask_t slot */ 589 int nslot; /* next empty doms[] cpumask_t slot */
589 590
590 ndoms = 0;
591 doms = NULL; 591 doms = NULL;
592 dattr = NULL; 592 dattr = NULL;
593 csa = NULL; 593 csa = NULL;
@@ -674,10 +674,8 @@ restart:
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 675 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
677 if (!doms) { 677 if (!doms)
678 ndoms = 0;
679 goto done; 678 goto done;
680 }
681 679
682 /* 680 /*
683 * The rest of the code, including the scheduler, can deal with 681 * The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
732done: 730done:
733 kfree(csa); 731 kfree(csa);
734 732
733 /*
734 * Fallback to the default domain if kmalloc() failed.
735 * See comments in partition_sched_domains().
736 */
737 if (doms == NULL)
738 ndoms = 1;
739
735 *domains = doms; 740 *domains = doms;
736 *attributes = dattr; 741 *attributes = dattr;
737 return ndoms; 742 return ndoms;
@@ -1172,7 +1177,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1172{ 1177{
1173 struct cpuset trialcs; 1178 struct cpuset trialcs;
1174 int err; 1179 int err;
1175 int cpus_nonempty, balance_flag_changed; 1180 int balance_flag_changed;
1176 1181
1177 trialcs = *cs; 1182 trialcs = *cs;
1178 if (turning_on) 1183 if (turning_on)
@@ -1184,7 +1189,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1184 if (err < 0) 1189 if (err < 0)
1185 return err; 1190 return err;
1186 1191
1187 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1188 balance_flag_changed = (is_sched_load_balance(cs) != 1192 balance_flag_changed = (is_sched_load_balance(cs) !=
1189 is_sched_load_balance(&trialcs)); 1193 is_sched_load_balance(&trialcs));
1190 1194
@@ -1192,7 +1196,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1192 cs->flags = trialcs.flags; 1196 cs->flags = trialcs.flags;
1193 mutex_unlock(&callback_mutex); 1197 mutex_unlock(&callback_mutex);
1194 1198
1195 if (cpus_nonempty && balance_flag_changed) 1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
1196 async_rebuild_sched_domains(); 1200 async_rebuild_sched_domains();
1197 1201
1198 return 0; 1202 return 0;
@@ -1921,7 +1925,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1921 * that has tasks along with an empty 'mems'. But if we did see such 1925 * that has tasks along with an empty 'mems'. But if we did see such
1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1926 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1923 */ 1927 */
1924static void scan_for_empty_cpusets(const struct cpuset *root) 1928static void scan_for_empty_cpusets(struct cpuset *root)
1925{ 1929{
1926 LIST_HEAD(queue); 1930 LIST_HEAD(queue);
1927 struct cpuset *cp; /* scans cpusets being updated */ 1931 struct cpuset *cp; /* scans cpusets being updated */
@@ -2012,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2016 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2013 * See also the previous routine cpuset_track_online_cpus(). 2017 * See also the previous routine cpuset_track_online_cpus().
2014 */ 2018 */
2015void cpuset_track_online_nodes(void) 2019static int cpuset_track_online_nodes(struct notifier_block *self,
2020 unsigned long action, void *arg)
2016{ 2021{
2017 cgroup_lock(); 2022 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2023 switch (action) {
2019 scan_for_empty_cpusets(&top_cpuset); 2024 case MEM_ONLINE:
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 break;
2027 case MEM_OFFLINE:
2028 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2029 scan_for_empty_cpusets(&top_cpuset);
2030 break;
2031 default:
2032 break;
2033 }
2020 cgroup_unlock(); 2034 cgroup_unlock();
2035 return NOTIFY_OK;
2021} 2036}
2022#endif 2037#endif
2023 2038
@@ -2033,6 +2048,7 @@ void __init cpuset_init_smp(void)
2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2034 2049
2035 hotcpu_notifier(cpuset_track_online_cpus, 0); 2050 hotcpu_notifier(cpuset_track_online_cpus, 0);
2051 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2036} 2052}
2037 2053
2038/** 2054/**
@@ -2437,19 +2453,15 @@ const struct file_operations proc_cpuset_operations = {
2437void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2453void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2438{ 2454{
2439 seq_printf(m, "Cpus_allowed:\t"); 2455 seq_printf(m, "Cpus_allowed:\t");
2440 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2456 seq_cpumask(m, &task->cpus_allowed);
2441 task->cpus_allowed);
2442 seq_printf(m, "\n"); 2457 seq_printf(m, "\n");
2443 seq_printf(m, "Cpus_allowed_list:\t"); 2458 seq_printf(m, "Cpus_allowed_list:\t");
2444 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, 2459 seq_cpumask_list(m, &task->cpus_allowed);
2445 task->cpus_allowed);
2446 seq_printf(m, "\n"); 2460 seq_printf(m, "\n");
2447 seq_printf(m, "Mems_allowed:\t"); 2461 seq_printf(m, "Mems_allowed:\t");
2448 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2462 seq_nodemask(m, &task->mems_allowed);
2449 task->mems_allowed);
2450 seq_printf(m, "\n"); 2463 seq_printf(m, "\n");
2451 seq_printf(m, "Mems_allowed_list:\t"); 2464 seq_printf(m, "Mems_allowed_list:\t");
2452 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, 2465 seq_nodemask_list(m, &task->mems_allowed);
2453 task->mems_allowed);
2454 seq_printf(m, "\n"); 2466 seq_printf(m, "\n");
2455} 2467}
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
124 } 124 }
125 return (mem != NULL); 125 return (mem != NULL);
126} 126}
127EXPORT_SYMBOL(dma_alloc_from_coherent);
127 128
128/** 129/**
129 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool 130 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
151 } 152 }
152 return 0; 153 return 0;
153} 154}
155EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
index d2c60a822790..f903189c5304 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,4 +1,4 @@
1/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ 1/*
2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. 2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
3 * 3 *
4 * Written by Hennus Bergman, 1992. 4 * Written by Hennus Bergman, 1992.
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0d407e886735..0511716e9424 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -12,7 +12,9 @@
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/proc_fs.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/seq_file.h>
16#include <linux/syscalls.h> 18#include <linux/syscalls.h>
17#include <linux/sysctl.h> 19#include <linux/sysctl.h>
18#include <linux/types.h> 20#include <linux/types.h>
@@ -173,20 +175,39 @@ __set_personality(u_long personality)
173 return 0; 175 return 0;
174} 176}
175 177
176int 178#ifdef CONFIG_PROC_FS
177get_exec_domain_list(char *page) 179static int execdomains_proc_show(struct seq_file *m, void *v)
178{ 180{
179 struct exec_domain *ep; 181 struct exec_domain *ep;
180 int len = 0;
181 182
182 read_lock(&exec_domains_lock); 183 read_lock(&exec_domains_lock);
183 for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) 184 for (ep = exec_domains; ep; ep = ep->next)
184 len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", 185 seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
185 ep->pers_low, ep->pers_high, ep->name, 186 ep->pers_low, ep->pers_high, ep->name,
186 module_name(ep->module)); 187 module_name(ep->module));
187 read_unlock(&exec_domains_lock); 188 read_unlock(&exec_domains_lock);
188 return (len); 189 return 0;
190}
191
192static int execdomains_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, execdomains_proc_show, NULL);
195}
196
197static const struct file_operations execdomains_proc_fops = {
198 .open = execdomains_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
203
204static int __init proc_execdomains_init(void)
205{
206 proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
207 return 0;
189} 208}
209module_init(proc_execdomains_init);
210#endif
190 211
191asmlinkage long 212asmlinkage long
192sys_personality(u_long personality) 213sys_personality(u_long personality)
diff --git a/kernel/exit.c b/kernel/exit.c
index 85a83c831856..2d8be7ebb0f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,13 +40,13 @@
40#include <linux/cn_proc.h> 40#include <linux/cn_proc.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
44#include <linux/pipe_fs_i.h> 43#include <linux/pipe_fs_i.h>
45#include <linux/audit.h> /* for audit_free() */ 44#include <linux/audit.h> /* for audit_free() */
46#include <linux/resource.h> 45#include <linux/resource.h>
47#include <linux/blkdev.h> 46#include <linux/blkdev.h>
48#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
49#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <trace/sched.h>
50 50
51#include <asm/uaccess.h> 51#include <asm/uaccess.h>
52#include <asm/unistd.h> 52#include <asm/unistd.h>
@@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 112 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 113 * will have been the last reference on the signal_struct.
114 */ 114 */
115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
118 sig->min_flt += tsk->min_flt; 116 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 117 sig->maj_flt += tsk->maj_flt;
@@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->inblock += task_io_get_inblock(tsk); 120 sig->inblock += task_io_get_inblock(tsk);
123 sig->oublock += task_io_get_oublock(tsk); 121 sig->oublock += task_io_get_oublock(tsk);
124 task_io_accounting_add(&sig->ioac, &tsk->ioac); 122 task_io_accounting_add(&sig->ioac, &tsk->ioac);
125 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
126 sig = NULL; /* Marker for below. */ 123 sig = NULL; /* Marker for below. */
127 } 124 }
128 125
@@ -143,13 +140,21 @@ static void __exit_signal(struct task_struct *tsk)
143 if (sig) { 140 if (sig) {
144 flush_sigqueue(&sig->shared_pending); 141 flush_sigqueue(&sig->shared_pending);
145 taskstats_tgid_free(sig); 142 taskstats_tgid_free(sig);
143 /*
144 * Make sure ->signal can't go away under rq->lock,
145 * see account_group_exec_runtime().
146 */
147 task_rq_unlock_wait(tsk);
146 __cleanup_signal(sig); 148 __cleanup_signal(sig);
147 } 149 }
148} 150}
149 151
150static void delayed_put_task_struct(struct rcu_head *rhp) 152static void delayed_put_task_struct(struct rcu_head *rhp)
151{ 153{
152 put_task_struct(container_of(rhp, struct task_struct, rcu)); 154 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
155
156 trace_sched_process_free(tsk);
157 put_task_struct(tsk);
153} 158}
154 159
155 160
@@ -640,24 +645,23 @@ retry:
640assign_new_owner: 645assign_new_owner:
641 BUG_ON(c == p); 646 BUG_ON(c == p);
642 get_task_struct(c); 647 get_task_struct(c);
648 read_unlock(&tasklist_lock);
649 down_write(&mm->mmap_sem);
643 /* 650 /*
644 * The task_lock protects c->mm from changing. 651 * The task_lock protects c->mm from changing.
645 * We always want mm->owner->mm == mm 652 * We always want mm->owner->mm == mm
646 */ 653 */
647 task_lock(c); 654 task_lock(c);
648 /*
649 * Delay read_unlock() till we have the task_lock()
650 * to ensure that c does not slip away underneath us
651 */
652 read_unlock(&tasklist_lock);
653 if (c->mm != mm) { 655 if (c->mm != mm) {
654 task_unlock(c); 656 task_unlock(c);
657 up_write(&mm->mmap_sem);
655 put_task_struct(c); 658 put_task_struct(c);
656 goto retry; 659 goto retry;
657 } 660 }
658 cgroup_mm_owner_callbacks(mm->owner, c); 661 cgroup_mm_owner_callbacks(mm->owner, c);
659 mm->owner = c; 662 mm->owner = c;
660 task_unlock(c); 663 task_unlock(c);
664 up_write(&mm->mmap_sem);
661 put_task_struct(c); 665 put_task_struct(c);
662} 666}
663#endif /* CONFIG_MM_OWNER */ 667#endif /* CONFIG_MM_OWNER */
@@ -1054,14 +1058,6 @@ NORET_TYPE void do_exit(long code)
1054 exit_itimers(tsk->signal); 1058 exit_itimers(tsk->signal);
1055 } 1059 }
1056 acct_collect(code, group_dead); 1060 acct_collect(code, group_dead);
1057#ifdef CONFIG_FUTEX
1058 if (unlikely(tsk->robust_list))
1059 exit_robust_list(tsk);
1060#ifdef CONFIG_COMPAT
1061 if (unlikely(tsk->compat_robust_list))
1062 compat_exit_robust_list(tsk);
1063#endif
1064#endif
1065 if (group_dead) 1061 if (group_dead)
1066 tty_audit_exit(); 1062 tty_audit_exit();
1067 if (unlikely(tsk->audit_context)) 1063 if (unlikely(tsk->audit_context))
@@ -1074,6 +1070,8 @@ NORET_TYPE void do_exit(long code)
1074 1070
1075 if (group_dead) 1071 if (group_dead)
1076 acct_process(); 1072 acct_process();
1073 trace_sched_process_exit(tsk);
1074
1077 exit_sem(tsk); 1075 exit_sem(tsk);
1078 exit_files(tsk); 1076 exit_files(tsk);
1079 exit_fs(tsk); 1077 exit_fs(tsk);
@@ -1302,6 +1300,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1302 if (likely(!traced)) { 1300 if (likely(!traced)) {
1303 struct signal_struct *psig; 1301 struct signal_struct *psig;
1304 struct signal_struct *sig; 1302 struct signal_struct *sig;
1303 struct task_cputime cputime;
1305 1304
1306 /* 1305 /*
1307 * The resource counters for the group leader are in its 1306 * The resource counters for the group leader are in its
@@ -1317,20 +1316,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1317 * need to protect the access to p->parent->signal fields, 1316 * need to protect the access to p->parent->signal fields,
1318 * as other threads in the parent group can be right 1317 * as other threads in the parent group can be right
1319 * here reaping other children at the same time. 1318 * here reaping other children at the same time.
1319 *
1320 * We use thread_group_cputime() to get times for the thread
1321 * group, which consolidates times for all threads in the
1322 * group including the group leader.
1320 */ 1323 */
1321 spin_lock_irq(&p->parent->sighand->siglock); 1324 spin_lock_irq(&p->parent->sighand->siglock);
1322 psig = p->parent->signal; 1325 psig = p->parent->signal;
1323 sig = p->signal; 1326 sig = p->signal;
1327 thread_group_cputime(p, &cputime);
1324 psig->cutime = 1328 psig->cutime =
1325 cputime_add(psig->cutime, 1329 cputime_add(psig->cutime,
1326 cputime_add(p->utime, 1330 cputime_add(cputime.utime,
1327 cputime_add(sig->utime, 1331 sig->cutime));
1328 sig->cutime)));
1329 psig->cstime = 1332 psig->cstime =
1330 cputime_add(psig->cstime, 1333 cputime_add(psig->cstime,
1331 cputime_add(p->stime, 1334 cputime_add(cputime.stime,
1332 cputime_add(sig->stime, 1335 sig->cstime));
1333 sig->cstime)));
1334 psig->cgtime = 1336 psig->cgtime =
1335 cputime_add(psig->cgtime, 1337 cputime_add(psig->cgtime,
1336 cputime_add(p->gtime, 1338 cputime_add(p->gtime,
@@ -1675,6 +1677,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options,
1675 struct task_struct *tsk; 1677 struct task_struct *tsk;
1676 int retval; 1678 int retval;
1677 1679
1680 trace_sched_process_wait(pid);
1681
1678 add_wait_queue(&current->signal->wait_chldexit,&wait); 1682 add_wait_queue(&current->signal->wait_chldexit,&wait);
1679repeat: 1683repeat:
1680 /* 1684 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..2a372a0e206f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/tracehook.h> 41#include <linux/tracehook.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -58,6 +59,7 @@
58#include <linux/tty.h> 59#include <linux/tty.h>
59#include <linux/proc_fs.h> 60#include <linux/proc_fs.h>
60#include <linux/blkdev.h> 61#include <linux/blkdev.h>
62#include <trace/sched.h>
61 63
62#include <asm/pgtable.h> 64#include <asm/pgtable.h>
63#include <asm/pgalloc.h> 65#include <asm/pgalloc.h>
@@ -518,6 +520,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
518{ 520{
519 struct completion *vfork_done = tsk->vfork_done; 521 struct completion *vfork_done = tsk->vfork_done;
520 522
523 /* Get rid of any futexes when releasing the mm */
524#ifdef CONFIG_FUTEX
525 if (unlikely(tsk->robust_list))
526 exit_robust_list(tsk);
527#ifdef CONFIG_COMPAT
528 if (unlikely(tsk->compat_robust_list))
529 compat_exit_robust_list(tsk);
530#endif
531#endif
532
521 /* Get rid of any cached register state */ 533 /* Get rid of any cached register state */
522 deactivate_mm(tsk, mm); 534 deactivate_mm(tsk, mm);
523 535
@@ -759,15 +771,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)
759 kmem_cache_free(sighand_cachep, sighand); 771 kmem_cache_free(sighand_cachep, sighand);
760} 772}
761 773
774
775/*
776 * Initialize POSIX timer handling for a thread group.
777 */
778static void posix_cpu_timers_init_group(struct signal_struct *sig)
779{
780 /* Thread group counters. */
781 thread_group_cputime_init(sig);
782
783 /* Expiration times and increments. */
784 sig->it_virt_expires = cputime_zero;
785 sig->it_virt_incr = cputime_zero;
786 sig->it_prof_expires = cputime_zero;
787 sig->it_prof_incr = cputime_zero;
788
789 /* Cached expiration times. */
790 sig->cputime_expires.prof_exp = cputime_zero;
791 sig->cputime_expires.virt_exp = cputime_zero;
792 sig->cputime_expires.sched_exp = 0;
793
794 /* The timer lists. */
795 INIT_LIST_HEAD(&sig->cpu_timers[0]);
796 INIT_LIST_HEAD(&sig->cpu_timers[1]);
797 INIT_LIST_HEAD(&sig->cpu_timers[2]);
798}
799
762static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 800static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
763{ 801{
764 struct signal_struct *sig; 802 struct signal_struct *sig;
765 int ret; 803 int ret;
766 804
767 if (clone_flags & CLONE_THREAD) { 805 if (clone_flags & CLONE_THREAD) {
768 atomic_inc(&current->signal->count); 806 ret = thread_group_cputime_clone_thread(current);
769 atomic_inc(&current->signal->live); 807 if (likely(!ret)) {
770 return 0; 808 atomic_inc(&current->signal->count);
809 atomic_inc(&current->signal->live);
810 }
811 return ret;
771 } 812 }
772 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 813 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
773 tsk->signal = sig; 814 tsk->signal = sig;
@@ -795,39 +836,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
795 sig->it_real_incr.tv64 = 0; 836 sig->it_real_incr.tv64 = 0;
796 sig->real_timer.function = it_real_fn; 837 sig->real_timer.function = it_real_fn;
797 838
798 sig->it_virt_expires = cputime_zero;
799 sig->it_virt_incr = cputime_zero;
800 sig->it_prof_expires = cputime_zero;
801 sig->it_prof_incr = cputime_zero;
802
803 sig->leader = 0; /* session leadership doesn't inherit */ 839 sig->leader = 0; /* session leadership doesn't inherit */
804 sig->tty_old_pgrp = NULL; 840 sig->tty_old_pgrp = NULL;
841 sig->tty = NULL;
805 842
806 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 843 sig->cutime = sig->cstime = cputime_zero;
807 sig->gtime = cputime_zero; 844 sig->gtime = cputime_zero;
808 sig->cgtime = cputime_zero; 845 sig->cgtime = cputime_zero;
809 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 846 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
810 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 847 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
811 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 848 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
812 task_io_accounting_init(&sig->ioac); 849 task_io_accounting_init(&sig->ioac);
813 sig->sum_sched_runtime = 0;
814 INIT_LIST_HEAD(&sig->cpu_timers[0]);
815 INIT_LIST_HEAD(&sig->cpu_timers[1]);
816 INIT_LIST_HEAD(&sig->cpu_timers[2]);
817 taskstats_tgid_init(sig); 850 taskstats_tgid_init(sig);
818 851
819 task_lock(current->group_leader); 852 task_lock(current->group_leader);
820 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 853 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
821 task_unlock(current->group_leader); 854 task_unlock(current->group_leader);
822 855
823 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 856 posix_cpu_timers_init_group(sig);
824 /* 857
825 * New sole thread in the process gets an expiry time
826 * of the whole CPU time limit.
827 */
828 tsk->it_prof_expires =
829 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
830 }
831 acct_init_pacct(&sig->pacct); 858 acct_init_pacct(&sig->pacct);
832 859
833 tty_audit_fork(sig); 860 tty_audit_fork(sig);
@@ -837,7 +864,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
837 864
838void __cleanup_signal(struct signal_struct *sig) 865void __cleanup_signal(struct signal_struct *sig)
839{ 866{
867 thread_group_cputime_free(sig);
840 exit_thread_group_keys(sig); 868 exit_thread_group_keys(sig);
869 tty_kref_put(sig->tty);
841 kmem_cache_free(signal_cachep, sig); 870 kmem_cache_free(signal_cachep, sig);
842} 871}
843 872
@@ -886,6 +915,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
886#endif /* CONFIG_MM_OWNER */ 915#endif /* CONFIG_MM_OWNER */
887 916
888/* 917/*
918 * Initialize POSIX timer handling for a single task.
919 */
920static void posix_cpu_timers_init(struct task_struct *tsk)
921{
922 tsk->cputime_expires.prof_exp = cputime_zero;
923 tsk->cputime_expires.virt_exp = cputime_zero;
924 tsk->cputime_expires.sched_exp = 0;
925 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
926 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
927 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
928}
929
930/*
889 * This creates a new process as a copy of the old one, 931 * This creates a new process as a copy of the old one,
890 * but does not actually start it yet. 932 * but does not actually start it yet.
891 * 933 *
@@ -987,6 +1029,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
987 p->prev_utime = cputime_zero; 1029 p->prev_utime = cputime_zero;
988 p->prev_stime = cputime_zero; 1030 p->prev_stime = cputime_zero;
989 1031
1032 p->default_timer_slack_ns = current->timer_slack_ns;
1033
990#ifdef CONFIG_DETECT_SOFTLOCKUP 1034#ifdef CONFIG_DETECT_SOFTLOCKUP
991 p->last_switch_count = 0; 1035 p->last_switch_count = 0;
992 p->last_switch_timestamp = 0; 1036 p->last_switch_timestamp = 0;
@@ -995,12 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
995 task_io_accounting_init(&p->ioac); 1039 task_io_accounting_init(&p->ioac);
996 acct_clear_integrals(p); 1040 acct_clear_integrals(p);
997 1041
998 p->it_virt_expires = cputime_zero; 1042 posix_cpu_timers_init(p);
999 p->it_prof_expires = cputime_zero;
1000 p->it_sched_expires = 0;
1001 INIT_LIST_HEAD(&p->cpu_timers[0]);
1002 INIT_LIST_HEAD(&p->cpu_timers[1]);
1003 INIT_LIST_HEAD(&p->cpu_timers[2]);
1004 1043
1005 p->lock_depth = -1; /* -1 = no lock */ 1044 p->lock_depth = -1; /* -1 = no lock */
1006 do_posix_clock_monotonic_gettime(&p->start_time); 1045 do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1201,21 +1240,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1201 if (clone_flags & CLONE_THREAD) { 1240 if (clone_flags & CLONE_THREAD) {
1202 p->group_leader = current->group_leader; 1241 p->group_leader = current->group_leader;
1203 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1242 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1204
1205 if (!cputime_eq(current->signal->it_virt_expires,
1206 cputime_zero) ||
1207 !cputime_eq(current->signal->it_prof_expires,
1208 cputime_zero) ||
1209 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1210 !list_empty(&current->signal->cpu_timers[0]) ||
1211 !list_empty(&current->signal->cpu_timers[1]) ||
1212 !list_empty(&current->signal->cpu_timers[2])) {
1213 /*
1214 * Have child wake up on its first tick to check
1215 * for process CPU timers.
1216 */
1217 p->it_prof_expires = jiffies_to_cputime(1);
1218 }
1219 } 1243 }
1220 1244
1221 if (likely(p->pid)) { 1245 if (likely(p->pid)) {
@@ -1227,7 +1251,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1227 p->nsproxy->pid_ns->child_reaper = p; 1251 p->nsproxy->pid_ns->child_reaper = p;
1228 1252
1229 p->signal->leader_pid = pid; 1253 p->signal->leader_pid = pid;
1230 p->signal->tty = current->signal->tty; 1254 tty_kref_put(p->signal->tty);
1255 p->signal->tty = tty_kref_get(current->signal->tty);
1231 set_task_pgrp(p, task_pgrp_nr(current)); 1256 set_task_pgrp(p, task_pgrp_nr(current));
1232 set_task_session(p, task_session_nr(current)); 1257 set_task_session(p, task_session_nr(current));
1233 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1258 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
@@ -1361,6 +1386,8 @@ long do_fork(unsigned long clone_flags,
1361 if (!IS_ERR(p)) { 1386 if (!IS_ERR(p)) {
1362 struct completion vfork; 1387 struct completion vfork;
1363 1388
1389 trace_sched_process_fork(current, p);
1390
1364 nr = task_pid_vnr(p); 1391 nr = task_pid_vnr(p);
1365 1392
1366 if (clone_flags & CLONE_PARENT_SETTID) 1393 if (clone_flags & CLONE_PARENT_SETTID)
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000000..2f4936cf7083
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,154 @@
1/*
2 * kernel/freezer.c - Function to freeze a process
3 *
4 * Originally from kernel/power/process.c
5 */
6
7#include <linux/interrupt.h>
8#include <linux/suspend.h>
9#include <linux/module.h>
10#include <linux/syscalls.h>
11#include <linux/freezer.h>
12
13/*
14 * freezing is complete, mark current process as frozen
15 */
16static inline void frozen_process(void)
17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN;
20 wmb();
21 }
22 clear_freeze_flag(current);
23}
24
25/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void)
27{
28 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */
30 long save;
31
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm);
42
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current))
50 break;
51 schedule();
52 }
53 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save);
55}
56EXPORT_SYMBOL(refrigerator);
57
58static void fake_signal_wake_up(struct task_struct *p)
59{
60 unsigned long flags;
61
62 spin_lock_irqsave(&p->sighand->siglock, flags);
63 signal_wake_up(p, 0);
64 spin_unlock_irqrestore(&p->sighand->siglock, flags);
65}
66
67/**
68 * freeze_task - send a freeze request to given task
69 * @p: task to send the request to
70 * @sig_only: if set, the request will only be sent if the task has the
71 * PF_FREEZER_NOSIG flag unset
72 * Return value: 'false', if @sig_only is set and the task has
73 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
74 *
75 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
76 * either sending a fake signal to it or waking it up, depending on whether
77 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
78 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
79 * TIF_FREEZE flag will not be set.
80 */
81bool freeze_task(struct task_struct *p, bool sig_only)
82{
83 /*
84 * We first check if the task is freezing and next if it has already
85 * been frozen to avoid the race with frozen_process() which first marks
86 * the task as frozen and next clears its TIF_FREEZE.
87 */
88 if (!freezing(p)) {
89 rmb();
90 if (frozen(p))
91 return false;
92
93 if (!sig_only || should_send_signal(p))
94 set_freeze_flag(p);
95 else
96 return false;
97 }
98
99 if (should_send_signal(p)) {
100 if (!signal_pending(p))
101 fake_signal_wake_up(p);
102 } else if (sig_only) {
103 return false;
104 } else {
105 wake_up_state(p, TASK_INTERRUPTIBLE);
106 }
107
108 return true;
109}
110
111void cancel_freezing(struct task_struct *p)
112{
113 unsigned long flags;
114
115 if (freezing(p)) {
116 pr_debug(" clean up: %s\n", p->comm);
117 clear_freeze_flag(p);
118 spin_lock_irqsave(&p->sighand->siglock, flags);
119 recalc_sigpending_and_wake(p);
120 spin_unlock_irqrestore(&p->sighand->siglock, flags);
121 }
122}
123
124static int __thaw_process(struct task_struct *p)
125{
126 if (frozen(p)) {
127 p->flags &= ~PF_FROZEN;
128 return 1;
129 }
130 clear_freeze_flag(p);
131 return 0;
132}
133
134/*
135 * Wake up a frozen process
136 *
137 * task_lock() is needed to prevent the race with refrigerator() which may
138 * occur if the freezing of tasks fails. Namely, without the lock, if the
139 * freezing of tasks failed, thaw_tasks() might have run before a task in
140 * refrigerator() could call frozen_process(), in which case the task would be
141 * frozen and no one would thaw it.
142 */
143int thaw_process(struct task_struct *p)
144{
145 task_lock(p);
146 if (__thaw_process(p) == 1) {
147 task_unlock(p);
148 wake_up_process(p);
149 return 1;
150 }
151 task_unlock(p);
152 return 0;
153}
154EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/futex.c b/kernel/futex.c
index 62cbd648e28a..e10c5c8786a6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1229,13 +1229,16 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1229 if (!abs_time) 1229 if (!abs_time)
1230 schedule(); 1230 schedule();
1231 else { 1231 else {
1232 unsigned long slack;
1233 slack = current->timer_slack_ns;
1234 if (rt_task(current))
1235 slack = 0;
1232 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1236 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1233 HRTIMER_MODE_ABS); 1237 HRTIMER_MODE_ABS);
1234 hrtimer_init_sleeper(&t, current); 1238 hrtimer_init_sleeper(&t, current);
1235 t.timer.expires = *abs_time; 1239 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1236 1240
1237 hrtimer_start(&t.timer, t.timer.expires, 1241 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1238 HRTIMER_MODE_ABS);
1239 if (!hrtimer_active(&t.timer)) 1242 if (!hrtimer_active(&t.timer))
1240 t.task = NULL; 1243 t.task = NULL;
1241 1244
@@ -1337,7 +1340,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1337 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 1340 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1338 HRTIMER_MODE_ABS); 1341 HRTIMER_MODE_ABS);
1339 hrtimer_init_sleeper(to, current); 1342 hrtimer_init_sleeper(to, current);
1340 to->timer.expires = *time; 1343 hrtimer_set_expires(&to->timer, *time);
1341 } 1344 }
1342 1345
1343 q.pi_state = NULL; 1346 q.pi_state = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..47e63349d1b2 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
517 if (!base->first) 517 if (!base->first)
518 continue; 518 continue;
519 timer = rb_entry(base->first, struct hrtimer, node); 519 timer = rb_entry(base->first, struct hrtimer, node);
520 expires = ktime_sub(timer->expires, base->offset); 520 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
521 if (expires.tv64 < cpu_base->expires_next.tv64) 521 if (expires.tv64 < cpu_base->expires_next.tv64)
522 cpu_base->expires_next = expires; 522 cpu_base->expires_next = expires;
523 } 523 }
@@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer,
539 struct hrtimer_clock_base *base) 539 struct hrtimer_clock_base *base)
540{ 540{
541 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 541 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
542 ktime_t expires = ktime_sub(timer->expires, base->offset); 542 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
543 int res; 543 int res;
544 544
545 WARN_ON_ONCE(timer->expires.tv64 < 0); 545 WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
546 546
547 /* 547 /*
548 * When the callback is running, we do not reprogram the clock event 548 * When the callback is running, we do not reprogram the clock event
@@ -664,25 +664,17 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
664 664
665 /* Timer is expired, act upon the callback mode */ 665 /* Timer is expired, act upon the callback mode */
666 switch(timer->cb_mode) { 666 switch(timer->cb_mode) {
667 case HRTIMER_CB_IRQSAFE_NO_RESTART: 667 case HRTIMER_CB_IRQSAFE_PERCPU:
668 debug_hrtimer_deactivate(timer); 668 case HRTIMER_CB_IRQSAFE_UNLOCKED:
669 /*
670 * We can call the callback from here. No restart
671 * happens, so no danger of recursion
672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1;
675 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
676 /* 669 /*
677 * This is solely for the sched tick emulation with 670 * This is solely for the sched tick emulation with
678 * dynamic tick support to ensure that we do not 671 * dynamic tick support to ensure that we do not
679 * restart the tick right on the edge and end up with 672 * restart the tick right on the edge and end up with
680 * the tick timer in the softirq ! The calling site 673 * the tick timer in the softirq ! The calling site
681 * takes care of this. 674 * takes care of this. Also used for hrtimer sleeper !
682 */ 675 */
683 debug_hrtimer_deactivate(timer); 676 debug_hrtimer_deactivate(timer);
684 return 1; 677 return 1;
685 case HRTIMER_CB_IRQSAFE:
686 case HRTIMER_CB_SOFTIRQ: 678 case HRTIMER_CB_SOFTIRQ:
687 /* 679 /*
688 * Move everything else into the softirq pending list ! 680 * Move everything else into the softirq pending list !
@@ -794,7 +786,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
794 u64 orun = 1; 786 u64 orun = 1;
795 ktime_t delta; 787 ktime_t delta;
796 788
797 delta = ktime_sub(now, timer->expires); 789 delta = ktime_sub(now, hrtimer_get_expires(timer));
798 790
799 if (delta.tv64 < 0) 791 if (delta.tv64 < 0)
800 return 0; 792 return 0;
@@ -806,8 +798,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
806 s64 incr = ktime_to_ns(interval); 798 s64 incr = ktime_to_ns(interval);
807 799
808 orun = ktime_divns(delta, incr); 800 orun = ktime_divns(delta, incr);
809 timer->expires = ktime_add_ns(timer->expires, incr * orun); 801 hrtimer_add_expires_ns(timer, incr * orun);
810 if (timer->expires.tv64 > now.tv64) 802 if (hrtimer_get_expires_tv64(timer) > now.tv64)
811 return orun; 803 return orun;
812 /* 804 /*
813 * This (and the ktime_add() below) is the 805 * This (and the ktime_add() below) is the
@@ -815,7 +807,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
815 */ 807 */
816 orun++; 808 orun++;
817 } 809 }
818 timer->expires = ktime_add_safe(timer->expires, interval); 810 hrtimer_add_expires(timer, interval);
819 811
820 return orun; 812 return orun;
821} 813}
@@ -847,7 +839,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
847 * We dont care about collisions. Nodes with 839 * We dont care about collisions. Nodes with
848 * the same expiry time stay together. 840 * the same expiry time stay together.
849 */ 841 */
850 if (timer->expires.tv64 < entry->expires.tv64) { 842 if (hrtimer_get_expires_tv64(timer) <
843 hrtimer_get_expires_tv64(entry)) {
851 link = &(*link)->rb_left; 844 link = &(*link)->rb_left;
852 } else { 845 } else {
853 link = &(*link)->rb_right; 846 link = &(*link)->rb_right;
@@ -944,9 +937,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
944} 937}
945 938
946/** 939/**
947 * hrtimer_start - (re)start an relative timer on the current CPU 940 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
948 * @timer: the timer to be added 941 * @timer: the timer to be added
949 * @tim: expiry time 942 * @tim: expiry time
943 * @delta_ns: "slack" range for the timer
950 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 944 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
951 * 945 *
952 * Returns: 946 * Returns:
@@ -954,7 +948,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
954 * 1 when the timer was active 948 * 1 when the timer was active
955 */ 949 */
956int 950int
957hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 951hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
952 const enum hrtimer_mode mode)
958{ 953{
959 struct hrtimer_clock_base *base, *new_base; 954 struct hrtimer_clock_base *base, *new_base;
960 unsigned long flags; 955 unsigned long flags;
@@ -982,7 +977,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
982#endif 977#endif
983 } 978 }
984 979
985 timer->expires = tim; 980 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
986 981
987 timer_stats_hrtimer_set_start_info(timer); 982 timer_stats_hrtimer_set_start_info(timer);
988 983
@@ -1015,8 +1010,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1015 1010
1016 return ret; 1011 return ret;
1017} 1012}
1013EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1014
1015/**
1016 * hrtimer_start - (re)start an hrtimer on the current CPU
1017 * @timer: the timer to be added
1018 * @tim: expiry time
1019 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
1020 *
1021 * Returns:
1022 * 0 on success
1023 * 1 when the timer was active
1024 */
1025int
1026hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1027{
1028 return hrtimer_start_range_ns(timer, tim, 0, mode);
1029}
1018EXPORT_SYMBOL_GPL(hrtimer_start); 1030EXPORT_SYMBOL_GPL(hrtimer_start);
1019 1031
1032
1020/** 1033/**
1021 * hrtimer_try_to_cancel - try to deactivate a timer 1034 * hrtimer_try_to_cancel - try to deactivate a timer
1022 * @timer: hrtimer to stop 1035 * @timer: hrtimer to stop
@@ -1076,7 +1089,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1076 ktime_t rem; 1089 ktime_t rem;
1077 1090
1078 base = lock_hrtimer_base(timer, &flags); 1091 base = lock_hrtimer_base(timer, &flags);
1079 rem = ktime_sub(timer->expires, base->get_time()); 1092 rem = hrtimer_expires_remaining(timer);
1080 unlock_hrtimer_base(timer, &flags); 1093 unlock_hrtimer_base(timer, &flags);
1081 1094
1082 return rem; 1095 return rem;
@@ -1108,7 +1121,7 @@ ktime_t hrtimer_get_next_event(void)
1108 continue; 1121 continue;
1109 1122
1110 timer = rb_entry(base->first, struct hrtimer, node); 1123 timer = rb_entry(base->first, struct hrtimer, node);
1111 delta.tv64 = timer->expires.tv64; 1124 delta.tv64 = hrtimer_get_expires_tv64(timer);
1112 delta = ktime_sub(delta, base->get_time()); 1125 delta = ktime_sub(delta, base->get_time());
1113 if (delta.tv64 < mindelta.tv64) 1126 if (delta.tv64 < mindelta.tv64)
1114 mindelta.tv64 = delta.tv64; 1127 mindelta.tv64 = delta.tv64;
@@ -1187,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1187 enum hrtimer_restart (*fn)(struct hrtimer *); 1200 enum hrtimer_restart (*fn)(struct hrtimer *);
1188 struct hrtimer *timer; 1201 struct hrtimer *timer;
1189 int restart; 1202 int restart;
1203 int emulate_hardirq_ctx = 0;
1190 1204
1191 timer = list_entry(cpu_base->cb_pending.next, 1205 timer = list_entry(cpu_base->cb_pending.next,
1192 struct hrtimer, cb_entry); 1206 struct hrtimer, cb_entry);
@@ -1195,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1195 timer_stats_account_hrtimer(timer); 1209 timer_stats_account_hrtimer(timer);
1196 1210
1197 fn = timer->function; 1211 fn = timer->function;
1212 /*
1213 * A timer might have been added to the cb_pending list
1214 * when it was migrated during a cpu-offline operation.
1215 * Emulate hardirq context for such timers.
1216 */
1217 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1218 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
1219 emulate_hardirq_ctx = 1;
1220
1198 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); 1221 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1199 spin_unlock_irq(&cpu_base->lock); 1222 spin_unlock_irq(&cpu_base->lock);
1200 1223
1201 restart = fn(timer); 1224 if (unlikely(emulate_hardirq_ctx)) {
1225 local_irq_disable();
1226 restart = fn(timer);
1227 local_irq_enable();
1228 } else
1229 restart = fn(timer);
1202 1230
1203 spin_lock_irq(&cpu_base->lock); 1231 spin_lock_irq(&cpu_base->lock);
1204 1232
@@ -1245,7 +1273,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1245 timer_stats_account_hrtimer(timer); 1273 timer_stats_account_hrtimer(timer);
1246 1274
1247 fn = timer->function; 1275 fn = timer->function;
1248 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1276 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1277 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1249 /* 1278 /*
1250 * Used for scheduler timers, avoid lock inversion with 1279 * Used for scheduler timers, avoid lock inversion with
1251 * rq->lock and tasklist_lock. 1280 * rq->lock and tasklist_lock.
@@ -1308,10 +1337,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1308 1337
1309 timer = rb_entry(node, struct hrtimer, node); 1338 timer = rb_entry(node, struct hrtimer, node);
1310 1339
1311 if (basenow.tv64 < timer->expires.tv64) { 1340 /*
1341 * The immediate goal for using the softexpires is
1342 * minimizing wakeups, not running timers at the
1343 * earliest interrupt after their soft expiration.
1344 * This allows us to avoid using a Priority Search
1345 * Tree, which can answer a stabbing querry for
1346 * overlapping intervals and instead use the simple
1347 * BST we already have.
1348 * We don't add extra wakeups by delaying timers that
1349 * are right-of a not yet expired timer, because that
1350 * timer will have to trigger a wakeup anyway.
1351 */
1352
1353 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1312 ktime_t expires; 1354 ktime_t expires;
1313 1355
1314 expires = ktime_sub(timer->expires, 1356 expires = ktime_sub(hrtimer_get_expires(timer),
1315 base->offset); 1357 base->offset);
1316 if (expires.tv64 < expires_next.tv64) 1358 if (expires.tv64 < expires_next.tv64)
1317 expires_next = expires; 1359 expires_next = expires;
@@ -1347,6 +1389,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1347 raise_softirq(HRTIMER_SOFTIRQ); 1389 raise_softirq(HRTIMER_SOFTIRQ);
1348} 1390}
1349 1391
1392/**
1393 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1394 *
1395 * hrtimer_peek_ahead_timers will peek at the timer queue of
1396 * the current cpu and check if there are any timers for which
1397 * the soft expires time has passed. If any such timers exist,
1398 * they are run immediately and then removed from the timer queue.
1399 *
1400 */
1401void hrtimer_peek_ahead_timers(void)
1402{
1403 struct tick_device *td;
1404 unsigned long flags;
1405
1406 if (!hrtimer_hres_active())
1407 return;
1408
1409 local_irq_save(flags);
1410 td = &__get_cpu_var(tick_cpu_device);
1411 if (td && td->evtdev)
1412 hrtimer_interrupt(td->evtdev);
1413 local_irq_restore(flags);
1414}
1415
1350static void run_hrtimer_softirq(struct softirq_action *h) 1416static void run_hrtimer_softirq(struct softirq_action *h)
1351{ 1417{
1352 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); 1418 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@ -1401,9 +1467,7 @@ void hrtimer_run_queues(void)
1401 if (!base->first) 1467 if (!base->first)
1402 continue; 1468 continue;
1403 1469
1404 if (base->get_softirq_time) 1470 if (gettime) {
1405 base->softirq_time = base->get_softirq_time();
1406 else if (gettime) {
1407 hrtimer_get_softirq_time(cpu_base); 1471 hrtimer_get_softirq_time(cpu_base);
1408 gettime = 0; 1472 gettime = 0;
1409 } 1473 }
@@ -1414,7 +1478,8 @@ void hrtimer_run_queues(void)
1414 struct hrtimer *timer; 1478 struct hrtimer *timer;
1415 1479
1416 timer = rb_entry(node, struct hrtimer, node); 1480 timer = rb_entry(node, struct hrtimer, node);
1417 if (base->softirq_time.tv64 <= timer->expires.tv64) 1481 if (base->softirq_time.tv64 <=
1482 hrtimer_get_expires_tv64(timer))
1418 break; 1483 break;
1419 1484
1420 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { 1485 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@ -1452,7 +1517,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1452 sl->timer.function = hrtimer_wakeup; 1517 sl->timer.function = hrtimer_wakeup;
1453 sl->task = task; 1518 sl->task = task;
1454#ifdef CONFIG_HIGH_RES_TIMERS 1519#ifdef CONFIG_HIGH_RES_TIMERS
1455 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1520 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1456#endif 1521#endif
1457} 1522}
1458 1523
@@ -1462,7 +1527,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1462 1527
1463 do { 1528 do {
1464 set_current_state(TASK_INTERRUPTIBLE); 1529 set_current_state(TASK_INTERRUPTIBLE);
1465 hrtimer_start(&t->timer, t->timer.expires, mode); 1530 hrtimer_start_expires(&t->timer, mode);
1466 if (!hrtimer_active(&t->timer)) 1531 if (!hrtimer_active(&t->timer))
1467 t->task = NULL; 1532 t->task = NULL;
1468 1533
@@ -1484,7 +1549,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1484 struct timespec rmt; 1549 struct timespec rmt;
1485 ktime_t rem; 1550 ktime_t rem;
1486 1551
1487 rem = ktime_sub(timer->expires, timer->base->get_time()); 1552 rem = hrtimer_expires_remaining(timer);
1488 if (rem.tv64 <= 0) 1553 if (rem.tv64 <= 0)
1489 return 0; 1554 return 0;
1490 rmt = ktime_to_timespec(rem); 1555 rmt = ktime_to_timespec(rem);
@@ -1503,7 +1568,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1503 1568
1504 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1569 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
1505 HRTIMER_MODE_ABS); 1570 HRTIMER_MODE_ABS);
1506 t.timer.expires.tv64 = restart->nanosleep.expires; 1571 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1507 1572
1508 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1573 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1509 goto out; 1574 goto out;
@@ -1528,9 +1593,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1528 struct restart_block *restart; 1593 struct restart_block *restart;
1529 struct hrtimer_sleeper t; 1594 struct hrtimer_sleeper t;
1530 int ret = 0; 1595 int ret = 0;
1596 unsigned long slack;
1597
1598 slack = current->timer_slack_ns;
1599 if (rt_task(current))
1600 slack = 0;
1531 1601
1532 hrtimer_init_on_stack(&t.timer, clockid, mode); 1602 hrtimer_init_on_stack(&t.timer, clockid, mode);
1533 t.timer.expires = timespec_to_ktime(*rqtp); 1603 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
1534 if (do_nanosleep(&t, mode)) 1604 if (do_nanosleep(&t, mode))
1535 goto out; 1605 goto out;
1536 1606
@@ -1550,7 +1620,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1550 restart->fn = hrtimer_nanosleep_restart; 1620 restart->fn = hrtimer_nanosleep_restart;
1551 restart->nanosleep.index = t.timer.base->index; 1621 restart->nanosleep.index = t.timer.base->index;
1552 restart->nanosleep.rmtp = rmtp; 1622 restart->nanosleep.rmtp = rmtp;
1553 restart->nanosleep.expires = t.timer.expires.tv64; 1623 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1554 1624
1555 ret = -ERESTART_RESTARTBLOCK; 1625 ret = -ERESTART_RESTARTBLOCK;
1556out: 1626out:
@@ -1591,49 +1661,123 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 1661
1592#ifdef CONFIG_HOTPLUG_CPU 1662#ifdef CONFIG_HOTPLUG_CPU
1593 1663
1594static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1664static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1595 struct hrtimer_clock_base *new_base) 1665 struct hrtimer_clock_base *new_base, int dcpu)
1596{ 1666{
1597 struct hrtimer *timer; 1667 struct hrtimer *timer;
1598 struct rb_node *node; 1668 struct rb_node *node;
1669 int raise = 0;
1599 1670
1600 while ((node = rb_first(&old_base->active))) { 1671 while ((node = rb_first(&old_base->active))) {
1601 timer = rb_entry(node, struct hrtimer, node); 1672 timer = rb_entry(node, struct hrtimer, node);
1602 BUG_ON(hrtimer_callback_running(timer)); 1673 BUG_ON(hrtimer_callback_running(timer));
1603 debug_hrtimer_deactivate(timer); 1674 debug_hrtimer_deactivate(timer);
1604 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1675
1676 /*
1677 * Should not happen. Per CPU timers should be
1678 * canceled _before_ the migration code is called
1679 */
1680 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1681 __remove_hrtimer(timer, old_base,
1682 HRTIMER_STATE_INACTIVE, 0);
1683 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1684 timer, timer->function, dcpu);
1685 continue;
1686 }
1687
1688 /*
1689 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1690 * timer could be seen as !active and just vanish away
1691 * under us on another CPU
1692 */
1693 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1605 timer->base = new_base; 1694 timer->base = new_base;
1606 /* 1695 /*
1607 * Enqueue the timer. Allow reprogramming of the event device 1696 * Enqueue the timer. Allow reprogramming of the event device
1608 */ 1697 */
1609 enqueue_hrtimer(timer, new_base, 1); 1698 enqueue_hrtimer(timer, new_base, 1);
1699
1700#ifdef CONFIG_HIGH_RES_TIMERS
1701 /*
1702 * Happens with high res enabled when the timer was
1703 * already expired and the callback mode is
1704 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1705 * enqueue code does not move them to the soft irq
1706 * pending list for performance/latency reasons, but
1707 * in the migration state, we need to do that
1708 * otherwise we end up with a stale timer.
1709 */
1710 if (timer->state == HRTIMER_STATE_MIGRATE) {
1711 timer->state = HRTIMER_STATE_PENDING;
1712 list_add_tail(&timer->cb_entry,
1713 &new_base->cpu_base->cb_pending);
1714 raise = 1;
1715 }
1716#endif
1717 /* Clear the migration state bit */
1718 timer->state &= ~HRTIMER_STATE_MIGRATE;
1719 }
1720 return raise;
1721}
1722
1723#ifdef CONFIG_HIGH_RES_TIMERS
1724static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1725 struct hrtimer_cpu_base *new_base)
1726{
1727 struct hrtimer *timer;
1728 int raise = 0;
1729
1730 while (!list_empty(&old_base->cb_pending)) {
1731 timer = list_entry(old_base->cb_pending.next,
1732 struct hrtimer, cb_entry);
1733
1734 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1735 timer->base = &new_base->clock_base[timer->base->index];
1736 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1737 raise = 1;
1610 } 1738 }
1739 return raise;
1740}
1741#else
1742static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1743 struct hrtimer_cpu_base *new_base)
1744{
1745 return 0;
1611} 1746}
1747#endif
1612 1748
1613static void migrate_hrtimers(int cpu) 1749static void migrate_hrtimers(int cpu)
1614{ 1750{
1615 struct hrtimer_cpu_base *old_base, *new_base; 1751 struct hrtimer_cpu_base *old_base, *new_base;
1616 int i; 1752 int i, raise = 0;
1617 1753
1618 BUG_ON(cpu_online(cpu)); 1754 BUG_ON(cpu_online(cpu));
1619 old_base = &per_cpu(hrtimer_bases, cpu); 1755 old_base = &per_cpu(hrtimer_bases, cpu);
1620 new_base = &get_cpu_var(hrtimer_bases); 1756 new_base = &get_cpu_var(hrtimer_bases);
1621 1757
1622 tick_cancel_sched_timer(cpu); 1758 tick_cancel_sched_timer(cpu);
1623 1759 /*
1624 local_irq_disable(); 1760 * The caller is globally serialized and nobody else
1625 spin_lock(&new_base->lock); 1761 * takes two locks at once, deadlock is not possible.
1762 */
1763 spin_lock_irq(&new_base->lock);
1626 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1764 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1627 1765
1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1766 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1629 migrate_hrtimer_list(&old_base->clock_base[i], 1767 if (migrate_hrtimer_list(&old_base->clock_base[i],
1630 &new_base->clock_base[i]); 1768 &new_base->clock_base[i], cpu))
1769 raise = 1;
1631 } 1770 }
1632 1771
1772 if (migrate_hrtimer_pending(old_base, new_base))
1773 raise = 1;
1774
1633 spin_unlock(&old_base->lock); 1775 spin_unlock(&old_base->lock);
1634 spin_unlock(&new_base->lock); 1776 spin_unlock_irq(&new_base->lock);
1635 local_irq_enable();
1636 put_cpu_var(hrtimer_bases); 1777 put_cpu_var(hrtimer_bases);
1778
1779 if (raise)
1780 hrtimer_raise_softirq();
1637} 1781}
1638#endif /* CONFIG_HOTPLUG_CPU */ 1782#endif /* CONFIG_HOTPLUG_CPU */
1639 1783
@@ -1678,3 +1822,103 @@ void __init hrtimers_init(void)
1678#endif 1822#endif
1679} 1823}
1680 1824
1825/**
1826 * schedule_hrtimeout_range - sleep until timeout
1827 * @expires: timeout value (ktime_t)
1828 * @delta: slack in expires timeout (ktime_t)
1829 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1830 *
1831 * Make the current task sleep until the given expiry time has
1832 * elapsed. The routine will return immediately unless
1833 * the current task state has been set (see set_current_state()).
1834 *
1835 * The @delta argument gives the kernel the freedom to schedule the
1836 * actual wakeup to a time that is both power and performance friendly.
1837 * The kernel give the normal best effort behavior for "@expires+@delta",
1838 * but may decide to fire the timer earlier, but no earlier than @expires.
1839 *
1840 * You can set the task state as follows -
1841 *
1842 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1843 * pass before the routine returns.
1844 *
1845 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1846 * delivered to the current task.
1847 *
1848 * The current task state is guaranteed to be TASK_RUNNING when this
1849 * routine returns.
1850 *
1851 * Returns 0 when the timer has expired otherwise -EINTR
1852 */
1853int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1854 const enum hrtimer_mode mode)
1855{
1856 struct hrtimer_sleeper t;
1857
1858 /*
1859 * Optimize when a zero timeout value is given. It does not
1860 * matter whether this is an absolute or a relative time.
1861 */
1862 if (expires && !expires->tv64) {
1863 __set_current_state(TASK_RUNNING);
1864 return 0;
1865 }
1866
1867 /*
1868 * A NULL parameter means "inifinte"
1869 */
1870 if (!expires) {
1871 schedule();
1872 __set_current_state(TASK_RUNNING);
1873 return -EINTR;
1874 }
1875
1876 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
1877 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1878
1879 hrtimer_init_sleeper(&t, current);
1880
1881 hrtimer_start_expires(&t.timer, mode);
1882 if (!hrtimer_active(&t.timer))
1883 t.task = NULL;
1884
1885 if (likely(t.task))
1886 schedule();
1887
1888 hrtimer_cancel(&t.timer);
1889 destroy_hrtimer_on_stack(&t.timer);
1890
1891 __set_current_state(TASK_RUNNING);
1892
1893 return !t.task ? 0 : -EINTR;
1894}
1895EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1896
1897/**
1898 * schedule_hrtimeout - sleep until timeout
1899 * @expires: timeout value (ktime_t)
1900 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1901 *
1902 * Make the current task sleep until the given expiry time has
1903 * elapsed. The routine will return immediately unless
1904 * the current task state has been set (see set_current_state()).
1905 *
1906 * You can set the task state as follows -
1907 *
1908 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1909 * pass before the routine returns.
1910 *
1911 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1912 * delivered to the current task.
1913 *
1914 * The current task state is guaranteed to be TASK_RUNNING when this
1915 * routine returns.
1916 *
1917 * Returns 0 when the timer has expired otherwise -EINTR
1918 */
1919int __sched schedule_hrtimeout(ktime_t *expires,
1920 const enum hrtimer_mode mode)
1921{
1922 return schedule_hrtimeout_range(expires, 0, mode);
1923}
1924EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 533068cfb607..cc0f7321b8ce 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -30,17 +30,16 @@ static DEFINE_MUTEX(probing_active);
30unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
31{ 31{
32 struct irq_desc *desc; 32 struct irq_desc *desc;
33 unsigned long mask; 33 unsigned long mask = 0;
34 unsigned int i; 34 unsigned int status;
35 int i;
35 36
36 mutex_lock(&probing_active); 37 mutex_lock(&probing_active);
37 /* 38 /*
38 * something may have generated an irq long ago and we want to 39 * something may have generated an irq long ago and we want to
39 * flush such a longstanding irq before considering it as spurious. 40 * flush such a longstanding irq before considering it as spurious.
40 */ 41 */
41 for (i = NR_IRQS-1; i > 0; i--) { 42 for_each_irq_desc_reverse(i, desc) {
42 desc = irq_desc + i;
43
44 spin_lock_irq(&desc->lock); 43 spin_lock_irq(&desc->lock);
45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 44 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
46 /* 45 /*
@@ -68,9 +67,7 @@ unsigned long probe_irq_on(void)
68 * (we must startup again here because if a longstanding irq 67 * (we must startup again here because if a longstanding irq
69 * happened in the previous stage, it may have masked itself) 68 * happened in the previous stage, it may have masked itself)
70 */ 69 */
71 for (i = NR_IRQS-1; i > 0; i--) { 70 for_each_irq_desc_reverse(i, desc) {
72 desc = irq_desc + i;
73
74 spin_lock_irq(&desc->lock); 71 spin_lock_irq(&desc->lock);
75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 72 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
76 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 73 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -88,11 +85,7 @@ unsigned long probe_irq_on(void)
88 /* 85 /*
89 * Now filter out any obviously spurious interrupts 86 * Now filter out any obviously spurious interrupts
90 */ 87 */
91 mask = 0; 88 for_each_irq_desc(i, desc) {
92 for (i = 0; i < NR_IRQS; i++) {
93 unsigned int status;
94
95 desc = irq_desc + i;
96 spin_lock_irq(&desc->lock); 89 spin_lock_irq(&desc->lock);
97 status = desc->status; 90 status = desc->status;
98 91
@@ -126,14 +119,11 @@ EXPORT_SYMBOL(probe_irq_on);
126 */ 119 */
127unsigned int probe_irq_mask(unsigned long val) 120unsigned int probe_irq_mask(unsigned long val)
128{ 121{
129 unsigned int mask; 122 unsigned int status, mask = 0;
123 struct irq_desc *desc;
130 int i; 124 int i;
131 125
132 mask = 0; 126 for_each_irq_desc(i, desc) {
133 for (i = 0; i < NR_IRQS; i++) {
134 struct irq_desc *desc = irq_desc + i;
135 unsigned int status;
136
137 spin_lock_irq(&desc->lock); 127 spin_lock_irq(&desc->lock);
138 status = desc->status; 128 status = desc->status;
139 129
@@ -171,20 +161,19 @@ EXPORT_SYMBOL(probe_irq_mask);
171 */ 161 */
172int probe_irq_off(unsigned long val) 162int probe_irq_off(unsigned long val)
173{ 163{
174 int i, irq_found = 0, nr_irqs = 0; 164 int i, irq_found = 0, nr_of_irqs = 0;
175 165 struct irq_desc *desc;
176 for (i = 0; i < NR_IRQS; i++) { 166 unsigned int status;
177 struct irq_desc *desc = irq_desc + i;
178 unsigned int status;
179 167
168 for_each_irq_desc(i, desc) {
180 spin_lock_irq(&desc->lock); 169 spin_lock_irq(&desc->lock);
181 status = desc->status; 170 status = desc->status;
182 171
183 if (status & IRQ_AUTODETECT) { 172 if (status & IRQ_AUTODETECT) {
184 if (!(status & IRQ_WAITING)) { 173 if (!(status & IRQ_WAITING)) {
185 if (!nr_irqs) 174 if (!nr_of_irqs)
186 irq_found = i; 175 irq_found = i;
187 nr_irqs++; 176 nr_of_irqs++;
188 } 177 }
189 desc->status = status & ~IRQ_AUTODETECT; 178 desc->status = status & ~IRQ_AUTODETECT;
190 desc->chip->shutdown(i); 179 desc->chip->shutdown(i);
@@ -193,7 +182,7 @@ int probe_irq_off(unsigned long val)
193 } 182 }
194 mutex_unlock(&probing_active); 183 mutex_unlock(&probing_active);
195 184
196 if (nr_irqs > 1) 185 if (nr_of_irqs > 1)
197 irq_found = -irq_found; 186 irq_found = -irq_found;
198 187
199 return irq_found; 188 return irq_found;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3cd441ebf5d2..10b5092e9bfe 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,16 +24,15 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc; 27 struct irq_desc *desc = irq_to_desc(irq);
28 unsigned long flags; 28 unsigned long flags;
29 29
30 if (irq >= NR_IRQS) { 30 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 32 return;
33 } 33 }
34 34
35 /* Ensure we don't have left over values from a previous use of this irq */ 35 /* Ensure we don't have left over values from a previous use of this irq */
36 desc = irq_desc + irq;
37 spin_lock_irqsave(&desc->lock, flags); 36 spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 37 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 38 desc->chip = &no_irq_chip;
@@ -57,15 +56,14 @@ void dynamic_irq_init(unsigned int irq)
57 */ 56 */
58void dynamic_irq_cleanup(unsigned int irq) 57void dynamic_irq_cleanup(unsigned int irq)
59{ 58{
60 struct irq_desc *desc; 59 struct irq_desc *desc = irq_to_desc(irq);
61 unsigned long flags; 60 unsigned long flags;
62 61
63 if (irq >= NR_IRQS) { 62 if (!desc) {
64 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); 63 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
65 return; 64 return;
66 } 65 }
67 66
68 desc = irq_desc + irq;
69 spin_lock_irqsave(&desc->lock, flags); 67 spin_lock_irqsave(&desc->lock, flags);
70 if (desc->action) { 68 if (desc->action) {
71 spin_unlock_irqrestore(&desc->lock, flags); 69 spin_unlock_irqrestore(&desc->lock, flags);
@@ -78,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq)
78 desc->chip_data = NULL; 76 desc->chip_data = NULL;
79 desc->handle_irq = handle_bad_irq; 77 desc->handle_irq = handle_bad_irq;
80 desc->chip = &no_irq_chip; 78 desc->chip = &no_irq_chip;
79 desc->name = NULL;
81 spin_unlock_irqrestore(&desc->lock, flags); 80 spin_unlock_irqrestore(&desc->lock, flags);
82} 81}
83 82
@@ -89,10 +88,10 @@ void dynamic_irq_cleanup(unsigned int irq)
89 */ 88 */
90int set_irq_chip(unsigned int irq, struct irq_chip *chip) 89int set_irq_chip(unsigned int irq, struct irq_chip *chip)
91{ 90{
92 struct irq_desc *desc; 91 struct irq_desc *desc = irq_to_desc(irq);
93 unsigned long flags; 92 unsigned long flags;
94 93
95 if (irq >= NR_IRQS) { 94 if (!desc) {
96 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); 95 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
97 return -EINVAL; 96 return -EINVAL;
98 } 97 }
@@ -100,7 +99,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
100 if (!chip) 99 if (!chip)
101 chip = &no_irq_chip; 100 chip = &no_irq_chip;
102 101
103 desc = irq_desc + irq;
104 spin_lock_irqsave(&desc->lock, flags); 102 spin_lock_irqsave(&desc->lock, flags);
105 irq_chip_set_defaults(chip); 103 irq_chip_set_defaults(chip);
106 desc->chip = chip; 104 desc->chip = chip;
@@ -111,27 +109,27 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
111EXPORT_SYMBOL(set_irq_chip); 109EXPORT_SYMBOL(set_irq_chip);
112 110
113/** 111/**
114 * set_irq_type - set the irq type for an irq 112 * set_irq_type - set the irq trigger type for an irq
115 * @irq: irq number 113 * @irq: irq number
116 * @type: interrupt type - see include/linux/interrupt.h 114 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
117 */ 115 */
118int set_irq_type(unsigned int irq, unsigned int type) 116int set_irq_type(unsigned int irq, unsigned int type)
119{ 117{
120 struct irq_desc *desc; 118 struct irq_desc *desc = irq_to_desc(irq);
121 unsigned long flags; 119 unsigned long flags;
122 int ret = -ENXIO; 120 int ret = -ENXIO;
123 121
124 if (irq >= NR_IRQS) { 122 if (!desc) {
125 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 123 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
126 return -ENODEV; 124 return -ENODEV;
127 } 125 }
128 126
129 desc = irq_desc + irq; 127 if (type == IRQ_TYPE_NONE)
130 if (desc->chip->set_type) { 128 return 0;
131 spin_lock_irqsave(&desc->lock, flags); 129
132 ret = desc->chip->set_type(irq, type); 130 spin_lock_irqsave(&desc->lock, flags);
133 spin_unlock_irqrestore(&desc->lock, flags); 131 ret = __irq_set_trigger(desc, irq, type);
134 } 132 spin_unlock_irqrestore(&desc->lock, flags);
135 return ret; 133 return ret;
136} 134}
137EXPORT_SYMBOL(set_irq_type); 135EXPORT_SYMBOL(set_irq_type);
@@ -145,16 +143,15 @@ EXPORT_SYMBOL(set_irq_type);
145 */ 143 */
146int set_irq_data(unsigned int irq, void *data) 144int set_irq_data(unsigned int irq, void *data)
147{ 145{
148 struct irq_desc *desc; 146 struct irq_desc *desc = irq_to_desc(irq);
149 unsigned long flags; 147 unsigned long flags;
150 148
151 if (irq >= NR_IRQS) { 149 if (!desc) {
152 printk(KERN_ERR 150 printk(KERN_ERR
153 "Trying to install controller data for IRQ%d\n", irq); 151 "Trying to install controller data for IRQ%d\n", irq);
154 return -EINVAL; 152 return -EINVAL;
155 } 153 }
156 154
157 desc = irq_desc + irq;
158 spin_lock_irqsave(&desc->lock, flags); 155 spin_lock_irqsave(&desc->lock, flags);
159 desc->handler_data = data; 156 desc->handler_data = data;
160 spin_unlock_irqrestore(&desc->lock, flags); 157 spin_unlock_irqrestore(&desc->lock, flags);
@@ -171,15 +168,15 @@ EXPORT_SYMBOL(set_irq_data);
171 */ 168 */
172int set_irq_msi(unsigned int irq, struct msi_desc *entry) 169int set_irq_msi(unsigned int irq, struct msi_desc *entry)
173{ 170{
174 struct irq_desc *desc; 171 struct irq_desc *desc = irq_to_desc(irq);
175 unsigned long flags; 172 unsigned long flags;
176 173
177 if (irq >= NR_IRQS) { 174 if (!desc) {
178 printk(KERN_ERR 175 printk(KERN_ERR
179 "Trying to install msi data for IRQ%d\n", irq); 176 "Trying to install msi data for IRQ%d\n", irq);
180 return -EINVAL; 177 return -EINVAL;
181 } 178 }
182 desc = irq_desc + irq; 179
183 spin_lock_irqsave(&desc->lock, flags); 180 spin_lock_irqsave(&desc->lock, flags);
184 desc->msi_desc = entry; 181 desc->msi_desc = entry;
185 if (entry) 182 if (entry)
@@ -197,10 +194,16 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
197 */ 194 */
198int set_irq_chip_data(unsigned int irq, void *data) 195int set_irq_chip_data(unsigned int irq, void *data)
199{ 196{
200 struct irq_desc *desc = irq_desc + irq; 197 struct irq_desc *desc = irq_to_desc(irq);
201 unsigned long flags; 198 unsigned long flags;
202 199
203 if (irq >= NR_IRQS || !desc->chip) { 200 if (!desc) {
201 printk(KERN_ERR
202 "Trying to install chip data for IRQ%d\n", irq);
203 return -EINVAL;
204 }
205
206 if (!desc->chip) {
204 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 207 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
205 return -EINVAL; 208 return -EINVAL;
206 } 209 }
@@ -218,7 +221,7 @@ EXPORT_SYMBOL(set_irq_chip_data);
218 */ 221 */
219static void default_enable(unsigned int irq) 222static void default_enable(unsigned int irq)
220{ 223{
221 struct irq_desc *desc = irq_desc + irq; 224 struct irq_desc *desc = irq_to_desc(irq);
222 225
223 desc->chip->unmask(irq); 226 desc->chip->unmask(irq);
224 desc->status &= ~IRQ_MASKED; 227 desc->status &= ~IRQ_MASKED;
@@ -236,8 +239,9 @@ static void default_disable(unsigned int irq)
236 */ 239 */
237static unsigned int default_startup(unsigned int irq) 240static unsigned int default_startup(unsigned int irq)
238{ 241{
239 irq_desc[irq].chip->enable(irq); 242 struct irq_desc *desc = irq_to_desc(irq);
240 243
244 desc->chip->enable(irq);
241 return 0; 245 return 0;
242} 246}
243 247
@@ -246,7 +250,7 @@ static unsigned int default_startup(unsigned int irq)
246 */ 250 */
247static void default_shutdown(unsigned int irq) 251static void default_shutdown(unsigned int irq)
248{ 252{
249 struct irq_desc *desc = irq_desc + irq; 253 struct irq_desc *desc = irq_to_desc(irq);
250 254
251 desc->chip->mask(irq); 255 desc->chip->mask(irq);
252 desc->status |= IRQ_MASKED; 256 desc->status |= IRQ_MASKED;
@@ -305,14 +309,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
305{ 309{
306 struct irqaction *action; 310 struct irqaction *action;
307 irqreturn_t action_ret; 311 irqreturn_t action_ret;
308 const unsigned int cpu = smp_processor_id();
309 312
310 spin_lock(&desc->lock); 313 spin_lock(&desc->lock);
311 314
312 if (unlikely(desc->status & IRQ_INPROGRESS)) 315 if (unlikely(desc->status & IRQ_INPROGRESS))
313 goto out_unlock; 316 goto out_unlock;
314 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 317 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
315 kstat_cpu(cpu).irqs[irq]++; 318 kstat_incr_irqs_this_cpu(irq, desc);
316 319
317 action = desc->action; 320 action = desc->action;
318 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 321 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -344,7 +347,6 @@ out_unlock:
344void 347void
345handle_level_irq(unsigned int irq, struct irq_desc *desc) 348handle_level_irq(unsigned int irq, struct irq_desc *desc)
346{ 349{
347 unsigned int cpu = smp_processor_id();
348 struct irqaction *action; 350 struct irqaction *action;
349 irqreturn_t action_ret; 351 irqreturn_t action_ret;
350 352
@@ -354,7 +356,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
354 if (unlikely(desc->status & IRQ_INPROGRESS)) 356 if (unlikely(desc->status & IRQ_INPROGRESS))
355 goto out_unlock; 357 goto out_unlock;
356 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 358 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
357 kstat_cpu(cpu).irqs[irq]++; 359 kstat_incr_irqs_this_cpu(irq, desc);
358 360
359 /* 361 /*
360 * If its disabled or no action available 362 * If its disabled or no action available
@@ -392,7 +394,6 @@ out_unlock:
392void 394void
393handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 395handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
394{ 396{
395 unsigned int cpu = smp_processor_id();
396 struct irqaction *action; 397 struct irqaction *action;
397 irqreturn_t action_ret; 398 irqreturn_t action_ret;
398 399
@@ -402,7 +403,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
402 goto out; 403 goto out;
403 404
404 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 405 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
405 kstat_cpu(cpu).irqs[irq]++; 406 kstat_incr_irqs_this_cpu(irq, desc);
406 407
407 /* 408 /*
408 * If its disabled or no action available 409 * If its disabled or no action available
@@ -451,8 +452,6 @@ out:
451void 452void
452handle_edge_irq(unsigned int irq, struct irq_desc *desc) 453handle_edge_irq(unsigned int irq, struct irq_desc *desc)
453{ 454{
454 const unsigned int cpu = smp_processor_id();
455
456 spin_lock(&desc->lock); 455 spin_lock(&desc->lock);
457 456
458 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 457 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -468,8 +467,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
468 mask_ack_irq(desc, irq); 467 mask_ack_irq(desc, irq);
469 goto out_unlock; 468 goto out_unlock;
470 } 469 }
471 470 kstat_incr_irqs_this_cpu(irq, desc);
472 kstat_cpu(cpu).irqs[irq]++;
473 471
474 /* Start handling the irq */ 472 /* Start handling the irq */
475 desc->chip->ack(irq); 473 desc->chip->ack(irq);
@@ -524,7 +522,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
524{ 522{
525 irqreturn_t action_ret; 523 irqreturn_t action_ret;
526 524
527 kstat_this_cpu.irqs[irq]++; 525 kstat_incr_irqs_this_cpu(irq, desc);
528 526
529 if (desc->chip->ack) 527 if (desc->chip->ack)
530 desc->chip->ack(irq); 528 desc->chip->ack(irq);
@@ -541,17 +539,15 @@ void
541__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 539__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
542 const char *name) 540 const char *name)
543{ 541{
544 struct irq_desc *desc; 542 struct irq_desc *desc = irq_to_desc(irq);
545 unsigned long flags; 543 unsigned long flags;
546 544
547 if (irq >= NR_IRQS) { 545 if (!desc) {
548 printk(KERN_ERR 546 printk(KERN_ERR
549 "Trying to install type control for IRQ%d\n", irq); 547 "Trying to install type control for IRQ%d\n", irq);
550 return; 548 return;
551 } 549 }
552 550
553 desc = irq_desc + irq;
554
555 if (!handle) 551 if (!handle)
556 handle = handle_bad_irq; 552 handle = handle_bad_irq;
557 else if (desc->chip == &no_irq_chip) { 553 else if (desc->chip == &no_irq_chip) {
@@ -583,7 +579,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
583 desc->status &= ~IRQ_DISABLED; 579 desc->status &= ~IRQ_DISABLED;
584 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 580 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
585 desc->depth = 0; 581 desc->depth = 0;
586 desc->chip->unmask(irq); 582 desc->chip->startup(irq);
587 } 583 }
588 spin_unlock_irqrestore(&desc->lock, flags); 584 spin_unlock_irqrestore(&desc->lock, flags);
589} 585}
@@ -606,17 +602,14 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
606 602
607void __init set_irq_noprobe(unsigned int irq) 603void __init set_irq_noprobe(unsigned int irq)
608{ 604{
609 struct irq_desc *desc; 605 struct irq_desc *desc = irq_to_desc(irq);
610 unsigned long flags; 606 unsigned long flags;
611 607
612 if (irq >= NR_IRQS) { 608 if (!desc) {
613 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); 609 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
614
615 return; 610 return;
616 } 611 }
617 612
618 desc = irq_desc + irq;
619
620 spin_lock_irqsave(&desc->lock, flags); 613 spin_lock_irqsave(&desc->lock, flags);
621 desc->status |= IRQ_NOPROBE; 614 desc->status |= IRQ_NOPROBE;
622 spin_unlock_irqrestore(&desc->lock, flags); 615 spin_unlock_irqrestore(&desc->lock, flags);
@@ -624,17 +617,14 @@ void __init set_irq_noprobe(unsigned int irq)
624 617
625void __init set_irq_probe(unsigned int irq) 618void __init set_irq_probe(unsigned int irq)
626{ 619{
627 struct irq_desc *desc; 620 struct irq_desc *desc = irq_to_desc(irq);
628 unsigned long flags; 621 unsigned long flags;
629 622
630 if (irq >= NR_IRQS) { 623 if (!desc) {
631 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 624 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
632
633 return; 625 return;
634 } 626 }
635 627
636 desc = irq_desc + irq;
637
638 spin_lock_irqsave(&desc->lock, flags); 628 spin_lock_irqsave(&desc->lock, flags);
639 desc->status &= ~IRQ_NOPROBE; 629 desc->status &= ~IRQ_NOPROBE;
640 spin_unlock_irqrestore(&desc->lock, flags); 630 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 5fa6198e9139..c815b42d0f5b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,11 +25,10 @@
25 * 25 *
26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
27 */ 27 */
28void 28void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
29handle_bad_irq(unsigned int irq, struct irq_desc *desc)
30{ 29{
31 print_irq_desc(irq, desc); 30 print_irq_desc(irq, desc);
32 kstat_this_cpu.irqs[irq]++; 31 kstat_incr_irqs_this_cpu(irq, desc);
33 ack_bad_irq(irq); 32 ack_bad_irq(irq);
34} 33}
35 34
@@ -47,6 +46,9 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
47 * 46 *
48 * Controller mappings for all interrupt sources: 47 * Controller mappings for all interrupt sources:
49 */ 48 */
49int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs);
51
50struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
51 [0 ... NR_IRQS-1] = { 53 [0 ... NR_IRQS-1] = {
52 .status = IRQ_DISABLED, 54 .status = IRQ_DISABLED,
@@ -66,7 +68,9 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
66 */ 68 */
67static void ack_bad(unsigned int irq) 69static void ack_bad(unsigned int irq)
68{ 70{
69 print_irq_desc(irq, irq_desc + irq); 71 struct irq_desc *desc = irq_to_desc(irq);
72
73 print_irq_desc(irq, desc);
70 ack_bad_irq(irq); 74 ack_bad_irq(irq);
71} 75}
72 76
@@ -131,8 +135,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
131 irqreturn_t ret, retval = IRQ_NONE; 135 irqreturn_t ret, retval = IRQ_NONE;
132 unsigned int status = 0; 136 unsigned int status = 0;
133 137
134 handle_dynamic_tick(action);
135
136 if (!(action->flags & IRQF_DISABLED)) 138 if (!(action->flags & IRQF_DISABLED))
137 local_irq_enable_in_hardirq(); 139 local_irq_enable_in_hardirq();
138 140
@@ -165,11 +167,12 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
165 */ 167 */
166unsigned int __do_IRQ(unsigned int irq) 168unsigned int __do_IRQ(unsigned int irq)
167{ 169{
168 struct irq_desc *desc = irq_desc + irq; 170 struct irq_desc *desc = irq_to_desc(irq);
169 struct irqaction *action; 171 struct irqaction *action;
170 unsigned int status; 172 unsigned int status;
171 173
172 kstat_this_cpu.irqs[irq]++; 174 kstat_incr_irqs_this_cpu(irq, desc);
175
173 if (CHECK_IRQ_PER_CPU(desc->status)) { 176 if (CHECK_IRQ_PER_CPU(desc->status)) {
174 irqreturn_t action_ret; 177 irqreturn_t action_ret;
175 178
@@ -256,8 +259,8 @@ out:
256} 259}
257#endif 260#endif
258 261
259#ifdef CONFIG_TRACE_IRQFLAGS
260 262
263#ifdef CONFIG_TRACE_IRQFLAGS
261/* 264/*
262 * lockdep: we want to handle all irq_desc locks as a single lock-class: 265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
263 */ 266 */
@@ -265,10 +268,10 @@ static struct lock_class_key irq_desc_lock_class;
265 268
266void early_init_irq_lock_class(void) 269void early_init_irq_lock_class(void)
267{ 270{
271 struct irq_desc *desc;
268 int i; 272 int i;
269 273
270 for (i = 0; i < NR_IRQS; i++) 274 for_each_irq_desc(i, desc)
271 lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); 275 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
272} 276}
273
274#endif 277#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 08a849a22447..c9767e641980 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -10,12 +10,15 @@ extern void irq_chip_set_defaults(struct irq_chip *chip);
10/* Set default handler: */ 10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); 11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12 12
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags);
15
13#ifdef CONFIG_PROC_FS 16#ifdef CONFIG_PROC_FS
14extern void register_irq_proc(unsigned int irq); 17extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
15extern void register_handler_proc(unsigned int irq, struct irqaction *action); 18extern void register_handler_proc(unsigned int irq, struct irqaction *action);
16extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 19extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
17#else 20#else
18static inline void register_irq_proc(unsigned int irq) { } 21static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
19static inline void register_handler_proc(unsigned int irq, 22static inline void register_handler_proc(unsigned int irq,
20 struct irqaction *action) { } 23 struct irqaction *action) { }
21static inline void unregister_handler_proc(unsigned int irq, 24static inline void unregister_handler_proc(unsigned int irq,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0314074fa232..c498a1b8c621 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL;
31 */ 31 */
32void synchronize_irq(unsigned int irq) 32void synchronize_irq(unsigned int irq)
33{ 33{
34 struct irq_desc *desc = irq_desc + irq; 34 struct irq_desc *desc = irq_to_desc(irq);
35 unsigned int status; 35 unsigned int status;
36 36
37 if (irq >= NR_IRQS) 37 if (!desc)
38 return; 38 return;
39 39
40 do { 40 do {
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq);
64 */ 64 */
65int irq_can_set_affinity(unsigned int irq) 65int irq_can_set_affinity(unsigned int irq)
66{ 66{
67 struct irq_desc *desc = irq_desc + irq; 67 struct irq_desc *desc = irq_to_desc(irq);
68 68
69 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 69 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
70 !desc->chip->set_affinity) 70 !desc->chip->set_affinity)
@@ -81,15 +81,21 @@ int irq_can_set_affinity(unsigned int irq)
81 */ 81 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 82int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
83{ 83{
84 struct irq_desc *desc = irq_desc + irq; 84 struct irq_desc *desc = irq_to_desc(irq);
85 85
86 if (!desc->chip->set_affinity) 86 if (!desc->chip->set_affinity)
87 return -EINVAL; 87 return -EINVAL;
88 88
89 set_balance_irq_affinity(irq, cpumask);
90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 89#ifdef CONFIG_GENERIC_PENDING_IRQ
92 set_pending_irq(irq, cpumask); 90 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
91 unsigned long flags;
92
93 spin_lock_irqsave(&desc->lock, flags);
94 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask);
96 spin_unlock_irqrestore(&desc->lock, flags);
97 } else
98 set_pending_irq(irq, cpumask);
93#else 99#else
94 desc->affinity = cpumask; 100 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask); 101 desc->chip->set_affinity(irq, cpumask);
@@ -104,16 +110,17 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
104int irq_select_affinity(unsigned int irq) 110int irq_select_affinity(unsigned int irq)
105{ 111{
106 cpumask_t mask; 112 cpumask_t mask;
113 struct irq_desc *desc;
107 114
108 if (!irq_can_set_affinity(irq)) 115 if (!irq_can_set_affinity(irq))
109 return 0; 116 return 0;
110 117
111 cpus_and(mask, cpu_online_map, irq_default_affinity); 118 cpus_and(mask, cpu_online_map, irq_default_affinity);
112 119
113 irq_desc[irq].affinity = mask; 120 desc = irq_to_desc(irq);
114 irq_desc[irq].chip->set_affinity(irq, mask); 121 desc->affinity = mask;
122 desc->chip->set_affinity(irq, mask);
115 123
116 set_balance_irq_affinity(irq, mask);
117 return 0; 124 return 0;
118} 125}
119#endif 126#endif
@@ -133,10 +140,10 @@ int irq_select_affinity(unsigned int irq)
133 */ 140 */
134void disable_irq_nosync(unsigned int irq) 141void disable_irq_nosync(unsigned int irq)
135{ 142{
136 struct irq_desc *desc = irq_desc + irq; 143 struct irq_desc *desc = irq_to_desc(irq);
137 unsigned long flags; 144 unsigned long flags;
138 145
139 if (irq >= NR_IRQS) 146 if (!desc)
140 return; 147 return;
141 148
142 spin_lock_irqsave(&desc->lock, flags); 149 spin_lock_irqsave(&desc->lock, flags);
@@ -162,9 +169,9 @@ EXPORT_SYMBOL(disable_irq_nosync);
162 */ 169 */
163void disable_irq(unsigned int irq) 170void disable_irq(unsigned int irq)
164{ 171{
165 struct irq_desc *desc = irq_desc + irq; 172 struct irq_desc *desc = irq_to_desc(irq);
166 173
167 if (irq >= NR_IRQS) 174 if (!desc)
168 return; 175 return;
169 176
170 disable_irq_nosync(irq); 177 disable_irq_nosync(irq);
@@ -204,10 +211,10 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
204 */ 211 */
205void enable_irq(unsigned int irq) 212void enable_irq(unsigned int irq)
206{ 213{
207 struct irq_desc *desc = irq_desc + irq; 214 struct irq_desc *desc = irq_to_desc(irq);
208 unsigned long flags; 215 unsigned long flags;
209 216
210 if (irq >= NR_IRQS) 217 if (!desc)
211 return; 218 return;
212 219
213 spin_lock_irqsave(&desc->lock, flags); 220 spin_lock_irqsave(&desc->lock, flags);
@@ -216,9 +223,9 @@ void enable_irq(unsigned int irq)
216} 223}
217EXPORT_SYMBOL(enable_irq); 224EXPORT_SYMBOL(enable_irq);
218 225
219int set_irq_wake_real(unsigned int irq, unsigned int on) 226static int set_irq_wake_real(unsigned int irq, unsigned int on)
220{ 227{
221 struct irq_desc *desc = irq_desc + irq; 228 struct irq_desc *desc = irq_to_desc(irq);
222 int ret = -ENXIO; 229 int ret = -ENXIO;
223 230
224 if (desc->chip->set_wake) 231 if (desc->chip->set_wake)
@@ -241,7 +248,7 @@ int set_irq_wake_real(unsigned int irq, unsigned int on)
241 */ 248 */
242int set_irq_wake(unsigned int irq, unsigned int on) 249int set_irq_wake(unsigned int irq, unsigned int on)
243{ 250{
244 struct irq_desc *desc = irq_desc + irq; 251 struct irq_desc *desc = irq_to_desc(irq);
245 unsigned long flags; 252 unsigned long flags;
246 int ret = 0; 253 int ret = 0;
247 254
@@ -281,12 +288,16 @@ EXPORT_SYMBOL(set_irq_wake);
281 */ 288 */
282int can_request_irq(unsigned int irq, unsigned long irqflags) 289int can_request_irq(unsigned int irq, unsigned long irqflags)
283{ 290{
291 struct irq_desc *desc = irq_to_desc(irq);
284 struct irqaction *action; 292 struct irqaction *action;
285 293
286 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) 294 if (!desc)
295 return 0;
296
297 if (desc->status & IRQ_NOREQUEST)
287 return 0; 298 return 0;
288 299
289 action = irq_desc[irq].action; 300 action = desc->action;
290 if (action) 301 if (action)
291 if (irqflags & action->flags & IRQF_SHARED) 302 if (irqflags & action->flags & IRQF_SHARED)
292 action = NULL; 303 action = NULL;
@@ -305,10 +316,11 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
305 desc->handle_irq = NULL; 316 desc->handle_irq = NULL;
306} 317}
307 318
308static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, 319int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
309 unsigned long flags) 320 unsigned long flags)
310{ 321{
311 int ret; 322 int ret;
323 struct irq_chip *chip = desc->chip;
312 324
313 if (!chip || !chip->set_type) { 325 if (!chip || !chip->set_type) {
314 /* 326 /*
@@ -326,6 +338,11 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
326 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 338 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
327 (int)(flags & IRQF_TRIGGER_MASK), 339 (int)(flags & IRQF_TRIGGER_MASK),
328 irq, chip->set_type); 340 irq, chip->set_type);
341 else {
342 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
343 desc->status &= ~IRQ_TYPE_SENSE_MASK;
344 desc->status |= flags & IRQ_TYPE_SENSE_MASK;
345 }
329 346
330 return ret; 347 return ret;
331} 348}
@@ -334,16 +351,16 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
334 * Internal function to register an irqaction - typically used to 351 * Internal function to register an irqaction - typically used to
335 * allocate special interrupts that are part of the architecture. 352 * allocate special interrupts that are part of the architecture.
336 */ 353 */
337int setup_irq(unsigned int irq, struct irqaction *new) 354static int
355__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
338{ 356{
339 struct irq_desc *desc = irq_desc + irq;
340 struct irqaction *old, **p; 357 struct irqaction *old, **p;
341 const char *old_name = NULL; 358 const char *old_name = NULL;
342 unsigned long flags; 359 unsigned long flags;
343 int shared = 0; 360 int shared = 0;
344 int ret; 361 int ret;
345 362
346 if (irq >= NR_IRQS) 363 if (!desc)
347 return -EINVAL; 364 return -EINVAL;
348 365
349 if (desc->chip == &no_irq_chip) 366 if (desc->chip == &no_irq_chip)
@@ -404,7 +421,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
404 421
405 /* Setup the type (level, edge polarity) if configured: */ 422 /* Setup the type (level, edge polarity) if configured: */
406 if (new->flags & IRQF_TRIGGER_MASK) { 423 if (new->flags & IRQF_TRIGGER_MASK) {
407 ret = __irq_set_trigger(desc->chip, irq, new->flags); 424 ret = __irq_set_trigger(desc, irq, new->flags);
408 425
409 if (ret) { 426 if (ret) {
410 spin_unlock_irqrestore(&desc->lock, flags); 427 spin_unlock_irqrestore(&desc->lock, flags);
@@ -423,16 +440,21 @@ int setup_irq(unsigned int irq, struct irqaction *new)
423 if (!(desc->status & IRQ_NOAUTOEN)) { 440 if (!(desc->status & IRQ_NOAUTOEN)) {
424 desc->depth = 0; 441 desc->depth = 0;
425 desc->status &= ~IRQ_DISABLED; 442 desc->status &= ~IRQ_DISABLED;
426 if (desc->chip->startup) 443 desc->chip->startup(irq);
427 desc->chip->startup(irq);
428 else
429 desc->chip->enable(irq);
430 } else 444 } else
431 /* Undo nested disables: */ 445 /* Undo nested disables: */
432 desc->depth = 1; 446 desc->depth = 1;
433 447
434 /* Set default affinity mask once everything is setup */ 448 /* Set default affinity mask once everything is setup */
435 irq_select_affinity(irq); 449 irq_select_affinity(irq);
450
451 } else if ((new->flags & IRQF_TRIGGER_MASK)
452 && (new->flags & IRQF_TRIGGER_MASK)
453 != (desc->status & IRQ_TYPE_SENSE_MASK)) {
454 /* hope the handler works with the actual trigger mode... */
455 pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
456 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
457 (int)(new->flags & IRQF_TRIGGER_MASK));
436 } 458 }
437 459
438 *p = new; 460 *p = new;
@@ -457,7 +479,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
457 spin_unlock_irqrestore(&desc->lock, flags); 479 spin_unlock_irqrestore(&desc->lock, flags);
458 480
459 new->irq = irq; 481 new->irq = irq;
460 register_irq_proc(irq); 482 register_irq_proc(irq, desc);
461 new->dir = NULL; 483 new->dir = NULL;
462 register_handler_proc(irq, new); 484 register_handler_proc(irq, new);
463 485
@@ -477,6 +499,20 @@ mismatch:
477} 499}
478 500
479/** 501/**
502 * setup_irq - setup an interrupt
503 * @irq: Interrupt line to setup
504 * @act: irqaction for the interrupt
505 *
506 * Used to statically setup interrupts in the early boot process.
507 */
508int setup_irq(unsigned int irq, struct irqaction *act)
509{
510 struct irq_desc *desc = irq_to_desc(irq);
511
512 return __setup_irq(irq, desc, act);
513}
514
515/**
480 * free_irq - free an interrupt 516 * free_irq - free an interrupt
481 * @irq: Interrupt line to free 517 * @irq: Interrupt line to free
482 * @dev_id: Device identity to free 518 * @dev_id: Device identity to free
@@ -492,15 +528,15 @@ mismatch:
492 */ 528 */
493void free_irq(unsigned int irq, void *dev_id) 529void free_irq(unsigned int irq, void *dev_id)
494{ 530{
495 struct irq_desc *desc; 531 struct irq_desc *desc = irq_to_desc(irq);
496 struct irqaction **p; 532 struct irqaction **p;
497 unsigned long flags; 533 unsigned long flags;
498 534
499 WARN_ON(in_interrupt()); 535 WARN_ON(in_interrupt());
500 if (irq >= NR_IRQS) 536
537 if (!desc)
501 return; 538 return;
502 539
503 desc = irq_desc + irq;
504 spin_lock_irqsave(&desc->lock, flags); 540 spin_lock_irqsave(&desc->lock, flags);
505 p = &desc->action; 541 p = &desc->action;
506 for (;;) { 542 for (;;) {
@@ -589,12 +625,14 @@ EXPORT_SYMBOL(free_irq);
589 * IRQF_SHARED Interrupt is shared 625 * IRQF_SHARED Interrupt is shared
590 * IRQF_DISABLED Disable local interrupts while processing 626 * IRQF_DISABLED Disable local interrupts while processing
591 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 627 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
628 * IRQF_TRIGGER_* Specify active edge(s) or level
592 * 629 *
593 */ 630 */
594int request_irq(unsigned int irq, irq_handler_t handler, 631int request_irq(unsigned int irq, irq_handler_t handler,
595 unsigned long irqflags, const char *devname, void *dev_id) 632 unsigned long irqflags, const char *devname, void *dev_id)
596{ 633{
597 struct irqaction *action; 634 struct irqaction *action;
635 struct irq_desc *desc;
598 int retval; 636 int retval;
599 637
600#ifdef CONFIG_LOCKDEP 638#ifdef CONFIG_LOCKDEP
@@ -611,9 +649,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
611 */ 649 */
612 if ((irqflags & IRQF_SHARED) && !dev_id) 650 if ((irqflags & IRQF_SHARED) && !dev_id)
613 return -EINVAL; 651 return -EINVAL;
614 if (irq >= NR_IRQS) 652
653 desc = irq_to_desc(irq);
654 if (!desc)
615 return -EINVAL; 655 return -EINVAL;
616 if (irq_desc[irq].status & IRQ_NOREQUEST) 656
657 if (desc->status & IRQ_NOREQUEST)
617 return -EINVAL; 658 return -EINVAL;
618 if (!handler) 659 if (!handler)
619 return -EINVAL; 660 return -EINVAL;
@@ -629,26 +670,29 @@ int request_irq(unsigned int irq, irq_handler_t handler,
629 action->next = NULL; 670 action->next = NULL;
630 action->dev_id = dev_id; 671 action->dev_id = dev_id;
631 672
673 retval = __setup_irq(irq, desc, action);
674 if (retval)
675 kfree(action);
676
632#ifdef CONFIG_DEBUG_SHIRQ 677#ifdef CONFIG_DEBUG_SHIRQ
633 if (irqflags & IRQF_SHARED) { 678 if (irqflags & IRQF_SHARED) {
634 /* 679 /*
635 * It's a shared IRQ -- the driver ought to be prepared for it 680 * It's a shared IRQ -- the driver ought to be prepared for it
636 * to happen immediately, so let's make sure.... 681 * to happen immediately, so let's make sure....
637 * We do this before actually registering it, to make sure that 682 * We disable the irq to make sure that a 'real' IRQ doesn't
638 * a 'real' IRQ doesn't run in parallel with our fake 683 * run in parallel with our fake.
639 */ 684 */
640 unsigned long flags; 685 unsigned long flags;
641 686
687 disable_irq(irq);
642 local_irq_save(flags); 688 local_irq_save(flags);
689
643 handler(irq, dev_id); 690 handler(irq, dev_id);
691
644 local_irq_restore(flags); 692 local_irq_restore(flags);
693 enable_irq(irq);
645 } 694 }
646#endif 695#endif
647
648 retval = setup_irq(irq, action);
649 if (retval)
650 kfree(action);
651
652 return retval; 696 return retval;
653} 697}
654EXPORT_SYMBOL(request_irq); 698EXPORT_SYMBOL(request_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 77b7acc875c5..90b920d3f52b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,18 +3,18 @@
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{ 5{
6 struct irq_desc *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_to_desc(irq);
7 unsigned long flags; 7 unsigned long flags;
8 8
9 spin_lock_irqsave(&desc->lock, flags); 9 spin_lock_irqsave(&desc->lock, flags);
10 desc->status |= IRQ_MOVE_PENDING; 10 desc->status |= IRQ_MOVE_PENDING;
11 irq_desc[irq].pending_mask = mask; 11 desc->pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags); 12 spin_unlock_irqrestore(&desc->lock, flags);
13} 13}
14 14
15void move_masked_irq(int irq) 15void move_masked_irq(int irq)
16{ 16{
17 struct irq_desc *desc = irq_desc + irq; 17 struct irq_desc *desc = irq_to_desc(irq);
18 cpumask_t tmp; 18 cpumask_t tmp;
19 19
20 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 20 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -30,7 +30,7 @@ void move_masked_irq(int irq)
30 30
31 desc->status &= ~IRQ_MOVE_PENDING; 31 desc->status &= ~IRQ_MOVE_PENDING;
32 32
33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) 33 if (unlikely(cpus_empty(desc->pending_mask)))
34 return; 34 return;
35 35
36 if (!desc->chip->set_affinity) 36 if (!desc->chip->set_affinity)
@@ -38,7 +38,7 @@ void move_masked_irq(int irq)
38 38
39 assert_spin_locked(&desc->lock); 39 assert_spin_locked(&desc->lock);
40 40
41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); 41 cpus_and(tmp, desc->pending_mask, cpu_online_map);
42 42
43 /* 43 /*
44 * If there was a valid mask to work with, please 44 * If there was a valid mask to work with, please
@@ -55,12 +55,12 @@ void move_masked_irq(int irq)
55 if (likely(!cpus_empty(tmp))) { 55 if (likely(!cpus_empty(tmp))) {
56 desc->chip->set_affinity(irq,tmp); 56 desc->chip->set_affinity(irq,tmp);
57 } 57 }
58 cpus_clear(irq_desc[irq].pending_mask); 58 cpus_clear(desc->pending_mask);
59} 59}
60 60
61void move_native_irq(int irq) 61void move_native_irq(int irq)
62{ 62{
63 struct irq_desc *desc = irq_desc + irq; 63 struct irq_desc *desc = irq_to_desc(irq);
64 64
65 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 65 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
66 return; 66 return;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a09dd29c2fd7..4d161c70ba55 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
19 19
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_desc + (long)m->private; 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 cpumask_t *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
43 cpumask_t new_value; 43 cpumask_t new_value;
44 int err; 44 int err;
45 45
46 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
47 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
48 return -EIO; 48 return -EIO;
49 49
@@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = {
132static int irq_spurious_read(char *page, char **start, off_t off, 132static int irq_spurious_read(char *page, char **start, off_t off,
133 int count, int *eof, void *data) 133 int count, int *eof, void *data)
134{ 134{
135 struct irq_desc *d = &irq_desc[(long) data]; 135 struct irq_desc *desc = irq_to_desc((long) data);
136 return sprintf(page, "count %u\n" 136 return sprintf(page, "count %u\n"
137 "unhandled %u\n" 137 "unhandled %u\n"
138 "last_unhandled %u ms\n", 138 "last_unhandled %u ms\n",
139 d->irq_count, 139 desc->irq_count,
140 d->irqs_unhandled, 140 desc->irqs_unhandled,
141 jiffies_to_msecs(d->last_unhandled)); 141 jiffies_to_msecs(desc->last_unhandled));
142} 142}
143 143
144#define MAX_NAMELEN 128 144#define MAX_NAMELEN 128
145 145
146static int name_unique(unsigned int irq, struct irqaction *new_action) 146static int name_unique(unsigned int irq, struct irqaction *new_action)
147{ 147{
148 struct irq_desc *desc = irq_desc + irq; 148 struct irq_desc *desc = irq_to_desc(irq);
149 struct irqaction *action; 149 struct irqaction *action;
150 unsigned long flags; 150 unsigned long flags;
151 int ret = 1; 151 int ret = 1;
@@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
165void register_handler_proc(unsigned int irq, struct irqaction *action) 165void register_handler_proc(unsigned int irq, struct irqaction *action)
166{ 166{
167 char name [MAX_NAMELEN]; 167 char name [MAX_NAMELEN];
168 struct irq_desc *desc = irq_to_desc(irq);
168 169
169 if (!irq_desc[irq].dir || action->dir || !action->name || 170 if (!desc->dir || action->dir || !action->name ||
170 !name_unique(irq, action)) 171 !name_unique(irq, action))
171 return; 172 return;
172 173
@@ -174,36 +175,34 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
174 snprintf(name, MAX_NAMELEN, "%s", action->name); 175 snprintf(name, MAX_NAMELEN, "%s", action->name);
175 176
176 /* create /proc/irq/1234/handler/ */ 177 /* create /proc/irq/1234/handler/ */
177 action->dir = proc_mkdir(name, irq_desc[irq].dir); 178 action->dir = proc_mkdir(name, desc->dir);
178} 179}
179 180
180#undef MAX_NAMELEN 181#undef MAX_NAMELEN
181 182
182#define MAX_NAMELEN 10 183#define MAX_NAMELEN 10
183 184
184void register_irq_proc(unsigned int irq) 185void register_irq_proc(unsigned int irq, struct irq_desc *desc)
185{ 186{
186 char name [MAX_NAMELEN]; 187 char name [MAX_NAMELEN];
187 struct proc_dir_entry *entry; 188 struct proc_dir_entry *entry;
188 189
189 if (!root_irq_dir || 190 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
190 (irq_desc[irq].chip == &no_irq_chip) ||
191 irq_desc[irq].dir)
192 return; 191 return;
193 192
194 memset(name, 0, MAX_NAMELEN); 193 memset(name, 0, MAX_NAMELEN);
195 sprintf(name, "%d", irq); 194 sprintf(name, "%d", irq);
196 195
197 /* create /proc/irq/1234 */ 196 /* create /proc/irq/1234 */
198 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); 197 desc->dir = proc_mkdir(name, root_irq_dir);
199 198
200#ifdef CONFIG_SMP 199#ifdef CONFIG_SMP
201 /* create /proc/irq/<irq>/smp_affinity */ 200 /* create /proc/irq/<irq>/smp_affinity */
202 proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, 201 proc_create_data("smp_affinity", 0600, desc->dir,
203 &irq_affinity_proc_fops, (void *)(long)irq); 202 &irq_affinity_proc_fops, (void *)(long)irq);
204#endif 203#endif
205 204
206 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); 205 entry = create_proc_entry("spurious", 0444, desc->dir);
207 if (entry) { 206 if (entry) {
208 entry->data = (void *)(long)irq; 207 entry->data = (void *)(long)irq;
209 entry->read_proc = irq_spurious_read; 208 entry->read_proc = irq_spurious_read;
@@ -214,11 +213,14 @@ void register_irq_proc(unsigned int irq)
214 213
215void unregister_handler_proc(unsigned int irq, struct irqaction *action) 214void unregister_handler_proc(unsigned int irq, struct irqaction *action)
216{ 215{
217 if (action->dir) 216 if (action->dir) {
218 remove_proc_entry(action->dir->name, irq_desc[irq].dir); 217 struct irq_desc *desc = irq_to_desc(irq);
218
219 remove_proc_entry(action->dir->name, desc->dir);
220 }
219} 221}
220 222
221void register_default_affinity_proc(void) 223static void register_default_affinity_proc(void)
222{ 224{
223#ifdef CONFIG_SMP 225#ifdef CONFIG_SMP
224 proc_create("irq/default_smp_affinity", 0600, NULL, 226 proc_create("irq/default_smp_affinity", 0600, NULL,
@@ -228,7 +230,8 @@ void register_default_affinity_proc(void)
228 230
229void init_irq_proc(void) 231void init_irq_proc(void)
230{ 232{
231 int i; 233 unsigned int irq;
234 struct irq_desc *desc;
232 235
233 /* create /proc/irq */ 236 /* create /proc/irq */
234 root_irq_dir = proc_mkdir("irq", NULL); 237 root_irq_dir = proc_mkdir("irq", NULL);
@@ -240,7 +243,7 @@ void init_irq_proc(void)
240 /* 243 /*
241 * Create entries for all existing IRQs. 244 * Create entries for all existing IRQs.
242 */ 245 */
243 for (i = 0; i < NR_IRQS; i++) 246 for_each_irq_desc(irq, desc)
244 register_irq_proc(i); 247 register_irq_proc(irq, desc);
245} 248}
246 249
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index a8046791ba2d..89c7117acf2b 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -33,10 +33,10 @@ static void resend_irqs(unsigned long arg)
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 int irq; 34 int irq;
35 35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) { 36 while (!bitmap_empty(irqs_resend, nr_irqs)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS); 37 irq = find_first_bit(irqs_resend, nr_irqs);
38 clear_bit(irq, irqs_resend); 38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq; 39 desc = irq_to_desc(irq);
40 local_irq_disable(); 40 local_irq_disable();
41 desc->handle_irq(irq, desc); 41 desc->handle_irq(irq, desc);
42 local_irq_enable(); 42 local_irq_enable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c66d3f10e853..dd364c11e56e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -12,83 +12,122 @@
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h>
15 16
16static int irqfixup __read_mostly; 17static int irqfixup __read_mostly;
17 18
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
20static void poll_spurious_irqs(unsigned long dummy);
21static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
22
18/* 23/*
19 * Recovery handler for misrouted interrupts. 24 * Recovery handler for misrouted interrupts.
20 */ 25 */
21static int misrouted_irq(int irq) 26static int try_one_irq(int irq, struct irq_desc *desc)
22{ 27{
23 int i; 28 struct irqaction *action;
24 int ok = 0; 29 int ok = 0, work = 0;
25 int work = 0; /* Did we do work for a real IRQ */
26
27 for (i = 1; i < NR_IRQS; i++) {
28 struct irq_desc *desc = irq_desc + i;
29 struct irqaction *action;
30
31 if (i == irq) /* Already tried */
32 continue;
33 30
34 spin_lock(&desc->lock); 31 spin_lock(&desc->lock);
35 /* Already running on another processor */ 32 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
37 /* 34 /*
38 * Already running: If it is shared get the other 35 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too 36 * CPU to go looking for our mystery interrupt too
40 */ 37 */
41 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
42 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock);
44 continue;
45 }
46 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS;
48 action = desc->action;
49 spin_unlock(&desc->lock); 40 spin_unlock(&desc->lock);
41 return ok;
42 }
43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action;
46 spin_unlock(&desc->lock);
50 47
51 while (action) { 48 while (action) {
52 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
53 if (action->flags & IRQF_SHARED) { 50 if (action->flags & IRQF_SHARED) {
54 if (action->handler(i, action->dev_id) == 51 if (action->handler(irq, action->dev_id) ==
55 IRQ_HANDLED) 52 IRQ_HANDLED)
56 ok = 1; 53 ok = 1;
57 }
58 action = action->next;
59 } 54 }
60 local_irq_disable(); 55 action = action->next;
61 /* Now clean up the flags */ 56 }
62 spin_lock(&desc->lock); 57 local_irq_disable();
63 action = desc->action; 58 /* Now clean up the flags */
59 spin_lock(&desc->lock);
60 action = desc->action;
64 61
62 /*
63 * While we were looking for a fixup someone queued a real
64 * IRQ clashing with our walk:
65 */
66 while ((desc->status & IRQ_PENDING) && action) {
65 /* 67 /*
66 * While we were looking for a fixup someone queued a real 68 * Perform real IRQ processing for the IRQ we deferred
67 * IRQ clashing with our walk:
68 */
69 while ((desc->status & IRQ_PENDING) && action) {
70 /*
71 * Perform real IRQ processing for the IRQ we deferred
72 */
73 work = 1;
74 spin_unlock(&desc->lock);
75 handle_IRQ_event(i, action);
76 spin_lock(&desc->lock);
77 desc->status &= ~IRQ_PENDING;
78 }
79 desc->status &= ~IRQ_INPROGRESS;
80 /*
81 * If we did actual work for the real IRQ line we must let the
82 * IRQ controller clean up too
83 */ 69 */
84 if (work && desc->chip && desc->chip->end) 70 work = 1;
85 desc->chip->end(i);
86 spin_unlock(&desc->lock); 71 spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING;
75 }
76 desc->status &= ~IRQ_INPROGRESS;
77 /*
78 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too
80 */
81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq);
83 spin_unlock(&desc->lock);
84
85 return ok;
86}
87
88static int misrouted_irq(int irq)
89{
90 struct irq_desc *desc;
91 int i, ok = 0;
92
93 for_each_irq_desc(i, desc) {
94 if (!i)
95 continue;
96
97 if (i == irq) /* Already tried */
98 continue;
99
100 if (try_one_irq(i, desc))
101 ok = 1;
87 } 102 }
88 /* So the caller can adjust the irq error counts */ 103 /* So the caller can adjust the irq error counts */
89 return ok; 104 return ok;
90} 105}
91 106
107static void poll_spurious_irqs(unsigned long dummy)
108{
109 struct irq_desc *desc;
110 int i;
111
112 for_each_irq_desc(i, desc) {
113 unsigned int status;
114
115 if (!i)
116 continue;
117
118 /* Racy but it doesn't matter */
119 status = desc->status;
120 barrier();
121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue;
123
124 try_one_irq(i, desc);
125 }
126
127 mod_timer(&poll_spurious_irq_timer,
128 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
129}
130
92/* 131/*
93 * If 99,900 of the previous 100,000 interrupts have not been handled 132 * If 99,900 of the previous 100,000 interrupts have not been handled
94 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 133 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -137,7 +176,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
137 } 176 }
138} 177}
139 178
140static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) 179static inline int
180try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
181 irqreturn_t action_ret)
141{ 182{
142 struct irqaction *action; 183 struct irqaction *action;
143 184
@@ -212,6 +253,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
212 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 253 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
213 desc->depth++; 254 desc->depth++;
214 desc->chip->disable(irq); 255 desc->chip->disable(irq);
256
257 mod_timer(&poll_spurious_irq_timer,
258 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
215 } 259 }
216 desc->irqs_unhandled = 0; 260 desc->irqs_unhandled = 0;
217} 261}
@@ -241,7 +285,7 @@ static int __init irqfixup_setup(char *str)
241 285
242__setup("irqfixup", irqfixup_setup); 286__setup("irqfixup", irqfixup_setup);
243module_param(irqfixup, int, 0644); 287module_param(irqfixup, int, 0644);
244MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); 288MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
245 289
246static int __init irqpoll_setup(char *str) 290static int __init irqpoll_setup(char *str)
247{ 291{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 55 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 56 break;
57 case ITIMER_VIRTUAL: 57 case ITIMER_VIRTUAL:
58 read_lock(&tasklist_lock);
59 spin_lock_irq(&tsk->sighand->siglock); 58 spin_lock_irq(&tsk->sighand->siglock);
60 cval = tsk->signal->it_virt_expires; 59 cval = tsk->signal->it_virt_expires;
61 cinterval = tsk->signal->it_virt_incr; 60 cinterval = tsk->signal->it_virt_incr;
62 if (!cputime_eq(cval, cputime_zero)) { 61 if (!cputime_eq(cval, cputime_zero)) {
63 struct task_struct *t = tsk; 62 struct task_cputime cputime;
64 cputime_t utime = tsk->signal->utime; 63 cputime_t utime;
65 do { 64
66 utime = cputime_add(utime, t->utime); 65 thread_group_cputime(tsk, &cputime);
67 t = next_thread(t); 66 utime = cputime.utime;
68 } while (t != tsk);
69 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
70 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
71 } else { 69 } else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
73 } 71 }
74 } 72 }
75 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
76 read_unlock(&tasklist_lock);
77 cputime_to_timeval(cval, &value->it_value); 74 cputime_to_timeval(cval, &value->it_value);
78 cputime_to_timeval(cinterval, &value->it_interval); 75 cputime_to_timeval(cinterval, &value->it_interval);
79 break; 76 break;
80 case ITIMER_PROF: 77 case ITIMER_PROF:
81 read_lock(&tasklist_lock);
82 spin_lock_irq(&tsk->sighand->siglock); 78 spin_lock_irq(&tsk->sighand->siglock);
83 cval = tsk->signal->it_prof_expires; 79 cval = tsk->signal->it_prof_expires;
84 cinterval = tsk->signal->it_prof_incr; 80 cinterval = tsk->signal->it_prof_incr;
85 if (!cputime_eq(cval, cputime_zero)) { 81 if (!cputime_eq(cval, cputime_zero)) {
86 struct task_struct *t = tsk; 82 struct task_cputime times;
87 cputime_t ptime = cputime_add(tsk->signal->utime, 83 cputime_t ptime;
88 tsk->signal->stime); 84
89 do { 85 thread_group_cputime(tsk, &times);
90 ptime = cputime_add(ptime, 86 ptime = cputime_add(times.utime, times.stime);
91 cputime_add(t->utime,
92 t->stime));
93 t = next_thread(t);
94 } while (t != tsk);
95 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
96 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
97 } else { 89 } else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
99 } 91 }
100 } 92 }
101 spin_unlock_irq(&tsk->sighand->siglock); 93 spin_unlock_irq(&tsk->sighand->siglock);
102 read_unlock(&tasklist_lock);
103 cputime_to_timeval(cval, &value->it_value); 94 cputime_to_timeval(cval, &value->it_value);
104 cputime_to_timeval(cinterval, &value->it_interval); 95 cputime_to_timeval(cinterval, &value->it_interval);
105 break; 96 break;
@@ -185,7 +176,6 @@ again:
185 case ITIMER_VIRTUAL: 176 case ITIMER_VIRTUAL:
186 nval = timeval_to_cputime(&value->it_value); 177 nval = timeval_to_cputime(&value->it_value);
187 ninterval = timeval_to_cputime(&value->it_interval); 178 ninterval = timeval_to_cputime(&value->it_interval);
188 read_lock(&tasklist_lock);
189 spin_lock_irq(&tsk->sighand->siglock); 179 spin_lock_irq(&tsk->sighand->siglock);
190 cval = tsk->signal->it_virt_expires; 180 cval = tsk->signal->it_virt_expires;
191 cinterval = tsk->signal->it_virt_incr; 181 cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
200 tsk->signal->it_virt_expires = nval; 190 tsk->signal->it_virt_expires = nval;
201 tsk->signal->it_virt_incr = ninterval; 191 tsk->signal->it_virt_incr = ninterval;
202 spin_unlock_irq(&tsk->sighand->siglock); 192 spin_unlock_irq(&tsk->sighand->siglock);
203 read_unlock(&tasklist_lock);
204 if (ovalue) { 193 if (ovalue) {
205 cputime_to_timeval(cval, &ovalue->it_value); 194 cputime_to_timeval(cval, &ovalue->it_value);
206 cputime_to_timeval(cinterval, &ovalue->it_interval); 195 cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
209 case ITIMER_PROF: 198 case ITIMER_PROF:
210 nval = timeval_to_cputime(&value->it_value); 199 nval = timeval_to_cputime(&value->it_value);
211 ninterval = timeval_to_cputime(&value->it_interval); 200 ninterval = timeval_to_cputime(&value->it_interval);
212 read_lock(&tasklist_lock);
213 spin_lock_irq(&tsk->sighand->siglock); 201 spin_lock_irq(&tsk->sighand->siglock);
214 cval = tsk->signal->it_prof_expires; 202 cval = tsk->signal->it_prof_expires;
215 cinterval = tsk->signal->it_prof_incr; 203 cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
224 tsk->signal->it_prof_expires = nval; 212 tsk->signal->it_prof_expires = nval;
225 tsk->signal->it_prof_incr = ninterval; 213 tsk->signal->it_prof_incr = ninterval;
226 spin_unlock_irq(&tsk->sighand->siglock); 214 spin_unlock_irq(&tsk->sighand->siglock);
227 read_unlock(&tasklist_lock);
228 if (ovalue) { 215 if (ovalue) {
229 cputime_to_timeval(cval, &ovalue->it_value); 216 cputime_to_timeval(cval, &ovalue->it_value);
230 cputime_to_timeval(cinterval, &ovalue->it_interval); 217 cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 38fc10ac7541..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr,
260 /* see if it's in a module */ 260 /* see if it's in a module */
261 return module_address_lookup(addr, symbolsize, offset, modname, 261 return module_address_lookup(addr, symbolsize, offset, modname,
262 namebuf); 262 namebuf);
263 return NULL;
264} 263}
265 264
266int lookup_symbol_name(unsigned long addr, char *symname) 265int lookup_symbol_name(unsigned long addr, char *symname)
@@ -305,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address)
305 char *modname; 304 char *modname;
306 const char *name; 305 const char *name;
307 unsigned long offset, size; 306 unsigned long offset, size;
308 char namebuf[KSYM_NAME_LEN]; 307 int len;
309 308
310 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 309 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
311 if (!name) 310 if (!name)
312 return sprintf(buffer, "0x%lx", address); 311 return sprintf(buffer, "0x%lx", address);
313 312
313 if (name != buffer)
314 strcpy(buffer, name);
315 len = strlen(buffer);
316 buffer += len;
317
314 if (modname) 318 if (modname)
315 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 319 len += sprintf(buffer, "+%#lx/%#lx [%s]",
316 size, modname); 320 offset, size, modname);
317 else 321 else
318 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 322 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
323
324 return len;
319} 325}
320 326
321/* Look up a kernel symbol and print it to the kernel messages. */ 327/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index aef265325cd3..ac0fde7b54d0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -30,6 +30,7 @@
30#include <linux/pm.h> 30#include <linux/pm.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h>
33 34
34#include <asm/page.h> 35#include <asm/page.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
@@ -1371,6 +1372,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1371 VMCOREINFO_SYMBOL(node_online_map); 1372 VMCOREINFO_SYMBOL(node_online_map);
1372 VMCOREINFO_SYMBOL(swapper_pg_dir); 1373 VMCOREINFO_SYMBOL(swapper_pg_dir);
1373 VMCOREINFO_SYMBOL(_stext); 1374 VMCOREINFO_SYMBOL(_stext);
1375 VMCOREINFO_SYMBOL(vmlist);
1374 1376
1375#ifndef CONFIG_NEED_MULTIPLE_NODES 1377#ifndef CONFIG_NEED_MULTIPLE_NODES
1376 VMCOREINFO_SYMBOL(mem_map); 1378 VMCOREINFO_SYMBOL(mem_map);
@@ -1406,6 +1408,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1406 VMCOREINFO_OFFSET(free_area, free_list); 1408 VMCOREINFO_OFFSET(free_area, free_list);
1407 VMCOREINFO_OFFSET(list_head, next); 1409 VMCOREINFO_OFFSET(list_head, next);
1408 VMCOREINFO_OFFSET(list_head, prev); 1410 VMCOREINFO_OFFSET(list_head, prev);
1411 VMCOREINFO_OFFSET(vm_struct, addr);
1409 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1412 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1410 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1411 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 25d955dbb989..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 590
591 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
593 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
594 local_irq_restore(flags); 595 local_irq_restore(flags);
595} 596}
@@ -1432,6 +1433,7 @@ acquirelock:
1432 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1433 1434
1434 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1435 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1436 local_irq_restore(flags); 1438 local_irq_restore(flags);
1437 1439
@@ -1524,6 +1526,7 @@ acquirelock:
1524kgdb_restore: 1526kgdb_restore:
1525 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1526 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1527 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1528 local_irq_restore(flags); 1531 local_irq_restore(flags);
1529 1532
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2456d1a0befb..3d3c3ea3a023 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -113,7 +113,7 @@ int request_module(const char *fmt, ...)
113 return ret; 113 return ret;
114} 114}
115EXPORT_SYMBOL(request_module); 115EXPORT_SYMBOL(request_module);
116#endif /* CONFIG_KMOD */ 116#endif /* CONFIG_MODULES */
117 117
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
@@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work)
265 } 265 }
266} 266}
267 267
268#ifdef CONFIG_PM 268#ifdef CONFIG_PM_SLEEP
269/* 269/*
270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
271 * (used for preventing user land processes from being created after the user 271 * (used for preventing user land processes from being created after the user
@@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
288 */ 288 */
289#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 289#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
290 290
291static int usermodehelper_pm_callback(struct notifier_block *nfb, 291/**
292 unsigned long action, 292 * usermodehelper_disable - prevent new helpers from being started
293 void *ignored) 293 */
294int usermodehelper_disable(void)
294{ 295{
295 long retval; 296 long retval;
296 297
297 switch (action) { 298 usermodehelper_disabled = 1;
298 case PM_HIBERNATION_PREPARE: 299 smp_mb();
299 case PM_SUSPEND_PREPARE: 300 /*
300 usermodehelper_disabled = 1; 301 * From now on call_usermodehelper_exec() won't start any new
301 smp_mb(); 302 * helpers, so it is sufficient if running_helpers turns out to
302 /* 303 * be zero at one point (it may be increased later, but that
303 * From now on call_usermodehelper_exec() won't start any new 304 * doesn't matter).
304 * helpers, so it is sufficient if running_helpers turns out to 305 */
305 * be zero at one point (it may be increased later, but that 306 retval = wait_event_timeout(running_helpers_waitq,
306 * doesn't matter).
307 */
308 retval = wait_event_timeout(running_helpers_waitq,
309 atomic_read(&running_helpers) == 0, 307 atomic_read(&running_helpers) == 0,
310 RUNNING_HELPERS_TIMEOUT); 308 RUNNING_HELPERS_TIMEOUT);
311 if (retval) { 309 if (retval)
312 return NOTIFY_OK; 310 return 0;
313 } else {
314 usermodehelper_disabled = 0;
315 return NOTIFY_BAD;
316 }
317 case PM_POST_HIBERNATION:
318 case PM_POST_SUSPEND:
319 usermodehelper_disabled = 0;
320 return NOTIFY_OK;
321 }
322 311
323 return NOTIFY_DONE; 312 usermodehelper_disabled = 0;
313 return -EAGAIN;
314}
315
316/**
317 * usermodehelper_enable - allow new helpers to be started again
318 */
319void usermodehelper_enable(void)
320{
321 usermodehelper_disabled = 0;
324} 322}
325 323
326static void helper_lock(void) 324static void helper_lock(void)
@@ -334,18 +332,12 @@ static void helper_unlock(void)
334 if (atomic_dec_and_test(&running_helpers)) 332 if (atomic_dec_and_test(&running_helpers))
335 wake_up(&running_helpers_waitq); 333 wake_up(&running_helpers_waitq);
336} 334}
337 335#else /* CONFIG_PM_SLEEP */
338static void register_pm_notifier_callback(void)
339{
340 pm_notifier(usermodehelper_pm_callback, 0);
341}
342#else /* CONFIG_PM */
343#define usermodehelper_disabled 0 336#define usermodehelper_disabled 0
344 337
345static inline void helper_lock(void) {} 338static inline void helper_lock(void) {}
346static inline void helper_unlock(void) {} 339static inline void helper_unlock(void) {}
347static inline void register_pm_notifier_callback(void) {} 340#endif /* CONFIG_PM_SLEEP */
348#endif /* CONFIG_PM */
349 341
350/** 342/**
351 * call_usermodehelper_setup - prepare to call a usermode helper 343 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -515,5 +507,4 @@ void __init usermodehelper_init(void)
515{ 507{
516 khelper_wq = create_singlethread_workqueue("khelper"); 508 khelper_wq = create_singlethread_workqueue("khelper");
517 BUG_ON(!khelper_wq); 509 BUG_ON(!khelper_wq);
518 register_pm_notifier_callback();
519} 510}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 75bc2cd9ebc6..9f8a3f25259a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned; 75 spinlock_t lock ____cacheline_aligned_in_smp;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77 77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 404 spin_lock_irqsave(hlist_lock, *flags);
405} 405}
406 406
407void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{ 408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 410 spin_lock_irqsave(hlist_lock, *flags);
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
613 return -EINVAL; 613 return -EINVAL;
614 p->addr = addr; 614 p->addr = addr;
615 615
616 if (!kernel_text_address((unsigned long) p->addr) || 616 preempt_disable();
617 in_kprobes_functions((unsigned long) p->addr)) 617 if (!__kernel_text_address((unsigned long) p->addr) ||
618 in_kprobes_functions((unsigned long) p->addr)) {
619 preempt_enable();
618 return -EINVAL; 620 return -EINVAL;
621 }
619 622
620 p->mod_refcounted = 0; 623 p->mod_refcounted = 0;
621 624
622 /* 625 /*
623 * Check if are we probing a module. 626 * Check if are we probing a module.
624 */ 627 */
625 probed_mod = module_text_address((unsigned long) p->addr); 628 probed_mod = __module_text_address((unsigned long) p->addr);
626 if (probed_mod) { 629 if (probed_mod) {
627 struct module *calling_mod = module_text_address(called_from); 630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
628 /* 632 /*
629 * We must allow modules to probe themself and in this case 633 * We must allow modules to probe themself and in this case
630 * avoid incrementing the module refcount, so as to allow 634 * avoid incrementing the module refcount, so as to allow
631 * unloading of self probing modules. 635 * unloading of self probing modules.
632 */ 636 */
633 if (calling_mod && calling_mod != probed_mod) { 637 if (calling_mod && calling_mod != probed_mod) {
634 if (unlikely(!try_module_get(probed_mod))) 638 if (unlikely(!try_module_get(probed_mod))) {
639 preempt_enable();
635 return -EINVAL; 640 return -EINVAL;
641 }
636 p->mod_refcounted = 1; 642 p->mod_refcounted = 1;
637 } else 643 } else
638 probed_mod = NULL; 644 probed_mod = NULL;
639 } 645 }
646 preempt_enable();
640 647
641 p->nmissed = 0; 648 p->nmissed = 0;
642 INIT_LIST_HEAD(&p->list); 649 INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
718 struct kprobe *old_p; 725 struct kprobe *old_p;
719 726
720 if (p->mod_refcounted) { 727 if (p->mod_refcounted) {
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
721 mod = module_text_address((unsigned long)p->addr); 732 mod = module_text_address((unsigned long)p->addr);
722 if (mod) 733 if (mod)
723 module_put(mod); 734 module_put(mod);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e53bc30e9ba5..08dd8ed86c77 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18 19
19#define KERNEL_ATTR_RO(_name) \ 20#define KERNEL_ATTR_RO(_name) \
@@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
53KERNEL_ATTR_RW(uevent_helper); 54KERNEL_ATTR_RW(uevent_helper);
54#endif 55#endif
55 56
57#ifdef CONFIG_PROFILING
58static ssize_t profiling_show(struct kobject *kobj,
59 struct kobj_attribute *attr, char *buf)
60{
61 return sprintf(buf, "%d\n", prof_on);
62}
63static ssize_t profiling_store(struct kobject *kobj,
64 struct kobj_attribute *attr,
65 const char *buf, size_t count)
66{
67 int ret;
68
69 if (prof_on)
70 return -EEXIST;
71 /*
72 * This eventually calls into get_option() which
73 * has a ton of callers and is not const. It is
74 * easiest to cast it away here.
75 */
76 profile_setup((char *)buf);
77 ret = profile_init();
78 if (ret)
79 return ret;
80 ret = create_proc_profile();
81 if (ret)
82 return ret;
83 return count;
84}
85KERNEL_ATTR_RW(profiling);
86#endif
87
56#ifdef CONFIG_KEXEC 88#ifdef CONFIG_KEXEC
57static ssize_t kexec_loaded_show(struct kobject *kobj, 89static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf) 90 struct kobj_attribute *attr, char *buf)
@@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = {
109 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
110 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
111#endif 143#endif
144#ifdef CONFIG_PROFILING
145 &profiling_attr.attr,
146#endif
112#ifdef CONFIG_KEXEC 147#ifdef CONFIG_KEXEC
113 &kexec_loaded_attr.attr, 148 &kexec_loaded_attr.attr,
114 &kexec_crash_loaded_attr.attr, 149 &kexec_crash_loaded_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 96cff2f8710b..8e7a7ce3ed0a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <trace/sched.h>
16 17
17#define KTHREAD_NICE_LEVEL (-5) 18#define KTHREAD_NICE_LEVEL (-5)
18 19
@@ -171,12 +172,11 @@ EXPORT_SYMBOL(kthread_create);
171 */ 172 */
172void kthread_bind(struct task_struct *k, unsigned int cpu) 173void kthread_bind(struct task_struct *k, unsigned int cpu)
173{ 174{
174 if (k->state != TASK_UNINTERRUPTIBLE) { 175 /* Must have done schedule() in kthread() before we set_task_cpu */
176 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
175 WARN_ON(1); 177 WARN_ON(1);
176 return; 178 return;
177 } 179 }
178 /* Must have done schedule() in kthread() before we set_task_cpu */
179 wait_task_inactive(k, 0);
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
@@ -206,6 +206,8 @@ int kthread_stop(struct task_struct *k)
206 /* It could exit after stop_info.k set, but before wake_up_process. */ 206 /* It could exit after stop_info.k set, but before wake_up_process. */
207 get_task_struct(k); 207 get_task_struct(k);
208 208
209 trace_sched_kthread_stop(k);
210
209 /* Must init completion *before* thread sees kthread_stop_info.k */ 211 /* Must init completion *before* thread sees kthread_stop_info.k */
210 init_completion(&kthread_stop_info.done); 212 init_completion(&kthread_stop_info.done);
211 smp_wmb(); 213 smp_wmb();
@@ -221,6 +223,8 @@ int kthread_stop(struct task_struct *k)
221 ret = kthread_stop_info.err; 223 ret = kthread_stop_info.err;
222 mutex_unlock(&kthread_stop_lock); 224 mutex_unlock(&kthread_stop_lock);
223 225
226 trace_sched_kthread_stop_ret(ret);
227
224 return ret; 228 return ret;
225} 229}
226EXPORT_SYMBOL(kthread_stop); 230EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dbda475b13bd..06e157119d2b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2169,12 +2169,11 @@ void early_boot_irqs_on(void)
2169/* 2169/*
2170 * Hardirqs will be enabled: 2170 * Hardirqs will be enabled:
2171 */ 2171 */
2172void trace_hardirqs_on_caller(unsigned long a0) 2172void trace_hardirqs_on_caller(unsigned long ip)
2173{ 2173{
2174 struct task_struct *curr = current; 2174 struct task_struct *curr = current;
2175 unsigned long ip;
2176 2175
2177 time_hardirqs_on(CALLER_ADDR0, a0); 2176 time_hardirqs_on(CALLER_ADDR0, ip);
2178 2177
2179 if (unlikely(!debug_locks || current->lockdep_recursion)) 2178 if (unlikely(!debug_locks || current->lockdep_recursion))
2180 return; 2179 return;
@@ -2188,7 +2187,6 @@ void trace_hardirqs_on_caller(unsigned long a0)
2188 } 2187 }
2189 /* we'll do an OFF -> ON transition: */ 2188 /* we'll do an OFF -> ON transition: */
2190 curr->hardirqs_enabled = 1; 2189 curr->hardirqs_enabled = 1;
2191 ip = (unsigned long) __builtin_return_address(0);
2192 2190
2193 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2191 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2194 return; 2192 return;
@@ -2224,11 +2222,11 @@ EXPORT_SYMBOL(trace_hardirqs_on);
2224/* 2222/*
2225 * Hardirqs were disabled: 2223 * Hardirqs were disabled:
2226 */ 2224 */
2227void trace_hardirqs_off_caller(unsigned long a0) 2225void trace_hardirqs_off_caller(unsigned long ip)
2228{ 2226{
2229 struct task_struct *curr = current; 2227 struct task_struct *curr = current;
2230 2228
2231 time_hardirqs_off(CALLER_ADDR0, a0); 2229 time_hardirqs_off(CALLER_ADDR0, ip);
2232 2230
2233 if (unlikely(!debug_locks || current->lockdep_recursion)) 2231 if (unlikely(!debug_locks || current->lockdep_recursion))
2234 return; 2232 return;
@@ -2241,7 +2239,7 @@ void trace_hardirqs_off_caller(unsigned long a0)
2241 * We have done an ON -> OFF transition: 2239 * We have done an ON -> OFF transition:
2242 */ 2240 */
2243 curr->hardirqs_enabled = 0; 2241 curr->hardirqs_enabled = 0;
2244 curr->hardirq_disable_ip = _RET_IP_; 2242 curr->hardirq_disable_ip = ip;
2245 curr->hardirq_disable_event = ++curr->irq_events; 2243 curr->hardirq_disable_event = ++curr->irq_events;
2246 debug_atomic_inc(&hardirqs_off_events); 2244 debug_atomic_inc(&hardirqs_off_events);
2247 } else 2245 } else
@@ -3417,9 +3415,10 @@ retry:
3417 } 3415 }
3418 printk(" ignoring it.\n"); 3416 printk(" ignoring it.\n");
3419 unlock = 0; 3417 unlock = 0;
3418 } else {
3419 if (count != 10)
3420 printk(KERN_CONT " locked it.\n");
3420 } 3421 }
3421 if (count != 10)
3422 printk(" locked it.\n");
3423 3422
3424 do_each_thread(g, p) { 3423 do_each_thread(g, p) {
3425 /* 3424 /*
diff --git a/kernel/marker.c b/kernel/marker.c
index 7d1faecd7a51..e9c6b2bc9400 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -62,7 +62,7 @@ struct marker_entry {
62 int refcount; /* Number of times armed. 0 if disarmed. */ 62 int refcount; /* Number of times armed. 0 if disarmed. */
63 struct rcu_head rcu; 63 struct rcu_head rcu;
64 void *oldptr; 64 void *oldptr;
65 unsigned char rcu_pending:1; 65 int rcu_pending;
66 unsigned char ptype:1; 66 unsigned char ptype:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 67 char name[0]; /* Contains name'\0'format'\0' */
68}; 68};
@@ -103,11 +103,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
103 char ptype; 103 char ptype;
104 104
105 /* 105 /*
106 * preempt_disable does two things : disabling preemption to make sure 106 * rcu_read_lock_sched does two things : disabling preemption to make
107 * the teardown of the callbacks can be done correctly when they are in 107 * sure the teardown of the callbacks can be done correctly when they
108 * modules and they insure RCU read coherency. 108 * are in modules and they insure RCU read coherency.
109 */ 109 */
110 preempt_disable(); 110 rcu_read_lock_sched();
111 ptype = mdata->ptype; 111 ptype = mdata->ptype;
112 if (likely(!ptype)) { 112 if (likely(!ptype)) {
113 marker_probe_func *func; 113 marker_probe_func *func;
@@ -145,7 +145,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 145 va_end(args);
146 } 146 }
147 } 147 }
148 preempt_enable(); 148 rcu_read_unlock_sched();
149} 149}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 150EXPORT_SYMBOL_GPL(marker_probe_cb);
151 151
@@ -162,7 +162,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
162 va_list args; /* not initialized */ 162 va_list args; /* not initialized */
163 char ptype; 163 char ptype;
164 164
165 preempt_disable(); 165 rcu_read_lock_sched();
166 ptype = mdata->ptype; 166 ptype = mdata->ptype;
167 if (likely(!ptype)) { 167 if (likely(!ptype)) {
168 marker_probe_func *func; 168 marker_probe_func *func;
@@ -195,7 +195,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 195 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 196 mdata->format, &args);
197 } 197 }
198 preempt_enable(); 198 rcu_read_unlock_sched();
199} 199}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); 200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 201
@@ -560,7 +560,7 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
560 * Disable a marker and its probe callback. 560 * Disable a marker and its probe callback.
561 * Note: only waiting an RCU period after setting elem->call to the empty 561 * Note: only waiting an RCU period after setting elem->call to the empty
562 * function insures that the original callback is not used anymore. This insured 562 * function insures that the original callback is not used anymore. This insured
563 * by preempt_disable around the call site. 563 * by rcu_read_lock_sched around the call site.
564 */ 564 */
565static void disable_marker(struct marker *elem) 565static void disable_marker(struct marker *elem)
566{ 566{
@@ -653,11 +653,17 @@ int marker_probe_register(const char *name, const char *format,
653 entry = get_marker(name); 653 entry = get_marker(name);
654 if (!entry) { 654 if (!entry) {
655 entry = add_marker(name, format); 655 entry = add_marker(name, format);
656 if (IS_ERR(entry)) { 656 if (IS_ERR(entry))
657 ret = PTR_ERR(entry); 657 ret = PTR_ERR(entry);
658 goto end; 658 } else if (format) {
659 } 659 if (!entry->format)
660 ret = marker_set_format(&entry, format);
661 else if (strcmp(entry->format, format))
662 ret = -EPERM;
660 } 663 }
664 if (ret)
665 goto end;
666
661 /* 667 /*
662 * If we detect that a call_rcu is pending for this marker, 668 * If we detect that a call_rcu is pending for this marker,
663 * make sure it's executed now. 669 * make sure it's executed now.
@@ -674,6 +680,8 @@ int marker_probe_register(const char *name, const char *format,
674 mutex_lock(&markers_mutex); 680 mutex_lock(&markers_mutex);
675 entry = get_marker(name); 681 entry = get_marker(name);
676 WARN_ON(!entry); 682 WARN_ON(!entry);
683 if (entry->rcu_pending)
684 rcu_barrier_sched();
677 entry->oldptr = old; 685 entry->oldptr = old;
678 entry->rcu_pending = 1; 686 entry->rcu_pending = 1;
679 /* write rcu_pending before calling the RCU callback */ 687 /* write rcu_pending before calling the RCU callback */
@@ -717,6 +725,8 @@ int marker_probe_unregister(const char *name,
717 entry = get_marker(name); 725 entry = get_marker(name);
718 if (!entry) 726 if (!entry)
719 goto end; 727 goto end;
728 if (entry->rcu_pending)
729 rcu_barrier_sched();
720 entry->oldptr = old; 730 entry->oldptr = old;
721 entry->rcu_pending = 1; 731 entry->rcu_pending = 1;
722 /* write rcu_pending before calling the RCU callback */ 732 /* write rcu_pending before calling the RCU callback */
@@ -795,6 +805,8 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
795 mutex_lock(&markers_mutex); 805 mutex_lock(&markers_mutex);
796 entry = get_marker_from_private_data(probe, probe_private); 806 entry = get_marker_from_private_data(probe, probe_private);
797 WARN_ON(!entry); 807 WARN_ON(!entry);
808 if (entry->rcu_pending)
809 rcu_barrier_sched();
798 entry->oldptr = old; 810 entry->oldptr = old;
799 entry->rcu_pending = 1; 811 entry->rcu_pending = 1;
800 /* write rcu_pending before calling the RCU callback */ 812 /* write rcu_pending before calling the RCU callback */
diff --git a/kernel/module.c b/kernel/module.c
index 9db11911e04b..1f4cc00e0c20 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,11 +20,13 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kallsyms.h> 22#include <linux/kallsyms.h>
23#include <linux/fs.h>
23#include <linux/sysfs.h> 24#include <linux/sysfs.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/elf.h> 28#include <linux/elf.h>
29#include <linux/proc_fs.h>
28#include <linux/seq_file.h> 30#include <linux/seq_file.h>
29#include <linux/syscalls.h> 31#include <linux/syscalls.h>
30#include <linux/fcntl.h> 32#include <linux/fcntl.h>
@@ -42,10 +44,13 @@
42#include <linux/string.h> 44#include <linux/string.h>
43#include <linux/mutex.h> 45#include <linux/mutex.h>
44#include <linux/unwind.h> 46#include <linux/unwind.h>
47#include <linux/rculist.h>
45#include <asm/uaccess.h> 48#include <asm/uaccess.h>
46#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
47#include <linux/license.h> 50#include <linux/license.h>
48#include <asm/sections.h> 51#include <asm/sections.h>
52#include <linux/tracepoint.h>
53#include <linux/ftrace.h>
49 54
50#if 0 55#if 0
51#define DEBUGP printk 56#define DEBUGP printk
@@ -61,7 +66,7 @@
61#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 66#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
62 67
63/* List of modules, protected by module_mutex or preempt_disable 68/* List of modules, protected by module_mutex or preempt_disable
64 * (add/delete uses stop_machine). */ 69 * (delete uses stop_machine/add uses RCU list operations). */
65static DEFINE_MUTEX(module_mutex); 70static DEFINE_MUTEX(module_mutex);
66static LIST_HEAD(modules); 71static LIST_HEAD(modules);
67 72
@@ -100,7 +105,7 @@ static inline int strong_try_module_get(struct module *mod)
100static inline void add_taint_module(struct module *mod, unsigned flag) 105static inline void add_taint_module(struct module *mod, unsigned flag)
101{ 106{
102 add_taint(flag); 107 add_taint(flag);
103 mod->taints |= flag; 108 mod->taints |= (1U << flag);
104} 109}
105 110
106/* 111/*
@@ -130,6 +135,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr,
130 return 0; 135 return 0;
131} 136}
132 137
138/* Find a module section, or NULL. */
139static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
140 const char *secstrings, const char *name)
141{
142 /* Section 0 has sh_addr 0. */
143 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
144}
145
146/* Find a module section, or NULL. Fill in number of "objects" in section. */
147static void *section_objs(Elf_Ehdr *hdr,
148 Elf_Shdr *sechdrs,
149 const char *secstrings,
150 const char *name,
151 size_t object_size,
152 unsigned int *num)
153{
154 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
155
156 /* Section 0 has sh_addr 0 and sh_size 0. */
157 *num = sechdrs[sec].sh_size / object_size;
158 return (void *)sechdrs[sec].sh_addr;
159}
160
133/* Provided by the linker */ 161/* Provided by the linker */
134extern const struct kernel_symbol __start___ksymtab[]; 162extern const struct kernel_symbol __start___ksymtab[];
135extern const struct kernel_symbol __stop___ksymtab[]; 163extern const struct kernel_symbol __stop___ksymtab[];
@@ -216,7 +244,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
216 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) 244 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
217 return true; 245 return true;
218 246
219 list_for_each_entry(mod, &modules, list) { 247 list_for_each_entry_rcu(mod, &modules, list) {
220 struct symsearch arr[] = { 248 struct symsearch arr[] = {
221 { mod->syms, mod->syms + mod->num_syms, mod->crcs, 249 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
222 NOT_GPL_ONLY, false }, 250 NOT_GPL_ONLY, false },
@@ -784,6 +812,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
784 mutex_lock(&module_mutex); 812 mutex_lock(&module_mutex);
785 /* Store the name of the last unloaded module for diagnostic purposes */ 813 /* Store the name of the last unloaded module for diagnostic purposes */
786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
815 unregister_dynamic_debug_module(mod->name);
787 free_module(mod); 816 free_module(mod);
788 817
789 out: 818 out:
@@ -923,7 +952,7 @@ static const char vermagic[] = VERMAGIC_STRING;
923static int try_to_force_load(struct module *mod, const char *symname) 952static int try_to_force_load(struct module *mod, const char *symname)
924{ 953{
925#ifdef CONFIG_MODULE_FORCE_LOAD 954#ifdef CONFIG_MODULE_FORCE_LOAD
926 if (!(tainted & TAINT_FORCED_MODULE)) 955 if (!test_taint(TAINT_FORCED_MODULE))
927 printk("%s: no version for \"%s\" found: kernel tainted.\n", 956 printk("%s: no version for \"%s\" found: kernel tainted.\n",
928 mod->name, symname); 957 mod->name, symname);
929 add_taint_module(mod, TAINT_FORCED_MODULE); 958 add_taint_module(mod, TAINT_FORCED_MODULE);
@@ -1033,7 +1062,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
1033 const unsigned long *crc; 1062 const unsigned long *crc;
1034 1063
1035 ret = find_symbol(name, &owner, &crc, 1064 ret = find_symbol(name, &owner, &crc,
1036 !(mod->taints & TAINT_PROPRIETARY_MODULE), true); 1065 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1037 if (!IS_ERR_VALUE(ret)) { 1066 if (!IS_ERR_VALUE(ret)) {
1038 /* use_module can fail due to OOM, 1067 /* use_module can fail due to OOM,
1039 or module initialization or unloading */ 1068 or module initialization or unloading */
@@ -1173,7 +1202,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1173 while (i-- > 0) 1202 while (i-- > 0)
1174 sysfs_remove_bin_file(notes_attrs->dir, 1203 sysfs_remove_bin_file(notes_attrs->dir,
1175 &notes_attrs->attrs[i]); 1204 &notes_attrs->attrs[i]);
1176 kobject_del(notes_attrs->dir); 1205 kobject_put(notes_attrs->dir);
1177 } 1206 }
1178 kfree(notes_attrs); 1207 kfree(notes_attrs);
1179} 1208}
@@ -1391,17 +1420,6 @@ static void mod_kobject_remove(struct module *mod)
1391} 1420}
1392 1421
1393/* 1422/*
1394 * link the module with the whole machine is stopped with interrupts off
1395 * - this defends against kallsyms not taking locks
1396 */
1397static int __link_module(void *_mod)
1398{
1399 struct module *mod = _mod;
1400 list_add(&mod->list, &modules);
1401 return 0;
1402}
1403
1404/*
1405 * unlink the module with the whole machine is stopped with interrupts off 1423 * unlink the module with the whole machine is stopped with interrupts off
1406 * - this defends against kallsyms not taking locks 1424 * - this defends against kallsyms not taking locks
1407 */ 1425 */
@@ -1429,6 +1447,9 @@ static void free_module(struct module *mod)
1429 /* Module unload stuff */ 1447 /* Module unload stuff */
1430 module_unload_free(mod); 1448 module_unload_free(mod);
1431 1449
1450 /* release any pointers to mcount in this module */
1451 ftrace_release(mod->module_core, mod->core_size);
1452
1432 /* This may be NULL, but that's OK */ 1453 /* This may be NULL, but that's OK */
1433 module_free(mod, mod->module_init); 1454 module_free(mod, mod->module_init);
1434 kfree(mod->args); 1455 kfree(mod->args);
@@ -1634,7 +1655,7 @@ static void set_license(struct module *mod, const char *license)
1634 license = "unspecified"; 1655 license = "unspecified";
1635 1656
1636 if (!license_is_gpl_compatible(license)) { 1657 if (!license_is_gpl_compatible(license)) {
1637 if (!(tainted & TAINT_PROPRIETARY_MODULE)) 1658 if (!test_taint(TAINT_PROPRIETARY_MODULE))
1638 printk(KERN_WARNING "%s: module license '%s' taints " 1659 printk(KERN_WARNING "%s: module license '%s' taints "
1639 "kernel.\n", mod->name, license); 1660 "kernel.\n", mod->name, license);
1640 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1661 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -1783,6 +1804,21 @@ static inline void add_kallsyms(struct module *mod,
1783} 1804}
1784#endif /* CONFIG_KALLSYMS */ 1805#endif /* CONFIG_KALLSYMS */
1785 1806
1807static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
1808{
1809#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
1810 unsigned int i;
1811
1812 for (i = 0; i < num; i++) {
1813 register_dynamic_debug_module(debug[i].modname,
1814 debug[i].type,
1815 debug[i].logical_modname,
1816 debug[i].flag_names,
1817 debug[i].hash, debug[i].hash2);
1818 }
1819#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1820}
1821
1786static void *module_alloc_update_bounds(unsigned long size) 1822static void *module_alloc_update_bounds(unsigned long size)
1787{ 1823{
1788 void *ret = module_alloc(size); 1824 void *ret = module_alloc(size);
@@ -1806,35 +1842,18 @@ static noinline struct module *load_module(void __user *umod,
1806 Elf_Ehdr *hdr; 1842 Elf_Ehdr *hdr;
1807 Elf_Shdr *sechdrs; 1843 Elf_Shdr *sechdrs;
1808 char *secstrings, *args, *modmagic, *strtab = NULL; 1844 char *secstrings, *args, *modmagic, *strtab = NULL;
1845 char *staging;
1809 unsigned int i; 1846 unsigned int i;
1810 unsigned int symindex = 0; 1847 unsigned int symindex = 0;
1811 unsigned int strindex = 0; 1848 unsigned int strindex = 0;
1812 unsigned int setupindex; 1849 unsigned int modindex, versindex, infoindex, pcpuindex;
1813 unsigned int exindex;
1814 unsigned int exportindex;
1815 unsigned int modindex;
1816 unsigned int obsparmindex;
1817 unsigned int infoindex;
1818 unsigned int gplindex;
1819 unsigned int crcindex;
1820 unsigned int gplcrcindex;
1821 unsigned int versindex;
1822 unsigned int pcpuindex;
1823 unsigned int gplfutureindex;
1824 unsigned int gplfuturecrcindex;
1825 unsigned int unwindex = 0; 1850 unsigned int unwindex = 0;
1826#ifdef CONFIG_UNUSED_SYMBOLS 1851 unsigned int num_kp, num_mcount;
1827 unsigned int unusedindex; 1852 struct kernel_param *kp;
1828 unsigned int unusedcrcindex;
1829 unsigned int unusedgplindex;
1830 unsigned int unusedgplcrcindex;
1831#endif
1832 unsigned int markersindex;
1833 unsigned int markersstringsindex;
1834 struct module *mod; 1853 struct module *mod;
1835 long err = 0; 1854 long err = 0;
1836 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1855 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1837 struct exception_table_entry *extable; 1856 unsigned long *mseg;
1838 mm_segment_t old_fs; 1857 mm_segment_t old_fs;
1839 1858
1840 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1859 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1898,6 +1917,7 @@ static noinline struct module *load_module(void __user *umod,
1898 err = -ENOEXEC; 1917 err = -ENOEXEC;
1899 goto free_hdr; 1918 goto free_hdr;
1900 } 1919 }
1920 /* This is temporary: point mod into copy of data. */
1901 mod = (void *)sechdrs[modindex].sh_addr; 1921 mod = (void *)sechdrs[modindex].sh_addr;
1902 1922
1903 if (symindex == 0) { 1923 if (symindex == 0) {
@@ -1907,22 +1927,6 @@ static noinline struct module *load_module(void __user *umod,
1907 goto free_hdr; 1927 goto free_hdr;
1908 } 1928 }
1909 1929
1910 /* Optional sections */
1911 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1912 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1913 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1914 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1915 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1916 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1917#ifdef CONFIG_UNUSED_SYMBOLS
1918 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1919 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1920 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1921 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1922#endif
1923 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1924 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1925 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1926 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1927 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1928 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
@@ -1960,6 +1964,14 @@ static noinline struct module *load_module(void __user *umod,
1960 goto free_hdr; 1964 goto free_hdr;
1961 } 1965 }
1962 1966
1967 staging = get_modinfo(sechdrs, infoindex, "staging");
1968 if (staging) {
1969 add_taint_module(mod, TAINT_CRAP);
1970 printk(KERN_WARNING "%s: module is from the staging directory,"
1971 " the quality is unknown, you have been warned.\n",
1972 mod->name);
1973 }
1974
1963 /* Now copy in args */ 1975 /* Now copy in args */
1964 args = strndup_user(uargs, ~0UL >> 1); 1976 args = strndup_user(uargs, ~0UL >> 1);
1965 if (IS_ERR(args)) { 1977 if (IS_ERR(args)) {
@@ -2070,42 +2082,57 @@ static noinline struct module *load_module(void __user *umod,
2070 if (err < 0) 2082 if (err < 0)
2071 goto cleanup; 2083 goto cleanup;
2072 2084
2073 /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ 2085 /* Now we've got everything in the final locations, we can
2074 mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); 2086 * find optional sections. */
2075 mod->syms = (void *)sechdrs[exportindex].sh_addr; 2087 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
2076 if (crcindex) 2088 &num_kp);
2077 mod->crcs = (void *)sechdrs[crcindex].sh_addr; 2089 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2078 mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); 2090 sizeof(*mod->syms), &mod->num_syms);
2079 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; 2091 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2080 if (gplcrcindex) 2092 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2081 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 2093 sizeof(*mod->gpl_syms),
2082 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 2094 &mod->num_gpl_syms);
2083 sizeof(*mod->gpl_future_syms); 2095 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2084 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 2096 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2085 if (gplfuturecrcindex) 2097 "__ksymtab_gpl_future",
2086 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 2098 sizeof(*mod->gpl_future_syms),
2099 &mod->num_gpl_future_syms);
2100 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2101 "__kcrctab_gpl_future");
2087 2102
2088#ifdef CONFIG_UNUSED_SYMBOLS 2103#ifdef CONFIG_UNUSED_SYMBOLS
2089 mod->num_unused_syms = sechdrs[unusedindex].sh_size / 2104 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2090 sizeof(*mod->unused_syms); 2105 "__ksymtab_unused",
2091 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / 2106 sizeof(*mod->unused_syms),
2092 sizeof(*mod->unused_gpl_syms); 2107 &mod->num_unused_syms);
2093 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; 2108 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2094 if (unusedcrcindex) 2109 "__kcrctab_unused");
2095 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 2110 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2096 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; 2111 "__ksymtab_unused_gpl",
2097 if (unusedgplcrcindex) 2112 sizeof(*mod->unused_gpl_syms),
2098 mod->unused_gpl_crcs 2113 &mod->num_unused_gpl_syms);
2099 = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2114 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2115 "__kcrctab_unused_gpl");
2116#endif
2117
2118#ifdef CONFIG_MARKERS
2119 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2120 sizeof(*mod->markers), &mod->num_markers);
2121#endif
2122#ifdef CONFIG_TRACEPOINTS
2123 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2124 "__tracepoints",
2125 sizeof(*mod->tracepoints),
2126 &mod->num_tracepoints);
2100#endif 2127#endif
2101 2128
2102#ifdef CONFIG_MODVERSIONS 2129#ifdef CONFIG_MODVERSIONS
2103 if ((mod->num_syms && !crcindex) 2130 if ((mod->num_syms && !mod->crcs)
2104 || (mod->num_gpl_syms && !gplcrcindex) 2131 || (mod->num_gpl_syms && !mod->gpl_crcs)
2105 || (mod->num_gpl_future_syms && !gplfuturecrcindex) 2132 || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
2106#ifdef CONFIG_UNUSED_SYMBOLS 2133#ifdef CONFIG_UNUSED_SYMBOLS
2107 || (mod->num_unused_syms && !unusedcrcindex) 2134 || (mod->num_unused_syms && !mod->unused_crcs)
2108 || (mod->num_unused_gpl_syms && !unusedgplcrcindex) 2135 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2109#endif 2136#endif
2110 ) { 2137 ) {
2111 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2138 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
@@ -2114,9 +2141,6 @@ static noinline struct module *load_module(void __user *umod,
2114 goto cleanup; 2141 goto cleanup;
2115 } 2142 }
2116#endif 2143#endif
2117 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
2118 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
2119 "__markers_strings");
2120 2144
2121 /* Now do relocations. */ 2145 /* Now do relocations. */
2122 for (i = 1; i < hdr->e_shnum; i++) { 2146 for (i = 1; i < hdr->e_shnum; i++) {
@@ -2139,22 +2163,16 @@ static noinline struct module *load_module(void __user *umod,
2139 if (err < 0) 2163 if (err < 0)
2140 goto cleanup; 2164 goto cleanup;
2141 } 2165 }
2142#ifdef CONFIG_MARKERS
2143 mod->markers = (void *)sechdrs[markersindex].sh_addr;
2144 mod->num_markers =
2145 sechdrs[markersindex].sh_size / sizeof(*mod->markers);
2146#endif
2147 2166
2148 /* Find duplicate symbols */ 2167 /* Find duplicate symbols */
2149 err = verify_export_symbols(mod); 2168 err = verify_export_symbols(mod);
2150
2151 if (err < 0) 2169 if (err < 0)
2152 goto cleanup; 2170 goto cleanup;
2153 2171
2154 /* Set up and sort exception table */ 2172 /* Set up and sort exception table */
2155 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); 2173 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2156 mod->extable = extable = (void *)sechdrs[exindex].sh_addr; 2174 sizeof(*mod->extable), &mod->num_exentries);
2157 sort_extable(extable, extable + mod->num_exentries); 2175 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2158 2176
2159 /* Finally, copy percpu area over. */ 2177 /* Finally, copy percpu area over. */
2160 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2178 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
@@ -2162,11 +2180,29 @@ static noinline struct module *load_module(void __user *umod,
2162 2180
2163 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2181 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
2164 2182
2183 if (!mod->taints) {
2184 struct mod_debug *debug;
2185 unsigned int num_debug;
2186
2165#ifdef CONFIG_MARKERS 2187#ifdef CONFIG_MARKERS
2166 if (!mod->taints)
2167 marker_update_probe_range(mod->markers, 2188 marker_update_probe_range(mod->markers,
2168 mod->markers + mod->num_markers); 2189 mod->markers + mod->num_markers);
2169#endif 2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 }
2200
2201 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount);
2205
2170 err = module_finalize(hdr, sechdrs, mod); 2206 err = module_finalize(hdr, sechdrs, mod);
2171 if (err < 0) 2207 if (err < 0)
2172 goto cleanup; 2208 goto cleanup;
@@ -2190,30 +2226,24 @@ static noinline struct module *load_module(void __user *umod,
2190 set_fs(old_fs); 2226 set_fs(old_fs);
2191 2227
2192 mod->args = args; 2228 mod->args = args;
2193 if (obsparmindex) 2229 if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
2194 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2230 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2195 mod->name); 2231 mod->name);
2196 2232
2197 /* Now sew it into the lists so we can get lockdep and oops 2233 /* Now sew it into the lists so we can get lockdep and oops
2198 * info during argument parsing. Noone should access us, since 2234 * info during argument parsing. Noone should access us, since
2199 * strong_try_module_get() will fail. */ 2235 * strong_try_module_get() will fail.
2200 stop_machine(__link_module, mod, NULL); 2236 * lockdep/oops can run asynchronous, so use the RCU list insertion
2201 2237 * function to insert in a way safe to concurrent readers.
2202 /* Size of section 0 is 0, so this works well if no params */ 2238 * The mutex protects against concurrent writers.
2203 err = parse_args(mod->name, mod->args, 2239 */
2204 (struct kernel_param *) 2240 list_add_rcu(&mod->list, &modules);
2205 sechdrs[setupindex].sh_addr, 2241
2206 sechdrs[setupindex].sh_size 2242 err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
2207 / sizeof(struct kernel_param),
2208 NULL);
2209 if (err < 0) 2243 if (err < 0)
2210 goto unlink; 2244 goto unlink;
2211 2245
2212 err = mod_sysfs_setup(mod, 2246 err = mod_sysfs_setup(mod, kp, num_kp);
2213 (struct kernel_param *)
2214 sechdrs[setupindex].sh_addr,
2215 sechdrs[setupindex].sh_size
2216 / sizeof(struct kernel_param));
2217 if (err < 0) 2247 if (err < 0)
2218 goto unlink; 2248 goto unlink;
2219 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2249 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2236,6 +2266,7 @@ static noinline struct module *load_module(void __user *umod,
2236 cleanup: 2266 cleanup:
2237 kobject_del(&mod->mkobj.kobj); 2267 kobject_del(&mod->mkobj.kobj);
2238 kobject_put(&mod->mkobj.kobj); 2268 kobject_put(&mod->mkobj.kobj);
2269 ftrace_release(mod->module_core, mod->core_size);
2239 free_unload: 2270 free_unload:
2240 module_unload_free(mod); 2271 module_unload_free(mod);
2241 module_free(mod, mod->module_init); 2272 module_free(mod, mod->module_init);
@@ -2401,7 +2432,7 @@ const char *module_address_lookup(unsigned long addr,
2401 const char *ret = NULL; 2432 const char *ret = NULL;
2402 2433
2403 preempt_disable(); 2434 preempt_disable();
2404 list_for_each_entry(mod, &modules, list) { 2435 list_for_each_entry_rcu(mod, &modules, list) {
2405 if (within(addr, mod->module_init, mod->init_size) 2436 if (within(addr, mod->module_init, mod->init_size)
2406 || within(addr, mod->module_core, mod->core_size)) { 2437 || within(addr, mod->module_core, mod->core_size)) {
2407 if (modname) 2438 if (modname)
@@ -2424,7 +2455,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2424 struct module *mod; 2455 struct module *mod;
2425 2456
2426 preempt_disable(); 2457 preempt_disable();
2427 list_for_each_entry(mod, &modules, list) { 2458 list_for_each_entry_rcu(mod, &modules, list) {
2428 if (within(addr, mod->module_init, mod->init_size) || 2459 if (within(addr, mod->module_init, mod->init_size) ||
2429 within(addr, mod->module_core, mod->core_size)) { 2460 within(addr, mod->module_core, mod->core_size)) {
2430 const char *sym; 2461 const char *sym;
@@ -2448,7 +2479,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2448 struct module *mod; 2479 struct module *mod;
2449 2480
2450 preempt_disable(); 2481 preempt_disable();
2451 list_for_each_entry(mod, &modules, list) { 2482 list_for_each_entry_rcu(mod, &modules, list) {
2452 if (within(addr, mod->module_init, mod->init_size) || 2483 if (within(addr, mod->module_init, mod->init_size) ||
2453 within(addr, mod->module_core, mod->core_size)) { 2484 within(addr, mod->module_core, mod->core_size)) {
2454 const char *sym; 2485 const char *sym;
@@ -2475,7 +2506,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2475 struct module *mod; 2506 struct module *mod;
2476 2507
2477 preempt_disable(); 2508 preempt_disable();
2478 list_for_each_entry(mod, &modules, list) { 2509 list_for_each_entry_rcu(mod, &modules, list) {
2479 if (symnum < mod->num_symtab) { 2510 if (symnum < mod->num_symtab) {
2480 *value = mod->symtab[symnum].st_value; 2511 *value = mod->symtab[symnum].st_value;
2481 *type = mod->symtab[symnum].st_info; 2512 *type = mod->symtab[symnum].st_info;
@@ -2518,7 +2549,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2518 ret = mod_find_symname(mod, colon+1); 2549 ret = mod_find_symname(mod, colon+1);
2519 *colon = ':'; 2550 *colon = ':';
2520 } else { 2551 } else {
2521 list_for_each_entry(mod, &modules, list) 2552 list_for_each_entry_rcu(mod, &modules, list)
2522 if ((ret = mod_find_symname(mod, name)) != 0) 2553 if ((ret = mod_find_symname(mod, name)) != 0)
2523 break; 2554 break;
2524 } 2555 }
@@ -2527,23 +2558,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2527} 2558}
2528#endif /* CONFIG_KALLSYMS */ 2559#endif /* CONFIG_KALLSYMS */
2529 2560
2530/* Called by the /proc file system to return a list of modules. */
2531static void *m_start(struct seq_file *m, loff_t *pos)
2532{
2533 mutex_lock(&module_mutex);
2534 return seq_list_start(&modules, *pos);
2535}
2536
2537static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2538{
2539 return seq_list_next(p, &modules, pos);
2540}
2541
2542static void m_stop(struct seq_file *m, void *p)
2543{
2544 mutex_unlock(&module_mutex);
2545}
2546
2547static char *module_flags(struct module *mod, char *buf) 2561static char *module_flags(struct module *mod, char *buf)
2548{ 2562{
2549 int bx = 0; 2563 int bx = 0;
@@ -2552,10 +2566,12 @@ static char *module_flags(struct module *mod, char *buf)
2552 mod->state == MODULE_STATE_GOING || 2566 mod->state == MODULE_STATE_GOING ||
2553 mod->state == MODULE_STATE_COMING) { 2567 mod->state == MODULE_STATE_COMING) {
2554 buf[bx++] = '('; 2568 buf[bx++] = '(';
2555 if (mod->taints & TAINT_PROPRIETARY_MODULE) 2569 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
2556 buf[bx++] = 'P'; 2570 buf[bx++] = 'P';
2557 if (mod->taints & TAINT_FORCED_MODULE) 2571 if (mod->taints & (1 << TAINT_FORCED_MODULE))
2558 buf[bx++] = 'F'; 2572 buf[bx++] = 'F';
2573 if (mod->taints & (1 << TAINT_CRAP))
2574 buf[bx++] = 'C';
2559 /* 2575 /*
2560 * TAINT_FORCED_RMMOD: could be added. 2576 * TAINT_FORCED_RMMOD: could be added.
2561 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2577 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -2575,6 +2591,24 @@ static char *module_flags(struct module *mod, char *buf)
2575 return buf; 2591 return buf;
2576} 2592}
2577 2593
2594#ifdef CONFIG_PROC_FS
2595/* Called by the /proc file system to return a list of modules. */
2596static void *m_start(struct seq_file *m, loff_t *pos)
2597{
2598 mutex_lock(&module_mutex);
2599 return seq_list_start(&modules, *pos);
2600}
2601
2602static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2603{
2604 return seq_list_next(p, &modules, pos);
2605}
2606
2607static void m_stop(struct seq_file *m, void *p)
2608{
2609 mutex_unlock(&module_mutex);
2610}
2611
2578static int m_show(struct seq_file *m, void *p) 2612static int m_show(struct seq_file *m, void *p)
2579{ 2613{
2580 struct module *mod = list_entry(p, struct module, list); 2614 struct module *mod = list_entry(p, struct module, list);
@@ -2605,13 +2639,33 @@ static int m_show(struct seq_file *m, void *p)
2605 Where refcount is a number or -, and deps is a comma-separated list 2639 Where refcount is a number or -, and deps is a comma-separated list
2606 of depends or -. 2640 of depends or -.
2607*/ 2641*/
2608const struct seq_operations modules_op = { 2642static const struct seq_operations modules_op = {
2609 .start = m_start, 2643 .start = m_start,
2610 .next = m_next, 2644 .next = m_next,
2611 .stop = m_stop, 2645 .stop = m_stop,
2612 .show = m_show 2646 .show = m_show
2613}; 2647};
2614 2648
2649static int modules_open(struct inode *inode, struct file *file)
2650{
2651 return seq_open(file, &modules_op);
2652}
2653
2654static const struct file_operations proc_modules_operations = {
2655 .open = modules_open,
2656 .read = seq_read,
2657 .llseek = seq_lseek,
2658 .release = seq_release,
2659};
2660
2661static int __init proc_modules_init(void)
2662{
2663 proc_create("modules", 0, NULL, &proc_modules_operations);
2664 return 0;
2665}
2666module_init(proc_modules_init);
2667#endif
2668
2615/* Given an address, look for it in the module exception tables. */ 2669/* Given an address, look for it in the module exception tables. */
2616const struct exception_table_entry *search_module_extables(unsigned long addr) 2670const struct exception_table_entry *search_module_extables(unsigned long addr)
2617{ 2671{
@@ -2619,7 +2673,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2619 struct module *mod; 2673 struct module *mod;
2620 2674
2621 preempt_disable(); 2675 preempt_disable();
2622 list_for_each_entry(mod, &modules, list) { 2676 list_for_each_entry_rcu(mod, &modules, list) {
2623 if (mod->num_exentries == 0) 2677 if (mod->num_exentries == 0)
2624 continue; 2678 continue;
2625 2679
@@ -2645,7 +2699,7 @@ int is_module_address(unsigned long addr)
2645 2699
2646 preempt_disable(); 2700 preempt_disable();
2647 2701
2648 list_for_each_entry(mod, &modules, list) { 2702 list_for_each_entry_rcu(mod, &modules, list) {
2649 if (within(addr, mod->module_core, mod->core_size)) { 2703 if (within(addr, mod->module_core, mod->core_size)) {
2650 preempt_enable(); 2704 preempt_enable();
2651 return 1; 2705 return 1;
@@ -2666,7 +2720,7 @@ struct module *__module_text_address(unsigned long addr)
2666 if (addr < module_addr_min || addr > module_addr_max) 2720 if (addr < module_addr_min || addr > module_addr_max)
2667 return NULL; 2721 return NULL;
2668 2722
2669 list_for_each_entry(mod, &modules, list) 2723 list_for_each_entry_rcu(mod, &modules, list)
2670 if (within(addr, mod->module_init, mod->init_text_size) 2724 if (within(addr, mod->module_init, mod->init_text_size)
2671 || within(addr, mod->module_core, mod->core_text_size)) 2725 || within(addr, mod->module_core, mod->core_text_size))
2672 return mod; 2726 return mod;
@@ -2691,8 +2745,11 @@ void print_modules(void)
2691 char buf[8]; 2745 char buf[8];
2692 2746
2693 printk("Modules linked in:"); 2747 printk("Modules linked in:");
2694 list_for_each_entry(mod, &modules, list) 2748 /* Most callers should already have preempt disabled, but make sure */
2749 preempt_disable();
2750 list_for_each_entry_rcu(mod, &modules, list)
2695 printk(" %s%s", mod->name, module_flags(mod, buf)); 2751 printk(" %s%s", mod->name, module_flags(mod, buf));
2752 preempt_enable();
2696 if (last_unloaded_module[0]) 2753 if (last_unloaded_module[0])
2697 printk(" [last unloaded: %s]", last_unloaded_module); 2754 printk(" [last unloaded: %s]", last_unloaded_module);
2698 printk("\n"); 2755 printk("\n");
@@ -2717,3 +2774,50 @@ void module_update_markers(void)
2717 mutex_unlock(&module_mutex); 2774 mutex_unlock(&module_mutex);
2718} 2775}
2719#endif 2776#endif
2777
2778#ifdef CONFIG_TRACEPOINTS
2779void module_update_tracepoints(void)
2780{
2781 struct module *mod;
2782
2783 mutex_lock(&module_mutex);
2784 list_for_each_entry(mod, &modules, list)
2785 if (!mod->taints)
2786 tracepoint_update_probe_range(mod->tracepoints,
2787 mod->tracepoints + mod->num_tracepoints);
2788 mutex_unlock(&module_mutex);
2789}
2790
2791/*
2792 * Returns 0 if current not found.
2793 * Returns 1 if current found.
2794 */
2795int module_get_iter_tracepoints(struct tracepoint_iter *iter)
2796{
2797 struct module *iter_mod;
2798 int found = 0;
2799
2800 mutex_lock(&module_mutex);
2801 list_for_each_entry(iter_mod, &modules, list) {
2802 if (!iter_mod->taints) {
2803 /*
2804 * Sorted module list
2805 */
2806 if (iter_mod < iter->module)
2807 continue;
2808 else if (iter_mod > iter->module)
2809 iter->tracepoint = NULL;
2810 found = tracepoint_get_iter_range(&iter->tracepoint,
2811 iter_mod->tracepoints,
2812 iter_mod->tracepoints
2813 + iter_mod->num_tracepoints);
2814 if (found) {
2815 iter->module = iter_mod;
2816 break;
2817 }
2818 }
2819 }
2820 mutex_unlock(&module_mutex);
2821 return found;
2822}
2823#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11584ef..4282c0a40a57 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
550 550
551static ATOMIC_NOTIFIER_HEAD(die_chain); 551static ATOMIC_NOTIFIER_HEAD(die_chain);
552 552
553int notify_die(enum die_val val, const char *str, 553int notrace notify_die(enum die_val val, const char *str,
554 struct pt_regs *regs, long err, int trap, int sig) 554 struct pt_regs *regs, long err, int trap, int sig)
555{ 555{
556 struct die_args args = { 556 struct die_args args = {
diff --git a/kernel/panic.c b/kernel/panic.c
index 12c5a0a6c89b..6513aac8e992 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,7 +23,7 @@
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24 24
25int panic_on_oops; 25int panic_on_oops;
26int tainted; 26static unsigned long tainted_mask;
27static int pause_on_oops; 27static int pause_on_oops;
28static int pause_on_oops_flag; 28static int pause_on_oops_flag;
29static DEFINE_SPINLOCK(pause_on_oops_lock); 29static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -34,13 +34,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
34 34
35EXPORT_SYMBOL(panic_notifier_list); 35EXPORT_SYMBOL(panic_notifier_list);
36 36
37static int __init panic_setup(char *str)
38{
39 panic_timeout = simple_strtoul(str, NULL, 0);
40 return 1;
41}
42__setup("panic=", panic_setup);
43
44static long no_blink(long time) 37static long no_blink(long time)
45{ 38{
46 return 0; 39 return 0;
@@ -143,6 +136,27 @@ NORET_TYPE void panic(const char * fmt, ...)
143 136
144EXPORT_SYMBOL(panic); 137EXPORT_SYMBOL(panic);
145 138
139
140struct tnt {
141 u8 bit;
142 char true;
143 char false;
144};
145
146static const struct tnt tnts[] = {
147 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
148 { TAINT_FORCED_MODULE, 'F', ' ' },
149 { TAINT_UNSAFE_SMP, 'S', ' ' },
150 { TAINT_FORCED_RMMOD, 'R', ' ' },
151 { TAINT_MACHINE_CHECK, 'M', ' ' },
152 { TAINT_BAD_PAGE, 'B', ' ' },
153 { TAINT_USER, 'U', ' ' },
154 { TAINT_DIE, 'D', ' ' },
155 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
156 { TAINT_WARN, 'W', ' ' },
157 { TAINT_CRAP, 'C', ' ' },
158};
159
146/** 160/**
147 * print_tainted - return a string to represent the kernel taint state. 161 * print_tainted - return a string to represent the kernel taint state.
148 * 162 *
@@ -155,44 +169,47 @@ EXPORT_SYMBOL(panic);
155 * 'U' - Userspace-defined naughtiness. 169 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden. 170 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning. 171 * 'W' - Taint on warning.
172 * 'C' - modules from drivers/staging are loaded.
158 * 173 *
159 * The string is overwritten by the next call to print_taint(). 174 * The string is overwritten by the next call to print_taint().
160 */ 175 */
161
162const char *print_tainted(void) 176const char *print_tainted(void)
163{ 177{
164 static char buf[20]; 178 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
165 if (tainted) { 179
166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", 180 if (tainted_mask) {
167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 181 char *s;
168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 182 int i;
169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 183
170 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 184 s = buf + sprintf(buf, "Tainted: ");
171 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 185 for (i = 0; i < ARRAY_SIZE(tnts); i++) {
172 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 186 const struct tnt *t = &tnts[i];
173 tainted & TAINT_USER ? 'U' : ' ', 187 *s++ = test_bit(t->bit, &tainted_mask) ?
174 tainted & TAINT_DIE ? 'D' : ' ', 188 t->true : t->false;
175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', 189 }
176 tainted & TAINT_WARN ? 'W' : ' '); 190 *s = 0;
177 } 191 } else
178 else
179 snprintf(buf, sizeof(buf), "Not tainted"); 192 snprintf(buf, sizeof(buf), "Not tainted");
180 return(buf); 193 return(buf);
181} 194}
182 195
183void add_taint(unsigned flag) 196int test_taint(unsigned flag)
184{ 197{
185 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 198 return test_bit(flag, &tainted_mask);
186 tainted |= flag; 199}
200EXPORT_SYMBOL(test_taint);
201
202unsigned long get_taint(void)
203{
204 return tainted_mask;
187} 205}
188EXPORT_SYMBOL(add_taint);
189 206
190static int __init pause_on_oops_setup(char *str) 207void add_taint(unsigned flag)
191{ 208{
192 pause_on_oops = simple_strtoul(str, NULL, 0); 209 debug_locks = 0; /* can't trust the integrity of the kernel anymore */
193 return 1; 210 set_bit(flag, &tainted_mask);
194} 211}
195__setup("pause_on_oops=", pause_on_oops_setup); 212EXPORT_SYMBOL(add_taint);
196 213
197static void spin_msec(int msecs) 214static void spin_msec(int msecs)
198{ 215{
@@ -353,3 +370,6 @@ void __stack_chk_fail(void)
353} 370}
354EXPORT_SYMBOL(__stack_chk_fail); 371EXPORT_SYMBOL(__stack_chk_fail);
355#endif 372#endif
373
374core_param(panic, panic_timeout, int, 0644);
375core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index afc46a23eb6d..a1e3025b19a9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
373} 373}
374 374
375/* sysfs output in /sys/modules/XYZ/parameters/ */ 375/* sysfs output in /sys/modules/XYZ/parameters/ */
376#define to_module_attr(n) container_of(n, struct module_attribute, attr);
377#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
376 378
377extern struct kernel_param __start___param[], __stop___param[]; 379extern struct kernel_param __start___param[], __stop___param[];
378 380
@@ -384,6 +386,7 @@ struct param_attribute
384 386
385struct module_param_attrs 387struct module_param_attrs
386{ 388{
389 unsigned int num;
387 struct attribute_group grp; 390 struct attribute_group grp;
388 struct param_attribute attrs[0]; 391 struct param_attribute attrs[0];
389}; 392};
@@ -434,93 +437,120 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
434 437
435#ifdef CONFIG_SYSFS 438#ifdef CONFIG_SYSFS
436/* 439/*
437 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME 440 * add_sysfs_param - add a parameter to sysfs
438 * @mk: struct module_kobject (contains parent kobject) 441 * @mk: struct module_kobject
439 * @kparam: array of struct kernel_param, the actual parameter definitions 442 * @kparam: the actual parameter definition to add to sysfs
440 * @num_params: number of entries in array 443 * @name: name of parameter
441 * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
442 * 444 *
443 * Create a kobject for a (per-module) group of parameters, and create files 445 * Create a kobject if for a (per-module) parameter if mp NULL, and
444 * in sysfs. A pointer to the param_kobject is returned on success, 446 * create file in sysfs. Returns an error on out of memory. Always cleans up
445 * NULL if there's no parameter to export, or other ERR_PTR(err). 447 * if there's an error.
446 */ 448 */
447static __modinit struct module_param_attrs * 449static __modinit int add_sysfs_param(struct module_kobject *mk,
448param_sysfs_setup(struct module_kobject *mk, 450 struct kernel_param *kp,
449 struct kernel_param *kparam, 451 const char *name)
450 unsigned int num_params,
451 unsigned int name_skip)
452{ 452{
453 struct module_param_attrs *mp; 453 struct module_param_attrs *new;
454 unsigned int valid_attrs = 0; 454 struct attribute **attrs;
455 unsigned int i, size[2]; 455 int err, num;
456 struct param_attribute *pattr; 456
457 struct attribute **gattr; 457 /* We don't bother calling this with invisible parameters. */
458 int err; 458 BUG_ON(!kp->perm);
459 459
460 for (i=0; i<num_params; i++) { 460 if (!mk->mp) {
461 if (kparam[i].perm) 461 num = 0;
462 valid_attrs++; 462 attrs = NULL;
463 } else {
464 num = mk->mp->num;
465 attrs = mk->mp->grp.attrs;
463 } 466 }
464 467
465 if (!valid_attrs) 468 /* Enlarge. */
466 return NULL; 469 new = krealloc(mk->mp,
467 470 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
468 size[0] = ALIGN(sizeof(*mp) + 471 GFP_KERNEL);
469 valid_attrs * sizeof(mp->attrs[0]), 472 if (!new) {
470 sizeof(mp->grp.attrs[0])); 473 kfree(mk->mp);
471 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); 474 err = -ENOMEM;
472 475 goto fail;
473 mp = kzalloc(size[0] + size[1], GFP_KERNEL);
474 if (!mp)
475 return ERR_PTR(-ENOMEM);
476
477 mp->grp.name = "parameters";
478 mp->grp.attrs = (void *)mp + size[0];
479
480 pattr = &mp->attrs[0];
481 gattr = &mp->grp.attrs[0];
482 for (i = 0; i < num_params; i++) {
483 struct kernel_param *kp = &kparam[i];
484 if (kp->perm) {
485 pattr->param = kp;
486 pattr->mattr.show = param_attr_show;
487 pattr->mattr.store = param_attr_store;
488 pattr->mattr.attr.name = (char *)&kp->name[name_skip];
489 pattr->mattr.attr.mode = kp->perm;
490 *(gattr++) = &(pattr++)->mattr.attr;
491 }
492 } 476 }
493 *gattr = NULL; 477 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
494 478 if (!attrs) {
495 if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { 479 err = -ENOMEM;
496 kfree(mp); 480 goto fail_free_new;
497 return ERR_PTR(err);
498 } 481 }
499 return mp; 482
483 /* Sysfs wants everything zeroed. */
484 memset(new, 0, sizeof(*new));
485 memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
486 memset(&attrs[num], 0, sizeof(attrs[num]));
487 new->grp.name = "parameters";
488 new->grp.attrs = attrs;
489
490 /* Tack new one on the end. */
491 new->attrs[num].param = kp;
492 new->attrs[num].mattr.show = param_attr_show;
493 new->attrs[num].mattr.store = param_attr_store;
494 new->attrs[num].mattr.attr.name = (char *)name;
495 new->attrs[num].mattr.attr.mode = kp->perm;
496 new->num = num+1;
497
498 /* Fix up all the pointers, since krealloc can move us */
499 for (num = 0; num < new->num; num++)
500 new->grp.attrs[num] = &new->attrs[num].mattr.attr;
501 new->grp.attrs[num] = NULL;
502
503 mk->mp = new;
504 return 0;
505
506fail_free_new:
507 kfree(new);
508fail:
509 mk->mp = NULL;
510 return err;
500} 511}
501 512
502#ifdef CONFIG_MODULES 513#ifdef CONFIG_MODULES
514static void free_module_param_attrs(struct module_kobject *mk)
515{
516 kfree(mk->mp->grp.attrs);
517 kfree(mk->mp);
518 mk->mp = NULL;
519}
520
503/* 521/*
504 * module_param_sysfs_setup - setup sysfs support for one module 522 * module_param_sysfs_setup - setup sysfs support for one module
505 * @mod: module 523 * @mod: module
506 * @kparam: module parameters (array) 524 * @kparam: module parameters (array)
507 * @num_params: number of module parameters 525 * @num_params: number of module parameters
508 * 526 *
509 * Adds sysfs entries for module parameters, and creates a link from 527 * Adds sysfs entries for module parameters under
510 * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ 528 * /sys/module/[mod->name]/parameters/
511 */ 529 */
512int module_param_sysfs_setup(struct module *mod, 530int module_param_sysfs_setup(struct module *mod,
513 struct kernel_param *kparam, 531 struct kernel_param *kparam,
514 unsigned int num_params) 532 unsigned int num_params)
515{ 533{
516 struct module_param_attrs *mp; 534 int i, err;
535 bool params = false;
536
537 for (i = 0; i < num_params; i++) {
538 if (kparam[i].perm == 0)
539 continue;
540 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
541 if (err)
542 return err;
543 params = true;
544 }
517 545
518 mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); 546 if (!params)
519 if (IS_ERR(mp)) 547 return 0;
520 return PTR_ERR(mp);
521 548
522 mod->param_attrs = mp; 549 /* Create the param group. */
523 return 0; 550 err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
551 if (err)
552 free_module_param_attrs(&mod->mkobj);
553 return err;
524} 554}
525 555
526/* 556/*
@@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod,
532 */ 562 */
533void module_param_sysfs_remove(struct module *mod) 563void module_param_sysfs_remove(struct module *mod)
534{ 564{
535 if (mod->param_attrs) { 565 if (mod->mkobj.mp) {
536 sysfs_remove_group(&mod->mkobj.kobj, 566 sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
537 &mod->param_attrs->grp);
538 /* We are positive that no one is using any param 567 /* We are positive that no one is using any param
539 * attrs at this point. Deallocate immediately. */ 568 * attrs at this point. Deallocate immediately. */
540 kfree(mod->param_attrs); 569 free_module_param_attrs(&mod->mkobj);
541 mod->param_attrs = NULL;
542 } 570 }
543} 571}
544#endif 572#endif
545 573
546/* 574static void __init kernel_add_sysfs_param(const char *name,
547 * kernel_param_sysfs_setup - wrapper for built-in params support 575 struct kernel_param *kparam,
548 */ 576 unsigned int name_skip)
549static void __init kernel_param_sysfs_setup(const char *name,
550 struct kernel_param *kparam,
551 unsigned int num_params,
552 unsigned int name_skip)
553{ 577{
554 struct module_kobject *mk; 578 struct module_kobject *mk;
555 int ret; 579 struct kobject *kobj;
580 int err;
556 581
557 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 582 kobj = kset_find_obj(module_kset, name);
558 BUG_ON(!mk); 583 if (kobj) {
559 584 /* We already have one. Remove params so we can add more. */
560 mk->mod = THIS_MODULE; 585 mk = to_module_kobject(kobj);
561 mk->kobj.kset = module_kset; 586 /* We need to remove it before adding parameters. */
562 ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); 587 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
563 if (ret) { 588 } else {
564 kobject_put(&mk->kobj); 589 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
565 printk(KERN_ERR "Module '%s' failed to be added to sysfs, " 590 BUG_ON(!mk);
566 "error number %d\n", name, ret); 591
567 printk(KERN_ERR "The system will be unstable now.\n"); 592 mk->mod = THIS_MODULE;
568 return; 593 mk->kobj.kset = module_kset;
594 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
595 "%s", name);
596 if (err) {
597 kobject_put(&mk->kobj);
598 printk(KERN_ERR "Module '%s' failed add to sysfs, "
599 "error number %d\n", name, err);
600 printk(KERN_ERR "The system will be unstable now.\n");
601 return;
602 }
603 /* So that exit path is even. */
604 kobject_get(&mk->kobj);
569 } 605 }
570 param_sysfs_setup(mk, kparam, num_params, name_skip); 606
607 /* These should not fail at boot. */
608 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
609 BUG_ON(err);
610 err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
611 BUG_ON(err);
571 kobject_uevent(&mk->kobj, KOBJ_ADD); 612 kobject_uevent(&mk->kobj, KOBJ_ADD);
613 kobject_put(&mk->kobj);
572} 614}
573 615
574/* 616/*
@@ -579,60 +621,36 @@ static void __init kernel_param_sysfs_setup(const char *name,
579 * The "module" name (KBUILD_MODNAME) is stored before a dot, the 621 * The "module" name (KBUILD_MODNAME) is stored before a dot, the
580 * "parameter" name is stored behind a dot in kernel_param->name. So, 622 * "parameter" name is stored behind a dot in kernel_param->name. So,
581 * extract the "module" name for all built-in kernel_param-eters, 623 * extract the "module" name for all built-in kernel_param-eters,
582 * and for all who have the same, call kernel_param_sysfs_setup. 624 * and for all who have the same, call kernel_add_sysfs_param.
583 */ 625 */
584static void __init param_sysfs_builtin(void) 626static void __init param_sysfs_builtin(void)
585{ 627{
586 struct kernel_param *kp, *kp_begin = NULL; 628 struct kernel_param *kp;
587 unsigned int i, name_len, count = 0; 629 unsigned int name_len;
588 char modname[MODULE_NAME_LEN + 1] = ""; 630 char modname[MODULE_NAME_LEN];
589 631
590 for (i=0; i < __stop___param - __start___param; i++) { 632 for (kp = __start___param; kp < __stop___param; kp++) {
591 char *dot; 633 char *dot;
592 size_t max_name_len;
593 634
594 kp = &__start___param[i]; 635 if (kp->perm == 0)
595 max_name_len = 636 continue;
596 min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
597 637
598 dot = memchr(kp->name, '.', max_name_len); 638 dot = strchr(kp->name, '.');
599 if (!dot) { 639 if (!dot) {
600 DEBUGP("couldn't find period in first %d characters " 640 /* This happens for core_param() */
601 "of %s\n", MODULE_NAME_LEN, kp->name); 641 strcpy(modname, "kernel");
602 continue; 642 name_len = 0;
603 } 643 } else {
604 name_len = dot - kp->name; 644 name_len = dot - kp->name + 1;
605 645 strlcpy(modname, kp->name, name_len);
606 /* new kbuild_modname? */
607 if (strlen(modname) != name_len
608 || strncmp(modname, kp->name, name_len) != 0) {
609 /* add a new kobject for previous kernel_params. */
610 if (count)
611 kernel_param_sysfs_setup(modname,
612 kp_begin,
613 count,
614 strlen(modname)+1);
615
616 strncpy(modname, kp->name, name_len);
617 modname[name_len] = '\0';
618 count = 0;
619 kp_begin = kp;
620 } 646 }
621 count++; 647 kernel_add_sysfs_param(modname, kp, name_len);
622 } 648 }
623
624 /* last kernel_params need to be registered as well */
625 if (count)
626 kernel_param_sysfs_setup(modname, kp_begin, count,
627 strlen(modname)+1);
628} 649}
629 650
630 651
631/* module-related sysfs stuff */ 652/* module-related sysfs stuff */
632 653
633#define to_module_attr(n) container_of(n, struct module_attribute, attr);
634#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
635
636static ssize_t module_attr_show(struct kobject *kobj, 654static ssize_t module_attr_show(struct kobject *kobj,
637 struct attribute *attr, 655 struct attribute *attr,
638 char *buf) 656 char *buf)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..895337b16a24 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,6 +7,93 @@
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11
12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct signal_struct *sig;
62 int i;
63 struct task_cputime *tot;
64
65 sig = tsk->signal;
66 if (unlikely(!sig) || !sig->cputime.totals) {
67 times->utime = tsk->utime;
68 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return;
71 }
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */
85void update_rlimit_cpu(unsigned long rlim_new)
86{
87 cputime_t cputime;
88
89 cputime = secs_to_cputime(rlim_new);
90 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
91 cputime_lt(current->signal->it_prof_expires, cputime)) {
92 spin_lock_irq(&current->sighand->siglock);
93 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
94 spin_unlock_irq(&current->sighand->siglock);
95 }
96}
10 97
11static int check_clock(const clockid_t which_clock) 98static int check_clock(const clockid_t which_clock)
12{ 99{
@@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
158{ 245{
159 return p->utime; 246 return p->utime;
160} 247}
161static inline unsigned long long sched_ns(struct task_struct *p)
162{
163 return task_sched_runtime(p);
164}
165 248
166int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 249int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
167{ 250{
@@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
211 cpu->cpu = virt_ticks(p); 294 cpu->cpu = virt_ticks(p);
212 break; 295 break;
213 case CPUCLOCK_SCHED: 296 case CPUCLOCK_SCHED:
214 cpu->sched = sched_ns(p); 297 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
215 break; 298 break;
216 } 299 }
217 return 0; 300 return 0;
@@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
220/* 303/*
221 * Sample a process (thread group) clock for the given group_leader task. 304 * Sample a process (thread group) clock for the given group_leader task.
222 * Must be called with tasklist_lock held for reading. 305 * Must be called with tasklist_lock held for reading.
223 * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
224 */ 306 */
225static int cpu_clock_sample_group_locked(unsigned int clock_idx, 307static int cpu_clock_sample_group(const clockid_t which_clock,
226 struct task_struct *p, 308 struct task_struct *p,
227 union cpu_time_count *cpu) 309 union cpu_time_count *cpu)
228{ 310{
229 struct task_struct *t = p; 311 struct task_cputime cputime;
230 switch (clock_idx) { 312
313 thread_group_cputime(p, &cputime);
314 switch (which_clock) {
231 default: 315 default:
232 return -EINVAL; 316 return -EINVAL;
233 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
234 cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); 318 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
235 do {
236 cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
237 t = next_thread(t);
238 } while (t != p);
239 break; 319 break;
240 case CPUCLOCK_VIRT: 320 case CPUCLOCK_VIRT:
241 cpu->cpu = p->signal->utime; 321 cpu->cpu = cputime.utime;
242 do {
243 cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
244 t = next_thread(t);
245 } while (t != p);
246 break; 322 break;
247 case CPUCLOCK_SCHED: 323 case CPUCLOCK_SCHED:
248 cpu->sched = p->signal->sum_sched_runtime; 324 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
249 /* Add in each other live thread. */
250 while ((t = next_thread(t)) != p) {
251 cpu->sched += t->se.sum_exec_runtime;
252 }
253 cpu->sched += sched_ns(p);
254 break; 325 break;
255 } 326 }
256 return 0; 327 return 0;
257} 328}
258 329
259/*
260 * Sample a process (thread group) clock for the given group_leader task.
261 * Must be called with tasklist_lock held for reading.
262 */
263static int cpu_clock_sample_group(const clockid_t which_clock,
264 struct task_struct *p,
265 union cpu_time_count *cpu)
266{
267 int ret;
268 unsigned long flags;
269 spin_lock_irqsave(&p->sighand->siglock, flags);
270 ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
271 cpu);
272 spin_unlock_irqrestore(&p->sighand->siglock, flags);
273 return ret;
274}
275
276 330
277int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 331int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
278{ 332{
@@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
471} 525}
472void posix_cpu_timers_exit_group(struct task_struct *tsk) 526void posix_cpu_timers_exit_group(struct task_struct *tsk)
473{ 527{
474 cleanup_timers(tsk->signal->cpu_timers, 528 struct task_cputime cputime;
475 cputime_add(tsk->utime, tsk->signal->utime),
476 cputime_add(tsk->stime, tsk->signal->stime),
477 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
478}
479
480
481/*
482 * Set the expiry times of all the threads in the process so one of them
483 * will go off before the process cumulative expiry total is reached.
484 */
485static void process_timer_rebalance(struct task_struct *p,
486 unsigned int clock_idx,
487 union cpu_time_count expires,
488 union cpu_time_count val)
489{
490 cputime_t ticks, left;
491 unsigned long long ns, nsleft;
492 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live);
494 529
495 if (!nthreads) 530 thread_group_cputime(tsk, &cputime);
496 return; 531 cleanup_timers(tsk->signal->cpu_timers,
497 532 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
498 switch (clock_idx) {
499 default:
500 BUG();
501 break;
502 case CPUCLOCK_PROF:
503 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
504 nthreads);
505 do {
506 if (likely(!(t->flags & PF_EXITING))) {
507 ticks = cputime_add(prof_ticks(t), left);
508 if (cputime_eq(t->it_prof_expires,
509 cputime_zero) ||
510 cputime_gt(t->it_prof_expires, ticks)) {
511 t->it_prof_expires = ticks;
512 }
513 }
514 t = next_thread(t);
515 } while (t != p);
516 break;
517 case CPUCLOCK_VIRT:
518 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
519 nthreads);
520 do {
521 if (likely(!(t->flags & PF_EXITING))) {
522 ticks = cputime_add(virt_ticks(t), left);
523 if (cputime_eq(t->it_virt_expires,
524 cputime_zero) ||
525 cputime_gt(t->it_virt_expires, ticks)) {
526 t->it_virt_expires = ticks;
527 }
528 }
529 t = next_thread(t);
530 } while (t != p);
531 break;
532 case CPUCLOCK_SCHED:
533 nsleft = expires.sched - val.sched;
534 do_div(nsleft, nthreads);
535 nsleft = max_t(unsigned long long, nsleft, 1);
536 do {
537 if (likely(!(t->flags & PF_EXITING))) {
538 ns = t->se.sum_exec_runtime + nsleft;
539 if (t->it_sched_expires == 0 ||
540 t->it_sched_expires > ns) {
541 t->it_sched_expires = ns;
542 }
543 }
544 t = next_thread(t);
545 } while (t != p);
546 break;
547 }
548} 533}
549 534
550static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 535static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
608 default: 593 default:
609 BUG(); 594 BUG();
610 case CPUCLOCK_PROF: 595 case CPUCLOCK_PROF:
611 if (cputime_eq(p->it_prof_expires, 596 if (cputime_eq(p->cputime_expires.prof_exp,
612 cputime_zero) || 597 cputime_zero) ||
613 cputime_gt(p->it_prof_expires, 598 cputime_gt(p->cputime_expires.prof_exp,
614 nt->expires.cpu)) 599 nt->expires.cpu))
615 p->it_prof_expires = nt->expires.cpu; 600 p->cputime_expires.prof_exp =
601 nt->expires.cpu;
616 break; 602 break;
617 case CPUCLOCK_VIRT: 603 case CPUCLOCK_VIRT:
618 if (cputime_eq(p->it_virt_expires, 604 if (cputime_eq(p->cputime_expires.virt_exp,
619 cputime_zero) || 605 cputime_zero) ||
620 cputime_gt(p->it_virt_expires, 606 cputime_gt(p->cputime_expires.virt_exp,
621 nt->expires.cpu)) 607 nt->expires.cpu))
622 p->it_virt_expires = nt->expires.cpu; 608 p->cputime_expires.virt_exp =
609 nt->expires.cpu;
623 break; 610 break;
624 case CPUCLOCK_SCHED: 611 case CPUCLOCK_SCHED:
625 if (p->it_sched_expires == 0 || 612 if (p->cputime_expires.sched_exp == 0 ||
626 p->it_sched_expires > nt->expires.sched) 613 p->cputime_expires.sched_exp >
627 p->it_sched_expires = nt->expires.sched; 614 nt->expires.sched)
615 p->cputime_expires.sched_exp =
616 nt->expires.sched;
628 break; 617 break;
629 } 618 }
630 } else { 619 } else {
631 /* 620 /*
632 * For a process timer, we must balance 621 * For a process timer, set the cached expiration time.
633 * all the live threads' expirations.
634 */ 622 */
635 switch (CPUCLOCK_WHICH(timer->it_clock)) { 623 switch (CPUCLOCK_WHICH(timer->it_clock)) {
636 default: 624 default:
@@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
641 cputime_lt(p->signal->it_virt_expires, 629 cputime_lt(p->signal->it_virt_expires,
642 timer->it.cpu.expires.cpu)) 630 timer->it.cpu.expires.cpu))
643 break; 631 break;
644 goto rebalance; 632 p->signal->cputime_expires.virt_exp =
633 timer->it.cpu.expires.cpu;
634 break;
645 case CPUCLOCK_PROF: 635 case CPUCLOCK_PROF:
646 if (!cputime_eq(p->signal->it_prof_expires, 636 if (!cputime_eq(p->signal->it_prof_expires,
647 cputime_zero) && 637 cputime_zero) &&
@@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
652 if (i != RLIM_INFINITY && 642 if (i != RLIM_INFINITY &&
653 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 643 i <= cputime_to_secs(timer->it.cpu.expires.cpu))
654 break; 644 break;
655 goto rebalance; 645 p->signal->cputime_expires.prof_exp =
646 timer->it.cpu.expires.cpu;
647 break;
656 case CPUCLOCK_SCHED: 648 case CPUCLOCK_SCHED:
657 rebalance: 649 p->signal->cputime_expires.sched_exp =
658 process_timer_rebalance( 650 timer->it.cpu.expires.sched;
659 timer->it.cpu.task,
660 CPUCLOCK_WHICH(timer->it_clock),
661 timer->it.cpu.expires, now);
662 break; 651 break;
663 } 652 }
664 } 653 }
@@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk,
969 struct signal_struct *const sig = tsk->signal; 958 struct signal_struct *const sig = tsk->signal;
970 959
971 maxfire = 20; 960 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 961 tsk->cputime_expires.prof_exp = cputime_zero;
973 while (!list_empty(timers)) { 962 while (!list_empty(timers)) {
974 struct cpu_timer_list *t = list_first_entry(timers, 963 struct cpu_timer_list *t = list_first_entry(timers,
975 struct cpu_timer_list, 964 struct cpu_timer_list,
976 entry); 965 entry);
977 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 966 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
978 tsk->it_prof_expires = t->expires.cpu; 967 tsk->cputime_expires.prof_exp = t->expires.cpu;
979 break; 968 break;
980 } 969 }
981 t->firing = 1; 970 t->firing = 1;
@@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk,
984 973
985 ++timers; 974 ++timers;
986 maxfire = 20; 975 maxfire = 20;
987 tsk->it_virt_expires = cputime_zero; 976 tsk->cputime_expires.virt_exp = cputime_zero;
988 while (!list_empty(timers)) { 977 while (!list_empty(timers)) {
989 struct cpu_timer_list *t = list_first_entry(timers, 978 struct cpu_timer_list *t = list_first_entry(timers,
990 struct cpu_timer_list, 979 struct cpu_timer_list,
991 entry); 980 entry);
992 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 981 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
993 tsk->it_virt_expires = t->expires.cpu; 982 tsk->cputime_expires.virt_exp = t->expires.cpu;
994 break; 983 break;
995 } 984 }
996 t->firing = 1; 985 t->firing = 1;
@@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk,
999 988
1000 ++timers; 989 ++timers;
1001 maxfire = 20; 990 maxfire = 20;
1002 tsk->it_sched_expires = 0; 991 tsk->cputime_expires.sched_exp = 0;
1003 while (!list_empty(timers)) { 992 while (!list_empty(timers)) {
1004 struct cpu_timer_list *t = list_first_entry(timers, 993 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 994 struct cpu_timer_list,
1006 entry); 995 entry);
1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { 996 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 997 tsk->cputime_expires.sched_exp = t->expires.sched;
1009 break; 998 break;
1010 } 999 }
1011 t->firing = 1; 1000 t->firing = 1;
@@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk,
1055{ 1044{
1056 int maxfire; 1045 int maxfire;
1057 struct signal_struct *const sig = tsk->signal; 1046 struct signal_struct *const sig = tsk->signal;
1058 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1047 cputime_t utime, ptime, virt_expires, prof_expires;
1059 unsigned long long sum_sched_runtime, sched_expires; 1048 unsigned long long sum_sched_runtime, sched_expires;
1060 struct task_struct *t;
1061 struct list_head *timers = sig->cpu_timers; 1049 struct list_head *timers = sig->cpu_timers;
1050 struct task_cputime cputime;
1062 1051
1063 /* 1052 /*
1064 * Don't sample the current process CPU clocks if there are no timers. 1053 * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk,
1074 /* 1063 /*
1075 * Collect the current process totals. 1064 * Collect the current process totals.
1076 */ 1065 */
1077 utime = sig->utime; 1066 thread_group_cputime(tsk, &cputime);
1078 stime = sig->stime; 1067 utime = cputime.utime;
1079 sum_sched_runtime = sig->sum_sched_runtime; 1068 ptime = cputime_add(utime, cputime.stime);
1080 t = tsk; 1069 sum_sched_runtime = cputime.sum_exec_runtime;
1081 do {
1082 utime = cputime_add(utime, t->utime);
1083 stime = cputime_add(stime, t->stime);
1084 sum_sched_runtime += t->se.sum_exec_runtime;
1085 t = next_thread(t);
1086 } while (t != tsk);
1087 ptime = cputime_add(utime, stime);
1088
1089 maxfire = 20; 1070 maxfire = 20;
1090 prof_expires = cputime_zero; 1071 prof_expires = cputime_zero;
1091 while (!list_empty(timers)) { 1072 while (!list_empty(timers)) {
@@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk,
1193 } 1174 }
1194 } 1175 }
1195 1176
1196 if (!cputime_eq(prof_expires, cputime_zero) || 1177 if (!cputime_eq(prof_expires, cputime_zero) &&
1197 !cputime_eq(virt_expires, cputime_zero) || 1178 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
1198 sched_expires != 0) { 1179 cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
1199 /* 1180 sig->cputime_expires.prof_exp = prof_expires;
1200 * Rebalance the threads' expiry times for the remaining 1181 if (!cputime_eq(virt_expires, cputime_zero) &&
1201 * process CPU timers. 1182 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1202 */ 1183 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1203 1184 sig->cputime_expires.virt_exp = virt_expires;
1204 cputime_t prof_left, virt_left, ticks; 1185 if (sched_expires != 0 &&
1205 unsigned long long sched_left, sched; 1186 (sig->cputime_expires.sched_exp == 0 ||
1206 const unsigned int nthreads = atomic_read(&sig->live); 1187 sig->cputime_expires.sched_exp > sched_expires))
1207 1188 sig->cputime_expires.sched_exp = sched_expires;
1208 if (!nthreads)
1209 return;
1210
1211 prof_left = cputime_sub(prof_expires, utime);
1212 prof_left = cputime_sub(prof_left, stime);
1213 prof_left = cputime_div_non_zero(prof_left, nthreads);
1214 virt_left = cputime_sub(virt_expires, utime);
1215 virt_left = cputime_div_non_zero(virt_left, nthreads);
1216 if (sched_expires) {
1217 sched_left = sched_expires - sum_sched_runtime;
1218 do_div(sched_left, nthreads);
1219 sched_left = max_t(unsigned long long, sched_left, 1);
1220 } else {
1221 sched_left = 0;
1222 }
1223 t = tsk;
1224 do {
1225 if (unlikely(t->flags & PF_EXITING))
1226 continue;
1227
1228 ticks = cputime_add(cputime_add(t->utime, t->stime),
1229 prof_left);
1230 if (!cputime_eq(prof_expires, cputime_zero) &&
1231 (cputime_eq(t->it_prof_expires, cputime_zero) ||
1232 cputime_gt(t->it_prof_expires, ticks))) {
1233 t->it_prof_expires = ticks;
1234 }
1235
1236 ticks = cputime_add(t->utime, virt_left);
1237 if (!cputime_eq(virt_expires, cputime_zero) &&
1238 (cputime_eq(t->it_virt_expires, cputime_zero) ||
1239 cputime_gt(t->it_virt_expires, ticks))) {
1240 t->it_virt_expires = ticks;
1241 }
1242
1243 sched = t->se.sum_exec_runtime + sched_left;
1244 if (sched_expires && (t->it_sched_expires == 0 ||
1245 t->it_sched_expires > sched)) {
1246 t->it_sched_expires = sched;
1247 }
1248 } while ((t = next_thread(t)) != tsk);
1249 }
1250} 1189}
1251 1190
1252/* 1191/*
@@ -1314,6 +1253,89 @@ out:
1314 ++timer->it_requeue_pending; 1253 ++timer->it_requeue_pending;
1315} 1254}
1316 1255
1256/**
1257 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1258 *
1259 * @cputime: The struct to compare.
1260 *
1261 * Checks @cputime to see if all fields are zero. Returns true if all fields
1262 * are zero, false if any field is nonzero.
1263 */
1264static inline int task_cputime_zero(const struct task_cputime *cputime)
1265{
1266 if (cputime_eq(cputime->utime, cputime_zero) &&
1267 cputime_eq(cputime->stime, cputime_zero) &&
1268 cputime->sum_exec_runtime == 0)
1269 return 1;
1270 return 0;
1271}
1272
1273/**
1274 * task_cputime_expired - Compare two task_cputime entities.
1275 *
1276 * @sample: The task_cputime structure to be checked for expiration.
1277 * @expires: Expiration times, against which @sample will be checked.
1278 *
1279 * Checks @sample against @expires to see if any field of @sample has expired.
1280 * Returns true if any field of the former is greater than the corresponding
1281 * field of the latter if the latter field is set. Otherwise returns false.
1282 */
1283static inline int task_cputime_expired(const struct task_cputime *sample,
1284 const struct task_cputime *expires)
1285{
1286 if (!cputime_eq(expires->utime, cputime_zero) &&
1287 cputime_ge(sample->utime, expires->utime))
1288 return 1;
1289 if (!cputime_eq(expires->stime, cputime_zero) &&
1290 cputime_ge(cputime_add(sample->utime, sample->stime),
1291 expires->stime))
1292 return 1;
1293 if (expires->sum_exec_runtime != 0 &&
1294 sample->sum_exec_runtime >= expires->sum_exec_runtime)
1295 return 1;
1296 return 0;
1297}
1298
1299/**
1300 * fastpath_timer_check - POSIX CPU timers fast path.
1301 *
1302 * @tsk: The task (thread) being checked.
1303 *
1304 * Check the task and thread group timers. If both are zero (there are no
1305 * timers set) return false. Otherwise snapshot the task and thread group
1306 * timers and compare them with the corresponding expiration times. Return
1307 * true if a timer has expired, else return false.
1308 */
1309static inline int fastpath_timer_check(struct task_struct *tsk)
1310{
1311 struct signal_struct *sig;
1312
1313 /* tsk == current, ensure it is safe to use ->signal/sighand */
1314 if (unlikely(tsk->exit_state))
1315 return 0;
1316
1317 if (!task_cputime_zero(&tsk->cputime_expires)) {
1318 struct task_cputime task_sample = {
1319 .utime = tsk->utime,
1320 .stime = tsk->stime,
1321 .sum_exec_runtime = tsk->se.sum_exec_runtime
1322 };
1323
1324 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1325 return 1;
1326 }
1327
1328 sig = tsk->signal;
1329 if (!task_cputime_zero(&sig->cputime_expires)) {
1330 struct task_cputime group_sample;
1331
1332 thread_group_cputime(tsk, &group_sample);
1333 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1334 return 1;
1335 }
1336 return 0;
1337}
1338
1317/* 1339/*
1318 * This is called from the timer interrupt handler. The irq handler has 1340 * This is called from the timer interrupt handler. The irq handler has
1319 * already updated our counts. We need to check if any timers fire now. 1341 * already updated our counts. We need to check if any timers fire now.
@@ -1326,42 +1348,31 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1326 1348
1327 BUG_ON(!irqs_disabled()); 1349 BUG_ON(!irqs_disabled());
1328 1350
1329#define UNEXPIRED(clock) \ 1351 /*
1330 (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ 1352 * The fast path checks that there are no expired thread or thread
1331 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) 1353 * group timers. If that's so, just return.
1332 1354 */
1333 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1355 if (!fastpath_timer_check(tsk))
1334 (tsk->it_sched_expires == 0 ||
1335 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1336 return; 1356 return;
1337 1357
1338#undef UNEXPIRED 1358 spin_lock(&tsk->sighand->siglock);
1339
1340 /* 1359 /*
1341 * Double-check with locks held. 1360 * Here we take off tsk->signal->cpu_timers[N] and
1361 * tsk->cpu_timers[N] all the timers that are firing, and
1362 * put them on the firing list.
1342 */ 1363 */
1343 read_lock(&tasklist_lock); 1364 check_thread_timers(tsk, &firing);
1344 if (likely(tsk->signal != NULL)) { 1365 check_process_timers(tsk, &firing);
1345 spin_lock(&tsk->sighand->siglock);
1346 1366
1347 /* 1367 /*
1348 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] 1368 * We must release these locks before taking any timer's lock.
1349 * all the timers that are firing, and put them on the firing list. 1369 * There is a potential race with timer deletion here, as the
1350 */ 1370 * siglock now protects our private firing list. We have set
1351 check_thread_timers(tsk, &firing); 1371 * the firing flag in each timer, so that a deletion attempt
1352 check_process_timers(tsk, &firing); 1372 * that gets the timer lock before we do will give it up and
1353 1373 * spin until we've taken care of that timer below.
1354 /* 1374 */
1355 * We must release these locks before taking any timer's lock. 1375 spin_unlock(&tsk->sighand->siglock);
1356 * There is a potential race with timer deletion here, as the
1357 * siglock now protects our private firing list. We have set
1358 * the firing flag in each timer, so that a deletion attempt
1359 * that gets the timer lock before we do will give it up and
1360 * spin until we've taken care of that timer below.
1361 */
1362 spin_unlock(&tsk->sighand->siglock);
1363 }
1364 read_unlock(&tasklist_lock);
1365 1376
1366 /* 1377 /*
1367 * Now that all the timers on our list have the firing flag, 1378 * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1400,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1389 1400
1390/* 1401/*
1391 * Set one of the process-wide special case CPU timers. 1402 * Set one of the process-wide special case CPU timers.
1392 * The tasklist_lock and tsk->sighand->siglock must be held by the caller. 1403 * The tsk->sighand->siglock must be held by the caller.
1393 * The oldval argument is null for the RLIMIT_CPU timer, where *newval is 1404 * The *newval argument is relative and we update it to be absolute, *oldval
1394 * absolute; non-null for ITIMER_*, where *newval is relative and we update 1405 * is absolute and we update it to be relative.
1395 * it to be absolute, *oldval is absolute and we update it to be relative.
1396 */ 1406 */
1397void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1407void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1398 cputime_t *newval, cputime_t *oldval) 1408 cputime_t *newval, cputime_t *oldval)
@@ -1401,7 +1411,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1401 struct list_head *head; 1411 struct list_head *head;
1402 1412
1403 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1413 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1404 cpu_clock_sample_group_locked(clock_idx, tsk, &now); 1414 cpu_clock_sample_group(clock_idx, tsk, &now);
1405 1415
1406 if (oldval) { 1416 if (oldval) {
1407 if (!cputime_eq(*oldval, cputime_zero)) { 1417 if (!cputime_eq(*oldval, cputime_zero)) {
@@ -1435,13 +1445,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1435 cputime_ge(list_first_entry(head, 1445 cputime_ge(list_first_entry(head,
1436 struct cpu_timer_list, entry)->expires.cpu, 1446 struct cpu_timer_list, entry)->expires.cpu,
1437 *newval)) { 1447 *newval)) {
1438 /* 1448 switch (clock_idx) {
1439 * Rejigger each thread's expiry time so that one will 1449 case CPUCLOCK_PROF:
1440 * notice before we hit the process-cumulative expiry time. 1450 tsk->signal->cputime_expires.prof_exp = *newval;
1441 */ 1451 break;
1442 union cpu_time_count expires = { .sched = 0 }; 1452 case CPUCLOCK_VIRT:
1443 expires.cpu = *newval; 1453 tsk->signal->cputime_expires.virt_exp = *newval;
1444 process_timer_rebalance(tsk, clock_idx, expires, now); 1454 break;
1455 }
1445 } 1456 }
1446} 1457}
1447 1458
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..5e79c662294b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
223} 223}
224 224
225/* 225/*
226 * Get monotonic time for posix timers
227 */
228static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
229{
230 getrawmonotonic(tp);
231 return 0;
232}
233
234/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 235 * Initialize everything, well, just everything in Posix clocks/timers ;)
227 */ 236 */
228static __init int init_posix_timers(void) 237static __init int init_posix_timers(void)
@@ -235,9 +244,15 @@ static __init int init_posix_timers(void)
235 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
236 .clock_set = do_posix_clock_nosettime, 245 .clock_set = do_posix_clock_nosettime,
237 }; 246 };
247 struct k_clock clock_monotonic_raw = {
248 .clock_getres = hrtimer_get_res,
249 .clock_get = posix_get_monotonic_raw,
250 .clock_set = do_posix_clock_nosettime,
251 };
238 252
239 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 253 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
240 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 254 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
255 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
241 256
242 posix_timers_cache = kmem_cache_create("posix_timers_cache", 257 posix_timers_cache = kmem_cache_create("posix_timers_cache",
243 sizeof (struct k_itimer), 0, SLAB_PANIC, 258 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info)
298 313
299int posix_timer_event(struct k_itimer *timr, int si_private) 314int posix_timer_event(struct k_itimer *timr, int si_private)
300{ 315{
316 int shared, ret;
301 /* 317 /*
302 * FIXME: if ->sigq is queued we can race with 318 * FIXME: if ->sigq is queued we can race with
303 * dequeue_signal()->do_schedule_next_timer(). 319 * dequeue_signal()->do_schedule_next_timer().
@@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
311 */ 327 */
312 timr->sigq->info.si_sys_private = si_private; 328 timr->sigq->info.si_sys_private = si_private;
313 329
314 timr->sigq->info.si_signo = timr->it_sigev_signo; 330 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
315 timr->sigq->info.si_code = SI_TIMER; 331 ret = send_sigqueue(timr->sigq, timr->it_process, shared);
316 timr->sigq->info.si_tid = timr->it_id; 332 /* If we failed to send the signal the timer stops. */
317 timr->sigq->info.si_value = timr->it_sigev_value; 333 return ret > 0;
318
319 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
320 struct task_struct *leader;
321 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
322
323 if (likely(ret >= 0))
324 return ret;
325
326 timr->it_sigev_notify = SIGEV_SIGNAL;
327 leader = timr->it_process->group_leader;
328 put_task_struct(timr->it_process);
329 timr->it_process = leader;
330 }
331
332 return send_sigqueue(timr->sigq, timr->it_process, 1);
333} 334}
334EXPORT_SYMBOL_GPL(posix_timer_event); 335EXPORT_SYMBOL_GPL(posix_timer_event);
335 336
@@ -441,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void)
441 return tmr; 442 return tmr;
442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 443 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
443 kmem_cache_free(posix_timers_cache, tmr); 444 kmem_cache_free(posix_timers_cache, tmr);
444 tmr = NULL; 445 return NULL;
445 } 446 }
446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); 447 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
447 return tmr; 448 return tmr;
@@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock,
468 struct sigevent __user *timer_event_spec, 469 struct sigevent __user *timer_event_spec,
469 timer_t __user * created_timer_id) 470 timer_t __user * created_timer_id)
470{ 471{
471 int error = 0; 472 struct k_itimer *new_timer;
472 struct k_itimer *new_timer = NULL; 473 int error, new_timer_id;
473 int new_timer_id; 474 struct task_struct *process;
474 struct task_struct *process = NULL;
475 unsigned long flags;
476 sigevent_t event; 475 sigevent_t event;
477 int it_id_set = IT_ID_NOT_SET; 476 int it_id_set = IT_ID_NOT_SET;
478 477
@@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock,
490 goto out; 489 goto out;
491 } 490 }
492 spin_lock_irq(&idr_lock); 491 spin_lock_irq(&idr_lock);
493 error = idr_get_new(&posix_timers_id, (void *) new_timer, 492 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
494 &new_timer_id);
495 spin_unlock_irq(&idr_lock); 493 spin_unlock_irq(&idr_lock);
496 if (error == -EAGAIN) 494 if (error) {
497 goto retry; 495 if (error == -EAGAIN)
498 else if (error) { 496 goto retry;
499 /* 497 /*
500 * Weird looking, but we return EAGAIN if the IDR is 498 * Weird looking, but we return EAGAIN if the IDR is
501 * full (proper POSIX return value for this) 499 * full (proper POSIX return value for this)
@@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock,
526 error = -EFAULT; 524 error = -EFAULT;
527 goto out; 525 goto out;
528 } 526 }
529 new_timer->it_sigev_notify = event.sigev_notify; 527 rcu_read_lock();
530 new_timer->it_sigev_signo = event.sigev_signo; 528 process = good_sigevent(&event);
531 new_timer->it_sigev_value = event.sigev_value; 529 if (process)
532 530 get_task_struct(process);
533 read_lock(&tasklist_lock); 531 rcu_read_unlock();
534 if ((process = good_sigevent(&event))) {
535 /*
536 * We may be setting up this process for another
537 * thread. It may be exiting. To catch this
538 * case the we check the PF_EXITING flag. If
539 * the flag is not set, the siglock will catch
540 * him before it is too late (in exit_itimers).
541 *
542 * The exec case is a bit more invloved but easy
543 * to code. If the process is in our thread
544 * group (and it must be or we would not allow
545 * it here) and is doing an exec, it will cause
546 * us to be killed. In this case it will wait
547 * for us to die which means we can finish this
548 * linkage with our last gasp. I.e. no code :)
549 */
550 spin_lock_irqsave(&process->sighand->siglock, flags);
551 if (!(process->flags & PF_EXITING)) {
552 new_timer->it_process = process;
553 list_add(&new_timer->list,
554 &process->signal->posix_timers);
555 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
556 get_task_struct(process);
557 spin_unlock_irqrestore(&process->sighand->siglock, flags);
558 } else {
559 spin_unlock_irqrestore(&process->sighand->siglock, flags);
560 process = NULL;
561 }
562 }
563 read_unlock(&tasklist_lock);
564 if (!process) { 532 if (!process) {
565 error = -EINVAL; 533 error = -EINVAL;
566 goto out; 534 goto out;
567 } 535 }
568 } else { 536 } else {
569 new_timer->it_sigev_notify = SIGEV_SIGNAL; 537 event.sigev_notify = SIGEV_SIGNAL;
570 new_timer->it_sigev_signo = SIGALRM; 538 event.sigev_signo = SIGALRM;
571 new_timer->it_sigev_value.sival_int = new_timer->it_id; 539 event.sigev_value.sival_int = new_timer->it_id;
572 process = current->group_leader; 540 process = current->group_leader;
573 spin_lock_irqsave(&process->sighand->siglock, flags); 541 get_task_struct(process);
574 new_timer->it_process = process;
575 list_add(&new_timer->list, &process->signal->posix_timers);
576 spin_unlock_irqrestore(&process->sighand->siglock, flags);
577 } 542 }
578 543
544 new_timer->it_sigev_notify = event.sigev_notify;
545 new_timer->sigq->info.si_signo = event.sigev_signo;
546 new_timer->sigq->info.si_value = event.sigev_value;
547 new_timer->sigq->info.si_tid = new_timer->it_id;
548 new_timer->sigq->info.si_code = SI_TIMER;
549
550 spin_lock_irq(&current->sighand->siglock);
551 new_timer->it_process = process;
552 list_add(&new_timer->list, &current->signal->posix_timers);
553 spin_unlock_irq(&current->sighand->siglock);
554
555 return 0;
579 /* 556 /*
580 * In the case of the timer belonging to another task, after 557 * In the case of the timer belonging to another task, after
581 * the task is unlocked, the timer is owned by the other task 558 * the task is unlocked, the timer is owned by the other task
582 * and may cease to exist at any time. Don't use or modify 559 * and may cease to exist at any time. Don't use or modify
583 * new_timer after the unlock call. 560 * new_timer after the unlock call.
584 */ 561 */
585
586out: 562out:
587 if (error) 563 release_posix_timer(new_timer, it_id_set);
588 release_posix_timer(new_timer, it_id_set);
589
590 return error; 564 return error;
591} 565}
592 566
@@ -597,7 +571,7 @@ out:
597 * the find to the timer lock. To avoid a dead lock, the timer id MUST 571 * the find to the timer lock. To avoid a dead lock, the timer id MUST
598 * be release with out holding the timer lock. 572 * be release with out holding the timer lock.
599 */ 573 */
600static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) 574static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
601{ 575{
602 struct k_itimer *timr; 576 struct k_itimer *timr;
603 /* 577 /*
@@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
605 * flags part over to the timer lock. Must not let interrupts in 579 * flags part over to the timer lock. Must not let interrupts in
606 * while we are moving the lock. 580 * while we are moving the lock.
607 */ 581 */
608
609 spin_lock_irqsave(&idr_lock, *flags); 582 spin_lock_irqsave(&idr_lock, *flags);
610 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); 583 timr = idr_find(&posix_timers_id, (int)timer_id);
611 if (timr) { 584 if (timr) {
612 spin_lock(&timr->it_lock); 585 spin_lock(&timr->it_lock);
613 586 if (timr->it_process &&
614 if ((timr->it_id != timer_id) || !(timr->it_process) || 587 same_thread_group(timr->it_process, current)) {
615 !same_thread_group(timr->it_process, current)) {
616 spin_unlock(&timr->it_lock);
617 spin_unlock_irqrestore(&idr_lock, *flags);
618 timr = NULL;
619 } else
620 spin_unlock(&idr_lock); 588 spin_unlock(&idr_lock);
621 } else 589 return timr;
622 spin_unlock_irqrestore(&idr_lock, *flags); 590 }
591 spin_unlock(&timr->it_lock);
592 }
593 spin_unlock_irqrestore(&idr_lock, *flags);
623 594
624 return timr; 595 return NULL;
625} 596}
626 597
627/* 598/*
@@ -668,7 +639,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
668 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) 639 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
669 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); 640 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
670 641
671 remaining = ktime_sub(timer->expires, now); 642 remaining = ktime_sub(hrtimer_get_expires(timer), now);
672 /* Return 0 only, when the timer is expired and not pending */ 643 /* Return 0 only, when the timer is expired and not pending */
673 if (remaining.tv64 <= 0) { 644 if (remaining.tv64 <= 0) {
674 /* 645 /*
@@ -762,7 +733,7 @@ common_timer_set(struct k_itimer *timr, int flags,
762 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 733 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
763 timr->it.real.timer.function = posix_timer_fn; 734 timr->it.real.timer.function = posix_timer_fn;
764 735
765 timer->expires = timespec_to_ktime(new_setting->it_value); 736 hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
766 737
767 /* Convert interval */ 738 /* Convert interval */
768 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); 739 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@ -771,14 +742,12 @@ common_timer_set(struct k_itimer *timr, int flags,
771 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 742 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
772 /* Setup correct expiry time for relative timers */ 743 /* Setup correct expiry time for relative timers */
773 if (mode == HRTIMER_MODE_REL) { 744 if (mode == HRTIMER_MODE_REL) {
774 timer->expires = 745 hrtimer_add_expires(timer, timer->base->get_time());
775 ktime_add_safe(timer->expires,
776 timer->base->get_time());
777 } 746 }
778 return 0; 747 return 0;
779 } 748 }
780 749
781 hrtimer_start(timer, timer->expires, mode); 750 hrtimer_start_expires(timer, mode);
782 return 0; 751 return 0;
783} 752}
784 753
@@ -862,8 +831,7 @@ retry_delete:
862 * This keeps any tasks waiting on the spin lock from thinking 831 * This keeps any tasks waiting on the spin lock from thinking
863 * they got something (see the lock code above). 832 * they got something (see the lock code above).
864 */ 833 */
865 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 834 put_task_struct(timer->it_process);
866 put_task_struct(timer->it_process);
867 timer->it_process = NULL; 835 timer->it_process = NULL;
868 836
869 unlock_timer(timer, flags); 837 unlock_timer(timer, flags);
@@ -890,8 +858,7 @@ retry_delete:
890 * This keeps any tasks waiting on the spin lock from thinking 858 * This keeps any tasks waiting on the spin lock from thinking
891 * they got something (see the lock code above). 859 * they got something (see the lock code above).
892 */ 860 */
893 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 861 put_task_struct(timer->it_process);
894 put_task_struct(timer->it_process);
895 timer->it_process = NULL; 862 timer->it_process = NULL;
896 863
897 unlock_timer(timer, flags); 864 unlock_timer(timer, flags);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dcd165f92a88..23bd4daeb96b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -96,7 +96,7 @@ config SUSPEND
96 96
97config PM_TEST_SUSPEND 97config PM_TEST_SUSPEND
98 bool "Test suspend/resume and wakealarm during bootup" 98 bool "Test suspend/resume and wakealarm during bootup"
99 depends on SUSPEND && PM_DEBUG && RTC_LIB=y 99 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
100 ---help--- 100 ---help---
101 This option will let you suspend your machine during bootup, and 101 This option will let you suspend your machine during bootup, and
102 make it wake up a few seconds later using an RTC wakeup alarm. 102 make it wake up a few seconds later using an RTC wakeup alarm.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index bbd85c60f741..c9d74083746f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -14,6 +14,7 @@
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/kmod.h>
17#include <linux/delay.h> 18#include <linux/delay.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/mount.h> 20#include <linux/mount.h>
@@ -520,6 +521,10 @@ int hibernate(void)
520 if (error) 521 if (error)
521 goto Exit; 522 goto Exit;
522 523
524 error = usermodehelper_disable();
525 if (error)
526 goto Exit;
527
523 /* Allocate memory management structures */ 528 /* Allocate memory management structures */
524 error = create_basic_memory_bitmaps(); 529 error = create_basic_memory_bitmaps();
525 if (error) 530 if (error)
@@ -558,6 +563,7 @@ int hibernate(void)
558 thaw_processes(); 563 thaw_processes();
559 Finish: 564 Finish:
560 free_basic_memory_bitmaps(); 565 free_basic_memory_bitmaps();
566 usermodehelper_enable();
561 Exit: 567 Exit:
562 pm_notifier_call_chain(PM_POST_HIBERNATION); 568 pm_notifier_call_chain(PM_POST_HIBERNATION);
563 pm_restore_console(); 569 pm_restore_console();
@@ -634,6 +640,10 @@ static int software_resume(void)
634 if (error) 640 if (error)
635 goto Finish; 641 goto Finish;
636 642
643 error = usermodehelper_disable();
644 if (error)
645 goto Finish;
646
637 error = create_basic_memory_bitmaps(); 647 error = create_basic_memory_bitmaps();
638 if (error) 648 if (error)
639 goto Finish; 649 goto Finish;
@@ -641,7 +651,7 @@ static int software_resume(void)
641 pr_debug("PM: Preparing processes for restore.\n"); 651 pr_debug("PM: Preparing processes for restore.\n");
642 error = prepare_processes(); 652 error = prepare_processes();
643 if (error) { 653 if (error) {
644 swsusp_close(); 654 swsusp_close(FMODE_READ);
645 goto Done; 655 goto Done;
646 } 656 }
647 657
@@ -656,6 +666,7 @@ static int software_resume(void)
656 thaw_processes(); 666 thaw_processes();
657 Done: 667 Done:
658 free_basic_memory_bitmaps(); 668 free_basic_memory_bitmaps();
669 usermodehelper_enable();
659 Finish: 670 Finish:
660 pm_notifier_call_chain(PM_POST_RESTORE); 671 pm_notifier_call_chain(PM_POST_RESTORE);
661 pm_restore_console(); 672 pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 540b16b68565..b8f7ce9473e8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -14,6 +14,7 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/kmod.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
@@ -173,7 +174,7 @@ static void suspend_test_finish(const char *label)
173 * has some performance issues. The stack dump of a WARN_ON 174 * has some performance issues. The stack dump of a WARN_ON
174 * is more likely to get the right attention than a printk... 175 * is more likely to get the right attention than a printk...
175 */ 176 */
176 WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); 177 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
177} 178}
178 179
179#else 180#else
@@ -237,6 +238,10 @@ static int suspend_prepare(void)
237 if (error) 238 if (error)
238 goto Finish; 239 goto Finish;
239 240
241 error = usermodehelper_disable();
242 if (error)
243 goto Finish;
244
240 if (suspend_freeze_processes()) { 245 if (suspend_freeze_processes()) {
241 error = -EAGAIN; 246 error = -EAGAIN;
242 goto Thaw; 247 goto Thaw;
@@ -256,6 +261,7 @@ static int suspend_prepare(void)
256 261
257 Thaw: 262 Thaw:
258 suspend_thaw_processes(); 263 suspend_thaw_processes();
264 usermodehelper_enable();
259 Finish: 265 Finish:
260 pm_notifier_call_chain(PM_POST_SUSPEND); 266 pm_notifier_call_chain(PM_POST_SUSPEND);
261 pm_restore_console(); 267 pm_restore_console();
@@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state)
376static void suspend_finish(void) 382static void suspend_finish(void)
377{ 383{
378 suspend_thaw_processes(); 384 suspend_thaw_processes();
385 usermodehelper_enable();
379 pm_notifier_call_chain(PM_POST_SUSPEND); 386 pm_notifier_call_chain(PM_POST_SUSPEND);
380 pm_restore_console(); 387 pm_restore_console();
381} 388}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index acc0c101dbd5..46b5ec7a3afb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -153,7 +153,7 @@ extern int swsusp_shrink_memory(void);
153extern void swsusp_free(void); 153extern void swsusp_free(void);
154extern int swsusp_read(unsigned int *flags_p); 154extern int swsusp_read(unsigned int *flags_p);
155extern int swsusp_write(unsigned int flags); 155extern int swsusp_write(unsigned int flags);
156extern void swsusp_close(void); 156extern void swsusp_close(fmode_t);
157 157
158struct timeval; 158struct timeval;
159/* kernel/power/swsusp.c */ 159/* kernel/power/swsusp.c */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 278946aecaf0..ca634019497a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p)
28 return 1; 28 return 1;
29} 29}
30 30
31/*
32 * freezing is complete, mark current process as frozen
33 */
34static inline void frozen_process(void)
35{
36 if (!unlikely(current->flags & PF_NOFREEZE)) {
37 current->flags |= PF_FROZEN;
38 wmb();
39 }
40 clear_freeze_flag(current);
41}
42
43/* Refrigerator is place where frozen processes are stored :-). */
44void refrigerator(void)
45{
46 /* Hmm, should we be allowed to suspend when there are realtime
47 processes around? */
48 long save;
49
50 task_lock(current);
51 if (freezing(current)) {
52 frozen_process();
53 task_unlock(current);
54 } else {
55 task_unlock(current);
56 return;
57 }
58 save = current->state;
59 pr_debug("%s entered refrigerator\n", current->comm);
60
61 spin_lock_irq(&current->sighand->siglock);
62 recalc_sigpending(); /* We sent fake signal, clean it up */
63 spin_unlock_irq(&current->sighand->siglock);
64
65 for (;;) {
66 set_current_state(TASK_UNINTERRUPTIBLE);
67 if (!frozen(current))
68 break;
69 schedule();
70 }
71 pr_debug("%s left refrigerator\n", current->comm);
72 __set_current_state(save);
73}
74
75static void fake_signal_wake_up(struct task_struct *p)
76{
77 unsigned long flags;
78
79 spin_lock_irqsave(&p->sighand->siglock, flags);
80 signal_wake_up(p, 0);
81 spin_unlock_irqrestore(&p->sighand->siglock, flags);
82}
83
84static inline bool should_send_signal(struct task_struct *p)
85{
86 return !(p->flags & PF_FREEZER_NOSIG);
87}
88
89/**
90 * freeze_task - send a freeze request to given task
91 * @p: task to send the request to
92 * @sig_only: if set, the request will only be sent if the task has the
93 * PF_FREEZER_NOSIG flag unset
94 * Return value: 'false', if @sig_only is set and the task has
95 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
96 *
97 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
98 * either sending a fake signal to it or waking it up, depending on whether
99 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
100 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
101 * TIF_FREEZE flag will not be set.
102 */
103static bool freeze_task(struct task_struct *p, bool sig_only)
104{
105 /*
106 * We first check if the task is freezing and next if it has already
107 * been frozen to avoid the race with frozen_process() which first marks
108 * the task as frozen and next clears its TIF_FREEZE.
109 */
110 if (!freezing(p)) {
111 rmb();
112 if (frozen(p))
113 return false;
114
115 if (!sig_only || should_send_signal(p))
116 set_freeze_flag(p);
117 else
118 return false;
119 }
120
121 if (should_send_signal(p)) {
122 if (!signal_pending(p))
123 fake_signal_wake_up(p);
124 } else if (sig_only) {
125 return false;
126 } else {
127 wake_up_state(p, TASK_INTERRUPTIBLE);
128 }
129
130 return true;
131}
132
133static void cancel_freezing(struct task_struct *p)
134{
135 unsigned long flags;
136
137 if (freezing(p)) {
138 pr_debug(" clean up: %s\n", p->comm);
139 clear_freeze_flag(p);
140 spin_lock_irqsave(&p->sighand->siglock, flags);
141 recalc_sigpending_and_wake(p);
142 spin_unlock_irqrestore(&p->sighand->siglock, flags);
143 }
144}
145
146static int try_to_freeze_tasks(bool sig_only) 31static int try_to_freeze_tasks(bool sig_only)
147{ 32{
148 struct task_struct *g, *p; 33 struct task_struct *g, *p;
@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
250 if (nosig_only && should_send_signal(p)) 135 if (nosig_only && should_send_signal(p))
251 continue; 136 continue;
252 137
138 if (cgroup_frozen(p))
139 continue;
140
253 thaw_process(p); 141 thaw_process(p);
254 } while_each_thread(g, p); 142 } while_each_thread(g, p);
255 read_unlock(&tasklist_lock); 143 read_unlock(&tasklist_lock);
@@ -264,4 +152,3 @@ void thaw_processes(void)
264 printk("done.\n"); 152 printk("done.\n");
265} 153}
266 154
267EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ccac849e46..b7713b53d07a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -172,13 +172,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
172 return res; 172 return res;
173 173
174 root_swap = res; 174 root_swap = res;
175 res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); 175 res = blkdev_get(resume_bdev, FMODE_WRITE);
176 if (res) 176 if (res)
177 return res; 177 return res;
178 178
179 res = set_blocksize(resume_bdev, PAGE_SIZE); 179 res = set_blocksize(resume_bdev, PAGE_SIZE);
180 if (res < 0) 180 if (res < 0)
181 blkdev_put(resume_bdev); 181 blkdev_put(resume_bdev, FMODE_WRITE);
182 182
183 return res; 183 return res;
184} 184}
@@ -426,7 +426,7 @@ int swsusp_write(unsigned int flags)
426 426
427 release_swap_writer(&handle); 427 release_swap_writer(&handle);
428 out: 428 out:
429 swsusp_close(); 429 swsusp_close(FMODE_WRITE);
430 return error; 430 return error;
431} 431}
432 432
@@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p)
574 error = load_image(&handle, &snapshot, header->pages - 1); 574 error = load_image(&handle, &snapshot, header->pages - 1);
575 release_swap_reader(&handle); 575 release_swap_reader(&handle);
576 576
577 blkdev_put(resume_bdev); 577 blkdev_put(resume_bdev, FMODE_READ);
578 578
579 if (!error) 579 if (!error)
580 pr_debug("PM: Image successfully loaded\n"); 580 pr_debug("PM: Image successfully loaded\n");
@@ -609,7 +609,7 @@ int swsusp_check(void)
609 return -EINVAL; 609 return -EINVAL;
610 } 610 }
611 if (error) 611 if (error)
612 blkdev_put(resume_bdev); 612 blkdev_put(resume_bdev, FMODE_READ);
613 else 613 else
614 pr_debug("PM: Signature found, resuming\n"); 614 pr_debug("PM: Signature found, resuming\n");
615 } else { 615 } else {
@@ -626,14 +626,14 @@ int swsusp_check(void)
626 * swsusp_close - close swap device. 626 * swsusp_close - close swap device.
627 */ 627 */
628 628
629void swsusp_close(void) 629void swsusp_close(fmode_t mode)
630{ 630{
631 if (IS_ERR(resume_bdev)) { 631 if (IS_ERR(resume_bdev)) {
632 pr_debug("PM: Image device not initialised\n"); 632 pr_debug("PM: Image device not initialised\n");
633 return; 633 return;
634 } 634 }
635 635
636 blkdev_put(resume_bdev); 636 blkdev_put(resume_bdev, mode); /* move up */
637} 637}
638 638
639static int swsusp_header_init(void) 639static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a6332a313262..005b93d839ba 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
212 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
213 if (data->frozen) 213 if (data->frozen)
214 break; 214 break;
215
215 printk("Syncing filesystems ... "); 216 printk("Syncing filesystems ... ");
216 sys_sync(); 217 sys_sync();
217 printk("done.\n"); 218 printk("done.\n");
218 219
219 error = freeze_processes(); 220 error = usermodehelper_disable();
220 if (error) 221 if (error)
222 break;
223
224 error = freeze_processes();
225 if (error) {
221 thaw_processes(); 226 thaw_processes();
227 usermodehelper_enable();
228 }
222 if (!error) 229 if (!error)
223 data->frozen = 1; 230 data->frozen = 1;
224 break; 231 break;
@@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
227 if (!data->frozen || data->ready) 234 if (!data->frozen || data->ready)
228 break; 235 break;
229 thaw_processes(); 236 thaw_processes();
237 usermodehelper_enable();
230 data->frozen = 0; 238 data->frozen = 0;
231 break; 239 break;
232 240
diff --git a/kernel/printk.c b/kernel/printk.c
index b51b1567bb55..f492f1583d77 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -13,7 +13,7 @@
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfred@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton
17 */ 17 */
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
@@ -233,45 +233,6 @@ static inline void boot_delay_msec(void)
233#endif 233#endif
234 234
235/* 235/*
236 * Return the number of unread characters in the log buffer.
237 */
238static int log_buf_get_len(void)
239{
240 return logged_chars;
241}
242
243/*
244 * Copy a range of characters from the log buffer.
245 */
246int log_buf_copy(char *dest, int idx, int len)
247{
248 int ret, max;
249 bool took_lock = false;
250
251 if (!oops_in_progress) {
252 spin_lock_irq(&logbuf_lock);
253 took_lock = true;
254 }
255
256 max = log_buf_get_len();
257 if (idx < 0 || idx >= max) {
258 ret = -1;
259 } else {
260 if (len > max)
261 len = max;
262 ret = len;
263 idx += (log_end - max);
264 while (len-- > 0)
265 dest[len] = LOG_BUF(idx + len);
266 }
267
268 if (took_lock)
269 spin_unlock_irq(&logbuf_lock);
270
271 return ret;
272}
273
274/*
275 * Commands to do_syslog: 236 * Commands to do_syslog:
276 * 237 *
277 * 0 -- Close the log. Currently a NOP. 238 * 0 -- Close the log. Currently a NOP.
@@ -577,9 +538,6 @@ static int have_callable_console(void)
577 * @fmt: format string 538 * @fmt: format string
578 * 539 *
579 * This is printk(). It can be called from any context. We want it to work. 540 * This is printk(). It can be called from any context. We want it to work.
580 * Be aware of the fact that if oops_in_progress is not set, we might try to
581 * wake klogd up which could deadlock on runqueue lock if printk() is called
582 * from scheduler code.
583 * 541 *
584 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 542 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
585 * call the console drivers. If we fail to get the semaphore we place the output 543 * call the console drivers. If we fail to get the semaphore we place the output
@@ -593,6 +551,8 @@ static int have_callable_console(void)
593 * 551 *
594 * See also: 552 * See also:
595 * printf(3) 553 * printf(3)
554 *
555 * See the vsnprintf() documentation for format string extensions over C99.
596 */ 556 */
597 557
598asmlinkage int printk(const char *fmt, ...) 558asmlinkage int printk(const char *fmt, ...)
@@ -982,10 +942,25 @@ int is_console_locked(void)
982 return console_locked; 942 return console_locked;
983} 943}
984 944
985void wake_up_klogd(void) 945static DEFINE_PER_CPU(int, printk_pending);
946
947void printk_tick(void)
986{ 948{
987 if (!oops_in_progress && waitqueue_active(&log_wait)) 949 if (__get_cpu_var(printk_pending)) {
950 __get_cpu_var(printk_pending) = 0;
988 wake_up_interruptible(&log_wait); 951 wake_up_interruptible(&log_wait);
952 }
953}
954
955int printk_needs_cpu(int cpu)
956{
957 return per_cpu(printk_pending, cpu);
958}
959
960void wake_up_klogd(void)
961{
962 if (waitqueue_active(&log_wait))
963 __raw_get_cpu_var(printk_pending) = 1;
989} 964}
990 965
991/** 966/**
@@ -1291,22 +1266,6 @@ static int __init disable_boot_consoles(void)
1291} 1266}
1292late_initcall(disable_boot_consoles); 1267late_initcall(disable_boot_consoles);
1293 1268
1294/**
1295 * tty_write_message - write a message to a certain tty, not just the console.
1296 * @tty: the destination tty_struct
1297 * @msg: the message to write
1298 *
1299 * This is used for messages that need to be redirected to a specific tty.
1300 * We don't put it into the syslog queue right now maybe in the future if
1301 * really needed.
1302 */
1303void tty_write_message(struct tty_struct *tty, char *msg)
1304{
1305 if (tty && tty->ops->write)
1306 tty->ops->write(tty, msg, strlen(msg));
1307 return;
1308}
1309
1310#if defined CONFIG_PRINTK 1269#if defined CONFIG_PRINTK
1311 1270
1312/* 1271/*
diff --git a/kernel/profile.c b/kernel/profile.c
index cd26bed4cc26..5b7d1ac7124c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,6 +22,8 @@
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
25#include <asm/sections.h> 27#include <asm/sections.h>
26#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
27#include <asm/ptrace.h> 29#include <asm/ptrace.h>
@@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
50static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
51#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
52 54
53static int __init profile_setup(char *str) 55int profile_setup(char *str)
54{ 56{
55 static char __initdata schedstr[] = "schedule"; 57 static char schedstr[] = "schedule";
56 static char __initdata sleepstr[] = "sleep"; 58 static char sleepstr[] = "sleep";
57 static char __initdata kvmstr[] = "kvm"; 59 static char kvmstr[] = "kvm";
58 int par; 60 int par;
59 61
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 62 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -100,14 +102,33 @@ static int __init profile_setup(char *str)
100__setup("profile=", profile_setup); 102__setup("profile=", profile_setup);
101 103
102 104
103void __init profile_init(void) 105int __ref profile_init(void)
104{ 106{
107 int buffer_bytes;
105 if (!prof_on) 108 if (!prof_on)
106 return; 109 return 0;
107 110
108 /* only text is profiled */ 111 /* only text is profiled */
109 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
110 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 return 0;
117 }
118
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer)
121 return 0;
122
123 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
124 if (prof_buffer)
125 return 0;
126
127 prof_buffer = vmalloc(buffer_bytes);
128 if (prof_buffer)
129 return 0;
130
131 return -ENOMEM;
111} 132}
112 133
113/* Profile event notifications */ 134/* Profile event notifications */
@@ -523,11 +544,11 @@ static const struct file_operations proc_profile_operations = {
523}; 544};
524 545
525#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
526static void __init profile_nop(void *unused) 547static inline void profile_nop(void *unused)
527{ 548{
528} 549}
529 550
530static int __init create_hash_tables(void) 551static int create_hash_tables(void)
531{ 552{
532 int cpu; 553 int cpu;
533 554
@@ -575,14 +596,14 @@ out_cleanup:
575#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
576#endif 597#endif
577 598
578static int __init create_proc_profile(void) 599int create_proc_profile(void)
579{ 600{
580 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
581 602
582 if (!prof_on) 603 if (!prof_on)
583 return 0; 604 return 0;
584 if (create_hash_tables()) 605 if (create_hash_tables())
585 return -1; 606 return -ENOMEM;
586 entry = proc_create("profile", S_IWUSR | S_IRUGO, 607 entry = proc_create("profile", S_IWUSR | S_IRUGO,
587 NULL, &proc_profile_operations); 608 NULL, &proc_profile_operations);
588 if (!entry) 609 if (!entry)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 356699a96d56..1e68e4c39e2c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
45 * TASK_TRACED, resume it now. 45 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled. 46 * Requires that irqs be disabled.
47 */ 47 */
48void ptrace_untrace(struct task_struct *child) 48static void ptrace_untrace(struct task_struct *child)
49{ 49{
50 spin_lock(&child->sighand->siglock); 50 spin_lock(&child->sighand->siglock);
51 if (task_is_traced(child)) { 51 if (task_is_traced(child)) {
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cdc9f68..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/cpu.h> 48#include <linux/cpu.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/time.h>
50 51
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
60static struct rcu_ctrlblk rcu_ctrlblk = { 61static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300, 62 .cur = -300,
62 .completed = -300, 63 .completed = -300,
64 .pending = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_MASK_NONE,
65}; 67};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300, 69 .cur = -300,
68 .completed = -300, 70 .completed = -300,
71 .pending = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_MASK_NONE,
71}; 74};
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
83{ 86{
84 int cpu; 87 int cpu;
85 cpumask_t cpumask; 88 cpumask_t cpumask;
89 unsigned long flags;
90
86 set_need_resched(); 91 set_need_resched();
92 spin_lock_irqsave(&rcp->lock, flags);
87 if (unlikely(!rcp->signaled)) { 93 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1; 94 rcp->signaled = 1;
89 /* 95 /*
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
109 for_each_cpu_mask_nr(cpu, cpumask) 115 for_each_cpu_mask_nr(cpu, cpumask)
110 smp_send_reschedule(cpu); 116 smp_send_reschedule(cpu);
111 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags);
112} 119}
113#else 120#else
114static inline void force_quiescent_state(struct rcu_data *rdp, 121static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
118} 125}
119#endif 126#endif
120 127
128static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
129 struct rcu_data *rdp)
130{
131 long batch;
132
133 head->next = NULL;
134 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
135
136 /*
137 * Determine the batch number of this callback.
138 *
139 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
140 * local variable "batch" and emits codes like this:
141 * 1) rdp->batch = rcp->cur + 1 # gets old value
142 * ......
143 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
144 * then [*nxttail[0], *nxttail[1]) may contain callbacks
145 * that batch# = rdp->batch, see the comment of struct rcu_data.
146 */
147 batch = ACCESS_ONCE(rcp->cur) + 1;
148
149 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
150 /* process callbacks */
151 rdp->nxttail[0] = rdp->nxttail[1];
152 rdp->nxttail[1] = rdp->nxttail[2];
153 if (rcu_batch_after(batch - 1, rdp->batch))
154 rdp->nxttail[0] = rdp->nxttail[2];
155 }
156
157 rdp->batch = batch;
158 *rdp->nxttail[2] = head;
159 rdp->nxttail[2] = &head->next;
160
161 if (unlikely(++rdp->qlen > qhimark)) {
162 rdp->blimit = INT_MAX;
163 force_quiescent_state(rdp, &rcu_ctrlblk);
164 }
165}
166
167#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
168
169static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
170{
171 rcp->gp_start = jiffies;
172 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
173}
174
175static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
176{
177 int cpu;
178 long delta;
179 unsigned long flags;
180
181 /* Only let one CPU complain about others per time interval. */
182
183 spin_lock_irqsave(&rcp->lock, flags);
184 delta = jiffies - rcp->jiffies_stall;
185 if (delta < 2 || rcp->cur != rcp->completed) {
186 spin_unlock_irqrestore(&rcp->lock, flags);
187 return;
188 }
189 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
190 spin_unlock_irqrestore(&rcp->lock, flags);
191
192 /* OK, time to rat on our buddy... */
193
194 printk(KERN_ERR "RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu);
198 }
199 printk(" (detected by %d, t=%ld jiffies)\n",
200 smp_processor_id(), (long)(jiffies - rcp->gp_start));
201}
202
203static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{
205 unsigned long flags;
206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start);
210 dump_stack();
211 spin_lock_irqsave(&rcp->lock, flags);
212 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
213 rcp->jiffies_stall =
214 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
215 spin_unlock_irqrestore(&rcp->lock, flags);
216 set_need_resched(); /* kick ourselves to get things going. */
217}
218
219static void check_cpu_stall(struct rcu_ctrlblk *rcp)
220{
221 long delta;
222
223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
225
226 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp);
228
229 } else if (rcp->cur != rcp->completed && delta >= 2) {
230
231 /* They had two seconds to dump stack, so complain. */
232 print_other_cpu_stall(rcp);
233 }
234}
235
236#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237
238static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
239{
240}
241
242static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
243{
244}
245
246#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
247
121/** 248/**
122 * call_rcu - Queue an RCU callback for invocation after a grace period. 249 * call_rcu - Queue an RCU callback for invocation after a grace period.
123 * @head: structure to be used for queueing the RCU updates. 250 * @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
133 void (*func)(struct rcu_head *rcu)) 260 void (*func)(struct rcu_head *rcu))
134{ 261{
135 unsigned long flags; 262 unsigned long flags;
136 struct rcu_data *rdp;
137 263
138 head->func = func; 264 head->func = func;
139 head->next = NULL;
140 local_irq_save(flags); 265 local_irq_save(flags);
141 rdp = &__get_cpu_var(rcu_data); 266 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
142 *rdp->nxttail = head;
143 rdp->nxttail = &head->next;
144 if (unlikely(++rdp->qlen > qhimark)) {
145 rdp->blimit = INT_MAX;
146 force_quiescent_state(rdp, &rcu_ctrlblk);
147 }
148 local_irq_restore(flags); 267 local_irq_restore(flags);
149} 268}
150EXPORT_SYMBOL_GPL(call_rcu); 269EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
169 void (*func)(struct rcu_head *rcu)) 288 void (*func)(struct rcu_head *rcu))
170{ 289{
171 unsigned long flags; 290 unsigned long flags;
172 struct rcu_data *rdp;
173 291
174 head->func = func; 292 head->func = func;
175 head->next = NULL;
176 local_irq_save(flags); 293 local_irq_save(flags);
177 rdp = &__get_cpu_var(rcu_bh_data); 294 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
178 *rdp->nxttail = head;
179 rdp->nxttail = &head->next;
180
181 if (unlikely(++rdp->qlen > qhimark)) {
182 rdp->blimit = INT_MAX;
183 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
184 }
185
186 local_irq_restore(flags); 295 local_irq_restore(flags);
187} 296}
188EXPORT_SYMBOL_GPL(call_rcu_bh); 297EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
211static inline void raise_rcu_softirq(void) 320static inline void raise_rcu_softirq(void)
212{ 321{
213 raise_softirq(RCU_SOFTIRQ); 322 raise_softirq(RCU_SOFTIRQ);
214 /*
215 * The smp_mb() here is required to ensure that this cpu's
216 * __rcu_process_callbacks() reads the most recently updated
217 * value of rcu->cur.
218 */
219 smp_mb();
220} 323}
221 324
222/* 325/*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
225 */ 328 */
226static void rcu_do_batch(struct rcu_data *rdp) 329static void rcu_do_batch(struct rcu_data *rdp)
227{ 330{
331 unsigned long flags;
228 struct rcu_head *next, *list; 332 struct rcu_head *next, *list;
229 int count = 0; 333 int count = 0;
230 334
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
239 } 343 }
240 rdp->donelist = list; 344 rdp->donelist = list;
241 345
242 local_irq_disable(); 346 local_irq_save(flags);
243 rdp->qlen -= count; 347 rdp->qlen -= count;
244 local_irq_enable(); 348 local_irq_restore(flags);
245 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) 349 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
246 rdp->blimit = blimit; 350 rdp->blimit = blimit;
247 351
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
269 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 373 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
270 * period (if necessary). 374 * period (if necessary).
271 */ 375 */
376
272/* 377/*
273 * Register a new batch of callbacks, and start it up if there is currently no 378 * Register a new batch of callbacks, and start it up if there is currently no
274 * active batch and the batch to be registered has not already occurred. 379 * active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
276 */ 381 */
277static void rcu_start_batch(struct rcu_ctrlblk *rcp) 382static void rcu_start_batch(struct rcu_ctrlblk *rcp)
278{ 383{
279 if (rcp->next_pending && 384 if (rcp->cur != rcp->pending &&
280 rcp->completed == rcp->cur) { 385 rcp->completed == rcp->cur) {
281 rcp->next_pending = 0;
282 /*
283 * next_pending == 0 must be visible in
284 * __rcu_process_callbacks() before it can see new value of cur.
285 */
286 smp_wmb();
287 rcp->cur++; 386 rcp->cur++;
387 record_gp_stall_check_time(rcp);
288 388
289 /* 389 /*
290 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 390 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
322static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 422static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
323 struct rcu_data *rdp) 423 struct rcu_data *rdp)
324{ 424{
425 unsigned long flags;
426
325 if (rdp->quiescbatch != rcp->cur) { 427 if (rdp->quiescbatch != rcp->cur) {
326 /* start new grace period: */ 428 /* start new grace period: */
327 rdp->qs_pending = 1; 429 rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
345 return; 447 return;
346 rdp->qs_pending = 0; 448 rdp->qs_pending = 0;
347 449
348 spin_lock(&rcp->lock); 450 spin_lock_irqsave(&rcp->lock, flags);
349 /* 451 /*
350 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 452 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
351 * during cpu startup. Ignore the quiescent state. 453 * during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
353 if (likely(rdp->quiescbatch == rcp->cur)) 455 if (likely(rdp->quiescbatch == rcp->cur))
354 cpu_quiet(rdp->cpu, rcp); 456 cpu_quiet(rdp->cpu, rcp);
355 457
356 spin_unlock(&rcp->lock); 458 spin_unlock_irqrestore(&rcp->lock, flags);
357} 459}
358 460
359 461
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
364 * which is dead and hence not processing interrupts. 466 * which is dead and hence not processing interrupts.
365 */ 467 */
366static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, 468static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
367 struct rcu_head **tail) 469 struct rcu_head **tail, long batch)
368{ 470{
369 local_irq_disable(); 471 unsigned long flags;
370 *this_rdp->nxttail = list; 472
371 if (list) 473 if (list) {
372 this_rdp->nxttail = tail; 474 local_irq_save(flags);
373 local_irq_enable(); 475 this_rdp->batch = batch;
476 *this_rdp->nxttail[2] = list;
477 this_rdp->nxttail[2] = tail;
478 local_irq_restore(flags);
479 }
374} 480}
375 481
376static void __rcu_offline_cpu(struct rcu_data *this_rdp, 482static void __rcu_offline_cpu(struct rcu_data *this_rdp,
377 struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 483 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
378{ 484{
379 /* if the cpu going offline owns the grace period 485 unsigned long flags;
486
487 /*
488 * if the cpu going offline owns the grace period
380 * we can block indefinitely waiting for it, so flush 489 * we can block indefinitely waiting for it, so flush
381 * it here 490 * it here
382 */ 491 */
383 spin_lock_bh(&rcp->lock); 492 spin_lock_irqsave(&rcp->lock, flags);
384 if (rcp->cur != rcp->completed) 493 if (rcp->cur != rcp->completed)
385 cpu_quiet(rdp->cpu, rcp); 494 cpu_quiet(rdp->cpu, rcp);
386 spin_unlock_bh(&rcp->lock); 495 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
387 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); 496 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
388 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 497 spin_unlock(&rcp->lock);
389 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
390 498
391 local_irq_disable();
392 this_rdp->qlen += rdp->qlen; 499 this_rdp->qlen += rdp->qlen;
393 local_irq_enable(); 500 local_irq_restore(flags);
394} 501}
395 502
396static void rcu_offline_cpu(int cpu) 503static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
420static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 527static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
421 struct rcu_data *rdp) 528 struct rcu_data *rdp)
422{ 529{
423 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 530 unsigned long flags;
424 *rdp->donetail = rdp->curlist; 531 long completed_snap;
425 rdp->donetail = rdp->curtail;
426 rdp->curlist = NULL;
427 rdp->curtail = &rdp->curlist;
428 }
429 532
430 if (rdp->nxtlist && !rdp->curlist) { 533 if (rdp->nxtlist) {
431 local_irq_disable(); 534 local_irq_save(flags);
432 rdp->curlist = rdp->nxtlist; 535 completed_snap = ACCESS_ONCE(rcp->completed);
433 rdp->curtail = rdp->nxttail;
434 rdp->nxtlist = NULL;
435 rdp->nxttail = &rdp->nxtlist;
436 local_irq_enable();
437 536
438 /* 537 /*
439 * start the next batch of callbacks 538 * move the other grace-period-completed entries to
539 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
440 */ 540 */
541 if (!rcu_batch_before(completed_snap, rdp->batch))
542 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
543 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
544 rdp->nxttail[0] = rdp->nxttail[1];
441 545
442 /* determine batch number */ 546 /*
443 rdp->batch = rcp->cur + 1; 547 * the grace period for entries in
444 /* see the comment and corresponding wmb() in 548 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
445 * the rcu_start_batch() 549 * move these entries to donelist
446 */ 550 */
447 smp_rmb(); 551 if (rdp->nxttail[0] != &rdp->nxtlist) {
552 *rdp->donetail = rdp->nxtlist;
553 rdp->donetail = rdp->nxttail[0];
554 rdp->nxtlist = *rdp->nxttail[0];
555 *rdp->donetail = NULL;
556
557 if (rdp->nxttail[1] == rdp->nxttail[0])
558 rdp->nxttail[1] = &rdp->nxtlist;
559 if (rdp->nxttail[2] == rdp->nxttail[0])
560 rdp->nxttail[2] = &rdp->nxtlist;
561 rdp->nxttail[0] = &rdp->nxtlist;
562 }
563
564 local_irq_restore(flags);
565
566 if (rcu_batch_after(rdp->batch, rcp->pending)) {
567 unsigned long flags2;
448 568
449 if (!rcp->next_pending) {
450 /* and start it/schedule start if it's a new batch */ 569 /* and start it/schedule start if it's a new batch */
451 spin_lock(&rcp->lock); 570 spin_lock_irqsave(&rcp->lock, flags2);
452 rcp->next_pending = 1; 571 if (rcu_batch_after(rdp->batch, rcp->pending)) {
453 rcu_start_batch(rcp); 572 rcp->pending = rdp->batch;
454 spin_unlock(&rcp->lock); 573 rcu_start_batch(rcp);
574 }
575 spin_unlock_irqrestore(&rcp->lock, flags2);
455 } 576 }
456 } 577 }
457 578
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
462 583
463static void rcu_process_callbacks(struct softirq_action *unused) 584static void rcu_process_callbacks(struct softirq_action *unused)
464{ 585{
586 /*
587 * Memory references from any prior RCU read-side critical sections
588 * executed by the interrupted code must be see before any RCU
589 * grace-period manupulations below.
590 */
591
592 smp_mb(); /* See above block comment. */
593
465 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); 594 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
466 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); 595 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
596
597 /*
598 * Memory references from any later RCU read-side critical sections
599 * executed by the interrupted code must be see after any RCU
600 * grace-period manupulations above.
601 */
602
603 smp_mb(); /* See above block comment. */
467} 604}
468 605
469static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 606static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
470{ 607{
471 /* This cpu has pending rcu entries and the grace period 608 /* Check for CPU stalls, if enabled. */
472 * for them has completed. 609 check_cpu_stall(rcp);
473 */
474 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
475 return 1;
476 610
477 /* This cpu has no pending entries, but there are new entries */ 611 if (rdp->nxtlist) {
478 if (!rdp->curlist && rdp->nxtlist) 612 long completed_snap = ACCESS_ONCE(rcp->completed);
479 return 1; 613
614 /*
615 * This cpu has pending rcu entries and the grace period
616 * for them has completed.
617 */
618 if (!rcu_batch_before(completed_snap, rdp->batch))
619 return 1;
620 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
621 rdp->nxttail[0] != rdp->nxttail[1])
622 return 1;
623 if (rdp->nxttail[0] != &rdp->nxtlist)
624 return 1;
625
626 /*
627 * This cpu has pending rcu entries and the new batch
628 * for then hasn't been started nor scheduled start
629 */
630 if (rcu_batch_after(rdp->batch, rcp->pending))
631 return 1;
632 }
480 633
481 /* This cpu has finished callbacks to invoke */ 634 /* This cpu has finished callbacks to invoke */
482 if (rdp->donelist) 635 if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
512 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 665 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
513 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); 666 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
514 667
515 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); 668 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
516} 669}
517 670
671/*
672 * Top-level function driving RCU grace-period detection, normally
673 * invoked from the scheduler-clock interrupt. This function simply
674 * increments counters that are read only from softirq by this same
675 * CPU, so there are no memory barriers required.
676 */
518void rcu_check_callbacks(int cpu, int user) 677void rcu_check_callbacks(int cpu, int user)
519{ 678{
520 if (user || 679 if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
558static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 717static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
559 struct rcu_data *rdp) 718 struct rcu_data *rdp)
560{ 719{
720 unsigned long flags;
721
722 spin_lock_irqsave(&rcp->lock, flags);
561 memset(rdp, 0, sizeof(*rdp)); 723 memset(rdp, 0, sizeof(*rdp));
562 rdp->curtail = &rdp->curlist; 724 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
563 rdp->nxttail = &rdp->nxtlist;
564 rdp->donetail = &rdp->donelist; 725 rdp->donetail = &rdp->donelist;
565 rdp->quiescbatch = rcp->completed; 726 rdp->quiescbatch = rcp->completed;
566 rdp->qs_pending = 0; 727 rdp->qs_pending = 0;
567 rdp->cpu = cpu; 728 rdp->cpu = cpu;
568 rdp->blimit = blimit; 729 rdp->blimit = blimit;
730 spin_unlock_irqrestore(&rcp->lock, flags);
569} 731}
570 732
571static void __cpuinit rcu_online_cpu(int cpu) 733static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
610 */ 772 */
611void __init __rcu_init(void) 773void __init __rcu_init(void)
612{ 774{
775#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
776 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
777#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
613 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 778 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
614 (void *)(long)smp_processor_id()); 779 (void *)(long)smp_processor_id());
615 /* Register notifier for non-boot CPUs */ 780 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 467d5940f624..ad63af8b2521 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type)
119 /* Take cpucontrol mutex to protect against CPU hotplug */ 119 /* Take cpucontrol mutex to protect against CPU hotplug */
120 mutex_lock(&rcu_barrier_mutex); 120 mutex_lock(&rcu_barrier_mutex);
121 init_completion(&rcu_barrier_completion); 121 init_completion(&rcu_barrier_completion);
122 atomic_set(&rcu_barrier_cpu_count, 0);
123 /* 122 /*
124 * The queueing of callbacks in all CPUs must be atomic with 123 * Initialize rcu_barrier_cpu_count to 1, then invoke
125 * respect to RCU, otherwise one CPU may queue a callback, 124 * rcu_barrier_func() on each CPU, so that each CPU also has
126 * wait for a grace period, decrement barrier count and call 125 * incremented rcu_barrier_cpu_count. Only then is it safe to
127 * complete(), while other CPUs have not yet queued anything. 126 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
128 * So, we need to make sure that grace periods cannot complete 127 * might complete its grace period before all of the other CPUs
129 * until all the callbacks are queued. 128 * did their increment, causing this function to return too
129 * early.
130 */ 130 */
131 rcu_read_lock(); 131 atomic_set(&rcu_barrier_cpu_count, 1);
132 on_each_cpu(rcu_barrier_func, (void *)type, 1); 132 on_each_cpu(rcu_barrier_func, (void *)type, 1);
133 rcu_read_unlock(); 133 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
134 complete(&rcu_barrier_completion);
134 wait_for_completion(&rcu_barrier_completion); 135 wait_for_completion(&rcu_barrier_completion);
135 mutex_unlock(&rcu_barrier_mutex); 136 mutex_unlock(&rcu_barrier_mutex);
136} 137}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..59236e8b9daa 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -54,17 +54,9 @@
54#include <linux/cpu.h> 54#include <linux/cpu.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
57#include <linux/byteorder/swabb.h>
58#include <linux/cpumask.h> 57#include <linux/cpumask.h>
59#include <linux/rcupreempt_trace.h> 58#include <linux/rcupreempt_trace.h>
60 59#include <asm/byteorder.h>
61/*
62 * Macro that prevents the compiler from reordering accesses, but does
63 * absolutely -nothing- to prevent CPUs from reordering. This is used
64 * only to mediate communication between mainline code and hardware
65 * interrupt and NMI handlers.
66 */
67#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
68 60
69/* 61/*
70 * PREEMPT_RCU data structures. 62 * PREEMPT_RCU data structures.
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
308 308
309static int __init rcupreempt_trace_init(void) 309static int __init rcupreempt_trace_init(void)
310{ 310{
311 int ret;
312
311 mutex_init(&rcupreempt_trace_mutex); 313 mutex_init(&rcupreempt_trace_mutex);
312 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); 314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
313 if (!rcupreempt_trace_buf) 315 if (!rcupreempt_trace_buf)
314 return 1; 316 return 1;
315 return rcupreempt_debugfs_init(); 317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
316} 321}
317 322
318static void __exit rcupreempt_trace_cleanup(void) 323static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 90b5b123f7a1..85cb90588a55 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -42,10 +42,10 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/cpu.h> 43#include <linux/cpu.h>
44#include <linux/delay.h> 44#include <linux/delay.h>
45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 45#include <linux/stat.h>
47#include <linux/srcu.h> 46#include <linux/srcu.h>
48#include <linux/slab.h> 47#include <linux/slab.h>
48#include <asm/byteorder.h>
49 49
50MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c08..32b0befdcb6a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
400 } 400 }
401 401
402 mutex_lock(&relay_channels_mutex); 402 mutex_lock(&relay_channels_mutex);
403 for_each_online_cpu(i) 403 for_each_possible_cpu(i)
404 if (chan->buf[i]) 404 if (chan->buf[i])
405 __relay_reset(chan->buf[i], 0); 405 __relay_reset(chan->buf[i], 0);
406 mutex_unlock(&relay_channels_mutex); 406 mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
611 return chan; 611 return chan;
612 612
613free_bufs: 613free_bufs:
614 for_each_online_cpu(i) { 614 for_each_possible_cpu(i) {
615 if (!chan->buf[i]) 615 if (chan->buf[i])
616 break; 616 relay_close_buf(chan->buf[i]);
617 relay_close_buf(chan->buf[i]);
618 } 617 }
619 618
620 kref_put(&chan->kref, relay_destroy_channel); 619 kref_put(&chan->kref, relay_destroy_channel);
diff --git a/kernel/resource.c b/kernel/resource.c
index 03d796c1b2e9..4337063663ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -17,6 +17,7 @@
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/device.h> 19#include <linux/device.h>
20#include <linux/pfn.h>
20#include <asm/io.h> 21#include <asm/io.h>
21 22
22 23
@@ -38,10 +39,6 @@ EXPORT_SYMBOL(iomem_resource);
38 39
39static DEFINE_RWLOCK(resource_lock); 40static DEFINE_RWLOCK(resource_lock);
40 41
41#ifdef CONFIG_PROC_FS
42
43enum { MAX_IORES_LEVEL = 5 };
44
45static void *r_next(struct seq_file *m, void *v, loff_t *pos) 42static void *r_next(struct seq_file *m, void *v, loff_t *pos)
46{ 43{
47 struct resource *p = v; 44 struct resource *p = v;
@@ -53,6 +50,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
53 return p->sibling; 50 return p->sibling;
54} 51}
55 52
53#ifdef CONFIG_PROC_FS
54
55enum { MAX_IORES_LEVEL = 5 };
56
56static void *r_start(struct seq_file *m, loff_t *pos) 57static void *r_start(struct seq_file *m, loff_t *pos)
57 __acquires(resource_lock) 58 __acquires(resource_lock)
58{ 59{
@@ -516,6 +517,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
516 return result; 517 return result;
517} 518}
518 519
520static void __init __reserve_region_with_split(struct resource *root,
521 resource_size_t start, resource_size_t end,
522 const char *name)
523{
524 struct resource *parent = root;
525 struct resource *conflict;
526 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
527
528 if (!res)
529 return;
530
531 res->name = name;
532 res->start = start;
533 res->end = end;
534 res->flags = IORESOURCE_BUSY;
535
536 for (;;) {
537 conflict = __request_resource(parent, res);
538 if (!conflict)
539 break;
540 if (conflict != parent) {
541 parent = conflict;
542 if (!(conflict->flags & IORESOURCE_BUSY))
543 continue;
544 }
545
546 /* Uhhuh, that didn't work out.. */
547 kfree(res);
548 res = NULL;
549 break;
550 }
551
552 if (!res) {
553 /* failed, split and try again */
554
555 /* conflict covered whole area */
556 if (conflict->start <= start && conflict->end >= end)
557 return;
558
559 if (conflict->start > start)
560 __reserve_region_with_split(root, start, conflict->start-1, name);
561 if (!(conflict->flags & IORESOURCE_BUSY)) {
562 resource_size_t common_start, common_end;
563
564 common_start = max(conflict->start, start);
565 common_end = min(conflict->end, end);
566 if (common_start < common_end)
567 __reserve_region_with_split(root, common_start, common_end, name);
568 }
569 if (conflict->end < end)
570 __reserve_region_with_split(root, conflict->end+1, end, name);
571 }
572
573}
574
575void __init reserve_region_with_split(struct resource *root,
576 resource_size_t start, resource_size_t end,
577 const char *name)
578{
579 write_lock(&resource_lock);
580 __reserve_region_with_split(root, start, end, name);
581 write_unlock(&resource_lock);
582}
583
519EXPORT_SYMBOL(adjust_resource); 584EXPORT_SYMBOL(adjust_resource);
520 585
521/** 586/**
@@ -562,33 +627,34 @@ struct resource * __request_region(struct resource *parent,
562{ 627{
563 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
564 629
565 if (res) { 630 if (!res)
566 res->name = name; 631 return NULL;
567 res->start = start;
568 res->end = start + n - 1;
569 res->flags = IORESOURCE_BUSY;
570 632
571 write_lock(&resource_lock); 633 res->name = name;
634 res->start = start;
635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY;
572 637
573 for (;;) { 638 write_lock(&resource_lock);
574 struct resource *conflict;
575 639
576 conflict = __request_resource(parent, res); 640 for (;;) {
577 if (!conflict) 641 struct resource *conflict;
578 break;
579 if (conflict != parent) {
580 parent = conflict;
581 if (!(conflict->flags & IORESOURCE_BUSY))
582 continue;
583 }
584 642
585 /* Uhhuh, that didn't work out.. */ 643 conflict = __request_resource(parent, res);
586 kfree(res); 644 if (!conflict)
587 res = NULL;
588 break; 645 break;
646 if (conflict != parent) {
647 parent = conflict;
648 if (!(conflict->flags & IORESOURCE_BUSY))
649 continue;
589 } 650 }
590 write_unlock(&resource_lock); 651
652 /* Uhhuh, that didn't work out.. */
653 kfree(res);
654 res = NULL;
655 break;
591 } 656 }
657 write_unlock(&resource_lock);
592 return res; 658 return res;
593} 659}
594EXPORT_SYMBOL(__request_region); 660EXPORT_SYMBOL(__request_region);
@@ -763,3 +829,41 @@ static int __init reserve_setup(char *str)
763} 829}
764 830
765__setup("reserve=", reserve_setup); 831__setup("reserve=", reserve_setup);
832
833/*
834 * Check if the requested addr and size spans more than any slot in the
835 * iomem resource tree.
836 */
837int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
838{
839 struct resource *p = &iomem_resource;
840 int err = 0;
841 loff_t l;
842
843 read_lock(&resource_lock);
844 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
845 /*
846 * We can probably skip the resources without
847 * IORESOURCE_IO attribute?
848 */
849 if (p->start >= addr + size)
850 continue;
851 if (p->end < addr)
852 continue;
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue;
856 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr,
859 (unsigned long long)(addr + size - 1),
860 (unsigned long long)p->start,
861 (unsigned long long)p->end,
862 p->name);
863 err = -1;
864 break;
865 }
866 read_unlock(&resource_lock);
867
868 return err;
869}
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 6522ae5b14a2..69d9cb921ffa 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
631 631
632 /* Setup the timer, when timeout != NULL */ 632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) { 633 if (unlikely(timeout)) {
634 hrtimer_start(&timeout->timer, timeout->timer.expires, 634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 HRTIMER_MODE_ABS);
636 if (!hrtimer_active(&timeout->timer)) 635 if (!hrtimer_active(&timeout->timer))
637 timeout->task = NULL; 636 timeout->task = NULL;
638 } 637 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 13dd2db9fb2d..9b1e79371c20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
55#include <linux/cpuset.h> 55#include <linux/cpuset.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 59#include <linux/seq_file.h>
59#include <linux/sysctl.h> 60#include <linux/sysctl.h>
60#include <linux/syscalls.h> 61#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
71#include <linux/debugfs.h> 72#include <linux/debugfs.h>
72#include <linux/ctype.h> 73#include <linux/ctype.h>
73#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
74 76
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
@@ -201,14 +203,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 203 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 205 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207}
208
209static inline int rt_bandwidth_enabled(void)
210{
211 return sysctl_sched_rt_runtime >= 0;
205} 212}
206 213
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 215{
209 ktime_t now; 216 ktime_t now;
210 217
211 if (rt_b->rt_runtime == RUNTIME_INF) 218 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 219 return;
213 220
214 if (hrtimer_active(&rt_b->rt_period_timer)) 221 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -221,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
221 228
222 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
223 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
224 hrtimer_start(&rt_b->rt_period_timer, 231 hrtimer_start_expires(&rt_b->rt_period_timer,
225 rt_b->rt_period_timer.expires, 232 HRTIMER_MODE_ABS);
226 HRTIMER_MODE_ABS);
227 } 233 }
228 spin_unlock(&rt_b->rt_runtime_lock); 234 spin_unlock(&rt_b->rt_runtime_lock);
229} 235}
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 304static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 305static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 306#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 307#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 308#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 309#endif /* CONFIG_USER_SCHED */
304 310
305/* task_group_lock serializes add/remove of task groups and also changes to 311/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 312 * a task group's cpu shares.
@@ -380,7 +386,6 @@ struct cfs_rq {
380 386
381 u64 exec_clock; 387 u64 exec_clock;
382 u64 min_vruntime; 388 u64 min_vruntime;
383 u64 pair_start;
384 389
385 struct rb_root tasks_timeline; 390 struct rb_root tasks_timeline;
386 struct rb_node *rb_leftmost; 391 struct rb_node *rb_leftmost;
@@ -392,9 +397,9 @@ struct cfs_rq {
392 * 'curr' points to currently running entity on this cfs_rq. 397 * 'curr' points to currently running entity on this cfs_rq.
393 * It is set to NULL otherwise (i.e when none are currently running). 398 * It is set to NULL otherwise (i.e when none are currently running).
394 */ 399 */
395 struct sched_entity *curr, *next; 400 struct sched_entity *curr, *next, *last;
396 401
397 unsigned long nr_spread_over; 402 unsigned int nr_spread_over;
398 403
399#ifdef CONFIG_FAIR_GROUP_SCHED 404#ifdef CONFIG_FAIR_GROUP_SCHED
400 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 405 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -604,9 +609,9 @@ struct rq {
604 609
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 611
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 613{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 615}
611 616
612static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -813,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
813unsigned int sysctl_sched_shares_ratelimit = 250000; 818unsigned int sysctl_sched_shares_ratelimit = 250000;
814 819
815/* 820/*
821 * Inject some fuzzyness into changing the per-cpu group shares
822 * this avoids remote rq-locks at the expense of fairness.
823 * default: 4
824 */
825unsigned int sysctl_sched_shares_thresh = 4;
826
827/*
816 * period over which we measure -rt task cpu usage in us. 828 * period over which we measure -rt task cpu usage in us.
817 * default: 1s 829 * default: 1s
818 */ 830 */
@@ -957,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 } 969 }
958} 970}
959 971
972void task_rq_unlock_wait(struct task_struct *p)
973{
974 struct rq *rq = task_rq(p);
975
976 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
977 spin_unlock_wait(&rq->lock);
978}
979
960static void __task_rq_unlock(struct rq *rq) 980static void __task_rq_unlock(struct rq *rq)
961 __releases(rq->lock) 981 __releases(rq->lock)
962{ 982{
@@ -1058,7 +1078,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1058 struct hrtimer *timer = &rq->hrtick_timer; 1078 struct hrtimer *timer = &rq->hrtick_timer;
1059 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1079 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1060 1080
1061 timer->expires = time; 1081 hrtimer_set_expires(timer, time);
1062 1082
1063 if (rq == this_rq()) { 1083 if (rq == this_rq()) {
1064 hrtimer_restart(timer); 1084 hrtimer_restart(timer);
@@ -1102,7 +1122,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1122 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1123}
1104 1124
1105static void init_hrtick(void) 1125static inline void init_hrtick(void)
1106{ 1126{
1107} 1127}
1108#endif /* CONFIG_SMP */ 1128#endif /* CONFIG_SMP */
@@ -1119,9 +1139,9 @@ static void init_rq_hrtick(struct rq *rq)
1119 1139
1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1121 rq->hrtick_timer.function = hrtick; 1141 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1143}
1124#else 1144#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1145static inline void hrtick_clear(struct rq *rq)
1126{ 1146{
1127} 1147}
@@ -1133,7 +1153,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1153static inline void init_hrtick(void)
1134{ 1154{
1135} 1155}
1136#endif 1156#endif /* CONFIG_SCHED_HRTICK */
1137 1157
1138/* 1158/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1159 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1400,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1400 update_load_sub(&rq->load, load);
1381} 1401}
1382 1402
1383#ifdef CONFIG_SMP 1403#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1404typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1405
1402/* 1406/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1407 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1408 * leaving it for the final time.
1405 */ 1409 */
1406static void 1410static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1411{
1409 struct task_group *parent, *child; 1412 struct task_group *parent, *child;
1413 int ret;
1410 1414
1411 rcu_read_lock(); 1415 rcu_read_lock();
1412 parent = &root_task_group; 1416 parent = &root_task_group;
1413down: 1417down:
1414 (*down)(parent, cpu, sd); 1418 ret = (*down)(parent, data);
1419 if (ret)
1420 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1421 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1422 parent = child;
1417 goto down; 1423 goto down;
@@ -1419,23 +1425,53 @@ down:
1419up: 1425up:
1420 continue; 1426 continue;
1421 } 1427 }
1422 (*up)(parent, cpu, sd); 1428 ret = (*up)(parent, data);
1429 if (ret)
1430 goto out_unlock;
1423 1431
1424 child = parent; 1432 child = parent;
1425 parent = parent->parent; 1433 parent = parent->parent;
1426 if (parent) 1434 if (parent)
1427 goto up; 1435 goto up;
1436out_unlock:
1428 rcu_read_unlock(); 1437 rcu_read_unlock();
1438
1439 return ret;
1429} 1440}
1430 1441
1442static int tg_nop(struct task_group *tg, void *data)
1443{
1444 return 0;
1445}
1446#endif
1447
1448#ifdef CONFIG_SMP
1449static unsigned long source_load(int cpu, int type);
1450static unsigned long target_load(int cpu, int type);
1451static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1452
1453static unsigned long cpu_avg_load_per_task(int cpu)
1454{
1455 struct rq *rq = cpu_rq(cpu);
1456
1457 if (rq->nr_running)
1458 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1459 else
1460 rq->avg_load_per_task = 0;
1461
1462 return rq->avg_load_per_task;
1463}
1464
1465#ifdef CONFIG_FAIR_GROUP_SCHED
1466
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1467static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1468
1433/* 1469/*
1434 * Calculate and set the cpu's group shares. 1470 * Calculate and set the cpu's group shares.
1435 */ 1471 */
1436static void 1472static void
1437__update_group_shares_cpu(struct task_group *tg, int cpu, 1473update_group_shares_cpu(struct task_group *tg, int cpu,
1438 unsigned long sd_shares, unsigned long sd_rq_weight) 1474 unsigned long sd_shares, unsigned long sd_rq_weight)
1439{ 1475{
1440 int boost = 0; 1476 int boost = 0;
1441 unsigned long shares; 1477 unsigned long shares;
@@ -1466,19 +1502,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1466 * 1502 *
1467 */ 1503 */
1468 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1504 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1505 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1469 1506
1470 /* 1507 if (abs(shares - tg->se[cpu]->load.weight) >
1471 * record the actual number of shares, not the boosted amount. 1508 sysctl_sched_shares_thresh) {
1472 */ 1509 struct rq *rq = cpu_rq(cpu);
1473 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1510 unsigned long flags;
1474 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1475 1511
1476 if (shares < MIN_SHARES) 1512 spin_lock_irqsave(&rq->lock, flags);
1477 shares = MIN_SHARES; 1513 /*
1478 else if (shares > MAX_SHARES) 1514 * record the actual number of shares, not the boosted amount.
1479 shares = MAX_SHARES; 1515 */
1516 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1517 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1480 1518
1481 __set_se_shares(tg->se[cpu], shares); 1519 __set_se_shares(tg->se[cpu], shares);
1520 spin_unlock_irqrestore(&rq->lock, flags);
1521 }
1482} 1522}
1483 1523
1484/* 1524/*
@@ -1486,11 +1526,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1526 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1527 * parent group depends on the shares of its child groups.
1488 */ 1528 */
1489static void 1529static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1530{
1492 unsigned long rq_weight = 0; 1531 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1532 unsigned long shares = 0;
1533 struct sched_domain *sd = data;
1494 int i; 1534 int i;
1495 1535
1496 for_each_cpu_mask(i, sd->span) { 1536 for_each_cpu_mask(i, sd->span) {
@@ -1507,14 +1547,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1507 if (!rq_weight) 1547 if (!rq_weight)
1508 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; 1548 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1509 1549
1510 for_each_cpu_mask(i, sd->span) { 1550 for_each_cpu_mask(i, sd->span)
1511 struct rq *rq = cpu_rq(i); 1551 update_group_shares_cpu(tg, i, shares, rq_weight);
1512 unsigned long flags;
1513 1552
1514 spin_lock_irqsave(&rq->lock, flags); 1553 return 0;
1515 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags);
1517 }
1518} 1554}
1519 1555
1520/* 1556/*
@@ -1522,10 +1558,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1558 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1559 * group is a fraction of its parents load.
1524 */ 1560 */
1525static void 1561static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1562{
1528 unsigned long load; 1563 unsigned long load;
1564 long cpu = (long)data;
1529 1565
1530 if (!tg->parent) { 1566 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1567 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1572,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1572 }
1537 1573
1538 tg->cfs_rq[cpu]->h_load = load; 1574 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1575
1541static void 1576 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1577}
1545 1578
1546static void update_shares(struct sched_domain *sd) 1579static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1583,7 @@ static void update_shares(struct sched_domain *sd)
1550 1583
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1584 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1585 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1586 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1587 }
1555} 1588}
1556 1589
@@ -1561,9 +1594,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1594 spin_lock(&rq->lock);
1562} 1595}
1563 1596
1564static void update_h_load(int cpu) 1597static void update_h_load(long cpu)
1565{ 1598{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1599 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1600}
1568 1601
1569#else 1602#else
@@ -1782,7 +1815,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1782 /* 1815 /*
1783 * Buddy candidates are cache hot: 1816 * Buddy candidates are cache hot:
1784 */ 1817 */
1785 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) 1818 if (sched_feat(CACHE_HOT_BUDDY) &&
1819 (&p->se == cfs_rq_of(&p->se)->next ||
1820 &p->se == cfs_rq_of(&p->se)->last))
1786 return 1; 1821 return 1;
1787 1822
1788 if (p->sched_class != &fair_sched_class) 1823 if (p->sched_class != &fair_sched_class)
@@ -1918,14 +1953,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1918 * just go back and repeat. 1953 * just go back and repeat.
1919 */ 1954 */
1920 rq = task_rq_lock(p, &flags); 1955 rq = task_rq_lock(p, &flags);
1956 trace_sched_wait_task(rq, p);
1921 running = task_running(rq, p); 1957 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1958 on_rq = p->se.on_rq;
1923 ncsw = 0; 1959 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1960 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1961 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1962 task_rq_unlock(rq, &flags);
1930 1963
1931 /* 1964 /*
@@ -2282,10 +2315,8 @@ out_activate:
2282 success = 1; 2315 success = 1;
2283 2316
2284out_running: 2317out_running:
2285 trace_mark(kernel_sched_wakeup, 2318 trace_sched_wakeup(rq, p);
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2319 check_preempt_curr(rq, p, sync);
2287 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p);
2289 2320
2290 p->state = TASK_RUNNING; 2321 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2322#ifdef CONFIG_SMP
@@ -2417,10 +2448,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2417 p->sched_class->task_new(rq, p); 2448 p->sched_class->task_new(rq, p);
2418 inc_nr_running(rq); 2449 inc_nr_running(rq);
2419 } 2450 }
2420 trace_mark(kernel_sched_wakeup_new, 2451 trace_sched_wakeup_new(rq, p);
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2452 check_preempt_curr(rq, p, 0);
2422 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p);
2424#ifdef CONFIG_SMP 2453#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2454 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2455 p->sched_class->task_wake_up(rq, p);
@@ -2592,11 +2621,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2592 struct mm_struct *mm, *oldmm; 2621 struct mm_struct *mm, *oldmm;
2593 2622
2594 prepare_task_switch(rq, prev, next); 2623 prepare_task_switch(rq, prev, next);
2595 trace_mark(kernel_sched_schedule, 2624 trace_sched_switch(rq, prev, next);
2596 "prev_pid %d next_pid %d prev_state %ld "
2597 "## rq %p prev %p next %p",
2598 prev->pid, next->pid, prev->state,
2599 rq, prev, next);
2600 mm = next->mm; 2625 mm = next->mm;
2601 oldmm = prev->active_mm; 2626 oldmm = prev->active_mm;
2602 /* 2627 /*
@@ -2836,6 +2861,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2836 || unlikely(!cpu_active(dest_cpu))) 2861 || unlikely(!cpu_active(dest_cpu)))
2837 goto out; 2862 goto out;
2838 2863
2864 trace_sched_migrate_task(rq, p, dest_cpu);
2839 /* force the process onto the specified CPU */ 2865 /* force the process onto the specified CPU */
2840 if (migrate_task(p, dest_cpu, &req)) { 2866 if (migrate_task(p, dest_cpu, &req)) {
2841 /* Need to wait for migration thread (might exit: take ref). */ 2867 /* Need to wait for migration thread (might exit: take ref). */
@@ -2880,7 +2906,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2906 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2907 * to be always true for them.
2882 */ 2908 */
2883 check_preempt_curr(this_rq, p); 2909 check_preempt_curr(this_rq, p, 0);
2884} 2910}
2885 2911
2886/* 2912/*
@@ -3329,7 +3355,7 @@ small_imbalance:
3329 } else 3355 } else
3330 this_load_per_task = cpu_avg_load_per_task(this_cpu); 3356 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3331 3357
3332 if (max_load - this_load + 2*busiest_load_per_task >= 3358 if (max_load - this_load + busiest_load_per_task >=
3333 busiest_load_per_task * imbn) { 3359 busiest_load_per_task * imbn) {
3334 *imbalance = busiest_load_per_task; 3360 *imbalance = busiest_load_per_task;
3335 return busiest; 3361 return busiest;
@@ -4037,23 +4063,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4037EXPORT_PER_CPU_SYMBOL(kstat); 4063EXPORT_PER_CPU_SYMBOL(kstat);
4038 4064
4039/* 4065/*
4040 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4066 * Return any ns on the sched_clock that have not yet been banked in
4041 * that have not yet been banked in case the task is currently running. 4067 * @p in case that task is currently running.
4042 */ 4068 */
4043unsigned long long task_sched_runtime(struct task_struct *p) 4069unsigned long long task_delta_exec(struct task_struct *p)
4044{ 4070{
4045 unsigned long flags; 4071 unsigned long flags;
4046 u64 ns, delta_exec;
4047 struct rq *rq; 4072 struct rq *rq;
4073 u64 ns = 0;
4048 4074
4049 rq = task_rq_lock(p, &flags); 4075 rq = task_rq_lock(p, &flags);
4050 ns = p->se.sum_exec_runtime; 4076
4051 if (task_current(rq, p)) { 4077 if (task_current(rq, p)) {
4078 u64 delta_exec;
4079
4052 update_rq_clock(rq); 4080 update_rq_clock(rq);
4053 delta_exec = rq->clock - p->se.exec_start; 4081 delta_exec = rq->clock - p->se.exec_start;
4054 if ((s64)delta_exec > 0) 4082 if ((s64)delta_exec > 0)
4055 ns += delta_exec; 4083 ns = delta_exec;
4056 } 4084 }
4085
4057 task_rq_unlock(rq, &flags); 4086 task_rq_unlock(rq, &flags);
4058 4087
4059 return ns; 4088 return ns;
@@ -4070,6 +4099,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4070 cputime64_t tmp; 4099 cputime64_t tmp;
4071 4100
4072 p->utime = cputime_add(p->utime, cputime); 4101 p->utime = cputime_add(p->utime, cputime);
4102 account_group_user_time(p, cputime);
4073 4103
4074 /* Add user time to cpustat. */ 4104 /* Add user time to cpustat. */
4075 tmp = cputime_to_cputime64(cputime); 4105 tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4124,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4094 tmp = cputime_to_cputime64(cputime); 4124 tmp = cputime_to_cputime64(cputime);
4095 4125
4096 p->utime = cputime_add(p->utime, cputime); 4126 p->utime = cputime_add(p->utime, cputime);
4127 account_group_user_time(p, cputime);
4097 p->gtime = cputime_add(p->gtime, cputime); 4128 p->gtime = cputime_add(p->gtime, cputime);
4098 4129
4099 cpustat->user = cputime64_add(cpustat->user, tmp); 4130 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4160,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4129 } 4160 }
4130 4161
4131 p->stime = cputime_add(p->stime, cputime); 4162 p->stime = cputime_add(p->stime, cputime);
4163 account_group_system_time(p, cputime);
4132 4164
4133 /* Add system time to cpustat. */ 4165 /* Add system time to cpustat. */
4134 tmp = cputime_to_cputime64(cputime); 4166 tmp = cputime_to_cputime64(cputime);
@@ -4170,6 +4202,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4170 4202
4171 if (p == rq->idle) { 4203 if (p == rq->idle) {
4172 p->stime = cputime_add(p->stime, steal); 4204 p->stime = cputime_add(p->stime, steal);
4205 account_group_system_time(p, steal);
4173 if (atomic_read(&rq->nr_iowait) > 0) 4206 if (atomic_read(&rq->nr_iowait) > 0)
4174 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4207 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4175 else 4208 else
@@ -4426,12 +4459,8 @@ need_resched_nonpreemptible:
4426 if (sched_feat(HRTICK)) 4459 if (sched_feat(HRTICK))
4427 hrtick_clear(rq); 4460 hrtick_clear(rq);
4428 4461
4429 /* 4462 spin_lock_irq(&rq->lock);
4430 * Do the rq-clock update outside the rq lock:
4431 */
4432 local_irq_disable();
4433 update_rq_clock(rq); 4463 update_rq_clock(rq);
4434 spin_lock(&rq->lock);
4435 clear_tsk_need_resched(prev); 4464 clear_tsk_need_resched(prev);
4436 4465
4437 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4466 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4627,6 +4656,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4656}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4657EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4658
4659/**
4660 * complete: - signals a single thread waiting on this completion
4661 * @x: holds the state of this particular completion
4662 *
4663 * This will wake up a single thread waiting on this completion. Threads will be
4664 * awakened in the same order in which they were queued.
4665 *
4666 * See also complete_all(), wait_for_completion() and related routines.
4667 */
4630void complete(struct completion *x) 4668void complete(struct completion *x)
4631{ 4669{
4632 unsigned long flags; 4670 unsigned long flags;
@@ -4638,6 +4676,12 @@ void complete(struct completion *x)
4638} 4676}
4639EXPORT_SYMBOL(complete); 4677EXPORT_SYMBOL(complete);
4640 4678
4679/**
4680 * complete_all: - signals all threads waiting on this completion
4681 * @x: holds the state of this particular completion
4682 *
4683 * This will wake up all threads waiting on this particular completion event.
4684 */
4641void complete_all(struct completion *x) 4685void complete_all(struct completion *x)
4642{ 4686{
4643 unsigned long flags; 4687 unsigned long flags;
@@ -4658,10 +4702,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4702 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4703 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4704 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4705 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4706 timeout = -ERESTARTSYS;
4666 break; 4707 break;
4667 } 4708 }
@@ -4689,12 +4730,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4730 return timeout;
4690} 4731}
4691 4732
4733/**
4734 * wait_for_completion: - waits for completion of a task
4735 * @x: holds the state of this particular completion
4736 *
4737 * This waits to be signaled for completion of a specific task. It is NOT
4738 * interruptible and there is no timeout.
4739 *
4740 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4741 * and interrupt capability. Also see complete().
4742 */
4692void __sched wait_for_completion(struct completion *x) 4743void __sched wait_for_completion(struct completion *x)
4693{ 4744{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4745 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4746}
4696EXPORT_SYMBOL(wait_for_completion); 4747EXPORT_SYMBOL(wait_for_completion);
4697 4748
4749/**
4750 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4751 * @x: holds the state of this particular completion
4752 * @timeout: timeout value in jiffies
4753 *
4754 * This waits for either a completion of a specific task to be signaled or for a
4755 * specified timeout to expire. The timeout is in jiffies. It is not
4756 * interruptible.
4757 */
4698unsigned long __sched 4758unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4759wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4760{
@@ -4702,6 +4762,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4762}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4763EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4764
4765/**
4766 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4767 * @x: holds the state of this particular completion
4768 *
4769 * This waits for completion of a specific task to be signaled. It is
4770 * interruptible.
4771 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4772int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4773{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4774 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4778,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4778}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4779EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4780
4781/**
4782 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4783 * @x: holds the state of this particular completion
4784 * @timeout: timeout value in jiffies
4785 *
4786 * This waits for either a completion of a specific task to be signaled or for a
4787 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4788 */
4714unsigned long __sched 4789unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4790wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4791 unsigned long timeout)
@@ -4719,6 +4794,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4794}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4795EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4796
4797/**
4798 * wait_for_completion_killable: - waits for completion of a task (killable)
4799 * @x: holds the state of this particular completion
4800 *
4801 * This waits to be signaled for completion of a specific task. It can be
4802 * interrupted by a kill signal.
4803 */
4722int __sched wait_for_completion_killable(struct completion *x) 4804int __sched wait_for_completion_killable(struct completion *x)
4723{ 4805{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4806 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5203,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5203 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5204 * assigned.
5123 */ 5205 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5206 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5207 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5208 return -EPERM;
5126#endif 5209#endif
5127 5210
@@ -5787,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5787 struct rq *rq = cpu_rq(cpu); 5870 struct rq *rq = cpu_rq(cpu);
5788 unsigned long flags; 5871 unsigned long flags;
5789 5872
5873 spin_lock_irqsave(&rq->lock, flags);
5874
5790 __sched_fork(idle); 5875 __sched_fork(idle);
5791 idle->se.exec_start = sched_clock(); 5876 idle->se.exec_start = sched_clock();
5792 5877
@@ -5794,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5794 idle->cpus_allowed = cpumask_of_cpu(cpu); 5879 idle->cpus_allowed = cpumask_of_cpu(cpu);
5795 __set_task_cpu(idle, cpu); 5880 __set_task_cpu(idle, cpu);
5796 5881
5797 spin_lock_irqsave(&rq->lock, flags);
5798 rq->curr = rq->idle = idle; 5882 rq->curr = rq->idle = idle;
5799#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5883#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5800 idle->oncpu = 1; 5884 idle->oncpu = 1;
@@ -5957,7 +6041,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6041 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6042 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6043 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6044 check_preempt_curr(rq_dest, p, 0);
5961 } 6045 }
5962done: 6046done:
5963 ret = 1; 6047 ret = 1;
@@ -6282,7 +6366,7 @@ set_table_entry(struct ctl_table *entry,
6282static struct ctl_table * 6366static struct ctl_table *
6283sd_alloc_ctl_domain_table(struct sched_domain *sd) 6367sd_alloc_ctl_domain_table(struct sched_domain *sd)
6284{ 6368{
6285 struct ctl_table *table = sd_alloc_ctl_entry(12); 6369 struct ctl_table *table = sd_alloc_ctl_entry(13);
6286 6370
6287 if (table == NULL) 6371 if (table == NULL)
6288 return NULL; 6372 return NULL;
@@ -6310,7 +6394,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6310 sizeof(int), 0644, proc_dointvec_minmax); 6394 sizeof(int), 0644, proc_dointvec_minmax);
6311 set_table_entry(&table[10], "flags", &sd->flags, 6395 set_table_entry(&table[10], "flags", &sd->flags,
6312 sizeof(int), 0644, proc_dointvec_minmax); 6396 sizeof(int), 0644, proc_dointvec_minmax);
6313 /* &table[11] is terminator */ 6397 set_table_entry(&table[11], "name", sd->name,
6398 CORENAME_MAX_SIZE, 0444, proc_dostring);
6399 /* &table[12] is terminator */
6314 6400
6315 return table; 6401 return table;
6316} 6402}
@@ -6802,15 +6888,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6802 struct sched_domain *tmp; 6888 struct sched_domain *tmp;
6803 6889
6804 /* Remove the sched domains which do not contribute to scheduling. */ 6890 /* Remove the sched domains which do not contribute to scheduling. */
6805 for (tmp = sd; tmp; tmp = tmp->parent) { 6891 for (tmp = sd; tmp; ) {
6806 struct sched_domain *parent = tmp->parent; 6892 struct sched_domain *parent = tmp->parent;
6807 if (!parent) 6893 if (!parent)
6808 break; 6894 break;
6895
6809 if (sd_parent_degenerate(tmp, parent)) { 6896 if (sd_parent_degenerate(tmp, parent)) {
6810 tmp->parent = parent->parent; 6897 tmp->parent = parent->parent;
6811 if (parent->parent) 6898 if (parent->parent)
6812 parent->parent->child = tmp; 6899 parent->parent->child = tmp;
6813 } 6900 } else
6901 tmp = tmp->parent;
6814 } 6902 }
6815 6903
6816 if (sd && sd_degenerate(sd)) { 6904 if (sd && sd_degenerate(sd)) {
@@ -7194,13 +7282,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7194 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7282 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7195 */ 7283 */
7196 7284
7285#ifdef CONFIG_SCHED_DEBUG
7286# define SD_INIT_NAME(sd, type) sd->name = #type
7287#else
7288# define SD_INIT_NAME(sd, type) do { } while (0)
7289#endif
7290
7197#define SD_INIT(sd, type) sd_init_##type(sd) 7291#define SD_INIT(sd, type) sd_init_##type(sd)
7292
7198#define SD_INIT_FUNC(type) \ 7293#define SD_INIT_FUNC(type) \
7199static noinline void sd_init_##type(struct sched_domain *sd) \ 7294static noinline void sd_init_##type(struct sched_domain *sd) \
7200{ \ 7295{ \
7201 memset(sd, 0, sizeof(*sd)); \ 7296 memset(sd, 0, sizeof(*sd)); \
7202 *sd = SD_##type##_INIT; \ 7297 *sd = SD_##type##_INIT; \
7203 sd->level = SD_LV_##type; \ 7298 sd->level = SD_LV_##type; \
7299 SD_INIT_NAME(sd, type); \
7204} 7300}
7205 7301
7206SD_INIT_FUNC(CPU) 7302SD_INIT_FUNC(CPU)
@@ -7591,6 +7687,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7591error: 7687error:
7592 free_sched_groups(cpu_map, tmpmask); 7688 free_sched_groups(cpu_map, tmpmask);
7593 SCHED_CPUMASK_FREE((void *)allmasks); 7689 SCHED_CPUMASK_FREE((void *)allmasks);
7690 kfree(rd);
7594 return -ENOMEM; 7691 return -ENOMEM;
7595#endif 7692#endif
7596} 7693}
@@ -7692,13 +7789,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7692 * 7789 *
7693 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7790 * The passed in 'doms_new' should be kmalloc'd. This routine takes
7694 * ownership of it and will kfree it when done with it. If the caller 7791 * ownership of it and will kfree it when done with it. If the caller
7695 * failed the kmalloc call, then it can pass in doms_new == NULL, 7792 * failed the kmalloc call, then it can pass in doms_new == NULL &&
7696 * and partition_sched_domains() will fallback to the single partition 7793 * ndoms_new == 1, and partition_sched_domains() will fallback to
7697 * 'fallback_doms', it also forces the domains to be rebuilt. 7794 * the single partition 'fallback_doms', it also forces the domains
7795 * to be rebuilt.
7698 * 7796 *
7699 * If doms_new==NULL it will be replaced with cpu_online_map. 7797 * If doms_new == NULL it will be replaced with cpu_online_map.
7700 * ndoms_new==0 is a special case for destroying existing domains. 7798 * ndoms_new == 0 is a special case for destroying existing domains,
7701 * It will not create the default domain. 7799 * and it will not create the default domain.
7702 * 7800 *
7703 * Call with hotplug lock held 7801 * Call with hotplug lock held
7704 */ 7802 */
@@ -8242,20 +8340,25 @@ void __might_sleep(char *file, int line)
8242#ifdef in_atomic 8340#ifdef in_atomic
8243 static unsigned long prev_jiffy; /* ratelimiting */ 8341 static unsigned long prev_jiffy; /* ratelimiting */
8244 8342
8245 if ((in_atomic() || irqs_disabled()) && 8343 if ((!in_atomic() && !irqs_disabled()) ||
8246 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8344 system_state != SYSTEM_RUNNING || oops_in_progress)
8247 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8345 return;
8248 return; 8346 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8249 prev_jiffy = jiffies; 8347 return;
8250 printk(KERN_ERR "BUG: sleeping function called from invalid" 8348 prev_jiffy = jiffies;
8251 " context at %s:%d\n", file, line); 8349
8252 printk("in_atomic():%d, irqs_disabled():%d\n", 8350 printk(KERN_ERR
8253 in_atomic(), irqs_disabled()); 8351 "BUG: sleeping function called from invalid context at %s:%d\n",
8254 debug_show_held_locks(current); 8352 file, line);
8255 if (irqs_disabled()) 8353 printk(KERN_ERR
8256 print_irqtrace_events(current); 8354 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8257 dump_stack(); 8355 in_atomic(), irqs_disabled(),
8258 } 8356 current->pid, current->comm);
8357
8358 debug_show_held_locks(current);
8359 if (irqs_disabled())
8360 print_irqtrace_events(current);
8361 dump_stack();
8259#endif 8362#endif
8260} 8363}
8261EXPORT_SYMBOL(__might_sleep); 8364EXPORT_SYMBOL(__might_sleep);
@@ -8753,73 +8856,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8753static unsigned long to_ratio(u64 period, u64 runtime) 8856static unsigned long to_ratio(u64 period, u64 runtime)
8754{ 8857{
8755 if (runtime == RUNTIME_INF) 8858 if (runtime == RUNTIME_INF)
8756 return 1ULL << 16; 8859 return 1ULL << 20;
8757 8860
8758 return div64_u64(runtime << 16, period); 8861 return div64_u64(runtime << 20, period);
8759} 8862}
8760 8863
8761#ifdef CONFIG_CGROUP_SCHED 8864/* Must be called with tasklist_lock held */
8762static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8865static inline int tg_has_rt_tasks(struct task_group *tg)
8763{ 8866{
8764 struct task_group *tgi, *parent = tg->parent; 8867 struct task_struct *g, *p;
8765 unsigned long total = 0;
8766 8868
8767 if (!parent) { 8869 do_each_thread(g, p) {
8768 if (global_rt_period() < period) 8870 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8769 return 0; 8871 return 1;
8872 } while_each_thread(g, p);
8770 8873
8771 return to_ratio(period, runtime) < 8874 return 0;
8772 to_ratio(global_rt_period(), global_rt_runtime()); 8875}
8773 }
8774 8876
8775 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8877struct rt_schedulable_data {
8776 return 0; 8878 struct task_group *tg;
8879 u64 rt_period;
8880 u64 rt_runtime;
8881};
8777 8882
8778 rcu_read_lock(); 8883static int tg_schedulable(struct task_group *tg, void *data)
8779 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8884{
8780 if (tgi == tg) 8885 struct rt_schedulable_data *d = data;
8781 continue; 8886 struct task_group *child;
8887 unsigned long total, sum = 0;
8888 u64 period, runtime;
8889
8890 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8891 runtime = tg->rt_bandwidth.rt_runtime;
8782 8892
8783 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8893 if (tg == d->tg) {
8784 tgi->rt_bandwidth.rt_runtime); 8894 period = d->rt_period;
8895 runtime = d->rt_runtime;
8785 } 8896 }
8786 rcu_read_unlock();
8787 8897
8788 return total + to_ratio(period, runtime) <= 8898 /*
8789 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8899 * Cannot have more runtime than the period.
8790 parent->rt_bandwidth.rt_runtime); 8900 */
8791} 8901 if (runtime > period && runtime != RUNTIME_INF)
8792#elif defined CONFIG_USER_SCHED 8902 return -EINVAL;
8793static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8794{
8795 struct task_group *tgi;
8796 unsigned long total = 0;
8797 unsigned long global_ratio =
8798 to_ratio(global_rt_period(), global_rt_runtime());
8799 8903
8800 rcu_read_lock(); 8904 /*
8801 list_for_each_entry_rcu(tgi, &task_groups, list) { 8905 * Ensure we don't starve existing RT tasks.
8802 if (tgi == tg) 8906 */
8803 continue; 8907 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8908 return -EBUSY;
8909
8910 total = to_ratio(period, runtime);
8804 8911
8805 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8912 /*
8806 tgi->rt_bandwidth.rt_runtime); 8913 * Nobody can have more than the global setting allows.
8914 */
8915 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8916 return -EINVAL;
8917
8918 /*
8919 * The sum of our children's runtime should not exceed our own.
8920 */
8921 list_for_each_entry_rcu(child, &tg->children, siblings) {
8922 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8923 runtime = child->rt_bandwidth.rt_runtime;
8924
8925 if (child == d->tg) {
8926 period = d->rt_period;
8927 runtime = d->rt_runtime;
8928 }
8929
8930 sum += to_ratio(period, runtime);
8807 } 8931 }
8808 rcu_read_unlock();
8809 8932
8810 return total + to_ratio(period, runtime) < global_ratio; 8933 if (sum > total)
8934 return -EINVAL;
8935
8936 return 0;
8811} 8937}
8812#endif
8813 8938
8814/* Must be called with tasklist_lock held */ 8939static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8815static inline int tg_has_rt_tasks(struct task_group *tg)
8816{ 8940{
8817 struct task_struct *g, *p; 8941 struct rt_schedulable_data data = {
8818 do_each_thread(g, p) { 8942 .tg = tg,
8819 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8943 .rt_period = period,
8820 return 1; 8944 .rt_runtime = runtime,
8821 } while_each_thread(g, p); 8945 };
8822 return 0; 8946
8947 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8823} 8948}
8824 8949
8825static int tg_set_bandwidth(struct task_group *tg, 8950static int tg_set_bandwidth(struct task_group *tg,
@@ -8829,14 +8954,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8829 8954
8830 mutex_lock(&rt_constraints_mutex); 8955 mutex_lock(&rt_constraints_mutex);
8831 read_lock(&tasklist_lock); 8956 read_lock(&tasklist_lock);
8832 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8957 err = __rt_schedulable(tg, rt_period, rt_runtime);
8833 err = -EBUSY; 8958 if (err)
8834 goto unlock;
8835 }
8836 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8837 err = -EINVAL;
8838 goto unlock; 8959 goto unlock;
8839 }
8840 8960
8841 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8961 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8842 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8962 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,19 +9025,25 @@ long sched_group_rt_period(struct task_group *tg)
8905 9025
8906static int sched_rt_global_constraints(void) 9026static int sched_rt_global_constraints(void)
8907{ 9027{
8908 struct task_group *tg = &root_task_group; 9028 u64 runtime, period;
8909 u64 rt_runtime, rt_period;
8910 int ret = 0; 9029 int ret = 0;
8911 9030
8912 if (sysctl_sched_rt_period <= 0) 9031 if (sysctl_sched_rt_period <= 0)
8913 return -EINVAL; 9032 return -EINVAL;
8914 9033
8915 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9034 runtime = global_rt_runtime();
8916 rt_runtime = tg->rt_bandwidth.rt_runtime; 9035 period = global_rt_period();
9036
9037 /*
9038 * Sanity check on the sysctl variables.
9039 */
9040 if (runtime > period && runtime != RUNTIME_INF)
9041 return -EINVAL;
8917 9042
8918 mutex_lock(&rt_constraints_mutex); 9043 mutex_lock(&rt_constraints_mutex);
8919 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9044 read_lock(&tasklist_lock);
8920 ret = -EINVAL; 9045 ret = __rt_schedulable(NULL, 0, 0);
9046 read_unlock(&tasklist_lock);
8921 mutex_unlock(&rt_constraints_mutex); 9047 mutex_unlock(&rt_constraints_mutex);
8922 9048
8923 return ret; 9049 return ret;
@@ -8991,7 +9117,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8991 9117
8992 if (!cgrp->parent) { 9118 if (!cgrp->parent) {
8993 /* This is early initialization for the top cgroup */ 9119 /* This is early initialization for the top cgroup */
8994 init_task_group.css.cgroup = cgrp;
8995 return &init_task_group.css; 9120 return &init_task_group.css;
8996 } 9121 }
8997 9122
@@ -9000,9 +9125,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9000 if (IS_ERR(tg)) 9125 if (IS_ERR(tg))
9001 return ERR_PTR(-ENOMEM); 9126 return ERR_PTR(-ENOMEM);
9002 9127
9003 /* Bind the cgroup to task_group object we just created */
9004 tg->css.cgroup = cgrp;
9005
9006 return &tg->css; 9128 return &tg->css;
9007} 9129}
9008 9130
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..81787248b60f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
118 118
119 /* 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC); 122 * max(scd->clock, scd->tick_gtod + TICK_NSEC));
123 */ 123 */
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c56..26ed8e3d1c15 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
144 last = __pick_last_entity(cfs_rq); 144 last = __pick_last_entity(cfs_rq);
145 if (last) 145 if (last)
146 max_vruntime = last->vruntime; 146 max_vruntime = last->vruntime;
147 min_vruntime = rq->cfs.min_vruntime; 147 min_vruntime = cfs_rq->min_vruntime;
148 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 148 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
149 spin_unlock_irqrestore(&rq->lock, flags); 149 spin_unlock_irqrestore(&rq->lock, flags);
150 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 150 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
161 SPLIT_NS(spread0)); 161 SPLIT_NS(spread0));
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS
165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171 164
172 P(sched_switch); 165 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
173 P(sched_count);
174 P(sched_goidle);
175
176 P(ttwu_count);
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
182#endif
183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
184 cfs_rq->nr_spread_over); 166 cfs_rq->nr_spread_over);
185#ifdef CONFIG_FAIR_GROUP_SCHED 167#ifdef CONFIG_FAIR_GROUP_SCHED
186#ifdef CONFIG_SMP 168#ifdef CONFIG_SMP
@@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu)
260#undef P 242#undef P
261#undef PN 243#undef PN
262 244
245#ifdef CONFIG_SCHEDSTATS
246#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
247
248 P(yld_exp_empty);
249 P(yld_act_empty);
250 P(yld_both_empty);
251 P(yld_count);
252
253 P(sched_switch);
254 P(sched_count);
255 P(sched_goidle);
256
257 P(ttwu_count);
258 P(ttwu_local);
259
260 P(bkl_count);
261
262#undef P
263#endif
263 print_cfs_stats(m, cpu); 264 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu); 265 print_rt_stats(m, cpu);
265 266
@@ -319,7 +320,7 @@ static int __init init_sched_debug_procfs(void)
319{ 320{
320 struct proc_dir_entry *pe; 321 struct proc_dir_entry *pe;
321 322
322 pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); 323 pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
323 if (!pe) 324 if (!pe)
324 return -ENOMEM; 325 return -ENOMEM;
325 return 0; 326 return 0;
@@ -333,12 +334,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
333 unsigned long flags; 334 unsigned long flags;
334 int num_threads = 1; 335 int num_threads = 1;
335 336
336 rcu_read_lock();
337 if (lock_task_sighand(p, &flags)) { 337 if (lock_task_sighand(p, &flags)) {
338 num_threads = atomic_read(&p->signal->count); 338 num_threads = atomic_read(&p->signal->count);
339 unlock_task_sighand(p, &flags); 339 unlock_task_sighand(p, &flags);
340 } 340 }
341 rcu_read_unlock();
342 341
343 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 342 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
344 SEQ_printf(m, 343 SEQ_printf(m,
@@ -424,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
424#undef __P 423#undef __P
425 424
426 { 425 {
426 unsigned int this_cpu = raw_smp_processor_id();
427 u64 t0, t1; 427 u64 t0, t1;
428 428
429 t0 = sched_clock(); 429 t0 = cpu_clock(this_cpu);
430 t1 = sched_clock(); 430 t1 = cpu_clock(this_cpu);
431 SEQ_printf(m, "%-35s:%21Ld\n", 431 SEQ_printf(m, "%-35s:%21Ld\n",
432 "clock-delta", (long long)(t1-t0)); 432 "clock-delta", (long long)(t1-t0));
433 } 433 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..98345e45b059 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
76static const struct sched_class fair_sched_class;
77
76/************************************************************** 78/**************************************************************
77 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
78 */ 80 */
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
141 return se->parent; 143 return se->parent;
142} 144}
143 145
146/* return depth at which a sched entity is present in the hierarchy */
147static inline int depth_se(struct sched_entity *se)
148{
149 int depth = 0;
150
151 for_each_sched_entity(se)
152 depth++;
153
154 return depth;
155}
156
157static void
158find_matching_se(struct sched_entity **se, struct sched_entity **pse)
159{
160 int se_depth, pse_depth;
161
162 /*
163 * preemption test can be made between sibling entities who are in the
164 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
165 * both tasks until we find their ancestors who are siblings of common
166 * parent.
167 */
168
169 /* First walk up until both entities are at same depth */
170 se_depth = depth_se(*se);
171 pse_depth = depth_se(*pse);
172
173 while (se_depth > pse_depth) {
174 se_depth--;
175 *se = parent_entity(*se);
176 }
177
178 while (pse_depth > se_depth) {
179 pse_depth--;
180 *pse = parent_entity(*pse);
181 }
182
183 while (!is_same_group(*se, *pse)) {
184 *se = parent_entity(*se);
185 *pse = parent_entity(*pse);
186 }
187}
188
144#else /* CONFIG_FAIR_GROUP_SCHED */ 189#else /* CONFIG_FAIR_GROUP_SCHED */
145 190
146static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 191static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
191 return NULL; 236 return NULL;
192} 237}
193 238
239static inline void
240find_matching_se(struct sched_entity **se, struct sched_entity **pse)
241{
242}
243
194#endif /* CONFIG_FAIR_GROUP_SCHED */ 244#endif /* CONFIG_FAIR_GROUP_SCHED */
195 245
196 246
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
221 return se->vruntime - cfs_rq->min_vruntime; 271 return se->vruntime - cfs_rq->min_vruntime;
222} 272}
223 273
274static void update_min_vruntime(struct cfs_rq *cfs_rq)
275{
276 u64 vruntime = cfs_rq->min_vruntime;
277
278 if (cfs_rq->curr)
279 vruntime = cfs_rq->curr->vruntime;
280
281 if (cfs_rq->rb_leftmost) {
282 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
283 struct sched_entity,
284 run_node);
285
286 if (vruntime == cfs_rq->min_vruntime)
287 vruntime = se->vruntime;
288 else
289 vruntime = min_vruntime(vruntime, se->vruntime);
290 }
291
292 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
293}
294
224/* 295/*
225 * Enqueue an entity into the rb-tree: 296 * Enqueue an entity into the rb-tree:
226 */ 297 */
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
254 * Maintain a cache of leftmost tree entries (it is frequently 325 * Maintain a cache of leftmost tree entries (it is frequently
255 * used): 326 * used):
256 */ 327 */
257 if (leftmost) { 328 if (leftmost)
258 cfs_rq->rb_leftmost = &se->run_node; 329 cfs_rq->rb_leftmost = &se->run_node;
259 /*
260 * maintain cfs_rq->min_vruntime to be a monotonic increasing
261 * value tracking the leftmost vruntime in the tree.
262 */
263 cfs_rq->min_vruntime =
264 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
265 }
266 330
267 rb_link_node(&se->run_node, parent, link); 331 rb_link_node(&se->run_node, parent, link);
268 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 332 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
272{ 336{
273 if (cfs_rq->rb_leftmost == &se->run_node) { 337 if (cfs_rq->rb_leftmost == &se->run_node) {
274 struct rb_node *next_node; 338 struct rb_node *next_node;
275 struct sched_entity *next;
276 339
277 next_node = rb_next(&se->run_node); 340 next_node = rb_next(&se->run_node);
278 cfs_rq->rb_leftmost = next_node; 341 cfs_rq->rb_leftmost = next_node;
279
280 if (next_node) {
281 next = rb_entry(next_node,
282 struct sched_entity, run_node);
283 cfs_rq->min_vruntime =
284 max_vruntime(cfs_rq->min_vruntime,
285 next->vruntime);
286 }
287 } 342 }
288 343
289 if (cfs_rq->next == se)
290 cfs_rq->next = NULL;
291
292 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 344 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
293} 345}
294 346
295static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
296{
297 return cfs_rq->rb_leftmost;
298}
299
300static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 347static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
301{ 348{
302 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 349 struct rb_node *left = cfs_rq->rb_leftmost;
350
351 if (!left)
352 return NULL;
353
354 return rb_entry(left, struct sched_entity, run_node);
303} 355}
304 356
305static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 357static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
306{ 358{
307 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 359 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
308 360
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 386#endif
335 387
336/* 388/*
337 * delta *= w / rw 389 * delta *= P[w / rw]
338 */ 390 */
339static inline unsigned long 391static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se) 392calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
348} 400}
349 401
350/* 402/*
351 * delta *= rw / w 403 * delta /= w
352 */ 404 */
353static inline unsigned long 405static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se) 406calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{ 407{
356 for_each_sched_entity(se) { 408 if (unlikely(se->load.weight != NICE_0_LOAD))
357 delta = calc_delta_mine(delta, 409 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360 410
361 return delta; 411 return delta;
362} 412}
@@ -386,84 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
386 * We calculate the wall-time slice from the period by taking a part 436 * We calculate the wall-time slice from the period by taking a part
387 * proportional to the weight. 437 * proportional to the weight.
388 * 438 *
389 * s = p*w/rw 439 * s = p*P[w/rw]
390 */ 440 */
391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
392{ 442{
393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
394}
395
396/*
397 * We calculate the vruntime slice of a to be inserted task
398 *
399 * vs = s*rw/w = p
400 */
401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
402{
403 unsigned long nr_running = cfs_rq->nr_running; 443 unsigned long nr_running = cfs_rq->nr_running;
404 444
405 if (!se->on_rq) 445 if (unlikely(!se->on_rq))
406 nr_running++; 446 nr_running++;
407 447
408 return __sched_period(nr_running); 448 return calc_delta_weight(__sched_period(nr_running), se);
409} 449}
410 450
411/* 451/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in 452 * We calculate the vruntime slice of a to be inserted task
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 * 453 *
454 * vs = s/w
421 */ 455 */
422static unsigned long 456static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{ 457{
425 struct load_weight lw = { 458 return calc_delta_fair(sched_slice(cfs_rq, se), se);
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467} 459}
468 460
469/* 461/*
@@ -482,6 +474,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
482 schedstat_add(cfs_rq, exec_clock, delta_exec); 474 schedstat_add(cfs_rq, exec_clock, delta_exec);
483 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 475 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
484 curr->vruntime += delta_exec_weighted; 476 curr->vruntime += delta_exec_weighted;
477 update_min_vruntime(cfs_rq);
485} 478}
486 479
487static void update_curr(struct cfs_rq *cfs_rq) 480static void update_curr(struct cfs_rq *cfs_rq)
@@ -507,6 +500,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
507 struct task_struct *curtask = task_of(curr); 500 struct task_struct *curtask = task_of(curr);
508 501
509 cpuacct_charge(curtask, delta_exec); 502 cpuacct_charge(curtask, delta_exec);
503 account_group_exec_runtime(curtask, delta_exec);
510 } 504 }
511} 505}
512 506
@@ -586,11 +580,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 580 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 581 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 582 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 583 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 584 add_cfs_task_weight(cfs_rq, se->load.weight);
585 list_add(&se->group_node, &cfs_rq->tasks);
586 }
591 cfs_rq->nr_running++; 587 cfs_rq->nr_running++;
592 se->on_rq = 1; 588 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 589}
595 590
596static void 591static void
@@ -599,11 +594,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 594 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 595 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 596 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 597 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 598 add_cfs_task_weight(cfs_rq, -se->load.weight);
599 list_del_init(&se->group_node);
600 }
604 cfs_rq->nr_running--; 601 cfs_rq->nr_running--;
605 se->on_rq = 0; 602 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 603}
608 604
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -668,13 +664,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
668static void 664static void
669place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 665place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
670{ 666{
671 u64 vruntime; 667 u64 vruntime = cfs_rq->min_vruntime;
672
673 if (first_fair(cfs_rq)) {
674 vruntime = min_vruntime(cfs_rq->min_vruntime,
675 __pick_next_entity(cfs_rq)->vruntime);
676 } else
677 vruntime = cfs_rq->min_vruntime;
678 668
679 /* 669 /*
680 * The 'current' period is already promised to the current tasks, 670 * The 'current' period is already promised to the current tasks,
@@ -683,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
683 * stays open at the end. 673 * stays open at the end.
684 */ 674 */
685 if (initial && sched_feat(START_DEBIT)) 675 if (initial && sched_feat(START_DEBIT))
686 vruntime += sched_vslice_add(cfs_rq, se); 676 vruntime += sched_vslice(cfs_rq, se);
687 677
688 if (!initial) { 678 if (!initial) {
689 /* sleeps upto a single latency don't count. */ 679 /* sleeps upto a single latency don't count. */
@@ -726,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
726 __enqueue_entity(cfs_rq, se); 716 __enqueue_entity(cfs_rq, se);
727} 717}
728 718
719static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{
721 if (cfs_rq->last == se)
722 cfs_rq->last = NULL;
723
724 if (cfs_rq->next == se)
725 cfs_rq->next = NULL;
726}
727
729static void 728static void
730dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 729dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
731{ 730{
@@ -748,9 +747,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
748#endif 747#endif
749 } 748 }
750 749
750 clear_buddies(cfs_rq, se);
751
751 if (se != cfs_rq->curr) 752 if (se != cfs_rq->curr)
752 __dequeue_entity(cfs_rq, se); 753 __dequeue_entity(cfs_rq, se);
753 account_entity_dequeue(cfs_rq, se); 754 account_entity_dequeue(cfs_rq, se);
755 update_min_vruntime(cfs_rq);
754} 756}
755 757
756/* 758/*
@@ -797,29 +799,18 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
797 se->prev_sum_exec_runtime = se->sum_exec_runtime; 799 se->prev_sum_exec_runtime = se->sum_exec_runtime;
798} 800}
799 801
800static struct sched_entity * 802static int
801pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 803wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
802{
803 struct rq *rq = rq_of(cfs_rq);
804 u64 pair_slice = rq->clock - cfs_rq->pair_start;
805
806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
807 cfs_rq->pair_start = rq->clock;
808 return se;
809 }
810
811 return cfs_rq->next;
812}
813 804
814static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 805static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
815{ 806{
816 struct sched_entity *se = NULL; 807 struct sched_entity *se = __pick_next_entity(cfs_rq);
817 808
818 if (first_fair(cfs_rq)) { 809 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
819 se = __pick_next_entity(cfs_rq); 810 return cfs_rq->next;
820 se = pick_next(cfs_rq, se); 811
821 set_next_entity(cfs_rq, se); 812 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
822 } 813 return cfs_rq->last;
823 814
824 return se; 815 return se;
825} 816}
@@ -904,11 +895,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
904 hrtick_start(rq, delta); 895 hrtick_start(rq, delta);
905 } 896 }
906} 897}
898
899/*
900 * called from enqueue/dequeue and updates the hrtick when the
901 * current task is from our class and nr_running is low enough
902 * to matter.
903 */
904static void hrtick_update(struct rq *rq)
905{
906 struct task_struct *curr = rq->curr;
907
908 if (curr->sched_class != &fair_sched_class)
909 return;
910
911 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
912 hrtick_start_fair(rq, curr);
913}
907#else /* !CONFIG_SCHED_HRTICK */ 914#else /* !CONFIG_SCHED_HRTICK */
908static inline void 915static inline void
909hrtick_start_fair(struct rq *rq, struct task_struct *p) 916hrtick_start_fair(struct rq *rq, struct task_struct *p)
910{ 917{
911} 918}
919
920static inline void hrtick_update(struct rq *rq)
921{
922}
912#endif 923#endif
913 924
914/* 925/*
@@ -929,7 +940,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
929 wakeup = 1; 940 wakeup = 1;
930 } 941 }
931 942
932 hrtick_start_fair(rq, rq->curr); 943 hrtick_update(rq);
933} 944}
934 945
935/* 946/*
@@ -951,7 +962,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
951 sleep = 1; 962 sleep = 1;
952 } 963 }
953 964
954 hrtick_start_fair(rq, rq->curr); 965 hrtick_update(rq);
955} 966}
956 967
957/* 968/*
@@ -971,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
971 if (unlikely(cfs_rq->nr_running == 1)) 982 if (unlikely(cfs_rq->nr_running == 1))
972 return; 983 return;
973 984
985 clear_buddies(cfs_rq, se);
986
974 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 987 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
975 update_rq_clock(rq); 988 update_rq_clock(rq);
976 /* 989 /*
@@ -1057,8 +1070,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1057 1070
1058#ifdef CONFIG_SMP 1071#ifdef CONFIG_SMP
1059 1072
1060static const struct sched_class fair_sched_class;
1061
1062#ifdef CONFIG_FAIR_GROUP_SCHED 1073#ifdef CONFIG_FAIR_GROUP_SCHED
1063/* 1074/*
1064 * effective_load() calculates the load change as seen from the root_task_group 1075 * effective_load() calculates the load change as seen from the root_task_group
@@ -1085,7 +1096,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1096 long wl, long wg)
1086{ 1097{
1087 struct sched_entity *se = tg->se[cpu]; 1098 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1099
1090 if (!tg->parent) 1100 if (!tg->parent)
1091 return wl; 1101 return wl;
@@ -1097,18 +1107,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1107 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1108 return wl;
1099 1109
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1110 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1111 long S, rw, s, a, b;
1112 long more_w;
1113
1114 /*
1115 * Instead of using this increment, also add the difference
1116 * between when the shares were last updated and now.
1117 */
1118 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1119 wl += more_w;
1120 wg += more_w;
1112 1121
1113 S = se->my_q->tg->shares; 1122 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1123 s = se->my_q->shares;
@@ -1117,7 +1126,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1126 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1127 b = S*rw + s*wg;
1119 1128
1120 wl = s*(a-b)/D(b); 1129 wl = s*(a-b);
1130
1131 if (likely(b))
1132 wl /= b;
1133
1121 /* 1134 /*
1122 * Assume the group is already running and will 1135 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1136 * thus already be accounted for in the weight.
@@ -1126,7 +1139,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1139 * alter the group weight.
1127 */ 1140 */
1128 wg = 0; 1141 wg = 0;
1129#undef D
1130 } 1142 }
1131 1143
1132 return wl; 1144 return wl;
@@ -1143,7 +1155,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1155#endif
1144 1156
1145static int 1157static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1158wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1159 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1160 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1161 unsigned int imbalance)
@@ -1158,6 +1170,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1170 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1159 return 0; 1171 return 0;
1160 1172
1173 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1174 p->se.avg_overlap > sysctl_sched_migration_cost))
1175 sync = 0;
1176
1161 /* 1177 /*
1162 * If sync wakeup then subtract the (maximum possible) 1178 * If sync wakeup then subtract the (maximum possible)
1163 * effect of the currently running task from the load 1179 * effect of the currently running task from the load
@@ -1182,17 +1198,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1182 * a reasonable amount of time then attract this newly 1198 * a reasonable amount of time then attract this newly
1183 * woken task: 1199 * woken task:
1184 */ 1200 */
1185 if (sync && balanced) { 1201 if (sync && balanced)
1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1202 return 1;
1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1188 return 1;
1189 }
1190 1203
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1204 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1205 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1206
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1207 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1208 tl_per_task)) {
1196 /* 1209 /*
1197 * This domain has SD_WAKE_AFFINE and 1210 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1211 * p is cache cold in this domain, and
@@ -1211,16 +1224,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1224 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1225 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1226 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1227 struct rq *this_rq;
1215 unsigned int imbalance; 1228 unsigned int imbalance;
1216 int idx; 1229 int idx;
1217 1230
1218 prev_cpu = task_cpu(p); 1231 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1232 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1233 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1234 new_cpu = prev_cpu;
1223 1235
1236 if (prev_cpu == this_cpu)
1237 goto out;
1224 /* 1238 /*
1225 * 'this_sd' is the first domain that both 1239 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1240 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1262,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1262 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1263 this_load = target_load(this_cpu, idx);
1250 1264
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1265 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1266 load, this_load, imbalance))
1253 return this_cpu; 1267 return this_cpu;
1254 1268
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1269 /*
1259 * Start passive balancing when half the imbalance_pct 1270 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1271 * limit is reached.
@@ -1280,9 +1291,7 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1280 * More easily preempt - nice tasks, while not making it harder for 1291 * More easily preempt - nice tasks, while not making it harder for
1281 * + nice tasks. 1292 * + nice tasks.
1282 */ 1293 */
1283 if (sched_feat(ASYM_GRAN)) 1294 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); 1295 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1296
1288 return gran; 1297 return gran;
@@ -1307,7 +1316,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{ 1316{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime; 1317 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309 1318
1310 if (vdiff < 0) 1319 if (vdiff <= 0)
1311 return -1; 1320 return -1;
1312 1321
1313 gran = wakeup_gran(curr); 1322 gran = wakeup_gran(curr);
@@ -1317,38 +1326,60 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1317 return 0; 1326 return 0;
1318} 1327}
1319 1328
1320/* return depth at which a sched entity is present in the hierarchy */ 1329static void set_last_buddy(struct sched_entity *se)
1321static inline int depth_se(struct sched_entity *se)
1322{ 1330{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se) 1331 for_each_sched_entity(se)
1326 depth++; 1332 cfs_rq_of(se)->last = se;
1333}
1327 1334
1328 return depth; 1335static void set_next_buddy(struct sched_entity *se)
1336{
1337 for_each_sched_entity(se)
1338 cfs_rq_of(se)->next = se;
1329} 1339}
1330 1340
1331/* 1341/*
1332 * Preempt the current task with a newly woken task if needed: 1342 * Preempt the current task with a newly woken task if needed:
1333 */ 1343 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1344static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1345{
1336 struct task_struct *curr = rq->curr; 1346 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1347 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth;
1340 1348
1341 if (unlikely(rt_prio(p->prio))) { 1349 if (unlikely(rt_prio(p->prio))) {
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351
1342 update_rq_clock(rq); 1352 update_rq_clock(rq);
1343 update_curr(cfs_rq); 1353 update_curr(cfs_rq);
1344 resched_task(curr); 1354 resched_task(curr);
1345 return; 1355 return;
1346 } 1356 }
1347 1357
1358 if (unlikely(p->sched_class != &fair_sched_class))
1359 return;
1360
1348 if (unlikely(se == pse)) 1361 if (unlikely(se == pse))
1349 return; 1362 return;
1350 1363
1351 cfs_rq_of(pse)->next = pse; 1364 /*
1365 * Only set the backward buddy when the current task is still on the
1366 * rq. This can happen when a wakeup gets interleaved with schedule on
1367 * the ->pre_schedule() or idle_balance() point, either of which can
1368 * drop the rq lock.
1369 *
1370 * Also, during early boot the idle thread is in the fair class, for
1371 * obvious reasons its a bad idea to schedule back to the idle thread.
1372 */
1373 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1374 set_last_buddy(se);
1375 set_next_buddy(pse);
1376
1377 /*
1378 * We can come here with TIF_NEED_RESCHED already set from new task
1379 * wake up path.
1380 */
1381 if (test_tsk_need_resched(curr))
1382 return;
1352 1383
1353 /* 1384 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1385 * Batch tasks do not preempt (their preemption is driven by
@@ -1360,34 +1391,26 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1391 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1392 return;
1362 1393
1363 /* 1394 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1364 * preemption test can be made between sibling entities who are in the 1395 (se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1396 pse->avg_overlap < sysctl_sched_migration_cost))) {
1366 * both tasks until we find their ancestors who are siblings of common 1397 resched_task(curr);
1367 * parent. 1398 return;
1368 */ 1399 }
1369 1400
1370 /* First walk up until both entities are at same depth */ 1401 find_matching_se(&se, &pse);
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373 1402
1374 while (se_depth > pse_depth) { 1403 while (se) {
1375 se_depth--; 1404 BUG_ON(!pse);
1376 se = parent_entity(se);
1377 }
1378 1405
1379 while (pse_depth > se_depth) { 1406 if (wakeup_preempt_entity(se, pse) == 1) {
1380 pse_depth--; 1407 resched_task(curr);
1381 pse = parent_entity(pse); 1408 break;
1382 } 1409 }
1383 1410
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se); 1411 se = parent_entity(se);
1386 pse = parent_entity(pse); 1412 pse = parent_entity(pse);
1387 } 1413 }
1388
1389 if (wakeup_preempt_entity(se, pse) == 1)
1390 resched_task(curr);
1391} 1414}
1392 1415
1393static struct task_struct *pick_next_task_fair(struct rq *rq) 1416static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1401,6 +1424,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1401 1424
1402 do { 1425 do {
1403 se = pick_next_entity(cfs_rq); 1426 se = pick_next_entity(cfs_rq);
1427 set_next_entity(cfs_rq, se);
1404 cfs_rq = group_cfs_rq(se); 1428 cfs_rq = group_cfs_rq(se);
1405 } while (cfs_rq); 1429 } while (cfs_rq);
1406 1430
@@ -1445,19 +1469,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1469 if (next == &cfs_rq->tasks)
1446 return NULL; 1470 return NULL;
1447 1471
1448 /* Skip over entities that are not tasks */ 1472 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1473 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1474 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1475
1462 return p; 1476 return p;
1463} 1477}
@@ -1507,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1521 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1522 update_h_load(busiest_cpu);
1509 1523
1510 list_for_each_entry(tg, &task_groups, list) { 1524 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1525 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1526 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1527 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1634,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1634 * 'current' within the tree based on its new key value.
1621 */ 1635 */
1622 swap(curr->vruntime, se->vruntime); 1636 swap(curr->vruntime, se->vruntime);
1637 resched_task(rq->curr);
1623 } 1638 }
1624 1639
1625 enqueue_task_fair(rq, p, 0); 1640 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1641}
1628 1642
1629/* 1643/*
@@ -1642,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1656 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1657 resched_task(rq->curr);
1644 } else 1658 } else
1645 check_preempt_curr(rq, p); 1659 check_preempt_curr(rq, p, 0);
1646} 1660}
1647 1661
1648/* 1662/*
@@ -1659,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1673 if (running)
1660 resched_task(rq->curr); 1674 resched_task(rq->curr);
1661 else 1675 else
1662 check_preempt_curr(rq, p); 1676 check_preempt_curr(rq, p, 0);
1663} 1677}
1664 1678
1665/* Account for a task changing its policy or group. 1679/* Account for a task changing its policy or group.
@@ -1693,9 +1707,6 @@ static const struct sched_class fair_sched_class = {
1693 .enqueue_task = enqueue_task_fair, 1707 .enqueue_task = enqueue_task_fair,
1694 .dequeue_task = dequeue_task_fair, 1708 .dequeue_task = dequeue_task_fair,
1695 .yield_task = yield_task_fair, 1709 .yield_task = yield_task_fair,
1696#ifdef CONFIG_SMP
1697 .select_task_rq = select_task_rq_fair,
1698#endif /* CONFIG_SMP */
1699 1710
1700 .check_preempt_curr = check_preempt_wakeup, 1711 .check_preempt_curr = check_preempt_wakeup,
1701 1712
@@ -1703,6 +1714,8 @@ static const struct sched_class fair_sched_class = {
1703 .put_prev_task = put_prev_task_fair, 1714 .put_prev_task = put_prev_task_fair,
1704 1715
1705#ifdef CONFIG_SMP 1716#ifdef CONFIG_SMP
1717 .select_task_rq = select_task_rq_fair,
1718
1706 .load_balance = load_balance_fair, 1719 .load_balance = load_balance_fair,
1707 .move_one_task = move_one_task_fair, 1720 .move_one_task = move_one_task_fair,
1708#endif 1721#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..da5d93b5d2c6 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -5,9 +5,11 @@ SCHED_FEAT(START_DEBIT, 1)
5SCHED_FEAT(AFFINE_WAKEUPS, 1) 5SCHED_FEAT(AFFINE_WAKEUPS, 1)
6SCHED_FEAT(CACHE_HOT_BUDDY, 1) 6SCHED_FEAT(CACHE_HOT_BUDDY, 1)
7SCHED_FEAT(SYNC_WAKEUPS, 1) 7SCHED_FEAT(SYNC_WAKEUPS, 1)
8SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 0)
9SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
10SCHED_FEAT(ASYM_GRAN, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
15SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..8a21a2e28c13 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
105 105
106 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
107 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
111 108
112 .check_preempt_curr = check_preempt_curr_idle, 109 .check_preempt_curr = check_preempt_curr_idle,
113 110
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
115 .put_prev_task = put_prev_task_idle, 112 .put_prev_task = put_prev_task_idle,
116 113
117#ifdef CONFIG_SMP 114#ifdef CONFIG_SMP
115 .select_task_rq = select_task_rq_idle,
116
118 .load_balance = load_balance_idle, 117 .load_balance = load_balance_idle,
119 .move_one_task = move_one_task_idle, 118 .move_one_task = move_one_task_idle,
120#endif 119#endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1113157b2058..d9ba9d5f99d6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
102 102
103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
104{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
105 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
106 107
107 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
108 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
109 110 enqueue_rt_entity(rt_se);
110 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
112 resched_task(curr); 112 resched_task(curr);
113 } 113 }
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
231#endif /* CONFIG_RT_GROUP_SCHED */ 231#endif /* CONFIG_RT_GROUP_SCHED */
232 232
233#ifdef CONFIG_SMP 233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
234static int do_balance_runtime(struct rt_rq *rt_rq) 237static int do_balance_runtime(struct rt_rq *rt_rq)
235{ 238{
236 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
250 continue; 253 continue;
251 254
252 spin_lock(&iter->rt_runtime_lock); 255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
253 if (iter->rt_runtime == RUNTIME_INF) 261 if (iter->rt_runtime == RUNTIME_INF)
254 goto next; 262 goto next;
255 263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
256 diff = iter->rt_runtime - iter->rt_time; 268 diff = iter->rt_runtime - iter->rt_time;
257 if (diff > 0) { 269 if (diff > 0) {
258 diff = div_u64((u64)diff, weight); 270 diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
274 return more; 286 return more;
275} 287}
276 288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
277static void __disable_runtime(struct rq *rq) 292static void __disable_runtime(struct rq *rq)
278{ 293{
279 struct root_domain *rd = rq->rd; 294 struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
289 304
290 spin_lock(&rt_b->rt_runtime_lock); 305 spin_lock(&rt_b->rt_runtime_lock);
291 spin_lock(&rt_rq->rt_runtime_lock); 306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
292 if (rt_rq->rt_runtime == RUNTIME_INF || 312 if (rt_rq->rt_runtime == RUNTIME_INF ||
293 rt_rq->rt_runtime == rt_b->rt_runtime) 313 rt_rq->rt_runtime == rt_b->rt_runtime)
294 goto balanced; 314 goto balanced;
295 spin_unlock(&rt_rq->rt_runtime_lock); 315 spin_unlock(&rt_rq->rt_runtime_lock);
296 316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
297 want = rt_b->rt_runtime - rt_rq->rt_runtime; 322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
298 323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
299 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu_mask(i, rd->span) {
300 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
301 s64 diff; 329 s64 diff;
302 330
331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
303 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
304 continue; 335 continue;
305 336
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
319 } 350 }
320 351
321 spin_lock(&rt_rq->rt_runtime_lock); 352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
322 BUG_ON(want); 357 BUG_ON(want);
323balanced: 358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
324 rt_rq->rt_runtime = RUNTIME_INF; 363 rt_rq->rt_runtime = RUNTIME_INF;
325 spin_unlock(&rt_rq->rt_runtime_lock); 364 spin_unlock(&rt_rq->rt_runtime_lock);
326 spin_unlock(&rt_b->rt_runtime_lock); 365 spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
343 if (unlikely(!scheduler_running)) 382 if (unlikely(!scheduler_running))
344 return; 383 return;
345 384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
346 for_each_leaf_rt_rq(rt_rq, rq) { 388 for_each_leaf_rt_rq(rt_rq, rq) {
347 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
348 390
@@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
389 int i, idle = 1; 431 int i, idle = 1;
390 cpumask_t span; 432 cpumask_t span;
391 433
392 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
393 return 1; 435 return 1;
394 436
395 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -484,9 +526,14 @@ static void update_curr_rt(struct rq *rq)
484 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 526 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
485 527
486 curr->se.sum_exec_runtime += delta_exec; 528 curr->se.sum_exec_runtime += delta_exec;
529 account_group_exec_runtime(curr, delta_exec);
530
487 curr->se.exec_start = rq->clock; 531 curr->se.exec_start = rq->clock;
488 cpuacct_charge(curr, delta_exec); 532 cpuacct_charge(curr, delta_exec);
489 533
534 if (!rt_bandwidth_enabled())
535 return;
536
490 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
491 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
492 539
@@ -784,7 +831,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
784/* 831/*
785 * Preempt the current task with a newly woken task if needed: 832 * Preempt the current task with a newly woken task if needed:
786 */ 833 */
787static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 834static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
788{ 835{
789 if (p->prio < rq->curr->prio) { 836 if (p->prio < rq->curr->prio) {
790 resched_task(rq->curr); 837 resched_task(rq->curr);
@@ -1413,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1413 p->rt.timeout++; 1460 p->rt.timeout++;
1414 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1461 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1415 if (p->rt.timeout > next) 1462 if (p->rt.timeout > next)
1416 p->it_sched_expires = p->se.sum_exec_runtime; 1463 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1417 } 1464 }
1418} 1465}
1419 1466
@@ -1457,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
1457 .enqueue_task = enqueue_task_rt, 1504 .enqueue_task = enqueue_task_rt,
1458 .dequeue_task = dequeue_task_rt, 1505 .dequeue_task = dequeue_task_rt,
1459 .yield_task = yield_task_rt, 1506 .yield_task = yield_task_rt,
1460#ifdef CONFIG_SMP
1461 .select_task_rq = select_task_rq_rt,
1462#endif /* CONFIG_SMP */
1463 1507
1464 .check_preempt_curr = check_preempt_curr_rt, 1508 .check_preempt_curr = check_preempt_curr_rt,
1465 1509
@@ -1467,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
1467 .put_prev_task = put_prev_task_rt, 1511 .put_prev_task = put_prev_task_rt,
1468 1512
1469#ifdef CONFIG_SMP 1513#ifdef CONFIG_SMP
1514 .select_task_rq = select_task_rq_rt,
1515
1470 .load_balance = load_balance_rt, 1516 .load_balance = load_balance_rt,
1471 .move_one_task = move_one_task_rt, 1517 .move_one_task = move_one_task_rt,
1472 .set_cpus_allowed = set_cpus_allowed_rt, 1518 .set_cpus_allowed = set_cpus_allowed_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..7dbf72a2b02c 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,7 +9,7 @@
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
11 int cpu; 11 int cpu;
12 int mask_len = NR_CPUS/32 * 9; 12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL); 13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14 14
15 if (mask_str == NULL) 15 if (mask_str == NULL)
@@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
90 return res; 90 return res;
91} 91}
92 92
93const struct file_operations proc_schedstat_operations = { 93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open, 94 .open = schedstat_open,
95 .read = seq_read, 95 .read = seq_read,
96 .llseek = seq_lseek, 96 .llseek = seq_lseek,
97 .release = single_release, 97 .release = single_release,
98}; 98};
99 99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106
100/* 107/*
101 * Expects runqueue lock to be held for atomicity of update 108 * Expects runqueue lock to be held for atomicity of update
102 */ 109 */
@@ -270,3 +277,96 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
270#define sched_info_switch(t, next) do { } while (0) 277#define sched_info_switch(t, next) do { } while (0)
271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 278#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
272 279
280/*
281 * The following are functions that support scheduler-internal time accounting.
282 * These functions are generally called at the timer tick. None of this depends
283 * on CONFIG_SCHEDSTATS.
284 */
285
286/**
287 * account_group_user_time - Maintain utime for a thread group.
288 *
289 * @tsk: Pointer to task structure.
290 * @cputime: Time value by which to increment the utime field of the
291 * thread_group_cputime structure.
292 *
293 * If thread group time is being maintained, get the structure for the
294 * running CPU and update the utime field there.
295 */
296static inline void account_group_user_time(struct task_struct *tsk,
297 cputime_t cputime)
298{
299 struct signal_struct *sig;
300
301 /* tsk == current, ensure it is safe to use ->signal */
302 if (unlikely(tsk->exit_state))
303 return;
304
305 sig = tsk->signal;
306 if (sig->cputime.totals) {
307 struct task_cputime *times;
308
309 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
310 times->utime = cputime_add(times->utime, cputime);
311 put_cpu_no_resched();
312 }
313}
314
315/**
316 * account_group_system_time - Maintain stime for a thread group.
317 *
318 * @tsk: Pointer to task structure.
319 * @cputime: Time value by which to increment the stime field of the
320 * thread_group_cputime structure.
321 *
322 * If thread group time is being maintained, get the structure for the
323 * running CPU and update the stime field there.
324 */
325static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime)
327{
328 struct signal_struct *sig;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 sig = tsk->signal;
335 if (sig->cputime.totals) {
336 struct task_cputime *times;
337
338 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
339 times->stime = cputime_add(times->stime, cputime);
340 put_cpu_no_resched();
341 }
342}
343
344/**
345 * account_group_exec_runtime - Maintain exec runtime for a thread group.
346 *
347 * @tsk: Pointer to task structure.
348 * @ns: Time value by which to increment the sum_exec_runtime field
349 * of the thread_group_cputime structure.
350 *
351 * If thread group time is being maintained, get the structure for the
352 * running CPU and update the sum_exec_runtime field there.
353 */
354static inline void account_group_exec_runtime(struct task_struct *tsk,
355 unsigned long long ns)
356{
357 struct signal_struct *sig;
358
359 sig = tsk->signal;
360 /* see __exit_signal()->task_rq_unlock_wait() */
361 barrier();
362 if (unlikely(!sig))
363 return;
364
365 if (sig->cputime.totals) {
366 struct task_cputime *times;
367
368 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
369 times->sum_exec_runtime += ns;
370 put_cpu_no_resched();
371 }
372}
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01d340f..4530fc654455 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,6 +27,7 @@
27#include <linux/freezer.h> 27#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/sched.h>
30 31
31#include <asm/param.h> 32#include <asm/param.h>
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
@@ -803,6 +804,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
803 struct sigpending *pending; 804 struct sigpending *pending;
804 struct sigqueue *q; 805 struct sigqueue *q;
805 806
807 trace_sched_signal_send(sig, t);
808
806 assert_spin_locked(&t->sighand->siglock); 809 assert_spin_locked(&t->sighand->siglock);
807 if (!prepare_signal(sig, t)) 810 if (!prepare_signal(sig, t))
808 return 0; 811 return 0;
@@ -1141,7 +1144,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1141 struct task_struct * p; 1144 struct task_struct * p;
1142 1145
1143 for_each_process(p) { 1146 for_each_process(p) {
1144 if (p->pid > 1 && !same_thread_group(p, current)) { 1147 if (task_pid_vnr(p) > 1 &&
1148 !same_thread_group(p, current)) {
1145 int err = group_send_sig_info(sig, info, p); 1149 int err = group_send_sig_info(sig, info, p);
1146 ++count; 1150 ++count;
1147 if (err != -EPERM) 1151 if (err != -EPERM)
@@ -1338,6 +1342,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1338 struct siginfo info; 1342 struct siginfo info;
1339 unsigned long flags; 1343 unsigned long flags;
1340 struct sighand_struct *psig; 1344 struct sighand_struct *psig;
1345 struct task_cputime cputime;
1341 int ret = sig; 1346 int ret = sig;
1342 1347
1343 BUG_ON(sig == -1); 1348 BUG_ON(sig == -1);
@@ -1368,10 +1373,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1368 1373
1369 info.si_uid = tsk->uid; 1374 info.si_uid = tsk->uid;
1370 1375
1371 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1376 thread_group_cputime(tsk, &cputime);
1372 tsk->signal->utime)); 1377 info.si_utime = cputime_to_jiffies(cputime.utime);
1373 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, 1378 info.si_stime = cputime_to_jiffies(cputime.stime);
1374 tsk->signal->stime));
1375 1379
1376 info.si_status = tsk->exit_code & 0x7f; 1380 info.si_status = tsk->exit_code & 0x7f;
1377 if (tsk->exit_code & 0x80) 1381 if (tsk->exit_code & 0x80)
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a8553777..75c8dde58c55 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
51{ 51{
52 /* Wait for response */ 52 /* Wait for response */
53 do { 53 do {
54 /*
55 * We need to see the flags store in the IPI handler
56 */
57 smp_mb();
58 if (!(data->flags & CSD_FLAG_WAIT)) 54 if (!(data->flags & CSD_FLAG_WAIT))
59 break; 55 break;
60 cpu_relax(); 56 cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
76 list_add_tail(&data->list, &dst->list); 72 list_add_tail(&data->list, &dst->list);
77 spin_unlock_irqrestore(&dst->lock, flags); 73 spin_unlock_irqrestore(&dst->lock, flags);
78 74
75 /*
76 * Make the list addition visible before sending the ipi.
77 */
78 smp_mb();
79
79 if (ipi) 80 if (ipi)
80 arch_send_call_function_single_ipi(cpu); 81 arch_send_call_function_single_ipi(cpu);
81 82
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
157 * Need to see other stores to list head for checking whether 158 * Need to see other stores to list head for checking whether
158 * list is empty without holding q->lock 159 * list is empty without holding q->lock
159 */ 160 */
160 smp_mb(); 161 smp_read_barrier_depends();
161 while (!list_empty(&q->list)) { 162 while (!list_empty(&q->list)) {
162 unsigned int data_flags; 163 unsigned int data_flags;
163 164
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
191 /* 192 /*
192 * See comment on outer loop 193 * See comment on outer loop
193 */ 194 */
194 smp_mb(); 195 smp_read_barrier_depends();
195 } 196 }
196} 197}
197 198
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
370 list_add_tail_rcu(&data->csd.list, &call_function_queue); 371 list_add_tail_rcu(&data->csd.list, &call_function_queue);
371 spin_unlock_irqrestore(&call_function_lock, flags); 372 spin_unlock_irqrestore(&call_function_lock, flags);
372 373
374 /*
375 * Make the list addition visible before sending the ipi.
376 */
377 smp_mb();
378
373 /* Send a message to all CPUs in the map */ 379 /* Send a message to all CPUs in the map */
374 arch_send_call_function_ipi(mask); 380 arch_send_call_function_ipi(mask);
375 381
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f266a6b9..e7c69a720d69 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 *
10 * Remote softirq infrastructure is by Jens Axboe.
9 */ 11 */
10 12
11#include <linux/module.h> 13#include <linux/module.h>
@@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
46EXPORT_SYMBOL(irq_stat); 48EXPORT_SYMBOL(irq_stat);
47#endif 49#endif
48 50
49static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; 51static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
50 52
51static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
52 54
@@ -205,7 +207,18 @@ restart:
205 207
206 do { 208 do {
207 if (pending & 1) { 209 if (pending & 1) {
210 int prev_count = preempt_count();
211
208 h->action(h); 212 h->action(h);
213
214 if (unlikely(prev_count != preempt_count())) {
215 printk(KERN_ERR "huh, entered softirq %td %p"
216 "with preempt_count %08x,"
217 " exited with %08x?\n", h - softirq_vec,
218 h->action, prev_count, preempt_count());
219 preempt_count() = prev_count;
220 }
221
209 rcu_bh_qsctr_inc(cpu); 222 rcu_bh_qsctr_inc(cpu);
210 } 223 }
211 h++; 224 h++;
@@ -254,16 +267,13 @@ asmlinkage void do_softirq(void)
254 */ 267 */
255void irq_enter(void) 268void irq_enter(void)
256{ 269{
257#ifdef CONFIG_NO_HZ
258 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
259 if (idle_cpu(cpu) && !in_interrupt()) 271
260 tick_nohz_stop_idle(cpu); 272 if (idle_cpu(cpu) && !in_interrupt()) {
261#endif 273 __irq_enter();
262 __irq_enter(); 274 tick_check_idle(cpu);
263#ifdef CONFIG_NO_HZ 275 } else
264 if (idle_cpu(cpu)) 276 __irq_enter();
265 tick_nohz_update_jiffies();
266#endif
267} 277}
268 278
269#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 279#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -463,17 +473,144 @@ void tasklet_kill(struct tasklet_struct *t)
463 473
464EXPORT_SYMBOL(tasklet_kill); 474EXPORT_SYMBOL(tasklet_kill);
465 475
476DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
477EXPORT_PER_CPU_SYMBOL(softirq_work_list);
478
479static void __local_trigger(struct call_single_data *cp, int softirq)
480{
481 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
482
483 list_add_tail(&cp->list, head);
484
485 /* Trigger the softirq only if the list was previously empty. */
486 if (head->next == &cp->list)
487 raise_softirq_irqoff(softirq);
488}
489
490#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
491static void remote_softirq_receive(void *data)
492{
493 struct call_single_data *cp = data;
494 unsigned long flags;
495 int softirq;
496
497 softirq = cp->priv;
498
499 local_irq_save(flags);
500 __local_trigger(cp, softirq);
501 local_irq_restore(flags);
502}
503
504static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
505{
506 if (cpu_online(cpu)) {
507 cp->func = remote_softirq_receive;
508 cp->info = cp;
509 cp->flags = 0;
510 cp->priv = softirq;
511
512 __smp_call_function_single(cpu, cp);
513 return 0;
514 }
515 return 1;
516}
517#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
518static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
519{
520 return 1;
521}
522#endif
523
524/**
525 * __send_remote_softirq - try to schedule softirq work on a remote cpu
526 * @cp: private SMP call function data area
527 * @cpu: the remote cpu
528 * @this_cpu: the currently executing cpu
529 * @softirq: the softirq for the work
530 *
531 * Attempt to schedule softirq work on a remote cpu. If this cannot be
532 * done, the work is instead queued up on the local cpu.
533 *
534 * Interrupts must be disabled.
535 */
536void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
537{
538 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
539 __local_trigger(cp, softirq);
540}
541EXPORT_SYMBOL(__send_remote_softirq);
542
543/**
544 * send_remote_softirq - try to schedule softirq work on a remote cpu
545 * @cp: private SMP call function data area
546 * @cpu: the remote cpu
547 * @softirq: the softirq for the work
548 *
549 * Like __send_remote_softirq except that disabling interrupts and
550 * computing the current cpu is done for the caller.
551 */
552void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
553{
554 unsigned long flags;
555 int this_cpu;
556
557 local_irq_save(flags);
558 this_cpu = smp_processor_id();
559 __send_remote_softirq(cp, cpu, this_cpu, softirq);
560 local_irq_restore(flags);
561}
562EXPORT_SYMBOL(send_remote_softirq);
563
564static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
565 unsigned long action, void *hcpu)
566{
567 /*
568 * If a CPU goes away, splice its entries to the current CPU
569 * and trigger a run of the softirq
570 */
571 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
572 int cpu = (unsigned long) hcpu;
573 int i;
574
575 local_irq_disable();
576 for (i = 0; i < NR_SOFTIRQS; i++) {
577 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
578 struct list_head *local_head;
579
580 if (list_empty(head))
581 continue;
582
583 local_head = &__get_cpu_var(softirq_work_list[i]);
584 list_splice_init(head, local_head);
585 raise_softirq_irqoff(i);
586 }
587 local_irq_enable();
588 }
589
590 return NOTIFY_OK;
591}
592
593static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
594 .notifier_call = remote_softirq_cpu_notify,
595};
596
466void __init softirq_init(void) 597void __init softirq_init(void)
467{ 598{
468 int cpu; 599 int cpu;
469 600
470 for_each_possible_cpu(cpu) { 601 for_each_possible_cpu(cpu) {
602 int i;
603
471 per_cpu(tasklet_vec, cpu).tail = 604 per_cpu(tasklet_vec, cpu).tail =
472 &per_cpu(tasklet_vec, cpu).head; 605 &per_cpu(tasklet_vec, cpu).head;
473 per_cpu(tasklet_hi_vec, cpu).tail = 606 per_cpu(tasklet_hi_vec, cpu).tail =
474 &per_cpu(tasklet_hi_vec, cpu).head; 607 &per_cpu(tasklet_hi_vec, cpu).head;
608 for (i = 0; i < NR_SOFTIRQS; i++)
609 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
475 } 610 }
476 611
612 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
613
477 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 614 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
478 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 615 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
479} 616}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index cb838ee93a82..3953e4aed733 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
226 * If the system crashed already then all bets are off, 226 * If the system crashed already then all bets are off,
227 * do not report extra hung tasks: 227 * do not report extra hung tasks:
228 */ 228 */
229 if ((tainted & TAINT_DIE) || did_panic) 229 if (test_taint(TAINT_DIE) || did_panic)
230 return; 230 return;
231 231
232 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index af3c7cea258b..24e8ceacc388 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -37,9 +37,13 @@ struct stop_machine_data {
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
38static unsigned int num_threads; 38static unsigned int num_threads;
39static atomic_t thread_ack; 39static atomic_t thread_ack;
40static struct completion finished;
41static DEFINE_MUTEX(lock); 40static DEFINE_MUTEX(lock);
42 41
42static struct workqueue_struct *stop_machine_wq;
43static struct stop_machine_data active, idle;
44static const cpumask_t *active_cpus;
45static void *stop_machine_work;
46
43static void set_state(enum stopmachine_state newstate) 47static void set_state(enum stopmachine_state newstate)
44{ 48{
45 /* Reset ack counter. */ 49 /* Reset ack counter. */
@@ -51,21 +55,26 @@ static void set_state(enum stopmachine_state newstate)
51/* Last one to ack a state moves to the next state. */ 55/* Last one to ack a state moves to the next state. */
52static void ack_state(void) 56static void ack_state(void)
53{ 57{
54 if (atomic_dec_and_test(&thread_ack)) { 58 if (atomic_dec_and_test(&thread_ack))
55 /* If we're the last one to ack the EXIT, we're finished. */ 59 set_state(state + 1);
56 if (state == STOPMACHINE_EXIT)
57 complete(&finished);
58 else
59 set_state(state + 1);
60 }
61} 60}
62 61
63/* This is the actual thread which stops the CPU. It exits by itself rather 62/* This is the actual function which stops the CPU. It runs
64 * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ 63 * in the context of a dedicated stopmachine workqueue. */
65static int stop_cpu(struct stop_machine_data *smdata) 64static void stop_cpu(struct work_struct *unused)
66{ 65{
67 enum stopmachine_state curstate = STOPMACHINE_NONE; 66 enum stopmachine_state curstate = STOPMACHINE_NONE;
68 67 struct stop_machine_data *smdata = &idle;
68 int cpu = smp_processor_id();
69 int err;
70
71 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map))
73 smdata = &active;
74 } else {
75 if (cpu_isset(cpu, *active_cpus))
76 smdata = &active;
77 }
69 /* Simple state machine */ 78 /* Simple state machine */
70 do { 79 do {
71 /* Chill out and ensure we re-read stopmachine_state. */ 80 /* Chill out and ensure we re-read stopmachine_state. */
@@ -78,9 +87,11 @@ static int stop_cpu(struct stop_machine_data *smdata)
78 hard_irq_disable(); 87 hard_irq_disable();
79 break; 88 break;
80 case STOPMACHINE_RUN: 89 case STOPMACHINE_RUN:
81 /* |= allows error detection if functions on 90 /* On multiple CPUs only a single error code
82 * multiple CPUs. */ 91 * is needed to tell that something failed. */
83 smdata->fnret |= smdata->fn(smdata->data); 92 err = smdata->fn(smdata->data);
93 if (err)
94 smdata->fnret = err;
84 break; 95 break;
85 default: 96 default:
86 break; 97 break;
@@ -90,7 +101,6 @@ static int stop_cpu(struct stop_machine_data *smdata)
90 } while (curstate != STOPMACHINE_EXIT); 101 } while (curstate != STOPMACHINE_EXIT);
91 102
92 local_irq_enable(); 103 local_irq_enable();
93 do_exit(0);
94} 104}
95 105
96/* Callback for CPUs which aren't supposed to do anything. */ 106/* Callback for CPUs which aren't supposed to do anything. */
@@ -101,78 +111,35 @@ static int chill(void *unused)
101 111
102int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
103{ 113{
104 int i, err; 114 struct work_struct *sm_work;
105 struct stop_machine_data active, idle; 115 int i, ret;
106 struct task_struct **threads;
107 116
117 /* Set up initial state. */
118 mutex_lock(&lock);
119 num_threads = num_online_cpus();
120 active_cpus = cpus;
108 active.fn = fn; 121 active.fn = fn;
109 active.data = data; 122 active.data = data;
110 active.fnret = 0; 123 active.fnret = 0;
111 idle.fn = chill; 124 idle.fn = chill;
112 idle.data = NULL; 125 idle.data = NULL;
113 126
114 /* This could be too big for stack on large machines. */
115 threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
116 if (!threads)
117 return -ENOMEM;
118
119 /* Set up initial state. */
120 mutex_lock(&lock);
121 init_completion(&finished);
122 num_threads = num_online_cpus();
123 set_state(STOPMACHINE_PREPARE); 127 set_state(STOPMACHINE_PREPARE);
124 128
125 for_each_online_cpu(i) { 129 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
126 struct stop_machine_data *smdata = &idle;
127 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
128
129 if (!cpus) {
130 if (i == first_cpu(cpu_online_map))
131 smdata = &active;
132 } else {
133 if (cpu_isset(i, *cpus))
134 smdata = &active;
135 }
136
137 threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
138 i);
139 if (IS_ERR(threads[i])) {
140 err = PTR_ERR(threads[i]);
141 threads[i] = NULL;
142 goto kill_threads;
143 }
144
145 /* Place it onto correct cpu. */
146 kthread_bind(threads[i], i);
147
148 /* Make it highest prio. */
149 if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
150 BUG();
151 }
152
153 /* We've created all the threads. Wake them all: hold this CPU so one
154 * doesn't hit this CPU until we're ready. */ 130 * doesn't hit this CPU until we're ready. */
155 get_cpu(); 131 get_cpu();
156 for_each_online_cpu(i) 132 for_each_online_cpu(i) {
157 wake_up_process(threads[i]); 133 sm_work = percpu_ptr(stop_machine_work, i);
158 134 INIT_WORK(sm_work, stop_cpu);
135 queue_work_on(i, stop_machine_wq, sm_work);
136 }
159 /* This will release the thread on our CPU. */ 137 /* This will release the thread on our CPU. */
160 put_cpu(); 138 put_cpu();
161 wait_for_completion(&finished); 139 flush_workqueue(stop_machine_wq);
162 mutex_unlock(&lock); 140 ret = active.fnret;
163
164 kfree(threads);
165
166 return active.fnret;
167
168kill_threads:
169 for_each_online_cpu(i)
170 if (threads[i])
171 kthread_stop(threads[i]);
172 mutex_unlock(&lock); 141 mutex_unlock(&lock);
173 142 return ret;
174 kfree(threads);
175 return err;
176} 143}
177 144
178int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
@@ -187,3 +154,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
187 return ret; 154 return ret;
188} 155}
189EXPORT_SYMBOL_GPL(stop_machine); 156EXPORT_SYMBOL_GPL(stop_machine);
157
158static int __init stop_machine_init(void)
159{
160 stop_machine_wq = create_rt_workqueue("kstop");
161 stop_machine_work = alloc_percpu(struct work_struct);
162 return 0;
163}
164core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..31deba8f7d16 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)
853 return old_fsgid; 853 return old_fsgid;
854} 854}
855 855
856void do_sys_times(struct tms *tms)
857{
858 struct task_cputime cputime;
859 cputime_t cutime, cstime;
860
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime);
863 cutime = current->signal->cutime;
864 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock);
866 tms->tms_utime = cputime_to_clock_t(cputime.utime);
867 tms->tms_stime = cputime_to_clock_t(cputime.stime);
868 tms->tms_cutime = cputime_to_clock_t(cutime);
869 tms->tms_cstime = cputime_to_clock_t(cstime);
870}
871
856asmlinkage long sys_times(struct tms __user * tbuf) 872asmlinkage long sys_times(struct tms __user * tbuf)
857{ 873{
858 /*
859 * In the SMP world we might just be unlucky and have one of
860 * the times increment as we use it. Since the value is an
861 * atomically safe type this is just fine. Conceptually its
862 * as if the syscall took an instant longer to occur.
863 */
864 if (tbuf) { 874 if (tbuf) {
865 struct tms tmp; 875 struct tms tmp;
866 struct task_struct *tsk = current; 876
867 struct task_struct *t; 877 do_sys_times(&tmp);
868 cputime_t utime, stime, cutime, cstime;
869
870 spin_lock_irq(&tsk->sighand->siglock);
871 utime = tsk->signal->utime;
872 stime = tsk->signal->stime;
873 t = tsk;
874 do {
875 utime = cputime_add(utime, t->utime);
876 stime = cputime_add(stime, t->stime);
877 t = next_thread(t);
878 } while (t != tsk);
879
880 cutime = tsk->signal->cutime;
881 cstime = tsk->signal->cstime;
882 spin_unlock_irq(&tsk->sighand->siglock);
883
884 tmp.tms_utime = cputime_to_clock_t(utime);
885 tmp.tms_stime = cputime_to_clock_t(stime);
886 tmp.tms_cutime = cputime_to_clock_t(cutime);
887 tmp.tms_cstime = cputime_to_clock_t(cstime);
888 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 878 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
889 return -EFAULT; 879 return -EFAULT;
890 } 880 }
@@ -1060,9 +1050,7 @@ asmlinkage long sys_setsid(void)
1060 group_leader->signal->leader = 1; 1050 group_leader->signal->leader = 1;
1061 __set_special_pids(sid); 1051 __set_special_pids(sid);
1062 1052
1063 spin_lock(&group_leader->sighand->siglock); 1053 proc_clear_tty(group_leader);
1064 group_leader->signal->tty = NULL;
1065 spin_unlock(&group_leader->sighand->siglock);
1066 1054
1067 err = session; 1055 err = session;
1068out: 1056out:
@@ -1351,8 +1339,10 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1351 down_write(&uts_sem); 1339 down_write(&uts_sem);
1352 errno = -EFAULT; 1340 errno = -EFAULT;
1353 if (!copy_from_user(tmp, name, len)) { 1341 if (!copy_from_user(tmp, name, len)) {
1354 memcpy(utsname()->nodename, tmp, len); 1342 struct new_utsname *u = utsname();
1355 utsname()->nodename[len] = 0; 1343
1344 memcpy(u->nodename, tmp, len);
1345 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1356 errno = 0; 1346 errno = 0;
1357 } 1347 }
1358 up_write(&uts_sem); 1348 up_write(&uts_sem);
@@ -1364,15 +1354,17 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1364asmlinkage long sys_gethostname(char __user *name, int len) 1354asmlinkage long sys_gethostname(char __user *name, int len)
1365{ 1355{
1366 int i, errno; 1356 int i, errno;
1357 struct new_utsname *u;
1367 1358
1368 if (len < 0) 1359 if (len < 0)
1369 return -EINVAL; 1360 return -EINVAL;
1370 down_read(&uts_sem); 1361 down_read(&uts_sem);
1371 i = 1 + strlen(utsname()->nodename); 1362 u = utsname();
1363 i = 1 + strlen(u->nodename);
1372 if (i > len) 1364 if (i > len)
1373 i = len; 1365 i = len;
1374 errno = 0; 1366 errno = 0;
1375 if (copy_to_user(name, utsname()->nodename, i)) 1367 if (copy_to_user(name, u->nodename, i))
1376 errno = -EFAULT; 1368 errno = -EFAULT;
1377 up_read(&uts_sem); 1369 up_read(&uts_sem);
1378 return errno; 1370 return errno;
@@ -1397,8 +1389,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1397 down_write(&uts_sem); 1389 down_write(&uts_sem);
1398 errno = -EFAULT; 1390 errno = -EFAULT;
1399 if (!copy_from_user(tmp, name, len)) { 1391 if (!copy_from_user(tmp, name, len)) {
1400 memcpy(utsname()->domainname, tmp, len); 1392 struct new_utsname *u = utsname();
1401 utsname()->domainname[len] = 0; 1393
1394 memcpy(u->domainname, tmp, len);
1395 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1402 errno = 0; 1396 errno = 0;
1403 } 1397 }
1404 up_write(&uts_sem); 1398 up_write(&uts_sem);
@@ -1445,21 +1439,28 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1445asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1439asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1446{ 1440{
1447 struct rlimit new_rlim, *old_rlim; 1441 struct rlimit new_rlim, *old_rlim;
1448 unsigned long it_prof_secs;
1449 int retval; 1442 int retval;
1450 1443
1451 if (resource >= RLIM_NLIMITS) 1444 if (resource >= RLIM_NLIMITS)
1452 return -EINVAL; 1445 return -EINVAL;
1453 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1446 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1454 return -EFAULT; 1447 return -EFAULT;
1455 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1456 return -EINVAL;
1457 old_rlim = current->signal->rlim + resource; 1448 old_rlim = current->signal->rlim + resource;
1458 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1449 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1459 !capable(CAP_SYS_RESOURCE)) 1450 !capable(CAP_SYS_RESOURCE))
1460 return -EPERM; 1451 return -EPERM;
1461 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) 1452
1462 return -EPERM; 1453 if (resource == RLIMIT_NOFILE) {
1454 if (new_rlim.rlim_max == RLIM_INFINITY)
1455 new_rlim.rlim_max = sysctl_nr_open;
1456 if (new_rlim.rlim_cur == RLIM_INFINITY)
1457 new_rlim.rlim_cur = sysctl_nr_open;
1458 if (new_rlim.rlim_max > sysctl_nr_open)
1459 return -EPERM;
1460 }
1461
1462 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1463 return -EINVAL;
1463 1464
1464 retval = security_task_setrlimit(resource, &new_rlim); 1465 retval = security_task_setrlimit(resource, &new_rlim);
1465 if (retval) 1466 if (retval)
@@ -1491,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1491 if (new_rlim.rlim_cur == RLIM_INFINITY) 1492 if (new_rlim.rlim_cur == RLIM_INFINITY)
1492 goto out; 1493 goto out;
1493 1494
1494 it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); 1495 update_rlimit_cpu(new_rlim.rlim_cur);
1495 if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
1496 unsigned long rlim_cur = new_rlim.rlim_cur;
1497 cputime_t cputime;
1498
1499 cputime = secs_to_cputime(rlim_cur);
1500 read_lock(&tasklist_lock);
1501 spin_lock_irq(&current->sighand->siglock);
1502 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
1503 spin_unlock_irq(&current->sighand->siglock);
1504 read_unlock(&tasklist_lock);
1505 }
1506out: 1496out:
1507 return 0; 1497 return 0;
1508} 1498}
@@ -1540,11 +1530,8 @@ out:
1540 * 1530 *
1541 */ 1531 */
1542 1532
1543static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, 1533static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1544 cputime_t *utimep, cputime_t *stimep)
1545{ 1534{
1546 *utimep = cputime_add(*utimep, t->utime);
1547 *stimep = cputime_add(*stimep, t->stime);
1548 r->ru_nvcsw += t->nvcsw; 1535 r->ru_nvcsw += t->nvcsw;
1549 r->ru_nivcsw += t->nivcsw; 1536 r->ru_nivcsw += t->nivcsw;
1550 r->ru_minflt += t->min_flt; 1537 r->ru_minflt += t->min_flt;
@@ -1558,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1558 struct task_struct *t; 1545 struct task_struct *t;
1559 unsigned long flags; 1546 unsigned long flags;
1560 cputime_t utime, stime; 1547 cputime_t utime, stime;
1548 struct task_cputime cputime;
1561 1549
1562 memset((char *) r, 0, sizeof *r); 1550 memset((char *) r, 0, sizeof *r);
1563 utime = stime = cputime_zero; 1551 utime = stime = cputime_zero;
1564 1552
1565 if (who == RUSAGE_THREAD) { 1553 if (who == RUSAGE_THREAD) {
1566 accumulate_thread_rusage(p, r, &utime, &stime); 1554 accumulate_thread_rusage(p, r);
1567 goto out; 1555 goto out;
1568 } 1556 }
1569 1557
@@ -1586,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1586 break; 1574 break;
1587 1575
1588 case RUSAGE_SELF: 1576 case RUSAGE_SELF:
1589 utime = cputime_add(utime, p->signal->utime); 1577 thread_group_cputime(p, &cputime);
1590 stime = cputime_add(stime, p->signal->stime); 1578 utime = cputime_add(utime, cputime.utime);
1579 stime = cputime_add(stime, cputime.stime);
1591 r->ru_nvcsw += p->signal->nvcsw; 1580 r->ru_nvcsw += p->signal->nvcsw;
1592 r->ru_nivcsw += p->signal->nivcsw; 1581 r->ru_nivcsw += p->signal->nivcsw;
1593 r->ru_minflt += p->signal->min_flt; 1582 r->ru_minflt += p->signal->min_flt;
@@ -1596,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1596 r->ru_oublock += p->signal->oublock; 1585 r->ru_oublock += p->signal->oublock;
1597 t = p; 1586 t = p;
1598 do { 1587 do {
1599 accumulate_thread_rusage(t, r, &utime, &stime); 1588 accumulate_thread_rusage(t, r);
1600 t = next_thread(t); 1589 t = next_thread(t);
1601 } while (t != p); 1590 } while (t != p);
1602 break; 1591 break;
@@ -1727,6 +1716,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1727 case PR_SET_TSC: 1716 case PR_SET_TSC:
1728 error = SET_TSC_CTL(arg2); 1717 error = SET_TSC_CTL(arg2);
1729 break; 1718 break;
1719 case PR_GET_TIMERSLACK:
1720 error = current->timer_slack_ns;
1721 break;
1722 case PR_SET_TIMERSLACK:
1723 if (arg2 <= 0)
1724 current->timer_slack_ns =
1725 current->default_timer_slack_ns;
1726 else
1727 current->timer_slack_ns = arg2;
1728 break;
1730 default: 1729 default:
1731 error = -EINVAL; 1730 error = -EINVAL;
1732 break; 1731 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 08d6e1bb99ac..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 31cond_syscall(sys_bind);
32cond_syscall(sys_listen); 32cond_syscall(sys_listen);
33cond_syscall(sys_accept); 33cond_syscall(sys_accept);
34cond_syscall(sys_paccept); 34cond_syscall(sys_accept4);
35cond_syscall(sys_connect); 35cond_syscall(sys_connect);
36cond_syscall(sys_getsockname); 36cond_syscall(sys_getsockname);
37cond_syscall(sys_getpeername); 37cond_syscall(sys_getpeername);
@@ -125,6 +125,12 @@ cond_syscall(sys_vm86old);
125cond_syscall(sys_vm86); 125cond_syscall(sys_vm86);
126cond_syscall(compat_sys_ipc); 126cond_syscall(compat_sys_ipc);
127cond_syscall(compat_sys_sysctl); 127cond_syscall(compat_sys_sysctl);
128cond_syscall(sys_flock);
129cond_syscall(sys_io_setup);
130cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents);
128 134
129/* arch-specific weak syscall entries */ 135/* arch-specific weak syscall entries */
130cond_syscall(sys_pciconfig_read); 136cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec0886fa3d..9d048fa2d902 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max;
80extern int sysctl_drop_caches; 80extern int sysctl_drop_caches;
81extern int percpu_pagelist_fraction; 81extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int maps_protect;
84extern int latencytop_enabled; 83extern int latencytop_enabled;
85extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
86#ifdef CONFIG_RCU_TORTURE_TEST 85#ifdef CONFIG_RCU_TORTURE_TEST
@@ -97,7 +96,7 @@ static int sixty = 60;
97static int neg_one = -1; 96static int neg_one = -1;
98#endif 97#endif
99 98
100#ifdef CONFIG_MMU 99#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
101static int two = 2; 100static int two = 2;
102#endif 101#endif
103 102
@@ -118,10 +117,8 @@ extern char modprobe_path[];
118extern int sg_big_buff; 117extern int sg_big_buff;
119#endif 118#endif
120 119
121#ifdef __sparc__ 120#ifdef CONFIG_SPARC
122extern char reboot_command []; 121#include <asm/system.h>
123extern int stop_a_enabled;
124extern int scons_pwroff;
125#endif 122#endif
126 123
127#ifdef __hppa__ 124#ifdef __hppa__
@@ -152,7 +149,7 @@ extern int max_lock_depth;
152#ifdef CONFIG_PROC_SYSCTL 149#ifdef CONFIG_PROC_SYSCTL
153static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 150static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
154 void __user *buffer, size_t *lenp, loff_t *ppos); 151 void __user *buffer, size_t *lenp, loff_t *ppos);
155static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 152static int proc_taint(struct ctl_table *table, int write, struct file *filp,
156 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
157#endif 154#endif
158 155
@@ -279,6 +276,16 @@ static struct ctl_table kern_table[] = {
279 }, 276 },
280 { 277 {
281 .ctl_name = CTL_UNNUMBERED, 278 .ctl_name = CTL_UNNUMBERED,
279 .procname = "sched_shares_thresh",
280 .data = &sysctl_sched_shares_thresh,
281 .maxlen = sizeof(unsigned int),
282 .mode = 0644,
283 .proc_handler = &proc_dointvec_minmax,
284 .strategy = &sysctl_intvec,
285 .extra1 = &zero,
286 },
287 {
288 .ctl_name = CTL_UNNUMBERED,
282 .procname = "sched_child_runs_first", 289 .procname = "sched_child_runs_first",
283 .data = &sysctl_sched_child_runs_first, 290 .data = &sysctl_sched_child_runs_first,
284 .maxlen = sizeof(unsigned int), 291 .maxlen = sizeof(unsigned int),
@@ -382,10 +389,9 @@ static struct ctl_table kern_table[] = {
382#ifdef CONFIG_PROC_SYSCTL 389#ifdef CONFIG_PROC_SYSCTL
383 { 390 {
384 .procname = "tainted", 391 .procname = "tainted",
385 .data = &tainted, 392 .maxlen = sizeof(long),
386 .maxlen = sizeof(int),
387 .mode = 0644, 393 .mode = 0644,
388 .proc_handler = &proc_dointvec_taint, 394 .proc_handler = &proc_taint,
389 }, 395 },
390#endif 396#endif
391#ifdef CONFIG_LATENCYTOP 397#ifdef CONFIG_LATENCYTOP
@@ -415,7 +421,7 @@ static struct ctl_table kern_table[] = {
415 .mode = 0644, 421 .mode = 0644,
416 .proc_handler = &proc_dointvec, 422 .proc_handler = &proc_dointvec,
417 }, 423 },
418#ifdef __sparc__ 424#ifdef CONFIG_SPARC
419 { 425 {
420 .ctl_name = KERN_SPARC_REBOOT, 426 .ctl_name = KERN_SPARC_REBOOT,
421 .procname = "reboot-cmd", 427 .procname = "reboot-cmd",
@@ -468,7 +474,7 @@ static struct ctl_table kern_table[] = {
468 .mode = 0644, 474 .mode = 0644,
469 .proc_handler = &proc_dointvec, 475 .proc_handler = &proc_dointvec,
470 }, 476 },
471#ifdef CONFIG_FTRACE 477#ifdef CONFIG_FUNCTION_TRACER
472 { 478 {
473 .ctl_name = CTL_UNNUMBERED, 479 .ctl_name = CTL_UNNUMBERED,
474 .procname = "ftrace_enabled", 480 .procname = "ftrace_enabled",
@@ -810,16 +816,6 @@ static struct ctl_table kern_table[] = {
810 .proc_handler = &proc_dointvec, 816 .proc_handler = &proc_dointvec,
811 }, 817 },
812#endif 818#endif
813#ifdef CONFIG_PROC_FS
814 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "maps_protect",
817 .data = &maps_protect,
818 .maxlen = sizeof(int),
819 .mode = 0644,
820 .proc_handler = &proc_dointvec,
821 },
822#endif
823 { 819 {
824 .ctl_name = CTL_UNNUMBERED, 820 .ctl_name = CTL_UNNUMBERED,
825 .procname = "poweroff_cmd", 821 .procname = "poweroff_cmd",
@@ -847,6 +843,16 @@ static struct ctl_table kern_table[] = {
847 .proc_handler = &proc_dointvec, 843 .proc_handler = &proc_dointvec,
848 }, 844 },
849#endif 845#endif
846#ifdef CONFIG_UNEVICTABLE_LRU
847 {
848 .ctl_name = CTL_UNNUMBERED,
849 .procname = "scan_unevictable_pages",
850 .data = &scan_unevictable_pages,
851 .maxlen = sizeof(scan_unevictable_pages),
852 .mode = 0644,
853 .proc_handler = &scan_unevictable_handler,
854 },
855#endif
850/* 856/*
851 * NOTE: do not add new entries to this table unless you have read 857 * NOTE: do not add new entries to this table unless you have read
852 * Documentation/sysctl/ctl_unnumbered.txt 858 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1261,6 +1267,7 @@ static struct ctl_table fs_table[] = {
1261 .extra1 = &minolduid, 1267 .extra1 = &minolduid,
1262 .extra2 = &maxolduid, 1268 .extra2 = &maxolduid,
1263 }, 1269 },
1270#ifdef CONFIG_FILE_LOCKING
1264 { 1271 {
1265 .ctl_name = FS_LEASES, 1272 .ctl_name = FS_LEASES,
1266 .procname = "leases-enable", 1273 .procname = "leases-enable",
@@ -1269,6 +1276,7 @@ static struct ctl_table fs_table[] = {
1269 .mode = 0644, 1276 .mode = 0644,
1270 .proc_handler = &proc_dointvec, 1277 .proc_handler = &proc_dointvec,
1271 }, 1278 },
1279#endif
1272#ifdef CONFIG_DNOTIFY 1280#ifdef CONFIG_DNOTIFY
1273 { 1281 {
1274 .ctl_name = FS_DIR_NOTIFY, 1282 .ctl_name = FS_DIR_NOTIFY,
@@ -1280,6 +1288,7 @@ static struct ctl_table fs_table[] = {
1280 }, 1288 },
1281#endif 1289#endif
1282#ifdef CONFIG_MMU 1290#ifdef CONFIG_MMU
1291#ifdef CONFIG_FILE_LOCKING
1283 { 1292 {
1284 .ctl_name = FS_LEASE_TIME, 1293 .ctl_name = FS_LEASE_TIME,
1285 .procname = "lease-break-time", 1294 .procname = "lease-break-time",
@@ -1291,6 +1300,8 @@ static struct ctl_table fs_table[] = {
1291 .extra1 = &zero, 1300 .extra1 = &zero,
1292 .extra2 = &two, 1301 .extra2 = &two,
1293 }, 1302 },
1303#endif
1304#ifdef CONFIG_AIO
1294 { 1305 {
1295 .procname = "aio-nr", 1306 .procname = "aio-nr",
1296 .data = &aio_nr, 1307 .data = &aio_nr,
@@ -1305,6 +1316,7 @@ static struct ctl_table fs_table[] = {
1305 .mode = 0644, 1316 .mode = 0644,
1306 .proc_handler = &proc_doulongvec_minmax, 1317 .proc_handler = &proc_doulongvec_minmax,
1307 }, 1318 },
1319#endif /* CONFIG_AIO */
1308#ifdef CONFIG_INOTIFY_USER 1320#ifdef CONFIG_INOTIFY_USER
1309 { 1321 {
1310 .ctl_name = FS_INOTIFY, 1322 .ctl_name = FS_INOTIFY,
@@ -1510,7 +1522,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1510/* Perform the actual read/write of a sysctl table entry. */ 1522/* Perform the actual read/write of a sysctl table entry. */
1511static int do_sysctl_strategy(struct ctl_table_root *root, 1523static int do_sysctl_strategy(struct ctl_table_root *root,
1512 struct ctl_table *table, 1524 struct ctl_table *table,
1513 int __user *name, int nlen,
1514 void __user *oldval, size_t __user *oldlenp, 1525 void __user *oldval, size_t __user *oldlenp,
1515 void __user *newval, size_t newlen) 1526 void __user *newval, size_t newlen)
1516{ 1527{
@@ -1524,8 +1535,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1524 return -EPERM; 1535 return -EPERM;
1525 1536
1526 if (table->strategy) { 1537 if (table->strategy) {
1527 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1538 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1528 newval, newlen);
1529 if (rc < 0) 1539 if (rc < 0)
1530 return rc; 1540 return rc;
1531 if (rc > 0) 1541 if (rc > 0)
@@ -1535,8 +1545,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1535 /* If there is no strategy routine, or if the strategy returns 1545 /* If there is no strategy routine, or if the strategy returns
1536 * zero, proceed with automatic r/w */ 1546 * zero, proceed with automatic r/w */
1537 if (table->data && table->maxlen) { 1547 if (table->data && table->maxlen) {
1538 rc = sysctl_data(table, name, nlen, oldval, oldlenp, 1548 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1539 newval, newlen);
1540 if (rc < 0) 1549 if (rc < 0)
1541 return rc; 1550 return rc;
1542 } 1551 }
@@ -1568,7 +1577,7 @@ repeat:
1568 table = table->child; 1577 table = table->child;
1569 goto repeat; 1578 goto repeat;
1570 } 1579 }
1571 error = do_sysctl_strategy(root, table, name, nlen, 1580 error = do_sysctl_strategy(root, table,
1572 oldval, oldlenp, 1581 oldval, oldlenp,
1573 newval, newlen); 1582 newval, newlen);
1574 return error; 1583 return error;
@@ -2237,49 +2246,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2237 NULL,NULL); 2246 NULL,NULL);
2238} 2247}
2239 2248
2240#define OP_SET 0
2241#define OP_AND 1
2242#define OP_OR 2
2243
2244static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
2245 int *valp,
2246 int write, void *data)
2247{
2248 int op = *(int *)data;
2249 if (write) {
2250 int val = *negp ? -*lvalp : *lvalp;
2251 switch(op) {
2252 case OP_SET: *valp = val; break;
2253 case OP_AND: *valp &= val; break;
2254 case OP_OR: *valp |= val; break;
2255 }
2256 } else {
2257 int val = *valp;
2258 if (val < 0) {
2259 *negp = -1;
2260 *lvalp = (unsigned long)-val;
2261 } else {
2262 *negp = 0;
2263 *lvalp = (unsigned long)val;
2264 }
2265 }
2266 return 0;
2267}
2268
2269/* 2249/*
2270 * Taint values can only be increased 2250 * Taint values can only be increased
2251 * This means we can safely use a temporary.
2271 */ 2252 */
2272static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 2253static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2273 void __user *buffer, size_t *lenp, loff_t *ppos) 2254 void __user *buffer, size_t *lenp, loff_t *ppos)
2274{ 2255{
2275 int op; 2256 struct ctl_table t;
2257 unsigned long tmptaint = get_taint();
2258 int err;
2276 2259
2277 if (write && !capable(CAP_SYS_ADMIN)) 2260 if (write && !capable(CAP_SYS_ADMIN))
2278 return -EPERM; 2261 return -EPERM;
2279 2262
2280 op = OP_OR; 2263 t = *table;
2281 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2264 t.data = &tmptaint;
2282 do_proc_dointvec_bset_conv,&op); 2265 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
2266 if (err < 0)
2267 return err;
2268
2269 if (write) {
2270 /*
2271 * Poor man's atomic or. Not worth adding a primitive
2272 * to everyone's atomic.h for this
2273 */
2274 int i;
2275 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2276 if ((tmptaint >> i) & 1)
2277 add_taint(i);
2278 }
2279 }
2280
2281 return err;
2283} 2282}
2284 2283
2285struct do_proc_dointvec_minmax_conv_param { 2284struct do_proc_dointvec_minmax_conv_param {
@@ -2727,7 +2726,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2727 */ 2726 */
2728 2727
2729/* The generic sysctl data routine (used if no strategy routine supplied) */ 2728/* The generic sysctl data routine (used if no strategy routine supplied) */
2730int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2729int sysctl_data(struct ctl_table *table,
2731 void __user *oldval, size_t __user *oldlenp, 2730 void __user *oldval, size_t __user *oldlenp,
2732 void __user *newval, size_t newlen) 2731 void __user *newval, size_t newlen)
2733{ 2732{
@@ -2761,7 +2760,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2761} 2760}
2762 2761
2763/* The generic string strategy routine: */ 2762/* The generic string strategy routine: */
2764int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2763int sysctl_string(struct ctl_table *table,
2765 void __user *oldval, size_t __user *oldlenp, 2764 void __user *oldval, size_t __user *oldlenp,
2766 void __user *newval, size_t newlen) 2765 void __user *newval, size_t newlen)
2767{ 2766{
@@ -2807,7 +2806,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2807 * are between the minimum and maximum values given in the arrays 2806 * are between the minimum and maximum values given in the arrays
2808 * table->extra1 and table->extra2, respectively. 2807 * table->extra1 and table->extra2, respectively.
2809 */ 2808 */
2810int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2809int sysctl_intvec(struct ctl_table *table,
2811 void __user *oldval, size_t __user *oldlenp, 2810 void __user *oldval, size_t __user *oldlenp,
2812 void __user *newval, size_t newlen) 2811 void __user *newval, size_t newlen)
2813{ 2812{
@@ -2843,7 +2842,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2843} 2842}
2844 2843
2845/* Strategy function to convert jiffies to seconds */ 2844/* Strategy function to convert jiffies to seconds */
2846int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2845int sysctl_jiffies(struct ctl_table *table,
2847 void __user *oldval, size_t __user *oldlenp, 2846 void __user *oldval, size_t __user *oldlenp,
2848 void __user *newval, size_t newlen) 2847 void __user *newval, size_t newlen)
2849{ 2848{
@@ -2877,7 +2876,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2877} 2876}
2878 2877
2879/* Strategy function to convert jiffies to seconds */ 2878/* Strategy function to convert jiffies to seconds */
2880int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2879int sysctl_ms_jiffies(struct ctl_table *table,
2881 void __user *oldval, size_t __user *oldlenp, 2880 void __user *oldval, size_t __user *oldlenp,
2882 void __user *newval, size_t newlen) 2881 void __user *newval, size_t newlen)
2883{ 2882{
@@ -2932,35 +2931,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2932 return error; 2931 return error;
2933} 2932}
2934 2933
2935int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2934int sysctl_data(struct ctl_table *table,
2936 void __user *oldval, size_t __user *oldlenp, 2935 void __user *oldval, size_t __user *oldlenp,
2937 void __user *newval, size_t newlen) 2936 void __user *newval, size_t newlen)
2938{ 2937{
2939 return -ENOSYS; 2938 return -ENOSYS;
2940} 2939}
2941 2940
2942int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2941int sysctl_string(struct ctl_table *table,
2943 void __user *oldval, size_t __user *oldlenp, 2942 void __user *oldval, size_t __user *oldlenp,
2944 void __user *newval, size_t newlen) 2943 void __user *newval, size_t newlen)
2945{ 2944{
2946 return -ENOSYS; 2945 return -ENOSYS;
2947} 2946}
2948 2947
2949int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2948int sysctl_intvec(struct ctl_table *table,
2950 void __user *oldval, size_t __user *oldlenp, 2949 void __user *oldval, size_t __user *oldlenp,
2951 void __user *newval, size_t newlen) 2950 void __user *newval, size_t newlen)
2952{ 2951{
2953 return -ENOSYS; 2952 return -ENOSYS;
2954} 2953}
2955 2954
2956int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2955int sysctl_jiffies(struct ctl_table *table,
2957 void __user *oldval, size_t __user *oldlenp, 2956 void __user *oldval, size_t __user *oldlenp,
2958 void __user *newval, size_t newlen) 2957 void __user *newval, size_t newlen)
2959{ 2958{
2960 return -ENOSYS; 2959 return -ENOSYS;
2961} 2960}
2962 2961
2963int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2962int sysctl_ms_jiffies(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp, 2963 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen) 2964 void __user *newval, size_t newlen)
2966{ 2965{
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fac..d63a4336fad6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
669#endif 669#endif
670 670
671EXPORT_SYMBOL(jiffies); 671EXPORT_SYMBOL(jiffies);
672
673/*
674 * Add two timespec values and do a safety check for overflow.
675 * It's assumed that both values are valid (>= 0)
676 */
677struct timespec timespec_add_safe(const struct timespec lhs,
678 const struct timespec rhs)
679{
680 struct timespec res;
681
682 set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
683 lhs.tv_nsec + rhs.tv_nsec);
684
685 if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
686 res.tv_sec = TIME_T_MAX;
687
688 return res;
689}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8d53106a0a92..95ed42951e0a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config TICK_ONESHOT 4config TICK_ONESHOT
5 bool 5 bool
6 default n
7 6
8config NO_HZ 7config NO_HZ
9 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 093d4acf993b..9ed2eec97526 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)
325 unsigned long flags; 325 unsigned long flags;
326 int ret; 326 int ret;
327 327
328 /* save mult_orig on registration */
329 c->mult_orig = c->mult;
330
328 spin_lock_irqsave(&clocksource_lock, flags); 331 spin_lock_irqsave(&clocksource_lock, flags);
329 ret = clocksource_enqueue(c); 332 ret = clocksource_enqueue(c);
330 if (!ret) 333 if (!ret)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fdb8875..1ca99557e929 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
64 .shift = JIFFIES_SHIFT, 65 .shift = JIFFIES_SHIFT,
65}; 66};
66 67
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1ad46f3df6e7..8ff15e5d486b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,13 +10,13 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15#include <linux/jiffies.h> 14#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/math64.h> 17#include <linux/math64.h>
19#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/workqueue.h>
20#include <asm/timex.h> 20#include <asm/timex.h>
21 21
22/* 22/*
@@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
142 time_state = TIME_OOP; 142 time_state = TIME_OOP;
143 printk(KERN_NOTICE "Clock: " 143 printk(KERN_NOTICE "Clock: "
144 "inserting leap second 23:59:60 UTC\n"); 144 "inserting leap second 23:59:60 UTC\n");
145 leap_timer.expires = ktime_add_ns(leap_timer.expires, 145 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
146 NSEC_PER_SEC);
147 res = HRTIMER_RESTART; 146 res = HRTIMER_RESTART;
148 break; 147 break;
149 case TIME_DEL: 148 case TIME_DEL:
@@ -218,11 +217,11 @@ void second_overflow(void)
218/* Disable the cmos update - used by virtualization and embedded */ 217/* Disable the cmos update - used by virtualization and embedded */
219int no_sync_cmos_clock __read_mostly; 218int no_sync_cmos_clock __read_mostly;
220 219
221static void sync_cmos_clock(unsigned long dummy); 220static void sync_cmos_clock(struct work_struct *work);
222 221
223static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 222static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
224 223
225static void sync_cmos_clock(unsigned long dummy) 224static void sync_cmos_clock(struct work_struct *work)
226{ 225{
227 struct timespec now, next; 226 struct timespec now, next;
228 int fail = 1; 227 int fail = 1;
@@ -258,13 +257,13 @@ static void sync_cmos_clock(unsigned long dummy)
258 next.tv_sec++; 257 next.tv_sec++;
259 next.tv_nsec -= NSEC_PER_SEC; 258 next.tv_nsec -= NSEC_PER_SEC;
260 } 259 }
261 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); 260 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
262} 261}
263 262
264static void notify_cmos_timer(void) 263static void notify_cmos_timer(void)
265{ 264{
266 if (!no_sync_cmos_clock) 265 if (!no_sync_cmos_clock)
267 mod_timer(&sync_cmos_timer, jiffies + 1); 266 schedule_delayed_work(&sync_cmos_work, 0);
268} 267}
269 268
270#else 269#else
@@ -277,38 +276,50 @@ static inline void notify_cmos_timer(void) { }
277int do_adjtimex(struct timex *txc) 276int do_adjtimex(struct timex *txc)
278{ 277{
279 struct timespec ts; 278 struct timespec ts;
280 long save_adjust, sec;
281 int result; 279 int result;
282 280
283 /* In order to modify anything, you gotta be super-user! */ 281 /* Validate the data before disabling interrupts */
284 if (txc->modes && !capable(CAP_SYS_TIME)) 282 if (txc->modes & ADJ_ADJTIME) {
285 return -EPERM;
286
287 /* Now we validate the data before disabling interrupts */
288
289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
290 /* singleshot must not be used with any other mode bits */ 283 /* singleshot must not be used with any other mode bits */
291 if (txc->modes & ~ADJ_OFFSET_SS_READ) 284 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
292 return -EINVAL; 285 return -EINVAL;
286 if (!(txc->modes & ADJ_OFFSET_READONLY) &&
287 !capable(CAP_SYS_TIME))
288 return -EPERM;
289 } else {
290 /* In order to modify anything, you gotta be super-user! */
291 if (txc->modes && !capable(CAP_SYS_TIME))
292 return -EPERM;
293
294 /* if the quartz is off by more than 10% something is VERY wrong! */
295 if (txc->modes & ADJ_TICK &&
296 (txc->tick < 900000/USER_HZ ||
297 txc->tick > 1100000/USER_HZ))
298 return -EINVAL;
299
300 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
301 hrtimer_cancel(&leap_timer);
293 } 302 }
294 303
295 /* if the quartz is off by more than 10% something is VERY wrong ! */
296 if (txc->modes & ADJ_TICK)
297 if (txc->tick < 900000/USER_HZ ||
298 txc->tick > 1100000/USER_HZ)
299 return -EINVAL;
300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts); 304 getnstimeofday(&ts);
304 305
305 write_seqlock_irq(&xtime_lock); 306 write_seqlock_irq(&xtime_lock);
306 307
307 /* Save for later - semantics of adjtime is to return old value */
308 save_adjust = time_adjust;
309
310 /* If there are input parameters, then process them */ 308 /* If there are input parameters, then process them */
309 if (txc->modes & ADJ_ADJTIME) {
310 long save_adjust = time_adjust;
311
312 if (!(txc->modes & ADJ_OFFSET_READONLY)) {
313 /* adjtime() is independent from ntp_adjtime() */
314 time_adjust = txc->offset;
315 ntp_update_frequency();
316 }
317 txc->offset = save_adjust;
318 goto adj_done;
319 }
311 if (txc->modes) { 320 if (txc->modes) {
321 long sec;
322
312 if (txc->modes & ADJ_STATUS) { 323 if (txc->modes & ADJ_STATUS) {
313 if ((time_status & STA_PLL) && 324 if ((time_status & STA_PLL) &&
314 !(txc->status & STA_PLL)) { 325 !(txc->status & STA_PLL)) {
@@ -375,13 +386,8 @@ int do_adjtimex(struct timex *txc)
375 if (txc->modes & ADJ_TAI && txc->constant > 0) 386 if (txc->modes & ADJ_TAI && txc->constant > 0)
376 time_tai = txc->constant; 387 time_tai = txc->constant;
377 388
378 if (txc->modes & ADJ_OFFSET) { 389 if (txc->modes & ADJ_OFFSET)
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT) 390 ntp_update_offset(txc->offset);
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
384 }
385 if (txc->modes & ADJ_TICK) 391 if (txc->modes & ADJ_TICK)
386 tick_usec = txc->tick; 392 tick_usec = txc->tick;
387 393
@@ -389,22 +395,18 @@ int do_adjtimex(struct timex *txc)
389 ntp_update_frequency(); 395 ntp_update_frequency();
390 } 396 }
391 397
398 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
399 NTP_SCALE_SHIFT);
400 if (!(time_status & STA_NANO))
401 txc->offset /= NSEC_PER_USEC;
402
403adj_done:
392 result = time_state; /* mostly `TIME_OK' */ 404 result = time_state; /* mostly `TIME_OK' */
393 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 405 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
394 result = TIME_ERROR; 406 result = TIME_ERROR;
395 407
396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 408 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
397 (txc->modes == ADJ_OFFSET_SS_READ)) 409 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
398 txc->offset = save_adjust;
399 else {
400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
401 NTP_SCALE_SHIFT);
402 if (!(time_status & STA_NANO))
403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
408 txc->maxerror = time_maxerror; 410 txc->maxerror = time_maxerror;
409 txc->esterror = time_esterror; 411 txc->esterror = time_esterror;
410 txc->status = time_status; 412 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index bd7034542399..f98a1b7b16e9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,7 +235,8 @@ static void tick_do_broadcast_on_off(void *why)
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
238 if (bc->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
239 TICKDEV_MODE_PERIODIC)
239 clockevents_shutdown(dev); 240 clockevents_shutdown(dev);
240 } 241 }
241 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
@@ -245,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why)
245 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
246 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpu_isset(cpu, tick_broadcast_mask)) {
247 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
248 if (bc->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
249 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
250 } 252 }
251 break; 253 break;
@@ -382,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
382} 384}
383 385
384/* 386/*
387 * Called from irq_enter() when idle was interrupted to reenable the
388 * per cpu device.
389 */
390void tick_check_oneshot_broadcast(int cpu)
391{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
396 }
397}
398
399/*
385 * Handle oneshot mode broadcasting 400 * Handle oneshot mode broadcasting
386 */ 401 */
387static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 469248782c23..b1c05bf75ee0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void); 38extern int tick_broadcast_oneshot_active(void);
39extern void tick_check_oneshot_broadcast(int cpu);
39# else /* BROADCAST */ 40# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 41static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 42{
@@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
45static inline void tick_broadcast_switch_to_oneshot(void) { } 46static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 47static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 48static inline int tick_broadcast_oneshot_active(void) { return 0; }
49static inline void tick_check_oneshot_broadcast(int cpu) { }
48# endif /* !BROADCAST */ 50# endif /* !BROADCAST */
49 51
50#else /* !ONESHOT */ 52#else /* !ONESHOT */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 39019b3f7621..342fc9ccab46 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/module.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -154,7 +155,7 @@ void tick_nohz_update_jiffies(void)
154 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
155} 156}
156 157
157void tick_nohz_stop_idle(int cpu) 158static void tick_nohz_stop_idle(int cpu)
158{ 159{
159 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
160 161
@@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
190{ 191{
191 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 192 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
192 193
193 *last_update_time = ktime_to_us(ts->idle_lastupdate); 194 if (!tick_nohz_enabled)
195 return -1;
196
197 if (ts->idle_active)
198 *last_update_time = ktime_to_us(ts->idle_lastupdate);
199 else
200 *last_update_time = ktime_to_us(ktime_get());
201
194 return ktime_to_us(ts->idle_sleeptime); 202 return ktime_to_us(ts->idle_sleeptime);
195} 203}
204EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
196 205
197/** 206/**
198 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 207 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -261,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle)
261 next_jiffies = get_next_timer_interrupt(last_jiffies); 270 next_jiffies = get_next_timer_interrupt(last_jiffies);
262 delta_jiffies = next_jiffies - last_jiffies; 271 delta_jiffies = next_jiffies - last_jiffies;
263 272
264 if (rcu_needs_cpu(cpu)) 273 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
265 delta_jiffies = 1; 274 delta_jiffies = 1;
266 /* 275 /*
267 * Do not stop the tick, if we are only one off 276 * Do not stop the tick, if we are only one off
@@ -291,7 +300,7 @@ void tick_nohz_stop_sched_tick(int inidle)
291 goto out; 300 goto out;
292 } 301 }
293 302
294 ts->idle_tick = ts->sched_timer.expires; 303 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
295 ts->tick_stopped = 1; 304 ts->tick_stopped = 1;
296 ts->idle_jiffies = last_jiffies; 305 ts->idle_jiffies = last_jiffies;
297 rcu_enter_nohz(); 306 rcu_enter_nohz();
@@ -368,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void)
368 return ts->sleep_length; 377 return ts->sleep_length;
369} 378}
370 379
380static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
381{
382 hrtimer_cancel(&ts->sched_timer);
383 hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
384
385 while (1) {
386 /* Forward the time to expire in the future */
387 hrtimer_forward(&ts->sched_timer, now, tick_period);
388
389 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
390 hrtimer_start_expires(&ts->sched_timer,
391 HRTIMER_MODE_ABS);
392 /* Check, if the timer was already in the past */
393 if (hrtimer_active(&ts->sched_timer))
394 break;
395 } else {
396 if (!tick_program_event(
397 hrtimer_get_expires(&ts->sched_timer), 0))
398 break;
399 }
400 /* Update jiffies and reread time */
401 tick_do_update_jiffies64(now);
402 now = ktime_get();
403 }
404}
405
371/** 406/**
372 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 407 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
373 * 408 *
@@ -421,35 +456,16 @@ void tick_nohz_restart_sched_tick(void)
421 */ 456 */
422 ts->tick_stopped = 0; 457 ts->tick_stopped = 0;
423 ts->idle_exittime = now; 458 ts->idle_exittime = now;
424 hrtimer_cancel(&ts->sched_timer);
425 ts->sched_timer.expires = ts->idle_tick;
426 459
427 while (1) { 460 tick_nohz_restart(ts, now);
428 /* Forward the time to expire in the future */
429 hrtimer_forward(&ts->sched_timer, now, tick_period);
430 461
431 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
432 hrtimer_start(&ts->sched_timer,
433 ts->sched_timer.expires,
434 HRTIMER_MODE_ABS);
435 /* Check, if the timer was already in the past */
436 if (hrtimer_active(&ts->sched_timer))
437 break;
438 } else {
439 if (!tick_program_event(ts->sched_timer.expires, 0))
440 break;
441 }
442 /* Update jiffies and reread time */
443 tick_do_update_jiffies64(now);
444 now = ktime_get();
445 }
446 local_irq_enable(); 462 local_irq_enable();
447} 463}
448 464
449static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 465static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
450{ 466{
451 hrtimer_forward(&ts->sched_timer, now, tick_period); 467 hrtimer_forward(&ts->sched_timer, now, tick_period);
452 return tick_program_event(ts->sched_timer.expires, 0); 468 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
453} 469}
454 470
455/* 471/*
@@ -494,10 +510,6 @@ static void tick_nohz_handler(struct clock_event_device *dev)
494 update_process_times(user_mode(regs)); 510 update_process_times(user_mode(regs));
495 profile_tick(CPU_PROFILING); 511 profile_tick(CPU_PROFILING);
496 512
497 /* Do not restart, when we are in the idle loop */
498 if (ts->tick_stopped)
499 return;
500
501 while (tick_nohz_reprogram(ts, now)) { 513 while (tick_nohz_reprogram(ts, now)) {
502 now = ktime_get(); 514 now = ktime_get();
503 tick_do_update_jiffies64(now); 515 tick_do_update_jiffies64(now);
@@ -532,7 +544,7 @@ static void tick_nohz_switch_to_nohz(void)
532 next = tick_init_jiffy_update(); 544 next = tick_init_jiffy_update();
533 545
534 for (;;) { 546 for (;;) {
535 ts->sched_timer.expires = next; 547 hrtimer_set_expires(&ts->sched_timer, next);
536 if (!tick_program_event(next, 0)) 548 if (!tick_program_event(next, 0))
537 break; 549 break;
538 next = ktime_add(next, tick_period); 550 next = ktime_add(next, tick_period);
@@ -543,6 +555,41 @@ static void tick_nohz_switch_to_nohz(void)
543 smp_processor_id()); 555 smp_processor_id());
544} 556}
545 557
558/*
559 * When NOHZ is enabled and the tick is stopped, we need to kick the
560 * tick timer from irq_enter() so that the jiffies update is kept
561 * alive during long running softirqs. That's ugly as hell, but
562 * correctness is key even if we need to fix the offending softirq in
563 * the first place.
564 *
565 * Note, this is different to tick_nohz_restart. We just kick the
566 * timer and do not touch the other magic bits which need to be done
567 * when idle is left.
568 */
569static void tick_nohz_kick_tick(int cpu)
570{
571#if 0
572 /* Switch back to 2.6.27 behaviour */
573
574 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
575 ktime_t delta, now;
576
577 if (!ts->tick_stopped)
578 return;
579
580 /*
581 * Do not touch the tick device, when the next expiry is either
582 * already reached or less/equal than the tick period.
583 */
584 now = ktime_get();
585 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
586 if (delta.tv64 <= tick_period.tv64)
587 return;
588
589 tick_nohz_restart(ts, now);
590#endif
591}
592
546#else 593#else
547 594
548static inline void tick_nohz_switch_to_nohz(void) { } 595static inline void tick_nohz_switch_to_nohz(void) { }
@@ -550,6 +597,19 @@ static inline void tick_nohz_switch_to_nohz(void) { }
550#endif /* NO_HZ */ 597#endif /* NO_HZ */
551 598
552/* 599/*
600 * Called from irq_enter to notify about the possible interruption of idle()
601 */
602void tick_check_idle(int cpu)
603{
604 tick_check_oneshot_broadcast(cpu);
605#ifdef CONFIG_NO_HZ
606 tick_nohz_stop_idle(cpu);
607 tick_nohz_update_jiffies();
608 tick_nohz_kick_tick(cpu);
609#endif
610}
611
612/*
553 * High resolution timer specific code 613 * High resolution timer specific code
554 */ 614 */
555#ifdef CONFIG_HIGH_RES_TIMERS 615#ifdef CONFIG_HIGH_RES_TIMERS
@@ -602,10 +662,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
602 profile_tick(CPU_PROFILING); 662 profile_tick(CPU_PROFILING);
603 } 663 }
604 664
605 /* Do not restart, when we are in the idle loop */
606 if (ts->tick_stopped)
607 return HRTIMER_NORESTART;
608
609 hrtimer_forward(timer, now, tick_period); 665 hrtimer_forward(timer, now, tick_period);
610 666
611 return HRTIMER_RESTART; 667 return HRTIMER_RESTART;
@@ -625,19 +681,18 @@ void tick_setup_sched_timer(void)
625 */ 681 */
626 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 682 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
627 ts->sched_timer.function = tick_sched_timer; 683 ts->sched_timer.function = tick_sched_timer;
628 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 684 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
629 685
630 /* Get the next period (per cpu) */ 686 /* Get the next period (per cpu) */
631 ts->sched_timer.expires = tick_init_jiffy_update(); 687 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
632 offset = ktime_to_ns(tick_period) >> 1; 688 offset = ktime_to_ns(tick_period) >> 1;
633 do_div(offset, num_possible_cpus()); 689 do_div(offset, num_possible_cpus());
634 offset *= smp_processor_id(); 690 offset *= smp_processor_id();
635 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); 691 hrtimer_add_expires_ns(&ts->sched_timer, offset);
636 692
637 for (;;) { 693 for (;;) {
638 hrtimer_forward(&ts->sched_timer, now, tick_period); 694 hrtimer_forward(&ts->sched_timer, now, tick_period);
639 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, 695 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
640 HRTIMER_MODE_ABS);
641 /* Check, if the timer was already in the past */ 696 /* Check, if the timer was already in the past */
642 if (hrtimer_active(&ts->sched_timer)) 697 if (hrtimer_active(&ts->sched_timer))
643 break; 698 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e91c29f961c9..e7acfb482a68 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,27 +58,26 @@ struct clocksource *clock;
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
60/** 60/**
61 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook 61 * clocksource_forward_now - update clock to the current time
62 * 62 *
63 * private function, must hold xtime_lock lock when being 63 * Forward the current clock to update its state since the last call to
64 * called. Returns the number of nanoseconds since the 64 * update_wall_time(). This is useful before significant clock changes,
65 * last call to update_wall_time() (adjusted by NTP scaling) 65 * as it avoids having to deal with this time offset explicitly.
66 */ 66 */
67static inline s64 __get_nsec_offset(void) 67static void clocksource_forward_now(void)
68{ 68{
69 cycle_t cycle_now, cycle_delta; 69 cycle_t cycle_now, cycle_delta;
70 s64 ns_offset; 70 s64 nsec;
71 71
72 /* read clocksource: */
73 cycle_now = clocksource_read(clock); 72 cycle_now = clocksource_read(clock);
74
75 /* calculate the delta since the last update_wall_time: */
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 73 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
74 clock->cycle_last = cycle_now;
77 75
78 /* convert to nanoseconds: */ 76 nsec = cyc2ns(clock, cycle_delta);
79 ns_offset = cyc2ns(clock, cycle_delta); 77 timespec_add_ns(&xtime, nsec);
80 78
81 return ns_offset; 79 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
80 clock->raw_time.tv_nsec += nsec;
82} 81}
83 82
84/** 83/**
@@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void)
89 */ 88 */
90void getnstimeofday(struct timespec *ts) 89void getnstimeofday(struct timespec *ts)
91{ 90{
91 cycle_t cycle_now, cycle_delta;
92 unsigned long seq; 92 unsigned long seq;
93 s64 nsecs; 93 s64 nsecs;
94 94
@@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts)
96 seq = read_seqbegin(&xtime_lock); 96 seq = read_seqbegin(&xtime_lock);
97 97
98 *ts = xtime; 98 *ts = xtime;
99 nsecs = __get_nsec_offset(); 99
100 /* read clocksource: */
101 cycle_now = clocksource_read(clock);
102
103 /* calculate the delta since the last update_wall_time: */
104 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
105
106 /* convert to nanoseconds: */
107 nsecs = cyc2ns(clock, cycle_delta);
100 108
101 } while (read_seqretry(&xtime_lock, seq)); 109 } while (read_seqretry(&xtime_lock, seq));
102 110
@@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday);
129 */ 137 */
130int do_settimeofday(struct timespec *tv) 138int do_settimeofday(struct timespec *tv)
131{ 139{
140 struct timespec ts_delta;
132 unsigned long flags; 141 unsigned long flags;
133 time_t wtm_sec, sec = tv->tv_sec;
134 long wtm_nsec, nsec = tv->tv_nsec;
135 142
136 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 143 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
137 return -EINVAL; 144 return -EINVAL;
138 145
139 write_seqlock_irqsave(&xtime_lock, flags); 146 write_seqlock_irqsave(&xtime_lock, flags);
140 147
141 nsec -= __get_nsec_offset(); 148 clocksource_forward_now();
149
150 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
151 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
152 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
142 153
143 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); 154 xtime = *tv;
144 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
145 155
146 set_normalized_timespec(&xtime, sec, nsec);
147 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
148 update_xtime_cache(0); 156 update_xtime_cache(0);
149 157
150 clock->error = 0; 158 clock->error = 0;
@@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday);
170static void change_clocksource(void) 178static void change_clocksource(void)
171{ 179{
172 struct clocksource *new; 180 struct clocksource *new;
173 cycle_t now;
174 u64 nsec;
175 181
176 new = clocksource_get_next(); 182 new = clocksource_get_next();
177 183
178 if (clock == new) 184 if (clock == new)
179 return; 185 return;
180 186
181 new->cycle_last = 0; 187 clocksource_forward_now();
182 now = clocksource_read(new);
183 nsec = __get_nsec_offset();
184 timespec_add_ns(&xtime, nsec);
185 188
186 clock = new; 189 new->raw_time = clock->raw_time;
187 clock->cycle_last = now;
188 190
191 clock = new;
192 clock->cycle_last = 0;
193 clock->cycle_last = clocksource_read(new);
189 clock->error = 0; 194 clock->error = 0;
190 clock->xtime_nsec = 0; 195 clock->xtime_nsec = 0;
191 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 196 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -200,11 +205,44 @@ static void change_clocksource(void)
200 */ 205 */
201} 206}
202#else 207#else
208static inline void clocksource_forward_now(void) { }
203static inline void change_clocksource(void) { } 209static inline void change_clocksource(void) { }
204static inline s64 __get_nsec_offset(void) { return 0; }
205#endif 210#endif
206 211
207/** 212/**
213 * getrawmonotonic - Returns the raw monotonic time in a timespec
214 * @ts: pointer to the timespec to be set
215 *
216 * Returns the raw monotonic time (completely un-modified by ntp)
217 */
218void getrawmonotonic(struct timespec *ts)
219{
220 unsigned long seq;
221 s64 nsecs;
222 cycle_t cycle_now, cycle_delta;
223
224 do {
225 seq = read_seqbegin(&xtime_lock);
226
227 /* read clocksource: */
228 cycle_now = clocksource_read(clock);
229
230 /* calculate the delta since the last update_wall_time: */
231 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
232
233 /* convert to nanoseconds: */
234 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
235
236 *ts = clock->raw_time;
237
238 } while (read_seqretry(&xtime_lock, seq));
239
240 timespec_add_ns(ts, nsecs);
241}
242EXPORT_SYMBOL(getrawmonotonic);
243
244
245/**
208 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 246 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
209 */ 247 */
210int timekeeping_valid_for_hres(void) 248int timekeeping_valid_for_hres(void)
@@ -265,8 +303,6 @@ void __init timekeeping_init(void)
265static int timekeeping_suspended; 303static int timekeeping_suspended;
266/* time in seconds when suspend began */ 304/* time in seconds when suspend began */
267static unsigned long timekeeping_suspend_time; 305static unsigned long timekeeping_suspend_time;
268/* xtime offset when we went into suspend */
269static s64 timekeeping_suspend_nsecs;
270 306
271/** 307/**
272 * timekeeping_resume - Resumes the generic timekeeping subsystem. 308 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev)
292 wall_to_monotonic.tv_sec -= sleep_length; 328 wall_to_monotonic.tv_sec -= sleep_length;
293 total_sleep_time += sleep_length; 329 total_sleep_time += sleep_length;
294 } 330 }
295 /* Make sure that we have the correct xtime reference */
296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
297 update_xtime_cache(0); 331 update_xtime_cache(0);
298 /* re-base the last cycle value */ 332 /* re-base the last cycle value */
299 clock->cycle_last = 0; 333 clock->cycle_last = 0;
@@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
319 timekeeping_suspend_time = read_persistent_clock(); 353 timekeeping_suspend_time = read_persistent_clock();
320 354
321 write_seqlock_irqsave(&xtime_lock, flags); 355 write_seqlock_irqsave(&xtime_lock, flags);
322 /* Get the current xtime offset */ 356 clocksource_forward_now();
323 timekeeping_suspend_nsecs = __get_nsec_offset();
324 timekeeping_suspended = 1; 357 timekeeping_suspended = 1;
325 write_sequnlock_irqrestore(&xtime_lock, flags); 358 write_sequnlock_irqrestore(&xtime_lock, flags);
326 359
@@ -454,23 +487,29 @@ void update_wall_time(void)
454#else 487#else
455 offset = clock->cycle_interval; 488 offset = clock->cycle_interval;
456#endif 489#endif
457 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; 490 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
458 491
459 /* normally this loop will run just once, however in the 492 /* normally this loop will run just once, however in the
460 * case of lost or late ticks, it will accumulate correctly. 493 * case of lost or late ticks, it will accumulate correctly.
461 */ 494 */
462 while (offset >= clock->cycle_interval) { 495 while (offset >= clock->cycle_interval) {
463 /* accumulate one interval */ 496 /* accumulate one interval */
464 clock->xtime_nsec += clock->xtime_interval;
465 clock->cycle_last += clock->cycle_interval;
466 offset -= clock->cycle_interval; 497 offset -= clock->cycle_interval;
498 clock->cycle_last += clock->cycle_interval;
467 499
500 clock->xtime_nsec += clock->xtime_interval;
468 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 501 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
469 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 502 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
470 xtime.tv_sec++; 503 xtime.tv_sec++;
471 second_overflow(); 504 second_overflow();
472 } 505 }
473 506
507 clock->raw_time.tv_nsec += clock->raw_interval;
508 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
509 clock->raw_time.tv_nsec -= NSEC_PER_SEC;
510 clock->raw_time.tv_sec++;
511 }
512
474 /* accumulate error between NTP and clock interval */ 513 /* accumulate error between NTP and clock interval */
475 clock->error += tick_length; 514 clock->error += tick_length;
476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 515 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@ -479,9 +518,12 @@ void update_wall_time(void)
479 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
480 clocksource_adjust(offset); 519 clocksource_adjust(offset);
481 520
482 /* store full nanoseconds into xtime */ 521 /* store full nanoseconds into xtime after rounding it up and
483 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 522 * add the remainder to the error difference.
523 */
524 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
484 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 525 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
526 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
485 527
486 update_xtime_cache(cyc2ns(clock, offset)); 528 update_xtime_cache(cyc2ns(clock, offset));
487 529
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20fd0001..a999b92a1277 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym)
47} 47}
48 48
49static void 49static void
50print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) 50print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
51 int idx, u64 now)
51{ 52{
52#ifdef CONFIG_TIMER_STATS 53#ifdef CONFIG_TIMER_STATS
53 char tmp[TASK_COMM_LEN + 1]; 54 char tmp[TASK_COMM_LEN + 1];
54#endif 55#endif
55 SEQ_printf(m, " #%d: ", idx); 56 SEQ_printf(m, " #%d: ", idx);
56 print_name_offset(m, timer); 57 print_name_offset(m, taddr);
57 SEQ_printf(m, ", "); 58 SEQ_printf(m, ", ");
58 print_name_offset(m, timer->function); 59 print_name_offset(m, timer->function);
59 SEQ_printf(m, ", S:%02lx", timer->state); 60 SEQ_printf(m, ", S:%02lx", timer->state);
@@ -65,9 +66,11 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); 66 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
66#endif 67#endif
67 SEQ_printf(m, "\n"); 68 SEQ_printf(m, "\n");
68 SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", 69 SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
69 (unsigned long long)ktime_to_ns(timer->expires), 70 (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
70 (long long)(ktime_to_ns(timer->expires) - now)); 71 (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
72 (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
73 (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
71} 74}
72 75
73static void 76static void
@@ -99,7 +102,7 @@ next_one:
99 tmp = *timer; 102 tmp = *timer;
100 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
101 104
102 print_timer(m, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
103 next++; 106 next++;
104 goto next_one; 107 goto next_one;
105 } 108 }
@@ -109,6 +112,7 @@ next_one:
109static void 112static void
110print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 114{
115 SEQ_printf(m, " .base: %p\n", base);
112 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
113 base->index); 117 base->index);
114 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
@@ -183,12 +187,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
183 187
184#ifdef CONFIG_GENERIC_CLOCKEVENTS 188#ifdef CONFIG_GENERIC_CLOCKEVENTS
185static void 189static void
186print_tickdevice(struct seq_file *m, struct tick_device *td) 190print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
187{ 191{
188 struct clock_event_device *dev = td->evtdev; 192 struct clock_event_device *dev = td->evtdev;
189 193
190 SEQ_printf(m, "\n"); 194 SEQ_printf(m, "\n");
191 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 195 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
196 if (cpu < 0)
197 SEQ_printf(m, "Broadcast device\n");
198 else
199 SEQ_printf(m, "Per CPU device: %d\n", cpu);
192 200
193 SEQ_printf(m, "Clock Event Device: "); 201 SEQ_printf(m, "Clock Event Device: ");
194 if (!dev) { 202 if (!dev) {
@@ -222,7 +230,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
222 int cpu; 230 int cpu;
223 231
224#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 232#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
225 print_tickdevice(m, tick_get_broadcast_device()); 233 print_tickdevice(m, tick_get_broadcast_device(), -1);
226 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 234 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
227 tick_get_broadcast_mask()->bits[0]); 235 tick_get_broadcast_mask()->bits[0]);
228#ifdef CONFIG_TICK_ONESHOT 236#ifdef CONFIG_TICK_ONESHOT
@@ -232,7 +240,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232 SEQ_printf(m, "\n"); 240 SEQ_printf(m, "\n");
233#endif 241#endif
234 for_each_online_cpu(cpu) 242 for_each_online_cpu(cpu)
235 print_tickdevice(m, tick_get_device(cpu)); 243 print_tickdevice(m, tick_get_device(cpu), cpu);
236 SEQ_printf(m, "\n"); 244 SEQ_printf(m, "\n");
237} 245}
238#else 246#else
@@ -244,7 +252,7 @@ static int timer_list_show(struct seq_file *m, void *v)
244 u64 now = ktime_to_ns(ktime_get()); 252 u64 now = ktime_to_ns(ktime_get());
245 int cpu; 253 int cpu;
246 254
247 SEQ_printf(m, "Timer List Version: v0.3\n"); 255 SEQ_printf(m, "Timer List Version: v0.4\n");
248 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
249 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
250 258
diff --git a/kernel/timer.c b/kernel/timer.c
index 03bc7f1f1593..dbd50fabe4c7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
113} 113}
114 114
115/** 115static unsigned long round_jiffies_common(unsigned long j, int cpu,
116 * __round_jiffies - function to round jiffies to a full second 116 bool force_up)
117 * @j: the time in (absolute) jiffies that should be rounded
118 * @cpu: the processor number on which the timeout will happen
119 *
120 * __round_jiffies() rounds an absolute time in the future (in jiffies)
121 * up or down to (approximately) full seconds. This is useful for timers
122 * for which the exact time they fire does not matter too much, as long as
123 * they fire approximately every X seconds.
124 *
125 * By rounding these timers to whole seconds, all such timers will fire
126 * at the same time, rather than at various times spread out. The goal
127 * of this is to have the CPU wake up less, which saves power.
128 *
129 * The exact rounding is skewed for each processor to avoid all
130 * processors firing at the exact same time, which could lead
131 * to lock contention or spurious cache line bouncing.
132 *
133 * The return value is the rounded version of the @j parameter.
134 */
135unsigned long __round_jiffies(unsigned long j, int cpu)
136{ 117{
137 int rem; 118 int rem;
138 unsigned long original = j; 119 unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
154 * due to delays of the timer irq, long irq off times etc etc) then 135 * due to delays of the timer irq, long irq off times etc etc) then
155 * we should round down to the whole second, not up. Use 1/4th second 136 * we should round down to the whole second, not up. Use 1/4th second
156 * as cutoff for this rounding as an extreme upper bound for this. 137 * as cutoff for this rounding as an extreme upper bound for this.
138 * But never round down if @force_up is set.
157 */ 139 */
158 if (rem < HZ/4) /* round down */ 140 if (rem < HZ/4 && !force_up) /* round down */
159 j = j - rem; 141 j = j - rem;
160 else /* round up */ 142 else /* round up */
161 j = j - rem + HZ; 143 j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
167 return original; 149 return original;
168 return j; 150 return j;
169} 151}
152
153/**
154 * __round_jiffies - function to round jiffies to a full second
155 * @j: the time in (absolute) jiffies that should be rounded
156 * @cpu: the processor number on which the timeout will happen
157 *
158 * __round_jiffies() rounds an absolute time in the future (in jiffies)
159 * up or down to (approximately) full seconds. This is useful for timers
160 * for which the exact time they fire does not matter too much, as long as
161 * they fire approximately every X seconds.
162 *
163 * By rounding these timers to whole seconds, all such timers will fire
164 * at the same time, rather than at various times spread out. The goal
165 * of this is to have the CPU wake up less, which saves power.
166 *
167 * The exact rounding is skewed for each processor to avoid all
168 * processors firing at the exact same time, which could lead
169 * to lock contention or spurious cache line bouncing.
170 *
171 * The return value is the rounded version of the @j parameter.
172 */
173unsigned long __round_jiffies(unsigned long j, int cpu)
174{
175 return round_jiffies_common(j, cpu, false);
176}
170EXPORT_SYMBOL_GPL(__round_jiffies); 177EXPORT_SYMBOL_GPL(__round_jiffies);
171 178
172/** 179/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
191 */ 198 */
192unsigned long __round_jiffies_relative(unsigned long j, int cpu) 199unsigned long __round_jiffies_relative(unsigned long j, int cpu)
193{ 200{
194 /* 201 unsigned long j0 = jiffies;
195 * In theory the following code can skip a jiffy in case jiffies 202
196 * increments right between the addition and the later subtraction. 203 /* Use j0 because jiffies might change while we run */
197 * However since the entire point of this function is to use approximate 204 return round_jiffies_common(j + j0, cpu, false) - j0;
198 * timeouts, it's entirely ok to not handle that.
199 */
200 return __round_jiffies(j + jiffies, cpu) - jiffies;
201} 205}
202EXPORT_SYMBOL_GPL(__round_jiffies_relative); 206EXPORT_SYMBOL_GPL(__round_jiffies_relative);
203 207
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
218 */ 222 */
219unsigned long round_jiffies(unsigned long j) 223unsigned long round_jiffies(unsigned long j)
220{ 224{
221 return __round_jiffies(j, raw_smp_processor_id()); 225 return round_jiffies_common(j, raw_smp_processor_id(), false);
222} 226}
223EXPORT_SYMBOL_GPL(round_jiffies); 227EXPORT_SYMBOL_GPL(round_jiffies);
224 228
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
243} 247}
244EXPORT_SYMBOL_GPL(round_jiffies_relative); 248EXPORT_SYMBOL_GPL(round_jiffies_relative);
245 249
250/**
251 * __round_jiffies_up - function to round jiffies up to a full second
252 * @j: the time in (absolute) jiffies that should be rounded
253 * @cpu: the processor number on which the timeout will happen
254 *
255 * This is the same as __round_jiffies() except that it will never
256 * round down. This is useful for timeouts for which the exact time
257 * of firing does not matter too much, as long as they don't fire too
258 * early.
259 */
260unsigned long __round_jiffies_up(unsigned long j, int cpu)
261{
262 return round_jiffies_common(j, cpu, true);
263}
264EXPORT_SYMBOL_GPL(__round_jiffies_up);
265
266/**
267 * __round_jiffies_up_relative - function to round jiffies up to a full second
268 * @j: the time in (relative) jiffies that should be rounded
269 * @cpu: the processor number on which the timeout will happen
270 *
271 * This is the same as __round_jiffies_relative() except that it will never
272 * round down. This is useful for timeouts for which the exact time
273 * of firing does not matter too much, as long as they don't fire too
274 * early.
275 */
276unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
277{
278 unsigned long j0 = jiffies;
279
280 /* Use j0 because jiffies might change while we run */
281 return round_jiffies_common(j + j0, cpu, true) - j0;
282}
283EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
284
285/**
286 * round_jiffies_up - function to round jiffies up to a full second
287 * @j: the time in (absolute) jiffies that should be rounded
288 *
289 * This is the same as round_jiffies() except that it will never
290 * round down. This is useful for timeouts for which the exact time
291 * of firing does not matter too much, as long as they don't fire too
292 * early.
293 */
294unsigned long round_jiffies_up(unsigned long j)
295{
296 return round_jiffies_common(j, raw_smp_processor_id(), true);
297}
298EXPORT_SYMBOL_GPL(round_jiffies_up);
299
300/**
301 * round_jiffies_up_relative - function to round jiffies up to a full second
302 * @j: the time in (relative) jiffies that should be rounded
303 *
304 * This is the same as round_jiffies_relative() except that it will never
305 * round down. This is useful for timeouts for which the exact time
306 * of firing does not matter too much, as long as they don't fire too
307 * early.
308 */
309unsigned long round_jiffies_up_relative(unsigned long j)
310{
311 return __round_jiffies_up_relative(j, raw_smp_processor_id());
312}
313EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
314
246 315
247static inline void set_running_timer(struct tvec_base *base, 316static inline void set_running_timer(struct tvec_base *base,
248 struct timer_list *timer) 317 struct timer_list *timer)
@@ -978,6 +1047,7 @@ void update_process_times(int user_tick)
978 run_local_timers(); 1047 run_local_timers();
979 if (rcu_pending(cpu)) 1048 if (rcu_pending(cpu))
980 rcu_check_callbacks(cpu, user_tick); 1049 rcu_check_callbacks(cpu, user_tick);
1050 printk_tick();
981 scheduler_tick(); 1051 scheduler_tick();
982 run_posix_cpu_timers(p); 1052 run_posix_cpu_timers(p);
983} 1053}
@@ -1435,9 +1505,11 @@ static void __cpuinit migrate_timers(int cpu)
1435 BUG_ON(cpu_online(cpu)); 1505 BUG_ON(cpu_online(cpu));
1436 old_base = per_cpu(tvec_bases, cpu); 1506 old_base = per_cpu(tvec_bases, cpu);
1437 new_base = get_cpu_var(tvec_bases); 1507 new_base = get_cpu_var(tvec_bases);
1438 1508 /*
1439 local_irq_disable(); 1509 * The caller is globally serialized and nobody else
1440 spin_lock(&new_base->lock); 1510 * takes two locks at once, deadlock is not possible.
1511 */
1512 spin_lock_irq(&new_base->lock);
1441 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1513 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1442 1514
1443 BUG_ON(old_base->running_timer); 1515 BUG_ON(old_base->running_timer);
@@ -1452,8 +1524,7 @@ static void __cpuinit migrate_timers(int cpu)
1452 } 1524 }
1453 1525
1454 spin_unlock(&old_base->lock); 1526 spin_unlock(&old_base->lock);
1455 spin_unlock(&new_base->lock); 1527 spin_unlock_irq(&new_base->lock);
1456 local_irq_enable();
1457 put_cpu_var(tvec_bases); 1528 put_cpu_var(tvec_bases);
1458} 1529}
1459#endif /* CONFIG_HOTPLUG_CPU */ 1530#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 263e9e6bbd60..33dbefd471e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,23 +1,40 @@
1# 1#
2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: 2# Architectures that offer an FUNCTION_TRACER implementation should
3# select HAVE_FUNCTION_TRACER:
3# 4#
4config HAVE_FTRACE 5
6config NOP_TRACER
7 bool
8
9config HAVE_FUNCTION_TRACER
5 bool 10 bool
6 11
7config HAVE_DYNAMIC_FTRACE 12config HAVE_DYNAMIC_FTRACE
8 bool 13 bool
9 14
15config HAVE_FTRACE_MCOUNT_RECORD
16 bool
17
10config TRACER_MAX_TRACE 18config TRACER_MAX_TRACE
11 bool 19 bool
12 20
21config RING_BUFFER
22 bool
23
13config TRACING 24config TRACING
14 bool 25 bool
15 select DEBUG_FS 26 select DEBUG_FS
16 select STACKTRACE 27 select RING_BUFFER
28 select STACKTRACE if STACKTRACE_SUPPORT
29 select TRACEPOINTS
30 select NOP_TRACER
17 31
18config FTRACE 32menu "Tracers"
33
34config FUNCTION_TRACER
19 bool "Kernel Function Tracer" 35 bool "Kernel Function Tracer"
20 depends on HAVE_FTRACE 36 depends on HAVE_FUNCTION_TRACER
37 depends on DEBUG_KERNEL
21 select FRAME_POINTER 38 select FRAME_POINTER
22 select TRACING 39 select TRACING
23 select CONTEXT_SWITCH_TRACER 40 select CONTEXT_SWITCH_TRACER
@@ -35,7 +52,7 @@ config IRQSOFF_TRACER
35 default n 52 default n
36 depends on TRACE_IRQFLAGS_SUPPORT 53 depends on TRACE_IRQFLAGS_SUPPORT
37 depends on GENERIC_TIME 54 depends on GENERIC_TIME
38 depends on HAVE_FTRACE 55 depends on DEBUG_KERNEL
39 select TRACE_IRQFLAGS 56 select TRACE_IRQFLAGS
40 select TRACING 57 select TRACING
41 select TRACER_MAX_TRACE 58 select TRACER_MAX_TRACE
@@ -58,7 +75,7 @@ config PREEMPT_TRACER
58 default n 75 default n
59 depends on GENERIC_TIME 76 depends on GENERIC_TIME
60 depends on PREEMPT 77 depends on PREEMPT
61 depends on HAVE_FTRACE 78 depends on DEBUG_KERNEL
62 select TRACING 79 select TRACING
63 select TRACER_MAX_TRACE 80 select TRACER_MAX_TRACE
64 help 81 help
@@ -85,7 +102,7 @@ config SYSPROF_TRACER
85 102
86config SCHED_TRACER 103config SCHED_TRACER
87 bool "Scheduling Latency Tracer" 104 bool "Scheduling Latency Tracer"
88 depends on HAVE_FTRACE 105 depends on DEBUG_KERNEL
89 select TRACING 106 select TRACING
90 select CONTEXT_SWITCH_TRACER 107 select CONTEXT_SWITCH_TRACER
91 select TRACER_MAX_TRACE 108 select TRACER_MAX_TRACE
@@ -95,17 +112,56 @@ config SCHED_TRACER
95 112
96config CONTEXT_SWITCH_TRACER 113config CONTEXT_SWITCH_TRACER
97 bool "Trace process context switches" 114 bool "Trace process context switches"
98 depends on HAVE_FTRACE 115 depends on DEBUG_KERNEL
99 select TRACING 116 select TRACING
100 select MARKERS 117 select MARKERS
101 help 118 help
102 This tracer gets called from the context switch and records 119 This tracer gets called from the context switch and records
103 all switching of tasks. 120 all switching of tasks.
104 121
122config BOOT_TRACER
123 bool "Trace boot initcalls"
124 depends on DEBUG_KERNEL
125 select TRACING
126 select CONTEXT_SWITCH_TRACER
127 help
128 This tracer helps developers to optimize boot times: it records
129 the timings of the initcalls and traces key events and the identity
130 of tasks that can cause boot delays, such as context-switches.
131
132 Its aim is to be parsed by the /scripts/bootgraph.pl tool to
133 produce pretty graphics about boot inefficiencies, giving a visual
134 representation of the delays during initcalls - but the raw
135 /debug/tracing/trace text output is readable too.
136
137 ( Note that tracing self tests can't be enabled if this tracer is
138 selected, because the self-tests are an initcall as well and that
139 would invalidate the boot trace. )
140
141config STACK_TRACER
142 bool "Trace max stack"
143 depends on HAVE_FUNCTION_TRACER
144 depends on DEBUG_KERNEL
145 select FUNCTION_TRACER
146 select STACKTRACE
147 help
148 This special tracer records the maximum stack footprint of the
149 kernel and displays it in debugfs/tracing/stack_trace.
150
151 This tracer works by hooking into every function call that the
152 kernel executes, and keeping a maximum stack depth value and
153 stack-trace saved. Because this logic has to execute in every
154 kernel function, all the time, this option can slow down the
155 kernel measurably and is generally intended for kernel
156 developers only.
157
158 Say N if unsure.
159
105config DYNAMIC_FTRACE 160config DYNAMIC_FTRACE
106 bool "enable/disable ftrace tracepoints dynamically" 161 bool "enable/disable ftrace tracepoints dynamically"
107 depends on FTRACE 162 depends on FUNCTION_TRACER
108 depends on HAVE_DYNAMIC_FTRACE 163 depends on HAVE_DYNAMIC_FTRACE
164 depends on DEBUG_KERNEL
109 default y 165 default y
110 help 166 help
111 This option will modify all the calls to ftrace dynamically 167 This option will modify all the calls to ftrace dynamically
@@ -113,7 +169,7 @@ config DYNAMIC_FTRACE
113 with a No-Op instruction) as they are called. A table is 169 with a No-Op instruction) as they are called. A table is
114 created to dynamically enable them again. 170 created to dynamically enable them again.
115 171
116 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise 172 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
117 has native performance as long as no tracing is active. 173 has native performance as long as no tracing is active.
118 174
119 The changes to the code are done by a kernel thread that 175 The changes to the code are done by a kernel thread that
@@ -121,15 +177,22 @@ config DYNAMIC_FTRACE
121 were made. If so, it runs stop_machine (stops all CPUS) 177 were made. If so, it runs stop_machine (stops all CPUS)
122 and modifies the code to jump over the call to ftrace. 178 and modifies the code to jump over the call to ftrace.
123 179
180config FTRACE_MCOUNT_RECORD
181 def_bool y
182 depends on DYNAMIC_FTRACE
183 depends on HAVE_FTRACE_MCOUNT_RECORD
184
124config FTRACE_SELFTEST 185config FTRACE_SELFTEST
125 bool 186 bool
126 187
127config FTRACE_STARTUP_TEST 188config FTRACE_STARTUP_TEST
128 bool "Perform a startup test on ftrace" 189 bool "Perform a startup test on ftrace"
129 depends on TRACING 190 depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
130 select FTRACE_SELFTEST 191 select FTRACE_SELFTEST
131 help 192 help
132 This option performs a series of startup tests on ftrace. On bootup 193 This option performs a series of startup tests on ftrace. On bootup
133 a series of tests are made to verify that the tracer is 194 a series of tests are made to verify that the tracer is
134 functioning properly. It will do tests on all the configured 195 functioning properly. It will do tests on all the configured
135 tracers of ftrace. 196 tracers of ftrace.
197
198endmenu
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 71d17de17288..c8228b1a49e9 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,7 +1,7 @@
1 1
2# Do not instrument the tracer itself: 2# Do not instrument the tracer itself:
3 3
4ifdef CONFIG_FTRACE 4ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7 7
@@ -10,15 +10,19 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13obj-$(CONFIG_FTRACE) += libftrace.o 13obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
14 15
15obj-$(CONFIG_TRACING) += trace.o 16obj-$(CONFIG_TRACING) += trace.o
16obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 17obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
17obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o 18obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
18obj-$(CONFIG_FTRACE) += trace_functions.o 19obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
19obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 20obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 21obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
21obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o 22obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
23obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o
22obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
23 27
24libftrace-y := ftrace.o 28libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6e3af31b403..78db083390f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -25,13 +25,24 @@
25#include <linux/ftrace.h> 25#include <linux/ftrace.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/ctype.h> 27#include <linux/ctype.h>
28#include <linux/hash.h>
29#include <linux/list.h> 28#include <linux/list.h>
30 29
31#include <asm/ftrace.h> 30#include <asm/ftrace.h>
32 31
33#include "trace.h" 32#include "trace.h"
34 33
34#define FTRACE_WARN_ON(cond) \
35 do { \
36 if (WARN_ON(cond)) \
37 ftrace_kill(); \
38 } while (0)
39
40#define FTRACE_WARN_ON_ONCE(cond) \
41 do { \
42 if (WARN_ON_ONCE(cond)) \
43 ftrace_kill(); \
44 } while (0)
45
35/* ftrace_enabled is a method to turn ftrace on or off */ 46/* ftrace_enabled is a method to turn ftrace on or off */
36int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
37static int last_ftrace_enabled; 48static int last_ftrace_enabled;
@@ -81,7 +92,7 @@ void clear_ftrace_function(void)
81 92
82static int __register_ftrace_function(struct ftrace_ops *ops) 93static int __register_ftrace_function(struct ftrace_ops *ops)
83{ 94{
84 /* Should never be called by interrupts */ 95 /* should not be called from interrupt context */
85 spin_lock(&ftrace_lock); 96 spin_lock(&ftrace_lock);
86 97
87 ops->next = ftrace_list; 98 ops->next = ftrace_list;
@@ -115,6 +126,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
115 struct ftrace_ops **p; 126 struct ftrace_ops **p;
116 int ret = 0; 127 int ret = 0;
117 128
129 /* should not be called from interrupt context */
118 spin_lock(&ftrace_lock); 130 spin_lock(&ftrace_lock);
119 131
120 /* 132 /*
@@ -152,8 +164,17 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
152} 164}
153 165
154#ifdef CONFIG_DYNAMIC_FTRACE 166#ifdef CONFIG_DYNAMIC_FTRACE
167#ifndef CONFIG_FTRACE_MCOUNT_RECORD
168# error Dynamic ftrace depends on MCOUNT_RECORD
169#endif
155 170
156static struct task_struct *ftraced_task; 171/*
172 * Since MCOUNT_ADDR may point to mcount itself, we do not want
173 * to get it confused by reading a reference in the code as we
174 * are parsing on objcopy output of text. Use a variable for
175 * it instead.
176 */
177static unsigned long mcount_addr = MCOUNT_ADDR;
157 178
158enum { 179enum {
159 FTRACE_ENABLE_CALLS = (1 << 0), 180 FTRACE_ENABLE_CALLS = (1 << 0),
@@ -164,15 +185,9 @@ enum {
164}; 185};
165 186
166static int ftrace_filtered; 187static int ftrace_filtered;
167static int tracing_on;
168static int frozen_record_count;
169
170static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
171 188
172static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); 189static LIST_HEAD(ftrace_new_addrs);
173 190
174static DEFINE_SPINLOCK(ftrace_shutdown_lock);
175static DEFINE_MUTEX(ftraced_lock);
176static DEFINE_MUTEX(ftrace_regex_lock); 191static DEFINE_MUTEX(ftrace_regex_lock);
177 192
178struct ftrace_page { 193struct ftrace_page {
@@ -190,16 +205,13 @@ struct ftrace_page {
190static struct ftrace_page *ftrace_pages_start; 205static struct ftrace_page *ftrace_pages_start;
191static struct ftrace_page *ftrace_pages; 206static struct ftrace_page *ftrace_pages;
192 207
193static int ftraced_trigger;
194static int ftraced_suspend;
195static int ftraced_stop;
196
197static int ftrace_record_suspend;
198
199static struct dyn_ftrace *ftrace_free_records; 208static struct dyn_ftrace *ftrace_free_records;
200 209
201 210
202#ifdef CONFIG_KPROBES 211#ifdef CONFIG_KPROBES
212
213static int frozen_record_count;
214
203static inline void freeze_record(struct dyn_ftrace *rec) 215static inline void freeze_record(struct dyn_ftrace *rec)
204{ 216{
205 if (!(rec->flags & FTRACE_FL_FROZEN)) { 217 if (!(rec->flags & FTRACE_FL_FROZEN)) {
@@ -226,79 +238,36 @@ static inline int record_frozen(struct dyn_ftrace *rec)
226# define record_frozen(rec) ({ 0; }) 238# define record_frozen(rec) ({ 0; })
227#endif /* CONFIG_KPROBES */ 239#endif /* CONFIG_KPROBES */
228 240
229int skip_trace(unsigned long ip) 241static void ftrace_free_rec(struct dyn_ftrace *rec)
230{ 242{
231 unsigned long fl; 243 rec->ip = (unsigned long)ftrace_free_records;
232 struct dyn_ftrace *rec; 244 ftrace_free_records = rec;
233 struct hlist_node *t; 245 rec->flags |= FTRACE_FL_FREE;
234 struct hlist_head *head;
235
236 if (frozen_record_count == 0)
237 return 0;
238
239 head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
240 hlist_for_each_entry_rcu(rec, t, head, node) {
241 if (rec->ip == ip) {
242 if (record_frozen(rec)) {
243 if (rec->flags & FTRACE_FL_FAILED)
244 return 1;
245
246 if (!(rec->flags & FTRACE_FL_CONVERTED))
247 return 1;
248
249 if (!tracing_on || !ftrace_enabled)
250 return 1;
251
252 if (ftrace_filtered) {
253 fl = rec->flags & (FTRACE_FL_FILTER |
254 FTRACE_FL_NOTRACE);
255 if (!fl || (fl & FTRACE_FL_NOTRACE))
256 return 1;
257 }
258 }
259 break;
260 }
261 }
262
263 return 0;
264} 246}
265 247
266static inline int 248void ftrace_release(void *start, unsigned long size)
267ftrace_ip_in_hash(unsigned long ip, unsigned long key)
268{ 249{
269 struct dyn_ftrace *p; 250 struct dyn_ftrace *rec;
270 struct hlist_node *t; 251 struct ftrace_page *pg;
271 int found = 0; 252 unsigned long s = (unsigned long)start;
272 253 unsigned long e = s + size;
273 hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { 254 int i;
274 if (p->ip == ip) {
275 found = 1;
276 break;
277 }
278 }
279
280 return found;
281}
282 255
283static inline void 256 if (ftrace_disabled || !start)
284ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) 257 return;
285{
286 hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
287}
288 258
289/* called from kstop_machine */ 259 /* should not be called from interrupt context */
290static inline void ftrace_del_hash(struct dyn_ftrace *node) 260 spin_lock(&ftrace_lock);
291{
292 hlist_del(&node->node);
293}
294 261
295static void ftrace_free_rec(struct dyn_ftrace *rec) 262 for (pg = ftrace_pages_start; pg; pg = pg->next) {
296{ 263 for (i = 0; i < pg->index; i++) {
297 /* no locking, only called from kstop_machine */ 264 rec = &pg->records[i];
298 265
299 rec->ip = (unsigned long)ftrace_free_records; 266 if ((rec->ip >= s) && (rec->ip < e))
300 ftrace_free_records = rec; 267 ftrace_free_rec(rec);
301 rec->flags |= FTRACE_FL_FREE; 268 }
269 }
270 spin_unlock(&ftrace_lock);
302} 271}
303 272
304static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 273static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -310,10 +279,8 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
310 rec = ftrace_free_records; 279 rec = ftrace_free_records;
311 280
312 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { 281 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
313 WARN_ON_ONCE(1); 282 FTRACE_WARN_ON_ONCE(1);
314 ftrace_free_records = NULL; 283 ftrace_free_records = NULL;
315 ftrace_disabled = 1;
316 ftrace_enabled = 0;
317 return NULL; 284 return NULL;
318 } 285 }
319 286
@@ -323,175 +290,125 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
323 } 290 }
324 291
325 if (ftrace_pages->index == ENTRIES_PER_PAGE) { 292 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
326 if (!ftrace_pages->next) 293 if (!ftrace_pages->next) {
327 return NULL; 294 /* allocate another page */
295 ftrace_pages->next =
296 (void *)get_zeroed_page(GFP_KERNEL);
297 if (!ftrace_pages->next)
298 return NULL;
299 }
328 ftrace_pages = ftrace_pages->next; 300 ftrace_pages = ftrace_pages->next;
329 } 301 }
330 302
331 return &ftrace_pages->records[ftrace_pages->index++]; 303 return &ftrace_pages->records[ftrace_pages->index++];
332} 304}
333 305
334static void 306static struct dyn_ftrace *
335ftrace_record_ip(unsigned long ip) 307ftrace_record_ip(unsigned long ip)
336{ 308{
337 struct dyn_ftrace *node; 309 struct dyn_ftrace *rec;
338 unsigned long flags;
339 unsigned long key;
340 int resched;
341 int atomic;
342 int cpu;
343 310
344 if (!ftrace_enabled || ftrace_disabled) 311 if (!ftrace_enabled || ftrace_disabled)
345 return; 312 return NULL;
346
347 resched = need_resched();
348 preempt_disable_notrace();
349
350 /*
351 * We simply need to protect against recursion.
352 * Use the the raw version of smp_processor_id and not
353 * __get_cpu_var which can call debug hooks that can
354 * cause a recursive crash here.
355 */
356 cpu = raw_smp_processor_id();
357 per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
358 if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
359 goto out;
360
361 if (unlikely(ftrace_record_suspend))
362 goto out;
363
364 key = hash_long(ip, FTRACE_HASHBITS);
365
366 WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
367 313
368 if (ftrace_ip_in_hash(ip, key)) 314 rec = ftrace_alloc_dyn_node(ip);
369 goto out; 315 if (!rec)
370 316 return NULL;
371 atomic = irqs_disabled();
372
373 spin_lock_irqsave(&ftrace_shutdown_lock, flags);
374
375 /* This ip may have hit the hash before the lock */
376 if (ftrace_ip_in_hash(ip, key))
377 goto out_unlock;
378
379 node = ftrace_alloc_dyn_node(ip);
380 if (!node)
381 goto out_unlock;
382
383 node->ip = ip;
384
385 ftrace_add_hash(node, key);
386 317
387 ftraced_trigger = 1; 318 rec->ip = ip;
388 319
389 out_unlock: 320 list_add(&rec->list, &ftrace_new_addrs);
390 spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
391 out:
392 per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
393 321
394 /* prevent recursion with scheduler */ 322 return rec;
395 if (resched)
396 preempt_enable_no_resched_notrace();
397 else
398 preempt_enable_notrace();
399} 323}
400 324
401#define FTRACE_ADDR ((long)(ftrace_caller)) 325#define FTRACE_ADDR ((long)(ftrace_caller))
402 326
403static int 327static int
404__ftrace_replace_code(struct dyn_ftrace *rec, 328__ftrace_replace_code(struct dyn_ftrace *rec,
405 unsigned char *old, unsigned char *new, int enable) 329 unsigned char *nop, int enable)
406{ 330{
407 unsigned long ip, fl; 331 unsigned long ip, fl;
332 unsigned char *call, *old, *new;
408 333
409 ip = rec->ip; 334 ip = rec->ip;
410 335
411 if (ftrace_filtered && enable) { 336 /*
337 * If this record is not to be traced and
338 * it is not enabled then do nothing.
339 *
340 * If this record is not to be traced and
341 * it is enabled then disabled it.
342 *
343 */
344 if (rec->flags & FTRACE_FL_NOTRACE) {
345 if (rec->flags & FTRACE_FL_ENABLED)
346 rec->flags &= ~FTRACE_FL_ENABLED;
347 else
348 return 0;
349
350 } else if (ftrace_filtered && enable) {
412 /* 351 /*
413 * If filtering is on: 352 * Filtering is on:
414 *
415 * If this record is set to be filtered and
416 * is enabled then do nothing.
417 *
418 * If this record is set to be filtered and
419 * it is not enabled, enable it.
420 *
421 * If this record is not set to be filtered
422 * and it is not enabled do nothing.
423 *
424 * If this record is set not to trace then
425 * do nothing.
426 *
427 * If this record is set not to trace and
428 * it is enabled then disable it.
429 *
430 * If this record is not set to be filtered and
431 * it is enabled, disable it.
432 */ 353 */
433 354
434 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | 355 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
435 FTRACE_FL_ENABLED);
436 356
437 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || 357 /* Record is filtered and enabled, do nothing */
438 (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || 358 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
439 !fl || (fl == FTRACE_FL_NOTRACE))
440 return 0; 359 return 0;
441 360
442 /* 361 /* Record is not filtered and is not enabled do nothing */
443 * If it is enabled disable it, 362 if (!fl)
444 * otherwise enable it! 363 return 0;
445 */ 364
446 if (fl & FTRACE_FL_ENABLED) { 365 /* Record is not filtered but enabled, disable it */
447 /* swap new and old */ 366 if (fl == FTRACE_FL_ENABLED)
448 new = old;
449 old = ftrace_call_replace(ip, FTRACE_ADDR);
450 rec->flags &= ~FTRACE_FL_ENABLED; 367 rec->flags &= ~FTRACE_FL_ENABLED;
451 } else { 368 else
452 new = ftrace_call_replace(ip, FTRACE_ADDR); 369 /* Otherwise record is filtered but not enabled, enable it */
453 rec->flags |= FTRACE_FL_ENABLED; 370 rec->flags |= FTRACE_FL_ENABLED;
454 }
455 } else { 371 } else {
372 /* Disable or not filtered */
456 373
457 if (enable) { 374 if (enable) {
458 /* 375 /* if record is enabled, do nothing */
459 * If this record is set not to trace and is
460 * not enabled, do nothing.
461 */
462 fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
463 if (fl == FTRACE_FL_NOTRACE)
464 return 0;
465
466 new = ftrace_call_replace(ip, FTRACE_ADDR);
467 } else
468 old = ftrace_call_replace(ip, FTRACE_ADDR);
469
470 if (enable) {
471 if (rec->flags & FTRACE_FL_ENABLED) 376 if (rec->flags & FTRACE_FL_ENABLED)
472 return 0; 377 return 0;
378
473 rec->flags |= FTRACE_FL_ENABLED; 379 rec->flags |= FTRACE_FL_ENABLED;
380
474 } else { 381 } else {
382
383 /* if record is not enabled do nothing */
475 if (!(rec->flags & FTRACE_FL_ENABLED)) 384 if (!(rec->flags & FTRACE_FL_ENABLED))
476 return 0; 385 return 0;
386
477 rec->flags &= ~FTRACE_FL_ENABLED; 387 rec->flags &= ~FTRACE_FL_ENABLED;
478 } 388 }
479 } 389 }
480 390
391 call = ftrace_call_replace(ip, FTRACE_ADDR);
392
393 if (rec->flags & FTRACE_FL_ENABLED) {
394 old = nop;
395 new = call;
396 } else {
397 old = call;
398 new = nop;
399 }
400
481 return ftrace_modify_code(ip, old, new); 401 return ftrace_modify_code(ip, old, new);
482} 402}
483 403
484static void ftrace_replace_code(int enable) 404static void ftrace_replace_code(int enable)
485{ 405{
486 int i, failed; 406 int i, failed;
487 unsigned char *new = NULL, *old = NULL; 407 unsigned char *nop = NULL;
488 struct dyn_ftrace *rec; 408 struct dyn_ftrace *rec;
489 struct ftrace_page *pg; 409 struct ftrace_page *pg;
490 410
491 if (enable) 411 nop = ftrace_nop_replace();
492 old = ftrace_nop_replace();
493 else
494 new = ftrace_nop_replace();
495 412
496 for (pg = ftrace_pages_start; pg; pg = pg->next) { 413 for (pg = ftrace_pages_start; pg; pg = pg->next) {
497 for (i = 0; i < pg->index; i++) { 414 for (i = 0; i < pg->index; i++) {
@@ -509,12 +426,11 @@ static void ftrace_replace_code(int enable)
509 unfreeze_record(rec); 426 unfreeze_record(rec);
510 } 427 }
511 428
512 failed = __ftrace_replace_code(rec, old, new, enable); 429 failed = __ftrace_replace_code(rec, nop, enable);
513 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 430 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
514 rec->flags |= FTRACE_FL_FAILED; 431 rec->flags |= FTRACE_FL_FAILED;
515 if ((system_state == SYSTEM_BOOTING) || 432 if ((system_state == SYSTEM_BOOTING) ||
516 !core_kernel_text(rec->ip)) { 433 !core_kernel_text(rec->ip)) {
517 ftrace_del_hash(rec);
518 ftrace_free_rec(rec); 434 ftrace_free_rec(rec);
519 } 435 }
520 } 436 }
@@ -522,13 +438,14 @@ static void ftrace_replace_code(int enable)
522 } 438 }
523} 439}
524 440
525static void ftrace_shutdown_replenish(void) 441static void print_ip_ins(const char *fmt, unsigned char *p)
526{ 442{
527 if (ftrace_pages->next) 443 int i;
528 return;
529 444
530 /* allocate another page */ 445 printk(KERN_CONT "%s", fmt);
531 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); 446
447 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
448 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
532} 449}
533 450
534static int 451static int
@@ -536,52 +453,59 @@ ftrace_code_disable(struct dyn_ftrace *rec)
536{ 453{
537 unsigned long ip; 454 unsigned long ip;
538 unsigned char *nop, *call; 455 unsigned char *nop, *call;
539 int failed; 456 int ret;
540 457
541 ip = rec->ip; 458 ip = rec->ip;
542 459
543 nop = ftrace_nop_replace(); 460 nop = ftrace_nop_replace();
544 call = ftrace_call_replace(ip, MCOUNT_ADDR); 461 call = ftrace_call_replace(ip, mcount_addr);
462
463 ret = ftrace_modify_code(ip, call, nop);
464 if (ret) {
465 switch (ret) {
466 case -EFAULT:
467 FTRACE_WARN_ON_ONCE(1);
468 pr_info("ftrace faulted on modifying ");
469 print_ip_sym(ip);
470 break;
471 case -EINVAL:
472 FTRACE_WARN_ON_ONCE(1);
473 pr_info("ftrace failed to modify ");
474 print_ip_sym(ip);
475 print_ip_ins(" expected: ", call);
476 print_ip_ins(" actual: ", (unsigned char *)ip);
477 print_ip_ins(" replace: ", nop);
478 printk(KERN_CONT "\n");
479 break;
480 case -EPERM:
481 FTRACE_WARN_ON_ONCE(1);
482 pr_info("ftrace faulted on writing ");
483 print_ip_sym(ip);
484 break;
485 default:
486 FTRACE_WARN_ON_ONCE(1);
487 pr_info("ftrace faulted on unknown error ");
488 print_ip_sym(ip);
489 }
545 490
546 failed = ftrace_modify_code(ip, call, nop);
547 if (failed) {
548 rec->flags |= FTRACE_FL_FAILED; 491 rec->flags |= FTRACE_FL_FAILED;
549 return 0; 492 return 0;
550 } 493 }
551 return 1; 494 return 1;
552} 495}
553 496
554static int __ftrace_update_code(void *ignore);
555
556static int __ftrace_modify_code(void *data) 497static int __ftrace_modify_code(void *data)
557{ 498{
558 unsigned long addr;
559 int *command = data; 499 int *command = data;
560 500
561 if (*command & FTRACE_ENABLE_CALLS) { 501 if (*command & FTRACE_ENABLE_CALLS)
562 /*
563 * Update any recorded ips now that we have the
564 * machine stopped
565 */
566 __ftrace_update_code(NULL);
567 ftrace_replace_code(1); 502 ftrace_replace_code(1);
568 tracing_on = 1; 503 else if (*command & FTRACE_DISABLE_CALLS)
569 } else if (*command & FTRACE_DISABLE_CALLS) {
570 ftrace_replace_code(0); 504 ftrace_replace_code(0);
571 tracing_on = 0;
572 }
573 505
574 if (*command & FTRACE_UPDATE_TRACE_FUNC) 506 if (*command & FTRACE_UPDATE_TRACE_FUNC)
575 ftrace_update_ftrace_func(ftrace_trace_function); 507 ftrace_update_ftrace_func(ftrace_trace_function);
576 508
577 if (*command & FTRACE_ENABLE_MCOUNT) {
578 addr = (unsigned long)ftrace_record_ip;
579 ftrace_mcount_set(&addr);
580 } else if (*command & FTRACE_DISABLE_MCOUNT) {
581 addr = (unsigned long)ftrace_stub;
582 ftrace_mcount_set(&addr);
583 }
584
585 return 0; 509 return 0;
586} 510}
587 511
@@ -590,26 +514,9 @@ static void ftrace_run_update_code(int command)
590 stop_machine(__ftrace_modify_code, &command, NULL); 514 stop_machine(__ftrace_modify_code, &command, NULL);
591} 515}
592 516
593void ftrace_disable_daemon(void)
594{
595 /* Stop the daemon from calling kstop_machine */
596 mutex_lock(&ftraced_lock);
597 ftraced_stop = 1;
598 mutex_unlock(&ftraced_lock);
599
600 ftrace_force_update();
601}
602
603void ftrace_enable_daemon(void)
604{
605 mutex_lock(&ftraced_lock);
606 ftraced_stop = 0;
607 mutex_unlock(&ftraced_lock);
608
609 ftrace_force_update();
610}
611
612static ftrace_func_t saved_ftrace_func; 517static ftrace_func_t saved_ftrace_func;
518static int ftrace_start;
519static DEFINE_MUTEX(ftrace_start_lock);
613 520
614static void ftrace_startup(void) 521static void ftrace_startup(void)
615{ 522{
@@ -618,10 +525,9 @@ static void ftrace_startup(void)
618 if (unlikely(ftrace_disabled)) 525 if (unlikely(ftrace_disabled))
619 return; 526 return;
620 527
621 mutex_lock(&ftraced_lock); 528 mutex_lock(&ftrace_start_lock);
622 ftraced_suspend++; 529 ftrace_start++;
623 if (ftraced_suspend == 1) 530 command |= FTRACE_ENABLE_CALLS;
624 command |= FTRACE_ENABLE_CALLS;
625 531
626 if (saved_ftrace_func != ftrace_trace_function) { 532 if (saved_ftrace_func != ftrace_trace_function) {
627 saved_ftrace_func = ftrace_trace_function; 533 saved_ftrace_func = ftrace_trace_function;
@@ -633,7 +539,7 @@ static void ftrace_startup(void)
633 539
634 ftrace_run_update_code(command); 540 ftrace_run_update_code(command);
635 out: 541 out:
636 mutex_unlock(&ftraced_lock); 542 mutex_unlock(&ftrace_start_lock);
637} 543}
638 544
639static void ftrace_shutdown(void) 545static void ftrace_shutdown(void)
@@ -643,9 +549,9 @@ static void ftrace_shutdown(void)
643 if (unlikely(ftrace_disabled)) 549 if (unlikely(ftrace_disabled))
644 return; 550 return;
645 551
646 mutex_lock(&ftraced_lock); 552 mutex_lock(&ftrace_start_lock);
647 ftraced_suspend--; 553 ftrace_start--;
648 if (!ftraced_suspend) 554 if (!ftrace_start)
649 command |= FTRACE_DISABLE_CALLS; 555 command |= FTRACE_DISABLE_CALLS;
650 556
651 if (saved_ftrace_func != ftrace_trace_function) { 557 if (saved_ftrace_func != ftrace_trace_function) {
@@ -658,7 +564,7 @@ static void ftrace_shutdown(void)
658 564
659 ftrace_run_update_code(command); 565 ftrace_run_update_code(command);
660 out: 566 out:
661 mutex_unlock(&ftraced_lock); 567 mutex_unlock(&ftrace_start_lock);
662} 568}
663 569
664static void ftrace_startup_sysctl(void) 570static void ftrace_startup_sysctl(void)
@@ -668,15 +574,15 @@ static void ftrace_startup_sysctl(void)
668 if (unlikely(ftrace_disabled)) 574 if (unlikely(ftrace_disabled))
669 return; 575 return;
670 576
671 mutex_lock(&ftraced_lock); 577 mutex_lock(&ftrace_start_lock);
672 /* Force update next time */ 578 /* Force update next time */
673 saved_ftrace_func = NULL; 579 saved_ftrace_func = NULL;
674 /* ftraced_suspend is true if we want ftrace running */ 580 /* ftrace_start is true if we want ftrace running */
675 if (ftraced_suspend) 581 if (ftrace_start)
676 command |= FTRACE_ENABLE_CALLS; 582 command |= FTRACE_ENABLE_CALLS;
677 583
678 ftrace_run_update_code(command); 584 ftrace_run_update_code(command);
679 mutex_unlock(&ftraced_lock); 585 mutex_unlock(&ftrace_start_lock);
680} 586}
681 587
682static void ftrace_shutdown_sysctl(void) 588static void ftrace_shutdown_sysctl(void)
@@ -686,153 +592,51 @@ static void ftrace_shutdown_sysctl(void)
686 if (unlikely(ftrace_disabled)) 592 if (unlikely(ftrace_disabled))
687 return; 593 return;
688 594
689 mutex_lock(&ftraced_lock); 595 mutex_lock(&ftrace_start_lock);
690 /* ftraced_suspend is true if ftrace is running */ 596 /* ftrace_start is true if ftrace is running */
691 if (ftraced_suspend) 597 if (ftrace_start)
692 command |= FTRACE_DISABLE_CALLS; 598 command |= FTRACE_DISABLE_CALLS;
693 599
694 ftrace_run_update_code(command); 600 ftrace_run_update_code(command);
695 mutex_unlock(&ftraced_lock); 601 mutex_unlock(&ftrace_start_lock);
696} 602}
697 603
698static cycle_t ftrace_update_time; 604static cycle_t ftrace_update_time;
699static unsigned long ftrace_update_cnt; 605static unsigned long ftrace_update_cnt;
700unsigned long ftrace_update_tot_cnt; 606unsigned long ftrace_update_tot_cnt;
701 607
702static int __ftrace_update_code(void *ignore) 608static int ftrace_update_code(void)
703{ 609{
704 int i, save_ftrace_enabled; 610 struct dyn_ftrace *p, *t;
705 cycle_t start, stop; 611 cycle_t start, stop;
706 struct dyn_ftrace *p;
707 struct hlist_node *t, *n;
708 struct hlist_head *head, temp_list;
709
710 /* Don't be recording funcs now */
711 ftrace_record_suspend++;
712 save_ftrace_enabled = ftrace_enabled;
713 ftrace_enabled = 0;
714 612
715 start = ftrace_now(raw_smp_processor_id()); 613 start = ftrace_now(raw_smp_processor_id());
716 ftrace_update_cnt = 0; 614 ftrace_update_cnt = 0;
717 615
718 /* No locks needed, the machine is stopped! */ 616 list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
719 for (i = 0; i < FTRACE_HASHSIZE; i++) {
720 INIT_HLIST_HEAD(&temp_list);
721 head = &ftrace_hash[i];
722
723 /* all CPUS are stopped, we are safe to modify code */
724 hlist_for_each_entry_safe(p, t, n, head, node) {
725 /* Skip over failed records which have not been
726 * freed. */
727 if (p->flags & FTRACE_FL_FAILED)
728 continue;
729
730 /* Unconverted records are always at the head of the
731 * hash bucket. Once we encounter a converted record,
732 * simply skip over to the next bucket. Saves ftraced
733 * some processor cycles (ftrace does its bid for
734 * global warming :-p ). */
735 if (p->flags & (FTRACE_FL_CONVERTED))
736 break;
737 617
738 /* Ignore updates to this record's mcount site. 618 /* If something went wrong, bail without enabling anything */
739 * Reintroduce this record at the head of this 619 if (unlikely(ftrace_disabled))
740 * bucket to attempt to "convert" it again if 620 return -1;
741 * the kprobe on it is unregistered before the
742 * next run. */
743 if (get_kprobe((void *)p->ip)) {
744 ftrace_del_hash(p);
745 INIT_HLIST_NODE(&p->node);
746 hlist_add_head(&p->node, &temp_list);
747 freeze_record(p);
748 continue;
749 } else {
750 unfreeze_record(p);
751 }
752 621
753 /* convert record (i.e, patch mcount-call with NOP) */ 622 list_del_init(&p->list);
754 if (ftrace_code_disable(p)) {
755 p->flags |= FTRACE_FL_CONVERTED;
756 ftrace_update_cnt++;
757 } else {
758 if ((system_state == SYSTEM_BOOTING) ||
759 !core_kernel_text(p->ip)) {
760 ftrace_del_hash(p);
761 ftrace_free_rec(p);
762 }
763 }
764 }
765 623
766 hlist_for_each_entry_safe(p, t, n, &temp_list, node) { 624 /* convert record (i.e, patch mcount-call with NOP) */
767 hlist_del(&p->node); 625 if (ftrace_code_disable(p)) {
768 INIT_HLIST_NODE(&p->node); 626 p->flags |= FTRACE_FL_CONVERTED;
769 hlist_add_head(&p->node, head); 627 ftrace_update_cnt++;
770 } 628 } else
629 ftrace_free_rec(p);
771 } 630 }
772 631
773 stop = ftrace_now(raw_smp_processor_id()); 632 stop = ftrace_now(raw_smp_processor_id());
774 ftrace_update_time = stop - start; 633 ftrace_update_time = stop - start;
775 ftrace_update_tot_cnt += ftrace_update_cnt; 634 ftrace_update_tot_cnt += ftrace_update_cnt;
776 ftraced_trigger = 0;
777
778 ftrace_enabled = save_ftrace_enabled;
779 ftrace_record_suspend--;
780
781 return 0;
782}
783 635
784static int ftrace_update_code(void)
785{
786 if (unlikely(ftrace_disabled) ||
787 !ftrace_enabled || !ftraced_trigger)
788 return 0;
789
790 stop_machine(__ftrace_update_code, NULL, NULL);
791
792 return 1;
793}
794
795static int ftraced(void *ignore)
796{
797 unsigned long usecs;
798
799 while (!kthread_should_stop()) {
800
801 set_current_state(TASK_INTERRUPTIBLE);
802
803 /* check once a second */
804 schedule_timeout(HZ);
805
806 if (unlikely(ftrace_disabled))
807 continue;
808
809 mutex_lock(&ftrace_sysctl_lock);
810 mutex_lock(&ftraced_lock);
811 if (!ftraced_suspend && !ftraced_stop &&
812 ftrace_update_code()) {
813 usecs = nsecs_to_usecs(ftrace_update_time);
814 if (ftrace_update_tot_cnt > 100000) {
815 ftrace_update_tot_cnt = 0;
816 pr_info("hm, dftrace overflow: %lu change%s"
817 " (%lu total) in %lu usec%s\n",
818 ftrace_update_cnt,
819 ftrace_update_cnt != 1 ? "s" : "",
820 ftrace_update_tot_cnt,
821 usecs, usecs != 1 ? "s" : "");
822 ftrace_disabled = 1;
823 WARN_ON_ONCE(1);
824 }
825 }
826 mutex_unlock(&ftraced_lock);
827 mutex_unlock(&ftrace_sysctl_lock);
828
829 ftrace_shutdown_replenish();
830 }
831 __set_current_state(TASK_RUNNING);
832 return 0; 636 return 0;
833} 637}
834 638
835static int __init ftrace_dyn_table_alloc(void) 639static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
836{ 640{
837 struct ftrace_page *pg; 641 struct ftrace_page *pg;
838 int cnt; 642 int cnt;
@@ -859,7 +663,9 @@ static int __init ftrace_dyn_table_alloc(void)
859 663
860 pg = ftrace_pages = ftrace_pages_start; 664 pg = ftrace_pages = ftrace_pages_start;
861 665
862 cnt = NR_TO_INIT / ENTRIES_PER_PAGE; 666 cnt = num_to_init / ENTRIES_PER_PAGE;
667 pr_info("ftrace: allocating %ld entries in %d pages\n",
668 num_to_init, cnt + 1);
863 669
864 for (i = 0; i < cnt; i++) { 670 for (i = 0; i < cnt; i++) {
865 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 671 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -901,6 +707,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
901 707
902 (*pos)++; 708 (*pos)++;
903 709
710 /* should not be called from interrupt context */
711 spin_lock(&ftrace_lock);
904 retry: 712 retry:
905 if (iter->idx >= iter->pg->index) { 713 if (iter->idx >= iter->pg->index) {
906 if (iter->pg->next) { 714 if (iter->pg->next) {
@@ -910,12 +718,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
910 } 718 }
911 } else { 719 } else {
912 rec = &iter->pg->records[iter->idx++]; 720 rec = &iter->pg->records[iter->idx++];
913 if ((!(iter->flags & FTRACE_ITER_FAILURES) && 721 if ((rec->flags & FTRACE_FL_FREE) ||
722
723 (!(iter->flags & FTRACE_ITER_FAILURES) &&
914 (rec->flags & FTRACE_FL_FAILED)) || 724 (rec->flags & FTRACE_FL_FAILED)) ||
915 725
916 ((iter->flags & FTRACE_ITER_FAILURES) && 726 ((iter->flags & FTRACE_ITER_FAILURES) &&
917 (!(rec->flags & FTRACE_FL_FAILED) || 727 !(rec->flags & FTRACE_FL_FAILED)) ||
918 (rec->flags & FTRACE_FL_FREE))) ||
919 728
920 ((iter->flags & FTRACE_ITER_FILTER) && 729 ((iter->flags & FTRACE_ITER_FILTER) &&
921 !(rec->flags & FTRACE_FL_FILTER)) || 730 !(rec->flags & FTRACE_FL_FILTER)) ||
@@ -926,6 +735,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
926 goto retry; 735 goto retry;
927 } 736 }
928 } 737 }
738 spin_unlock(&ftrace_lock);
929 739
930 iter->pos = *pos; 740 iter->pos = *pos;
931 741
@@ -938,13 +748,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
938 void *p = NULL; 748 void *p = NULL;
939 loff_t l = -1; 749 loff_t l = -1;
940 750
941 if (*pos != iter->pos) { 751 if (*pos > iter->pos)
942 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) 752 *pos = iter->pos;
943 ; 753
944 } else { 754 l = *pos;
945 l = *pos; 755 p = t_next(m, p, &l);
946 p = t_next(m, p, &l);
947 }
948 756
949 return p; 757 return p;
950} 758}
@@ -955,15 +763,21 @@ static void t_stop(struct seq_file *m, void *p)
955 763
956static int t_show(struct seq_file *m, void *v) 764static int t_show(struct seq_file *m, void *v)
957{ 765{
766 struct ftrace_iterator *iter = m->private;
958 struct dyn_ftrace *rec = v; 767 struct dyn_ftrace *rec = v;
959 char str[KSYM_SYMBOL_LEN]; 768 char str[KSYM_SYMBOL_LEN];
769 int ret = 0;
960 770
961 if (!rec) 771 if (!rec)
962 return 0; 772 return 0;
963 773
964 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 774 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
965 775
966 seq_printf(m, "%s\n", str); 776 ret = seq_printf(m, "%s\n", str);
777 if (ret < 0) {
778 iter->pos--;
779 iter->idx--;
780 }
967 781
968 return 0; 782 return 0;
969} 783}
@@ -989,7 +803,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
989 return -ENOMEM; 803 return -ENOMEM;
990 804
991 iter->pg = ftrace_pages_start; 805 iter->pg = ftrace_pages_start;
992 iter->pos = -1; 806 iter->pos = 0;
993 807
994 ret = seq_open(file, &show_ftrace_seq_ops); 808 ret = seq_open(file, &show_ftrace_seq_ops);
995 if (!ret) { 809 if (!ret) {
@@ -1039,8 +853,8 @@ static void ftrace_filter_reset(int enable)
1039 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 853 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1040 unsigned i; 854 unsigned i;
1041 855
1042 /* keep kstop machine from running */ 856 /* should not be called from interrupt context */
1043 preempt_disable(); 857 spin_lock(&ftrace_lock);
1044 if (enable) 858 if (enable)
1045 ftrace_filtered = 0; 859 ftrace_filtered = 0;
1046 pg = ftrace_pages_start; 860 pg = ftrace_pages_start;
@@ -1053,7 +867,7 @@ static void ftrace_filter_reset(int enable)
1053 } 867 }
1054 pg = pg->next; 868 pg = pg->next;
1055 } 869 }
1056 preempt_enable(); 870 spin_unlock(&ftrace_lock);
1057} 871}
1058 872
1059static int 873static int
@@ -1076,7 +890,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1076 890
1077 if (file->f_mode & FMODE_READ) { 891 if (file->f_mode & FMODE_READ) {
1078 iter->pg = ftrace_pages_start; 892 iter->pg = ftrace_pages_start;
1079 iter->pos = -1; 893 iter->pos = 0;
1080 iter->flags = enable ? FTRACE_ITER_FILTER : 894 iter->flags = enable ? FTRACE_ITER_FILTER :
1081 FTRACE_ITER_NOTRACE; 895 FTRACE_ITER_NOTRACE;
1082 896
@@ -1165,8 +979,8 @@ ftrace_match(unsigned char *buff, int len, int enable)
1165 } 979 }
1166 } 980 }
1167 981
1168 /* keep kstop machine from running */ 982 /* should not be called from interrupt context */
1169 preempt_disable(); 983 spin_lock(&ftrace_lock);
1170 if (enable) 984 if (enable)
1171 ftrace_filtered = 1; 985 ftrace_filtered = 1;
1172 pg = ftrace_pages_start; 986 pg = ftrace_pages_start;
@@ -1203,7 +1017,7 @@ ftrace_match(unsigned char *buff, int len, int enable)
1203 } 1017 }
1204 pg = pg->next; 1018 pg = pg->next;
1205 } 1019 }
1206 preempt_enable(); 1020 spin_unlock(&ftrace_lock);
1207} 1021}
1208 1022
1209static ssize_t 1023static ssize_t
@@ -1366,10 +1180,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1366 } 1180 }
1367 1181
1368 mutex_lock(&ftrace_sysctl_lock); 1182 mutex_lock(&ftrace_sysctl_lock);
1369 mutex_lock(&ftraced_lock); 1183 mutex_lock(&ftrace_start_lock);
1370 if (iter->filtered && ftraced_suspend && ftrace_enabled) 1184 if (ftrace_start && ftrace_enabled)
1371 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1185 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1372 mutex_unlock(&ftraced_lock); 1186 mutex_unlock(&ftrace_start_lock);
1373 mutex_unlock(&ftrace_sysctl_lock); 1187 mutex_unlock(&ftrace_sysctl_lock);
1374 1188
1375 kfree(iter); 1189 kfree(iter);
@@ -1389,55 +1203,6 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
1389 return ftrace_regex_release(inode, file, 0); 1203 return ftrace_regex_release(inode, file, 0);
1390} 1204}
1391 1205
1392static ssize_t
1393ftraced_read(struct file *filp, char __user *ubuf,
1394 size_t cnt, loff_t *ppos)
1395{
1396 /* don't worry about races */
1397 char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
1398 int r = strlen(buf);
1399
1400 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1401}
1402
1403static ssize_t
1404ftraced_write(struct file *filp, const char __user *ubuf,
1405 size_t cnt, loff_t *ppos)
1406{
1407 char buf[64];
1408 long val;
1409 int ret;
1410
1411 if (cnt >= sizeof(buf))
1412 return -EINVAL;
1413
1414 if (copy_from_user(&buf, ubuf, cnt))
1415 return -EFAULT;
1416
1417 if (strncmp(buf, "enable", 6) == 0)
1418 val = 1;
1419 else if (strncmp(buf, "disable", 7) == 0)
1420 val = 0;
1421 else {
1422 buf[cnt] = 0;
1423
1424 ret = strict_strtoul(buf, 10, &val);
1425 if (ret < 0)
1426 return ret;
1427
1428 val = !!val;
1429 }
1430
1431 if (val)
1432 ftrace_enable_daemon();
1433 else
1434 ftrace_disable_daemon();
1435
1436 filp->f_pos += cnt;
1437
1438 return cnt;
1439}
1440
1441static struct file_operations ftrace_avail_fops = { 1206static struct file_operations ftrace_avail_fops = {
1442 .open = ftrace_avail_open, 1207 .open = ftrace_avail_open,
1443 .read = seq_read, 1208 .read = seq_read,
@@ -1468,54 +1233,6 @@ static struct file_operations ftrace_notrace_fops = {
1468 .release = ftrace_notrace_release, 1233 .release = ftrace_notrace_release,
1469}; 1234};
1470 1235
1471static struct file_operations ftraced_fops = {
1472 .open = tracing_open_generic,
1473 .read = ftraced_read,
1474 .write = ftraced_write,
1475};
1476
1477/**
1478 * ftrace_force_update - force an update to all recording ftrace functions
1479 */
1480int ftrace_force_update(void)
1481{
1482 int ret = 0;
1483
1484 if (unlikely(ftrace_disabled))
1485 return -ENODEV;
1486
1487 mutex_lock(&ftrace_sysctl_lock);
1488 mutex_lock(&ftraced_lock);
1489
1490 /*
1491 * If ftraced_trigger is not set, then there is nothing
1492 * to update.
1493 */
1494 if (ftraced_trigger && !ftrace_update_code())
1495 ret = -EBUSY;
1496
1497 mutex_unlock(&ftraced_lock);
1498 mutex_unlock(&ftrace_sysctl_lock);
1499
1500 return ret;
1501}
1502
1503static void ftrace_force_shutdown(void)
1504{
1505 struct task_struct *task;
1506 int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
1507
1508 mutex_lock(&ftraced_lock);
1509 task = ftraced_task;
1510 ftraced_task = NULL;
1511 ftraced_suspend = -1;
1512 ftrace_run_update_code(command);
1513 mutex_unlock(&ftraced_lock);
1514
1515 if (task)
1516 kthread_stop(task);
1517}
1518
1519static __init int ftrace_init_debugfs(void) 1236static __init int ftrace_init_debugfs(void)
1520{ 1237{
1521 struct dentry *d_tracer; 1238 struct dentry *d_tracer;
@@ -1546,97 +1263,103 @@ static __init int ftrace_init_debugfs(void)
1546 pr_warning("Could not create debugfs " 1263 pr_warning("Could not create debugfs "
1547 "'set_ftrace_notrace' entry\n"); 1264 "'set_ftrace_notrace' entry\n");
1548 1265
1549 entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
1550 NULL, &ftraced_fops);
1551 if (!entry)
1552 pr_warning("Could not create debugfs "
1553 "'ftraced_enabled' entry\n");
1554 return 0; 1266 return 0;
1555} 1267}
1556 1268
1557fs_initcall(ftrace_init_debugfs); 1269fs_initcall(ftrace_init_debugfs);
1558 1270
1559static int __init ftrace_dynamic_init(void) 1271static int ftrace_convert_nops(unsigned long *start,
1272 unsigned long *end)
1560{ 1273{
1561 struct task_struct *p; 1274 unsigned long *p;
1562 unsigned long addr; 1275 unsigned long addr;
1276 unsigned long flags;
1277
1278 mutex_lock(&ftrace_start_lock);
1279 p = start;
1280 while (p < end) {
1281 addr = ftrace_call_adjust(*p++);
1282 ftrace_record_ip(addr);
1283 }
1284
1285 /* disable interrupts to prevent kstop machine */
1286 local_irq_save(flags);
1287 ftrace_update_code();
1288 local_irq_restore(flags);
1289 mutex_unlock(&ftrace_start_lock);
1290
1291 return 0;
1292}
1293
1294void ftrace_init_module(unsigned long *start, unsigned long *end)
1295{
1296 if (ftrace_disabled || start == end)
1297 return;
1298 ftrace_convert_nops(start, end);
1299}
1300
1301extern unsigned long __start_mcount_loc[];
1302extern unsigned long __stop_mcount_loc[];
1303
1304void __init ftrace_init(void)
1305{
1306 unsigned long count, addr, flags;
1563 int ret; 1307 int ret;
1564 1308
1565 addr = (unsigned long)ftrace_record_ip; 1309 /* Keep the ftrace pointer to the stub */
1310 addr = (unsigned long)ftrace_stub;
1566 1311
1567 stop_machine(ftrace_dyn_arch_init, &addr, NULL); 1312 local_irq_save(flags);
1313 ftrace_dyn_arch_init(&addr);
1314 local_irq_restore(flags);
1568 1315
1569 /* ftrace_dyn_arch_init places the return code in addr */ 1316 /* ftrace_dyn_arch_init places the return code in addr */
1570 if (addr) { 1317 if (addr)
1571 ret = (int)addr;
1572 goto failed; 1318 goto failed;
1573 }
1574 1319
1575 ret = ftrace_dyn_table_alloc(); 1320 count = __stop_mcount_loc - __start_mcount_loc;
1576 if (ret)
1577 goto failed;
1578 1321
1579 p = kthread_run(ftraced, NULL, "ftraced"); 1322 ret = ftrace_dyn_table_alloc(count);
1580 if (IS_ERR(p)) { 1323 if (ret)
1581 ret = -1;
1582 goto failed; 1324 goto failed;
1583 }
1584 1325
1585 last_ftrace_enabled = ftrace_enabled = 1; 1326 last_ftrace_enabled = ftrace_enabled = 1;
1586 ftraced_task = p;
1587 1327
1588 return 0; 1328 ret = ftrace_convert_nops(__start_mcount_loc,
1329 __stop_mcount_loc);
1589 1330
1331 return;
1590 failed: 1332 failed:
1591 ftrace_disabled = 1; 1333 ftrace_disabled = 1;
1592 return ret;
1593} 1334}
1594 1335
1595core_initcall(ftrace_dynamic_init);
1596#else 1336#else
1337
1338static int __init ftrace_nodyn_init(void)
1339{
1340 ftrace_enabled = 1;
1341 return 0;
1342}
1343device_initcall(ftrace_nodyn_init);
1344
1597# define ftrace_startup() do { } while (0) 1345# define ftrace_startup() do { } while (0)
1598# define ftrace_shutdown() do { } while (0) 1346# define ftrace_shutdown() do { } while (0)
1599# define ftrace_startup_sysctl() do { } while (0) 1347# define ftrace_startup_sysctl() do { } while (0)
1600# define ftrace_shutdown_sysctl() do { } while (0) 1348# define ftrace_shutdown_sysctl() do { } while (0)
1601# define ftrace_force_shutdown() do { } while (0)
1602#endif /* CONFIG_DYNAMIC_FTRACE */ 1349#endif /* CONFIG_DYNAMIC_FTRACE */
1603 1350
1604/** 1351/**
1605 * ftrace_kill_atomic - kill ftrace from critical sections 1352 * ftrace_kill - kill ftrace
1606 * 1353 *
1607 * This function should be used by panic code. It stops ftrace 1354 * This function should be used by panic code. It stops ftrace
1608 * but in a not so nice way. If you need to simply kill ftrace 1355 * but in a not so nice way. If you need to simply kill ftrace
1609 * from a non-atomic section, use ftrace_kill. 1356 * from a non-atomic section, use ftrace_kill.
1610 */ 1357 */
1611void ftrace_kill_atomic(void)
1612{
1613 ftrace_disabled = 1;
1614 ftrace_enabled = 0;
1615#ifdef CONFIG_DYNAMIC_FTRACE
1616 ftraced_suspend = -1;
1617#endif
1618 clear_ftrace_function();
1619}
1620
1621/**
1622 * ftrace_kill - totally shutdown ftrace
1623 *
1624 * This is a safety measure. If something was detected that seems
1625 * wrong, calling this function will keep ftrace from doing
1626 * any more modifications, and updates.
1627 * used when something went wrong.
1628 */
1629void ftrace_kill(void) 1358void ftrace_kill(void)
1630{ 1359{
1631 mutex_lock(&ftrace_sysctl_lock);
1632 ftrace_disabled = 1; 1360 ftrace_disabled = 1;
1633 ftrace_enabled = 0; 1361 ftrace_enabled = 0;
1634
1635 clear_ftrace_function(); 1362 clear_ftrace_function();
1636 mutex_unlock(&ftrace_sysctl_lock);
1637
1638 /* Try to totally disable ftrace */
1639 ftrace_force_shutdown();
1640} 1363}
1641 1364
1642/** 1365/**
@@ -1725,3 +1448,4 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1725 mutex_unlock(&ftrace_sysctl_lock); 1448 mutex_unlock(&ftrace_sysctl_lock);
1726 return ret; 1449 return ret;
1727} 1450}
1451
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 000000000000..f780e9552f91
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,2186 @@
1/*
2 * Generic ring buffer
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/ring_buffer.h>
7#include <linux/spinlock.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/percpu.h>
12#include <linux/mutex.h>
13#include <linux/sched.h> /* used for sched_clock() (for now) */
14#include <linux/init.h>
15#include <linux/hash.h>
16#include <linux/list.h>
17#include <linux/fs.h>
18
19#include "trace.h"
20
21/* Global flag to disable all recording to ring buffers */
22static int ring_buffers_off __read_mostly;
23
24/**
25 * tracing_on - enable all tracing buffers
26 *
27 * This function enables all tracing buffers that may have been
28 * disabled with tracing_off.
29 */
30void tracing_on(void)
31{
32 ring_buffers_off = 0;
33}
34
35/**
36 * tracing_off - turn off all tracing buffers
37 *
38 * This function stops all tracing buffers from recording data.
39 * It does not disable any overhead the tracers themselves may
40 * be causing. This function simply causes all recording to
41 * the ring buffers to fail.
42 */
43void tracing_off(void)
44{
45 ring_buffers_off = 1;
46}
47
48/* Up this if you want to test the TIME_EXTENTS and normalization */
49#define DEBUG_SHIFT 0
50
51/* FIXME!!! */
52u64 ring_buffer_time_stamp(int cpu)
53{
54 u64 time;
55
56 preempt_disable_notrace();
57 /* shift to debug/test normalization and TIME_EXTENTS */
58 time = sched_clock() << DEBUG_SHIFT;
59 preempt_enable_notrace();
60
61 return time;
62}
63
64void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
65{
66 /* Just stupid testing the normalize function and deltas */
67 *ts >>= DEBUG_SHIFT;
68}
69
70#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
71#define RB_ALIGNMENT_SHIFT 2
72#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
73#define RB_MAX_SMALL_DATA 28
74
75enum {
76 RB_LEN_TIME_EXTEND = 8,
77 RB_LEN_TIME_STAMP = 16,
78};
79
80/* inline for ring buffer fast paths */
81static inline unsigned
82rb_event_length(struct ring_buffer_event *event)
83{
84 unsigned length;
85
86 switch (event->type) {
87 case RINGBUF_TYPE_PADDING:
88 /* undefined */
89 return -1;
90
91 case RINGBUF_TYPE_TIME_EXTEND:
92 return RB_LEN_TIME_EXTEND;
93
94 case RINGBUF_TYPE_TIME_STAMP:
95 return RB_LEN_TIME_STAMP;
96
97 case RINGBUF_TYPE_DATA:
98 if (event->len)
99 length = event->len << RB_ALIGNMENT_SHIFT;
100 else
101 length = event->array[0];
102 return length + RB_EVNT_HDR_SIZE;
103 default:
104 BUG();
105 }
106 /* not hit */
107 return 0;
108}
109
110/**
111 * ring_buffer_event_length - return the length of the event
112 * @event: the event to get the length of
113 */
114unsigned ring_buffer_event_length(struct ring_buffer_event *event)
115{
116 return rb_event_length(event);
117}
118
119/* inline for ring buffer fast paths */
120static inline void *
121rb_event_data(struct ring_buffer_event *event)
122{
123 BUG_ON(event->type != RINGBUF_TYPE_DATA);
124 /* If length is in len field, then array[0] has the data */
125 if (event->len)
126 return (void *)&event->array[0];
127 /* Otherwise length is in array[0] and array[1] has the data */
128 return (void *)&event->array[1];
129}
130
131/**
132 * ring_buffer_event_data - return the data of the event
133 * @event: the event to get the data from
134 */
135void *ring_buffer_event_data(struct ring_buffer_event *event)
136{
137 return rb_event_data(event);
138}
139
140#define for_each_buffer_cpu(buffer, cpu) \
141 for_each_cpu_mask(cpu, buffer->cpumask)
142
143#define TS_SHIFT 27
144#define TS_MASK ((1ULL << TS_SHIFT) - 1)
145#define TS_DELTA_TEST (~TS_MASK)
146
147/*
148 * This hack stolen from mm/slob.c.
149 * We can store per page timing information in the page frame of the page.
150 * Thanks to Peter Zijlstra for suggesting this idea.
151 */
152struct buffer_page {
153 u64 time_stamp; /* page time stamp */
154 local_t write; /* index for next write */
155 local_t commit; /* write commited index */
156 unsigned read; /* index for next read */
157 struct list_head list; /* list of free pages */
158 void *page; /* Actual data page */
159};
160
161/*
162 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
163 * this issue out.
164 */
165static inline void free_buffer_page(struct buffer_page *bpage)
166{
167 if (bpage->page)
168 free_page((unsigned long)bpage->page);
169 kfree(bpage);
170}
171
172/*
173 * We need to fit the time_stamp delta into 27 bits.
174 */
175static inline int test_time_stamp(u64 delta)
176{
177 if (delta & TS_DELTA_TEST)
178 return 1;
179 return 0;
180}
181
182#define BUF_PAGE_SIZE PAGE_SIZE
183
184/*
185 * head_page == tail_page && head == tail then buffer is empty.
186 */
187struct ring_buffer_per_cpu {
188 int cpu;
189 struct ring_buffer *buffer;
190 spinlock_t lock;
191 struct lock_class_key lock_key;
192 struct list_head pages;
193 struct buffer_page *head_page; /* read from head */
194 struct buffer_page *tail_page; /* write to tail */
195 struct buffer_page *commit_page; /* commited pages */
196 struct buffer_page *reader_page;
197 unsigned long overrun;
198 unsigned long entries;
199 u64 write_stamp;
200 u64 read_stamp;
201 atomic_t record_disabled;
202};
203
204struct ring_buffer {
205 unsigned long size;
206 unsigned pages;
207 unsigned flags;
208 int cpus;
209 cpumask_t cpumask;
210 atomic_t record_disabled;
211
212 struct mutex mutex;
213
214 struct ring_buffer_per_cpu **buffers;
215};
216
217struct ring_buffer_iter {
218 struct ring_buffer_per_cpu *cpu_buffer;
219 unsigned long head;
220 struct buffer_page *head_page;
221 u64 read_stamp;
222};
223
224#define RB_WARN_ON(buffer, cond) \
225 do { \
226 if (unlikely(cond)) { \
227 atomic_inc(&buffer->record_disabled); \
228 WARN_ON(1); \
229 } \
230 } while (0)
231
232#define RB_WARN_ON_RET(buffer, cond) \
233 do { \
234 if (unlikely(cond)) { \
235 atomic_inc(&buffer->record_disabled); \
236 WARN_ON(1); \
237 return -1; \
238 } \
239 } while (0)
240
241#define RB_WARN_ON_ONCE(buffer, cond) \
242 do { \
243 static int once; \
244 if (unlikely(cond) && !once) { \
245 once++; \
246 atomic_inc(&buffer->record_disabled); \
247 WARN_ON(1); \
248 } \
249 } while (0)
250
251/**
252 * check_pages - integrity check of buffer pages
253 * @cpu_buffer: CPU buffer with pages to test
254 *
255 * As a safty measure we check to make sure the data pages have not
256 * been corrupted.
257 */
258static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
259{
260 struct list_head *head = &cpu_buffer->pages;
261 struct buffer_page *page, *tmp;
262
263 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
264 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
265
266 list_for_each_entry_safe(page, tmp, head, list) {
267 RB_WARN_ON_RET(cpu_buffer,
268 page->list.next->prev != &page->list);
269 RB_WARN_ON_RET(cpu_buffer,
270 page->list.prev->next != &page->list);
271 }
272
273 return 0;
274}
275
276static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
277 unsigned nr_pages)
278{
279 struct list_head *head = &cpu_buffer->pages;
280 struct buffer_page *page, *tmp;
281 unsigned long addr;
282 LIST_HEAD(pages);
283 unsigned i;
284
285 for (i = 0; i < nr_pages; i++) {
286 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
287 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
288 if (!page)
289 goto free_pages;
290 list_add(&page->list, &pages);
291
292 addr = __get_free_page(GFP_KERNEL);
293 if (!addr)
294 goto free_pages;
295 page->page = (void *)addr;
296 }
297
298 list_splice(&pages, head);
299
300 rb_check_pages(cpu_buffer);
301
302 return 0;
303
304 free_pages:
305 list_for_each_entry_safe(page, tmp, &pages, list) {
306 list_del_init(&page->list);
307 free_buffer_page(page);
308 }
309 return -ENOMEM;
310}
311
312static struct ring_buffer_per_cpu *
313rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
314{
315 struct ring_buffer_per_cpu *cpu_buffer;
316 struct buffer_page *page;
317 unsigned long addr;
318 int ret;
319
320 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
321 GFP_KERNEL, cpu_to_node(cpu));
322 if (!cpu_buffer)
323 return NULL;
324
325 cpu_buffer->cpu = cpu;
326 cpu_buffer->buffer = buffer;
327 spin_lock_init(&cpu_buffer->lock);
328 INIT_LIST_HEAD(&cpu_buffer->pages);
329
330 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
331 GFP_KERNEL, cpu_to_node(cpu));
332 if (!page)
333 goto fail_free_buffer;
334
335 cpu_buffer->reader_page = page;
336 addr = __get_free_page(GFP_KERNEL);
337 if (!addr)
338 goto fail_free_reader;
339 page->page = (void *)addr;
340
341 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
342
343 ret = rb_allocate_pages(cpu_buffer, buffer->pages);
344 if (ret < 0)
345 goto fail_free_reader;
346
347 cpu_buffer->head_page
348 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
349 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
350
351 return cpu_buffer;
352
353 fail_free_reader:
354 free_buffer_page(cpu_buffer->reader_page);
355
356 fail_free_buffer:
357 kfree(cpu_buffer);
358 return NULL;
359}
360
361static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
362{
363 struct list_head *head = &cpu_buffer->pages;
364 struct buffer_page *page, *tmp;
365
366 list_del_init(&cpu_buffer->reader_page->list);
367 free_buffer_page(cpu_buffer->reader_page);
368
369 list_for_each_entry_safe(page, tmp, head, list) {
370 list_del_init(&page->list);
371 free_buffer_page(page);
372 }
373 kfree(cpu_buffer);
374}
375
376/*
377 * Causes compile errors if the struct buffer_page gets bigger
378 * than the struct page.
379 */
380extern int ring_buffer_page_too_big(void);
381
382/**
383 * ring_buffer_alloc - allocate a new ring_buffer
384 * @size: the size in bytes that is needed.
385 * @flags: attributes to set for the ring buffer.
386 *
387 * Currently the only flag that is available is the RB_FL_OVERWRITE
388 * flag. This flag means that the buffer will overwrite old data
389 * when the buffer wraps. If this flag is not set, the buffer will
390 * drop data when the tail hits the head.
391 */
392struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
393{
394 struct ring_buffer *buffer;
395 int bsize;
396 int cpu;
397
398 /* Paranoid! Optimizes out when all is well */
399 if (sizeof(struct buffer_page) > sizeof(struct page))
400 ring_buffer_page_too_big();
401
402
403 /* keep it in its own cache line */
404 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
405 GFP_KERNEL);
406 if (!buffer)
407 return NULL;
408
409 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
410 buffer->flags = flags;
411
412 /* need at least two pages */
413 if (buffer->pages == 1)
414 buffer->pages++;
415
416 buffer->cpumask = cpu_possible_map;
417 buffer->cpus = nr_cpu_ids;
418
419 bsize = sizeof(void *) * nr_cpu_ids;
420 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
421 GFP_KERNEL);
422 if (!buffer->buffers)
423 goto fail_free_buffer;
424
425 for_each_buffer_cpu(buffer, cpu) {
426 buffer->buffers[cpu] =
427 rb_allocate_cpu_buffer(buffer, cpu);
428 if (!buffer->buffers[cpu])
429 goto fail_free_buffers;
430 }
431
432 mutex_init(&buffer->mutex);
433
434 return buffer;
435
436 fail_free_buffers:
437 for_each_buffer_cpu(buffer, cpu) {
438 if (buffer->buffers[cpu])
439 rb_free_cpu_buffer(buffer->buffers[cpu]);
440 }
441 kfree(buffer->buffers);
442
443 fail_free_buffer:
444 kfree(buffer);
445 return NULL;
446}
447
448/**
449 * ring_buffer_free - free a ring buffer.
450 * @buffer: the buffer to free.
451 */
452void
453ring_buffer_free(struct ring_buffer *buffer)
454{
455 int cpu;
456
457 for_each_buffer_cpu(buffer, cpu)
458 rb_free_cpu_buffer(buffer->buffers[cpu]);
459
460 kfree(buffer);
461}
462
463static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
464
465static void
466rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
467{
468 struct buffer_page *page;
469 struct list_head *p;
470 unsigned i;
471
472 atomic_inc(&cpu_buffer->record_disabled);
473 synchronize_sched();
474
475 for (i = 0; i < nr_pages; i++) {
476 BUG_ON(list_empty(&cpu_buffer->pages));
477 p = cpu_buffer->pages.next;
478 page = list_entry(p, struct buffer_page, list);
479 list_del_init(&page->list);
480 free_buffer_page(page);
481 }
482 BUG_ON(list_empty(&cpu_buffer->pages));
483
484 rb_reset_cpu(cpu_buffer);
485
486 rb_check_pages(cpu_buffer);
487
488 atomic_dec(&cpu_buffer->record_disabled);
489
490}
491
492static void
493rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
494 struct list_head *pages, unsigned nr_pages)
495{
496 struct buffer_page *page;
497 struct list_head *p;
498 unsigned i;
499
500 atomic_inc(&cpu_buffer->record_disabled);
501 synchronize_sched();
502
503 for (i = 0; i < nr_pages; i++) {
504 BUG_ON(list_empty(pages));
505 p = pages->next;
506 page = list_entry(p, struct buffer_page, list);
507 list_del_init(&page->list);
508 list_add_tail(&page->list, &cpu_buffer->pages);
509 }
510 rb_reset_cpu(cpu_buffer);
511
512 rb_check_pages(cpu_buffer);
513
514 atomic_dec(&cpu_buffer->record_disabled);
515}
516
517/**
518 * ring_buffer_resize - resize the ring buffer
519 * @buffer: the buffer to resize.
520 * @size: the new size.
521 *
522 * The tracer is responsible for making sure that the buffer is
523 * not being used while changing the size.
524 * Note: We may be able to change the above requirement by using
525 * RCU synchronizations.
526 *
527 * Minimum size is 2 * BUF_PAGE_SIZE.
528 *
529 * Returns -1 on failure.
530 */
531int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
532{
533 struct ring_buffer_per_cpu *cpu_buffer;
534 unsigned nr_pages, rm_pages, new_pages;
535 struct buffer_page *page, *tmp;
536 unsigned long buffer_size;
537 unsigned long addr;
538 LIST_HEAD(pages);
539 int i, cpu;
540
541 /*
542 * Always succeed at resizing a non-existent buffer:
543 */
544 if (!buffer)
545 return size;
546
547 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
548 size *= BUF_PAGE_SIZE;
549 buffer_size = buffer->pages * BUF_PAGE_SIZE;
550
551 /* we need a minimum of two pages */
552 if (size < BUF_PAGE_SIZE * 2)
553 size = BUF_PAGE_SIZE * 2;
554
555 if (size == buffer_size)
556 return size;
557
558 mutex_lock(&buffer->mutex);
559
560 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
561
562 if (size < buffer_size) {
563
564 /* easy case, just free pages */
565 BUG_ON(nr_pages >= buffer->pages);
566
567 rm_pages = buffer->pages - nr_pages;
568
569 for_each_buffer_cpu(buffer, cpu) {
570 cpu_buffer = buffer->buffers[cpu];
571 rb_remove_pages(cpu_buffer, rm_pages);
572 }
573 goto out;
574 }
575
576 /*
577 * This is a bit more difficult. We only want to add pages
578 * when we can allocate enough for all CPUs. We do this
579 * by allocating all the pages and storing them on a local
580 * link list. If we succeed in our allocation, then we
581 * add these pages to the cpu_buffers. Otherwise we just free
582 * them all and return -ENOMEM;
583 */
584 BUG_ON(nr_pages <= buffer->pages);
585 new_pages = nr_pages - buffer->pages;
586
587 for_each_buffer_cpu(buffer, cpu) {
588 for (i = 0; i < new_pages; i++) {
589 page = kzalloc_node(ALIGN(sizeof(*page),
590 cache_line_size()),
591 GFP_KERNEL, cpu_to_node(cpu));
592 if (!page)
593 goto free_pages;
594 list_add(&page->list, &pages);
595 addr = __get_free_page(GFP_KERNEL);
596 if (!addr)
597 goto free_pages;
598 page->page = (void *)addr;
599 }
600 }
601
602 for_each_buffer_cpu(buffer, cpu) {
603 cpu_buffer = buffer->buffers[cpu];
604 rb_insert_pages(cpu_buffer, &pages, new_pages);
605 }
606
607 BUG_ON(!list_empty(&pages));
608
609 out:
610 buffer->pages = nr_pages;
611 mutex_unlock(&buffer->mutex);
612
613 return size;
614
615 free_pages:
616 list_for_each_entry_safe(page, tmp, &pages, list) {
617 list_del_init(&page->list);
618 free_buffer_page(page);
619 }
620 mutex_unlock(&buffer->mutex);
621 return -ENOMEM;
622}
623
624static inline int rb_null_event(struct ring_buffer_event *event)
625{
626 return event->type == RINGBUF_TYPE_PADDING;
627}
628
629static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
630{
631 return page->page + index;
632}
633
634static inline struct ring_buffer_event *
635rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
636{
637 return __rb_page_index(cpu_buffer->reader_page,
638 cpu_buffer->reader_page->read);
639}
640
641static inline struct ring_buffer_event *
642rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
643{
644 return __rb_page_index(cpu_buffer->head_page,
645 cpu_buffer->head_page->read);
646}
647
648static inline struct ring_buffer_event *
649rb_iter_head_event(struct ring_buffer_iter *iter)
650{
651 return __rb_page_index(iter->head_page, iter->head);
652}
653
654static inline unsigned rb_page_write(struct buffer_page *bpage)
655{
656 return local_read(&bpage->write);
657}
658
659static inline unsigned rb_page_commit(struct buffer_page *bpage)
660{
661 return local_read(&bpage->commit);
662}
663
664/* Size is determined by what has been commited */
665static inline unsigned rb_page_size(struct buffer_page *bpage)
666{
667 return rb_page_commit(bpage);
668}
669
670static inline unsigned
671rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
672{
673 return rb_page_commit(cpu_buffer->commit_page);
674}
675
676static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
677{
678 return rb_page_commit(cpu_buffer->head_page);
679}
680
681/*
682 * When the tail hits the head and the buffer is in overwrite mode,
683 * the head jumps to the next page and all content on the previous
684 * page is discarded. But before doing so, we update the overrun
685 * variable of the buffer.
686 */
687static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
688{
689 struct ring_buffer_event *event;
690 unsigned long head;
691
692 for (head = 0; head < rb_head_size(cpu_buffer);
693 head += rb_event_length(event)) {
694
695 event = __rb_page_index(cpu_buffer->head_page, head);
696 BUG_ON(rb_null_event(event));
697 /* Only count data entries */
698 if (event->type != RINGBUF_TYPE_DATA)
699 continue;
700 cpu_buffer->overrun++;
701 cpu_buffer->entries--;
702 }
703}
704
705static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
706 struct buffer_page **page)
707{
708 struct list_head *p = (*page)->list.next;
709
710 if (p == &cpu_buffer->pages)
711 p = p->next;
712
713 *page = list_entry(p, struct buffer_page, list);
714}
715
716static inline unsigned
717rb_event_index(struct ring_buffer_event *event)
718{
719 unsigned long addr = (unsigned long)event;
720
721 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
722}
723
724static inline int
725rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
726 struct ring_buffer_event *event)
727{
728 unsigned long addr = (unsigned long)event;
729 unsigned long index;
730
731 index = rb_event_index(event);
732 addr &= PAGE_MASK;
733
734 return cpu_buffer->commit_page->page == (void *)addr &&
735 rb_commit_index(cpu_buffer) == index;
736}
737
738static inline void
739rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
740 struct ring_buffer_event *event)
741{
742 unsigned long addr = (unsigned long)event;
743 unsigned long index;
744
745 index = rb_event_index(event);
746 addr &= PAGE_MASK;
747
748 while (cpu_buffer->commit_page->page != (void *)addr) {
749 RB_WARN_ON(cpu_buffer,
750 cpu_buffer->commit_page == cpu_buffer->tail_page);
751 cpu_buffer->commit_page->commit =
752 cpu_buffer->commit_page->write;
753 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
754 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
755 }
756
757 /* Now set the commit to the event's index */
758 local_set(&cpu_buffer->commit_page->commit, index);
759}
760
761static inline void
762rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
763{
764 /*
765 * We only race with interrupts and NMIs on this CPU.
766 * If we own the commit event, then we can commit
767 * all others that interrupted us, since the interruptions
768 * are in stack format (they finish before they come
769 * back to us). This allows us to do a simple loop to
770 * assign the commit to the tail.
771 */
772 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
773 cpu_buffer->commit_page->commit =
774 cpu_buffer->commit_page->write;
775 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
776 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
777 /* add barrier to keep gcc from optimizing too much */
778 barrier();
779 }
780 while (rb_commit_index(cpu_buffer) !=
781 rb_page_write(cpu_buffer->commit_page)) {
782 cpu_buffer->commit_page->commit =
783 cpu_buffer->commit_page->write;
784 barrier();
785 }
786}
787
788static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
789{
790 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
791 cpu_buffer->reader_page->read = 0;
792}
793
794static inline void rb_inc_iter(struct ring_buffer_iter *iter)
795{
796 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
797
798 /*
799 * The iterator could be on the reader page (it starts there).
800 * But the head could have moved, since the reader was
801 * found. Check for this case and assign the iterator
802 * to the head page instead of next.
803 */
804 if (iter->head_page == cpu_buffer->reader_page)
805 iter->head_page = cpu_buffer->head_page;
806 else
807 rb_inc_page(cpu_buffer, &iter->head_page);
808
809 iter->read_stamp = iter->head_page->time_stamp;
810 iter->head = 0;
811}
812
813/**
814 * ring_buffer_update_event - update event type and data
815 * @event: the even to update
816 * @type: the type of event
817 * @length: the size of the event field in the ring buffer
818 *
819 * Update the type and data fields of the event. The length
820 * is the actual size that is written to the ring buffer,
821 * and with this, we can determine what to place into the
822 * data field.
823 */
824static inline void
825rb_update_event(struct ring_buffer_event *event,
826 unsigned type, unsigned length)
827{
828 event->type = type;
829
830 switch (type) {
831
832 case RINGBUF_TYPE_PADDING:
833 break;
834
835 case RINGBUF_TYPE_TIME_EXTEND:
836 event->len =
837 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
838 >> RB_ALIGNMENT_SHIFT;
839 break;
840
841 case RINGBUF_TYPE_TIME_STAMP:
842 event->len =
843 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
844 >> RB_ALIGNMENT_SHIFT;
845 break;
846
847 case RINGBUF_TYPE_DATA:
848 length -= RB_EVNT_HDR_SIZE;
849 if (length > RB_MAX_SMALL_DATA) {
850 event->len = 0;
851 event->array[0] = length;
852 } else
853 event->len =
854 (length + (RB_ALIGNMENT-1))
855 >> RB_ALIGNMENT_SHIFT;
856 break;
857 default:
858 BUG();
859 }
860}
861
862static inline unsigned rb_calculate_event_length(unsigned length)
863{
864 struct ring_buffer_event event; /* Used only for sizeof array */
865
866 /* zero length can cause confusions */
867 if (!length)
868 length = 1;
869
870 if (length > RB_MAX_SMALL_DATA)
871 length += sizeof(event.array[0]);
872
873 length += RB_EVNT_HDR_SIZE;
874 length = ALIGN(length, RB_ALIGNMENT);
875
876 return length;
877}
878
879static struct ring_buffer_event *
880__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
881 unsigned type, unsigned long length, u64 *ts)
882{
883 struct buffer_page *tail_page, *head_page, *reader_page;
884 unsigned long tail, write;
885 struct ring_buffer *buffer = cpu_buffer->buffer;
886 struct ring_buffer_event *event;
887 unsigned long flags;
888
889 tail_page = cpu_buffer->tail_page;
890 write = local_add_return(length, &tail_page->write);
891 tail = write - length;
892
893 /* See if we shot pass the end of this buffer page */
894 if (write > BUF_PAGE_SIZE) {
895 struct buffer_page *next_page = tail_page;
896
897 spin_lock_irqsave(&cpu_buffer->lock, flags);
898
899 rb_inc_page(cpu_buffer, &next_page);
900
901 head_page = cpu_buffer->head_page;
902 reader_page = cpu_buffer->reader_page;
903
904 /* we grabbed the lock before incrementing */
905 RB_WARN_ON(cpu_buffer, next_page == reader_page);
906
907 /*
908 * If for some reason, we had an interrupt storm that made
909 * it all the way around the buffer, bail, and warn
910 * about it.
911 */
912 if (unlikely(next_page == cpu_buffer->commit_page)) {
913 WARN_ON_ONCE(1);
914 goto out_unlock;
915 }
916
917 if (next_page == head_page) {
918 if (!(buffer->flags & RB_FL_OVERWRITE)) {
919 /* reset write */
920 if (tail <= BUF_PAGE_SIZE)
921 local_set(&tail_page->write, tail);
922 goto out_unlock;
923 }
924
925 /* tail_page has not moved yet? */
926 if (tail_page == cpu_buffer->tail_page) {
927 /* count overflows */
928 rb_update_overflow(cpu_buffer);
929
930 rb_inc_page(cpu_buffer, &head_page);
931 cpu_buffer->head_page = head_page;
932 cpu_buffer->head_page->read = 0;
933 }
934 }
935
936 /*
937 * If the tail page is still the same as what we think
938 * it is, then it is up to us to update the tail
939 * pointer.
940 */
941 if (tail_page == cpu_buffer->tail_page) {
942 local_set(&next_page->write, 0);
943 local_set(&next_page->commit, 0);
944 cpu_buffer->tail_page = next_page;
945
946 /* reread the time stamp */
947 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
948 cpu_buffer->tail_page->time_stamp = *ts;
949 }
950
951 /*
952 * The actual tail page has moved forward.
953 */
954 if (tail < BUF_PAGE_SIZE) {
955 /* Mark the rest of the page with padding */
956 event = __rb_page_index(tail_page, tail);
957 event->type = RINGBUF_TYPE_PADDING;
958 }
959
960 if (tail <= BUF_PAGE_SIZE)
961 /* Set the write back to the previous setting */
962 local_set(&tail_page->write, tail);
963
964 /*
965 * If this was a commit entry that failed,
966 * increment that too
967 */
968 if (tail_page == cpu_buffer->commit_page &&
969 tail == rb_commit_index(cpu_buffer)) {
970 rb_set_commit_to_write(cpu_buffer);
971 }
972
973 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
974
975 /* fail and let the caller try again */
976 return ERR_PTR(-EAGAIN);
977 }
978
979 /* We reserved something on the buffer */
980
981 BUG_ON(write > BUF_PAGE_SIZE);
982
983 event = __rb_page_index(tail_page, tail);
984 rb_update_event(event, type, length);
985
986 /*
987 * If this is a commit and the tail is zero, then update
988 * this page's time stamp.
989 */
990 if (!tail && rb_is_commit(cpu_buffer, event))
991 cpu_buffer->commit_page->time_stamp = *ts;
992
993 return event;
994
995 out_unlock:
996 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
997 return NULL;
998}
999
1000static int
1001rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1002 u64 *ts, u64 *delta)
1003{
1004 struct ring_buffer_event *event;
1005 static int once;
1006 int ret;
1007
1008 if (unlikely(*delta > (1ULL << 59) && !once++)) {
1009 printk(KERN_WARNING "Delta way too big! %llu"
1010 " ts=%llu write stamp = %llu\n",
1011 (unsigned long long)*delta,
1012 (unsigned long long)*ts,
1013 (unsigned long long)cpu_buffer->write_stamp);
1014 WARN_ON(1);
1015 }
1016
1017 /*
1018 * The delta is too big, we to add a
1019 * new timestamp.
1020 */
1021 event = __rb_reserve_next(cpu_buffer,
1022 RINGBUF_TYPE_TIME_EXTEND,
1023 RB_LEN_TIME_EXTEND,
1024 ts);
1025 if (!event)
1026 return -EBUSY;
1027
1028 if (PTR_ERR(event) == -EAGAIN)
1029 return -EAGAIN;
1030
1031 /* Only a commited time event can update the write stamp */
1032 if (rb_is_commit(cpu_buffer, event)) {
1033 /*
1034 * If this is the first on the page, then we need to
1035 * update the page itself, and just put in a zero.
1036 */
1037 if (rb_event_index(event)) {
1038 event->time_delta = *delta & TS_MASK;
1039 event->array[0] = *delta >> TS_SHIFT;
1040 } else {
1041 cpu_buffer->commit_page->time_stamp = *ts;
1042 event->time_delta = 0;
1043 event->array[0] = 0;
1044 }
1045 cpu_buffer->write_stamp = *ts;
1046 /* let the caller know this was the commit */
1047 ret = 1;
1048 } else {
1049 /* Darn, this is just wasted space */
1050 event->time_delta = 0;
1051 event->array[0] = 0;
1052 ret = 0;
1053 }
1054
1055 *delta = 0;
1056
1057 return ret;
1058}
1059
1060static struct ring_buffer_event *
1061rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1062 unsigned type, unsigned long length)
1063{
1064 struct ring_buffer_event *event;
1065 u64 ts, delta;
1066 int commit = 0;
1067 int nr_loops = 0;
1068
1069 again:
1070 /*
1071 * We allow for interrupts to reenter here and do a trace.
1072 * If one does, it will cause this original code to loop
1073 * back here. Even with heavy interrupts happening, this
1074 * should only happen a few times in a row. If this happens
1075 * 1000 times in a row, there must be either an interrupt
1076 * storm or we have something buggy.
1077 * Bail!
1078 */
1079 if (unlikely(++nr_loops > 1000)) {
1080 RB_WARN_ON(cpu_buffer, 1);
1081 return NULL;
1082 }
1083
1084 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1085
1086 /*
1087 * Only the first commit can update the timestamp.
1088 * Yes there is a race here. If an interrupt comes in
1089 * just after the conditional and it traces too, then it
1090 * will also check the deltas. More than one timestamp may
1091 * also be made. But only the entry that did the actual
1092 * commit will be something other than zero.
1093 */
1094 if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
1095 rb_page_write(cpu_buffer->tail_page) ==
1096 rb_commit_index(cpu_buffer)) {
1097
1098 delta = ts - cpu_buffer->write_stamp;
1099
1100 /* make sure this delta is calculated here */
1101 barrier();
1102
1103 /* Did the write stamp get updated already? */
1104 if (unlikely(ts < cpu_buffer->write_stamp))
1105 delta = 0;
1106
1107 if (test_time_stamp(delta)) {
1108
1109 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1110
1111 if (commit == -EBUSY)
1112 return NULL;
1113
1114 if (commit == -EAGAIN)
1115 goto again;
1116
1117 RB_WARN_ON(cpu_buffer, commit < 0);
1118 }
1119 } else
1120 /* Non commits have zero deltas */
1121 delta = 0;
1122
1123 event = __rb_reserve_next(cpu_buffer, type, length, &ts);
1124 if (PTR_ERR(event) == -EAGAIN)
1125 goto again;
1126
1127 if (!event) {
1128 if (unlikely(commit))
1129 /*
1130 * Ouch! We needed a timestamp and it was commited. But
1131 * we didn't get our event reserved.
1132 */
1133 rb_set_commit_to_write(cpu_buffer);
1134 return NULL;
1135 }
1136
1137 /*
1138 * If the timestamp was commited, make the commit our entry
1139 * now so that we will update it when needed.
1140 */
1141 if (commit)
1142 rb_set_commit_event(cpu_buffer, event);
1143 else if (!rb_is_commit(cpu_buffer, event))
1144 delta = 0;
1145
1146 event->time_delta = delta;
1147
1148 return event;
1149}
1150
1151static DEFINE_PER_CPU(int, rb_need_resched);
1152
1153/**
1154 * ring_buffer_lock_reserve - reserve a part of the buffer
1155 * @buffer: the ring buffer to reserve from
1156 * @length: the length of the data to reserve (excluding event header)
1157 * @flags: a pointer to save the interrupt flags
1158 *
1159 * Returns a reseverd event on the ring buffer to copy directly to.
1160 * The user of this interface will need to get the body to write into
1161 * and can use the ring_buffer_event_data() interface.
1162 *
1163 * The length is the length of the data needed, not the event length
1164 * which also includes the event header.
1165 *
1166 * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
1167 * If NULL is returned, then nothing has been allocated or locked.
1168 */
1169struct ring_buffer_event *
1170ring_buffer_lock_reserve(struct ring_buffer *buffer,
1171 unsigned long length,
1172 unsigned long *flags)
1173{
1174 struct ring_buffer_per_cpu *cpu_buffer;
1175 struct ring_buffer_event *event;
1176 int cpu, resched;
1177
1178 if (ring_buffers_off)
1179 return NULL;
1180
1181 if (atomic_read(&buffer->record_disabled))
1182 return NULL;
1183
1184 /* If we are tracing schedule, we don't want to recurse */
1185 resched = need_resched();
1186 preempt_disable_notrace();
1187
1188 cpu = raw_smp_processor_id();
1189
1190 if (!cpu_isset(cpu, buffer->cpumask))
1191 goto out;
1192
1193 cpu_buffer = buffer->buffers[cpu];
1194
1195 if (atomic_read(&cpu_buffer->record_disabled))
1196 goto out;
1197
1198 length = rb_calculate_event_length(length);
1199 if (length > BUF_PAGE_SIZE)
1200 goto out;
1201
1202 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
1203 if (!event)
1204 goto out;
1205
1206 /*
1207 * Need to store resched state on this cpu.
1208 * Only the first needs to.
1209 */
1210
1211 if (preempt_count() == 1)
1212 per_cpu(rb_need_resched, cpu) = resched;
1213
1214 return event;
1215
1216 out:
1217 if (resched)
1218 preempt_enable_notrace();
1219 else
1220 preempt_enable_notrace();
1221 return NULL;
1222}
1223
1224static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1225 struct ring_buffer_event *event)
1226{
1227 cpu_buffer->entries++;
1228
1229 /* Only process further if we own the commit */
1230 if (!rb_is_commit(cpu_buffer, event))
1231 return;
1232
1233 cpu_buffer->write_stamp += event->time_delta;
1234
1235 rb_set_commit_to_write(cpu_buffer);
1236}
1237
1238/**
1239 * ring_buffer_unlock_commit - commit a reserved
1240 * @buffer: The buffer to commit to
1241 * @event: The event pointer to commit.
1242 * @flags: the interrupt flags received from ring_buffer_lock_reserve.
1243 *
1244 * This commits the data to the ring buffer, and releases any locks held.
1245 *
1246 * Must be paired with ring_buffer_lock_reserve.
1247 */
1248int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1249 struct ring_buffer_event *event,
1250 unsigned long flags)
1251{
1252 struct ring_buffer_per_cpu *cpu_buffer;
1253 int cpu = raw_smp_processor_id();
1254
1255 cpu_buffer = buffer->buffers[cpu];
1256
1257 rb_commit(cpu_buffer, event);
1258
1259 /*
1260 * Only the last preempt count needs to restore preemption.
1261 */
1262 if (preempt_count() == 1) {
1263 if (per_cpu(rb_need_resched, cpu))
1264 preempt_enable_no_resched_notrace();
1265 else
1266 preempt_enable_notrace();
1267 } else
1268 preempt_enable_no_resched_notrace();
1269
1270 return 0;
1271}
1272
1273/**
1274 * ring_buffer_write - write data to the buffer without reserving
1275 * @buffer: The ring buffer to write to.
1276 * @length: The length of the data being written (excluding the event header)
1277 * @data: The data to write to the buffer.
1278 *
1279 * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
1280 * one function. If you already have the data to write to the buffer, it
1281 * may be easier to simply call this function.
1282 *
1283 * Note, like ring_buffer_lock_reserve, the length is the length of the data
1284 * and not the length of the event which would hold the header.
1285 */
1286int ring_buffer_write(struct ring_buffer *buffer,
1287 unsigned long length,
1288 void *data)
1289{
1290 struct ring_buffer_per_cpu *cpu_buffer;
1291 struct ring_buffer_event *event;
1292 unsigned long event_length;
1293 void *body;
1294 int ret = -EBUSY;
1295 int cpu, resched;
1296
1297 if (ring_buffers_off)
1298 return -EBUSY;
1299
1300 if (atomic_read(&buffer->record_disabled))
1301 return -EBUSY;
1302
1303 resched = need_resched();
1304 preempt_disable_notrace();
1305
1306 cpu = raw_smp_processor_id();
1307
1308 if (!cpu_isset(cpu, buffer->cpumask))
1309 goto out;
1310
1311 cpu_buffer = buffer->buffers[cpu];
1312
1313 if (atomic_read(&cpu_buffer->record_disabled))
1314 goto out;
1315
1316 event_length = rb_calculate_event_length(length);
1317 event = rb_reserve_next_event(cpu_buffer,
1318 RINGBUF_TYPE_DATA, event_length);
1319 if (!event)
1320 goto out;
1321
1322 body = rb_event_data(event);
1323
1324 memcpy(body, data, length);
1325
1326 rb_commit(cpu_buffer, event);
1327
1328 ret = 0;
1329 out:
1330 if (resched)
1331 preempt_enable_no_resched_notrace();
1332 else
1333 preempt_enable_notrace();
1334
1335 return ret;
1336}
1337
1338static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1339{
1340 struct buffer_page *reader = cpu_buffer->reader_page;
1341 struct buffer_page *head = cpu_buffer->head_page;
1342 struct buffer_page *commit = cpu_buffer->commit_page;
1343
1344 return reader->read == rb_page_commit(reader) &&
1345 (commit == reader ||
1346 (commit == head &&
1347 head->read == rb_page_commit(commit)));
1348}
1349
1350/**
1351 * ring_buffer_record_disable - stop all writes into the buffer
1352 * @buffer: The ring buffer to stop writes to.
1353 *
1354 * This prevents all writes to the buffer. Any attempt to write
1355 * to the buffer after this will fail and return NULL.
1356 *
1357 * The caller should call synchronize_sched() after this.
1358 */
1359void ring_buffer_record_disable(struct ring_buffer *buffer)
1360{
1361 atomic_inc(&buffer->record_disabled);
1362}
1363
1364/**
1365 * ring_buffer_record_enable - enable writes to the buffer
1366 * @buffer: The ring buffer to enable writes
1367 *
1368 * Note, multiple disables will need the same number of enables
1369 * to truely enable the writing (much like preempt_disable).
1370 */
1371void ring_buffer_record_enable(struct ring_buffer *buffer)
1372{
1373 atomic_dec(&buffer->record_disabled);
1374}
1375
1376/**
1377 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
1378 * @buffer: The ring buffer to stop writes to.
1379 * @cpu: The CPU buffer to stop
1380 *
1381 * This prevents all writes to the buffer. Any attempt to write
1382 * to the buffer after this will fail and return NULL.
1383 *
1384 * The caller should call synchronize_sched() after this.
1385 */
1386void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1387{
1388 struct ring_buffer_per_cpu *cpu_buffer;
1389
1390 if (!cpu_isset(cpu, buffer->cpumask))
1391 return;
1392
1393 cpu_buffer = buffer->buffers[cpu];
1394 atomic_inc(&cpu_buffer->record_disabled);
1395}
1396
1397/**
1398 * ring_buffer_record_enable_cpu - enable writes to the buffer
1399 * @buffer: The ring buffer to enable writes
1400 * @cpu: The CPU to enable.
1401 *
1402 * Note, multiple disables will need the same number of enables
1403 * to truely enable the writing (much like preempt_disable).
1404 */
1405void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1406{
1407 struct ring_buffer_per_cpu *cpu_buffer;
1408
1409 if (!cpu_isset(cpu, buffer->cpumask))
1410 return;
1411
1412 cpu_buffer = buffer->buffers[cpu];
1413 atomic_dec(&cpu_buffer->record_disabled);
1414}
1415
1416/**
1417 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
1418 * @buffer: The ring buffer
1419 * @cpu: The per CPU buffer to get the entries from.
1420 */
1421unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1422{
1423 struct ring_buffer_per_cpu *cpu_buffer;
1424
1425 if (!cpu_isset(cpu, buffer->cpumask))
1426 return 0;
1427
1428 cpu_buffer = buffer->buffers[cpu];
1429 return cpu_buffer->entries;
1430}
1431
1432/**
1433 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
1434 * @buffer: The ring buffer
1435 * @cpu: The per CPU buffer to get the number of overruns from
1436 */
1437unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1438{
1439 struct ring_buffer_per_cpu *cpu_buffer;
1440
1441 if (!cpu_isset(cpu, buffer->cpumask))
1442 return 0;
1443
1444 cpu_buffer = buffer->buffers[cpu];
1445 return cpu_buffer->overrun;
1446}
1447
1448/**
1449 * ring_buffer_entries - get the number of entries in a buffer
1450 * @buffer: The ring buffer
1451 *
1452 * Returns the total number of entries in the ring buffer
1453 * (all CPU entries)
1454 */
1455unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1456{
1457 struct ring_buffer_per_cpu *cpu_buffer;
1458 unsigned long entries = 0;
1459 int cpu;
1460
1461 /* if you care about this being correct, lock the buffer */
1462 for_each_buffer_cpu(buffer, cpu) {
1463 cpu_buffer = buffer->buffers[cpu];
1464 entries += cpu_buffer->entries;
1465 }
1466
1467 return entries;
1468}
1469
1470/**
1471 * ring_buffer_overrun_cpu - get the number of overruns in buffer
1472 * @buffer: The ring buffer
1473 *
1474 * Returns the total number of overruns in the ring buffer
1475 * (all CPU entries)
1476 */
1477unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1478{
1479 struct ring_buffer_per_cpu *cpu_buffer;
1480 unsigned long overruns = 0;
1481 int cpu;
1482
1483 /* if you care about this being correct, lock the buffer */
1484 for_each_buffer_cpu(buffer, cpu) {
1485 cpu_buffer = buffer->buffers[cpu];
1486 overruns += cpu_buffer->overrun;
1487 }
1488
1489 return overruns;
1490}
1491
1492/**
1493 * ring_buffer_iter_reset - reset an iterator
1494 * @iter: The iterator to reset
1495 *
1496 * Resets the iterator, so that it will start from the beginning
1497 * again.
1498 */
1499void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1500{
1501 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1502
1503 /* Iterator usage is expected to have record disabled */
1504 if (list_empty(&cpu_buffer->reader_page->list)) {
1505 iter->head_page = cpu_buffer->head_page;
1506 iter->head = cpu_buffer->head_page->read;
1507 } else {
1508 iter->head_page = cpu_buffer->reader_page;
1509 iter->head = cpu_buffer->reader_page->read;
1510 }
1511 if (iter->head)
1512 iter->read_stamp = cpu_buffer->read_stamp;
1513 else
1514 iter->read_stamp = iter->head_page->time_stamp;
1515}
1516
1517/**
1518 * ring_buffer_iter_empty - check if an iterator has no more to read
1519 * @iter: The iterator to check
1520 */
1521int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1522{
1523 struct ring_buffer_per_cpu *cpu_buffer;
1524
1525 cpu_buffer = iter->cpu_buffer;
1526
1527 return iter->head_page == cpu_buffer->commit_page &&
1528 iter->head == rb_commit_index(cpu_buffer);
1529}
1530
1531static void
1532rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1533 struct ring_buffer_event *event)
1534{
1535 u64 delta;
1536
1537 switch (event->type) {
1538 case RINGBUF_TYPE_PADDING:
1539 return;
1540
1541 case RINGBUF_TYPE_TIME_EXTEND:
1542 delta = event->array[0];
1543 delta <<= TS_SHIFT;
1544 delta += event->time_delta;
1545 cpu_buffer->read_stamp += delta;
1546 return;
1547
1548 case RINGBUF_TYPE_TIME_STAMP:
1549 /* FIXME: not implemented */
1550 return;
1551
1552 case RINGBUF_TYPE_DATA:
1553 cpu_buffer->read_stamp += event->time_delta;
1554 return;
1555
1556 default:
1557 BUG();
1558 }
1559 return;
1560}
1561
1562static void
1563rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1564 struct ring_buffer_event *event)
1565{
1566 u64 delta;
1567
1568 switch (event->type) {
1569 case RINGBUF_TYPE_PADDING:
1570 return;
1571
1572 case RINGBUF_TYPE_TIME_EXTEND:
1573 delta = event->array[0];
1574 delta <<= TS_SHIFT;
1575 delta += event->time_delta;
1576 iter->read_stamp += delta;
1577 return;
1578
1579 case RINGBUF_TYPE_TIME_STAMP:
1580 /* FIXME: not implemented */
1581 return;
1582
1583 case RINGBUF_TYPE_DATA:
1584 iter->read_stamp += event->time_delta;
1585 return;
1586
1587 default:
1588 BUG();
1589 }
1590 return;
1591}
1592
1593static struct buffer_page *
1594rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1595{
1596 struct buffer_page *reader = NULL;
1597 unsigned long flags;
1598 int nr_loops = 0;
1599
1600 spin_lock_irqsave(&cpu_buffer->lock, flags);
1601
1602 again:
1603 /*
1604 * This should normally only loop twice. But because the
1605 * start of the reader inserts an empty page, it causes
1606 * a case where we will loop three times. There should be no
1607 * reason to loop four times (that I know of).
1608 */
1609 if (unlikely(++nr_loops > 3)) {
1610 RB_WARN_ON(cpu_buffer, 1);
1611 reader = NULL;
1612 goto out;
1613 }
1614
1615 reader = cpu_buffer->reader_page;
1616
1617 /* If there's more to read, return this page */
1618 if (cpu_buffer->reader_page->read < rb_page_size(reader))
1619 goto out;
1620
1621 /* Never should we have an index greater than the size */
1622 RB_WARN_ON(cpu_buffer,
1623 cpu_buffer->reader_page->read > rb_page_size(reader));
1624
1625 /* check if we caught up to the tail */
1626 reader = NULL;
1627 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
1628 goto out;
1629
1630 /*
1631 * Splice the empty reader page into the list around the head.
1632 * Reset the reader page to size zero.
1633 */
1634
1635 reader = cpu_buffer->head_page;
1636 cpu_buffer->reader_page->list.next = reader->list.next;
1637 cpu_buffer->reader_page->list.prev = reader->list.prev;
1638
1639 local_set(&cpu_buffer->reader_page->write, 0);
1640 local_set(&cpu_buffer->reader_page->commit, 0);
1641
1642 /* Make the reader page now replace the head */
1643 reader->list.prev->next = &cpu_buffer->reader_page->list;
1644 reader->list.next->prev = &cpu_buffer->reader_page->list;
1645
1646 /*
1647 * If the tail is on the reader, then we must set the head
1648 * to the inserted page, otherwise we set it one before.
1649 */
1650 cpu_buffer->head_page = cpu_buffer->reader_page;
1651
1652 if (cpu_buffer->commit_page != reader)
1653 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
1654
1655 /* Finally update the reader page to the new head */
1656 cpu_buffer->reader_page = reader;
1657 rb_reset_reader_page(cpu_buffer);
1658
1659 goto again;
1660
1661 out:
1662 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
1663
1664 return reader;
1665}
1666
1667static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1668{
1669 struct ring_buffer_event *event;
1670 struct buffer_page *reader;
1671 unsigned length;
1672
1673 reader = rb_get_reader_page(cpu_buffer);
1674
1675 /* This function should not be called when buffer is empty */
1676 BUG_ON(!reader);
1677
1678 event = rb_reader_event(cpu_buffer);
1679
1680 if (event->type == RINGBUF_TYPE_DATA)
1681 cpu_buffer->entries--;
1682
1683 rb_update_read_stamp(cpu_buffer, event);
1684
1685 length = rb_event_length(event);
1686 cpu_buffer->reader_page->read += length;
1687}
1688
1689static void rb_advance_iter(struct ring_buffer_iter *iter)
1690{
1691 struct ring_buffer *buffer;
1692 struct ring_buffer_per_cpu *cpu_buffer;
1693 struct ring_buffer_event *event;
1694 unsigned length;
1695
1696 cpu_buffer = iter->cpu_buffer;
1697 buffer = cpu_buffer->buffer;
1698
1699 /*
1700 * Check if we are at the end of the buffer.
1701 */
1702 if (iter->head >= rb_page_size(iter->head_page)) {
1703 BUG_ON(iter->head_page == cpu_buffer->commit_page);
1704 rb_inc_iter(iter);
1705 return;
1706 }
1707
1708 event = rb_iter_head_event(iter);
1709
1710 length = rb_event_length(event);
1711
1712 /*
1713 * This should not be called to advance the header if we are
1714 * at the tail of the buffer.
1715 */
1716 BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
1717 (iter->head + length > rb_commit_index(cpu_buffer)));
1718
1719 rb_update_iter_read_stamp(iter, event);
1720
1721 iter->head += length;
1722
1723 /* check for end of page padding */
1724 if ((iter->head >= rb_page_size(iter->head_page)) &&
1725 (iter->head_page != cpu_buffer->commit_page))
1726 rb_advance_iter(iter);
1727}
1728
1729/**
1730 * ring_buffer_peek - peek at the next event to be read
1731 * @buffer: The ring buffer to read
1732 * @cpu: The cpu to peak at
1733 * @ts: The timestamp counter of this event.
1734 *
1735 * This will return the event that will be read next, but does
1736 * not consume the data.
1737 */
1738struct ring_buffer_event *
1739ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1740{
1741 struct ring_buffer_per_cpu *cpu_buffer;
1742 struct ring_buffer_event *event;
1743 struct buffer_page *reader;
1744 int nr_loops = 0;
1745
1746 if (!cpu_isset(cpu, buffer->cpumask))
1747 return NULL;
1748
1749 cpu_buffer = buffer->buffers[cpu];
1750
1751 again:
1752 /*
1753 * We repeat when a timestamp is encountered. It is possible
1754 * to get multiple timestamps from an interrupt entering just
1755 * as one timestamp is about to be written. The max times
1756 * that this can happen is the number of nested interrupts we
1757 * can have. Nesting 10 deep of interrupts is clearly
1758 * an anomaly.
1759 */
1760 if (unlikely(++nr_loops > 10)) {
1761 RB_WARN_ON(cpu_buffer, 1);
1762 return NULL;
1763 }
1764
1765 reader = rb_get_reader_page(cpu_buffer);
1766 if (!reader)
1767 return NULL;
1768
1769 event = rb_reader_event(cpu_buffer);
1770
1771 switch (event->type) {
1772 case RINGBUF_TYPE_PADDING:
1773 RB_WARN_ON(cpu_buffer, 1);
1774 rb_advance_reader(cpu_buffer);
1775 return NULL;
1776
1777 case RINGBUF_TYPE_TIME_EXTEND:
1778 /* Internal data, OK to advance */
1779 rb_advance_reader(cpu_buffer);
1780 goto again;
1781
1782 case RINGBUF_TYPE_TIME_STAMP:
1783 /* FIXME: not implemented */
1784 rb_advance_reader(cpu_buffer);
1785 goto again;
1786
1787 case RINGBUF_TYPE_DATA:
1788 if (ts) {
1789 *ts = cpu_buffer->read_stamp + event->time_delta;
1790 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
1791 }
1792 return event;
1793
1794 default:
1795 BUG();
1796 }
1797
1798 return NULL;
1799}
1800
1801/**
1802 * ring_buffer_iter_peek - peek at the next event to be read
1803 * @iter: The ring buffer iterator
1804 * @ts: The timestamp counter of this event.
1805 *
1806 * This will return the event that will be read next, but does
1807 * not increment the iterator.
1808 */
1809struct ring_buffer_event *
1810ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1811{
1812 struct ring_buffer *buffer;
1813 struct ring_buffer_per_cpu *cpu_buffer;
1814 struct ring_buffer_event *event;
1815 int nr_loops = 0;
1816
1817 if (ring_buffer_iter_empty(iter))
1818 return NULL;
1819
1820 cpu_buffer = iter->cpu_buffer;
1821 buffer = cpu_buffer->buffer;
1822
1823 again:
1824 /*
1825 * We repeat when a timestamp is encountered. It is possible
1826 * to get multiple timestamps from an interrupt entering just
1827 * as one timestamp is about to be written. The max times
1828 * that this can happen is the number of nested interrupts we
1829 * can have. Nesting 10 deep of interrupts is clearly
1830 * an anomaly.
1831 */
1832 if (unlikely(++nr_loops > 10)) {
1833 RB_WARN_ON(cpu_buffer, 1);
1834 return NULL;
1835 }
1836
1837 if (rb_per_cpu_empty(cpu_buffer))
1838 return NULL;
1839
1840 event = rb_iter_head_event(iter);
1841
1842 switch (event->type) {
1843 case RINGBUF_TYPE_PADDING:
1844 rb_inc_iter(iter);
1845 goto again;
1846
1847 case RINGBUF_TYPE_TIME_EXTEND:
1848 /* Internal data, OK to advance */
1849 rb_advance_iter(iter);
1850 goto again;
1851
1852 case RINGBUF_TYPE_TIME_STAMP:
1853 /* FIXME: not implemented */
1854 rb_advance_iter(iter);
1855 goto again;
1856
1857 case RINGBUF_TYPE_DATA:
1858 if (ts) {
1859 *ts = iter->read_stamp + event->time_delta;
1860 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
1861 }
1862 return event;
1863
1864 default:
1865 BUG();
1866 }
1867
1868 return NULL;
1869}
1870
1871/**
1872 * ring_buffer_consume - return an event and consume it
1873 * @buffer: The ring buffer to get the next event from
1874 *
1875 * Returns the next event in the ring buffer, and that event is consumed.
1876 * Meaning, that sequential reads will keep returning a different event,
1877 * and eventually empty the ring buffer if the producer is slower.
1878 */
1879struct ring_buffer_event *
1880ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1881{
1882 struct ring_buffer_per_cpu *cpu_buffer;
1883 struct ring_buffer_event *event;
1884
1885 if (!cpu_isset(cpu, buffer->cpumask))
1886 return NULL;
1887
1888 event = ring_buffer_peek(buffer, cpu, ts);
1889 if (!event)
1890 return NULL;
1891
1892 cpu_buffer = buffer->buffers[cpu];
1893 rb_advance_reader(cpu_buffer);
1894
1895 return event;
1896}
1897
1898/**
1899 * ring_buffer_read_start - start a non consuming read of the buffer
1900 * @buffer: The ring buffer to read from
1901 * @cpu: The cpu buffer to iterate over
1902 *
1903 * This starts up an iteration through the buffer. It also disables
1904 * the recording to the buffer until the reading is finished.
1905 * This prevents the reading from being corrupted. This is not
1906 * a consuming read, so a producer is not expected.
1907 *
1908 * Must be paired with ring_buffer_finish.
1909 */
1910struct ring_buffer_iter *
1911ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1912{
1913 struct ring_buffer_per_cpu *cpu_buffer;
1914 struct ring_buffer_iter *iter;
1915 unsigned long flags;
1916
1917 if (!cpu_isset(cpu, buffer->cpumask))
1918 return NULL;
1919
1920 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
1921 if (!iter)
1922 return NULL;
1923
1924 cpu_buffer = buffer->buffers[cpu];
1925
1926 iter->cpu_buffer = cpu_buffer;
1927
1928 atomic_inc(&cpu_buffer->record_disabled);
1929 synchronize_sched();
1930
1931 spin_lock_irqsave(&cpu_buffer->lock, flags);
1932 ring_buffer_iter_reset(iter);
1933 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
1934
1935 return iter;
1936}
1937
1938/**
1939 * ring_buffer_finish - finish reading the iterator of the buffer
1940 * @iter: The iterator retrieved by ring_buffer_start
1941 *
1942 * This re-enables the recording to the buffer, and frees the
1943 * iterator.
1944 */
1945void
1946ring_buffer_read_finish(struct ring_buffer_iter *iter)
1947{
1948 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1949
1950 atomic_dec(&cpu_buffer->record_disabled);
1951 kfree(iter);
1952}
1953
1954/**
1955 * ring_buffer_read - read the next item in the ring buffer by the iterator
1956 * @iter: The ring buffer iterator
1957 * @ts: The time stamp of the event read.
1958 *
1959 * This reads the next event in the ring buffer and increments the iterator.
1960 */
1961struct ring_buffer_event *
1962ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1963{
1964 struct ring_buffer_event *event;
1965
1966 event = ring_buffer_iter_peek(iter, ts);
1967 if (!event)
1968 return NULL;
1969
1970 rb_advance_iter(iter);
1971
1972 return event;
1973}
1974
1975/**
1976 * ring_buffer_size - return the size of the ring buffer (in bytes)
1977 * @buffer: The ring buffer.
1978 */
1979unsigned long ring_buffer_size(struct ring_buffer *buffer)
1980{
1981 return BUF_PAGE_SIZE * buffer->pages;
1982}
1983
1984static void
1985rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1986{
1987 cpu_buffer->head_page
1988 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
1989 local_set(&cpu_buffer->head_page->write, 0);
1990 local_set(&cpu_buffer->head_page->commit, 0);
1991
1992 cpu_buffer->head_page->read = 0;
1993
1994 cpu_buffer->tail_page = cpu_buffer->head_page;
1995 cpu_buffer->commit_page = cpu_buffer->head_page;
1996
1997 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1998 local_set(&cpu_buffer->reader_page->write, 0);
1999 local_set(&cpu_buffer->reader_page->commit, 0);
2000 cpu_buffer->reader_page->read = 0;
2001
2002 cpu_buffer->overrun = 0;
2003 cpu_buffer->entries = 0;
2004}
2005
2006/**
2007 * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
2008 * @buffer: The ring buffer to reset a per cpu buffer of
2009 * @cpu: The CPU buffer to be reset
2010 */
2011void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2012{
2013 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2014 unsigned long flags;
2015
2016 if (!cpu_isset(cpu, buffer->cpumask))
2017 return;
2018
2019 spin_lock_irqsave(&cpu_buffer->lock, flags);
2020
2021 rb_reset_cpu(cpu_buffer);
2022
2023 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
2024}
2025
2026/**
2027 * ring_buffer_reset - reset a ring buffer
2028 * @buffer: The ring buffer to reset all cpu buffers
2029 */
2030void ring_buffer_reset(struct ring_buffer *buffer)
2031{
2032 int cpu;
2033
2034 for_each_buffer_cpu(buffer, cpu)
2035 ring_buffer_reset_cpu(buffer, cpu);
2036}
2037
2038/**
2039 * rind_buffer_empty - is the ring buffer empty?
2040 * @buffer: The ring buffer to test
2041 */
2042int ring_buffer_empty(struct ring_buffer *buffer)
2043{
2044 struct ring_buffer_per_cpu *cpu_buffer;
2045 int cpu;
2046
2047 /* yes this is racy, but if you don't like the race, lock the buffer */
2048 for_each_buffer_cpu(buffer, cpu) {
2049 cpu_buffer = buffer->buffers[cpu];
2050 if (!rb_per_cpu_empty(cpu_buffer))
2051 return 0;
2052 }
2053 return 1;
2054}
2055
2056/**
2057 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
2058 * @buffer: The ring buffer
2059 * @cpu: The CPU buffer to test
2060 */
2061int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2062{
2063 struct ring_buffer_per_cpu *cpu_buffer;
2064
2065 if (!cpu_isset(cpu, buffer->cpumask))
2066 return 1;
2067
2068 cpu_buffer = buffer->buffers[cpu];
2069 return rb_per_cpu_empty(cpu_buffer);
2070}
2071
2072/**
2073 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2074 * @buffer_a: One buffer to swap with
2075 * @buffer_b: The other buffer to swap with
2076 *
2077 * This function is useful for tracers that want to take a "snapshot"
2078 * of a CPU buffer and has another back up buffer lying around.
2079 * it is expected that the tracer handles the cpu buffer not being
2080 * used at the moment.
2081 */
2082int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2083 struct ring_buffer *buffer_b, int cpu)
2084{
2085 struct ring_buffer_per_cpu *cpu_buffer_a;
2086 struct ring_buffer_per_cpu *cpu_buffer_b;
2087
2088 if (!cpu_isset(cpu, buffer_a->cpumask) ||
2089 !cpu_isset(cpu, buffer_b->cpumask))
2090 return -EINVAL;
2091
2092 /* At least make sure the two buffers are somewhat the same */
2093 if (buffer_a->size != buffer_b->size ||
2094 buffer_a->pages != buffer_b->pages)
2095 return -EINVAL;
2096
2097 cpu_buffer_a = buffer_a->buffers[cpu];
2098 cpu_buffer_b = buffer_b->buffers[cpu];
2099
2100 /*
2101 * We can't do a synchronize_sched here because this
2102 * function can be called in atomic context.
2103 * Normally this will be called from the same CPU as cpu.
2104 * If not it's up to the caller to protect this.
2105 */
2106 atomic_inc(&cpu_buffer_a->record_disabled);
2107 atomic_inc(&cpu_buffer_b->record_disabled);
2108
2109 buffer_a->buffers[cpu] = cpu_buffer_b;
2110 buffer_b->buffers[cpu] = cpu_buffer_a;
2111
2112 cpu_buffer_b->buffer = buffer_a;
2113 cpu_buffer_a->buffer = buffer_b;
2114
2115 atomic_dec(&cpu_buffer_a->record_disabled);
2116 atomic_dec(&cpu_buffer_b->record_disabled);
2117
2118 return 0;
2119}
2120
2121static ssize_t
2122rb_simple_read(struct file *filp, char __user *ubuf,
2123 size_t cnt, loff_t *ppos)
2124{
2125 int *p = filp->private_data;
2126 char buf[64];
2127 int r;
2128
2129 /* !ring_buffers_off == tracing_on */
2130 r = sprintf(buf, "%d\n", !*p);
2131
2132 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2133}
2134
2135static ssize_t
2136rb_simple_write(struct file *filp, const char __user *ubuf,
2137 size_t cnt, loff_t *ppos)
2138{
2139 int *p = filp->private_data;
2140 char buf[64];
2141 long val;
2142 int ret;
2143
2144 if (cnt >= sizeof(buf))
2145 return -EINVAL;
2146
2147 if (copy_from_user(&buf, ubuf, cnt))
2148 return -EFAULT;
2149
2150 buf[cnt] = 0;
2151
2152 ret = strict_strtoul(buf, 10, &val);
2153 if (ret < 0)
2154 return ret;
2155
2156 /* !ring_buffers_off == tracing_on */
2157 *p = !val;
2158
2159 (*ppos)++;
2160
2161 return cnt;
2162}
2163
2164static struct file_operations rb_simple_fops = {
2165 .open = tracing_open_generic,
2166 .read = rb_simple_read,
2167 .write = rb_simple_write,
2168};
2169
2170
2171static __init int rb_init_debugfs(void)
2172{
2173 struct dentry *d_tracer;
2174 struct dentry *entry;
2175
2176 d_tracer = tracing_init_dentry();
2177
2178 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2179 &ring_buffers_off, &rb_simple_fops);
2180 if (!entry)
2181 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2182
2183 return 0;
2184}
2185
2186fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f3fb3db61c3..d86e3252f300 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -14,6 +14,7 @@
14#include <linux/utsrelease.h> 14#include <linux/utsrelease.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/notifier.h>
17#include <linux/debugfs.h> 18#include <linux/debugfs.h>
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
@@ -22,6 +23,7 @@
22#include <linux/ftrace.h> 23#include <linux/ftrace.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/kdebug.h>
25#include <linux/ctype.h> 27#include <linux/ctype.h>
26#include <linux/init.h> 28#include <linux/init.h>
27#include <linux/poll.h> 29#include <linux/poll.h>
@@ -31,25 +33,37 @@
31#include <linux/writeback.h> 33#include <linux/writeback.h>
32 34
33#include <linux/stacktrace.h> 35#include <linux/stacktrace.h>
36#include <linux/ring_buffer.h>
37#include <linux/irqflags.h>
34 38
35#include "trace.h" 39#include "trace.h"
36 40
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42
37unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
38unsigned long __read_mostly tracing_thresh; 44unsigned long __read_mostly tracing_thresh;
39 45
40static unsigned long __read_mostly tracing_nr_buffers; 46static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
47
48static inline void ftrace_disable_cpu(void)
49{
50 preempt_disable();
51 local_inc(&__get_cpu_var(ftrace_cpu_disabled));
52}
53
54static inline void ftrace_enable_cpu(void)
55{
56 local_dec(&__get_cpu_var(ftrace_cpu_disabled));
57 preempt_enable();
58}
59
41static cpumask_t __read_mostly tracing_buffer_mask; 60static cpumask_t __read_mostly tracing_buffer_mask;
42 61
43#define for_each_tracing_cpu(cpu) \ 62#define for_each_tracing_cpu(cpu) \
44 for_each_cpu_mask(cpu, tracing_buffer_mask) 63 for_each_cpu_mask(cpu, tracing_buffer_mask)
45 64
46static int trace_alloc_page(void);
47static int trace_free_page(void);
48
49static int tracing_disabled = 1; 65static int tracing_disabled = 1;
50 66
51static unsigned long tracing_pages_allocated;
52
53long 67long
54ns2usecs(cycle_t nsec) 68ns2usecs(cycle_t nsec)
55{ 69{
@@ -60,7 +74,9 @@ ns2usecs(cycle_t nsec)
60 74
61cycle_t ftrace_now(int cpu) 75cycle_t ftrace_now(int cpu)
62{ 76{
63 return cpu_clock(cpu); 77 u64 ts = ring_buffer_time_stamp(cpu);
78 ring_buffer_normalize_time_stamp(cpu, &ts);
79 return ts;
64} 80}
65 81
66/* 82/*
@@ -100,11 +116,18 @@ static int tracer_enabled = 1;
100int ftrace_function_enabled; 116int ftrace_function_enabled;
101 117
102/* 118/*
103 * trace_nr_entries is the number of entries that is allocated 119 * trace_buf_size is the size in bytes that is allocated
104 * for a buffer. Note, the number of entries is always rounded 120 * for a buffer. Note, the number of bytes is always rounded
105 * to ENTRIES_PER_PAGE. 121 * to page size.
122 *
123 * This number is purposely set to a low number of 16384.
124 * If the dump on oops happens, it will be much appreciated
125 * to not have to wait for all that output. Anyway this can be
126 * boot time and run time configurable.
106 */ 127 */
107static unsigned long trace_nr_entries = 65536UL; 128#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */
129
130static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
108 131
109/* trace_types holds a link list of available tracers. */ 132/* trace_types holds a link list of available tracers. */
110static struct tracer *trace_types __read_mostly; 133static struct tracer *trace_types __read_mostly;
@@ -133,24 +156,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
133/* trace_flags holds iter_ctrl options */ 156/* trace_flags holds iter_ctrl options */
134unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 157unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
135 158
136static notrace void no_trace_init(struct trace_array *tr)
137{
138 int cpu;
139
140 ftrace_function_enabled = 0;
141 if(tr->ctrl)
142 for_each_online_cpu(cpu)
143 tracing_reset(tr->data[cpu]);
144 tracer_enabled = 0;
145}
146
147/* dummy trace to disable tracing */
148static struct tracer no_tracer __read_mostly = {
149 .name = "none",
150 .init = no_trace_init
151};
152
153
154/** 159/**
155 * trace_wake_up - wake up tasks waiting for trace input 160 * trace_wake_up - wake up tasks waiting for trace input
156 * 161 *
@@ -167,23 +172,21 @@ void trace_wake_up(void)
167 wake_up(&trace_wait); 172 wake_up(&trace_wait);
168} 173}
169 174
170#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) 175static int __init set_buf_size(char *str)
171
172static int __init set_nr_entries(char *str)
173{ 176{
174 unsigned long nr_entries; 177 unsigned long buf_size;
175 int ret; 178 int ret;
176 179
177 if (!str) 180 if (!str)
178 return 0; 181 return 0;
179 ret = strict_strtoul(str, 0, &nr_entries); 182 ret = strict_strtoul(str, 0, &buf_size);
180 /* nr_entries can not be zero */ 183 /* nr_entries can not be zero */
181 if (ret < 0 || nr_entries == 0) 184 if (ret < 0 || buf_size == 0)
182 return 0; 185 return 0;
183 trace_nr_entries = nr_entries; 186 trace_buf_size = buf_size;
184 return 1; 187 return 1;
185} 188}
186__setup("trace_entries=", set_nr_entries); 189__setup("trace_buf_size=", set_buf_size);
187 190
188unsigned long nsecs_to_usecs(unsigned long nsecs) 191unsigned long nsecs_to_usecs(unsigned long nsecs)
189{ 192{
@@ -191,21 +194,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
191} 194}
192 195
193/* 196/*
194 * trace_flag_type is an enumeration that holds different
195 * states when a trace occurs. These are:
196 * IRQS_OFF - interrupts were disabled
197 * NEED_RESCED - reschedule is requested
198 * HARDIRQ - inside an interrupt handler
199 * SOFTIRQ - inside a softirq handler
200 */
201enum trace_flag_type {
202 TRACE_FLAG_IRQS_OFF = 0x01,
203 TRACE_FLAG_NEED_RESCHED = 0x02,
204 TRACE_FLAG_HARDIRQ = 0x04,
205 TRACE_FLAG_SOFTIRQ = 0x08,
206};
207
208/*
209 * TRACE_ITER_SYM_MASK masks the options in trace_flags that 197 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
210 * control the output of kernel symbols. 198 * control the output of kernel symbols.
211 */ 199 */
@@ -224,6 +212,7 @@ static const char *trace_options[] = {
224 "block", 212 "block",
225 "stacktrace", 213 "stacktrace",
226 "sched-tree", 214 "sched-tree",
215 "ftrace_printk",
227 NULL 216 NULL
228}; 217};
229 218
@@ -266,54 +255,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
266 tracing_record_cmdline(current); 255 tracing_record_cmdline(current);
267} 256}
268 257
269#define CHECK_COND(cond) \
270 if (unlikely(cond)) { \
271 tracing_disabled = 1; \
272 WARN_ON(1); \
273 return -1; \
274 }
275
276/**
277 * check_pages - integrity check of trace buffers
278 *
279 * As a safty measure we check to make sure the data pages have not
280 * been corrupted.
281 */
282int check_pages(struct trace_array_cpu *data)
283{
284 struct page *page, *tmp;
285
286 CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
287 CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
288
289 list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
290 CHECK_COND(page->lru.next->prev != &page->lru);
291 CHECK_COND(page->lru.prev->next != &page->lru);
292 }
293
294 return 0;
295}
296
297/**
298 * head_page - page address of the first page in per_cpu buffer.
299 *
300 * head_page returns the page address of the first page in
301 * a per_cpu buffer. This also preforms various consistency
302 * checks to make sure the buffer has not been corrupted.
303 */
304void *head_page(struct trace_array_cpu *data)
305{
306 struct page *page;
307
308 if (list_empty(&data->trace_pages))
309 return NULL;
310
311 page = list_entry(data->trace_pages.next, struct page, lru);
312 BUG_ON(&page->lru == &data->trace_pages);
313
314 return page_address(page);
315}
316
317/** 258/**
318 * trace_seq_printf - sequence printing of trace information 259 * trace_seq_printf - sequence printing of trace information
319 * @s: trace sequence descriptor 260 * @s: trace sequence descriptor
@@ -395,28 +336,23 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
395 return len; 336 return len;
396} 337}
397 338
398#define HEX_CHARS 17 339#define MAX_MEMHEX_BYTES 8
399static const char hex2asc[] = "0123456789abcdef"; 340#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
400 341
401static int 342static int
402trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) 343trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
403{ 344{
404 unsigned char hex[HEX_CHARS]; 345 unsigned char hex[HEX_CHARS];
405 unsigned char *data = mem; 346 unsigned char *data = mem;
406 unsigned char byte;
407 int i, j; 347 int i, j;
408 348
409 BUG_ON(len >= HEX_CHARS);
410
411#ifdef __BIG_ENDIAN 349#ifdef __BIG_ENDIAN
412 for (i = 0, j = 0; i < len; i++) { 350 for (i = 0, j = 0; i < len; i++) {
413#else 351#else
414 for (i = len-1, j = 0; i >= 0; i--) { 352 for (i = len-1, j = 0; i >= 0; i--) {
415#endif 353#endif
416 byte = data[i]; 354 hex[j++] = hex_asc_hi(data[i]);
417 355 hex[j++] = hex_asc_lo(data[i]);
418 hex[j++] = hex2asc[byte & 0x0f];
419 hex[j++] = hex2asc[byte >> 4];
420 } 356 }
421 hex[j++] = ' '; 357 hex[j++] = ' ';
422 358
@@ -460,34 +396,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
460 trace_seq_reset(s); 396 trace_seq_reset(s);
461} 397}
462 398
463/*
464 * flip the trace buffers between two trace descriptors.
465 * This usually is the buffers between the global_trace and
466 * the max_tr to record a snapshot of a current trace.
467 *
468 * The ftrace_max_lock must be held.
469 */
470static void
471flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
472{
473 struct list_head flip_pages;
474
475 INIT_LIST_HEAD(&flip_pages);
476
477 memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
478 sizeof(struct trace_array_cpu) -
479 offsetof(struct trace_array_cpu, trace_head_idx));
480
481 check_pages(tr1);
482 check_pages(tr2);
483 list_splice_init(&tr1->trace_pages, &flip_pages);
484 list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
485 list_splice_init(&flip_pages, &tr2->trace_pages);
486 BUG_ON(!list_empty(&flip_pages));
487 check_pages(tr1);
488 check_pages(tr2);
489}
490
491/** 399/**
492 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 400 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
493 * @tr: tracer 401 * @tr: tracer
@@ -500,17 +408,17 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
500void 408void
501update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 409update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
502{ 410{
503 struct trace_array_cpu *data; 411 struct ring_buffer *buf = tr->buffer;
504 int i;
505 412
506 WARN_ON_ONCE(!irqs_disabled()); 413 WARN_ON_ONCE(!irqs_disabled());
507 __raw_spin_lock(&ftrace_max_lock); 414 __raw_spin_lock(&ftrace_max_lock);
508 /* clear out all the previous traces */ 415
509 for_each_tracing_cpu(i) { 416 tr->buffer = max_tr.buffer;
510 data = tr->data[i]; 417 max_tr.buffer = buf;
511 flip_trace(max_tr.data[i], data); 418
512 tracing_reset(data); 419 ftrace_disable_cpu();
513 } 420 ring_buffer_reset(tr->buffer);
421 ftrace_enable_cpu();
514 422
515 __update_max_tr(tr, tsk, cpu); 423 __update_max_tr(tr, tsk, cpu);
516 __raw_spin_unlock(&ftrace_max_lock); 424 __raw_spin_unlock(&ftrace_max_lock);
@@ -527,16 +435,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
527void 435void
528update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) 436update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
529{ 437{
530 struct trace_array_cpu *data = tr->data[cpu]; 438 int ret;
531 int i;
532 439
533 WARN_ON_ONCE(!irqs_disabled()); 440 WARN_ON_ONCE(!irqs_disabled());
534 __raw_spin_lock(&ftrace_max_lock); 441 __raw_spin_lock(&ftrace_max_lock);
535 for_each_tracing_cpu(i)
536 tracing_reset(max_tr.data[i]);
537 442
538 flip_trace(max_tr.data[cpu], data); 443 ftrace_disable_cpu();
539 tracing_reset(data); 444
445 ring_buffer_reset(max_tr.buffer);
446 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
447
448 ftrace_enable_cpu();
449
450 WARN_ON_ONCE(ret);
540 451
541 __update_max_tr(tr, tsk, cpu); 452 __update_max_tr(tr, tsk, cpu);
542 __raw_spin_unlock(&ftrace_max_lock); 453 __raw_spin_unlock(&ftrace_max_lock);
@@ -573,7 +484,6 @@ int register_tracer(struct tracer *type)
573#ifdef CONFIG_FTRACE_STARTUP_TEST 484#ifdef CONFIG_FTRACE_STARTUP_TEST
574 if (type->selftest) { 485 if (type->selftest) {
575 struct tracer *saved_tracer = current_trace; 486 struct tracer *saved_tracer = current_trace;
576 struct trace_array_cpu *data;
577 struct trace_array *tr = &global_trace; 487 struct trace_array *tr = &global_trace;
578 int saved_ctrl = tr->ctrl; 488 int saved_ctrl = tr->ctrl;
579 int i; 489 int i;
@@ -585,10 +495,7 @@ int register_tracer(struct tracer *type)
585 * If we fail, we do not register this tracer. 495 * If we fail, we do not register this tracer.
586 */ 496 */
587 for_each_tracing_cpu(i) { 497 for_each_tracing_cpu(i) {
588 data = tr->data[i]; 498 tracing_reset(tr, i);
589 if (!head_page(data))
590 continue;
591 tracing_reset(data);
592 } 499 }
593 current_trace = type; 500 current_trace = type;
594 tr->ctrl = 0; 501 tr->ctrl = 0;
@@ -604,10 +511,7 @@ int register_tracer(struct tracer *type)
604 } 511 }
605 /* Only reset on passing, to avoid touching corrupted buffers */ 512 /* Only reset on passing, to avoid touching corrupted buffers */
606 for_each_tracing_cpu(i) { 513 for_each_tracing_cpu(i) {
607 data = tr->data[i]; 514 tracing_reset(tr, i);
608 if (!head_page(data))
609 continue;
610 tracing_reset(data);
611 } 515 }
612 printk(KERN_CONT "PASSED\n"); 516 printk(KERN_CONT "PASSED\n");
613 } 517 }
@@ -653,13 +557,11 @@ void unregister_tracer(struct tracer *type)
653 mutex_unlock(&trace_types_lock); 557 mutex_unlock(&trace_types_lock);
654} 558}
655 559
656void tracing_reset(struct trace_array_cpu *data) 560void tracing_reset(struct trace_array *tr, int cpu)
657{ 561{
658 data->trace_idx = 0; 562 ftrace_disable_cpu();
659 data->overrun = 0; 563 ring_buffer_reset_cpu(tr->buffer, cpu);
660 data->trace_head = data->trace_tail = head_page(data); 564 ftrace_enable_cpu();
661 data->trace_head_idx = 0;
662 data->trace_tail_idx = 0;
663} 565}
664 566
665#define SAVED_CMDLINES 128 567#define SAVED_CMDLINES 128
@@ -745,82 +647,20 @@ void tracing_record_cmdline(struct task_struct *tsk)
745 trace_save_cmdline(tsk); 647 trace_save_cmdline(tsk);
746} 648}
747 649
748static inline struct list_head * 650void
749trace_next_list(struct trace_array_cpu *data, struct list_head *next) 651tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
750{ 652 int pc)
751 /*
752 * Roundrobin - but skip the head (which is not a real page):
753 */
754 next = next->next;
755 if (unlikely(next == &data->trace_pages))
756 next = next->next;
757 BUG_ON(next == &data->trace_pages);
758
759 return next;
760}
761
762static inline void *
763trace_next_page(struct trace_array_cpu *data, void *addr)
764{
765 struct list_head *next;
766 struct page *page;
767
768 page = virt_to_page(addr);
769
770 next = trace_next_list(data, &page->lru);
771 page = list_entry(next, struct page, lru);
772
773 return page_address(page);
774}
775
776static inline struct trace_entry *
777tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
778{
779 unsigned long idx, idx_next;
780 struct trace_entry *entry;
781
782 data->trace_idx++;
783 idx = data->trace_head_idx;
784 idx_next = idx + 1;
785
786 BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
787
788 entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
789
790 if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
791 data->trace_head = trace_next_page(data, data->trace_head);
792 idx_next = 0;
793 }
794
795 if (data->trace_head == data->trace_tail &&
796 idx_next == data->trace_tail_idx) {
797 /* overrun */
798 data->overrun++;
799 data->trace_tail_idx++;
800 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
801 data->trace_tail =
802 trace_next_page(data, data->trace_tail);
803 data->trace_tail_idx = 0;
804 }
805 }
806
807 data->trace_head_idx = idx_next;
808
809 return entry;
810}
811
812static inline void
813tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
814{ 653{
815 struct task_struct *tsk = current; 654 struct task_struct *tsk = current;
816 unsigned long pc;
817
818 pc = preempt_count();
819 655
820 entry->preempt_count = pc & 0xff; 656 entry->preempt_count = pc & 0xff;
821 entry->pid = (tsk) ? tsk->pid : 0; 657 entry->pid = (tsk) ? tsk->pid : 0;
822 entry->t = ftrace_now(raw_smp_processor_id()); 658 entry->flags =
823 entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 659#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
660 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
661#else
662 TRACE_FLAG_IRQS_NOSUPPORT |
663#endif
824 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 664 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
825 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 665 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
826 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 666 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
@@ -828,145 +668,141 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
828 668
829void 669void
830trace_function(struct trace_array *tr, struct trace_array_cpu *data, 670trace_function(struct trace_array *tr, struct trace_array_cpu *data,
831 unsigned long ip, unsigned long parent_ip, unsigned long flags) 671 unsigned long ip, unsigned long parent_ip, unsigned long flags,
672 int pc)
832{ 673{
833 struct trace_entry *entry; 674 struct ring_buffer_event *event;
675 struct ftrace_entry *entry;
834 unsigned long irq_flags; 676 unsigned long irq_flags;
835 677
836 raw_local_irq_save(irq_flags); 678 /* If we are reading the ring buffer, don't trace */
837 __raw_spin_lock(&data->lock); 679 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
838 entry = tracing_get_trace_entry(tr, data); 680 return;
839 tracing_generic_entry_update(entry, flags); 681
840 entry->type = TRACE_FN; 682 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
841 entry->fn.ip = ip; 683 &irq_flags);
842 entry->fn.parent_ip = parent_ip; 684 if (!event)
843 __raw_spin_unlock(&data->lock); 685 return;
844 raw_local_irq_restore(irq_flags); 686 entry = ring_buffer_event_data(event);
687 tracing_generic_entry_update(&entry->ent, flags, pc);
688 entry->ent.type = TRACE_FN;
689 entry->ip = ip;
690 entry->parent_ip = parent_ip;
691 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
845} 692}
846 693
847void 694void
848ftrace(struct trace_array *tr, struct trace_array_cpu *data, 695ftrace(struct trace_array *tr, struct trace_array_cpu *data,
849 unsigned long ip, unsigned long parent_ip, unsigned long flags) 696 unsigned long ip, unsigned long parent_ip, unsigned long flags,
697 int pc)
850{ 698{
851 if (likely(!atomic_read(&data->disabled))) 699 if (likely(!atomic_read(&data->disabled)))
852 trace_function(tr, data, ip, parent_ip, flags); 700 trace_function(tr, data, ip, parent_ip, flags, pc);
853} 701}
854 702
855#ifdef CONFIG_MMIOTRACE 703static void ftrace_trace_stack(struct trace_array *tr,
856void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, 704 struct trace_array_cpu *data,
857 struct mmiotrace_rw *rw) 705 unsigned long flags,
706 int skip, int pc)
858{ 707{
859 struct trace_entry *entry; 708#ifdef CONFIG_STACKTRACE
709 struct ring_buffer_event *event;
710 struct stack_entry *entry;
711 struct stack_trace trace;
860 unsigned long irq_flags; 712 unsigned long irq_flags;
861 713
862 raw_local_irq_save(irq_flags); 714 if (!(trace_flags & TRACE_ITER_STACKTRACE))
863 __raw_spin_lock(&data->lock); 715 return;
864
865 entry = tracing_get_trace_entry(tr, data);
866 tracing_generic_entry_update(entry, 0);
867 entry->type = TRACE_MMIO_RW;
868 entry->mmiorw = *rw;
869
870 __raw_spin_unlock(&data->lock);
871 raw_local_irq_restore(irq_flags);
872
873 trace_wake_up();
874}
875
876void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
877 struct mmiotrace_map *map)
878{
879 struct trace_entry *entry;
880 unsigned long irq_flags;
881 716
882 raw_local_irq_save(irq_flags); 717 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
883 __raw_spin_lock(&data->lock); 718 &irq_flags);
719 if (!event)
720 return;
721 entry = ring_buffer_event_data(event);
722 tracing_generic_entry_update(&entry->ent, flags, pc);
723 entry->ent.type = TRACE_STACK;
884 724
885 entry = tracing_get_trace_entry(tr, data); 725 memset(&entry->caller, 0, sizeof(entry->caller));
886 tracing_generic_entry_update(entry, 0);
887 entry->type = TRACE_MMIO_MAP;
888 entry->mmiomap = *map;
889 726
890 __raw_spin_unlock(&data->lock); 727 trace.nr_entries = 0;
891 raw_local_irq_restore(irq_flags); 728 trace.max_entries = FTRACE_STACK_ENTRIES;
729 trace.skip = skip;
730 trace.entries = entry->caller;
892 731
893 trace_wake_up(); 732 save_stack_trace(&trace);
894} 733 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
895#endif 734#endif
735}
896 736
897void __trace_stack(struct trace_array *tr, 737void __trace_stack(struct trace_array *tr,
898 struct trace_array_cpu *data, 738 struct trace_array_cpu *data,
899 unsigned long flags, 739 unsigned long flags,
900 int skip) 740 int skip)
901{ 741{
902 struct trace_entry *entry; 742 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
903 struct stack_trace trace;
904
905 if (!(trace_flags & TRACE_ITER_STACKTRACE))
906 return;
907
908 entry = tracing_get_trace_entry(tr, data);
909 tracing_generic_entry_update(entry, flags);
910 entry->type = TRACE_STACK;
911
912 memset(&entry->stack, 0, sizeof(entry->stack));
913
914 trace.nr_entries = 0;
915 trace.max_entries = FTRACE_STACK_ENTRIES;
916 trace.skip = skip;
917 trace.entries = entry->stack.caller;
918
919 save_stack_trace(&trace);
920} 743}
921 744
922void 745static void
923__trace_special(void *__tr, void *__data, 746ftrace_trace_special(void *__tr, void *__data,
924 unsigned long arg1, unsigned long arg2, unsigned long arg3) 747 unsigned long arg1, unsigned long arg2, unsigned long arg3,
748 int pc)
925{ 749{
750 struct ring_buffer_event *event;
926 struct trace_array_cpu *data = __data; 751 struct trace_array_cpu *data = __data;
927 struct trace_array *tr = __tr; 752 struct trace_array *tr = __tr;
928 struct trace_entry *entry; 753 struct special_entry *entry;
929 unsigned long irq_flags; 754 unsigned long irq_flags;
930 755
931 raw_local_irq_save(irq_flags); 756 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
932 __raw_spin_lock(&data->lock); 757 &irq_flags);
933 entry = tracing_get_trace_entry(tr, data); 758 if (!event)
934 tracing_generic_entry_update(entry, 0); 759 return;
935 entry->type = TRACE_SPECIAL; 760 entry = ring_buffer_event_data(event);
936 entry->special.arg1 = arg1; 761 tracing_generic_entry_update(&entry->ent, 0, pc);
937 entry->special.arg2 = arg2; 762 entry->ent.type = TRACE_SPECIAL;
938 entry->special.arg3 = arg3; 763 entry->arg1 = arg1;
939 __trace_stack(tr, data, irq_flags, 4); 764 entry->arg2 = arg2;
940 __raw_spin_unlock(&data->lock); 765 entry->arg3 = arg3;
941 raw_local_irq_restore(irq_flags); 766 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
767 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
942 768
943 trace_wake_up(); 769 trace_wake_up();
944} 770}
945 771
946void 772void
773__trace_special(void *__tr, void *__data,
774 unsigned long arg1, unsigned long arg2, unsigned long arg3)
775{
776 ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
777}
778
779void
947tracing_sched_switch_trace(struct trace_array *tr, 780tracing_sched_switch_trace(struct trace_array *tr,
948 struct trace_array_cpu *data, 781 struct trace_array_cpu *data,
949 struct task_struct *prev, 782 struct task_struct *prev,
950 struct task_struct *next, 783 struct task_struct *next,
951 unsigned long flags) 784 unsigned long flags, int pc)
952{ 785{
953 struct trace_entry *entry; 786 struct ring_buffer_event *event;
787 struct ctx_switch_entry *entry;
954 unsigned long irq_flags; 788 unsigned long irq_flags;
955 789
956 raw_local_irq_save(irq_flags); 790 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
957 __raw_spin_lock(&data->lock); 791 &irq_flags);
958 entry = tracing_get_trace_entry(tr, data); 792 if (!event)
959 tracing_generic_entry_update(entry, flags); 793 return;
960 entry->type = TRACE_CTX; 794 entry = ring_buffer_event_data(event);
961 entry->ctx.prev_pid = prev->pid; 795 tracing_generic_entry_update(&entry->ent, flags, pc);
962 entry->ctx.prev_prio = prev->prio; 796 entry->ent.type = TRACE_CTX;
963 entry->ctx.prev_state = prev->state; 797 entry->prev_pid = prev->pid;
964 entry->ctx.next_pid = next->pid; 798 entry->prev_prio = prev->prio;
965 entry->ctx.next_prio = next->prio; 799 entry->prev_state = prev->state;
966 entry->ctx.next_state = next->state; 800 entry->next_pid = next->pid;
967 __trace_stack(tr, data, flags, 5); 801 entry->next_prio = next->prio;
968 __raw_spin_unlock(&data->lock); 802 entry->next_state = next->state;
969 raw_local_irq_restore(irq_flags); 803 entry->next_cpu = task_cpu(next);
804 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
805 ftrace_trace_stack(tr, data, flags, 5, pc);
970} 806}
971 807
972void 808void
@@ -974,25 +810,28 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
974 struct trace_array_cpu *data, 810 struct trace_array_cpu *data,
975 struct task_struct *wakee, 811 struct task_struct *wakee,
976 struct task_struct *curr, 812 struct task_struct *curr,
977 unsigned long flags) 813 unsigned long flags, int pc)
978{ 814{
979 struct trace_entry *entry; 815 struct ring_buffer_event *event;
816 struct ctx_switch_entry *entry;
980 unsigned long irq_flags; 817 unsigned long irq_flags;
981 818
982 raw_local_irq_save(irq_flags); 819 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
983 __raw_spin_lock(&data->lock); 820 &irq_flags);
984 entry = tracing_get_trace_entry(tr, data); 821 if (!event)
985 tracing_generic_entry_update(entry, flags); 822 return;
986 entry->type = TRACE_WAKE; 823 entry = ring_buffer_event_data(event);
987 entry->ctx.prev_pid = curr->pid; 824 tracing_generic_entry_update(&entry->ent, flags, pc);
988 entry->ctx.prev_prio = curr->prio; 825 entry->ent.type = TRACE_WAKE;
989 entry->ctx.prev_state = curr->state; 826 entry->prev_pid = curr->pid;
990 entry->ctx.next_pid = wakee->pid; 827 entry->prev_prio = curr->prio;
991 entry->ctx.next_prio = wakee->prio; 828 entry->prev_state = curr->state;
992 entry->ctx.next_state = wakee->state; 829 entry->next_pid = wakee->pid;
993 __trace_stack(tr, data, flags, 6); 830 entry->next_prio = wakee->prio;
994 __raw_spin_unlock(&data->lock); 831 entry->next_state = wakee->state;
995 raw_local_irq_restore(irq_flags); 832 entry->next_cpu = task_cpu(wakee);
833 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
834 ftrace_trace_stack(tr, data, flags, 6, pc);
996 835
997 trace_wake_up(); 836 trace_wake_up();
998} 837}
@@ -1002,26 +841,24 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1002{ 841{
1003 struct trace_array *tr = &global_trace; 842 struct trace_array *tr = &global_trace;
1004 struct trace_array_cpu *data; 843 struct trace_array_cpu *data;
1005 unsigned long flags;
1006 long disabled;
1007 int cpu; 844 int cpu;
845 int pc;
1008 846
1009 if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) 847 if (tracing_disabled || !tr->ctrl)
1010 return; 848 return;
1011 849
1012 local_irq_save(flags); 850 pc = preempt_count();
851 preempt_disable_notrace();
1013 cpu = raw_smp_processor_id(); 852 cpu = raw_smp_processor_id();
1014 data = tr->data[cpu]; 853 data = tr->data[cpu];
1015 disabled = atomic_inc_return(&data->disabled);
1016 854
1017 if (likely(disabled == 1)) 855 if (likely(!atomic_read(&data->disabled)))
1018 __trace_special(tr, data, arg1, arg2, arg3); 856 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
1019 857
1020 atomic_dec(&data->disabled); 858 preempt_enable_notrace();
1021 local_irq_restore(flags);
1022} 859}
1023 860
1024#ifdef CONFIG_FTRACE 861#ifdef CONFIG_FUNCTION_TRACER
1025static void 862static void
1026function_trace_call(unsigned long ip, unsigned long parent_ip) 863function_trace_call(unsigned long ip, unsigned long parent_ip)
1027{ 864{
@@ -1029,24 +866,28 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
1029 struct trace_array_cpu *data; 866 struct trace_array_cpu *data;
1030 unsigned long flags; 867 unsigned long flags;
1031 long disabled; 868 long disabled;
1032 int cpu; 869 int cpu, resched;
870 int pc;
1033 871
1034 if (unlikely(!ftrace_function_enabled)) 872 if (unlikely(!ftrace_function_enabled))
1035 return; 873 return;
1036 874
1037 if (skip_trace(ip)) 875 pc = preempt_count();
1038 return; 876 resched = need_resched();
1039 877 preempt_disable_notrace();
1040 local_irq_save(flags); 878 local_save_flags(flags);
1041 cpu = raw_smp_processor_id(); 879 cpu = raw_smp_processor_id();
1042 data = tr->data[cpu]; 880 data = tr->data[cpu];
1043 disabled = atomic_inc_return(&data->disabled); 881 disabled = atomic_inc_return(&data->disabled);
1044 882
1045 if (likely(disabled == 1)) 883 if (likely(disabled == 1))
1046 trace_function(tr, data, ip, parent_ip, flags); 884 trace_function(tr, data, ip, parent_ip, flags, pc);
1047 885
1048 atomic_dec(&data->disabled); 886 atomic_dec(&data->disabled);
1049 local_irq_restore(flags); 887 if (resched)
888 preempt_enable_no_resched_notrace();
889 else
890 preempt_enable_notrace();
1050} 891}
1051 892
1052static struct ftrace_ops trace_ops __read_mostly = 893static struct ftrace_ops trace_ops __read_mostly =
@@ -1073,111 +914,96 @@ enum trace_file_type {
1073 TRACE_FILE_LAT_FMT = 1, 914 TRACE_FILE_LAT_FMT = 1,
1074}; 915};
1075 916
1076static struct trace_entry * 917static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
1077trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
1078 struct trace_iterator *iter, int cpu)
1079{ 918{
1080 struct page *page; 919 /* Don't allow ftrace to trace into the ring buffers */
1081 struct trace_entry *array; 920 ftrace_disable_cpu();
1082 921
1083 if (iter->next_idx[cpu] >= tr->entries || 922 iter->idx++;
1084 iter->next_idx[cpu] >= data->trace_idx || 923 if (iter->buffer_iter[iter->cpu])
1085 (data->trace_head == data->trace_tail && 924 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1086 data->trace_head_idx == data->trace_tail_idx))
1087 return NULL;
1088 925
1089 if (!iter->next_page[cpu]) { 926 ftrace_enable_cpu();
1090 /* Initialize the iterator for this cpu trace buffer */ 927}
1091 WARN_ON(!data->trace_tail);
1092 page = virt_to_page(data->trace_tail);
1093 iter->next_page[cpu] = &page->lru;
1094 iter->next_page_idx[cpu] = data->trace_tail_idx;
1095 }
1096 928
1097 page = list_entry(iter->next_page[cpu], struct page, lru); 929static struct trace_entry *
1098 BUG_ON(&data->trace_pages == &page->lru); 930peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
931{
932 struct ring_buffer_event *event;
933 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
934
935 /* Don't allow ftrace to trace into the ring buffers */
936 ftrace_disable_cpu();
937
938 if (buf_iter)
939 event = ring_buffer_iter_peek(buf_iter, ts);
940 else
941 event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
1099 942
1100 array = page_address(page); 943 ftrace_enable_cpu();
1101 944
1102 WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); 945 return event ? ring_buffer_event_data(event) : NULL;
1103 return &array[iter->next_page_idx[cpu]];
1104} 946}
1105 947
1106static struct trace_entry * 948static struct trace_entry *
1107find_next_entry(struct trace_iterator *iter, int *ent_cpu) 949__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1108{ 950{
1109 struct trace_array *tr = iter->tr; 951 struct ring_buffer *buffer = iter->tr->buffer;
1110 struct trace_entry *ent, *next = NULL; 952 struct trace_entry *ent, *next = NULL;
953 u64 next_ts = 0, ts;
1111 int next_cpu = -1; 954 int next_cpu = -1;
1112 int cpu; 955 int cpu;
1113 956
1114 for_each_tracing_cpu(cpu) { 957 for_each_tracing_cpu(cpu) {
1115 if (!head_page(tr->data[cpu])) 958
959 if (ring_buffer_empty_cpu(buffer, cpu))
1116 continue; 960 continue;
1117 ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); 961
962 ent = peek_next_entry(iter, cpu, &ts);
963
1118 /* 964 /*
1119 * Pick the entry with the smallest timestamp: 965 * Pick the entry with the smallest timestamp:
1120 */ 966 */
1121 if (ent && (!next || ent->t < next->t)) { 967 if (ent && (!next || ts < next_ts)) {
1122 next = ent; 968 next = ent;
1123 next_cpu = cpu; 969 next_cpu = cpu;
970 next_ts = ts;
1124 } 971 }
1125 } 972 }
1126 973
1127 if (ent_cpu) 974 if (ent_cpu)
1128 *ent_cpu = next_cpu; 975 *ent_cpu = next_cpu;
1129 976
977 if (ent_ts)
978 *ent_ts = next_ts;
979
1130 return next; 980 return next;
1131} 981}
1132 982
1133static void trace_iterator_increment(struct trace_iterator *iter) 983/* Find the next real entry, without updating the iterator itself */
984static struct trace_entry *
985find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1134{ 986{
1135 iter->idx++; 987 return __find_next_entry(iter, ent_cpu, ent_ts);
1136 iter->next_idx[iter->cpu]++;
1137 iter->next_page_idx[iter->cpu]++;
1138
1139 if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
1140 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1141
1142 iter->next_page_idx[iter->cpu] = 0;
1143 iter->next_page[iter->cpu] =
1144 trace_next_list(data, iter->next_page[iter->cpu]);
1145 }
1146} 988}
1147 989
1148static void trace_consume(struct trace_iterator *iter) 990/* Find the next real entry, and increment the iterator to the next entry */
991static void *find_next_entry_inc(struct trace_iterator *iter)
1149{ 992{
1150 struct trace_array_cpu *data = iter->tr->data[iter->cpu]; 993 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
1151 994
1152 data->trace_tail_idx++; 995 if (iter->ent)
1153 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { 996 trace_iterator_increment(iter, iter->cpu);
1154 data->trace_tail = trace_next_page(data, data->trace_tail);
1155 data->trace_tail_idx = 0;
1156 }
1157 997
1158 /* Check if we empty it, then reset the index */ 998 return iter->ent ? iter : NULL;
1159 if (data->trace_head == data->trace_tail &&
1160 data->trace_head_idx == data->trace_tail_idx)
1161 data->trace_idx = 0;
1162} 999}
1163 1000
1164static void *find_next_entry_inc(struct trace_iterator *iter) 1001static void trace_consume(struct trace_iterator *iter)
1165{ 1002{
1166 struct trace_entry *next; 1003 /* Don't allow ftrace to trace into the ring buffers */
1167 int next_cpu = -1; 1004 ftrace_disable_cpu();
1168 1005 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
1169 next = find_next_entry(iter, &next_cpu); 1006 ftrace_enable_cpu();
1170
1171 iter->prev_ent = iter->ent;
1172 iter->prev_cpu = iter->cpu;
1173
1174 iter->ent = next;
1175 iter->cpu = next_cpu;
1176
1177 if (next)
1178 trace_iterator_increment(iter);
1179
1180 return next ? iter : NULL;
1181} 1007}
1182 1008
1183static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1009static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1210,7 +1036,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1210 struct trace_iterator *iter = m->private; 1036 struct trace_iterator *iter = m->private;
1211 void *p = NULL; 1037 void *p = NULL;
1212 loff_t l = 0; 1038 loff_t l = 0;
1213 int i; 1039 int cpu;
1214 1040
1215 mutex_lock(&trace_types_lock); 1041 mutex_lock(&trace_types_lock);
1216 1042
@@ -1229,14 +1055,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1229 iter->ent = NULL; 1055 iter->ent = NULL;
1230 iter->cpu = 0; 1056 iter->cpu = 0;
1231 iter->idx = -1; 1057 iter->idx = -1;
1232 iter->prev_ent = NULL;
1233 iter->prev_cpu = -1;
1234 1058
1235 for_each_tracing_cpu(i) { 1059 ftrace_disable_cpu();
1236 iter->next_idx[i] = 0; 1060
1237 iter->next_page[i] = NULL; 1061 for_each_tracing_cpu(cpu) {
1062 ring_buffer_iter_reset(iter->buffer_iter[cpu]);
1238 } 1063 }
1239 1064
1065 ftrace_enable_cpu();
1066
1240 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1067 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1241 ; 1068 ;
1242 1069
@@ -1261,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p)
1261 mutex_unlock(&trace_types_lock); 1088 mutex_unlock(&trace_types_lock);
1262} 1089}
1263 1090
1264#define KRETPROBE_MSG "[unknown/kretprobe'd]"
1265
1266#ifdef CONFIG_KRETPROBES 1091#ifdef CONFIG_KRETPROBES
1267static inline int kretprobed(unsigned long addr) 1092static inline const char *kretprobed(const char *name)
1268{ 1093{
1269 return addr == (unsigned long)kretprobe_trampoline; 1094 static const char tramp_name[] = "kretprobe_trampoline";
1095 int size = sizeof(tramp_name);
1096
1097 if (strncmp(tramp_name, name, size) == 0)
1098 return "[unknown/kretprobe'd]";
1099 return name;
1270} 1100}
1271#else 1101#else
1272static inline int kretprobed(unsigned long addr) 1102static inline const char *kretprobed(const char *name)
1273{ 1103{
1274 return 0; 1104 return name;
1275} 1105}
1276#endif /* CONFIG_KRETPROBES */ 1106#endif /* CONFIG_KRETPROBES */
1277 1107
@@ -1280,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1280{ 1110{
1281#ifdef CONFIG_KALLSYMS 1111#ifdef CONFIG_KALLSYMS
1282 char str[KSYM_SYMBOL_LEN]; 1112 char str[KSYM_SYMBOL_LEN];
1113 const char *name;
1283 1114
1284 kallsyms_lookup(address, NULL, NULL, NULL, str); 1115 kallsyms_lookup(address, NULL, NULL, NULL, str);
1285 1116
1286 return trace_seq_printf(s, fmt, str); 1117 name = kretprobed(str);
1118
1119 return trace_seq_printf(s, fmt, name);
1287#endif 1120#endif
1288 return 1; 1121 return 1;
1289} 1122}
@@ -1294,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1294{ 1127{
1295#ifdef CONFIG_KALLSYMS 1128#ifdef CONFIG_KALLSYMS
1296 char str[KSYM_SYMBOL_LEN]; 1129 char str[KSYM_SYMBOL_LEN];
1130 const char *name;
1297 1131
1298 sprint_symbol(str, address); 1132 sprint_symbol(str, address);
1299 return trace_seq_printf(s, fmt, str); 1133 name = kretprobed(str);
1134
1135 return trace_seq_printf(s, fmt, name);
1300#endif 1136#endif
1301 return 1; 1137 return 1;
1302} 1138}
@@ -1330,21 +1166,21 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1330 1166
1331static void print_lat_help_header(struct seq_file *m) 1167static void print_lat_help_header(struct seq_file *m)
1332{ 1168{
1333 seq_puts(m, "# _------=> CPU# \n"); 1169 seq_puts(m, "# _------=> CPU# \n");
1334 seq_puts(m, "# / _-----=> irqs-off \n"); 1170 seq_puts(m, "# / _-----=> irqs-off \n");
1335 seq_puts(m, "# | / _----=> need-resched \n"); 1171 seq_puts(m, "# | / _----=> need-resched \n");
1336 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1172 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1337 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1173 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1338 seq_puts(m, "# |||| / \n"); 1174 seq_puts(m, "# |||| / \n");
1339 seq_puts(m, "# ||||| delay \n"); 1175 seq_puts(m, "# ||||| delay \n");
1340 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1176 seq_puts(m, "# cmd pid ||||| time | caller \n");
1341 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1177 seq_puts(m, "# \\ / ||||| \\ | / \n");
1342} 1178}
1343 1179
1344static void print_func_help_header(struct seq_file *m) 1180static void print_func_help_header(struct seq_file *m)
1345{ 1181{
1346 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1182 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1347 seq_puts(m, "# | | | | |\n"); 1183 seq_puts(m, "# | | | | |\n");
1348} 1184}
1349 1185
1350 1186
@@ -1355,23 +1191,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1355 struct trace_array *tr = iter->tr; 1191 struct trace_array *tr = iter->tr;
1356 struct trace_array_cpu *data = tr->data[tr->cpu]; 1192 struct trace_array_cpu *data = tr->data[tr->cpu];
1357 struct tracer *type = current_trace; 1193 struct tracer *type = current_trace;
1358 unsigned long total = 0; 1194 unsigned long total;
1359 unsigned long entries = 0; 1195 unsigned long entries;
1360 int cpu;
1361 const char *name = "preemption"; 1196 const char *name = "preemption";
1362 1197
1363 if (type) 1198 if (type)
1364 name = type->name; 1199 name = type->name;
1365 1200
1366 for_each_tracing_cpu(cpu) { 1201 entries = ring_buffer_entries(iter->tr->buffer);
1367 if (head_page(tr->data[cpu])) { 1202 total = entries +
1368 total += tr->data[cpu]->trace_idx; 1203 ring_buffer_overruns(iter->tr->buffer);
1369 if (tr->data[cpu]->trace_idx > tr->entries)
1370 entries += tr->entries;
1371 else
1372 entries += tr->data[cpu]->trace_idx;
1373 }
1374 }
1375 1204
1376 seq_printf(m, "%s latency trace v1.1.5 on %s\n", 1205 seq_printf(m, "%s latency trace v1.1.5 on %s\n",
1377 name, UTS_RELEASE); 1206 name, UTS_RELEASE);
@@ -1428,9 +1257,10 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1428 comm = trace_find_cmdline(entry->pid); 1257 comm = trace_find_cmdline(entry->pid);
1429 1258
1430 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); 1259 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1431 trace_seq_printf(s, "%d", cpu); 1260 trace_seq_printf(s, "%3d", cpu);
1432 trace_seq_printf(s, "%c%c", 1261 trace_seq_printf(s, "%c%c",
1433 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', 1262 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
1263 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
1434 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); 1264 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1435 1265
1436 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 1266 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
@@ -1457,7 +1287,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1457unsigned long preempt_mark_thresh = 100; 1287unsigned long preempt_mark_thresh = 100;
1458 1288
1459static void 1289static void
1460lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, 1290lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1461 unsigned long rel_usecs) 1291 unsigned long rel_usecs)
1462{ 1292{
1463 trace_seq_printf(s, " %4lldus", abs_usecs); 1293 trace_seq_printf(s, " %4lldus", abs_usecs);
@@ -1471,34 +1301,76 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
1471 1301
1472static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1302static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1473 1303
1474static int 1304/*
1305 * The message is supposed to contain an ending newline.
1306 * If the printing stops prematurely, try to add a newline of our own.
1307 */
1308void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1309{
1310 struct trace_entry *ent;
1311 struct trace_field_cont *cont;
1312 bool ok = true;
1313
1314 ent = peek_next_entry(iter, iter->cpu, NULL);
1315 if (!ent || ent->type != TRACE_CONT) {
1316 trace_seq_putc(s, '\n');
1317 return;
1318 }
1319
1320 do {
1321 cont = (struct trace_field_cont *)ent;
1322 if (ok)
1323 ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
1324
1325 ftrace_disable_cpu();
1326
1327 if (iter->buffer_iter[iter->cpu])
1328 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1329 else
1330 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
1331
1332 ftrace_enable_cpu();
1333
1334 ent = peek_next_entry(iter, iter->cpu, NULL);
1335 } while (ent && ent->type == TRACE_CONT);
1336
1337 if (!ok)
1338 trace_seq_putc(s, '\n');
1339}
1340
1341static enum print_line_t
1475print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1342print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1476{ 1343{
1477 struct trace_seq *s = &iter->seq; 1344 struct trace_seq *s = &iter->seq;
1478 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1345 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1479 struct trace_entry *next_entry = find_next_entry(iter, NULL); 1346 struct trace_entry *next_entry;
1480 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 1347 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1481 struct trace_entry *entry = iter->ent; 1348 struct trace_entry *entry = iter->ent;
1482 unsigned long abs_usecs; 1349 unsigned long abs_usecs;
1483 unsigned long rel_usecs; 1350 unsigned long rel_usecs;
1351 u64 next_ts;
1484 char *comm; 1352 char *comm;
1485 int S, T; 1353 int S, T;
1486 int i; 1354 int i;
1487 unsigned state; 1355 unsigned state;
1488 1356
1357 if (entry->type == TRACE_CONT)
1358 return TRACE_TYPE_HANDLED;
1359
1360 next_entry = find_next_entry(iter, NULL, &next_ts);
1489 if (!next_entry) 1361 if (!next_entry)
1490 next_entry = entry; 1362 next_ts = iter->ts;
1491 rel_usecs = ns2usecs(next_entry->t - entry->t); 1363 rel_usecs = ns2usecs(next_ts - iter->ts);
1492 abs_usecs = ns2usecs(entry->t - iter->tr->time_start); 1364 abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
1493 1365
1494 if (verbose) { 1366 if (verbose) {
1495 comm = trace_find_cmdline(entry->pid); 1367 comm = trace_find_cmdline(entry->pid);
1496 trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" 1368 trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
1497 " %ld.%03ldms (+%ld.%03ldms): ", 1369 " %ld.%03ldms (+%ld.%03ldms): ",
1498 comm, 1370 comm,
1499 entry->pid, cpu, entry->flags, 1371 entry->pid, cpu, entry->flags,
1500 entry->preempt_count, trace_idx, 1372 entry->preempt_count, trace_idx,
1501 ns2usecs(entry->t), 1373 ns2usecs(iter->ts),
1502 abs_usecs/1000, 1374 abs_usecs/1000,
1503 abs_usecs % 1000, rel_usecs/1000, 1375 abs_usecs % 1000, rel_usecs/1000,
1504 rel_usecs % 1000); 1376 rel_usecs % 1000);
@@ -1507,52 +1379,82 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1507 lat_print_timestamp(s, abs_usecs, rel_usecs); 1379 lat_print_timestamp(s, abs_usecs, rel_usecs);
1508 } 1380 }
1509 switch (entry->type) { 1381 switch (entry->type) {
1510 case TRACE_FN: 1382 case TRACE_FN: {
1511 seq_print_ip_sym(s, entry->fn.ip, sym_flags); 1383 struct ftrace_entry *field;
1384
1385 trace_assign_type(field, entry);
1386
1387 seq_print_ip_sym(s, field->ip, sym_flags);
1512 trace_seq_puts(s, " ("); 1388 trace_seq_puts(s, " (");
1513 if (kretprobed(entry->fn.parent_ip)) 1389 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1514 trace_seq_puts(s, KRETPROBE_MSG);
1515 else
1516 seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
1517 trace_seq_puts(s, ")\n"); 1390 trace_seq_puts(s, ")\n");
1518 break; 1391 break;
1392 }
1519 case TRACE_CTX: 1393 case TRACE_CTX:
1520 case TRACE_WAKE: 1394 case TRACE_WAKE: {
1521 T = entry->ctx.next_state < sizeof(state_to_char) ? 1395 struct ctx_switch_entry *field;
1522 state_to_char[entry->ctx.next_state] : 'X'; 1396
1397 trace_assign_type(field, entry);
1398
1399 T = field->next_state < sizeof(state_to_char) ?
1400 state_to_char[field->next_state] : 'X';
1523 1401
1524 state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; 1402 state = field->prev_state ?
1403 __ffs(field->prev_state) + 1 : 0;
1525 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; 1404 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1526 comm = trace_find_cmdline(entry->ctx.next_pid); 1405 comm = trace_find_cmdline(field->next_pid);
1527 trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", 1406 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1528 entry->ctx.prev_pid, 1407 field->prev_pid,
1529 entry->ctx.prev_prio, 1408 field->prev_prio,
1530 S, entry->type == TRACE_CTX ? "==>" : " +", 1409 S, entry->type == TRACE_CTX ? "==>" : " +",
1531 entry->ctx.next_pid, 1410 field->next_cpu,
1532 entry->ctx.next_prio, 1411 field->next_pid,
1412 field->next_prio,
1533 T, comm); 1413 T, comm);
1534 break; 1414 break;
1535 case TRACE_SPECIAL: 1415 }
1416 case TRACE_SPECIAL: {
1417 struct special_entry *field;
1418
1419 trace_assign_type(field, entry);
1420
1536 trace_seq_printf(s, "# %ld %ld %ld\n", 1421 trace_seq_printf(s, "# %ld %ld %ld\n",
1537 entry->special.arg1, 1422 field->arg1,
1538 entry->special.arg2, 1423 field->arg2,
1539 entry->special.arg3); 1424 field->arg3);
1540 break; 1425 break;
1541 case TRACE_STACK: 1426 }
1427 case TRACE_STACK: {
1428 struct stack_entry *field;
1429
1430 trace_assign_type(field, entry);
1431
1542 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1432 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1543 if (i) 1433 if (i)
1544 trace_seq_puts(s, " <= "); 1434 trace_seq_puts(s, " <= ");
1545 seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); 1435 seq_print_ip_sym(s, field->caller[i], sym_flags);
1546 } 1436 }
1547 trace_seq_puts(s, "\n"); 1437 trace_seq_puts(s, "\n");
1548 break; 1438 break;
1439 }
1440 case TRACE_PRINT: {
1441 struct print_entry *field;
1442
1443 trace_assign_type(field, entry);
1444
1445 seq_print_ip_sym(s, field->ip, sym_flags);
1446 trace_seq_printf(s, ": %s", field->buf);
1447 if (entry->flags & TRACE_FLAG_CONT)
1448 trace_seq_print_cont(s, iter);
1449 break;
1450 }
1549 default: 1451 default:
1550 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1452 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1551 } 1453 }
1552 return 1; 1454 return TRACE_TYPE_HANDLED;
1553} 1455}
1554 1456
1555static int print_trace_fmt(struct trace_iterator *iter) 1457static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1556{ 1458{
1557 struct trace_seq *s = &iter->seq; 1459 struct trace_seq *s = &iter->seq;
1558 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1460 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1567,90 +1469,123 @@ static int print_trace_fmt(struct trace_iterator *iter)
1567 1469
1568 entry = iter->ent; 1470 entry = iter->ent;
1569 1471
1472 if (entry->type == TRACE_CONT)
1473 return TRACE_TYPE_HANDLED;
1474
1570 comm = trace_find_cmdline(iter->ent->pid); 1475 comm = trace_find_cmdline(iter->ent->pid);
1571 1476
1572 t = ns2usecs(entry->t); 1477 t = ns2usecs(iter->ts);
1573 usec_rem = do_div(t, 1000000ULL); 1478 usec_rem = do_div(t, 1000000ULL);
1574 secs = (unsigned long)t; 1479 secs = (unsigned long)t;
1575 1480
1576 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); 1481 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1577 if (!ret) 1482 if (!ret)
1578 return 0; 1483 return TRACE_TYPE_PARTIAL_LINE;
1579 ret = trace_seq_printf(s, "[%02d] ", iter->cpu); 1484 ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
1580 if (!ret) 1485 if (!ret)
1581 return 0; 1486 return TRACE_TYPE_PARTIAL_LINE;
1582 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); 1487 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1583 if (!ret) 1488 if (!ret)
1584 return 0; 1489 return TRACE_TYPE_PARTIAL_LINE;
1585 1490
1586 switch (entry->type) { 1491 switch (entry->type) {
1587 case TRACE_FN: 1492 case TRACE_FN: {
1588 ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); 1493 struct ftrace_entry *field;
1494
1495 trace_assign_type(field, entry);
1496
1497 ret = seq_print_ip_sym(s, field->ip, sym_flags);
1589 if (!ret) 1498 if (!ret)
1590 return 0; 1499 return TRACE_TYPE_PARTIAL_LINE;
1591 if ((sym_flags & TRACE_ITER_PRINT_PARENT) && 1500 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
1592 entry->fn.parent_ip) { 1501 field->parent_ip) {
1593 ret = trace_seq_printf(s, " <-"); 1502 ret = trace_seq_printf(s, " <-");
1594 if (!ret) 1503 if (!ret)
1595 return 0; 1504 return TRACE_TYPE_PARTIAL_LINE;
1596 if (kretprobed(entry->fn.parent_ip)) 1505 ret = seq_print_ip_sym(s,
1597 ret = trace_seq_puts(s, KRETPROBE_MSG); 1506 field->parent_ip,
1598 else 1507 sym_flags);
1599 ret = seq_print_ip_sym(s, entry->fn.parent_ip,
1600 sym_flags);
1601 if (!ret) 1508 if (!ret)
1602 return 0; 1509 return TRACE_TYPE_PARTIAL_LINE;
1603 } 1510 }
1604 ret = trace_seq_printf(s, "\n"); 1511 ret = trace_seq_printf(s, "\n");
1605 if (!ret) 1512 if (!ret)
1606 return 0; 1513 return TRACE_TYPE_PARTIAL_LINE;
1607 break; 1514 break;
1515 }
1608 case TRACE_CTX: 1516 case TRACE_CTX:
1609 case TRACE_WAKE: 1517 case TRACE_WAKE: {
1610 S = entry->ctx.prev_state < sizeof(state_to_char) ? 1518 struct ctx_switch_entry *field;
1611 state_to_char[entry->ctx.prev_state] : 'X'; 1519
1612 T = entry->ctx.next_state < sizeof(state_to_char) ? 1520 trace_assign_type(field, entry);
1613 state_to_char[entry->ctx.next_state] : 'X'; 1521
1614 ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", 1522 S = field->prev_state < sizeof(state_to_char) ?
1615 entry->ctx.prev_pid, 1523 state_to_char[field->prev_state] : 'X';
1616 entry->ctx.prev_prio, 1524 T = field->next_state < sizeof(state_to_char) ?
1525 state_to_char[field->next_state] : 'X';
1526 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
1527 field->prev_pid,
1528 field->prev_prio,
1617 S, 1529 S,
1618 entry->type == TRACE_CTX ? "==>" : " +", 1530 entry->type == TRACE_CTX ? "==>" : " +",
1619 entry->ctx.next_pid, 1531 field->next_cpu,
1620 entry->ctx.next_prio, 1532 field->next_pid,
1533 field->next_prio,
1621 T); 1534 T);
1622 if (!ret) 1535 if (!ret)
1623 return 0; 1536 return TRACE_TYPE_PARTIAL_LINE;
1624 break; 1537 break;
1625 case TRACE_SPECIAL: 1538 }
1539 case TRACE_SPECIAL: {
1540 struct special_entry *field;
1541
1542 trace_assign_type(field, entry);
1543
1626 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1544 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1627 entry->special.arg1, 1545 field->arg1,
1628 entry->special.arg2, 1546 field->arg2,
1629 entry->special.arg3); 1547 field->arg3);
1630 if (!ret) 1548 if (!ret)
1631 return 0; 1549 return TRACE_TYPE_PARTIAL_LINE;
1632 break; 1550 break;
1633 case TRACE_STACK: 1551 }
1552 case TRACE_STACK: {
1553 struct stack_entry *field;
1554
1555 trace_assign_type(field, entry);
1556
1634 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1557 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1635 if (i) { 1558 if (i) {
1636 ret = trace_seq_puts(s, " <= "); 1559 ret = trace_seq_puts(s, " <= ");
1637 if (!ret) 1560 if (!ret)
1638 return 0; 1561 return TRACE_TYPE_PARTIAL_LINE;
1639 } 1562 }
1640 ret = seq_print_ip_sym(s, entry->stack.caller[i], 1563 ret = seq_print_ip_sym(s, field->caller[i],
1641 sym_flags); 1564 sym_flags);
1642 if (!ret) 1565 if (!ret)
1643 return 0; 1566 return TRACE_TYPE_PARTIAL_LINE;
1644 } 1567 }
1645 ret = trace_seq_puts(s, "\n"); 1568 ret = trace_seq_puts(s, "\n");
1646 if (!ret) 1569 if (!ret)
1647 return 0; 1570 return TRACE_TYPE_PARTIAL_LINE;
1648 break; 1571 break;
1649 } 1572 }
1650 return 1; 1573 case TRACE_PRINT: {
1574 struct print_entry *field;
1575
1576 trace_assign_type(field, entry);
1577
1578 seq_print_ip_sym(s, field->ip, sym_flags);
1579 trace_seq_printf(s, ": %s", field->buf);
1580 if (entry->flags & TRACE_FLAG_CONT)
1581 trace_seq_print_cont(s, iter);
1582 break;
1583 }
1584 }
1585 return TRACE_TYPE_HANDLED;
1651} 1586}
1652 1587
1653static int print_raw_fmt(struct trace_iterator *iter) 1588static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1654{ 1589{
1655 struct trace_seq *s = &iter->seq; 1590 struct trace_seq *s = &iter->seq;
1656 struct trace_entry *entry; 1591 struct trace_entry *entry;
@@ -1659,47 +1594,77 @@ static int print_raw_fmt(struct trace_iterator *iter)
1659 1594
1660 entry = iter->ent; 1595 entry = iter->ent;
1661 1596
1597 if (entry->type == TRACE_CONT)
1598 return TRACE_TYPE_HANDLED;
1599
1662 ret = trace_seq_printf(s, "%d %d %llu ", 1600 ret = trace_seq_printf(s, "%d %d %llu ",
1663 entry->pid, iter->cpu, entry->t); 1601 entry->pid, iter->cpu, iter->ts);
1664 if (!ret) 1602 if (!ret)
1665 return 0; 1603 return TRACE_TYPE_PARTIAL_LINE;
1666 1604
1667 switch (entry->type) { 1605 switch (entry->type) {
1668 case TRACE_FN: 1606 case TRACE_FN: {
1607 struct ftrace_entry *field;
1608
1609 trace_assign_type(field, entry);
1610
1669 ret = trace_seq_printf(s, "%x %x\n", 1611 ret = trace_seq_printf(s, "%x %x\n",
1670 entry->fn.ip, entry->fn.parent_ip); 1612 field->ip,
1613 field->parent_ip);
1671 if (!ret) 1614 if (!ret)
1672 return 0; 1615 return TRACE_TYPE_PARTIAL_LINE;
1673 break; 1616 break;
1617 }
1674 case TRACE_CTX: 1618 case TRACE_CTX:
1675 case TRACE_WAKE: 1619 case TRACE_WAKE: {
1676 S = entry->ctx.prev_state < sizeof(state_to_char) ? 1620 struct ctx_switch_entry *field;
1677 state_to_char[entry->ctx.prev_state] : 'X'; 1621
1678 T = entry->ctx.next_state < sizeof(state_to_char) ? 1622 trace_assign_type(field, entry);
1679 state_to_char[entry->ctx.next_state] : 'X'; 1623
1624 S = field->prev_state < sizeof(state_to_char) ?
1625 state_to_char[field->prev_state] : 'X';
1626 T = field->next_state < sizeof(state_to_char) ?
1627 state_to_char[field->next_state] : 'X';
1680 if (entry->type == TRACE_WAKE) 1628 if (entry->type == TRACE_WAKE)
1681 S = '+'; 1629 S = '+';
1682 ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", 1630 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
1683 entry->ctx.prev_pid, 1631 field->prev_pid,
1684 entry->ctx.prev_prio, 1632 field->prev_prio,
1685 S, 1633 S,
1686 entry->ctx.next_pid, 1634 field->next_cpu,
1687 entry->ctx.next_prio, 1635 field->next_pid,
1636 field->next_prio,
1688 T); 1637 T);
1689 if (!ret) 1638 if (!ret)
1690 return 0; 1639 return TRACE_TYPE_PARTIAL_LINE;
1691 break; 1640 break;
1641 }
1692 case TRACE_SPECIAL: 1642 case TRACE_SPECIAL:
1693 case TRACE_STACK: 1643 case TRACE_STACK: {
1644 struct special_entry *field;
1645
1646 trace_assign_type(field, entry);
1647
1694 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1648 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1695 entry->special.arg1, 1649 field->arg1,
1696 entry->special.arg2, 1650 field->arg2,
1697 entry->special.arg3); 1651 field->arg3);
1698 if (!ret) 1652 if (!ret)
1699 return 0; 1653 return TRACE_TYPE_PARTIAL_LINE;
1700 break; 1654 break;
1701 } 1655 }
1702 return 1; 1656 case TRACE_PRINT: {
1657 struct print_entry *field;
1658
1659 trace_assign_type(field, entry);
1660
1661 trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
1662 if (entry->flags & TRACE_FLAG_CONT)
1663 trace_seq_print_cont(s, iter);
1664 break;
1665 }
1666 }
1667 return TRACE_TYPE_HANDLED;
1703} 1668}
1704 1669
1705#define SEQ_PUT_FIELD_RET(s, x) \ 1670#define SEQ_PUT_FIELD_RET(s, x) \
@@ -1710,11 +1675,12 @@ do { \
1710 1675
1711#define SEQ_PUT_HEX_FIELD_RET(s, x) \ 1676#define SEQ_PUT_HEX_FIELD_RET(s, x) \
1712do { \ 1677do { \
1678 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
1713 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ 1679 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
1714 return 0; \ 1680 return 0; \
1715} while (0) 1681} while (0)
1716 1682
1717static int print_hex_fmt(struct trace_iterator *iter) 1683static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1718{ 1684{
1719 struct trace_seq *s = &iter->seq; 1685 struct trace_seq *s = &iter->seq;
1720 unsigned char newline = '\n'; 1686 unsigned char newline = '\n';
@@ -1723,97 +1689,139 @@ static int print_hex_fmt(struct trace_iterator *iter)
1723 1689
1724 entry = iter->ent; 1690 entry = iter->ent;
1725 1691
1692 if (entry->type == TRACE_CONT)
1693 return TRACE_TYPE_HANDLED;
1694
1726 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 1695 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1727 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 1696 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1728 SEQ_PUT_HEX_FIELD_RET(s, entry->t); 1697 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
1729 1698
1730 switch (entry->type) { 1699 switch (entry->type) {
1731 case TRACE_FN: 1700 case TRACE_FN: {
1732 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); 1701 struct ftrace_entry *field;
1733 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); 1702
1703 trace_assign_type(field, entry);
1704
1705 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
1706 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
1734 break; 1707 break;
1708 }
1735 case TRACE_CTX: 1709 case TRACE_CTX:
1736 case TRACE_WAKE: 1710 case TRACE_WAKE: {
1737 S = entry->ctx.prev_state < sizeof(state_to_char) ? 1711 struct ctx_switch_entry *field;
1738 state_to_char[entry->ctx.prev_state] : 'X'; 1712
1739 T = entry->ctx.next_state < sizeof(state_to_char) ? 1713 trace_assign_type(field, entry);
1740 state_to_char[entry->ctx.next_state] : 'X'; 1714
1715 S = field->prev_state < sizeof(state_to_char) ?
1716 state_to_char[field->prev_state] : 'X';
1717 T = field->next_state < sizeof(state_to_char) ?
1718 state_to_char[field->next_state] : 'X';
1741 if (entry->type == TRACE_WAKE) 1719 if (entry->type == TRACE_WAKE)
1742 S = '+'; 1720 S = '+';
1743 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); 1721 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
1744 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); 1722 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
1745 SEQ_PUT_HEX_FIELD_RET(s, S); 1723 SEQ_PUT_HEX_FIELD_RET(s, S);
1746 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); 1724 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
1747 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); 1725 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
1748 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); 1726 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
1749 SEQ_PUT_HEX_FIELD_RET(s, T); 1727 SEQ_PUT_HEX_FIELD_RET(s, T);
1750 break; 1728 break;
1729 }
1751 case TRACE_SPECIAL: 1730 case TRACE_SPECIAL:
1752 case TRACE_STACK: 1731 case TRACE_STACK: {
1753 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); 1732 struct special_entry *field;
1754 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); 1733
1755 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); 1734 trace_assign_type(field, entry);
1735
1736 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1737 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1738 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1756 break; 1739 break;
1757 } 1740 }
1741 }
1758 SEQ_PUT_FIELD_RET(s, newline); 1742 SEQ_PUT_FIELD_RET(s, newline);
1759 1743
1760 return 1; 1744 return TRACE_TYPE_HANDLED;
1761} 1745}
1762 1746
1763static int print_bin_fmt(struct trace_iterator *iter) 1747static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1764{ 1748{
1765 struct trace_seq *s = &iter->seq; 1749 struct trace_seq *s = &iter->seq;
1766 struct trace_entry *entry; 1750 struct trace_entry *entry;
1767 1751
1768 entry = iter->ent; 1752 entry = iter->ent;
1769 1753
1754 if (entry->type == TRACE_CONT)
1755 return TRACE_TYPE_HANDLED;
1756
1770 SEQ_PUT_FIELD_RET(s, entry->pid); 1757 SEQ_PUT_FIELD_RET(s, entry->pid);
1771 SEQ_PUT_FIELD_RET(s, entry->cpu); 1758 SEQ_PUT_FIELD_RET(s, entry->cpu);
1772 SEQ_PUT_FIELD_RET(s, entry->t); 1759 SEQ_PUT_FIELD_RET(s, iter->ts);
1773 1760
1774 switch (entry->type) { 1761 switch (entry->type) {
1775 case TRACE_FN: 1762 case TRACE_FN: {
1776 SEQ_PUT_FIELD_RET(s, entry->fn.ip); 1763 struct ftrace_entry *field;
1777 SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); 1764
1765 trace_assign_type(field, entry);
1766
1767 SEQ_PUT_FIELD_RET(s, field->ip);
1768 SEQ_PUT_FIELD_RET(s, field->parent_ip);
1778 break; 1769 break;
1779 case TRACE_CTX: 1770 }
1780 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); 1771 case TRACE_CTX: {
1781 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); 1772 struct ctx_switch_entry *field;
1782 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); 1773
1783 SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); 1774 trace_assign_type(field, entry);
1784 SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); 1775
1785 SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); 1776 SEQ_PUT_FIELD_RET(s, field->prev_pid);
1777 SEQ_PUT_FIELD_RET(s, field->prev_prio);
1778 SEQ_PUT_FIELD_RET(s, field->prev_state);
1779 SEQ_PUT_FIELD_RET(s, field->next_pid);
1780 SEQ_PUT_FIELD_RET(s, field->next_prio);
1781 SEQ_PUT_FIELD_RET(s, field->next_state);
1786 break; 1782 break;
1783 }
1787 case TRACE_SPECIAL: 1784 case TRACE_SPECIAL:
1788 case TRACE_STACK: 1785 case TRACE_STACK: {
1789 SEQ_PUT_FIELD_RET(s, entry->special.arg1); 1786 struct special_entry *field;
1790 SEQ_PUT_FIELD_RET(s, entry->special.arg2); 1787
1791 SEQ_PUT_FIELD_RET(s, entry->special.arg3); 1788 trace_assign_type(field, entry);
1789
1790 SEQ_PUT_FIELD_RET(s, field->arg1);
1791 SEQ_PUT_FIELD_RET(s, field->arg2);
1792 SEQ_PUT_FIELD_RET(s, field->arg3);
1792 break; 1793 break;
1793 } 1794 }
1795 }
1794 return 1; 1796 return 1;
1795} 1797}
1796 1798
1797static int trace_empty(struct trace_iterator *iter) 1799static int trace_empty(struct trace_iterator *iter)
1798{ 1800{
1799 struct trace_array_cpu *data;
1800 int cpu; 1801 int cpu;
1801 1802
1802 for_each_tracing_cpu(cpu) { 1803 for_each_tracing_cpu(cpu) {
1803 data = iter->tr->data[cpu]; 1804 if (iter->buffer_iter[cpu]) {
1804 1805 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
1805 if (head_page(data) && data->trace_idx && 1806 return 0;
1806 (data->trace_tail != data->trace_head || 1807 } else {
1807 data->trace_tail_idx != data->trace_head_idx)) 1808 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
1808 return 0; 1809 return 0;
1810 }
1809 } 1811 }
1812
1810 return 1; 1813 return 1;
1811} 1814}
1812 1815
1813static int print_trace_line(struct trace_iterator *iter) 1816static enum print_line_t print_trace_line(struct trace_iterator *iter)
1814{ 1817{
1815 if (iter->trace && iter->trace->print_line) 1818 enum print_line_t ret;
1816 return iter->trace->print_line(iter); 1819
1820 if (iter->trace && iter->trace->print_line) {
1821 ret = iter->trace->print_line(iter);
1822 if (ret != TRACE_TYPE_UNHANDLED)
1823 return ret;
1824 }
1817 1825
1818 if (trace_flags & TRACE_ITER_BIN) 1826 if (trace_flags & TRACE_ITER_BIN)
1819 return print_bin_fmt(iter); 1827 return print_bin_fmt(iter);
@@ -1869,6 +1877,8 @@ static struct trace_iterator *
1869__tracing_open(struct inode *inode, struct file *file, int *ret) 1877__tracing_open(struct inode *inode, struct file *file, int *ret)
1870{ 1878{
1871 struct trace_iterator *iter; 1879 struct trace_iterator *iter;
1880 struct seq_file *m;
1881 int cpu;
1872 1882
1873 if (tracing_disabled) { 1883 if (tracing_disabled) {
1874 *ret = -ENODEV; 1884 *ret = -ENODEV;
@@ -1889,28 +1899,46 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1889 iter->trace = current_trace; 1899 iter->trace = current_trace;
1890 iter->pos = -1; 1900 iter->pos = -1;
1891 1901
1902 for_each_tracing_cpu(cpu) {
1903
1904 iter->buffer_iter[cpu] =
1905 ring_buffer_read_start(iter->tr->buffer, cpu);
1906
1907 if (!iter->buffer_iter[cpu])
1908 goto fail_buffer;
1909 }
1910
1892 /* TODO stop tracer */ 1911 /* TODO stop tracer */
1893 *ret = seq_open(file, &tracer_seq_ops); 1912 *ret = seq_open(file, &tracer_seq_ops);
1894 if (!*ret) { 1913 if (*ret)
1895 struct seq_file *m = file->private_data; 1914 goto fail_buffer;
1896 m->private = iter;
1897 1915
1898 /* stop the trace while dumping */ 1916 m = file->private_data;
1899 if (iter->tr->ctrl) { 1917 m->private = iter;
1900 tracer_enabled = 0;
1901 ftrace_function_enabled = 0;
1902 }
1903 1918
1904 if (iter->trace && iter->trace->open) 1919 /* stop the trace while dumping */
1905 iter->trace->open(iter); 1920 if (iter->tr->ctrl) {
1906 } else { 1921 tracer_enabled = 0;
1907 kfree(iter); 1922 ftrace_function_enabled = 0;
1908 iter = NULL;
1909 } 1923 }
1924
1925 if (iter->trace && iter->trace->open)
1926 iter->trace->open(iter);
1927
1910 mutex_unlock(&trace_types_lock); 1928 mutex_unlock(&trace_types_lock);
1911 1929
1912 out: 1930 out:
1913 return iter; 1931 return iter;
1932
1933 fail_buffer:
1934 for_each_tracing_cpu(cpu) {
1935 if (iter->buffer_iter[cpu])
1936 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1937 }
1938 mutex_unlock(&trace_types_lock);
1939 kfree(iter);
1940
1941 return ERR_PTR(-ENOMEM);
1914} 1942}
1915 1943
1916int tracing_open_generic(struct inode *inode, struct file *filp) 1944int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1926,8 +1954,14 @@ int tracing_release(struct inode *inode, struct file *file)
1926{ 1954{
1927 struct seq_file *m = (struct seq_file *)file->private_data; 1955 struct seq_file *m = (struct seq_file *)file->private_data;
1928 struct trace_iterator *iter = m->private; 1956 struct trace_iterator *iter = m->private;
1957 int cpu;
1929 1958
1930 mutex_lock(&trace_types_lock); 1959 mutex_lock(&trace_types_lock);
1960 for_each_tracing_cpu(cpu) {
1961 if (iter->buffer_iter[cpu])
1962 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1963 }
1964
1931 if (iter->trace && iter->trace->close) 1965 if (iter->trace && iter->trace->close)
1932 iter->trace->close(iter); 1966 iter->trace->close(iter);
1933 1967
@@ -2352,6 +2386,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2352 struct tracer *t; 2386 struct tracer *t;
2353 char buf[max_tracer_type_len+1]; 2387 char buf[max_tracer_type_len+1];
2354 int i; 2388 int i;
2389 size_t ret;
2390
2391 ret = cnt;
2355 2392
2356 if (cnt > max_tracer_type_len) 2393 if (cnt > max_tracer_type_len)
2357 cnt = max_tracer_type_len; 2394 cnt = max_tracer_type_len;
@@ -2370,7 +2407,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2370 if (strcmp(t->name, buf) == 0) 2407 if (strcmp(t->name, buf) == 0)
2371 break; 2408 break;
2372 } 2409 }
2373 if (!t || t == current_trace) 2410 if (!t) {
2411 ret = -EINVAL;
2412 goto out;
2413 }
2414 if (t == current_trace)
2374 goto out; 2415 goto out;
2375 2416
2376 if (current_trace && current_trace->reset) 2417 if (current_trace && current_trace->reset)
@@ -2383,9 +2424,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2383 out: 2424 out:
2384 mutex_unlock(&trace_types_lock); 2425 mutex_unlock(&trace_types_lock);
2385 2426
2386 filp->f_pos += cnt; 2427 if (ret > 0)
2428 filp->f_pos += ret;
2387 2429
2388 return cnt; 2430 return ret;
2389} 2431}
2390 2432
2391static ssize_t 2433static ssize_t
@@ -2500,20 +2542,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2500 size_t cnt, loff_t *ppos) 2542 size_t cnt, loff_t *ppos)
2501{ 2543{
2502 struct trace_iterator *iter = filp->private_data; 2544 struct trace_iterator *iter = filp->private_data;
2503 struct trace_array_cpu *data;
2504 static cpumask_t mask;
2505 unsigned long flags;
2506#ifdef CONFIG_FTRACE
2507 int ftrace_save;
2508#endif
2509 int cpu;
2510 ssize_t sret; 2545 ssize_t sret;
2511 2546
2512 /* return any leftover data */ 2547 /* return any leftover data */
2513 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2548 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2514 if (sret != -EBUSY) 2549 if (sret != -EBUSY)
2515 return sret; 2550 return sret;
2516 sret = 0;
2517 2551
2518 trace_seq_reset(&iter->seq); 2552 trace_seq_reset(&iter->seq);
2519 2553
@@ -2524,6 +2558,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2524 goto out; 2558 goto out;
2525 } 2559 }
2526 2560
2561waitagain:
2562 sret = 0;
2527 while (trace_empty(iter)) { 2563 while (trace_empty(iter)) {
2528 2564
2529 if ((filp->f_flags & O_NONBLOCK)) { 2565 if ((filp->f_flags & O_NONBLOCK)) {
@@ -2588,46 +2624,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2588 offsetof(struct trace_iterator, seq)); 2624 offsetof(struct trace_iterator, seq));
2589 iter->pos = -1; 2625 iter->pos = -1;
2590 2626
2591 /*
2592 * We need to stop all tracing on all CPUS to read the
2593 * the next buffer. This is a bit expensive, but is
2594 * not done often. We fill all what we can read,
2595 * and then release the locks again.
2596 */
2597
2598 cpus_clear(mask);
2599 local_irq_save(flags);
2600#ifdef CONFIG_FTRACE
2601 ftrace_save = ftrace_enabled;
2602 ftrace_enabled = 0;
2603#endif
2604 smp_wmb();
2605 for_each_tracing_cpu(cpu) {
2606 data = iter->tr->data[cpu];
2607
2608 if (!head_page(data) || !data->trace_idx)
2609 continue;
2610
2611 atomic_inc(&data->disabled);
2612 cpu_set(cpu, mask);
2613 }
2614
2615 for_each_cpu_mask(cpu, mask) {
2616 data = iter->tr->data[cpu];
2617 __raw_spin_lock(&data->lock);
2618
2619 if (data->overrun > iter->last_overrun[cpu])
2620 iter->overrun[cpu] +=
2621 data->overrun - iter->last_overrun[cpu];
2622 iter->last_overrun[cpu] = data->overrun;
2623 }
2624
2625 while (find_next_entry_inc(iter) != NULL) { 2627 while (find_next_entry_inc(iter) != NULL) {
2626 int ret; 2628 enum print_line_t ret;
2627 int len = iter->seq.len; 2629 int len = iter->seq.len;
2628 2630
2629 ret = print_trace_line(iter); 2631 ret = print_trace_line(iter);
2630 if (!ret) { 2632 if (ret == TRACE_TYPE_PARTIAL_LINE) {
2631 /* don't print partial lines */ 2633 /* don't print partial lines */
2632 iter->seq.len = len; 2634 iter->seq.len = len;
2633 break; 2635 break;
@@ -2639,26 +2641,17 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2639 break; 2641 break;
2640 } 2642 }
2641 2643
2642 for_each_cpu_mask(cpu, mask) {
2643 data = iter->tr->data[cpu];
2644 __raw_spin_unlock(&data->lock);
2645 }
2646
2647 for_each_cpu_mask(cpu, mask) {
2648 data = iter->tr->data[cpu];
2649 atomic_dec(&data->disabled);
2650 }
2651#ifdef CONFIG_FTRACE
2652 ftrace_enabled = ftrace_save;
2653#endif
2654 local_irq_restore(flags);
2655
2656 /* Now copy what we have to the user */ 2644 /* Now copy what we have to the user */
2657 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2645 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2658 if (iter->seq.readpos >= iter->seq.len) 2646 if (iter->seq.readpos >= iter->seq.len)
2659 trace_seq_reset(&iter->seq); 2647 trace_seq_reset(&iter->seq);
2648
2649 /*
2650 * If there was nothing to send to user, inspite of consuming trace
2651 * entries, go back to wait for more entries.
2652 */
2660 if (sret == -EBUSY) 2653 if (sret == -EBUSY)
2661 sret = 0; 2654 goto waitagain;
2662 2655
2663out: 2656out:
2664 mutex_unlock(&trace_types_lock); 2657 mutex_unlock(&trace_types_lock);
@@ -2684,7 +2677,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2684{ 2677{
2685 unsigned long val; 2678 unsigned long val;
2686 char buf[64]; 2679 char buf[64];
2687 int i, ret; 2680 int ret, cpu;
2681 struct trace_array *tr = filp->private_data;
2688 2682
2689 if (cnt >= sizeof(buf)) 2683 if (cnt >= sizeof(buf))
2690 return -EINVAL; 2684 return -EINVAL;
@@ -2704,71 +2698,111 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2704 2698
2705 mutex_lock(&trace_types_lock); 2699 mutex_lock(&trace_types_lock);
2706 2700
2707 if (current_trace != &no_tracer) { 2701 if (tr->ctrl) {
2708 cnt = -EBUSY; 2702 cnt = -EBUSY;
2709 pr_info("ftrace: set current_tracer to none" 2703 pr_info("ftrace: please disable tracing"
2710 " before modifying buffer size\n"); 2704 " before modifying buffer size\n");
2711 goto out; 2705 goto out;
2712 } 2706 }
2713 2707
2714 if (val > global_trace.entries) { 2708 /* disable all cpu buffers */
2715 long pages_requested; 2709 for_each_tracing_cpu(cpu) {
2716 unsigned long freeable_pages; 2710 if (global_trace.data[cpu])
2717 2711 atomic_inc(&global_trace.data[cpu]->disabled);
2718 /* make sure we have enough memory before mapping */ 2712 if (max_tr.data[cpu])
2719 pages_requested = 2713 atomic_inc(&max_tr.data[cpu]->disabled);
2720 (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; 2714 }
2721
2722 /* account for each buffer (and max_tr) */
2723 pages_requested *= tracing_nr_buffers * 2;
2724
2725 /* Check for overflow */
2726 if (pages_requested < 0) {
2727 cnt = -ENOMEM;
2728 goto out;
2729 }
2730
2731 freeable_pages = determine_dirtyable_memory();
2732 2715
2733 /* we only allow to request 1/4 of useable memory */ 2716 if (val != global_trace.entries) {
2734 if (pages_requested > 2717 ret = ring_buffer_resize(global_trace.buffer, val);
2735 ((freeable_pages + tracing_pages_allocated) / 4)) { 2718 if (ret < 0) {
2736 cnt = -ENOMEM; 2719 cnt = ret;
2737 goto out; 2720 goto out;
2738 } 2721 }
2739 2722
2740 while (global_trace.entries < val) { 2723 ret = ring_buffer_resize(max_tr.buffer, val);
2741 if (trace_alloc_page()) { 2724 if (ret < 0) {
2742 cnt = -ENOMEM; 2725 int r;
2743 goto out; 2726 cnt = ret;
2727 r = ring_buffer_resize(global_trace.buffer,
2728 global_trace.entries);
2729 if (r < 0) {
2730 /* AARGH! We are left with different
2731 * size max buffer!!!! */
2732 WARN_ON(1);
2733 tracing_disabled = 1;
2744 } 2734 }
2745 /* double check that we don't go over the known pages */ 2735 goto out;
2746 if (tracing_pages_allocated > pages_requested)
2747 break;
2748 } 2736 }
2749 2737
2750 } else { 2738 global_trace.entries = val;
2751 /* include the number of entries in val (inc of page entries) */
2752 while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
2753 trace_free_page();
2754 } 2739 }
2755 2740
2756 /* check integrity */
2757 for_each_tracing_cpu(i)
2758 check_pages(global_trace.data[i]);
2759
2760 filp->f_pos += cnt; 2741 filp->f_pos += cnt;
2761 2742
2762 /* If check pages failed, return ENOMEM */ 2743 /* If check pages failed, return ENOMEM */
2763 if (tracing_disabled) 2744 if (tracing_disabled)
2764 cnt = -ENOMEM; 2745 cnt = -ENOMEM;
2765 out: 2746 out:
2747 for_each_tracing_cpu(cpu) {
2748 if (global_trace.data[cpu])
2749 atomic_dec(&global_trace.data[cpu]->disabled);
2750 if (max_tr.data[cpu])
2751 atomic_dec(&max_tr.data[cpu]->disabled);
2752 }
2753
2766 max_tr.entries = global_trace.entries; 2754 max_tr.entries = global_trace.entries;
2767 mutex_unlock(&trace_types_lock); 2755 mutex_unlock(&trace_types_lock);
2768 2756
2769 return cnt; 2757 return cnt;
2770} 2758}
2771 2759
2760static int mark_printk(const char *fmt, ...)
2761{
2762 int ret;
2763 va_list args;
2764 va_start(args, fmt);
2765 ret = trace_vprintk(0, fmt, args);
2766 va_end(args);
2767 return ret;
2768}
2769
2770static ssize_t
2771tracing_mark_write(struct file *filp, const char __user *ubuf,
2772 size_t cnt, loff_t *fpos)
2773{
2774 char *buf;
2775 char *end;
2776 struct trace_array *tr = &global_trace;
2777
2778 if (!tr->ctrl || tracing_disabled)
2779 return -EINVAL;
2780
2781 if (cnt > TRACE_BUF_SIZE)
2782 cnt = TRACE_BUF_SIZE;
2783
2784 buf = kmalloc(cnt + 1, GFP_KERNEL);
2785 if (buf == NULL)
2786 return -ENOMEM;
2787
2788 if (copy_from_user(buf, ubuf, cnt)) {
2789 kfree(buf);
2790 return -EFAULT;
2791 }
2792
2793 /* Cut from the first nil or newline. */
2794 buf[cnt] = '\0';
2795 end = strchr(buf, '\n');
2796 if (end)
2797 *end = '\0';
2798
2799 cnt = mark_printk("%s\n", buf);
2800 kfree(buf);
2801 *fpos += cnt;
2802
2803 return cnt;
2804}
2805
2772static struct file_operations tracing_max_lat_fops = { 2806static struct file_operations tracing_max_lat_fops = {
2773 .open = tracing_open_generic, 2807 .open = tracing_open_generic,
2774 .read = tracing_max_lat_read, 2808 .read = tracing_max_lat_read,
@@ -2800,6 +2834,11 @@ static struct file_operations tracing_entries_fops = {
2800 .write = tracing_entries_write, 2834 .write = tracing_entries_write,
2801}; 2835};
2802 2836
2837static struct file_operations tracing_mark_fops = {
2838 .open = tracing_open_generic,
2839 .write = tracing_mark_write,
2840};
2841
2803#ifdef CONFIG_DYNAMIC_FTRACE 2842#ifdef CONFIG_DYNAMIC_FTRACE
2804 2843
2805static ssize_t 2844static ssize_t
@@ -2846,7 +2885,7 @@ struct dentry *tracing_init_dentry(void)
2846#include "trace_selftest.c" 2885#include "trace_selftest.c"
2847#endif 2886#endif
2848 2887
2849static __init void tracer_init_debugfs(void) 2888static __init int tracer_init_debugfs(void)
2850{ 2889{
2851 struct dentry *d_tracer; 2890 struct dentry *d_tracer;
2852 struct dentry *entry; 2891 struct dentry *entry;
@@ -2881,12 +2920,12 @@ static __init void tracer_init_debugfs(void)
2881 entry = debugfs_create_file("available_tracers", 0444, d_tracer, 2920 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
2882 &global_trace, &show_traces_fops); 2921 &global_trace, &show_traces_fops);
2883 if (!entry) 2922 if (!entry)
2884 pr_warning("Could not create debugfs 'trace' entry\n"); 2923 pr_warning("Could not create debugfs 'available_tracers' entry\n");
2885 2924
2886 entry = debugfs_create_file("current_tracer", 0444, d_tracer, 2925 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
2887 &global_trace, &set_tracer_fops); 2926 &global_trace, &set_tracer_fops);
2888 if (!entry) 2927 if (!entry)
2889 pr_warning("Could not create debugfs 'trace' entry\n"); 2928 pr_warning("Could not create debugfs 'current_tracer' entry\n");
2890 2929
2891 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, 2930 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
2892 &tracing_max_latency, 2931 &tracing_max_latency,
@@ -2899,7 +2938,7 @@ static __init void tracer_init_debugfs(void)
2899 &tracing_thresh, &tracing_max_lat_fops); 2938 &tracing_thresh, &tracing_max_lat_fops);
2900 if (!entry) 2939 if (!entry)
2901 pr_warning("Could not create debugfs " 2940 pr_warning("Could not create debugfs "
2902 "'tracing_threash' entry\n"); 2941 "'tracing_thresh' entry\n");
2903 entry = debugfs_create_file("README", 0644, d_tracer, 2942 entry = debugfs_create_file("README", 0644, d_tracer,
2904 NULL, &tracing_readme_fops); 2943 NULL, &tracing_readme_fops);
2905 if (!entry) 2944 if (!entry)
@@ -2909,13 +2948,19 @@ static __init void tracer_init_debugfs(void)
2909 NULL, &tracing_pipe_fops); 2948 NULL, &tracing_pipe_fops);
2910 if (!entry) 2949 if (!entry)
2911 pr_warning("Could not create debugfs " 2950 pr_warning("Could not create debugfs "
2912 "'tracing_threash' entry\n"); 2951 "'trace_pipe' entry\n");
2913 2952
2914 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 2953 entry = debugfs_create_file("trace_entries", 0644, d_tracer,
2915 &global_trace, &tracing_entries_fops); 2954 &global_trace, &tracing_entries_fops);
2916 if (!entry) 2955 if (!entry)
2917 pr_warning("Could not create debugfs " 2956 pr_warning("Could not create debugfs "
2918 "'tracing_threash' entry\n"); 2957 "'trace_entries' entry\n");
2958
2959 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2960 NULL, &tracing_mark_fops);
2961 if (!entry)
2962 pr_warning("Could not create debugfs "
2963 "'trace_marker' entry\n");
2919 2964
2920#ifdef CONFIG_DYNAMIC_FTRACE 2965#ifdef CONFIG_DYNAMIC_FTRACE
2921 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 2966 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
@@ -2928,230 +2973,263 @@ static __init void tracer_init_debugfs(void)
2928#ifdef CONFIG_SYSPROF_TRACER 2973#ifdef CONFIG_SYSPROF_TRACER
2929 init_tracer_sysprof_debugfs(d_tracer); 2974 init_tracer_sysprof_debugfs(d_tracer);
2930#endif 2975#endif
2976 return 0;
2931} 2977}
2932 2978
2933static int trace_alloc_page(void) 2979int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2934{ 2980{
2981 static DEFINE_SPINLOCK(trace_buf_lock);
2982 static char trace_buf[TRACE_BUF_SIZE];
2983
2984 struct ring_buffer_event *event;
2985 struct trace_array *tr = &global_trace;
2935 struct trace_array_cpu *data; 2986 struct trace_array_cpu *data;
2936 struct page *page, *tmp; 2987 struct print_entry *entry;
2937 LIST_HEAD(pages); 2988 unsigned long flags, irq_flags;
2938 void *array; 2989 int cpu, len = 0, size, pc;
2939 unsigned pages_allocated = 0;
2940 int i;
2941 2990
2942 /* first allocate a page for each CPU */ 2991 if (!tr->ctrl || tracing_disabled)
2943 for_each_tracing_cpu(i) { 2992 return 0;
2944 array = (void *)__get_free_page(GFP_KERNEL);
2945 if (array == NULL) {
2946 printk(KERN_ERR "tracer: failed to allocate page"
2947 "for trace buffer!\n");
2948 goto free_pages;
2949 }
2950 2993
2951 pages_allocated++; 2994 pc = preempt_count();
2952 page = virt_to_page(array); 2995 preempt_disable_notrace();
2953 list_add(&page->lru, &pages); 2996 cpu = raw_smp_processor_id();
2997 data = tr->data[cpu];
2954 2998
2955/* Only allocate if we are actually using the max trace */ 2999 if (unlikely(atomic_read(&data->disabled)))
2956#ifdef CONFIG_TRACER_MAX_TRACE 3000 goto out;
2957 array = (void *)__get_free_page(GFP_KERNEL);
2958 if (array == NULL) {
2959 printk(KERN_ERR "tracer: failed to allocate page"
2960 "for trace buffer!\n");
2961 goto free_pages;
2962 }
2963 pages_allocated++;
2964 page = virt_to_page(array);
2965 list_add(&page->lru, &pages);
2966#endif
2967 }
2968 3001
2969 /* Now that we successfully allocate a page per CPU, add them */ 3002 spin_lock_irqsave(&trace_buf_lock, flags);
2970 for_each_tracing_cpu(i) { 3003 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
2971 data = global_trace.data[i];
2972 page = list_entry(pages.next, struct page, lru);
2973 list_del_init(&page->lru);
2974 list_add_tail(&page->lru, &data->trace_pages);
2975 ClearPageLRU(page);
2976 3004
2977#ifdef CONFIG_TRACER_MAX_TRACE 3005 len = min(len, TRACE_BUF_SIZE-1);
2978 data = max_tr.data[i]; 3006 trace_buf[len] = 0;
2979 page = list_entry(pages.next, struct page, lru);
2980 list_del_init(&page->lru);
2981 list_add_tail(&page->lru, &data->trace_pages);
2982 SetPageLRU(page);
2983#endif
2984 }
2985 tracing_pages_allocated += pages_allocated;
2986 global_trace.entries += ENTRIES_PER_PAGE;
2987 3007
2988 return 0; 3008 size = sizeof(*entry) + len + 1;
3009 event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
3010 if (!event)
3011 goto out_unlock;
3012 entry = ring_buffer_event_data(event);
3013 tracing_generic_entry_update(&entry->ent, flags, pc);
3014 entry->ent.type = TRACE_PRINT;
3015 entry->ip = ip;
2989 3016
2990 free_pages: 3017 memcpy(&entry->buf, trace_buf, len);
2991 list_for_each_entry_safe(page, tmp, &pages, lru) { 3018 entry->buf[len] = 0;
2992 list_del_init(&page->lru); 3019 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
2993 __free_page(page); 3020
2994 } 3021 out_unlock:
2995 return -ENOMEM; 3022 spin_unlock_irqrestore(&trace_buf_lock, flags);
3023
3024 out:
3025 preempt_enable_notrace();
3026
3027 return len;
2996} 3028}
3029EXPORT_SYMBOL_GPL(trace_vprintk);
2997 3030
2998static int trace_free_page(void) 3031int __ftrace_printk(unsigned long ip, const char *fmt, ...)
2999{ 3032{
3000 struct trace_array_cpu *data; 3033 int ret;
3001 struct page *page; 3034 va_list ap;
3002 struct list_head *p;
3003 int i;
3004 int ret = 0;
3005 3035
3006 /* free one page from each buffer */ 3036 if (!(trace_flags & TRACE_ITER_PRINTK))
3007 for_each_tracing_cpu(i) { 3037 return 0;
3008 data = global_trace.data[i];
3009 p = data->trace_pages.next;
3010 if (p == &data->trace_pages) {
3011 /* should never happen */
3012 WARN_ON(1);
3013 tracing_disabled = 1;
3014 ret = -1;
3015 break;
3016 }
3017 page = list_entry(p, struct page, lru);
3018 ClearPageLRU(page);
3019 list_del(&page->lru);
3020 tracing_pages_allocated--;
3021 tracing_pages_allocated--;
3022 __free_page(page);
3023 3038
3024 tracing_reset(data); 3039 va_start(ap, fmt);
3040 ret = trace_vprintk(ip, fmt, ap);
3041 va_end(ap);
3042 return ret;
3043}
3044EXPORT_SYMBOL_GPL(__ftrace_printk);
3025 3045
3026#ifdef CONFIG_TRACER_MAX_TRACE 3046static int trace_panic_handler(struct notifier_block *this,
3027 data = max_tr.data[i]; 3047 unsigned long event, void *unused)
3028 p = data->trace_pages.next; 3048{
3029 if (p == &data->trace_pages) { 3049 ftrace_dump();
3030 /* should never happen */ 3050 return NOTIFY_OK;
3031 WARN_ON(1); 3051}
3032 tracing_disabled = 1;
3033 ret = -1;
3034 break;
3035 }
3036 page = list_entry(p, struct page, lru);
3037 ClearPageLRU(page);
3038 list_del(&page->lru);
3039 __free_page(page);
3040 3052
3041 tracing_reset(data); 3053static struct notifier_block trace_panic_notifier = {
3042#endif 3054 .notifier_call = trace_panic_handler,
3043 } 3055 .next = NULL,
3044 global_trace.entries -= ENTRIES_PER_PAGE; 3056 .priority = 150 /* priority: INT_MAX >= x >= 0 */
3057};
3045 3058
3046 return ret; 3059static int trace_die_handler(struct notifier_block *self,
3060 unsigned long val,
3061 void *data)
3062{
3063 switch (val) {
3064 case DIE_OOPS:
3065 ftrace_dump();
3066 break;
3067 default:
3068 break;
3069 }
3070 return NOTIFY_OK;
3047} 3071}
3048 3072
3049__init static int tracer_alloc_buffers(void) 3073static struct notifier_block trace_die_notifier = {
3074 .notifier_call = trace_die_handler,
3075 .priority = 200
3076};
3077
3078/*
3079 * printk is set to max of 1024, we really don't need it that big.
3080 * Nothing should be printing 1000 characters anyway.
3081 */
3082#define TRACE_MAX_PRINT 1000
3083
3084/*
3085 * Define here KERN_TRACE so that we have one place to modify
3086 * it if we decide to change what log level the ftrace dump
3087 * should be at.
3088 */
3089#define KERN_TRACE KERN_INFO
3090
3091static void
3092trace_printk_seq(struct trace_seq *s)
3050{ 3093{
3051 struct trace_array_cpu *data; 3094 /* Probably should print a warning here. */
3052 void *array; 3095 if (s->len >= 1000)
3053 struct page *page; 3096 s->len = 1000;
3054 int pages = 0;
3055 int ret = -ENOMEM;
3056 int i;
3057 3097
3058 /* TODO: make the number of buffers hot pluggable with CPUS */ 3098 /* should be zero ended, but we are paranoid. */
3059 tracing_nr_buffers = num_possible_cpus(); 3099 s->buffer[s->len] = 0;
3060 tracing_buffer_mask = cpu_possible_map;
3061 3100
3062 /* Allocate the first page for all buffers */ 3101 printk(KERN_TRACE "%s", s->buffer);
3063 for_each_tracing_cpu(i) { 3102
3064 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 3103 trace_seq_reset(s);
3065 max_tr.data[i] = &per_cpu(max_data, i); 3104}
3066 3105
3067 array = (void *)__get_free_page(GFP_KERNEL);
3068 if (array == NULL) {
3069 printk(KERN_ERR "tracer: failed to allocate page"
3070 "for trace buffer!\n");
3071 goto free_buffers;
3072 }
3073 3106
3074 /* set the array to the list */ 3107void ftrace_dump(void)
3075 INIT_LIST_HEAD(&data->trace_pages); 3108{
3076 page = virt_to_page(array); 3109 static DEFINE_SPINLOCK(ftrace_dump_lock);
3077 list_add(&page->lru, &data->trace_pages); 3110 /* use static because iter can be a bit big for the stack */
3078 /* use the LRU flag to differentiate the two buffers */ 3111 static struct trace_iterator iter;
3079 ClearPageLRU(page); 3112 static cpumask_t mask;
3113 static int dump_ran;
3114 unsigned long flags;
3115 int cnt = 0, cpu;
3080 3116
3081 data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 3117 /* only one dump */
3082 max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 3118 spin_lock_irqsave(&ftrace_dump_lock, flags);
3119 if (dump_ran)
3120 goto out;
3083 3121
3084/* Only allocate if we are actually using the max trace */ 3122 dump_ran = 1;
3085#ifdef CONFIG_TRACER_MAX_TRACE
3086 array = (void *)__get_free_page(GFP_KERNEL);
3087 if (array == NULL) {
3088 printk(KERN_ERR "tracer: failed to allocate page"
3089 "for trace buffer!\n");
3090 goto free_buffers;
3091 }
3092 3123
3093 INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); 3124 /* No turning back! */
3094 page = virt_to_page(array); 3125 ftrace_kill();
3095 list_add(&page->lru, &max_tr.data[i]->trace_pages); 3126
3096 SetPageLRU(page); 3127 for_each_tracing_cpu(cpu) {
3097#endif 3128 atomic_inc(&global_trace.data[cpu]->disabled);
3098 } 3129 }
3099 3130
3131 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3132
3133 iter.tr = &global_trace;
3134 iter.trace = current_trace;
3135
3100 /* 3136 /*
3101 * Since we allocate by orders of pages, we may be able to 3137 * We need to stop all tracing on all CPUS to read the
3102 * round up a bit. 3138 * the next buffer. This is a bit expensive, but is
3139 * not done often. We fill all what we can read,
3140 * and then release the locks again.
3103 */ 3141 */
3104 global_trace.entries = ENTRIES_PER_PAGE;
3105 pages++;
3106 3142
3107 while (global_trace.entries < trace_nr_entries) { 3143 cpus_clear(mask);
3108 if (trace_alloc_page()) 3144
3109 break; 3145 while (!trace_empty(&iter)) {
3110 pages++; 3146
3147 if (!cnt)
3148 printk(KERN_TRACE "---------------------------------\n");
3149
3150 cnt++;
3151
3152 /* reset all but tr, trace, and overruns */
3153 memset(&iter.seq, 0,
3154 sizeof(struct trace_iterator) -
3155 offsetof(struct trace_iterator, seq));
3156 iter.iter_flags |= TRACE_FILE_LAT_FMT;
3157 iter.pos = -1;
3158
3159 if (find_next_entry_inc(&iter) != NULL) {
3160 print_trace_line(&iter);
3161 trace_consume(&iter);
3162 }
3163
3164 trace_printk_seq(&iter.seq);
3111 } 3165 }
3112 max_tr.entries = global_trace.entries;
3113 3166
3114 pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", 3167 if (!cnt)
3115 pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); 3168 printk(KERN_TRACE " (ftrace buffer empty)\n");
3116 pr_info(" actual entries %ld\n", global_trace.entries); 3169 else
3170 printk(KERN_TRACE "---------------------------------\n");
3171
3172 out:
3173 spin_unlock_irqrestore(&ftrace_dump_lock, flags);
3174}
3175
3176__init static int tracer_alloc_buffers(void)
3177{
3178 struct trace_array_cpu *data;
3179 int i;
3180
3181 /* TODO: make the number of buffers hot pluggable with CPUS */
3182 tracing_buffer_mask = cpu_possible_map;
3183
3184 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3185 TRACE_BUFFER_FLAGS);
3186 if (!global_trace.buffer) {
3187 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3188 WARN_ON(1);
3189 return 0;
3190 }
3191 global_trace.entries = ring_buffer_size(global_trace.buffer);
3192
3193#ifdef CONFIG_TRACER_MAX_TRACE
3194 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3195 TRACE_BUFFER_FLAGS);
3196 if (!max_tr.buffer) {
3197 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3198 WARN_ON(1);
3199 ring_buffer_free(global_trace.buffer);
3200 return 0;
3201 }
3202 max_tr.entries = ring_buffer_size(max_tr.buffer);
3203 WARN_ON(max_tr.entries != global_trace.entries);
3204#endif
3117 3205
3118 tracer_init_debugfs(); 3206 /* Allocate the first page for all buffers */
3207 for_each_tracing_cpu(i) {
3208 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
3209 max_tr.data[i] = &per_cpu(max_data, i);
3210 }
3119 3211
3120 trace_init_cmdlines(); 3212 trace_init_cmdlines();
3121 3213
3122 register_tracer(&no_tracer); 3214 register_tracer(&nop_trace);
3123 current_trace = &no_tracer; 3215#ifdef CONFIG_BOOT_TRACER
3216 register_tracer(&boot_tracer);
3217 current_trace = &boot_tracer;
3218 current_trace->init(&global_trace);
3219#else
3220 current_trace = &nop_trace;
3221#endif
3124 3222
3125 /* All seems OK, enable tracing */ 3223 /* All seems OK, enable tracing */
3126 global_trace.ctrl = tracer_enabled; 3224 global_trace.ctrl = tracer_enabled;
3127 tracing_disabled = 0; 3225 tracing_disabled = 0;
3128 3226
3129 return 0; 3227 atomic_notifier_chain_register(&panic_notifier_list,
3130 3228 &trace_panic_notifier);
3131 free_buffers:
3132 for (i-- ; i >= 0; i--) {
3133 struct page *page, *tmp;
3134 struct trace_array_cpu *data = global_trace.data[i];
3135 3229
3136 if (data) { 3230 register_die_notifier(&trace_die_notifier);
3137 list_for_each_entry_safe(page, tmp,
3138 &data->trace_pages, lru) {
3139 list_del_init(&page->lru);
3140 __free_page(page);
3141 }
3142 }
3143 3231
3144#ifdef CONFIG_TRACER_MAX_TRACE 3232 return 0;
3145 data = max_tr.data[i];
3146 if (data) {
3147 list_for_each_entry_safe(page, tmp,
3148 &data->trace_pages, lru) {
3149 list_del_init(&page->lru);
3150 __free_page(page);
3151 }
3152 }
3153#endif
3154 }
3155 return ret;
3156} 3233}
3157fs_initcall(tracer_alloc_buffers); 3234early_initcall(tracer_alloc_buffers);
3235fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f86788c2b..8465ad052707 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,7 +5,9 @@
5#include <asm/atomic.h> 5#include <asm/atomic.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h>
8#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h>
9 11
10enum trace_type { 12enum trace_type {
11 __TRACE_FIRST_TYPE = 0, 13 __TRACE_FIRST_TYPE = 0,
@@ -13,38 +15,60 @@ enum trace_type {
13 TRACE_FN, 15 TRACE_FN,
14 TRACE_CTX, 16 TRACE_CTX,
15 TRACE_WAKE, 17 TRACE_WAKE,
18 TRACE_CONT,
16 TRACE_STACK, 19 TRACE_STACK,
20 TRACE_PRINT,
17 TRACE_SPECIAL, 21 TRACE_SPECIAL,
18 TRACE_MMIO_RW, 22 TRACE_MMIO_RW,
19 TRACE_MMIO_MAP, 23 TRACE_MMIO_MAP,
24 TRACE_BOOT,
20 25
21 __TRACE_LAST_TYPE 26 __TRACE_LAST_TYPE
22}; 27};
23 28
24/* 29/*
30 * The trace entry - the most basic unit of tracing. This is what
31 * is printed in the end as a single line in the trace output, such as:
32 *
33 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
34 */
35struct trace_entry {
36 unsigned char type;
37 unsigned char cpu;
38 unsigned char flags;
39 unsigned char preempt_count;
40 int pid;
41};
42
43/*
25 * Function trace entry - function address and parent function addres: 44 * Function trace entry - function address and parent function addres:
26 */ 45 */
27struct ftrace_entry { 46struct ftrace_entry {
47 struct trace_entry ent;
28 unsigned long ip; 48 unsigned long ip;
29 unsigned long parent_ip; 49 unsigned long parent_ip;
30}; 50};
51extern struct tracer boot_tracer;
31 52
32/* 53/*
33 * Context switch trace entry - which task (and prio) we switched from/to: 54 * Context switch trace entry - which task (and prio) we switched from/to:
34 */ 55 */
35struct ctx_switch_entry { 56struct ctx_switch_entry {
57 struct trace_entry ent;
36 unsigned int prev_pid; 58 unsigned int prev_pid;
37 unsigned char prev_prio; 59 unsigned char prev_prio;
38 unsigned char prev_state; 60 unsigned char prev_state;
39 unsigned int next_pid; 61 unsigned int next_pid;
40 unsigned char next_prio; 62 unsigned char next_prio;
41 unsigned char next_state; 63 unsigned char next_state;
64 unsigned int next_cpu;
42}; 65};
43 66
44/* 67/*
45 * Special (free-form) trace entry: 68 * Special (free-form) trace entry:
46 */ 69 */
47struct special_entry { 70struct special_entry {
71 struct trace_entry ent;
48 unsigned long arg1; 72 unsigned long arg1;
49 unsigned long arg2; 73 unsigned long arg2;
50 unsigned long arg3; 74 unsigned long arg3;
@@ -57,33 +81,62 @@ struct special_entry {
57#define FTRACE_STACK_ENTRIES 8 81#define FTRACE_STACK_ENTRIES 8
58 82
59struct stack_entry { 83struct stack_entry {
84 struct trace_entry ent;
60 unsigned long caller[FTRACE_STACK_ENTRIES]; 85 unsigned long caller[FTRACE_STACK_ENTRIES];
61}; 86};
62 87
63/* 88/*
64 * The trace entry - the most basic unit of tracing. This is what 89 * ftrace_printk entry:
65 * is printed in the end as a single line in the trace output, such as:
66 *
67 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
68 */ 90 */
69struct trace_entry { 91struct print_entry {
70 char type; 92 struct trace_entry ent;
71 char cpu; 93 unsigned long ip;
72 char flags; 94 char buf[];
73 char preempt_count; 95};
74 int pid; 96
75 cycle_t t; 97#define TRACE_OLD_SIZE 88
76 union { 98
77 struct ftrace_entry fn; 99struct trace_field_cont {
78 struct ctx_switch_entry ctx; 100 unsigned char type;
79 struct special_entry special; 101 /* Temporary till we get rid of this completely */
80 struct stack_entry stack; 102 char buf[TRACE_OLD_SIZE - 1];
81 struct mmiotrace_rw mmiorw; 103};
82 struct mmiotrace_map mmiomap; 104
83 }; 105struct trace_mmiotrace_rw {
106 struct trace_entry ent;
107 struct mmiotrace_rw rw;
108};
109
110struct trace_mmiotrace_map {
111 struct trace_entry ent;
112 struct mmiotrace_map map;
113};
114
115struct trace_boot {
116 struct trace_entry ent;
117 struct boot_trace initcall;
118};
119
120/*
121 * trace_flag_type is an enumeration that holds different
122 * states when a trace occurs. These are:
123 * IRQS_OFF - interrupts were disabled
124 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
125 * NEED_RESCED - reschedule is requested
126 * HARDIRQ - inside an interrupt handler
127 * SOFTIRQ - inside a softirq handler
128 * CONT - multiple entries hold the trace item
129 */
130enum trace_flag_type {
131 TRACE_FLAG_IRQS_OFF = 0x01,
132 TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
133 TRACE_FLAG_NEED_RESCHED = 0x04,
134 TRACE_FLAG_HARDIRQ = 0x08,
135 TRACE_FLAG_SOFTIRQ = 0x10,
136 TRACE_FLAG_CONT = 0x20,
84}; 137};
85 138
86#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) 139#define TRACE_BUF_SIZE 1024
87 140
88/* 141/*
89 * The CPU trace array - it consists of thousands of trace entries 142 * The CPU trace array - it consists of thousands of trace entries
@@ -91,16 +144,9 @@ struct trace_entry {
91 * the trace, etc.) 144 * the trace, etc.)
92 */ 145 */
93struct trace_array_cpu { 146struct trace_array_cpu {
94 struct list_head trace_pages;
95 atomic_t disabled; 147 atomic_t disabled;
96 raw_spinlock_t lock;
97 struct lock_class_key lock_key;
98 148
99 /* these fields get copied into max-trace: */ 149 /* these fields get copied into max-trace: */
100 unsigned trace_head_idx;
101 unsigned trace_tail_idx;
102 void *trace_head; /* producer */
103 void *trace_tail; /* consumer */
104 unsigned long trace_idx; 150 unsigned long trace_idx;
105 unsigned long overrun; 151 unsigned long overrun;
106 unsigned long saved_latency; 152 unsigned long saved_latency;
@@ -124,6 +170,7 @@ struct trace_iterator;
124 * They have on/off state as well: 170 * They have on/off state as well:
125 */ 171 */
126struct trace_array { 172struct trace_array {
173 struct ring_buffer *buffer;
127 unsigned long entries; 174 unsigned long entries;
128 long ctrl; 175 long ctrl;
129 int cpu; 176 int cpu;
@@ -132,6 +179,56 @@ struct trace_array {
132 struct trace_array_cpu *data[NR_CPUS]; 179 struct trace_array_cpu *data[NR_CPUS];
133}; 180};
134 181
182#define FTRACE_CMP_TYPE(var, type) \
183 __builtin_types_compatible_p(typeof(var), type *)
184
185#undef IF_ASSIGN
186#define IF_ASSIGN(var, entry, etype, id) \
187 if (FTRACE_CMP_TYPE(var, etype)) { \
188 var = (typeof(var))(entry); \
189 WARN_ON(id && (entry)->type != id); \
190 break; \
191 }
192
193/* Will cause compile errors if type is not found. */
194extern void __ftrace_bad_type(void);
195
196/*
197 * The trace_assign_type is a verifier that the entry type is
198 * the same as the type being assigned. To add new types simply
199 * add a line with the following format:
200 *
201 * IF_ASSIGN(var, ent, type, id);
202 *
203 * Where "type" is the trace type that includes the trace_entry
204 * as the "ent" item. And "id" is the trace identifier that is
205 * used in the trace_type enum.
206 *
207 * If the type can have more than one id, then use zero.
208 */
209#define trace_assign_type(var, ent) \
210 do { \
211 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
212 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
213 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
214 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
215 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
216 IF_ASSIGN(var, ent, struct special_entry, 0); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
218 TRACE_MMIO_RW); \
219 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
220 TRACE_MMIO_MAP); \
221 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
222 __ftrace_bad_type(); \
223 } while (0)
224
225/* Return values for print_line callback */
226enum print_line_t {
227 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
228 TRACE_TYPE_HANDLED = 1,
229 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
230};
231
135/* 232/*
136 * A specific tracer, represented by methods that operate on a trace array: 233 * A specific tracer, represented by methods that operate on a trace array:
137 */ 234 */
@@ -152,7 +249,7 @@ struct tracer {
152 int (*selftest)(struct tracer *trace, 249 int (*selftest)(struct tracer *trace,
153 struct trace_array *tr); 250 struct trace_array *tr);
154#endif 251#endif
155 int (*print_line)(struct trace_iterator *iter); 252 enum print_line_t (*print_line)(struct trace_iterator *iter);
156 struct tracer *next; 253 struct tracer *next;
157 int print_max; 254 int print_max;
158}; 255};
@@ -171,57 +268,58 @@ struct trace_iterator {
171 struct trace_array *tr; 268 struct trace_array *tr;
172 struct tracer *trace; 269 struct tracer *trace;
173 void *private; 270 void *private;
174 long last_overrun[NR_CPUS]; 271 struct ring_buffer_iter *buffer_iter[NR_CPUS];
175 long overrun[NR_CPUS];
176 272
177 /* The below is zeroed out in pipe_read */ 273 /* The below is zeroed out in pipe_read */
178 struct trace_seq seq; 274 struct trace_seq seq;
179 struct trace_entry *ent; 275 struct trace_entry *ent;
180 int cpu; 276 int cpu;
181 277 u64 ts;
182 struct trace_entry *prev_ent;
183 int prev_cpu;
184 278
185 unsigned long iter_flags; 279 unsigned long iter_flags;
186 loff_t pos; 280 loff_t pos;
187 unsigned long next_idx[NR_CPUS];
188 struct list_head *next_page[NR_CPUS];
189 unsigned next_page_idx[NR_CPUS];
190 long idx; 281 long idx;
191}; 282};
192 283
193void tracing_reset(struct trace_array_cpu *data); 284void trace_wake_up(void);
285void tracing_reset(struct trace_array *tr, int cpu);
194int tracing_open_generic(struct inode *inode, struct file *filp); 286int tracing_open_generic(struct inode *inode, struct file *filp);
195struct dentry *tracing_init_dentry(void); 287struct dentry *tracing_init_dentry(void);
196void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 288void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
197 289
290struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
291 struct trace_array_cpu *data);
292void tracing_generic_entry_update(struct trace_entry *entry,
293 unsigned long flags,
294 int pc);
295
198void ftrace(struct trace_array *tr, 296void ftrace(struct trace_array *tr,
199 struct trace_array_cpu *data, 297 struct trace_array_cpu *data,
200 unsigned long ip, 298 unsigned long ip,
201 unsigned long parent_ip, 299 unsigned long parent_ip,
202 unsigned long flags); 300 unsigned long flags, int pc);
203void tracing_sched_switch_trace(struct trace_array *tr, 301void tracing_sched_switch_trace(struct trace_array *tr,
204 struct trace_array_cpu *data, 302 struct trace_array_cpu *data,
205 struct task_struct *prev, 303 struct task_struct *prev,
206 struct task_struct *next, 304 struct task_struct *next,
207 unsigned long flags); 305 unsigned long flags, int pc);
208void tracing_record_cmdline(struct task_struct *tsk); 306void tracing_record_cmdline(struct task_struct *tsk);
209 307
210void tracing_sched_wakeup_trace(struct trace_array *tr, 308void tracing_sched_wakeup_trace(struct trace_array *tr,
211 struct trace_array_cpu *data, 309 struct trace_array_cpu *data,
212 struct task_struct *wakee, 310 struct task_struct *wakee,
213 struct task_struct *cur, 311 struct task_struct *cur,
214 unsigned long flags); 312 unsigned long flags, int pc);
215void trace_special(struct trace_array *tr, 313void trace_special(struct trace_array *tr,
216 struct trace_array_cpu *data, 314 struct trace_array_cpu *data,
217 unsigned long arg1, 315 unsigned long arg1,
218 unsigned long arg2, 316 unsigned long arg2,
219 unsigned long arg3); 317 unsigned long arg3, int pc);
220void trace_function(struct trace_array *tr, 318void trace_function(struct trace_array *tr,
221 struct trace_array_cpu *data, 319 struct trace_array_cpu *data,
222 unsigned long ip, 320 unsigned long ip,
223 unsigned long parent_ip, 321 unsigned long parent_ip,
224 unsigned long flags); 322 unsigned long flags, int pc);
225 323
226void tracing_start_cmdline_record(void); 324void tracing_start_cmdline_record(void);
227void tracing_stop_cmdline_record(void); 325void tracing_stop_cmdline_record(void);
@@ -239,7 +337,7 @@ void update_max_tr_single(struct trace_array *tr,
239 337
240extern cycle_t ftrace_now(int cpu); 338extern cycle_t ftrace_now(int cpu);
241 339
242#ifdef CONFIG_FTRACE 340#ifdef CONFIG_FUNCTION_TRACER
243void tracing_start_function_trace(void); 341void tracing_start_function_trace(void);
244void tracing_stop_function_trace(void); 342void tracing_stop_function_trace(void);
245#else 343#else
@@ -268,51 +366,33 @@ extern unsigned long ftrace_update_tot_cnt;
268extern int DYN_FTRACE_TEST_NAME(void); 366extern int DYN_FTRACE_TEST_NAME(void);
269#endif 367#endif
270 368
271#ifdef CONFIG_MMIOTRACE
272extern void __trace_mmiotrace_rw(struct trace_array *tr,
273 struct trace_array_cpu *data,
274 struct mmiotrace_rw *rw);
275extern void __trace_mmiotrace_map(struct trace_array *tr,
276 struct trace_array_cpu *data,
277 struct mmiotrace_map *map);
278#endif
279
280#ifdef CONFIG_FTRACE_STARTUP_TEST 369#ifdef CONFIG_FTRACE_STARTUP_TEST
281#ifdef CONFIG_FTRACE
282extern int trace_selftest_startup_function(struct tracer *trace, 370extern int trace_selftest_startup_function(struct tracer *trace,
283 struct trace_array *tr); 371 struct trace_array *tr);
284#endif
285#ifdef CONFIG_IRQSOFF_TRACER
286extern int trace_selftest_startup_irqsoff(struct tracer *trace, 372extern int trace_selftest_startup_irqsoff(struct tracer *trace,
287 struct trace_array *tr); 373 struct trace_array *tr);
288#endif
289#ifdef CONFIG_PREEMPT_TRACER
290extern int trace_selftest_startup_preemptoff(struct tracer *trace, 374extern int trace_selftest_startup_preemptoff(struct tracer *trace,
291 struct trace_array *tr); 375 struct trace_array *tr);
292#endif
293#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
294extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, 376extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
295 struct trace_array *tr); 377 struct trace_array *tr);
296#endif
297#ifdef CONFIG_SCHED_TRACER
298extern int trace_selftest_startup_wakeup(struct tracer *trace, 378extern int trace_selftest_startup_wakeup(struct tracer *trace,
299 struct trace_array *tr); 379 struct trace_array *tr);
300#endif 380extern int trace_selftest_startup_nop(struct tracer *trace,
301#ifdef CONFIG_CONTEXT_SWITCH_TRACER 381 struct trace_array *tr);
302extern int trace_selftest_startup_sched_switch(struct tracer *trace, 382extern int trace_selftest_startup_sched_switch(struct tracer *trace,
303 struct trace_array *tr); 383 struct trace_array *tr);
304#endif
305#ifdef CONFIG_SYSPROF_TRACER
306extern int trace_selftest_startup_sysprof(struct tracer *trace, 384extern int trace_selftest_startup_sysprof(struct tracer *trace,
307 struct trace_array *tr); 385 struct trace_array *tr);
308#endif
309#endif /* CONFIG_FTRACE_STARTUP_TEST */ 386#endif /* CONFIG_FTRACE_STARTUP_TEST */
310 387
311extern void *head_page(struct trace_array_cpu *data); 388extern void *head_page(struct trace_array_cpu *data);
312extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 389extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
390extern void trace_seq_print_cont(struct trace_seq *s,
391 struct trace_iterator *iter);
313extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 392extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
314 size_t cnt); 393 size_t cnt);
315extern long ns2usecs(cycle_t nsec); 394extern long ns2usecs(cycle_t nsec);
395extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
316 396
317extern unsigned long trace_flags; 397extern unsigned long trace_flags;
318 398
@@ -334,6 +414,9 @@ enum trace_iterator_flags {
334 TRACE_ITER_BLOCK = 0x80, 414 TRACE_ITER_BLOCK = 0x80,
335 TRACE_ITER_STACKTRACE = 0x100, 415 TRACE_ITER_STACKTRACE = 0x100,
336 TRACE_ITER_SCHED_TREE = 0x200, 416 TRACE_ITER_SCHED_TREE = 0x200,
417 TRACE_ITER_PRINTK = 0x400,
337}; 418};
338 419
420extern struct tracer nop_trace;
421
339#endif /* _LINUX_KERNEL_TRACE_H */ 422#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 000000000000..d0a5e50eeff2
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,126 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12
13#include "trace.h"
14
15static struct trace_array *boot_trace;
16static int trace_boot_enabled;
17
18
19/* Should be started after do_pre_smp_initcalls() in init/main.c */
20void start_boot_trace(void)
21{
22 trace_boot_enabled = 1;
23}
24
25void stop_boot_trace(void)
26{
27 trace_boot_enabled = 0;
28}
29
30void reset_boot_trace(struct trace_array *tr)
31{
32 stop_boot_trace();
33}
34
35static void boot_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 boot_trace = tr;
39
40 trace_boot_enabled = 0;
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu);
44}
45
46static void boot_trace_ctrl_update(struct trace_array *tr)
47{
48 if (tr->ctrl)
49 start_boot_trace();
50 else
51 stop_boot_trace();
52}
53
54static enum print_line_t initcall_print_line(struct trace_iterator *iter)
55{
56 int ret;
57 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime);
62 struct timespec rettime = ktime_to_timespec(it->rettime);
63
64 if (entry->type == TRACE_BOOT) {
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
66 calltime.tv_sec,
67 calltime.tv_nsec,
68 it->func, it->caller);
69 if (!ret)
70 return TRACE_TYPE_PARTIAL_LINE;
71
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n",
74 rettime.tv_sec,
75 rettime.tv_nsec,
76 it->func, it->result, it->duration);
77
78 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE;
80 return TRACE_TYPE_HANDLED;
81 }
82 return TRACE_TYPE_UNHANDLED;
83}
84
85struct tracer boot_tracer __read_mostly =
86{
87 .name = "initcall",
88 .init = boot_trace_init,
89 .reset = reset_boot_trace,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line,
92};
93
94void trace_boot(struct boot_trace *it, initcall_t fn)
95{
96 struct ring_buffer_event *event;
97 struct trace_boot *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace;
101
102 if (!trace_boot_enabled)
103 return;
104
105 /* Get its name now since this function could
106 * disappear because it is in the .init section.
107 */
108 sprint_symbol(it->func, (unsigned long)fn);
109 preempt_disable();
110 data = tr->data[smp_processor_id()];
111
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags);
114 if (!event)
115 goto out;
116 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT;
119 entry->initcall = *it;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121
122 trace_wake_up();
123
124 out:
125 preempt_enable();
126}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 312144897970..0f85a64003d3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,7 +23,7 @@ static void function_reset(struct trace_array *tr)
23 tr->time_start = ftrace_now(tr->cpu); 23 tr->time_start = ftrace_now(tr->cpu);
24 24
25 for_each_online_cpu(cpu) 25 for_each_online_cpu(cpu)
26 tracing_reset(tr->data[cpu]); 26 tracing_reset(tr, cpu);
27} 27}
28 28
29static void start_function_trace(struct trace_array *tr) 29static void start_function_trace(struct trace_array *tr)
@@ -64,7 +64,7 @@ static void function_trace_ctrl_update(struct trace_array *tr)
64 64
65static struct tracer function_trace __read_mostly = 65static struct tracer function_trace __read_mostly =
66{ 66{
67 .name = "ftrace", 67 .name = "function",
68 .init = function_trace_init, 68 .init = function_trace_init,
69 .reset = function_trace_reset, 69 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 70 .ctrl_update = function_trace_ctrl_update,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ece6cfb649fa..9c74071c10e0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -63,7 +63,7 @@ irq_trace(void)
63 */ 63 */
64static __cacheline_aligned_in_smp unsigned long max_sequence; 64static __cacheline_aligned_in_smp unsigned long max_sequence;
65 65
66#ifdef CONFIG_FTRACE 66#ifdef CONFIG_FUNCTION_TRACER
67/* 67/*
68 * irqsoff uses its own tracer function to keep the overhead down: 68 * irqsoff uses its own tracer function to keep the overhead down:
69 */ 69 */
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
95 disabled = atomic_inc_return(&data->disabled); 95 disabled = atomic_inc_return(&data->disabled);
96 96
97 if (likely(disabled == 1)) 97 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags); 98 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
99 99
100 atomic_dec(&data->disabled); 100 atomic_dec(&data->disabled);
101} 101}
@@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly =
104{ 104{
105 .func = irqsoff_tracer_call, 105 .func = irqsoff_tracer_call,
106}; 106};
107#endif /* CONFIG_FTRACE */ 107#endif /* CONFIG_FUNCTION_TRACER */
108 108
109/* 109/*
110 * Should this new latency be reported/recorded? 110 * Should this new latency be reported/recorded?
@@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr,
130 unsigned long latency, t0, t1; 130 unsigned long latency, t0, t1;
131 cycle_t T0, T1, delta; 131 cycle_t T0, T1, delta;
132 unsigned long flags; 132 unsigned long flags;
133 int pc;
133 134
134 /* 135 /*
135 * usecs conversion is slow so we try to delay the conversion 136 * usecs conversion is slow so we try to delay the conversion
@@ -141,6 +142,8 @@ check_critical_timing(struct trace_array *tr,
141 142
142 local_save_flags(flags); 143 local_save_flags(flags);
143 144
145 pc = preempt_count();
146
144 if (!report_latency(delta)) 147 if (!report_latency(delta))
145 goto out; 148 goto out;
146 149
@@ -150,7 +153,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 153 if (!report_latency(delta))
151 goto out_unlock; 154 goto out_unlock;
152 155
153 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); 156 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
154 157
155 latency = nsecs_to_usecs(delta); 158 latency = nsecs_to_usecs(delta);
156 159
@@ -173,8 +176,8 @@ out_unlock:
173out: 176out:
174 data->critical_sequence = max_sequence; 177 data->critical_sequence = max_sequence;
175 data->preempt_timestamp = ftrace_now(cpu); 178 data->preempt_timestamp = ftrace_now(cpu);
176 tracing_reset(data); 179 tracing_reset(tr, cpu);
177 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); 180 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
178} 181}
179 182
180static inline void 183static inline void
@@ -203,11 +206,11 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
203 data->critical_sequence = max_sequence; 206 data->critical_sequence = max_sequence;
204 data->preempt_timestamp = ftrace_now(cpu); 207 data->preempt_timestamp = ftrace_now(cpu);
205 data->critical_start = parent_ip ? : ip; 208 data->critical_start = parent_ip ? : ip;
206 tracing_reset(data); 209 tracing_reset(tr, cpu);
207 210
208 local_save_flags(flags); 211 local_save_flags(flags);
209 212
210 trace_function(tr, data, ip, parent_ip, flags); 213 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
211 214
212 per_cpu(tracing_cpu, cpu) = 1; 215 per_cpu(tracing_cpu, cpu) = 1;
213 216
@@ -234,14 +237,14 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
234 237
235 data = tr->data[cpu]; 238 data = tr->data[cpu];
236 239
237 if (unlikely(!data) || unlikely(!head_page(data)) || 240 if (unlikely(!data) ||
238 !data->critical_start || atomic_read(&data->disabled)) 241 !data->critical_start || atomic_read(&data->disabled))
239 return; 242 return;
240 243
241 atomic_inc(&data->disabled); 244 atomic_inc(&data->disabled);
242 245
243 local_save_flags(flags); 246 local_save_flags(flags);
244 trace_function(tr, data, ip, parent_ip, flags); 247 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
245 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 248 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
246 data->critical_start = 0; 249 data->critical_start = 0;
247 atomic_dec(&data->disabled); 250 atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b13dc19dcbb4..f28484618ff0 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace_array *tr)
27 tr->time_start = ftrace_now(tr->cpu); 27 tr->time_start = ftrace_now(tr->cpu);
28 28
29 for_each_online_cpu(cpu) 29 for_each_online_cpu(cpu)
30 tracing_reset(tr->data[cpu]); 30 tracing_reset(tr, cpu);
31} 31}
32 32
33static void mmio_trace_init(struct trace_array *tr) 33static void mmio_trace_init(struct trace_array *tr)
@@ -130,10 +130,14 @@ static unsigned long count_overruns(struct trace_iterator *iter)
130{ 130{
131 int cpu; 131 int cpu;
132 unsigned long cnt = 0; 132 unsigned long cnt = 0;
133/* FIXME: */
134#if 0
133 for_each_online_cpu(cpu) { 135 for_each_online_cpu(cpu) {
134 cnt += iter->overrun[cpu]; 136 cnt += iter->overrun[cpu];
135 iter->overrun[cpu] = 0; 137 iter->overrun[cpu] = 0;
136 } 138 }
139#endif
140 (void)cpu;
137 return cnt; 141 return cnt;
138} 142}
139 143
@@ -171,17 +175,21 @@ print_out:
171 return (ret == -EBUSY) ? 0 : ret; 175 return (ret == -EBUSY) ? 0 : ret;
172} 176}
173 177
174static int mmio_print_rw(struct trace_iterator *iter) 178static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
175{ 179{
176 struct trace_entry *entry = iter->ent; 180 struct trace_entry *entry = iter->ent;
177 struct mmiotrace_rw *rw = &entry->mmiorw; 181 struct trace_mmiotrace_rw *field;
182 struct mmiotrace_rw *rw;
178 struct trace_seq *s = &iter->seq; 183 struct trace_seq *s = &iter->seq;
179 unsigned long long t = ns2usecs(entry->t); 184 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, 1000000ULL); 185 unsigned long usec_rem = do_div(t, 1000000ULL);
181 unsigned secs = (unsigned long)t; 186 unsigned secs = (unsigned long)t;
182 int ret = 1; 187 int ret = 1;
183 188
184 switch (entry->mmiorw.opcode) { 189 trace_assign_type(field, entry);
190 rw = &field->rw;
191
192 switch (rw->opcode) {
185 case MMIO_READ: 193 case MMIO_READ:
186 ret = trace_seq_printf(s, 194 ret = trace_seq_printf(s,
187 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 195 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
@@ -209,21 +217,25 @@ static int mmio_print_rw(struct trace_iterator *iter)
209 break; 217 break;
210 } 218 }
211 if (ret) 219 if (ret)
212 return 1; 220 return TRACE_TYPE_HANDLED;
213 return 0; 221 return TRACE_TYPE_PARTIAL_LINE;
214} 222}
215 223
216static int mmio_print_map(struct trace_iterator *iter) 224static enum print_line_t mmio_print_map(struct trace_iterator *iter)
217{ 225{
218 struct trace_entry *entry = iter->ent; 226 struct trace_entry *entry = iter->ent;
219 struct mmiotrace_map *m = &entry->mmiomap; 227 struct trace_mmiotrace_map *field;
228 struct mmiotrace_map *m;
220 struct trace_seq *s = &iter->seq; 229 struct trace_seq *s = &iter->seq;
221 unsigned long long t = ns2usecs(entry->t); 230 unsigned long long t = ns2usecs(iter->ts);
222 unsigned long usec_rem = do_div(t, 1000000ULL); 231 unsigned long usec_rem = do_div(t, 1000000ULL);
223 unsigned secs = (unsigned long)t; 232 unsigned secs = (unsigned long)t;
224 int ret = 1; 233 int ret;
225 234
226 switch (entry->mmiorw.opcode) { 235 trace_assign_type(field, entry);
236 m = &field->map;
237
238 switch (m->opcode) {
227 case MMIO_PROBE: 239 case MMIO_PROBE:
228 ret = trace_seq_printf(s, 240 ret = trace_seq_printf(s,
229 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 241 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
@@ -241,20 +253,43 @@ static int mmio_print_map(struct trace_iterator *iter)
241 break; 253 break;
242 } 254 }
243 if (ret) 255 if (ret)
244 return 1; 256 return TRACE_TYPE_HANDLED;
245 return 0; 257 return TRACE_TYPE_PARTIAL_LINE;
258}
259
260static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
261{
262 struct trace_entry *entry = iter->ent;
263 struct print_entry *print = (struct print_entry *)entry;
264 const char *msg = print->buf;
265 struct trace_seq *s = &iter->seq;
266 unsigned long long t = ns2usecs(iter->ts);
267 unsigned long usec_rem = do_div(t, 1000000ULL);
268 unsigned secs = (unsigned long)t;
269 int ret;
270
271 /* The trailing newline must be in the message. */
272 ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
273 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE;
275
276 if (entry->flags & TRACE_FLAG_CONT)
277 trace_seq_print_cont(s, iter);
278
279 return TRACE_TYPE_HANDLED;
246} 280}
247 281
248/* return 0 to abort printing without consuming current entry in pipe mode */ 282static enum print_line_t mmio_print_line(struct trace_iterator *iter)
249static int mmio_print_line(struct trace_iterator *iter)
250{ 283{
251 switch (iter->ent->type) { 284 switch (iter->ent->type) {
252 case TRACE_MMIO_RW: 285 case TRACE_MMIO_RW:
253 return mmio_print_rw(iter); 286 return mmio_print_rw(iter);
254 case TRACE_MMIO_MAP: 287 case TRACE_MMIO_MAP:
255 return mmio_print_map(iter); 288 return mmio_print_map(iter);
289 case TRACE_PRINT:
290 return mmio_print_mark(iter);
256 default: 291 default:
257 return 1; /* ignore unknown entries */ 292 return TRACE_TYPE_HANDLED; /* ignore unknown entries */
258 } 293 }
259} 294}
260 295
@@ -276,6 +311,27 @@ __init static int init_mmio_trace(void)
276} 311}
277device_initcall(init_mmio_trace); 312device_initcall(init_mmio_trace);
278 313
314static void __trace_mmiotrace_rw(struct trace_array *tr,
315 struct trace_array_cpu *data,
316 struct mmiotrace_rw *rw)
317{
318 struct ring_buffer_event *event;
319 struct trace_mmiotrace_rw *entry;
320 unsigned long irq_flags;
321
322 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
323 &irq_flags);
324 if (!event)
325 return;
326 entry = ring_buffer_event_data(event);
327 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
328 entry->ent.type = TRACE_MMIO_RW;
329 entry->rw = *rw;
330 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
331
332 trace_wake_up();
333}
334
279void mmio_trace_rw(struct mmiotrace_rw *rw) 335void mmio_trace_rw(struct mmiotrace_rw *rw)
280{ 336{
281 struct trace_array *tr = mmio_trace_array; 337 struct trace_array *tr = mmio_trace_array;
@@ -283,6 +339,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw)
283 __trace_mmiotrace_rw(tr, data, rw); 339 __trace_mmiotrace_rw(tr, data, rw);
284} 340}
285 341
342static void __trace_mmiotrace_map(struct trace_array *tr,
343 struct trace_array_cpu *data,
344 struct mmiotrace_map *map)
345{
346 struct ring_buffer_event *event;
347 struct trace_mmiotrace_map *entry;
348 unsigned long irq_flags;
349
350 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
351 &irq_flags);
352 if (!event)
353 return;
354 entry = ring_buffer_event_data(event);
355 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
356 entry->ent.type = TRACE_MMIO_MAP;
357 entry->map = *map;
358 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
359
360 trace_wake_up();
361}
362
286void mmio_trace_mapping(struct mmiotrace_map *map) 363void mmio_trace_mapping(struct mmiotrace_map *map)
287{ 364{
288 struct trace_array *tr = mmio_trace_array; 365 struct trace_array *tr = mmio_trace_array;
@@ -293,3 +370,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
293 __trace_mmiotrace_map(tr, data, map); 370 __trace_mmiotrace_map(tr, data, map);
294 preempt_enable(); 371 preempt_enable();
295} 372}
373
374int mmio_trace_printk(const char *fmt, va_list args)
375{
376 return trace_vprintk(0, fmt, args);
377}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
new file mode 100644
index 000000000000..4592b4862515
--- /dev/null
+++ b/kernel/trace/trace_nop.c
@@ -0,0 +1,64 @@
1/*
2 * nop tracer
3 *
4 * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12
13#include "trace.h"
14
15static struct trace_array *ctx_trace;
16
17static void start_nop_trace(struct trace_array *tr)
18{
19 /* Nothing to do! */
20}
21
22static void stop_nop_trace(struct trace_array *tr)
23{
24 /* Nothing to do! */
25}
26
27static void nop_trace_init(struct trace_array *tr)
28{
29 int cpu;
30 ctx_trace = tr;
31
32 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu);
34
35 if (tr->ctrl)
36 start_nop_trace(tr);
37}
38
39static void nop_trace_reset(struct trace_array *tr)
40{
41 if (tr->ctrl)
42 stop_nop_trace(tr);
43}
44
45static void nop_trace_ctrl_update(struct trace_array *tr)
46{
47 /* When starting a new trace, reset the buffers */
48 if (tr->ctrl)
49 start_nop_trace(tr);
50 else
51 stop_nop_trace(tr);
52}
53
54struct tracer nop_trace __read_mostly =
55{
56 .name = "nop",
57 .init = nop_trace_init,
58 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop,
62#endif
63};
64
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb817a209aa0..b8f56beb1a62 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -9,8 +9,8 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/marker.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
13#include <trace/sched.h>
14 14
15#include "trace.h" 15#include "trace.h"
16 16
@@ -19,15 +19,16 @@ static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static atomic_t sched_ref;
20 20
21static void 21static void
22sched_switch_func(void *private, void *__rq, struct task_struct *prev, 22probe_sched_switch(struct rq *__rq, struct task_struct *prev,
23 struct task_struct *next) 23 struct task_struct *next)
24{ 24{
25 struct trace_array **ptr = private;
26 struct trace_array *tr = *ptr;
27 struct trace_array_cpu *data; 25 struct trace_array_cpu *data;
28 unsigned long flags; 26 unsigned long flags;
29 long disabled;
30 int cpu; 27 int cpu;
28 int pc;
29
30 if (!atomic_read(&sched_ref))
31 return;
31 32
32 tracing_record_cmdline(prev); 33 tracing_record_cmdline(prev);
33 tracing_record_cmdline(next); 34 tracing_record_cmdline(next);
@@ -35,97 +36,41 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,
35 if (!tracer_enabled) 36 if (!tracer_enabled)
36 return; 37 return;
37 38
39 pc = preempt_count();
38 local_irq_save(flags); 40 local_irq_save(flags);
39 cpu = raw_smp_processor_id(); 41 cpu = raw_smp_processor_id();
40 data = tr->data[cpu]; 42 data = ctx_trace->data[cpu];
41 disabled = atomic_inc_return(&data->disabled);
42 43
43 if (likely(disabled == 1)) 44 if (likely(!atomic_read(&data->disabled)))
44 tracing_sched_switch_trace(tr, data, prev, next, flags); 45 tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
45 46
46 atomic_dec(&data->disabled);
47 local_irq_restore(flags); 47 local_irq_restore(flags);
48} 48}
49 49
50static notrace void
51sched_switch_callback(void *probe_data, void *call_data,
52 const char *format, va_list *args)
53{
54 struct task_struct *prev;
55 struct task_struct *next;
56 struct rq *__rq;
57
58 if (!atomic_read(&sched_ref))
59 return;
60
61 /* skip prev_pid %d next_pid %d prev_state %ld */
62 (void)va_arg(*args, int);
63 (void)va_arg(*args, int);
64 (void)va_arg(*args, long);
65 __rq = va_arg(*args, typeof(__rq));
66 prev = va_arg(*args, typeof(prev));
67 next = va_arg(*args, typeof(next));
68
69 /*
70 * If tracer_switch_func only points to the local
71 * switch func, it still needs the ptr passed to it.
72 */
73 sched_switch_func(probe_data, __rq, prev, next);
74}
75
76static void 50static void
77wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct 51probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
78 task_struct *curr)
79{ 52{
80 struct trace_array **ptr = private;
81 struct trace_array *tr = *ptr;
82 struct trace_array_cpu *data; 53 struct trace_array_cpu *data;
83 unsigned long flags; 54 unsigned long flags;
84 long disabled; 55 int cpu, pc;
85 int cpu;
86 56
87 if (!tracer_enabled) 57 if (!likely(tracer_enabled))
88 return; 58 return;
89 59
90 tracing_record_cmdline(curr); 60 pc = preempt_count();
61 tracing_record_cmdline(current);
91 62
92 local_irq_save(flags); 63 local_irq_save(flags);
93 cpu = raw_smp_processor_id(); 64 cpu = raw_smp_processor_id();
94 data = tr->data[cpu]; 65 data = ctx_trace->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96 66
97 if (likely(disabled == 1)) 67 if (likely(!atomic_read(&data->disabled)))
98 tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); 68 tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
69 flags, pc);
99 70
100 atomic_dec(&data->disabled);
101 local_irq_restore(flags); 71 local_irq_restore(flags);
102} 72}
103 73
104static notrace void
105wake_up_callback(void *probe_data, void *call_data,
106 const char *format, va_list *args)
107{
108 struct task_struct *curr;
109 struct task_struct *task;
110 struct rq *__rq;
111
112 if (likely(!tracer_enabled))
113 return;
114
115 /* Skip pid %d state %ld */
116 (void)va_arg(*args, int);
117 (void)va_arg(*args, long);
118 /* now get the meat: "rq %p task %p rq->curr %p" */
119 __rq = va_arg(*args, typeof(__rq));
120 task = va_arg(*args, typeof(task));
121 curr = va_arg(*args, typeof(curr));
122
123 tracing_record_cmdline(task);
124 tracing_record_cmdline(curr);
125
126 wakeup_func(probe_data, __rq, task, curr);
127}
128
129static void sched_switch_reset(struct trace_array *tr) 74static void sched_switch_reset(struct trace_array *tr)
130{ 75{
131 int cpu; 76 int cpu;
@@ -133,67 +78,47 @@ static void sched_switch_reset(struct trace_array *tr)
133 tr->time_start = ftrace_now(tr->cpu); 78 tr->time_start = ftrace_now(tr->cpu);
134 79
135 for_each_online_cpu(cpu) 80 for_each_online_cpu(cpu)
136 tracing_reset(tr->data[cpu]); 81 tracing_reset(tr, cpu);
137} 82}
138 83
139static int tracing_sched_register(void) 84static int tracing_sched_register(void)
140{ 85{
141 int ret; 86 int ret;
142 87
143 ret = marker_probe_register("kernel_sched_wakeup", 88 ret = register_trace_sched_wakeup(probe_sched_wakeup);
144 "pid %d state %ld ## rq %p task %p rq->curr %p",
145 wake_up_callback,
146 &ctx_trace);
147 if (ret) { 89 if (ret) {
148 pr_info("wakeup trace: Couldn't add marker" 90 pr_info("wakeup trace: Couldn't activate tracepoint"
149 " probe to kernel_sched_wakeup\n"); 91 " probe to kernel_sched_wakeup\n");
150 return ret; 92 return ret;
151 } 93 }
152 94
153 ret = marker_probe_register("kernel_sched_wakeup_new", 95 ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
154 "pid %d state %ld ## rq %p task %p rq->curr %p",
155 wake_up_callback,
156 &ctx_trace);
157 if (ret) { 96 if (ret) {
158 pr_info("wakeup trace: Couldn't add marker" 97 pr_info("wakeup trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_wakeup_new\n"); 98 " probe to kernel_sched_wakeup_new\n");
160 goto fail_deprobe; 99 goto fail_deprobe;
161 } 100 }
162 101
163 ret = marker_probe_register("kernel_sched_schedule", 102 ret = register_trace_sched_switch(probe_sched_switch);
164 "prev_pid %d next_pid %d prev_state %ld "
165 "## rq %p prev %p next %p",
166 sched_switch_callback,
167 &ctx_trace);
168 if (ret) { 103 if (ret) {
169 pr_info("sched trace: Couldn't add marker" 104 pr_info("sched trace: Couldn't activate tracepoint"
170 " probe to kernel_sched_schedule\n"); 105 " probe to kernel_sched_schedule\n");
171 goto fail_deprobe_wake_new; 106 goto fail_deprobe_wake_new;
172 } 107 }
173 108
174 return ret; 109 return ret;
175fail_deprobe_wake_new: 110fail_deprobe_wake_new:
176 marker_probe_unregister("kernel_sched_wakeup_new", 111 unregister_trace_sched_wakeup_new(probe_sched_wakeup);
177 wake_up_callback,
178 &ctx_trace);
179fail_deprobe: 112fail_deprobe:
180 marker_probe_unregister("kernel_sched_wakeup", 113 unregister_trace_sched_wakeup(probe_sched_wakeup);
181 wake_up_callback,
182 &ctx_trace);
183 return ret; 114 return ret;
184} 115}
185 116
186static void tracing_sched_unregister(void) 117static void tracing_sched_unregister(void)
187{ 118{
188 marker_probe_unregister("kernel_sched_schedule", 119 unregister_trace_sched_switch(probe_sched_switch);
189 sched_switch_callback, 120 unregister_trace_sched_wakeup_new(probe_sched_wakeup);
190 &ctx_trace); 121 unregister_trace_sched_wakeup(probe_sched_wakeup);
191 marker_probe_unregister("kernel_sched_wakeup_new",
192 wake_up_callback,
193 &ctx_trace);
194 marker_probe_unregister("kernel_sched_wakeup",
195 wake_up_callback,
196 &ctx_trace);
197} 122}
198 123
199static void tracing_start_sched_switch(void) 124static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e303ccb62cdf..3ae93f16b565 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/marker.h> 18#include <trace/sched.h>
19 19
20#include "trace.h" 20#include "trace.h"
21 21
@@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock =
31 31
32static void __wakeup_reset(struct trace_array *tr); 32static void __wakeup_reset(struct trace_array *tr);
33 33
34#ifdef CONFIG_FTRACE 34#ifdef CONFIG_FUNCTION_TRACER
35/* 35/*
36 * irqsoff uses its own tracer function to keep the overhead down: 36 * irqsoff uses its own tracer function to keep the overhead down:
37 */ 37 */
@@ -44,10 +44,12 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
44 long disabled; 44 long disabled;
45 int resched; 45 int resched;
46 int cpu; 46 int cpu;
47 int pc;
47 48
48 if (likely(!wakeup_task)) 49 if (likely(!wakeup_task))
49 return; 50 return;
50 51
52 pc = preempt_count();
51 resched = need_resched(); 53 resched = need_resched();
52 preempt_disable_notrace(); 54 preempt_disable_notrace();
53 55
@@ -70,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
70 if (task_cpu(wakeup_task) != cpu) 72 if (task_cpu(wakeup_task) != cpu)
71 goto unlock; 73 goto unlock;
72 74
73 trace_function(tr, data, ip, parent_ip, flags); 75 trace_function(tr, data, ip, parent_ip, flags, pc);
74 76
75 unlock: 77 unlock:
76 __raw_spin_unlock(&wakeup_lock); 78 __raw_spin_unlock(&wakeup_lock);
@@ -94,7 +96,7 @@ static struct ftrace_ops trace_ops __read_mostly =
94{ 96{
95 .func = wakeup_tracer_call, 97 .func = wakeup_tracer_call,
96}; 98};
97#endif /* CONFIG_FTRACE */ 99#endif /* CONFIG_FUNCTION_TRACER */
98 100
99/* 101/*
100 * Should this new latency be reported/recorded? 102 * Should this new latency be reported/recorded?
@@ -112,17 +114,18 @@ static int report_latency(cycle_t delta)
112} 114}
113 115
114static void notrace 116static void notrace
115wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, 117probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
116 struct task_struct *next) 118 struct task_struct *next)
117{ 119{
118 unsigned long latency = 0, t0 = 0, t1 = 0; 120 unsigned long latency = 0, t0 = 0, t1 = 0;
119 struct trace_array **ptr = private;
120 struct trace_array *tr = *ptr;
121 struct trace_array_cpu *data; 121 struct trace_array_cpu *data;
122 cycle_t T0, T1, delta; 122 cycle_t T0, T1, delta;
123 unsigned long flags; 123 unsigned long flags;
124 long disabled; 124 long disabled;
125 int cpu; 125 int cpu;
126 int pc;
127
128 tracing_record_cmdline(prev);
126 129
127 if (unlikely(!tracer_enabled)) 130 if (unlikely(!tracer_enabled))
128 return; 131 return;
@@ -139,12 +142,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
139 if (next != wakeup_task) 142 if (next != wakeup_task)
140 return; 143 return;
141 144
145 pc = preempt_count();
146
142 /* The task we are waiting for is waking up */ 147 /* The task we are waiting for is waking up */
143 data = tr->data[wakeup_cpu]; 148 data = wakeup_trace->data[wakeup_cpu];
144 149
145 /* disable local data, not wakeup_cpu data */ 150 /* disable local data, not wakeup_cpu data */
146 cpu = raw_smp_processor_id(); 151 cpu = raw_smp_processor_id();
147 disabled = atomic_inc_return(&tr->data[cpu]->disabled); 152 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
148 if (likely(disabled != 1)) 153 if (likely(disabled != 1))
149 goto out; 154 goto out;
150 155
@@ -155,7 +160,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
155 if (unlikely(!tracer_enabled || next != wakeup_task)) 160 if (unlikely(!tracer_enabled || next != wakeup_task))
156 goto out_unlock; 161 goto out_unlock;
157 162
158 trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); 163 trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
159 164
160 /* 165 /*
161 * usecs conversion is slow so we try to delay the conversion 166 * usecs conversion is slow so we try to delay the conversion
@@ -174,39 +179,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
174 t0 = nsecs_to_usecs(T0); 179 t0 = nsecs_to_usecs(T0);
175 t1 = nsecs_to_usecs(T1); 180 t1 = nsecs_to_usecs(T1);
176 181
177 update_max_tr(tr, wakeup_task, wakeup_cpu); 182 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 183
179out_unlock: 184out_unlock:
180 __wakeup_reset(tr); 185 __wakeup_reset(wakeup_trace);
181 __raw_spin_unlock(&wakeup_lock); 186 __raw_spin_unlock(&wakeup_lock);
182 local_irq_restore(flags); 187 local_irq_restore(flags);
183out: 188out:
184 atomic_dec(&tr->data[cpu]->disabled); 189 atomic_dec(&wakeup_trace->data[cpu]->disabled);
185}
186
187static notrace void
188sched_switch_callback(void *probe_data, void *call_data,
189 const char *format, va_list *args)
190{
191 struct task_struct *prev;
192 struct task_struct *next;
193 struct rq *__rq;
194
195 /* skip prev_pid %d next_pid %d prev_state %ld */
196 (void)va_arg(*args, int);
197 (void)va_arg(*args, int);
198 (void)va_arg(*args, long);
199 __rq = va_arg(*args, typeof(__rq));
200 prev = va_arg(*args, typeof(prev));
201 next = va_arg(*args, typeof(next));
202
203 tracing_record_cmdline(prev);
204
205 /*
206 * If tracer_switch_func only points to the local
207 * switch func, it still needs the ptr passed to it.
208 */
209 wakeup_sched_switch(probe_data, __rq, prev, next);
210} 190}
211 191
212static void __wakeup_reset(struct trace_array *tr) 192static void __wakeup_reset(struct trace_array *tr)
@@ -216,7 +196,7 @@ static void __wakeup_reset(struct trace_array *tr)
216 196
217 for_each_possible_cpu(cpu) { 197 for_each_possible_cpu(cpu) {
218 data = tr->data[cpu]; 198 data = tr->data[cpu];
219 tracing_reset(data); 199 tracing_reset(tr, cpu);
220 } 200 }
221 201
222 wakeup_cpu = -1; 202 wakeup_cpu = -1;
@@ -240,19 +220,26 @@ static void wakeup_reset(struct trace_array *tr)
240} 220}
241 221
242static void 222static void
243wakeup_check_start(struct trace_array *tr, struct task_struct *p, 223probe_wakeup(struct rq *rq, struct task_struct *p)
244 struct task_struct *curr)
245{ 224{
246 int cpu = smp_processor_id(); 225 int cpu = smp_processor_id();
247 unsigned long flags; 226 unsigned long flags;
248 long disabled; 227 long disabled;
228 int pc;
229
230 if (likely(!tracer_enabled))
231 return;
232
233 tracing_record_cmdline(p);
234 tracing_record_cmdline(current);
249 235
250 if (likely(!rt_task(p)) || 236 if (likely(!rt_task(p)) ||
251 p->prio >= wakeup_prio || 237 p->prio >= wakeup_prio ||
252 p->prio >= curr->prio) 238 p->prio >= current->prio)
253 return; 239 return;
254 240
255 disabled = atomic_inc_return(&tr->data[cpu]->disabled); 241 pc = preempt_count();
242 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
256 if (unlikely(disabled != 1)) 243 if (unlikely(disabled != 1))
257 goto out; 244 goto out;
258 245
@@ -264,7 +251,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
264 goto out_locked; 251 goto out_locked;
265 252
266 /* reset the trace */ 253 /* reset the trace */
267 __wakeup_reset(tr); 254 __wakeup_reset(wakeup_trace);
268 255
269 wakeup_cpu = task_cpu(p); 256 wakeup_cpu = task_cpu(p);
270 wakeup_prio = p->prio; 257 wakeup_prio = p->prio;
@@ -274,74 +261,37 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
274 261
275 local_save_flags(flags); 262 local_save_flags(flags);
276 263
277 tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); 264 wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
278 trace_function(tr, tr->data[wakeup_cpu], 265 trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
279 CALLER_ADDR1, CALLER_ADDR2, flags); 266 CALLER_ADDR1, CALLER_ADDR2, flags, pc);
280 267
281out_locked: 268out_locked:
282 __raw_spin_unlock(&wakeup_lock); 269 __raw_spin_unlock(&wakeup_lock);
283out: 270out:
284 atomic_dec(&tr->data[cpu]->disabled); 271 atomic_dec(&wakeup_trace->data[cpu]->disabled);
285}
286
287static notrace void
288wake_up_callback(void *probe_data, void *call_data,
289 const char *format, va_list *args)
290{
291 struct trace_array **ptr = probe_data;
292 struct trace_array *tr = *ptr;
293 struct task_struct *curr;
294 struct task_struct *task;
295 struct rq *__rq;
296
297 if (likely(!tracer_enabled))
298 return;
299
300 /* Skip pid %d state %ld */
301 (void)va_arg(*args, int);
302 (void)va_arg(*args, long);
303 /* now get the meat: "rq %p task %p rq->curr %p" */
304 __rq = va_arg(*args, typeof(__rq));
305 task = va_arg(*args, typeof(task));
306 curr = va_arg(*args, typeof(curr));
307
308 tracing_record_cmdline(task);
309 tracing_record_cmdline(curr);
310
311 wakeup_check_start(tr, task, curr);
312} 272}
313 273
314static void start_wakeup_tracer(struct trace_array *tr) 274static void start_wakeup_tracer(struct trace_array *tr)
315{ 275{
316 int ret; 276 int ret;
317 277
318 ret = marker_probe_register("kernel_sched_wakeup", 278 ret = register_trace_sched_wakeup(probe_wakeup);
319 "pid %d state %ld ## rq %p task %p rq->curr %p",
320 wake_up_callback,
321 &wakeup_trace);
322 if (ret) { 279 if (ret) {
323 pr_info("wakeup trace: Couldn't add marker" 280 pr_info("wakeup trace: Couldn't activate tracepoint"
324 " probe to kernel_sched_wakeup\n"); 281 " probe to kernel_sched_wakeup\n");
325 return; 282 return;
326 } 283 }
327 284
328 ret = marker_probe_register("kernel_sched_wakeup_new", 285 ret = register_trace_sched_wakeup_new(probe_wakeup);
329 "pid %d state %ld ## rq %p task %p rq->curr %p",
330 wake_up_callback,
331 &wakeup_trace);
332 if (ret) { 286 if (ret) {
333 pr_info("wakeup trace: Couldn't add marker" 287 pr_info("wakeup trace: Couldn't activate tracepoint"
334 " probe to kernel_sched_wakeup_new\n"); 288 " probe to kernel_sched_wakeup_new\n");
335 goto fail_deprobe; 289 goto fail_deprobe;
336 } 290 }
337 291
338 ret = marker_probe_register("kernel_sched_schedule", 292 ret = register_trace_sched_switch(probe_wakeup_sched_switch);
339 "prev_pid %d next_pid %d prev_state %ld "
340 "## rq %p prev %p next %p",
341 sched_switch_callback,
342 &wakeup_trace);
343 if (ret) { 293 if (ret) {
344 pr_info("sched trace: Couldn't add marker" 294 pr_info("sched trace: Couldn't activate tracepoint"
345 " probe to kernel_sched_schedule\n"); 295 " probe to kernel_sched_schedule\n");
346 goto fail_deprobe_wake_new; 296 goto fail_deprobe_wake_new;
347 } 297 }
@@ -363,28 +313,18 @@ static void start_wakeup_tracer(struct trace_array *tr)
363 313
364 return; 314 return;
365fail_deprobe_wake_new: 315fail_deprobe_wake_new:
366 marker_probe_unregister("kernel_sched_wakeup_new", 316 unregister_trace_sched_wakeup_new(probe_wakeup);
367 wake_up_callback,
368 &wakeup_trace);
369fail_deprobe: 317fail_deprobe:
370 marker_probe_unregister("kernel_sched_wakeup", 318 unregister_trace_sched_wakeup(probe_wakeup);
371 wake_up_callback,
372 &wakeup_trace);
373} 319}
374 320
375static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
376{ 322{
377 tracer_enabled = 0; 323 tracer_enabled = 0;
378 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
379 marker_probe_unregister("kernel_sched_schedule", 325 unregister_trace_sched_switch(probe_wakeup_sched_switch);
380 sched_switch_callback, 326 unregister_trace_sched_wakeup_new(probe_wakeup);
381 &wakeup_trace); 327 unregister_trace_sched_wakeup(probe_wakeup);
382 marker_probe_unregister("kernel_sched_wakeup_new",
383 wake_up_callback,
384 &wakeup_trace);
385 marker_probe_unregister("kernel_sched_wakeup",
386 wake_up_callback,
387 &wakeup_trace);
388} 328}
389 329
390static void wakeup_tracer_init(struct trace_array *tr) 330static void wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0911b7e073bf..90bc752a7580 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,65 +9,29 @@ static inline int trace_valid_entry(struct trace_entry *entry)
9 case TRACE_FN: 9 case TRACE_FN:
10 case TRACE_CTX: 10 case TRACE_CTX:
11 case TRACE_WAKE: 11 case TRACE_WAKE:
12 case TRACE_CONT:
12 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT:
13 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
14 return 1; 16 return 1;
15 } 17 }
16 return 0; 18 return 0;
17} 19}
18 20
19static int 21static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
20trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
21{ 22{
22 struct trace_entry *entries; 23 struct ring_buffer_event *event;
23 struct page *page; 24 struct trace_entry *entry;
24 int idx = 0;
25 int i;
26 25
27 BUG_ON(list_empty(&data->trace_pages)); 26 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
28 page = list_entry(data->trace_pages.next, struct page, lru); 27 entry = ring_buffer_event_data(event);
29 entries = page_address(page);
30 28
31 check_pages(data); 29 if (!trace_valid_entry(entry)) {
32 if (head_page(data) != entries)
33 goto failed;
34
35 /*
36 * The starting trace buffer always has valid elements,
37 * if any element exists.
38 */
39 entries = head_page(data);
40
41 for (i = 0; i < tr->entries; i++) {
42
43 if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
44 printk(KERN_CONT ".. invalid entry %d ", 30 printk(KERN_CONT ".. invalid entry %d ",
45 entries[idx].type); 31 entry->type);
46 goto failed; 32 goto failed;
47 } 33 }
48
49 idx++;
50 if (idx >= ENTRIES_PER_PAGE) {
51 page = virt_to_page(entries);
52 if (page->lru.next == &data->trace_pages) {
53 if (i != tr->entries - 1) {
54 printk(KERN_CONT ".. entries buffer mismatch");
55 goto failed;
56 }
57 } else {
58 page = list_entry(page->lru.next, struct page, lru);
59 entries = page_address(page);
60 }
61 idx = 0;
62 }
63 }
64
65 page = virt_to_page(entries);
66 if (page->lru.next != &data->trace_pages) {
67 printk(KERN_CONT ".. too many entries");
68 goto failed;
69 } 34 }
70
71 return 0; 35 return 0;
72 36
73 failed: 37 failed:
@@ -89,13 +53,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
89 /* Don't allow flipping of max traces now */ 53 /* Don't allow flipping of max traces now */
90 raw_local_irq_save(flags); 54 raw_local_irq_save(flags);
91 __raw_spin_lock(&ftrace_max_lock); 55 __raw_spin_lock(&ftrace_max_lock);
92 for_each_possible_cpu(cpu) {
93 if (!head_page(tr->data[cpu]))
94 continue;
95 56
96 cnt += tr->data[cpu]->trace_idx; 57 cnt = ring_buffer_entries(tr->buffer);
97 58
98 ret = trace_test_buffer_cpu(tr, tr->data[cpu]); 59 for_each_possible_cpu(cpu) {
60 ret = trace_test_buffer_cpu(tr, cpu);
99 if (ret) 61 if (ret)
100 break; 62 break;
101 } 63 }
@@ -108,7 +70,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
108 return ret; 70 return ret;
109} 71}
110 72
111#ifdef CONFIG_FTRACE 73#ifdef CONFIG_FUNCTION_TRACER
112 74
113#ifdef CONFIG_DYNAMIC_FTRACE 75#ifdef CONFIG_DYNAMIC_FTRACE
114 76
@@ -120,11 +82,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
120 struct trace_array *tr, 82 struct trace_array *tr,
121 int (*func)(void)) 83 int (*func)(void))
122{ 84{
123 unsigned long count;
124 int ret;
125 int save_ftrace_enabled = ftrace_enabled; 85 int save_ftrace_enabled = ftrace_enabled;
126 int save_tracer_enabled = tracer_enabled; 86 int save_tracer_enabled = tracer_enabled;
87 unsigned long count;
127 char *func_name; 88 char *func_name;
89 int ret;
128 90
129 /* The ftrace test PASSED */ 91 /* The ftrace test PASSED */
130 printk(KERN_CONT "PASSED\n"); 92 printk(KERN_CONT "PASSED\n");
@@ -137,13 +99,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
137 /* passed in by parameter to fool gcc from optimizing */ 99 /* passed in by parameter to fool gcc from optimizing */
138 func(); 100 func();
139 101
140 /* update the records */
141 ret = ftrace_force_update();
142 if (ret) {
143 printk(KERN_CONT ".. ftraced failed .. ");
144 return ret;
145 }
146
147 /* 102 /*
148 * Some archs *cough*PowerPC*cough* add charachters to the 103 * Some archs *cough*PowerPC*cough* add charachters to the
149 * start of the function names. We simply put a '*' to 104 * start of the function names. We simply put a '*' to
@@ -157,6 +112,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
157 /* enable tracing */ 112 /* enable tracing */
158 tr->ctrl = 1; 113 tr->ctrl = 1;
159 trace->init(tr); 114 trace->init(tr);
115
160 /* Sleep for a 1/10 of a second */ 116 /* Sleep for a 1/10 of a second */
161 msleep(100); 117 msleep(100);
162 118
@@ -212,21 +168,14 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
212int 168int
213trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 169trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
214{ 170{
215 unsigned long count;
216 int ret;
217 int save_ftrace_enabled = ftrace_enabled; 171 int save_ftrace_enabled = ftrace_enabled;
218 int save_tracer_enabled = tracer_enabled; 172 int save_tracer_enabled = tracer_enabled;
173 unsigned long count;
174 int ret;
219 175
220 /* make sure msleep has been recorded */ 176 /* make sure msleep has been recorded */
221 msleep(1); 177 msleep(1);
222 178
223 /* force the recorded functions to be traced */
224 ret = ftrace_force_update();
225 if (ret) {
226 printk(KERN_CONT ".. ftraced failed .. ");
227 return ret;
228 }
229
230 /* start the tracing */ 179 /* start the tracing */
231 ftrace_enabled = 1; 180 ftrace_enabled = 1;
232 tracer_enabled = 1; 181 tracer_enabled = 1;
@@ -263,7 +212,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
263 212
264 return ret; 213 return ret;
265} 214}
266#endif /* CONFIG_FTRACE */ 215#endif /* CONFIG_FUNCTION_TRACER */
267 216
268#ifdef CONFIG_IRQSOFF_TRACER 217#ifdef CONFIG_IRQSOFF_TRACER
269int 218int
@@ -415,6 +364,15 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
415} 364}
416#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ 365#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
417 366
367#ifdef CONFIG_NOP_TRACER
368int
369trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
370{
371 /* What could possibly go wrong? */
372 return 0;
373}
374#endif
375
418#ifdef CONFIG_SCHED_TRACER 376#ifdef CONFIG_SCHED_TRACER
419static int trace_wakeup_test_thread(void *data) 377static int trace_wakeup_test_thread(void *data)
420{ 378{
@@ -486,6 +444,9 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
486 444
487 wake_up_process(p); 445 wake_up_process(p);
488 446
447 /* give a little time to let the thread wake up */
448 msleep(100);
449
489 /* stop the tracing. */ 450 /* stop the tracing. */
490 tr->ctrl = 0; 451 tr->ctrl = 0;
491 trace->ctrl_update(tr); 452 trace->ctrl_update(tr);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
new file mode 100644
index 000000000000..be682b62fe58
--- /dev/null
+++ b/kernel/trace/trace_stack.c
@@ -0,0 +1,314 @@
1/*
2 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
3 *
4 */
5#include <linux/stacktrace.h>
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/fs.h>
15#include "trace.h"
16
17#define STACK_TRACE_ENTRIES 500
18
19static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
20 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
21static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
22
23static struct stack_trace max_stack_trace = {
24 .max_entries = STACK_TRACE_ENTRIES,
25 .entries = stack_dump_trace,
26};
27
28static unsigned long max_stack_size;
29static raw_spinlock_t max_stack_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
31
32static int stack_trace_disabled __read_mostly;
33static DEFINE_PER_CPU(int, trace_active);
34
35static inline void check_stack(void)
36{
37 unsigned long this_size, flags;
38 unsigned long *p, *top, *start;
39 int i;
40
41 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
42 this_size = THREAD_SIZE - this_size;
43
44 if (this_size <= max_stack_size)
45 return;
46
47 /* we do not handle interrupt stacks yet */
48 if (!object_is_on_stack(&this_size))
49 return;
50
51 raw_local_irq_save(flags);
52 __raw_spin_lock(&max_stack_lock);
53
54 /* a race could have already updated it */
55 if (this_size <= max_stack_size)
56 goto out;
57
58 max_stack_size = this_size;
59
60 max_stack_trace.nr_entries = 0;
61 max_stack_trace.skip = 3;
62
63 save_stack_trace(&max_stack_trace);
64
65 /*
66 * Now find where in the stack these are.
67 */
68 i = 0;
69 start = &this_size;
70 top = (unsigned long *)
71 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
72
73 /*
74 * Loop through all the entries. One of the entries may
75 * for some reason be missed on the stack, so we may
76 * have to account for them. If they are all there, this
77 * loop will only happen once. This code only takes place
78 * on a new max, so it is far from a fast path.
79 */
80 while (i < max_stack_trace.nr_entries) {
81
82 stack_dump_index[i] = this_size;
83 p = start;
84
85 for (; p < top && i < max_stack_trace.nr_entries; p++) {
86 if (*p == stack_dump_trace[i]) {
87 this_size = stack_dump_index[i++] =
88 (top - p) * sizeof(unsigned long);
89 /* Start the search from here */
90 start = p + 1;
91 }
92 }
93
94 i++;
95 }
96
97 out:
98 __raw_spin_unlock(&max_stack_lock);
99 raw_local_irq_restore(flags);
100}
101
102static void
103stack_trace_call(unsigned long ip, unsigned long parent_ip)
104{
105 int cpu, resched;
106
107 if (unlikely(!ftrace_enabled || stack_trace_disabled))
108 return;
109
110 resched = need_resched();
111 preempt_disable_notrace();
112
113 cpu = raw_smp_processor_id();
114 /* no atomic needed, we only modify this variable by this cpu */
115 if (per_cpu(trace_active, cpu)++ != 0)
116 goto out;
117
118 check_stack();
119
120 out:
121 per_cpu(trace_active, cpu)--;
122 /* prevent recursion in schedule */
123 if (resched)
124 preempt_enable_no_resched_notrace();
125 else
126 preempt_enable_notrace();
127}
128
129static struct ftrace_ops trace_ops __read_mostly =
130{
131 .func = stack_trace_call,
132};
133
134static ssize_t
135stack_max_size_read(struct file *filp, char __user *ubuf,
136 size_t count, loff_t *ppos)
137{
138 unsigned long *ptr = filp->private_data;
139 char buf[64];
140 int r;
141
142 r = snprintf(buf, sizeof(buf), "%ld\n", *ptr);
143 if (r > sizeof(buf))
144 r = sizeof(buf);
145 return simple_read_from_buffer(ubuf, count, ppos, buf, r);
146}
147
148static ssize_t
149stack_max_size_write(struct file *filp, const char __user *ubuf,
150 size_t count, loff_t *ppos)
151{
152 long *ptr = filp->private_data;
153 unsigned long val, flags;
154 char buf[64];
155 int ret;
156
157 if (count >= sizeof(buf))
158 return -EINVAL;
159
160 if (copy_from_user(&buf, ubuf, count))
161 return -EFAULT;
162
163 buf[count] = 0;
164
165 ret = strict_strtoul(buf, 10, &val);
166 if (ret < 0)
167 return ret;
168
169 raw_local_irq_save(flags);
170 __raw_spin_lock(&max_stack_lock);
171 *ptr = val;
172 __raw_spin_unlock(&max_stack_lock);
173 raw_local_irq_restore(flags);
174
175 return count;
176}
177
178static struct file_operations stack_max_size_fops = {
179 .open = tracing_open_generic,
180 .read = stack_max_size_read,
181 .write = stack_max_size_write,
182};
183
184static void *
185t_next(struct seq_file *m, void *v, loff_t *pos)
186{
187 long i = (long)m->private;
188
189 (*pos)++;
190
191 i++;
192
193 if (i >= max_stack_trace.nr_entries ||
194 stack_dump_trace[i] == ULONG_MAX)
195 return NULL;
196
197 m->private = (void *)i;
198
199 return &m->private;
200}
201
202static void *t_start(struct seq_file *m, loff_t *pos)
203{
204 void *t = &m->private;
205 loff_t l = 0;
206
207 local_irq_disable();
208 __raw_spin_lock(&max_stack_lock);
209
210 for (; t && l < *pos; t = t_next(m, t, &l))
211 ;
212
213 return t;
214}
215
216static void t_stop(struct seq_file *m, void *p)
217{
218 __raw_spin_unlock(&max_stack_lock);
219 local_irq_enable();
220}
221
222static int trace_lookup_stack(struct seq_file *m, long i)
223{
224 unsigned long addr = stack_dump_trace[i];
225#ifdef CONFIG_KALLSYMS
226 char str[KSYM_SYMBOL_LEN];
227
228 sprint_symbol(str, addr);
229
230 return seq_printf(m, "%s\n", str);
231#else
232 return seq_printf(m, "%p\n", (void*)addr);
233#endif
234}
235
236static int t_show(struct seq_file *m, void *v)
237{
238 long i = *(long *)v;
239 int size;
240
241 if (i < 0) {
242 seq_printf(m, " Depth Size Location"
243 " (%d entries)\n"
244 " ----- ---- --------\n",
245 max_stack_trace.nr_entries);
246 return 0;
247 }
248
249 if (i >= max_stack_trace.nr_entries ||
250 stack_dump_trace[i] == ULONG_MAX)
251 return 0;
252
253 if (i+1 == max_stack_trace.nr_entries ||
254 stack_dump_trace[i+1] == ULONG_MAX)
255 size = stack_dump_index[i];
256 else
257 size = stack_dump_index[i] - stack_dump_index[i+1];
258
259 seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
260
261 trace_lookup_stack(m, i);
262
263 return 0;
264}
265
266static struct seq_operations stack_trace_seq_ops = {
267 .start = t_start,
268 .next = t_next,
269 .stop = t_stop,
270 .show = t_show,
271};
272
273static int stack_trace_open(struct inode *inode, struct file *file)
274{
275 int ret;
276
277 ret = seq_open(file, &stack_trace_seq_ops);
278 if (!ret) {
279 struct seq_file *m = file->private_data;
280 m->private = (void *)-1;
281 }
282
283 return ret;
284}
285
286static struct file_operations stack_trace_fops = {
287 .open = stack_trace_open,
288 .read = seq_read,
289 .llseek = seq_lseek,
290};
291
292static __init int stack_trace_init(void)
293{
294 struct dentry *d_tracer;
295 struct dentry *entry;
296
297 d_tracer = tracing_init_dentry();
298
299 entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
300 &max_stack_size, &stack_max_size_fops);
301 if (!entry)
302 pr_warning("Could not create debugfs 'stack_max_size' entry\n");
303
304 entry = debugfs_create_file("stack_trace", 0444, d_tracer,
305 NULL, &stack_trace_fops);
306 if (!entry)
307 pr_warning("Could not create debugfs 'stack_trace' entry\n");
308
309 register_ftrace_function(&trace_ops);
310
311 return 0;
312}
313
314device_initcall(stack_trace_init);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..9587d3bcba55 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 208}
@@ -241,7 +241,7 @@ static void stack_reset(struct trace_array *tr)
241 tr->time_start = ftrace_now(tr->cpu); 241 tr->time_start = ftrace_now(tr->cpu);
242 242
243 for_each_online_cpu(cpu) 243 for_each_online_cpu(cpu)
244 tracing_reset(tr->data[cpu]); 244 tracing_reset(tr, cpu);
245} 245}
246 246
247static void start_stack_trace(struct trace_array *tr) 247static void start_stack_trace(struct trace_array *tr)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 000000000000..af8c85664882
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,485 @@
1/*
2 * Copyright (C) 2008 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/tracepoint.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[];
30
31/* Set to 1 to enable tracepoint debug output */
32static const int tracepoint_debug;
33
34/*
35 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
36 * builtin and module tracepoints and the hash table.
37 */
38static DEFINE_MUTEX(tracepoints_mutex);
39
40/*
41 * Tracepoint hash table, containing the active tracepoints.
42 * Protected by tracepoints_mutex.
43 */
44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46
47/*
48 * Note about RCU :
49 * It is used to to delay the free of multiple probes array until a quiescent
50 * state is reached.
51 * Tracepoint entries modifications are protected by the tracepoints_mutex.
52 */
53struct tracepoint_entry {
54 struct hlist_node hlist;
55 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0];
61};
62
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
64
65static void free_old_closure(struct rcu_head *head)
66{
67 struct tracepoint_entry *entry = container_of(head,
68 struct tracepoint_entry, rcu);
69 kfree(entry->oldptr);
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73}
74
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
76{
77 if (!old)
78 return;
79 entry->oldptr = old;
80 entry->rcu_pending = 1;
81 /* write rcu_pending before calling the RCU callback */
82 smp_wmb();
83 call_rcu_sched(&entry->rcu, free_old_closure);
84}
85
86static void debug_print_probes(struct tracepoint_entry *entry)
87{
88 int i;
89
90 if (!tracepoint_debug)
91 return;
92
93 for (i = 0; entry->funcs[i]; i++)
94 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
95}
96
97static void *
98tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
99{
100 int nr_probes = 0;
101 void **old, **new;
102
103 WARN_ON(!probe);
104
105 debug_print_probes(entry);
106 old = entry->funcs;
107 if (old) {
108 /* (N -> N+1), (N != 0, 1) probes */
109 for (nr_probes = 0; old[nr_probes]; nr_probes++)
110 if (old[nr_probes] == probe)
111 return ERR_PTR(-EEXIST);
112 }
113 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
115 if (new == NULL)
116 return ERR_PTR(-ENOMEM);
117 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe;
120 entry->refcount = nr_probes + 1;
121 entry->funcs = new;
122 debug_print_probes(entry);
123 return old;
124}
125
126static void *
127tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
128{
129 int nr_probes = 0, nr_del = 0, i;
130 void **old, **new;
131
132 old = entry->funcs;
133
134 if (!old)
135 return NULL;
136
137 debug_print_probes(entry);
138 /* (N -> M), (N > 1, M >= 0) probes */
139 for (nr_probes = 0; old[nr_probes]; nr_probes++) {
140 if ((!probe || old[nr_probes] == probe))
141 nr_del++;
142 }
143
144 if (nr_probes - nr_del == 0) {
145 /* N -> 0, (N > 1) */
146 entry->funcs = NULL;
147 entry->refcount = 0;
148 debug_print_probes(entry);
149 return old;
150 } else {
151 int j = 0;
152 /* N -> M, (N > 1, M > 0) */
153 /* + 1 for NULL */
154 new = kzalloc((nr_probes - nr_del + 1)
155 * sizeof(void *), GFP_KERNEL);
156 if (new == NULL)
157 return ERR_PTR(-ENOMEM);
158 for (i = 0; old[i]; i++)
159 if ((probe && old[i] != probe))
160 new[j++] = old[i];
161 entry->refcount = nr_probes - nr_del;
162 entry->funcs = new;
163 }
164 debug_print_probes(entry);
165 return old;
166}
167
168/*
169 * Get tracepoint if the tracepoint is present in the tracepoint hash table.
170 * Must be called with tracepoints_mutex held.
171 * Returns NULL if not present.
172 */
173static struct tracepoint_entry *get_tracepoint(const char *name)
174{
175 struct hlist_head *head;
176 struct hlist_node *node;
177 struct tracepoint_entry *e;
178 u32 hash = jhash(name, strlen(name), 0);
179
180 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
181 hlist_for_each_entry(e, node, head, hlist) {
182 if (!strcmp(name, e->name))
183 return e;
184 }
185 return NULL;
186}
187
188/*
189 * Add the tracepoint to the tracepoint hash table. Must be called with
190 * tracepoints_mutex held.
191 */
192static struct tracepoint_entry *add_tracepoint(const char *name)
193{
194 struct hlist_head *head;
195 struct hlist_node *node;
196 struct tracepoint_entry *e;
197 size_t name_len = strlen(name) + 1;
198 u32 hash = jhash(name, name_len-1, 0);
199
200 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
201 hlist_for_each_entry(e, node, head, hlist) {
202 if (!strcmp(name, e->name)) {
203 printk(KERN_NOTICE
204 "tracepoint %s busy\n", name);
205 return ERR_PTR(-EEXIST); /* Already there */
206 }
207 }
208 /*
209 * Using kmalloc here to allocate a variable length element. Could
210 * cause some memory fragmentation if overused.
211 */
212 e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
213 if (!e)
214 return ERR_PTR(-ENOMEM);
215 memcpy(&e->name[0], name, name_len);
216 e->funcs = NULL;
217 e->refcount = 0;
218 e->rcu_pending = 0;
219 hlist_add_head(&e->hlist, head);
220 return e;
221}
222
223/*
224 * Remove the tracepoint from the tracepoint hash table. Must be called with
225 * mutex_lock held.
226 */
227static int remove_tracepoint(const char *name)
228{
229 struct hlist_head *head;
230 struct hlist_node *node;
231 struct tracepoint_entry *e;
232 int found = 0;
233 size_t len = strlen(name) + 1;
234 u32 hash = jhash(name, len-1, 0);
235
236 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
237 hlist_for_each_entry(e, node, head, hlist) {
238 if (!strcmp(name, e->name)) {
239 found = 1;
240 break;
241 }
242 }
243 if (!found)
244 return -ENOENT;
245 if (e->refcount)
246 return -EBUSY;
247 hlist_del(&e->hlist);
248 /* Make sure the call_rcu_sched has been executed */
249 if (e->rcu_pending)
250 rcu_barrier_sched();
251 kfree(e);
252 return 0;
253}
254
255/*
256 * Sets the probe callback corresponding to one tracepoint.
257 */
258static void set_tracepoint(struct tracepoint_entry **entry,
259 struct tracepoint *elem, int active)
260{
261 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
262
263 /*
264 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
265 * probe callbacks array is consistent before setting a pointer to it.
266 * This array is referenced by __DO_TRACE from
267 * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
268 * is used.
269 */
270 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
271 elem->state = active;
272}
273
274/*
275 * Disable a tracepoint and its probe callback.
276 * Note: only waiting an RCU period after setting elem->call to the empty
277 * function insures that the original callback is not used anymore. This insured
278 * by preempt_disable around the call site.
279 */
280static void disable_tracepoint(struct tracepoint *elem)
281{
282 elem->state = 0;
283}
284
285/**
286 * tracepoint_update_probe_range - Update a probe range
287 * @begin: beginning of the range
288 * @end: end of the range
289 *
290 * Updates the probe callback corresponding to a range of tracepoints.
291 */
292void tracepoint_update_probe_range(struct tracepoint *begin,
293 struct tracepoint *end)
294{
295 struct tracepoint *iter;
296 struct tracepoint_entry *mark_entry;
297
298 mutex_lock(&tracepoints_mutex);
299 for (iter = begin; iter < end; iter++) {
300 mark_entry = get_tracepoint(iter->name);
301 if (mark_entry) {
302 set_tracepoint(&mark_entry, iter,
303 !!mark_entry->refcount);
304 } else {
305 disable_tracepoint(iter);
306 }
307 }
308 mutex_unlock(&tracepoints_mutex);
309}
310
311/*
312 * Update probes, removing the faulty probes.
313 */
314static void tracepoint_update_probes(void)
315{
316 /* Core kernel tracepoints */
317 tracepoint_update_probe_range(__start___tracepoints,
318 __stop___tracepoints);
319 /* tracepoints in modules. */
320 module_update_tracepoints();
321}
322
323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name
326 * @probe: probe handler
327 *
328 * Returns 0 if ok, error value on error.
329 * The probe address must at least be aligned on the architecture pointer size.
330 */
331int tracepoint_probe_register(const char *name, void *probe)
332{
333 struct tracepoint_entry *entry;
334 int ret = 0;
335 void *old;
336
337 mutex_lock(&tracepoints_mutex);
338 entry = get_tracepoint(name);
339 if (!entry) {
340 entry = add_tracepoint(name);
341 if (IS_ERR(entry)) {
342 ret = PTR_ERR(entry);
343 goto end;
344 }
345 }
346 /*
347 * If we detect that a call_rcu_sched is pending for this tracepoint,
348 * make sure it's executed now.
349 */
350 if (entry->rcu_pending)
351 rcu_barrier_sched();
352 old = tracepoint_entry_add_probe(entry, probe);
353 if (IS_ERR(old)) {
354 ret = PTR_ERR(old);
355 goto end;
356 }
357 mutex_unlock(&tracepoints_mutex);
358 tracepoint_update_probes(); /* may update entry */
359 mutex_lock(&tracepoints_mutex);
360 entry = get_tracepoint(name);
361 WARN_ON(!entry);
362 if (entry->rcu_pending)
363 rcu_barrier_sched();
364 tracepoint_entry_free_old(entry, old);
365end:
366 mutex_unlock(&tracepoints_mutex);
367 return ret;
368}
369EXPORT_SYMBOL_GPL(tracepoint_probe_register);
370
371/**
372 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
373 * @name: tracepoint name
374 * @probe: probe function pointer
375 *
376 * We do not need to call a synchronize_sched to make sure the probes have
377 * finished running before doing a module unload, because the module unload
378 * itself uses stop_machine(), which insures that every preempt disabled section
379 * have finished.
380 */
381int tracepoint_probe_unregister(const char *name, void *probe)
382{
383 struct tracepoint_entry *entry;
384 void *old;
385 int ret = -ENOENT;
386
387 mutex_lock(&tracepoints_mutex);
388 entry = get_tracepoint(name);
389 if (!entry)
390 goto end;
391 if (entry->rcu_pending)
392 rcu_barrier_sched();
393 old = tracepoint_entry_remove_probe(entry, probe);
394 if (!old) {
395 printk(KERN_WARNING "Warning: Trying to unregister a probe"
396 "that doesn't exist\n");
397 goto end;
398 }
399 mutex_unlock(&tracepoints_mutex);
400 tracepoint_update_probes(); /* may update entry */
401 mutex_lock(&tracepoints_mutex);
402 entry = get_tracepoint(name);
403 if (!entry)
404 goto end;
405 if (entry->rcu_pending)
406 rcu_barrier_sched();
407 tracepoint_entry_free_old(entry, old);
408 remove_tracepoint(name); /* Ignore busy error message */
409 ret = 0;
410end:
411 mutex_unlock(&tracepoints_mutex);
412 return ret;
413}
414EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
415
416/**
417 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
418 * @tracepoint: current tracepoints (in), next tracepoint (out)
419 * @begin: beginning of the range
420 * @end: end of the range
421 *
422 * Returns whether a next tracepoint has been found (1) or not (0).
423 * Will return the first tracepoint in the range if the input tracepoint is
424 * NULL.
425 */
426int tracepoint_get_iter_range(struct tracepoint **tracepoint,
427 struct tracepoint *begin, struct tracepoint *end)
428{
429 if (!*tracepoint && begin != end) {
430 *tracepoint = begin;
431 return 1;
432 }
433 if (*tracepoint >= begin && *tracepoint < end)
434 return 1;
435 return 0;
436}
437EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
438
439static void tracepoint_get_iter(struct tracepoint_iter *iter)
440{
441 int found = 0;
442
443 /* Core kernel tracepoints */
444 if (!iter->module) {
445 found = tracepoint_get_iter_range(&iter->tracepoint,
446 __start___tracepoints, __stop___tracepoints);
447 if (found)
448 goto end;
449 }
450 /* tracepoints in modules. */
451 found = module_get_iter_tracepoints(iter);
452end:
453 if (!found)
454 tracepoint_iter_reset(iter);
455}
456
457void tracepoint_iter_start(struct tracepoint_iter *iter)
458{
459 tracepoint_get_iter(iter);
460}
461EXPORT_SYMBOL_GPL(tracepoint_iter_start);
462
463void tracepoint_iter_next(struct tracepoint_iter *iter)
464{
465 iter->tracepoint++;
466 /*
467 * iter->tracepoint may be invalid because we blindly incremented it.
468 * Make sure it is valid by marshalling on the tracepoints, getting the
469 * tracepoints from following modules if necessary.
470 */
471 tracepoint_get_iter(iter);
472}
473EXPORT_SYMBOL_GPL(tracepoint_iter_next);
474
475void tracepoint_iter_stop(struct tracepoint_iter *iter)
476{
477}
478EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
479
480void tracepoint_iter_reset(struct tracepoint_iter *iter)
481{
482 iter->module = NULL;
483 iter->tracepoint = NULL;
484}
485EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4ab9659d269e..3b34b3545936 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
60 60
61#ifdef CONFIG_SYSCTL_SYSCALL 61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */ 62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, 63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp, 64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen) 65 void __user *newval, size_t newlen)
66{ 66{
@@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
69 write = newval && newlen; 69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table)); 70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write); 71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, name, nlen, 72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 oldval, oldlenp, newval, newlen);
74 put_uts(table, write, uts_table.data); 73 put_uts(table, write, uts_table.data);
75 return r; 74 return r;
76} 75}
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..cd87131f2fc2 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
72 spin_lock_irqsave(&q->lock, flags); 72 spin_lock_irqsave(&q->lock, flags);
73 if (list_empty(&wait->task_list)) 73 if (list_empty(&wait->task_list))
74 __add_wait_queue(q, wait); 74 __add_wait_queue(q, wait);
75 /* 75 set_current_state(state);
76 * don't alter the task state if this is just going to
77 * queue an async wait queue callback
78 */
79 if (is_sync_wait(wait))
80 set_current_state(state);
81 spin_unlock_irqrestore(&q->lock, flags); 76 spin_unlock_irqrestore(&q->lock, flags);
82} 77}
83EXPORT_SYMBOL(prepare_to_wait); 78EXPORT_SYMBOL(prepare_to_wait);
@@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91 spin_lock_irqsave(&q->lock, flags); 86 spin_lock_irqsave(&q->lock, flags);
92 if (list_empty(&wait->task_list)) 87 if (list_empty(&wait->task_list))
93 __add_wait_queue_tail(q, wait); 88 __add_wait_queue_tail(q, wait);
94 /* 89 set_current_state(state);
95 * don't alter the task state if this is just going to
96 * queue an async wait queue callback
97 */
98 if (is_sync_wait(wait))
99 set_current_state(state);
100 spin_unlock_irqrestore(&q->lock, flags); 90 spin_unlock_irqrestore(&q->lock, flags);
101} 91}
102EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4048e92aa04f..d4dc69ddebd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -9,7 +9,7 @@
9 * Derived from the taskqueue/keventd code by: 9 * Derived from the taskqueue/keventd code by:
10 * 10 *
11 * David Woodhouse <dwmw2@infradead.org> 11 * David Woodhouse <dwmw2@infradead.org>
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *
@@ -62,6 +62,7 @@ struct workqueue_struct {
62 const char *name; 62 const char *name;
63 int singlethread; 63 int singlethread;
64 int freezeable; /* Freeze threads during suspend */ 64 int freezeable; /* Freeze threads during suspend */
65 int rt;
65#ifdef CONFIG_LOCKDEP 66#ifdef CONFIG_LOCKDEP
66 struct lockdep_map lockdep_map; 67 struct lockdep_map lockdep_map;
67#endif 68#endif
@@ -766,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
766 767
767static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 768static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
768{ 769{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
769 struct workqueue_struct *wq = cwq->wq; 771 struct workqueue_struct *wq = cwq->wq;
770 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; 772 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
771 struct task_struct *p; 773 struct task_struct *p;
@@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
781 */ 783 */
782 if (IS_ERR(p)) 784 if (IS_ERR(p))
783 return PTR_ERR(p); 785 return PTR_ERR(p);
784 786 if (cwq->wq->rt)
787 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
785 cwq->thread = p; 788 cwq->thread = p;
786 789
787 return 0; 790 return 0;
@@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
801struct workqueue_struct *__create_workqueue_key(const char *name, 804struct workqueue_struct *__create_workqueue_key(const char *name,
802 int singlethread, 805 int singlethread,
803 int freezeable, 806 int freezeable,
807 int rt,
804 struct lock_class_key *key, 808 struct lock_class_key *key,
805 const char *lock_name) 809 const char *lock_name)
806{ 810{
@@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
822 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 826 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
823 wq->singlethread = singlethread; 827 wq->singlethread = singlethread;
824 wq->freezeable = freezeable; 828 wq->freezeable = freezeable;
829 wq->rt = rt;
825 INIT_LIST_HEAD(&wq->list); 830 INIT_LIST_HEAD(&wq->list);
826 831
827 if (singlethread) { 832 if (singlethread) {
@@ -965,6 +970,51 @@ undo:
965 return ret; 970 return ret;
966} 971}
967 972
973#ifdef CONFIG_SMP
974struct work_for_cpu {
975 struct work_struct work;
976 long (*fn)(void *);
977 void *arg;
978 long ret;
979};
980
981static void do_work_for_cpu(struct work_struct *w)
982{
983 struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
984
985 wfc->ret = wfc->fn(wfc->arg);
986}
987
988/**
989 * work_on_cpu - run a function in user context on a particular cpu
990 * @cpu: the cpu to run on
991 * @fn: the function to run
992 * @arg: the function arg
993 *
994 * This will return -EINVAL in the cpu is not online, or the return value
995 * of @fn otherwise.
996 */
997long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
998{
999 struct work_for_cpu wfc;
1000
1001 INIT_WORK(&wfc.work, do_work_for_cpu);
1002 wfc.fn = fn;
1003 wfc.arg = arg;
1004 get_online_cpus();
1005 if (unlikely(!cpu_online(cpu)))
1006 wfc.ret = -EINVAL;
1007 else {
1008 schedule_work_on(cpu, &wfc.work);
1009 flush_work(&wfc.work);
1010 }
1011 put_online_cpus();
1012
1013 return wfc.ret;
1014}
1015EXPORT_SYMBOL_GPL(work_on_cpu);
1016#endif /* CONFIG_SMP */
1017
968void __init init_workqueues(void) 1018void __init init_workqueues(void)
969{ 1019{
970 cpu_populated_map = cpu_online_map; 1020 cpu_populated_map = cpu_online_map;