aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorArnd Bergmann <arnd@arndb.de>2009-06-12 03:53:47 -0400
committerArnd Bergmann <arnd@arndb.de>2009-06-12 05:32:58 -0400
commit5b02ee3d219f9e01b6e9146e25613822cfc2e5ce (patch)
tree7ce9126738c3cf4b37d67170d0e4b34818c057a9 /kernel
parent26a28fa4fea5b8c65713aa50c124f76a88c7924d (diff)
parent8ebf975608aaebd7feb33d77f07ba21a6380e086 (diff)
asm-generic: merge branch 'master' of torvalds/linux-2.6
Fixes a merge conflict against the x86 tree caused by a fix to atomic.h which I renamed to atomic_long.h. Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/futex.c1188
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c72
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c38
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/module.c89
-rw-r--r--kernel/mutex.c31
-rw-r--r--kernel/perf_counter.c4260
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c19
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcutree.c25
-rw-r--r--kernel/rcutree_trace.c64
-rw-r--r--kernel/rtmutex.c248
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c444
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_fair.c13
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/signal.c71
-rw-r--r--kernel/slow-work.c4
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c49
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/timer.c89
-rw-r--r--kernel/trace/Kconfig143
-rw-r--r--kernel/trace/Makefile20
-rw-r--r--kernel/trace/blktrace.c285
-rw-r--r--kernel/trace/events.c14
-rw-r--r--kernel/trace/ftrace.c801
-rw-r--r--kernel/trace/kmemtrace.c10
-rw-r--r--kernel/trace/ring_buffer.c777
-rw-r--r--kernel/trace/ring_buffer_benchmark.c416
-rw-r--r--kernel/trace/trace.c383
-rw-r--r--kernel/trace/trace.h243
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_profile.c24
-rw-r--r--kernel/trace/trace_event_types.h12
-rw-r--r--kernel/trace/trace_events.c839
-rw-r--r--kernel/trace/trace_events_filter.c1204
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h176
-rw-r--r--kernel/trace/trace_events_stage_3.h281
-rw-r--r--kernel/trace/trace_export.c110
-rw-r--r--kernel/trace/trace_functions_graph.c31
-rw-r--r--kernel/trace/trace_hw_branches.c203
-rw-r--r--kernel/trace/trace_mmiotrace.c6
-rw-r--r--kernel/trace/trace_output.c240
-rw-r--r--kernel/trace/trace_output.h34
-rw-r--r--kernel/trace/trace_power.c8
-rw-r--r--kernel/trace/trace_printk.c6
-rw-r--r--kernel/trace/trace_sched_switch.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_selftest.c58
-rw-r--r--kernel/trace/trace_stack.c15
-rw-r--r--kernel/trace/trace_stat.c208
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_sysprof.c6
-rw-r--r--kernel/trace/trace_workqueue.c25
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/workqueue.c11
78 files changed, 10934 insertions, 2652 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..90b53f6dc226 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,8 +93,10 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o 93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 94obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 95obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
98 100
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 101ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 102# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a82..1f6396d76687 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
568 if (err) 568 if (err)
569 goto skip_it; 569 goto skip_it;
570 570
571 root_mnt = collect_mounts(path.mnt, path.dentry); 571 root_mnt = collect_mounts(&path);
572 path_put(&path); 572 path_put(&path);
573 if (!root_mnt) 573 if (!root_mnt)
574 goto skip_it; 574 goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
660 err = kern_path(tree->pathname, 0, &path); 660 err = kern_path(tree->pathname, 0, &path);
661 if (err) 661 if (err)
662 goto Err; 662 goto Err;
663 mnt = collect_mounts(path.mnt, path.dentry); 663 mnt = collect_mounts(&path);
664 path_put(&path); 664 path_put(&path);
665 if (!mnt) { 665 if (!mnt) {
666 err = -ENOMEM; 666 err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
720 err = kern_path(new, 0, &path); 720 err = kern_path(new, 0, &path);
721 if (err) 721 if (err)
722 return err; 722 return err;
723 tagged = collect_mounts(path.mnt, path.dentry); 723 tagged = collect_mounts(&path);
724 path_put(&path); 724 path_put(&path);
725 if (!tagged) 725 if (!tagged)
726 return -ENOMEM; 726 return -ENOMEM;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd3765..3fb789f6df94 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h>
49 50
50#include <asm/atomic.h> 51#include <asm/atomic.h>
51 52
@@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
900 struct cgroup *cgrp = &root->top_cgroup; 901 struct cgroup *cgrp = &root->top_cgroup;
901 struct cgroup_sb_opts opts; 902 struct cgroup_sb_opts opts;
902 903
904 lock_kernel();
903 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 905 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
904 mutex_lock(&cgroup_mutex); 906 mutex_lock(&cgroup_mutex);
905 907
@@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
927 kfree(opts.release_agent); 929 kfree(opts.release_agent);
928 mutex_unlock(&cgroup_mutex); 930 mutex_unlock(&cgroup_mutex);
929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 931 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
932 unlock_kernel();
930 return ret; 933 return ret;
931} 934}
932 935
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
882 882
883} 883}
884 884
885asmlinkage long
886compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
887 struct compat_siginfo __user *uinfo)
888{
889 siginfo_t info;
890
891 if (copy_siginfo_from_user32(&info, uinfo))
892 return -EFAULT;
893 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
894}
895
885#ifdef __ARCH_WANT_COMPAT_SYS_TIME 896#ifdef __ARCH_WANT_COMPAT_SYS_TIME
886 897
887/* compat_time_t is a 32 bit "long" and needs to get converted. */ 898/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869..d5a7e17474ee 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {
1857 1857
1858int __init cpuset_init_early(void) 1858int __init cpuset_init_early(void)
1859{ 1859{
1860 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); 1860 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1861 1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++; 1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0; 1863 return 0;
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d707..1bb4d7e5d616 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
167 167
168/* 168/*
169 * Prepare credentials for current to perform an execve() 169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex 170 * - The caller must hold current->cred_guard_mutex
171 */ 171 */
172struct cred *prepare_exec_creds(void) 172struct cred *prepare_exec_creds(void)
173{ 173{
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
276 struct cred *new; 276 struct cred *new;
277 int ret; 277 int ret;
278 278
279 mutex_init(&p->cred_exec_mutex); 279 mutex_init(&p->cred_guard_mutex);
280 280
281 if ( 281 if (
282#ifdef CONFIG_KEYS 282#ifdef CONFIG_KEYS
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..b6c90b5ef509 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,8 @@
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h> 49#include <linux/fs_struct.h>
50#include <linux/init_task.h> 50#include <linux/init_task.h>
51#include <trace/sched.h> 51#include <linux/perf_counter.h>
52#include <trace/events/sched.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -56,10 +57,6 @@
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
57#include "cred-internals.h" 58#include "cred-internals.h"
58 59
59DEFINE_TRACE(sched_process_free);
60DEFINE_TRACE(sched_process_exit);
61DEFINE_TRACE(sched_process_wait);
62
63static void exit_mm(struct task_struct * tsk); 60static void exit_mm(struct task_struct * tsk);
64 61
65static void __unhash_process(struct task_struct *p) 62static void __unhash_process(struct task_struct *p)
@@ -158,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 155{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 156 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 157
158#ifdef CONFIG_PERF_COUNTERS
159 WARN_ON_ONCE(tsk->perf_counter_ctxp);
160#endif
161 trace_sched_process_free(tsk); 161 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 162 put_task_struct(tsk);
163} 163}
@@ -174,6 +174,7 @@ repeat:
174 atomic_dec(&__task_cred(p)->user->processes); 174 atomic_dec(&__task_cred(p)->user->processes);
175 175
176 proc_flush_task(p); 176 proc_flush_task(p);
177
177 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
178 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
179 __exit_signal(p); 180 __exit_signal(p);
@@ -975,16 +976,19 @@ NORET_TYPE void do_exit(long code)
975 module_put(tsk->binfmt->module); 976 module_put(tsk->binfmt->module);
976 977
977 proc_exit_connector(tsk); 978 proc_exit_connector(tsk);
979
980 /*
981 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications.
983 */
984 perf_counter_exit_task(tsk);
985
978 exit_notify(tsk, group_dead); 986 exit_notify(tsk, group_dead);
979#ifdef CONFIG_NUMA 987#ifdef CONFIG_NUMA
980 mpol_put(tsk->mempolicy); 988 mpol_put(tsk->mempolicy);
981 tsk->mempolicy = NULL; 989 tsk->mempolicy = NULL;
982#endif 990#endif
983#ifdef CONFIG_FUTEX 991#ifdef CONFIG_FUTEX
984 /*
985 * This must happen late, after the PID is not
986 * hashed anymore:
987 */
988 if (unlikely(!list_empty(&tsk->pi_state_list))) 992 if (unlikely(!list_empty(&tsk->pi_state_list)))
989 exit_pi_state_list(tsk); 993 exit_pi_state_list(tsk);
990 if (unlikely(current->pi_state_cache)) 994 if (unlikely(current->pi_state_cache))
@@ -1476,6 +1480,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1476 */ 1480 */
1477 if (*notask_error) 1481 if (*notask_error)
1478 *notask_error = ret; 1482 *notask_error = ret;
1483 return 0;
1479 } 1484 }
1480 1485
1481 if (likely(!ptrace) && unlikely(p->ptrace)) { 1486 if (likely(!ptrace) && unlikely(p->ptrace)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 875ffbdd96d0..4430eb1376f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,8 +61,8 @@
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <trace/sched.h>
65#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_counter.h>
66 66
67#include <asm/pgtable.h> 67#include <asm/pgtable.h>
68#include <asm/pgalloc.h> 68#include <asm/pgalloc.h>
@@ -71,6 +71,8 @@
71#include <asm/cacheflush.h> 71#include <asm/cacheflush.h>
72#include <asm/tlbflush.h> 72#include <asm/tlbflush.h>
73 73
74#include <trace/events/sched.h>
75
74/* 76/*
75 * Protected counters by write_lock_irq(&tasklist_lock) 77 * Protected counters by write_lock_irq(&tasklist_lock)
76 */ 78 */
@@ -83,8 +85,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
83 85
84__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 86__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
85 87
86DEFINE_TRACE(sched_process_fork);
87
88int nr_processes(void) 88int nr_processes(void)
89{ 89{
90 int cpu; 90 int cpu;
@@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
982 if (!p) 982 if (!p)
983 goto fork_out; 983 goto fork_out;
984 984
985 ftrace_graph_init_task(p);
986
985 rt_mutex_init_task(p); 987 rt_mutex_init_task(p);
986 988
987#ifdef CONFIG_PROVE_LOCKING 989#ifdef CONFIG_PROVE_LOCKING
@@ -1089,12 +1091,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1089#ifdef CONFIG_DEBUG_MUTEXES 1091#ifdef CONFIG_DEBUG_MUTEXES
1090 p->blocked_on = NULL; /* not blocked yet */ 1092 p->blocked_on = NULL; /* not blocked yet */
1091#endif 1093#endif
1092 if (unlikely(current->ptrace)) 1094
1093 ptrace_fork(p, clone_flags); 1095 p->bts = NULL;
1094 1096
1095 /* Perform scheduler related setup. Assign this task to a CPU. */ 1097 /* Perform scheduler related setup. Assign this task to a CPU. */
1096 sched_fork(p, clone_flags); 1098 sched_fork(p, clone_flags);
1097 1099
1100 retval = perf_counter_init_task(p);
1101 if (retval)
1102 goto bad_fork_cleanup_policy;
1103
1098 if ((retval = audit_alloc(p))) 1104 if ((retval = audit_alloc(p)))
1099 goto bad_fork_cleanup_policy; 1105 goto bad_fork_cleanup_policy;
1100 /* copy all the process information */ 1106 /* copy all the process information */
@@ -1131,8 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1131 } 1137 }
1132 } 1138 }
1133 1139
1134 ftrace_graph_init_task(p);
1135
1136 p->pid = pid_nr(pid); 1140 p->pid = pid_nr(pid);
1137 p->tgid = p->pid; 1141 p->tgid = p->pid;
1138 if (clone_flags & CLONE_THREAD) 1142 if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1145,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if (current->nsproxy != p->nsproxy) { 1145 if (current->nsproxy != p->nsproxy) {
1142 retval = ns_cgroup_clone(p, pid); 1146 retval = ns_cgroup_clone(p, pid);
1143 if (retval) 1147 if (retval)
1144 goto bad_fork_free_graph; 1148 goto bad_fork_free_pid;
1145 } 1149 }
1146 1150
1147 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1151 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233 spin_unlock(&current->sighand->siglock); 1237 spin_unlock(&current->sighand->siglock);
1234 write_unlock_irq(&tasklist_lock); 1238 write_unlock_irq(&tasklist_lock);
1235 retval = -ERESTARTNOINTR; 1239 retval = -ERESTARTNOINTR;
1236 goto bad_fork_free_graph; 1240 goto bad_fork_free_pid;
1237 } 1241 }
1238 1242
1239 if (clone_flags & CLONE_THREAD) { 1243 if (clone_flags & CLONE_THREAD) {
@@ -1268,8 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1268 cgroup_post_fork(p); 1272 cgroup_post_fork(p);
1269 return p; 1273 return p;
1270 1274
1271bad_fork_free_graph:
1272 ftrace_graph_exit_task(p);
1273bad_fork_free_pid: 1275bad_fork_free_pid:
1274 if (pid != &init_struct_pid) 1276 if (pid != &init_struct_pid)
1275 free_pid(pid); 1277 free_pid(pid);
@@ -1293,6 +1295,7 @@ bad_fork_cleanup_semundo:
1293bad_fork_cleanup_audit: 1295bad_fork_cleanup_audit:
1294 audit_free(p); 1296 audit_free(p);
1295bad_fork_cleanup_policy: 1297bad_fork_cleanup_policy:
1298 perf_counter_free_task(p);
1296#ifdef CONFIG_NUMA 1299#ifdef CONFIG_NUMA
1297 mpol_put(p->mempolicy); 1300 mpol_put(p->mempolicy);
1298bad_fork_cleanup_cgroup: 1301bad_fork_cleanup_cgroup:
@@ -1406,6 +1409,12 @@ long do_fork(unsigned long clone_flags,
1406 if (clone_flags & CLONE_VFORK) { 1409 if (clone_flags & CLONE_VFORK) {
1407 p->vfork_done = &vfork; 1410 p->vfork_done = &vfork;
1408 init_completion(&vfork); 1411 init_completion(&vfork);
1412 } else if (!(clone_flags & CLONE_VM)) {
1413 /*
1414 * vfork will do an exec which will call
1415 * set_task_comm()
1416 */
1417 perf_counter_fork(p);
1409 } 1418 }
1410 1419
1411 audit_finish_fork(p); 1420 audit_finish_fork(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d53a62..80b5ce716596 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
96 */ 100 */
97struct futex_q { 101struct futex_q {
98 struct plist_node list; 102 struct plist_node list;
99 /* There can only be a single waiter */ 103 /* Waiter reference */
100 wait_queue_head_t waiter; 104 struct task_struct *task;
101 105
102 /* Which hash list lock to use: */ 106 /* Which hash list lock to use: */
103 spinlock_t *lock_ptr; 107 spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
107 111
108 /* Optional priority inheritance state: */ 112 /* Optional priority inheritance state: */
109 struct futex_pi_state *pi_state; 113 struct futex_pi_state *pi_state;
110 struct task_struct *task; 114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter;
111 117
112 /* Bitset for the optional bitmasked wakeup */ 118 /* Bitset for the optional bitmasked wakeup */
113 u32 bitset; 119 u32 bitset;
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
278 drop_futex_key_refs(key); 284 drop_futex_key_refs(key);
279} 285}
280 286
287/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in
290 * @key: the futex key (to distinguish it from other futex futex_q's)
291 *
292 * Must be called with the hb lock held.
293 */
294static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
295 union futex_key *key)
296{
297 struct futex_q *this;
298
299 plist_for_each_entry(this, &hb->chain, list) {
300 if (match_futex(&this->key, key))
301 return this;
302 }
303 return NULL;
304}
305
281static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 306static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
282{ 307{
283 u32 curval; 308 u32 curval;
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
539 return 0; 564 return 0;
540} 565}
541 566
567/**
568 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
569 * @uaddr: the pi futex user address
570 * @hb: the pi futex hash bucket
571 * @key: the futex key associated with uaddr and hb
572 * @ps: the pi_state pointer where we store the result of the
573 * lookup
574 * @task: the task to perform the atomic lock work for. This will
575 * be "current" except in the case of requeue pi.
576 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
577 *
578 * Returns:
579 * 0 - ready to wait
580 * 1 - acquired the lock
581 * <0 - error
582 *
583 * The hb->lock and futex_key refs shall be held by the caller.
584 */
585static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
586 union futex_key *key,
587 struct futex_pi_state **ps,
588 struct task_struct *task, int set_waiters)
589{
590 int lock_taken, ret, ownerdied = 0;
591 u32 uval, newval, curval;
592
593retry:
594 ret = lock_taken = 0;
595
596 /*
597 * To avoid races, we attempt to take the lock here again
598 * (by doing a 0 -> TID atomic cmpxchg), while holding all
599 * the locks. It will most likely not succeed.
600 */
601 newval = task_pid_vnr(task);
602 if (set_waiters)
603 newval |= FUTEX_WAITERS;
604
605 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
606
607 if (unlikely(curval == -EFAULT))
608 return -EFAULT;
609
610 /*
611 * Detect deadlocks.
612 */
613 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
614 return -EDEADLK;
615
616 /*
617 * Surprise - we got the lock. Just return to userspace:
618 */
619 if (unlikely(!curval))
620 return 1;
621
622 uval = curval;
623
624 /*
625 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
626 * to wake at the next unlock.
627 */
628 newval = curval | FUTEX_WAITERS;
629
630 /*
631 * There are two cases, where a futex might have no owner (the
632 * owner TID is 0): OWNER_DIED. We take over the futex in this
633 * case. We also do an unconditional take over, when the owner
634 * of the futex died.
635 *
636 * This is safe as we are protected by the hash bucket lock !
637 */
638 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
639 /* Keep the OWNER_DIED bit */
640 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
641 ownerdied = 0;
642 lock_taken = 1;
643 }
644
645 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
646
647 if (unlikely(curval == -EFAULT))
648 return -EFAULT;
649 if (unlikely(curval != uval))
650 goto retry;
651
652 /*
653 * We took the lock due to owner died take over.
654 */
655 if (unlikely(lock_taken))
656 return 1;
657
658 /*
659 * We dont have the lock. Look up the PI state (or create it if
660 * we are the first waiter):
661 */
662 ret = lookup_pi_state(uval, hb, key, ps);
663
664 if (unlikely(ret)) {
665 switch (ret) {
666 case -ESRCH:
667 /*
668 * No owner found for this futex. Check if the
669 * OWNER_DIED bit is set to figure out whether
670 * this is a robust futex or not.
671 */
672 if (get_futex_value_locked(&curval, uaddr))
673 return -EFAULT;
674
675 /*
676 * We simply start over in case of a robust
677 * futex. The code above will take the futex
678 * and return happy.
679 */
680 if (curval & FUTEX_OWNER_DIED) {
681 ownerdied = 1;
682 goto retry;
683 }
684 default:
685 break;
686 }
687 }
688
689 return ret;
690}
691
542/* 692/*
543 * The hash bucket lock must be held when this is called. 693 * The hash bucket lock must be held when this is called.
544 * Afterwards, the futex_q must not be accessed. 694 * Afterwards, the futex_q must not be accessed.
545 */ 695 */
546static void wake_futex(struct futex_q *q) 696static void wake_futex(struct futex_q *q)
547{ 697{
548 plist_del(&q->list, &q->list.plist); 698 struct task_struct *p = q->task;
699
549 /* 700 /*
550 * The lock in wake_up_all() is a crucial memory barrier after the 701 * We set q->lock_ptr = NULL _before_ we wake up the task. If
551 * plist_del() and also before assigning to q->lock_ptr. 702 * a non futex wake up happens on another CPU then the task
703 * might exit and p would dereference a non existing task
704 * struct. Prevent this by holding a reference on p across the
705 * wake up.
552 */ 706 */
553 wake_up(&q->waiter); 707 get_task_struct(p);
708
709 plist_del(&q->list, &q->list.plist);
554 /* 710 /*
555 * The waiting task can free the futex_q as soon as this is written, 711 * The waiting task can free the futex_q as soon as
556 * without taking any locks. This must come last. 712 * q->lock_ptr = NULL is written, without taking any locks. A
557 * 713 * memory barrier is required here to prevent the following
558 * A memory barrier is required here to prevent the following store to 714 * store to lock_ptr from getting ahead of the plist_del.
559 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
560 * end of wake_up() does not prevent this store from moving.
561 */ 715 */
562 smp_wmb(); 716 smp_wmb();
563 q->lock_ptr = NULL; 717 q->lock_ptr = NULL;
718
719 wake_up_state(p, TASK_NORMAL);
720 put_task_struct(p);
564} 721}
565 722
566static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 723static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
689 846
690 plist_for_each_entry_safe(this, next, head, list) { 847 plist_for_each_entry_safe(this, next, head, list) {
691 if (match_futex (&this->key, &key)) { 848 if (match_futex (&this->key, &key)) {
692 if (this->pi_state) { 849 if (this->pi_state || this->rt_waiter) {
693 ret = -EINVAL; 850 ret = -EINVAL;
694 break; 851 break;
695 } 852 }
@@ -802,24 +959,185 @@ out:
802 return ret; 959 return ret;
803} 960}
804 961
805/* 962/**
806 * Requeue all waiters hashed on one physical page to another 963 * requeue_futex() - Requeue a futex_q from one hb to another
807 * physical page. 964 * @q: the futex_q to requeue
965 * @hb1: the source hash_bucket
966 * @hb2: the target hash_bucket
967 * @key2: the new key for the requeued futex_q
968 */
969static inline
970void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
971 struct futex_hash_bucket *hb2, union futex_key *key2)
972{
973
974 /*
975 * If key1 and key2 hash to the same bucket, no need to
976 * requeue.
977 */
978 if (likely(&hb1->chain != &hb2->chain)) {
979 plist_del(&q->list, &hb1->chain);
980 plist_add(&q->list, &hb2->chain);
981 q->lock_ptr = &hb2->lock;
982#ifdef CONFIG_DEBUG_PI_LIST
983 q->list.plist.lock = &hb2->lock;
984#endif
985 }
986 get_futex_key_refs(key2);
987 q->key = *key2;
988}
989
990/**
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q
993 * key: the key of the requeue target futex
994 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held.
1000 */
1001static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1003{
1004 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key);
1006 q->key = *key;
1007
1008 WARN_ON(plist_node_empty(&q->list));
1009 plist_del(&q->list, &q->list.plist);
1010
1011 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL;
1013
1014 wake_up_state(q->task, TASK_NORMAL);
1015}
1016
1017/**
1018 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1019 * @pifutex: the user address of the to futex
1020 * @hb1: the from futex hash bucket, must be locked by the caller
1021 * @hb2: the to futex hash bucket, must be locked by the caller
1022 * @key1: the from futex key
1023 * @key2: the to futex key
1024 * @ps: address to store the pi_state pointer
1025 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1026 *
1027 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1028 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1029 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1030 * hb1 and hb2 must be held by the caller.
1031 *
1032 * Returns:
1033 * 0 - failed to acquire the lock atomicly
1034 * 1 - acquired the lock
1035 * <0 - error
1036 */
1037static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1038 struct futex_hash_bucket *hb1,
1039 struct futex_hash_bucket *hb2,
1040 union futex_key *key1, union futex_key *key2,
1041 struct futex_pi_state **ps, int set_waiters)
1042{
1043 struct futex_q *top_waiter = NULL;
1044 u32 curval;
1045 int ret;
1046
1047 if (get_futex_value_locked(&curval, pifutex))
1048 return -EFAULT;
1049
1050 /*
1051 * Find the top_waiter and determine if there are additional waiters.
1052 * If the caller intends to requeue more than 1 waiter to pifutex,
1053 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1054 * as we have means to handle the possible fault. If not, don't set
1055 * the bit unecessarily as it will force the subsequent unlock to enter
1056 * the kernel.
1057 */
1058 top_waiter = futex_top_waiter(hb1, key1);
1059
1060 /* There are no waiters, nothing for us to do. */
1061 if (!top_waiter)
1062 return 0;
1063
1064 /*
1065 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1066 * the contended case or if set_waiters is 1. The pi_state is returned
1067 * in ps in contended cases.
1068 */
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters);
1071 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2);
1073
1074 return ret;
1075}
1076
1077/**
1078 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1079 * uaddr1: source futex user address
1080 * uaddr2: target futex user address
1081 * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1082 * nr_requeue: number of waiters to requeue (0-INT_MAX)
1083 * requeue_pi: if we are attempting to requeue from a non-pi futex to a
1084 * pi futex (pi to pi requeue is not supported)
1085 *
1086 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1087 * uaddr2 atomically on behalf of the top waiter.
1088 *
1089 * Returns:
1090 * >=0 - on success, the number of tasks requeued or woken
1091 * <0 - on error
808 */ 1092 */
809static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1093static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
810 int nr_wake, int nr_requeue, u32 *cmpval) 1094 int nr_wake, int nr_requeue, u32 *cmpval,
1095 int requeue_pi)
811{ 1096{
812 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1097 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1098 int drop_count = 0, task_count = 0, ret;
1099 struct futex_pi_state *pi_state = NULL;
813 struct futex_hash_bucket *hb1, *hb2; 1100 struct futex_hash_bucket *hb1, *hb2;
814 struct plist_head *head1; 1101 struct plist_head *head1;
815 struct futex_q *this, *next; 1102 struct futex_q *this, *next;
816 int ret, drop_count = 0; 1103 u32 curval2;
1104
1105 if (requeue_pi) {
1106 /*
1107 * requeue_pi requires a pi_state, try to allocate it now
1108 * without any locks in case it fails.
1109 */
1110 if (refill_pi_state_cache())
1111 return -ENOMEM;
1112 /*
1113 * requeue_pi must wake as many tasks as it can, up to nr_wake
1114 * + nr_requeue, since it acquires the rt_mutex prior to
1115 * returning to userspace, so as to not leave the rt_mutex with
1116 * waiters and no owner. However, second and third wake-ups
1117 * cannot be predicted as they involve race conditions with the
1118 * first wake and a fault while looking up the pi_state. Both
1119 * pthread_cond_signal() and pthread_cond_broadcast() should
1120 * use nr_wake=1.
1121 */
1122 if (nr_wake != 1)
1123 return -EINVAL;
1124 }
817 1125
818retry: 1126retry:
1127 if (pi_state != NULL) {
1128 /*
1129 * We will have to lookup the pi_state again, so free this one
1130 * to keep the accounting correct.
1131 */
1132 free_pi_state(pi_state);
1133 pi_state = NULL;
1134 }
1135
819 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1136 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
820 if (unlikely(ret != 0)) 1137 if (unlikely(ret != 0))
821 goto out; 1138 goto out;
822 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); 1139 ret = get_futex_key(uaddr2, fshared, &key2,
1140 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
823 if (unlikely(ret != 0)) 1141 if (unlikely(ret != 0))
824 goto out_put_key1; 1142 goto out_put_key1;
825 1143
@@ -854,32 +1172,99 @@ retry_private:
854 } 1172 }
855 } 1173 }
856 1174
1175 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1176 /*
1177 * Attempt to acquire uaddr2 and wake the top waiter. If we
1178 * intend to requeue waiters, force setting the FUTEX_WAITERS
1179 * bit. We force this here where we are able to easily handle
1180 * faults rather in the requeue loop below.
1181 */
1182 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1183 &key2, &pi_state, nr_requeue);
1184
1185 /*
1186 * At this point the top_waiter has either taken uaddr2 or is
1187 * waiting on it. If the former, then the pi_state will not
1188 * exist yet, look it up one more time to ensure we have a
1189 * reference to it.
1190 */
1191 if (ret == 1) {
1192 WARN_ON(pi_state);
1193 task_count++;
1194 ret = get_futex_value_locked(&curval2, uaddr2);
1195 if (!ret)
1196 ret = lookup_pi_state(curval2, hb2, &key2,
1197 &pi_state);
1198 }
1199
1200 switch (ret) {
1201 case 0:
1202 break;
1203 case -EFAULT:
1204 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2);
1208 if (!ret)
1209 goto retry;
1210 goto out;
1211 case -EAGAIN:
1212 /* The owner was exiting, try again. */
1213 double_unlock_hb(hb1, hb2);
1214 put_futex_key(fshared, &key2);
1215 put_futex_key(fshared, &key1);
1216 cond_resched();
1217 goto retry;
1218 default:
1219 goto out_unlock;
1220 }
1221 }
1222
857 head1 = &hb1->chain; 1223 head1 = &hb1->chain;
858 plist_for_each_entry_safe(this, next, head1, list) { 1224 plist_for_each_entry_safe(this, next, head1, list) {
859 if (!match_futex (&this->key, &key1)) 1225 if (task_count - nr_wake >= nr_requeue)
1226 break;
1227
1228 if (!match_futex(&this->key, &key1))
860 continue; 1229 continue;
861 if (++ret <= nr_wake) { 1230
1231 WARN_ON(!requeue_pi && this->rt_waiter);
1232 WARN_ON(requeue_pi && !this->rt_waiter);
1233
1234 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1236 * lock, we already woke the top_waiter. If not, it will be
1237 * woken by futex_unlock_pi().
1238 */
1239 if (++task_count <= nr_wake && !requeue_pi) {
862 wake_futex(this); 1240 wake_futex(this);
863 } else { 1241 continue;
864 /* 1242 }
865 * If key1 and key2 hash to the same bucket, no need to
866 * requeue.
867 */
868 if (likely(head1 != &hb2->chain)) {
869 plist_del(&this->list, &hb1->chain);
870 plist_add(&this->list, &hb2->chain);
871 this->lock_ptr = &hb2->lock;
872#ifdef CONFIG_DEBUG_PI_LIST
873 this->list.plist.lock = &hb2->lock;
874#endif
875 }
876 this->key = key2;
877 get_futex_key_refs(&key2);
878 drop_count++;
879 1243
880 if (ret - nr_wake >= nr_requeue) 1244 /*
881 break; 1245 * Requeue nr_requeue waiters and possibly one more in the case
1246 * of requeue_pi if we couldn't acquire the lock atomically.
1247 */
1248 if (requeue_pi) {
1249 /* Prepare the waiter to take the rt_mutex. */
1250 atomic_inc(&pi_state->refcount);
1251 this->pi_state = pi_state;
1252 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1253 this->rt_waiter,
1254 this->task, 1);
1255 if (ret == 1) {
1256 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2);
1258 continue;
1259 } else if (ret) {
1260 /* -EDEADLK */
1261 this->pi_state = NULL;
1262 free_pi_state(pi_state);
1263 goto out_unlock;
1264 }
882 } 1265 }
1266 requeue_futex(this, hb1, hb2, &key2);
1267 drop_count++;
883 } 1268 }
884 1269
885out_unlock: 1270out_unlock:
@@ -899,7 +1284,9 @@ out_put_keys:
899out_put_key1: 1284out_put_key1:
900 put_futex_key(fshared, &key1); 1285 put_futex_key(fshared, &key1);
901out: 1286out:
902 return ret; 1287 if (pi_state != NULL)
1288 free_pi_state(pi_state);
1289 return ret ? ret : task_count;
903} 1290}
904 1291
905/* The key must be already stored in q->key. */ 1292/* The key must be already stored in q->key. */
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
907{ 1294{
908 struct futex_hash_bucket *hb; 1295 struct futex_hash_bucket *hb;
909 1296
910 init_waitqueue_head(&q->waiter);
911
912 get_futex_key_refs(&q->key); 1297 get_futex_key_refs(&q->key);
913 hb = hash_futex(&q->key); 1298 hb = hash_futex(&q->key);
914 q->lock_ptr = &hb->lock; 1299 q->lock_ptr = &hb->lock;
@@ -1119,35 +1504,149 @@ handle_fault:
1119 */ 1504 */
1120#define FLAGS_SHARED 0x01 1505#define FLAGS_SHARED 0x01
1121#define FLAGS_CLOCKRT 0x02 1506#define FLAGS_CLOCKRT 0x02
1507#define FLAGS_HAS_TIMEOUT 0x04
1122 1508
1123static long futex_wait_restart(struct restart_block *restart); 1509static long futex_wait_restart(struct restart_block *restart);
1124 1510
1125static int futex_wait(u32 __user *uaddr, int fshared, 1511/**
1126 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1512 * fixup_owner() - Post lock pi_state and corner case management
1513 * @uaddr: user address of the futex
1514 * @fshared: whether the futex is shared (1) or not (0)
1515 * @q: futex_q (contains pi_state and access to the rt_mutex)
1516 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1517 *
1518 * After attempting to lock an rt_mutex, this function is called to cleanup
1519 * the pi_state owner as well as handle race conditions that may allow us to
1520 * acquire the lock. Must be called with the hb lock held.
1521 *
1522 * Returns:
1523 * 1 - success, lock taken
1524 * 0 - success, lock not taken
1525 * <0 - on error (-EFAULT)
1526 */
1527static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1528 int locked)
1127{ 1529{
1128 struct task_struct *curr = current; 1530 struct task_struct *owner;
1129 struct restart_block *restart; 1531 int ret = 0;
1130 DECLARE_WAITQUEUE(wait, curr);
1131 struct futex_hash_bucket *hb;
1132 struct futex_q q;
1133 u32 uval;
1134 int ret;
1135 struct hrtimer_sleeper t;
1136 int rem = 0;
1137 1532
1138 if (!bitset) 1533 if (locked) {
1139 return -EINVAL; 1534 /*
1535 * Got the lock. We might not be the anticipated owner if we
1536 * did a lock-steal - fix up the PI-state in that case:
1537 */
1538 if (q->pi_state->owner != current)
1539 ret = fixup_pi_state_owner(uaddr, q, current, fshared);
1540 goto out;
1541 }
1140 1542
1141 q.pi_state = NULL; 1543 /*
1142 q.bitset = bitset; 1544 * Catch the rare case, where the lock was released when we were on the
1143retry: 1545 * way back before we locked the hash bucket.
1144 q.key = FUTEX_KEY_INIT; 1546 */
1145 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); 1547 if (q->pi_state->owner == current) {
1146 if (unlikely(ret != 0)) 1548 /*
1549 * Try to get the rt_mutex now. This might fail as some other
1550 * task acquired the rt_mutex after we removed ourself from the
1551 * rt_mutex waiters list.
1552 */
1553 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1554 locked = 1;
1555 goto out;
1556 }
1557
1558 /*
1559 * pi_state is incorrect, some other task did a lock steal and
1560 * we returned due to timeout or signal without taking the
1561 * rt_mutex. Too late. We can access the rt_mutex_owner without
1562 * locking, as the other task is now blocked on the hash bucket
1563 * lock. Fix the state up.
1564 */
1565 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1566 ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
1147 goto out; 1567 goto out;
1568 }
1148 1569
1149retry_private: 1570 /*
1150 hb = queue_lock(&q); 1571 * Paranoia check. If we did not take the lock, then we should not be
1572 * the owner, nor the pending owner, of the rt_mutex.
1573 */
1574 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1575 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1576 "pi-state %p\n", ret,
1577 q->pi_state->pi_mutex.owner,
1578 q->pi_state->owner);
1579
1580out:
1581 return ret ? ret : locked;
1582}
1583
1584/**
1585 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1586 * @hb: the futex hash bucket, must be locked by the caller
1587 * @q: the futex_q to queue up on
1588 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1589 */
1590static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1591 struct hrtimer_sleeper *timeout)
1592{
1593 queue_me(q, hb);
1594
1595 /*
1596 * There might have been scheduling since the queue_me(), as we
1597 * cannot hold a spinlock across the get_user() in case it
1598 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1599 * queueing ourselves into the futex hash. This code thus has to
1600 * rely on the futex_wake() code removing us from hash when it
1601 * wakes us up.
1602 */
1603 set_current_state(TASK_INTERRUPTIBLE);
1604
1605 /* Arm the timer */
1606 if (timeout) {
1607 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1608 if (!hrtimer_active(&timeout->timer))
1609 timeout->task = NULL;
1610 }
1611
1612 /*
1613 * !plist_node_empty() is safe here without any lock.
1614 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1615 */
1616 if (likely(!plist_node_empty(&q->list))) {
1617 /*
1618 * If the timer has already expired, current will already be
1619 * flagged for rescheduling. Only call schedule if there
1620 * is no timeout, or if it has yet to expire.
1621 */
1622 if (!timeout || timeout->task)
1623 schedule();
1624 }
1625 __set_current_state(TASK_RUNNING);
1626}
1627
1628/**
1629 * futex_wait_setup() - Prepare to wait on a futex
1630 * @uaddr: the futex userspace address
1631 * @val: the expected value
1632 * @fshared: whether the futex is shared (1) or not (0)
1633 * @q: the associated futex_q
1634 * @hb: storage for hash_bucket pointer to be returned to caller
1635 *
1636 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1637 * compare it with the expected value. Handle atomic faults internally.
1638 * Return with the hb lock held and a q.key reference on success, and unlocked
1639 * with no q.key reference on failure.
1640 *
1641 * Returns:
1642 * 0 - uaddr contains val and hb has been locked
1643 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1644 */
1645static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1646 struct futex_q *q, struct futex_hash_bucket **hb)
1647{
1648 u32 uval;
1649 int ret;
1151 1650
1152 /* 1651 /*
1153 * Access the page AFTER the hash-bucket is locked. 1652 * Access the page AFTER the hash-bucket is locked.
@@ -1165,95 +1664,83 @@ retry_private:
1165 * A consequence is that futex_wait() can return zero and absorb 1664 * A consequence is that futex_wait() can return zero and absorb
1166 * a wakeup when *uaddr != val on entry to the syscall. This is 1665 * a wakeup when *uaddr != val on entry to the syscall. This is
1167 * rare, but normal. 1666 * rare, but normal.
1168 *
1169 * For shared futexes, we hold the mmap semaphore, so the mapping
1170 * cannot have changed since we looked it up in get_futex_key.
1171 */ 1667 */
1668retry:
1669 q->key = FUTEX_KEY_INIT;
1670 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
1671 if (unlikely(ret != 0))
1672 return ret;
1673
1674retry_private:
1675 *hb = queue_lock(q);
1676
1172 ret = get_futex_value_locked(&uval, uaddr); 1677 ret = get_futex_value_locked(&uval, uaddr);
1173 1678
1174 if (unlikely(ret)) { 1679 if (ret) {
1175 queue_unlock(&q, hb); 1680 queue_unlock(q, *hb);
1176 1681
1177 ret = get_user(uval, uaddr); 1682 ret = get_user(uval, uaddr);
1178 if (ret) 1683 if (ret)
1179 goto out_put_key; 1684 goto out;
1180 1685
1181 if (!fshared) 1686 if (!fshared)
1182 goto retry_private; 1687 goto retry_private;
1183 1688
1184 put_futex_key(fshared, &q.key); 1689 put_futex_key(fshared, &q->key);
1185 goto retry; 1690 goto retry;
1186 } 1691 }
1187 ret = -EWOULDBLOCK;
1188 if (unlikely(uval != val)) {
1189 queue_unlock(&q, hb);
1190 goto out_put_key;
1191 }
1192 1692
1193 /* Only actually queue if *uaddr contained val. */ 1693 if (uval != val) {
1194 queue_me(&q, hb); 1694 queue_unlock(q, *hb);
1695 ret = -EWOULDBLOCK;
1696 }
1195 1697
1196 /* 1698out:
1197 * There might have been scheduling since the queue_me(), as we 1699 if (ret)
1198 * cannot hold a spinlock across the get_user() in case it 1700 put_futex_key(fshared, &q->key);
1199 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1701 return ret;
1200 * queueing ourselves into the futex hash. This code thus has to 1702}
1201 * rely on the futex_wake() code removing us from hash when it
1202 * wakes us up.
1203 */
1204 1703
1205 /* add_wait_queue is the barrier after __set_current_state. */ 1704static int futex_wait(u32 __user *uaddr, int fshared,
1206 __set_current_state(TASK_INTERRUPTIBLE); 1705 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1207 add_wait_queue(&q.waiter, &wait); 1706{
1208 /* 1707 struct hrtimer_sleeper timeout, *to = NULL;
1209 * !plist_node_empty() is safe here without any lock. 1708 struct restart_block *restart;
1210 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1709 struct futex_hash_bucket *hb;
1211 */ 1710 struct futex_q q;
1212 if (likely(!plist_node_empty(&q.list))) { 1711 int ret;
1213 if (!abs_time)
1214 schedule();
1215 else {
1216 hrtimer_init_on_stack(&t.timer,
1217 clockrt ? CLOCK_REALTIME :
1218 CLOCK_MONOTONIC,
1219 HRTIMER_MODE_ABS);
1220 hrtimer_init_sleeper(&t, current);
1221 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1222 current->timer_slack_ns);
1223
1224 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1225 if (!hrtimer_active(&t.timer))
1226 t.task = NULL;
1227 1712
1228 /* 1713 if (!bitset)
1229 * the timer could have already expired, in which 1714 return -EINVAL;
1230 * case current would be flagged for rescheduling.
1231 * Don't bother calling schedule.
1232 */
1233 if (likely(t.task))
1234 schedule();
1235 1715
1236 hrtimer_cancel(&t.timer); 1716 q.pi_state = NULL;
1717 q.bitset = bitset;
1718 q.rt_waiter = NULL;
1237 1719
1238 /* Flag if a timeout occured */ 1720 if (abs_time) {
1239 rem = (t.task == NULL); 1721 to = &timeout;
1240 1722
1241 destroy_hrtimer_on_stack(&t.timer); 1723 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
1242 } 1724 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1725 hrtimer_init_sleeper(to, current);
1726 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1727 current->timer_slack_ns);
1243 } 1728 }
1244 __set_current_state(TASK_RUNNING);
1245 1729
1246 /* 1730 /* Prepare to wait on uaddr. */
1247 * NOTE: we don't remove ourselves from the waitqueue because 1731 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1248 * we are the only user of it. 1732 if (ret)
1249 */ 1733 goto out;
1734
1735 /* queue_me and wait for wakeup, timeout, or a signal. */
1736 futex_wait_queue_me(hb, &q, to);
1250 1737
1251 /* If we were woken (and unqueued), we succeeded, whatever. */ 1738 /* If we were woken (and unqueued), we succeeded, whatever. */
1252 ret = 0; 1739 ret = 0;
1253 if (!unqueue_me(&q)) 1740 if (!unqueue_me(&q))
1254 goto out_put_key; 1741 goto out_put_key;
1255 ret = -ETIMEDOUT; 1742 ret = -ETIMEDOUT;
1256 if (rem) 1743 if (to && !to->task)
1257 goto out_put_key; 1744 goto out_put_key;
1258 1745
1259 /* 1746 /*
@@ -1270,7 +1757,7 @@ retry_private:
1270 restart->futex.val = val; 1757 restart->futex.val = val;
1271 restart->futex.time = abs_time->tv64; 1758 restart->futex.time = abs_time->tv64;
1272 restart->futex.bitset = bitset; 1759 restart->futex.bitset = bitset;
1273 restart->futex.flags = 0; 1760 restart->futex.flags = FLAGS_HAS_TIMEOUT;
1274 1761
1275 if (fshared) 1762 if (fshared)
1276 restart->futex.flags |= FLAGS_SHARED; 1763 restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1769,10 @@ retry_private:
1282out_put_key: 1769out_put_key:
1283 put_futex_key(fshared, &q.key); 1770 put_futex_key(fshared, &q.key);
1284out: 1771out:
1772 if (to) {
1773 hrtimer_cancel(&to->timer);
1774 destroy_hrtimer_on_stack(&to->timer);
1775 }
1285 return ret; 1776 return ret;
1286} 1777}
1287 1778
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
1290{ 1781{
1291 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1782 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1292 int fshared = 0; 1783 int fshared = 0;
1293 ktime_t t; 1784 ktime_t t, *tp = NULL;
1294 1785
1295 t.tv64 = restart->futex.time; 1786 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1787 t.tv64 = restart->futex.time;
1788 tp = &t;
1789 }
1296 restart->fn = do_no_restart_syscall; 1790 restart->fn = do_no_restart_syscall;
1297 if (restart->futex.flags & FLAGS_SHARED) 1791 if (restart->futex.flags & FLAGS_SHARED)
1298 fshared = 1; 1792 fshared = 1;
1299 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1793 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
1300 restart->futex.bitset, 1794 restart->futex.bitset,
1301 restart->futex.flags & FLAGS_CLOCKRT); 1795 restart->futex.flags & FLAGS_CLOCKRT);
1302} 1796}
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1312 int detect, ktime_t *time, int trylock) 1806 int detect, ktime_t *time, int trylock)
1313{ 1807{
1314 struct hrtimer_sleeper timeout, *to = NULL; 1808 struct hrtimer_sleeper timeout, *to = NULL;
1315 struct task_struct *curr = current;
1316 struct futex_hash_bucket *hb; 1809 struct futex_hash_bucket *hb;
1317 u32 uval, newval, curval; 1810 u32 uval;
1318 struct futex_q q; 1811 struct futex_q q;
1319 int ret, lock_taken, ownerdied = 0; 1812 int res, ret;
1320 1813
1321 if (refill_pi_state_cache()) 1814 if (refill_pi_state_cache())
1322 return -ENOMEM; 1815 return -ENOMEM;
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1330 } 1823 }
1331 1824
1332 q.pi_state = NULL; 1825 q.pi_state = NULL;
1826 q.rt_waiter = NULL;
1333retry: 1827retry:
1334 q.key = FUTEX_KEY_INIT; 1828 q.key = FUTEX_KEY_INIT;
1335 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1829 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1833,15 @@ retry:
1339retry_private: 1833retry_private:
1340 hb = queue_lock(&q); 1834 hb = queue_lock(&q);
1341 1835
1342retry_locked: 1836 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1343 ret = lock_taken = 0;
1344
1345 /*
1346 * To avoid races, we attempt to take the lock here again
1347 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1348 * the locks. It will most likely not succeed.
1349 */
1350 newval = task_pid_vnr(current);
1351
1352 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1353
1354 if (unlikely(curval == -EFAULT))
1355 goto uaddr_faulted;
1356
1357 /*
1358 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1359 * situation and we return success to user space.
1360 */
1361 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1362 ret = -EDEADLK;
1363 goto out_unlock_put_key;
1364 }
1365
1366 /*
1367 * Surprise - we got the lock. Just return to userspace:
1368 */
1369 if (unlikely(!curval))
1370 goto out_unlock_put_key;
1371
1372 uval = curval;
1373
1374 /*
1375 * Set the WAITERS flag, so the owner will know it has someone
1376 * to wake at next unlock
1377 */
1378 newval = curval | FUTEX_WAITERS;
1379
1380 /*
1381 * There are two cases, where a futex might have no owner (the
1382 * owner TID is 0): OWNER_DIED. We take over the futex in this
1383 * case. We also do an unconditional take over, when the owner
1384 * of the futex died.
1385 *
1386 * This is safe as we are protected by the hash bucket lock !
1387 */
1388 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1389 /* Keep the OWNER_DIED bit */
1390 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1391 ownerdied = 0;
1392 lock_taken = 1;
1393 }
1394
1395 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1396
1397 if (unlikely(curval == -EFAULT))
1398 goto uaddr_faulted;
1399 if (unlikely(curval != uval))
1400 goto retry_locked;
1401
1402 /*
1403 * We took the lock due to owner died take over.
1404 */
1405 if (unlikely(lock_taken))
1406 goto out_unlock_put_key;
1407
1408 /*
1409 * We dont have the lock. Look up the PI state (or create it if
1410 * we are the first waiter):
1411 */
1412 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1413
1414 if (unlikely(ret)) { 1837 if (unlikely(ret)) {
1415 switch (ret) { 1838 switch (ret) {
1416 1839 case 1:
1840 /* We got the lock. */
1841 ret = 0;
1842 goto out_unlock_put_key;
1843 case -EFAULT:
1844 goto uaddr_faulted;
1417 case -EAGAIN: 1845 case -EAGAIN:
1418 /* 1846 /*
1419 * Task is exiting and we just wait for the 1847 * Task is exiting and we just wait for the
@@ -1423,25 +1851,6 @@ retry_locked:
1423 put_futex_key(fshared, &q.key); 1851 put_futex_key(fshared, &q.key);
1424 cond_resched(); 1852 cond_resched();
1425 goto retry; 1853 goto retry;
1426
1427 case -ESRCH:
1428 /*
1429 * No owner found for this futex. Check if the
1430 * OWNER_DIED bit is set to figure out whether
1431 * this is a robust futex or not.
1432 */
1433 if (get_futex_value_locked(&curval, uaddr))
1434 goto uaddr_faulted;
1435
1436 /*
1437 * We simply start over in case of a robust
1438 * futex. The code above will take the futex
1439 * and return happy.
1440 */
1441 if (curval & FUTEX_OWNER_DIED) {
1442 ownerdied = 1;
1443 goto retry_locked;
1444 }
1445 default: 1854 default:
1446 goto out_unlock_put_key; 1855 goto out_unlock_put_key;
1447 } 1856 }
@@ -1465,71 +1874,21 @@ retry_locked:
1465 } 1874 }
1466 1875
1467 spin_lock(q.lock_ptr); 1876 spin_lock(q.lock_ptr);
1468 1877 /*
1469 if (!ret) { 1878 * Fixup the pi_state owner and possibly acquire the lock if we
1470 /* 1879 * haven't already.
1471 * Got the lock. We might not be the anticipated owner 1880 */
1472 * if we did a lock-steal - fix up the PI-state in 1881 res = fixup_owner(uaddr, fshared, &q, !ret);
1473 * that case: 1882 /*
1474 */ 1883 * If fixup_owner() returned an error, proprogate that. If it acquired
1475 if (q.pi_state->owner != curr) 1884 * the lock, clear our -ETIMEDOUT or -EINTR.
1476 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); 1885 */
1477 } else { 1886 if (res)
1478 /* 1887 ret = (res < 0) ? res : 0;
1479 * Catch the rare case, where the lock was released
1480 * when we were on the way back before we locked the
1481 * hash bucket.
1482 */
1483 if (q.pi_state->owner == curr) {
1484 /*
1485 * Try to get the rt_mutex now. This might
1486 * fail as some other task acquired the
1487 * rt_mutex after we removed ourself from the
1488 * rt_mutex waiters list.
1489 */
1490 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1491 ret = 0;
1492 else {
1493 /*
1494 * pi_state is incorrect, some other
1495 * task did a lock steal and we
1496 * returned due to timeout or signal
1497 * without taking the rt_mutex. Too
1498 * late. We can access the
1499 * rt_mutex_owner without locking, as
1500 * the other task is now blocked on
1501 * the hash bucket lock. Fix the state
1502 * up.
1503 */
1504 struct task_struct *owner;
1505 int res;
1506
1507 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1508 res = fixup_pi_state_owner(uaddr, &q, owner,
1509 fshared);
1510
1511 /* propagate -EFAULT, if the fixup failed */
1512 if (res)
1513 ret = res;
1514 }
1515 } else {
1516 /*
1517 * Paranoia check. If we did not take the lock
1518 * in the trylock above, then we should not be
1519 * the owner of the rtmutex, neither the real
1520 * nor the pending one:
1521 */
1522 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1523 printk(KERN_ERR "futex_lock_pi: ret = %d "
1524 "pi-mutex: %p pi-state %p\n", ret,
1525 q.pi_state->pi_mutex.owner,
1526 q.pi_state->owner);
1527 }
1528 }
1529 1888
1530 /* 1889 /*
1531 * If fixup_pi_state_owner() faulted and was unable to handle the 1890 * If fixup_owner() faulted and was unable to handle the fault, unlock
1532 * fault, unlock it and return the fault to userspace. 1891 * it and return the fault to userspace.
1533 */ 1892 */
1534 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 1893 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1535 rt_mutex_unlock(&q.pi_state->pi_mutex); 1894 rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1896,7 @@ retry_locked:
1537 /* Unqueue and drop the lock */ 1896 /* Unqueue and drop the lock */
1538 unqueue_me_pi(&q); 1897 unqueue_me_pi(&q);
1539 1898
1540 if (to) 1899 goto out;
1541 destroy_hrtimer_on_stack(&to->timer);
1542 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1543 1900
1544out_unlock_put_key: 1901out_unlock_put_key:
1545 queue_unlock(&q, hb); 1902 queue_unlock(&q, hb);
@@ -1549,7 +1906,7 @@ out_put_key:
1549out: 1906out:
1550 if (to) 1907 if (to)
1551 destroy_hrtimer_on_stack(&to->timer); 1908 destroy_hrtimer_on_stack(&to->timer);
1552 return ret; 1909 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1553 1910
1554uaddr_faulted: 1911uaddr_faulted:
1555 /* 1912 /*
@@ -1572,7 +1929,6 @@ uaddr_faulted:
1572 goto retry; 1929 goto retry;
1573} 1930}
1574 1931
1575
1576/* 1932/*
1577 * Userspace attempted a TID -> 0 atomic transition, and failed. 1933 * Userspace attempted a TID -> 0 atomic transition, and failed.
1578 * This is the in-kernel slowpath: we look up the PI state (if any), 1934 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1674,6 +2030,229 @@ pi_faulted:
1674 return ret; 2030 return ret;
1675} 2031}
1676 2032
2033/**
2034 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2035 * @hb: the hash_bucket futex_q was original enqueued on
2036 * @q: the futex_q woken while waiting to be requeued
2037 * @key2: the futex_key of the requeue target futex
2038 * @timeout: the timeout associated with the wait (NULL if none)
2039 *
2040 * Detect if the task was woken on the initial futex as opposed to the requeue
2041 * target futex. If so, determine if it was a timeout or a signal that caused
2042 * the wakeup and return the appropriate error code to the caller. Must be
2043 * called with the hb lock held.
2044 *
2045 * Returns
2046 * 0 - no early wakeup detected
2047 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2048 */
2049static inline
2050int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2051 struct futex_q *q, union futex_key *key2,
2052 struct hrtimer_sleeper *timeout)
2053{
2054 int ret = 0;
2055
2056 /*
2057 * With the hb lock held, we avoid races while we process the wakeup.
2058 * We only need to hold hb (and not hb2) to ensure atomicity as the
2059 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2060 * It can't be requeued from uaddr2 to something else since we don't
2061 * support a PI aware source futex for requeue.
2062 */
2063 if (!match_futex(&q->key, key2)) {
2064 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2065 /*
2066 * We were woken prior to requeue by a timeout or a signal.
2067 * Unqueue the futex_q and determine which it was.
2068 */
2069 plist_del(&q->list, &q->list.plist);
2070 drop_futex_key_refs(&q->key);
2071
2072 if (timeout && !timeout->task)
2073 ret = -ETIMEDOUT;
2074 else
2075 ret = -ERESTARTNOINTR;
2076 }
2077 return ret;
2078}
2079
2080/**
2081 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2082 * @uaddr: the futex we initialyl wait on (non-pi)
2083 * @fshared: whether the futexes are shared (1) or not (0). They must be
2084 * the same type, no requeueing from private to shared, etc.
2085 * @val: the expected value of uaddr
2086 * @abs_time: absolute timeout
2087 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
2088 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2089 * @uaddr2: the pi futex we will take prior to returning to user-space
2090 *
2091 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2092 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2093 * complete the acquisition of the rt_mutex prior to returning to userspace.
2094 * This ensures the rt_mutex maintains an owner when it has waiters; without
2095 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2096 * need to.
2097 *
2098 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2099 * via the following:
2100 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2101 * 2) wakeup on uaddr2 after a requeue and subsequent unlock
2102 * 3) signal (before or after requeue)
2103 * 4) timeout (before or after requeue)
2104 *
2105 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
2106 *
2107 * If 2, we may then block on trying to take the rt_mutex and return via:
2108 * 5) successful lock
2109 * 6) signal
2110 * 7) timeout
2111 * 8) other lock acquisition failure
2112 *
2113 * If 6, we setup a restart_block with futex_lock_pi() as the function.
2114 *
2115 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2116 *
2117 * Returns:
2118 * 0 - On success
2119 * <0 - On error
2120 */
2121static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2122 u32 val, ktime_t *abs_time, u32 bitset,
2123 int clockrt, u32 __user *uaddr2)
2124{
2125 struct hrtimer_sleeper timeout, *to = NULL;
2126 struct rt_mutex_waiter rt_waiter;
2127 struct rt_mutex *pi_mutex = NULL;
2128 struct futex_hash_bucket *hb;
2129 union futex_key key2;
2130 struct futex_q q;
2131 int res, ret;
2132
2133 if (!bitset)
2134 return -EINVAL;
2135
2136 if (abs_time) {
2137 to = &timeout;
2138 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
2139 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2140 hrtimer_init_sleeper(to, current);
2141 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2142 current->timer_slack_ns);
2143 }
2144
2145 /*
2146 * The waiter is allocated on our stack, manipulated by the requeue
2147 * code while we sleep on uaddr.
2148 */
2149 debug_rt_mutex_init_waiter(&rt_waiter);
2150 rt_waiter.task = NULL;
2151
2152 q.pi_state = NULL;
2153 q.bitset = bitset;
2154 q.rt_waiter = &rt_waiter;
2155
2156 key2 = FUTEX_KEY_INIT;
2157 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2158 if (unlikely(ret != 0))
2159 goto out;
2160
2161 /* Prepare to wait on uaddr. */
2162 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2163 if (ret)
2164 goto out_key2;
2165
2166 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2167 futex_wait_queue_me(hb, &q, to);
2168
2169 spin_lock(&hb->lock);
2170 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2171 spin_unlock(&hb->lock);
2172 if (ret)
2173 goto out_put_keys;
2174
2175 /*
2176 * In order for us to be here, we know our q.key == key2, and since
2177 * we took the hb->lock above, we also know that futex_requeue() has
2178 * completed and we no longer have to concern ourselves with a wakeup
2179 * race with the atomic proxy lock acquition by the requeue code.
2180 */
2181
2182 /* Check if the requeue code acquired the second futex for us. */
2183 if (!q.rt_waiter) {
2184 /*
2185 * Got the lock. We might not be the anticipated owner if we
2186 * did a lock-steal - fix up the PI-state in that case.
2187 */
2188 if (q.pi_state && (q.pi_state->owner != current)) {
2189 spin_lock(q.lock_ptr);
2190 ret = fixup_pi_state_owner(uaddr2, &q, current,
2191 fshared);
2192 spin_unlock(q.lock_ptr);
2193 }
2194 } else {
2195 /*
2196 * We have been woken up by futex_unlock_pi(), a timeout, or a
2197 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2198 * the pi_state.
2199 */
2200 WARN_ON(!&q.pi_state);
2201 pi_mutex = &q.pi_state->pi_mutex;
2202 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2203 debug_rt_mutex_free_waiter(&rt_waiter);
2204
2205 spin_lock(q.lock_ptr);
2206 /*
2207 * Fixup the pi_state owner and possibly acquire the lock if we
2208 * haven't already.
2209 */
2210 res = fixup_owner(uaddr2, fshared, &q, !ret);
2211 /*
2212 * If fixup_owner() returned an error, proprogate that. If it
2213 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
2214 */
2215 if (res)
2216 ret = (res < 0) ? res : 0;
2217
2218 /* Unqueue and drop the lock. */
2219 unqueue_me_pi(&q);
2220 }
2221
2222 /*
2223 * If fixup_pi_state_owner() faulted and was unable to handle the
2224 * fault, unlock the rt_mutex and return the fault to userspace.
2225 */
2226 if (ret == -EFAULT) {
2227 if (rt_mutex_owner(pi_mutex) == current)
2228 rt_mutex_unlock(pi_mutex);
2229 } else if (ret == -EINTR) {
2230 /*
2231 * We've already been requeued, but we have no way to
2232 * restart by calling futex_lock_pi() directly. We
2233 * could restart the syscall, but that will look at
2234 * the user space value and return right away. So we
2235 * drop back with EWOULDBLOCK to tell user space that
2236 * "val" has been changed. That's the same what the
2237 * restart of the syscall would do in
2238 * futex_wait_setup().
2239 */
2240 ret = -EWOULDBLOCK;
2241 }
2242
2243out_put_keys:
2244 put_futex_key(fshared, &q.key);
2245out_key2:
2246 put_futex_key(fshared, &key2);
2247
2248out:
2249 if (to) {
2250 hrtimer_cancel(&to->timer);
2251 destroy_hrtimer_on_stack(&to->timer);
2252 }
2253 return ret;
2254}
2255
1677/* 2256/*
1678 * Support for robust futexes: the kernel cleans up held futexes at 2257 * Support for robust futexes: the kernel cleans up held futexes at
1679 * thread exit time. 2258 * thread exit time.
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1896 fshared = 1; 2475 fshared = 1;
1897 2476
1898 clockrt = op & FUTEX_CLOCK_REALTIME; 2477 clockrt = op & FUTEX_CLOCK_REALTIME;
1899 if (clockrt && cmd != FUTEX_WAIT_BITSET) 2478 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
1900 return -ENOSYS; 2479 return -ENOSYS;
1901 2480
1902 switch (cmd) { 2481 switch (cmd) {
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1911 ret = futex_wake(uaddr, fshared, val, val3); 2490 ret = futex_wake(uaddr, fshared, val, val3);
1912 break; 2491 break;
1913 case FUTEX_REQUEUE: 2492 case FUTEX_REQUEUE:
1914 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2493 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
1915 break; 2494 break;
1916 case FUTEX_CMP_REQUEUE: 2495 case FUTEX_CMP_REQUEUE:
1917 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2496 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2497 0);
1918 break; 2498 break;
1919 case FUTEX_WAKE_OP: 2499 case FUTEX_WAKE_OP:
1920 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2500 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1931 if (futex_cmpxchg_enabled) 2511 if (futex_cmpxchg_enabled)
1932 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2512 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
1933 break; 2513 break;
2514 case FUTEX_WAIT_REQUEUE_PI:
2515 val3 = FUTEX_BITSET_MATCH_ANY;
2516 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
2517 clockrt, uaddr2);
2518 break;
2519 case FUTEX_CMP_REQUEUE_PI:
2520 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2521 1);
2522 break;
1934 default: 2523 default:
1935 ret = -ENOSYS; 2524 ret = -ENOSYS;
1936 } 2525 }
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1948 int cmd = op & FUTEX_CMD_MASK; 2537 int cmd = op & FUTEX_CMD_MASK;
1949 2538
1950 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2539 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
1951 cmd == FUTEX_WAIT_BITSET)) { 2540 cmd == FUTEX_WAIT_BITSET ||
2541 cmd == FUTEX_WAIT_REQUEUE_PI)) {
1952 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2542 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1953 return -EFAULT; 2543 return -EFAULT;
1954 if (!timespec_valid(&ts)) 2544 if (!timespec_valid(&ts))
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1960 tp = &t; 2550 tp = &t;
1961 } 2551 }
1962 /* 2552 /*
1963 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2553 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
1964 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2554 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
1965 */ 2555 */
1966 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2556 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
1967 cmd == FUTEX_WAKE_OP) 2557 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
1968 val2 = (u32) (unsigned long) utime; 2558 val2 = (u32) (unsigned long) utime;
1969 2559
1970 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2560 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
359 359
360 spin_lock(&desc->lock); 360 spin_lock(&desc->lock);
361 mask_ack_irq(desc, irq); 361 mask_ack_irq(desc, irq);
362 desc = irq_remap_to_desc(irq, desc);
363 362
364 if (unlikely(desc->status & IRQ_INPROGRESS)) 363 if (unlikely(desc->status & IRQ_INPROGRESS))
365 goto out_unlock; 364 goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
438 desc->status &= ~IRQ_INPROGRESS; 437 desc->status &= ~IRQ_INPROGRESS;
439out: 438out:
440 desc->chip->eoi(irq); 439 desc->chip->eoi(irq);
441 desc = irq_remap_to_desc(irq, desc);
442 440
443 spin_unlock(&desc->lock); 441 spin_unlock(&desc->lock);
444} 442}
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 !desc->action)) { 473 !desc->action)) {
476 desc->status |= (IRQ_PENDING | IRQ_MASKED); 474 desc->status |= (IRQ_PENDING | IRQ_MASKED);
477 mask_ack_irq(desc, irq); 475 mask_ack_irq(desc, irq);
478 desc = irq_remap_to_desc(irq, desc);
479 goto out_unlock; 476 goto out_unlock;
480 } 477 }
481 kstat_incr_irqs_this_cpu(irq, desc); 478 kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
483 /* Start handling the irq */ 480 /* Start handling the irq */
484 if (desc->chip->ack) 481 if (desc->chip->ack)
485 desc->chip->ack(irq); 482 desc->chip->ack(irq);
486 desc = irq_remap_to_desc(irq, desc);
487 483
488 /* Mark the IRQ currently in progress.*/ 484 /* Mark the IRQ currently in progress.*/
489 desc->status |= IRQ_INPROGRESS; 485 desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 if (!noirqdebug) 540 if (!noirqdebug)
545 note_interrupt(irq, desc, action_ret); 541 note_interrupt(irq, desc, action_ret);
546 542
547 if (desc->chip->eoi) { 543 if (desc->chip->eoi)
548 desc->chip->eoi(irq); 544 desc->chip->eoi(irq);
549 desc = irq_remap_to_desc(irq, desc);
550 }
551} 545}
552 546
553void 547void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
582 576
583 /* Uninstall? */ 577 /* Uninstall? */
584 if (handle == handle_bad_irq) { 578 if (handle == handle_bad_irq) {
585 if (desc->chip != &no_irq_chip) { 579 if (desc->chip != &no_irq_chip)
586 mask_ack_irq(desc, irq); 580 mask_ack_irq(desc, irq);
587 desc = irq_remap_to_desc(irq, desc);
588 }
589 desc->status |= IRQ_DISABLED; 581 desc->status |= IRQ_DISABLED;
590 desc->depth = 1; 582 desc->depth = 1;
591 } 583 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744f..104578541230 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,14 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/slab.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/random.h> 16#include <linux/random.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 19#include <linux/rculist.h>
19#include <linux/hash.h> 20#include <linux/hash.h>
20#include <trace/irq.h>
21#include <linux/bootmem.h> 21#include <linux/bootmem.h>
22#include <trace/events/irq.h>
22 23
23#include "internals.h" 24#include "internals.h"
24 25
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 82 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
82}; 83};
83 84
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 85void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
85{ 86{
86 int node;
87 void *ptr; 87 void *ptr;
88 88
89 node = cpu_to_node(cpu); 89 if (slab_is_available())
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92 else
93 ptr = alloc_bootmem_node(NODE_DATA(node),
94 nr * sizeof(*desc->kstat_irqs));
91 95
92 /* 96 /*
93 * don't overwite if can not get new one 97 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one 98 * init_copy_kstat_irqs() could still use old one
95 */ 99 */
96 if (ptr) { 100 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", 101 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
98 cpu, node);
99 desc->kstat_irqs = ptr; 102 desc->kstat_irqs = ptr;
100 } 103 }
101} 104}
102 105
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 106static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{ 107{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 108 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106 109
107 spin_lock_init(&desc->lock); 110 spin_lock_init(&desc->lock);
108 desc->irq = irq; 111 desc->irq = irq;
109#ifdef CONFIG_SMP 112#ifdef CONFIG_SMP
110 desc->cpu = cpu; 113 desc->node = node;
111#endif 114#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 115 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, cpu, nr_cpu_ids); 116 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) { 117 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n"); 118 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1); 119 BUG_ON(1);
117 } 120 }
118 if (!init_alloc_desc_masks(desc, cpu, false)) { 121 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); 122 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1); 123 BUG_ON(1);
121 } 124 }
122 arch_init_chip_data(desc, cpu); 125 init_desc_masks(desc);
126 arch_init_chip_data(desc, node);
123} 127}
124 128
125/* 129/*
@@ -146,6 +150,7 @@ int __init early_irq_init(void)
146{ 150{
147 struct irq_desc *desc; 151 struct irq_desc *desc;
148 int legacy_count; 152 int legacy_count;
153 int node;
149 int i; 154 int i;
150 155
151 init_irq_default_affinity(); 156 init_irq_default_affinity();
@@ -156,20 +161,21 @@ int __init early_irq_init(void)
156 161
157 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
158 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node;
159 165
160 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
161 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
162 168
163 /* allocate based on nr_cpu_ids */ 169 /* allocate based on nr_cpu_ids */
164 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ 170 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
165 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * 171 sizeof(int), GFP_NOWAIT, node);
166 sizeof(int));
167 172
168 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
169 desc[i].irq = i; 174 desc[i].irq = i;
170 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
171 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
172 init_alloc_desc_masks(&desc[i], 0, true); 177 alloc_desc_masks(&desc[i], node, true);
178 init_desc_masks(&desc[i]);
173 irq_desc_ptrs[i] = desc + i; 179 irq_desc_ptrs[i] = desc + i;
174 } 180 }
175 181
@@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
187 return NULL; 193 return NULL;
188} 194}
189 195
190struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 196struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
191{ 197{
192 struct irq_desc *desc; 198 struct irq_desc *desc;
193 unsigned long flags; 199 unsigned long flags;
194 int node;
195 200
196 if (irq >= nr_irqs) { 201 if (irq >= nr_irqs) {
197 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", 202 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
210 if (desc) 215 if (desc)
211 goto out_unlock; 216 goto out_unlock;
212 217
213 node = cpu_to_node(cpu); 218 if (slab_is_available())
214 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 219 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
215 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", 220 else
216 irq, cpu, node); 221 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
222
223 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
217 if (!desc) { 224 if (!desc) {
218 printk(KERN_ERR "can not alloc irq_desc\n"); 225 printk(KERN_ERR "can not alloc irq_desc\n");
219 BUG_ON(1); 226 BUG_ON(1);
220 } 227 }
221 init_one_irq_desc(irq, desc, cpu); 228 init_one_irq_desc(irq, desc, node);
222 229
223 irq_desc_ptrs[irq] = desc; 230 irq_desc_ptrs[irq] = desc;
224 231
@@ -256,7 +263,8 @@ int __init early_irq_init(void)
256 263
257 for (i = 0; i < count; i++) { 264 for (i = 0; i < count; i++) {
258 desc[i].irq = i; 265 desc[i].irq = i;
259 init_alloc_desc_masks(&desc[i], 0, true); 266 alloc_desc_masks(&desc[i], 0, true);
267 init_desc_masks(&desc[i]);
260 desc[i].kstat_irqs = kstat_irqs_all[i]; 268 desc[i].kstat_irqs = kstat_irqs_all[i];
261 } 269 }
262 return arch_early_irq_init(); 270 return arch_early_irq_init();
@@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
267 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 275 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
268} 276}
269 277
270struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 278struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
271{ 279{
272 return irq_to_desc(irq); 280 return irq_to_desc(irq);
273} 281}
@@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
348 "but no thread function available.", irq, action->name); 356 "but no thread function available.", irq, action->name);
349} 357}
350 358
351DEFINE_TRACE(irq_handler_entry);
352DEFINE_TRACE(irq_handler_exit);
353
354/** 359/**
355 * handle_IRQ_event - irq action chain handler 360 * handle_IRQ_event - irq action chain handler
356 * @irq: the interrupt number 361 * @irq: the interrupt number
@@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
453 /* 458 /*
454 * No locking required for CPU-local interrupts: 459 * No locking required for CPU-local interrupts:
455 */ 460 */
456 if (desc->chip->ack) { 461 if (desc->chip->ack)
457 desc->chip->ack(irq); 462 desc->chip->ack(irq);
458 /* get new one */
459 desc = irq_remap_to_desc(irq, desc);
460 }
461 if (likely(!(desc->status & IRQ_DISABLED))) { 463 if (likely(!(desc->status & IRQ_DISABLED))) {
462 action_ret = handle_IRQ_event(irq, desc->action); 464 action_ret = handle_IRQ_event(irq, desc->action);
463 if (!noirqdebug) 465 if (!noirqdebug)
@@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
468 } 470 }
469 471
470 spin_lock(&desc->lock); 472 spin_lock(&desc->lock);
471 if (desc->chip->ack) { 473 if (desc->chip->ack)
472 desc->chip->ack(irq); 474 desc->chip->ack(irq);
473 desc = irq_remap_to_desc(irq, desc);
474 }
475 /* 475 /*
476 * REPLAY is when Linux resends an IRQ that was dropped earlier 476 * REPLAY is when Linux resends an IRQ that was dropped earlier
477 * WAITING is used by probe to mark irqs that are being tested 477 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..73468253143b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 17
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
22 22
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47
45/* 48/*
46 * Debugging printout: 49 * Debugging printout:
47 */ 50 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243..aaf5c9d05770 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83static void 83void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{ 85{
86 struct irqaction *action = desc->action; 86 struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
109 spin_lock_irqsave(&desc->lock, flags); 109 spin_lock_irqsave(&desc->lock, flags);
110 110
111#ifdef CONFIG_GENERIC_PENDING_IRQ 111#ifdef CONFIG_GENERIC_PENDING_IRQ
112 if (desc->status & IRQ_MOVE_PCNTXT) 112 if (desc->status & IRQ_MOVE_PCNTXT) {
113 desc->chip->set_affinity(irq, cpumask); 113 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask);
116 }
117 }
114 else { 118 else {
115 desc->status |= IRQ_MOVE_PENDING; 119 desc->status |= IRQ_MOVE_PENDING;
116 cpumask_copy(desc->pending_mask, cpumask); 120 cpumask_copy(desc->pending_mask, cpumask);
117 } 121 }
118#else 122#else
119 cpumask_copy(desc->affinity, cpumask); 123 if (!desc->chip->set_affinity(irq, cpumask)) {
120 desc->chip->set_affinity(irq, cpumask); 124 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask);
126 }
121#endif 127#endif
122 irq_set_thread_affinity(desc, cpumask);
123 desc->status |= IRQ_AFFINITY_SET; 128 desc->status |= IRQ_AFFINITY_SET;
124 spin_unlock_irqrestore(&desc->lock, flags); 129 spin_unlock_irqrestore(&desc->lock, flags);
125 return 0; 130 return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..cfe767ca1545 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3#include <linux/interrupt.h>
4
5#include "internals.h"
3 6
4void move_masked_irq(int irq) 7void move_masked_irq(int irq)
5{ 8{
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
39 * masking the irqs. 42 * masking the irqs.
40 */ 43 */
41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 45 < nr_cpu_ids))
43 cpumask_and(desc->affinity, 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
44 desc->pending_mask, cpu_online_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
45 desc->chip->set_affinity(irq, desc->affinity); 48 irq_set_thread_affinity(desc, desc->pending_mask);
46 } 49 }
50
47 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
48} 52}
49 53
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..2f69bee57bf2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
15 15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc, 16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int node, int nr)
19{ 19{
20 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, node, nr);
21 21
22 if (desc->kstat_irqs != old_desc->kstat_irqs) 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
34} 34}
35 35
36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
37 struct irq_desc *desc, int cpu) 37 struct irq_desc *desc, int node)
38{ 38{
39 memcpy(desc, old_desc, sizeof(struct irq_desc)); 39 memcpy(desc, old_desc, sizeof(struct irq_desc));
40 if (!init_alloc_desc_masks(desc, cpu, false)) { 40 if (!alloc_desc_masks(desc, node, false)) {
41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " 41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
49 init_copy_desc_masks(old_desc, desc); 49 init_copy_desc_masks(old_desc, desc);
50 arch_init_copy_chip_data(old_desc, desc, cpu); 50 arch_init_copy_chip_data(old_desc, desc, node);
51 return true; 51 return true;
52} 52}
53 53
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
59} 59}
60 60
61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, 61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
62 int cpu) 62 int node)
63{ 63{
64 struct irq_desc *desc; 64 struct irq_desc *desc;
65 unsigned int irq; 65 unsigned int irq;
66 unsigned long flags; 66 unsigned long flags;
67 int node;
68 67
69 irq = old_desc->irq; 68 irq = old_desc->irq;
70 69
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
77 goto out_unlock; 76 goto out_unlock;
78 77
79 node = cpu_to_node(cpu);
80 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 78 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
81 if (!desc) { 79 if (!desc) {
82 printk(KERN_ERR "irq %d: can not get new irq_desc " 80 printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
85 desc = old_desc; 83 desc = old_desc;
86 goto out_unlock; 84 goto out_unlock;
87 } 85 }
88 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { 86 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
89 /* still use old one */ 87 /* still use old one */
90 kfree(desc); 88 kfree(desc);
91 desc = old_desc; 89 desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
97 95
98 /* free the old one */ 96 /* free the old one */
99 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
100 spin_unlock(&old_desc->lock);
101 kfree(old_desc); 98 kfree(old_desc);
102 spin_lock(&desc->lock);
103 99
104 return desc; 100 return desc;
105 101
@@ -109,24 +105,14 @@ out_unlock:
109 return desc; 105 return desc;
110} 106}
111 107
112struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
113{ 109{
114 int old_cpu;
115 int node, old_node;
116
117 /* those all static, do move them */ 110 /* those all static, do move them */
118 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY)
119 return desc; 112 return desc;
120 113
121 old_cpu = desc->cpu; 114 if (desc->node != node)
122 if (old_cpu != cpu) { 115 desc = __real_move_irq_desc(desc, node);
123 node = cpu_to_node(cpu);
124 old_node = cpu_to_node(old_cpu);
125 if (old_node != node)
126 desc = __real_move_irq_desc(desc, cpu);
127 else
128 desc->cpu = cpu;
129 }
130 116
131 return desc; 117 return desc;
132} 118}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf8519abf..41c88fe40500 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <trace/sched.h> 16#include <trace/events/sched.h>
17 17
18#define KTHREAD_NICE_LEVEL (-5) 18#define KTHREAD_NICE_LEVEL (-5)
19 19
@@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
23 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
27struct kthread_create_info 24struct kthread_create_info
28{ 25{
29 /* Information passed to kthread() from kthreadd. */ 26 /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index accb40cdb12a..8bbeef996c76 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <trace/lockdep.h>
46 45
47#include <asm/sections.h> 46#include <asm/sections.h>
48 47
49#include "lockdep_internals.h" 48#include "lockdep_internals.h"
50 49
50#define CREATE_TRACE_POINTS
51#include <trace/events/lockdep.h>
52
51#ifdef CONFIG_PROVE_LOCKING 53#ifdef CONFIG_PROVE_LOCKING
52int prove_locking = 1; 54int prove_locking = 1;
53module_param(prove_locking, int, 0644); 55module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
2935} 2937}
2936EXPORT_SYMBOL_GPL(lock_set_class); 2938EXPORT_SYMBOL_GPL(lock_set_class);
2937 2939
2938DEFINE_TRACE(lock_acquire);
2939
2940/* 2940/*
2941 * We are not always called with irqs disabled - do that here, 2941 * We are not always called with irqs disabled - do that here,
2942 * and also avoid lockdep recursion: 2942 * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2963} 2963}
2964EXPORT_SYMBOL_GPL(lock_acquire); 2964EXPORT_SYMBOL_GPL(lock_acquire);
2965 2965
2966DEFINE_TRACE(lock_release);
2967
2968void lock_release(struct lockdep_map *lock, int nested, 2966void lock_release(struct lockdep_map *lock, int nested,
2969 unsigned long ip) 2967 unsigned long ip)
2970{ 2968{
@@ -3105,6 +3103,8 @@ found_it:
3105 hlock->holdtime_stamp = now; 3103 hlock->holdtime_stamp = now;
3106 } 3104 }
3107 3105
3106 trace_lock_acquired(lock, ip, waittime);
3107
3108 stats = get_lock_stats(hlock_class(hlock)); 3108 stats = get_lock_stats(hlock_class(hlock));
3109 if (waittime) { 3109 if (waittime) {
3110 if (hlock->read) 3110 if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
3120 lock->ip = ip; 3120 lock->ip = ip;
3121} 3121}
3122 3122
3123DEFINE_TRACE(lock_contended);
3124
3125void lock_contended(struct lockdep_map *lock, unsigned long ip) 3123void lock_contended(struct lockdep_map *lock, unsigned long ip)
3126{ 3124{
3127 unsigned long flags; 3125 unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3143} 3141}
3144EXPORT_SYMBOL_GPL(lock_contended); 3142EXPORT_SYMBOL_GPL(lock_contended);
3145 3143
3146DEFINE_TRACE(lock_acquired);
3147
3148void lock_acquired(struct lockdep_map *lock, unsigned long ip) 3144void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3149{ 3145{
3150 unsigned long flags; 3146 unsigned long flags;
3151 3147
3152 trace_lock_acquired(lock, ip);
3153
3154 if (unlikely(!lock_stat)) 3148 if (unlikely(!lock_stat))
3155 return; 3149 return;
3156 3150
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d95..35f7de00bf0d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
18*/ 18*/
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/ftrace_event.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
23#include <linux/fs.h> 24#include <linux/fs.h>
@@ -52,6 +53,7 @@
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/async.h> 54#include <linux/async.h>
54#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h>
55 57
56#if 0 58#if 0
57#define DEBUGP printk 59#define DEBUGP printk
@@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex); 74EXPORT_SYMBOL_GPL(module_mutex);
73static LIST_HEAD(modules); 75static LIST_HEAD(modules);
74 76
77/* Block module loading/unloading? */
78int modules_disabled = 0;
79
75/* Waiting for a module to finish initializing? */ 80/* Waiting for a module to finish initializing? */
76static DECLARE_WAIT_QUEUE_HEAD(module_wq); 81static DECLARE_WAIT_QUEUE_HEAD(module_wq);
77 82
@@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
429 unsigned long extra; 434 unsigned long extra;
430 unsigned int i; 435 unsigned int i;
431 void *ptr; 436 void *ptr;
437 int cpu;
432 438
433 if (align > PAGE_SIZE) { 439 if (align > PAGE_SIZE) {
434 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 440 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
458 if (!split_block(i, size)) 464 if (!split_block(i, size))
459 return NULL; 465 return NULL;
460 466
467 /* add the per-cpu scanning areas */
468 for_each_possible_cpu(cpu)
469 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
470 GFP_KERNEL);
471
461 /* Mark allocated */ 472 /* Mark allocated */
462 pcpu_size[i] = -pcpu_size[i]; 473 pcpu_size[i] = -pcpu_size[i];
463 return ptr; 474 return ptr;
@@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme)
472{ 483{
473 unsigned int i; 484 unsigned int i;
474 void *ptr = __per_cpu_start + block_size(pcpu_size[0]); 485 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
486 int cpu;
475 487
476 /* First entry is core kernel percpu data. */ 488 /* First entry is core kernel percpu data. */
477 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 489 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme)
483 BUG(); 495 BUG();
484 496
485 free: 497 free:
498 /* remove the per-cpu scanning areas */
499 for_each_possible_cpu(cpu)
500 kmemleak_free(freeme + per_cpu_offset(cpu));
501
486 /* Merge with previous? */ 502 /* Merge with previous? */
487 if (pcpu_size[i-1] >= 0) { 503 if (pcpu_size[i-1] >= 0) {
488 pcpu_size[i-1] += pcpu_size[i]; 504 pcpu_size[i-1] += pcpu_size[i];
@@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
777 char name[MODULE_NAME_LEN]; 793 char name[MODULE_NAME_LEN];
778 int ret, forced = 0; 794 int ret, forced = 0;
779 795
780 if (!capable(CAP_SYS_MODULE)) 796 if (!capable(CAP_SYS_MODULE) || modules_disabled)
781 return -EPERM; 797 return -EPERM;
782 798
783 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) 799 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -1489,9 +1505,6 @@ static void free_module(struct module *mod)
1489 /* Free any allocated parameters. */ 1505 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp); 1506 destroy_params(mod->kp, mod->num_kp);
1491 1507
1492 /* release any pointers to mcount in this module */
1493 ftrace_release(mod->module_core, mod->core_size);
1494
1495 /* This may be NULL, but that's OK */ 1508 /* This may be NULL, but that's OK */
1496 module_free(mod, mod->module_init); 1509 module_free(mod, mod->module_init);
1497 kfree(mod->args); 1510 kfree(mod->args);
@@ -1878,6 +1891,36 @@ static void *module_alloc_update_bounds(unsigned long size)
1878 return ret; 1891 return ret;
1879} 1892}
1880 1893
1894#ifdef CONFIG_DEBUG_KMEMLEAK
1895static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1896 Elf_Shdr *sechdrs, char *secstrings)
1897{
1898 unsigned int i;
1899
1900 /* only scan the sections containing data */
1901 kmemleak_scan_area(mod->module_core, (unsigned long)mod -
1902 (unsigned long)mod->module_core,
1903 sizeof(struct module), GFP_KERNEL);
1904
1905 for (i = 1; i < hdr->e_shnum; i++) {
1906 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1907 continue;
1908 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
1909 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1910 continue;
1911
1912 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
1913 (unsigned long)mod->module_core,
1914 sechdrs[i].sh_size, GFP_KERNEL);
1915 }
1916}
1917#else
1918static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1919 Elf_Shdr *sechdrs, char *secstrings)
1920{
1921}
1922#endif
1923
1881/* Allocate and load the module: note that size of section 0 is always 1924/* Allocate and load the module: note that size of section 0 is always
1882 zero, and we rely on this for optional sections. */ 1925 zero, and we rely on this for optional sections. */
1883static noinline struct module *load_module(void __user *umod, 1926static noinline struct module *load_module(void __user *umod,
@@ -1892,11 +1935,9 @@ static noinline struct module *load_module(void __user *umod,
1892 unsigned int symindex = 0; 1935 unsigned int symindex = 0;
1893 unsigned int strindex = 0; 1936 unsigned int strindex = 0;
1894 unsigned int modindex, versindex, infoindex, pcpuindex; 1937 unsigned int modindex, versindex, infoindex, pcpuindex;
1895 unsigned int num_mcount;
1896 struct module *mod; 1938 struct module *mod;
1897 long err = 0; 1939 long err = 0;
1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1940 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1899 unsigned long *mseg;
1900 mm_segment_t old_fs; 1941 mm_segment_t old_fs;
1901 1942
1902 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1943 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2050,6 +2091,12 @@ static noinline struct module *load_module(void __user *umod,
2050 2091
2051 /* Do the allocs. */ 2092 /* Do the allocs. */
2052 ptr = module_alloc_update_bounds(mod->core_size); 2093 ptr = module_alloc_update_bounds(mod->core_size);
2094 /*
2095 * The pointer to this block is stored in the module structure
2096 * which is inside the block. Just mark it as not being a
2097 * leak.
2098 */
2099 kmemleak_not_leak(ptr);
2053 if (!ptr) { 2100 if (!ptr) {
2054 err = -ENOMEM; 2101 err = -ENOMEM;
2055 goto free_percpu; 2102 goto free_percpu;
@@ -2058,6 +2105,13 @@ static noinline struct module *load_module(void __user *umod,
2058 mod->module_core = ptr; 2105 mod->module_core = ptr;
2059 2106
2060 ptr = module_alloc_update_bounds(mod->init_size); 2107 ptr = module_alloc_update_bounds(mod->init_size);
2108 /*
2109 * The pointer to this block is stored in the module structure
2110 * which is inside the block. This block doesn't need to be
2111 * scanned as it contains data and code that will be freed
2112 * after the module is initialized.
2113 */
2114 kmemleak_ignore(ptr);
2061 if (!ptr && mod->init_size) { 2115 if (!ptr && mod->init_size) {
2062 err = -ENOMEM; 2116 err = -ENOMEM;
2063 goto free_core; 2117 goto free_core;
@@ -2088,6 +2142,7 @@ static noinline struct module *load_module(void __user *umod,
2088 } 2142 }
2089 /* Module has been moved. */ 2143 /* Module has been moved. */
2090 mod = (void *)sechdrs[modindex].sh_addr; 2144 mod = (void *)sechdrs[modindex].sh_addr;
2145 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2091 2146
2092#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2147#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2093 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2148 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
@@ -2172,7 +2227,19 @@ static noinline struct module *load_module(void __user *umod,
2172 sizeof(*mod->tracepoints), 2227 sizeof(*mod->tracepoints),
2173 &mod->num_tracepoints); 2228 &mod->num_tracepoints);
2174#endif 2229#endif
2175 2230#ifdef CONFIG_EVENT_TRACING
2231 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2232 "_ftrace_events",
2233 sizeof(*mod->trace_events),
2234 &mod->num_trace_events);
2235#endif
2236#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2237 /* sechdrs[0].sh_size is always zero */
2238 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2239 "__mcount_loc",
2240 sizeof(*mod->ftrace_callsites),
2241 &mod->num_ftrace_callsites);
2242#endif
2176#ifdef CONFIG_MODVERSIONS 2243#ifdef CONFIG_MODVERSIONS
2177 if ((mod->num_syms && !mod->crcs) 2244 if ((mod->num_syms && !mod->crcs)
2178 || (mod->num_gpl_syms && !mod->gpl_crcs) 2245 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2304,6 @@ static noinline struct module *load_module(void __user *umod,
2237 dynamic_debug_setup(debug, num_debug); 2304 dynamic_debug_setup(debug, num_debug);
2238 } 2305 }
2239 2306
2240 /* sechdrs[0].sh_size is always zero */
2241 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2242 sizeof(*mseg), &num_mcount);
2243 ftrace_init_module(mod, mseg, mseg + num_mcount);
2244
2245 err = module_finalize(hdr, sechdrs, mod); 2307 err = module_finalize(hdr, sechdrs, mod);
2246 if (err < 0) 2308 if (err < 0)
2247 goto cleanup; 2309 goto cleanup;
@@ -2302,7 +2364,6 @@ static noinline struct module *load_module(void __user *umod,
2302 cleanup: 2364 cleanup:
2303 kobject_del(&mod->mkobj.kobj); 2365 kobject_del(&mod->mkobj.kobj);
2304 kobject_put(&mod->mkobj.kobj); 2366 kobject_put(&mod->mkobj.kobj);
2305 ftrace_release(mod->module_core, mod->core_size);
2306 free_unload: 2367 free_unload:
2307 module_unload_free(mod); 2368 module_unload_free(mod);
2308#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2369#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
@@ -2336,7 +2397,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2336 int ret = 0; 2397 int ret = 0;
2337 2398
2338 /* Must have permission */ 2399 /* Must have permission */
2339 if (!capable(CAP_SYS_MODULE)) 2400 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2340 return -EPERM; 2401 return -EPERM;
2341 2402
2342 /* Only one module load at a time, please */ 2403 /* Only one module load at a time, please */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..947b3ad551f8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
471 473
472 return ret; 474 return ret;
473} 475}
474
475EXPORT_SYMBOL(mutex_trylock); 476EXPORT_SYMBOL(mutex_trylock);
477
478/**
479 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
480 * @cnt: the atomic which we are to dec
481 * @lock: the mutex to return holding if we dec to 0
482 *
483 * return true and hold lock if we dec to 0, return false otherwise
484 */
485int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
486{
487 /* dec if we can't possibly hit 0 */
488 if (atomic_add_unless(cnt, -1, 1))
489 return 0;
490 /* we might hit 0, so take the lock */
491 mutex_lock(lock);
492 if (!atomic_dec_and_test(cnt)) {
493 /* when we actually did the dec, we didn't hit 0 */
494 mutex_unlock(lock);
495 return 0;
496 }
497 /* we hit 0, and we hold the lock */
498 return 1;
499}
500EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..ef5d8a5b2453
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4260 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45
46/*
47 * perf counter paranoia level:
48 * 0 - not paranoid
49 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv
51 */
52int sysctl_perf_counter_paranoid __read_mostly;
53
54static inline bool perf_paranoid_cpu(void)
55{
56 return sysctl_perf_counter_paranoid > 0;
57}
58
59static inline bool perf_paranoid_kernel(void)
60{
61 return sysctl_perf_counter_paranoid > 1;
62}
63
64int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
65
66/*
67 * max perf counter sample rate
68 */
69int sysctl_perf_counter_sample_rate __read_mostly = 100000;
70
71static atomic64_t perf_counter_id;
72
73/*
74 * Lock for (sysadmin-configurable) counter reservations:
75 */
76static DEFINE_SPINLOCK(perf_resource_lock);
77
78/*
79 * Architecture provided APIs - weak aliases:
80 */
81extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
82{
83 return NULL;
84}
85
86void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); }
88
89void __weak hw_perf_counter_setup(int cpu) { barrier(); }
90
91int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader,
93 struct perf_cpu_context *cpuctx,
94 struct perf_counter_context *ctx, int cpu)
95{
96 return 0;
97}
98
99void __weak perf_counter_print_debug(void) { }
100
101static DEFINE_PER_CPU(int, disable_count);
102
103void __perf_disable(void)
104{
105 __get_cpu_var(disable_count)++;
106}
107
108bool __perf_enable(void)
109{
110 return !--__get_cpu_var(disable_count);
111}
112
113void perf_disable(void)
114{
115 __perf_disable();
116 hw_perf_disable();
117}
118
119void perf_enable(void)
120{
121 if (__perf_enable())
122 hw_perf_enable();
123}
124
125static void get_ctx(struct perf_counter_context *ctx)
126{
127 atomic_inc(&ctx->refcount);
128}
129
130static void free_ctx(struct rcu_head *head)
131{
132 struct perf_counter_context *ctx;
133
134 ctx = container_of(head, struct perf_counter_context, rcu_head);
135 kfree(ctx);
136}
137
138static void put_ctx(struct perf_counter_context *ctx)
139{
140 if (atomic_dec_and_test(&ctx->refcount)) {
141 if (ctx->parent_ctx)
142 put_ctx(ctx->parent_ctx);
143 if (ctx->task)
144 put_task_struct(ctx->task);
145 call_rcu(&ctx->rcu_head, free_ctx);
146 }
147}
148
149/*
150 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked,
152 * the context could get moved to another task.
153 */
154static struct perf_counter_context *
155perf_lock_task_context(struct task_struct *task, unsigned long *flags)
156{
157 struct perf_counter_context *ctx;
158
159 rcu_read_lock();
160 retry:
161 ctx = rcu_dereference(task->perf_counter_ctxp);
162 if (ctx) {
163 /*
164 * If this context is a clone of another, it might
165 * get swapped for another underneath us by
166 * perf_counter_task_sched_out, though the
167 * rcu_read_lock() protects us from any context
168 * getting freed. Lock the context and check if it
169 * got swapped before we could get the lock, and retry
170 * if so. If we locked the right context, then it
171 * can't get swapped on us any more.
172 */
173 spin_lock_irqsave(&ctx->lock, *flags);
174 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry;
177 }
178 }
179 rcu_read_unlock();
180 return ctx;
181}
182
183/*
184 * Get the context for a task and increment its pin_count so it
185 * can't get swapped to another task. This also increments its
186 * reference count so that the context can't get freed.
187 */
188static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
189{
190 struct perf_counter_context *ctx;
191 unsigned long flags;
192
193 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) {
195 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags);
198 }
199 return ctx;
200}
201
202static void perf_unpin_context(struct perf_counter_context *ctx)
203{
204 unsigned long flags;
205
206 spin_lock_irqsave(&ctx->lock, flags);
207 --ctx->pin_count;
208 spin_unlock_irqrestore(&ctx->lock, flags);
209 put_ctx(ctx);
210}
211
212/*
213 * Add a counter from the lists for its context.
214 * Must be called with ctx->mutex and ctx->lock held.
215 */
216static void
217list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
218{
219 struct perf_counter *group_leader = counter->group_leader;
220
221 /*
222 * Depending on whether it is a standalone or sibling counter,
223 * add it straight to the context's counter list, or to the group
224 * leader's sibling list:
225 */
226 if (group_leader == counter)
227 list_add_tail(&counter->list_entry, &ctx->counter_list);
228 else {
229 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
230 group_leader->nr_siblings++;
231 }
232
233 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++;
235}
236
237/*
238 * Remove a counter from the lists for its context.
239 * Must be called with ctx->mutex and ctx->lock held.
240 */
241static void
242list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
243{
244 struct perf_counter *sibling, *tmp;
245
246 if (list_empty(&counter->list_entry))
247 return;
248 ctx->nr_counters--;
249
250 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry);
252
253 if (counter->group_leader != counter)
254 counter->group_leader->nr_siblings--;
255
256 /*
257 * If this was a group counter with sibling counters then
258 * upgrade the siblings to singleton counters by adding them
259 * to the context list directly:
260 */
261 list_for_each_entry_safe(sibling, tmp,
262 &counter->sibling_list, list_entry) {
263
264 list_move_tail(&sibling->list_entry, &ctx->counter_list);
265 sibling->group_leader = sibling;
266 }
267}
268
269static void
270counter_sched_out(struct perf_counter *counter,
271 struct perf_cpu_context *cpuctx,
272 struct perf_counter_context *ctx)
273{
274 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
275 return;
276
277 counter->state = PERF_COUNTER_STATE_INACTIVE;
278 counter->tstamp_stopped = ctx->time;
279 counter->pmu->disable(counter);
280 counter->oncpu = -1;
281
282 if (!is_software_counter(counter))
283 cpuctx->active_oncpu--;
284 ctx->nr_active--;
285 if (counter->attr.exclusive || !cpuctx->active_oncpu)
286 cpuctx->exclusive = 0;
287}
288
289static void
290group_sched_out(struct perf_counter *group_counter,
291 struct perf_cpu_context *cpuctx,
292 struct perf_counter_context *ctx)
293{
294 struct perf_counter *counter;
295
296 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
297 return;
298
299 counter_sched_out(group_counter, cpuctx, ctx);
300
301 /*
302 * Schedule out siblings (if any):
303 */
304 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
305 counter_sched_out(counter, cpuctx, ctx);
306
307 if (group_counter->attr.exclusive)
308 cpuctx->exclusive = 0;
309}
310
311/*
312 * Cross CPU call to remove a performance counter
313 *
314 * We disable the counter on the hardware level first. After that we
315 * remove it from the context list.
316 */
317static void __perf_counter_remove_from_context(void *info)
318{
319 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
320 struct perf_counter *counter = info;
321 struct perf_counter_context *ctx = counter->ctx;
322
323 /*
324 * If this is a task context, we need to check whether it is
325 * the current task context of this cpu. If not it has been
326 * scheduled out before the smp call arrived.
327 */
328 if (ctx->task && cpuctx->task_ctx != ctx)
329 return;
330
331 spin_lock(&ctx->lock);
332 /*
333 * Protect the list operation against NMI by disabling the
334 * counters on a global level.
335 */
336 perf_disable();
337
338 counter_sched_out(counter, cpuctx, ctx);
339
340 list_del_counter(counter, ctx);
341
342 if (!ctx->task) {
343 /*
344 * Allow more per task counters with respect to the
345 * reservation:
346 */
347 cpuctx->max_pertask =
348 min(perf_max_counters - ctx->nr_counters,
349 perf_max_counters - perf_reserved_percpu);
350 }
351
352 perf_enable();
353 spin_unlock(&ctx->lock);
354}
355
356
357/*
358 * Remove the counter from a task's (or a CPU's) list of counters.
359 *
360 * Must be called with ctx->mutex held.
361 *
362 * CPU counters are removed with a smp call. For task counters we only
363 * call when the task is on a CPU.
364 *
365 * If counter->ctx is a cloned context, callers must make sure that
366 * every task struct that counter->ctx->task could possibly point to
367 * remains valid. This is OK when called from perf_release since
368 * that only calls us on the top-level context, which can't be a clone.
369 * When called from perf_counter_exit_task, it's OK because the
370 * context has been detached from its task.
371 */
372static void perf_counter_remove_from_context(struct perf_counter *counter)
373{
374 struct perf_counter_context *ctx = counter->ctx;
375 struct task_struct *task = ctx->task;
376
377 if (!task) {
378 /*
379 * Per cpu counters are removed via an smp call and
380 * the removal is always sucessful.
381 */
382 smp_call_function_single(counter->cpu,
383 __perf_counter_remove_from_context,
384 counter, 1);
385 return;
386 }
387
388retry:
389 task_oncpu_function_call(task, __perf_counter_remove_from_context,
390 counter);
391
392 spin_lock_irq(&ctx->lock);
393 /*
394 * If the context is active we need to retry the smp call.
395 */
396 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
397 spin_unlock_irq(&ctx->lock);
398 goto retry;
399 }
400
401 /*
402 * The lock prevents that this context is scheduled in so we
403 * can remove the counter safely, if the call above did not
404 * succeed.
405 */
406 if (!list_empty(&counter->list_entry)) {
407 list_del_counter(counter, ctx);
408 }
409 spin_unlock_irq(&ctx->lock);
410}
411
412static inline u64 perf_clock(void)
413{
414 return cpu_clock(smp_processor_id());
415}
416
417/*
418 * Update the record of the current time in a context.
419 */
420static void update_context_time(struct perf_counter_context *ctx)
421{
422 u64 now = perf_clock();
423
424 ctx->time += now - ctx->timestamp;
425 ctx->timestamp = now;
426}
427
428/*
429 * Update the total_time_enabled and total_time_running fields for a counter.
430 */
431static void update_counter_times(struct perf_counter *counter)
432{
433 struct perf_counter_context *ctx = counter->ctx;
434 u64 run_end;
435
436 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
437 return;
438
439 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
440
441 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
442 run_end = counter->tstamp_stopped;
443 else
444 run_end = ctx->time;
445
446 counter->total_time_running = run_end - counter->tstamp_running;
447}
448
449/*
450 * Update total_time_enabled and total_time_running for all counters in a group.
451 */
452static void update_group_times(struct perf_counter *leader)
453{
454 struct perf_counter *counter;
455
456 update_counter_times(leader);
457 list_for_each_entry(counter, &leader->sibling_list, list_entry)
458 update_counter_times(counter);
459}
460
461/*
462 * Cross CPU call to disable a performance counter
463 */
464static void __perf_counter_disable(void *info)
465{
466 struct perf_counter *counter = info;
467 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
468 struct perf_counter_context *ctx = counter->ctx;
469
470 /*
471 * If this is a per-task counter, need to check whether this
472 * counter's task is the current task on this cpu.
473 */
474 if (ctx->task && cpuctx->task_ctx != ctx)
475 return;
476
477 spin_lock(&ctx->lock);
478
479 /*
480 * If the counter is on, turn it off.
481 * If it is in error state, leave it in error state.
482 */
483 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
484 update_context_time(ctx);
485 update_counter_times(counter);
486 if (counter == counter->group_leader)
487 group_sched_out(counter, cpuctx, ctx);
488 else
489 counter_sched_out(counter, cpuctx, ctx);
490 counter->state = PERF_COUNTER_STATE_OFF;
491 }
492
493 spin_unlock(&ctx->lock);
494}
495
496/*
497 * Disable a counter.
498 *
499 * If counter->ctx is a cloned context, callers must make sure that
500 * every task struct that counter->ctx->task could possibly point to
501 * remains valid. This condition is satisifed when called through
502 * perf_counter_for_each_child or perf_counter_for_each because they
503 * hold the top-level counter's child_mutex, so any descendant that
504 * goes to exit will block in sync_child_counter.
505 * When called from perf_pending_counter it's OK because counter->ctx
506 * is the current context on this CPU and preemption is disabled,
507 * hence we can't get into perf_counter_task_sched_out for this context.
508 */
509static void perf_counter_disable(struct perf_counter *counter)
510{
511 struct perf_counter_context *ctx = counter->ctx;
512 struct task_struct *task = ctx->task;
513
514 if (!task) {
515 /*
516 * Disable the counter on the cpu that it's on
517 */
518 smp_call_function_single(counter->cpu, __perf_counter_disable,
519 counter, 1);
520 return;
521 }
522
523 retry:
524 task_oncpu_function_call(task, __perf_counter_disable, counter);
525
526 spin_lock_irq(&ctx->lock);
527 /*
528 * If the counter is still active, we need to retry the cross-call.
529 */
530 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
531 spin_unlock_irq(&ctx->lock);
532 goto retry;
533 }
534
535 /*
536 * Since we have the lock this context can't be scheduled
537 * in, so we can change the state safely.
538 */
539 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
540 update_counter_times(counter);
541 counter->state = PERF_COUNTER_STATE_OFF;
542 }
543
544 spin_unlock_irq(&ctx->lock);
545}
546
547static int
548counter_sched_in(struct perf_counter *counter,
549 struct perf_cpu_context *cpuctx,
550 struct perf_counter_context *ctx,
551 int cpu)
552{
553 if (counter->state <= PERF_COUNTER_STATE_OFF)
554 return 0;
555
556 counter->state = PERF_COUNTER_STATE_ACTIVE;
557 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
558 /*
559 * The new state must be visible before we turn it on in the hardware:
560 */
561 smp_wmb();
562
563 if (counter->pmu->enable(counter)) {
564 counter->state = PERF_COUNTER_STATE_INACTIVE;
565 counter->oncpu = -1;
566 return -EAGAIN;
567 }
568
569 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
570
571 if (!is_software_counter(counter))
572 cpuctx->active_oncpu++;
573 ctx->nr_active++;
574
575 if (counter->attr.exclusive)
576 cpuctx->exclusive = 1;
577
578 return 0;
579}
580
581static int
582group_sched_in(struct perf_counter *group_counter,
583 struct perf_cpu_context *cpuctx,
584 struct perf_counter_context *ctx,
585 int cpu)
586{
587 struct perf_counter *counter, *partial_group;
588 int ret;
589
590 if (group_counter->state == PERF_COUNTER_STATE_OFF)
591 return 0;
592
593 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
594 if (ret)
595 return ret < 0 ? ret : 0;
596
597 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
598 return -EAGAIN;
599
600 /*
601 * Schedule in siblings as one group (if any):
602 */
603 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
604 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
605 partial_group = counter;
606 goto group_error;
607 }
608 }
609
610 return 0;
611
612group_error:
613 /*
614 * Groups can be scheduled in as one unit only, so undo any
615 * partial group before returning:
616 */
617 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
618 if (counter == partial_group)
619 break;
620 counter_sched_out(counter, cpuctx, ctx);
621 }
622 counter_sched_out(group_counter, cpuctx, ctx);
623
624 return -EAGAIN;
625}
626
627/*
628 * Return 1 for a group consisting entirely of software counters,
629 * 0 if the group contains any hardware counters.
630 */
631static int is_software_only_group(struct perf_counter *leader)
632{
633 struct perf_counter *counter;
634
635 if (!is_software_counter(leader))
636 return 0;
637
638 list_for_each_entry(counter, &leader->sibling_list, list_entry)
639 if (!is_software_counter(counter))
640 return 0;
641
642 return 1;
643}
644
645/*
646 * Work out whether we can put this counter group on the CPU now.
647 */
648static int group_can_go_on(struct perf_counter *counter,
649 struct perf_cpu_context *cpuctx,
650 int can_add_hw)
651{
652 /*
653 * Groups consisting entirely of software counters can always go on.
654 */
655 if (is_software_only_group(counter))
656 return 1;
657 /*
658 * If an exclusive group is already on, no other hardware
659 * counters can go on.
660 */
661 if (cpuctx->exclusive)
662 return 0;
663 /*
664 * If this group is exclusive and there are already
665 * counters on the CPU, it can't go on.
666 */
667 if (counter->attr.exclusive && cpuctx->active_oncpu)
668 return 0;
669 /*
670 * Otherwise, try to add it if all previous groups were able
671 * to go on.
672 */
673 return can_add_hw;
674}
675
676static void add_counter_to_ctx(struct perf_counter *counter,
677 struct perf_counter_context *ctx)
678{
679 list_add_counter(counter, ctx);
680 counter->tstamp_enabled = ctx->time;
681 counter->tstamp_running = ctx->time;
682 counter->tstamp_stopped = ctx->time;
683}
684
685/*
686 * Cross CPU call to install and enable a performance counter
687 *
688 * Must be called with ctx->mutex held
689 */
690static void __perf_install_in_context(void *info)
691{
692 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
693 struct perf_counter *counter = info;
694 struct perf_counter_context *ctx = counter->ctx;
695 struct perf_counter *leader = counter->group_leader;
696 int cpu = smp_processor_id();
697 int err;
698
699 /*
700 * If this is a task context, we need to check whether it is
701 * the current task context of this cpu. If not it has been
702 * scheduled out before the smp call arrived.
703 * Or possibly this is the right context but it isn't
704 * on this cpu because it had no counters.
705 */
706 if (ctx->task && cpuctx->task_ctx != ctx) {
707 if (cpuctx->task_ctx || ctx->task != current)
708 return;
709 cpuctx->task_ctx = ctx;
710 }
711
712 spin_lock(&ctx->lock);
713 ctx->is_active = 1;
714 update_context_time(ctx);
715
716 /*
717 * Protect the list operation against NMI by disabling the
718 * counters on a global level. NOP for non NMI based counters.
719 */
720 perf_disable();
721
722 add_counter_to_ctx(counter, ctx);
723
724 /*
725 * Don't put the counter on if it is disabled or if
726 * it is in a group and the group isn't on.
727 */
728 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
729 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
730 goto unlock;
731
732 /*
733 * An exclusive counter can't go on if there are already active
734 * hardware counters, and no hardware counter can go on if there
735 * is already an exclusive counter on.
736 */
737 if (!group_can_go_on(counter, cpuctx, 1))
738 err = -EEXIST;
739 else
740 err = counter_sched_in(counter, cpuctx, ctx, cpu);
741
742 if (err) {
743 /*
744 * This counter couldn't go on. If it is in a group
745 * then we have to pull the whole group off.
746 * If the counter group is pinned then put it in error state.
747 */
748 if (leader != counter)
749 group_sched_out(leader, cpuctx, ctx);
750 if (leader->attr.pinned) {
751 update_group_times(leader);
752 leader->state = PERF_COUNTER_STATE_ERROR;
753 }
754 }
755
756 if (!err && !ctx->task && cpuctx->max_pertask)
757 cpuctx->max_pertask--;
758
759 unlock:
760 perf_enable();
761
762 spin_unlock(&ctx->lock);
763}
764
765/*
766 * Attach a performance counter to a context
767 *
768 * First we add the counter to the list with the hardware enable bit
769 * in counter->hw_config cleared.
770 *
771 * If the counter is attached to a task which is on a CPU we use a smp
772 * call to enable it in the task context. The task might have been
773 * scheduled away, but we check this in the smp call again.
774 *
775 * Must be called with ctx->mutex held.
776 */
777static void
778perf_install_in_context(struct perf_counter_context *ctx,
779 struct perf_counter *counter,
780 int cpu)
781{
782 struct task_struct *task = ctx->task;
783
784 if (!task) {
785 /*
786 * Per cpu counters are installed via an smp call and
787 * the install is always sucessful.
788 */
789 smp_call_function_single(cpu, __perf_install_in_context,
790 counter, 1);
791 return;
792 }
793
794retry:
795 task_oncpu_function_call(task, __perf_install_in_context,
796 counter);
797
798 spin_lock_irq(&ctx->lock);
799 /*
800 * we need to retry the smp call.
801 */
802 if (ctx->is_active && list_empty(&counter->list_entry)) {
803 spin_unlock_irq(&ctx->lock);
804 goto retry;
805 }
806
807 /*
808 * The lock prevents that this context is scheduled in so we
809 * can add the counter safely, if it the call above did not
810 * succeed.
811 */
812 if (list_empty(&counter->list_entry))
813 add_counter_to_ctx(counter, ctx);
814 spin_unlock_irq(&ctx->lock);
815}
816
817/*
818 * Cross CPU call to enable a performance counter
819 */
820static void __perf_counter_enable(void *info)
821{
822 struct perf_counter *counter = info;
823 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
824 struct perf_counter_context *ctx = counter->ctx;
825 struct perf_counter *leader = counter->group_leader;
826 int err;
827
828 /*
829 * If this is a per-task counter, need to check whether this
830 * counter's task is the current task on this cpu.
831 */
832 if (ctx->task && cpuctx->task_ctx != ctx) {
833 if (cpuctx->task_ctx || ctx->task != current)
834 return;
835 cpuctx->task_ctx = ctx;
836 }
837
838 spin_lock(&ctx->lock);
839 ctx->is_active = 1;
840 update_context_time(ctx);
841
842 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
843 goto unlock;
844 counter->state = PERF_COUNTER_STATE_INACTIVE;
845 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
846
847 /*
848 * If the counter is in a group and isn't the group leader,
849 * then don't put it on unless the group is on.
850 */
851 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
852 goto unlock;
853
854 if (!group_can_go_on(counter, cpuctx, 1)) {
855 err = -EEXIST;
856 } else {
857 perf_disable();
858 if (counter == leader)
859 err = group_sched_in(counter, cpuctx, ctx,
860 smp_processor_id());
861 else
862 err = counter_sched_in(counter, cpuctx, ctx,
863 smp_processor_id());
864 perf_enable();
865 }
866
867 if (err) {
868 /*
869 * If this counter can't go on and it's part of a
870 * group, then the whole group has to come off.
871 */
872 if (leader != counter)
873 group_sched_out(leader, cpuctx, ctx);
874 if (leader->attr.pinned) {
875 update_group_times(leader);
876 leader->state = PERF_COUNTER_STATE_ERROR;
877 }
878 }
879
880 unlock:
881 spin_unlock(&ctx->lock);
882}
883
884/*
885 * Enable a counter.
886 *
887 * If counter->ctx is a cloned context, callers must make sure that
888 * every task struct that counter->ctx->task could possibly point to
889 * remains valid. This condition is satisfied when called through
890 * perf_counter_for_each_child or perf_counter_for_each as described
891 * for perf_counter_disable.
892 */
893static void perf_counter_enable(struct perf_counter *counter)
894{
895 struct perf_counter_context *ctx = counter->ctx;
896 struct task_struct *task = ctx->task;
897
898 if (!task) {
899 /*
900 * Enable the counter on the cpu that it's on
901 */
902 smp_call_function_single(counter->cpu, __perf_counter_enable,
903 counter, 1);
904 return;
905 }
906
907 spin_lock_irq(&ctx->lock);
908 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
909 goto out;
910
911 /*
912 * If the counter is in error state, clear that first.
913 * That way, if we see the counter in error state below, we
914 * know that it has gone back into error state, as distinct
915 * from the task having been scheduled away before the
916 * cross-call arrived.
917 */
918 if (counter->state == PERF_COUNTER_STATE_ERROR)
919 counter->state = PERF_COUNTER_STATE_OFF;
920
921 retry:
922 spin_unlock_irq(&ctx->lock);
923 task_oncpu_function_call(task, __perf_counter_enable, counter);
924
925 spin_lock_irq(&ctx->lock);
926
927 /*
928 * If the context is active and the counter is still off,
929 * we need to retry the cross-call.
930 */
931 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
932 goto retry;
933
934 /*
935 * Since we have the lock this context can't be scheduled
936 * in, so we can change the state safely.
937 */
938 if (counter->state == PERF_COUNTER_STATE_OFF) {
939 counter->state = PERF_COUNTER_STATE_INACTIVE;
940 counter->tstamp_enabled =
941 ctx->time - counter->total_time_enabled;
942 }
943 out:
944 spin_unlock_irq(&ctx->lock);
945}
946
947static int perf_counter_refresh(struct perf_counter *counter, int refresh)
948{
949 /*
950 * not supported on inherited counters
951 */
952 if (counter->attr.inherit)
953 return -EINVAL;
954
955 atomic_add(refresh, &counter->event_limit);
956 perf_counter_enable(counter);
957
958 return 0;
959}
960
961void __perf_counter_sched_out(struct perf_counter_context *ctx,
962 struct perf_cpu_context *cpuctx)
963{
964 struct perf_counter *counter;
965
966 spin_lock(&ctx->lock);
967 ctx->is_active = 0;
968 if (likely(!ctx->nr_counters))
969 goto out;
970 update_context_time(ctx);
971
972 perf_disable();
973 if (ctx->nr_active) {
974 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
975 if (counter != counter->group_leader)
976 counter_sched_out(counter, cpuctx, ctx);
977 else
978 group_sched_out(counter, cpuctx, ctx);
979 }
980 }
981 perf_enable();
982 out:
983 spin_unlock(&ctx->lock);
984}
985
986/*
987 * Test whether two contexts are equivalent, i.e. whether they
988 * have both been cloned from the same version of the same context
989 * and they both have the same number of enabled counters.
990 * If the number of enabled counters is the same, then the set
991 * of enabled counters should be the same, because these are both
992 * inherited contexts, therefore we can't access individual counters
993 * in them directly with an fd; we can only enable/disable all
994 * counters via prctl, or enable/disable all counters in a family
995 * via ioctl, which will have the same effect on both contexts.
996 */
997static int context_equiv(struct perf_counter_context *ctx1,
998 struct perf_counter_context *ctx2)
999{
1000 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1001 && ctx1->parent_gen == ctx2->parent_gen
1002 && !ctx1->pin_count && !ctx2->pin_count;
1003}
1004
1005/*
1006 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled.
1008 *
1009 * We stop each counter and update the counter value in counter->count.
1010 *
1011 * This does not protect us against NMI, but disable()
1012 * sets the disabled bit in the control field of counter _before_
1013 * accessing the counter control register. If a NMI hits, then it will
1014 * not restart the counter.
1015 */
1016void perf_counter_task_sched_out(struct task_struct *task,
1017 struct task_struct *next, int cpu)
1018{
1019 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1020 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1021 struct perf_counter_context *next_ctx;
1022 struct perf_counter_context *parent;
1023 struct pt_regs *regs;
1024 int do_switch = 1;
1025
1026 regs = task_pt_regs(task);
1027 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1028
1029 if (likely(!ctx || !cpuctx->task_ctx))
1030 return;
1031
1032 update_context_time(ctx);
1033
1034 rcu_read_lock();
1035 parent = rcu_dereference(ctx->parent_ctx);
1036 next_ctx = next->perf_counter_ctxp;
1037 if (parent && next_ctx &&
1038 rcu_dereference(next_ctx->parent_ctx) == parent) {
1039 /*
1040 * Looks like the two contexts are clones, so we might be
1041 * able to optimize the context switch. We lock both
1042 * contexts and check that they are clones under the
1043 * lock (including re-checking that neither has been
1044 * uncloned in the meantime). It doesn't matter which
1045 * order we take the locks because no other cpu could
1046 * be trying to lock both of these tasks.
1047 */
1048 spin_lock(&ctx->lock);
1049 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1050 if (context_equiv(ctx, next_ctx)) {
1051 /*
1052 * XXX do we need a memory barrier of sorts
1053 * wrt to rcu_dereference() of perf_counter_ctxp
1054 */
1055 task->perf_counter_ctxp = next_ctx;
1056 next->perf_counter_ctxp = ctx;
1057 ctx->task = next;
1058 next_ctx->task = task;
1059 do_switch = 0;
1060 }
1061 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock);
1063 }
1064 rcu_read_unlock();
1065
1066 if (do_switch) {
1067 __perf_counter_sched_out(ctx, cpuctx);
1068 cpuctx->task_ctx = NULL;
1069 }
1070}
1071
1072/*
1073 * Called with IRQs disabled
1074 */
1075static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1076{
1077 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1078
1079 if (!cpuctx->task_ctx)
1080 return;
1081
1082 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1083 return;
1084
1085 __perf_counter_sched_out(ctx, cpuctx);
1086 cpuctx->task_ctx = NULL;
1087}
1088
1089/*
1090 * Called with IRQs disabled
1091 */
1092static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1093{
1094 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1095}
1096
1097static void
1098__perf_counter_sched_in(struct perf_counter_context *ctx,
1099 struct perf_cpu_context *cpuctx, int cpu)
1100{
1101 struct perf_counter *counter;
1102 int can_add_hw = 1;
1103
1104 spin_lock(&ctx->lock);
1105 ctx->is_active = 1;
1106 if (likely(!ctx->nr_counters))
1107 goto out;
1108
1109 ctx->timestamp = perf_clock();
1110
1111 perf_disable();
1112
1113 /*
1114 * First go through the list and put on any pinned groups
1115 * in order to give them the best chance of going on.
1116 */
1117 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1118 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1119 !counter->attr.pinned)
1120 continue;
1121 if (counter->cpu != -1 && counter->cpu != cpu)
1122 continue;
1123
1124 if (counter != counter->group_leader)
1125 counter_sched_in(counter, cpuctx, ctx, cpu);
1126 else {
1127 if (group_can_go_on(counter, cpuctx, 1))
1128 group_sched_in(counter, cpuctx, ctx, cpu);
1129 }
1130
1131 /*
1132 * If this pinned group hasn't been scheduled,
1133 * put it in error state.
1134 */
1135 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1136 update_group_times(counter);
1137 counter->state = PERF_COUNTER_STATE_ERROR;
1138 }
1139 }
1140
1141 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1142 /*
1143 * Ignore counters in OFF or ERROR state, and
1144 * ignore pinned counters since we did them already.
1145 */
1146 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1147 counter->attr.pinned)
1148 continue;
1149
1150 /*
1151 * Listen to the 'cpu' scheduling filter constraint
1152 * of counters:
1153 */
1154 if (counter->cpu != -1 && counter->cpu != cpu)
1155 continue;
1156
1157 if (counter != counter->group_leader) {
1158 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1159 can_add_hw = 0;
1160 } else {
1161 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1162 if (group_sched_in(counter, cpuctx, ctx, cpu))
1163 can_add_hw = 0;
1164 }
1165 }
1166 }
1167 perf_enable();
1168 out:
1169 spin_unlock(&ctx->lock);
1170}
1171
1172/*
1173 * Called from scheduler to add the counters of the current task
1174 * with interrupts disabled.
1175 *
1176 * We restore the counter value and then enable it.
1177 *
1178 * This does not protect us against NMI, but enable()
1179 * sets the enabled bit in the control field of counter _before_
1180 * accessing the counter control register. If a NMI hits, then it will
1181 * keep the counter running.
1182 */
1183void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1184{
1185 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1186 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1187
1188 if (likely(!ctx))
1189 return;
1190 if (cpuctx->task_ctx == ctx)
1191 return;
1192 __perf_counter_sched_in(ctx, cpuctx, cpu);
1193 cpuctx->task_ctx = ctx;
1194}
1195
1196static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1197{
1198 struct perf_counter_context *ctx = &cpuctx->ctx;
1199
1200 __perf_counter_sched_in(ctx, cpuctx, cpu);
1201}
1202
1203#define MAX_INTERRUPTS (~0ULL)
1204
1205static void perf_log_throttle(struct perf_counter *counter, int enable);
1206static void perf_log_period(struct perf_counter *counter, u64 period);
1207
1208static void perf_adjust_period(struct perf_counter *counter, u64 events)
1209{
1210 struct hw_perf_counter *hwc = &counter->hw;
1211 u64 period, sample_period;
1212 s64 delta;
1213
1214 events *= hwc->sample_period;
1215 period = div64_u64(events, counter->attr.sample_freq);
1216
1217 delta = (s64)(period - hwc->sample_period);
1218 delta = (delta + 7) / 8; /* low pass filter */
1219
1220 sample_period = hwc->sample_period + delta;
1221
1222 if (!sample_period)
1223 sample_period = 1;
1224
1225 perf_log_period(counter, sample_period);
1226
1227 hwc->sample_period = sample_period;
1228}
1229
1230static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1231{
1232 struct perf_counter *counter;
1233 struct hw_perf_counter *hwc;
1234 u64 interrupts, freq;
1235
1236 spin_lock(&ctx->lock);
1237 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1238 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1239 continue;
1240
1241 hwc = &counter->hw;
1242
1243 interrupts = hwc->interrupts;
1244 hwc->interrupts = 0;
1245
1246 /*
1247 * unthrottle counters on the tick
1248 */
1249 if (interrupts == MAX_INTERRUPTS) {
1250 perf_log_throttle(counter, 1);
1251 counter->pmu->unthrottle(counter);
1252 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1253 }
1254
1255 if (!counter->attr.freq || !counter->attr.sample_freq)
1256 continue;
1257
1258 /*
1259 * if the specified freq < HZ then we need to skip ticks
1260 */
1261 if (counter->attr.sample_freq < HZ) {
1262 freq = counter->attr.sample_freq;
1263
1264 hwc->freq_count += freq;
1265 hwc->freq_interrupts += interrupts;
1266
1267 if (hwc->freq_count < HZ)
1268 continue;
1269
1270 interrupts = hwc->freq_interrupts;
1271 hwc->freq_interrupts = 0;
1272 hwc->freq_count -= HZ;
1273 } else
1274 freq = HZ;
1275
1276 perf_adjust_period(counter, freq * interrupts);
1277
1278 /*
1279 * In order to avoid being stalled by an (accidental) huge
1280 * sample period, force reset the sample period if we didn't
1281 * get any events in this freq period.
1282 */
1283 if (!interrupts) {
1284 perf_disable();
1285 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter);
1288 perf_enable();
1289 }
1290 }
1291 spin_unlock(&ctx->lock);
1292}
1293
1294/*
1295 * Round-robin a context's counters:
1296 */
1297static void rotate_ctx(struct perf_counter_context *ctx)
1298{
1299 struct perf_counter *counter;
1300
1301 if (!ctx->nr_counters)
1302 return;
1303
1304 spin_lock(&ctx->lock);
1305 /*
1306 * Rotate the first entry last (works just fine for group counters too):
1307 */
1308 perf_disable();
1309 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1310 list_move_tail(&counter->list_entry, &ctx->counter_list);
1311 break;
1312 }
1313 perf_enable();
1314
1315 spin_unlock(&ctx->lock);
1316}
1317
1318void perf_counter_task_tick(struct task_struct *curr, int cpu)
1319{
1320 struct perf_cpu_context *cpuctx;
1321 struct perf_counter_context *ctx;
1322
1323 if (!atomic_read(&nr_counters))
1324 return;
1325
1326 cpuctx = &per_cpu(perf_cpu_context, cpu);
1327 ctx = curr->perf_counter_ctxp;
1328
1329 perf_ctx_adjust_freq(&cpuctx->ctx);
1330 if (ctx)
1331 perf_ctx_adjust_freq(ctx);
1332
1333 perf_counter_cpu_sched_out(cpuctx);
1334 if (ctx)
1335 __perf_counter_task_sched_out(ctx);
1336
1337 rotate_ctx(&cpuctx->ctx);
1338 if (ctx)
1339 rotate_ctx(ctx);
1340
1341 perf_counter_cpu_sched_in(cpuctx, cpu);
1342 if (ctx)
1343 perf_counter_task_sched_in(curr, cpu);
1344}
1345
1346/*
1347 * Cross CPU call to read the hardware counter
1348 */
1349static void __read(void *info)
1350{
1351 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx;
1353 unsigned long flags;
1354
1355 local_irq_save(flags);
1356 if (ctx->is_active)
1357 update_context_time(ctx);
1358 counter->pmu->read(counter);
1359 update_counter_times(counter);
1360 local_irq_restore(flags);
1361}
1362
1363static u64 perf_counter_read(struct perf_counter *counter)
1364{
1365 /*
1366 * If counter is enabled and currently active on a CPU, update the
1367 * value in the counter structure:
1368 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter);
1374 }
1375
1376 return atomic64_read(&counter->count);
1377}
1378
1379/*
1380 * Initialize the perf_counter context in a task_struct:
1381 */
1382static void
1383__perf_counter_init_context(struct perf_counter_context *ctx,
1384 struct task_struct *task)
1385{
1386 memset(ctx, 0, sizeof(*ctx));
1387 spin_lock_init(&ctx->lock);
1388 mutex_init(&ctx->mutex);
1389 INIT_LIST_HEAD(&ctx->counter_list);
1390 INIT_LIST_HEAD(&ctx->event_list);
1391 atomic_set(&ctx->refcount, 1);
1392 ctx->task = task;
1393}
1394
1395static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1396{
1397 struct perf_counter_context *parent_ctx;
1398 struct perf_counter_context *ctx;
1399 struct perf_cpu_context *cpuctx;
1400 struct task_struct *task;
1401 unsigned long flags;
1402 int err;
1403
1404 /*
1405 * If cpu is not a wildcard then this is a percpu counter:
1406 */
1407 if (cpu != -1) {
1408 /* Must be root to operate on a CPU counter: */
1409 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1410 return ERR_PTR(-EACCES);
1411
1412 if (cpu < 0 || cpu > num_possible_cpus())
1413 return ERR_PTR(-EINVAL);
1414
1415 /*
1416 * We could be clever and allow to attach a counter to an
1417 * offline CPU and activate it when the CPU comes up, but
1418 * that's for later.
1419 */
1420 if (!cpu_isset(cpu, cpu_online_map))
1421 return ERR_PTR(-ENODEV);
1422
1423 cpuctx = &per_cpu(perf_cpu_context, cpu);
1424 ctx = &cpuctx->ctx;
1425 get_ctx(ctx);
1426
1427 return ctx;
1428 }
1429
1430 rcu_read_lock();
1431 if (!pid)
1432 task = current;
1433 else
1434 task = find_task_by_vpid(pid);
1435 if (task)
1436 get_task_struct(task);
1437 rcu_read_unlock();
1438
1439 if (!task)
1440 return ERR_PTR(-ESRCH);
1441
1442 /*
1443 * Can't attach counters to a dying task.
1444 */
1445 err = -ESRCH;
1446 if (task->flags & PF_EXITING)
1447 goto errout;
1448
1449 /* Reuse ptrace permission checks for now. */
1450 err = -EACCES;
1451 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1452 goto errout;
1453
1454 retry:
1455 ctx = perf_lock_task_context(task, &flags);
1456 if (ctx) {
1457 parent_ctx = ctx->parent_ctx;
1458 if (parent_ctx) {
1459 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */
1461 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags);
1468 }
1469
1470 if (!ctx) {
1471 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1472 err = -ENOMEM;
1473 if (!ctx)
1474 goto errout;
1475 __perf_counter_init_context(ctx, task);
1476 get_ctx(ctx);
1477 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1478 /*
1479 * We raced with some other task; use
1480 * the context they set.
1481 */
1482 kfree(ctx);
1483 goto retry;
1484 }
1485 get_task_struct(task);
1486 }
1487
1488 put_task_struct(task);
1489 return ctx;
1490
1491 errout:
1492 put_task_struct(task);
1493 return ERR_PTR(err);
1494}
1495
1496static void free_counter_rcu(struct rcu_head *head)
1497{
1498 struct perf_counter *counter;
1499
1500 counter = container_of(head, struct perf_counter, rcu_head);
1501 if (counter->ns)
1502 put_pid_ns(counter->ns);
1503 kfree(counter);
1504}
1505
1506static void perf_pending_sync(struct perf_counter *counter);
1507
1508static void free_counter(struct perf_counter *counter)
1509{
1510 perf_pending_sync(counter);
1511
1512 atomic_dec(&nr_counters);
1513 if (counter->attr.mmap)
1514 atomic_dec(&nr_mmap_counters);
1515 if (counter->attr.comm)
1516 atomic_dec(&nr_comm_counters);
1517
1518 if (counter->destroy)
1519 counter->destroy(counter);
1520
1521 put_ctx(counter->ctx);
1522 call_rcu(&counter->rcu_head, free_counter_rcu);
1523}
1524
1525/*
1526 * Called when the last reference to the file is gone.
1527 */
1528static int perf_release(struct inode *inode, struct file *file)
1529{
1530 struct perf_counter *counter = file->private_data;
1531 struct perf_counter_context *ctx = counter->ctx;
1532
1533 file->private_data = NULL;
1534
1535 WARN_ON_ONCE(ctx->parent_ctx);
1536 mutex_lock(&ctx->mutex);
1537 perf_counter_remove_from_context(counter);
1538 mutex_unlock(&ctx->mutex);
1539
1540 mutex_lock(&counter->owner->perf_counter_mutex);
1541 list_del_init(&counter->owner_entry);
1542 mutex_unlock(&counter->owner->perf_counter_mutex);
1543 put_task_struct(counter->owner);
1544
1545 free_counter(counter);
1546
1547 return 0;
1548}
1549
1550/*
1551 * Read the performance counter - simple non blocking version for now
1552 */
1553static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{
1556 u64 values[3];
1557 int n;
1558
1559 /*
1560 * Return end-of-file for a read on a counter that is in
1561 * error state (i.e. because it was pinned but it couldn't be
1562 * scheduled on to the CPU at some point).
1563 */
1564 if (counter->state == PERF_COUNTER_STATE_ERROR)
1565 return 0;
1566
1567 WARN_ON_ONCE(counter->ctx->parent_ctx);
1568 mutex_lock(&counter->child_mutex);
1569 values[0] = perf_counter_read(counter);
1570 n = 1;
1571 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1572 values[n++] = counter->total_time_enabled +
1573 atomic64_read(&counter->child_total_time_enabled);
1574 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1575 values[n++] = counter->total_time_running +
1576 atomic64_read(&counter->child_total_time_running);
1577 if (counter->attr.read_format & PERF_FORMAT_ID)
1578 values[n++] = counter->id;
1579 mutex_unlock(&counter->child_mutex);
1580
1581 if (count < n * sizeof(u64))
1582 return -EINVAL;
1583 count = n * sizeof(u64);
1584
1585 if (copy_to_user(buf, values, count))
1586 return -EFAULT;
1587
1588 return count;
1589}
1590
1591static ssize_t
1592perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1593{
1594 struct perf_counter *counter = file->private_data;
1595
1596 return perf_read_hw(counter, buf, count);
1597}
1598
1599static unsigned int perf_poll(struct file *file, poll_table *wait)
1600{
1601 struct perf_counter *counter = file->private_data;
1602 struct perf_mmap_data *data;
1603 unsigned int events = POLL_HUP;
1604
1605 rcu_read_lock();
1606 data = rcu_dereference(counter->data);
1607 if (data)
1608 events = atomic_xchg(&data->poll, 0);
1609 rcu_read_unlock();
1610
1611 poll_wait(file, &counter->waitq, wait);
1612
1613 return events;
1614}
1615
1616static void perf_counter_reset(struct perf_counter *counter)
1617{
1618 (void)perf_counter_read(counter);
1619 atomic64_set(&counter->count, 0);
1620 perf_counter_update_userpage(counter);
1621}
1622
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/*
1640 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block
1642 * in sync_child_counter if it goes to exit, thus satisfying the
1643 * task existence requirements of perf_counter_enable/disable.
1644 */
1645static void perf_counter_for_each_child(struct perf_counter *counter,
1646 void (*func)(struct perf_counter *))
1647{
1648 struct perf_counter *child;
1649
1650 WARN_ON_ONCE(counter->ctx->parent_ctx);
1651 mutex_lock(&counter->child_mutex);
1652 func(counter);
1653 list_for_each_entry(child, &counter->child_list, child_list)
1654 func(child);
1655 mutex_unlock(&counter->child_mutex);
1656}
1657
1658static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *))
1660{
1661 struct perf_counter *child;
1662
1663 WARN_ON_ONCE(counter->ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex);
1665 perf_counter_for_each_sibling(counter, func);
1666 list_for_each_entry(child, &counter->child_list, child_list)
1667 perf_counter_for_each_sibling(child, func);
1668 mutex_unlock(&counter->child_mutex);
1669}
1670
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1672{
1673 struct perf_counter_context *ctx = counter->ctx;
1674 unsigned long size;
1675 int ret = 0;
1676 u64 value;
1677
1678 if (!counter->attr.sample_period)
1679 return -EINVAL;
1680
1681 size = copy_from_user(&value, arg, sizeof(value));
1682 if (size != sizeof(value))
1683 return -EFAULT;
1684
1685 if (!value)
1686 return -EINVAL;
1687
1688 spin_lock_irq(&ctx->lock);
1689 if (counter->attr.freq) {
1690 if (value > sysctl_perf_counter_sample_rate) {
1691 ret = -EINVAL;
1692 goto unlock;
1693 }
1694
1695 counter->attr.sample_freq = value;
1696 } else {
1697 perf_log_period(counter, value);
1698
1699 counter->attr.sample_period = value;
1700 counter->hw.sample_period = value;
1701 }
1702unlock:
1703 spin_unlock_irq(&ctx->lock);
1704
1705 return ret;
1706}
1707
1708static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1709{
1710 struct perf_counter *counter = file->private_data;
1711 void (*func)(struct perf_counter *);
1712 u32 flags = arg;
1713
1714 switch (cmd) {
1715 case PERF_COUNTER_IOC_ENABLE:
1716 func = perf_counter_enable;
1717 break;
1718 case PERF_COUNTER_IOC_DISABLE:
1719 func = perf_counter_disable;
1720 break;
1721 case PERF_COUNTER_IOC_RESET:
1722 func = perf_counter_reset;
1723 break;
1724
1725 case PERF_COUNTER_IOC_REFRESH:
1726 return perf_counter_refresh(counter, arg);
1727
1728 case PERF_COUNTER_IOC_PERIOD:
1729 return perf_counter_period(counter, (u64 __user *)arg);
1730
1731 default:
1732 return -ENOTTY;
1733 }
1734
1735 if (flags & PERF_IOC_FLAG_GROUP)
1736 perf_counter_for_each(counter, func);
1737 else
1738 perf_counter_for_each_child(counter, func);
1739
1740 return 0;
1741}
1742
1743int perf_counter_task_enable(void)
1744{
1745 struct perf_counter *counter;
1746
1747 mutex_lock(&current->perf_counter_mutex);
1748 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1749 perf_counter_for_each_child(counter, perf_counter_enable);
1750 mutex_unlock(&current->perf_counter_mutex);
1751
1752 return 0;
1753}
1754
1755int perf_counter_task_disable(void)
1756{
1757 struct perf_counter *counter;
1758
1759 mutex_lock(&current->perf_counter_mutex);
1760 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1761 perf_counter_for_each_child(counter, perf_counter_disable);
1762 mutex_unlock(&current->perf_counter_mutex);
1763
1764 return 0;
1765}
1766
1767/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch
1770 * code calls this from NMI context.
1771 */
1772void perf_counter_update_userpage(struct perf_counter *counter)
1773{
1774 struct perf_counter_mmap_page *userpg;
1775 struct perf_mmap_data *data;
1776
1777 rcu_read_lock();
1778 data = rcu_dereference(counter->data);
1779 if (!data)
1780 goto unlock;
1781
1782 userpg = data->user_page;
1783
1784 /*
1785 * Disable preemption so as to not let the corresponding user-space
1786 * spin too long if we get preempted.
1787 */
1788 preempt_disable();
1789 ++userpg->lock;
1790 barrier();
1791 userpg->index = counter->hw.idx;
1792 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795
1796 barrier();
1797 ++userpg->lock;
1798 preempt_enable();
1799unlock:
1800 rcu_read_unlock();
1801}
1802
1803static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1804{
1805 struct perf_counter *counter = vma->vm_file->private_data;
1806 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS;
1808
1809 rcu_read_lock();
1810 data = rcu_dereference(counter->data);
1811 if (!data)
1812 goto unlock;
1813
1814 if (vmf->pgoff == 0) {
1815 vmf->page = virt_to_page(data->user_page);
1816 } else {
1817 int nr = vmf->pgoff - 1;
1818
1819 if ((unsigned)nr > data->nr_pages)
1820 goto unlock;
1821
1822 vmf->page = virt_to_page(data->data_pages[nr]);
1823 }
1824 get_page(vmf->page);
1825 ret = 0;
1826unlock:
1827 rcu_read_unlock();
1828
1829 return ret;
1830}
1831
1832static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1833{
1834 struct perf_mmap_data *data;
1835 unsigned long size;
1836 int i;
1837
1838 WARN_ON(atomic_read(&counter->mmap_count));
1839
1840 size = sizeof(struct perf_mmap_data);
1841 size += nr_pages * sizeof(void *);
1842
1843 data = kzalloc(size, GFP_KERNEL);
1844 if (!data)
1845 goto fail;
1846
1847 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1848 if (!data->user_page)
1849 goto fail_user_page;
1850
1851 for (i = 0; i < nr_pages; i++) {
1852 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1853 if (!data->data_pages[i])
1854 goto fail_data_pages;
1855 }
1856
1857 data->nr_pages = nr_pages;
1858 atomic_set(&data->lock, -1);
1859
1860 rcu_assign_pointer(counter->data, data);
1861
1862 return 0;
1863
1864fail_data_pages:
1865 for (i--; i >= 0; i--)
1866 free_page((unsigned long)data->data_pages[i]);
1867
1868 free_page((unsigned long)data->user_page);
1869
1870fail_user_page:
1871 kfree(data);
1872
1873fail:
1874 return -ENOMEM;
1875}
1876
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{
1879 struct perf_mmap_data *data;
1880 int i;
1881
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883
1884 free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]);
1887 kfree(data);
1888}
1889
1890static void perf_mmap_data_free(struct perf_counter *counter)
1891{
1892 struct perf_mmap_data *data = counter->data;
1893
1894 WARN_ON(atomic_read(&counter->mmap_count));
1895
1896 rcu_assign_pointer(counter->data, NULL);
1897 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1898}
1899
1900static void perf_mmap_open(struct vm_area_struct *vma)
1901{
1902 struct perf_counter *counter = vma->vm_file->private_data;
1903
1904 atomic_inc(&counter->mmap_count);
1905}
1906
1907static void perf_mmap_close(struct vm_area_struct *vma)
1908{
1909 struct perf_counter *counter = vma->vm_file->private_data;
1910
1911 WARN_ON_ONCE(counter->ctx->parent_ctx);
1912 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
1913 struct user_struct *user = current_user();
1914
1915 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
1916 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1917 perf_mmap_data_free(counter);
1918 mutex_unlock(&counter->mmap_mutex);
1919 }
1920}
1921
1922static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open,
1924 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault,
1926};
1927
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1929{
1930 struct perf_counter *counter = file->private_data;
1931 unsigned long user_locked, user_lock_limit;
1932 struct user_struct *user = current_user();
1933 unsigned long locked, lock_limit;
1934 unsigned long vma_size;
1935 unsigned long nr_pages;
1936 long user_extra, extra;
1937 int ret = 0;
1938
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1940 return -EINVAL;
1941
1942 vma_size = vma->vm_end - vma->vm_start;
1943 nr_pages = (vma_size / PAGE_SIZE) - 1;
1944
1945 /*
1946 * If we have data pages ensure they're a power-of-two number, so we
1947 * can do bitmasks instead of modulo.
1948 */
1949 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1950 return -EINVAL;
1951
1952 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1953 return -EINVAL;
1954
1955 if (vma->vm_pgoff != 0)
1956 return -EINVAL;
1957
1958 WARN_ON_ONCE(counter->ctx->parent_ctx);
1959 mutex_lock(&counter->mmap_mutex);
1960 if (atomic_inc_not_zero(&counter->mmap_count)) {
1961 if (nr_pages != counter->data->nr_pages)
1962 ret = -EINVAL;
1963 goto unlock;
1964 }
1965
1966 user_extra = nr_pages + 1;
1967 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1968
1969 /*
1970 * Increase the limit linearly with more CPUs:
1971 */
1972 user_lock_limit *= num_online_cpus();
1973
1974 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
1975
1976 extra = 0;
1977 if (user_locked > user_lock_limit)
1978 extra = user_locked - user_lock_limit;
1979
1980 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1981 lock_limit >>= PAGE_SHIFT;
1982 locked = vma->vm_mm->locked_vm + extra;
1983
1984 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1985 ret = -EPERM;
1986 goto unlock;
1987 }
1988
1989 WARN_ON(counter->data);
1990 ret = perf_mmap_data_alloc(counter, nr_pages);
1991 if (ret)
1992 goto unlock;
1993
1994 atomic_set(&counter->mmap_count, 1);
1995 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra;
1998unlock:
1999 mutex_unlock(&counter->mmap_mutex);
2000
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops;
2004
2005 return ret;
2006}
2007
2008static int perf_fasync(int fd, struct file *filp, int on)
2009{
2010 struct inode *inode = filp->f_path.dentry->d_inode;
2011 struct perf_counter *counter = filp->private_data;
2012 int retval;
2013
2014 mutex_lock(&inode->i_mutex);
2015 retval = fasync_helper(fd, filp, on, &counter->fasync);
2016 mutex_unlock(&inode->i_mutex);
2017
2018 if (retval < 0)
2019 return retval;
2020
2021 return 0;
2022}
2023
2024static const struct file_operations perf_fops = {
2025 .release = perf_release,
2026 .read = perf_read,
2027 .poll = perf_poll,
2028 .unlocked_ioctl = perf_ioctl,
2029 .compat_ioctl = perf_ioctl,
2030 .mmap = perf_mmap,
2031 .fasync = perf_fasync,
2032};
2033
2034/*
2035 * Perf counter wakeup
2036 *
2037 * If there's data, ensure we set the poll() state and publish everything
2038 * to user-space before waking everybody up.
2039 */
2040
2041void perf_counter_wakeup(struct perf_counter *counter)
2042{
2043 wake_up_all(&counter->waitq);
2044
2045 if (counter->pending_kill) {
2046 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2047 counter->pending_kill = 0;
2048 }
2049}
2050
2051/*
2052 * Pending wakeups
2053 *
2054 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2055 *
2056 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2057 * single linked list and use cmpxchg() to add entries lockless.
2058 */
2059
2060static void perf_pending_counter(struct perf_pending_entry *entry)
2061{
2062 struct perf_counter *counter = container_of(entry,
2063 struct perf_counter, pending);
2064
2065 if (counter->pending_disable) {
2066 counter->pending_disable = 0;
2067 perf_counter_disable(counter);
2068 }
2069
2070 if (counter->pending_wakeup) {
2071 counter->pending_wakeup = 0;
2072 perf_counter_wakeup(counter);
2073 }
2074}
2075
2076#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2077
2078static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2079 PENDING_TAIL,
2080};
2081
2082static void perf_pending_queue(struct perf_pending_entry *entry,
2083 void (*func)(struct perf_pending_entry *))
2084{
2085 struct perf_pending_entry **head;
2086
2087 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2088 return;
2089
2090 entry->func = func;
2091
2092 head = &get_cpu_var(perf_pending_head);
2093
2094 do {
2095 entry->next = *head;
2096 } while (cmpxchg(head, entry->next, entry) != entry->next);
2097
2098 set_perf_counter_pending();
2099
2100 put_cpu_var(perf_pending_head);
2101}
2102
2103static int __perf_pending_run(void)
2104{
2105 struct perf_pending_entry *list;
2106 int nr = 0;
2107
2108 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2109 while (list != PENDING_TAIL) {
2110 void (*func)(struct perf_pending_entry *);
2111 struct perf_pending_entry *entry = list;
2112
2113 list = list->next;
2114
2115 func = entry->func;
2116 entry->next = NULL;
2117 /*
2118 * Ensure we observe the unqueue before we issue the wakeup,
2119 * so that we won't be waiting forever.
2120 * -- see perf_not_pending().
2121 */
2122 smp_wmb();
2123
2124 func(entry);
2125 nr++;
2126 }
2127
2128 return nr;
2129}
2130
2131static inline int perf_not_pending(struct perf_counter *counter)
2132{
2133 /*
2134 * If we flush on whatever cpu we run, there is a chance we don't
2135 * need to wait.
2136 */
2137 get_cpu();
2138 __perf_pending_run();
2139 put_cpu();
2140
2141 /*
2142 * Ensure we see the proper queue state before going to sleep
2143 * so that we do not miss the wakeup. -- see perf_pending_handle()
2144 */
2145 smp_rmb();
2146 return counter->pending.next == NULL;
2147}
2148
2149static void perf_pending_sync(struct perf_counter *counter)
2150{
2151 wait_event(counter->waitq, perf_not_pending(counter));
2152}
2153
2154void perf_counter_do_pending(void)
2155{
2156 __perf_pending_run();
2157}
2158
2159/*
2160 * Callchain support -- arch specific
2161 */
2162
2163__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2164{
2165 return NULL;
2166}
2167
2168/*
2169 * Output
2170 */
2171
2172struct perf_output_handle {
2173 struct perf_counter *counter;
2174 struct perf_mmap_data *data;
2175 unsigned long head;
2176 unsigned long offset;
2177 int nmi;
2178 int overflow;
2179 int locked;
2180 unsigned long flags;
2181};
2182
2183static void perf_output_wakeup(struct perf_output_handle *handle)
2184{
2185 atomic_set(&handle->data->poll, POLL_IN);
2186
2187 if (handle->nmi) {
2188 handle->counter->pending_wakeup = 1;
2189 perf_pending_queue(&handle->counter->pending,
2190 perf_pending_counter);
2191 } else
2192 perf_counter_wakeup(handle->counter);
2193}
2194
2195/*
2196 * Curious locking construct.
2197 *
2198 * We need to ensure a later event doesn't publish a head when a former
2199 * event isn't done writing. However since we need to deal with NMIs we
2200 * cannot fully serialize things.
2201 *
2202 * What we do is serialize between CPUs so we only have to deal with NMI
2203 * nesting on a single CPU.
2204 *
2205 * We only publish the head (and generate a wakeup) when the outer-most
2206 * event completes.
2207 */
2208static void perf_output_lock(struct perf_output_handle *handle)
2209{
2210 struct perf_mmap_data *data = handle->data;
2211 int cpu;
2212
2213 handle->locked = 0;
2214
2215 local_irq_save(handle->flags);
2216 cpu = smp_processor_id();
2217
2218 if (in_nmi() && atomic_read(&data->lock) == cpu)
2219 return;
2220
2221 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2222 cpu_relax();
2223
2224 handle->locked = 1;
2225}
2226
2227static void perf_output_unlock(struct perf_output_handle *handle)
2228{
2229 struct perf_mmap_data *data = handle->data;
2230 unsigned long head;
2231 int cpu;
2232
2233 data->done_head = data->head;
2234
2235 if (!handle->locked)
2236 goto out;
2237
2238again:
2239 /*
2240 * The xchg implies a full barrier that ensures all writes are done
2241 * before we publish the new head, matched by a rmb() in userspace when
2242 * reading this position.
2243 */
2244 while ((head = atomic_long_xchg(&data->done_head, 0)))
2245 data->user_page->data_head = head;
2246
2247 /*
2248 * NMI can happen here, which means we can miss a done_head update.
2249 */
2250
2251 cpu = atomic_xchg(&data->lock, -1);
2252 WARN_ON_ONCE(cpu != smp_processor_id());
2253
2254 /*
2255 * Therefore we have to validate we did not indeed do so.
2256 */
2257 if (unlikely(atomic_long_read(&data->done_head))) {
2258 /*
2259 * Since we had it locked, we can lock it again.
2260 */
2261 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2262 cpu_relax();
2263
2264 goto again;
2265 }
2266
2267 if (atomic_xchg(&data->wakeup, 0))
2268 perf_output_wakeup(handle);
2269out:
2270 local_irq_restore(handle->flags);
2271}
2272
2273static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow)
2276{
2277 struct perf_mmap_data *data;
2278 unsigned int offset, head;
2279
2280 /*
2281 * For inherited counters we send all the output towards the parent.
2282 */
2283 if (counter->parent)
2284 counter = counter->parent;
2285
2286 rcu_read_lock();
2287 data = rcu_dereference(counter->data);
2288 if (!data)
2289 goto out;
2290
2291 handle->data = data;
2292 handle->counter = counter;
2293 handle->nmi = nmi;
2294 handle->overflow = overflow;
2295
2296 if (!data->nr_pages)
2297 goto fail;
2298
2299 perf_output_lock(handle);
2300
2301 do {
2302 offset = head = atomic_long_read(&data->head);
2303 head += size;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305
2306 handle->offset = offset;
2307 handle->head = head;
2308
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1);
2311
2312 return 0;
2313
2314fail:
2315 perf_output_wakeup(handle);
2316out:
2317 rcu_read_unlock();
2318
2319 return -ENOSPC;
2320}
2321
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle)
2362{
2363 struct perf_counter *counter = handle->counter;
2364 struct perf_mmap_data *data = handle->data;
2365
2366 int wakeup_events = counter->attr.wakeup_events;
2367
2368 if (handle->overflow && wakeup_events) {
2369 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events);
2372 atomic_set(&data->wakeup, 1);
2373 }
2374 }
2375
2376 perf_output_unlock(handle);
2377 rcu_read_unlock();
2378}
2379
2380static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2381{
2382 /*
2383 * only top level counters have the pid namespace they were created in
2384 */
2385 if (counter->parent)
2386 counter = counter->parent;
2387
2388 return task_tgid_nr_ns(p, counter->ns);
2389}
2390
2391static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2392{
2393 /*
2394 * only top level counters have the pid namespace they were created in
2395 */
2396 if (counter->parent)
2397 counter = counter->parent;
2398
2399 return task_pid_nr_ns(p, counter->ns);
2400}
2401
2402static void perf_counter_output(struct perf_counter *counter, int nmi,
2403 struct perf_sample_data *data)
2404{
2405 int ret;
2406 u64 sample_type = counter->attr.sample_type;
2407 struct perf_output_handle handle;
2408 struct perf_event_header header;
2409 u64 ip;
2410 struct {
2411 u32 pid, tid;
2412 } tid_entry;
2413 struct {
2414 u64 id;
2415 u64 counter;
2416 } group_entry;
2417 struct perf_callchain_entry *callchain = NULL;
2418 int callchain_size = 0;
2419 u64 time;
2420 struct {
2421 u32 cpu, reserved;
2422 } cpu_entry;
2423
2424 header.type = 0;
2425 header.size = sizeof(header);
2426
2427 header.misc = PERF_EVENT_MISC_OVERFLOW;
2428 header.misc |= perf_misc_flags(data->regs);
2429
2430 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip);
2434 }
2435
2436 if (sample_type & PERF_SAMPLE_TID) {
2437 /* namespace issues */
2438 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current);
2440
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry);
2443 }
2444
2445 if (sample_type & PERF_SAMPLE_TIME) {
2446 /*
2447 * Maybe do better on x86 and provide cpu_clock_nmi()
2448 */
2449 time = sched_clock();
2450
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64);
2453 }
2454
2455 if (sample_type & PERF_SAMPLE_ADDR) {
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64);
2458 }
2459
2460 if (sample_type & PERF_SAMPLE_ID) {
2461 header.type |= PERF_SAMPLE_ID;
2462 header.size += sizeof(u64);
2463 }
2464
2465 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry);
2468
2469 cpu_entry.cpu = raw_smp_processor_id();
2470 }
2471
2472 if (sample_type & PERF_SAMPLE_PERIOD) {
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64);
2475 }
2476
2477 if (sample_type & PERF_SAMPLE_GROUP) {
2478 header.type |= PERF_SAMPLE_GROUP;
2479 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry);
2481 }
2482
2483 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2484 callchain = perf_callchain(data->regs);
2485
2486 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size;
2491 }
2492 }
2493
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2495 if (ret)
2496 return;
2497
2498 perf_output_put(&handle, header);
2499
2500 if (sample_type & PERF_SAMPLE_IP)
2501 perf_output_put(&handle, ip);
2502
2503 if (sample_type & PERF_SAMPLE_TID)
2504 perf_output_put(&handle, tid_entry);
2505
2506 if (sample_type & PERF_SAMPLE_TIME)
2507 perf_output_put(&handle, time);
2508
2509 if (sample_type & PERF_SAMPLE_ADDR)
2510 perf_output_put(&handle, data->addr);
2511
2512 if (sample_type & PERF_SAMPLE_ID)
2513 perf_output_put(&handle, counter->id);
2514
2515 if (sample_type & PERF_SAMPLE_CPU)
2516 perf_output_put(&handle, cpu_entry);
2517
2518 if (sample_type & PERF_SAMPLE_PERIOD)
2519 perf_output_put(&handle, data->period);
2520
2521 /*
2522 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
2523 */
2524 if (sample_type & PERF_SAMPLE_GROUP) {
2525 struct perf_counter *leader, *sub;
2526 u64 nr = counter->nr_siblings;
2527
2528 perf_output_put(&handle, nr);
2529
2530 leader = counter->group_leader;
2531 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2532 if (sub != counter)
2533 sub->pmu->read(sub);
2534
2535 group_entry.id = sub->id;
2536 group_entry.counter = atomic64_read(&sub->count);
2537
2538 perf_output_put(&handle, group_entry);
2539 }
2540 }
2541
2542 if (callchain)
2543 perf_output_copy(&handle, callchain, callchain_size);
2544
2545 perf_output_end(&handle);
2546}
2547
2548/*
2549 * fork tracking
2550 */
2551
2552struct perf_fork_event {
2553 struct task_struct *task;
2554
2555 struct {
2556 struct perf_event_header header;
2557
2558 u32 pid;
2559 u32 ppid;
2560 } event;
2561};
2562
2563static void perf_counter_fork_output(struct perf_counter *counter,
2564 struct perf_fork_event *fork_event)
2565{
2566 struct perf_output_handle handle;
2567 int size = fork_event->event.header.size;
2568 struct task_struct *task = fork_event->task;
2569 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2570
2571 if (ret)
2572 return;
2573
2574 fork_event->event.pid = perf_counter_pid(counter, task);
2575 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2576
2577 perf_output_put(&handle, fork_event->event);
2578 perf_output_end(&handle);
2579}
2580
2581static int perf_counter_fork_match(struct perf_counter *counter)
2582{
2583 if (counter->attr.comm || counter->attr.mmap)
2584 return 1;
2585
2586 return 0;
2587}
2588
2589static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2590 struct perf_fork_event *fork_event)
2591{
2592 struct perf_counter *counter;
2593
2594 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2595 return;
2596
2597 rcu_read_lock();
2598 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2599 if (perf_counter_fork_match(counter))
2600 perf_counter_fork_output(counter, fork_event);
2601 }
2602 rcu_read_unlock();
2603}
2604
2605static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2606{
2607 struct perf_cpu_context *cpuctx;
2608 struct perf_counter_context *ctx;
2609
2610 cpuctx = &get_cpu_var(perf_cpu_context);
2611 perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
2612 put_cpu_var(perf_cpu_context);
2613
2614 rcu_read_lock();
2615 /*
2616 * doesn't really matter which of the child contexts the
2617 * events ends up in.
2618 */
2619 ctx = rcu_dereference(current->perf_counter_ctxp);
2620 if (ctx)
2621 perf_counter_fork_ctx(ctx, fork_event);
2622 rcu_read_unlock();
2623}
2624
2625void perf_counter_fork(struct task_struct *task)
2626{
2627 struct perf_fork_event fork_event;
2628
2629 if (!atomic_read(&nr_comm_counters) &&
2630 !atomic_read(&nr_mmap_counters))
2631 return;
2632
2633 fork_event = (struct perf_fork_event){
2634 .task = task,
2635 .event = {
2636 .header = {
2637 .type = PERF_EVENT_FORK,
2638 .size = sizeof(fork_event.event),
2639 },
2640 },
2641 };
2642
2643 perf_counter_fork_event(&fork_event);
2644}
2645
2646/*
2647 * comm tracking
2648 */
2649
2650struct perf_comm_event {
2651 struct task_struct *task;
2652 char *comm;
2653 int comm_size;
2654
2655 struct {
2656 struct perf_event_header header;
2657
2658 u32 pid;
2659 u32 tid;
2660 } event;
2661};
2662
2663static void perf_counter_comm_output(struct perf_counter *counter,
2664 struct perf_comm_event *comm_event)
2665{
2666 struct perf_output_handle handle;
2667 int size = comm_event->event.header.size;
2668 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2669
2670 if (ret)
2671 return;
2672
2673 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
2674 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
2675
2676 perf_output_put(&handle, comm_event->event);
2677 perf_output_copy(&handle, comm_event->comm,
2678 comm_event->comm_size);
2679 perf_output_end(&handle);
2680}
2681
2682static int perf_counter_comm_match(struct perf_counter *counter)
2683{
2684 if (counter->attr.comm)
2685 return 1;
2686
2687 return 0;
2688}
2689
2690static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2691 struct perf_comm_event *comm_event)
2692{
2693 struct perf_counter *counter;
2694
2695 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2696 return;
2697
2698 rcu_read_lock();
2699 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2700 if (perf_counter_comm_match(counter))
2701 perf_counter_comm_output(counter, comm_event);
2702 }
2703 rcu_read_unlock();
2704}
2705
2706static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2707{
2708 struct perf_cpu_context *cpuctx;
2709 struct perf_counter_context *ctx;
2710 unsigned int size;
2711 char *comm = comm_event->task->comm;
2712
2713 size = ALIGN(strlen(comm)+1, sizeof(u64));
2714
2715 comm_event->comm = comm;
2716 comm_event->comm_size = size;
2717
2718 comm_event->event.header.size = sizeof(comm_event->event) + size;
2719
2720 cpuctx = &get_cpu_var(perf_cpu_context);
2721 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2722 put_cpu_var(perf_cpu_context);
2723
2724 rcu_read_lock();
2725 /*
2726 * doesn't really matter which of the child contexts the
2727 * events ends up in.
2728 */
2729 ctx = rcu_dereference(current->perf_counter_ctxp);
2730 if (ctx)
2731 perf_counter_comm_ctx(ctx, comm_event);
2732 rcu_read_unlock();
2733}
2734
2735void perf_counter_comm(struct task_struct *task)
2736{
2737 struct perf_comm_event comm_event;
2738
2739 if (!atomic_read(&nr_comm_counters))
2740 return;
2741
2742 comm_event = (struct perf_comm_event){
2743 .task = task,
2744 .event = {
2745 .header = { .type = PERF_EVENT_COMM, },
2746 },
2747 };
2748
2749 perf_counter_comm_event(&comm_event);
2750}
2751
2752/*
2753 * mmap tracking
2754 */
2755
2756struct perf_mmap_event {
2757 struct vm_area_struct *vma;
2758
2759 const char *file_name;
2760 int file_size;
2761
2762 struct {
2763 struct perf_event_header header;
2764
2765 u32 pid;
2766 u32 tid;
2767 u64 start;
2768 u64 len;
2769 u64 pgoff;
2770 } event;
2771};
2772
2773static void perf_counter_mmap_output(struct perf_counter *counter,
2774 struct perf_mmap_event *mmap_event)
2775{
2776 struct perf_output_handle handle;
2777 int size = mmap_event->event.header.size;
2778 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2779
2780 if (ret)
2781 return;
2782
2783 mmap_event->event.pid = perf_counter_pid(counter, current);
2784 mmap_event->event.tid = perf_counter_tid(counter, current);
2785
2786 perf_output_put(&handle, mmap_event->event);
2787 perf_output_copy(&handle, mmap_event->file_name,
2788 mmap_event->file_size);
2789 perf_output_end(&handle);
2790}
2791
2792static int perf_counter_mmap_match(struct perf_counter *counter,
2793 struct perf_mmap_event *mmap_event)
2794{
2795 if (counter->attr.mmap)
2796 return 1;
2797
2798 return 0;
2799}
2800
2801static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2802 struct perf_mmap_event *mmap_event)
2803{
2804 struct perf_counter *counter;
2805
2806 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2807 return;
2808
2809 rcu_read_lock();
2810 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2811 if (perf_counter_mmap_match(counter, mmap_event))
2812 perf_counter_mmap_output(counter, mmap_event);
2813 }
2814 rcu_read_unlock();
2815}
2816
2817static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2818{
2819 struct perf_cpu_context *cpuctx;
2820 struct perf_counter_context *ctx;
2821 struct vm_area_struct *vma = mmap_event->vma;
2822 struct file *file = vma->vm_file;
2823 unsigned int size;
2824 char tmp[16];
2825 char *buf = NULL;
2826 const char *name;
2827
2828 if (file) {
2829 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2830 if (!buf) {
2831 name = strncpy(tmp, "//enomem", sizeof(tmp));
2832 goto got_name;
2833 }
2834 name = d_path(&file->f_path, buf, PATH_MAX);
2835 if (IS_ERR(name)) {
2836 name = strncpy(tmp, "//toolong", sizeof(tmp));
2837 goto got_name;
2838 }
2839 } else {
2840 name = arch_vma_name(mmap_event->vma);
2841 if (name)
2842 goto got_name;
2843
2844 if (!vma->vm_mm) {
2845 name = strncpy(tmp, "[vdso]", sizeof(tmp));
2846 goto got_name;
2847 }
2848
2849 name = strncpy(tmp, "//anon", sizeof(tmp));
2850 goto got_name;
2851 }
2852
2853got_name:
2854 size = ALIGN(strlen(name)+1, sizeof(u64));
2855
2856 mmap_event->file_name = name;
2857 mmap_event->file_size = size;
2858
2859 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2860
2861 cpuctx = &get_cpu_var(perf_cpu_context);
2862 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2863 put_cpu_var(perf_cpu_context);
2864
2865 rcu_read_lock();
2866 /*
2867 * doesn't really matter which of the child contexts the
2868 * events ends up in.
2869 */
2870 ctx = rcu_dereference(current->perf_counter_ctxp);
2871 if (ctx)
2872 perf_counter_mmap_ctx(ctx, mmap_event);
2873 rcu_read_unlock();
2874
2875 kfree(buf);
2876}
2877
2878void __perf_counter_mmap(struct vm_area_struct *vma)
2879{
2880 struct perf_mmap_event mmap_event;
2881
2882 if (!atomic_read(&nr_mmap_counters))
2883 return;
2884
2885 mmap_event = (struct perf_mmap_event){
2886 .vma = vma,
2887 .event = {
2888 .header = { .type = PERF_EVENT_MMAP, },
2889 .start = vma->vm_start,
2890 .len = vma->vm_end - vma->vm_start,
2891 .pgoff = vma->vm_pgoff,
2892 },
2893 };
2894
2895 perf_counter_mmap_event(&mmap_event);
2896}
2897
2898/*
2899 * Log sample_period changes so that analyzing tools can re-normalize the
2900 * event flow.
2901 */
2902
2903struct freq_event {
2904 struct perf_event_header header;
2905 u64 time;
2906 u64 id;
2907 u64 period;
2908};
2909
2910static void perf_log_period(struct perf_counter *counter, u64 period)
2911{
2912 struct perf_output_handle handle;
2913 struct freq_event event;
2914 int ret;
2915
2916 if (counter->hw.sample_period == period)
2917 return;
2918
2919 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2920 return;
2921
2922 event = (struct freq_event) {
2923 .header = {
2924 .type = PERF_EVENT_PERIOD,
2925 .misc = 0,
2926 .size = sizeof(event),
2927 },
2928 .time = sched_clock(),
2929 .id = counter->id,
2930 .period = period,
2931 };
2932
2933 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2934 if (ret)
2935 return;
2936
2937 perf_output_put(&handle, event);
2938 perf_output_end(&handle);
2939}
2940
2941/*
2942 * IRQ throttle logging
2943 */
2944
2945static void perf_log_throttle(struct perf_counter *counter, int enable)
2946{
2947 struct perf_output_handle handle;
2948 int ret;
2949
2950 struct {
2951 struct perf_event_header header;
2952 u64 time;
2953 u64 id;
2954 } throttle_event = {
2955 .header = {
2956 .type = PERF_EVENT_THROTTLE + 1,
2957 .misc = 0,
2958 .size = sizeof(throttle_event),
2959 },
2960 .time = sched_clock(),
2961 .id = counter->id,
2962 };
2963
2964 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
2965 if (ret)
2966 return;
2967
2968 perf_output_put(&handle, throttle_event);
2969 perf_output_end(&handle);
2970}
2971
2972/*
2973 * Generic counter overflow handling.
2974 */
2975
2976int perf_counter_overflow(struct perf_counter *counter, int nmi,
2977 struct perf_sample_data *data)
2978{
2979 int events = atomic_read(&counter->event_limit);
2980 int throttle = counter->pmu->unthrottle != NULL;
2981 struct hw_perf_counter *hwc = &counter->hw;
2982 int ret = 0;
2983
2984 if (!throttle) {
2985 hwc->interrupts++;
2986 } else {
2987 if (hwc->interrupts != MAX_INTERRUPTS) {
2988 hwc->interrupts++;
2989 if (HZ * hwc->interrupts >
2990 (u64)sysctl_perf_counter_sample_rate) {
2991 hwc->interrupts = MAX_INTERRUPTS;
2992 perf_log_throttle(counter, 0);
2993 ret = 1;
2994 }
2995 } else {
2996 /*
2997 * Keep re-disabling counters even though on the previous
2998 * pass we disabled it - just in case we raced with a
2999 * sched-in and the counter got enabled again:
3000 */
3001 ret = 1;
3002 }
3003 }
3004
3005 if (counter->attr.freq) {
3006 u64 now = sched_clock();
3007 s64 delta = now - hwc->freq_stamp;
3008
3009 hwc->freq_stamp = now;
3010
3011 if (delta > 0 && delta < TICK_NSEC)
3012 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3013 }
3014
3015 /*
3016 * XXX event_limit might not quite work as expected on inherited
3017 * counters
3018 */
3019
3020 counter->pending_kill = POLL_IN;
3021 if (events && atomic_dec_and_test(&counter->event_limit)) {
3022 ret = 1;
3023 counter->pending_kill = POLL_HUP;
3024 if (nmi) {
3025 counter->pending_disable = 1;
3026 perf_pending_queue(&counter->pending,
3027 perf_pending_counter);
3028 } else
3029 perf_counter_disable(counter);
3030 }
3031
3032 perf_counter_output(counter, nmi, data);
3033 return ret;
3034}
3035
3036/*
3037 * Generic software counter infrastructure
3038 */
3039
3040static void perf_swcounter_update(struct perf_counter *counter)
3041{
3042 struct hw_perf_counter *hwc = &counter->hw;
3043 u64 prev, now;
3044 s64 delta;
3045
3046again:
3047 prev = atomic64_read(&hwc->prev_count);
3048 now = atomic64_read(&hwc->count);
3049 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
3050 goto again;
3051
3052 delta = now - prev;
3053
3054 atomic64_add(delta, &counter->count);
3055 atomic64_sub(delta, &hwc->period_left);
3056}
3057
3058static void perf_swcounter_set_period(struct perf_counter *counter)
3059{
3060 struct hw_perf_counter *hwc = &counter->hw;
3061 s64 left = atomic64_read(&hwc->period_left);
3062 s64 period = hwc->sample_period;
3063
3064 if (unlikely(left <= -period)) {
3065 left = period;
3066 atomic64_set(&hwc->period_left, left);
3067 hwc->last_period = period;
3068 }
3069
3070 if (unlikely(left <= 0)) {
3071 left += period;
3072 atomic64_add(period, &hwc->period_left);
3073 hwc->last_period = period;
3074 }
3075
3076 atomic64_set(&hwc->prev_count, -left);
3077 atomic64_set(&hwc->count, -left);
3078}
3079
3080static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3081{
3082 enum hrtimer_restart ret = HRTIMER_RESTART;
3083 struct perf_sample_data data;
3084 struct perf_counter *counter;
3085 u64 period;
3086
3087 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3088 counter->pmu->read(counter);
3089
3090 data.addr = 0;
3091 data.regs = get_irq_regs();
3092 /*
3093 * In case we exclude kernel IPs or are somehow not in interrupt
3094 * context, provide the next best thing, the user IP.
3095 */
3096 if ((counter->attr.exclude_kernel || !data.regs) &&
3097 !counter->attr.exclude_user)
3098 data.regs = task_pt_regs(current);
3099
3100 if (data.regs) {
3101 if (perf_counter_overflow(counter, 0, &data))
3102 ret = HRTIMER_NORESTART;
3103 }
3104
3105 period = max_t(u64, 10000, counter->hw.sample_period);
3106 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3107
3108 return ret;
3109}
3110
3111static void perf_swcounter_overflow(struct perf_counter *counter,
3112 int nmi, struct pt_regs *regs, u64 addr)
3113{
3114 struct perf_sample_data data = {
3115 .regs = regs,
3116 .addr = addr,
3117 .period = counter->hw.last_period,
3118 };
3119
3120 perf_swcounter_update(counter);
3121 perf_swcounter_set_period(counter);
3122 if (perf_counter_overflow(counter, nmi, &data))
3123 /* soft-disable the counter */
3124 ;
3125
3126}
3127
3128static int perf_swcounter_is_counting(struct perf_counter *counter)
3129{
3130 struct perf_counter_context *ctx;
3131 unsigned long flags;
3132 int count;
3133
3134 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3135 return 1;
3136
3137 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3138 return 0;
3139
3140 /*
3141 * If the counter is inactive, it could be just because
3142 * its task is scheduled out, or because it's in a group
3143 * which could not go on the PMU. We want to count in
3144 * the first case but not the second. If the context is
3145 * currently active then an inactive software counter must
3146 * be the second case. If it's not currently active then
3147 * we need to know whether the counter was active when the
3148 * context was last active, which we can determine by
3149 * comparing counter->tstamp_stopped with ctx->time.
3150 *
3151 * We are within an RCU read-side critical section,
3152 * which protects the existence of *ctx.
3153 */
3154 ctx = counter->ctx;
3155 spin_lock_irqsave(&ctx->lock, flags);
3156 count = 1;
3157 /* Re-check state now we have the lock */
3158 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
3159 counter->ctx->is_active ||
3160 counter->tstamp_stopped < ctx->time)
3161 count = 0;
3162 spin_unlock_irqrestore(&ctx->lock, flags);
3163 return count;
3164}
3165
3166static int perf_swcounter_match(struct perf_counter *counter,
3167 enum perf_type_id type,
3168 u32 event, struct pt_regs *regs)
3169{
3170 if (!perf_swcounter_is_counting(counter))
3171 return 0;
3172
3173 if (counter->attr.type != type)
3174 return 0;
3175 if (counter->attr.config != event)
3176 return 0;
3177
3178 if (regs) {
3179 if (counter->attr.exclude_user && user_mode(regs))
3180 return 0;
3181
3182 if (counter->attr.exclude_kernel && !user_mode(regs))
3183 return 0;
3184 }
3185
3186 return 1;
3187}
3188
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr)
3191{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193
3194 if (counter->hw.sample_period && !neg && regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr);
3196}
3197
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event,
3200 u64 nr, int nmi, struct pt_regs *regs,
3201 u64 addr)
3202{
3203 struct perf_counter *counter;
3204
3205 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3206 return;
3207
3208 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr);
3212 }
3213 rcu_read_unlock();
3214}
3215
3216static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3217{
3218 if (in_nmi())
3219 return &cpuctx->recursion[3];
3220
3221 if (in_irq())
3222 return &cpuctx->recursion[2];
3223
3224 if (in_softirq())
3225 return &cpuctx->recursion[1];
3226
3227 return &cpuctx->recursion[0];
3228}
3229
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs,
3232 u64 addr)
3233{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx);
3236 struct perf_counter_context *ctx;
3237
3238 if (*recursion)
3239 goto out;
3240
3241 (*recursion)++;
3242 barrier();
3243
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr);
3246 rcu_read_lock();
3247 /*
3248 * doesn't really matter which of the child contexts the
3249 * events ends up in.
3250 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
3254 rcu_read_unlock();
3255
3256 barrier();
3257 (*recursion)--;
3258
3259out:
3260 put_cpu_var(perf_cpu_context);
3261}
3262
3263void
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3265{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
3267}
3268
3269static void perf_swcounter_read(struct perf_counter *counter)
3270{
3271 perf_swcounter_update(counter);
3272}
3273
3274static int perf_swcounter_enable(struct perf_counter *counter)
3275{
3276 perf_swcounter_set_period(counter);
3277 return 0;
3278}
3279
3280static void perf_swcounter_disable(struct perf_counter *counter)
3281{
3282 perf_swcounter_update(counter);
3283}
3284
3285static const struct pmu perf_ops_generic = {
3286 .enable = perf_swcounter_enable,
3287 .disable = perf_swcounter_disable,
3288 .read = perf_swcounter_read,
3289};
3290
3291/*
3292 * Software counter: cpu wall time clock
3293 */
3294
3295static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3296{
3297 int cpu = raw_smp_processor_id();
3298 s64 prev;
3299 u64 now;
3300
3301 now = cpu_clock(cpu);
3302 prev = atomic64_read(&counter->hw.prev_count);
3303 atomic64_set(&counter->hw.prev_count, now);
3304 atomic64_add(now - prev, &counter->count);
3305}
3306
3307static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3308{
3309 struct hw_perf_counter *hwc = &counter->hw;
3310 int cpu = raw_smp_processor_id();
3311
3312 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3313 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3314 hwc->hrtimer.function = perf_swcounter_hrtimer;
3315 if (hwc->sample_period) {
3316 u64 period = max_t(u64, 10000, hwc->sample_period);
3317 __hrtimer_start_range_ns(&hwc->hrtimer,
3318 ns_to_ktime(period), 0,
3319 HRTIMER_MODE_REL, 0);
3320 }
3321
3322 return 0;
3323}
3324
3325static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3326{
3327 if (counter->hw.sample_period)
3328 hrtimer_cancel(&counter->hw.hrtimer);
3329 cpu_clock_perf_counter_update(counter);
3330}
3331
3332static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3333{
3334 cpu_clock_perf_counter_update(counter);
3335}
3336
3337static const struct pmu perf_ops_cpu_clock = {
3338 .enable = cpu_clock_perf_counter_enable,
3339 .disable = cpu_clock_perf_counter_disable,
3340 .read = cpu_clock_perf_counter_read,
3341};
3342
3343/*
3344 * Software counter: task time clock
3345 */
3346
3347static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3348{
3349 u64 prev;
3350 s64 delta;
3351
3352 prev = atomic64_xchg(&counter->hw.prev_count, now);
3353 delta = now - prev;
3354 atomic64_add(delta, &counter->count);
3355}
3356
3357static int task_clock_perf_counter_enable(struct perf_counter *counter)
3358{
3359 struct hw_perf_counter *hwc = &counter->hw;
3360 u64 now;
3361
3362 now = counter->ctx->time;
3363
3364 atomic64_set(&hwc->prev_count, now);
3365 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3366 hwc->hrtimer.function = perf_swcounter_hrtimer;
3367 if (hwc->sample_period) {
3368 u64 period = max_t(u64, 10000, hwc->sample_period);
3369 __hrtimer_start_range_ns(&hwc->hrtimer,
3370 ns_to_ktime(period), 0,
3371 HRTIMER_MODE_REL, 0);
3372 }
3373
3374 return 0;
3375}
3376
3377static void task_clock_perf_counter_disable(struct perf_counter *counter)
3378{
3379 if (counter->hw.sample_period)
3380 hrtimer_cancel(&counter->hw.hrtimer);
3381 task_clock_perf_counter_update(counter, counter->ctx->time);
3382
3383}
3384
3385static void task_clock_perf_counter_read(struct perf_counter *counter)
3386{
3387 u64 time;
3388
3389 if (!in_nmi()) {
3390 update_context_time(counter->ctx);
3391 time = counter->ctx->time;
3392 } else {
3393 u64 now = perf_clock();
3394 u64 delta = now - counter->ctx->timestamp;
3395 time = counter->ctx->time + delta;
3396 }
3397
3398 task_clock_perf_counter_update(counter, time);
3399}
3400
3401static const struct pmu perf_ops_task_clock = {
3402 .enable = task_clock_perf_counter_enable,
3403 .disable = task_clock_perf_counter_disable,
3404 .read = task_clock_perf_counter_read,
3405};
3406
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id)
3430{
3431 struct pt_regs *regs = get_irq_regs();
3432
3433 if (!regs)
3434 regs = task_pt_regs(current);
3435
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
3437}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439
3440extern int ftrace_profile_enable(int);
3441extern void ftrace_profile_disable(int);
3442
3443static void tp_perf_counter_destroy(struct perf_counter *counter)
3444{
3445 ftrace_profile_disable(perf_event_id(&counter->attr));
3446}
3447
3448static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3449{
3450 int event_id = perf_event_id(&counter->attr);
3451 int ret;
3452
3453 ret = ftrace_profile_enable(event_id);
3454 if (ret)
3455 return NULL;
3456
3457 counter->destroy = tp_perf_counter_destroy;
3458
3459 return &perf_ops_generic;
3460}
3461#else
3462static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3463{
3464 return NULL;
3465}
3466#endif
3467
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{
3470 const struct pmu *pmu = NULL;
3471
3472 /*
3473 * Software counters (currently) can't in general distinguish
3474 * between user, kernel and hypervisor events.
3475 * However, context switches and cpu migrations are considered
3476 * to be kernel events, and page faults are never hypervisor
3477 * events.
3478 */
3479 switch (counter->attr.config) {
3480 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock;
3482
3483 break;
3484 case PERF_COUNT_SW_TASK_CLOCK:
3485 /*
3486 * If the user instantiates this as a per-cpu counter,
3487 * use the cpu_clock counter instead.
3488 */
3489 if (counter->ctx->task)
3490 pmu = &perf_ops_task_clock;
3491 else
3492 pmu = &perf_ops_cpu_clock;
3493
3494 break;
3495 case PERF_COUNT_SW_PAGE_FAULTS:
3496 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS:
3500 pmu = &perf_ops_generic;
3501 break;
3502 }
3503
3504 return pmu;
3505}
3506
3507/*
3508 * Allocate and initialize a counter structure
3509 */
3510static struct perf_counter *
3511perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu,
3513 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader,
3515 gfp_t gfpflags)
3516{
3517 const struct pmu *pmu;
3518 struct perf_counter *counter;
3519 struct hw_perf_counter *hwc;
3520 long err;
3521
3522 counter = kzalloc(sizeof(*counter), gfpflags);
3523 if (!counter)
3524 return ERR_PTR(-ENOMEM);
3525
3526 /*
3527 * Single counters are their own group leaders, with an
3528 * empty sibling list:
3529 */
3530 if (!group_leader)
3531 group_leader = counter;
3532
3533 mutex_init(&counter->child_mutex);
3534 INIT_LIST_HEAD(&counter->child_list);
3535
3536 INIT_LIST_HEAD(&counter->list_entry);
3537 INIT_LIST_HEAD(&counter->event_entry);
3538 INIT_LIST_HEAD(&counter->sibling_list);
3539 init_waitqueue_head(&counter->waitq);
3540
3541 mutex_init(&counter->mmap_mutex);
3542
3543 counter->cpu = cpu;
3544 counter->attr = *attr;
3545 counter->group_leader = group_leader;
3546 counter->pmu = NULL;
3547 counter->ctx = ctx;
3548 counter->oncpu = -1;
3549
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id);
3552
3553 counter->state = PERF_COUNTER_STATE_INACTIVE;
3554
3555 if (attr->disabled)
3556 counter->state = PERF_COUNTER_STATE_OFF;
3557
3558 pmu = NULL;
3559
3560 hwc = &counter->hw;
3561 hwc->sample_period = attr->sample_period;
3562 if (attr->freq && attr->sample_freq)
3563 hwc->sample_period = 1;
3564
3565 atomic64_set(&hwc->period_left, hwc->sample_period);
3566
3567 /*
3568 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
3569 */
3570 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
3571 goto done;
3572
3573 if (attr->type == PERF_TYPE_RAW) {
3574 pmu = hw_perf_counter_init(counter);
3575 goto done;
3576 }
3577
3578 switch (attr->type) {
3579 case PERF_TYPE_HARDWARE:
3580 case PERF_TYPE_HW_CACHE:
3581 pmu = hw_perf_counter_init(counter);
3582 break;
3583
3584 case PERF_TYPE_SOFTWARE:
3585 pmu = sw_perf_counter_init(counter);
3586 break;
3587
3588 case PERF_TYPE_TRACEPOINT:
3589 pmu = tp_perf_counter_init(counter);
3590 break;
3591 }
3592done:
3593 err = 0;
3594 if (!pmu)
3595 err = -EINVAL;
3596 else if (IS_ERR(pmu))
3597 err = PTR_ERR(pmu);
3598
3599 if (err) {
3600 if (counter->ns)
3601 put_pid_ns(counter->ns);
3602 kfree(counter);
3603 return ERR_PTR(err);
3604 }
3605
3606 counter->pmu = pmu;
3607
3608 atomic_inc(&nr_counters);
3609 if (counter->attr.mmap)
3610 atomic_inc(&nr_mmap_counters);
3611 if (counter->attr.comm)
3612 atomic_inc(&nr_comm_counters);
3613
3614 return counter;
3615}
3616
3617/**
3618 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3619 *
3620 * @attr_uptr: event type attributes for monitoring/sampling
3621 * @pid: target pid
3622 * @cpu: target cpu
3623 * @group_fd: group leader counter fd
3624 */
3625SYSCALL_DEFINE5(perf_counter_open,
3626 const struct perf_counter_attr __user *, attr_uptr,
3627 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3628{
3629 struct perf_counter *counter, *group_leader;
3630 struct perf_counter_attr attr;
3631 struct perf_counter_context *ctx;
3632 struct file *counter_file = NULL;
3633 struct file *group_file = NULL;
3634 int fput_needed = 0;
3635 int fput_needed2 = 0;
3636 int ret;
3637
3638 /* for future expandability... */
3639 if (flags)
3640 return -EINVAL;
3641
3642 if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
3643 return -EFAULT;
3644
3645 if (!attr.exclude_kernel) {
3646 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
3647 return -EACCES;
3648 }
3649
3650 if (attr.freq) {
3651 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
3652 return -EINVAL;
3653 }
3654
3655 /*
3656 * Get the target context (task or percpu):
3657 */
3658 ctx = find_get_context(pid, cpu);
3659 if (IS_ERR(ctx))
3660 return PTR_ERR(ctx);
3661
3662 /*
3663 * Look up the group leader (we will attach this counter to it):
3664 */
3665 group_leader = NULL;
3666 if (group_fd != -1) {
3667 ret = -EINVAL;
3668 group_file = fget_light(group_fd, &fput_needed);
3669 if (!group_file)
3670 goto err_put_context;
3671 if (group_file->f_op != &perf_fops)
3672 goto err_put_context;
3673
3674 group_leader = group_file->private_data;
3675 /*
3676 * Do not allow a recursive hierarchy (this new sibling
3677 * becoming part of another group-sibling):
3678 */
3679 if (group_leader->group_leader != group_leader)
3680 goto err_put_context;
3681 /*
3682 * Do not allow to attach to a group in a different
3683 * task or CPU context:
3684 */
3685 if (group_leader->ctx != ctx)
3686 goto err_put_context;
3687 /*
3688 * Only a group leader can be exclusive or pinned
3689 */
3690 if (attr.exclusive || attr.pinned)
3691 goto err_put_context;
3692 }
3693
3694 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3695 GFP_KERNEL);
3696 ret = PTR_ERR(counter);
3697 if (IS_ERR(counter))
3698 goto err_put_context;
3699
3700 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3701 if (ret < 0)
3702 goto err_free_put_context;
3703
3704 counter_file = fget_light(ret, &fput_needed2);
3705 if (!counter_file)
3706 goto err_free_put_context;
3707
3708 counter->filp = counter_file;
3709 WARN_ON_ONCE(ctx->parent_ctx);
3710 mutex_lock(&ctx->mutex);
3711 perf_install_in_context(ctx, counter, cpu);
3712 ++ctx->generation;
3713 mutex_unlock(&ctx->mutex);
3714
3715 counter->owner = current;
3716 get_task_struct(current);
3717 mutex_lock(&current->perf_counter_mutex);
3718 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
3719 mutex_unlock(&current->perf_counter_mutex);
3720
3721 fput_light(counter_file, fput_needed2);
3722
3723out_fput:
3724 fput_light(group_file, fput_needed);
3725
3726 return ret;
3727
3728err_free_put_context:
3729 kfree(counter);
3730
3731err_put_context:
3732 put_ctx(ctx);
3733
3734 goto out_fput;
3735}
3736
3737/*
3738 * inherit a counter from parent task to child task:
3739 */
3740static struct perf_counter *
3741inherit_counter(struct perf_counter *parent_counter,
3742 struct task_struct *parent,
3743 struct perf_counter_context *parent_ctx,
3744 struct task_struct *child,
3745 struct perf_counter *group_leader,
3746 struct perf_counter_context *child_ctx)
3747{
3748 struct perf_counter *child_counter;
3749
3750 /*
3751 * Instead of creating recursive hierarchies of counters,
3752 * we link inherited counters back to the original parent,
3753 * which has a filp for sure, which we use as the reference
3754 * count:
3755 */
3756 if (parent_counter->parent)
3757 parent_counter = parent_counter->parent;
3758
3759 child_counter = perf_counter_alloc(&parent_counter->attr,
3760 parent_counter->cpu, child_ctx,
3761 group_leader, GFP_KERNEL);
3762 if (IS_ERR(child_counter))
3763 return child_counter;
3764 get_ctx(child_ctx);
3765
3766 /*
3767 * Make the child state follow the state of the parent counter,
3768 * not its attr.disabled bit. We hold the parent's mutex,
3769 * so we won't race with perf_counter_{en, dis}able_family.
3770 */
3771 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3772 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3773 else
3774 child_counter->state = PERF_COUNTER_STATE_OFF;
3775
3776 if (parent_counter->attr.freq)
3777 child_counter->hw.sample_period = parent_counter->hw.sample_period;
3778
3779 /*
3780 * Link it up in the child's context:
3781 */
3782 add_counter_to_ctx(child_counter, child_ctx);
3783
3784 child_counter->parent = parent_counter;
3785 /*
3786 * inherit into child's child as well:
3787 */
3788 child_counter->attr.inherit = 1;
3789
3790 /*
3791 * Get a reference to the parent filp - we will fput it
3792 * when the child counter exits. This is safe to do because
3793 * we are in the parent and we know that the filp still
3794 * exists and has a nonzero count:
3795 */
3796 atomic_long_inc(&parent_counter->filp->f_count);
3797
3798 /*
3799 * Link this into the parent counter's child list
3800 */
3801 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3802 mutex_lock(&parent_counter->child_mutex);
3803 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3804 mutex_unlock(&parent_counter->child_mutex);
3805
3806 return child_counter;
3807}
3808
3809static int inherit_group(struct perf_counter *parent_counter,
3810 struct task_struct *parent,
3811 struct perf_counter_context *parent_ctx,
3812 struct task_struct *child,
3813 struct perf_counter_context *child_ctx)
3814{
3815 struct perf_counter *leader;
3816 struct perf_counter *sub;
3817 struct perf_counter *child_ctr;
3818
3819 leader = inherit_counter(parent_counter, parent, parent_ctx,
3820 child, NULL, child_ctx);
3821 if (IS_ERR(leader))
3822 return PTR_ERR(leader);
3823 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3824 child_ctr = inherit_counter(sub, parent, parent_ctx,
3825 child, leader, child_ctx);
3826 if (IS_ERR(child_ctr))
3827 return PTR_ERR(child_ctr);
3828 }
3829 return 0;
3830}
3831
3832static void sync_child_counter(struct perf_counter *child_counter,
3833 struct perf_counter *parent_counter)
3834{
3835 u64 child_val;
3836
3837 child_val = atomic64_read(&child_counter->count);
3838
3839 /*
3840 * Add back the child's count to the parent's count:
3841 */
3842 atomic64_add(child_val, &parent_counter->count);
3843 atomic64_add(child_counter->total_time_enabled,
3844 &parent_counter->child_total_time_enabled);
3845 atomic64_add(child_counter->total_time_running,
3846 &parent_counter->child_total_time_running);
3847
3848 /*
3849 * Remove this counter from the parent's list
3850 */
3851 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3852 mutex_lock(&parent_counter->child_mutex);
3853 list_del_init(&child_counter->child_list);
3854 mutex_unlock(&parent_counter->child_mutex);
3855
3856 /*
3857 * Release the parent counter, if this was the last
3858 * reference to it.
3859 */
3860 fput(parent_counter->filp);
3861}
3862
3863static void
3864__perf_counter_exit_task(struct perf_counter *child_counter,
3865 struct perf_counter_context *child_ctx)
3866{
3867 struct perf_counter *parent_counter;
3868
3869 update_counter_times(child_counter);
3870 perf_counter_remove_from_context(child_counter);
3871
3872 parent_counter = child_counter->parent;
3873 /*
3874 * It can happen that parent exits first, and has counters
3875 * that are still around due to the child reference. These
3876 * counters need to be zapped - but otherwise linger.
3877 */
3878 if (parent_counter) {
3879 sync_child_counter(child_counter, parent_counter);
3880 free_counter(child_counter);
3881 }
3882}
3883
3884/*
3885 * When a child task exits, feed back counter values to parent counters.
3886 */
3887void perf_counter_exit_task(struct task_struct *child)
3888{
3889 struct perf_counter *child_counter, *tmp;
3890 struct perf_counter_context *child_ctx;
3891 unsigned long flags;
3892
3893 if (likely(!child->perf_counter_ctxp))
3894 return;
3895
3896 local_irq_save(flags);
3897 /*
3898 * We can't reschedule here because interrupts are disabled,
3899 * and either child is current or it is a task that can't be
3900 * scheduled, so we are now safe from rescheduling changing
3901 * our context.
3902 */
3903 child_ctx = child->perf_counter_ctxp;
3904 __perf_counter_task_sched_out(child_ctx);
3905
3906 /*
3907 * Take the context lock here so that if find_get_context is
3908 * reading child->perf_counter_ctxp, we wait until it has
3909 * incremented the context's refcount before we do put_ctx below.
3910 */
3911 spin_lock(&child_ctx->lock);
3912 child->perf_counter_ctxp = NULL;
3913 if (child_ctx->parent_ctx) {
3914 /*
3915 * This context is a clone; unclone it so it can't get
3916 * swapped to another process while we're removing all
3917 * the counters from it.
3918 */
3919 put_ctx(child_ctx->parent_ctx);
3920 child_ctx->parent_ctx = NULL;
3921 }
3922 spin_unlock(&child_ctx->lock);
3923 local_irq_restore(flags);
3924
3925 /*
3926 * We can recurse on the same lock type through:
3927 *
3928 * __perf_counter_exit_task()
3929 * sync_child_counter()
3930 * fput(parent_counter->filp)
3931 * perf_release()
3932 * mutex_lock(&ctx->mutex)
3933 *
3934 * But since its the parent context it won't be the same instance.
3935 */
3936 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
3937
3938again:
3939 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3940 list_entry)
3941 __perf_counter_exit_task(child_counter, child_ctx);
3942
3943 /*
3944 * If the last counter was a group counter, it will have appended all
3945 * its siblings to the list, but we obtained 'tmp' before that which
3946 * will still point to the list head terminating the iteration.
3947 */
3948 if (!list_empty(&child_ctx->counter_list))
3949 goto again;
3950
3951 mutex_unlock(&child_ctx->mutex);
3952
3953 put_ctx(child_ctx);
3954}
3955
3956/*
3957 * free an unexposed, unused context as created by inheritance by
3958 * init_task below, used by fork() in case of fail.
3959 */
3960void perf_counter_free_task(struct task_struct *task)
3961{
3962 struct perf_counter_context *ctx = task->perf_counter_ctxp;
3963 struct perf_counter *counter, *tmp;
3964
3965 if (!ctx)
3966 return;
3967
3968 mutex_lock(&ctx->mutex);
3969again:
3970 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
3971 struct perf_counter *parent = counter->parent;
3972
3973 if (WARN_ON_ONCE(!parent))
3974 continue;
3975
3976 mutex_lock(&parent->child_mutex);
3977 list_del_init(&counter->child_list);
3978 mutex_unlock(&parent->child_mutex);
3979
3980 fput(parent->filp);
3981
3982 list_del_counter(counter, ctx);
3983 free_counter(counter);
3984 }
3985
3986 if (!list_empty(&ctx->counter_list))
3987 goto again;
3988
3989 mutex_unlock(&ctx->mutex);
3990
3991 put_ctx(ctx);
3992}
3993
3994/*
3995 * Initialize the perf_counter context in task_struct
3996 */
3997int perf_counter_init_task(struct task_struct *child)
3998{
3999 struct perf_counter_context *child_ctx, *parent_ctx;
4000 struct perf_counter_context *cloned_ctx;
4001 struct perf_counter *counter;
4002 struct task_struct *parent = current;
4003 int inherited_all = 1;
4004 int ret = 0;
4005
4006 child->perf_counter_ctxp = NULL;
4007
4008 mutex_init(&child->perf_counter_mutex);
4009 INIT_LIST_HEAD(&child->perf_counter_list);
4010
4011 if (likely(!parent->perf_counter_ctxp))
4012 return 0;
4013
4014 /*
4015 * This is executed from the parent task context, so inherit
4016 * counters that have been marked for cloning.
4017 * First allocate and initialize a context for the child.
4018 */
4019
4020 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4021 if (!child_ctx)
4022 return -ENOMEM;
4023
4024 __perf_counter_init_context(child_ctx, child);
4025 child->perf_counter_ctxp = child_ctx;
4026 get_task_struct(child);
4027
4028 /*
4029 * If the parent's context is a clone, pin it so it won't get
4030 * swapped under us.
4031 */
4032 parent_ctx = perf_pin_task_context(parent);
4033
4034 /*
4035 * No need to check if parent_ctx != NULL here; since we saw
4036 * it non-NULL earlier, the only reason for it to become NULL
4037 * is if we exit, and since we're currently in the middle of
4038 * a fork we can't be exiting at the same time.
4039 */
4040
4041 /*
4042 * Lock the parent list. No need to lock the child - not PID
4043 * hashed yet and not running, so nobody can access it.
4044 */
4045 mutex_lock(&parent_ctx->mutex);
4046
4047 /*
4048 * We dont have to disable NMIs - we are only looking at
4049 * the list, not manipulating it:
4050 */
4051 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4052 if (counter != counter->group_leader)
4053 continue;
4054
4055 if (!counter->attr.inherit) {
4056 inherited_all = 0;
4057 continue;
4058 }
4059
4060 ret = inherit_group(counter, parent, parent_ctx,
4061 child, child_ctx);
4062 if (ret) {
4063 inherited_all = 0;
4064 break;
4065 }
4066 }
4067
4068 if (inherited_all) {
4069 /*
4070 * Mark the child context as a clone of the parent
4071 * context, or of whatever the parent is a clone of.
4072 * Note that if the parent is a clone, it could get
4073 * uncloned at any point, but that doesn't matter
4074 * because the list of counters and the generation
4075 * count can't have changed since we took the mutex.
4076 */
4077 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4078 if (cloned_ctx) {
4079 child_ctx->parent_ctx = cloned_ctx;
4080 child_ctx->parent_gen = parent_ctx->parent_gen;
4081 } else {
4082 child_ctx->parent_ctx = parent_ctx;
4083 child_ctx->parent_gen = parent_ctx->generation;
4084 }
4085 get_ctx(child_ctx->parent_ctx);
4086 }
4087
4088 mutex_unlock(&parent_ctx->mutex);
4089
4090 perf_unpin_context(parent_ctx);
4091
4092 return ret;
4093}
4094
4095static void __cpuinit perf_counter_init_cpu(int cpu)
4096{
4097 struct perf_cpu_context *cpuctx;
4098
4099 cpuctx = &per_cpu(perf_cpu_context, cpu);
4100 __perf_counter_init_context(&cpuctx->ctx, NULL);
4101
4102 spin_lock(&perf_resource_lock);
4103 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4104 spin_unlock(&perf_resource_lock);
4105
4106 hw_perf_counter_setup(cpu);
4107}
4108
4109#ifdef CONFIG_HOTPLUG_CPU
4110static void __perf_counter_exit_cpu(void *info)
4111{
4112 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4113 struct perf_counter_context *ctx = &cpuctx->ctx;
4114 struct perf_counter *counter, *tmp;
4115
4116 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4117 __perf_counter_remove_from_context(counter);
4118}
4119static void perf_counter_exit_cpu(int cpu)
4120{
4121 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4122 struct perf_counter_context *ctx = &cpuctx->ctx;
4123
4124 mutex_lock(&ctx->mutex);
4125 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4126 mutex_unlock(&ctx->mutex);
4127}
4128#else
4129static inline void perf_counter_exit_cpu(int cpu) { }
4130#endif
4131
4132static int __cpuinit
4133perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4134{
4135 unsigned int cpu = (long)hcpu;
4136
4137 switch (action) {
4138
4139 case CPU_UP_PREPARE:
4140 case CPU_UP_PREPARE_FROZEN:
4141 perf_counter_init_cpu(cpu);
4142 break;
4143
4144 case CPU_DOWN_PREPARE:
4145 case CPU_DOWN_PREPARE_FROZEN:
4146 perf_counter_exit_cpu(cpu);
4147 break;
4148
4149 default:
4150 break;
4151 }
4152
4153 return NOTIFY_OK;
4154}
4155
4156/*
4157 * This has to have a higher priority than migration_notifier in sched.c.
4158 */
4159static struct notifier_block __cpuinitdata perf_cpu_nb = {
4160 .notifier_call = perf_cpu_notify,
4161 .priority = 20,
4162};
4163
4164void __init perf_counter_init(void)
4165{
4166 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4167 (void *)(long)smp_processor_id());
4168 register_cpu_notifier(&perf_cpu_nb);
4169}
4170
4171static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4172{
4173 return sprintf(buf, "%d\n", perf_reserved_percpu);
4174}
4175
4176static ssize_t
4177perf_set_reserve_percpu(struct sysdev_class *class,
4178 const char *buf,
4179 size_t count)
4180{
4181 struct perf_cpu_context *cpuctx;
4182 unsigned long val;
4183 int err, cpu, mpt;
4184
4185 err = strict_strtoul(buf, 10, &val);
4186 if (err)
4187 return err;
4188 if (val > perf_max_counters)
4189 return -EINVAL;
4190
4191 spin_lock(&perf_resource_lock);
4192 perf_reserved_percpu = val;
4193 for_each_online_cpu(cpu) {
4194 cpuctx = &per_cpu(perf_cpu_context, cpu);
4195 spin_lock_irq(&cpuctx->ctx.lock);
4196 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4197 perf_max_counters - perf_reserved_percpu);
4198 cpuctx->max_pertask = mpt;
4199 spin_unlock_irq(&cpuctx->ctx.lock);
4200 }
4201 spin_unlock(&perf_resource_lock);
4202
4203 return count;
4204}
4205
4206static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4207{
4208 return sprintf(buf, "%d\n", perf_overcommit);
4209}
4210
4211static ssize_t
4212perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4213{
4214 unsigned long val;
4215 int err;
4216
4217 err = strict_strtoul(buf, 10, &val);
4218 if (err)
4219 return err;
4220 if (val > 1)
4221 return -EINVAL;
4222
4223 spin_lock(&perf_resource_lock);
4224 perf_overcommit = val;
4225 spin_unlock(&perf_resource_lock);
4226
4227 return count;
4228}
4229
4230static SYSDEV_CLASS_ATTR(
4231 reserve_percpu,
4232 0644,
4233 perf_show_reserve_percpu,
4234 perf_set_reserve_percpu
4235 );
4236
4237static SYSDEV_CLASS_ATTR(
4238 overcommit,
4239 0644,
4240 perf_show_overcommit,
4241 perf_set_overcommit
4242 );
4243
4244static struct attribute *perfclass_attrs[] = {
4245 &attr_reserve_percpu.attr,
4246 &attr_overcommit.attr,
4247 NULL
4248};
4249
4250static struct attribute_group perfclass_attr_group = {
4251 .attrs = perfclass_attrs,
4252 .name = "perf_counters",
4253};
4254
4255static int __init perf_counter_sysfs_init(void)
4256{
4257 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4258 &perfclass_attr_group);
4259}
4260device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae..28cf26ad2d24 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
111 /* only text is profiled */ 111 /* only text is profiled */
112 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
118 return 0;
119 }
120 114
121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 115 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
122 return -ENOMEM; 116 return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42c317874cfa..f6d8b8cb5e34 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,16 +25,6 @@
25 25
26 26
27/* 27/*
28 * Initialize a new task whose father had been ptraced.
29 *
30 * Called from copy_process().
31 */
32void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
33{
34 arch_ptrace_fork(child, clone_flags);
35}
36
37/*
38 * ptrace a task: make the debugger its new parent and 28 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 29 * move it to the ptrace list.
40 * 30 *
@@ -185,10 +175,11 @@ int ptrace_attach(struct task_struct *task)
185 if (same_thread_group(task, current)) 175 if (same_thread_group(task, current))
186 goto out; 176 goto out;
187 177
188 /* Protect exec's credential calculations against our interference; 178 /* Protect the target's credential calculations against our
189 * SUID, SGID and LSM creds get determined differently under ptrace. 179 * interference; SUID, SGID and LSM creds get determined differently
180 * under ptrace.
190 */ 181 */
191 retval = mutex_lock_interruptible(&task->cred_exec_mutex); 182 retval = mutex_lock_interruptible(&task->cred_guard_mutex);
192 if (retval < 0) 183 if (retval < 0)
193 goto out; 184 goto out;
194 185
@@ -232,7 +223,7 @@ repeat:
232bad: 223bad:
233 write_unlock_irqrestore(&tasklist_lock, flags); 224 write_unlock_irqrestore(&tasklist_lock, flags);
234 task_unlock(task); 225 task_unlock(task);
235 mutex_unlock(&task->cred_exec_mutex); 226 mutex_unlock(&task->cred_guard_mutex);
236out: 227out:
237 return retval; 228 return retval;
238} 229}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d3..beb0e659adcc 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
1356 1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; 1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); 1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; 1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq, 1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, 1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret); 1362 ret);
1363 1363
1364 /*
1365 * Signals would prevent us from sleeping, and we cannot
1366 * do much with them in any case. So flush them.
1367 */
1368 if (ret)
1369 flush_signals(current);
1370 couldsleepnext = 0; 1364 couldsleepnext = 0;
1371 1365
1372 } while (!kthread_should_stop()); 1366 } while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9b..0dccfbba6d26 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1259 check_cpu_stall(rsp, rdp); 1259 check_cpu_stall(rsp, rdp);
1260 1260
1261 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1261 /* Is the RCU core waiting for a quiescent state from this CPU? */
1262 if (rdp->qs_pending) 1262 if (rdp->qs_pending) {
1263 rdp->n_rp_qs_pending++;
1263 return 1; 1264 return 1;
1265 }
1264 1266
1265 /* Does this CPU have callbacks ready to invoke? */ 1267 /* Does this CPU have callbacks ready to invoke? */
1266 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1268 if (cpu_has_callbacks_ready_to_invoke(rdp)) {
1269 rdp->n_rp_cb_ready++;
1267 return 1; 1270 return 1;
1271 }
1268 1272
1269 /* Has RCU gone idle with this CPU needing another grace period? */ 1273 /* Has RCU gone idle with this CPU needing another grace period? */
1270 if (cpu_needs_another_gp(rsp, rdp)) 1274 if (cpu_needs_another_gp(rsp, rdp)) {
1275 rdp->n_rp_cpu_needs_gp++;
1271 return 1; 1276 return 1;
1277 }
1272 1278
1273 /* Has another RCU grace period completed? */ 1279 /* Has another RCU grace period completed? */
1274 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ 1280 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
1281 rdp->n_rp_gp_completed++;
1275 return 1; 1282 return 1;
1283 }
1276 1284
1277 /* Has a new RCU grace period started? */ 1285 /* Has a new RCU grace period started? */
1278 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ 1286 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
1287 rdp->n_rp_gp_started++;
1279 return 1; 1288 return 1;
1289 }
1280 1290
1281 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1291 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1282 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1292 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1283 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) 1293 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1294 rdp->n_rp_need_fqs++;
1284 return 1; 1295 return 1;
1296 }
1285 1297
1286 /* nothing to do */ 1298 /* nothing to do */
1299 rdp->n_rp_need_nothing++;
1287 return 0; 1300 return 0;
1288} 1301}
1289 1302
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba9404..fe1dcdbf1ca3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
213 .release = single_release, 213 .release = single_release,
214}; 214};
215 215
216static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; 216static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
217{
218 seq_printf(m, "%3d%cnp=%ld "
219 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
220 rdp->cpu,
221 cpu_is_offline(rdp->cpu) ? '!' : ' ',
222 rdp->n_rcu_pending,
223 rdp->n_rp_qs_pending,
224 rdp->n_rp_cb_ready,
225 rdp->n_rp_cpu_needs_gp,
226 rdp->n_rp_gp_completed,
227 rdp->n_rp_gp_started,
228 rdp->n_rp_need_fqs,
229 rdp->n_rp_need_nothing);
230}
231
232static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
233{
234 int cpu;
235 struct rcu_data *rdp;
236
237 for_each_possible_cpu(cpu) {
238 rdp = rsp->rda[cpu];
239 if (rdp->beenonline)
240 print_one_rcu_pending(m, rdp);
241 }
242}
243
244static int show_rcu_pending(struct seq_file *m, void *unused)
245{
246 seq_puts(m, "rcu:\n");
247 print_rcu_pendings(m, &rcu_state);
248 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state);
250 return 0;
251}
252
253static int rcu_pending_open(struct inode *inode, struct file *file)
254{
255 return single_open(file, show_rcu_pending, NULL);
256}
257
258static struct file_operations rcu_pending_fops = {
259 .owner = THIS_MODULE,
260 .open = rcu_pending_open,
261 .read = seq_read,
262 .llseek = seq_lseek,
263 .release = single_release,
264};
265
266static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272
217static int __init rcuclassic_trace_init(void) 273static int __init rcuclassic_trace_init(void)
218{ 274{
219 rcudir = debugfs_create_dir("rcu", NULL); 275 rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
238 NULL, &rcuhier_fops); 294 NULL, &rcuhier_fops);
239 if (!hierdir) 295 if (!hierdir)
240 goto free_out; 296 goto free_out;
297
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir)
301 goto free_out;
241 return 0; 302 return 0;
242free_out: 303free_out:
243 if (datadir) 304 if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
257 debugfs_remove(datadir_csv); 318 debugfs_remove(datadir_csv);
258 debugfs_remove(gpdir); 319 debugfs_remove(gpdir);
259 debugfs_remove(hierdir); 320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
260 debugfs_remove(rcudir); 322 debugfs_remove(rcudir);
261} 323}
262 324
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..820c5af44f3e 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
300 * assigned pending owner [which might not have taken the 300 * assigned pending owner [which might not have taken the
301 * lock yet]: 301 * lock yet]:
302 */ 302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock) 303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
304{ 305{
305 struct task_struct *pendowner = rt_mutex_owner(lock); 306 struct task_struct *pendowner = rt_mutex_owner(lock);
306 struct rt_mutex_waiter *next; 307 struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
309 if (!rt_mutex_owner_pending(lock)) 310 if (!rt_mutex_owner_pending(lock))
310 return 0; 311 return 0;
311 312
312 if (pendowner == current) 313 if (pendowner == task)
313 return 1; 314 return 1;
314 315
315 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 spin_lock_irqsave(&pendowner->pi_lock, flags);
316 if (current->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
317 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
318 return 0; 319 return 0;
319 } 320 }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
338 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
339 * enqueued on the pending owners pi_waiters queue. So 340 * enqueued on the pending owners pi_waiters queue. So
340 * we have to enqueue this waiter into 341 * we have to enqueue this waiter into
341 * current->pi_waiters list. This covers the case, 342 * task->pi_waiters list. This covers the case,
342 * where current is boosted because it holds another 343 * where task is boosted because it holds another
343 * lock and gets unboosted because the booster is 344 * lock and gets unboosted because the booster is
344 * interrupted, so we would delay a waiter with higher 345 * interrupted, so we would delay a waiter with higher
345 * priority as current->normal_prio. 346 * priority as task->normal_prio.
346 * 347 *
347 * Note: in the rare case of a SCHED_OTHER task changing 348 * Note: in the rare case of a SCHED_OTHER task changing
348 * its priority and thus stealing the lock, next->task 349 * its priority and thus stealing the lock, next->task
349 * might be current: 350 * might be task:
350 */ 351 */
351 if (likely(next->task != current)) { 352 if (likely(next->task != task)) {
352 spin_lock_irqsave(&current->pi_lock, flags); 353 spin_lock_irqsave(&task->pi_lock, flags);
353 plist_add(&next->pi_list_entry, &current->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
354 __rt_mutex_adjust_prio(current); 355 __rt_mutex_adjust_prio(task);
355 spin_unlock_irqrestore(&current->pi_lock, flags); 356 spin_unlock_irqrestore(&task->pi_lock, flags);
356 } 357 }
357 return 1; 358 return 1;
358} 359}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
389 */ 390 */
390 mark_rt_mutex_waiters(lock); 391 mark_rt_mutex_waiters(lock);
391 392
392 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) 393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
393 return 0; 394 return 0;
394 395
395 /* We got the lock. */ 396 /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
411 */ 412 */
412static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 413static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
413 struct rt_mutex_waiter *waiter, 414 struct rt_mutex_waiter *waiter,
415 struct task_struct *task,
414 int detect_deadlock) 416 int detect_deadlock)
415{ 417{
416 struct task_struct *owner = rt_mutex_owner(lock); 418 struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
418 unsigned long flags; 420 unsigned long flags;
419 int chain_walk = 0, res; 421 int chain_walk = 0, res;
420 422
421 spin_lock_irqsave(&current->pi_lock, flags); 423 spin_lock_irqsave(&task->pi_lock, flags);
422 __rt_mutex_adjust_prio(current); 424 __rt_mutex_adjust_prio(task);
423 waiter->task = current; 425 waiter->task = task;
424 waiter->lock = lock; 426 waiter->lock = lock;
425 plist_node_init(&waiter->list_entry, current->prio); 427 plist_node_init(&waiter->list_entry, task->prio);
426 plist_node_init(&waiter->pi_list_entry, current->prio); 428 plist_node_init(&waiter->pi_list_entry, task->prio);
427 429
428 /* Get the top priority waiter on the lock */ 430 /* Get the top priority waiter on the lock */
429 if (rt_mutex_has_waiters(lock)) 431 if (rt_mutex_has_waiters(lock))
430 top_waiter = rt_mutex_top_waiter(lock); 432 top_waiter = rt_mutex_top_waiter(lock);
431 plist_add(&waiter->list_entry, &lock->wait_list); 433 plist_add(&waiter->list_entry, &lock->wait_list);
432 434
433 current->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
434 436
435 spin_unlock_irqrestore(&current->pi_lock, flags); 437 spin_unlock_irqrestore(&task->pi_lock, flags);
436 438
437 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
438 spin_lock_irqsave(&owner->pi_lock, flags); 440 spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
460 spin_unlock(&lock->wait_lock); 462 spin_unlock(&lock->wait_lock);
461 463
462 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
463 current); 465 task);
464 466
465 spin_lock(&lock->wait_lock); 467 spin_lock(&lock->wait_lock);
466 468
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
605 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 607 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
606} 608}
607 609
608/* 610/**
609 * Slow path lock function: 611 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
612 * @lock: the rt_mutex to take
613 * @state: the state the task should block in (TASK_INTERRUPTIBLE
614 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 *
619 * lock->wait_lock must be held by the caller.
610 */ 620 */
611static int __sched 621static int __sched
612rt_mutex_slowlock(struct rt_mutex *lock, int state, 622__rt_mutex_slowlock(struct rt_mutex *lock, int state,
613 struct hrtimer_sleeper *timeout, 623 struct hrtimer_sleeper *timeout,
614 int detect_deadlock) 624 struct rt_mutex_waiter *waiter,
625 int detect_deadlock)
615{ 626{
616 struct rt_mutex_waiter waiter;
617 int ret = 0; 627 int ret = 0;
618 628
619 debug_rt_mutex_init_waiter(&waiter);
620 waiter.task = NULL;
621
622 spin_lock(&lock->wait_lock);
623
624 /* Try to acquire the lock again: */
625 if (try_to_take_rt_mutex(lock)) {
626 spin_unlock(&lock->wait_lock);
627 return 0;
628 }
629
630 set_current_state(state);
631
632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) {
634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 if (!hrtimer_active(&timeout->timer))
636 timeout->task = NULL;
637 }
638
639 for (;;) { 629 for (;;) {
640 /* Try to acquire the lock: */ 630 /* Try to acquire the lock: */
641 if (try_to_take_rt_mutex(lock)) 631 if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
656 } 646 }
657 647
658 /* 648 /*
659 * waiter.task is NULL the first time we come here and 649 * waiter->task is NULL the first time we come here and
660 * when we have been woken up by the previous owner 650 * when we have been woken up by the previous owner
661 * but the lock got stolen by a higher prio task. 651 * but the lock got stolen by a higher prio task.
662 */ 652 */
663 if (!waiter.task) { 653 if (!waiter->task) {
664 ret = task_blocks_on_rt_mutex(lock, &waiter, 654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
665 detect_deadlock); 655 detect_deadlock);
666 /* 656 /*
667 * If we got woken up by the owner then start loop 657 * If we got woken up by the owner then start loop
668 * all over without going into schedule to try 658 * all over without going into schedule to try
669 * to get the lock now: 659 * to get the lock now:
670 */ 660 */
671 if (unlikely(!waiter.task)) { 661 if (unlikely(!waiter->task)) {
672 /* 662 /*
673 * Reset the return value. We might 663 * Reset the return value. We might
674 * have returned with -EDEADLK and the 664 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
684 674
685 spin_unlock(&lock->wait_lock); 675 spin_unlock(&lock->wait_lock);
686 676
687 debug_rt_mutex_print_deadlock(&waiter); 677 debug_rt_mutex_print_deadlock(waiter);
688 678
689 if (waiter.task) 679 if (waiter->task)
690 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
691 681
692 spin_lock(&lock->wait_lock); 682 spin_lock(&lock->wait_lock);
693 set_current_state(state); 683 set_current_state(state);
694 } 684 }
695 685
686 return ret;
687}
688
689/*
690 * Slow path lock function:
691 */
692static int __sched
693rt_mutex_slowlock(struct rt_mutex *lock, int state,
694 struct hrtimer_sleeper *timeout,
695 int detect_deadlock)
696{
697 struct rt_mutex_waiter waiter;
698 int ret = 0;
699
700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702
703 spin_lock(&lock->wait_lock);
704
705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock);
708 return 0;
709 }
710
711 set_current_state(state);
712
713 /* Setup the timer, when timeout != NULL */
714 if (unlikely(timeout)) {
715 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
716 if (!hrtimer_active(&timeout->timer))
717 timeout->task = NULL;
718 }
719
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
721 detect_deadlock);
722
696 set_current_state(TASK_RUNNING); 723 set_current_state(TASK_RUNNING);
697 724
698 if (unlikely(waiter.task)) 725 if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 891EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
865 892
866/** 893/**
867 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible 894 * rt_mutex_timed_lock - lock a rt_mutex interruptible
868 * the timeout structure is provided 895 * the timeout structure is provided
869 * by the caller 896 * by the caller
870 * 897 *
871 * @lock: the rt_mutex to be locked 898 * @lock: the rt_mutex to be locked
872 * @timeout: timeout structure or NULL (no timeout) 899 * @timeout: timeout structure or NULL (no timeout)
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
913} 940}
914EXPORT_SYMBOL_GPL(rt_mutex_unlock); 941EXPORT_SYMBOL_GPL(rt_mutex_unlock);
915 942
916/*** 943/**
917 * rt_mutex_destroy - mark a mutex unusable 944 * rt_mutex_destroy - mark a mutex unusable
918 * @lock: the mutex to be destroyed 945 * @lock: the mutex to be destroyed
919 * 946 *
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
986} 1013}
987 1014
988/** 1015/**
1016 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1017 * @lock: the rt_mutex to take
1018 * @waiter: the pre-initialized rt_mutex_waiter
1019 * @task: the task to prepare
1020 * @detect_deadlock: perform deadlock detection (1) or not (0)
1021 *
1022 * Returns:
1023 * 0 - task blocked on lock
1024 * 1 - acquired the lock for task, caller should wake it up
1025 * <0 - error
1026 *
1027 * Special API call for FUTEX_REQUEUE_PI support.
1028 */
1029int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1030 struct rt_mutex_waiter *waiter,
1031 struct task_struct *task, int detect_deadlock)
1032{
1033 int ret;
1034
1035 spin_lock(&lock->wait_lock);
1036
1037 mark_rt_mutex_waiters(lock);
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0);
1044
1045 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1;
1047 }
1048
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050
1051
1052 if (ret && !waiter->task) {
1053 /*
1054 * Reset the return value. We might have
1055 * returned with -EDEADLK and the owner
1056 * released the lock while we were walking the
1057 * pi chain. Let the waiter sort it out.
1058 */
1059 ret = 0;
1060 }
1061 spin_unlock(&lock->wait_lock);
1062
1063 debug_rt_mutex_print_deadlock(waiter);
1064
1065 return ret;
1066}
1067
1068/**
989 * rt_mutex_next_owner - return the next owner of the lock 1069 * rt_mutex_next_owner - return the next owner of the lock
990 * 1070 *
991 * @lock: the rt lock query 1071 * @lock: the rt lock query
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1004 1084
1005 return rt_mutex_top_waiter(lock)->task; 1085 return rt_mutex_top_waiter(lock)->task;
1006} 1086}
1087
1088/**
1089 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1090 * @lock: the rt_mutex we were woken on
1091 * @to: the timeout, null if none. hrtimer should already have
1092 * been started.
1093 * @waiter: the pre-initialized rt_mutex_waiter
1094 * @detect_deadlock: perform deadlock detection (1) or not (0)
1095 *
1096 * Complete the lock acquisition started our behalf by another thread.
1097 *
1098 * Returns:
1099 * 0 - success
1100 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
1101 *
1102 * Special API call for PI-futex requeue support
1103 */
1104int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1105 struct hrtimer_sleeper *to,
1106 struct rt_mutex_waiter *waiter,
1107 int detect_deadlock)
1108{
1109 int ret;
1110
1111 spin_lock(&lock->wait_lock);
1112
1113 set_current_state(TASK_INTERRUPTIBLE);
1114
1115 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
1116 detect_deadlock);
1117
1118 set_current_state(TASK_RUNNING);
1119
1120 if (unlikely(waiter->task))
1121 remove_waiter(lock, waiter);
1122
1123 /*
1124 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1125 * have to fix that up.
1126 */
1127 fixup_rt_mutex_waiters(lock);
1128
1129 spin_unlock(&lock->wait_lock);
1130
1131 /*
1132 * Readjust priority, when we did not get the lock. We might have been
1133 * the pending owner and boosted. Since we did not take the lock, the
1134 * PI boost has to go.
1135 */
1136 if (unlikely(ret))
1137 rt_mutex_adjust_prio(current);
1138
1139 return ret;
1140}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
124 struct rt_mutex_waiter *waiter,
125 struct task_struct *task,
126 int detect_deadlock);
127extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
128 struct hrtimer_sleeper *to,
129 struct rt_mutex_waiter *waiter,
130 int detect_deadlock);
123 131
124#ifdef CONFIG_DEBUG_RT_MUTEXES 132#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h" 133# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..f04aa9664504 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,17 +69,18 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
76 75
77#include <asm/tlb.h> 76#include <asm/tlb.h>
78#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
79 78
80#include "sched_cpupri.h" 79#include "sched_cpupri.h"
81 80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
82/* 84/*
83 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
118 */ 120 */
119#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
120 122
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
128 124
129static void double_rq_lock(struct rq *rq1, struct rq *rq2); 125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -584,6 +580,7 @@ struct rq {
584 struct load_weight load; 580 struct load_weight load;
585 unsigned long nr_load_updates; 581 unsigned long nr_load_updates;
586 u64 nr_switches; 582 u64 nr_switches;
583 u64 nr_migrations_in;
587 584
588 struct cfs_rq cfs; 585 struct cfs_rq cfs;
589 struct rt_rq rt; 586 struct rt_rq rt;
@@ -630,6 +627,10 @@ struct rq {
630 struct list_head migration_queue; 627 struct list_head migration_queue;
631#endif 628#endif
632 629
630 /* calc_load related fields */
631 unsigned long calc_load_update;
632 long calc_load_active;
633
633#ifdef CONFIG_SCHED_HRTICK 634#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 635#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 636 int hrtick_csd_pending;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 695
695static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
696{ 697{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 699}
@@ -1728,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1729}
1729#endif 1730#endif
1730 1731
1732static void calc_load_account_active(struct rq *this_rq);
1733
1731#include "sched_stats.h" 1734#include "sched_stats.h"
1732#include "sched_idletask.c" 1735#include "sched_idletask.c"
1733#include "sched_fair.c" 1736#include "sched_fair.c"
@@ -1958,7 +1961,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1958 1961
1959 clock_offset = old_rq->clock - new_rq->clock; 1962 clock_offset = old_rq->clock - new_rq->clock;
1960 1963
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1964 trace_sched_migrate_task(p, new_cpu);
1962 1965
1963#ifdef CONFIG_SCHEDSTATS 1966#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start) 1967 if (p->se.wait_start)
@@ -1967,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1970 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1971 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1972 p->se.block_start -= clock_offset;
1973#endif
1970 if (old_cpu != new_cpu) { 1974 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1975 p->se.nr_migrations++;
1976 new_rq->nr_migrations_in++;
1977#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1980#endif
1981 perf_counter_task_migration(p, new_cpu);
1982 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1983 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1984 new_cfsrq->min_vruntime;
1978 1985
@@ -2015,6 +2022,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2015} 2022}
2016 2023
2017/* 2024/*
2025 * wait_task_context_switch - wait for a thread to complete at least one
2026 * context switch.
2027 *
2028 * @p must not be current.
2029 */
2030void wait_task_context_switch(struct task_struct *p)
2031{
2032 unsigned long nvcsw, nivcsw, flags;
2033 int running;
2034 struct rq *rq;
2035
2036 nvcsw = p->nvcsw;
2037 nivcsw = p->nivcsw;
2038 for (;;) {
2039 /*
2040 * The runqueue is assigned before the actual context
2041 * switch. We need to take the runqueue lock.
2042 *
2043 * We could check initially without the lock but it is
2044 * very likely that we need to take the lock in every
2045 * iteration.
2046 */
2047 rq = task_rq_lock(p, &flags);
2048 running = task_running(rq, p);
2049 task_rq_unlock(rq, &flags);
2050
2051 if (likely(!running))
2052 break;
2053 /*
2054 * The switch count is incremented before the actual
2055 * context switch. We thus wait for two switches to be
2056 * sure at least one completed.
2057 */
2058 if ((p->nvcsw - nvcsw) > 1)
2059 break;
2060 if ((p->nivcsw - nivcsw) > 1)
2061 break;
2062
2063 cpu_relax();
2064 }
2065}
2066
2067/*
2018 * wait_task_inactive - wait for a thread to unschedule. 2068 * wait_task_inactive - wait for a thread to unschedule.
2019 * 2069 *
2020 * If @match_state is nonzero, it's the @p->state value just checked and 2070 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2324,6 +2374,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2374
2325#endif /* CONFIG_SMP */ 2375#endif /* CONFIG_SMP */
2326 2376
2377/**
2378 * task_oncpu_function_call - call a function on the cpu on which a task runs
2379 * @p: the task to evaluate
2380 * @func: the function to be called
2381 * @info: the function call argument
2382 *
2383 * Calls the function @func when the task is currently running. This might
2384 * be on the current CPU, which just calls the function directly
2385 */
2386void task_oncpu_function_call(struct task_struct *p,
2387 void (*func) (void *info), void *info)
2388{
2389 int cpu;
2390
2391 preempt_disable();
2392 cpu = task_cpu(p);
2393 if (task_curr(p))
2394 smp_call_function_single(cpu, func, info, 1);
2395 preempt_enable();
2396}
2397
2327/*** 2398/***
2328 * try_to_wake_up - wake up a thread 2399 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2400 * @p: the to-be-woken-up thread
@@ -2458,6 +2529,17 @@ out:
2458 return success; 2529 return success;
2459} 2530}
2460 2531
2532/**
2533 * wake_up_process - Wake up a specific process
2534 * @p: The process to be woken up.
2535 *
2536 * Attempt to wake up the nominated process and move it to the set of runnable
2537 * processes. Returns 1 if the process was woken up, 0 if it was already
2538 * running.
2539 *
2540 * It may be assumed that this function implies a write memory barrier before
2541 * changing the task state if and only if any tasks are woken up.
2542 */
2461int wake_up_process(struct task_struct *p) 2543int wake_up_process(struct task_struct *p)
2462{ 2544{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2545 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,6 +2562,7 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2562 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2563 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2564 p->se.prev_sum_exec_runtime = 0;
2565 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2566 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2567 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2568 p->se.start_runtime = 0;
@@ -2710,6 +2793,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2793 */
2711 prev_state = prev->state; 2794 prev_state = prev->state;
2712 finish_arch_switch(prev); 2795 finish_arch_switch(prev);
2796 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2797 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2798#ifdef CONFIG_SMP
2715 if (post_schedule) 2799 if (post_schedule)
@@ -2766,7 +2850,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2850 * combine the page table reload and the switch backend into
2767 * one hypercall. 2851 * one hypercall.
2768 */ 2852 */
2769 arch_enter_lazy_cpu_mode(); 2853 arch_start_context_switch(prev);
2770 2854
2771 if (unlikely(!mm)) { 2855 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2856 next->active_mm = oldmm;
@@ -2856,19 +2940,81 @@ unsigned long nr_iowait(void)
2856 return sum; 2940 return sum;
2857} 2941}
2858 2942
2859unsigned long nr_active(void) 2943/* Variables and functions for calc_load */
2944static atomic_long_t calc_load_tasks;
2945static unsigned long calc_load_update;
2946unsigned long avenrun[3];
2947EXPORT_SYMBOL(avenrun);
2948
2949/**
2950 * get_avenrun - get the load average array
2951 * @loads: pointer to dest load array
2952 * @offset: offset to add
2953 * @shift: shift count to shift the result left
2954 *
2955 * These values are estimates at best, so no need for locking.
2956 */
2957void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2958{
2861 unsigned long i, running = 0, uninterruptible = 0; 2959 loads[0] = (avenrun[0] + offset) << shift;
2960 loads[1] = (avenrun[1] + offset) << shift;
2961 loads[2] = (avenrun[2] + offset) << shift;
2962}
2862 2963
2863 for_each_online_cpu(i) { 2964static unsigned long
2864 running += cpu_rq(i)->nr_running; 2965calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2966{
2866 } 2967 load *= exp;
2968 load += active * (FIXED_1 - exp);
2969 return load >> FSHIFT;
2970}
2867 2971
2868 if (unlikely((long)uninterruptible < 0)) 2972/*
2869 uninterruptible = 0; 2973 * calc_load - update the avenrun load estimates 10 ticks after the
2974 * CPUs have updated calc_load_tasks.
2975 */
2976void calc_global_load(void)
2977{
2978 unsigned long upd = calc_load_update + 10;
2979 long active;
2870 2980
2871 return running + uninterruptible; 2981 if (time_before(jiffies, upd))
2982 return;
2983
2984 active = atomic_long_read(&calc_load_tasks);
2985 active = active > 0 ? active * FIXED_1 : 0;
2986
2987 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2988 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2989 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2990
2991 calc_load_update += LOAD_FREQ;
2992}
2993
2994/*
2995 * Either called from update_cpu_load() or from a cpu going idle
2996 */
2997static void calc_load_account_active(struct rq *this_rq)
2998{
2999 long nr_active, delta;
3000
3001 nr_active = this_rq->nr_running;
3002 nr_active += (long) this_rq->nr_uninterruptible;
3003
3004 if (nr_active != this_rq->calc_load_active) {
3005 delta = nr_active - this_rq->calc_load_active;
3006 this_rq->calc_load_active = nr_active;
3007 atomic_long_add(delta, &calc_load_tasks);
3008 }
3009}
3010
3011/*
3012 * Externally visible per-cpu scheduler statistics:
3013 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3014 */
3015u64 cpu_nr_migrations(int cpu)
3016{
3017 return cpu_rq(cpu)->nr_migrations_in;
2872} 3018}
2873 3019
2874/* 3020/*
@@ -2899,6 +3045,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 3045 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3046 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 3047 }
3048
3049 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3050 this_rq->calc_load_update += LOAD_FREQ;
3051 calc_load_account_active(this_rq);
3052 }
2902} 3053}
2903 3054
2904#ifdef CONFIG_SMP 3055#ifdef CONFIG_SMP
@@ -4240,10 +4391,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4391static struct {
4241 atomic_t load_balancer; 4392 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4393 cpumask_var_t cpu_mask;
4394 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4395} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4396 .load_balancer = ATOMIC_INIT(-1),
4245}; 4397};
4246 4398
4399#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4400/**
4401 * lowest_flag_domain - Return lowest sched_domain containing flag.
4402 * @cpu: The cpu whose lowest level of sched domain is to
4403 * be returned.
4404 * @flag: The flag to check for the lowest sched_domain
4405 * for the given cpu.
4406 *
4407 * Returns the lowest sched_domain of a cpu which contains the given flag.
4408 */
4409static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4410{
4411 struct sched_domain *sd;
4412
4413 for_each_domain(cpu, sd)
4414 if (sd && (sd->flags & flag))
4415 break;
4416
4417 return sd;
4418}
4419
4420/**
4421 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4422 * @cpu: The cpu whose domains we're iterating over.
4423 * @sd: variable holding the value of the power_savings_sd
4424 * for cpu.
4425 * @flag: The flag to filter the sched_domains to be iterated.
4426 *
4427 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4428 * set, starting from the lowest sched_domain to the highest.
4429 */
4430#define for_each_flag_domain(cpu, sd, flag) \
4431 for (sd = lowest_flag_domain(cpu, flag); \
4432 (sd && (sd->flags & flag)); sd = sd->parent)
4433
4434/**
4435 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4436 * @ilb_group: group to be checked for semi-idleness
4437 *
4438 * Returns: 1 if the group is semi-idle. 0 otherwise.
4439 *
4440 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4441 * and atleast one non-idle CPU. This helper function checks if the given
4442 * sched_group is semi-idle or not.
4443 */
4444static inline int is_semi_idle_group(struct sched_group *ilb_group)
4445{
4446 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4447 sched_group_cpus(ilb_group));
4448
4449 /*
4450 * A sched_group is semi-idle when it has atleast one busy cpu
4451 * and atleast one idle cpu.
4452 */
4453 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4454 return 0;
4455
4456 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4457 return 0;
4458
4459 return 1;
4460}
4461/**
4462 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4463 * @cpu: The cpu which is nominating a new idle_load_balancer.
4464 *
4465 * Returns: Returns the id of the idle load balancer if it exists,
4466 * Else, returns >= nr_cpu_ids.
4467 *
4468 * This algorithm picks the idle load balancer such that it belongs to a
4469 * semi-idle powersavings sched_domain. The idea is to try and avoid
4470 * completely idle packages/cores just for the purpose of idle load balancing
4471 * when there are other idle cpu's which are better suited for that job.
4472 */
4473static int find_new_ilb(int cpu)
4474{
4475 struct sched_domain *sd;
4476 struct sched_group *ilb_group;
4477
4478 /*
4479 * Have idle load balancer selection from semi-idle packages only
4480 * when power-aware load balancing is enabled
4481 */
4482 if (!(sched_smt_power_savings || sched_mc_power_savings))
4483 goto out_done;
4484
4485 /*
4486 * Optimize for the case when we have no idle CPUs or only one
4487 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4488 */
4489 if (cpumask_weight(nohz.cpu_mask) < 2)
4490 goto out_done;
4491
4492 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4493 ilb_group = sd->groups;
4494
4495 do {
4496 if (is_semi_idle_group(ilb_group))
4497 return cpumask_first(nohz.ilb_grp_nohz_mask);
4498
4499 ilb_group = ilb_group->next;
4500
4501 } while (ilb_group != sd->groups);
4502 }
4503
4504out_done:
4505 return cpumask_first(nohz.cpu_mask);
4506}
4507#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4508static inline int find_new_ilb(int call_cpu)
4509{
4510 return cpumask_first(nohz.cpu_mask);
4511}
4512#endif
4513
4247/* 4514/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4515 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4516 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4565,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4565 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4566 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4567 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4568 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4569 int new_ilb;
4570
4571 if (!(sched_smt_power_savings ||
4572 sched_mc_power_savings))
4573 return 1;
4574 /*
4575 * Check to see if there is a more power-efficient
4576 * ilb.
4577 */
4578 new_ilb = find_new_ilb(cpu);
4579 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4580 atomic_set(&nohz.load_balancer, -1);
4581 resched_cpu(new_ilb);
4582 return 0;
4583 }
4302 return 1; 4584 return 1;
4585 }
4303 } else { 4586 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4587 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4588 return 0;
@@ -4468,15 +4751,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4751 }
4469 4752
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4753 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4754 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4755
4481 if (ilb < nr_cpu_ids) 4756 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4757 resched_cpu(ilb);
@@ -4840,6 +5115,8 @@ void scheduler_tick(void)
4840 curr->sched_class->task_tick(rq, curr, 0); 5115 curr->sched_class->task_tick(rq, curr, 0);
4841 spin_unlock(&rq->lock); 5116 spin_unlock(&rq->lock);
4842 5117
5118 perf_counter_task_tick(curr, cpu);
5119
4843#ifdef CONFIG_SMP 5120#ifdef CONFIG_SMP
4844 rq->idle_at_tick = idle_cpu(cpu); 5121 rq->idle_at_tick = idle_cpu(cpu);
4845 trigger_load_balance(rq, cpu); 5122 trigger_load_balance(rq, cpu);
@@ -5007,13 +5284,15 @@ pick_next_task(struct rq *rq)
5007/* 5284/*
5008 * schedule() is the main scheduler function. 5285 * schedule() is the main scheduler function.
5009 */ 5286 */
5010asmlinkage void __sched __schedule(void) 5287asmlinkage void __sched schedule(void)
5011{ 5288{
5012 struct task_struct *prev, *next; 5289 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5290 unsigned long *switch_count;
5014 struct rq *rq; 5291 struct rq *rq;
5015 int cpu; 5292 int cpu;
5016 5293
5294need_resched:
5295 preempt_disable();
5017 cpu = smp_processor_id(); 5296 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5297 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5298 rcu_qsctr_inc(cpu);
@@ -5053,6 +5332,7 @@ need_resched_nonpreemptible:
5053 5332
5054 if (likely(prev != next)) { 5333 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5334 sched_info_switch(prev, next);
5335 perf_counter_task_sched_out(prev, next, cpu);
5056 5336
5057 rq->nr_switches++; 5337 rq->nr_switches++;
5058 rq->curr = next; 5338 rq->curr = next;
@@ -5070,15 +5350,9 @@ need_resched_nonpreemptible:
5070 5350
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5351 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5352 goto need_resched_nonpreemptible;
5073}
5074 5353
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5354 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5355 if (need_resched())
5082 goto need_resched; 5356 goto need_resched;
5083} 5357}
5084EXPORT_SYMBOL(schedule); 5358EXPORT_SYMBOL(schedule);
@@ -5221,7 +5495,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5495 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5496 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5497 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5498static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5499 int nr_exclusive, int sync, void *key)
5226{ 5500{
5227 wait_queue_t *curr, *next; 5501 wait_queue_t *curr, *next;
@@ -5241,6 +5515,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5241 * @mode: which threads 5515 * @mode: which threads
5242 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5516 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5243 * @key: is directly passed to the wakeup function 5517 * @key: is directly passed to the wakeup function
5518 *
5519 * It may be assumed that this function implies a write memory barrier before
5520 * changing the task state if and only if any tasks are woken up.
5244 */ 5521 */
5245void __wake_up(wait_queue_head_t *q, unsigned int mode, 5522void __wake_up(wait_queue_head_t *q, unsigned int mode,
5246 int nr_exclusive, void *key) 5523 int nr_exclusive, void *key)
@@ -5279,6 +5556,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279 * with each other. This can prevent needless bouncing between CPUs. 5556 * with each other. This can prevent needless bouncing between CPUs.
5280 * 5557 *
5281 * On UP it can prevent extra preemption. 5558 * On UP it can prevent extra preemption.
5559 *
5560 * It may be assumed that this function implies a write memory barrier before
5561 * changing the task state if and only if any tasks are woken up.
5282 */ 5562 */
5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5563void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5284 int nr_exclusive, void *key) 5564 int nr_exclusive, void *key)
@@ -5315,6 +5595,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5315 * awakened in the same order in which they were queued. 5595 * awakened in the same order in which they were queued.
5316 * 5596 *
5317 * See also complete_all(), wait_for_completion() and related routines. 5597 * See also complete_all(), wait_for_completion() and related routines.
5598 *
5599 * It may be assumed that this function implies a write memory barrier before
5600 * changing the task state if and only if any tasks are woken up.
5318 */ 5601 */
5319void complete(struct completion *x) 5602void complete(struct completion *x)
5320{ 5603{
@@ -5332,6 +5615,9 @@ EXPORT_SYMBOL(complete);
5332 * @x: holds the state of this particular completion 5615 * @x: holds the state of this particular completion
5333 * 5616 *
5334 * This will wake up all threads waiting on this particular completion event. 5617 * This will wake up all threads waiting on this particular completion event.
5618 *
5619 * It may be assumed that this function implies a write memory barrier before
5620 * changing the task state if and only if any tasks are woken up.
5335 */ 5621 */
5336void complete_all(struct completion *x) 5622void complete_all(struct completion *x)
5337{ 5623{
@@ -6490,8 +6776,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6776#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6777 free = stack_not_used(p);
6492#endif 6778#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6779 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6780 task_pid_nr(p), task_pid_nr(p->real_parent),
6781 (unsigned long)task_thread_info(p)->flags);
6495 6782
6496 show_stack(p, NULL); 6783 show_stack(p, NULL);
6497} 6784}
@@ -6970,6 +7257,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7257
6971 } 7258 }
6972} 7259}
7260
7261/*
7262 * remove the tasks which were accounted by rq from calc_load_tasks.
7263 */
7264static void calc_global_load_remove(struct rq *rq)
7265{
7266 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7267}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7268#endif /* CONFIG_HOTPLUG_CPU */
6974 7269
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7270#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7499,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7499 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7500 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7501 spin_lock_irqsave(&rq->lock, flags);
7502 rq->calc_load_update = calc_load_update;
7503 rq->calc_load_active = 0;
7207 if (rq->rd) { 7504 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7505 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7506
@@ -7243,7 +7540,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7540 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7541 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7542 BUG_ON(rq->nr_running != 0);
7246 7543 calc_global_load_remove(rq);
7247 /* 7544 /*
7248 * No need to migrate the tasks: it was best-effort if 7545 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7546 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7576,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7279 return NOTIFY_OK; 7576 return NOTIFY_OK;
7280} 7577}
7281 7578
7282/* Register at highest priority so that task migration (migrate_all_tasks) 7579/*
7283 * happens before everything else. 7580 * Register at high priority so that task migration (migrate_all_tasks)
7581 * happens before everything else. This has to be lower priority than
7582 * the notifier in the perf_counter subsystem, though.
7284 */ 7583 */
7285static struct notifier_block __cpuinitdata migration_notifier = { 7584static struct notifier_block __cpuinitdata migration_notifier = {
7286 .notifier_call = migration_call, 7585 .notifier_call = migration_call,
@@ -7525,24 +7824,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7525 7824
7526static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7825static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
7527{ 7826{
7827 gfp_t gfp = GFP_KERNEL;
7828
7528 memset(rd, 0, sizeof(*rd)); 7829 memset(rd, 0, sizeof(*rd));
7529 7830
7530 if (bootmem) { 7831 if (bootmem)
7531 alloc_bootmem_cpumask_var(&def_root_domain.span); 7832 gfp = GFP_NOWAIT;
7532 alloc_bootmem_cpumask_var(&def_root_domain.online);
7533 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7534 cpupri_init(&rd->cpupri, true);
7535 return 0;
7536 }
7537 7833
7538 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7834 if (!alloc_cpumask_var(&rd->span, gfp))
7539 goto out; 7835 goto out;
7540 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7836 if (!alloc_cpumask_var(&rd->online, gfp))
7541 goto free_span; 7837 goto free_span;
7542 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7838 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7543 goto free_online; 7839 goto free_online;
7544 7840
7545 if (cpupri_init(&rd->cpupri, false) != 0) 7841 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7546 goto free_rto_mask; 7842 goto free_rto_mask;
7547 return 0; 7843 return 0;
7548 7844
@@ -7753,8 +8049,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 8049
7754/* 8050/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 8051 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8052 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 8053 * ( See the the comments in include/linux/sched.h:struct sched_group
8054 * and struct sched_domain. )
7758 */ 8055 */
7759struct static_sched_group { 8056struct static_sched_group {
7760 struct sched_group sg; 8057 struct sched_group sg;
@@ -7875,7 +8172,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8172 struct sched_domain *sd;
7876 8173
7877 sd = &per_cpu(phys_domains, j).sd; 8174 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8175 if (j != group_first_cpu(sd->groups)) {
7879 /* 8176 /*
7880 * Only add "power" once for each 8177 * Only add "power" once for each
7881 * physical package. 8178 * physical package.
@@ -7953,7 +8250,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8250
7954 WARN_ON(!sd || !sd->groups); 8251 WARN_ON(!sd || !sd->groups);
7955 8252
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8253 if (cpu != group_first_cpu(sd->groups))
7957 return; 8254 return;
7958 8255
7959 child = sd->child; 8256 child = sd->child;
@@ -8865,7 +9162,7 @@ void __init sched_init(void)
8865 * we use alloc_bootmem(). 9162 * we use alloc_bootmem().
8866 */ 9163 */
8867 if (alloc_size) { 9164 if (alloc_size) {
8868 ptr = (unsigned long)alloc_bootmem(alloc_size); 9165 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8869 9166
8870#ifdef CONFIG_FAIR_GROUP_SCHED 9167#ifdef CONFIG_FAIR_GROUP_SCHED
8871 init_task_group.se = (struct sched_entity **)ptr; 9168 init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9235,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9235 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9236 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9237 rq->nr_running = 0;
9238 rq->calc_load_active = 0;
9239 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9240 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9241 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9242#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9257,7 @@ void __init sched_init(void)
8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9257 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8959 * then A0's share of the cpu resource is: 9258 * then A0's share of the cpu resource is:
8960 * 9259 *
8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9260 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8962 * 9261 *
8963 * We achieve this by letting init_task_group's tasks sit 9262 * We achieve this by letting init_task_group's tasks sit
8964 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9263 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9344,26 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9344 * when this runqueue becomes "idle".
9046 */ 9345 */
9047 init_idle(current, smp_processor_id()); 9346 init_idle(current, smp_processor_id());
9347
9348 calc_load_update = jiffies + LOAD_FREQ;
9349
9048 /* 9350 /*
9049 * During early bootup we pretend to be a normal task: 9351 * During early bootup we pretend to be a normal task:
9050 */ 9352 */
9051 current->sched_class = &fair_sched_class; 9353 current->sched_class = &fair_sched_class;
9052 9354
9053 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9355 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9054 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9356 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9055#ifdef CONFIG_SMP 9357#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9358#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9359 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9360 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9058#endif 9361#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9362 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9060#endif /* SMP */ 9363#endif /* SMP */
9061 9364
9365 perf_counter_init();
9366
9062 scheduler_running = 1; 9367 scheduler_running = 1;
9063} 9368}
9064 9369
@@ -9800,6 +10105,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10105 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10106 return -EINVAL;
9802 10107
10108 /*
10109 * There's always some RT tasks in the root group
10110 * -- migration, kstopmachine etc..
10111 */
10112 if (sysctl_sched_rt_runtime == 0)
10113 return -EBUSY;
10114
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10115 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10116 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10117 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a5e3ed..7deffc9f0e5f 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 */ 154 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
156{ 156{
157 gfp_t gfp = GFP_KERNEL;
157 int i; 158 int i;
158 159
160 if (bootmem)
161 gfp = GFP_NOWAIT;
162
159 memset(cp, 0, sizeof(*cp)); 163 memset(cp, 0, sizeof(*cp));
160 164
161 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 165 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
163 167
164 spin_lock_init(&vec->lock); 168 spin_lock_init(&vec->lock);
165 vec->count = 0; 169 vec->count = 0;
166 if (bootmem) 170 if (!zalloc_cpumask_var(&vec->mask, gfp))
167 alloc_bootmem_cpumask_var(&vec->mask);
168 else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
169 goto cleanup; 171 goto cleanup;
170 } 172 }
171 173
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..5f9650e8fe75 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1487
1488 find_matching_se(&se, &pse); 1488 find_matching_se(&se, &pse);
1489 1489
1490 while (se) { 1490 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492 1491
1493 if (wakeup_preempt_entity(se, pse) == 1) { 1492 if (wakeup_preempt_entity(se, pse) == 1)
1494 resched_task(curr); 1493 resched_task(curr);
1495 break;
1496 }
1497
1498 se = parent_entity(se);
1499 pse = parent_entity(pse);
1500 }
1501} 1494}
1502 1495
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1496static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4c..809a228019ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
27#include <linux/freezer.h> 27#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/sched.h> 30#include <trace/events/sched.h>
31 31
32#include <asm/param.h> 32#include <asm/param.h>
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
@@ -41,8 +41,6 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
46static void __user *sig_handler(struct task_struct *t, int sig) 44static void __user *sig_handler(struct task_struct *t, int sig)
47{ 45{
48 return t->sighand->action[sig - 1].sa.sa_handler; 46 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
249/* 247/*
250 * Flush all pending signals for a task. 248 * Flush all pending signals for a task.
251 */ 249 */
250void __flush_signals(struct task_struct *t)
251{
252 clear_tsk_thread_flag(t, TIF_SIGPENDING);
253 flush_sigqueue(&t->pending);
254 flush_sigqueue(&t->signal->shared_pending);
255}
256
252void flush_signals(struct task_struct *t) 257void flush_signals(struct task_struct *t)
253{ 258{
254 unsigned long flags; 259 unsigned long flags;
255 260
256 spin_lock_irqsave(&t->sighand->siglock, flags); 261 spin_lock_irqsave(&t->sighand->siglock, flags);
257 clear_tsk_thread_flag(t, TIF_SIGPENDING); 262 __flush_signals(t);
258 flush_sigqueue(&t->pending);
259 flush_sigqueue(&t->signal->shared_pending);
260 spin_unlock_irqrestore(&t->sighand->siglock, flags); 263 spin_unlock_irqrestore(&t->sighand->siglock, flags);
261} 264}
262 265
@@ -2278,24 +2281,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2278 return kill_something_info(sig, &info, pid); 2281 return kill_something_info(sig, &info, pid);
2279} 2282}
2280 2283
2281static int do_tkill(pid_t tgid, pid_t pid, int sig) 2284static int
2285do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2282{ 2286{
2283 int error;
2284 struct siginfo info;
2285 struct task_struct *p; 2287 struct task_struct *p;
2286 unsigned long flags; 2288 unsigned long flags;
2287 2289 int error = -ESRCH;
2288 error = -ESRCH;
2289 info.si_signo = sig;
2290 info.si_errno = 0;
2291 info.si_code = SI_TKILL;
2292 info.si_pid = task_tgid_vnr(current);
2293 info.si_uid = current_uid();
2294 2290
2295 rcu_read_lock(); 2291 rcu_read_lock();
2296 p = find_task_by_vpid(pid); 2292 p = find_task_by_vpid(pid);
2297 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2293 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2298 error = check_kill_permission(sig, &info, p); 2294 error = check_kill_permission(sig, info, p);
2299 /* 2295 /*
2300 * The null signal is a permissions and process existence 2296 * The null signal is a permissions and process existence
2301 * probe. No signal is actually delivered. 2297 * probe. No signal is actually delivered.
@@ -2305,7 +2301,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2305 * signal is private anyway. 2301 * signal is private anyway.
2306 */ 2302 */
2307 if (!error && sig && lock_task_sighand(p, &flags)) { 2303 if (!error && sig && lock_task_sighand(p, &flags)) {
2308 error = specific_send_sig_info(sig, &info, p); 2304 error = specific_send_sig_info(sig, info, p);
2309 unlock_task_sighand(p, &flags); 2305 unlock_task_sighand(p, &flags);
2310 } 2306 }
2311 } 2307 }
@@ -2314,6 +2310,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2314 return error; 2310 return error;
2315} 2311}
2316 2312
2313static int do_tkill(pid_t tgid, pid_t pid, int sig)
2314{
2315 struct siginfo info;
2316
2317 info.si_signo = sig;
2318 info.si_errno = 0;
2319 info.si_code = SI_TKILL;
2320 info.si_pid = task_tgid_vnr(current);
2321 info.si_uid = current_uid();
2322
2323 return do_send_specific(tgid, pid, sig, &info);
2324}
2325
2317/** 2326/**
2318 * sys_tgkill - send signal to one specific thread 2327 * sys_tgkill - send signal to one specific thread
2319 * @tgid: the thread group ID of the thread 2328 * @tgid: the thread group ID of the thread
@@ -2363,6 +2372,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2363 return kill_proc_info(sig, &info, pid); 2372 return kill_proc_info(sig, &info, pid);
2364} 2373}
2365 2374
2375long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2376{
2377 /* This is only valid for single tasks */
2378 if (pid <= 0 || tgid <= 0)
2379 return -EINVAL;
2380
2381 /* Not even root can pretend to send signals from the kernel.
2382 Nor can they impersonate a kill(), which adds source info. */
2383 if (info->si_code >= 0)
2384 return -EPERM;
2385 info->si_signo = sig;
2386
2387 return do_send_specific(tgid, pid, sig, info);
2388}
2389
2390SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2391 siginfo_t __user *, uinfo)
2392{
2393 siginfo_t info;
2394
2395 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2396 return -EFAULT;
2397
2398 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2399}
2400
2366int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2401int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2367{ 2402{
2368 struct task_struct *t = current; 2403 struct task_struct *t = current;
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f43..521ed2004d63 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -372,8 +372,8 @@ static int slow_work_thread(void *_data)
372 vsmax *= atomic_read(&slow_work_thread_count); 372 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100; 373 vsmax /= 100;
374 374
375 prepare_to_wait(&slow_work_thread_wq, &wait, 375 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE); 376 TASK_INTERRUPTIBLE);
377 if (!freezing(current) && 377 if (!freezing(current) &&
378 !slow_work_threads_should_exit && 378 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) && 379 !slow_work_available(vsmax) &&
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511..258885a543db 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,9 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/tick.h> 26#include <linux/tick.h>
27#include <trace/irq.h> 27
28#define CREATE_TRACE_POINTS
29#include <trace/events/irq.h>
28 30
29#include <asm/irq.h> 31#include <asm/irq.h>
30/* 32/*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
186 */ 188 */
187#define MAX_SOFTIRQ_RESTART 10 189#define MAX_SOFTIRQ_RESTART 10
188 190
189DEFINE_TRACE(softirq_entry);
190DEFINE_TRACE(softirq_exit);
191
192asmlinkage void __do_softirq(void) 191asmlinkage void __do_softirq(void)
193{ 192{
194 struct softirq_action *h; 193 struct softirq_action *h;
@@ -828,7 +827,7 @@ int __init __weak arch_early_irq_init(void)
828 return 0; 827 return 0;
829} 828}
830 829
831int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) 830int __weak arch_init_chip_data(struct irq_desc *desc, int node)
832{ 831{
833 return 0; 832 return 0;
834} 833}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..438d99a38c87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1794 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1795 error = SET_TSC_CTL(arg2);
1795 break; 1796 break;
1797 case PR_TASK_PERF_COUNTERS_DISABLE:
1798 error = perf_counter_task_disable();
1799 break;
1800 case PR_TASK_PERF_COUNTERS_ENABLE:
1801 error = perf_counter_task_enable();
1802 break;
1796 case PR_GET_TIMERSLACK: 1803 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1804 error = current->timer_slack_ns;
1798 break; 1805 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d56fb76..ce664f98e3fb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_counter.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/processor.h> 55#include <asm/processor.h>
@@ -114,6 +115,7 @@ static int ngroups_max = NGROUPS_MAX;
114 115
115#ifdef CONFIG_MODULES 116#ifdef CONFIG_MODULES
116extern char modprobe_path[]; 117extern char modprobe_path[];
118extern int modules_disabled;
117#endif 119#endif
118#ifdef CONFIG_CHR_DEV_SG 120#ifdef CONFIG_CHR_DEV_SG
119extern int sg_big_buff; 121extern int sg_big_buff;
@@ -534,6 +536,17 @@ static struct ctl_table kern_table[] = {
534 .proc_handler = &proc_dostring, 536 .proc_handler = &proc_dostring,
535 .strategy = &sysctl_string, 537 .strategy = &sysctl_string,
536 }, 538 },
539 {
540 .ctl_name = CTL_UNNUMBERED,
541 .procname = "modules_disabled",
542 .data = &modules_disabled,
543 .maxlen = sizeof(int),
544 .mode = 0644,
545 /* only handle a transition from default "0" to "1" */
546 .proc_handler = &proc_dointvec_minmax,
547 .extra1 = &one,
548 .extra2 = &one,
549 },
537#endif 550#endif
538#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 551#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
539 { 552 {
@@ -731,6 +744,14 @@ static struct ctl_table kern_table[] = {
731 }, 744 },
732 { 745 {
733 .ctl_name = CTL_UNNUMBERED, 746 .ctl_name = CTL_UNNUMBERED,
747 .procname = "bootloader_version",
748 .data = &bootloader_version,
749 .maxlen = sizeof (int),
750 .mode = 0444,
751 .proc_handler = &proc_dointvec,
752 },
753 {
754 .ctl_name = CTL_UNNUMBERED,
734 .procname = "kstack_depth_to_print", 755 .procname = "kstack_depth_to_print",
735 .data = &kstack_depth_to_print, 756 .data = &kstack_depth_to_print,
736 .maxlen = sizeof(int), 757 .maxlen = sizeof(int),
@@ -912,6 +933,32 @@ static struct ctl_table kern_table[] = {
912 .child = slow_work_sysctls, 933 .child = slow_work_sysctls,
913 }, 934 },
914#endif 935#endif
936#ifdef CONFIG_PERF_COUNTERS
937 {
938 .ctl_name = CTL_UNNUMBERED,
939 .procname = "perf_counter_paranoid",
940 .data = &sysctl_perf_counter_paranoid,
941 .maxlen = sizeof(sysctl_perf_counter_paranoid),
942 .mode = 0644,
943 .proc_handler = &proc_dointvec,
944 },
945 {
946 .ctl_name = CTL_UNNUMBERED,
947 .procname = "perf_counter_mlock_kb",
948 .data = &sysctl_perf_counter_mlock,
949 .maxlen = sizeof(sysctl_perf_counter_mlock),
950 .mode = 0644,
951 .proc_handler = &proc_dointvec,
952 },
953 {
954 .ctl_name = CTL_UNNUMBERED,
955 .procname = "perf_counter_max_sample_rate",
956 .data = &sysctl_perf_counter_sample_rate,
957 .maxlen = sizeof(sysctl_perf_counter_sample_rate),
958 .mode = 0644,
959 .proc_handler = &proc_dointvec,
960 },
961#endif
915/* 962/*
916 * NOTE: do not add new entries to this table unless you have read 963 * NOTE: do not add new entries to this table unless you have read
917 * Documentation/sysctl/ctl_unnumbered.txt 964 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1225,7 +1272,6 @@ static struct ctl_table vm_table[] = {
1225 .strategy = &sysctl_jiffies, 1272 .strategy = &sysctl_jiffies,
1226 }, 1273 },
1227#endif 1274#endif
1228#ifdef CONFIG_SECURITY
1229 { 1275 {
1230 .ctl_name = CTL_UNNUMBERED, 1276 .ctl_name = CTL_UNNUMBERED,
1231 .procname = "mmap_min_addr", 1277 .procname = "mmap_min_addr",
@@ -1234,7 +1280,6 @@ static struct ctl_table vm_table[] = {
1234 .mode = 0644, 1280 .mode = 0644,
1235 .proc_handler = &proc_doulongvec_minmax, 1281 .proc_handler = &proc_doulongvec_minmax,
1236 }, 1282 },
1237#endif
1238#ifdef CONFIG_NUMA 1283#ifdef CONFIG_NUMA
1239 { 1284 {
1240 .ctl_name = CTL_UNNUMBERED, 1285 .ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e0..80189f6f1c5a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
402 unsigned long flags; 402 unsigned long flags;
403 int ret; 403 int ret;
404 404
405 /* save mult_orig on registration */
406 c->mult_orig = c->mult;
407
408 spin_lock_irqsave(&clocksource_lock, flags); 405 spin_lock_irqsave(&clocksource_lock, flags);
409 ret = clocksource_enqueue(c); 406 ret = clocksource_enqueue(c);
410 if (!ret) 407 if (!ret)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..e8c77d9c633a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
77 clock->cycle_last = cycle_now; 77 clock->cycle_last = cycle_now;
78 78
79 nsec = cyc2ns(clock, cycle_delta); 79 nsec = cyc2ns(clock, cycle_delta);
80
81 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset();
83
80 timespec_add_ns(&xtime, nsec); 84 timespec_add_ns(&xtime, nsec);
81 85
82 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
111 /* convert to nanoseconds: */ 115 /* convert to nanoseconds: */
112 nsecs = cyc2ns(clock, cycle_delta); 116 nsecs = cyc2ns(clock, cycle_delta);
113 117
118 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset();
120
114 } while (read_seqretry(&xtime_lock, seq)); 121 } while (read_seqretry(&xtime_lock, seq));
115 122
116 timespec_add_ns(ts, nsecs); 123 timespec_add_ns(ts, nsecs);
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..c01e568935ea 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1123,53 +1124,14 @@ void update_process_times(int user_tick)
1123} 1124}
1124 1125
1125/* 1126/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1127 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1128 */
1169static void run_timer_softirq(struct softirq_action *h) 1129static void run_timer_softirq(struct softirq_action *h)
1170{ 1130{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1131 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1132
1133 perf_counter_do_pending();
1134
1173 hrtimer_run_pending(); 1135 hrtimer_run_pending();
1174 1136
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1137 if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1187,16 +1149,6 @@ void run_local_timers(void)
1187} 1149}
1188 1150
1189/* 1151/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1152 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1153 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1154 * jiffies is defined in the linker script...
@@ -1205,7 +1157,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1157void do_timer(unsigned long ticks)
1206{ 1158{
1207 jiffies_64 += ticks; 1159 jiffies_64 += ticks;
1208 update_times(ticks); 1160 update_wall_time();
1161 calc_global_load();
1209} 1162}
1210 1163
1211#ifdef __ARCH_WANT_SYS_ALARM 1164#ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1359,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1359{
1407 unsigned long mem_total, sav_total; 1360 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1361 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1362 struct timespec tp;
1410 1363
1411 memset(info, 0, sizeof(struct sysinfo)); 1364 memset(info, 0, sizeof(struct sysinfo));
1412 1365
1413 do { 1366 ktime_get_ts(&tp);
1414 struct timespec tp; 1367 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1368 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1369
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1370 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1371
1438 info->procs = nr_threads; 1372 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1373
1441 si_meminfo(info); 1374 si_meminfo(info);
1442 si_swapinfo(info); 1375 si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 417d1985e299..4a13e5a01ce3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -48,6 +48,21 @@ config FTRACE_NMI_ENTER
48 depends on HAVE_FTRACE_NMI_ENTER 48 depends on HAVE_FTRACE_NMI_ENTER
49 default y 49 default y
50 50
51config EVENT_TRACING
52 select CONTEXT_SWITCH_TRACER
53 bool
54
55config CONTEXT_SWITCH_TRACER
56 select MARKERS
57 bool
58
59# All tracer options should select GENERIC_TRACER. For those options that are
60# enabled by all tracers (context switch and event tracer) they select TRACING.
61# This allows those options to appear when no other tracer is selected. But the
62# options do not appear when something else selects it. We need the two options
63# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
64# hidding of the automatic options options.
65
51config TRACING 66config TRACING
52 bool 67 bool
53 select DEBUG_FS 68 select DEBUG_FS
@@ -56,6 +71,11 @@ config TRACING
56 select TRACEPOINTS 71 select TRACEPOINTS
57 select NOP_TRACER 72 select NOP_TRACER
58 select BINARY_PRINTF 73 select BINARY_PRINTF
74 select EVENT_TRACING
75
76config GENERIC_TRACER
77 bool
78 select TRACING
59 79
60# 80#
61# Minimum requirements an architecture has to meet for us to 81# Minimum requirements an architecture has to meet for us to
@@ -73,14 +93,20 @@ config TRACING_SUPPORT
73 93
74if TRACING_SUPPORT 94if TRACING_SUPPORT
75 95
76menu "Tracers" 96menuconfig FTRACE
97 bool "Tracers"
98 default y if DEBUG_KERNEL
99 help
100 Enable the kernel tracing infrastructure.
101
102if FTRACE
77 103
78config FUNCTION_TRACER 104config FUNCTION_TRACER
79 bool "Kernel Function Tracer" 105 bool "Kernel Function Tracer"
80 depends on HAVE_FUNCTION_TRACER 106 depends on HAVE_FUNCTION_TRACER
81 select FRAME_POINTER 107 select FRAME_POINTER
82 select KALLSYMS 108 select KALLSYMS
83 select TRACING 109 select GENERIC_TRACER
84 select CONTEXT_SWITCH_TRACER 110 select CONTEXT_SWITCH_TRACER
85 help 111 help
86 Enable the kernel to trace every kernel function. This is done 112 Enable the kernel to trace every kernel function. This is done
@@ -104,13 +130,14 @@ config FUNCTION_GRAPH_TRACER
104 the return value. This is done by setting the current return 130 the return value. This is done by setting the current return
105 address on the current task structure into a stack of calls. 131 address on the current task structure into a stack of calls.
106 132
133
107config IRQSOFF_TRACER 134config IRQSOFF_TRACER
108 bool "Interrupts-off Latency Tracer" 135 bool "Interrupts-off Latency Tracer"
109 default n 136 default n
110 depends on TRACE_IRQFLAGS_SUPPORT 137 depends on TRACE_IRQFLAGS_SUPPORT
111 depends on GENERIC_TIME 138 depends on GENERIC_TIME
112 select TRACE_IRQFLAGS 139 select TRACE_IRQFLAGS
113 select TRACING 140 select GENERIC_TRACER
114 select TRACER_MAX_TRACE 141 select TRACER_MAX_TRACE
115 help 142 help
116 This option measures the time spent in irqs-off critical 143 This option measures the time spent in irqs-off critical
@@ -131,7 +158,7 @@ config PREEMPT_TRACER
131 default n 158 default n
132 depends on GENERIC_TIME 159 depends on GENERIC_TIME
133 depends on PREEMPT 160 depends on PREEMPT
134 select TRACING 161 select GENERIC_TRACER
135 select TRACER_MAX_TRACE 162 select TRACER_MAX_TRACE
136 help 163 help
137 This option measures the time spent in preemption off critical 164 This option measures the time spent in preemption off critical
@@ -150,7 +177,7 @@ config PREEMPT_TRACER
150config SYSPROF_TRACER 177config SYSPROF_TRACER
151 bool "Sysprof Tracer" 178 bool "Sysprof Tracer"
152 depends on X86 179 depends on X86
153 select TRACING 180 select GENERIC_TRACER
154 select CONTEXT_SWITCH_TRACER 181 select CONTEXT_SWITCH_TRACER
155 help 182 help
156 This tracer provides the trace needed by the 'Sysprof' userspace 183 This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,40 +185,33 @@ config SYSPROF_TRACER
158 185
159config SCHED_TRACER 186config SCHED_TRACER
160 bool "Scheduling Latency Tracer" 187 bool "Scheduling Latency Tracer"
161 select TRACING 188 select GENERIC_TRACER
162 select CONTEXT_SWITCH_TRACER 189 select CONTEXT_SWITCH_TRACER
163 select TRACER_MAX_TRACE 190 select TRACER_MAX_TRACE
164 help 191 help
165 This tracer tracks the latency of the highest priority task 192 This tracer tracks the latency of the highest priority task
166 to be scheduled in, starting from the point it has woken up. 193 to be scheduled in, starting from the point it has woken up.
167 194
168config CONTEXT_SWITCH_TRACER 195config ENABLE_DEFAULT_TRACERS
169 bool "Trace process context switches" 196 bool "Trace process context switches and events"
170 select TRACING 197 depends on !GENERIC_TRACER
171 select MARKERS
172 help
173 This tracer gets called from the context switch and records
174 all switching of tasks.
175
176config EVENT_TRACER
177 bool "Trace various events in the kernel"
178 select TRACING 198 select TRACING
179 help 199 help
180 This tracer hooks to various trace points in the kernel 200 This tracer hooks to various trace points in the kernel
181 allowing the user to pick and choose which trace point they 201 allowing the user to pick and choose which trace point they
182 want to trace. 202 want to trace. It also includes the sched_switch tracer plugin.
183 203
184config FTRACE_SYSCALLS 204config FTRACE_SYSCALLS
185 bool "Trace syscalls" 205 bool "Trace syscalls"
186 depends on HAVE_FTRACE_SYSCALLS 206 depends on HAVE_FTRACE_SYSCALLS
187 select TRACING 207 select GENERIC_TRACER
188 select KALLSYMS 208 select KALLSYMS
189 help 209 help
190 Basic tracer to catch the syscall entry and exit events. 210 Basic tracer to catch the syscall entry and exit events.
191 211
192config BOOT_TRACER 212config BOOT_TRACER
193 bool "Trace boot initcalls" 213 bool "Trace boot initcalls"
194 select TRACING 214 select GENERIC_TRACER
195 select CONTEXT_SWITCH_TRACER 215 select CONTEXT_SWITCH_TRACER
196 help 216 help
197 This tracer helps developers to optimize boot times: it records 217 This tracer helps developers to optimize boot times: it records
@@ -207,8 +227,36 @@ config BOOT_TRACER
207 to enable this on bootup. 227 to enable this on bootup.
208 228
209config TRACE_BRANCH_PROFILING 229config TRACE_BRANCH_PROFILING
230 bool
231 select GENERIC_TRACER
232
233choice
234 prompt "Branch Profiling"
235 default BRANCH_PROFILE_NONE
236 help
237 The branch profiling is a software profiler. It will add hooks
238 into the C conditionals to test which path a branch takes.
239
240 The likely/unlikely profiler only looks at the conditions that
241 are annotated with a likely or unlikely macro.
242
243 The "all branch" profiler will profile every if statement in the
244 kernel. This profiler will also enable the likely/unlikely
245 profiler as well.
246
247 Either of the above profilers add a bit of overhead to the system.
248 If unsure choose "No branch profiling".
249
250config BRANCH_PROFILE_NONE
251 bool "No branch profiling"
252 help
253 No branch profiling. Branch profiling adds a bit of overhead.
254 Only enable it if you want to analyse the branching behavior.
255 Otherwise keep it disabled.
256
257config PROFILE_ANNOTATED_BRANCHES
210 bool "Trace likely/unlikely profiler" 258 bool "Trace likely/unlikely profiler"
211 select TRACING 259 select TRACE_BRANCH_PROFILING
212 help 260 help
213 This tracer profiles all the the likely and unlikely macros 261 This tracer profiles all the the likely and unlikely macros
214 in the kernel. It will display the results in: 262 in the kernel. It will display the results in:
@@ -218,11 +266,9 @@ config TRACE_BRANCH_PROFILING
218 Note: this will add a significant overhead, only turn this 266 Note: this will add a significant overhead, only turn this
219 on if you need to profile the system's use of these macros. 267 on if you need to profile the system's use of these macros.
220 268
221 Say N if unsure.
222
223config PROFILE_ALL_BRANCHES 269config PROFILE_ALL_BRANCHES
224 bool "Profile all if conditionals" 270 bool "Profile all if conditionals"
225 depends on TRACE_BRANCH_PROFILING 271 select TRACE_BRANCH_PROFILING
226 help 272 help
227 This tracer profiles all branch conditions. Every if () 273 This tracer profiles all branch conditions. Every if ()
228 taken in the kernel is recorded whether it hit or miss. 274 taken in the kernel is recorded whether it hit or miss.
@@ -230,11 +276,12 @@ config PROFILE_ALL_BRANCHES
230 276
231 /debugfs/tracing/profile_branch 277 /debugfs/tracing/profile_branch
232 278
279 This option also enables the likely/unlikely profiler.
280
233 This configuration, when enabled, will impose a great overhead 281 This configuration, when enabled, will impose a great overhead
234 on the system. This should only be enabled when the system 282 on the system. This should only be enabled when the system
235 is to be analyzed 283 is to be analyzed
236 284endchoice
237 Say N if unsure.
238 285
239config TRACING_BRANCHES 286config TRACING_BRANCHES
240 bool 287 bool
@@ -261,7 +308,7 @@ config BRANCH_TRACER
261config POWER_TRACER 308config POWER_TRACER
262 bool "Trace power consumption behavior" 309 bool "Trace power consumption behavior"
263 depends on X86 310 depends on X86
264 select TRACING 311 select GENERIC_TRACER
265 help 312 help
266 This tracer helps developers to analyze and optimize the kernels 313 This tracer helps developers to analyze and optimize the kernels
267 power management decisions, specifically the C-state and P-state 314 power management decisions, specifically the C-state and P-state
@@ -295,14 +342,14 @@ config STACK_TRACER
295config HW_BRANCH_TRACER 342config HW_BRANCH_TRACER
296 depends on HAVE_HW_BRANCH_TRACER 343 depends on HAVE_HW_BRANCH_TRACER
297 bool "Trace hw branches" 344 bool "Trace hw branches"
298 select TRACING 345 select GENERIC_TRACER
299 help 346 help
300 This tracer records all branches on the system in a circular 347 This tracer records all branches on the system in a circular
301 buffer giving access to the last N branches for each cpu. 348 buffer giving access to the last N branches for each cpu.
302 349
303config KMEMTRACE 350config KMEMTRACE
304 bool "Trace SLAB allocations" 351 bool "Trace SLAB allocations"
305 select TRACING 352 select GENERIC_TRACER
306 help 353 help
307 kmemtrace provides tracing for slab allocator functions, such as 354 kmemtrace provides tracing for slab allocator functions, such as
308 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 355 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +369,7 @@ config KMEMTRACE
322 369
323config WORKQUEUE_TRACER 370config WORKQUEUE_TRACER
324 bool "Trace workqueues" 371 bool "Trace workqueues"
325 select TRACING 372 select GENERIC_TRACER
326 help 373 help
327 The workqueue tracer provides some statistical informations 374 The workqueue tracer provides some statistical informations
328 about each cpu workqueue thread such as the number of the 375 about each cpu workqueue thread such as the number of the
@@ -338,7 +385,7 @@ config BLK_DEV_IO_TRACE
338 select RELAY 385 select RELAY
339 select DEBUG_FS 386 select DEBUG_FS
340 select TRACEPOINTS 387 select TRACEPOINTS
341 select TRACING 388 select GENERIC_TRACER
342 select STACKTRACE 389 select STACKTRACE
343 help 390 help
344 Say Y here if you want to be able to trace the block layer actions 391 Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +422,20 @@ config DYNAMIC_FTRACE
375 were made. If so, it runs stop_machine (stops all CPUS) 422 were made. If so, it runs stop_machine (stops all CPUS)
376 and modifies the code to jump over the call to ftrace. 423 and modifies the code to jump over the call to ftrace.
377 424
425config FUNCTION_PROFILER
426 bool "Kernel function profiler"
427 depends on FUNCTION_TRACER
428 default n
429 help
430 This option enables the kernel function profiler. A file is created
431 in debugfs called function_profile_enabled which defaults to zero.
432 When a 1 is echoed into this file profiling begins, and when a
433 zero is entered, profiling stops. A file in the trace_stats
434 directory called functions, that show the list of functions that
435 have been hit and their counters.
436
437 If in doubt, say N
438
378config FTRACE_MCOUNT_RECORD 439config FTRACE_MCOUNT_RECORD
379 def_bool y 440 def_bool y
380 depends on DYNAMIC_FTRACE 441 depends on DYNAMIC_FTRACE
@@ -385,7 +446,7 @@ config FTRACE_SELFTEST
385 446
386config FTRACE_STARTUP_TEST 447config FTRACE_STARTUP_TEST
387 bool "Perform a startup test on ftrace" 448 bool "Perform a startup test on ftrace"
388 depends on TRACING 449 depends on GENERIC_TRACER
389 select FTRACE_SELFTEST 450 select FTRACE_SELFTEST
390 help 451 help
391 This option performs a series of startup tests on ftrace. On bootup 452 This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +457,7 @@ config FTRACE_STARTUP_TEST
396config MMIOTRACE 457config MMIOTRACE
397 bool "Memory mapped IO tracing" 458 bool "Memory mapped IO tracing"
398 depends on HAVE_MMIOTRACE_SUPPORT && PCI 459 depends on HAVE_MMIOTRACE_SUPPORT && PCI
399 select TRACING 460 select GENERIC_TRACER
400 help 461 help
401 Mmiotrace traces Memory Mapped I/O access and is meant for 462 Mmiotrace traces Memory Mapped I/O access and is meant for
402 debugging and reverse engineering. It is called from the ioremap 463 debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +477,23 @@ config MMIOTRACE_TEST
416 477
417 Say N, unless you absolutely know what you are doing. 478 Say N, unless you absolutely know what you are doing.
418 479
419endmenu 480config RING_BUFFER_BENCHMARK
481 tristate "Ring buffer benchmark stress tester"
482 depends on RING_BUFFER
483 help
484 This option creates a test to stress the ring buffer and bench mark it.
485 It creates its own ring buffer such that it will not interfer with
486 any other users of the ring buffer (such as ftrace). It then creates
487 a producer and consumer that will run for 10 seconds and sleep for
488 10 seconds. Each interval it will print out the number of events
489 it recorded and give a rough estimate of how long each iteration took.
490
491 It does not disable interrupts or raise its priority, so it may be
492 affected by processes that are running.
493
494 If unsure, say N
495
496endif # FTRACE
420 497
421endif # TRACING_SUPPORT 498endif # TRACING_SUPPORT
422 499
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec1..844164dca90a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif 16endif
17 17
18#
19# Make the trace clocks available generally: it's infrastructure
20# relied on by ptrace for example:
21#
22obj-y += trace_clock.o
23
18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 24obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 25obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
26obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
20 27
21obj-$(CONFIG_TRACING) += trace.o 28obj-$(CONFIG_TRACING) += trace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_TRACING) += trace_output.o 29obj-$(CONFIG_TRACING) += trace_output.o
24obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
25obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
39obj-$(CONFIG_POWER_TRACER) += trace_power.o 45obj-$(CONFIG_POWER_TRACER) += trace_power.o
40obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
41obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43obj-$(CONFIG_EVENT_TRACER) += trace_events.o 49ifeq ($(CONFIG_BLOCK),y)
44obj-$(CONFIG_EVENT_TRACER) += events.o 50obj-$(CONFIG_EVENT_TRACING) += blktrace.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o 51endif
52obj-$(CONFIG_EVENT_TRACING) += trace_events.o
53obj-$(CONFIG_EVENT_TRACING) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
48obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o 56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
49 57
50libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0ba..39af8af6fc30 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <trace/block.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27
28#include <trace/events/block.h>
29
28#include "trace_output.h" 30#include "trace_output.h"
29 31
32#ifdef CONFIG_BLK_DEV_IO_TRACE
33
30static unsigned int blktrace_seq __read_mostly = 1; 34static unsigned int blktrace_seq __read_mostly = 1;
31 35
32static struct trace_array *blk_tr; 36static struct trace_array *blk_tr;
@@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
147{ 151{
148 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) 152 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149 return 1; 153 return 1;
150 if (sector < bt->start_lba || sector > bt->end_lba) 154 if (sector && (sector < bt->start_lba || sector > bt->end_lba))
151 return 1; 155 return 1;
152 if (bt->pid && pid != bt->pid) 156 if (bt->pid && pid != bt->pid)
153 return 1; 157 return 1;
@@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
192 what |= MASK_TC_BIT(rw, DISCARD); 196 what |= MASK_TC_BIT(rw, DISCARD);
193 197
194 pid = tsk->pid; 198 pid = tsk->pid;
195 if (unlikely(act_log_check(bt, what, sector, pid))) 199 if (act_log_check(bt, what, sector, pid))
196 return; 200 return;
197 cpu = raw_smp_processor_id(); 201 cpu = raw_smp_processor_id();
198 202
@@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt)
262{ 266{
263 debugfs_remove(bt->msg_file); 267 debugfs_remove(bt->msg_file);
264 debugfs_remove(bt->dropped_file); 268 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
265 relay_close(bt->rchan); 270 relay_close(bt->rchan);
266 free_percpu(bt->sequence); 271 free_percpu(bt->sequence);
267 free_percpu(bt->msg_data); 272 free_percpu(bt->msg_data);
@@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
403 .remove_buf_file = blk_remove_buf_file_callback, 408 .remove_buf_file = blk_remove_buf_file_callback,
404}; 409};
405 410
411static void blk_trace_setup_lba(struct blk_trace *bt,
412 struct block_device *bdev)
413{
414 struct hd_struct *part = NULL;
415
416 if (bdev)
417 part = bdev->bd_part;
418
419 if (part) {
420 bt->start_lba = part->start_sect;
421 bt->end_lba = part->start_sect + part->nr_sects;
422 } else {
423 bt->start_lba = 0;
424 bt->end_lba = -1ULL;
425 }
426}
427
406/* 428/*
407 * Setup everything required to start tracing 429 * Setup everything required to start tracing
408 */ 430 */
409int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 431int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410 struct blk_user_trace_setup *buts) 432 struct block_device *bdev,
433 struct blk_user_trace_setup *buts)
411{ 434{
412 struct blk_trace *old_bt, *bt = NULL; 435 struct blk_trace *old_bt, *bt = NULL;
413 struct dentry *dir = NULL; 436 struct dentry *dir = NULL;
@@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
480 if (!bt->act_mask) 503 if (!bt->act_mask)
481 bt->act_mask = (u16) -1; 504 bt->act_mask = (u16) -1;
482 505
483 bt->start_lba = buts->start_lba; 506 blk_trace_setup_lba(bt, bdev);
484 bt->end_lba = buts->end_lba; 507
485 if (!bt->end_lba) 508 /* overwrite with user settings */
486 bt->end_lba = -1ULL; 509 if (buts->start_lba)
510 bt->start_lba = buts->start_lba;
511 if (buts->end_lba)
512 bt->end_lba = buts->end_lba;
487 513
488 bt->pid = buts->pid; 514 bt->pid = buts->pid;
489 bt->trace_state = Blktrace_setup; 515 bt->trace_state = Blktrace_setup;
@@ -505,6 +531,7 @@ err:
505} 531}
506 532
507int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 533int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
534 struct block_device *bdev,
508 char __user *arg) 535 char __user *arg)
509{ 536{
510 struct blk_user_trace_setup buts; 537 struct blk_user_trace_setup buts;
@@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
514 if (ret) 541 if (ret)
515 return -EFAULT; 542 return -EFAULT;
516 543
517 ret = do_blk_trace_setup(q, name, dev, &buts); 544 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
518 if (ret) 545 if (ret)
519 return ret; 546 return ret;
520 547
@@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
582 switch (cmd) { 609 switch (cmd) {
583 case BLKTRACESETUP: 610 case BLKTRACESETUP:
584 bdevname(bdev, b); 611 bdevname(bdev, b);
585 ret = blk_trace_setup(q, b, bdev->bd_dev, arg); 612 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
586 break; 613 break;
587 case BLKTRACESTART: 614 case BLKTRACESTART:
588 start = 1; 615 start = 1;
@@ -642,12 +669,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
642 669
643 if (blk_pc_request(rq)) { 670 if (blk_pc_request(rq)) {
644 what |= BLK_TC_ACT(BLK_TC_PC); 671 what |= BLK_TC_ACT(BLK_TC_PC);
645 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, 672 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
646 rq->cmd_len, rq->cmd); 673 what, rq->errors, rq->cmd_len, rq->cmd);
647 } else { 674 } else {
648 what |= BLK_TC_ACT(BLK_TC_FS); 675 what |= BLK_TC_ACT(BLK_TC_FS);
649 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 676 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
650 rw, what, rq->errors, 0, NULL); 677 what, rq->errors, 0, NULL);
651 } 678 }
652} 679}
653 680
@@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
809 * @bio: the source bio 836 * @bio: the source bio
810 * @dev: target device 837 * @dev: target device
811 * @from: source sector 838 * @from: source sector
812 * @to: target sector
813 * 839 *
814 * Description: 840 * Description:
815 * Device mapper or raid target sometimes need to split a bio because 841 * Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
817 * 843 *
818 **/ 844 **/
819static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 845static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
820 dev_t dev, sector_t from, sector_t to) 846 dev_t dev, sector_t from)
821{ 847{
822 struct blk_trace *bt = q->blk_trace; 848 struct blk_trace *bt = q->blk_trace;
823 struct blk_io_trace_remap r; 849 struct blk_io_trace_remap r;
@@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
825 if (likely(!bt)) 851 if (likely(!bt))
826 return; 852 return;
827 853
828 r.device = cpu_to_be32(dev); 854 r.device_from = cpu_to_be32(dev);
829 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); 855 r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
830 r.sector = cpu_to_be64(to); 856 r.sector_from = cpu_to_be64(from);
831 857
832 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, 858 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
833 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); 859 BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
860 sizeof(r), &r);
834} 861}
835 862
836/** 863/**
@@ -854,11 +881,11 @@ void blk_add_driver_data(struct request_queue *q,
854 return; 881 return;
855 882
856 if (blk_pc_request(rq)) 883 if (blk_pc_request(rq))
857 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, 884 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
858 rq->errors, len, data); 885 BLK_TA_DRV_DATA, rq->errors, len, data);
859 else 886 else
860 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 887 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
861 0, BLK_TA_DRV_DATA, rq->errors, len, data); 888 BLK_TA_DRV_DATA, rq->errors, len, data);
862} 889}
863EXPORT_SYMBOL_GPL(blk_add_driver_data); 890EXPORT_SYMBOL_GPL(blk_add_driver_data);
864 891
@@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
971 return te_blk_io_trace(ent) + 1; 998 return te_blk_io_trace(ent) + 1;
972} 999}
973 1000
1001static inline u32 t_action(const struct trace_entry *ent)
1002{
1003 return te_blk_io_trace(ent)->action;
1004}
1005
1006static inline u32 t_bytes(const struct trace_entry *ent)
1007{
1008 return te_blk_io_trace(ent)->bytes;
1009}
1010
974static inline u32 t_sec(const struct trace_entry *ent) 1011static inline u32 t_sec(const struct trace_entry *ent)
975{ 1012{
976 return te_blk_io_trace(ent)->bytes >> 9; 1013 return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
996 struct blk_io_trace_remap *r) 1033 struct blk_io_trace_remap *r)
997{ 1034{
998 const struct blk_io_trace_remap *__r = pdu_start(ent); 1035 const struct blk_io_trace_remap *__r = pdu_start(ent);
999 __u64 sector = __r->sector; 1036 __u64 sector_from = __r->sector_from;
1000 1037
1001 r->device = be32_to_cpu(__r->device);
1002 r->device_from = be32_to_cpu(__r->device_from); 1038 r->device_from = be32_to_cpu(__r->device_from);
1003 r->sector = be64_to_cpu(sector); 1039 r->device_to = be32_to_cpu(__r->device_to);
1040 r->sector_from = be64_to_cpu(sector_from);
1004} 1041}
1005 1042
1006typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1043typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
1031 MAJOR(t->device), MINOR(t->device), act, rwbs); 1068 MAJOR(t->device), MINOR(t->device), act, rwbs);
1032} 1069}
1033 1070
1071static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1072{
1073 const unsigned char *pdu_buf;
1074 int pdu_len;
1075 int i, end, ret;
1076
1077 pdu_buf = pdu_start(ent);
1078 pdu_len = te_blk_io_trace(ent)->pdu_len;
1079
1080 if (!pdu_len)
1081 return 1;
1082
1083 /* find the last zero that needs to be printed */
1084 for (end = pdu_len - 1; end >= 0; end--)
1085 if (pdu_buf[end])
1086 break;
1087 end++;
1088
1089 if (!trace_seq_putc(s, '('))
1090 return 0;
1091
1092 for (i = 0; i < pdu_len; i++) {
1093
1094 ret = trace_seq_printf(s, "%s%02x",
1095 i == 0 ? "" : " ", pdu_buf[i]);
1096 if (!ret)
1097 return ret;
1098
1099 /*
1100 * stop when the rest is just zeroes and indicate so
1101 * with a ".." appended
1102 */
1103 if (i == end && end != pdu_len - 1)
1104 return trace_seq_puts(s, " ..) ");
1105 }
1106
1107 return trace_seq_puts(s, ") ");
1108}
1109
1034static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1110static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1035{ 1111{
1036 char cmd[TASK_COMM_LEN]; 1112 char cmd[TASK_COMM_LEN];
1037 1113
1038 trace_find_cmdline(ent->pid, cmd); 1114 trace_find_cmdline(ent->pid, cmd);
1039 1115
1040 if (t_sec(ent)) 1116 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1041 return trace_seq_printf(s, "%llu + %u [%s]\n", 1117 int ret;
1042 t_sector(ent), t_sec(ent), cmd); 1118
1043 return trace_seq_printf(s, "[%s]\n", cmd); 1119 ret = trace_seq_printf(s, "%u ", t_bytes(ent));
1120 if (!ret)
1121 return 0;
1122 ret = blk_log_dump_pdu(s, ent);
1123 if (!ret)
1124 return 0;
1125 return trace_seq_printf(s, "[%s]\n", cmd);
1126 } else {
1127 if (t_sec(ent))
1128 return trace_seq_printf(s, "%llu + %u [%s]\n",
1129 t_sector(ent), t_sec(ent), cmd);
1130 return trace_seq_printf(s, "[%s]\n", cmd);
1131 }
1044} 1132}
1045 1133
1046static int blk_log_with_error(struct trace_seq *s, 1134static int blk_log_with_error(struct trace_seq *s,
1047 const struct trace_entry *ent) 1135 const struct trace_entry *ent)
1048{ 1136{
1049 if (t_sec(ent)) 1137 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1050 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), 1138 int ret;
1051 t_sec(ent), t_error(ent)); 1139
1052 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); 1140 ret = blk_log_dump_pdu(s, ent);
1141 if (ret)
1142 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1143 return 0;
1144 } else {
1145 if (t_sec(ent))
1146 return trace_seq_printf(s, "%llu + %u [%d]\n",
1147 t_sector(ent),
1148 t_sec(ent), t_error(ent));
1149 return trace_seq_printf(s, "%llu [%d]\n",
1150 t_sector(ent), t_error(ent));
1151 }
1053} 1152}
1054 1153
1055static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1154static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1056{ 1155{
1057 struct blk_io_trace_remap r = { .device = 0, }; 1156 struct blk_io_trace_remap r = { .device_from = 0, };
1058 1157
1059 get_pdu_remap(ent, &r); 1158 get_pdu_remap(ent, &r);
1060 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1159 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1061 t_sector(ent), 1160 t_sector(ent), t_sec(ent),
1062 t_sec(ent), MAJOR(r.device), MINOR(r.device), 1161 MAJOR(r.device_from), MINOR(r.device_from),
1063 (unsigned long long)r.sector); 1162 (unsigned long long)r.sector_from);
1064} 1163}
1065 1164
1066static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1165static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m)
1117static void blk_tracer_start(struct trace_array *tr) 1216static void blk_tracer_start(struct trace_array *tr)
1118{ 1217{
1119 blk_tracer_enabled = true; 1218 blk_tracer_enabled = true;
1120 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1121} 1219}
1122 1220
1123static int blk_tracer_init(struct trace_array *tr) 1221static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr)
1130static void blk_tracer_stop(struct trace_array *tr) 1228static void blk_tracer_stop(struct trace_array *tr)
1131{ 1229{
1132 blk_tracer_enabled = false; 1230 blk_tracer_enabled = false;
1133 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1134} 1231}
1135 1232
1136static void blk_tracer_reset(struct trace_array *tr) 1233static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1182 } 1279 }
1183 1280
1184 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1281 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1185 ret = trace_seq_printf(s, "Bad pc action %x\n", what); 1282 ret = trace_seq_printf(s, "Unknown action %x\n", what);
1186 else { 1283 else {
1187 ret = log_action(iter, what2act[what].act[long_act]); 1284 ret = log_action(iter, what2act[what].act[long_act]);
1188 if (ret) 1285 if (ret)
@@ -1195,9 +1292,6 @@ out:
1195static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1292static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1196 int flags) 1293 int flags)
1197{ 1294{
1198 if (!trace_print_context(iter))
1199 return TRACE_TYPE_PARTIAL_LINE;
1200
1201 return print_one_line(iter, false); 1295 return print_one_line(iter, false);
1202} 1296}
1203 1297
@@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1232 return print_one_line(iter, true); 1326 return print_one_line(iter, true);
1233} 1327}
1234 1328
1329static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
1330{
1331 /* don't output context-info for blk_classic output */
1332 if (bit == TRACE_BLK_OPT_CLASSIC) {
1333 if (set)
1334 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1335 else
1336 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1337 }
1338 return 0;
1339}
1340
1235static struct tracer blk_tracer __read_mostly = { 1341static struct tracer blk_tracer __read_mostly = {
1236 .name = "blk", 1342 .name = "blk",
1237 .init = blk_tracer_init, 1343 .init = blk_tracer_init,
@@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = {
1241 .print_header = blk_tracer_print_header, 1347 .print_header = blk_tracer_print_header,
1242 .print_line = blk_tracer_print_line, 1348 .print_line = blk_tracer_print_line,
1243 .flags = &blk_tracer_flags, 1349 .flags = &blk_tracer_flags,
1350 .set_flag = blk_tracer_set_flag,
1244}; 1351};
1245 1352
1246static struct trace_event trace_blk_event = { 1353static struct trace_event trace_blk_event = {
@@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
1285/* 1392/*
1286 * Setup everything required to start tracing 1393 * Setup everything required to start tracing
1287 */ 1394 */
1288static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) 1395static int blk_trace_setup_queue(struct request_queue *q,
1396 struct block_device *bdev)
1289{ 1397{
1290 struct blk_trace *old_bt, *bt = NULL; 1398 struct blk_trace *old_bt, *bt = NULL;
1291 int ret = -ENOMEM; 1399 int ret = -ENOMEM;
@@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1298 if (!bt->msg_data) 1406 if (!bt->msg_data)
1299 goto free_bt; 1407 goto free_bt;
1300 1408
1301 bt->dev = dev; 1409 bt->dev = bdev->bd_dev;
1302 bt->act_mask = (u16)-1; 1410 bt->act_mask = (u16)-1;
1303 bt->end_lba = -1ULL; 1411
1412 blk_trace_setup_lba(bt, bdev);
1304 1413
1305 old_bt = xchg(&q->blk_trace, bt); 1414 old_bt = xchg(&q->blk_trace, bt);
1306 if (old_bt != NULL) { 1415 if (old_bt != NULL) {
@@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1517 1626
1518 if (attr == &dev_attr_enable) { 1627 if (attr == &dev_attr_enable) {
1519 if (value) 1628 if (value)
1520 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1629 ret = blk_trace_setup_queue(q, bdev);
1521 else 1630 else
1522 ret = blk_trace_remove_queue(q); 1631 ret = blk_trace_remove_queue(q);
1523 goto out_unlock_bdev; 1632 goto out_unlock_bdev;
@@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1525 1634
1526 ret = 0; 1635 ret = 0;
1527 if (q->blk_trace == NULL) 1636 if (q->blk_trace == NULL)
1528 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1637 ret = blk_trace_setup_queue(q, bdev);
1529 1638
1530 if (ret == 0) { 1639 if (ret == 0) {
1531 if (attr == &dev_attr_act_mask) 1640 if (attr == &dev_attr_act_mask)
@@ -1548,3 +1657,77 @@ out:
1548 return ret ? ret : count; 1657 return ret ? ret : count;
1549} 1658}
1550 1659
1660int blk_trace_init_sysfs(struct device *dev)
1661{
1662 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1663}
1664
1665#endif /* CONFIG_BLK_DEV_IO_TRACE */
1666
1667#ifdef CONFIG_EVENT_TRACING
1668
1669void blk_dump_cmd(char *buf, struct request *rq)
1670{
1671 int i, end;
1672 int len = rq->cmd_len;
1673 unsigned char *cmd = rq->cmd;
1674
1675 if (!blk_pc_request(rq)) {
1676 buf[0] = '\0';
1677 return;
1678 }
1679
1680 for (end = len - 1; end >= 0; end--)
1681 if (cmd[end])
1682 break;
1683 end++;
1684
1685 for (i = 0; i < len; i++) {
1686 buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
1687 if (i == end && end != len - 1) {
1688 sprintf(buf, " ..");
1689 break;
1690 }
1691 }
1692}
1693
1694void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1695{
1696 int i = 0;
1697
1698 if (rw & WRITE)
1699 rwbs[i++] = 'W';
1700 else if (rw & 1 << BIO_RW_DISCARD)
1701 rwbs[i++] = 'D';
1702 else if (bytes)
1703 rwbs[i++] = 'R';
1704 else
1705 rwbs[i++] = 'N';
1706
1707 if (rw & 1 << BIO_RW_AHEAD)
1708 rwbs[i++] = 'A';
1709 if (rw & 1 << BIO_RW_BARRIER)
1710 rwbs[i++] = 'B';
1711 if (rw & 1 << BIO_RW_SYNCIO)
1712 rwbs[i++] = 'S';
1713 if (rw & 1 << BIO_RW_META)
1714 rwbs[i++] = 'M';
1715
1716 rwbs[i] = '\0';
1717}
1718
1719void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1720{
1721 int rw = rq->cmd_flags & 0x03;
1722 int bytes;
1723
1724 if (blk_discard_rq(rq))
1725 rw |= (1 << BIO_RW_DISCARD);
1726
1727 bytes = blk_rq_bytes(rq);
1728
1729 blk_fill_rwbs(rwbs, rw, bytes);
1730}
1731
1732#endif /* CONFIG_EVENT_TRACING */
1733
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 246f2aa6dc46..000000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,14 +0,0 @@
1/*
2 * This is the place to register all trace points as events.
3 */
4
5#include <linux/stringify.h>
6
7#include <trace/trace_events.h>
8
9#include "trace_output.h"
10
11#include "trace_events_stage_1.h"
12#include "trace_events_stage_2.h"
13#include "trace_events_stage_3.h"
14
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c3..bb60732ade0c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,11 +29,13 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h> 30#include <linux/hash.h>
31 31
32#include <trace/sched.h> 32#include <trace/events/sched.h>
33 33
34#include <asm/ftrace.h> 34#include <asm/ftrace.h>
35#include <asm/setup.h>
35 36
36#include "trace.h" 37#include "trace_output.h"
38#include "trace_stat.h"
37 39
38#define FTRACE_WARN_ON(cond) \ 40#define FTRACE_WARN_ON(cond) \
39 do { \ 41 do { \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
68 70
69static struct ftrace_ops ftrace_list_end __read_mostly = 71static struct ftrace_ops ftrace_list_end __read_mostly =
70{ 72{
71 .func = ftrace_stub, 73 .func = ftrace_stub,
72}; 74};
73 75
74static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 76static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void)
240#endif 242#endif
241} 243}
242 244
245#ifdef CONFIG_FUNCTION_PROFILER
246struct ftrace_profile {
247 struct hlist_node node;
248 unsigned long ip;
249 unsigned long counter;
250#ifdef CONFIG_FUNCTION_GRAPH_TRACER
251 unsigned long long time;
252#endif
253};
254
255struct ftrace_profile_page {
256 struct ftrace_profile_page *next;
257 unsigned long index;
258 struct ftrace_profile records[];
259};
260
261struct ftrace_profile_stat {
262 atomic_t disabled;
263 struct hlist_head *hash;
264 struct ftrace_profile_page *pages;
265 struct ftrace_profile_page *start;
266 struct tracer_stat stat;
267};
268
269#define PROFILE_RECORDS_SIZE \
270 (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
271
272#define PROFILES_PER_PAGE \
273 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
274
275static int ftrace_profile_bits __read_mostly;
276static int ftrace_profile_enabled __read_mostly;
277
278/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
279static DEFINE_MUTEX(ftrace_profile_lock);
280
281static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
282
283#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
284
285static void *
286function_stat_next(void *v, int idx)
287{
288 struct ftrace_profile *rec = v;
289 struct ftrace_profile_page *pg;
290
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292
293 again:
294 rec++;
295 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next;
297 if (!pg)
298 return NULL;
299 rec = &pg->records[0];
300 if (!rec->counter)
301 goto again;
302 }
303
304 return rec;
305}
306
307static void *function_stat_start(struct tracer_stat *trace)
308{
309 struct ftrace_profile_stat *stat =
310 container_of(trace, struct ftrace_profile_stat, stat);
311
312 if (!stat || !stat->start)
313 return NULL;
314
315 return function_stat_next(&stat->start->records[0], 0);
316}
317
318#ifdef CONFIG_FUNCTION_GRAPH_TRACER
319/* function graph compares on total time */
320static int function_stat_cmp(void *p1, void *p2)
321{
322 struct ftrace_profile *a = p1;
323 struct ftrace_profile *b = p2;
324
325 if (a->time < b->time)
326 return -1;
327 if (a->time > b->time)
328 return 1;
329 else
330 return 0;
331}
332#else
333/* not function graph compares against hits */
334static int function_stat_cmp(void *p1, void *p2)
335{
336 struct ftrace_profile *a = p1;
337 struct ftrace_profile *b = p2;
338
339 if (a->counter < b->counter)
340 return -1;
341 if (a->counter > b->counter)
342 return 1;
343 else
344 return 0;
345}
346#endif
347
348static int function_stat_headers(struct seq_file *m)
349{
350#ifdef CONFIG_FUNCTION_GRAPH_TRACER
351 seq_printf(m, " Function "
352 "Hit Time Avg\n"
353 " -------- "
354 "--- ---- ---\n");
355#else
356 seq_printf(m, " Function Hit\n"
357 " -------- ---\n");
358#endif
359 return 0;
360}
361
362static int function_stat_show(struct seq_file *m, void *v)
363{
364 struct ftrace_profile *rec = v;
365 char str[KSYM_SYMBOL_LEN];
366#ifdef CONFIG_FUNCTION_GRAPH_TRACER
367 static DEFINE_MUTEX(mutex);
368 static struct trace_seq s;
369 unsigned long long avg;
370#endif
371
372 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
373 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
374
375#ifdef CONFIG_FUNCTION_GRAPH_TRACER
376 seq_printf(m, " ");
377 avg = rec->time;
378 do_div(avg, rec->counter);
379
380 mutex_lock(&mutex);
381 trace_seq_init(&s);
382 trace_print_graph_duration(rec->time, &s);
383 trace_seq_puts(&s, " ");
384 trace_print_graph_duration(avg, &s);
385 trace_print_seq(m, &s);
386 mutex_unlock(&mutex);
387#endif
388 seq_putc(m, '\n');
389
390 return 0;
391}
392
393static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
394{
395 struct ftrace_profile_page *pg;
396
397 pg = stat->pages = stat->start;
398
399 while (pg) {
400 memset(pg->records, 0, PROFILE_RECORDS_SIZE);
401 pg->index = 0;
402 pg = pg->next;
403 }
404
405 memset(stat->hash, 0,
406 FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
407}
408
409int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
410{
411 struct ftrace_profile_page *pg;
412 int functions;
413 int pages;
414 int i;
415
416 /* If we already allocated, do nothing */
417 if (stat->pages)
418 return 0;
419
420 stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
421 if (!stat->pages)
422 return -ENOMEM;
423
424#ifdef CONFIG_DYNAMIC_FTRACE
425 functions = ftrace_update_tot_cnt;
426#else
427 /*
428 * We do not know the number of functions that exist because
429 * dynamic tracing is what counts them. With past experience
430 * we have around 20K functions. That should be more than enough.
431 * It is highly unlikely we will execute every function in
432 * the kernel.
433 */
434 functions = 20000;
435#endif
436
437 pg = stat->start = stat->pages;
438
439 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
440
441 for (i = 0; i < pages; i++) {
442 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
443 if (!pg->next)
444 goto out_free;
445 pg = pg->next;
446 }
447
448 return 0;
449
450 out_free:
451 pg = stat->start;
452 while (pg) {
453 unsigned long tmp = (unsigned long)pg;
454
455 pg = pg->next;
456 free_page(tmp);
457 }
458
459 free_page((unsigned long)stat->pages);
460 stat->pages = NULL;
461 stat->start = NULL;
462
463 return -ENOMEM;
464}
465
466static int ftrace_profile_init_cpu(int cpu)
467{
468 struct ftrace_profile_stat *stat;
469 int size;
470
471 stat = &per_cpu(ftrace_profile_stats, cpu);
472
473 if (stat->hash) {
474 /* If the profile is already created, simply reset it */
475 ftrace_profile_reset(stat);
476 return 0;
477 }
478
479 /*
480 * We are profiling all functions, but usually only a few thousand
481 * functions are hit. We'll make a hash of 1024 items.
482 */
483 size = FTRACE_PROFILE_HASH_SIZE;
484
485 stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
486
487 if (!stat->hash)
488 return -ENOMEM;
489
490 if (!ftrace_profile_bits) {
491 size--;
492
493 for (; size; size >>= 1)
494 ftrace_profile_bits++;
495 }
496
497 /* Preallocate the function profiling pages */
498 if (ftrace_profile_pages_init(stat) < 0) {
499 kfree(stat->hash);
500 stat->hash = NULL;
501 return -ENOMEM;
502 }
503
504 return 0;
505}
506
507static int ftrace_profile_init(void)
508{
509 int cpu;
510 int ret = 0;
511
512 for_each_online_cpu(cpu) {
513 ret = ftrace_profile_init_cpu(cpu);
514 if (ret)
515 break;
516 }
517
518 return ret;
519}
520
521/* interrupts must be disabled */
522static struct ftrace_profile *
523ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
524{
525 struct ftrace_profile *rec;
526 struct hlist_head *hhd;
527 struct hlist_node *n;
528 unsigned long key;
529
530 key = hash_long(ip, ftrace_profile_bits);
531 hhd = &stat->hash[key];
532
533 if (hlist_empty(hhd))
534 return NULL;
535
536 hlist_for_each_entry_rcu(rec, n, hhd, node) {
537 if (rec->ip == ip)
538 return rec;
539 }
540
541 return NULL;
542}
543
544static void ftrace_add_profile(struct ftrace_profile_stat *stat,
545 struct ftrace_profile *rec)
546{
547 unsigned long key;
548
549 key = hash_long(rec->ip, ftrace_profile_bits);
550 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
551}
552
553/*
554 * The memory is already allocated, this simply finds a new record to use.
555 */
556static struct ftrace_profile *
557ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
558{
559 struct ftrace_profile *rec = NULL;
560
561 /* prevent recursion (from NMIs) */
562 if (atomic_inc_return(&stat->disabled) != 1)
563 goto out;
564
565 /*
566 * Try to find the function again since an NMI
567 * could have added it
568 */
569 rec = ftrace_find_profiled_func(stat, ip);
570 if (rec)
571 goto out;
572
573 if (stat->pages->index == PROFILES_PER_PAGE) {
574 if (!stat->pages->next)
575 goto out;
576 stat->pages = stat->pages->next;
577 }
578
579 rec = &stat->pages->records[stat->pages->index++];
580 rec->ip = ip;
581 ftrace_add_profile(stat, rec);
582
583 out:
584 atomic_dec(&stat->disabled);
585
586 return rec;
587}
588
589static void
590function_profile_call(unsigned long ip, unsigned long parent_ip)
591{
592 struct ftrace_profile_stat *stat;
593 struct ftrace_profile *rec;
594 unsigned long flags;
595
596 if (!ftrace_profile_enabled)
597 return;
598
599 local_irq_save(flags);
600
601 stat = &__get_cpu_var(ftrace_profile_stats);
602 if (!stat->hash || !ftrace_profile_enabled)
603 goto out;
604
605 rec = ftrace_find_profiled_func(stat, ip);
606 if (!rec) {
607 rec = ftrace_profile_alloc(stat, ip);
608 if (!rec)
609 goto out;
610 }
611
612 rec->counter++;
613 out:
614 local_irq_restore(flags);
615}
616
617#ifdef CONFIG_FUNCTION_GRAPH_TRACER
618static int profile_graph_entry(struct ftrace_graph_ent *trace)
619{
620 function_profile_call(trace->func, 0);
621 return 1;
622}
623
624static void profile_graph_return(struct ftrace_graph_ret *trace)
625{
626 struct ftrace_profile_stat *stat;
627 unsigned long long calltime;
628 struct ftrace_profile *rec;
629 unsigned long flags;
630
631 local_irq_save(flags);
632 stat = &__get_cpu_var(ftrace_profile_stats);
633 if (!stat->hash || !ftrace_profile_enabled)
634 goto out;
635
636 calltime = trace->rettime - trace->calltime;
637
638 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
639 int index;
640
641 index = trace->depth;
642
643 /* Append this call time to the parent time to subtract */
644 if (index)
645 current->ret_stack[index - 1].subtime += calltime;
646
647 if (current->ret_stack[index].subtime < calltime)
648 calltime -= current->ret_stack[index].subtime;
649 else
650 calltime = 0;
651 }
652
653 rec = ftrace_find_profiled_func(stat, trace->func);
654 if (rec)
655 rec->time += calltime;
656
657 out:
658 local_irq_restore(flags);
659}
660
661static int register_ftrace_profiler(void)
662{
663 return register_ftrace_graph(&profile_graph_return,
664 &profile_graph_entry);
665}
666
667static void unregister_ftrace_profiler(void)
668{
669 unregister_ftrace_graph();
670}
671#else
672static struct ftrace_ops ftrace_profile_ops __read_mostly =
673{
674 .func = function_profile_call,
675};
676
677static int register_ftrace_profiler(void)
678{
679 return register_ftrace_function(&ftrace_profile_ops);
680}
681
682static void unregister_ftrace_profiler(void)
683{
684 unregister_ftrace_function(&ftrace_profile_ops);
685}
686#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
687
688static ssize_t
689ftrace_profile_write(struct file *filp, const char __user *ubuf,
690 size_t cnt, loff_t *ppos)
691{
692 unsigned long val;
693 char buf[64]; /* big enough to hold a number */
694 int ret;
695
696 if (cnt >= sizeof(buf))
697 return -EINVAL;
698
699 if (copy_from_user(&buf, ubuf, cnt))
700 return -EFAULT;
701
702 buf[cnt] = 0;
703
704 ret = strict_strtoul(buf, 10, &val);
705 if (ret < 0)
706 return ret;
707
708 val = !!val;
709
710 mutex_lock(&ftrace_profile_lock);
711 if (ftrace_profile_enabled ^ val) {
712 if (val) {
713 ret = ftrace_profile_init();
714 if (ret < 0) {
715 cnt = ret;
716 goto out;
717 }
718
719 ret = register_ftrace_profiler();
720 if (ret < 0) {
721 cnt = ret;
722 goto out;
723 }
724 ftrace_profile_enabled = 1;
725 } else {
726 ftrace_profile_enabled = 0;
727 /*
728 * unregister_ftrace_profiler calls stop_machine
729 * so this acts like an synchronize_sched.
730 */
731 unregister_ftrace_profiler();
732 }
733 }
734 out:
735 mutex_unlock(&ftrace_profile_lock);
736
737 filp->f_pos += cnt;
738
739 return cnt;
740}
741
742static ssize_t
743ftrace_profile_read(struct file *filp, char __user *ubuf,
744 size_t cnt, loff_t *ppos)
745{
746 char buf[64]; /* big enough to hold a number */
747 int r;
748
749 r = sprintf(buf, "%u\n", ftrace_profile_enabled);
750 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
751}
752
753static const struct file_operations ftrace_profile_fops = {
754 .open = tracing_open_generic,
755 .read = ftrace_profile_read,
756 .write = ftrace_profile_write,
757};
758
759/* used to initialize the real stat files */
760static struct tracer_stat function_stats __initdata = {
761 .name = "functions",
762 .stat_start = function_stat_start,
763 .stat_next = function_stat_next,
764 .stat_cmp = function_stat_cmp,
765 .stat_headers = function_stat_headers,
766 .stat_show = function_stat_show
767};
768
769static void ftrace_profile_debugfs(struct dentry *d_tracer)
770{
771 struct ftrace_profile_stat *stat;
772 struct dentry *entry;
773 char *name;
774 int ret;
775 int cpu;
776
777 for_each_possible_cpu(cpu) {
778 stat = &per_cpu(ftrace_profile_stats, cpu);
779
780 /* allocate enough for function name + cpu number */
781 name = kmalloc(32, GFP_KERNEL);
782 if (!name) {
783 /*
784 * The files created are permanent, if something happens
785 * we still do not free memory.
786 */
787 kfree(stat);
788 WARN(1,
789 "Could not allocate stat file for cpu %d\n",
790 cpu);
791 return;
792 }
793 stat->stat = function_stats;
794 snprintf(name, 32, "function%d", cpu);
795 stat->stat.name = name;
796 ret = register_stat_tracer(&stat->stat);
797 if (ret) {
798 WARN(1,
799 "Could not register function stat for cpu %d\n",
800 cpu);
801 kfree(name);
802 return;
803 }
804 }
805
806 entry = debugfs_create_file("function_profile_enabled", 0644,
807 d_tracer, NULL, &ftrace_profile_fops);
808 if (!entry)
809 pr_warning("Could not create debugfs "
810 "'function_profile_enabled' entry\n");
811}
812
813#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer)
815{
816}
817#endif /* CONFIG_FUNCTION_PROFILER */
818
243/* set when tracing only a pid */ 819/* set when tracing only a pid */
244struct pid *ftrace_pid_trace; 820struct pid *ftrace_pid_trace;
245static struct pid * const ftrace_swapper_pid = &init_struct_pid; 821static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +837,6 @@ struct ftrace_func_probe {
261 struct rcu_head rcu; 837 struct rcu_head rcu;
262}; 838};
263 839
264
265enum { 840enum {
266 FTRACE_ENABLE_CALLS = (1 << 0), 841 FTRACE_ENABLE_CALLS = (1 << 0),
267 FTRACE_DISABLE_CALLS = (1 << 1), 842 FTRACE_DISABLE_CALLS = (1 << 1),
@@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
346 rec->flags |= FTRACE_FL_FREE; 921 rec->flags |= FTRACE_FL_FREE;
347} 922}
348 923
349void ftrace_release(void *start, unsigned long size)
350{
351 struct dyn_ftrace *rec;
352 struct ftrace_page *pg;
353 unsigned long s = (unsigned long)start;
354 unsigned long e = s + size;
355
356 if (ftrace_disabled || !start)
357 return;
358
359 mutex_lock(&ftrace_lock);
360 do_for_each_ftrace_rec(pg, rec) {
361 if ((rec->ip >= s) && (rec->ip < e)) {
362 /*
363 * rec->ip is changed in ftrace_free_rec()
364 * It should not between s and e if record was freed.
365 */
366 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
367 ftrace_free_rec(rec);
368 }
369 } while_for_each_ftrace_rec();
370 mutex_unlock(&ftrace_lock);
371}
372
373static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 924static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
374{ 925{
375 struct dyn_ftrace *rec; 926 struct dyn_ftrace *rec;
@@ -1408,7 +1959,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1408 1959
1409static struct ftrace_ops trace_probe_ops __read_mostly = 1960static struct ftrace_ops trace_probe_ops __read_mostly =
1410{ 1961{
1411 .func = function_trace_probe_call, 1962 .func = function_trace_probe_call,
1412}; 1963};
1413 1964
1414static int ftrace_probe_registered; 1965static int ftrace_probe_registered;
@@ -1823,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
1823 ftrace_set_regex(buf, len, reset, 0); 2374 ftrace_set_regex(buf, len, reset, 0);
1824} 2375}
1825 2376
2377/*
2378 * command line interface to allow users to set filters on boot up.
2379 */
2380#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
2381static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
2382static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
2383
2384static int __init set_ftrace_notrace(char *str)
2385{
2386 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
2387 return 1;
2388}
2389__setup("ftrace_notrace=", set_ftrace_notrace);
2390
2391static int __init set_ftrace_filter(char *str)
2392{
2393 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
2394 return 1;
2395}
2396__setup("ftrace_filter=", set_ftrace_filter);
2397
2398static void __init set_ftrace_early_filter(char *buf, int enable)
2399{
2400 char *func;
2401
2402 while (buf) {
2403 func = strsep(&buf, ",");
2404 ftrace_set_regex(func, strlen(func), 0, enable);
2405 }
2406}
2407
2408static void __init set_ftrace_early_filters(void)
2409{
2410 if (ftrace_filter_buf[0])
2411 set_ftrace_early_filter(ftrace_filter_buf, 1);
2412 if (ftrace_notrace_buf[0])
2413 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2414}
2415
1826static int 2416static int
1827ftrace_regex_release(struct inode *inode, struct file *file, int enable) 2417ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1828{ 2418{
@@ -2128,38 +2718,23 @@ static const struct file_operations ftrace_graph_fops = {
2128 2718
2129static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 2719static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2130{ 2720{
2131 struct dentry *entry;
2132 2721
2133 entry = debugfs_create_file("available_filter_functions", 0444, 2722 trace_create_file("available_filter_functions", 0444,
2134 d_tracer, NULL, &ftrace_avail_fops); 2723 d_tracer, NULL, &ftrace_avail_fops);
2135 if (!entry)
2136 pr_warning("Could not create debugfs "
2137 "'available_filter_functions' entry\n");
2138 2724
2139 entry = debugfs_create_file("failures", 0444, 2725 trace_create_file("failures", 0444,
2140 d_tracer, NULL, &ftrace_failures_fops); 2726 d_tracer, NULL, &ftrace_failures_fops);
2141 if (!entry)
2142 pr_warning("Could not create debugfs 'failures' entry\n");
2143 2727
2144 entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, 2728 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2145 NULL, &ftrace_filter_fops); 2729 NULL, &ftrace_filter_fops);
2146 if (!entry)
2147 pr_warning("Could not create debugfs "
2148 "'set_ftrace_filter' entry\n");
2149 2730
2150 entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, 2731 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
2151 NULL, &ftrace_notrace_fops); 2732 NULL, &ftrace_notrace_fops);
2152 if (!entry)
2153 pr_warning("Could not create debugfs "
2154 "'set_ftrace_notrace' entry\n");
2155 2733
2156#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2734#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2157 entry = debugfs_create_file("set_graph_function", 0444, d_tracer, 2735 trace_create_file("set_graph_function", 0444, d_tracer,
2158 NULL, 2736 NULL,
2159 &ftrace_graph_fops); 2737 &ftrace_graph_fops);
2160 if (!entry)
2161 pr_warning("Could not create debugfs "
2162 "'set_graph_function' entry\n");
2163#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2738#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2164 2739
2165 return 0; 2740 return 0;
@@ -2197,14 +2772,72 @@ static int ftrace_convert_nops(struct module *mod,
2197 return 0; 2772 return 0;
2198} 2773}
2199 2774
2200void ftrace_init_module(struct module *mod, 2775#ifdef CONFIG_MODULES
2201 unsigned long *start, unsigned long *end) 2776void ftrace_release(void *start, void *end)
2777{
2778 struct dyn_ftrace *rec;
2779 struct ftrace_page *pg;
2780 unsigned long s = (unsigned long)start;
2781 unsigned long e = (unsigned long)end;
2782
2783 if (ftrace_disabled || !start || start == end)
2784 return;
2785
2786 mutex_lock(&ftrace_lock);
2787 do_for_each_ftrace_rec(pg, rec) {
2788 if ((rec->ip >= s) && (rec->ip < e)) {
2789 /*
2790 * rec->ip is changed in ftrace_free_rec()
2791 * It should not between s and e if record was freed.
2792 */
2793 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
2794 ftrace_free_rec(rec);
2795 }
2796 } while_for_each_ftrace_rec();
2797 mutex_unlock(&ftrace_lock);
2798}
2799
2800static void ftrace_init_module(struct module *mod,
2801 unsigned long *start, unsigned long *end)
2202{ 2802{
2203 if (ftrace_disabled || start == end) 2803 if (ftrace_disabled || start == end)
2204 return; 2804 return;
2205 ftrace_convert_nops(mod, start, end); 2805 ftrace_convert_nops(mod, start, end);
2206} 2806}
2207 2807
2808static int ftrace_module_notify(struct notifier_block *self,
2809 unsigned long val, void *data)
2810{
2811 struct module *mod = data;
2812
2813 switch (val) {
2814 case MODULE_STATE_COMING:
2815 ftrace_init_module(mod, mod->ftrace_callsites,
2816 mod->ftrace_callsites +
2817 mod->num_ftrace_callsites);
2818 break;
2819 case MODULE_STATE_GOING:
2820 ftrace_release(mod->ftrace_callsites,
2821 mod->ftrace_callsites +
2822 mod->num_ftrace_callsites);
2823 break;
2824 }
2825
2826 return 0;
2827}
2828#else
2829static int ftrace_module_notify(struct notifier_block *self,
2830 unsigned long val, void *data)
2831{
2832 return 0;
2833}
2834#endif /* CONFIG_MODULES */
2835
2836struct notifier_block ftrace_module_nb = {
2837 .notifier_call = ftrace_module_notify,
2838 .priority = 0,
2839};
2840
2208extern unsigned long __start_mcount_loc[]; 2841extern unsigned long __start_mcount_loc[];
2209extern unsigned long __stop_mcount_loc[]; 2842extern unsigned long __stop_mcount_loc[];
2210 2843
@@ -2236,6 +2869,12 @@ void __init ftrace_init(void)
2236 __start_mcount_loc, 2869 __start_mcount_loc,
2237 __stop_mcount_loc); 2870 __stop_mcount_loc);
2238 2871
2872 ret = register_module_notifier(&ftrace_module_nb);
2873 if (ret)
2874 pr_warning("Failed to register trace ftrace module notifier\n");
2875
2876 set_ftrace_early_filters();
2877
2239 return; 2878 return;
2240 failed: 2879 failed:
2241 ftrace_disabled = 1; 2880 ftrace_disabled = 1;
@@ -2417,7 +3056,6 @@ static const struct file_operations ftrace_pid_fops = {
2417static __init int ftrace_init_debugfs(void) 3056static __init int ftrace_init_debugfs(void)
2418{ 3057{
2419 struct dentry *d_tracer; 3058 struct dentry *d_tracer;
2420 struct dentry *entry;
2421 3059
2422 d_tracer = tracing_init_dentry(); 3060 d_tracer = tracing_init_dentry();
2423 if (!d_tracer) 3061 if (!d_tracer)
@@ -2425,11 +3063,11 @@ static __init int ftrace_init_debugfs(void)
2425 3063
2426 ftrace_init_dyn_debugfs(d_tracer); 3064 ftrace_init_dyn_debugfs(d_tracer);
2427 3065
2428 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, 3066 trace_create_file("set_ftrace_pid", 0644, d_tracer,
2429 NULL, &ftrace_pid_fops); 3067 NULL, &ftrace_pid_fops);
2430 if (!entry) 3068
2431 pr_warning("Could not create debugfs " 3069 ftrace_profile_debugfs(d_tracer);
2432 "'set_ftrace_pid' entry\n"); 3070
2433 return 0; 3071 return 0;
2434} 3072}
2435fs_initcall(ftrace_init_debugfs); 3073fs_initcall(ftrace_init_debugfs);
@@ -2538,7 +3176,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
2538 3176
2539#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3177#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2540 3178
2541static atomic_t ftrace_graph_active; 3179static int ftrace_graph_active;
2542static struct notifier_block ftrace_suspend_notifier; 3180static struct notifier_block ftrace_suspend_notifier;
2543 3181
2544int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 3182int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
2580 } 3218 }
2581 3219
2582 if (t->ret_stack == NULL) { 3220 if (t->ret_stack == NULL) {
2583 t->curr_ret_stack = -1;
2584 /* Make sure IRQs see the -1 first: */
2585 barrier();
2586 t->ret_stack = ret_stack_list[start++];
2587 atomic_set(&t->tracing_graph_pause, 0); 3221 atomic_set(&t->tracing_graph_pause, 0);
2588 atomic_set(&t->trace_overrun, 0); 3222 atomic_set(&t->trace_overrun, 0);
3223 t->curr_ret_stack = -1;
3224 /* Make sure the tasks see the -1 first: */
3225 smp_wmb();
3226 t->ret_stack = ret_stack_list[start++];
2589 } 3227 }
2590 } while_each_thread(g, t); 3228 } while_each_thread(g, t);
2591 3229
@@ -2643,8 +3281,10 @@ static int start_graph_tracing(void)
2643 return -ENOMEM; 3281 return -ENOMEM;
2644 3282
2645 /* The cpu_boot init_task->ret_stack will never be freed */ 3283 /* The cpu_boot init_task->ret_stack will never be freed */
2646 for_each_online_cpu(cpu) 3284 for_each_online_cpu(cpu) {
2647 ftrace_graph_init_task(idle_task(cpu)); 3285 if (!idle_task(cpu)->ret_stack)
3286 ftrace_graph_init_task(idle_task(cpu));
3287 }
2648 3288
2649 do { 3289 do {
2650 ret = alloc_retstack_tasklist(ret_stack_list); 3290 ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3330,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2690 mutex_lock(&ftrace_lock); 3330 mutex_lock(&ftrace_lock);
2691 3331
2692 /* we currently allow only one tracer registered at a time */ 3332 /* we currently allow only one tracer registered at a time */
2693 if (atomic_read(&ftrace_graph_active)) { 3333 if (ftrace_graph_active) {
2694 ret = -EBUSY; 3334 ret = -EBUSY;
2695 goto out; 3335 goto out;
2696 } 3336 }
@@ -2698,10 +3338,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2698 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; 3338 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2699 register_pm_notifier(&ftrace_suspend_notifier); 3339 register_pm_notifier(&ftrace_suspend_notifier);
2700 3340
2701 atomic_inc(&ftrace_graph_active); 3341 ftrace_graph_active++;
2702 ret = start_graph_tracing(); 3342 ret = start_graph_tracing();
2703 if (ret) { 3343 if (ret) {
2704 atomic_dec(&ftrace_graph_active); 3344 ftrace_graph_active--;
2705 goto out; 3345 goto out;
2706 } 3346 }
2707 3347
@@ -2719,10 +3359,10 @@ void unregister_ftrace_graph(void)
2719{ 3359{
2720 mutex_lock(&ftrace_lock); 3360 mutex_lock(&ftrace_lock);
2721 3361
2722 if (!unlikely(atomic_read(&ftrace_graph_active))) 3362 if (unlikely(!ftrace_graph_active))
2723 goto out; 3363 goto out;
2724 3364
2725 atomic_dec(&ftrace_graph_active); 3365 ftrace_graph_active--;
2726 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); 3366 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
2727 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3367 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2728 ftrace_graph_entry = ftrace_graph_entry_stub; 3368 ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3376,25 @@ void unregister_ftrace_graph(void)
2736/* Allocate a return stack for newly created task */ 3376/* Allocate a return stack for newly created task */
2737void ftrace_graph_init_task(struct task_struct *t) 3377void ftrace_graph_init_task(struct task_struct *t)
2738{ 3378{
2739 if (atomic_read(&ftrace_graph_active)) { 3379 /* Make sure we do not use the parent ret_stack */
2740 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH 3380 t->ret_stack = NULL;
3381
3382 if (ftrace_graph_active) {
3383 struct ftrace_ret_stack *ret_stack;
3384
3385 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2741 * sizeof(struct ftrace_ret_stack), 3386 * sizeof(struct ftrace_ret_stack),
2742 GFP_KERNEL); 3387 GFP_KERNEL);
2743 if (!t->ret_stack) 3388 if (!ret_stack)
2744 return; 3389 return;
2745 t->curr_ret_stack = -1; 3390 t->curr_ret_stack = -1;
2746 atomic_set(&t->tracing_graph_pause, 0); 3391 atomic_set(&t->tracing_graph_pause, 0);
2747 atomic_set(&t->trace_overrun, 0); 3392 atomic_set(&t->trace_overrun, 0);
2748 t->ftrace_timestamp = 0; 3393 t->ftrace_timestamp = 0;
2749 } else 3394 /* make curr_ret_stack visable before we add the ret_stack */
2750 t->ret_stack = NULL; 3395 smp_wmb();
3396 t->ret_stack = ret_stack;
3397 }
2751} 3398}
2752 3399
2753void ftrace_graph_exit_task(struct task_struct *t) 3400void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e37..86cdf671d7e2 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
12#include <linux/dcache.h> 12#include <linux/dcache.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14 14
15#include <trace/kmemtrace.h> 15#include <linux/kmemtrace.h>
16 16
17#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h" 18#include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
42 gfp_t gfp_flags, 42 gfp_t gfp_flags,
43 int node) 43 int node)
44{ 44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
45 struct trace_array *tr = kmemtrace_array; 46 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry; 47 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event; 48 struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
62 entry->gfp_flags = gfp_flags; 63 entry->gfp_flags = gfp_flags;
63 entry->node = node; 64 entry->node = node;
64 65
65 ring_buffer_unlock_commit(tr->buffer, event); 66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
66 68
67 trace_wake_up(); 69 trace_wake_up();
68} 70}
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site, 73 unsigned long call_site,
72 const void *ptr) 74 const void *ptr)
73{ 75{
76 struct ftrace_event_call *call = &event_kmem_free;
74 struct trace_array *tr = kmemtrace_array; 77 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry; 78 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event; 79 struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
86 entry->call_site = call_site; 89 entry->call_site = call_site;
87 entry->ptr = ptr; 90 entry->ptr = ptr;
88 91
89 ring_buffer_unlock_commit(tr->buffer, event); 92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
90 94
91 trace_wake_up(); 95 trace_wake_up();
92} 96}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c844..2e642b2b7253 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,28 @@
22#include "trace.h" 22#include "trace.h"
23 23
24/* 24/*
25 * The ring buffer header is special. We must manually up keep it.
26 */
27int ring_buffer_print_entry_header(struct trace_seq *s)
28{
29 int ret;
30
31 ret = trace_seq_printf(s, "# compressed entry header\n");
32 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
33 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
34 ret = trace_seq_printf(s, "\tarray : 32 bits\n");
35 ret = trace_seq_printf(s, "\n");
36 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
37 RINGBUF_TYPE_PADDING);
38 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
39 RINGBUF_TYPE_TIME_EXTEND);
40 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
41 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
42
43 return ret;
44}
45
46/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is 47 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is 48 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read 49 * associated with the CPU it is currently executing on. A reader may read
@@ -182,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
182 204
183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 205#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
184#define RB_ALIGNMENT 4U 206#define RB_ALIGNMENT 4U
185#define RB_MAX_SMALL_DATA 28 207#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
208
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
186 211
187enum { 212enum {
188 RB_LEN_TIME_EXTEND = 8, 213 RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +216,28 @@ enum {
191 216
192static inline int rb_null_event(struct ring_buffer_event *event) 217static inline int rb_null_event(struct ring_buffer_event *event)
193{ 218{
194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; 219 return event->type_len == RINGBUF_TYPE_PADDING
220 && event->time_delta == 0;
195} 221}
196 222
197static inline int rb_discarded_event(struct ring_buffer_event *event) 223static inline int rb_discarded_event(struct ring_buffer_event *event)
198{ 224{
199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta; 225 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
200} 226}
201 227
202static void rb_event_set_padding(struct ring_buffer_event *event) 228static void rb_event_set_padding(struct ring_buffer_event *event)
203{ 229{
204 event->type = RINGBUF_TYPE_PADDING; 230 event->type_len = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0; 231 event->time_delta = 0;
206} 232}
207 233
208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
222{
223 event->type = RINGBUF_TYPE_PADDING;
224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
227}
228
229static unsigned 234static unsigned
230rb_event_data_length(struct ring_buffer_event *event) 235rb_event_data_length(struct ring_buffer_event *event)
231{ 236{
232 unsigned length; 237 unsigned length;
233 238
234 if (event->len) 239 if (event->type_len)
235 length = event->len * RB_ALIGNMENT; 240 length = event->type_len * RB_ALIGNMENT;
236 else 241 else
237 length = event->array[0]; 242 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE; 243 return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
242static unsigned 247static unsigned
243rb_event_length(struct ring_buffer_event *event) 248rb_event_length(struct ring_buffer_event *event)
244{ 249{
245 switch (event->type) { 250 switch (event->type_len) {
246 case RINGBUF_TYPE_PADDING: 251 case RINGBUF_TYPE_PADDING:
247 if (rb_null_event(event)) 252 if (rb_null_event(event))
248 /* undefined */ 253 /* undefined */
249 return -1; 254 return -1;
250 return rb_event_data_length(event); 255 return event->array[0] + RB_EVNT_HDR_SIZE;
251 256
252 case RINGBUF_TYPE_TIME_EXTEND: 257 case RINGBUF_TYPE_TIME_EXTEND:
253 return RB_LEN_TIME_EXTEND; 258 return RB_LEN_TIME_EXTEND;
@@ -271,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
271unsigned ring_buffer_event_length(struct ring_buffer_event *event) 276unsigned ring_buffer_event_length(struct ring_buffer_event *event)
272{ 277{
273 unsigned length = rb_event_length(event); 278 unsigned length = rb_event_length(event);
274 if (event->type != RINGBUF_TYPE_DATA) 279 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
275 return length; 280 return length;
276 length -= RB_EVNT_HDR_SIZE; 281 length -= RB_EVNT_HDR_SIZE;
277 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) 282 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
284static void * 289static void *
285rb_event_data(struct ring_buffer_event *event) 290rb_event_data(struct ring_buffer_event *event)
286{ 291{
287 BUG_ON(event->type != RINGBUF_TYPE_DATA); 292 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
288 /* If length is in len field, then array[0] has the data */ 293 /* If length is in len field, then array[0] has the data */
289 if (event->len) 294 if (event->type_len)
290 return (void *)&event->array[0]; 295 return (void *)&event->array[0];
291 /* Otherwise length is in array[0] and array[1] has the data */ 296 /* Otherwise length is in array[0] and array[1] has the data */
292 return (void *)&event->array[1]; 297 return (void *)&event->array[1];
@@ -316,9 +321,10 @@ struct buffer_data_page {
316}; 321};
317 322
318struct buffer_page { 323struct buffer_page {
324 struct list_head list; /* list of buffer pages */
319 local_t write; /* index for next write */ 325 local_t write; /* index for next write */
320 unsigned read; /* index for next read */ 326 unsigned read; /* index for next read */
321 struct list_head list; /* list of free pages */ 327 local_t entries; /* entries on this page */
322 struct buffer_data_page *page; /* Actual data page */ 328 struct buffer_data_page *page; /* Actual data page */
323}; 329};
324 330
@@ -361,6 +367,34 @@ static inline int test_time_stamp(u64 delta)
361 367
362#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) 368#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
363 369
370/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
371#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
372
373/* Max number of timestamps that can fit on a page */
374#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
375
376int ring_buffer_print_page_header(struct trace_seq *s)
377{
378 struct buffer_data_page field;
379 int ret;
380
381 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
382 "offset:0;\tsize:%u;\n",
383 (unsigned int)sizeof(field.time_stamp));
384
385 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
386 "offset:%u;\tsize:%u;\n",
387 (unsigned int)offsetof(typeof(field), commit),
388 (unsigned int)sizeof(field.commit));
389
390 ret = trace_seq_printf(s, "\tfield: char data;\t"
391 "offset:%u;\tsize:%u;\n",
392 (unsigned int)offsetof(typeof(field), data),
393 (unsigned int)BUF_PAGE_SIZE);
394
395 return ret;
396}
397
364/* 398/*
365 * head_page == tail_page && head == tail then buffer is empty. 399 * head_page == tail_page && head == tail then buffer is empty.
366 */ 400 */
@@ -375,8 +409,11 @@ struct ring_buffer_per_cpu {
375 struct buffer_page *tail_page; /* write to tail */ 409 struct buffer_page *tail_page; /* write to tail */
376 struct buffer_page *commit_page; /* committed pages */ 410 struct buffer_page *commit_page; /* committed pages */
377 struct buffer_page *reader_page; 411 struct buffer_page *reader_page;
412 unsigned long nmi_dropped;
413 unsigned long commit_overrun;
378 unsigned long overrun; 414 unsigned long overrun;
379 unsigned long entries; 415 unsigned long read;
416 local_t entries;
380 u64 write_stamp; 417 u64 write_stamp;
381 u64 read_stamp; 418 u64 read_stamp;
382 atomic_t record_disabled; 419 atomic_t record_disabled;
@@ -389,6 +426,8 @@ struct ring_buffer {
389 atomic_t record_disabled; 426 atomic_t record_disabled;
390 cpumask_var_t cpumask; 427 cpumask_var_t cpumask;
391 428
429 struct lock_class_key *reader_lock_key;
430
392 struct mutex mutex; 431 struct mutex mutex;
393 432
394 struct ring_buffer_per_cpu **buffers; 433 struct ring_buffer_per_cpu **buffers;
@@ -420,13 +459,18 @@ struct ring_buffer_iter {
420/* Up this if you want to test the TIME_EXTENTS and normalization */ 459/* Up this if you want to test the TIME_EXTENTS and normalization */
421#define DEBUG_SHIFT 0 460#define DEBUG_SHIFT 0
422 461
462static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
463{
464 /* shift to debug/test normalization and TIME_EXTENTS */
465 return buffer->clock() << DEBUG_SHIFT;
466}
467
423u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) 468u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
424{ 469{
425 u64 time; 470 u64 time;
426 471
427 preempt_disable_notrace(); 472 preempt_disable_notrace();
428 /* shift to debug/test normalization and TIME_EXTENTS */ 473 time = rb_time_stamp(buffer, cpu);
429 time = buffer->clock() << DEBUG_SHIFT;
430 preempt_enable_no_resched_notrace(); 474 preempt_enable_no_resched_notrace();
431 475
432 return time; 476 return time;
@@ -523,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
523 cpu_buffer->cpu = cpu; 567 cpu_buffer->cpu = cpu;
524 cpu_buffer->buffer = buffer; 568 cpu_buffer->buffer = buffer;
525 spin_lock_init(&cpu_buffer->reader_lock); 569 spin_lock_init(&cpu_buffer->reader_lock);
570 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
526 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 571 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
527 INIT_LIST_HEAD(&cpu_buffer->pages); 572 INIT_LIST_HEAD(&cpu_buffer->pages);
528 573
@@ -593,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
593 * when the buffer wraps. If this flag is not set, the buffer will 638 * when the buffer wraps. If this flag is not set, the buffer will
594 * drop data when the tail hits the head. 639 * drop data when the tail hits the head.
595 */ 640 */
596struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) 641struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
642 struct lock_class_key *key)
597{ 643{
598 struct ring_buffer *buffer; 644 struct ring_buffer *buffer;
599 int bsize; 645 int bsize;
@@ -616,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
616 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 662 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
617 buffer->flags = flags; 663 buffer->flags = flags;
618 buffer->clock = trace_clock_local; 664 buffer->clock = trace_clock_local;
665 buffer->reader_lock_key = key;
619 666
620 /* need at least two pages */ 667 /* need at least two pages */
621 if (buffer->pages == 1) 668 if (buffer->pages == 1)
@@ -673,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
673 kfree(buffer); 720 kfree(buffer);
674 return NULL; 721 return NULL;
675} 722}
676EXPORT_SYMBOL_GPL(ring_buffer_alloc); 723EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
677 724
678/** 725/**
679 * ring_buffer_free - free a ring buffer. 726 * ring_buffer_free - free a ring buffer.
@@ -947,31 +994,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
947 return rb_page_commit(cpu_buffer->head_page); 994 return rb_page_commit(cpu_buffer->head_page);
948} 995}
949 996
950/*
951 * When the tail hits the head and the buffer is in overwrite mode,
952 * the head jumps to the next page and all content on the previous
953 * page is discarded. But before doing so, we update the overrun
954 * variable of the buffer.
955 */
956static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
957{
958 struct ring_buffer_event *event;
959 unsigned long head;
960
961 for (head = 0; head < rb_head_size(cpu_buffer);
962 head += rb_event_length(event)) {
963
964 event = __rb_page_index(cpu_buffer->head_page, head);
965 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
966 return;
967 /* Only count data entries */
968 if (event->type != RINGBUF_TYPE_DATA)
969 continue;
970 cpu_buffer->overrun++;
971 cpu_buffer->entries--;
972 }
973}
974
975static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 997static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
976 struct buffer_page **bpage) 998 struct buffer_page **bpage)
977{ 999{
@@ -991,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event)
991 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1013 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
992} 1014}
993 1015
994static int 1016static inline int
995rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1017rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
996 struct ring_buffer_event *event) 1018 struct ring_buffer_event *event)
997{ 1019{
@@ -1110,28 +1132,21 @@ static void
1110rb_update_event(struct ring_buffer_event *event, 1132rb_update_event(struct ring_buffer_event *event,
1111 unsigned type, unsigned length) 1133 unsigned type, unsigned length)
1112{ 1134{
1113 event->type = type; 1135 event->type_len = type;
1114 1136
1115 switch (type) { 1137 switch (type) {
1116 1138
1117 case RINGBUF_TYPE_PADDING: 1139 case RINGBUF_TYPE_PADDING:
1118 break;
1119
1120 case RINGBUF_TYPE_TIME_EXTEND: 1140 case RINGBUF_TYPE_TIME_EXTEND:
1121 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
1122 break;
1123
1124 case RINGBUF_TYPE_TIME_STAMP: 1141 case RINGBUF_TYPE_TIME_STAMP:
1125 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
1126 break; 1142 break;
1127 1143
1128 case RINGBUF_TYPE_DATA: 1144 case 0:
1129 length -= RB_EVNT_HDR_SIZE; 1145 length -= RB_EVNT_HDR_SIZE;
1130 if (length > RB_MAX_SMALL_DATA) { 1146 if (length > RB_MAX_SMALL_DATA)
1131 event->len = 0;
1132 event->array[0] = length; 1147 event->array[0] = length;
1133 } else 1148 else
1134 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1149 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1135 break; 1150 break;
1136 default: 1151 default:
1137 BUG(); 1152 BUG();
@@ -1155,131 +1170,156 @@ static unsigned rb_calculate_event_length(unsigned length)
1155 return length; 1170 return length;
1156} 1171}
1157 1172
1173
1158static struct ring_buffer_event * 1174static struct ring_buffer_event *
1159__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1175rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1160 unsigned type, unsigned long length, u64 *ts) 1176 unsigned long length, unsigned long tail,
1177 struct buffer_page *commit_page,
1178 struct buffer_page *tail_page, u64 *ts)
1161{ 1179{
1162 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; 1180 struct buffer_page *next_page, *head_page, *reader_page;
1163 unsigned long tail, write;
1164 struct ring_buffer *buffer = cpu_buffer->buffer; 1181 struct ring_buffer *buffer = cpu_buffer->buffer;
1165 struct ring_buffer_event *event; 1182 struct ring_buffer_event *event;
1166 unsigned long flags;
1167 bool lock_taken = false; 1183 bool lock_taken = false;
1184 unsigned long flags;
1168 1185
1169 commit_page = cpu_buffer->commit_page; 1186 next_page = tail_page;
1170 /* we just need to protect against interrupts */
1171 barrier();
1172 tail_page = cpu_buffer->tail_page;
1173 write = local_add_return(length, &tail_page->write);
1174 tail = write - length;
1175 1187
1176 /* See if we shot pass the end of this buffer page */ 1188 local_irq_save(flags);
1177 if (write > BUF_PAGE_SIZE) { 1189 /*
1178 struct buffer_page *next_page = tail_page; 1190 * Since the write to the buffer is still not
1191 * fully lockless, we must be careful with NMIs.
1192 * The locks in the writers are taken when a write
1193 * crosses to a new page. The locks protect against
1194 * races with the readers (this will soon be fixed
1195 * with a lockless solution).
1196 *
1197 * Because we can not protect against NMIs, and we
1198 * want to keep traces reentrant, we need to manage
1199 * what happens when we are in an NMI.
1200 *
1201 * NMIs can happen after we take the lock.
1202 * If we are in an NMI, only take the lock
1203 * if it is not already taken. Otherwise
1204 * simply fail.
1205 */
1206 if (unlikely(in_nmi())) {
1207 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1208 cpu_buffer->nmi_dropped++;
1209 goto out_reset;
1210 }
1211 } else
1212 __raw_spin_lock(&cpu_buffer->lock);
1179 1213
1180 local_irq_save(flags); 1214 lock_taken = true;
1181 /*
1182 * Since the write to the buffer is still not
1183 * fully lockless, we must be careful with NMIs.
1184 * The locks in the writers are taken when a write
1185 * crosses to a new page. The locks protect against
1186 * races with the readers (this will soon be fixed
1187 * with a lockless solution).
1188 *
1189 * Because we can not protect against NMIs, and we
1190 * want to keep traces reentrant, we need to manage
1191 * what happens when we are in an NMI.
1192 *
1193 * NMIs can happen after we take the lock.
1194 * If we are in an NMI, only take the lock
1195 * if it is not already taken. Otherwise
1196 * simply fail.
1197 */
1198 if (unlikely(in_nmi())) {
1199 if (!__raw_spin_trylock(&cpu_buffer->lock))
1200 goto out_reset;
1201 } else
1202 __raw_spin_lock(&cpu_buffer->lock);
1203 1215
1204 lock_taken = true; 1216 rb_inc_page(cpu_buffer, &next_page);
1205 1217
1206 rb_inc_page(cpu_buffer, &next_page); 1218 head_page = cpu_buffer->head_page;
1219 reader_page = cpu_buffer->reader_page;
1207 1220
1208 head_page = cpu_buffer->head_page; 1221 /* we grabbed the lock before incrementing */
1209 reader_page = cpu_buffer->reader_page; 1222 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1223 goto out_reset;
1210 1224
1211 /* we grabbed the lock before incrementing */ 1225 /*
1212 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1226 * If for some reason, we had an interrupt storm that made
1213 goto out_reset; 1227 * it all the way around the buffer, bail, and warn
1228 * about it.
1229 */
1230 if (unlikely(next_page == commit_page)) {
1231 cpu_buffer->commit_overrun++;
1232 goto out_reset;
1233 }
1214 1234
1215 /* 1235 if (next_page == head_page) {
1216 * If for some reason, we had an interrupt storm that made 1236 if (!(buffer->flags & RB_FL_OVERWRITE))
1217 * it all the way around the buffer, bail, and warn
1218 * about it.
1219 */
1220 if (unlikely(next_page == commit_page)) {
1221 WARN_ON_ONCE(1);
1222 goto out_reset; 1237 goto out_reset;
1223 }
1224 1238
1225 if (next_page == head_page) { 1239 /* tail_page has not moved yet? */
1226 if (!(buffer->flags & RB_FL_OVERWRITE)) 1240 if (tail_page == cpu_buffer->tail_page) {
1227 goto out_reset; 1241 /* count overflows */
1228 1242 cpu_buffer->overrun +=
1229 /* tail_page has not moved yet? */ 1243 local_read(&head_page->entries);
1230 if (tail_page == cpu_buffer->tail_page) {
1231 /* count overflows */
1232 rb_update_overflow(cpu_buffer);
1233 1244
1234 rb_inc_page(cpu_buffer, &head_page); 1245 rb_inc_page(cpu_buffer, &head_page);
1235 cpu_buffer->head_page = head_page; 1246 cpu_buffer->head_page = head_page;
1236 cpu_buffer->head_page->read = 0; 1247 cpu_buffer->head_page->read = 0;
1237 }
1238 } 1248 }
1249 }
1239 1250
1240 /* 1251 /*
1241 * If the tail page is still the same as what we think 1252 * If the tail page is still the same as what we think
1242 * it is, then it is up to us to update the tail 1253 * it is, then it is up to us to update the tail
1243 * pointer. 1254 * pointer.
1244 */ 1255 */
1245 if (tail_page == cpu_buffer->tail_page) { 1256 if (tail_page == cpu_buffer->tail_page) {
1246 local_set(&next_page->write, 0); 1257 local_set(&next_page->write, 0);
1247 local_set(&next_page->page->commit, 0); 1258 local_set(&next_page->entries, 0);
1248 cpu_buffer->tail_page = next_page; 1259 local_set(&next_page->page->commit, 0);
1260 cpu_buffer->tail_page = next_page;
1261
1262 /* reread the time stamp */
1263 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1264 cpu_buffer->tail_page->page->time_stamp = *ts;
1265 }
1249 1266
1250 /* reread the time stamp */ 1267 /*
1251 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); 1268 * The actual tail page has moved forward.
1252 cpu_buffer->tail_page->page->time_stamp = *ts; 1269 */
1253 } 1270 if (tail < BUF_PAGE_SIZE) {
1271 /* Mark the rest of the page with padding */
1272 event = __rb_page_index(tail_page, tail);
1273 rb_event_set_padding(event);
1274 }
1254 1275
1255 /* 1276 /* Set the write back to the previous setting */
1256 * The actual tail page has moved forward. 1277 local_sub(length, &tail_page->write);
1257 */
1258 if (tail < BUF_PAGE_SIZE) {
1259 /* Mark the rest of the page with padding */
1260 event = __rb_page_index(tail_page, tail);
1261 rb_event_set_padding(event);
1262 }
1263 1278
1264 if (tail <= BUF_PAGE_SIZE) 1279 /*
1265 /* Set the write back to the previous setting */ 1280 * If this was a commit entry that failed,
1266 local_set(&tail_page->write, tail); 1281 * increment that too
1282 */
1283 if (tail_page == cpu_buffer->commit_page &&
1284 tail == rb_commit_index(cpu_buffer)) {
1285 rb_set_commit_to_write(cpu_buffer);
1286 }
1267 1287
1268 /* 1288 __raw_spin_unlock(&cpu_buffer->lock);
1269 * If this was a commit entry that failed, 1289 local_irq_restore(flags);
1270 * increment that too 1290
1271 */ 1291 /* fail and let the caller try again */
1272 if (tail_page == cpu_buffer->commit_page && 1292 return ERR_PTR(-EAGAIN);
1273 tail == rb_commit_index(cpu_buffer)) { 1293
1274 rb_set_commit_to_write(cpu_buffer); 1294 out_reset:
1275 } 1295 /* reset write */
1296 local_sub(length, &tail_page->write);
1276 1297
1298 if (likely(lock_taken))
1277 __raw_spin_unlock(&cpu_buffer->lock); 1299 __raw_spin_unlock(&cpu_buffer->lock);
1278 local_irq_restore(flags); 1300 local_irq_restore(flags);
1301 return NULL;
1302}
1279 1303
1280 /* fail and let the caller try again */ 1304static struct ring_buffer_event *
1281 return ERR_PTR(-EAGAIN); 1305__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1282 } 1306 unsigned type, unsigned long length, u64 *ts)
1307{
1308 struct buffer_page *tail_page, *commit_page;
1309 struct ring_buffer_event *event;
1310 unsigned long tail, write;
1311
1312 commit_page = cpu_buffer->commit_page;
1313 /* we just need to protect against interrupts */
1314 barrier();
1315 tail_page = cpu_buffer->tail_page;
1316 write = local_add_return(length, &tail_page->write);
1317 tail = write - length;
1318
1319 /* See if we shot pass the end of this buffer page */
1320 if (write > BUF_PAGE_SIZE)
1321 return rb_move_tail(cpu_buffer, length, tail,
1322 commit_page, tail_page, ts);
1283 1323
1284 /* We reserved something on the buffer */ 1324 /* We reserved something on the buffer */
1285 1325
@@ -1289,6 +1329,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1289 event = __rb_page_index(tail_page, tail); 1329 event = __rb_page_index(tail_page, tail);
1290 rb_update_event(event, type, length); 1330 rb_update_event(event, type, length);
1291 1331
1332 /* The passed in type is zero for DATA */
1333 if (likely(!type))
1334 local_inc(&tail_page->entries);
1335
1292 /* 1336 /*
1293 * If this is a commit and the tail is zero, then update 1337 * If this is a commit and the tail is zero, then update
1294 * this page's time stamp. 1338 * this page's time stamp.
@@ -1297,16 +1341,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1297 cpu_buffer->commit_page->page->time_stamp = *ts; 1341 cpu_buffer->commit_page->page->time_stamp = *ts;
1298 1342
1299 return event; 1343 return event;
1344}
1300 1345
1301 out_reset: 1346static inline int
1302 /* reset write */ 1347rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1303 if (tail <= BUF_PAGE_SIZE) 1348 struct ring_buffer_event *event)
1304 local_set(&tail_page->write, tail); 1349{
1350 unsigned long new_index, old_index;
1351 struct buffer_page *bpage;
1352 unsigned long index;
1353 unsigned long addr;
1305 1354
1306 if (likely(lock_taken)) 1355 new_index = rb_event_index(event);
1307 __raw_spin_unlock(&cpu_buffer->lock); 1356 old_index = new_index + rb_event_length(event);
1308 local_irq_restore(flags); 1357 addr = (unsigned long)event;
1309 return NULL; 1358 addr &= PAGE_MASK;
1359
1360 bpage = cpu_buffer->tail_page;
1361
1362 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1363 /*
1364 * This is on the tail page. It is possible that
1365 * a write could come in and move the tail page
1366 * and write to the next page. That is fine
1367 * because we just shorten what is on this page.
1368 */
1369 index = local_cmpxchg(&bpage->write, old_index, new_index);
1370 if (index == old_index)
1371 return 1;
1372 }
1373
1374 /* could not discard */
1375 return 0;
1310} 1376}
1311 1377
1312static int 1378static int
@@ -1351,16 +1417,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1351 event->array[0] = *delta >> TS_SHIFT; 1417 event->array[0] = *delta >> TS_SHIFT;
1352 } else { 1418 } else {
1353 cpu_buffer->commit_page->page->time_stamp = *ts; 1419 cpu_buffer->commit_page->page->time_stamp = *ts;
1354 event->time_delta = 0; 1420 /* try to discard, since we do not need this */
1355 event->array[0] = 0; 1421 if (!rb_try_to_discard(cpu_buffer, event)) {
1422 /* nope, just zero it */
1423 event->time_delta = 0;
1424 event->array[0] = 0;
1425 }
1356 } 1426 }
1357 cpu_buffer->write_stamp = *ts; 1427 cpu_buffer->write_stamp = *ts;
1358 /* let the caller know this was the commit */ 1428 /* let the caller know this was the commit */
1359 ret = 1; 1429 ret = 1;
1360 } else { 1430 } else {
1361 /* Darn, this is just wasted space */ 1431 /* Try to discard the event */
1362 event->time_delta = 0; 1432 if (!rb_try_to_discard(cpu_buffer, event)) {
1363 event->array[0] = 0; 1433 /* Darn, this is just wasted space */
1434 event->time_delta = 0;
1435 event->array[0] = 0;
1436 }
1364 ret = 0; 1437 ret = 0;
1365 } 1438 }
1366 1439
@@ -1371,13 +1444,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1371 1444
1372static struct ring_buffer_event * 1445static struct ring_buffer_event *
1373rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1446rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1374 unsigned type, unsigned long length) 1447 unsigned long length)
1375{ 1448{
1376 struct ring_buffer_event *event; 1449 struct ring_buffer_event *event;
1377 u64 ts, delta; 1450 u64 ts, delta = 0;
1378 int commit = 0; 1451 int commit = 0;
1379 int nr_loops = 0; 1452 int nr_loops = 0;
1380 1453
1454 length = rb_calculate_event_length(length);
1381 again: 1455 again:
1382 /* 1456 /*
1383 * We allow for interrupts to reenter here and do a trace. 1457 * We allow for interrupts to reenter here and do a trace.
@@ -1391,7 +1465,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1391 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1465 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1392 return NULL; 1466 return NULL;
1393 1467
1394 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 1468 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1395 1469
1396 /* 1470 /*
1397 * Only the first commit can update the timestamp. 1471 * Only the first commit can update the timestamp.
@@ -1401,23 +1475,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1401 * also be made. But only the entry that did the actual 1475 * also be made. But only the entry that did the actual
1402 * commit will be something other than zero. 1476 * commit will be something other than zero.
1403 */ 1477 */
1404 if (cpu_buffer->tail_page == cpu_buffer->commit_page && 1478 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
1405 rb_page_write(cpu_buffer->tail_page) == 1479 rb_page_write(cpu_buffer->tail_page) ==
1406 rb_commit_index(cpu_buffer)) { 1480 rb_commit_index(cpu_buffer))) {
1481 u64 diff;
1407 1482
1408 delta = ts - cpu_buffer->write_stamp; 1483 diff = ts - cpu_buffer->write_stamp;
1409 1484
1410 /* make sure this delta is calculated here */ 1485 /* make sure this diff is calculated here */
1411 barrier(); 1486 barrier();
1412 1487
1413 /* Did the write stamp get updated already? */ 1488 /* Did the write stamp get updated already? */
1414 if (unlikely(ts < cpu_buffer->write_stamp)) 1489 if (unlikely(ts < cpu_buffer->write_stamp))
1415 delta = 0; 1490 goto get_event;
1416 1491
1417 if (test_time_stamp(delta)) { 1492 delta = diff;
1493 if (unlikely(test_time_stamp(delta))) {
1418 1494
1419 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1495 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1420
1421 if (commit == -EBUSY) 1496 if (commit == -EBUSY)
1422 return NULL; 1497 return NULL;
1423 1498
@@ -1426,12 +1501,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1426 1501
1427 RB_WARN_ON(cpu_buffer, commit < 0); 1502 RB_WARN_ON(cpu_buffer, commit < 0);
1428 } 1503 }
1429 } else 1504 }
1430 /* Non commits have zero deltas */
1431 delta = 0;
1432 1505
1433 event = __rb_reserve_next(cpu_buffer, type, length, &ts); 1506 get_event:
1434 if (PTR_ERR(event) == -EAGAIN) 1507 event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
1508 if (unlikely(PTR_ERR(event) == -EAGAIN))
1435 goto again; 1509 goto again;
1436 1510
1437 if (!event) { 1511 if (!event) {
@@ -1448,7 +1522,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1448 * If the timestamp was commited, make the commit our entry 1522 * If the timestamp was commited, make the commit our entry
1449 * now so that we will update it when needed. 1523 * now so that we will update it when needed.
1450 */ 1524 */
1451 if (commit) 1525 if (unlikely(commit))
1452 rb_set_commit_event(cpu_buffer, event); 1526 rb_set_commit_event(cpu_buffer, event);
1453 else if (!rb_is_commit(cpu_buffer, event)) 1527 else if (!rb_is_commit(cpu_buffer, event))
1454 delta = 0; 1528 delta = 0;
@@ -1458,6 +1532,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1458 return event; 1532 return event;
1459} 1533}
1460 1534
1535#define TRACE_RECURSIVE_DEPTH 16
1536
1537static int trace_recursive_lock(void)
1538{
1539 current->trace_recursion++;
1540
1541 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
1542 return 0;
1543
1544 /* Disable all tracing before we do anything else */
1545 tracing_off_permanent();
1546
1547 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
1548 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
1549 current->trace_recursion,
1550 hardirq_count() >> HARDIRQ_SHIFT,
1551 softirq_count() >> SOFTIRQ_SHIFT,
1552 in_nmi());
1553
1554 WARN_ON_ONCE(1);
1555 return -1;
1556}
1557
1558static void trace_recursive_unlock(void)
1559{
1560 WARN_ON_ONCE(!current->trace_recursion);
1561
1562 current->trace_recursion--;
1563}
1564
1461static DEFINE_PER_CPU(int, rb_need_resched); 1565static DEFINE_PER_CPU(int, rb_need_resched);
1462 1566
1463/** 1567/**
@@ -1491,6 +1595,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1491 /* If we are tracing schedule, we don't want to recurse */ 1595 /* If we are tracing schedule, we don't want to recurse */
1492 resched = ftrace_preempt_disable(); 1596 resched = ftrace_preempt_disable();
1493 1597
1598 if (trace_recursive_lock())
1599 goto out_nocheck;
1600
1494 cpu = raw_smp_processor_id(); 1601 cpu = raw_smp_processor_id();
1495 1602
1496 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1603 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1608,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1501 if (atomic_read(&cpu_buffer->record_disabled)) 1608 if (atomic_read(&cpu_buffer->record_disabled))
1502 goto out; 1609 goto out;
1503 1610
1504 length = rb_calculate_event_length(length); 1611 if (length > BUF_MAX_DATA_SIZE)
1505 if (length > BUF_PAGE_SIZE)
1506 goto out; 1612 goto out;
1507 1613
1508 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); 1614 event = rb_reserve_next_event(cpu_buffer, length);
1509 if (!event) 1615 if (!event)
1510 goto out; 1616 goto out;
1511 1617
@@ -1520,6 +1626,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1520 return event; 1626 return event;
1521 1627
1522 out: 1628 out:
1629 trace_recursive_unlock();
1630
1631 out_nocheck:
1523 ftrace_preempt_enable(resched); 1632 ftrace_preempt_enable(resched);
1524 return NULL; 1633 return NULL;
1525} 1634}
@@ -1528,7 +1637,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1528static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1637static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1529 struct ring_buffer_event *event) 1638 struct ring_buffer_event *event)
1530{ 1639{
1531 cpu_buffer->entries++; 1640 local_inc(&cpu_buffer->entries);
1532 1641
1533 /* Only process further if we own the commit */ 1642 /* Only process further if we own the commit */
1534 if (!rb_is_commit(cpu_buffer, event)) 1643 if (!rb_is_commit(cpu_buffer, event))
@@ -1558,6 +1667,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1558 1667
1559 rb_commit(cpu_buffer, event); 1668 rb_commit(cpu_buffer, event);
1560 1669
1670 trace_recursive_unlock();
1671
1561 /* 1672 /*
1562 * Only the last preempt count needs to restore preemption. 1673 * Only the last preempt count needs to restore preemption.
1563 */ 1674 */
@@ -1570,6 +1681,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1570} 1681}
1571EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); 1682EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1572 1683
1684static inline void rb_event_discard(struct ring_buffer_event *event)
1685{
1686 /* array[0] holds the actual length for the discarded event */
1687 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
1688 event->type_len = RINGBUF_TYPE_PADDING;
1689 /* time delta must be non zero */
1690 if (!event->time_delta)
1691 event->time_delta = 1;
1692}
1693
1694/**
1695 * ring_buffer_event_discard - discard any event in the ring buffer
1696 * @event: the event to discard
1697 *
1698 * Sometimes a event that is in the ring buffer needs to be ignored.
1699 * This function lets the user discard an event in the ring buffer
1700 * and then that event will not be read later.
1701 *
1702 * Note, it is up to the user to be careful with this, and protect
1703 * against races. If the user discards an event that has been consumed
1704 * it is possible that it could corrupt the ring buffer.
1705 */
1706void ring_buffer_event_discard(struct ring_buffer_event *event)
1707{
1708 rb_event_discard(event);
1709}
1710EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1711
1712/**
1713 * ring_buffer_commit_discard - discard an event that has not been committed
1714 * @buffer: the ring buffer
1715 * @event: non committed event to discard
1716 *
1717 * This is similar to ring_buffer_event_discard but must only be
1718 * performed on an event that has not been committed yet. The difference
1719 * is that this will also try to free the event from the ring buffer
1720 * if another event has not been added behind it.
1721 *
1722 * If another event has been added behind it, it will set the event
1723 * up as discarded, and perform the commit.
1724 *
1725 * If this function is called, do not call ring_buffer_unlock_commit on
1726 * the event.
1727 */
1728void ring_buffer_discard_commit(struct ring_buffer *buffer,
1729 struct ring_buffer_event *event)
1730{
1731 struct ring_buffer_per_cpu *cpu_buffer;
1732 int cpu;
1733
1734 /* The event is discarded regardless */
1735 rb_event_discard(event);
1736
1737 /*
1738 * This must only be called if the event has not been
1739 * committed yet. Thus we can assume that preemption
1740 * is still disabled.
1741 */
1742 RB_WARN_ON(buffer, preemptible());
1743
1744 cpu = smp_processor_id();
1745 cpu_buffer = buffer->buffers[cpu];
1746
1747 if (!rb_try_to_discard(cpu_buffer, event))
1748 goto out;
1749
1750 /*
1751 * The commit is still visible by the reader, so we
1752 * must increment entries.
1753 */
1754 local_inc(&cpu_buffer->entries);
1755 out:
1756 /*
1757 * If a write came in and pushed the tail page
1758 * we still need to update the commit pointer
1759 * if we were the commit.
1760 */
1761 if (rb_is_commit(cpu_buffer, event))
1762 rb_set_commit_to_write(cpu_buffer);
1763
1764 trace_recursive_unlock();
1765
1766 /*
1767 * Only the last preempt count needs to restore preemption.
1768 */
1769 if (preempt_count() == 1)
1770 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1771 else
1772 preempt_enable_no_resched_notrace();
1773
1774}
1775EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
1776
1573/** 1777/**
1574 * ring_buffer_write - write data to the buffer without reserving 1778 * ring_buffer_write - write data to the buffer without reserving
1575 * @buffer: The ring buffer to write to. 1779 * @buffer: The ring buffer to write to.
@@ -1589,7 +1793,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
1589{ 1793{
1590 struct ring_buffer_per_cpu *cpu_buffer; 1794 struct ring_buffer_per_cpu *cpu_buffer;
1591 struct ring_buffer_event *event; 1795 struct ring_buffer_event *event;
1592 unsigned long event_length;
1593 void *body; 1796 void *body;
1594 int ret = -EBUSY; 1797 int ret = -EBUSY;
1595 int cpu, resched; 1798 int cpu, resched;
@@ -1612,9 +1815,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
1612 if (atomic_read(&cpu_buffer->record_disabled)) 1815 if (atomic_read(&cpu_buffer->record_disabled))
1613 goto out; 1816 goto out;
1614 1817
1615 event_length = rb_calculate_event_length(length); 1818 if (length > BUF_MAX_DATA_SIZE)
1616 event = rb_reserve_next_event(cpu_buffer, 1819 goto out;
1617 RINGBUF_TYPE_DATA, event_length); 1820
1821 event = rb_reserve_next_event(cpu_buffer, length);
1618 if (!event) 1822 if (!event)
1619 goto out; 1823 goto out;
1620 1824
@@ -1728,7 +1932,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1728 return 0; 1932 return 0;
1729 1933
1730 cpu_buffer = buffer->buffers[cpu]; 1934 cpu_buffer = buffer->buffers[cpu];
1731 ret = cpu_buffer->entries; 1935 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
1936 - cpu_buffer->read;
1732 1937
1733 return ret; 1938 return ret;
1734} 1939}
@@ -1755,6 +1960,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1755EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1960EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1756 1961
1757/** 1962/**
1963 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1964 * @buffer: The ring buffer
1965 * @cpu: The per CPU buffer to get the number of overruns from
1966 */
1967unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
1968{
1969 struct ring_buffer_per_cpu *cpu_buffer;
1970 unsigned long ret;
1971
1972 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1973 return 0;
1974
1975 cpu_buffer = buffer->buffers[cpu];
1976 ret = cpu_buffer->nmi_dropped;
1977
1978 return ret;
1979}
1980EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
1981
1982/**
1983 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
1984 * @buffer: The ring buffer
1985 * @cpu: The per CPU buffer to get the number of overruns from
1986 */
1987unsigned long
1988ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
1989{
1990 struct ring_buffer_per_cpu *cpu_buffer;
1991 unsigned long ret;
1992
1993 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1994 return 0;
1995
1996 cpu_buffer = buffer->buffers[cpu];
1997 ret = cpu_buffer->commit_overrun;
1998
1999 return ret;
2000}
2001EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
2002
2003/**
1758 * ring_buffer_entries - get the number of entries in a buffer 2004 * ring_buffer_entries - get the number of entries in a buffer
1759 * @buffer: The ring buffer 2005 * @buffer: The ring buffer
1760 * 2006 *
@@ -1770,7 +2016,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1770 /* if you care about this being correct, lock the buffer */ 2016 /* if you care about this being correct, lock the buffer */
1771 for_each_buffer_cpu(buffer, cpu) { 2017 for_each_buffer_cpu(buffer, cpu) {
1772 cpu_buffer = buffer->buffers[cpu]; 2018 cpu_buffer = buffer->buffers[cpu];
1773 entries += cpu_buffer->entries; 2019 entries += (local_read(&cpu_buffer->entries) -
2020 cpu_buffer->overrun) - cpu_buffer->read;
1774 } 2021 }
1775 2022
1776 return entries; 2023 return entries;
@@ -1862,7 +2109,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1862{ 2109{
1863 u64 delta; 2110 u64 delta;
1864 2111
1865 switch (event->type) { 2112 switch (event->type_len) {
1866 case RINGBUF_TYPE_PADDING: 2113 case RINGBUF_TYPE_PADDING:
1867 return; 2114 return;
1868 2115
@@ -1893,7 +2140,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1893{ 2140{
1894 u64 delta; 2141 u64 delta;
1895 2142
1896 switch (event->type) { 2143 switch (event->type_len) {
1897 case RINGBUF_TYPE_PADDING: 2144 case RINGBUF_TYPE_PADDING:
1898 return; 2145 return;
1899 2146
@@ -1966,6 +2213,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1966 cpu_buffer->reader_page->list.prev = reader->list.prev; 2213 cpu_buffer->reader_page->list.prev = reader->list.prev;
1967 2214
1968 local_set(&cpu_buffer->reader_page->write, 0); 2215 local_set(&cpu_buffer->reader_page->write, 0);
2216 local_set(&cpu_buffer->reader_page->entries, 0);
1969 local_set(&cpu_buffer->reader_page->page->commit, 0); 2217 local_set(&cpu_buffer->reader_page->page->commit, 0);
1970 2218
1971 /* Make the reader page now replace the head */ 2219 /* Make the reader page now replace the head */
@@ -2008,8 +2256,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2008 2256
2009 event = rb_reader_event(cpu_buffer); 2257 event = rb_reader_event(cpu_buffer);
2010 2258
2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) 2259 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
2012 cpu_buffer->entries--; 2260 || rb_discarded_event(event))
2261 cpu_buffer->read++;
2013 2262
2014 rb_update_read_stamp(cpu_buffer, event); 2263 rb_update_read_stamp(cpu_buffer, event);
2015 2264
@@ -2031,8 +2280,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2031 * Check if we are at the end of the buffer. 2280 * Check if we are at the end of the buffer.
2032 */ 2281 */
2033 if (iter->head >= rb_page_size(iter->head_page)) { 2282 if (iter->head >= rb_page_size(iter->head_page)) {
2034 if (RB_WARN_ON(buffer, 2283 /* discarded commits can make the page empty */
2035 iter->head_page == cpu_buffer->commit_page)) 2284 if (iter->head_page == cpu_buffer->commit_page)
2036 return; 2285 return;
2037 rb_inc_iter(iter); 2286 rb_inc_iter(iter);
2038 return; 2287 return;
@@ -2075,12 +2324,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2075 /* 2324 /*
2076 * We repeat when a timestamp is encountered. It is possible 2325 * We repeat when a timestamp is encountered. It is possible
2077 * to get multiple timestamps from an interrupt entering just 2326 * to get multiple timestamps from an interrupt entering just
2078 * as one timestamp is about to be written. The max times 2327 * as one timestamp is about to be written, or from discarded
2079 * that this can happen is the number of nested interrupts we 2328 * commits. The most that we can have is the number on a single page.
2080 * can have. Nesting 10 deep of interrupts is clearly
2081 * an anomaly.
2082 */ 2329 */
2083 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2330 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2084 return NULL; 2331 return NULL;
2085 2332
2086 reader = rb_get_reader_page(cpu_buffer); 2333 reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2336,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2089 2336
2090 event = rb_reader_event(cpu_buffer); 2337 event = rb_reader_event(cpu_buffer);
2091 2338
2092 switch (event->type) { 2339 switch (event->type_len) {
2093 case RINGBUF_TYPE_PADDING: 2340 case RINGBUF_TYPE_PADDING:
2094 if (rb_null_event(event)) 2341 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1); 2342 RB_WARN_ON(cpu_buffer, 1);
@@ -2146,14 +2393,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2146 2393
2147 again: 2394 again:
2148 /* 2395 /*
2149 * We repeat when a timestamp is encountered. It is possible 2396 * We repeat when a timestamp is encountered.
2150 * to get multiple timestamps from an interrupt entering just 2397 * We can get multiple timestamps by nested interrupts or also
2151 * as one timestamp is about to be written. The max times 2398 * if filtering is on (discarding commits). Since discarding
2152 * that this can happen is the number of nested interrupts we 2399 * commits can be frequent we can get a lot of timestamps.
2153 * can have. Nesting 10 deep of interrupts is clearly 2400 * But we limit them by not adding timestamps if they begin
2154 * an anomaly. 2401 * at the start of a page.
2155 */ 2402 */
2156 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2403 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2157 return NULL; 2404 return NULL;
2158 2405
2159 if (rb_per_cpu_empty(cpu_buffer)) 2406 if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2408,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2161 2408
2162 event = rb_iter_head_event(iter); 2409 event = rb_iter_head_event(iter);
2163 2410
2164 switch (event->type) { 2411 switch (event->type_len) {
2165 case RINGBUF_TYPE_PADDING: 2412 case RINGBUF_TYPE_PADDING:
2166 if (rb_null_event(event)) { 2413 if (rb_null_event(event)) {
2167 rb_inc_iter(iter); 2414 rb_inc_iter(iter);
@@ -2220,7 +2467,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2220 event = rb_buffer_peek(buffer, cpu, ts); 2467 event = rb_buffer_peek(buffer, cpu, ts);
2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2468 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2222 2469
2223 if (event && event->type == RINGBUF_TYPE_PADDING) { 2470 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2224 cpu_relax(); 2471 cpu_relax();
2225 goto again; 2472 goto again;
2226 } 2473 }
@@ -2248,7 +2495,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2248 event = rb_iter_peek(iter, ts); 2495 event = rb_iter_peek(iter, ts);
2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2496 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2250 2497
2251 if (event && event->type == RINGBUF_TYPE_PADDING) { 2498 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2252 cpu_relax(); 2499 cpu_relax();
2253 goto again; 2500 goto again;
2254 } 2501 }
@@ -2293,7 +2540,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2293 out: 2540 out:
2294 preempt_enable(); 2541 preempt_enable();
2295 2542
2296 if (event && event->type == RINGBUF_TYPE_PADDING) { 2543 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2297 cpu_relax(); 2544 cpu_relax();
2298 goto again; 2545 goto again;
2299 } 2546 }
@@ -2386,7 +2633,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2386 out: 2633 out:
2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2634 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2388 2635
2389 if (event && event->type == RINGBUF_TYPE_PADDING) { 2636 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2390 cpu_relax(); 2637 cpu_relax();
2391 goto again; 2638 goto again;
2392 } 2639 }
@@ -2411,6 +2658,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2411 cpu_buffer->head_page 2658 cpu_buffer->head_page
2412 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2659 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
2413 local_set(&cpu_buffer->head_page->write, 0); 2660 local_set(&cpu_buffer->head_page->write, 0);
2661 local_set(&cpu_buffer->head_page->entries, 0);
2414 local_set(&cpu_buffer->head_page->page->commit, 0); 2662 local_set(&cpu_buffer->head_page->page->commit, 0);
2415 2663
2416 cpu_buffer->head_page->read = 0; 2664 cpu_buffer->head_page->read = 0;
@@ -2420,11 +2668,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2420 2668
2421 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2669 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
2422 local_set(&cpu_buffer->reader_page->write, 0); 2670 local_set(&cpu_buffer->reader_page->write, 0);
2671 local_set(&cpu_buffer->reader_page->entries, 0);
2423 local_set(&cpu_buffer->reader_page->page->commit, 0); 2672 local_set(&cpu_buffer->reader_page->page->commit, 0);
2424 cpu_buffer->reader_page->read = 0; 2673 cpu_buffer->reader_page->read = 0;
2425 2674
2675 cpu_buffer->nmi_dropped = 0;
2676 cpu_buffer->commit_overrun = 0;
2426 cpu_buffer->overrun = 0; 2677 cpu_buffer->overrun = 0;
2427 cpu_buffer->entries = 0; 2678 cpu_buffer->read = 0;
2679 local_set(&cpu_buffer->entries, 0);
2428 2680
2429 cpu_buffer->write_stamp = 0; 2681 cpu_buffer->write_stamp = 0;
2430 cpu_buffer->read_stamp = 0; 2682 cpu_buffer->read_stamp = 0;
@@ -2443,6 +2695,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2443 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2695 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2444 return; 2696 return;
2445 2697
2698 atomic_inc(&cpu_buffer->record_disabled);
2699
2446 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2700 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2447 2701
2448 __raw_spin_lock(&cpu_buffer->lock); 2702 __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2706,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2452 __raw_spin_unlock(&cpu_buffer->lock); 2706 __raw_spin_unlock(&cpu_buffer->lock);
2453 2707
2454 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2708 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2709
2710 atomic_dec(&cpu_buffer->record_disabled);
2455} 2711}
2456EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 2712EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2457 2713
@@ -2578,28 +2834,6 @@ out:
2578} 2834}
2579EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2835EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2580 2836
2581static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2582 struct buffer_data_page *bpage,
2583 unsigned int offset)
2584{
2585 struct ring_buffer_event *event;
2586 unsigned long head;
2587
2588 __raw_spin_lock(&cpu_buffer->lock);
2589 for (head = offset; head < local_read(&bpage->commit);
2590 head += rb_event_length(event)) {
2591
2592 event = __rb_data_page_index(bpage, head);
2593 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2594 return;
2595 /* Only count data entries */
2596 if (event->type != RINGBUF_TYPE_DATA)
2597 continue;
2598 cpu_buffer->entries--;
2599 }
2600 __raw_spin_unlock(&cpu_buffer->lock);
2601}
2602
2603/** 2837/**
2604 * ring_buffer_alloc_read_page - allocate a page to read from buffer 2838 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2605 * @buffer: the buffer to allocate for. 2839 * @buffer: the buffer to allocate for.
@@ -2630,6 +2864,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2630 2864
2631 return bpage; 2865 return bpage;
2632} 2866}
2867EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
2633 2868
2634/** 2869/**
2635 * ring_buffer_free_read_page - free an allocated read page 2870 * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2877,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2642{ 2877{
2643 free_page((unsigned long)data); 2878 free_page((unsigned long)data);
2644} 2879}
2880EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
2645 2881
2646/** 2882/**
2647 * ring_buffer_read_page - extract a page from the ring buffer 2883 * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3004,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2768 /* we copied everything to the beginning */ 3004 /* we copied everything to the beginning */
2769 read = 0; 3005 read = 0;
2770 } else { 3006 } else {
3007 /* update the entry counter */
3008 cpu_buffer->read += local_read(&reader->entries);
3009
2771 /* swap the pages */ 3010 /* swap the pages */
2772 rb_init_page(bpage); 3011 rb_init_page(bpage);
2773 bpage = reader->page; 3012 bpage = reader->page;
2774 reader->page = *data_page; 3013 reader->page = *data_page;
2775 local_set(&reader->write, 0); 3014 local_set(&reader->write, 0);
3015 local_set(&reader->entries, 0);
2776 reader->read = 0; 3016 reader->read = 0;
2777 *data_page = bpage; 3017 *data_page = bpage;
2778
2779 /* update the entry counter */
2780 rb_remove_entries(cpu_buffer, bpage, read);
2781 } 3018 }
2782 ret = read; 3019 ret = read;
2783 3020
@@ -2787,6 +3024,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2787 out: 3024 out:
2788 return ret; 3025 return ret;
2789} 3026}
3027EXPORT_SYMBOL_GPL(ring_buffer_read_page);
2790 3028
2791static ssize_t 3029static ssize_t
2792rb_simple_read(struct file *filp, char __user *ubuf, 3030rb_simple_read(struct file *filp, char __user *ubuf,
@@ -2845,14 +3083,11 @@ static const struct file_operations rb_simple_fops = {
2845static __init int rb_init_debugfs(void) 3083static __init int rb_init_debugfs(void)
2846{ 3084{
2847 struct dentry *d_tracer; 3085 struct dentry *d_tracer;
2848 struct dentry *entry;
2849 3086
2850 d_tracer = tracing_init_dentry(); 3087 d_tracer = tracing_init_dentry();
2851 3088
2852 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 3089 trace_create_file("tracing_on", 0644, d_tracer,
2853 &ring_buffer_flags, &rb_simple_fops); 3090 &ring_buffer_flags, &rb_simple_fops);
2854 if (!entry)
2855 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2856 3091
2857 return 0; 3092 return 0;
2858} 3093}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 000000000000..8d68e149a8b3
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,416 @@
1/*
2 * ring buffer tester and benchmark
3 *
4 * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/ring_buffer.h>
7#include <linux/completion.h>
8#include <linux/kthread.h>
9#include <linux/module.h>
10#include <linux/time.h>
11
12struct rb_page {
13 u64 ts;
14 local_t commit;
15 char data[4080];
16};
17
18/* run time and sleep time in seconds */
19#define RUN_TIME 10
20#define SLEEP_TIME 10
21
22/* number of events for writer to wake up the reader */
23static int wakeup_interval = 100;
24
25static int reader_finish;
26static struct completion read_start;
27static struct completion read_done;
28
29static struct ring_buffer *buffer;
30static struct task_struct *producer;
31static struct task_struct *consumer;
32static unsigned long read;
33
34static int disable_reader;
35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer");
37
38static int read_events;
39
40static int kill_test;
41
42#define KILL_TEST() \
43 do { \
44 if (!kill_test) { \
45 kill_test = 1; \
46 WARN_ON(1); \
47 } \
48 } while (0)
49
50enum event_status {
51 EVENT_FOUND,
52 EVENT_DROPPED,
53};
54
55static enum event_status read_event(int cpu)
56{
57 struct ring_buffer_event *event;
58 int *entry;
59 u64 ts;
60
61 event = ring_buffer_consume(buffer, cpu, &ts);
62 if (!event)
63 return EVENT_DROPPED;
64
65 entry = ring_buffer_event_data(event);
66 if (*entry != cpu) {
67 KILL_TEST();
68 return EVENT_DROPPED;
69 }
70
71 read++;
72 return EVENT_FOUND;
73}
74
75static enum event_status read_page(int cpu)
76{
77 struct ring_buffer_event *event;
78 struct rb_page *rpage;
79 unsigned long commit;
80 void *bpage;
81 int *entry;
82 int ret;
83 int inc;
84 int i;
85
86 bpage = ring_buffer_alloc_read_page(buffer);
87 if (!bpage)
88 return EVENT_DROPPED;
89
90 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
91 if (ret >= 0) {
92 rpage = bpage;
93 commit = local_read(&rpage->commit);
94 for (i = 0; i < commit && !kill_test; i += inc) {
95
96 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
97 KILL_TEST();
98 break;
99 }
100
101 inc = -1;
102 event = (void *)&rpage->data[i];
103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING:
105 /* We don't expect any padding */
106 KILL_TEST();
107 break;
108 case RINGBUF_TYPE_TIME_EXTEND:
109 inc = 8;
110 break;
111 case 0:
112 entry = ring_buffer_event_data(event);
113 if (*entry != cpu) {
114 KILL_TEST();
115 break;
116 }
117 read++;
118 if (!event->array[0]) {
119 KILL_TEST();
120 break;
121 }
122 inc = event->array[0];
123 break;
124 default:
125 entry = ring_buffer_event_data(event);
126 if (*entry != cpu) {
127 KILL_TEST();
128 break;
129 }
130 read++;
131 inc = ((event->type_len + 1) * 4);
132 }
133 if (kill_test)
134 break;
135
136 if (inc <= 0) {
137 KILL_TEST();
138 break;
139 }
140 }
141 }
142 ring_buffer_free_read_page(buffer, bpage);
143
144 if (ret < 0)
145 return EVENT_DROPPED;
146 return EVENT_FOUND;
147}
148
149static void ring_buffer_consumer(void)
150{
151 /* toggle between reading pages and events */
152 read_events ^= 1;
153
154 read = 0;
155 while (!reader_finish && !kill_test) {
156 int found;
157
158 do {
159 int cpu;
160
161 found = 0;
162 for_each_online_cpu(cpu) {
163 enum event_status stat;
164
165 if (read_events)
166 stat = read_event(cpu);
167 else
168 stat = read_page(cpu);
169
170 if (kill_test)
171 break;
172 if (stat == EVENT_FOUND)
173 found = 1;
174 }
175 } while (found && !kill_test);
176
177 set_current_state(TASK_INTERRUPTIBLE);
178 if (reader_finish)
179 break;
180
181 schedule();
182 __set_current_state(TASK_RUNNING);
183 }
184 reader_finish = 0;
185 complete(&read_done);
186}
187
188static void ring_buffer_producer(void)
189{
190 struct timeval start_tv;
191 struct timeval end_tv;
192 unsigned long long time;
193 unsigned long long entries;
194 unsigned long long overruns;
195 unsigned long missed = 0;
196 unsigned long hit = 0;
197 unsigned long avg;
198 int cnt = 0;
199
200 /*
201 * Hammer the buffer for 10 secs (this may
202 * make the system stall)
203 */
204 pr_info("Starting ring buffer hammer\n");
205 do_gettimeofday(&start_tv);
206 do {
207 struct ring_buffer_event *event;
208 int *entry;
209
210 event = ring_buffer_lock_reserve(buffer, 10);
211 if (!event) {
212 missed++;
213 } else {
214 hit++;
215 entry = ring_buffer_event_data(event);
216 *entry = smp_processor_id();
217 ring_buffer_unlock_commit(buffer, event);
218 }
219 do_gettimeofday(&end_tv);
220
221 cnt++;
222 if (consumer && !(cnt % wakeup_interval))
223 wake_up_process(consumer);
224
225#ifndef CONFIG_PREEMPT
226 /*
227 * If we are a non preempt kernel, the 10 second run will
228 * stop everything while it runs. Instead, we will call
229 * cond_resched and also add any time that was lost by a
230 * rescedule.
231 *
232 * Do a cond resched at the same frequency we would wake up
233 * the reader.
234 */
235 if (cnt % wakeup_interval)
236 cond_resched();
237#endif
238
239 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
240 pr_info("End ring buffer hammer\n");
241
242 if (consumer) {
243 /* Init both completions here to avoid races */
244 init_completion(&read_start);
245 init_completion(&read_done);
246 /* the completions must be visible before the finish var */
247 smp_wmb();
248 reader_finish = 1;
249 /* finish var visible before waking up the consumer */
250 smp_wmb();
251 wake_up_process(consumer);
252 wait_for_completion(&read_done);
253 }
254
255 time = end_tv.tv_sec - start_tv.tv_sec;
256 time *= USEC_PER_SEC;
257 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
258
259 entries = ring_buffer_entries(buffer);
260 overruns = ring_buffer_overruns(buffer);
261
262 if (kill_test)
263 pr_info("ERROR!\n");
264 pr_info("Time: %lld (usecs)\n", time);
265 pr_info("Overruns: %lld\n", overruns);
266 if (disable_reader)
267 pr_info("Read: (reader disabled)\n");
268 else
269 pr_info("Read: %ld (by %s)\n", read,
270 read_events ? "events" : "pages");
271 pr_info("Entries: %lld\n", entries);
272 pr_info("Total: %lld\n", entries + overruns + read);
273 pr_info("Missed: %ld\n", missed);
274 pr_info("Hit: %ld\n", hit);
275
276 /* Convert time from usecs to millisecs */
277 do_div(time, USEC_PER_MSEC);
278 if (time)
279 hit /= (long)time;
280 else
281 pr_info("TIME IS ZERO??\n");
282
283 pr_info("Entries per millisec: %ld\n", hit);
284
285 if (hit) {
286 /* Calculate the average time in nanosecs */
287 avg = NSEC_PER_MSEC / hit;
288 pr_info("%ld ns per entry\n", avg);
289 }
290
291 if (missed) {
292 if (time)
293 missed /= (long)time;
294
295 pr_info("Total iterations per millisec: %ld\n", hit + missed);
296
297 /* it is possible that hit + missed will overflow and be zero */
298 if (!(hit + missed)) {
299 pr_info("hit + missed overflowed and totalled zero!\n");
300 hit--; /* make it non zero */
301 }
302
303 /* Caculate the average time in nanosecs */
304 avg = NSEC_PER_MSEC / (hit + missed);
305 pr_info("%ld ns per entry\n", avg);
306 }
307}
308
309static void wait_to_die(void)
310{
311 set_current_state(TASK_INTERRUPTIBLE);
312 while (!kthread_should_stop()) {
313 schedule();
314 set_current_state(TASK_INTERRUPTIBLE);
315 }
316 __set_current_state(TASK_RUNNING);
317}
318
319static int ring_buffer_consumer_thread(void *arg)
320{
321 while (!kthread_should_stop() && !kill_test) {
322 complete(&read_start);
323
324 ring_buffer_consumer();
325
326 set_current_state(TASK_INTERRUPTIBLE);
327 if (kthread_should_stop() || kill_test)
328 break;
329
330 schedule();
331 __set_current_state(TASK_RUNNING);
332 }
333 __set_current_state(TASK_RUNNING);
334
335 if (kill_test)
336 wait_to_die();
337
338 return 0;
339}
340
341static int ring_buffer_producer_thread(void *arg)
342{
343 init_completion(&read_start);
344
345 while (!kthread_should_stop() && !kill_test) {
346 ring_buffer_reset(buffer);
347
348 if (consumer) {
349 smp_wmb();
350 wake_up_process(consumer);
351 wait_for_completion(&read_start);
352 }
353
354 ring_buffer_producer();
355
356 pr_info("Sleeping for 10 secs\n");
357 set_current_state(TASK_INTERRUPTIBLE);
358 schedule_timeout(HZ * SLEEP_TIME);
359 __set_current_state(TASK_RUNNING);
360 }
361
362 if (kill_test)
363 wait_to_die();
364
365 return 0;
366}
367
368static int __init ring_buffer_benchmark_init(void)
369{
370 int ret;
371
372 /* make a one meg buffer in overwite mode */
373 buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
374 if (!buffer)
375 return -ENOMEM;
376
377 if (!disable_reader) {
378 consumer = kthread_create(ring_buffer_consumer_thread,
379 NULL, "rb_consumer");
380 ret = PTR_ERR(consumer);
381 if (IS_ERR(consumer))
382 goto out_fail;
383 }
384
385 producer = kthread_run(ring_buffer_producer_thread,
386 NULL, "rb_producer");
387 ret = PTR_ERR(producer);
388
389 if (IS_ERR(producer))
390 goto out_kill;
391
392 return 0;
393
394 out_kill:
395 if (consumer)
396 kthread_stop(consumer);
397
398 out_fail:
399 ring_buffer_free(buffer);
400 return ret;
401}
402
403static void __exit ring_buffer_benchmark_exit(void)
404{
405 kthread_stop(producer);
406 if (consumer)
407 kthread_stop(consumer);
408 ring_buffer_free(buffer);
409}
410
411module_init(ring_buffer_benchmark_init);
412module_exit(ring_buffer_benchmark_exit);
413
414MODULE_AUTHOR("Steven Rostedt");
415MODULE_DESCRIPTION("ring_buffer_benchmark");
416MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cda81ec58d9f..8acd9b81a5d7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -171,6 +171,13 @@ static struct trace_array global_trace;
171 171
172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
173 173
174int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
175 struct ring_buffer_event *event)
176{
177 return filter_check_discard(call, rec, global_trace.buffer, event);
178}
179EXPORT_SYMBOL_GPL(filter_current_check_discard);
180
174cycle_t ftrace_now(int cpu) 181cycle_t ftrace_now(int cpu)
175{ 182{
176 u64 ts; 183 u64 ts;
@@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
255 262
256/* trace_flags holds trace_options default values */ 263/* trace_flags holds trace_options default values */
257unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 264unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
258 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; 265 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
266 TRACE_ITER_GRAPH_TIME;
259 267
260/** 268/**
261 * trace_wake_up - wake up tasks waiting for trace input 269 * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
317 "latency-format", 325 "latency-format",
318 "global-clock", 326 "global-clock",
319 "sleep-time", 327 "sleep-time",
328 "graph-time",
320 NULL 329 NULL
321}; 330};
322 331
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
402 return cnt; 411 return cnt;
403} 412}
404 413
405static void
406trace_print_seq(struct seq_file *m, struct trace_seq *s)
407{
408 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
409
410 s->buffer[len] = 0;
411 seq_puts(m, s->buffer);
412
413 trace_seq_init(s);
414}
415
416/** 414/**
417 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
418 * @tr: tracer 416 * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
641 tracing_reset(tr, cpu); 639 tracing_reset(tr, cpu);
642} 640}
643 641
642void tracing_reset_current(int cpu)
643{
644 tracing_reset(&global_trace, cpu);
645}
646
647void tracing_reset_current_online_cpus(void)
648{
649 tracing_reset_online_cpus(&global_trace);
650}
651
644#define SAVED_CMDLINES 128 652#define SAVED_CMDLINES 128
645#define NO_CMDLINE_MAP UINT_MAX 653#define NO_CMDLINE_MAP UINT_MAX
646static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 654static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
800 return; 808 return;
801 } 809 }
802 810
811 preempt_disable();
803 __raw_spin_lock(&trace_cmdline_lock); 812 __raw_spin_lock(&trace_cmdline_lock);
804 map = map_pid_to_cmdline[pid]; 813 map = map_pid_to_cmdline[pid];
805 if (map != NO_CMDLINE_MAP) 814 if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
808 strcpy(comm, "<...>"); 817 strcpy(comm, "<...>");
809 818
810 __raw_spin_unlock(&trace_cmdline_lock); 819 __raw_spin_unlock(&trace_cmdline_lock);
820 preempt_enable();
811} 821}
812 822
813void tracing_record_cmdline(struct task_struct *tsk) 823void tracing_record_cmdline(struct task_struct *tsk)
@@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
840} 850}
841 851
842struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
843 unsigned char type, 853 int type,
844 unsigned long len, 854 unsigned long len,
845 unsigned long flags, int pc) 855 unsigned long flags, int pc)
846{ 856{
@@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
883} 893}
884 894
885struct ring_buffer_event * 895struct ring_buffer_event *
886trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, 896trace_current_buffer_lock_reserve(int type, unsigned long len,
887 unsigned long flags, int pc) 897 unsigned long flags, int pc)
888{ 898{
889 return trace_buffer_lock_reserve(&global_trace, 899 return trace_buffer_lock_reserve(&global_trace,
890 type, len, flags, pc); 900 type, len, flags, pc);
891} 901}
902EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
892 903
893void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 904void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
894 unsigned long flags, int pc) 905 unsigned long flags, int pc)
895{ 906{
896 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 907 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
897} 908}
909EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
898 910
899void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 911void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
900 unsigned long flags, int pc) 912 unsigned long flags, int pc)
901{ 913{
902 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 914 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
915}
916EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
917
918void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
919{
920 ring_buffer_discard_commit(global_trace.buffer, event);
903} 921}
922EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
904 923
905void 924void
906trace_function(struct trace_array *tr, 925trace_function(struct trace_array *tr,
907 unsigned long ip, unsigned long parent_ip, unsigned long flags, 926 unsigned long ip, unsigned long parent_ip, unsigned long flags,
908 int pc) 927 int pc)
909{ 928{
929 struct ftrace_event_call *call = &event_function;
910 struct ring_buffer_event *event; 930 struct ring_buffer_event *event;
911 struct ftrace_entry *entry; 931 struct ftrace_entry *entry;
912 932
@@ -921,7 +941,9 @@ trace_function(struct trace_array *tr,
921 entry = ring_buffer_event_data(event); 941 entry = ring_buffer_event_data(event);
922 entry->ip = ip; 942 entry->ip = ip;
923 entry->parent_ip = parent_ip; 943 entry->parent_ip = parent_ip;
924 ring_buffer_unlock_commit(tr->buffer, event); 944
945 if (!filter_check_discard(call, entry, tr->buffer, event))
946 ring_buffer_unlock_commit(tr->buffer, event);
925} 947}
926 948
927#ifdef CONFIG_FUNCTION_GRAPH_TRACER 949#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr,
930 unsigned long flags, 952 unsigned long flags,
931 int pc) 953 int pc)
932{ 954{
955 struct ftrace_event_call *call = &event_funcgraph_entry;
933 struct ring_buffer_event *event; 956 struct ring_buffer_event *event;
934 struct ftrace_graph_ent_entry *entry; 957 struct ftrace_graph_ent_entry *entry;
935 958
@@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr,
942 return 0; 965 return 0;
943 entry = ring_buffer_event_data(event); 966 entry = ring_buffer_event_data(event);
944 entry->graph_ent = *trace; 967 entry->graph_ent = *trace;
945 ring_buffer_unlock_commit(global_trace.buffer, event); 968 if (!filter_current_check_discard(call, entry, event))
969 ring_buffer_unlock_commit(global_trace.buffer, event);
946 970
947 return 1; 971 return 1;
948} 972}
@@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr,
952 unsigned long flags, 976 unsigned long flags,
953 int pc) 977 int pc)
954{ 978{
979 struct ftrace_event_call *call = &event_funcgraph_exit;
955 struct ring_buffer_event *event; 980 struct ring_buffer_event *event;
956 struct ftrace_graph_ret_entry *entry; 981 struct ftrace_graph_ret_entry *entry;
957 982
@@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr,
964 return; 989 return;
965 entry = ring_buffer_event_data(event); 990 entry = ring_buffer_event_data(event);
966 entry->ret = *trace; 991 entry->ret = *trace;
967 ring_buffer_unlock_commit(global_trace.buffer, event); 992 if (!filter_current_check_discard(call, entry, event))
993 ring_buffer_unlock_commit(global_trace.buffer, event);
968} 994}
969#endif 995#endif
970 996
@@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
982 int skip, int pc) 1008 int skip, int pc)
983{ 1009{
984#ifdef CONFIG_STACKTRACE 1010#ifdef CONFIG_STACKTRACE
1011 struct ftrace_event_call *call = &event_kernel_stack;
985 struct ring_buffer_event *event; 1012 struct ring_buffer_event *event;
986 struct stack_entry *entry; 1013 struct stack_entry *entry;
987 struct stack_trace trace; 1014 struct stack_trace trace;
@@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
999 trace.entries = entry->caller; 1026 trace.entries = entry->caller;
1000 1027
1001 save_stack_trace(&trace); 1028 save_stack_trace(&trace);
1002 ring_buffer_unlock_commit(tr->buffer, event); 1029 if (!filter_check_discard(call, entry, tr->buffer, event))
1030 ring_buffer_unlock_commit(tr->buffer, event);
1003#endif 1031#endif
1004} 1032}
1005 1033
@@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1024 unsigned long flags, int pc) 1052 unsigned long flags, int pc)
1025{ 1053{
1026#ifdef CONFIG_STACKTRACE 1054#ifdef CONFIG_STACKTRACE
1055 struct ftrace_event_call *call = &event_user_stack;
1027 struct ring_buffer_event *event; 1056 struct ring_buffer_event *event;
1028 struct userstack_entry *entry; 1057 struct userstack_entry *entry;
1029 struct stack_trace trace; 1058 struct stack_trace trace;
@@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1045 trace.entries = entry->caller; 1074 trace.entries = entry->caller;
1046 1075
1047 save_stack_trace_user(&trace); 1076 save_stack_trace_user(&trace);
1048 ring_buffer_unlock_commit(tr->buffer, event); 1077 if (!filter_check_discard(call, entry, tr->buffer, event))
1078 ring_buffer_unlock_commit(tr->buffer, event);
1049#endif 1079#endif
1050} 1080}
1051 1081
@@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
1089 struct task_struct *next, 1119 struct task_struct *next,
1090 unsigned long flags, int pc) 1120 unsigned long flags, int pc)
1091{ 1121{
1122 struct ftrace_event_call *call = &event_context_switch;
1092 struct ring_buffer_event *event; 1123 struct ring_buffer_event *event;
1093 struct ctx_switch_entry *entry; 1124 struct ctx_switch_entry *entry;
1094 1125
@@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
1104 entry->next_prio = next->prio; 1135 entry->next_prio = next->prio;
1105 entry->next_state = next->state; 1136 entry->next_state = next->state;
1106 entry->next_cpu = task_cpu(next); 1137 entry->next_cpu = task_cpu(next);
1107 trace_buffer_unlock_commit(tr, event, flags, pc); 1138
1139 if (!filter_check_discard(call, entry, tr->buffer, event))
1140 trace_buffer_unlock_commit(tr, event, flags, pc);
1108} 1141}
1109 1142
1110void 1143void
@@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1113 struct task_struct *curr, 1146 struct task_struct *curr,
1114 unsigned long flags, int pc) 1147 unsigned long flags, int pc)
1115{ 1148{
1149 struct ftrace_event_call *call = &event_wakeup;
1116 struct ring_buffer_event *event; 1150 struct ring_buffer_event *event;
1117 struct ctx_switch_entry *entry; 1151 struct ctx_switch_entry *entry;
1118 1152
@@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1129 entry->next_state = wakee->state; 1163 entry->next_state = wakee->state;
1130 entry->next_cpu = task_cpu(wakee); 1164 entry->next_cpu = task_cpu(wakee);
1131 1165
1132 ring_buffer_unlock_commit(tr->buffer, event); 1166 if (!filter_check_discard(call, entry, tr->buffer, event))
1167 ring_buffer_unlock_commit(tr->buffer, event);
1133 ftrace_trace_stack(tr, flags, 6, pc); 1168 ftrace_trace_stack(tr, flags, 6, pc);
1134 ftrace_trace_userstack(tr, flags, pc); 1169 ftrace_trace_userstack(tr, flags, pc);
1135} 1170}
@@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1230 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1265 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
1231 static u32 trace_buf[TRACE_BUF_SIZE]; 1266 static u32 trace_buf[TRACE_BUF_SIZE];
1232 1267
1268 struct ftrace_event_call *call = &event_bprint;
1233 struct ring_buffer_event *event; 1269 struct ring_buffer_event *event;
1234 struct trace_array *tr = &global_trace; 1270 struct trace_array *tr = &global_trace;
1235 struct trace_array_cpu *data; 1271 struct trace_array_cpu *data;
1236 struct bprint_entry *entry; 1272 struct bprint_entry *entry;
1237 unsigned long flags; 1273 unsigned long flags;
1274 int disable;
1238 int resched; 1275 int resched;
1239 int cpu, len = 0, size, pc; 1276 int cpu, len = 0, size, pc;
1240 1277
@@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1249 cpu = raw_smp_processor_id(); 1286 cpu = raw_smp_processor_id();
1250 data = tr->data[cpu]; 1287 data = tr->data[cpu];
1251 1288
1252 if (unlikely(atomic_read(&data->disabled))) 1289 disable = atomic_inc_return(&data->disabled);
1290 if (unlikely(disable != 1))
1253 goto out; 1291 goto out;
1254 1292
1255 /* Lockdep uses trace_printk for lock tracing */ 1293 /* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1269 entry->fmt = fmt; 1307 entry->fmt = fmt;
1270 1308
1271 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1309 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1272 ring_buffer_unlock_commit(tr->buffer, event); 1310 if (!filter_check_discard(call, entry, tr->buffer, event))
1311 ring_buffer_unlock_commit(tr->buffer, event);
1273 1312
1274out_unlock: 1313out_unlock:
1275 __raw_spin_unlock(&trace_buf_lock); 1314 __raw_spin_unlock(&trace_buf_lock);
1276 local_irq_restore(flags); 1315 local_irq_restore(flags);
1277 1316
1278out: 1317out:
1318 atomic_dec_return(&data->disabled);
1279 ftrace_preempt_enable(resched); 1319 ftrace_preempt_enable(resched);
1280 unpause_graph_tracing(); 1320 unpause_graph_tracing();
1281 1321
@@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1288 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1328 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1289 static char trace_buf[TRACE_BUF_SIZE]; 1329 static char trace_buf[TRACE_BUF_SIZE];
1290 1330
1331 struct ftrace_event_call *call = &event_print;
1291 struct ring_buffer_event *event; 1332 struct ring_buffer_event *event;
1292 struct trace_array *tr = &global_trace; 1333 struct trace_array *tr = &global_trace;
1293 struct trace_array_cpu *data; 1334 struct trace_array_cpu *data;
1294 int cpu, len = 0, size, pc; 1335 int cpu, len = 0, size, pc;
1295 struct print_entry *entry; 1336 struct print_entry *entry;
1296 unsigned long irq_flags; 1337 unsigned long irq_flags;
1338 int disable;
1297 1339
1298 if (tracing_disabled || tracing_selftest_running) 1340 if (tracing_disabled || tracing_selftest_running)
1299 return 0; 1341 return 0;
@@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1303 cpu = raw_smp_processor_id(); 1345 cpu = raw_smp_processor_id();
1304 data = tr->data[cpu]; 1346 data = tr->data[cpu];
1305 1347
1306 if (unlikely(atomic_read(&data->disabled))) 1348 disable = atomic_inc_return(&data->disabled);
1349 if (unlikely(disable != 1))
1307 goto out; 1350 goto out;
1308 1351
1309 pause_graph_tracing(); 1352 pause_graph_tracing();
@@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1323 1366
1324 memcpy(&entry->buf, trace_buf, len); 1367 memcpy(&entry->buf, trace_buf, len);
1325 entry->buf[len] = 0; 1368 entry->buf[len] = 0;
1326 ring_buffer_unlock_commit(tr->buffer, event); 1369 if (!filter_check_discard(call, entry, tr->buffer, event))
1370 ring_buffer_unlock_commit(tr->buffer, event);
1327 1371
1328 out_unlock: 1372 out_unlock:
1329 __raw_spin_unlock(&trace_buf_lock); 1373 __raw_spin_unlock(&trace_buf_lock);
1330 raw_local_irq_restore(irq_flags); 1374 raw_local_irq_restore(irq_flags);
1331 unpause_graph_tracing(); 1375 unpause_graph_tracing();
1332 out: 1376 out:
1377 atomic_dec_return(&data->disabled);
1333 preempt_enable_notrace(); 1378 preempt_enable_notrace();
1334 1379
1335 return len; 1380 return len;
@@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1526 p = s_next(m, p, &l); 1571 p = s_next(m, p, &l);
1527 } 1572 }
1528 1573
1574 trace_event_read_lock();
1529 return p; 1575 return p;
1530} 1576}
1531 1577
1532static void s_stop(struct seq_file *m, void *p) 1578static void s_stop(struct seq_file *m, void *p)
1533{ 1579{
1534 atomic_dec(&trace_record_cmdline_disabled); 1580 atomic_dec(&trace_record_cmdline_disabled);
1581 trace_event_read_unlock();
1535} 1582}
1536 1583
1537static void print_lat_help_header(struct seq_file *m) 1584static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter)
1774 return 1; 1821 return 1;
1775} 1822}
1776 1823
1824/* Called with trace_event_read_lock() held. */
1777static enum print_line_t print_trace_line(struct trace_iterator *iter) 1825static enum print_line_t print_trace_line(struct trace_iterator *iter)
1778{ 1826{
1779 enum print_line_t ret; 1827 enum print_line_t ret;
@@ -2397,6 +2445,56 @@ static const struct file_operations tracing_readme_fops = {
2397}; 2445};
2398 2446
2399static ssize_t 2447static ssize_t
2448tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2449 size_t cnt, loff_t *ppos)
2450{
2451 char *buf_comm;
2452 char *file_buf;
2453 char *buf;
2454 int len = 0;
2455 int pid;
2456 int i;
2457
2458 file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
2459 if (!file_buf)
2460 return -ENOMEM;
2461
2462 buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
2463 if (!buf_comm) {
2464 kfree(file_buf);
2465 return -ENOMEM;
2466 }
2467
2468 buf = file_buf;
2469
2470 for (i = 0; i < SAVED_CMDLINES; i++) {
2471 int r;
2472
2473 pid = map_cmdline_to_pid[i];
2474 if (pid == -1 || pid == NO_CMDLINE_MAP)
2475 continue;
2476
2477 trace_find_cmdline(pid, buf_comm);
2478 r = sprintf(buf, "%d %s\n", pid, buf_comm);
2479 buf += r;
2480 len += r;
2481 }
2482
2483 len = simple_read_from_buffer(ubuf, cnt, ppos,
2484 file_buf, len);
2485
2486 kfree(file_buf);
2487 kfree(buf_comm);
2488
2489 return len;
2490}
2491
2492static const struct file_operations tracing_saved_cmdlines_fops = {
2493 .open = tracing_open_generic,
2494 .read = tracing_saved_cmdlines_read,
2495};
2496
2497static ssize_t
2400tracing_ctrl_read(struct file *filp, char __user *ubuf, 2498tracing_ctrl_read(struct file *filp, char __user *ubuf,
2401 size_t cnt, loff_t *ppos) 2499 size_t cnt, loff_t *ppos)
2402{ 2500{
@@ -2728,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2728 /* trace pipe does not show start of buffer */ 2826 /* trace pipe does not show start of buffer */
2729 cpumask_setall(iter->started); 2827 cpumask_setall(iter->started);
2730 2828
2829 if (trace_flags & TRACE_ITER_LATENCY_FMT)
2830 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2831
2731 iter->cpu_file = cpu_file; 2832 iter->cpu_file = cpu_file;
2732 iter->tr = &global_trace; 2833 iter->tr = &global_trace;
2733 mutex_init(&iter->mutex); 2834 mutex_init(&iter->mutex);
@@ -2915,6 +3016,7 @@ waitagain:
2915 offsetof(struct trace_iterator, seq)); 3016 offsetof(struct trace_iterator, seq));
2916 iter->pos = -1; 3017 iter->pos = -1;
2917 3018
3019 trace_event_read_lock();
2918 while (find_next_entry_inc(iter) != NULL) { 3020 while (find_next_entry_inc(iter) != NULL) {
2919 enum print_line_t ret; 3021 enum print_line_t ret;
2920 int len = iter->seq.len; 3022 int len = iter->seq.len;
@@ -2931,6 +3033,7 @@ waitagain:
2931 if (iter->seq.len >= cnt) 3033 if (iter->seq.len >= cnt)
2932 break; 3034 break;
2933 } 3035 }
3036 trace_event_read_unlock();
2934 3037
2935 /* Now copy what we have to the user */ 3038 /* Now copy what we have to the user */
2936 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 3039 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3053,6 +3156,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3053 goto out_err; 3156 goto out_err;
3054 } 3157 }
3055 3158
3159 trace_event_read_lock();
3160
3056 /* Fill as many pages as possible. */ 3161 /* Fill as many pages as possible. */
3057 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3162 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
3058 pages[i] = alloc_page(GFP_KERNEL); 3163 pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3180,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3075 trace_seq_init(&iter->seq); 3180 trace_seq_init(&iter->seq);
3076 } 3181 }
3077 3182
3183 trace_event_read_unlock();
3078 mutex_unlock(&iter->mutex); 3184 mutex_unlock(&iter->mutex);
3079 3185
3080 spd.nr_pages = i; 3186 spd.nr_pages = i;
@@ -3425,7 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3425 .spd_release = buffer_spd_release, 3531 .spd_release = buffer_spd_release,
3426 }; 3532 };
3427 struct buffer_ref *ref; 3533 struct buffer_ref *ref;
3428 int size, i; 3534 int entries, size, i;
3429 size_t ret; 3535 size_t ret;
3430 3536
3431 if (*ppos & (PAGE_SIZE - 1)) { 3537 if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3546,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3440 len &= PAGE_MASK; 3546 len &= PAGE_MASK;
3441 } 3547 }
3442 3548
3443 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { 3549 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3550
3551 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
3444 struct page *page; 3552 struct page *page;
3445 int r; 3553 int r;
3446 3554
@@ -3457,7 +3565,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3457 } 3565 }
3458 3566
3459 r = ring_buffer_read_page(ref->buffer, &ref->page, 3567 r = ring_buffer_read_page(ref->buffer, &ref->page,
3460 len, info->cpu, 0); 3568 len, info->cpu, 1);
3461 if (r < 0) { 3569 if (r < 0) {
3462 ring_buffer_free_read_page(ref->buffer, 3570 ring_buffer_free_read_page(ref->buffer,
3463 ref->page); 3571 ref->page);
@@ -3481,6 +3589,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3481 spd.partial[i].private = (unsigned long)ref; 3589 spd.partial[i].private = (unsigned long)ref;
3482 spd.nr_pages++; 3590 spd.nr_pages++;
3483 *ppos += PAGE_SIZE; 3591 *ppos += PAGE_SIZE;
3592
3593 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3484 } 3594 }
3485 3595
3486 spd.nr_pages = i; 3596 spd.nr_pages = i;
@@ -3508,6 +3618,45 @@ static const struct file_operations tracing_buffers_fops = {
3508 .llseek = no_llseek, 3618 .llseek = no_llseek,
3509}; 3619};
3510 3620
3621static ssize_t
3622tracing_stats_read(struct file *filp, char __user *ubuf,
3623 size_t count, loff_t *ppos)
3624{
3625 unsigned long cpu = (unsigned long)filp->private_data;
3626 struct trace_array *tr = &global_trace;
3627 struct trace_seq *s;
3628 unsigned long cnt;
3629
3630 s = kmalloc(sizeof(*s), GFP_ATOMIC);
3631 if (!s)
3632 return ENOMEM;
3633
3634 trace_seq_init(s);
3635
3636 cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "entries: %ld\n", cnt);
3638
3639 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
3640 trace_seq_printf(s, "overrun: %ld\n", cnt);
3641
3642 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3643 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3644
3645 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3646 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3647
3648 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3649
3650 kfree(s);
3651
3652 return count;
3653}
3654
3655static const struct file_operations tracing_stats_fops = {
3656 .open = tracing_open_generic,
3657 .read = tracing_stats_read,
3658};
3659
3511#ifdef CONFIG_DYNAMIC_FTRACE 3660#ifdef CONFIG_DYNAMIC_FTRACE
3512 3661
3513int __weak ftrace_arch_read_dyn_info(char *buf, int size) 3662int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3746,7 @@ struct dentry *tracing_dentry_percpu(void)
3597static void tracing_init_debugfs_percpu(long cpu) 3746static void tracing_init_debugfs_percpu(long cpu)
3598{ 3747{
3599 struct dentry *d_percpu = tracing_dentry_percpu(); 3748 struct dentry *d_percpu = tracing_dentry_percpu();
3600 struct dentry *entry, *d_cpu; 3749 struct dentry *d_cpu;
3601 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3750 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
3602 char cpu_dir[7]; 3751 char cpu_dir[7];
3603 3752
@@ -3612,21 +3761,18 @@ static void tracing_init_debugfs_percpu(long cpu)
3612 } 3761 }
3613 3762
3614 /* per cpu trace_pipe */ 3763 /* per cpu trace_pipe */
3615 entry = debugfs_create_file("trace_pipe", 0444, d_cpu, 3764 trace_create_file("trace_pipe", 0444, d_cpu,
3616 (void *) cpu, &tracing_pipe_fops); 3765 (void *) cpu, &tracing_pipe_fops);
3617 if (!entry)
3618 pr_warning("Could not create debugfs 'trace_pipe' entry\n");
3619 3766
3620 /* per cpu trace */ 3767 /* per cpu trace */
3621 entry = debugfs_create_file("trace", 0644, d_cpu, 3768 trace_create_file("trace", 0644, d_cpu,
3622 (void *) cpu, &tracing_fops); 3769 (void *) cpu, &tracing_fops);
3623 if (!entry) 3770
3624 pr_warning("Could not create debugfs 'trace' entry\n"); 3771 trace_create_file("trace_pipe_raw", 0444, d_cpu,
3772 (void *) cpu, &tracing_buffers_fops);
3625 3773
3626 entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, 3774 trace_create_file("stats", 0444, d_cpu,
3627 (void *) cpu, &tracing_buffers_fops); 3775 (void *) cpu, &tracing_stats_fops);
3628 if (!entry)
3629 pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
3630} 3776}
3631 3777
3632#ifdef CONFIG_FTRACE_SELFTEST 3778#ifdef CONFIG_FTRACE_SELFTEST
@@ -3782,6 +3928,22 @@ static const struct file_operations trace_options_core_fops = {
3782 .write = trace_options_core_write, 3928 .write = trace_options_core_write,
3783}; 3929};
3784 3930
3931struct dentry *trace_create_file(const char *name,
3932 mode_t mode,
3933 struct dentry *parent,
3934 void *data,
3935 const struct file_operations *fops)
3936{
3937 struct dentry *ret;
3938
3939 ret = debugfs_create_file(name, mode, parent, data, fops);
3940 if (!ret)
3941 pr_warning("Could not create debugfs '%s' entry\n", name);
3942
3943 return ret;
3944}
3945
3946
3785static struct dentry *trace_options_init_dentry(void) 3947static struct dentry *trace_options_init_dentry(void)
3786{ 3948{
3787 struct dentry *d_tracer; 3949 struct dentry *d_tracer;
@@ -3809,7 +3971,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
3809 struct tracer_opt *opt) 3971 struct tracer_opt *opt)
3810{ 3972{
3811 struct dentry *t_options; 3973 struct dentry *t_options;
3812 struct dentry *entry;
3813 3974
3814 t_options = trace_options_init_dentry(); 3975 t_options = trace_options_init_dentry();
3815 if (!t_options) 3976 if (!t_options)
@@ -3818,11 +3979,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
3818 topt->flags = flags; 3979 topt->flags = flags;
3819 topt->opt = opt; 3980 topt->opt = opt;
3820 3981
3821 entry = debugfs_create_file(opt->name, 0644, t_options, topt, 3982 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
3822 &trace_options_fops); 3983 &trace_options_fops);
3823 3984
3824 topt->entry = entry;
3825
3826} 3985}
3827 3986
3828static struct trace_option_dentry * 3987static struct trace_option_dentry *
@@ -3877,123 +4036,84 @@ static struct dentry *
3877create_trace_option_core_file(const char *option, long index) 4036create_trace_option_core_file(const char *option, long index)
3878{ 4037{
3879 struct dentry *t_options; 4038 struct dentry *t_options;
3880 struct dentry *entry;
3881 4039
3882 t_options = trace_options_init_dentry(); 4040 t_options = trace_options_init_dentry();
3883 if (!t_options) 4041 if (!t_options)
3884 return NULL; 4042 return NULL;
3885 4043
3886 entry = debugfs_create_file(option, 0644, t_options, (void *)index, 4044 return trace_create_file(option, 0644, t_options, (void *)index,
3887 &trace_options_core_fops); 4045 &trace_options_core_fops);
3888
3889 return entry;
3890} 4046}
3891 4047
3892static __init void create_trace_options_dir(void) 4048static __init void create_trace_options_dir(void)
3893{ 4049{
3894 struct dentry *t_options; 4050 struct dentry *t_options;
3895 struct dentry *entry;
3896 int i; 4051 int i;
3897 4052
3898 t_options = trace_options_init_dentry(); 4053 t_options = trace_options_init_dentry();
3899 if (!t_options) 4054 if (!t_options)
3900 return; 4055 return;
3901 4056
3902 for (i = 0; trace_options[i]; i++) { 4057 for (i = 0; trace_options[i]; i++)
3903 entry = create_trace_option_core_file(trace_options[i], i); 4058 create_trace_option_core_file(trace_options[i], i);
3904 if (!entry)
3905 pr_warning("Could not create debugfs %s entry\n",
3906 trace_options[i]);
3907 }
3908} 4059}
3909 4060
3910static __init int tracer_init_debugfs(void) 4061static __init int tracer_init_debugfs(void)
3911{ 4062{
3912 struct dentry *d_tracer; 4063 struct dentry *d_tracer;
3913 struct dentry *entry;
3914 int cpu; 4064 int cpu;
3915 4065
3916 d_tracer = tracing_init_dentry(); 4066 d_tracer = tracing_init_dentry();
3917 4067
3918 entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, 4068 trace_create_file("tracing_enabled", 0644, d_tracer,
3919 &global_trace, &tracing_ctrl_fops); 4069 &global_trace, &tracing_ctrl_fops);
3920 if (!entry)
3921 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
3922 4070
3923 entry = debugfs_create_file("trace_options", 0644, d_tracer, 4071 trace_create_file("trace_options", 0644, d_tracer,
3924 NULL, &tracing_iter_fops); 4072 NULL, &tracing_iter_fops);
3925 if (!entry)
3926 pr_warning("Could not create debugfs 'trace_options' entry\n");
3927 4073
3928 create_trace_options_dir(); 4074 trace_create_file("tracing_cpumask", 0644, d_tracer,
4075 NULL, &tracing_cpumask_fops);
4076
4077 trace_create_file("trace", 0644, d_tracer,
4078 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
3929 4079
3930 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 4080 trace_create_file("available_tracers", 0444, d_tracer,
3931 NULL, &tracing_cpumask_fops); 4081 &global_trace, &show_traces_fops);
3932 if (!entry) 4082
3933 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); 4083 trace_create_file("current_tracer", 0644, d_tracer,
3934 4084 &global_trace, &set_tracer_fops);
3935 entry = debugfs_create_file("trace", 0644, d_tracer, 4085
3936 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); 4086 trace_create_file("tracing_max_latency", 0644, d_tracer,
3937 if (!entry) 4087 &tracing_max_latency, &tracing_max_lat_fops);
3938 pr_warning("Could not create debugfs 'trace' entry\n"); 4088
3939 4089 trace_create_file("tracing_thresh", 0644, d_tracer,
3940 entry = debugfs_create_file("available_tracers", 0444, d_tracer, 4090 &tracing_thresh, &tracing_max_lat_fops);
3941 &global_trace, &show_traces_fops); 4091
3942 if (!entry) 4092 trace_create_file("README", 0444, d_tracer,
3943 pr_warning("Could not create debugfs 'available_tracers' entry\n"); 4093 NULL, &tracing_readme_fops);
3944 4094
3945 entry = debugfs_create_file("current_tracer", 0444, d_tracer, 4095 trace_create_file("trace_pipe", 0444, d_tracer,
3946 &global_trace, &set_tracer_fops);
3947 if (!entry)
3948 pr_warning("Could not create debugfs 'current_tracer' entry\n");
3949
3950 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
3951 &tracing_max_latency,
3952 &tracing_max_lat_fops);
3953 if (!entry)
3954 pr_warning("Could not create debugfs "
3955 "'tracing_max_latency' entry\n");
3956
3957 entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
3958 &tracing_thresh, &tracing_max_lat_fops);
3959 if (!entry)
3960 pr_warning("Could not create debugfs "
3961 "'tracing_thresh' entry\n");
3962 entry = debugfs_create_file("README", 0644, d_tracer,
3963 NULL, &tracing_readme_fops);
3964 if (!entry)
3965 pr_warning("Could not create debugfs 'README' entry\n");
3966
3967 entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
3968 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4096 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
3969 if (!entry) 4097
3970 pr_warning("Could not create debugfs " 4098 trace_create_file("buffer_size_kb", 0644, d_tracer,
3971 "'trace_pipe' entry\n"); 4099 &global_trace, &tracing_entries_fops);
3972 4100
3973 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, 4101 trace_create_file("trace_marker", 0220, d_tracer,
3974 &global_trace, &tracing_entries_fops); 4102 NULL, &tracing_mark_fops);
3975 if (!entry) 4103
3976 pr_warning("Could not create debugfs " 4104 trace_create_file("saved_cmdlines", 0444, d_tracer,
3977 "'buffer_size_kb' entry\n"); 4105 NULL, &tracing_saved_cmdlines_fops);
3978
3979 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
3980 NULL, &tracing_mark_fops);
3981 if (!entry)
3982 pr_warning("Could not create debugfs "
3983 "'trace_marker' entry\n");
3984 4106
3985#ifdef CONFIG_DYNAMIC_FTRACE 4107#ifdef CONFIG_DYNAMIC_FTRACE
3986 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4108 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
3987 &ftrace_update_tot_cnt, 4109 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
3988 &tracing_dyn_info_fops);
3989 if (!entry)
3990 pr_warning("Could not create debugfs "
3991 "'dyn_ftrace_total_info' entry\n");
3992#endif 4110#endif
3993#ifdef CONFIG_SYSPROF_TRACER 4111#ifdef CONFIG_SYSPROF_TRACER
3994 init_tracer_sysprof_debugfs(d_tracer); 4112 init_tracer_sysprof_debugfs(d_tracer);
3995#endif 4113#endif
3996 4114
4115 create_trace_options_dir();
4116
3997 for_each_tracing_cpu(cpu) 4117 for_each_tracing_cpu(cpu)
3998 tracing_init_debugfs_percpu(cpu); 4118 tracing_init_debugfs_percpu(cpu);
3999 4119
@@ -4064,7 +4184,8 @@ trace_printk_seq(struct trace_seq *s)
4064 4184
4065static void __ftrace_dump(bool disable_tracing) 4185static void __ftrace_dump(bool disable_tracing)
4066{ 4186{
4067 static DEFINE_SPINLOCK(ftrace_dump_lock); 4187 static raw_spinlock_t ftrace_dump_lock =
4188 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
4068 /* use static because iter can be a bit big for the stack */ 4189 /* use static because iter can be a bit big for the stack */
4069 static struct trace_iterator iter; 4190 static struct trace_iterator iter;
4070 unsigned int old_userobj; 4191 unsigned int old_userobj;
@@ -4073,7 +4194,8 @@ static void __ftrace_dump(bool disable_tracing)
4073 int cnt = 0, cpu; 4194 int cnt = 0, cpu;
4074 4195
4075 /* only one dump */ 4196 /* only one dump */
4076 spin_lock_irqsave(&ftrace_dump_lock, flags); 4197 local_irq_save(flags);
4198 __raw_spin_lock(&ftrace_dump_lock);
4077 if (dump_ran) 4199 if (dump_ran)
4078 goto out; 4200 goto out;
4079 4201
@@ -4145,7 +4267,8 @@ static void __ftrace_dump(bool disable_tracing)
4145 } 4267 }
4146 4268
4147 out: 4269 out:
4148 spin_unlock_irqrestore(&ftrace_dump_lock, flags); 4270 __raw_spin_unlock(&ftrace_dump_lock);
4271 local_irq_restore(flags);
4149} 4272}
4150 4273
4151/* By default: disable tracing after the dump */ 4274/* By default: disable tracing after the dump */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e685ac2b2ba1..6e735d4771f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,9 +9,12 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h> 11#include <trace/boot.h>
12#include <trace/kmemtrace.h> 12#include <linux/kmemtrace.h>
13#include <trace/power.h> 13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h>
17
15enum trace_type { 18enum trace_type {
16 __TRACE_FIRST_TYPE = 0, 19 __TRACE_FIRST_TYPE = 0,
17 20
@@ -42,20 +45,6 @@ enum trace_type {
42}; 45};
43 46
44/* 47/*
45 * The trace entry - the most basic unit of tracing. This is what
46 * is printed in the end as a single line in the trace output, such as:
47 *
48 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
49 */
50struct trace_entry {
51 unsigned char type;
52 unsigned char flags;
53 unsigned char preempt_count;
54 int pid;
55 int tgid;
56};
57
58/*
59 * Function trace entry - function address and parent function addres: 48 * Function trace entry - function address and parent function addres:
60 */ 49 */
61struct ftrace_entry { 50struct ftrace_entry {
@@ -263,8 +252,6 @@ struct trace_array_cpu {
263 char comm[TASK_COMM_LEN]; 252 char comm[TASK_COMM_LEN];
264}; 253};
265 254
266struct trace_iterator;
267
268/* 255/*
269 * The trace array - an array of per-CPU trace arrays. This is the 256 * The trace array - an array of per-CPU trace arrays. This is the
270 * highest level data structure that individual tracers deal with. 257 * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
339 __ftrace_bad_type(); \ 326 __ftrace_bad_type(); \
340 } while (0) 327 } while (0)
341 328
342/* Return values for print_line callback */
343enum print_line_t {
344 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
345 TRACE_TYPE_HANDLED = 1,
346 TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
347 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
348};
349
350
351/* 329/*
352 * An option specific to a tracer. This is a boolean value. 330 * An option specific to a tracer. This is a boolean value.
353 * The bit is the bit index that sets its value on the 331 * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
423 struct tracer_stat *stats; 401 struct tracer_stat *stats;
424}; 402};
425 403
426struct trace_seq {
427 unsigned char buffer[PAGE_SIZE];
428 unsigned int len;
429 unsigned int readpos;
430};
431
432static inline void
433trace_seq_init(struct trace_seq *s)
434{
435 s->len = 0;
436 s->readpos = 0;
437}
438
439 404
440#define TRACE_PIPE_ALL_CPU -1 405#define TRACE_PIPE_ALL_CPU -1
441 406
442/*
443 * Trace iterator - used by printout routines who present trace
444 * results to users and which routines might sleep, etc:
445 */
446struct trace_iterator {
447 struct trace_array *tr;
448 struct tracer *trace;
449 void *private;
450 int cpu_file;
451 struct mutex mutex;
452 struct ring_buffer_iter *buffer_iter[NR_CPUS];
453
454 /* The below is zeroed out in pipe_read */
455 struct trace_seq seq;
456 struct trace_entry *ent;
457 int cpu;
458 u64 ts;
459
460 unsigned long iter_flags;
461 loff_t pos;
462 long idx;
463
464 cpumask_var_t started;
465};
466
467int tracer_init(struct tracer *t, struct trace_array *tr); 407int tracer_init(struct tracer *t, struct trace_array *tr);
468int tracing_is_enabled(void); 408int tracing_is_enabled(void);
469void trace_wake_up(void); 409void trace_wake_up(void);
470void tracing_reset(struct trace_array *tr, int cpu); 410void tracing_reset(struct trace_array *tr, int cpu);
471void tracing_reset_online_cpus(struct trace_array *tr); 411void tracing_reset_online_cpus(struct trace_array *tr);
412void tracing_reset_current(int cpu);
413void tracing_reset_current_online_cpus(void);
472int tracing_open_generic(struct inode *inode, struct file *filp); 414int tracing_open_generic(struct inode *inode, struct file *filp);
415struct dentry *trace_create_file(const char *name,
416 mode_t mode,
417 struct dentry *parent,
418 void *data,
419 const struct file_operations *fops);
420
473struct dentry *tracing_init_dentry(void); 421struct dentry *tracing_init_dentry(void);
474void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 422void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
475 423
476struct ring_buffer_event; 424struct ring_buffer_event;
477 425
478struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
479 unsigned char type, 427 int type,
480 unsigned long len, 428 unsigned long len,
481 unsigned long flags, 429 unsigned long flags,
482 int pc); 430 int pc);
@@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
484 struct ring_buffer_event *event, 432 struct ring_buffer_event *event,
485 unsigned long flags, int pc); 433 unsigned long flags, int pc);
486 434
487struct ring_buffer_event *
488trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
489 unsigned long flags, int pc);
490void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
491 unsigned long flags, int pc);
492void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
493 unsigned long flags, int pc);
494
495struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 435struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
496 struct trace_array_cpu *data); 436 struct trace_array_cpu *data);
497 437
@@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
514 struct task_struct *prev, 454 struct task_struct *prev,
515 struct task_struct *next, 455 struct task_struct *next,
516 unsigned long flags, int pc); 456 unsigned long flags, int pc);
517void tracing_record_cmdline(struct task_struct *tsk);
518 457
519void tracing_sched_wakeup_trace(struct trace_array *tr, 458void tracing_sched_wakeup_trace(struct trace_array *tr,
520 struct task_struct *wakee, 459 struct task_struct *wakee,
@@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
599 struct trace_array *tr); 538 struct trace_array *tr);
600extern int trace_selftest_startup_branch(struct tracer *trace, 539extern int trace_selftest_startup_branch(struct tracer *trace,
601 struct trace_array *tr); 540 struct trace_array *tr);
541extern int trace_selftest_startup_hw_branches(struct tracer *trace,
542 struct trace_array *tr);
602#endif /* CONFIG_FTRACE_STARTUP_TEST */ 543#endif /* CONFIG_FTRACE_STARTUP_TEST */
603 544
604extern void *head_page(struct trace_array_cpu *data); 545extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +554,8 @@ extern unsigned long trace_flags;
613/* Standard output formatting function used for function return traces */ 554/* Standard output formatting function used for function return traces */
614#ifdef CONFIG_FUNCTION_GRAPH_TRACER 555#ifdef CONFIG_FUNCTION_GRAPH_TRACER
615extern enum print_line_t print_graph_function(struct trace_iterator *iter); 556extern enum print_line_t print_graph_function(struct trace_iterator *iter);
557extern enum print_line_t
558trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
616 559
617#ifdef CONFIG_DYNAMIC_FTRACE 560#ifdef CONFIG_DYNAMIC_FTRACE
618/* TODO: make this variable */ 561/* TODO: make this variable */
@@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
644 return 1; 587 return 1;
645} 588}
646#endif /* CONFIG_DYNAMIC_FTRACE */ 589#endif /* CONFIG_DYNAMIC_FTRACE */
647
648#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 590#else /* CONFIG_FUNCTION_GRAPH_TRACER */
649static inline enum print_line_t 591static inline enum print_line_t
650print_graph_function(struct trace_iterator *iter) 592print_graph_function(struct trace_iterator *iter)
@@ -692,6 +634,7 @@ enum trace_iterator_flags {
692 TRACE_ITER_LATENCY_FMT = 0x40000, 634 TRACE_ITER_LATENCY_FMT = 0x40000,
693 TRACE_ITER_GLOBAL_CLK = 0x80000, 635 TRACE_ITER_GLOBAL_CLK = 0x80000,
694 TRACE_ITER_SLEEP_TIME = 0x100000, 636 TRACE_ITER_SLEEP_TIME = 0x100000,
637 TRACE_ITER_GRAPH_TIME = 0x200000,
695}; 638};
696 639
697/* 640/*
@@ -790,103 +733,113 @@ struct ftrace_event_field {
790 char *type; 733 char *type;
791 int offset; 734 int offset;
792 int size; 735 int size;
736 int is_signed;
793}; 737};
794 738
795struct ftrace_event_call { 739struct event_filter {
796 char *name; 740 int n_preds;
797 char *system;
798 struct dentry *dir;
799 int enabled;
800 int (*regfunc)(void);
801 void (*unregfunc)(void);
802 int id;
803 int (*raw_init)(void);
804 int (*show_format)(struct trace_seq *s);
805 int (*define_fields)(void);
806 struct list_head fields;
807 struct filter_pred **preds; 741 struct filter_pred **preds;
808 742 char *filter_string;
809#ifdef CONFIG_EVENT_PROFILE
810 atomic_t profile_count;
811 int (*profile_enable)(struct ftrace_event_call *);
812 void (*profile_disable)(struct ftrace_event_call *);
813#endif
814}; 743};
815 744
816struct event_subsystem { 745struct event_subsystem {
817 struct list_head list; 746 struct list_head list;
818 const char *name; 747 const char *name;
819 struct dentry *entry; 748 struct dentry *entry;
820 struct filter_pred **preds; 749 void *filter;
821}; 750};
822 751
823#define events_for_each(event) \
824 for (event = __start_ftrace_events; \
825 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
826 event++)
827
828#define MAX_FILTER_PRED 8
829
830struct filter_pred; 752struct filter_pred;
831 753
832typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); 754typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
755 int val1, int val2);
833 756
834struct filter_pred { 757struct filter_pred {
835 filter_pred_fn_t fn; 758 filter_pred_fn_t fn;
836 u64 val; 759 u64 val;
837 char *str_val; 760 char str_val[MAX_FILTER_STR_VAL];
838 int str_len; 761 int str_len;
839 char *field_name; 762 char *field_name;
840 int offset; 763 int offset;
841 int not; 764 int not;
842 int or; 765 int op;
843 int compound; 766 int pop_n;
844 int clear;
845}; 767};
846 768
847int trace_define_field(struct ftrace_event_call *call, char *type, 769extern void print_event_filter(struct ftrace_event_call *call,
848 char *name, int offset, int size);
849extern void filter_free_pred(struct filter_pred *pred);
850extern void filter_print_preds(struct filter_pred **preds,
851 struct trace_seq *s); 770 struct trace_seq *s);
852extern int filter_parse(char **pbuf, struct filter_pred *pred); 771extern int apply_event_filter(struct ftrace_event_call *call,
853extern int filter_add_pred(struct ftrace_event_call *call, 772 char *filter_string);
854 struct filter_pred *pred); 773extern int apply_subsystem_event_filter(struct event_subsystem *system,
855extern void filter_free_preds(struct ftrace_event_call *call); 774 char *filter_string);
856extern int filter_match_preds(struct ftrace_event_call *call, void *rec); 775extern void print_subsystem_event_filter(struct event_subsystem *system,
857extern void filter_free_subsystem_preds(struct event_subsystem *system); 776 struct trace_seq *s);
858extern int filter_add_subsystem_pred(struct event_subsystem *system, 777
859 struct filter_pred *pred); 778static inline int
860 779filter_check_discard(struct ftrace_event_call *call, void *rec,
861void event_trace_printk(unsigned long ip, const char *fmt, ...); 780 struct ring_buffer *buffer,
862extern struct ftrace_event_call __start_ftrace_events[]; 781 struct ring_buffer_event *event)
863extern struct ftrace_event_call __stop_ftrace_events[]; 782{
864 783 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
865#define for_each_event(event) \ 784 ring_buffer_discard_commit(buffer, event);
866 for (event = __start_ftrace_events; \ 785 return 1;
867 (unsigned long)event < (unsigned long)__stop_ftrace_events; \ 786 }
868 event++) 787
788 return 0;
789}
790
791#define DEFINE_COMPARISON_PRED(type) \
792static int filter_pred_##type(struct filter_pred *pred, void *event, \
793 int val1, int val2) \
794{ \
795 type *addr = (type *)(event + pred->offset); \
796 type val = (type)pred->val; \
797 int match = 0; \
798 \
799 switch (pred->op) { \
800 case OP_LT: \
801 match = (*addr < val); \
802 break; \
803 case OP_LE: \
804 match = (*addr <= val); \
805 break; \
806 case OP_GT: \
807 match = (*addr > val); \
808 break; \
809 case OP_GE: \
810 match = (*addr >= val); \
811 break; \
812 default: \
813 break; \
814 } \
815 \
816 return match; \
817}
818
819#define DEFINE_EQUALITY_PRED(size) \
820static int filter_pred_##size(struct filter_pred *pred, void *event, \
821 int val1, int val2) \
822{ \
823 u##size *addr = (u##size *)(event + pred->offset); \
824 u##size val = (u##size)pred->val; \
825 int match; \
826 \
827 match = (val == *addr) ^ pred->not; \
828 \
829 return match; \
830}
831
832extern struct mutex event_mutex;
833extern struct list_head ftrace_events;
869 834
870extern const char *__start___trace_bprintk_fmt[]; 835extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[]; 836extern const char *__stop___trace_bprintk_fmt[];
872 837
873/* 838#undef TRACE_EVENT_FORMAT
874 * The double __builtin_constant_p is because gcc will give us an error 839#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
875 * if we try to allocate the static variable to fmt if it is not a 840 extern struct ftrace_event_call event_##call;
876 * constant. Even with the outer if statement optimizing out. 841#undef TRACE_EVENT_FORMAT_NOFILTER
877 */ 842#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
878#define event_trace_printk(ip, fmt, args...) \ 843#include "trace_event_types.h"
879do { \
880 __trace_printk_check_format(fmt, ##args); \
881 tracing_record_cmdline(current); \
882 if (__builtin_constant_p(fmt)) { \
883 static const char *trace_printk_fmt \
884 __attribute__((section("__trace_printk_fmt"))) = \
885 __builtin_constant_p(fmt) ? fmt : NULL; \
886 \
887 __trace_bprintk(ip, trace_printk_fmt, ##args); \
888 } else \
889 __trace_printk(ip, fmt, ##args); \
890} while (0)
891 844
892#endif /* _LINUX_KERNEL_TRACE_H */ 845#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c3642..a29ef23ffb47 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/time.h>
12 13
13#include "trace.h" 14#include "trace.h"
14#include "trace_output.h" 15#include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
67 trace_assign_type(field, entry); 68 trace_assign_type(field, entry);
68 call = &field->boot_call; 69 call = &field->boot_call;
69 ts = iter->ts; 70 ts = iter->ts;
70 nsec_rem = do_div(ts, 1000000000); 71 nsec_rem = do_div(ts, NSEC_PER_SEC);
71 72
72 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 73 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
73 (unsigned long)ts, nsec_rem, call->func, call->caller); 74 (unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
92 trace_assign_type(field, entry); 93 trace_assign_type(field, entry);
93 init_ret = &field->boot_ret; 94 init_ret = &field->boot_ret;
94 ts = iter->ts; 95 ts = iter->ts;
95 nsec_rem = do_div(ts, 1000000000); 96 nsec_rem = do_div(ts, NSEC_PER_SEC);
96 97
97 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 98 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
98 "returned %d after %llu msecs\n", 99 "returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8333715e4066..7a7a9fd249a9 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
30static void 30static void
31probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) 31probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch;
33 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
34 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
35 struct trace_branch *entry; 36 struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
73 entry->line = f->line; 74 entry->line = f->line;
74 entry->correct = val == expect; 75 entry->correct = val == expect;
75 76
76 ring_buffer_unlock_commit(tr->buffer, event); 77 if (!filter_check_discard(call, entry, tr->buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event);
77 79
78 out: 80 out:
79 atomic_dec(&tr->data[cpu]->disabled); 81 atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
271 return 0; 273 return 0;
272} 274}
273 275
274static void *annotated_branch_stat_start(void) 276static void *annotated_branch_stat_start(struct tracer_stat *trace)
275{ 277{
276 return __start_annotated_branch_profile; 278 return __start_annotated_branch_profile;
277} 279}
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
346 return 0; 348 return 0;
347} 349}
348 350
349static void *all_branch_stat_start(void) 351static void *all_branch_stat_start(struct tracer_stat *trace)
350{ 352{
351 return __start_branch_profile; 353 return __start_branch_profile;
352} 354}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba9970776..5b5895afecfe 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,22 +10,30 @@
10int ftrace_profile_enable(int event_id) 10int ftrace_profile_enable(int event_id)
11{ 11{
12 struct ftrace_event_call *event; 12 struct ftrace_event_call *event;
13 int ret = -EINVAL;
13 14
14 for_each_event(event) { 15 mutex_lock(&event_mutex);
15 if (event->id == event_id) 16 list_for_each_entry(event, &ftrace_events, list) {
16 return event->profile_enable(event); 17 if (event->id == event_id) {
18 ret = event->profile_enable(event);
19 break;
20 }
17 } 21 }
22 mutex_unlock(&event_mutex);
18 23
19 return -EINVAL; 24 return ret;
20} 25}
21 26
22void ftrace_profile_disable(int event_id) 27void ftrace_profile_disable(int event_id)
23{ 28{
24 struct ftrace_event_call *event; 29 struct ftrace_event_call *event;
25 30
26 for_each_event(event) { 31 mutex_lock(&event_mutex);
27 if (event->id == event_id) 32 list_for_each_entry(event, &ftrace_events, list) {
28 return event->profile_disable(event); 33 if (event->id == event_id) {
34 event->profile_disable(event);
35 break;
36 }
29 } 37 }
38 mutex_unlock(&event_mutex);
30} 39}
31
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd7..5e32e375134d 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") 57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58); 58);
59 59
60TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, 60TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT( 61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1) 62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2) 63 TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, 122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT( 123 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line) 124 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) 125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
126 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) 126 TRACE_FUNC_SIZE+1, func)
127 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
128 TRACE_FUNC_SIZE+1, file)
127 TRACE_FIELD(char, correct, correct) 129 TRACE_FIELD(char, correct, correct)
128 ), 130 ),
129 TP_RAW_FMT("%u:%s:%s (%u)") 131 TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
139 141
140TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, 142TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
141 TRACE_STRUCT( 143 TRACE_STRUCT(
142 TRACE_FIELD(ktime_t, state_data.stamp, stamp) 144 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
143 TRACE_FIELD(ktime_t, state_data.end, end) 145 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
144 TRACE_FIELD(int, state_data.type, type) 146 TRACE_FIELD(int, state_data.type, type)
145 TRACE_FIELD(int, state_data.state, state) 147 TRACE_FIELD(int, state_data.state, state)
146 ), 148 ),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 576f4fa2af0d..aa08be69a1b6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,19 +8,25 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/workqueue.h>
12#include <linux/spinlock.h>
13#include <linux/kthread.h>
11#include <linux/debugfs.h> 14#include <linux/debugfs.h>
12#include <linux/uaccess.h> 15#include <linux/uaccess.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h>
15 19
16#include "trace_output.h" 20#include "trace_output.h"
17 21
18#define TRACE_SYSTEM "TRACE_SYSTEM" 22#define TRACE_SYSTEM "TRACE_SYSTEM"
19 23
20static DEFINE_MUTEX(event_mutex); 24DEFINE_MUTEX(event_mutex);
25
26LIST_HEAD(ftrace_events);
21 27
22int trace_define_field(struct ftrace_event_call *call, char *type, 28int trace_define_field(struct ftrace_event_call *call, char *type,
23 char *name, int offset, int size) 29 char *name, int offset, int size, int is_signed)
24{ 30{
25 struct ftrace_event_field *field; 31 struct ftrace_event_field *field;
26 32
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
38 44
39 field->offset = offset; 45 field->offset = offset;
40 field->size = size; 46 field->size = size;
47 field->is_signed = is_signed;
41 list_add(&field->link, &call->fields); 48 list_add(&field->link, &call->fields);
42 49
43 return 0; 50 return 0;
@@ -51,47 +58,94 @@ err:
51 58
52 return -ENOMEM; 59 return -ENOMEM;
53} 60}
61EXPORT_SYMBOL_GPL(trace_define_field);
54 62
55static void ftrace_clear_events(void) 63#ifdef CONFIG_MODULES
56{
57 struct ftrace_event_call *call = (void *)__start_ftrace_events;
58
59 64
60 while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { 65static void trace_destroy_fields(struct ftrace_event_call *call)
66{
67 struct ftrace_event_field *field, *next;
61 68
62 if (call->enabled) { 69 list_for_each_entry_safe(field, next, &call->fields, link) {
63 call->enabled = 0; 70 list_del(&field->link);
64 call->unregfunc(); 71 kfree(field->type);
65 } 72 kfree(field->name);
66 call++; 73 kfree(field);
67 } 74 }
68} 75}
69 76
77#endif /* CONFIG_MODULES */
78
70static void ftrace_event_enable_disable(struct ftrace_event_call *call, 79static void ftrace_event_enable_disable(struct ftrace_event_call *call,
71 int enable) 80 int enable)
72{ 81{
73
74 switch (enable) { 82 switch (enable) {
75 case 0: 83 case 0:
76 if (call->enabled) { 84 if (call->enabled) {
77 call->enabled = 0; 85 call->enabled = 0;
86 tracing_stop_cmdline_record();
78 call->unregfunc(); 87 call->unregfunc();
79 } 88 }
80 break; 89 break;
81 case 1: 90 case 1:
82 if (!call->enabled) { 91 if (!call->enabled) {
83 call->enabled = 1; 92 call->enabled = 1;
93 tracing_start_cmdline_record();
84 call->regfunc(); 94 call->regfunc();
85 } 95 }
86 break; 96 break;
87 } 97 }
88} 98}
89 99
100static void ftrace_clear_events(void)
101{
102 struct ftrace_event_call *call;
103
104 mutex_lock(&event_mutex);
105 list_for_each_entry(call, &ftrace_events, list) {
106 ftrace_event_enable_disable(call, 0);
107 }
108 mutex_unlock(&event_mutex);
109}
110
111/*
112 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
113 */
114static int __ftrace_set_clr_event(const char *match, const char *sub,
115 const char *event, int set)
116{
117 struct ftrace_event_call *call;
118 int ret = -EINVAL;
119
120 mutex_lock(&event_mutex);
121 list_for_each_entry(call, &ftrace_events, list) {
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142
143 return ret;
144}
145
90static int ftrace_set_clr_event(char *buf, int set) 146static int ftrace_set_clr_event(char *buf, int set)
91{ 147{
92 struct ftrace_event_call *call = __start_ftrace_events;
93 char *event = NULL, *sub = NULL, *match; 148 char *event = NULL, *sub = NULL, *match;
94 int ret = -EINVAL;
95 149
96 /* 150 /*
97 * The buf format can be <subsystem>:<event-name> 151 * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
117 event = NULL; 171 event = NULL;
118 } 172 }
119 173
120 mutex_lock(&event_mutex); 174 return __ftrace_set_clr_event(match, sub, event, set);
121 for_each_event(call) { 175}
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142 176
143 return ret; 177/**
178 * trace_set_clr_event - enable or disable an event
179 * @system: system name to match (NULL for any system)
180 * @event: event name to match (NULL for all events, within system)
181 * @set: 1 to enable, 0 to disable
182 *
183 * This is a way for other parts of the kernel to enable or disable
184 * event recording.
185 *
186 * Returns 0 on success, -EINVAL if the parameters do not match any
187 * registered events.
188 */
189int trace_set_clr_event(const char *system, const char *event, int set)
190{
191 return __ftrace_set_clr_event(NULL, system, event, set);
144} 192}
145 193
146/* 128 should be much more than enough */ 194/* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
224static void * 272static void *
225t_next(struct seq_file *m, void *v, loff_t *pos) 273t_next(struct seq_file *m, void *v, loff_t *pos)
226{ 274{
227 struct ftrace_event_call *call = m->private; 275 struct list_head *list = m->private;
228 struct ftrace_event_call *next = call; 276 struct ftrace_event_call *call;
229 277
230 (*pos)++; 278 (*pos)++;
231 279
232 for (;;) { 280 for (;;) {
233 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) 281 if (list == &ftrace_events)
234 return NULL; 282 return NULL;
235 283
284 call = list_entry(list, struct ftrace_event_call, list);
285
236 /* 286 /*
237 * The ftrace subsystem is for showing formats only. 287 * The ftrace subsystem is for showing formats only.
238 * They can not be enabled or disabled via the event files. 288 * They can not be enabled or disabled via the event files.
@@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
240 if (call->regfunc) 290 if (call->regfunc)
241 break; 291 break;
242 292
243 call++; 293 list = list->next;
244 next = call;
245 } 294 }
246 295
247 m->private = ++next; 296 m->private = list->next;
248 297
249 return call; 298 return call;
250} 299}
251 300
252static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
253{ 302{
303 mutex_lock(&event_mutex);
304 if (*pos == 0)
305 m->private = ftrace_events.next;
254 return t_next(m, NULL, pos); 306 return t_next(m, NULL, pos);
255} 307}
256 308
257static void * 309static void *
258s_next(struct seq_file *m, void *v, loff_t *pos) 310s_next(struct seq_file *m, void *v, loff_t *pos)
259{ 311{
260 struct ftrace_event_call *call = m->private; 312 struct list_head *list = m->private;
261 struct ftrace_event_call *next; 313 struct ftrace_event_call *call;
262 314
263 (*pos)++; 315 (*pos)++;
264 316
265 retry: 317 retry:
266 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) 318 if (list == &ftrace_events)
267 return NULL; 319 return NULL;
268 320
321 call = list_entry(list, struct ftrace_event_call, list);
322
269 if (!call->enabled) { 323 if (!call->enabled) {
270 call++; 324 list = list->next;
271 goto retry; 325 goto retry;
272 } 326 }
273 327
274 next = call; 328 m->private = list->next;
275 m->private = ++next;
276 329
277 return call; 330 return call;
278} 331}
279 332
280static void *s_start(struct seq_file *m, loff_t *pos) 333static void *s_start(struct seq_file *m, loff_t *pos)
281{ 334{
335 mutex_lock(&event_mutex);
336 if (*pos == 0)
337 m->private = ftrace_events.next;
282 return s_next(m, NULL, pos); 338 return s_next(m, NULL, pos);
283} 339}
284 340
@@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v)
295 351
296static void t_stop(struct seq_file *m, void *p) 352static void t_stop(struct seq_file *m, void *p)
297{ 353{
354 mutex_unlock(&event_mutex);
298} 355}
299 356
300static int 357static int
301ftrace_event_seq_open(struct inode *inode, struct file *file) 358ftrace_event_seq_open(struct inode *inode, struct file *file)
302{ 359{
303 int ret;
304 const struct seq_operations *seq_ops; 360 const struct seq_operations *seq_ops;
305 361
306 if ((file->f_mode & FMODE_WRITE) && 362 if ((file->f_mode & FMODE_WRITE) &&
@@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
308 ftrace_clear_events(); 364 ftrace_clear_events();
309 365
310 seq_ops = inode->i_private; 366 seq_ops = inode->i_private;
311 ret = seq_open(file, seq_ops); 367 return seq_open(file, seq_ops);
312 if (!ret) {
313 struct seq_file *m = file->private_data;
314
315 m->private = __start_ftrace_events;
316 }
317 return ret;
318} 368}
319 369
320static ssize_t 370static ssize_t
@@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
374 return cnt; 424 return cnt;
375} 425}
376 426
427static ssize_t
428system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
429 loff_t *ppos)
430{
431 const char set_to_char[4] = { '?', '0', '1', 'X' };
432 const char *system = filp->private_data;
433 struct ftrace_event_call *call;
434 char buf[2];
435 int set = 0;
436 int ret;
437
438 mutex_lock(&event_mutex);
439 list_for_each_entry(call, &ftrace_events, list) {
440 if (!call->name || !call->regfunc)
441 continue;
442
443 if (system && strcmp(call->system, system) != 0)
444 continue;
445
446 /*
447 * We need to find out if all the events are set
448 * or if all events or cleared, or if we have
449 * a mixture.
450 */
451 set |= (1 << !!call->enabled);
452
453 /*
454 * If we have a mixture, no need to look further.
455 */
456 if (set == 3)
457 break;
458 }
459 mutex_unlock(&event_mutex);
460
461 buf[0] = set_to_char[set];
462 buf[1] = '\n';
463
464 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
465
466 return ret;
467}
468
469static ssize_t
470system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
471 loff_t *ppos)
472{
473 const char *system = filp->private_data;
474 unsigned long val;
475 char buf[64];
476 ssize_t ret;
477
478 if (cnt >= sizeof(buf))
479 return -EINVAL;
480
481 if (copy_from_user(&buf, ubuf, cnt))
482 return -EFAULT;
483
484 buf[cnt] = 0;
485
486 ret = strict_strtoul(buf, 10, &val);
487 if (ret < 0)
488 return ret;
489
490 ret = tracing_update_buffers();
491 if (ret < 0)
492 return ret;
493
494 if (val != 0 && val != 1)
495 return -EINVAL;
496
497 ret = __ftrace_set_clr_event(NULL, system, NULL, val);
498 if (ret)
499 goto out;
500
501 ret = cnt;
502
503out:
504 *ppos += cnt;
505
506 return ret;
507}
508
509extern char *__bad_type_size(void);
510
377#undef FIELD 511#undef FIELD
378#define FIELD(type, name) \ 512#define FIELD(type, name) \
513 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
379 #type, "common_" #name, offsetof(typeof(field), name), \ 514 #type, "common_" #name, offsetof(typeof(field), name), \
380 sizeof(field.name) 515 sizeof(field.name)
381 516
@@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)
391 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 526 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
392 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 527 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
393 "\n", 528 "\n",
394 FIELD(unsigned char, type), 529 FIELD(unsigned short, type),
395 FIELD(unsigned char, flags), 530 FIELD(unsigned char, flags),
396 FIELD(unsigned char, preempt_count), 531 FIELD(unsigned char, preempt_count),
397 FIELD(int, pid), 532 FIELD(int, pid),
@@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
481 616
482 trace_seq_init(s); 617 trace_seq_init(s);
483 618
484 filter_print_preds(call->preds, s); 619 print_event_filter(call, s);
485 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 620 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
486 621
487 kfree(s); 622 kfree(s);
@@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
494 loff_t *ppos) 629 loff_t *ppos)
495{ 630{
496 struct ftrace_event_call *call = filp->private_data; 631 struct ftrace_event_call *call = filp->private_data;
497 char buf[64], *pbuf = buf; 632 char *buf;
498 struct filter_pred *pred;
499 int err; 633 int err;
500 634
501 if (cnt >= sizeof(buf)) 635 if (cnt >= PAGE_SIZE)
502 return -EINVAL; 636 return -EINVAL;
503 637
504 if (copy_from_user(&buf, ubuf, cnt)) 638 buf = (char *)__get_free_page(GFP_TEMPORARY);
505 return -EFAULT; 639 if (!buf)
506 buf[cnt] = '\0';
507
508 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
509 if (!pred)
510 return -ENOMEM; 640 return -ENOMEM;
511 641
512 err = filter_parse(&pbuf, pred); 642 if (copy_from_user(buf, ubuf, cnt)) {
513 if (err < 0) { 643 free_page((unsigned long) buf);
514 filter_free_pred(pred); 644 return -EFAULT;
515 return err;
516 }
517
518 if (pred->clear) {
519 filter_free_preds(call);
520 filter_free_pred(pred);
521 return cnt;
522 } 645 }
646 buf[cnt] = '\0';
523 647
524 err = filter_add_pred(call, pred); 648 err = apply_event_filter(call, buf);
525 if (err < 0) { 649 free_page((unsigned long) buf);
526 filter_free_pred(pred); 650 if (err < 0)
527 return err; 651 return err;
528 }
529 652
530 *ppos += cnt; 653 *ppos += cnt;
531 654
@@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
549 672
550 trace_seq_init(s); 673 trace_seq_init(s);
551 674
552 filter_print_preds(system->preds, s); 675 print_subsystem_event_filter(system, s);
553 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 676 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
554 677
555 kfree(s); 678 kfree(s);
@@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
562 loff_t *ppos) 685 loff_t *ppos)
563{ 686{
564 struct event_subsystem *system = filp->private_data; 687 struct event_subsystem *system = filp->private_data;
565 char buf[64], *pbuf = buf; 688 char *buf;
566 struct filter_pred *pred;
567 int err; 689 int err;
568 690
569 if (cnt >= sizeof(buf)) 691 if (cnt >= PAGE_SIZE)
570 return -EINVAL; 692 return -EINVAL;
571 693
572 if (copy_from_user(&buf, ubuf, cnt)) 694 buf = (char *)__get_free_page(GFP_TEMPORARY);
573 return -EFAULT; 695 if (!buf)
574 buf[cnt] = '\0';
575
576 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
577 if (!pred)
578 return -ENOMEM; 696 return -ENOMEM;
579 697
580 err = filter_parse(&pbuf, pred); 698 if (copy_from_user(buf, ubuf, cnt)) {
581 if (err < 0) { 699 free_page((unsigned long) buf);
582 filter_free_pred(pred); 700 return -EFAULT;
583 return err;
584 }
585
586 if (pred->clear) {
587 filter_free_subsystem_preds(system);
588 filter_free_pred(pred);
589 return cnt;
590 } 701 }
702 buf[cnt] = '\0';
591 703
592 err = filter_add_subsystem_pred(system, pred); 704 err = apply_subsystem_event_filter(system, buf);
593 if (err < 0) { 705 free_page((unsigned long) buf);
594 filter_free_subsystem_preds(system); 706 if (err < 0)
595 filter_free_pred(pred);
596 return err; 707 return err;
597 }
598 708
599 *ppos += cnt; 709 *ppos += cnt;
600 710
601 return cnt; 711 return cnt;
602} 712}
603 713
714static ssize_t
715show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
716{
717 int (*func)(struct trace_seq *s) = filp->private_data;
718 struct trace_seq *s;
719 int r;
720
721 if (*ppos)
722 return 0;
723
724 s = kmalloc(sizeof(*s), GFP_KERNEL);
725 if (!s)
726 return -ENOMEM;
727
728 trace_seq_init(s);
729
730 func(s);
731 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
732
733 kfree(s);
734
735 return r;
736}
737
604static const struct seq_operations show_event_seq_ops = { 738static const struct seq_operations show_event_seq_ops = {
605 .start = t_start, 739 .start = t_start,
606 .next = t_next, 740 .next = t_next,
@@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
658 .write = subsystem_filter_write, 792 .write = subsystem_filter_write,
659}; 793};
660 794
795static const struct file_operations ftrace_system_enable_fops = {
796 .open = tracing_open_generic,
797 .read = system_enable_read,
798 .write = system_enable_write,
799};
800
801static const struct file_operations ftrace_show_header_fops = {
802 .open = tracing_open_generic,
803 .read = show_header,
804};
805
661static struct dentry *event_trace_events_dir(void) 806static struct dentry *event_trace_events_dir(void)
662{ 807{
663 static struct dentry *d_tracer; 808 static struct dentry *d_tracer;
@@ -684,6 +829,7 @@ static struct dentry *
684event_subsystem_dir(const char *name, struct dentry *d_events) 829event_subsystem_dir(const char *name, struct dentry *d_events)
685{ 830{
686 struct event_subsystem *system; 831 struct event_subsystem *system;
832 struct dentry *entry;
687 833
688 /* First see if we did not already create this dir */ 834 /* First see if we did not already create this dir */
689 list_for_each_entry(system, &event_subsystems, list) { 835 list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
707 return d_events; 853 return d_events;
708 } 854 }
709 855
710 system->name = name; 856 system->name = kstrdup(name, GFP_KERNEL);
857 if (!system->name) {
858 debugfs_remove(system->entry);
859 kfree(system);
860 return d_events;
861 }
862
711 list_add(&system->list, &event_subsystems); 863 list_add(&system->list, &event_subsystems);
712 864
713 system->preds = NULL; 865 system->filter = NULL;
866
867 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
868 if (!system->filter) {
869 pr_warning("Could not allocate filter for subsystem "
870 "'%s'\n", name);
871 return system->entry;
872 }
873
874 entry = debugfs_create_file("filter", 0644, system->entry, system,
875 &ftrace_subsystem_filter_fops);
876 if (!entry) {
877 kfree(system->filter);
878 system->filter = NULL;
879 pr_warning("Could not create debugfs "
880 "'%s/filter' entry\n", name);
881 }
882
883 entry = trace_create_file("enable", 0644, system->entry,
884 (void *)system->name,
885 &ftrace_system_enable_fops);
714 886
715 return system->entry; 887 return system->entry;
716} 888}
717 889
718static int 890static int
719event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) 891event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
892 const struct file_operations *id,
893 const struct file_operations *enable,
894 const struct file_operations *filter,
895 const struct file_operations *format)
720{ 896{
721 struct dentry *entry; 897 struct dentry *entry;
722 int ret; 898 int ret;
@@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
725 * If the trace point header did not define TRACE_SYSTEM 901 * If the trace point header did not define TRACE_SYSTEM
726 * then the system would be called "TRACE_SYSTEM". 902 * then the system would be called "TRACE_SYSTEM".
727 */ 903 */
728 if (strcmp(call->system, "TRACE_SYSTEM") != 0) 904 if (strcmp(call->system, TRACE_SYSTEM) != 0)
729 d_events = event_subsystem_dir(call->system, d_events); 905 d_events = event_subsystem_dir(call->system, d_events);
730 906
731 if (call->raw_init) { 907 if (call->raw_init) {
@@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
744 return -1; 920 return -1;
745 } 921 }
746 922
747 if (call->regfunc) { 923 if (call->regfunc)
748 entry = debugfs_create_file("enable", 0644, call->dir, call, 924 entry = trace_create_file("enable", 0644, call->dir, call,
749 &ftrace_enable_fops); 925 enable);
750 if (!entry)
751 pr_warning("Could not create debugfs "
752 "'%s/enable' entry\n", call->name);
753 }
754 926
755 if (call->id) { 927 if (call->id)
756 entry = debugfs_create_file("id", 0444, call->dir, call, 928 entry = trace_create_file("id", 0444, call->dir, call,
757 &ftrace_event_id_fops); 929 id);
758 if (!entry)
759 pr_warning("Could not create debugfs '%s/id' entry\n",
760 call->name);
761 }
762 930
763 if (call->define_fields) { 931 if (call->define_fields) {
764 ret = call->define_fields(); 932 ret = call->define_fields();
@@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
767 " events/%s\n", call->name); 935 " events/%s\n", call->name);
768 return ret; 936 return ret;
769 } 937 }
770 entry = debugfs_create_file("filter", 0644, call->dir, call, 938 entry = trace_create_file("filter", 0644, call->dir, call,
771 &ftrace_event_filter_fops); 939 filter);
772 if (!entry)
773 pr_warning("Could not create debugfs "
774 "'%s/filter' entry\n", call->name);
775 } 940 }
776 941
777 /* A trace may not want to export its format */ 942 /* A trace may not want to export its format */
778 if (!call->show_format) 943 if (!call->show_format)
779 return 0; 944 return 0;
780 945
781 entry = debugfs_create_file("format", 0444, call->dir, call, 946 entry = trace_create_file("format", 0444, call->dir, call,
782 &ftrace_event_format_fops); 947 format);
783 if (!entry) 948
784 pr_warning("Could not create debugfs " 949 return 0;
785 "'%s/format' entry\n", call->name); 950}
951
952#define for_each_event(event, start, end) \
953 for (event = start; \
954 (unsigned long)event < (unsigned long)end; \
955 event++)
956
957#ifdef CONFIG_MODULES
958
959static LIST_HEAD(ftrace_module_file_list);
960
961/*
962 * Modules must own their file_operations to keep up with
963 * reference counting.
964 */
965struct ftrace_module_file_ops {
966 struct list_head list;
967 struct module *mod;
968 struct file_operations id;
969 struct file_operations enable;
970 struct file_operations format;
971 struct file_operations filter;
972};
973
974static struct ftrace_module_file_ops *
975trace_create_file_ops(struct module *mod)
976{
977 struct ftrace_module_file_ops *file_ops;
978
979 /*
980 * This is a bit of a PITA. To allow for correct reference
981 * counting, modules must "own" their file_operations.
982 * To do this, we allocate the file operations that will be
983 * used in the event directory.
984 */
985
986 file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
987 if (!file_ops)
988 return NULL;
989
990 file_ops->mod = mod;
991
992 file_ops->id = ftrace_event_id_fops;
993 file_ops->id.owner = mod;
994
995 file_ops->enable = ftrace_enable_fops;
996 file_ops->enable.owner = mod;
997
998 file_ops->filter = ftrace_event_filter_fops;
999 file_ops->filter.owner = mod;
1000
1001 file_ops->format = ftrace_event_format_fops;
1002 file_ops->format.owner = mod;
1003
1004 list_add(&file_ops->list, &ftrace_module_file_list);
1005
1006 return file_ops;
1007}
1008
1009static void trace_module_add_events(struct module *mod)
1010{
1011 struct ftrace_module_file_ops *file_ops = NULL;
1012 struct ftrace_event_call *call, *start, *end;
1013 struct dentry *d_events;
1014
1015 start = mod->trace_events;
1016 end = mod->trace_events + mod->num_trace_events;
1017
1018 if (start == end)
1019 return;
1020
1021 d_events = event_trace_events_dir();
1022 if (!d_events)
1023 return;
1024
1025 for_each_event(call, start, end) {
1026 /* The linker may leave blanks */
1027 if (!call->name)
1028 continue;
1029
1030 /*
1031 * This module has events, create file ops for this module
1032 * if not already done.
1033 */
1034 if (!file_ops) {
1035 file_ops = trace_create_file_ops(mod);
1036 if (!file_ops)
1037 return;
1038 }
1039 call->mod = mod;
1040 list_add(&call->list, &ftrace_events);
1041 event_create_dir(call, d_events,
1042 &file_ops->id, &file_ops->enable,
1043 &file_ops->filter, &file_ops->format);
1044 }
1045}
1046
1047static void trace_module_remove_events(struct module *mod)
1048{
1049 struct ftrace_module_file_ops *file_ops;
1050 struct ftrace_event_call *call, *p;
1051 bool found = false;
1052
1053 down_write(&trace_event_mutex);
1054 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1055 if (call->mod == mod) {
1056 found = true;
1057 ftrace_event_enable_disable(call, 0);
1058 if (call->event)
1059 __unregister_ftrace_event(call->event);
1060 debugfs_remove_recursive(call->dir);
1061 list_del(&call->list);
1062 trace_destroy_fields(call);
1063 destroy_preds(call);
1064 }
1065 }
1066
1067 /* Now free the file_operations */
1068 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1069 if (file_ops->mod == mod)
1070 break;
1071 }
1072 if (&file_ops->list != &ftrace_module_file_list) {
1073 list_del(&file_ops->list);
1074 kfree(file_ops);
1075 }
1076
1077 /*
1078 * It is safest to reset the ring buffer if the module being unloaded
1079 * registered any events.
1080 */
1081 if (found)
1082 tracing_reset_current_online_cpus();
1083 up_write(&trace_event_mutex);
1084}
1085
1086static int trace_module_notify(struct notifier_block *self,
1087 unsigned long val, void *data)
1088{
1089 struct module *mod = data;
1090
1091 mutex_lock(&event_mutex);
1092 switch (val) {
1093 case MODULE_STATE_COMING:
1094 trace_module_add_events(mod);
1095 break;
1096 case MODULE_STATE_GOING:
1097 trace_module_remove_events(mod);
1098 break;
1099 }
1100 mutex_unlock(&event_mutex);
786 1101
787 return 0; 1102 return 0;
788} 1103}
1104#else
1105static int trace_module_notify(struct notifier_block *self,
1106 unsigned long val, void *data)
1107{
1108 return 0;
1109}
1110#endif /* CONFIG_MODULES */
1111
1112struct notifier_block trace_module_nb = {
1113 .notifier_call = trace_module_notify,
1114 .priority = 0,
1115};
1116
1117extern struct ftrace_event_call __start_ftrace_events[];
1118extern struct ftrace_event_call __stop_ftrace_events[];
789 1119
790static __init int event_trace_init(void) 1120static __init int event_trace_init(void)
791{ 1121{
792 struct ftrace_event_call *call = __start_ftrace_events; 1122 struct ftrace_event_call *call;
793 struct dentry *d_tracer; 1123 struct dentry *d_tracer;
794 struct dentry *entry; 1124 struct dentry *entry;
795 struct dentry *d_events; 1125 struct dentry *d_events;
1126 int ret;
796 1127
797 d_tracer = tracing_init_dentry(); 1128 d_tracer = tracing_init_dentry();
798 if (!d_tracer) 1129 if (!d_tracer)
@@ -816,13 +1147,243 @@ static __init int event_trace_init(void)
816 if (!d_events) 1147 if (!d_events)
817 return 0; 1148 return 0;
818 1149
819 for_each_event(call) { 1150 /* ring buffer internal formats */
1151 trace_create_file("header_page", 0444, d_events,
1152 ring_buffer_print_page_header,
1153 &ftrace_show_header_fops);
1154
1155 trace_create_file("header_event", 0444, d_events,
1156 ring_buffer_print_entry_header,
1157 &ftrace_show_header_fops);
1158
1159 trace_create_file("enable", 0644, d_events,
1160 NULL, &ftrace_system_enable_fops);
1161
1162 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
820 /* The linker may leave blanks */ 1163 /* The linker may leave blanks */
821 if (!call->name) 1164 if (!call->name)
822 continue; 1165 continue;
823 event_create_dir(call, d_events); 1166 list_add(&call->list, &ftrace_events);
1167 event_create_dir(call, d_events, &ftrace_event_id_fops,
1168 &ftrace_enable_fops, &ftrace_event_filter_fops,
1169 &ftrace_event_format_fops);
824 } 1170 }
825 1171
1172 ret = register_module_notifier(&trace_module_nb);
1173 if (ret)
1174 pr_warning("Failed to register trace events module notifier\n");
1175
826 return 0; 1176 return 0;
827} 1177}
828fs_initcall(event_trace_init); 1178fs_initcall(event_trace_init);
1179
1180#ifdef CONFIG_FTRACE_STARTUP_TEST
1181
1182static DEFINE_SPINLOCK(test_spinlock);
1183static DEFINE_SPINLOCK(test_spinlock_irq);
1184static DEFINE_MUTEX(test_mutex);
1185
1186static __init void test_work(struct work_struct *dummy)
1187{
1188 spin_lock(&test_spinlock);
1189 spin_lock_irq(&test_spinlock_irq);
1190 udelay(1);
1191 spin_unlock_irq(&test_spinlock_irq);
1192 spin_unlock(&test_spinlock);
1193
1194 mutex_lock(&test_mutex);
1195 msleep(1);
1196 mutex_unlock(&test_mutex);
1197}
1198
1199static __init int event_test_thread(void *unused)
1200{
1201 void *test_malloc;
1202
1203 test_malloc = kmalloc(1234, GFP_KERNEL);
1204 if (!test_malloc)
1205 pr_info("failed to kmalloc\n");
1206
1207 schedule_on_each_cpu(test_work);
1208
1209 kfree(test_malloc);
1210
1211 set_current_state(TASK_INTERRUPTIBLE);
1212 while (!kthread_should_stop())
1213 schedule();
1214
1215 return 0;
1216}
1217
1218/*
1219 * Do various things that may trigger events.
1220 */
1221static __init void event_test_stuff(void)
1222{
1223 struct task_struct *test_thread;
1224
1225 test_thread = kthread_run(event_test_thread, NULL, "test-events");
1226 msleep(1);
1227 kthread_stop(test_thread);
1228}
1229
1230/*
1231 * For every trace event defined, we will test each trace point separately,
1232 * and then by groups, and finally all trace points.
1233 */
1234static __init void event_trace_self_tests(void)
1235{
1236 struct ftrace_event_call *call;
1237 struct event_subsystem *system;
1238 int ret;
1239
1240 pr_info("Running tests on trace events:\n");
1241
1242 list_for_each_entry(call, &ftrace_events, list) {
1243
1244 /* Only test those that have a regfunc */
1245 if (!call->regfunc)
1246 continue;
1247
1248 pr_info("Testing event %s: ", call->name);
1249
1250 /*
1251 * If an event is already enabled, someone is using
1252 * it and the self test should not be on.
1253 */
1254 if (call->enabled) {
1255 pr_warning("Enabled event during self test!\n");
1256 WARN_ON_ONCE(1);
1257 continue;
1258 }
1259
1260 ftrace_event_enable_disable(call, 1);
1261 event_test_stuff();
1262 ftrace_event_enable_disable(call, 0);
1263
1264 pr_cont("OK\n");
1265 }
1266
1267 /* Now test at the sub system level */
1268
1269 pr_info("Running tests on trace event systems:\n");
1270
1271 list_for_each_entry(system, &event_subsystems, list) {
1272
1273 /* the ftrace system is special, skip it */
1274 if (strcmp(system->name, "ftrace") == 0)
1275 continue;
1276
1277 pr_info("Testing event system %s: ", system->name);
1278
1279 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
1280 if (WARN_ON_ONCE(ret)) {
1281 pr_warning("error enabling system %s\n",
1282 system->name);
1283 continue;
1284 }
1285
1286 event_test_stuff();
1287
1288 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
1289 if (WARN_ON_ONCE(ret))
1290 pr_warning("error disabling system %s\n",
1291 system->name);
1292
1293 pr_cont("OK\n");
1294 }
1295
1296 /* Test with all events enabled */
1297
1298 pr_info("Running tests on all trace events:\n");
1299 pr_info("Testing all events: ");
1300
1301 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
1302 if (WARN_ON_ONCE(ret)) {
1303 pr_warning("error enabling all events\n");
1304 return;
1305 }
1306
1307 event_test_stuff();
1308
1309 /* reset sysname */
1310 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
1311 if (WARN_ON_ONCE(ret)) {
1312 pr_warning("error disabling all events\n");
1313 return;
1314 }
1315
1316 pr_cont("OK\n");
1317}
1318
1319#ifdef CONFIG_FUNCTION_TRACER
1320
1321static DEFINE_PER_CPU(atomic_t, test_event_disable);
1322
1323static void
1324function_test_events_call(unsigned long ip, unsigned long parent_ip)
1325{
1326 struct ring_buffer_event *event;
1327 struct ftrace_entry *entry;
1328 unsigned long flags;
1329 long disabled;
1330 int resched;
1331 int cpu;
1332 int pc;
1333
1334 pc = preempt_count();
1335 resched = ftrace_preempt_disable();
1336 cpu = raw_smp_processor_id();
1337 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
1338
1339 if (disabled != 1)
1340 goto out;
1341
1342 local_save_flags(flags);
1343
1344 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
1345 flags, pc);
1346 if (!event)
1347 goto out;
1348 entry = ring_buffer_event_data(event);
1349 entry->ip = ip;
1350 entry->parent_ip = parent_ip;
1351
1352 trace_nowake_buffer_unlock_commit(event, flags, pc);
1353
1354 out:
1355 atomic_dec(&per_cpu(test_event_disable, cpu));
1356 ftrace_preempt_enable(resched);
1357}
1358
1359static struct ftrace_ops trace_ops __initdata =
1360{
1361 .func = function_test_events_call,
1362};
1363
1364static __init void event_trace_self_test_with_function(void)
1365{
1366 register_ftrace_function(&trace_ops);
1367 pr_info("Running tests again, along with the function tracer\n");
1368 event_trace_self_tests();
1369 unregister_ftrace_function(&trace_ops);
1370}
1371#else
1372static __init void event_trace_self_test_with_function(void)
1373{
1374}
1375#endif
1376
1377static __init int event_trace_self_tests_init(void)
1378{
1379
1380 event_trace_self_tests();
1381
1382 event_trace_self_test_with_function();
1383
1384 return 0;
1385}
1386
1387late_initcall(event_trace_self_tests_init);
1388
1389#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e03cbf1e38f3..db6e54bdb596 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,119 +22,297 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/ctype.h> 24#include <linux/ctype.h>
25#include <linux/mutex.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
28 29
29static int filter_pred_64(struct filter_pred *pred, void *event) 30static DEFINE_MUTEX(filter_mutex);
31
32enum filter_op_ids
33{
34 OP_OR,
35 OP_AND,
36 OP_NE,
37 OP_EQ,
38 OP_LT,
39 OP_LE,
40 OP_GT,
41 OP_GE,
42 OP_NONE,
43 OP_OPEN_PAREN,
44};
45
46struct filter_op {
47 int id;
48 char *string;
49 int precedence;
50};
51
52static struct filter_op filter_ops[] = {
53 { OP_OR, "||", 1 },
54 { OP_AND, "&&", 2 },
55 { OP_NE, "!=", 4 },
56 { OP_EQ, "==", 4 },
57 { OP_LT, "<", 5 },
58 { OP_LE, "<=", 5 },
59 { OP_GT, ">", 5 },
60 { OP_GE, ">=", 5 },
61 { OP_NONE, "OP_NONE", 0 },
62 { OP_OPEN_PAREN, "(", 0 },
63};
64
65enum {
66 FILT_ERR_NONE,
67 FILT_ERR_INVALID_OP,
68 FILT_ERR_UNBALANCED_PAREN,
69 FILT_ERR_TOO_MANY_OPERANDS,
70 FILT_ERR_OPERAND_TOO_LONG,
71 FILT_ERR_FIELD_NOT_FOUND,
72 FILT_ERR_ILLEGAL_FIELD_OP,
73 FILT_ERR_ILLEGAL_INTVAL,
74 FILT_ERR_BAD_SUBSYS_FILTER,
75 FILT_ERR_TOO_MANY_PREDS,
76 FILT_ERR_MISSING_FIELD,
77 FILT_ERR_INVALID_FILTER,
78};
79
80static char *err_text[] = {
81 "No error",
82 "Invalid operator",
83 "Unbalanced parens",
84 "Too many operands",
85 "Operand too long",
86 "Field not found",
87 "Illegal operation for field type",
88 "Illegal integer value",
89 "Couldn't find or set field in one of a subsystem's events",
90 "Too many terms in predicate expression",
91 "Missing field name and/or value",
92 "Meaningless filter expression",
93};
94
95struct opstack_op {
96 int op;
97 struct list_head list;
98};
99
100struct postfix_elt {
101 int op;
102 char *operand;
103 struct list_head list;
104};
105
106struct filter_parse_state {
107 struct filter_op *ops;
108 struct list_head opstack;
109 struct list_head postfix;
110 int lasterr;
111 int lasterr_pos;
112
113 struct {
114 char *string;
115 unsigned int cnt;
116 unsigned int tail;
117 } infix;
118
119 struct {
120 char string[MAX_FILTER_STR_VAL];
121 int pos;
122 unsigned int tail;
123 } operand;
124};
125
126DEFINE_COMPARISON_PRED(s64);
127DEFINE_COMPARISON_PRED(u64);
128DEFINE_COMPARISON_PRED(s32);
129DEFINE_COMPARISON_PRED(u32);
130DEFINE_COMPARISON_PRED(s16);
131DEFINE_COMPARISON_PRED(u16);
132DEFINE_COMPARISON_PRED(s8);
133DEFINE_COMPARISON_PRED(u8);
134
135DEFINE_EQUALITY_PRED(64);
136DEFINE_EQUALITY_PRED(32);
137DEFINE_EQUALITY_PRED(16);
138DEFINE_EQUALITY_PRED(8);
139
140static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
141 void *event __attribute((unused)),
142 int val1, int val2)
143{
144 return val1 && val2;
145}
146
147static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
148 void *event __attribute((unused)),
149 int val1, int val2)
150{
151 return val1 || val2;
152}
153
154/* Filter predicate for fixed sized arrays of characters */
155static int filter_pred_string(struct filter_pred *pred, void *event,
156 int val1, int val2)
30{ 157{
31 u64 *addr = (u64 *)(event + pred->offset); 158 char *addr = (char *)(event + pred->offset);
32 u64 val = (u64)pred->val; 159 int cmp, match;
33 int match; 160
161 cmp = strncmp(addr, pred->str_val, pred->str_len);
34 162
35 match = (val == *addr) ^ pred->not; 163 match = (!cmp) ^ pred->not;
36 164
37 return match; 165 return match;
38} 166}
39 167
40static int filter_pred_32(struct filter_pred *pred, void *event) 168/*
169 * Filter predicate for dynamic sized arrays of characters.
170 * These are implemented through a list of strings at the end
171 * of the entry.
172 * Also each of these strings have a field in the entry which
173 * contains its offset from the beginning of the entry.
174 * We have then first to get this field, dereference it
175 * and add it to the address of the entry, and at last we have
176 * the address of the string.
177 */
178static int filter_pred_strloc(struct filter_pred *pred, void *event,
179 int val1, int val2)
41{ 180{
42 u32 *addr = (u32 *)(event + pred->offset); 181 int str_loc = *(int *)(event + pred->offset);
43 u32 val = (u32)pred->val; 182 char *addr = (char *)(event + str_loc);
44 int match; 183 int cmp, match;
184
185 cmp = strncmp(addr, pred->str_val, pred->str_len);
45 186
46 match = (val == *addr) ^ pred->not; 187 match = (!cmp) ^ pred->not;
47 188
48 return match; 189 return match;
49} 190}
50 191
51static int filter_pred_16(struct filter_pred *pred, void *event) 192static int filter_pred_none(struct filter_pred *pred, void *event,
193 int val1, int val2)
194{
195 return 0;
196}
197
198/* return 1 if event matches, 0 otherwise (discard) */
199int filter_match_preds(struct ftrace_event_call *call, void *rec)
52{ 200{
53 u16 *addr = (u16 *)(event + pred->offset); 201 struct event_filter *filter = call->filter;
54 u16 val = (u16)pred->val; 202 int match, top = 0, val1 = 0, val2 = 0;
55 int match; 203 int stack[MAX_FILTER_PRED];
204 struct filter_pred *pred;
205 int i;
206
207 for (i = 0; i < filter->n_preds; i++) {
208 pred = filter->preds[i];
209 if (!pred->pop_n) {
210 match = pred->fn(pred, rec, val1, val2);
211 stack[top++] = match;
212 continue;
213 }
214 if (pred->pop_n > top) {
215 WARN_ON_ONCE(1);
216 return 0;
217 }
218 val1 = stack[--top];
219 val2 = stack[--top];
220 match = pred->fn(pred, rec, val1, val2);
221 stack[top++] = match;
222 }
56 223
57 match = (val == *addr) ^ pred->not; 224 return stack[--top];
225}
226EXPORT_SYMBOL_GPL(filter_match_preds);
58 227
59 return match; 228static void parse_error(struct filter_parse_state *ps, int err, int pos)
229{
230 ps->lasterr = err;
231 ps->lasterr_pos = pos;
60} 232}
61 233
62static int filter_pred_8(struct filter_pred *pred, void *event) 234static void remove_filter_string(struct event_filter *filter)
63{ 235{
64 u8 *addr = (u8 *)(event + pred->offset); 236 kfree(filter->filter_string);
65 u8 val = (u8)pred->val; 237 filter->filter_string = NULL;
66 int match; 238}
67 239
68 match = (val == *addr) ^ pred->not; 240static int replace_filter_string(struct event_filter *filter,
241 char *filter_string)
242{
243 kfree(filter->filter_string);
244 filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
245 if (!filter->filter_string)
246 return -ENOMEM;
69 247
70 return match; 248 return 0;
71} 249}
72 250
73static int filter_pred_string(struct filter_pred *pred, void *event) 251static int append_filter_string(struct event_filter *filter,
252 char *string)
74{ 253{
75 char *addr = (char *)(event + pred->offset); 254 int newlen;
76 int cmp, match; 255 char *new_filter_string;
77 256
78 cmp = strncmp(addr, pred->str_val, pred->str_len); 257 BUG_ON(!filter->filter_string);
258 newlen = strlen(filter->filter_string) + strlen(string) + 1;
259 new_filter_string = kmalloc(newlen, GFP_KERNEL);
260 if (!new_filter_string)
261 return -ENOMEM;
79 262
80 match = (!cmp) ^ pred->not; 263 strcpy(new_filter_string, filter->filter_string);
264 strcat(new_filter_string, string);
265 kfree(filter->filter_string);
266 filter->filter_string = new_filter_string;
81 267
82 return match; 268 return 0;
83} 269}
84 270
85/* return 1 if event matches, 0 otherwise (discard) */ 271static void append_filter_err(struct filter_parse_state *ps,
86int filter_match_preds(struct ftrace_event_call *call, void *rec) 272 struct event_filter *filter)
87{ 273{
88 int i, matched, and_failed = 0; 274 int pos = ps->lasterr_pos;
89 struct filter_pred *pred; 275 char *buf, *pbuf;
90 276
91 for (i = 0; i < MAX_FILTER_PRED; i++) { 277 buf = (char *)__get_free_page(GFP_TEMPORARY);
92 if (call->preds[i]) { 278 if (!buf)
93 pred = call->preds[i]; 279 return;
94 if (and_failed && !pred->or)
95 continue;
96 matched = pred->fn(pred, rec);
97 if (!matched && !pred->or) {
98 and_failed = 1;
99 continue;
100 } else if (matched && pred->or)
101 return 1;
102 } else
103 break;
104 }
105 280
106 if (and_failed) 281 append_filter_string(filter, "\n");
107 return 0; 282 memset(buf, ' ', PAGE_SIZE);
283 if (pos > PAGE_SIZE - 128)
284 pos = 0;
285 buf[pos] = '^';
286 pbuf = &buf[pos] + 1;
108 287
109 return 1; 288 sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
289 append_filter_string(filter, buf);
290 free_page((unsigned long) buf);
110} 291}
111 292
112void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) 293void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
113{ 294{
114 char *field_name; 295 struct event_filter *filter = call->filter;
115 struct filter_pred *pred;
116 int i;
117 296
118 if (!preds) { 297 mutex_lock(&filter_mutex);
298 if (filter->filter_string)
299 trace_seq_printf(s, "%s\n", filter->filter_string);
300 else
119 trace_seq_printf(s, "none\n"); 301 trace_seq_printf(s, "none\n");
120 return; 302 mutex_unlock(&filter_mutex);
121 } 303}
122 304
123 for (i = 0; i < MAX_FILTER_PRED; i++) { 305void print_subsystem_event_filter(struct event_subsystem *system,
124 if (preds[i]) { 306 struct trace_seq *s)
125 pred = preds[i]; 307{
126 field_name = pred->field_name; 308 struct event_filter *filter = system->filter;
127 if (i) 309
128 trace_seq_printf(s, pred->or ? "|| " : "&& "); 310 mutex_lock(&filter_mutex);
129 trace_seq_printf(s, "%s ", field_name); 311 if (filter->filter_string)
130 trace_seq_printf(s, pred->not ? "!= " : "== "); 312 trace_seq_printf(s, "%s\n", filter->filter_string);
131 if (pred->str_val) 313 else
132 trace_seq_printf(s, "%s\n", pred->str_val); 314 trace_seq_printf(s, "none\n");
133 else 315 mutex_unlock(&filter_mutex);
134 trace_seq_printf(s, "%llu\n", pred->val);
135 } else
136 break;
137 }
138} 316}
139 317
140static struct ftrace_event_field * 318static struct ftrace_event_field *
@@ -150,284 +328,828 @@ find_event_field(struct ftrace_event_call *call, char *name)
150 return NULL; 328 return NULL;
151} 329}
152 330
153void filter_free_pred(struct filter_pred *pred) 331static void filter_free_pred(struct filter_pred *pred)
154{ 332{
155 if (!pred) 333 if (!pred)
156 return; 334 return;
157 335
158 kfree(pred->field_name); 336 kfree(pred->field_name);
159 kfree(pred->str_val);
160 kfree(pred); 337 kfree(pred);
161} 338}
162 339
163void filter_free_preds(struct ftrace_event_call *call) 340static void filter_clear_pred(struct filter_pred *pred)
164{ 341{
165 int i; 342 kfree(pred->field_name);
343 pred->field_name = NULL;
344 pred->str_len = 0;
345}
166 346
167 if (call->preds) { 347static int filter_set_pred(struct filter_pred *dest,
168 for (i = 0; i < MAX_FILTER_PRED; i++) 348 struct filter_pred *src,
169 filter_free_pred(call->preds[i]); 349 filter_pred_fn_t fn)
170 kfree(call->preds); 350{
171 call->preds = NULL; 351 *dest = *src;
352 if (src->field_name) {
353 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
354 if (!dest->field_name)
355 return -ENOMEM;
172 } 356 }
357 dest->fn = fn;
358
359 return 0;
173} 360}
174 361
175void filter_free_subsystem_preds(struct event_subsystem *system) 362static void filter_disable_preds(struct ftrace_event_call *call)
176{ 363{
177 struct ftrace_event_call *call = __start_ftrace_events; 364 struct event_filter *filter = call->filter;
178 int i; 365 int i;
179 366
180 if (system->preds) { 367 call->filter_active = 0;
181 for (i = 0; i < MAX_FILTER_PRED; i++) 368 filter->n_preds = 0;
182 filter_free_pred(system->preds[i]);
183 kfree(system->preds);
184 system->preds = NULL;
185 }
186 369
187 events_for_each(call) { 370 for (i = 0; i < MAX_FILTER_PRED; i++)
188 if (!call->name || !call->regfunc) 371 filter->preds[i]->fn = filter_pred_none;
189 continue; 372}
373
374void destroy_preds(struct ftrace_event_call *call)
375{
376 struct event_filter *filter = call->filter;
377 int i;
190 378
191 if (!strcmp(call->system, system->name)) 379 for (i = 0; i < MAX_FILTER_PRED; i++) {
192 filter_free_preds(call); 380 if (filter->preds[i])
381 filter_free_pred(filter->preds[i]);
193 } 382 }
383 kfree(filter->preds);
384 kfree(filter);
385 call->filter = NULL;
194} 386}
195 387
196static int __filter_add_pred(struct ftrace_event_call *call, 388int init_preds(struct ftrace_event_call *call)
197 struct filter_pred *pred)
198{ 389{
390 struct event_filter *filter;
391 struct filter_pred *pred;
199 int i; 392 int i;
200 393
201 if (call->preds && !pred->compound) 394 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
202 filter_free_preds(call); 395 if (!call->filter)
396 return -ENOMEM;
203 397
204 if (!call->preds) { 398 call->filter_active = 0;
205 call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), 399 filter->n_preds = 0;
206 GFP_KERNEL); 400
207 if (!call->preds) 401 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
208 return -ENOMEM; 402 if (!filter->preds)
209 } 403 goto oom;
210 404
211 for (i = 0; i < MAX_FILTER_PRED; i++) { 405 for (i = 0; i < MAX_FILTER_PRED; i++) {
212 if (!call->preds[i]) { 406 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
213 call->preds[i] = pred; 407 if (!pred)
214 return 0; 408 goto oom;
409 pred->fn = filter_pred_none;
410 filter->preds[i] = pred;
411 }
412
413 return 0;
414
415oom:
416 destroy_preds(call);
417
418 return -ENOMEM;
419}
420EXPORT_SYMBOL_GPL(init_preds);
421
422static void filter_free_subsystem_preds(struct event_subsystem *system)
423{
424 struct event_filter *filter = system->filter;
425 struct ftrace_event_call *call;
426 int i;
427
428 if (filter->n_preds) {
429 for (i = 0; i < filter->n_preds; i++)
430 filter_free_pred(filter->preds[i]);
431 kfree(filter->preds);
432 filter->preds = NULL;
433 filter->n_preds = 0;
434 }
435
436 mutex_lock(&event_mutex);
437 list_for_each_entry(call, &ftrace_events, list) {
438 if (!call->define_fields)
439 continue;
440
441 if (!strcmp(call->system, system->name)) {
442 filter_disable_preds(call);
443 remove_filter_string(call->filter);
215 } 444 }
216 } 445 }
446 mutex_unlock(&event_mutex);
447}
448
449static int filter_add_pred_fn(struct filter_parse_state *ps,
450 struct ftrace_event_call *call,
451 struct filter_pred *pred,
452 filter_pred_fn_t fn)
453{
454 struct event_filter *filter = call->filter;
455 int idx, err;
456
457 if (filter->n_preds == MAX_FILTER_PRED) {
458 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
459 return -ENOSPC;
460 }
461
462 idx = filter->n_preds;
463 filter_clear_pred(filter->preds[idx]);
464 err = filter_set_pred(filter->preds[idx], pred, fn);
465 if (err)
466 return err;
217 467
218 return -ENOSPC; 468 filter->n_preds++;
469 call->filter_active = 1;
470
471 return 0;
219} 472}
220 473
474enum {
475 FILTER_STATIC_STRING = 1,
476 FILTER_DYN_STRING
477};
478
221static int is_string_field(const char *type) 479static int is_string_field(const char *type)
222{ 480{
481 if (strstr(type, "__data_loc") && strstr(type, "char"))
482 return FILTER_DYN_STRING;
483
223 if (strchr(type, '[') && strstr(type, "char")) 484 if (strchr(type, '[') && strstr(type, "char"))
224 return 1; 485 return FILTER_STATIC_STRING;
225 486
226 return 0; 487 return 0;
227} 488}
228 489
229int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) 490static int is_legal_op(struct ftrace_event_field *field, int op)
230{ 491{
231 struct ftrace_event_field *field; 492 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
232 493 return 0;
233 field = find_event_field(call, pred->field_name);
234 if (!field)
235 return -EINVAL;
236 494
237 pred->offset = field->offset; 495 return 1;
496}
238 497
239 if (is_string_field(field->type)) { 498static filter_pred_fn_t select_comparison_fn(int op, int field_size,
240 if (!pred->str_val) 499 int field_is_signed)
241 return -EINVAL; 500{
242 pred->fn = filter_pred_string; 501 filter_pred_fn_t fn = NULL;
243 pred->str_len = field->size;
244 return __filter_add_pred(call, pred);
245 } else {
246 if (pred->str_val)
247 return -EINVAL;
248 }
249 502
250 switch (field->size) { 503 switch (field_size) {
251 case 8: 504 case 8:
252 pred->fn = filter_pred_64; 505 if (op == OP_EQ || op == OP_NE)
506 fn = filter_pred_64;
507 else if (field_is_signed)
508 fn = filter_pred_s64;
509 else
510 fn = filter_pred_u64;
253 break; 511 break;
254 case 4: 512 case 4:
255 pred->fn = filter_pred_32; 513 if (op == OP_EQ || op == OP_NE)
514 fn = filter_pred_32;
515 else if (field_is_signed)
516 fn = filter_pred_s32;
517 else
518 fn = filter_pred_u32;
256 break; 519 break;
257 case 2: 520 case 2:
258 pred->fn = filter_pred_16; 521 if (op == OP_EQ || op == OP_NE)
522 fn = filter_pred_16;
523 else if (field_is_signed)
524 fn = filter_pred_s16;
525 else
526 fn = filter_pred_u16;
259 break; 527 break;
260 case 1: 528 case 1:
261 pred->fn = filter_pred_8; 529 if (op == OP_EQ || op == OP_NE)
530 fn = filter_pred_8;
531 else if (field_is_signed)
532 fn = filter_pred_s8;
533 else
534 fn = filter_pred_u8;
262 break; 535 break;
263 default:
264 return -EINVAL;
265 } 536 }
266 537
267 return __filter_add_pred(call, pred); 538 return fn;
268} 539}
269 540
270static struct filter_pred *copy_pred(struct filter_pred *pred) 541static int filter_add_pred(struct filter_parse_state *ps,
542 struct ftrace_event_call *call,
543 struct filter_pred *pred)
271{ 544{
272 struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); 545 struct ftrace_event_field *field;
273 if (!new_pred) 546 filter_pred_fn_t fn;
274 return NULL; 547 unsigned long long val;
548 int string_type;
549
550 pred->fn = filter_pred_none;
551
552 if (pred->op == OP_AND) {
553 pred->pop_n = 2;
554 return filter_add_pred_fn(ps, call, pred, filter_pred_and);
555 } else if (pred->op == OP_OR) {
556 pred->pop_n = 2;
557 return filter_add_pred_fn(ps, call, pred, filter_pred_or);
558 }
559
560 field = find_event_field(call, pred->field_name);
561 if (!field) {
562 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
563 return -EINVAL;
564 }
275 565
276 memcpy(new_pred, pred, sizeof(*pred)); 566 pred->offset = field->offset;
277 567
278 if (pred->field_name) { 568 if (!is_legal_op(field, pred->op)) {
279 new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 569 parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
280 if (!new_pred->field_name) { 570 return -EINVAL;
281 kfree(new_pred);
282 return NULL;
283 }
284 } 571 }
285 572
286 if (pred->str_val) { 573 string_type = is_string_field(field->type);
287 new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); 574 if (string_type) {
288 if (!new_pred->str_val) { 575 if (string_type == FILTER_STATIC_STRING)
289 filter_free_pred(new_pred); 576 fn = filter_pred_string;
290 return NULL; 577 else
578 fn = filter_pred_strloc;
579 pred->str_len = field->size;
580 if (pred->op == OP_NE)
581 pred->not = 1;
582 return filter_add_pred_fn(ps, call, pred, fn);
583 } else {
584 if (strict_strtoull(pred->str_val, 0, &val)) {
585 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
586 return -EINVAL;
291 } 587 }
588 pred->val = val;
589 }
590
591 fn = select_comparison_fn(pred->op, field->size, field->is_signed);
592 if (!fn) {
593 parse_error(ps, FILT_ERR_INVALID_OP, 0);
594 return -EINVAL;
292 } 595 }
293 596
294 return new_pred; 597 if (pred->op == OP_NE)
598 pred->not = 1;
599
600 return filter_add_pred_fn(ps, call, pred, fn);
295} 601}
296 602
297int filter_add_subsystem_pred(struct event_subsystem *system, 603static int filter_add_subsystem_pred(struct filter_parse_state *ps,
298 struct filter_pred *pred) 604 struct event_subsystem *system,
605 struct filter_pred *pred,
606 char *filter_string)
299{ 607{
300 struct ftrace_event_call *call = __start_ftrace_events; 608 struct event_filter *filter = system->filter;
301 struct filter_pred *event_pred; 609 struct ftrace_event_call *call;
302 int i; 610 int err = 0;
303
304 if (system->preds && !pred->compound)
305 filter_free_subsystem_preds(system);
306 611
307 if (!system->preds) { 612 if (!filter->preds) {
308 system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), 613 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
309 GFP_KERNEL); 614 GFP_KERNEL);
310 if (!system->preds) 615
616 if (!filter->preds)
311 return -ENOMEM; 617 return -ENOMEM;
312 } 618 }
313 619
314 for (i = 0; i < MAX_FILTER_PRED; i++) { 620 if (filter->n_preds == MAX_FILTER_PRED) {
315 if (!system->preds[i]) { 621 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
316 system->preds[i] = pred; 622 return -ENOSPC;
317 break;
318 }
319 } 623 }
320 624
321 if (i == MAX_FILTER_PRED) 625 filter->preds[filter->n_preds] = pred;
322 return -ENOSPC; 626 filter->n_preds++;
323 627
324 events_for_each(call) { 628 mutex_lock(&event_mutex);
325 int err; 629 list_for_each_entry(call, &ftrace_events, list) {
326 630
327 if (!call->name || !call->regfunc) 631 if (!call->define_fields)
328 continue; 632 continue;
329 633
330 if (strcmp(call->system, system->name)) 634 if (strcmp(call->system, system->name))
331 continue; 635 continue;
332 636
333 if (!find_event_field(call, pred->field_name)) 637 err = filter_add_pred(ps, call, pred);
334 continue; 638 if (err) {
639 mutex_unlock(&event_mutex);
640 filter_free_subsystem_preds(system);
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
642 goto out;
643 }
644 replace_filter_string(call->filter, filter_string);
645 }
646 mutex_unlock(&event_mutex);
647out:
648 return err;
649}
335 650
336 event_pred = copy_pred(pred); 651static void parse_init(struct filter_parse_state *ps,
337 if (!event_pred) 652 struct filter_op *ops,
338 goto oom; 653 char *infix_string)
654{
655 memset(ps, '\0', sizeof(*ps));
339 656
340 err = filter_add_pred(call, event_pred); 657 ps->infix.string = infix_string;
341 if (err) 658 ps->infix.cnt = strlen(infix_string);
342 filter_free_pred(event_pred); 659 ps->ops = ops;
343 if (err == -ENOMEM) 660
344 goto oom; 661 INIT_LIST_HEAD(&ps->opstack);
662 INIT_LIST_HEAD(&ps->postfix);
663}
664
665static char infix_next(struct filter_parse_state *ps)
666{
667 ps->infix.cnt--;
668
669 return ps->infix.string[ps->infix.tail++];
670}
671
672static char infix_peek(struct filter_parse_state *ps)
673{
674 if (ps->infix.tail == strlen(ps->infix.string))
675 return 0;
676
677 return ps->infix.string[ps->infix.tail];
678}
679
680static void infix_advance(struct filter_parse_state *ps)
681{
682 ps->infix.cnt--;
683 ps->infix.tail++;
684}
685
686static inline int is_precedence_lower(struct filter_parse_state *ps,
687 int a, int b)
688{
689 return ps->ops[a].precedence < ps->ops[b].precedence;
690}
691
692static inline int is_op_char(struct filter_parse_state *ps, char c)
693{
694 int i;
695
696 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
697 if (ps->ops[i].string[0] == c)
698 return 1;
345 } 699 }
346 700
347 return 0; 701 return 0;
702}
348 703
349oom: 704static int infix_get_op(struct filter_parse_state *ps, char firstc)
350 system->preds[i] = NULL; 705{
351 return -ENOMEM; 706 char nextc = infix_peek(ps);
707 char opstr[3];
708 int i;
709
710 opstr[0] = firstc;
711 opstr[1] = nextc;
712 opstr[2] = '\0';
713
714 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
715 if (!strcmp(opstr, ps->ops[i].string)) {
716 infix_advance(ps);
717 return ps->ops[i].id;
718 }
719 }
720
721 opstr[1] = '\0';
722
723 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
724 if (!strcmp(opstr, ps->ops[i].string))
725 return ps->ops[i].id;
726 }
727
728 return OP_NONE;
352} 729}
353 730
354int filter_parse(char **pbuf, struct filter_pred *pred) 731static inline void clear_operand_string(struct filter_parse_state *ps)
355{ 732{
356 char *tmp, *tok, *val_str = NULL; 733 memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
357 int tok_n = 0; 734 ps->operand.tail = 0;
735}
358 736
359 /* field ==/!= number, or/and field ==/!= number, number */ 737static inline int append_operand_char(struct filter_parse_state *ps, char c)
360 while ((tok = strsep(pbuf, " \n"))) { 738{
361 if (tok_n == 0) { 739 if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
362 if (!strcmp(tok, "0")) { 740 return -EINVAL;
363 pred->clear = 1; 741
364 return 0; 742 ps->operand.string[ps->operand.tail++] = c;
365 } else if (!strcmp(tok, "&&")) { 743
366 pred->or = 0; 744 return 0;
367 pred->compound = 1; 745}
368 } else if (!strcmp(tok, "||")) { 746
369 pred->or = 1; 747static int filter_opstack_push(struct filter_parse_state *ps, int op)
370 pred->compound = 1; 748{
371 } else 749 struct opstack_op *opstack_op;
372 pred->field_name = tok; 750
373 tok_n = 1; 751 opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
752 if (!opstack_op)
753 return -ENOMEM;
754
755 opstack_op->op = op;
756 list_add(&opstack_op->list, &ps->opstack);
757
758 return 0;
759}
760
761static int filter_opstack_empty(struct filter_parse_state *ps)
762{
763 return list_empty(&ps->opstack);
764}
765
766static int filter_opstack_top(struct filter_parse_state *ps)
767{
768 struct opstack_op *opstack_op;
769
770 if (filter_opstack_empty(ps))
771 return OP_NONE;
772
773 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
774
775 return opstack_op->op;
776}
777
778static int filter_opstack_pop(struct filter_parse_state *ps)
779{
780 struct opstack_op *opstack_op;
781 int op;
782
783 if (filter_opstack_empty(ps))
784 return OP_NONE;
785
786 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
787 op = opstack_op->op;
788 list_del(&opstack_op->list);
789
790 kfree(opstack_op);
791
792 return op;
793}
794
795static void filter_opstack_clear(struct filter_parse_state *ps)
796{
797 while (!filter_opstack_empty(ps))
798 filter_opstack_pop(ps);
799}
800
801static char *curr_operand(struct filter_parse_state *ps)
802{
803 return ps->operand.string;
804}
805
806static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
807{
808 struct postfix_elt *elt;
809
810 elt = kmalloc(sizeof(*elt), GFP_KERNEL);
811 if (!elt)
812 return -ENOMEM;
813
814 elt->op = OP_NONE;
815 elt->operand = kstrdup(operand, GFP_KERNEL);
816 if (!elt->operand) {
817 kfree(elt);
818 return -ENOMEM;
819 }
820
821 list_add_tail(&elt->list, &ps->postfix);
822
823 return 0;
824}
825
826static int postfix_append_op(struct filter_parse_state *ps, int op)
827{
828 struct postfix_elt *elt;
829
830 elt = kmalloc(sizeof(*elt), GFP_KERNEL);
831 if (!elt)
832 return -ENOMEM;
833
834 elt->op = op;
835 elt->operand = NULL;
836
837 list_add_tail(&elt->list, &ps->postfix);
838
839 return 0;
840}
841
842static void postfix_clear(struct filter_parse_state *ps)
843{
844 struct postfix_elt *elt;
845
846 while (!list_empty(&ps->postfix)) {
847 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
848 kfree(elt->operand);
849 list_del(&elt->list);
850 }
851}
852
853static int filter_parse(struct filter_parse_state *ps)
854{
855 int in_string = 0;
856 int op, top_op;
857 char ch;
858
859 while ((ch = infix_next(ps))) {
860 if (ch == '"') {
861 in_string ^= 1;
374 continue; 862 continue;
375 } 863 }
376 if (tok_n == 1) { 864
377 if (!pred->field_name) 865 if (in_string)
378 pred->field_name = tok; 866 goto parse_operand;
379 else if (!strcmp(tok, "!=")) 867
380 pred->not = 1; 868 if (isspace(ch))
381 else if (!strcmp(tok, "==")) 869 continue;
382 pred->not = 0; 870
383 else { 871 if (is_op_char(ps, ch)) {
384 pred->field_name = NULL; 872 op = infix_get_op(ps, ch);
873 if (op == OP_NONE) {
874 parse_error(ps, FILT_ERR_INVALID_OP, 0);
385 return -EINVAL; 875 return -EINVAL;
386 } 876 }
387 tok_n = 2; 877
878 if (strlen(curr_operand(ps))) {
879 postfix_append_operand(ps, curr_operand(ps));
880 clear_operand_string(ps);
881 }
882
883 while (!filter_opstack_empty(ps)) {
884 top_op = filter_opstack_top(ps);
885 if (!is_precedence_lower(ps, top_op, op)) {
886 top_op = filter_opstack_pop(ps);
887 postfix_append_op(ps, top_op);
888 continue;
889 }
890 break;
891 }
892
893 filter_opstack_push(ps, op);
388 continue; 894 continue;
389 } 895 }
390 if (tok_n == 2) { 896
391 if (pred->compound) { 897 if (ch == '(') {
392 if (!strcmp(tok, "!=")) 898 filter_opstack_push(ps, OP_OPEN_PAREN);
393 pred->not = 1; 899 continue;
394 else if (!strcmp(tok, "==")) 900 }
395 pred->not = 0; 901
396 else { 902 if (ch == ')') {
397 pred->field_name = NULL; 903 if (strlen(curr_operand(ps))) {
398 return -EINVAL; 904 postfix_append_operand(ps, curr_operand(ps));
399 } 905 clear_operand_string(ps);
400 } else { 906 }
401 val_str = tok; 907
402 break; /* done */ 908 top_op = filter_opstack_pop(ps);
909 while (top_op != OP_NONE) {
910 if (top_op == OP_OPEN_PAREN)
911 break;
912 postfix_append_op(ps, top_op);
913 top_op = filter_opstack_pop(ps);
914 }
915 if (top_op == OP_NONE) {
916 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
917 return -EINVAL;
403 } 918 }
404 tok_n = 3;
405 continue; 919 continue;
406 } 920 }
407 if (tok_n == 3) { 921parse_operand:
408 val_str = tok; 922 if (append_operand_char(ps, ch)) {
409 break; /* done */ 923 parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
924 return -EINVAL;
410 } 925 }
411 } 926 }
412 927
413 if (!val_str) { 928 if (strlen(curr_operand(ps)))
414 pred->field_name = NULL; 929 postfix_append_operand(ps, curr_operand(ps));
415 return -EINVAL; 930
931 while (!filter_opstack_empty(ps)) {
932 top_op = filter_opstack_pop(ps);
933 if (top_op == OP_NONE)
934 break;
935 if (top_op == OP_OPEN_PAREN) {
936 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
937 return -EINVAL;
938 }
939 postfix_append_op(ps, top_op);
416 } 940 }
417 941
418 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 942 return 0;
419 if (!pred->field_name) 943}
420 return -ENOMEM;
421 944
422 pred->val = simple_strtoull(val_str, &tmp, 0); 945static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
423 if (tmp == val_str) { 946{
424 pred->str_val = kstrdup(val_str, GFP_KERNEL); 947 struct filter_pred *pred;
425 if (!pred->str_val) 948
426 return -ENOMEM; 949 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
427 } else if (*tmp != '\0') 950 if (!pred)
951 return NULL;
952
953 pred->field_name = kstrdup(operand1, GFP_KERNEL);
954 if (!pred->field_name) {
955 kfree(pred);
956 return NULL;
957 }
958
959 strcpy(pred->str_val, operand2);
960 pred->str_len = strlen(operand2);
961
962 pred->op = op;
963
964 return pred;
965}
966
967static struct filter_pred *create_logical_pred(int op)
968{
969 struct filter_pred *pred;
970
971 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
972 if (!pred)
973 return NULL;
974
975 pred->op = op;
976
977 return pred;
978}
979
980static int check_preds(struct filter_parse_state *ps)
981{
982 int n_normal_preds = 0, n_logical_preds = 0;
983 struct postfix_elt *elt;
984
985 list_for_each_entry(elt, &ps->postfix, list) {
986 if (elt->op == OP_NONE)
987 continue;
988
989 if (elt->op == OP_AND || elt->op == OP_OR) {
990 n_logical_preds++;
991 continue;
992 }
993 n_normal_preds++;
994 }
995
996 if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
997 parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
428 return -EINVAL; 998 return -EINVAL;
999 }
429 1000
430 return 0; 1001 return 0;
431} 1002}
432 1003
1004static int replace_preds(struct event_subsystem *system,
1005 struct ftrace_event_call *call,
1006 struct filter_parse_state *ps,
1007 char *filter_string)
1008{
1009 char *operand1 = NULL, *operand2 = NULL;
1010 struct filter_pred *pred;
1011 struct postfix_elt *elt;
1012 int err;
1013
1014 err = check_preds(ps);
1015 if (err)
1016 return err;
1017
1018 list_for_each_entry(elt, &ps->postfix, list) {
1019 if (elt->op == OP_NONE) {
1020 if (!operand1)
1021 operand1 = elt->operand;
1022 else if (!operand2)
1023 operand2 = elt->operand;
1024 else {
1025 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1026 return -EINVAL;
1027 }
1028 continue;
1029 }
1030
1031 if (elt->op == OP_AND || elt->op == OP_OR) {
1032 pred = create_logical_pred(elt->op);
1033 if (call) {
1034 err = filter_add_pred(ps, call, pred);
1035 filter_free_pred(pred);
1036 } else
1037 err = filter_add_subsystem_pred(ps, system,
1038 pred, filter_string);
1039 if (err)
1040 return err;
1041
1042 operand1 = operand2 = NULL;
1043 continue;
1044 }
1045
1046 if (!operand1 || !operand2) {
1047 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1048 return -EINVAL;
1049 }
1050
1051 pred = create_pred(elt->op, operand1, operand2);
1052 if (call) {
1053 err = filter_add_pred(ps, call, pred);
1054 filter_free_pred(pred);
1055 } else
1056 err = filter_add_subsystem_pred(ps, system, pred,
1057 filter_string);
1058 if (err)
1059 return err;
1060
1061 operand1 = operand2 = NULL;
1062 }
1063
1064 return 0;
1065}
1066
1067int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1068{
1069 int err;
1070
1071 struct filter_parse_state *ps;
1072
1073 mutex_lock(&filter_mutex);
1074
1075 if (!strcmp(strstrip(filter_string), "0")) {
1076 filter_disable_preds(call);
1077 remove_filter_string(call->filter);
1078 mutex_unlock(&filter_mutex);
1079 return 0;
1080 }
1081
1082 err = -ENOMEM;
1083 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1084 if (!ps)
1085 goto out_unlock;
1086
1087 filter_disable_preds(call);
1088 replace_filter_string(call->filter, filter_string);
1089
1090 parse_init(ps, filter_ops, filter_string);
1091 err = filter_parse(ps);
1092 if (err) {
1093 append_filter_err(ps, call->filter);
1094 goto out;
1095 }
1096
1097 err = replace_preds(NULL, call, ps, filter_string);
1098 if (err)
1099 append_filter_err(ps, call->filter);
1100
1101out:
1102 filter_opstack_clear(ps);
1103 postfix_clear(ps);
1104 kfree(ps);
1105out_unlock:
1106 mutex_unlock(&filter_mutex);
1107
1108 return err;
1109}
1110
1111int apply_subsystem_event_filter(struct event_subsystem *system,
1112 char *filter_string)
1113{
1114 int err;
1115
1116 struct filter_parse_state *ps;
1117
1118 mutex_lock(&filter_mutex);
1119
1120 if (!strcmp(strstrip(filter_string), "0")) {
1121 filter_free_subsystem_preds(system);
1122 remove_filter_string(system->filter);
1123 mutex_unlock(&filter_mutex);
1124 return 0;
1125 }
1126
1127 err = -ENOMEM;
1128 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1129 if (!ps)
1130 goto out_unlock;
1131
1132 filter_free_subsystem_preds(system);
1133 replace_filter_string(system->filter, filter_string);
1134
1135 parse_init(ps, filter_ops, filter_string);
1136 err = filter_parse(ps);
1137 if (err) {
1138 append_filter_err(ps, system->filter);
1139 goto out;
1140 }
1141
1142 err = replace_preds(system, NULL, ps, filter_string);
1143 if (err)
1144 append_filter_err(ps, system->filter);
1145
1146out:
1147 filter_opstack_clear(ps);
1148 postfix_clear(ps);
1149 kfree(ps);
1150out_unlock:
1151 mutex_unlock(&filter_mutex);
1152
1153 return err;
1154}
433 1155
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 38985f9b379c..000000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Stage 1 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * struct ftrace_raw_<call> {
7 * struct trace_entry ent;
8 * <type> <item>;
9 * <type2> <item2>[<len>];
10 * [...]
11 * };
12 *
13 * The <type> <item> is created by the __field(type, item) macro or
14 * the __array(type2, item2, len) macro.
15 * We simply do "type item;", and that will create the fields
16 * in the structure.
17 */
18
19#undef TRACE_FORMAT
20#define TRACE_FORMAT(call, proto, args, fmt)
21
22#undef __array
23#define __array(type, item, len) type item[len];
24
25#undef __field
26#define __field(type, item) type item;
27
28#undef TP_STRUCT__entry
29#define TP_STRUCT__entry(args...) args
30
31#undef TRACE_EVENT
32#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
33 struct ftrace_raw_##name { \
34 struct trace_entry ent; \
35 tstruct \
36 }; \
37 static struct ftrace_event_call event_##name
38
39#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index d363c6672c6c..000000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Stage 2 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * enum print_line_t
7 * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
8 * {
9 * struct trace_seq *s = &iter->seq;
10 * struct ftrace_raw_<call> *field; <-- defined in stage 1
11 * struct trace_entry *entry;
12 * int ret;
13 *
14 * entry = iter->ent;
15 *
16 * if (entry->type != event_<call>.id) {
17 * WARN_ON_ONCE(1);
18 * return TRACE_TYPE_UNHANDLED;
19 * }
20 *
21 * field = (typeof(field))entry;
22 *
23 * ret = trace_seq_printf(s, <TP_printk> "\n");
24 * if (!ret)
25 * return TRACE_TYPE_PARTIAL_LINE;
26 *
27 * return TRACE_TYPE_HANDLED;
28 * }
29 *
30 * This is the method used to print the raw event to the trace
31 * output format. Note, this is not needed if the data is read
32 * in binary.
33 */
34
35#undef __entry
36#define __entry field
37
38#undef TP_printk
39#define TP_printk(fmt, args...) fmt "\n", args
40
41#undef TRACE_EVENT
42#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
43enum print_line_t \
44ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
45{ \
46 struct trace_seq *s = &iter->seq; \
47 struct ftrace_raw_##call *field; \
48 struct trace_entry *entry; \
49 int ret; \
50 \
51 entry = iter->ent; \
52 \
53 if (entry->type != event_##call.id) { \
54 WARN_ON_ONCE(1); \
55 return TRACE_TYPE_UNHANDLED; \
56 } \
57 \
58 field = (typeof(field))entry; \
59 \
60 ret = trace_seq_printf(s, #call ": " print); \
61 if (!ret) \
62 return TRACE_TYPE_PARTIAL_LINE; \
63 \
64 return TRACE_TYPE_HANDLED; \
65}
66
67#include <trace/trace_event_types.h>
68
69/*
70 * Setup the showing format of trace point.
71 *
72 * int
73 * ftrace_format_##call(struct trace_seq *s)
74 * {
75 * struct ftrace_raw_##call field;
76 * int ret;
77 *
78 * ret = trace_seq_printf(s, #type " " #item ";"
79 * " offset:%u; size:%u;\n",
80 * offsetof(struct ftrace_raw_##call, item),
81 * sizeof(field.type));
82 *
83 * }
84 */
85
86#undef TP_STRUCT__entry
87#define TP_STRUCT__entry(args...) args
88
89#undef __field
90#define __field(type, item) \
91 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
92 "offset:%u;\tsize:%u;\n", \
93 (unsigned int)offsetof(typeof(field), item), \
94 (unsigned int)sizeof(field.item)); \
95 if (!ret) \
96 return 0;
97
98#undef __array
99#define __array(type, item, len) \
100 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
101 "offset:%u;\tsize:%u;\n", \
102 (unsigned int)offsetof(typeof(field), item), \
103 (unsigned int)sizeof(field.item)); \
104 if (!ret) \
105 return 0;
106
107#undef __entry
108#define __entry REC
109
110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
112
113#undef TP_fast_assign
114#define TP_fast_assign(args...) args
115
116#undef TRACE_EVENT
117#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
118static int \
119ftrace_format_##call(struct trace_seq *s) \
120{ \
121 struct ftrace_raw_##call field; \
122 int ret; \
123 \
124 tstruct; \
125 \
126 trace_seq_printf(s, "\nprint fmt: " print); \
127 \
128 return ret; \
129}
130
131#include <trace/trace_event_types.h>
132
133#undef __field
134#define __field(type, item) \
135 ret = trace_define_field(event_call, #type, #item, \
136 offsetof(typeof(field), item), \
137 sizeof(field.item)); \
138 if (ret) \
139 return ret;
140
141#undef __array
142#define __array(type, item, len) \
143 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
144 offsetof(typeof(field), item), \
145 sizeof(field.item)); \
146 if (ret) \
147 return ret;
148
149#define __common_field(type, item) \
150 ret = trace_define_field(event_call, #type, "common_" #item, \
151 offsetof(typeof(field.ent), item), \
152 sizeof(field.ent.item)); \
153 if (ret) \
154 return ret;
155
156#undef TRACE_EVENT
157#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
158int \
159ftrace_define_fields_##call(void) \
160{ \
161 struct ftrace_raw_##call field; \
162 struct ftrace_event_call *event_call = &event_##call; \
163 int ret; \
164 \
165 __common_field(unsigned char, type); \
166 __common_field(unsigned char, flags); \
167 __common_field(unsigned char, preempt_count); \
168 __common_field(int, pid); \
169 __common_field(int, tgid); \
170 \
171 tstruct; \
172 \
173 return ret; \
174}
175
176#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 9d2fa78cecca..000000000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * Stage 3 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * static void ftrace_event_<call>(proto)
7 * {
8 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
9 * }
10 *
11 * static int ftrace_reg_event_<call>(void)
12 * {
13 * int ret;
14 *
15 * ret = register_trace_<call>(ftrace_event_<call>);
16 * if (!ret)
17 * pr_info("event trace: Could not activate trace point "
18 * "probe to <call>");
19 * return ret;
20 * }
21 *
22 * static void ftrace_unreg_event_<call>(void)
23 * {
24 * unregister_trace_<call>(ftrace_event_<call>);
25 * }
26 *
27 * For those macros defined with TRACE_FORMAT:
28 *
29 * static struct ftrace_event_call __used
30 * __attribute__((__aligned__(4)))
31 * __attribute__((section("_ftrace_events"))) event_<call> = {
32 * .name = "<call>",
33 * .regfunc = ftrace_reg_event_<call>,
34 * .unregfunc = ftrace_unreg_event_<call>,
35 * }
36 *
37 *
38 * For those macros defined with TRACE_EVENT:
39 *
40 * static struct ftrace_event_call event_<call>;
41 *
42 * static void ftrace_raw_event_<call>(proto)
43 * {
44 * struct ring_buffer_event *event;
45 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
46 * unsigned long irq_flags;
47 * int pc;
48 *
49 * local_save_flags(irq_flags);
50 * pc = preempt_count();
51 *
52 * event = trace_current_buffer_lock_reserve(event_<call>.id,
53 * sizeof(struct ftrace_raw_<call>),
54 * irq_flags, pc);
55 * if (!event)
56 * return;
57 * entry = ring_buffer_event_data(event);
58 *
59 * <assign>; <-- Here we assign the entries by the __field and
60 * __array macros.
61 *
62 * trace_current_buffer_unlock_commit(event, irq_flags, pc);
63 * }
64 *
65 * static int ftrace_raw_reg_event_<call>(void)
66 * {
67 * int ret;
68 *
69 * ret = register_trace_<call>(ftrace_raw_event_<call>);
70 * if (!ret)
71 * pr_info("event trace: Could not activate trace point "
72 * "probe to <call>");
73 * return ret;
74 * }
75 *
76 * static void ftrace_unreg_event_<call>(void)
77 * {
78 * unregister_trace_<call>(ftrace_raw_event_<call>);
79 * }
80 *
81 * static struct trace_event ftrace_event_type_<call> = {
82 * .trace = ftrace_raw_output_<call>, <-- stage 2
83 * };
84 *
85 * static int ftrace_raw_init_event_<call>(void)
86 * {
87 * int id;
88 *
89 * id = register_ftrace_event(&ftrace_event_type_<call>);
90 * if (!id)
91 * return -ENODEV;
92 * event_<call>.id = id;
93 * return 0;
94 * }
95 *
96 * static struct ftrace_event_call __used
97 * __attribute__((__aligned__(4)))
98 * __attribute__((section("_ftrace_events"))) event_<call> = {
99 * .name = "<call>",
100 * .system = "<system>",
101 * .raw_init = ftrace_raw_init_event_<call>,
102 * .regfunc = ftrace_reg_event_<call>,
103 * .unregfunc = ftrace_unreg_event_<call>,
104 * .show_format = ftrace_format_<call>,
105 * }
106 *
107 */
108
109#undef TP_FMT
110#define TP_FMT(fmt, args...) fmt "\n", ##args
111
112#ifdef CONFIG_EVENT_PROFILE
113#define _TRACE_PROFILE(call, proto, args) \
114static void ftrace_profile_##call(proto) \
115{ \
116 extern void perf_tpcounter_event(int); \
117 perf_tpcounter_event(event_##call.id); \
118} \
119 \
120static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
121{ \
122 int ret = 0; \
123 \
124 if (!atomic_inc_return(&call->profile_count)) \
125 ret = register_trace_##call(ftrace_profile_##call); \
126 \
127 return ret; \
128} \
129 \
130static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
131{ \
132 if (atomic_add_negative(-1, &call->profile_count)) \
133 unregister_trace_##call(ftrace_profile_##call); \
134}
135
136#define _TRACE_PROFILE_INIT(call) \
137 .profile_count = ATOMIC_INIT(-1), \
138 .profile_enable = ftrace_profile_enable_##call, \
139 .profile_disable = ftrace_profile_disable_##call,
140
141#else
142#define _TRACE_PROFILE(call, proto, args)
143#define _TRACE_PROFILE_INIT(call)
144#endif
145
146#define _TRACE_FORMAT(call, proto, args, fmt) \
147static void ftrace_event_##call(proto) \
148{ \
149 event_trace_printk(_RET_IP_, #call ": " fmt); \
150} \
151 \
152static int ftrace_reg_event_##call(void) \
153{ \
154 int ret; \
155 \
156 ret = register_trace_##call(ftrace_event_##call); \
157 if (ret) \
158 pr_info("event trace: Could not activate trace point " \
159 "probe to " #call "\n"); \
160 return ret; \
161} \
162 \
163static void ftrace_unreg_event_##call(void) \
164{ \
165 unregister_trace_##call(ftrace_event_##call); \
166} \
167 \
168static struct ftrace_event_call event_##call; \
169 \
170static int ftrace_init_event_##call(void) \
171{ \
172 int id; \
173 \
174 id = register_ftrace_event(NULL); \
175 if (!id) \
176 return -ENODEV; \
177 event_##call.id = id; \
178 return 0; \
179}
180
181#undef TRACE_FORMAT
182#define TRACE_FORMAT(call, proto, args, fmt) \
183_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
184_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
185static struct ftrace_event_call __used \
186__attribute__((__aligned__(4))) \
187__attribute__((section("_ftrace_events"))) event_##call = { \
188 .name = #call, \
189 .system = __stringify(TRACE_SYSTEM), \
190 .raw_init = ftrace_init_event_##call, \
191 .regfunc = ftrace_reg_event_##call, \
192 .unregfunc = ftrace_unreg_event_##call, \
193 _TRACE_PROFILE_INIT(call) \
194}
195
196#undef __entry
197#define __entry entry
198
199#undef TRACE_EVENT
200#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
201_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
202 \
203static struct ftrace_event_call event_##call; \
204 \
205static void ftrace_raw_event_##call(proto) \
206{ \
207 struct ftrace_event_call *call = &event_##call; \
208 struct ring_buffer_event *event; \
209 struct ftrace_raw_##call *entry; \
210 unsigned long irq_flags; \
211 int pc; \
212 \
213 local_save_flags(irq_flags); \
214 pc = preempt_count(); \
215 \
216 event = trace_current_buffer_lock_reserve(event_##call.id, \
217 sizeof(struct ftrace_raw_##call), \
218 irq_flags, pc); \
219 if (!event) \
220 return; \
221 entry = ring_buffer_event_data(event); \
222 \
223 assign; \
224 \
225 if (call->preds && !filter_match_preds(call, entry)) \
226 ring_buffer_event_discard(event); \
227 \
228 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
229 \
230} \
231 \
232static int ftrace_raw_reg_event_##call(void) \
233{ \
234 int ret; \
235 \
236 ret = register_trace_##call(ftrace_raw_event_##call); \
237 if (ret) \
238 pr_info("event trace: Could not activate trace point " \
239 "probe to " #call "\n"); \
240 return ret; \
241} \
242 \
243static void ftrace_raw_unreg_event_##call(void) \
244{ \
245 unregister_trace_##call(ftrace_raw_event_##call); \
246} \
247 \
248static struct trace_event ftrace_event_type_##call = { \
249 .trace = ftrace_raw_output_##call, \
250}; \
251 \
252static int ftrace_raw_init_event_##call(void) \
253{ \
254 int id; \
255 \
256 id = register_ftrace_event(&ftrace_event_type_##call); \
257 if (!id) \
258 return -ENODEV; \
259 event_##call.id = id; \
260 INIT_LIST_HEAD(&event_##call.fields); \
261 return 0; \
262} \
263 \
264static struct ftrace_event_call __used \
265__attribute__((__aligned__(4))) \
266__attribute__((section("_ftrace_events"))) event_##call = { \
267 .name = #call, \
268 .system = __stringify(TRACE_SYSTEM), \
269 .raw_init = ftrace_raw_init_event_##call, \
270 .regfunc = ftrace_raw_reg_event_##call, \
271 .unregfunc = ftrace_raw_unreg_event_##call, \
272 .show_format = ftrace_format_##call, \
273 .define_fields = ftrace_define_fields_##call, \
274 _TRACE_PROFILE_INIT(call) \
275}
276
277#include <trace/trace_event_types.h>
278
279#undef _TRACE_PROFILE
280#undef _TRACE_PROFILE_INIT
281
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf3..d06cf898dc86 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
19#undef TRACE_STRUCT 19#undef TRACE_STRUCT
20#define TRACE_STRUCT(args...) args 20#define TRACE_STRUCT(args...) args
21 21
22extern void __bad_type_size(void);
23
22#undef TRACE_FIELD 24#undef TRACE_FIELD
23#define TRACE_FIELD(type, item, assign) \ 25#define TRACE_FIELD(type, item, assign) \
26 if (sizeof(type) != sizeof(field.item)) \
27 __bad_type_size(); \
24 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
25 "offset:%u;\tsize:%u;\n", \ 29 "offset:%u;\tsize:%u;\n", \
26 (unsigned int)offsetof(typeof(field), item), \ 30 (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
30 34
31 35
32#undef TRACE_FIELD_SPECIAL 36#undef TRACE_FIELD_SPECIAL
33#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ 37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
34 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
35 "offset:%u;\tsize:%u;\n", \ 39 "offset:%u;\tsize:%u;\n", \
36 (unsigned int)offsetof(typeof(field), item), \ 40 (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
46 if (!ret) \ 50 if (!ret) \
47 return 0; 51 return 0;
48 52
53#undef TRACE_FIELD_SIGN
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
55 TRACE_FIELD(type, item, assign)
49 56
50#undef TP_RAW_FMT 57#undef TP_RAW_FMT
51#define TP_RAW_FMT(args...) args 58#define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s) \
65 return ret; \ 72 return ret; \
66} 73}
67 74
75#undef TRACE_EVENT_FORMAT_NOFILTER
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
77 tpfmt) \
78static int \
79ftrace_format_##call(struct trace_seq *s) \
80{ \
81 struct args field; \
82 int ret; \
83 \
84 tstruct; \
85 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
87 \
88 return ret; \
89}
90
68#include "trace_event_types.h" 91#include "trace_event_types.h"
69 92
70#undef TRACE_ZERO_CHAR 93#undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \
78#define TRACE_FIELD(type, item, assign)\ 101#define TRACE_FIELD(type, item, assign)\
79 entry->item = assign; 102 entry->item = assign;
80 103
104#undef TRACE_FIELD_SIGN
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
106 TRACE_FIELD(type, item, assign)
107
81#undef TP_CMD 108#undef TP_CMD
82#define TP_CMD(cmd...) cmd 109#define TP_CMD(cmd...) cmd
83 110
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s) \
85#define TRACE_ENTRY entry 112#define TRACE_ENTRY entry
86 113
87#undef TRACE_FIELD_SPECIAL 114#undef TRACE_FIELD_SPECIAL
88#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ 115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
89 cmd; 116 cmd;
90 117
91#undef TRACE_EVENT_FORMAT 118#undef TRACE_EVENT_FORMAT
92#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \
122 \
123struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \
127 .id = proto, \
128 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \
131 .define_fields = ftrace_define_fields_##call, \
132}; \
133static int ftrace_raw_init_event_##call(void) \
134{ \
135 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \
138} \
139
140#undef TRACE_EVENT_FORMAT_NOFILTER
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
93 \ 143 \
94static struct ftrace_event_call __used \ 144struct ftrace_event_call __used \
95__attribute__((__aligned__(4))) \ 145__attribute__((__aligned__(4))) \
96__attribute__((section("_ftrace_events"))) event_##call = { \ 146__attribute__((section("_ftrace_events"))) event_##call = { \
97 .name = #call, \ 147 .name = #call, \
98 .id = proto, \ 148 .id = proto, \
99 .system = __stringify(TRACE_SYSTEM), \ 149 .system = __stringify(TRACE_SYSTEM), \
100 .show_format = ftrace_format_##call, \ 150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
101} 200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
102#include "trace_event_types.h" 206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a7..8b592418d8b2 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
65 if (!current->ret_stack) 65 if (!current->ret_stack)
66 return -EBUSY; 66 return -EBUSY;
67 67
68 /*
69 * We must make sure the ret_stack is tested before we read
70 * anything else.
71 */
72 smp_rmb();
73
68 /* The return trace stack is full */ 74 /* The return trace stack is full */
69 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { 75 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
70 atomic_inc(&current->trace_overrun); 76 atomic_inc(&current->trace_overrun);
@@ -78,13 +84,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
78 current->ret_stack[index].ret = ret; 84 current->ret_stack[index].ret = ret;
79 current->ret_stack[index].func = func; 85 current->ret_stack[index].func = func;
80 current->ret_stack[index].calltime = calltime; 86 current->ret_stack[index].calltime = calltime;
87 current->ret_stack[index].subtime = 0;
81 *depth = index; 88 *depth = index;
82 89
83 return 0; 90 return 0;
84} 91}
85 92
86/* Retrieve a function return address to the trace stack on thread info.*/ 93/* Retrieve a function return address to the trace stack on thread info.*/
87void 94static void
88ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 95ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
89{ 96{
90 int index; 97 int index;
@@ -104,9 +111,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
104 trace->calltime = current->ret_stack[index].calltime; 111 trace->calltime = current->ret_stack[index].calltime;
105 trace->overrun = atomic_read(&current->trace_overrun); 112 trace->overrun = atomic_read(&current->trace_overrun);
106 trace->depth = index; 113 trace->depth = index;
107 barrier();
108 current->curr_ret_stack--;
109
110} 114}
111 115
112/* 116/*
@@ -121,6 +125,8 @@ unsigned long ftrace_return_to_handler(void)
121 ftrace_pop_return_trace(&trace, &ret); 125 ftrace_pop_return_trace(&trace, &ret);
122 trace.rettime = trace_clock_local(); 126 trace.rettime = trace_clock_local();
123 ftrace_graph_return(&trace); 127 ftrace_graph_return(&trace);
128 barrier();
129 current->curr_ret_stack--;
124 130
125 if (unlikely(!ret)) { 131 if (unlikely(!ret)) {
126 ftrace_graph_stop(); 132 ftrace_graph_stop();
@@ -426,8 +432,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
426 return TRACE_TYPE_HANDLED; 432 return TRACE_TYPE_HANDLED;
427} 433}
428 434
429static enum print_line_t 435enum print_line_t
430print_graph_duration(unsigned long long duration, struct trace_seq *s) 436trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
431{ 437{
432 unsigned long nsecs_rem = do_div(duration, 1000); 438 unsigned long nsecs_rem = do_div(duration, 1000);
433 /* log10(ULONG_MAX) + '\0' */ 439 /* log10(ULONG_MAX) + '\0' */
@@ -464,12 +470,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
464 if (!ret) 470 if (!ret)
465 return TRACE_TYPE_PARTIAL_LINE; 471 return TRACE_TYPE_PARTIAL_LINE;
466 } 472 }
473 return TRACE_TYPE_HANDLED;
474}
475
476static enum print_line_t
477print_graph_duration(unsigned long long duration, struct trace_seq *s)
478{
479 int ret;
480
481 ret = trace_print_graph_duration(duration, s);
482 if (ret != TRACE_TYPE_HANDLED)
483 return ret;
467 484
468 ret = trace_seq_printf(s, "| "); 485 ret = trace_seq_printf(s, "| ");
469 if (!ret) 486 if (!ret)
470 return TRACE_TYPE_PARTIAL_LINE; 487 return TRACE_TYPE_PARTIAL_LINE;
471 return TRACE_TYPE_HANDLED;
472 488
489 return TRACE_TYPE_HANDLED;
473} 490}
474 491
475/* Case of a leaf function on its call entry */ 492/* Case of a leaf function on its call entry */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347f..ca7d7c4d0c2a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,10 +1,9 @@
1/* 1/*
2 * h/w branch tracer for x86 based on bts 2 * h/w branch tracer for x86 based on BTS
3 * 3 *
4 * Copyright (C) 2008-2009 Intel Corporation. 4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */ 6 */
7#include <linux/spinlock.h>
8#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
9#include <linux/debugfs.h> 8#include <linux/debugfs.h>
10#include <linux/ftrace.h> 9#include <linux/ftrace.h>
@@ -15,110 +14,119 @@
15 14
16#include <asm/ds.h> 15#include <asm/ds.h>
17 16
18#include "trace.h"
19#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h"
20 19
21 20
22#define SIZEOF_BTS (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
23 22
24/*
25 * The tracer lock protects the below per-cpu tracer array.
26 * It needs to be held to:
27 * - start tracing on all cpus
28 * - stop tracing on all cpus
29 * - start tracing on a single hotplug cpu
30 * - stop tracing on a single hotplug cpu
31 * - read the trace from all cpus
32 * - read the trace from a single cpu
33 */
34static DEFINE_SPINLOCK(bts_tracer_lock);
35static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, tracer);
36static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
37 25
38#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(tracer, smp_processor_id())
39#define this_buffer per_cpu(buffer, smp_processor_id())
40 27
41static int __read_mostly trace_hw_branches_enabled; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
42static struct trace_array *hw_branch_trace __read_mostly; 30static struct trace_array *hw_branch_trace __read_mostly;
43 31
44 32
45/* 33static void bts_trace_init_cpu(int cpu)
46 * Start tracing on the current cpu.
47 * The argument is ignored.
48 *
49 * pre: bts_tracer_lock must be locked.
50 */
51static void bts_trace_start_cpu(void *arg)
52{ 34{
53 if (this_tracer) 35 per_cpu(tracer, cpu) =
54 ds_release_bts(this_tracer); 36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
55 37 NULL, (size_t)-1, BTS_KERNEL);
56 this_tracer = 38
57 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, 39 if (IS_ERR(per_cpu(tracer, cpu)))
58 /* ovfl = */ NULL, /* th = */ (size_t)-1, 40 per_cpu(tracer, cpu) = NULL;
59 BTS_KERNEL);
60 if (IS_ERR(this_tracer)) {
61 this_tracer = NULL;
62 return;
63 }
64} 41}
65 42
66static void bts_trace_start(struct trace_array *tr) 43static int bts_trace_init(struct trace_array *tr)
67{ 44{
68 spin_lock(&bts_tracer_lock); 45 int cpu;
46
47 hw_branch_trace = tr;
48 trace_hw_branches_enabled = 0;
69 49
70 on_each_cpu(bts_trace_start_cpu, NULL, 1); 50 get_online_cpus();
71 trace_hw_branches_enabled = 1; 51 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu);
72 53
73 spin_unlock(&bts_tracer_lock); 54 if (likely(per_cpu(tracer, cpu)))
55 trace_hw_branches_enabled = 1;
56 }
57 trace_hw_branches_suspended = 0;
58 put_online_cpus();
59
60 /* If we could not enable tracing on a single cpu, we fail. */
61 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
74} 62}
75 63
76/* 64static void bts_trace_reset(struct trace_array *tr)
77 * Stop tracing on the current cpu.
78 * The argument is ignored.
79 *
80 * pre: bts_tracer_lock must be locked.
81 */
82static void bts_trace_stop_cpu(void *arg)
83{ 65{
84 if (this_tracer) { 66 int cpu;
85 ds_release_bts(this_tracer); 67
86 this_tracer = NULL; 68 get_online_cpus();
69 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu));
72 per_cpu(tracer, cpu) = NULL;
73 }
87 } 74 }
75 trace_hw_branches_enabled = 0;
76 trace_hw_branches_suspended = 0;
77 put_online_cpus();
88} 78}
89 79
90static void bts_trace_stop(struct trace_array *tr) 80static void bts_trace_start(struct trace_array *tr)
91{ 81{
92 spin_lock(&bts_tracer_lock); 82 int cpu;
93 83
94 trace_hw_branches_enabled = 0; 84 get_online_cpus();
95 on_each_cpu(bts_trace_stop_cpu, NULL, 1); 85 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu));
88 trace_hw_branches_suspended = 0;
89 put_online_cpus();
90}
96 91
97 spin_unlock(&bts_tracer_lock); 92static void bts_trace_stop(struct trace_array *tr)
93{
94 int cpu;
95
96 get_online_cpus();
97 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu));
100 trace_hw_branches_suspended = 1;
101 put_online_cpus();
98} 102}
99 103
100static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, 104static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
101 unsigned long action, void *hcpu) 105 unsigned long action, void *hcpu)
102{ 106{
103 unsigned int cpu = (unsigned long)hcpu; 107 int cpu = (long)hcpu;
104
105 spin_lock(&bts_tracer_lock);
106
107 if (!trace_hw_branches_enabled)
108 goto out;
109 108
110 switch (action) { 109 switch (action) {
111 case CPU_ONLINE: 110 case CPU_ONLINE:
112 case CPU_DOWN_FAILED: 111 case CPU_DOWN_FAILED:
113 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); 112 /* The notification is sent with interrupts enabled. */
113 if (trace_hw_branches_enabled) {
114 bts_trace_init_cpu(cpu);
115
116 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu));
119 }
114 break; 120 break;
121
115 case CPU_DOWN_PREPARE: 122 case CPU_DOWN_PREPARE:
116 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 123 /* The notification is sent with interrupts enabled. */
117 break; 124 if (likely(per_cpu(tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu));
126 per_cpu(tracer, cpu) = NULL;
127 }
118 } 128 }
119 129
120 out:
121 spin_unlock(&bts_tracer_lock);
122 return NOTIFY_DONE; 130 return NOTIFY_DONE;
123} 131}
124 132
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
126 .notifier_call = bts_hotcpu_handler 134 .notifier_call = bts_hotcpu_handler
127}; 135};
128 136
129static int bts_trace_init(struct trace_array *tr)
130{
131 hw_branch_trace = tr;
132
133 bts_trace_start(tr);
134
135 return 0;
136}
137
138static void bts_trace_reset(struct trace_array *tr)
139{
140 bts_trace_stop(tr);
141}
142
143static void bts_trace_print_header(struct seq_file *m) 137static void bts_trace_print_header(struct seq_file *m)
144{ 138{
145 seq_puts(m, "# CPU# TO <- FROM\n"); 139 seq_puts(m, "# CPU# TO <- FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
147 141
148static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) 142static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
149{ 143{
144 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
150 struct trace_entry *entry = iter->ent; 145 struct trace_entry *entry = iter->ent;
151 struct trace_seq *seq = &iter->seq; 146 struct trace_seq *seq = &iter->seq;
152 struct hw_branch_entry *it; 147 struct hw_branch_entry *it;
153 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
154 148
155 trace_assign_type(it, entry); 149 trace_assign_type(it, entry);
156 150
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
168 162
169void trace_hw_branch(u64 from, u64 to) 163void trace_hw_branch(u64 from, u64 to)
170{ 164{
165 struct ftrace_event_call *call = &event_hw_branch;
171 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
172 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
173 struct hw_branch_entry *entry; 168 struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
194 entry->ent.type = TRACE_HW_BRANCHES; 189 entry->ent.type = TRACE_HW_BRANCHES;
195 entry->from = from; 190 entry->from = from;
196 entry->to = to; 191 entry->to = to;
197 trace_buffer_unlock_commit(tr, event, 0, 0); 192 if (!filter_check_discard(call, entry, tr->buffer, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0);
198 194
199 out: 195 out:
200 atomic_dec(&tr->data[cpu]->disabled); 196 atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
224/* 220/*
225 * Collect the trace on the current cpu and write it into the ftrace buffer. 221 * Collect the trace on the current cpu and write it into the ftrace buffer.
226 * 222 *
227 * pre: bts_tracer_lock must be locked 223 * pre: tracing must be suspended on the current cpu
228 */ 224 */
229static void trace_bts_cpu(void *arg) 225static void trace_bts_cpu(void *arg)
230{ 226{
231 struct trace_array *tr = (struct trace_array *) arg; 227 struct trace_array *tr = (struct trace_array *)arg;
232 const struct bts_trace *trace; 228 const struct bts_trace *trace;
233 unsigned char *at; 229 unsigned char *at;
234 230
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
241 if (unlikely(!this_tracer)) 237 if (unlikely(!this_tracer))
242 return; 238 return;
243 239
244 ds_suspend_bts(this_tracer);
245 trace = ds_read_bts(this_tracer); 240 trace = ds_read_bts(this_tracer);
246 if (!trace) 241 if (!trace)
247 goto out; 242 return;
248 243
249 for (at = trace->ds.top; (void *)at < trace->ds.end; 244 for (at = trace->ds.top; (void *)at < trace->ds.end;
250 at += trace->ds.size) 245 at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
253 for (at = trace->ds.begin; (void *)at < trace->ds.top; 248 for (at = trace->ds.begin; (void *)at < trace->ds.top;
254 at += trace->ds.size) 249 at += trace->ds.size)
255 trace_bts_at(trace, at); 250 trace_bts_at(trace, at);
256
257out:
258 ds_resume_bts(this_tracer);
259} 251}
260 252
261static void trace_bts_prepare(struct trace_iterator *iter) 253static void trace_bts_prepare(struct trace_iterator *iter)
262{ 254{
263 spin_lock(&bts_tracer_lock); 255 int cpu;
264 256
257 get_online_cpus();
258 for_each_online_cpu(cpu)
259 if (likely(per_cpu(tracer, cpu)))
260 ds_suspend_bts(per_cpu(tracer, cpu));
261 /*
262 * We need to collect the trace on the respective cpu since ftrace
263 * implicitly adds the record for the current cpu.
264 * Once that is more flexible, we could collect the data from any cpu.
265 */
265 on_each_cpu(trace_bts_cpu, iter->tr, 1); 266 on_each_cpu(trace_bts_cpu, iter->tr, 1);
266 267
267 spin_unlock(&bts_tracer_lock); 268 for_each_online_cpu(cpu)
269 if (likely(per_cpu(tracer, cpu)))
270 ds_resume_bts(per_cpu(tracer, cpu));
271 put_online_cpus();
268} 272}
269 273
270static void trace_bts_close(struct trace_iterator *iter) 274static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
274 278
275void trace_hw_branch_oops(void) 279void trace_hw_branch_oops(void)
276{ 280{
277 spin_lock(&bts_tracer_lock); 281 if (this_tracer) {
278 282 ds_suspend_bts_noirq(this_tracer);
279 trace_bts_cpu(hw_branch_trace); 283 trace_bts_cpu(hw_branch_trace);
280 284 ds_resume_bts_noirq(this_tracer);
281 spin_unlock(&bts_tracer_lock); 285 }
282} 286}
283 287
284struct tracer bts_tracer __read_mostly = 288struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
291 .start = bts_trace_start, 295 .start = bts_trace_start,
292 .stop = bts_trace_stop, 296 .stop = bts_trace_stop,
293 .open = trace_bts_prepare, 297 .open = trace_bts_prepare,
294 .close = trace_bts_close 298 .close = trace_bts_close,
299#ifdef CONFIG_FTRACE_SELFTEST
300 .selftest = trace_selftest_startup_hw_branches,
301#endif /* CONFIG_FTRACE_SELFTEST */
295}; 302};
296 303
297__init static int init_bts_trace(void) 304__init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b4..d53b45ed0806 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/time.h>
13
12#include <asm/atomic.h> 14#include <asm/atomic.h>
13 15
14#include "trace.h" 16#include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
174 struct mmiotrace_rw *rw; 176 struct mmiotrace_rw *rw;
175 struct trace_seq *s = &iter->seq; 177 struct trace_seq *s = &iter->seq;
176 unsigned long long t = ns2usecs(iter->ts); 178 unsigned long long t = ns2usecs(iter->ts);
177 unsigned long usec_rem = do_div(t, 1000000ULL); 179 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
178 unsigned secs = (unsigned long)t; 180 unsigned secs = (unsigned long)t;
179 int ret = 1; 181 int ret = 1;
180 182
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
221 struct mmiotrace_map *m; 223 struct mmiotrace_map *m;
222 struct trace_seq *s = &iter->seq; 224 struct trace_seq *s = &iter->seq;
223 unsigned long long t = ns2usecs(iter->ts); 225 unsigned long long t = ns2usecs(iter->ts);
224 unsigned long usec_rem = do_div(t, 1000000ULL); 226 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
225 unsigned secs = (unsigned long)t; 227 unsigned secs = (unsigned long)t;
226 int ret; 228 int ret;
227 229
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64b54a59c55b..7938f3ae93e3 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,11 +14,25 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17static DEFINE_MUTEX(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
18static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
19 23
20static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
21 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29
30 s->buffer[len] = 0;
31 seq_puts(m, s->buffer);
32
33 trace_seq_init(s);
34}
35
22enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 36enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
23{ 37{
24 struct trace_seq *s = &iter->seq; 38 struct trace_seq *s = &iter->seq;
@@ -84,6 +98,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
84 98
85 return len; 99 return len;
86} 100}
101EXPORT_SYMBOL_GPL(trace_seq_printf);
102
103/**
104 * trace_seq_vprintf - sequence printing of trace information
105 * @s: trace sequence descriptor
106 * @fmt: printf format string
107 *
108 * The tracer may use either sequence operations or its own
109 * copy to user routines. To simplify formating of a trace
110 * trace_seq_printf is used to store strings into a special
111 * buffer (@s). Then the output may be either used by
112 * the sequencer or pulled into another buffer.
113 */
114int
115trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
116{
117 int len = (PAGE_SIZE - 1) - s->len;
118 int ret;
119
120 if (!len)
121 return 0;
122
123 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
124
125 /* If we can't write it all, don't bother writing anything */
126 if (ret >= len)
127 return 0;
128
129 s->len += ret;
130
131 return len;
132}
133EXPORT_SYMBOL_GPL(trace_seq_vprintf);
87 134
88int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 135int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
89{ 136{
@@ -201,6 +248,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
201 return 0; 248 return 0;
202} 249}
203 250
251const char *
252ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
253 unsigned long flags,
254 const struct trace_print_flags *flag_array)
255{
256 unsigned long mask;
257 const char *str;
258 const char *ret = p->buffer + p->len;
259 int i;
260
261 for (i = 0; flag_array[i].name && flags; i++) {
262
263 mask = flag_array[i].mask;
264 if ((flags & mask) != mask)
265 continue;
266
267 str = flag_array[i].name;
268 flags &= ~mask;
269 if (p->len && delim)
270 trace_seq_puts(p, delim);
271 trace_seq_puts(p, str);
272 }
273
274 /* check for left over flags */
275 if (flags) {
276 if (p->len && delim)
277 trace_seq_puts(p, delim);
278 trace_seq_printf(p, "0x%lx", flags);
279 }
280
281 trace_seq_putc(p, 0);
282
283 return ret;
284}
285EXPORT_SYMBOL(ftrace_print_flags_seq);
286
287const char *
288ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
289 const struct trace_print_flags *symbol_array)
290{
291 int i;
292 const char *ret = p->buffer + p->len;
293
294 for (i = 0; symbol_array[i].name; i++) {
295
296 if (val != symbol_array[i].mask)
297 continue;
298
299 trace_seq_puts(p, symbol_array[i].name);
300 break;
301 }
302
303 if (!p->len)
304 trace_seq_printf(p, "0x%lx", val);
305
306 trace_seq_putc(p, 0);
307
308 return ret;
309}
310EXPORT_SYMBOL(ftrace_print_symbols_seq);
311
204#ifdef CONFIG_KRETPROBES 312#ifdef CONFIG_KRETPROBES
205static inline const char *kretprobed(const char *name) 313static inline const char *kretprobed(const char *name)
206{ 314{
@@ -311,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
311 419
312 if (ip == ULONG_MAX || !ret) 420 if (ip == ULONG_MAX || !ret)
313 break; 421 break;
314 if (i && ret) 422 if (ret)
315 ret = trace_seq_puts(s, " <- "); 423 ret = trace_seq_puts(s, " => ");
316 if (!ip) { 424 if (!ip) {
317 if (ret) 425 if (ret)
318 ret = trace_seq_puts(s, "??"); 426 ret = trace_seq_puts(s, "??");
427 if (ret)
428 ret = trace_seq_puts(s, "\n");
319 continue; 429 continue;
320 } 430 }
321 if (!ret) 431 if (!ret)
322 break; 432 break;
323 if (ret) 433 if (ret)
324 ret = seq_print_user_ip(s, mm, ip, sym_flags); 434 ret = seq_print_user_ip(s, mm, ip, sym_flags);
435 ret = trace_seq_puts(s, "\n");
325 } 436 }
326 437
327 if (mm) 438 if (mm)
@@ -455,6 +566,7 @@ static int task_state_char(unsigned long state)
455 * @type: the type of event to look for 566 * @type: the type of event to look for
456 * 567 *
457 * Returns an event of type @type otherwise NULL 568 * Returns an event of type @type otherwise NULL
569 * Called with trace_event_read_lock() held.
458 */ 570 */
459struct trace_event *ftrace_find_event(int type) 571struct trace_event *ftrace_find_event(int type)
460{ 572{
@@ -464,7 +576,7 @@ struct trace_event *ftrace_find_event(int type)
464 576
465 key = type & (EVENT_HASHSIZE - 1); 577 key = type & (EVENT_HASHSIZE - 1);
466 578
467 hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { 579 hlist_for_each_entry(event, n, &event_hash[key], node) {
468 if (event->type == type) 580 if (event->type == type)
469 return event; 581 return event;
470 } 582 }
@@ -472,6 +584,46 @@ struct trace_event *ftrace_find_event(int type)
472 return NULL; 584 return NULL;
473} 585}
474 586
587static LIST_HEAD(ftrace_event_list);
588
589static int trace_search_list(struct list_head **list)
590{
591 struct trace_event *e;
592 int last = __TRACE_LAST_TYPE;
593
594 if (list_empty(&ftrace_event_list)) {
595 *list = &ftrace_event_list;
596 return last + 1;
597 }
598
599 /*
600 * We used up all possible max events,
601 * lets see if somebody freed one.
602 */
603 list_for_each_entry(e, &ftrace_event_list, list) {
604 if (e->type != last + 1)
605 break;
606 last++;
607 }
608
609 /* Did we used up all 65 thousand events??? */
610 if ((last + 1) > FTRACE_MAX_EVENT)
611 return 0;
612
613 *list = &e->list;
614 return last + 1;
615}
616
617void trace_event_read_lock(void)
618{
619 down_read(&trace_event_mutex);
620}
621
622void trace_event_read_unlock(void)
623{
624 up_read(&trace_event_mutex);
625}
626
475/** 627/**
476 * register_ftrace_event - register output for an event type 628 * register_ftrace_event - register output for an event type
477 * @event: the event type to register 629 * @event: the event type to register
@@ -492,22 +644,42 @@ int register_ftrace_event(struct trace_event *event)
492 unsigned key; 644 unsigned key;
493 int ret = 0; 645 int ret = 0;
494 646
495 mutex_lock(&trace_event_mutex); 647 down_write(&trace_event_mutex);
496 648
497 if (!event) { 649 if (WARN_ON(!event))
498 ret = next_event_type++;
499 goto out; 650 goto out;
500 }
501 651
502 if (!event->type) 652 INIT_LIST_HEAD(&event->list);
503 event->type = next_event_type++; 653
504 else if (event->type > __TRACE_LAST_TYPE) { 654 if (!event->type) {
655 struct list_head *list = NULL;
656
657 if (next_event_type > FTRACE_MAX_EVENT) {
658
659 event->type = trace_search_list(&list);
660 if (!event->type)
661 goto out;
662
663 } else {
664
665 event->type = next_event_type++;
666 list = &ftrace_event_list;
667 }
668
669 if (WARN_ON(ftrace_find_event(event->type)))
670 goto out;
671
672 list_add_tail(&event->list, list);
673
674 } else if (event->type > __TRACE_LAST_TYPE) {
505 printk(KERN_WARNING "Need to add type to trace.h\n"); 675 printk(KERN_WARNING "Need to add type to trace.h\n");
506 WARN_ON(1); 676 WARN_ON(1);
507 }
508
509 if (ftrace_find_event(event->type))
510 goto out; 677 goto out;
678 } else {
679 /* Is this event already used */
680 if (ftrace_find_event(event->type))
681 goto out;
682 }
511 683
512 if (event->trace == NULL) 684 if (event->trace == NULL)
513 event->trace = trace_nop_print; 685 event->trace = trace_nop_print;
@@ -520,14 +692,25 @@ int register_ftrace_event(struct trace_event *event)
520 692
521 key = event->type & (EVENT_HASHSIZE - 1); 693 key = event->type & (EVENT_HASHSIZE - 1);
522 694
523 hlist_add_head_rcu(&event->node, &event_hash[key]); 695 hlist_add_head(&event->node, &event_hash[key]);
524 696
525 ret = event->type; 697 ret = event->type;
526 out: 698 out:
527 mutex_unlock(&trace_event_mutex); 699 up_write(&trace_event_mutex);
528 700
529 return ret; 701 return ret;
530} 702}
703EXPORT_SYMBOL_GPL(register_ftrace_event);
704
705/*
706 * Used by module code with the trace_event_mutex held for write.
707 */
708int __unregister_ftrace_event(struct trace_event *event)
709{
710 hlist_del(&event->node);
711 list_del(&event->list);
712 return 0;
713}
531 714
532/** 715/**
533 * unregister_ftrace_event - remove a no longer used event 716 * unregister_ftrace_event - remove a no longer used event
@@ -535,12 +718,13 @@ int register_ftrace_event(struct trace_event *event)
535 */ 718 */
536int unregister_ftrace_event(struct trace_event *event) 719int unregister_ftrace_event(struct trace_event *event)
537{ 720{
538 mutex_lock(&trace_event_mutex); 721 down_write(&trace_event_mutex);
539 hlist_del(&event->node); 722 __unregister_ftrace_event(event);
540 mutex_unlock(&trace_event_mutex); 723 up_write(&trace_event_mutex);
541 724
542 return 0; 725 return 0;
543} 726}
727EXPORT_SYMBOL_GPL(unregister_ftrace_event);
544 728
545/* 729/*
546 * Standard events 730 * Standard events
@@ -833,14 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
833 1017
834 trace_assign_type(field, iter->ent); 1018 trace_assign_type(field, iter->ent);
835 1019
1020 if (!trace_seq_puts(s, "<stack trace>\n"))
1021 goto partial;
836 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1022 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
837 if (i) { 1023 if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
838 if (!trace_seq_puts(s, " <= ")) 1024 break;
839 goto partial; 1025 if (!trace_seq_puts(s, " => "))
1026 goto partial;
840 1027
841 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1028 if (!seq_print_ip_sym(s, field->caller[i], flags))
842 goto partial; 1029 goto partial;
843 }
844 if (!trace_seq_puts(s, "\n")) 1030 if (!trace_seq_puts(s, "\n"))
845 goto partial; 1031 goto partial;
846 } 1032 }
@@ -868,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
868 1054
869 trace_assign_type(field, iter->ent); 1055 trace_assign_type(field, iter->ent);
870 1056
871 if (!seq_print_userip_objs(field, s, flags)) 1057 if (!trace_seq_puts(s, "<user stack trace>\n"))
872 goto partial; 1058 goto partial;
873 1059
874 if (!trace_seq_putc(s, '\n')) 1060 if (!seq_print_userip_objs(field, s, flags))
875 goto partial; 1061 goto partial;
876 1062
877 return TRACE_TYPE_HANDLED; 1063 return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index e0bde39c2dd9..d38bec4a9c30 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,41 +1,17 @@
1#ifndef __TRACE_EVENTS_H 1#ifndef __TRACE_EVENTS_H
2#define __TRACE_EVENTS_H 2#define __TRACE_EVENTS_H
3 3
4#include <linux/trace_seq.h>
4#include "trace.h" 5#include "trace.h"
5 6
6typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
7 int flags);
8
9struct trace_event {
10 struct hlist_node node;
11 int type;
12 trace_print_func trace;
13 trace_print_func raw;
14 trace_print_func hex;
15 trace_print_func binary;
16};
17
18extern enum print_line_t 7extern enum print_line_t
19trace_print_bprintk_msg_only(struct trace_iterator *iter); 8trace_print_bprintk_msg_only(struct trace_iterator *iter);
20extern enum print_line_t 9extern enum print_line_t
21trace_print_printk_msg_only(struct trace_iterator *iter); 10trace_print_printk_msg_only(struct trace_iterator *iter);
22 11
23extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
24 __attribute__ ((format (printf, 2, 3)));
25extern int
26trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
27extern int 12extern int
28seq_print_ip_sym(struct trace_seq *s, unsigned long ip, 13seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
29 unsigned long sym_flags); 14 unsigned long sym_flags);
30extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
31 size_t cnt);
32extern int trace_seq_puts(struct trace_seq *s, const char *str);
33extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
34extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
35extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
36 size_t len);
37extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
38extern int trace_seq_path(struct trace_seq *s, struct path *path);
39extern int seq_print_userip_objs(const struct userstack_entry *entry, 15extern int seq_print_userip_objs(const struct userstack_entry *entry,
40 struct trace_seq *s, unsigned long sym_flags); 16 struct trace_seq *s, unsigned long sym_flags);
41extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, 17extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
44extern int trace_print_context(struct trace_iterator *iter); 20extern int trace_print_context(struct trace_iterator *iter);
45extern int trace_print_lat_context(struct trace_iterator *iter); 21extern int trace_print_lat_context(struct trace_iterator *iter);
46 22
23extern void trace_event_read_lock(void);
24extern void trace_event_read_unlock(void);
47extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
48extern int register_ftrace_event(struct trace_event *event);
49extern int unregister_ftrace_event(struct trace_event *event);
50 26
51extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
52 int flags); 28 int flags);
53 29
30/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event);
32extern struct rw_semaphore trace_event_mutex;
33
54#define MAX_MEMHEX_BYTES 8 34#define MAX_MEMHEX_BYTES 8
55#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 35#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
56 36
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 118439709fb7..8a30d9874cd4 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
36 36
37static void probe_power_end(struct power_trace *it) 37static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power;
39 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
40 struct trace_power *entry; 41 struct trace_power *entry;
41 struct trace_array_cpu *data; 42 struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
54 goto out; 55 goto out;
55 entry = ring_buffer_event_data(event); 56 entry = ring_buffer_event_data(event);
56 entry->state_data = *it; 57 entry->state_data = *it;
57 trace_buffer_unlock_commit(tr, event, 0, 0); 58 if (!filter_check_discard(call, entry, tr->buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0);
58 out: 60 out:
59 preempt_enable(); 61 preempt_enable();
60} 62}
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
62static void probe_power_mark(struct power_trace *it, unsigned int type, 64static void probe_power_mark(struct power_trace *it, unsigned int type,
63 unsigned int level) 65 unsigned int level)
64{ 66{
67 struct ftrace_event_call *call = &event_power;
65 struct ring_buffer_event *event; 68 struct ring_buffer_event *event;
66 struct trace_power *entry; 69 struct trace_power *entry;
67 struct trace_array_cpu *data; 70 struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
84 goto out; 87 goto out;
85 entry = ring_buffer_event_data(event); 88 entry = ring_buffer_event_data(event);
86 entry->state_data = *it; 89 entry->state_data = *it;
87 trace_buffer_unlock_commit(tr, event, 0, 0); 90 if (!filter_check_discard(call, entry, tr->buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0);
88 out: 92 out:
89 preempt_enable(); 93 preempt_enable();
90} 94}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107fe..9bece9687b62 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
245static __init int init_trace_printk_function_export(void) 245static __init int init_trace_printk_function_export(void)
246{ 246{
247 struct dentry *d_tracer; 247 struct dentry *d_tracer;
248 struct dentry *entry;
249 248
250 d_tracer = tracing_init_dentry(); 249 d_tracer = tracing_init_dentry();
251 if (!d_tracer) 250 if (!d_tracer)
252 return 0; 251 return 0;
253 252
254 entry = debugfs_create_file("printk_formats", 0444, d_tracer, 253 trace_create_file("printk_formats", 0444, d_tracer,
255 NULL, &ftrace_formats_fops); 254 NULL, &ftrace_formats_fops);
256 if (!entry)
257 pr_warning("Could not create debugfs "
258 "'printk_formats' entry\n");
259 255
260 return 0; 256 return 0;
261} 257}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1ae..a98106dd979c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/ftrace.h> 12#include <linux/ftrace.h>
13#include <trace/sched.h> 13#include <trace/events/sched.h>
14 14
15#include "trace.h" 15#include "trace.h"
16 16
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
29 int cpu; 29 int cpu;
30 int pc; 30 int pc;
31 31
32 if (!sched_ref || sched_stopped) 32 if (unlikely(!sched_ref))
33 return; 33 return;
34 34
35 tracing_record_cmdline(prev); 35 tracing_record_cmdline(prev);
36 tracing_record_cmdline(next); 36 tracing_record_cmdline(next);
37 37
38 if (!tracer_enabled) 38 if (!tracer_enabled || sched_stopped)
39 return; 39 return;
40 40
41 pc = preempt_count(); 41 pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
56 unsigned long flags; 56 unsigned long flags;
57 int cpu, pc; 57 int cpu, pc;
58 58
59 if (!likely(tracer_enabled)) 59 if (unlikely(!sched_ref))
60 return; 60 return;
61 61
62 pc = preempt_count();
63 tracing_record_cmdline(current); 62 tracing_record_cmdline(current);
64 63
65 if (sched_stopped) 64 if (!tracer_enabled || sched_stopped)
66 return; 65 return;
67 66
67 pc = preempt_count();
68 local_irq_save(flags); 68 local_irq_save(flags);
69 cpu = raw_smp_processor_id(); 69 cpu = raw_smp_processor_id();
70 data = ctx_trace->data[cpu]; 70 data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153e..eacb27225173 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <trace/sched.h> 18#include <trace/events/sched.h>
19 19
20#include "trace.h" 20#include "trace.h"
21 21
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
138 138
139 pc = preempt_count(); 139 pc = preempt_count();
140 140
141 /* The task we are waiting for is waking up */
142 data = wakeup_trace->data[wakeup_cpu];
143
144 /* disable local data, not wakeup_cpu data */ 141 /* disable local data, not wakeup_cpu data */
145 cpu = raw_smp_processor_id(); 142 cpu = raw_smp_processor_id();
146 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 143 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
154 if (unlikely(!tracer_enabled || next != wakeup_task)) 151 if (unlikely(!tracer_enabled || next != wakeup_task))
155 goto out_unlock; 152 goto out_unlock;
156 153
154 /* The task we are waiting for is waking up */
155 data = wakeup_trace->data[wakeup_cpu];
156
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 159
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 08f4eb2763d1..00dd6485bdd7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
16 case TRACE_BRANCH: 16 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES:
19 return 1; 20 return 1;
20 } 21 }
21 return 0; 22 return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
188#else 189#else
189# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) 190# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
190#endif /* CONFIG_DYNAMIC_FTRACE */ 191#endif /* CONFIG_DYNAMIC_FTRACE */
192
191/* 193/*
192 * Simple verification test of ftrace function tracer. 194 * Simple verification test of ftrace function tracer.
193 * Enable ftrace, sleep 1/10 second, and then read the trace 195 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
749 return ret; 751 return ret;
750} 752}
751#endif /* CONFIG_BRANCH_TRACER */ 753#endif /* CONFIG_BRANCH_TRACER */
754
755#ifdef CONFIG_HW_BRANCH_TRACER
756int
757trace_selftest_startup_hw_branches(struct tracer *trace,
758 struct trace_array *tr)
759{
760 struct trace_iterator *iter;
761 struct tracer tracer;
762 unsigned long count;
763 int ret;
764
765 if (!trace->open) {
766 printk(KERN_CONT "missing open function...");
767 return -1;
768 }
769
770 ret = tracer_init(trace, tr);
771 if (ret) {
772 warn_failed_init_tracer(trace, ret);
773 return ret;
774 }
775
776 /*
777 * The hw-branch tracer needs to collect the trace from the various
778 * cpu trace buffers - before tracing is stopped.
779 */
780 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
781 if (!iter)
782 return -ENOMEM;
783
784 memcpy(&tracer, trace, sizeof(tracer));
785
786 iter->trace = &tracer;
787 iter->tr = tr;
788 iter->pos = -1;
789 mutex_init(&iter->mutex);
790
791 trace->open(iter);
792
793 mutex_destroy(&iter->mutex);
794 kfree(iter);
795
796 tracing_stop();
797
798 ret = trace_test_buffer(tr, &count);
799 trace->reset(tr);
800 tracing_start();
801
802 if (!ret && !count) {
803 printk(KERN_CONT "no entries found..");
804 ret = -1;
805 }
806
807 return ret;
808}
809#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f9661..2d7aebd71dbd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
265 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
266 " (%d entries)\n" 266 " (%d entries)\n"
267 " ----- ---- --------\n", 267 " ----- ---- --------\n",
268 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries - 1);
269 269
270 if (!stack_tracer_enabled && !max_stack_size) 270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m); 271 print_disabled(m);
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
352static __init int stack_trace_init(void) 352static __init int stack_trace_init(void)
353{ 353{
354 struct dentry *d_tracer; 354 struct dentry *d_tracer;
355 struct dentry *entry;
356 355
357 d_tracer = tracing_init_dentry(); 356 d_tracer = tracing_init_dentry();
358 357
359 entry = debugfs_create_file("stack_max_size", 0644, d_tracer, 358 trace_create_file("stack_max_size", 0644, d_tracer,
360 &max_stack_size, &stack_max_size_fops); 359 &max_stack_size, &stack_max_size_fops);
361 if (!entry)
362 pr_warning("Could not create debugfs 'stack_max_size' entry\n");
363 360
364 entry = debugfs_create_file("stack_trace", 0444, d_tracer, 361 trace_create_file("stack_trace", 0444, d_tracer,
365 NULL, &stack_trace_fops); 362 NULL, &stack_trace_fops);
366 if (!entry)
367 pr_warning("Could not create debugfs 'stack_trace' entry\n");
368 363
369 if (stack_tracer_enabled) 364 if (stack_tracer_enabled)
370 register_ftrace_function(&trace_ops); 365 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index acdebd771a93..c00643733f4c 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Infrastructure for statistic tracing (histogram output). 2 * Infrastructure for statistic tracing (histogram output).
3 * 3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> 4 * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
5 * 5 *
6 * Based on the code from trace_branch.c which is 6 * Based on the code from trace_branch.c which is
7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,22 +10,27 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/rbtree.h>
13#include <linux/debugfs.h> 14#include <linux/debugfs.h>
14#include "trace_stat.h" 15#include "trace_stat.h"
15#include "trace.h" 16#include "trace.h"
16 17
17 18
18/* List of stat entries from a tracer */ 19/*
19struct trace_stat_list { 20 * List of stat red-black nodes from a tracer
20 struct list_head list; 21 * We use a such tree to sort quickly the stat
22 * entries from the tracer.
23 */
24struct stat_node {
25 struct rb_node node;
21 void *stat; 26 void *stat;
22}; 27};
23 28
24/* A stat session is the stats output in one file */ 29/* A stat session is the stats output in one file */
25struct tracer_stat_session { 30struct stat_session {
26 struct list_head session_list; 31 struct list_head session_list;
27 struct tracer_stat *ts; 32 struct tracer_stat *ts;
28 struct list_head stat_list; 33 struct rb_root stat_root;
29 struct mutex stat_mutex; 34 struct mutex stat_mutex;
30 struct dentry *file; 35 struct dentry *file;
31}; 36};
@@ -37,18 +42,48 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
37/* The root directory for all stat files */ 42/* The root directory for all stat files */
38static struct dentry *stat_dir; 43static struct dentry *stat_dir;
39 44
45/*
46 * Iterate through the rbtree using a post order traversal path
47 * to release the next node.
48 * It won't necessary release one at each iteration
49 * but it will at least advance closer to the next one
50 * to be released.
51 */
52static struct rb_node *release_next(struct rb_node *node)
53{
54 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node);
56
57 if (node->rb_left)
58 return node->rb_left;
59 else if (node->rb_right)
60 return node->rb_right;
61 else {
62 if (!parent)
63 ;
64 else if (parent->rb_left == node)
65 parent->rb_left = NULL;
66 else
67 parent->rb_right = NULL;
68
69 snode = container_of(node, struct stat_node, node);
70 kfree(snode);
71
72 return parent;
73 }
74}
40 75
41static void reset_stat_session(struct tracer_stat_session *session) 76static void reset_stat_session(struct stat_session *session)
42{ 77{
43 struct trace_stat_list *node, *next; 78 struct rb_node *node = session->stat_root.rb_node;
44 79
45 list_for_each_entry_safe(node, next, &session->stat_list, list) 80 while (node)
46 kfree(node); 81 node = release_next(node);
47 82
48 INIT_LIST_HEAD(&session->stat_list); 83 session->stat_root = RB_ROOT;
49} 84}
50 85
51static void destroy_session(struct tracer_stat_session *session) 86static void destroy_session(struct stat_session *session)
52{ 87{
53 debugfs_remove(session->file); 88 debugfs_remove(session->file);
54 reset_stat_session(session); 89 reset_stat_session(session);
@@ -56,25 +91,60 @@ static void destroy_session(struct tracer_stat_session *session)
56 kfree(session); 91 kfree(session);
57} 92}
58 93
94typedef int (*cmp_stat_t)(void *, void *);
95
96static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
97{
98 struct rb_node **new = &(root->rb_node), *parent = NULL;
99 struct stat_node *data;
100
101 data = kzalloc(sizeof(*data), GFP_KERNEL);
102 if (!data)
103 return -ENOMEM;
104 data->stat = stat;
105
106 /*
107 * Figure out where to put new node
108 * This is a descendent sorting
109 */
110 while (*new) {
111 struct stat_node *this;
112 int result;
113
114 this = container_of(*new, struct stat_node, node);
115 result = cmp(data->stat, this->stat);
116
117 parent = *new;
118 if (result >= 0)
119 new = &((*new)->rb_left);
120 else
121 new = &((*new)->rb_right);
122 }
123
124 rb_link_node(&data->node, parent, new);
125 rb_insert_color(&data->node, root);
126 return 0;
127}
128
59/* 129/*
60 * For tracers that don't provide a stat_cmp callback. 130 * For tracers that don't provide a stat_cmp callback.
61 * This one will force an immediate insertion on tail of 131 * This one will force an insertion as right-most node
62 * the list. 132 * in the rbtree.
63 */ 133 */
64static int dummy_cmp(void *p1, void *p2) 134static int dummy_cmp(void *p1, void *p2)
65{ 135{
66 return 1; 136 return -1;
67} 137}
68 138
69/* 139/*
70 * Initialize the stat list at each trace_stat file opening. 140 * Initialize the stat rbtree at each trace_stat file opening.
71 * All of these copies and sorting are required on all opening 141 * All of these copies and sorting are required on all opening
72 * since the stats could have changed between two file sessions. 142 * since the stats could have changed between two file sessions.
73 */ 143 */
74static int stat_seq_init(struct tracer_stat_session *session) 144static int stat_seq_init(struct stat_session *session)
75{ 145{
76 struct trace_stat_list *iter_entry, *new_entry;
77 struct tracer_stat *ts = session->ts; 146 struct tracer_stat *ts = session->ts;
147 struct rb_root *root = &session->stat_root;
78 void *stat; 148 void *stat;
79 int ret = 0; 149 int ret = 0;
80 int i; 150 int i;
@@ -85,29 +155,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
85 if (!ts->stat_cmp) 155 if (!ts->stat_cmp)
86 ts->stat_cmp = dummy_cmp; 156 ts->stat_cmp = dummy_cmp;
87 157
88 stat = ts->stat_start(); 158 stat = ts->stat_start(ts);
89 if (!stat) 159 if (!stat)
90 goto exit; 160 goto exit;
91 161
92 /* 162 ret = insert_stat(root, stat, ts->stat_cmp);
93 * The first entry. Actually this is the second, but the first 163 if (ret)
94 * one (the stat_list head) is pointless.
95 */
96 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
97 if (!new_entry) {
98 ret = -ENOMEM;
99 goto exit; 164 goto exit;
100 }
101
102 INIT_LIST_HEAD(&new_entry->list);
103
104 list_add(&new_entry->list, &session->stat_list);
105
106 new_entry->stat = stat;
107 165
108 /* 166 /*
109 * Iterate over the tracer stat entries and store them in a sorted 167 * Iterate over the tracer stat entries and store them in an rbtree.
110 * list.
111 */ 168 */
112 for (i = 1; ; i++) { 169 for (i = 1; ; i++) {
113 stat = ts->stat_next(stat, i); 170 stat = ts->stat_next(stat, i);
@@ -116,36 +173,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
116 if (!stat) 173 if (!stat)
117 break; 174 break;
118 175
119 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); 176 ret = insert_stat(root, stat, ts->stat_cmp);
120 if (!new_entry) { 177 if (ret)
121 ret = -ENOMEM; 178 goto exit_free_rbtree;
122 goto exit_free_list;
123 }
124
125 INIT_LIST_HEAD(&new_entry->list);
126 new_entry->stat = stat;
127
128 list_for_each_entry_reverse(iter_entry, &session->stat_list,
129 list) {
130
131 /* Insertion with a descendent sorting */
132 if (ts->stat_cmp(iter_entry->stat,
133 new_entry->stat) >= 0) {
134
135 list_add(&new_entry->list, &iter_entry->list);
136 break;
137 }
138 }
139
140 /* The current larger value */
141 if (list_empty(&new_entry->list))
142 list_add(&new_entry->list, &session->stat_list);
143 } 179 }
180
144exit: 181exit:
145 mutex_unlock(&session->stat_mutex); 182 mutex_unlock(&session->stat_mutex);
146 return ret; 183 return ret;
147 184
148exit_free_list: 185exit_free_rbtree:
149 reset_stat_session(session); 186 reset_stat_session(session);
150 mutex_unlock(&session->stat_mutex); 187 mutex_unlock(&session->stat_mutex);
151 return ret; 188 return ret;
@@ -154,38 +191,51 @@ exit_free_list:
154 191
155static void *stat_seq_start(struct seq_file *s, loff_t *pos) 192static void *stat_seq_start(struct seq_file *s, loff_t *pos)
156{ 193{
157 struct tracer_stat_session *session = s->private; 194 struct stat_session *session = s->private;
195 struct rb_node *node;
196 int i;
158 197
159 /* Prevent from tracer switch or stat_list modification */ 198 /* Prevent from tracer switch or rbtree modification */
160 mutex_lock(&session->stat_mutex); 199 mutex_lock(&session->stat_mutex);
161 200
162 /* If we are in the beginning of the file, print the headers */ 201 /* If we are in the beginning of the file, print the headers */
163 if (!*pos && session->ts->stat_headers) 202 if (!*pos && session->ts->stat_headers) {
203 (*pos)++;
164 return SEQ_START_TOKEN; 204 return SEQ_START_TOKEN;
205 }
165 206
166 return seq_list_start(&session->stat_list, *pos); 207 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node);
210
211 (*pos)++;
212
213 return node;
167} 214}
168 215
169static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) 216static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
170{ 217{
171 struct tracer_stat_session *session = s->private; 218 struct stat_session *session = s->private;
219 struct rb_node *node = p;
220
221 (*pos)++;
172 222
173 if (p == SEQ_START_TOKEN) 223 if (p == SEQ_START_TOKEN)
174 return seq_list_start(&session->stat_list, *pos); 224 return rb_first(&session->stat_root);
175 225
176 return seq_list_next(p, &session->stat_list, pos); 226 return rb_next(node);
177} 227}
178 228
179static void stat_seq_stop(struct seq_file *s, void *p) 229static void stat_seq_stop(struct seq_file *s, void *p)
180{ 230{
181 struct tracer_stat_session *session = s->private; 231 struct stat_session *session = s->private;
182 mutex_unlock(&session->stat_mutex); 232 mutex_unlock(&session->stat_mutex);
183} 233}
184 234
185static int stat_seq_show(struct seq_file *s, void *v) 235static int stat_seq_show(struct seq_file *s, void *v)
186{ 236{
187 struct tracer_stat_session *session = s->private; 237 struct stat_session *session = s->private;
188 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); 238 struct stat_node *l = container_of(v, struct stat_node, node);
189 239
190 if (v == SEQ_START_TOKEN) 240 if (v == SEQ_START_TOKEN)
191 return session->ts->stat_headers(s); 241 return session->ts->stat_headers(s);
@@ -205,7 +255,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
205{ 255{
206 int ret; 256 int ret;
207 257
208 struct tracer_stat_session *session = inode->i_private; 258 struct stat_session *session = inode->i_private;
209 259
210 ret = seq_open(file, &trace_stat_seq_ops); 260 ret = seq_open(file, &trace_stat_seq_ops);
211 if (!ret) { 261 if (!ret) {
@@ -218,11 +268,11 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
218} 268}
219 269
220/* 270/*
221 * Avoid consuming memory with our now useless list. 271 * Avoid consuming memory with our now useless rbtree.
222 */ 272 */
223static int tracing_stat_release(struct inode *i, struct file *f) 273static int tracing_stat_release(struct inode *i, struct file *f)
224{ 274{
225 struct tracer_stat_session *session = i->i_private; 275 struct stat_session *session = i->i_private;
226 276
227 mutex_lock(&session->stat_mutex); 277 mutex_lock(&session->stat_mutex);
228 reset_stat_session(session); 278 reset_stat_session(session);
@@ -251,7 +301,7 @@ static int tracing_stat_init(void)
251 return 0; 301 return 0;
252} 302}
253 303
254static int init_stat_file(struct tracer_stat_session *session) 304static int init_stat_file(struct stat_session *session)
255{ 305{
256 if (!stat_dir && tracing_stat_init()) 306 if (!stat_dir && tracing_stat_init())
257 return -ENODEV; 307 return -ENODEV;
@@ -266,7 +316,7 @@ static int init_stat_file(struct tracer_stat_session *session)
266 316
267int register_stat_tracer(struct tracer_stat *trace) 317int register_stat_tracer(struct tracer_stat *trace)
268{ 318{
269 struct tracer_stat_session *session, *node, *tmp; 319 struct stat_session *session, *node;
270 int ret; 320 int ret;
271 321
272 if (!trace) 322 if (!trace)
@@ -277,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
277 327
278 /* Already registered? */ 328 /* Already registered? */
279 mutex_lock(&all_stat_sessions_mutex); 329 mutex_lock(&all_stat_sessions_mutex);
280 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { 330 list_for_each_entry(node, &all_stat_sessions, session_list) {
281 if (node->ts == trace) { 331 if (node->ts == trace) {
282 mutex_unlock(&all_stat_sessions_mutex); 332 mutex_unlock(&all_stat_sessions_mutex);
283 return -EINVAL; 333 return -EINVAL;
@@ -286,15 +336,13 @@ int register_stat_tracer(struct tracer_stat *trace)
286 mutex_unlock(&all_stat_sessions_mutex); 336 mutex_unlock(&all_stat_sessions_mutex);
287 337
288 /* Init the session */ 338 /* Init the session */
289 session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); 339 session = kzalloc(sizeof(*session), GFP_KERNEL);
290 if (!session) 340 if (!session)
291 return -ENOMEM; 341 return -ENOMEM;
292 342
293 session->ts = trace; 343 session->ts = trace;
294 INIT_LIST_HEAD(&session->session_list); 344 INIT_LIST_HEAD(&session->session_list);
295 INIT_LIST_HEAD(&session->stat_list);
296 mutex_init(&session->stat_mutex); 345 mutex_init(&session->stat_mutex);
297 session->file = NULL;
298 346
299 ret = init_stat_file(session); 347 ret = init_stat_file(session);
300 if (ret) { 348 if (ret) {
@@ -312,7 +360,7 @@ int register_stat_tracer(struct tracer_stat *trace)
312 360
313void unregister_stat_tracer(struct tracer_stat *trace) 361void unregister_stat_tracer(struct tracer_stat *trace)
314{ 362{
315 struct tracer_stat_session *node, *tmp; 363 struct stat_session *node, *tmp;
316 364
317 mutex_lock(&all_stat_sessions_mutex); 365 mutex_lock(&all_stat_sessions_mutex);
318 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { 366 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3d..f3546a2cd826 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
12 /* The name of your stat file */ 12 /* The name of your stat file */
13 const char *name; 13 const char *name;
14 /* Iteration over statistic entries */ 14 /* Iteration over statistic entries */
15 void *(*stat_start)(void); 15 void *(*stat_start)(struct tracer_stat *trace);
16 void *(*stat_next)(void *prev, int idx); 16 void *(*stat_next)(void *prev, int idx);
17 /* Compare two entries for stats sorting */ 17 /* Compare two entries for stats sorting */
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149f..e04b76cc238a 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {
321 321
322void init_tracer_sysprof_debugfs(struct dentry *d_tracer) 322void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
323{ 323{
324 struct dentry *entry;
325 324
326 entry = debugfs_create_file("sysprof_sample_period", 0644, 325 trace_create_file("sysprof_sample_period", 0644,
327 d_tracer, NULL, &sysprof_sample_fops); 326 d_tracer, NULL, &sysprof_sample_fops);
328 if (entry)
329 return;
330 pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
331} 327}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 797201e4a137..97fcea4acce1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8 8
9#include <trace/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include "trace_stat.h" 12#include "trace_stat.h"
@@ -16,8 +16,6 @@
16/* A cpu workqueue thread */ 16/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 17struct cpu_workqueue_stats {
18 struct list_head list; 18 struct list_head list;
19/* Useful to know if we print the cpu headers */
20 bool first_entry;
21 int cpu; 19 int cpu;
22 pid_t pid; 20 pid_t pid;
23/* Can be inserted from interrupt or user context, need to be atomic */ 21/* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
47 struct work_struct *work) 45 struct work_struct *work)
48{ 46{
49 int cpu = cpumask_first(&wq_thread->cpus_allowed); 47 int cpu = cpumask_first(&wq_thread->cpus_allowed);
50 struct cpu_workqueue_stats *node, *next; 48 struct cpu_workqueue_stats *node;
51 unsigned long flags; 49 unsigned long flags;
52 50
53 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 51 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
54 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, 52 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
55 list) {
56 if (node->pid == wq_thread->pid) { 53 if (node->pid == wq_thread->pid) {
57 atomic_inc(&node->inserted); 54 atomic_inc(&node->inserted);
58 goto found; 55 goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
69 struct work_struct *work) 66 struct work_struct *work)
70{ 67{
71 int cpu = cpumask_first(&wq_thread->cpus_allowed); 68 int cpu = cpumask_first(&wq_thread->cpus_allowed);
72 struct cpu_workqueue_stats *node, *next; 69 struct cpu_workqueue_stats *node;
73 unsigned long flags; 70 unsigned long flags;
74 71
75 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 72 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
76 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, 73 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
77 list) {
78 if (node->pid == wq_thread->pid) { 74 if (node->pid == wq_thread->pid) {
79 node->executed++; 75 node->executed++;
80 goto found; 76 goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
105 cws->pid = wq_thread->pid; 101 cws->pid = wq_thread->pid;
106 102
107 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
108 if (list_empty(&workqueue_cpu_stat(cpu)->list))
109 cws->first_entry = true;
110 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); 104 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
111 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 105 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
112} 106}
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
152 return ret; 146 return ret;
153} 147}
154 148
155static void *workqueue_stat_start(void) 149static void *workqueue_stat_start(struct tracer_stat *trace)
156{ 150{
157 int cpu; 151 int cpu;
158 void *ret = NULL; 152 void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
191static int workqueue_stat_show(struct seq_file *s, void *p) 185static int workqueue_stat_show(struct seq_file *s, void *p)
192{ 186{
193 struct cpu_workqueue_stats *cws = p; 187 struct cpu_workqueue_stats *cws = p;
194 unsigned long flags;
195 int cpu = cws->cpu;
196 struct pid *pid; 188 struct pid *pid;
197 struct task_struct *tsk; 189 struct task_struct *tsk;
198 190
199 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
200 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
201 seq_printf(s, "\n");
202 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
203
204 pid = find_get_pid(cws->pid); 191 pid = find_get_pid(cws->pid);
205 if (pid) { 192 if (pid) {
206 tsk = get_pid_task(pid, PIDTYPE_PID); 193 tsk = get_pid_task(pid, PIDTYPE_PID);
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 157 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 158 spin_unlock_irqrestore(&q->lock, flags);
159} 159}
160EXPORT_SYMBOL(abort_exclusive_wait); 160EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a08950..0668795d8818 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <trace/workqueue.h> 36#define CREATE_TRACE_POINTS
37#include <trace/events/workqueue.h>
37 38
38/* 39/*
39 * The per-CPU workqueue (if single thread, we always use the first 40 * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
124 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 125 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
125} 126}
126 127
127DEFINE_TRACE(workqueue_insertion);
128
129static void insert_work(struct cpu_workqueue_struct *cwq, 128static void insert_work(struct cpu_workqueue_struct *cwq,
130 struct work_struct *work, struct list_head *head) 129 struct work_struct *work, struct list_head *head)
131{ 130{
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
262} 261}
263EXPORT_SYMBOL_GPL(queue_delayed_work_on); 262EXPORT_SYMBOL_GPL(queue_delayed_work_on);
264 263
265DEFINE_TRACE(workqueue_execution);
266
267static void run_workqueue(struct cpu_workqueue_struct *cwq) 264static void run_workqueue(struct cpu_workqueue_struct *cwq)
268{ 265{
269 spin_lock_irq(&cwq->lock); 266 spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
753 return cwq; 750 return cwq;
754} 751}
755 752
756DEFINE_TRACE(workqueue_creation);
757
758static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 753static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
759{ 754{
760 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 755 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
860} 855}
861EXPORT_SYMBOL_GPL(__create_workqueue_key); 856EXPORT_SYMBOL_GPL(__create_workqueue_key);
862 857
863DEFINE_TRACE(workqueue_destruction);
864
865static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 858static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
866{ 859{
867 /* 860 /*